diff --git "a/checkpoint-32000/trainer_state.json" "b/checkpoint-32000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-32000/trainer_state.json" @@ -0,0 +1,224033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7919139881285697, + "eval_steps": 500, + "global_step": 32000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.599731212901781e-05, + "grad_norm": 1.5208414793014526, + "learning_rate": 5e-09, + "loss": 0.5181, + "step": 1 + }, + { + "epoch": 0.00011199462425803562, + "grad_norm": 1.3712694644927979, + "learning_rate": 1e-08, + "loss": 0.376, + "step": 2 + }, + { + "epoch": 0.0001679919363870534, + "grad_norm": 1.4184719324111938, + "learning_rate": 1.5e-08, + "loss": 0.4098, + "step": 3 + }, + { + "epoch": 0.00022398924851607123, + "grad_norm": 1.4291776418685913, + "learning_rate": 2e-08, + "loss": 0.4771, + "step": 4 + }, + { + "epoch": 0.00027998656064508903, + "grad_norm": 1.239039659500122, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.5122, + "step": 5 + }, + { + "epoch": 0.0003359838727741068, + "grad_norm": 1.3924657106399536, + "learning_rate": 3e-08, + "loss": 0.5332, + "step": 6 + }, + { + "epoch": 0.00039198118490312467, + "grad_norm": 1.0782504081726074, + "learning_rate": 3.5e-08, + "loss": 0.3749, + "step": 7 + }, + { + "epoch": 0.00044797849703214247, + "grad_norm": 1.5758692026138306, + "learning_rate": 4e-08, + "loss": 0.5218, + "step": 8 + }, + { + "epoch": 0.0005039758091611603, + "grad_norm": 1.3339658975601196, + "learning_rate": 4.5e-08, + "loss": 0.5131, + "step": 9 + }, + { + "epoch": 0.0005599731212901781, + "grad_norm": 1.5984278917312622, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.5228, + "step": 10 + }, + { + "epoch": 0.0006159704334191959, + "grad_norm": 1.2421988248825073, + "learning_rate": 5.5e-08, + "loss": 0.3985, + "step": 11 + }, + { + "epoch": 0.0006719677455482136, + "grad_norm": 1.3071264028549194, + "learning_rate": 6e-08, + "loss": 0.4958, + "step": 12 + }, + { + "epoch": 0.0007279650576772315, + "grad_norm": 1.7665575742721558, + "learning_rate": 6.5e-08, + "loss": 0.5927, + "step": 13 + }, + { + "epoch": 0.0007839623698062493, + "grad_norm": 1.494536280632019, + "learning_rate": 7e-08, + "loss": 0.4472, + "step": 14 + }, + { + "epoch": 0.0008399596819352671, + "grad_norm": 1.1757087707519531, + "learning_rate": 7.500000000000001e-08, + "loss": 0.4116, + "step": 15 + }, + { + "epoch": 0.0008959569940642849, + "grad_norm": 1.6829992532730103, + "learning_rate": 8e-08, + "loss": 0.5333, + "step": 16 + }, + { + "epoch": 0.0009519543061933027, + "grad_norm": 1.836796522140503, + "learning_rate": 8.5e-08, + "loss": 0.4205, + "step": 17 + }, + { + "epoch": 0.0010079516183223206, + "grad_norm": 1.6840550899505615, + "learning_rate": 9e-08, + "loss": 0.4523, + "step": 18 + }, + { + "epoch": 0.0010639489304513383, + "grad_norm": 1.5171109437942505, + "learning_rate": 9.5e-08, + "loss": 0.5163, + "step": 19 + }, + { + "epoch": 0.0011199462425803561, + "grad_norm": 1.4950929880142212, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.4597, + "step": 20 + }, + { + "epoch": 0.001175943554709374, + "grad_norm": 1.510266900062561, + "learning_rate": 1.05e-07, + "loss": 0.4991, + "step": 21 + }, + { + "epoch": 0.0012319408668383918, + "grad_norm": 1.2690305709838867, + "learning_rate": 1.1e-07, + "loss": 0.4515, + "step": 22 + }, + { + "epoch": 0.0012879381789674097, + "grad_norm": 1.4038795232772827, + "learning_rate": 1.15e-07, + "loss": 0.4929, + "step": 23 + }, + { + "epoch": 0.0013439354910964273, + "grad_norm": 1.4907742738723755, + "learning_rate": 1.2e-07, + "loss": 0.766, + "step": 24 + }, + { + "epoch": 0.0013999328032254451, + "grad_norm": 1.5836879014968872, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.5249, + "step": 25 + }, + { + "epoch": 0.001455930115354463, + "grad_norm": 2.3851399421691895, + "learning_rate": 1.3e-07, + "loss": 0.5666, + "step": 26 + }, + { + "epoch": 0.0015119274274834808, + "grad_norm": 1.2829978466033936, + "learning_rate": 1.35e-07, + "loss": 0.4299, + "step": 27 + }, + { + "epoch": 0.0015679247396124987, + "grad_norm": 1.6352183818817139, + "learning_rate": 1.4e-07, + "loss": 0.5148, + "step": 28 + }, + { + "epoch": 0.0016239220517415163, + "grad_norm": 1.3571499586105347, + "learning_rate": 1.45e-07, + "loss": 0.4276, + "step": 29 + }, + { + "epoch": 0.0016799193638705342, + "grad_norm": 1.4235525131225586, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.6501, + "step": 30 + }, + { + "epoch": 0.001735916675999552, + "grad_norm": 1.5542964935302734, + "learning_rate": 1.55e-07, + "loss": 0.504, + "step": 31 + }, + { + "epoch": 0.0017919139881285699, + "grad_norm": 1.2780183553695679, + "learning_rate": 1.6e-07, + "loss": 0.4262, + "step": 32 + }, + { + "epoch": 0.0018479113002575877, + "grad_norm": 1.2457411289215088, + "learning_rate": 1.65e-07, + "loss": 0.4542, + "step": 33 + }, + { + "epoch": 0.0019039086123866053, + "grad_norm": 1.409238576889038, + "learning_rate": 1.7e-07, + "loss": 0.5514, + "step": 34 + }, + { + "epoch": 0.0019599059245156234, + "grad_norm": 1.564186453819275, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.4696, + "step": 35 + }, + { + "epoch": 0.0020159032366446413, + "grad_norm": 1.1453927755355835, + "learning_rate": 1.8e-07, + "loss": 0.4193, + "step": 36 + }, + { + "epoch": 0.0020719005487736587, + "grad_norm": 1.3826923370361328, + "learning_rate": 1.8500000000000003e-07, + "loss": 0.457, + "step": 37 + }, + { + "epoch": 0.0021278978609026765, + "grad_norm": 1.4963278770446777, + "learning_rate": 1.9e-07, + "loss": 0.6355, + "step": 38 + }, + { + "epoch": 0.0021838951730316944, + "grad_norm": 2.089097261428833, + "learning_rate": 1.95e-07, + "loss": 0.4094, + "step": 39 + }, + { + "epoch": 0.0022398924851607122, + "grad_norm": 1.8463817834854126, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.5716, + "step": 40 + }, + { + "epoch": 0.00229588979728973, + "grad_norm": 1.464522123336792, + "learning_rate": 2.0500000000000002e-07, + "loss": 0.519, + "step": 41 + }, + { + "epoch": 0.002351887109418748, + "grad_norm": 1.4796329736709595, + "learning_rate": 2.1e-07, + "loss": 0.5112, + "step": 42 + }, + { + "epoch": 0.0024078844215477658, + "grad_norm": 1.595494270324707, + "learning_rate": 2.15e-07, + "loss": 0.6506, + "step": 43 + }, + { + "epoch": 0.0024638817336767836, + "grad_norm": 1.8938987255096436, + "learning_rate": 2.2e-07, + "loss": 0.4963, + "step": 44 + }, + { + "epoch": 0.0025198790458058015, + "grad_norm": 1.4932397603988647, + "learning_rate": 2.25e-07, + "loss": 0.4818, + "step": 45 + }, + { + "epoch": 0.0025758763579348193, + "grad_norm": 1.2402743101119995, + "learning_rate": 2.3e-07, + "loss": 0.4129, + "step": 46 + }, + { + "epoch": 0.0026318736700638367, + "grad_norm": 1.3202379941940308, + "learning_rate": 2.3500000000000003e-07, + "loss": 0.4353, + "step": 47 + }, + { + "epoch": 0.0026878709821928546, + "grad_norm": 1.5665568113327026, + "learning_rate": 2.4e-07, + "loss": 0.6673, + "step": 48 + }, + { + "epoch": 0.0027438682943218724, + "grad_norm": 1.557573676109314, + "learning_rate": 2.45e-07, + "loss": 0.4768, + "step": 49 + }, + { + "epoch": 0.0027998656064508903, + "grad_norm": 1.5255318880081177, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.528, + "step": 50 + }, + { + "epoch": 0.002855862918579908, + "grad_norm": 1.3243638277053833, + "learning_rate": 2.5500000000000005e-07, + "loss": 0.5488, + "step": 51 + }, + { + "epoch": 0.002911860230708926, + "grad_norm": 1.2211617231369019, + "learning_rate": 2.6e-07, + "loss": 0.44, + "step": 52 + }, + { + "epoch": 0.002967857542837944, + "grad_norm": 2.0557689666748047, + "learning_rate": 2.65e-07, + "loss": 0.5321, + "step": 53 + }, + { + "epoch": 0.0030238548549669617, + "grad_norm": 1.4214329719543457, + "learning_rate": 2.7e-07, + "loss": 0.3911, + "step": 54 + }, + { + "epoch": 0.0030798521670959795, + "grad_norm": 1.35947847366333, + "learning_rate": 2.75e-07, + "loss": 0.4427, + "step": 55 + }, + { + "epoch": 0.0031358494792249974, + "grad_norm": 1.4284133911132812, + "learning_rate": 2.8e-07, + "loss": 0.4277, + "step": 56 + }, + { + "epoch": 0.0031918467913540152, + "grad_norm": 1.4069606065750122, + "learning_rate": 2.85e-07, + "loss": 0.4372, + "step": 57 + }, + { + "epoch": 0.0032478441034830326, + "grad_norm": 2.8065545558929443, + "learning_rate": 2.9e-07, + "loss": 0.4607, + "step": 58 + }, + { + "epoch": 0.0033038414156120505, + "grad_norm": 1.6844050884246826, + "learning_rate": 2.95e-07, + "loss": 0.6309, + "step": 59 + }, + { + "epoch": 0.0033598387277410683, + "grad_norm": 1.4104372262954712, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.4987, + "step": 60 + }, + { + "epoch": 0.003415836039870086, + "grad_norm": 1.5404714345932007, + "learning_rate": 3.0500000000000004e-07, + "loss": 0.5942, + "step": 61 + }, + { + "epoch": 0.003471833351999104, + "grad_norm": 1.5271331071853638, + "learning_rate": 3.1e-07, + "loss": 0.4563, + "step": 62 + }, + { + "epoch": 0.003527830664128122, + "grad_norm": 1.4416390657424927, + "learning_rate": 3.15e-07, + "loss": 0.5131, + "step": 63 + }, + { + "epoch": 0.0035838279762571397, + "grad_norm": 1.416885256767273, + "learning_rate": 3.2e-07, + "loss": 0.4983, + "step": 64 + }, + { + "epoch": 0.0036398252883861576, + "grad_norm": 1.6866464614868164, + "learning_rate": 3.25e-07, + "loss": 0.4553, + "step": 65 + }, + { + "epoch": 0.0036958226005151754, + "grad_norm": 1.4955333471298218, + "learning_rate": 3.3e-07, + "loss": 0.4709, + "step": 66 + }, + { + "epoch": 0.0037518199126441933, + "grad_norm": 1.3900035619735718, + "learning_rate": 3.35e-07, + "loss": 0.5626, + "step": 67 + }, + { + "epoch": 0.0038078172247732107, + "grad_norm": 1.3770145177841187, + "learning_rate": 3.4e-07, + "loss": 0.5259, + "step": 68 + }, + { + "epoch": 0.0038638145369022285, + "grad_norm": 1.2875261306762695, + "learning_rate": 3.4500000000000003e-07, + "loss": 0.4388, + "step": 69 + }, + { + "epoch": 0.003919811849031247, + "grad_norm": 1.3735551834106445, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.5785, + "step": 70 + }, + { + "epoch": 0.003975809161160264, + "grad_norm": 1.9509772062301636, + "learning_rate": 3.5500000000000004e-07, + "loss": 0.5238, + "step": 71 + }, + { + "epoch": 0.0040318064732892825, + "grad_norm": 1.8565374612808228, + "learning_rate": 3.6e-07, + "loss": 0.7029, + "step": 72 + }, + { + "epoch": 0.0040878037854183, + "grad_norm": 1.4239002466201782, + "learning_rate": 3.65e-07, + "loss": 0.4697, + "step": 73 + }, + { + "epoch": 0.004143801097547317, + "grad_norm": 1.467603087425232, + "learning_rate": 3.7000000000000006e-07, + "loss": 0.4178, + "step": 74 + }, + { + "epoch": 0.004199798409676336, + "grad_norm": 1.1795387268066406, + "learning_rate": 3.75e-07, + "loss": 0.4297, + "step": 75 + }, + { + "epoch": 0.004255795721805353, + "grad_norm": 1.1643917560577393, + "learning_rate": 3.8e-07, + "loss": 0.4823, + "step": 76 + }, + { + "epoch": 0.004311793033934371, + "grad_norm": 1.4003437757492065, + "learning_rate": 3.85e-07, + "loss": 0.4341, + "step": 77 + }, + { + "epoch": 0.004367790346063389, + "grad_norm": 1.246448278427124, + "learning_rate": 3.9e-07, + "loss": 0.4935, + "step": 78 + }, + { + "epoch": 0.004423787658192407, + "grad_norm": 1.449789047241211, + "learning_rate": 3.950000000000001e-07, + "loss": 0.4214, + "step": 79 + }, + { + "epoch": 0.0044797849703214244, + "grad_norm": 1.5086668729782104, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.5524, + "step": 80 + }, + { + "epoch": 0.004535782282450443, + "grad_norm": 1.5325603485107422, + "learning_rate": 4.05e-07, + "loss": 0.4823, + "step": 81 + }, + { + "epoch": 0.00459177959457946, + "grad_norm": 1.292855143547058, + "learning_rate": 4.1000000000000004e-07, + "loss": 0.5164, + "step": 82 + }, + { + "epoch": 0.0046477769067084776, + "grad_norm": 1.3947184085845947, + "learning_rate": 4.1500000000000005e-07, + "loss": 0.5706, + "step": 83 + }, + { + "epoch": 0.004703774218837496, + "grad_norm": 2.1091814041137695, + "learning_rate": 4.2e-07, + "loss": 0.5663, + "step": 84 + }, + { + "epoch": 0.004759771530966513, + "grad_norm": 1.1866250038146973, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.4289, + "step": 85 + }, + { + "epoch": 0.0048157688430955315, + "grad_norm": 1.5813519954681396, + "learning_rate": 4.3e-07, + "loss": 0.8407, + "step": 86 + }, + { + "epoch": 0.004871766155224549, + "grad_norm": 1.4272483587265015, + "learning_rate": 4.3499999999999996e-07, + "loss": 0.5547, + "step": 87 + }, + { + "epoch": 0.004927763467353567, + "grad_norm": 1.3648320436477661, + "learning_rate": 4.4e-07, + "loss": 0.6105, + "step": 88 + }, + { + "epoch": 0.004983760779482585, + "grad_norm": 1.4899494647979736, + "learning_rate": 4.4500000000000003e-07, + "loss": 0.5103, + "step": 89 + }, + { + "epoch": 0.005039758091611603, + "grad_norm": 1.5137723684310913, + "learning_rate": 4.5e-07, + "loss": 0.6131, + "step": 90 + }, + { + "epoch": 0.00509575540374062, + "grad_norm": 2.122572660446167, + "learning_rate": 4.5500000000000004e-07, + "loss": 0.4913, + "step": 91 + }, + { + "epoch": 0.005151752715869639, + "grad_norm": 1.4125502109527588, + "learning_rate": 4.6e-07, + "loss": 0.4826, + "step": 92 + }, + { + "epoch": 0.005207750027998656, + "grad_norm": 1.275382399559021, + "learning_rate": 4.65e-07, + "loss": 0.4789, + "step": 93 + }, + { + "epoch": 0.0052637473401276735, + "grad_norm": 1.5469411611557007, + "learning_rate": 4.7000000000000005e-07, + "loss": 0.5324, + "step": 94 + }, + { + "epoch": 0.005319744652256692, + "grad_norm": 1.503063440322876, + "learning_rate": 4.75e-07, + "loss": 0.5175, + "step": 95 + }, + { + "epoch": 0.005375741964385709, + "grad_norm": 1.4756922721862793, + "learning_rate": 4.8e-07, + "loss": 0.6135, + "step": 96 + }, + { + "epoch": 0.0054317392765147274, + "grad_norm": 1.2535488605499268, + "learning_rate": 4.85e-07, + "loss": 0.4542, + "step": 97 + }, + { + "epoch": 0.005487736588643745, + "grad_norm": 1.3292878866195679, + "learning_rate": 4.9e-07, + "loss": 0.6138, + "step": 98 + }, + { + "epoch": 0.005543733900772763, + "grad_norm": 1.4198826551437378, + "learning_rate": 4.95e-07, + "loss": 0.5016, + "step": 99 + }, + { + "epoch": 0.0055997312129017806, + "grad_norm": 1.4598007202148438, + "learning_rate": 5.000000000000001e-07, + "loss": 0.4207, + "step": 100 + }, + { + "epoch": 0.005655728525030799, + "grad_norm": 1.2995636463165283, + "learning_rate": 5.05e-07, + "loss": 0.6009, + "step": 101 + }, + { + "epoch": 0.005711725837159816, + "grad_norm": 1.3568147420883179, + "learning_rate": 5.100000000000001e-07, + "loss": 0.405, + "step": 102 + }, + { + "epoch": 0.0057677231492888345, + "grad_norm": 1.1702243089675903, + "learning_rate": 5.15e-07, + "loss": 0.3981, + "step": 103 + }, + { + "epoch": 0.005823720461417852, + "grad_norm": 1.4444057941436768, + "learning_rate": 5.2e-07, + "loss": 0.4031, + "step": 104 + }, + { + "epoch": 0.005879717773546869, + "grad_norm": 1.5215481519699097, + "learning_rate": 5.250000000000001e-07, + "loss": 0.6405, + "step": 105 + }, + { + "epoch": 0.005935715085675888, + "grad_norm": 1.5095003843307495, + "learning_rate": 5.3e-07, + "loss": 0.6124, + "step": 106 + }, + { + "epoch": 0.005991712397804905, + "grad_norm": 1.3150241374969482, + "learning_rate": 5.35e-07, + "loss": 0.4757, + "step": 107 + }, + { + "epoch": 0.006047709709933923, + "grad_norm": 1.1489843130111694, + "learning_rate": 5.4e-07, + "loss": 0.4294, + "step": 108 + }, + { + "epoch": 0.006103707022062941, + "grad_norm": 1.4761924743652344, + "learning_rate": 5.450000000000001e-07, + "loss": 0.5482, + "step": 109 + }, + { + "epoch": 0.006159704334191959, + "grad_norm": 1.4262828826904297, + "learning_rate": 5.5e-07, + "loss": 0.4507, + "step": 110 + }, + { + "epoch": 0.0062157016463209765, + "grad_norm": 1.3081737756729126, + "learning_rate": 5.550000000000001e-07, + "loss": 0.4249, + "step": 111 + }, + { + "epoch": 0.006271698958449995, + "grad_norm": 2.10310435295105, + "learning_rate": 5.6e-07, + "loss": 0.5236, + "step": 112 + }, + { + "epoch": 0.006327696270579012, + "grad_norm": 1.2382475137710571, + "learning_rate": 5.65e-07, + "loss": 0.3968, + "step": 113 + }, + { + "epoch": 0.0063836935827080304, + "grad_norm": 2.0119569301605225, + "learning_rate": 5.7e-07, + "loss": 0.6051, + "step": 114 + }, + { + "epoch": 0.006439690894837048, + "grad_norm": 1.5267064571380615, + "learning_rate": 5.75e-07, + "loss": 0.5475, + "step": 115 + }, + { + "epoch": 0.006495688206966065, + "grad_norm": 2.7379636764526367, + "learning_rate": 5.8e-07, + "loss": 0.3968, + "step": 116 + }, + { + "epoch": 0.0065516855190950836, + "grad_norm": 1.4146085977554321, + "learning_rate": 5.85e-07, + "loss": 0.6878, + "step": 117 + }, + { + "epoch": 0.006607682831224101, + "grad_norm": 1.2528458833694458, + "learning_rate": 5.9e-07, + "loss": 0.4599, + "step": 118 + }, + { + "epoch": 0.006663680143353119, + "grad_norm": 1.3054895401000977, + "learning_rate": 5.95e-07, + "loss": 0.4442, + "step": 119 + }, + { + "epoch": 0.006719677455482137, + "grad_norm": 1.1482446193695068, + "learning_rate": 6.000000000000001e-07, + "loss": 0.4932, + "step": 120 + }, + { + "epoch": 0.006775674767611155, + "grad_norm": 1.2546861171722412, + "learning_rate": 6.05e-07, + "loss": 0.4436, + "step": 121 + }, + { + "epoch": 0.006831672079740172, + "grad_norm": 1.4483879804611206, + "learning_rate": 6.100000000000001e-07, + "loss": 0.5461, + "step": 122 + }, + { + "epoch": 0.006887669391869191, + "grad_norm": 1.6083807945251465, + "learning_rate": 6.15e-07, + "loss": 0.5924, + "step": 123 + }, + { + "epoch": 0.006943666703998208, + "grad_norm": 1.2077231407165527, + "learning_rate": 6.2e-07, + "loss": 0.4581, + "step": 124 + }, + { + "epoch": 0.0069996640161272255, + "grad_norm": 1.2149949073791504, + "learning_rate": 6.25e-07, + "loss": 0.4393, + "step": 125 + }, + { + "epoch": 0.007055661328256244, + "grad_norm": 1.4178581237792969, + "learning_rate": 6.3e-07, + "loss": 0.422, + "step": 126 + }, + { + "epoch": 0.007111658640385261, + "grad_norm": 1.2476677894592285, + "learning_rate": 6.35e-07, + "loss": 0.496, + "step": 127 + }, + { + "epoch": 0.0071676559525142795, + "grad_norm": 1.5778127908706665, + "learning_rate": 6.4e-07, + "loss": 0.6051, + "step": 128 + }, + { + "epoch": 0.007223653264643297, + "grad_norm": 1.2792600393295288, + "learning_rate": 6.450000000000001e-07, + "loss": 0.467, + "step": 129 + }, + { + "epoch": 0.007279650576772315, + "grad_norm": 2.7617485523223877, + "learning_rate": 6.5e-07, + "loss": 0.486, + "step": 130 + }, + { + "epoch": 0.007335647888901333, + "grad_norm": 1.6663296222686768, + "learning_rate": 6.550000000000001e-07, + "loss": 0.5529, + "step": 131 + }, + { + "epoch": 0.007391645201030351, + "grad_norm": 1.3751574754714966, + "learning_rate": 6.6e-07, + "loss": 0.5267, + "step": 132 + }, + { + "epoch": 0.007447642513159368, + "grad_norm": 1.3827310800552368, + "learning_rate": 6.65e-07, + "loss": 0.4651, + "step": 133 + }, + { + "epoch": 0.0075036398252883866, + "grad_norm": 1.3962993621826172, + "learning_rate": 6.7e-07, + "loss": 0.5695, + "step": 134 + }, + { + "epoch": 0.007559637137417404, + "grad_norm": 1.3722511529922485, + "learning_rate": 6.75e-07, + "loss": 0.482, + "step": 135 + }, + { + "epoch": 0.007615634449546421, + "grad_norm": 1.6061739921569824, + "learning_rate": 6.8e-07, + "loss": 0.4326, + "step": 136 + }, + { + "epoch": 0.00767163176167544, + "grad_norm": 1.2789545059204102, + "learning_rate": 6.85e-07, + "loss": 0.3909, + "step": 137 + }, + { + "epoch": 0.007727629073804457, + "grad_norm": 1.573917269706726, + "learning_rate": 6.900000000000001e-07, + "loss": 0.5517, + "step": 138 + }, + { + "epoch": 0.007783626385933475, + "grad_norm": Infinity, + "learning_rate": 6.900000000000001e-07, + "loss": 0.5032, + "step": 139 + }, + { + "epoch": 0.007839623698062494, + "grad_norm": 1.4517173767089844, + "learning_rate": 6.95e-07, + "loss": 0.5369, + "step": 140 + }, + { + "epoch": 0.007895621010191511, + "grad_norm": 1.7139356136322021, + "learning_rate": 7.000000000000001e-07, + "loss": 0.5134, + "step": 141 + }, + { + "epoch": 0.007951618322320528, + "grad_norm": 2.1147677898406982, + "learning_rate": 7.05e-07, + "loss": 0.6408, + "step": 142 + }, + { + "epoch": 0.008007615634449546, + "grad_norm": 1.1497327089309692, + "learning_rate": 7.100000000000001e-07, + "loss": 0.4417, + "step": 143 + }, + { + "epoch": 0.008063612946578565, + "grad_norm": 1.518349051475525, + "learning_rate": 7.15e-07, + "loss": 0.5329, + "step": 144 + }, + { + "epoch": 0.008119610258707582, + "grad_norm": 1.3756242990493774, + "learning_rate": 7.2e-07, + "loss": 0.4445, + "step": 145 + }, + { + "epoch": 0.0081756075708366, + "grad_norm": 1.3819228410720825, + "learning_rate": 7.25e-07, + "loss": 0.4424, + "step": 146 + }, + { + "epoch": 0.008231604882965617, + "grad_norm": 1.4747849702835083, + "learning_rate": 7.3e-07, + "loss": 0.5919, + "step": 147 + }, + { + "epoch": 0.008287602195094635, + "grad_norm": 1.4428592920303345, + "learning_rate": 7.350000000000001e-07, + "loss": 0.6117, + "step": 148 + }, + { + "epoch": 0.008343599507223654, + "grad_norm": 1.2393968105316162, + "learning_rate": 7.400000000000001e-07, + "loss": 0.4504, + "step": 149 + }, + { + "epoch": 0.008399596819352671, + "grad_norm": 1.219472885131836, + "learning_rate": 7.450000000000001e-07, + "loss": 0.4756, + "step": 150 + }, + { + "epoch": 0.008455594131481689, + "grad_norm": 1.123125433921814, + "learning_rate": 7.5e-07, + "loss": 0.4298, + "step": 151 + }, + { + "epoch": 0.008511591443610706, + "grad_norm": 1.373967170715332, + "learning_rate": 7.550000000000001e-07, + "loss": 0.4997, + "step": 152 + }, + { + "epoch": 0.008567588755739725, + "grad_norm": 1.5358420610427856, + "learning_rate": 7.6e-07, + "loss": 0.6462, + "step": 153 + }, + { + "epoch": 0.008623586067868743, + "grad_norm": 2.188631296157837, + "learning_rate": 7.65e-07, + "loss": 0.4562, + "step": 154 + }, + { + "epoch": 0.00867958337999776, + "grad_norm": 1.5309580564498901, + "learning_rate": 7.7e-07, + "loss": 0.4204, + "step": 155 + }, + { + "epoch": 0.008735580692126777, + "grad_norm": 1.2409296035766602, + "learning_rate": 7.75e-07, + "loss": 0.4228, + "step": 156 + }, + { + "epoch": 0.008791578004255795, + "grad_norm": Infinity, + "learning_rate": 7.75e-07, + "loss": 0.5002, + "step": 157 + }, + { + "epoch": 0.008847575316384814, + "grad_norm": 1.2380938529968262, + "learning_rate": 7.8e-07, + "loss": 0.4157, + "step": 158 + }, + { + "epoch": 0.008903572628513831, + "grad_norm": 1.5206270217895508, + "learning_rate": 7.85e-07, + "loss": 0.5231, + "step": 159 + }, + { + "epoch": 0.008959569940642849, + "grad_norm": 1.4741681814193726, + "learning_rate": 7.900000000000002e-07, + "loss": 0.4099, + "step": 160 + }, + { + "epoch": 0.009015567252771866, + "grad_norm": 1.3287562131881714, + "learning_rate": 7.950000000000001e-07, + "loss": 0.3698, + "step": 161 + }, + { + "epoch": 0.009071564564900885, + "grad_norm": 1.2945431470870972, + "learning_rate": 8.000000000000001e-07, + "loss": 0.4772, + "step": 162 + }, + { + "epoch": 0.009127561877029903, + "grad_norm": 1.1704779863357544, + "learning_rate": 8.05e-07, + "loss": 0.5405, + "step": 163 + }, + { + "epoch": 0.00918355918915892, + "grad_norm": 1.4360202550888062, + "learning_rate": 8.1e-07, + "loss": 0.6144, + "step": 164 + }, + { + "epoch": 0.009239556501287938, + "grad_norm": 1.7820653915405273, + "learning_rate": 8.149999999999999e-07, + "loss": 0.6294, + "step": 165 + }, + { + "epoch": 0.009295553813416955, + "grad_norm": 1.488885760307312, + "learning_rate": 8.200000000000001e-07, + "loss": 0.5041, + "step": 166 + }, + { + "epoch": 0.009351551125545974, + "grad_norm": 1.0918800830841064, + "learning_rate": 8.25e-07, + "loss": 0.3615, + "step": 167 + }, + { + "epoch": 0.009407548437674992, + "grad_norm": 1.3130441904067993, + "learning_rate": 8.300000000000001e-07, + "loss": 0.4489, + "step": 168 + }, + { + "epoch": 0.009463545749804009, + "grad_norm": 1.2139885425567627, + "learning_rate": 8.35e-07, + "loss": 0.4585, + "step": 169 + }, + { + "epoch": 0.009519543061933027, + "grad_norm": 1.5473610162734985, + "learning_rate": 8.4e-07, + "loss": 0.5744, + "step": 170 + }, + { + "epoch": 0.009575540374062046, + "grad_norm": 1.2141534090042114, + "learning_rate": 8.45e-07, + "loss": 0.4876, + "step": 171 + }, + { + "epoch": 0.009631537686191063, + "grad_norm": 1.2391549348831177, + "learning_rate": 8.500000000000001e-07, + "loss": 0.3436, + "step": 172 + }, + { + "epoch": 0.00968753499832008, + "grad_norm": 1.445334792137146, + "learning_rate": 8.550000000000001e-07, + "loss": 0.4885, + "step": 173 + }, + { + "epoch": 0.009743532310449098, + "grad_norm": 1.298776626586914, + "learning_rate": 8.6e-07, + "loss": 0.4514, + "step": 174 + }, + { + "epoch": 0.009799529622578117, + "grad_norm": 1.5566740036010742, + "learning_rate": 8.65e-07, + "loss": 0.4906, + "step": 175 + }, + { + "epoch": 0.009855526934707134, + "grad_norm": 1.181283712387085, + "learning_rate": 8.699999999999999e-07, + "loss": 0.5238, + "step": 176 + }, + { + "epoch": 0.009911524246836152, + "grad_norm": 1.2867265939712524, + "learning_rate": 8.750000000000001e-07, + "loss": 0.4365, + "step": 177 + }, + { + "epoch": 0.00996752155896517, + "grad_norm": 1.2805672883987427, + "learning_rate": 8.8e-07, + "loss": 0.4703, + "step": 178 + }, + { + "epoch": 0.010023518871094187, + "grad_norm": 1.4181572198867798, + "learning_rate": 8.850000000000001e-07, + "loss": 0.545, + "step": 179 + }, + { + "epoch": 0.010079516183223206, + "grad_norm": 1.273695945739746, + "learning_rate": 8.900000000000001e-07, + "loss": 0.4459, + "step": 180 + }, + { + "epoch": 0.010135513495352223, + "grad_norm": 1.3415385484695435, + "learning_rate": 8.95e-07, + "loss": 0.5312, + "step": 181 + }, + { + "epoch": 0.01019151080748124, + "grad_norm": 1.2442187070846558, + "learning_rate": 9e-07, + "loss": 0.4855, + "step": 182 + }, + { + "epoch": 0.010247508119610258, + "grad_norm": 1.607743740081787, + "learning_rate": 9.050000000000001e-07, + "loss": 0.636, + "step": 183 + }, + { + "epoch": 0.010303505431739277, + "grad_norm": 1.3131659030914307, + "learning_rate": 9.100000000000001e-07, + "loss": 0.4415, + "step": 184 + }, + { + "epoch": 0.010359502743868295, + "grad_norm": 1.1934609413146973, + "learning_rate": 9.15e-07, + "loss": 0.4209, + "step": 185 + }, + { + "epoch": 0.010415500055997312, + "grad_norm": 1.2915111780166626, + "learning_rate": 9.2e-07, + "loss": 0.4771, + "step": 186 + }, + { + "epoch": 0.01047149736812633, + "grad_norm": 1.5296729803085327, + "learning_rate": 9.25e-07, + "loss": 0.6426, + "step": 187 + }, + { + "epoch": 0.010527494680255347, + "grad_norm": 2.22491717338562, + "learning_rate": 9.3e-07, + "loss": 0.5904, + "step": 188 + }, + { + "epoch": 0.010583491992384366, + "grad_norm": 1.2684471607208252, + "learning_rate": 9.350000000000002e-07, + "loss": 0.5164, + "step": 189 + }, + { + "epoch": 0.010639489304513383, + "grad_norm": 1.7580739259719849, + "learning_rate": 9.400000000000001e-07, + "loss": 0.471, + "step": 190 + }, + { + "epoch": 0.010695486616642401, + "grad_norm": 1.3328614234924316, + "learning_rate": 9.450000000000001e-07, + "loss": 0.5199, + "step": 191 + }, + { + "epoch": 0.010751483928771418, + "grad_norm": 10.069008827209473, + "learning_rate": 9.5e-07, + "loss": 0.3754, + "step": 192 + }, + { + "epoch": 0.010807481240900437, + "grad_norm": 1.2109394073486328, + "learning_rate": 9.55e-07, + "loss": 0.5417, + "step": 193 + }, + { + "epoch": 0.010863478553029455, + "grad_norm": 1.279045581817627, + "learning_rate": 9.6e-07, + "loss": 0.6722, + "step": 194 + }, + { + "epoch": 0.010919475865158472, + "grad_norm": 1.482125997543335, + "learning_rate": 9.65e-07, + "loss": 0.4102, + "step": 195 + }, + { + "epoch": 0.01097547317728749, + "grad_norm": 1.2239766120910645, + "learning_rate": 9.7e-07, + "loss": 0.4908, + "step": 196 + }, + { + "epoch": 0.011031470489416509, + "grad_norm": 1.3760238885879517, + "learning_rate": 9.75e-07, + "loss": 0.4594, + "step": 197 + }, + { + "epoch": 0.011087467801545526, + "grad_norm": 1.3297518491744995, + "learning_rate": 9.8e-07, + "loss": 0.5052, + "step": 198 + }, + { + "epoch": 0.011143465113674544, + "grad_norm": 1.2966132164001465, + "learning_rate": 9.849999999999999e-07, + "loss": 0.4952, + "step": 199 + }, + { + "epoch": 0.011199462425803561, + "grad_norm": 1.4916179180145264, + "learning_rate": 9.9e-07, + "loss": 0.4624, + "step": 200 + }, + { + "epoch": 0.011255459737932579, + "grad_norm": 1.8758313655853271, + "learning_rate": 9.95e-07, + "loss": 0.6089, + "step": 201 + }, + { + "epoch": 0.011311457050061598, + "grad_norm": 1.1678751707077026, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.4251, + "step": 202 + }, + { + "epoch": 0.011367454362190615, + "grad_norm": 1.1295413970947266, + "learning_rate": 1.0050000000000001e-06, + "loss": 0.394, + "step": 203 + }, + { + "epoch": 0.011423451674319633, + "grad_norm": 1.2641093730926514, + "learning_rate": 1.01e-06, + "loss": 0.4225, + "step": 204 + }, + { + "epoch": 0.01147944898644865, + "grad_norm": 1.326363205909729, + "learning_rate": 1.015e-06, + "loss": 0.5714, + "step": 205 + }, + { + "epoch": 0.011535446298577669, + "grad_norm": 0.9996131062507629, + "learning_rate": 1.0200000000000002e-06, + "loss": 0.3477, + "step": 206 + }, + { + "epoch": 0.011591443610706686, + "grad_norm": 1.3686496019363403, + "learning_rate": 1.0250000000000001e-06, + "loss": 0.4609, + "step": 207 + }, + { + "epoch": 0.011647440922835704, + "grad_norm": 1.1996912956237793, + "learning_rate": 1.03e-06, + "loss": 0.5012, + "step": 208 + }, + { + "epoch": 0.011703438234964721, + "grad_norm": 1.2017878293991089, + "learning_rate": 1.035e-06, + "loss": 0.4679, + "step": 209 + }, + { + "epoch": 0.011759435547093739, + "grad_norm": 1.3492447137832642, + "learning_rate": 1.04e-06, + "loss": 0.6228, + "step": 210 + }, + { + "epoch": 0.011815432859222758, + "grad_norm": 1.0122253894805908, + "learning_rate": 1.045e-06, + "loss": 0.4258, + "step": 211 + }, + { + "epoch": 0.011871430171351775, + "grad_norm": 1.3060179948806763, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.4753, + "step": 212 + }, + { + "epoch": 0.011927427483480793, + "grad_norm": 1.1057965755462646, + "learning_rate": 1.055e-06, + "loss": 0.372, + "step": 213 + }, + { + "epoch": 0.01198342479560981, + "grad_norm": 1.383988857269287, + "learning_rate": 1.06e-06, + "loss": 0.6251, + "step": 214 + }, + { + "epoch": 0.01203942210773883, + "grad_norm": 1.1000547409057617, + "learning_rate": 1.065e-06, + "loss": 0.3334, + "step": 215 + }, + { + "epoch": 0.012095419419867847, + "grad_norm": 1.307716727256775, + "learning_rate": 1.07e-06, + "loss": 0.4021, + "step": 216 + }, + { + "epoch": 0.012151416731996864, + "grad_norm": 1.3594478368759155, + "learning_rate": 1.0749999999999999e-06, + "loss": 0.4034, + "step": 217 + }, + { + "epoch": 0.012207414044125882, + "grad_norm": 1.0731086730957031, + "learning_rate": 1.08e-06, + "loss": 0.4315, + "step": 218 + }, + { + "epoch": 0.012263411356254899, + "grad_norm": 1.0504783391952515, + "learning_rate": 1.085e-06, + "loss": 0.3754, + "step": 219 + }, + { + "epoch": 0.012319408668383918, + "grad_norm": 1.1919739246368408, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.4962, + "step": 220 + }, + { + "epoch": 0.012375405980512936, + "grad_norm": 1.6576035022735596, + "learning_rate": 1.095e-06, + "loss": 0.6879, + "step": 221 + }, + { + "epoch": 0.012431403292641953, + "grad_norm": 1.518970251083374, + "learning_rate": 1.1e-06, + "loss": 0.6722, + "step": 222 + }, + { + "epoch": 0.01248740060477097, + "grad_norm": 1.672376036643982, + "learning_rate": 1.1050000000000002e-06, + "loss": 0.5444, + "step": 223 + }, + { + "epoch": 0.01254339791689999, + "grad_norm": 1.0940489768981934, + "learning_rate": 1.1100000000000002e-06, + "loss": 0.3958, + "step": 224 + }, + { + "epoch": 0.012599395229029007, + "grad_norm": 1.4551310539245605, + "learning_rate": 1.1150000000000001e-06, + "loss": 0.5054, + "step": 225 + }, + { + "epoch": 0.012655392541158024, + "grad_norm": 1.3679637908935547, + "learning_rate": 1.12e-06, + "loss": 0.4151, + "step": 226 + }, + { + "epoch": 0.012711389853287042, + "grad_norm": 1.242140293121338, + "learning_rate": 1.125e-06, + "loss": 0.401, + "step": 227 + }, + { + "epoch": 0.012767387165416061, + "grad_norm": 1.0123270750045776, + "learning_rate": 1.13e-06, + "loss": 0.4147, + "step": 228 + }, + { + "epoch": 0.012823384477545078, + "grad_norm": 1.3932843208312988, + "learning_rate": 1.1350000000000001e-06, + "loss": 0.4361, + "step": 229 + }, + { + "epoch": 0.012879381789674096, + "grad_norm": 1.0555158853530884, + "learning_rate": 1.14e-06, + "loss": 0.4174, + "step": 230 + }, + { + "epoch": 0.012935379101803113, + "grad_norm": 1.380471110343933, + "learning_rate": 1.145e-06, + "loss": 0.4259, + "step": 231 + }, + { + "epoch": 0.01299137641393213, + "grad_norm": 1.2852363586425781, + "learning_rate": 1.15e-06, + "loss": 0.4641, + "step": 232 + }, + { + "epoch": 0.01304737372606115, + "grad_norm": 1.1543710231781006, + "learning_rate": 1.155e-06, + "loss": 0.5133, + "step": 233 + }, + { + "epoch": 0.013103371038190167, + "grad_norm": 1.174673318862915, + "learning_rate": 1.16e-06, + "loss": 0.3946, + "step": 234 + }, + { + "epoch": 0.013159368350319185, + "grad_norm": 1.1139841079711914, + "learning_rate": 1.165e-06, + "loss": 0.4283, + "step": 235 + }, + { + "epoch": 0.013215365662448202, + "grad_norm": 1.6161155700683594, + "learning_rate": 1.17e-06, + "loss": 0.5241, + "step": 236 + }, + { + "epoch": 0.013271362974577221, + "grad_norm": 1.313589334487915, + "learning_rate": 1.175e-06, + "loss": 0.5173, + "step": 237 + }, + { + "epoch": 0.013327360286706239, + "grad_norm": 1.4072622060775757, + "learning_rate": 1.18e-06, + "loss": 0.7632, + "step": 238 + }, + { + "epoch": 0.013383357598835256, + "grad_norm": 1.611053228378296, + "learning_rate": 1.185e-06, + "loss": 0.526, + "step": 239 + }, + { + "epoch": 0.013439354910964273, + "grad_norm": 1.2626253366470337, + "learning_rate": 1.19e-06, + "loss": 0.6914, + "step": 240 + }, + { + "epoch": 0.01349535222309329, + "grad_norm": 1.1688967943191528, + "learning_rate": 1.1950000000000002e-06, + "loss": 0.5033, + "step": 241 + }, + { + "epoch": 0.01355134953522231, + "grad_norm": 1.1805068254470825, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.5048, + "step": 242 + }, + { + "epoch": 0.013607346847351327, + "grad_norm": 1.3243757486343384, + "learning_rate": 1.2050000000000001e-06, + "loss": 0.5549, + "step": 243 + }, + { + "epoch": 0.013663344159480345, + "grad_norm": 1.4326049089431763, + "learning_rate": 1.21e-06, + "loss": 0.733, + "step": 244 + }, + { + "epoch": 0.013719341471609362, + "grad_norm": 1.2837655544281006, + "learning_rate": 1.215e-06, + "loss": 0.5065, + "step": 245 + }, + { + "epoch": 0.013775338783738381, + "grad_norm": 1.687691330909729, + "learning_rate": 1.2200000000000002e-06, + "loss": 0.605, + "step": 246 + }, + { + "epoch": 0.013831336095867399, + "grad_norm": 1.4184305667877197, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.4447, + "step": 247 + }, + { + "epoch": 0.013887333407996416, + "grad_norm": 1.5195060968399048, + "learning_rate": 1.23e-06, + "loss": 0.4551, + "step": 248 + }, + { + "epoch": 0.013943330720125434, + "grad_norm": 1.5335530042648315, + "learning_rate": 1.235e-06, + "loss": 0.5239, + "step": 249 + }, + { + "epoch": 0.013999328032254451, + "grad_norm": 1.2263363599777222, + "learning_rate": 1.24e-06, + "loss": 0.4089, + "step": 250 + }, + { + "epoch": 0.01405532534438347, + "grad_norm": 1.3511929512023926, + "learning_rate": 1.245e-06, + "loss": 0.5311, + "step": 251 + }, + { + "epoch": 0.014111322656512488, + "grad_norm": 1.4361425638198853, + "learning_rate": 1.25e-06, + "loss": 0.5113, + "step": 252 + }, + { + "epoch": 0.014167319968641505, + "grad_norm": 1.3185176849365234, + "learning_rate": 1.255e-06, + "loss": 0.5712, + "step": 253 + }, + { + "epoch": 0.014223317280770522, + "grad_norm": 1.7490395307540894, + "learning_rate": 1.26e-06, + "loss": 0.4938, + "step": 254 + }, + { + "epoch": 0.014279314592899542, + "grad_norm": 1.2518575191497803, + "learning_rate": 1.265e-06, + "loss": 0.4094, + "step": 255 + }, + { + "epoch": 0.014335311905028559, + "grad_norm": 1.9215528964996338, + "learning_rate": 1.27e-06, + "loss": 0.5999, + "step": 256 + }, + { + "epoch": 0.014391309217157576, + "grad_norm": 1.1665399074554443, + "learning_rate": 1.275e-06, + "loss": 0.4608, + "step": 257 + }, + { + "epoch": 0.014447306529286594, + "grad_norm": 1.461685299873352, + "learning_rate": 1.28e-06, + "loss": 0.593, + "step": 258 + }, + { + "epoch": 0.014503303841415613, + "grad_norm": 1.0450584888458252, + "learning_rate": 1.2850000000000002e-06, + "loss": 0.3811, + "step": 259 + }, + { + "epoch": 0.01455930115354463, + "grad_norm": 1.2816405296325684, + "learning_rate": 1.2900000000000001e-06, + "loss": 0.4941, + "step": 260 + }, + { + "epoch": 0.014615298465673648, + "grad_norm": 1.2557902336120605, + "learning_rate": 1.295e-06, + "loss": 0.459, + "step": 261 + }, + { + "epoch": 0.014671295777802665, + "grad_norm": 1.1175843477249146, + "learning_rate": 1.3e-06, + "loss": 0.4936, + "step": 262 + }, + { + "epoch": 0.014727293089931683, + "grad_norm": 1.479242205619812, + "learning_rate": 1.3050000000000002e-06, + "loss": 0.4825, + "step": 263 + }, + { + "epoch": 0.014783290402060702, + "grad_norm": 1.5817683935165405, + "learning_rate": 1.3100000000000002e-06, + "loss": 0.3805, + "step": 264 + }, + { + "epoch": 0.01483928771418972, + "grad_norm": 1.5696752071380615, + "learning_rate": 1.3150000000000001e-06, + "loss": 0.6506, + "step": 265 + }, + { + "epoch": 0.014895285026318737, + "grad_norm": 1.4629154205322266, + "learning_rate": 1.32e-06, + "loss": 0.4585, + "step": 266 + }, + { + "epoch": 0.014951282338447754, + "grad_norm": 1.5436128377914429, + "learning_rate": 1.325e-06, + "loss": 0.5556, + "step": 267 + }, + { + "epoch": 0.015007279650576773, + "grad_norm": 1.4215129613876343, + "learning_rate": 1.33e-06, + "loss": 0.4365, + "step": 268 + }, + { + "epoch": 0.01506327696270579, + "grad_norm": 1.3057754039764404, + "learning_rate": 1.3350000000000001e-06, + "loss": 0.4787, + "step": 269 + }, + { + "epoch": 0.015119274274834808, + "grad_norm": 1.1739662885665894, + "learning_rate": 1.34e-06, + "loss": 0.5333, + "step": 270 + }, + { + "epoch": 0.015175271586963825, + "grad_norm": 1.51952064037323, + "learning_rate": 1.345e-06, + "loss": 0.3872, + "step": 271 + }, + { + "epoch": 0.015231268899092843, + "grad_norm": 1.7925301790237427, + "learning_rate": 1.35e-06, + "loss": 0.5211, + "step": 272 + }, + { + "epoch": 0.015287266211221862, + "grad_norm": 1.2592473030090332, + "learning_rate": 1.355e-06, + "loss": 0.4538, + "step": 273 + }, + { + "epoch": 0.01534326352335088, + "grad_norm": 1.247963547706604, + "learning_rate": 1.36e-06, + "loss": 0.4779, + "step": 274 + }, + { + "epoch": 0.015399260835479897, + "grad_norm": 1.4560402631759644, + "learning_rate": 1.365e-06, + "loss": 0.4341, + "step": 275 + }, + { + "epoch": 0.015455258147608914, + "grad_norm": 1.4166802167892456, + "learning_rate": 1.37e-06, + "loss": 0.5152, + "step": 276 + }, + { + "epoch": 0.015511255459737933, + "grad_norm": 1.4385263919830322, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.4547, + "step": 277 + }, + { + "epoch": 0.01556725277186695, + "grad_norm": 1.5150035619735718, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5362, + "step": 278 + }, + { + "epoch": 0.015623250083995968, + "grad_norm": 3.9275166988372803, + "learning_rate": 1.385e-06, + "loss": 0.5216, + "step": 279 + }, + { + "epoch": 0.015679247396124987, + "grad_norm": 1.3215997219085693, + "learning_rate": 1.39e-06, + "loss": 0.6029, + "step": 280 + }, + { + "epoch": 0.015735244708254003, + "grad_norm": 1.7647961378097534, + "learning_rate": 1.3950000000000002e-06, + "loss": 0.4077, + "step": 281 + }, + { + "epoch": 0.015791242020383022, + "grad_norm": 1.5197356939315796, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.6688, + "step": 282 + }, + { + "epoch": 0.015847239332512038, + "grad_norm": 1.4369826316833496, + "learning_rate": 1.405e-06, + "loss": 0.4598, + "step": 283 + }, + { + "epoch": 0.015903236644641057, + "grad_norm": 1.3526933193206787, + "learning_rate": 1.41e-06, + "loss": 0.5358, + "step": 284 + }, + { + "epoch": 0.015959233956770076, + "grad_norm": 1.2903668880462646, + "learning_rate": 1.415e-06, + "loss": 0.3934, + "step": 285 + }, + { + "epoch": 0.016015231268899092, + "grad_norm": 1.2543387413024902, + "learning_rate": 1.4200000000000002e-06, + "loss": 0.496, + "step": 286 + }, + { + "epoch": 0.01607122858102811, + "grad_norm": 1.3359887599945068, + "learning_rate": 1.4250000000000001e-06, + "loss": 0.5165, + "step": 287 + }, + { + "epoch": 0.01612722589315713, + "grad_norm": 1.1500910520553589, + "learning_rate": 1.43e-06, + "loss": 0.3095, + "step": 288 + }, + { + "epoch": 0.016183223205286146, + "grad_norm": 1.53554368019104, + "learning_rate": 1.435e-06, + "loss": 0.6023, + "step": 289 + }, + { + "epoch": 0.016239220517415165, + "grad_norm": 1.2718192338943481, + "learning_rate": 1.44e-06, + "loss": 0.4301, + "step": 290 + }, + { + "epoch": 0.01629521782954418, + "grad_norm": 1.4737380743026733, + "learning_rate": 1.445e-06, + "loss": 0.4529, + "step": 291 + }, + { + "epoch": 0.0163512151416732, + "grad_norm": 1.2656103372573853, + "learning_rate": 1.45e-06, + "loss": 0.5206, + "step": 292 + }, + { + "epoch": 0.01640721245380222, + "grad_norm": 1.2045220136642456, + "learning_rate": 1.455e-06, + "loss": 0.4806, + "step": 293 + }, + { + "epoch": 0.016463209765931235, + "grad_norm": 1.7034382820129395, + "learning_rate": 1.46e-06, + "loss": 0.4972, + "step": 294 + }, + { + "epoch": 0.016519207078060254, + "grad_norm": 1.1198574304580688, + "learning_rate": 1.465e-06, + "loss": 0.5209, + "step": 295 + }, + { + "epoch": 0.01657520439018927, + "grad_norm": 1.133908748626709, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.5018, + "step": 296 + }, + { + "epoch": 0.01663120170231829, + "grad_norm": 1.2515907287597656, + "learning_rate": 1.475e-06, + "loss": 0.5826, + "step": 297 + }, + { + "epoch": 0.016687199014447308, + "grad_norm": 1.297999382019043, + "learning_rate": 1.4800000000000002e-06, + "loss": 0.4961, + "step": 298 + }, + { + "epoch": 0.016743196326576323, + "grad_norm": 1.4551876783370972, + "learning_rate": 1.4850000000000002e-06, + "loss": 0.5899, + "step": 299 + }, + { + "epoch": 0.016799193638705343, + "grad_norm": 1.346625566482544, + "learning_rate": 1.4900000000000001e-06, + "loss": 0.5737, + "step": 300 + }, + { + "epoch": 0.016855190950834358, + "grad_norm": 1.3084245920181274, + "learning_rate": 1.495e-06, + "loss": 0.5083, + "step": 301 + }, + { + "epoch": 0.016911188262963377, + "grad_norm": 1.501230001449585, + "learning_rate": 1.5e-06, + "loss": 0.5419, + "step": 302 + }, + { + "epoch": 0.016967185575092397, + "grad_norm": 1.3253297805786133, + "learning_rate": 1.505e-06, + "loss": 0.5299, + "step": 303 + }, + { + "epoch": 0.017023182887221412, + "grad_norm": 1.3683420419692993, + "learning_rate": 1.5100000000000002e-06, + "loss": 0.5981, + "step": 304 + }, + { + "epoch": 0.01707918019935043, + "grad_norm": 1.3841209411621094, + "learning_rate": 1.5150000000000001e-06, + "loss": 0.4693, + "step": 305 + }, + { + "epoch": 0.01713517751147945, + "grad_norm": 1.3428679704666138, + "learning_rate": 1.52e-06, + "loss": 0.445, + "step": 306 + }, + { + "epoch": 0.017191174823608466, + "grad_norm": 1.248568058013916, + "learning_rate": 1.525e-06, + "loss": 0.3911, + "step": 307 + }, + { + "epoch": 0.017247172135737485, + "grad_norm": 1.3845674991607666, + "learning_rate": 1.53e-06, + "loss": 0.546, + "step": 308 + }, + { + "epoch": 0.0173031694478665, + "grad_norm": 1.465867519378662, + "learning_rate": 1.5350000000000001e-06, + "loss": 0.6043, + "step": 309 + }, + { + "epoch": 0.01735916675999552, + "grad_norm": 1.1764181852340698, + "learning_rate": 1.54e-06, + "loss": 0.377, + "step": 310 + }, + { + "epoch": 0.01741516407212454, + "grad_norm": 1.1894500255584717, + "learning_rate": 1.545e-06, + "loss": 0.4326, + "step": 311 + }, + { + "epoch": 0.017471161384253555, + "grad_norm": 1.270615577697754, + "learning_rate": 1.55e-06, + "loss": 0.448, + "step": 312 + }, + { + "epoch": 0.017527158696382574, + "grad_norm": 1.9738068580627441, + "learning_rate": 1.555e-06, + "loss": 0.5594, + "step": 313 + }, + { + "epoch": 0.01758315600851159, + "grad_norm": 1.4984630346298218, + "learning_rate": 1.56e-06, + "loss": 0.5236, + "step": 314 + }, + { + "epoch": 0.01763915332064061, + "grad_norm": 1.8723793029785156, + "learning_rate": 1.565e-06, + "loss": 0.6548, + "step": 315 + }, + { + "epoch": 0.017695150632769628, + "grad_norm": 1.298327922821045, + "learning_rate": 1.57e-06, + "loss": 0.4244, + "step": 316 + }, + { + "epoch": 0.017751147944898644, + "grad_norm": 1.0812171697616577, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.4079, + "step": 317 + }, + { + "epoch": 0.017807145257027663, + "grad_norm": 1.3327000141143799, + "learning_rate": 1.5800000000000003e-06, + "loss": 0.6322, + "step": 318 + }, + { + "epoch": 0.017863142569156682, + "grad_norm": 1.8076618909835815, + "learning_rate": 1.585e-06, + "loss": 0.409, + "step": 319 + }, + { + "epoch": 0.017919139881285698, + "grad_norm": 1.5142481327056885, + "learning_rate": 1.5900000000000002e-06, + "loss": 0.5574, + "step": 320 + }, + { + "epoch": 0.017975137193414717, + "grad_norm": 1.940686821937561, + "learning_rate": 1.595e-06, + "loss": 0.533, + "step": 321 + }, + { + "epoch": 0.018031134505543733, + "grad_norm": 1.33735990524292, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.4811, + "step": 322 + }, + { + "epoch": 0.018087131817672752, + "grad_norm": 1.296083927154541, + "learning_rate": 1.6049999999999999e-06, + "loss": 0.4361, + "step": 323 + }, + { + "epoch": 0.01814312912980177, + "grad_norm": 1.1967061758041382, + "learning_rate": 1.61e-06, + "loss": 0.4532, + "step": 324 + }, + { + "epoch": 0.018199126441930787, + "grad_norm": 1.6449378728866577, + "learning_rate": 1.6150000000000002e-06, + "loss": 0.7581, + "step": 325 + }, + { + "epoch": 0.018255123754059806, + "grad_norm": 1.3829392194747925, + "learning_rate": 1.62e-06, + "loss": 0.5744, + "step": 326 + }, + { + "epoch": 0.01831112106618882, + "grad_norm": 1.4353978633880615, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.5701, + "step": 327 + }, + { + "epoch": 0.01836711837831784, + "grad_norm": 1.405627965927124, + "learning_rate": 1.6299999999999999e-06, + "loss": 0.4762, + "step": 328 + }, + { + "epoch": 0.01842311569044686, + "grad_norm": 1.2175483703613281, + "learning_rate": 1.635e-06, + "loss": 0.4396, + "step": 329 + }, + { + "epoch": 0.018479113002575875, + "grad_norm": 1.2880176305770874, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.3867, + "step": 330 + }, + { + "epoch": 0.018535110314704895, + "grad_norm": 1.3141285181045532, + "learning_rate": 1.645e-06, + "loss": 0.4472, + "step": 331 + }, + { + "epoch": 0.01859110762683391, + "grad_norm": 1.2125033140182495, + "learning_rate": 1.65e-06, + "loss": 0.5351, + "step": 332 + }, + { + "epoch": 0.01864710493896293, + "grad_norm": 1.4135538339614868, + "learning_rate": 1.655e-06, + "loss": 0.6807, + "step": 333 + }, + { + "epoch": 0.01870310225109195, + "grad_norm": 1.2327980995178223, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.4119, + "step": 334 + }, + { + "epoch": 0.018759099563220964, + "grad_norm": 1.4258291721343994, + "learning_rate": 1.6650000000000002e-06, + "loss": 0.6192, + "step": 335 + }, + { + "epoch": 0.018815096875349983, + "grad_norm": 1.2788861989974976, + "learning_rate": 1.67e-06, + "loss": 0.5347, + "step": 336 + }, + { + "epoch": 0.018871094187479003, + "grad_norm": 1.8292855024337769, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.5791, + "step": 337 + }, + { + "epoch": 0.018927091499608018, + "grad_norm": 1.2971700429916382, + "learning_rate": 1.68e-06, + "loss": 0.7015, + "step": 338 + }, + { + "epoch": 0.018983088811737037, + "grad_norm": 1.2348295450210571, + "learning_rate": 1.6850000000000002e-06, + "loss": 0.3874, + "step": 339 + }, + { + "epoch": 0.019039086123866053, + "grad_norm": 1.1175373792648315, + "learning_rate": 1.69e-06, + "loss": 0.3748, + "step": 340 + }, + { + "epoch": 0.019095083435995072, + "grad_norm": 1.308707594871521, + "learning_rate": 1.695e-06, + "loss": 0.6144, + "step": 341 + }, + { + "epoch": 0.01915108074812409, + "grad_norm": 1.1951675415039062, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.4266, + "step": 342 + }, + { + "epoch": 0.019207078060253107, + "grad_norm": 1.2016880512237549, + "learning_rate": 1.705e-06, + "loss": 0.457, + "step": 343 + }, + { + "epoch": 0.019263075372382126, + "grad_norm": 1.9292396306991577, + "learning_rate": 1.7100000000000001e-06, + "loss": 0.508, + "step": 344 + }, + { + "epoch": 0.019319072684511142, + "grad_norm": 1.3462649583816528, + "learning_rate": 1.7149999999999999e-06, + "loss": 0.5321, + "step": 345 + }, + { + "epoch": 0.01937506999664016, + "grad_norm": 2.1647658348083496, + "learning_rate": 1.72e-06, + "loss": 0.6202, + "step": 346 + }, + { + "epoch": 0.01943106730876918, + "grad_norm": 1.140655755996704, + "learning_rate": 1.7250000000000002e-06, + "loss": 0.486, + "step": 347 + }, + { + "epoch": 0.019487064620898196, + "grad_norm": 1.2482906579971313, + "learning_rate": 1.73e-06, + "loss": 0.446, + "step": 348 + }, + { + "epoch": 0.019543061933027215, + "grad_norm": 1.4118690490722656, + "learning_rate": 1.7350000000000001e-06, + "loss": 0.7418, + "step": 349 + }, + { + "epoch": 0.019599059245156234, + "grad_norm": 1.7058286666870117, + "learning_rate": 1.7399999999999999e-06, + "loss": 0.4212, + "step": 350 + }, + { + "epoch": 0.01965505655728525, + "grad_norm": 1.3479478359222412, + "learning_rate": 1.745e-06, + "loss": 0.4631, + "step": 351 + }, + { + "epoch": 0.01971105386941427, + "grad_norm": 1.2280378341674805, + "learning_rate": 1.7500000000000002e-06, + "loss": 0.6453, + "step": 352 + }, + { + "epoch": 0.019767051181543285, + "grad_norm": 1.2790160179138184, + "learning_rate": 1.7550000000000001e-06, + "loss": 0.4572, + "step": 353 + }, + { + "epoch": 0.019823048493672304, + "grad_norm": 1.3132696151733398, + "learning_rate": 1.76e-06, + "loss": 0.612, + "step": 354 + }, + { + "epoch": 0.019879045805801323, + "grad_norm": 1.4868576526641846, + "learning_rate": 1.765e-06, + "loss": 0.5342, + "step": 355 + }, + { + "epoch": 0.01993504311793034, + "grad_norm": 1.3354017734527588, + "learning_rate": 1.7700000000000002e-06, + "loss": 0.5269, + "step": 356 + }, + { + "epoch": 0.019991040430059358, + "grad_norm": 1.2551268339157104, + "learning_rate": 1.775e-06, + "loss": 0.4749, + "step": 357 + }, + { + "epoch": 0.020047037742188373, + "grad_norm": 1.1117607355117798, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3936, + "step": 358 + }, + { + "epoch": 0.020103035054317393, + "grad_norm": 1.2144349813461304, + "learning_rate": 1.7850000000000003e-06, + "loss": 0.5228, + "step": 359 + }, + { + "epoch": 0.020159032366446412, + "grad_norm": 1.3747631311416626, + "learning_rate": 1.79e-06, + "loss": 0.3782, + "step": 360 + }, + { + "epoch": 0.020215029678575427, + "grad_norm": 1.2662698030471802, + "learning_rate": 1.7950000000000002e-06, + "loss": 0.4512, + "step": 361 + }, + { + "epoch": 0.020271026990704447, + "grad_norm": 1.4247366189956665, + "learning_rate": 1.8e-06, + "loss": 0.5507, + "step": 362 + }, + { + "epoch": 0.020327024302833462, + "grad_norm": 1.3011784553527832, + "learning_rate": 1.805e-06, + "loss": 0.4161, + "step": 363 + }, + { + "epoch": 0.02038302161496248, + "grad_norm": 1.6431331634521484, + "learning_rate": 1.8100000000000002e-06, + "loss": 0.4019, + "step": 364 + }, + { + "epoch": 0.0204390189270915, + "grad_norm": 1.3516236543655396, + "learning_rate": 1.815e-06, + "loss": 0.5224, + "step": 365 + }, + { + "epoch": 0.020495016239220516, + "grad_norm": 1.4772320985794067, + "learning_rate": 1.8200000000000002e-06, + "loss": 0.4568, + "step": 366 + }, + { + "epoch": 0.020551013551349535, + "grad_norm": 1.2806977033615112, + "learning_rate": 1.8249999999999999e-06, + "loss": 0.3767, + "step": 367 + }, + { + "epoch": 0.020607010863478555, + "grad_norm": 1.4820971488952637, + "learning_rate": 1.83e-06, + "loss": 0.5307, + "step": 368 + }, + { + "epoch": 0.02066300817560757, + "grad_norm": 1.237572193145752, + "learning_rate": 1.8350000000000002e-06, + "loss": 0.4643, + "step": 369 + }, + { + "epoch": 0.02071900548773659, + "grad_norm": 1.4231642484664917, + "learning_rate": 1.84e-06, + "loss": 0.5078, + "step": 370 + }, + { + "epoch": 0.020775002799865605, + "grad_norm": 1.2499034404754639, + "learning_rate": 1.8450000000000001e-06, + "loss": 0.5165, + "step": 371 + }, + { + "epoch": 0.020831000111994624, + "grad_norm": 1.2820899486541748, + "learning_rate": 1.85e-06, + "loss": 0.5586, + "step": 372 + }, + { + "epoch": 0.020886997424123643, + "grad_norm": 1.3260475397109985, + "learning_rate": 1.8550000000000002e-06, + "loss": 0.3488, + "step": 373 + }, + { + "epoch": 0.02094299473625266, + "grad_norm": 2.5342912673950195, + "learning_rate": 1.86e-06, + "loss": 0.7005, + "step": 374 + }, + { + "epoch": 0.020998992048381678, + "grad_norm": 1.2824656963348389, + "learning_rate": 1.8650000000000001e-06, + "loss": 0.4665, + "step": 375 + }, + { + "epoch": 0.021054989360510694, + "grad_norm": 1.4731974601745605, + "learning_rate": 1.8700000000000003e-06, + "loss": 0.5446, + "step": 376 + }, + { + "epoch": 0.021110986672639713, + "grad_norm": 1.3010905981063843, + "learning_rate": 1.875e-06, + "loss": 0.5301, + "step": 377 + }, + { + "epoch": 0.021166983984768732, + "grad_norm": 1.2400481700897217, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.6083, + "step": 378 + }, + { + "epoch": 0.021222981296897748, + "grad_norm": 1.2115167379379272, + "learning_rate": 1.885e-06, + "loss": 0.4465, + "step": 379 + }, + { + "epoch": 0.021278978609026767, + "grad_norm": 1.2475694417953491, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.5166, + "step": 380 + }, + { + "epoch": 0.021334975921155786, + "grad_norm": 1.2450370788574219, + "learning_rate": 1.8950000000000003e-06, + "loss": 0.4768, + "step": 381 + }, + { + "epoch": 0.021390973233284802, + "grad_norm": 1.3486833572387695, + "learning_rate": 1.9e-06, + "loss": 0.63, + "step": 382 + }, + { + "epoch": 0.02144697054541382, + "grad_norm": 1.5615993738174438, + "learning_rate": 1.9050000000000002e-06, + "loss": 0.5499, + "step": 383 + }, + { + "epoch": 0.021502967857542837, + "grad_norm": 1.0564674139022827, + "learning_rate": 1.91e-06, + "loss": 0.4291, + "step": 384 + }, + { + "epoch": 0.021558965169671856, + "grad_norm": 1.1341657638549805, + "learning_rate": 1.9150000000000003e-06, + "loss": 0.4337, + "step": 385 + }, + { + "epoch": 0.021614962481800875, + "grad_norm": 1.2850117683410645, + "learning_rate": 1.92e-06, + "loss": 0.3848, + "step": 386 + }, + { + "epoch": 0.02167095979392989, + "grad_norm": 1.4669718742370605, + "learning_rate": 1.925e-06, + "loss": 0.4459, + "step": 387 + }, + { + "epoch": 0.02172695710605891, + "grad_norm": 1.3067212104797363, + "learning_rate": 1.93e-06, + "loss": 0.4179, + "step": 388 + }, + { + "epoch": 0.021782954418187925, + "grad_norm": 1.338586688041687, + "learning_rate": 1.935e-06, + "loss": 0.468, + "step": 389 + }, + { + "epoch": 0.021838951730316945, + "grad_norm": 1.3039277791976929, + "learning_rate": 1.94e-06, + "loss": 0.4632, + "step": 390 + }, + { + "epoch": 0.021894949042445964, + "grad_norm": 1.2103965282440186, + "learning_rate": 1.945e-06, + "loss": 0.4425, + "step": 391 + }, + { + "epoch": 0.02195094635457498, + "grad_norm": 1.1756149530410767, + "learning_rate": 1.95e-06, + "loss": 0.5389, + "step": 392 + }, + { + "epoch": 0.022006943666704, + "grad_norm": 1.6295759677886963, + "learning_rate": 1.9550000000000003e-06, + "loss": 0.5089, + "step": 393 + }, + { + "epoch": 0.022062940978833018, + "grad_norm": 1.2471020221710205, + "learning_rate": 1.96e-06, + "loss": 0.4591, + "step": 394 + }, + { + "epoch": 0.022118938290962033, + "grad_norm": 1.276649832725525, + "learning_rate": 1.9650000000000002e-06, + "loss": 0.4814, + "step": 395 + }, + { + "epoch": 0.022174935603091053, + "grad_norm": 1.3960410356521606, + "learning_rate": 1.9699999999999998e-06, + "loss": 0.3976, + "step": 396 + }, + { + "epoch": 0.022230932915220068, + "grad_norm": 1.5828909873962402, + "learning_rate": 1.975e-06, + "loss": 0.4629, + "step": 397 + }, + { + "epoch": 0.022286930227349087, + "grad_norm": 1.1500638723373413, + "learning_rate": 1.98e-06, + "loss": 0.4718, + "step": 398 + }, + { + "epoch": 0.022342927539478107, + "grad_norm": 1.217605471611023, + "learning_rate": 1.985e-06, + "loss": 0.5962, + "step": 399 + }, + { + "epoch": 0.022398924851607122, + "grad_norm": 1.4575655460357666, + "learning_rate": 1.99e-06, + "loss": 0.5309, + "step": 400 + }, + { + "epoch": 0.02245492216373614, + "grad_norm": 1.4009506702423096, + "learning_rate": 1.995e-06, + "loss": 0.5567, + "step": 401 + }, + { + "epoch": 0.022510919475865157, + "grad_norm": 1.2377105951309204, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4204, + "step": 402 + }, + { + "epoch": 0.022566916787994176, + "grad_norm": 1.214349389076233, + "learning_rate": 2.005e-06, + "loss": 0.5886, + "step": 403 + }, + { + "epoch": 0.022622914100123195, + "grad_norm": 1.275333046913147, + "learning_rate": 2.0100000000000002e-06, + "loss": 0.4612, + "step": 404 + }, + { + "epoch": 0.02267891141225221, + "grad_norm": 1.3972492218017578, + "learning_rate": 2.015e-06, + "loss": 0.6052, + "step": 405 + }, + { + "epoch": 0.02273490872438123, + "grad_norm": 1.0658378601074219, + "learning_rate": 2.02e-06, + "loss": 0.3883, + "step": 406 + }, + { + "epoch": 0.022790906036510246, + "grad_norm": 1.2928189039230347, + "learning_rate": 2.025e-06, + "loss": 0.6167, + "step": 407 + }, + { + "epoch": 0.022846903348639265, + "grad_norm": 1.3358932733535767, + "learning_rate": 2.03e-06, + "loss": 0.3844, + "step": 408 + }, + { + "epoch": 0.022902900660768284, + "grad_norm": 1.479500412940979, + "learning_rate": 2.035e-06, + "loss": 0.4249, + "step": 409 + }, + { + "epoch": 0.0229588979728973, + "grad_norm": 1.2579445838928223, + "learning_rate": 2.0400000000000004e-06, + "loss": 0.4825, + "step": 410 + }, + { + "epoch": 0.02301489528502632, + "grad_norm": 1.2056101560592651, + "learning_rate": 2.045e-06, + "loss": 0.4273, + "step": 411 + }, + { + "epoch": 0.023070892597155338, + "grad_norm": 1.3829147815704346, + "learning_rate": 2.0500000000000003e-06, + "loss": 0.6324, + "step": 412 + }, + { + "epoch": 0.023126889909284354, + "grad_norm": 1.4959462881088257, + "learning_rate": 2.055e-06, + "loss": 0.5117, + "step": 413 + }, + { + "epoch": 0.023182887221413373, + "grad_norm": 1.593402624130249, + "learning_rate": 2.06e-06, + "loss": 0.573, + "step": 414 + }, + { + "epoch": 0.02323888453354239, + "grad_norm": 1.2119688987731934, + "learning_rate": 2.065e-06, + "loss": 0.4263, + "step": 415 + }, + { + "epoch": 0.023294881845671408, + "grad_norm": 1.2972121238708496, + "learning_rate": 2.07e-06, + "loss": 0.4812, + "step": 416 + }, + { + "epoch": 0.023350879157800427, + "grad_norm": 1.3286548852920532, + "learning_rate": 2.075e-06, + "loss": 0.4082, + "step": 417 + }, + { + "epoch": 0.023406876469929443, + "grad_norm": 1.6062296628952026, + "learning_rate": 2.08e-06, + "loss": 0.5704, + "step": 418 + }, + { + "epoch": 0.023462873782058462, + "grad_norm": 1.2025117874145508, + "learning_rate": 2.085e-06, + "loss": 0.4505, + "step": 419 + }, + { + "epoch": 0.023518871094187477, + "grad_norm": 1.1478592157363892, + "learning_rate": 2.09e-06, + "loss": 0.4226, + "step": 420 + }, + { + "epoch": 0.023574868406316497, + "grad_norm": 1.5359517335891724, + "learning_rate": 2.0950000000000003e-06, + "loss": 0.5153, + "step": 421 + }, + { + "epoch": 0.023630865718445516, + "grad_norm": 1.400523066520691, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.5782, + "step": 422 + }, + { + "epoch": 0.02368686303057453, + "grad_norm": 1.2018365859985352, + "learning_rate": 2.105e-06, + "loss": 0.4398, + "step": 423 + }, + { + "epoch": 0.02374286034270355, + "grad_norm": 1.286759853363037, + "learning_rate": 2.11e-06, + "loss": 0.4881, + "step": 424 + }, + { + "epoch": 0.02379885765483257, + "grad_norm": 1.2115166187286377, + "learning_rate": 2.115e-06, + "loss": 0.5765, + "step": 425 + }, + { + "epoch": 0.023854854966961585, + "grad_norm": 1.3774797916412354, + "learning_rate": 2.12e-06, + "loss": 0.4356, + "step": 426 + }, + { + "epoch": 0.023910852279090605, + "grad_norm": 1.211519479751587, + "learning_rate": 2.1250000000000004e-06, + "loss": 0.5455, + "step": 427 + }, + { + "epoch": 0.02396684959121962, + "grad_norm": 1.1984440088272095, + "learning_rate": 2.13e-06, + "loss": 0.4587, + "step": 428 + }, + { + "epoch": 0.02402284690334864, + "grad_norm": 1.484640121459961, + "learning_rate": 2.1350000000000003e-06, + "loss": 0.4316, + "step": 429 + }, + { + "epoch": 0.02407884421547766, + "grad_norm": 1.4276765584945679, + "learning_rate": 2.14e-06, + "loss": 0.5961, + "step": 430 + }, + { + "epoch": 0.024134841527606674, + "grad_norm": 0.9489923715591431, + "learning_rate": 2.1450000000000002e-06, + "loss": 0.3301, + "step": 431 + }, + { + "epoch": 0.024190838839735693, + "grad_norm": 1.2276958227157593, + "learning_rate": 2.1499999999999997e-06, + "loss": 0.3866, + "step": 432 + }, + { + "epoch": 0.02424683615186471, + "grad_norm": 1.308908462524414, + "learning_rate": 2.155e-06, + "loss": 0.4123, + "step": 433 + }, + { + "epoch": 0.024302833463993728, + "grad_norm": 1.2939643859863281, + "learning_rate": 2.16e-06, + "loss": 0.4278, + "step": 434 + }, + { + "epoch": 0.024358830776122747, + "grad_norm": 1.3001083135604858, + "learning_rate": 2.165e-06, + "loss": 0.657, + "step": 435 + }, + { + "epoch": 0.024414828088251763, + "grad_norm": 1.6597774028778076, + "learning_rate": 2.17e-06, + "loss": 0.6464, + "step": 436 + }, + { + "epoch": 0.024470825400380782, + "grad_norm": 1.2446776628494263, + "learning_rate": 2.175e-06, + "loss": 0.4831, + "step": 437 + }, + { + "epoch": 0.024526822712509798, + "grad_norm": 0.962810218334198, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.3997, + "step": 438 + }, + { + "epoch": 0.024582820024638817, + "grad_norm": 1.5243562459945679, + "learning_rate": 2.1850000000000003e-06, + "loss": 0.3864, + "step": 439 + }, + { + "epoch": 0.024638817336767836, + "grad_norm": 1.2688652276992798, + "learning_rate": 2.19e-06, + "loss": 0.4964, + "step": 440 + }, + { + "epoch": 0.024694814648896852, + "grad_norm": 1.4606717824935913, + "learning_rate": 2.195e-06, + "loss": 0.6258, + "step": 441 + }, + { + "epoch": 0.02475081196102587, + "grad_norm": 1.2517849206924438, + "learning_rate": 2.2e-06, + "loss": 0.4499, + "step": 442 + }, + { + "epoch": 0.02480680927315489, + "grad_norm": 1.210240125656128, + "learning_rate": 2.205e-06, + "loss": 0.5199, + "step": 443 + }, + { + "epoch": 0.024862806585283906, + "grad_norm": 1.3706921339035034, + "learning_rate": 2.2100000000000004e-06, + "loss": 0.4387, + "step": 444 + }, + { + "epoch": 0.024918803897412925, + "grad_norm": 1.144190788269043, + "learning_rate": 2.215e-06, + "loss": 0.3526, + "step": 445 + }, + { + "epoch": 0.02497480120954194, + "grad_norm": 1.22667396068573, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.5217, + "step": 446 + }, + { + "epoch": 0.02503079852167096, + "grad_norm": 1.5109798908233643, + "learning_rate": 2.225e-06, + "loss": 0.4878, + "step": 447 + }, + { + "epoch": 0.02508679583379998, + "grad_norm": 1.2600774765014648, + "learning_rate": 2.2300000000000002e-06, + "loss": 0.4519, + "step": 448 + }, + { + "epoch": 0.025142793145928995, + "grad_norm": 1.29214346408844, + "learning_rate": 2.2349999999999998e-06, + "loss": 0.459, + "step": 449 + }, + { + "epoch": 0.025198790458058014, + "grad_norm": 1.1631250381469727, + "learning_rate": 2.24e-06, + "loss": 0.4691, + "step": 450 + }, + { + "epoch": 0.02525478777018703, + "grad_norm": 1.379044771194458, + "learning_rate": 2.245e-06, + "loss": 0.524, + "step": 451 + }, + { + "epoch": 0.02531078508231605, + "grad_norm": 1.169918417930603, + "learning_rate": 2.25e-06, + "loss": 0.4829, + "step": 452 + }, + { + "epoch": 0.025366782394445068, + "grad_norm": 1.133783221244812, + "learning_rate": 2.255e-06, + "loss": 0.536, + "step": 453 + }, + { + "epoch": 0.025422779706574083, + "grad_norm": 1.431279182434082, + "learning_rate": 2.26e-06, + "loss": 0.5463, + "step": 454 + }, + { + "epoch": 0.025478777018703103, + "grad_norm": 1.282882809638977, + "learning_rate": 2.265e-06, + "loss": 0.4981, + "step": 455 + }, + { + "epoch": 0.025534774330832122, + "grad_norm": 1.1804708242416382, + "learning_rate": 2.2700000000000003e-06, + "loss": 0.419, + "step": 456 + }, + { + "epoch": 0.025590771642961137, + "grad_norm": 1.298541784286499, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.4705, + "step": 457 + }, + { + "epoch": 0.025646768955090157, + "grad_norm": 1.1998291015625, + "learning_rate": 2.28e-06, + "loss": 0.3948, + "step": 458 + }, + { + "epoch": 0.025702766267219172, + "grad_norm": 1.3842189311981201, + "learning_rate": 2.285e-06, + "loss": 0.4904, + "step": 459 + }, + { + "epoch": 0.02575876357934819, + "grad_norm": 1.264269232749939, + "learning_rate": 2.29e-06, + "loss": 0.4574, + "step": 460 + }, + { + "epoch": 0.02581476089147721, + "grad_norm": 1.2546792030334473, + "learning_rate": 2.2950000000000005e-06, + "loss": 0.3574, + "step": 461 + }, + { + "epoch": 0.025870758203606226, + "grad_norm": 1.0641919374465942, + "learning_rate": 2.3e-06, + "loss": 0.5158, + "step": 462 + }, + { + "epoch": 0.025926755515735245, + "grad_norm": 1.35035240650177, + "learning_rate": 2.3050000000000004e-06, + "loss": 0.5019, + "step": 463 + }, + { + "epoch": 0.02598275282786426, + "grad_norm": 1.248283863067627, + "learning_rate": 2.31e-06, + "loss": 0.4133, + "step": 464 + }, + { + "epoch": 0.02603875013999328, + "grad_norm": 1.6036901473999023, + "learning_rate": 2.3150000000000003e-06, + "loss": 0.6371, + "step": 465 + }, + { + "epoch": 0.0260947474521223, + "grad_norm": 1.4229565858840942, + "learning_rate": 2.32e-06, + "loss": 0.4239, + "step": 466 + }, + { + "epoch": 0.026150744764251315, + "grad_norm": 1.245241403579712, + "learning_rate": 2.325e-06, + "loss": 0.4052, + "step": 467 + }, + { + "epoch": 0.026206742076380334, + "grad_norm": 1.4726648330688477, + "learning_rate": 2.33e-06, + "loss": 0.5263, + "step": 468 + }, + { + "epoch": 0.02626273938850935, + "grad_norm": 1.1756625175476074, + "learning_rate": 2.335e-06, + "loss": 0.4515, + "step": 469 + }, + { + "epoch": 0.02631873670063837, + "grad_norm": 1.3073421716690063, + "learning_rate": 2.34e-06, + "loss": 0.6833, + "step": 470 + }, + { + "epoch": 0.026374734012767388, + "grad_norm": 1.1485313177108765, + "learning_rate": 2.345e-06, + "loss": 0.3888, + "step": 471 + }, + { + "epoch": 0.026430731324896404, + "grad_norm": 1.120802402496338, + "learning_rate": 2.35e-06, + "loss": 0.3704, + "step": 472 + }, + { + "epoch": 0.026486728637025423, + "grad_norm": 1.3014438152313232, + "learning_rate": 2.3550000000000003e-06, + "loss": 0.6238, + "step": 473 + }, + { + "epoch": 0.026542725949154442, + "grad_norm": 1.2936598062515259, + "learning_rate": 2.36e-06, + "loss": 0.5, + "step": 474 + }, + { + "epoch": 0.026598723261283458, + "grad_norm": 1.2880456447601318, + "learning_rate": 2.3650000000000002e-06, + "loss": 0.41, + "step": 475 + }, + { + "epoch": 0.026654720573412477, + "grad_norm": 1.2736504077911377, + "learning_rate": 2.37e-06, + "loss": 0.4159, + "step": 476 + }, + { + "epoch": 0.026710717885541493, + "grad_norm": 1.3126493692398071, + "learning_rate": 2.375e-06, + "loss": 0.3981, + "step": 477 + }, + { + "epoch": 0.026766715197670512, + "grad_norm": 1.1644043922424316, + "learning_rate": 2.38e-06, + "loss": 0.4143, + "step": 478 + }, + { + "epoch": 0.02682271250979953, + "grad_norm": 1.2235571146011353, + "learning_rate": 2.385e-06, + "loss": 0.4354, + "step": 479 + }, + { + "epoch": 0.026878709821928547, + "grad_norm": 1.503503680229187, + "learning_rate": 2.3900000000000004e-06, + "loss": 0.4693, + "step": 480 + }, + { + "epoch": 0.026934707134057566, + "grad_norm": 1.1442761421203613, + "learning_rate": 2.395e-06, + "loss": 0.4132, + "step": 481 + }, + { + "epoch": 0.02699070444618658, + "grad_norm": 1.382543921470642, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.4063, + "step": 482 + }, + { + "epoch": 0.0270467017583156, + "grad_norm": 1.2548270225524902, + "learning_rate": 2.405e-06, + "loss": 0.3863, + "step": 483 + }, + { + "epoch": 0.02710269907044462, + "grad_norm": 1.2861868143081665, + "learning_rate": 2.4100000000000002e-06, + "loss": 0.4733, + "step": 484 + }, + { + "epoch": 0.027158696382573635, + "grad_norm": 1.3512135744094849, + "learning_rate": 2.415e-06, + "loss": 0.4445, + "step": 485 + }, + { + "epoch": 0.027214693694702655, + "grad_norm": 16.69654655456543, + "learning_rate": 2.42e-06, + "loss": 0.5016, + "step": 486 + }, + { + "epoch": 0.027270691006831674, + "grad_norm": 1.4881073236465454, + "learning_rate": 2.425e-06, + "loss": 0.5744, + "step": 487 + }, + { + "epoch": 0.02732668831896069, + "grad_norm": 1.1898785829544067, + "learning_rate": 2.43e-06, + "loss": 0.526, + "step": 488 + }, + { + "epoch": 0.02738268563108971, + "grad_norm": 1.2967259883880615, + "learning_rate": 2.435e-06, + "loss": 0.5289, + "step": 489 + }, + { + "epoch": 0.027438682943218724, + "grad_norm": 1.49234139919281, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.6374, + "step": 490 + }, + { + "epoch": 0.027494680255347743, + "grad_norm": 1.1433918476104736, + "learning_rate": 2.445e-06, + "loss": 0.3643, + "step": 491 + }, + { + "epoch": 0.027550677567476763, + "grad_norm": 1.375954508781433, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.655, + "step": 492 + }, + { + "epoch": 0.02760667487960578, + "grad_norm": 1.0267375707626343, + "learning_rate": 2.4550000000000002e-06, + "loss": 0.5066, + "step": 493 + }, + { + "epoch": 0.027662672191734797, + "grad_norm": 1.6544849872589111, + "learning_rate": 2.46e-06, + "loss": 0.4229, + "step": 494 + }, + { + "epoch": 0.027718669503863813, + "grad_norm": 1.0966876745224, + "learning_rate": 2.465e-06, + "loss": 0.446, + "step": 495 + }, + { + "epoch": 0.027774666815992832, + "grad_norm": 1.3126972913742065, + "learning_rate": 2.47e-06, + "loss": 0.4629, + "step": 496 + }, + { + "epoch": 0.02783066412812185, + "grad_norm": 1.245058298110962, + "learning_rate": 2.4750000000000004e-06, + "loss": 0.618, + "step": 497 + }, + { + "epoch": 0.027886661440250867, + "grad_norm": 1.700477957725525, + "learning_rate": 2.48e-06, + "loss": 0.5081, + "step": 498 + }, + { + "epoch": 0.027942658752379886, + "grad_norm": 1.3252248764038086, + "learning_rate": 2.4850000000000003e-06, + "loss": 0.4537, + "step": 499 + }, + { + "epoch": 0.027998656064508902, + "grad_norm": 1.6630816459655762, + "learning_rate": 2.49e-06, + "loss": 0.3542, + "step": 500 + }, + { + "epoch": 0.02805465337663792, + "grad_norm": 1.546900749206543, + "learning_rate": 2.4950000000000003e-06, + "loss": 0.6161, + "step": 501 + }, + { + "epoch": 0.02811065068876694, + "grad_norm": 1.234569787979126, + "learning_rate": 2.5e-06, + "loss": 0.436, + "step": 502 + }, + { + "epoch": 0.028166648000895956, + "grad_norm": 1.4984447956085205, + "learning_rate": 2.505e-06, + "loss": 0.4401, + "step": 503 + }, + { + "epoch": 0.028222645313024975, + "grad_norm": 1.3634111881256104, + "learning_rate": 2.51e-06, + "loss": 0.4091, + "step": 504 + }, + { + "epoch": 0.028278642625153994, + "grad_norm": 1.5010645389556885, + "learning_rate": 2.515e-06, + "loss": 0.4773, + "step": 505 + }, + { + "epoch": 0.02833463993728301, + "grad_norm": 1.1970478296279907, + "learning_rate": 2.52e-06, + "loss": 0.4239, + "step": 506 + }, + { + "epoch": 0.02839063724941203, + "grad_norm": 1.2129427194595337, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.444, + "step": 507 + }, + { + "epoch": 0.028446634561541045, + "grad_norm": 1.586907148361206, + "learning_rate": 2.53e-06, + "loss": 0.5241, + "step": 508 + }, + { + "epoch": 0.028502631873670064, + "grad_norm": 1.338651418685913, + "learning_rate": 2.5350000000000003e-06, + "loss": 0.5268, + "step": 509 + }, + { + "epoch": 0.028558629185799083, + "grad_norm": 1.1952580213546753, + "learning_rate": 2.54e-06, + "loss": 0.4454, + "step": 510 + }, + { + "epoch": 0.0286146264979281, + "grad_norm": 1.2241628170013428, + "learning_rate": 2.545e-06, + "loss": 0.4019, + "step": 511 + }, + { + "epoch": 0.028670623810057118, + "grad_norm": 1.2646491527557373, + "learning_rate": 2.55e-06, + "loss": 0.5136, + "step": 512 + }, + { + "epoch": 0.028726621122186134, + "grad_norm": 1.3502599000930786, + "learning_rate": 2.555e-06, + "loss": 0.557, + "step": 513 + }, + { + "epoch": 0.028782618434315153, + "grad_norm": 1.107664704322815, + "learning_rate": 2.56e-06, + "loss": 0.3778, + "step": 514 + }, + { + "epoch": 0.028838615746444172, + "grad_norm": 1.457395076751709, + "learning_rate": 2.565e-06, + "loss": 0.7625, + "step": 515 + }, + { + "epoch": 0.028894613058573188, + "grad_norm": 1.2146413326263428, + "learning_rate": 2.5700000000000004e-06, + "loss": 0.4772, + "step": 516 + }, + { + "epoch": 0.028950610370702207, + "grad_norm": 1.2828574180603027, + "learning_rate": 2.575e-06, + "loss": 0.4514, + "step": 517 + }, + { + "epoch": 0.029006607682831226, + "grad_norm": 1.3281506299972534, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.5961, + "step": 518 + }, + { + "epoch": 0.02906260499496024, + "grad_norm": 1.1464653015136719, + "learning_rate": 2.5850000000000002e-06, + "loss": 0.3494, + "step": 519 + }, + { + "epoch": 0.02911860230708926, + "grad_norm": 1.368131399154663, + "learning_rate": 2.59e-06, + "loss": 0.5817, + "step": 520 + }, + { + "epoch": 0.029174599619218276, + "grad_norm": 1.3296947479248047, + "learning_rate": 2.595e-06, + "loss": 0.615, + "step": 521 + }, + { + "epoch": 0.029230596931347295, + "grad_norm": 1.1147074699401855, + "learning_rate": 2.6e-06, + "loss": 0.437, + "step": 522 + }, + { + "epoch": 0.029286594243476315, + "grad_norm": 1.301010012626648, + "learning_rate": 2.605e-06, + "loss": 0.4163, + "step": 523 + }, + { + "epoch": 0.02934259155560533, + "grad_norm": 1.564185380935669, + "learning_rate": 2.6100000000000004e-06, + "loss": 0.6379, + "step": 524 + }, + { + "epoch": 0.02939858886773435, + "grad_norm": 1.2892937660217285, + "learning_rate": 2.615e-06, + "loss": 0.5075, + "step": 525 + }, + { + "epoch": 0.029454586179863365, + "grad_norm": 1.5146722793579102, + "learning_rate": 2.6200000000000003e-06, + "loss": 0.5486, + "step": 526 + }, + { + "epoch": 0.029510583491992384, + "grad_norm": 1.3844255208969116, + "learning_rate": 2.625e-06, + "loss": 0.5322, + "step": 527 + }, + { + "epoch": 0.029566580804121403, + "grad_norm": 1.4252188205718994, + "learning_rate": 2.6300000000000002e-06, + "loss": 0.696, + "step": 528 + }, + { + "epoch": 0.02962257811625042, + "grad_norm": 1.1514140367507935, + "learning_rate": 2.6349999999999998e-06, + "loss": 0.3972, + "step": 529 + }, + { + "epoch": 0.02967857542837944, + "grad_norm": 1.384243369102478, + "learning_rate": 2.64e-06, + "loss": 0.4538, + "step": 530 + }, + { + "epoch": 0.029734572740508454, + "grad_norm": 1.5517767667770386, + "learning_rate": 2.645e-06, + "loss": 0.7234, + "step": 531 + }, + { + "epoch": 0.029790570052637473, + "grad_norm": 1.220450520515442, + "learning_rate": 2.65e-06, + "loss": 0.4006, + "step": 532 + }, + { + "epoch": 0.029846567364766492, + "grad_norm": 1.5837041139602661, + "learning_rate": 2.655e-06, + "loss": 0.5086, + "step": 533 + }, + { + "epoch": 0.029902564676895508, + "grad_norm": 2.34853196144104, + "learning_rate": 2.66e-06, + "loss": 0.5333, + "step": 534 + }, + { + "epoch": 0.029958561989024527, + "grad_norm": 1.433725118637085, + "learning_rate": 2.6650000000000003e-06, + "loss": 0.6396, + "step": 535 + }, + { + "epoch": 0.030014559301153546, + "grad_norm": 1.336133599281311, + "learning_rate": 2.6700000000000003e-06, + "loss": 0.5378, + "step": 536 + }, + { + "epoch": 0.030070556613282562, + "grad_norm": 1.3267064094543457, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.6231, + "step": 537 + }, + { + "epoch": 0.03012655392541158, + "grad_norm": 1.4553563594818115, + "learning_rate": 2.68e-06, + "loss": 0.5325, + "step": 538 + }, + { + "epoch": 0.030182551237540597, + "grad_norm": 1.1272870302200317, + "learning_rate": 2.685e-06, + "loss": 0.4525, + "step": 539 + }, + { + "epoch": 0.030238548549669616, + "grad_norm": 1.8186594247817993, + "learning_rate": 2.69e-06, + "loss": 0.4983, + "step": 540 + }, + { + "epoch": 0.030294545861798635, + "grad_norm": 1.2522361278533936, + "learning_rate": 2.6950000000000005e-06, + "loss": 0.5709, + "step": 541 + }, + { + "epoch": 0.03035054317392765, + "grad_norm": 1.3136264085769653, + "learning_rate": 2.7e-06, + "loss": 0.4878, + "step": 542 + }, + { + "epoch": 0.03040654048605667, + "grad_norm": 1.222240686416626, + "learning_rate": 2.7050000000000004e-06, + "loss": 0.4881, + "step": 543 + }, + { + "epoch": 0.030462537798185686, + "grad_norm": 1.3070313930511475, + "learning_rate": 2.71e-06, + "loss": 0.4469, + "step": 544 + }, + { + "epoch": 0.030518535110314705, + "grad_norm": 1.2443084716796875, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.5433, + "step": 545 + }, + { + "epoch": 0.030574532422443724, + "grad_norm": 1.4681586027145386, + "learning_rate": 2.72e-06, + "loss": 0.5622, + "step": 546 + }, + { + "epoch": 0.03063052973457274, + "grad_norm": 1.0752031803131104, + "learning_rate": 2.725e-06, + "loss": 0.3916, + "step": 547 + }, + { + "epoch": 0.03068652704670176, + "grad_norm": 1.338870644569397, + "learning_rate": 2.73e-06, + "loss": 0.4114, + "step": 548 + }, + { + "epoch": 0.030742524358830778, + "grad_norm": 1.2860264778137207, + "learning_rate": 2.735e-06, + "loss": 0.4996, + "step": 549 + }, + { + "epoch": 0.030798521670959794, + "grad_norm": 10.62662124633789, + "learning_rate": 2.74e-06, + "loss": 0.474, + "step": 550 + }, + { + "epoch": 0.030854518983088813, + "grad_norm": 1.2164727449417114, + "learning_rate": 2.745e-06, + "loss": 0.5298, + "step": 551 + }, + { + "epoch": 0.03091051629521783, + "grad_norm": 1.0920971632003784, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.4692, + "step": 552 + }, + { + "epoch": 0.030966513607346847, + "grad_norm": 1.2918709516525269, + "learning_rate": 2.7550000000000003e-06, + "loss": 0.3756, + "step": 553 + }, + { + "epoch": 0.031022510919475867, + "grad_norm": 1.6300508975982666, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.4695, + "step": 554 + }, + { + "epoch": 0.031078508231604882, + "grad_norm": 1.2380396127700806, + "learning_rate": 2.765e-06, + "loss": 0.5224, + "step": 555 + }, + { + "epoch": 0.0311345055437339, + "grad_norm": 1.748967170715332, + "learning_rate": 2.77e-06, + "loss": 0.6471, + "step": 556 + }, + { + "epoch": 0.031190502855862917, + "grad_norm": 1.3673230409622192, + "learning_rate": 2.775e-06, + "loss": 0.576, + "step": 557 + }, + { + "epoch": 0.031246500167991936, + "grad_norm": 1.1652144193649292, + "learning_rate": 2.78e-06, + "loss": 0.4436, + "step": 558 + }, + { + "epoch": 0.031302497480120955, + "grad_norm": 1.121286153793335, + "learning_rate": 2.785e-06, + "loss": 0.4163, + "step": 559 + }, + { + "epoch": 0.031358494792249975, + "grad_norm": 1.2735199928283691, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.4716, + "step": 560 + }, + { + "epoch": 0.03141449210437899, + "grad_norm": 1.420231819152832, + "learning_rate": 2.795e-06, + "loss": 0.618, + "step": 561 + }, + { + "epoch": 0.031470489416508006, + "grad_norm": 1.2376223802566528, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.3637, + "step": 562 + }, + { + "epoch": 0.031526486728637025, + "grad_norm": 1.3908500671386719, + "learning_rate": 2.805e-06, + "loss": 0.6301, + "step": 563 + }, + { + "epoch": 0.031582484040766044, + "grad_norm": 1.2173155546188354, + "learning_rate": 2.81e-06, + "loss": 0.4616, + "step": 564 + }, + { + "epoch": 0.03163848135289506, + "grad_norm": 1.752326488494873, + "learning_rate": 2.815e-06, + "loss": 0.403, + "step": 565 + }, + { + "epoch": 0.031694478665024076, + "grad_norm": 1.6780214309692383, + "learning_rate": 2.82e-06, + "loss": 0.5124, + "step": 566 + }, + { + "epoch": 0.031750475977153095, + "grad_norm": 1.2604048252105713, + "learning_rate": 2.825e-06, + "loss": 0.4851, + "step": 567 + }, + { + "epoch": 0.031806473289282114, + "grad_norm": 1.31491219997406, + "learning_rate": 2.83e-06, + "loss": 0.4711, + "step": 568 + }, + { + "epoch": 0.03186247060141113, + "grad_norm": 4.062485218048096, + "learning_rate": 2.835e-06, + "loss": 0.5685, + "step": 569 + }, + { + "epoch": 0.03191846791354015, + "grad_norm": 1.2817109823226929, + "learning_rate": 2.8400000000000003e-06, + "loss": 0.427, + "step": 570 + }, + { + "epoch": 0.031974465225669164, + "grad_norm": 1.2257922887802124, + "learning_rate": 2.8450000000000003e-06, + "loss": 0.485, + "step": 571 + }, + { + "epoch": 0.032030462537798184, + "grad_norm": 1.1841542720794678, + "learning_rate": 2.8500000000000002e-06, + "loss": 0.4409, + "step": 572 + }, + { + "epoch": 0.0320864598499272, + "grad_norm": 1.4808175563812256, + "learning_rate": 2.855e-06, + "loss": 0.4201, + "step": 573 + }, + { + "epoch": 0.03214245716205622, + "grad_norm": 1.2412053346633911, + "learning_rate": 2.86e-06, + "loss": 0.4371, + "step": 574 + }, + { + "epoch": 0.03219845447418524, + "grad_norm": 1.184926986694336, + "learning_rate": 2.865e-06, + "loss": 0.5311, + "step": 575 + }, + { + "epoch": 0.03225445178631426, + "grad_norm": 1.2667714357376099, + "learning_rate": 2.87e-06, + "loss": 0.4015, + "step": 576 + }, + { + "epoch": 0.03231044909844327, + "grad_norm": 1.4195163249969482, + "learning_rate": 2.8750000000000004e-06, + "loss": 0.4566, + "step": 577 + }, + { + "epoch": 0.03236644641057229, + "grad_norm": 1.3507108688354492, + "learning_rate": 2.88e-06, + "loss": 0.5713, + "step": 578 + }, + { + "epoch": 0.03242244372270131, + "grad_norm": 1.4129518270492554, + "learning_rate": 2.8850000000000003e-06, + "loss": 0.5334, + "step": 579 + }, + { + "epoch": 0.03247844103483033, + "grad_norm": 1.3056820631027222, + "learning_rate": 2.89e-06, + "loss": 0.4025, + "step": 580 + }, + { + "epoch": 0.03253443834695935, + "grad_norm": 1.4413847923278809, + "learning_rate": 2.8950000000000002e-06, + "loss": 0.5258, + "step": 581 + }, + { + "epoch": 0.03259043565908836, + "grad_norm": 1.270606279373169, + "learning_rate": 2.9e-06, + "loss": 0.4603, + "step": 582 + }, + { + "epoch": 0.03264643297121738, + "grad_norm": 1.3087241649627686, + "learning_rate": 2.905e-06, + "loss": 0.5114, + "step": 583 + }, + { + "epoch": 0.0327024302833464, + "grad_norm": 1.163053274154663, + "learning_rate": 2.91e-06, + "loss": 0.4819, + "step": 584 + }, + { + "epoch": 0.03275842759547542, + "grad_norm": 1.5739001035690308, + "learning_rate": 2.915e-06, + "loss": 0.6085, + "step": 585 + }, + { + "epoch": 0.03281442490760444, + "grad_norm": 1.167637825012207, + "learning_rate": 2.92e-06, + "loss": 0.4609, + "step": 586 + }, + { + "epoch": 0.03287042221973345, + "grad_norm": 1.22612464427948, + "learning_rate": 2.9250000000000004e-06, + "loss": 0.5268, + "step": 587 + }, + { + "epoch": 0.03292641953186247, + "grad_norm": 1.5374791622161865, + "learning_rate": 2.93e-06, + "loss": 0.5035, + "step": 588 + }, + { + "epoch": 0.03298241684399149, + "grad_norm": 1.3516424894332886, + "learning_rate": 2.9350000000000003e-06, + "loss": 0.4111, + "step": 589 + }, + { + "epoch": 0.03303841415612051, + "grad_norm": 1.4715403318405151, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.371, + "step": 590 + }, + { + "epoch": 0.03309441146824953, + "grad_norm": 1.4850103855133057, + "learning_rate": 2.945e-06, + "loss": 0.4527, + "step": 591 + }, + { + "epoch": 0.03315040878037854, + "grad_norm": 1.4048868417739868, + "learning_rate": 2.95e-06, + "loss": 0.458, + "step": 592 + }, + { + "epoch": 0.03320640609250756, + "grad_norm": 1.741652011871338, + "learning_rate": 2.955e-06, + "loss": 0.4839, + "step": 593 + }, + { + "epoch": 0.03326240340463658, + "grad_norm": 1.2577733993530273, + "learning_rate": 2.9600000000000005e-06, + "loss": 0.5411, + "step": 594 + }, + { + "epoch": 0.033318400716765596, + "grad_norm": 1.3856233358383179, + "learning_rate": 2.965e-06, + "loss": 0.5322, + "step": 595 + }, + { + "epoch": 0.033374398028894615, + "grad_norm": 1.447217345237732, + "learning_rate": 2.9700000000000004e-06, + "loss": 0.6493, + "step": 596 + }, + { + "epoch": 0.03343039534102363, + "grad_norm": 1.4229252338409424, + "learning_rate": 2.975e-06, + "loss": 0.3921, + "step": 597 + }, + { + "epoch": 0.03348639265315265, + "grad_norm": 11.00639820098877, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.3584, + "step": 598 + }, + { + "epoch": 0.033542389965281666, + "grad_norm": 1.2174385786056519, + "learning_rate": 2.9850000000000002e-06, + "loss": 0.3824, + "step": 599 + }, + { + "epoch": 0.033598387277410685, + "grad_norm": 1.1561325788497925, + "learning_rate": 2.99e-06, + "loss": 0.4915, + "step": 600 + }, + { + "epoch": 0.033654384589539704, + "grad_norm": 1.4047815799713135, + "learning_rate": 2.995e-06, + "loss": 0.5279, + "step": 601 + }, + { + "epoch": 0.033710381901668716, + "grad_norm": 4.640164852142334, + "learning_rate": 3e-06, + "loss": 0.4194, + "step": 602 + }, + { + "epoch": 0.033766379213797736, + "grad_norm": 1.154270052909851, + "learning_rate": 3.005e-06, + "loss": 0.4438, + "step": 603 + }, + { + "epoch": 0.033822376525926755, + "grad_norm": 1.1494734287261963, + "learning_rate": 3.01e-06, + "loss": 0.4764, + "step": 604 + }, + { + "epoch": 0.033878373838055774, + "grad_norm": 1.1763715744018555, + "learning_rate": 3.015e-06, + "loss": 0.5438, + "step": 605 + }, + { + "epoch": 0.03393437115018479, + "grad_norm": 1.0460625886917114, + "learning_rate": 3.0200000000000003e-06, + "loss": 0.4528, + "step": 606 + }, + { + "epoch": 0.03399036846231381, + "grad_norm": 5.100392818450928, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.4503, + "step": 607 + }, + { + "epoch": 0.034046365774442824, + "grad_norm": 1.0152342319488525, + "learning_rate": 3.0300000000000002e-06, + "loss": 0.4112, + "step": 608 + }, + { + "epoch": 0.034102363086571844, + "grad_norm": 1.1221469640731812, + "learning_rate": 3.035e-06, + "loss": 0.4216, + "step": 609 + }, + { + "epoch": 0.03415836039870086, + "grad_norm": 2.1545469760894775, + "learning_rate": 3.04e-06, + "loss": 0.5182, + "step": 610 + }, + { + "epoch": 0.03421435771082988, + "grad_norm": 1.3261489868164062, + "learning_rate": 3.0450000000000005e-06, + "loss": 0.7225, + "step": 611 + }, + { + "epoch": 0.0342703550229589, + "grad_norm": 1.443620204925537, + "learning_rate": 3.05e-06, + "loss": 0.6039, + "step": 612 + }, + { + "epoch": 0.03432635233508791, + "grad_norm": 1.3014883995056152, + "learning_rate": 3.0550000000000004e-06, + "loss": 0.5219, + "step": 613 + }, + { + "epoch": 0.03438234964721693, + "grad_norm": 2.6879069805145264, + "learning_rate": 3.06e-06, + "loss": 0.501, + "step": 614 + }, + { + "epoch": 0.03443834695934595, + "grad_norm": 1.3093355894088745, + "learning_rate": 3.0650000000000003e-06, + "loss": 0.4093, + "step": 615 + }, + { + "epoch": 0.03449434427147497, + "grad_norm": 1.1495851278305054, + "learning_rate": 3.0700000000000003e-06, + "loss": 0.388, + "step": 616 + }, + { + "epoch": 0.03455034158360399, + "grad_norm": 1.318150281906128, + "learning_rate": 3.075e-06, + "loss": 0.4241, + "step": 617 + }, + { + "epoch": 0.034606338895733, + "grad_norm": 1.5912282466888428, + "learning_rate": 3.08e-06, + "loss": 0.5434, + "step": 618 + }, + { + "epoch": 0.03466233620786202, + "grad_norm": 1.113492488861084, + "learning_rate": 3.085e-06, + "loss": 0.5174, + "step": 619 + }, + { + "epoch": 0.03471833351999104, + "grad_norm": 1.3931667804718018, + "learning_rate": 3.09e-06, + "loss": 0.3684, + "step": 620 + }, + { + "epoch": 0.03477433083212006, + "grad_norm": 1.3992459774017334, + "learning_rate": 3.095e-06, + "loss": 0.472, + "step": 621 + }, + { + "epoch": 0.03483032814424908, + "grad_norm": 1.2402055263519287, + "learning_rate": 3.1e-06, + "loss": 0.3491, + "step": 622 + }, + { + "epoch": 0.03488632545637809, + "grad_norm": 1.3450154066085815, + "learning_rate": 3.1050000000000003e-06, + "loss": 0.5987, + "step": 623 + }, + { + "epoch": 0.03494232276850711, + "grad_norm": 1.214292287826538, + "learning_rate": 3.11e-06, + "loss": 0.4254, + "step": 624 + }, + { + "epoch": 0.03499832008063613, + "grad_norm": 1.2098424434661865, + "learning_rate": 3.1150000000000002e-06, + "loss": 0.3846, + "step": 625 + }, + { + "epoch": 0.03505431739276515, + "grad_norm": 1.3445996046066284, + "learning_rate": 3.12e-06, + "loss": 0.5421, + "step": 626 + }, + { + "epoch": 0.03511031470489417, + "grad_norm": 1.2277252674102783, + "learning_rate": 3.125e-06, + "loss": 0.6699, + "step": 627 + }, + { + "epoch": 0.03516631201702318, + "grad_norm": 1.179675817489624, + "learning_rate": 3.13e-06, + "loss": 0.3626, + "step": 628 + }, + { + "epoch": 0.0352223093291522, + "grad_norm": 12.111026763916016, + "learning_rate": 3.1350000000000005e-06, + "loss": 0.3788, + "step": 629 + }, + { + "epoch": 0.03527830664128122, + "grad_norm": 1.2397090196609497, + "learning_rate": 3.14e-06, + "loss": 0.4611, + "step": 630 + }, + { + "epoch": 0.03533430395341024, + "grad_norm": 1.2845584154129028, + "learning_rate": 3.145e-06, + "loss": 0.4117, + "step": 631 + }, + { + "epoch": 0.035390301265539256, + "grad_norm": 1.0865263938903809, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.3984, + "step": 632 + }, + { + "epoch": 0.03544629857766827, + "grad_norm": 1.6106657981872559, + "learning_rate": 3.1550000000000003e-06, + "loss": 0.5957, + "step": 633 + }, + { + "epoch": 0.03550229588979729, + "grad_norm": 1.0239503383636475, + "learning_rate": 3.1600000000000007e-06, + "loss": 0.4252, + "step": 634 + }, + { + "epoch": 0.03555829320192631, + "grad_norm": 1.4274102449417114, + "learning_rate": 3.1649999999999998e-06, + "loss": 0.5052, + "step": 635 + }, + { + "epoch": 0.035614290514055326, + "grad_norm": 1.3243954181671143, + "learning_rate": 3.17e-06, + "loss": 0.5587, + "step": 636 + }, + { + "epoch": 0.035670287826184345, + "grad_norm": 1.0345747470855713, + "learning_rate": 3.175e-06, + "loss": 0.3972, + "step": 637 + }, + { + "epoch": 0.035726285138313364, + "grad_norm": 1.2989063262939453, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.536, + "step": 638 + }, + { + "epoch": 0.035782282450442376, + "grad_norm": 1.3336299657821655, + "learning_rate": 3.1850000000000004e-06, + "loss": 0.4359, + "step": 639 + }, + { + "epoch": 0.035838279762571396, + "grad_norm": 1.2301009893417358, + "learning_rate": 3.19e-06, + "loss": 0.4369, + "step": 640 + }, + { + "epoch": 0.035894277074700415, + "grad_norm": 1.0255048274993896, + "learning_rate": 3.195e-06, + "loss": 0.4257, + "step": 641 + }, + { + "epoch": 0.035950274386829434, + "grad_norm": 1.262315273284912, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.409, + "step": 642 + }, + { + "epoch": 0.03600627169895845, + "grad_norm": 1.1630877256393433, + "learning_rate": 3.2050000000000002e-06, + "loss": 0.3283, + "step": 643 + }, + { + "epoch": 0.036062269011087465, + "grad_norm": 1.3334583044052124, + "learning_rate": 3.2099999999999998e-06, + "loss": 0.4425, + "step": 644 + }, + { + "epoch": 0.036118266323216484, + "grad_norm": 1.649351954460144, + "learning_rate": 3.215e-06, + "loss": 0.5322, + "step": 645 + }, + { + "epoch": 0.036174263635345504, + "grad_norm": 1.298487901687622, + "learning_rate": 3.22e-06, + "loss": 0.4998, + "step": 646 + }, + { + "epoch": 0.03623026094747452, + "grad_norm": 1.2624866962432861, + "learning_rate": 3.225e-06, + "loss": 0.3944, + "step": 647 + }, + { + "epoch": 0.03628625825960354, + "grad_norm": 1.4487106800079346, + "learning_rate": 3.2300000000000004e-06, + "loss": 0.4767, + "step": 648 + }, + { + "epoch": 0.036342255571732554, + "grad_norm": 1.2317790985107422, + "learning_rate": 3.235e-06, + "loss": 0.4973, + "step": 649 + }, + { + "epoch": 0.03639825288386157, + "grad_norm": 1.002759337425232, + "learning_rate": 3.24e-06, + "loss": 0.3591, + "step": 650 + }, + { + "epoch": 0.03645425019599059, + "grad_norm": 1.2259923219680786, + "learning_rate": 3.2450000000000003e-06, + "loss": 0.3969, + "step": 651 + }, + { + "epoch": 0.03651024750811961, + "grad_norm": 1.3137469291687012, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.4246, + "step": 652 + }, + { + "epoch": 0.03656624482024863, + "grad_norm": 1.1844868659973145, + "learning_rate": 3.2550000000000006e-06, + "loss": 0.4375, + "step": 653 + }, + { + "epoch": 0.03662224213237764, + "grad_norm": 1.4780162572860718, + "learning_rate": 3.2599999999999997e-06, + "loss": 0.5578, + "step": 654 + }, + { + "epoch": 0.03667823944450666, + "grad_norm": 1.3112592697143555, + "learning_rate": 3.265e-06, + "loss": 0.354, + "step": 655 + }, + { + "epoch": 0.03673423675663568, + "grad_norm": 1.0690807104110718, + "learning_rate": 3.27e-06, + "loss": 0.4815, + "step": 656 + }, + { + "epoch": 0.0367902340687647, + "grad_norm": 1.209903359413147, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.3635, + "step": 657 + }, + { + "epoch": 0.03684623138089372, + "grad_norm": 1.1705541610717773, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.5315, + "step": 658 + }, + { + "epoch": 0.03690222869302273, + "grad_norm": 1.2070939540863037, + "learning_rate": 3.285e-06, + "loss": 0.3196, + "step": 659 + }, + { + "epoch": 0.03695822600515175, + "grad_norm": 1.1473920345306396, + "learning_rate": 3.29e-06, + "loss": 0.4252, + "step": 660 + }, + { + "epoch": 0.03701422331728077, + "grad_norm": 1.2592459917068481, + "learning_rate": 3.2950000000000002e-06, + "loss": 0.4727, + "step": 661 + }, + { + "epoch": 0.03707022062940979, + "grad_norm": 2.0471949577331543, + "learning_rate": 3.3e-06, + "loss": 0.6152, + "step": 662 + }, + { + "epoch": 0.03712621794153881, + "grad_norm": 1.3031131029129028, + "learning_rate": 3.3050000000000005e-06, + "loss": 0.7303, + "step": 663 + }, + { + "epoch": 0.03718221525366782, + "grad_norm": 1.2334582805633545, + "learning_rate": 3.31e-06, + "loss": 0.5149, + "step": 664 + }, + { + "epoch": 0.03723821256579684, + "grad_norm": 1.376380205154419, + "learning_rate": 3.315e-06, + "loss": 0.4672, + "step": 665 + }, + { + "epoch": 0.03729420987792586, + "grad_norm": 1.150626301765442, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.506, + "step": 666 + }, + { + "epoch": 0.03735020719005488, + "grad_norm": 1.3962228298187256, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.6496, + "step": 667 + }, + { + "epoch": 0.0374062045021839, + "grad_norm": 1.3976243734359741, + "learning_rate": 3.3300000000000003e-06, + "loss": 0.5132, + "step": 668 + }, + { + "epoch": 0.037462201814312916, + "grad_norm": 1.2252672910690308, + "learning_rate": 3.335e-06, + "loss": 0.3773, + "step": 669 + }, + { + "epoch": 0.03751819912644193, + "grad_norm": 1.3803831338882446, + "learning_rate": 3.34e-06, + "loss": 0.4245, + "step": 670 + }, + { + "epoch": 0.03757419643857095, + "grad_norm": 1.45225989818573, + "learning_rate": 3.345e-06, + "loss": 0.7511, + "step": 671 + }, + { + "epoch": 0.03763019375069997, + "grad_norm": 1.3100789785385132, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.4476, + "step": 672 + }, + { + "epoch": 0.037686191062828986, + "grad_norm": 1.2988783121109009, + "learning_rate": 3.3550000000000005e-06, + "loss": 0.5123, + "step": 673 + }, + { + "epoch": 0.037742188374958005, + "grad_norm": 1.1171373128890991, + "learning_rate": 3.36e-06, + "loss": 0.4446, + "step": 674 + }, + { + "epoch": 0.03779818568708702, + "grad_norm": 1.5306018590927124, + "learning_rate": 3.365e-06, + "loss": 0.4832, + "step": 675 + }, + { + "epoch": 0.037854182999216036, + "grad_norm": 1.277156114578247, + "learning_rate": 3.3700000000000003e-06, + "loss": 0.4448, + "step": 676 + }, + { + "epoch": 0.037910180311345056, + "grad_norm": 1.3082318305969238, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.4844, + "step": 677 + }, + { + "epoch": 0.037966177623474075, + "grad_norm": 1.2136180400848389, + "learning_rate": 3.38e-06, + "loss": 0.5297, + "step": 678 + }, + { + "epoch": 0.038022174935603094, + "grad_norm": 1.3228617906570435, + "learning_rate": 3.3849999999999998e-06, + "loss": 0.4339, + "step": 679 + }, + { + "epoch": 0.038078172247732106, + "grad_norm": 1.2094823122024536, + "learning_rate": 3.39e-06, + "loss": 0.4614, + "step": 680 + }, + { + "epoch": 0.038134169559861125, + "grad_norm": 1.2327871322631836, + "learning_rate": 3.395e-06, + "loss": 0.4556, + "step": 681 + }, + { + "epoch": 0.038190166871990144, + "grad_norm": 1.4336551427841187, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.5025, + "step": 682 + }, + { + "epoch": 0.038246164184119164, + "grad_norm": 1.1964848041534424, + "learning_rate": 3.405e-06, + "loss": 0.3432, + "step": 683 + }, + { + "epoch": 0.03830216149624818, + "grad_norm": 1.4089807271957397, + "learning_rate": 3.41e-06, + "loss": 0.5415, + "step": 684 + }, + { + "epoch": 0.038358158808377195, + "grad_norm": 1.3728796243667603, + "learning_rate": 3.4150000000000003e-06, + "loss": 0.4414, + "step": 685 + }, + { + "epoch": 0.038414156120506214, + "grad_norm": 1.1719013452529907, + "learning_rate": 3.4200000000000003e-06, + "loss": 0.4603, + "step": 686 + }, + { + "epoch": 0.03847015343263523, + "grad_norm": 1.244240164756775, + "learning_rate": 3.4250000000000002e-06, + "loss": 0.5535, + "step": 687 + }, + { + "epoch": 0.03852615074476425, + "grad_norm": 1.2287418842315674, + "learning_rate": 3.4299999999999998e-06, + "loss": 0.4683, + "step": 688 + }, + { + "epoch": 0.03858214805689327, + "grad_norm": 1.5462092161178589, + "learning_rate": 3.435e-06, + "loss": 0.5485, + "step": 689 + }, + { + "epoch": 0.038638145369022284, + "grad_norm": 1.2673622369766235, + "learning_rate": 3.44e-06, + "loss": 0.6002, + "step": 690 + }, + { + "epoch": 0.0386941426811513, + "grad_norm": 1.442499041557312, + "learning_rate": 3.4450000000000005e-06, + "loss": 0.6001, + "step": 691 + }, + { + "epoch": 0.03875013999328032, + "grad_norm": 1.2574551105499268, + "learning_rate": 3.4500000000000004e-06, + "loss": 0.4971, + "step": 692 + }, + { + "epoch": 0.03880613730540934, + "grad_norm": 1.633691668510437, + "learning_rate": 3.455e-06, + "loss": 0.6757, + "step": 693 + }, + { + "epoch": 0.03886213461753836, + "grad_norm": 1.447800636291504, + "learning_rate": 3.46e-06, + "loss": 0.506, + "step": 694 + }, + { + "epoch": 0.03891813192966737, + "grad_norm": 1.336168646812439, + "learning_rate": 3.4650000000000003e-06, + "loss": 0.4252, + "step": 695 + }, + { + "epoch": 0.03897412924179639, + "grad_norm": 1.0794219970703125, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.4782, + "step": 696 + }, + { + "epoch": 0.03903012655392541, + "grad_norm": 1.2878397703170776, + "learning_rate": 3.4750000000000006e-06, + "loss": 0.6454, + "step": 697 + }, + { + "epoch": 0.03908612386605443, + "grad_norm": 1.3281662464141846, + "learning_rate": 3.4799999999999997e-06, + "loss": 0.4986, + "step": 698 + }, + { + "epoch": 0.03914212117818345, + "grad_norm": 1.8202744722366333, + "learning_rate": 3.485e-06, + "loss": 0.6661, + "step": 699 + }, + { + "epoch": 0.03919811849031247, + "grad_norm": 1.1826694011688232, + "learning_rate": 3.49e-06, + "loss": 0.4021, + "step": 700 + }, + { + "epoch": 0.03925411580244148, + "grad_norm": 2.1669232845306396, + "learning_rate": 3.4950000000000004e-06, + "loss": 0.4714, + "step": 701 + }, + { + "epoch": 0.0393101131145705, + "grad_norm": 1.3901138305664062, + "learning_rate": 3.5000000000000004e-06, + "loss": 0.5407, + "step": 702 + }, + { + "epoch": 0.03936611042669952, + "grad_norm": 1.0436351299285889, + "learning_rate": 3.505e-06, + "loss": 0.3554, + "step": 703 + }, + { + "epoch": 0.03942210773882854, + "grad_norm": 1.0835562944412231, + "learning_rate": 3.5100000000000003e-06, + "loss": 0.4768, + "step": 704 + }, + { + "epoch": 0.03947810505095756, + "grad_norm": 1.1899420022964478, + "learning_rate": 3.5150000000000002e-06, + "loss": 0.539, + "step": 705 + }, + { + "epoch": 0.03953410236308657, + "grad_norm": 1.0693700313568115, + "learning_rate": 3.52e-06, + "loss": 0.4778, + "step": 706 + }, + { + "epoch": 0.03959009967521559, + "grad_norm": 1.595470666885376, + "learning_rate": 3.5249999999999997e-06, + "loss": 0.5446, + "step": 707 + }, + { + "epoch": 0.03964609698734461, + "grad_norm": 1.35414719581604, + "learning_rate": 3.53e-06, + "loss": 0.5125, + "step": 708 + }, + { + "epoch": 0.03970209429947363, + "grad_norm": 1.272335410118103, + "learning_rate": 3.535e-06, + "loss": 0.4132, + "step": 709 + }, + { + "epoch": 0.039758091611602646, + "grad_norm": 3.9252395629882812, + "learning_rate": 3.5400000000000004e-06, + "loss": 0.488, + "step": 710 + }, + { + "epoch": 0.03981408892373166, + "grad_norm": 1.3578699827194214, + "learning_rate": 3.5450000000000004e-06, + "loss": 0.6098, + "step": 711 + }, + { + "epoch": 0.03987008623586068, + "grad_norm": 1.3184001445770264, + "learning_rate": 3.55e-06, + "loss": 0.3705, + "step": 712 + }, + { + "epoch": 0.039926083547989696, + "grad_norm": 1.26827871799469, + "learning_rate": 3.555e-06, + "loss": 0.4091, + "step": 713 + }, + { + "epoch": 0.039982080860118716, + "grad_norm": 1.2401190996170044, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.5061, + "step": 714 + }, + { + "epoch": 0.040038078172247735, + "grad_norm": 1.6993461847305298, + "learning_rate": 3.565e-06, + "loss": 0.5484, + "step": 715 + }, + { + "epoch": 0.04009407548437675, + "grad_norm": 1.1220078468322754, + "learning_rate": 3.5700000000000005e-06, + "loss": 0.5662, + "step": 716 + }, + { + "epoch": 0.040150072796505766, + "grad_norm": 1.306207299232483, + "learning_rate": 3.575e-06, + "loss": 0.4847, + "step": 717 + }, + { + "epoch": 0.040206070108634785, + "grad_norm": 1.3487942218780518, + "learning_rate": 3.58e-06, + "loss": 0.428, + "step": 718 + }, + { + "epoch": 0.040262067420763804, + "grad_norm": 1.2893807888031006, + "learning_rate": 3.585e-06, + "loss": 0.439, + "step": 719 + }, + { + "epoch": 0.040318064732892823, + "grad_norm": 1.2305127382278442, + "learning_rate": 3.5900000000000004e-06, + "loss": 0.4885, + "step": 720 + }, + { + "epoch": 0.040374062045021836, + "grad_norm": 1.2074695825576782, + "learning_rate": 3.5950000000000003e-06, + "loss": 0.4735, + "step": 721 + }, + { + "epoch": 0.040430059357150855, + "grad_norm": 1.2309809923171997, + "learning_rate": 3.6e-06, + "loss": 0.4212, + "step": 722 + }, + { + "epoch": 0.040486056669279874, + "grad_norm": 1.4647796154022217, + "learning_rate": 3.6050000000000002e-06, + "loss": 0.4403, + "step": 723 + }, + { + "epoch": 0.04054205398140889, + "grad_norm": 2.394265651702881, + "learning_rate": 3.61e-06, + "loss": 0.5968, + "step": 724 + }, + { + "epoch": 0.04059805129353791, + "grad_norm": 1.4066598415374756, + "learning_rate": 3.6150000000000005e-06, + "loss": 0.4851, + "step": 725 + }, + { + "epoch": 0.040654048605666925, + "grad_norm": 1.1987708806991577, + "learning_rate": 3.6200000000000005e-06, + "loss": 0.4466, + "step": 726 + }, + { + "epoch": 0.040710045917795944, + "grad_norm": 1.0690430402755737, + "learning_rate": 3.625e-06, + "loss": 0.5593, + "step": 727 + }, + { + "epoch": 0.04076604322992496, + "grad_norm": 1.1197233200073242, + "learning_rate": 3.63e-06, + "loss": 0.5191, + "step": 728 + }, + { + "epoch": 0.04082204054205398, + "grad_norm": 1.168726921081543, + "learning_rate": 3.6350000000000003e-06, + "loss": 0.3816, + "step": 729 + }, + { + "epoch": 0.040878037854183, + "grad_norm": 1.2563281059265137, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.4761, + "step": 730 + }, + { + "epoch": 0.04093403516631202, + "grad_norm": 1.3992208242416382, + "learning_rate": 3.6450000000000007e-06, + "loss": 0.5434, + "step": 731 + }, + { + "epoch": 0.04099003247844103, + "grad_norm": 0.9997865557670593, + "learning_rate": 3.6499999999999998e-06, + "loss": 0.4953, + "step": 732 + }, + { + "epoch": 0.04104602979057005, + "grad_norm": 1.3411892652511597, + "learning_rate": 3.655e-06, + "loss": 0.4478, + "step": 733 + }, + { + "epoch": 0.04110202710269907, + "grad_norm": 6.3067450523376465, + "learning_rate": 3.66e-06, + "loss": 0.4111, + "step": 734 + }, + { + "epoch": 0.04115802441482809, + "grad_norm": 1.3528656959533691, + "learning_rate": 3.6650000000000005e-06, + "loss": 0.522, + "step": 735 + }, + { + "epoch": 0.04121402172695711, + "grad_norm": 1.1481785774230957, + "learning_rate": 3.6700000000000004e-06, + "loss": 0.4935, + "step": 736 + }, + { + "epoch": 0.04127001903908612, + "grad_norm": 1.2303099632263184, + "learning_rate": 3.675e-06, + "loss": 0.4304, + "step": 737 + }, + { + "epoch": 0.04132601635121514, + "grad_norm": 1.5268962383270264, + "learning_rate": 3.68e-06, + "loss": 0.4484, + "step": 738 + }, + { + "epoch": 0.04138201366334416, + "grad_norm": 1.2627424001693726, + "learning_rate": 3.6850000000000003e-06, + "loss": 0.489, + "step": 739 + }, + { + "epoch": 0.04143801097547318, + "grad_norm": 1.397262692451477, + "learning_rate": 3.6900000000000002e-06, + "loss": 0.5543, + "step": 740 + }, + { + "epoch": 0.0414940082876022, + "grad_norm": 1.2485219240188599, + "learning_rate": 3.6949999999999998e-06, + "loss": 0.4055, + "step": 741 + }, + { + "epoch": 0.04155000559973121, + "grad_norm": 1.1135332584381104, + "learning_rate": 3.7e-06, + "loss": 0.3795, + "step": 742 + }, + { + "epoch": 0.04160600291186023, + "grad_norm": 1.4126280546188354, + "learning_rate": 3.705e-06, + "loss": 0.6212, + "step": 743 + }, + { + "epoch": 0.04166200022398925, + "grad_norm": 1.4079738855361938, + "learning_rate": 3.7100000000000005e-06, + "loss": 0.7047, + "step": 744 + }, + { + "epoch": 0.04171799753611827, + "grad_norm": 1.4724724292755127, + "learning_rate": 3.7150000000000004e-06, + "loss": 0.4877, + "step": 745 + }, + { + "epoch": 0.04177399484824729, + "grad_norm": 1.661381721496582, + "learning_rate": 3.72e-06, + "loss": 0.4769, + "step": 746 + }, + { + "epoch": 0.0418299921603763, + "grad_norm": 1.7881790399551392, + "learning_rate": 3.725e-06, + "loss": 0.563, + "step": 747 + }, + { + "epoch": 0.04188598947250532, + "grad_norm": 1.3039181232452393, + "learning_rate": 3.7300000000000003e-06, + "loss": 0.5058, + "step": 748 + }, + { + "epoch": 0.04194198678463434, + "grad_norm": 1.1592111587524414, + "learning_rate": 3.7350000000000002e-06, + "loss": 0.4011, + "step": 749 + }, + { + "epoch": 0.041997984096763356, + "grad_norm": 1.2834731340408325, + "learning_rate": 3.7400000000000006e-06, + "loss": 0.4299, + "step": 750 + }, + { + "epoch": 0.042053981408892376, + "grad_norm": 1.2477951049804688, + "learning_rate": 3.7449999999999997e-06, + "loss": 0.4805, + "step": 751 + }, + { + "epoch": 0.04210997872102139, + "grad_norm": 1.5107780694961548, + "learning_rate": 3.75e-06, + "loss": 0.4794, + "step": 752 + }, + { + "epoch": 0.04216597603315041, + "grad_norm": 1.288833737373352, + "learning_rate": 3.755e-06, + "loss": 0.5569, + "step": 753 + }, + { + "epoch": 0.042221973345279426, + "grad_norm": 1.2187424898147583, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.3778, + "step": 754 + }, + { + "epoch": 0.042277970657408445, + "grad_norm": 1.2177382707595825, + "learning_rate": 3.7650000000000004e-06, + "loss": 0.4573, + "step": 755 + }, + { + "epoch": 0.042333967969537464, + "grad_norm": 1.2576156854629517, + "learning_rate": 3.77e-06, + "loss": 0.443, + "step": 756 + }, + { + "epoch": 0.042389965281666483, + "grad_norm": 1.2027933597564697, + "learning_rate": 3.775e-06, + "loss": 0.5188, + "step": 757 + }, + { + "epoch": 0.042445962593795496, + "grad_norm": 1.3134757280349731, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.5296, + "step": 758 + }, + { + "epoch": 0.042501959905924515, + "grad_norm": 1.1032532453536987, + "learning_rate": 3.785e-06, + "loss": 0.4721, + "step": 759 + }, + { + "epoch": 0.042557957218053534, + "grad_norm": 1.1537388563156128, + "learning_rate": 3.7900000000000006e-06, + "loss": 0.3652, + "step": 760 + }, + { + "epoch": 0.04261395453018255, + "grad_norm": 1.1453958749771118, + "learning_rate": 3.795e-06, + "loss": 0.3934, + "step": 761 + }, + { + "epoch": 0.04266995184231157, + "grad_norm": 1.3031656742095947, + "learning_rate": 3.8e-06, + "loss": 0.4986, + "step": 762 + }, + { + "epoch": 0.042725949154440584, + "grad_norm": 1.8433433771133423, + "learning_rate": 3.8050000000000004e-06, + "loss": 0.615, + "step": 763 + }, + { + "epoch": 0.042781946466569604, + "grad_norm": 1.5257457494735718, + "learning_rate": 3.8100000000000004e-06, + "loss": 0.5185, + "step": 764 + }, + { + "epoch": 0.04283794377869862, + "grad_norm": 1.3897265195846558, + "learning_rate": 3.815000000000001e-06, + "loss": 0.5186, + "step": 765 + }, + { + "epoch": 0.04289394109082764, + "grad_norm": 1.1871623992919922, + "learning_rate": 3.82e-06, + "loss": 0.4082, + "step": 766 + }, + { + "epoch": 0.04294993840295666, + "grad_norm": 2.2760584354400635, + "learning_rate": 3.825e-06, + "loss": 0.8009, + "step": 767 + }, + { + "epoch": 0.04300593571508567, + "grad_norm": 1.3228431940078735, + "learning_rate": 3.830000000000001e-06, + "loss": 0.4165, + "step": 768 + }, + { + "epoch": 0.04306193302721469, + "grad_norm": 1.8395787477493286, + "learning_rate": 3.8350000000000006e-06, + "loss": 0.6306, + "step": 769 + }, + { + "epoch": 0.04311793033934371, + "grad_norm": 1.3083232641220093, + "learning_rate": 3.84e-06, + "loss": 0.393, + "step": 770 + }, + { + "epoch": 0.04317392765147273, + "grad_norm": 1.3711607456207275, + "learning_rate": 3.845e-06, + "loss": 0.5634, + "step": 771 + }, + { + "epoch": 0.04322992496360175, + "grad_norm": 1.2909704446792603, + "learning_rate": 3.85e-06, + "loss": 0.4414, + "step": 772 + }, + { + "epoch": 0.04328592227573076, + "grad_norm": 1.2462844848632812, + "learning_rate": 3.855e-06, + "loss": 0.4788, + "step": 773 + }, + { + "epoch": 0.04334191958785978, + "grad_norm": 1.2240833044052124, + "learning_rate": 3.86e-06, + "loss": 0.5835, + "step": 774 + }, + { + "epoch": 0.0433979168999888, + "grad_norm": 1.207229733467102, + "learning_rate": 3.865e-06, + "loss": 0.4087, + "step": 775 + }, + { + "epoch": 0.04345391421211782, + "grad_norm": 1.5298811197280884, + "learning_rate": 3.87e-06, + "loss": 0.5086, + "step": 776 + }, + { + "epoch": 0.04350991152424684, + "grad_norm": 1.5431842803955078, + "learning_rate": 3.875e-06, + "loss": 0.5255, + "step": 777 + }, + { + "epoch": 0.04356590883637585, + "grad_norm": 1.2514537572860718, + "learning_rate": 3.88e-06, + "loss": 0.4644, + "step": 778 + }, + { + "epoch": 0.04362190614850487, + "grad_norm": 1.1557564735412598, + "learning_rate": 3.885e-06, + "loss": 0.4376, + "step": 779 + }, + { + "epoch": 0.04367790346063389, + "grad_norm": 1.2817264795303345, + "learning_rate": 3.89e-06, + "loss": 0.5082, + "step": 780 + }, + { + "epoch": 0.04373390077276291, + "grad_norm": 1.2424750328063965, + "learning_rate": 3.895e-06, + "loss": 0.506, + "step": 781 + }, + { + "epoch": 0.04378989808489193, + "grad_norm": 1.014085054397583, + "learning_rate": 3.9e-06, + "loss": 0.5267, + "step": 782 + }, + { + "epoch": 0.04384589539702094, + "grad_norm": 1.3631815910339355, + "learning_rate": 3.905000000000001e-06, + "loss": 0.4526, + "step": 783 + }, + { + "epoch": 0.04390189270914996, + "grad_norm": 1.462369441986084, + "learning_rate": 3.910000000000001e-06, + "loss": 0.3593, + "step": 784 + }, + { + "epoch": 0.04395789002127898, + "grad_norm": 1.1282445192337036, + "learning_rate": 3.915e-06, + "loss": 0.4777, + "step": 785 + }, + { + "epoch": 0.044013887333408, + "grad_norm": 1.2128945589065552, + "learning_rate": 3.92e-06, + "loss": 0.3664, + "step": 786 + }, + { + "epoch": 0.044069884645537016, + "grad_norm": 1.27531099319458, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.5199, + "step": 787 + }, + { + "epoch": 0.044125881957666035, + "grad_norm": 1.6103917360305786, + "learning_rate": 3.9300000000000005e-06, + "loss": 0.7764, + "step": 788 + }, + { + "epoch": 0.04418187926979505, + "grad_norm": 1.2535672187805176, + "learning_rate": 3.9350000000000004e-06, + "loss": 0.5684, + "step": 789 + }, + { + "epoch": 0.04423787658192407, + "grad_norm": 1.2681461572647095, + "learning_rate": 3.9399999999999995e-06, + "loss": 0.4236, + "step": 790 + }, + { + "epoch": 0.044293873894053086, + "grad_norm": 1.3095355033874512, + "learning_rate": 3.945e-06, + "loss": 0.5838, + "step": 791 + }, + { + "epoch": 0.044349871206182105, + "grad_norm": 1.3122279644012451, + "learning_rate": 3.95e-06, + "loss": 0.4812, + "step": 792 + }, + { + "epoch": 0.044405868518311124, + "grad_norm": 1.420548915863037, + "learning_rate": 3.955e-06, + "loss": 0.512, + "step": 793 + }, + { + "epoch": 0.044461865830440137, + "grad_norm": 1.1349029541015625, + "learning_rate": 3.96e-06, + "loss": 0.3778, + "step": 794 + }, + { + "epoch": 0.044517863142569156, + "grad_norm": 1.3369938135147095, + "learning_rate": 3.965e-06, + "loss": 0.4454, + "step": 795 + }, + { + "epoch": 0.044573860454698175, + "grad_norm": 1.3689488172531128, + "learning_rate": 3.97e-06, + "loss": 0.4819, + "step": 796 + }, + { + "epoch": 0.044629857766827194, + "grad_norm": 6.419103145599365, + "learning_rate": 3.975e-06, + "loss": 0.5101, + "step": 797 + }, + { + "epoch": 0.04468585507895621, + "grad_norm": 1.4806280136108398, + "learning_rate": 3.98e-06, + "loss": 0.6097, + "step": 798 + }, + { + "epoch": 0.044741852391085225, + "grad_norm": 1.527851939201355, + "learning_rate": 3.985e-06, + "loss": 0.6501, + "step": 799 + }, + { + "epoch": 0.044797849703214244, + "grad_norm": 1.3300751447677612, + "learning_rate": 3.99e-06, + "loss": 0.4639, + "step": 800 + }, + { + "epoch": 0.044853847015343264, + "grad_norm": 1.3300977945327759, + "learning_rate": 3.995e-06, + "loss": 0.4535, + "step": 801 + }, + { + "epoch": 0.04490984432747228, + "grad_norm": 1.2579883337020874, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5186, + "step": 802 + }, + { + "epoch": 0.0449658416396013, + "grad_norm": 1.0359623432159424, + "learning_rate": 4.005000000000001e-06, + "loss": 0.4602, + "step": 803 + }, + { + "epoch": 0.045021838951730314, + "grad_norm": 1.2562466859817505, + "learning_rate": 4.01e-06, + "loss": 0.6528, + "step": 804 + }, + { + "epoch": 0.04507783626385933, + "grad_norm": 1.155367374420166, + "learning_rate": 4.015e-06, + "loss": 0.4138, + "step": 805 + }, + { + "epoch": 0.04513383357598835, + "grad_norm": 1.1500927209854126, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.4477, + "step": 806 + }, + { + "epoch": 0.04518983088811737, + "grad_norm": 1.2277973890304565, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.4754, + "step": 807 + }, + { + "epoch": 0.04524582820024639, + "grad_norm": 1.1716548204421997, + "learning_rate": 4.03e-06, + "loss": 0.3876, + "step": 808 + }, + { + "epoch": 0.0453018255123754, + "grad_norm": 1.2428687810897827, + "learning_rate": 4.0349999999999995e-06, + "loss": 0.4257, + "step": 809 + }, + { + "epoch": 0.04535782282450442, + "grad_norm": 1.1642229557037354, + "learning_rate": 4.04e-06, + "loss": 0.4966, + "step": 810 + }, + { + "epoch": 0.04541382013663344, + "grad_norm": 1.3695844411849976, + "learning_rate": 4.045e-06, + "loss": 0.4443, + "step": 811 + }, + { + "epoch": 0.04546981744876246, + "grad_norm": 1.215888500213623, + "learning_rate": 4.05e-06, + "loss": 0.4461, + "step": 812 + }, + { + "epoch": 0.04552581476089148, + "grad_norm": 1.8594948053359985, + "learning_rate": 4.055e-06, + "loss": 0.5258, + "step": 813 + }, + { + "epoch": 0.04558181207302049, + "grad_norm": 1.3100166320800781, + "learning_rate": 4.06e-06, + "loss": 0.4898, + "step": 814 + }, + { + "epoch": 0.04563780938514951, + "grad_norm": 1.0542774200439453, + "learning_rate": 4.065e-06, + "loss": 0.3433, + "step": 815 + }, + { + "epoch": 0.04569380669727853, + "grad_norm": 1.1762754917144775, + "learning_rate": 4.07e-06, + "loss": 0.4285, + "step": 816 + }, + { + "epoch": 0.04574980400940755, + "grad_norm": 1.7509896755218506, + "learning_rate": 4.075e-06, + "loss": 0.7001, + "step": 817 + }, + { + "epoch": 0.04580580132153657, + "grad_norm": 2.022645950317383, + "learning_rate": 4.080000000000001e-06, + "loss": 0.4217, + "step": 818 + }, + { + "epoch": 0.04586179863366559, + "grad_norm": 1.2015148401260376, + "learning_rate": 4.085e-06, + "loss": 0.4548, + "step": 819 + }, + { + "epoch": 0.0459177959457946, + "grad_norm": 1.2890264987945557, + "learning_rate": 4.09e-06, + "loss": 0.4291, + "step": 820 + }, + { + "epoch": 0.04597379325792362, + "grad_norm": 1.2706619501113892, + "learning_rate": 4.095000000000001e-06, + "loss": 0.4252, + "step": 821 + }, + { + "epoch": 0.04602979057005264, + "grad_norm": 1.3321216106414795, + "learning_rate": 4.1000000000000006e-06, + "loss": 0.4343, + "step": 822 + }, + { + "epoch": 0.04608578788218166, + "grad_norm": 1.4125432968139648, + "learning_rate": 4.1050000000000005e-06, + "loss": 0.6235, + "step": 823 + }, + { + "epoch": 0.046141785194310676, + "grad_norm": 1.304635763168335, + "learning_rate": 4.11e-06, + "loss": 0.6007, + "step": 824 + }, + { + "epoch": 0.04619778250643969, + "grad_norm": 1.1754714250564575, + "learning_rate": 4.115e-06, + "loss": 0.4203, + "step": 825 + }, + { + "epoch": 0.04625377981856871, + "grad_norm": 1.2974853515625, + "learning_rate": 4.12e-06, + "loss": 0.6425, + "step": 826 + }, + { + "epoch": 0.04630977713069773, + "grad_norm": 1.3489078283309937, + "learning_rate": 4.125e-06, + "loss": 0.5505, + "step": 827 + }, + { + "epoch": 0.046365774442826746, + "grad_norm": 1.4762914180755615, + "learning_rate": 4.13e-06, + "loss": 0.4373, + "step": 828 + }, + { + "epoch": 0.046421771754955765, + "grad_norm": 1.038651943206787, + "learning_rate": 4.135e-06, + "loss": 0.4102, + "step": 829 + }, + { + "epoch": 0.04647776906708478, + "grad_norm": 1.2074174880981445, + "learning_rate": 4.14e-06, + "loss": 0.557, + "step": 830 + }, + { + "epoch": 0.046533766379213796, + "grad_norm": 1.2075752019882202, + "learning_rate": 4.145e-06, + "loss": 0.4114, + "step": 831 + }, + { + "epoch": 0.046589763691342816, + "grad_norm": 1.296036720275879, + "learning_rate": 4.15e-06, + "loss": 0.5151, + "step": 832 + }, + { + "epoch": 0.046645761003471835, + "grad_norm": 1.5932459831237793, + "learning_rate": 4.155e-06, + "loss": 0.6072, + "step": 833 + }, + { + "epoch": 0.046701758315600854, + "grad_norm": 1.1555140018463135, + "learning_rate": 4.16e-06, + "loss": 0.4633, + "step": 834 + }, + { + "epoch": 0.046757755627729866, + "grad_norm": 1.7015200853347778, + "learning_rate": 4.165e-06, + "loss": 0.5737, + "step": 835 + }, + { + "epoch": 0.046813752939858885, + "grad_norm": 1.2817109823226929, + "learning_rate": 4.17e-06, + "loss": 0.6031, + "step": 836 + }, + { + "epoch": 0.046869750251987904, + "grad_norm": 1.0388692617416382, + "learning_rate": 4.175000000000001e-06, + "loss": 0.3362, + "step": 837 + }, + { + "epoch": 0.046925747564116924, + "grad_norm": 1.156796932220459, + "learning_rate": 4.18e-06, + "loss": 0.4485, + "step": 838 + }, + { + "epoch": 0.04698174487624594, + "grad_norm": 1.2893918752670288, + "learning_rate": 4.185e-06, + "loss": 0.5426, + "step": 839 + }, + { + "epoch": 0.047037742188374955, + "grad_norm": 1.2383168935775757, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.4145, + "step": 840 + }, + { + "epoch": 0.047093739500503974, + "grad_norm": 1.4720534086227417, + "learning_rate": 4.1950000000000005e-06, + "loss": 0.5713, + "step": 841 + }, + { + "epoch": 0.04714973681263299, + "grad_norm": 5.215790748596191, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.4335, + "step": 842 + }, + { + "epoch": 0.04720573412476201, + "grad_norm": 1.6487631797790527, + "learning_rate": 4.2049999999999996e-06, + "loss": 0.466, + "step": 843 + }, + { + "epoch": 0.04726173143689103, + "grad_norm": 1.4052884578704834, + "learning_rate": 4.21e-06, + "loss": 0.5244, + "step": 844 + }, + { + "epoch": 0.047317728749020044, + "grad_norm": 1.196442723274231, + "learning_rate": 4.215e-06, + "loss": 0.5315, + "step": 845 + }, + { + "epoch": 0.04737372606114906, + "grad_norm": 1.341271996498108, + "learning_rate": 4.22e-06, + "loss": 0.5769, + "step": 846 + }, + { + "epoch": 0.04742972337327808, + "grad_norm": 1.5218042135238647, + "learning_rate": 4.225e-06, + "loss": 0.5436, + "step": 847 + }, + { + "epoch": 0.0474857206854071, + "grad_norm": 1.12349271774292, + "learning_rate": 4.23e-06, + "loss": 0.4701, + "step": 848 + }, + { + "epoch": 0.04754171799753612, + "grad_norm": 1.3710609674453735, + "learning_rate": 4.235e-06, + "loss": 0.5771, + "step": 849 + }, + { + "epoch": 0.04759771530966514, + "grad_norm": 1.0842112302780151, + "learning_rate": 4.24e-06, + "loss": 0.3687, + "step": 850 + }, + { + "epoch": 0.04765371262179415, + "grad_norm": 1.6881595849990845, + "learning_rate": 4.245e-06, + "loss": 0.5157, + "step": 851 + }, + { + "epoch": 0.04770970993392317, + "grad_norm": 5.3729047775268555, + "learning_rate": 4.250000000000001e-06, + "loss": 0.5579, + "step": 852 + }, + { + "epoch": 0.04776570724605219, + "grad_norm": 1.4211618900299072, + "learning_rate": 4.255e-06, + "loss": 0.288, + "step": 853 + }, + { + "epoch": 0.04782170455818121, + "grad_norm": 1.1432451009750366, + "learning_rate": 4.26e-06, + "loss": 0.6627, + "step": 854 + }, + { + "epoch": 0.04787770187031023, + "grad_norm": 1.4285887479782104, + "learning_rate": 4.265e-06, + "loss": 0.5766, + "step": 855 + }, + { + "epoch": 0.04793369918243924, + "grad_norm": 1.2693086862564087, + "learning_rate": 4.270000000000001e-06, + "loss": 0.5203, + "step": 856 + }, + { + "epoch": 0.04798969649456826, + "grad_norm": 1.0911688804626465, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.373, + "step": 857 + }, + { + "epoch": 0.04804569380669728, + "grad_norm": 1.2234435081481934, + "learning_rate": 4.28e-06, + "loss": 0.4711, + "step": 858 + }, + { + "epoch": 0.0481016911188263, + "grad_norm": 1.1317834854125977, + "learning_rate": 4.2850000000000005e-06, + "loss": 0.3715, + "step": 859 + }, + { + "epoch": 0.04815768843095532, + "grad_norm": 1.4064030647277832, + "learning_rate": 4.2900000000000004e-06, + "loss": 0.4611, + "step": 860 + }, + { + "epoch": 0.04821368574308433, + "grad_norm": 1.5491019487380981, + "learning_rate": 4.295e-06, + "loss": 0.4954, + "step": 861 + }, + { + "epoch": 0.04826968305521335, + "grad_norm": 1.1596893072128296, + "learning_rate": 4.2999999999999995e-06, + "loss": 0.4147, + "step": 862 + }, + { + "epoch": 0.04832568036734237, + "grad_norm": 1.39393150806427, + "learning_rate": 4.305e-06, + "loss": 0.4504, + "step": 863 + }, + { + "epoch": 0.04838167767947139, + "grad_norm": 0.9863626956939697, + "learning_rate": 4.31e-06, + "loss": 0.4018, + "step": 864 + }, + { + "epoch": 0.048437674991600406, + "grad_norm": 1.2500860691070557, + "learning_rate": 4.315e-06, + "loss": 0.3177, + "step": 865 + }, + { + "epoch": 0.04849367230372942, + "grad_norm": 1.2027941942214966, + "learning_rate": 4.32e-06, + "loss": 0.5849, + "step": 866 + }, + { + "epoch": 0.04854966961585844, + "grad_norm": 1.3106149435043335, + "learning_rate": 4.325e-06, + "loss": 0.3912, + "step": 867 + }, + { + "epoch": 0.048605666927987456, + "grad_norm": 2.14656400680542, + "learning_rate": 4.33e-06, + "loss": 0.5411, + "step": 868 + }, + { + "epoch": 0.048661664240116476, + "grad_norm": 1.165299415588379, + "learning_rate": 4.335e-06, + "loss": 0.3608, + "step": 869 + }, + { + "epoch": 0.048717661552245495, + "grad_norm": 1.5042529106140137, + "learning_rate": 4.34e-06, + "loss": 0.3919, + "step": 870 + }, + { + "epoch": 0.04877365886437451, + "grad_norm": 1.0934454202651978, + "learning_rate": 4.345000000000001e-06, + "loss": 0.3231, + "step": 871 + }, + { + "epoch": 0.048829656176503526, + "grad_norm": 1.1957106590270996, + "learning_rate": 4.35e-06, + "loss": 0.5768, + "step": 872 + }, + { + "epoch": 0.048885653488632545, + "grad_norm": 1.435402750968933, + "learning_rate": 4.355e-06, + "loss": 0.5095, + "step": 873 + }, + { + "epoch": 0.048941650800761564, + "grad_norm": 1.366929531097412, + "learning_rate": 4.360000000000001e-06, + "loss": 0.4974, + "step": 874 + }, + { + "epoch": 0.048997648112890584, + "grad_norm": 1.3479841947555542, + "learning_rate": 4.3650000000000006e-06, + "loss": 0.4395, + "step": 875 + }, + { + "epoch": 0.049053645425019596, + "grad_norm": 1.4555615186691284, + "learning_rate": 4.3700000000000005e-06, + "loss": 0.7476, + "step": 876 + }, + { + "epoch": 0.049109642737148615, + "grad_norm": 1.3223801851272583, + "learning_rate": 4.375e-06, + "loss": 0.4592, + "step": 877 + }, + { + "epoch": 0.049165640049277634, + "grad_norm": 1.015773057937622, + "learning_rate": 4.38e-06, + "loss": 0.3394, + "step": 878 + }, + { + "epoch": 0.04922163736140665, + "grad_norm": 1.224155068397522, + "learning_rate": 4.385e-06, + "loss": 0.4811, + "step": 879 + }, + { + "epoch": 0.04927763467353567, + "grad_norm": 1.174251675605774, + "learning_rate": 4.39e-06, + "loss": 0.3923, + "step": 880 + }, + { + "epoch": 0.04933363198566469, + "grad_norm": 1.5335127115249634, + "learning_rate": 4.395e-06, + "loss": 0.531, + "step": 881 + }, + { + "epoch": 0.049389629297793704, + "grad_norm": 1.2524313926696777, + "learning_rate": 4.4e-06, + "loss": 0.5577, + "step": 882 + }, + { + "epoch": 0.04944562660992272, + "grad_norm": 1.1989326477050781, + "learning_rate": 4.405e-06, + "loss": 0.5136, + "step": 883 + }, + { + "epoch": 0.04950162392205174, + "grad_norm": 1.1970378160476685, + "learning_rate": 4.41e-06, + "loss": 0.3874, + "step": 884 + }, + { + "epoch": 0.04955762123418076, + "grad_norm": 1.2088544368743896, + "learning_rate": 4.415e-06, + "loss": 0.4222, + "step": 885 + }, + { + "epoch": 0.04961361854630978, + "grad_norm": 1.228928565979004, + "learning_rate": 4.420000000000001e-06, + "loss": 0.4155, + "step": 886 + }, + { + "epoch": 0.04966961585843879, + "grad_norm": 1.324442744255066, + "learning_rate": 4.425e-06, + "loss": 0.4051, + "step": 887 + }, + { + "epoch": 0.04972561317056781, + "grad_norm": 1.2726175785064697, + "learning_rate": 4.43e-06, + "loss": 0.3889, + "step": 888 + }, + { + "epoch": 0.04978161048269683, + "grad_norm": 1.198268175125122, + "learning_rate": 4.435e-06, + "loss": 0.4054, + "step": 889 + }, + { + "epoch": 0.04983760779482585, + "grad_norm": 1.3631871938705444, + "learning_rate": 4.440000000000001e-06, + "loss": 0.5341, + "step": 890 + }, + { + "epoch": 0.04989360510695487, + "grad_norm": 1.1524522304534912, + "learning_rate": 4.445000000000001e-06, + "loss": 0.584, + "step": 891 + }, + { + "epoch": 0.04994960241908388, + "grad_norm": 1.4735164642333984, + "learning_rate": 4.45e-06, + "loss": 0.5243, + "step": 892 + }, + { + "epoch": 0.0500055997312129, + "grad_norm": 1.1556566953659058, + "learning_rate": 4.4550000000000005e-06, + "loss": 0.3543, + "step": 893 + }, + { + "epoch": 0.05006159704334192, + "grad_norm": 1.2251462936401367, + "learning_rate": 4.4600000000000005e-06, + "loss": 0.4228, + "step": 894 + }, + { + "epoch": 0.05011759435547094, + "grad_norm": 1.2216525077819824, + "learning_rate": 4.4650000000000004e-06, + "loss": 0.5462, + "step": 895 + }, + { + "epoch": 0.05017359166759996, + "grad_norm": 3.547142744064331, + "learning_rate": 4.4699999999999996e-06, + "loss": 0.7819, + "step": 896 + }, + { + "epoch": 0.05022958897972897, + "grad_norm": 1.2285093069076538, + "learning_rate": 4.475e-06, + "loss": 0.6265, + "step": 897 + }, + { + "epoch": 0.05028558629185799, + "grad_norm": 1.074223518371582, + "learning_rate": 4.48e-06, + "loss": 0.3546, + "step": 898 + }, + { + "epoch": 0.05034158360398701, + "grad_norm": 1.1250219345092773, + "learning_rate": 4.485e-06, + "loss": 0.42, + "step": 899 + }, + { + "epoch": 0.05039758091611603, + "grad_norm": 1.5750139951705933, + "learning_rate": 4.49e-06, + "loss": 0.5449, + "step": 900 + }, + { + "epoch": 0.05045357822824505, + "grad_norm": 1.213463306427002, + "learning_rate": 4.495e-06, + "loss": 0.4779, + "step": 901 + }, + { + "epoch": 0.05050957554037406, + "grad_norm": 1.3777439594268799, + "learning_rate": 4.5e-06, + "loss": 0.4854, + "step": 902 + }, + { + "epoch": 0.05056557285250308, + "grad_norm": 1.180437445640564, + "learning_rate": 4.505e-06, + "loss": 0.5171, + "step": 903 + }, + { + "epoch": 0.0506215701646321, + "grad_norm": 1.2369575500488281, + "learning_rate": 4.51e-06, + "loss": 0.5179, + "step": 904 + }, + { + "epoch": 0.050677567476761116, + "grad_norm": 1.1850531101226807, + "learning_rate": 4.515000000000001e-06, + "loss": 0.4336, + "step": 905 + }, + { + "epoch": 0.050733564788890136, + "grad_norm": 1.65920889377594, + "learning_rate": 4.52e-06, + "loss": 0.4392, + "step": 906 + }, + { + "epoch": 0.05078956210101915, + "grad_norm": 1.313531517982483, + "learning_rate": 4.525e-06, + "loss": 0.4201, + "step": 907 + }, + { + "epoch": 0.05084555941314817, + "grad_norm": 1.7814490795135498, + "learning_rate": 4.53e-06, + "loss": 0.424, + "step": 908 + }, + { + "epoch": 0.050901556725277186, + "grad_norm": 1.093064785003662, + "learning_rate": 4.535000000000001e-06, + "loss": 0.4134, + "step": 909 + }, + { + "epoch": 0.050957554037406205, + "grad_norm": 1.2600054740905762, + "learning_rate": 4.540000000000001e-06, + "loss": 0.4557, + "step": 910 + }, + { + "epoch": 0.051013551349535224, + "grad_norm": 1.131227970123291, + "learning_rate": 4.545e-06, + "loss": 0.4015, + "step": 911 + }, + { + "epoch": 0.051069548661664244, + "grad_norm": 1.0976067781448364, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.4971, + "step": 912 + }, + { + "epoch": 0.051125545973793256, + "grad_norm": 1.1147174835205078, + "learning_rate": 4.5550000000000004e-06, + "loss": 0.3695, + "step": 913 + }, + { + "epoch": 0.051181543285922275, + "grad_norm": 1.4934289455413818, + "learning_rate": 4.56e-06, + "loss": 0.6644, + "step": 914 + }, + { + "epoch": 0.051237540598051294, + "grad_norm": 1.0716238021850586, + "learning_rate": 4.565e-06, + "loss": 0.3123, + "step": 915 + }, + { + "epoch": 0.05129353791018031, + "grad_norm": 1.3327045440673828, + "learning_rate": 4.57e-06, + "loss": 0.5055, + "step": 916 + }, + { + "epoch": 0.05134953522230933, + "grad_norm": 1.1724621057510376, + "learning_rate": 4.575e-06, + "loss": 0.5311, + "step": 917 + }, + { + "epoch": 0.051405532534438345, + "grad_norm": 1.1683303117752075, + "learning_rate": 4.58e-06, + "loss": 0.5039, + "step": 918 + }, + { + "epoch": 0.051461529846567364, + "grad_norm": 1.2588891983032227, + "learning_rate": 4.585e-06, + "loss": 0.5484, + "step": 919 + }, + { + "epoch": 0.05151752715869638, + "grad_norm": 1.3223018646240234, + "learning_rate": 4.590000000000001e-06, + "loss": 0.5563, + "step": 920 + }, + { + "epoch": 0.0515735244708254, + "grad_norm": 1.3173508644104004, + "learning_rate": 4.595e-06, + "loss": 0.466, + "step": 921 + }, + { + "epoch": 0.05162952178295442, + "grad_norm": 1.736454963684082, + "learning_rate": 4.6e-06, + "loss": 0.5395, + "step": 922 + }, + { + "epoch": 0.05168551909508343, + "grad_norm": 1.2006267309188843, + "learning_rate": 4.605e-06, + "loss": 0.5065, + "step": 923 + }, + { + "epoch": 0.05174151640721245, + "grad_norm": 1.3227877616882324, + "learning_rate": 4.610000000000001e-06, + "loss": 0.4938, + "step": 924 + }, + { + "epoch": 0.05179751371934147, + "grad_norm": 1.1532106399536133, + "learning_rate": 4.615e-06, + "loss": 0.6579, + "step": 925 + }, + { + "epoch": 0.05185351103147049, + "grad_norm": 1.1137820482254028, + "learning_rate": 4.62e-06, + "loss": 0.4266, + "step": 926 + }, + { + "epoch": 0.05190950834359951, + "grad_norm": 1.6235989332199097, + "learning_rate": 4.625e-06, + "loss": 0.4395, + "step": 927 + }, + { + "epoch": 0.05196550565572852, + "grad_norm": 1.167560338973999, + "learning_rate": 4.6300000000000006e-06, + "loss": 0.3794, + "step": 928 + }, + { + "epoch": 0.05202150296785754, + "grad_norm": 1.2593872547149658, + "learning_rate": 4.6350000000000005e-06, + "loss": 0.4781, + "step": 929 + }, + { + "epoch": 0.05207750027998656, + "grad_norm": 1.4556719064712524, + "learning_rate": 4.64e-06, + "loss": 0.5134, + "step": 930 + }, + { + "epoch": 0.05213349759211558, + "grad_norm": 1.1882364749908447, + "learning_rate": 4.645e-06, + "loss": 0.4005, + "step": 931 + }, + { + "epoch": 0.0521894949042446, + "grad_norm": 1.14823579788208, + "learning_rate": 4.65e-06, + "loss": 0.4556, + "step": 932 + }, + { + "epoch": 0.05224549221637361, + "grad_norm": 1.373916506767273, + "learning_rate": 4.655e-06, + "loss": 0.4299, + "step": 933 + }, + { + "epoch": 0.05230148952850263, + "grad_norm": 1.1341737508773804, + "learning_rate": 4.66e-06, + "loss": 0.4239, + "step": 934 + }, + { + "epoch": 0.05235748684063165, + "grad_norm": 1.2210662364959717, + "learning_rate": 4.665e-06, + "loss": 0.5539, + "step": 935 + }, + { + "epoch": 0.05241348415276067, + "grad_norm": 1.1791493892669678, + "learning_rate": 4.67e-06, + "loss": 0.4105, + "step": 936 + }, + { + "epoch": 0.05246948146488969, + "grad_norm": 1.2142149209976196, + "learning_rate": 4.675e-06, + "loss": 0.4108, + "step": 937 + }, + { + "epoch": 0.0525254787770187, + "grad_norm": 1.3496472835540771, + "learning_rate": 4.68e-06, + "loss": 0.396, + "step": 938 + }, + { + "epoch": 0.05258147608914772, + "grad_norm": 1.1289809942245483, + "learning_rate": 4.685000000000001e-06, + "loss": 0.3934, + "step": 939 + }, + { + "epoch": 0.05263747340127674, + "grad_norm": 1.210899829864502, + "learning_rate": 4.69e-06, + "loss": 0.4609, + "step": 940 + }, + { + "epoch": 0.05269347071340576, + "grad_norm": 1.090699553489685, + "learning_rate": 4.695e-06, + "loss": 0.5039, + "step": 941 + }, + { + "epoch": 0.052749468025534776, + "grad_norm": 1.3170802593231201, + "learning_rate": 4.7e-06, + "loss": 0.498, + "step": 942 + }, + { + "epoch": 0.052805465337663796, + "grad_norm": 1.2018908262252808, + "learning_rate": 4.705000000000001e-06, + "loss": 0.393, + "step": 943 + }, + { + "epoch": 0.05286146264979281, + "grad_norm": 1.3873302936553955, + "learning_rate": 4.710000000000001e-06, + "loss": 0.478, + "step": 944 + }, + { + "epoch": 0.05291745996192183, + "grad_norm": 1.3100378513336182, + "learning_rate": 4.715e-06, + "loss": 0.4197, + "step": 945 + }, + { + "epoch": 0.052973457274050846, + "grad_norm": 1.6671725511550903, + "learning_rate": 4.72e-06, + "loss": 0.6221, + "step": 946 + }, + { + "epoch": 0.053029454586179865, + "grad_norm": 1.6387559175491333, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.5495, + "step": 947 + }, + { + "epoch": 0.053085451898308884, + "grad_norm": 1.2692862749099731, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.4587, + "step": 948 + }, + { + "epoch": 0.0531414492104379, + "grad_norm": 1.299742341041565, + "learning_rate": 4.735e-06, + "loss": 0.4027, + "step": 949 + }, + { + "epoch": 0.053197446522566916, + "grad_norm": 1.3480663299560547, + "learning_rate": 4.74e-06, + "loss": 0.4369, + "step": 950 + }, + { + "epoch": 0.053253443834695935, + "grad_norm": 1.2166800498962402, + "learning_rate": 4.745e-06, + "loss": 0.5011, + "step": 951 + }, + { + "epoch": 0.053309441146824954, + "grad_norm": 1.2832262516021729, + "learning_rate": 4.75e-06, + "loss": 0.402, + "step": 952 + }, + { + "epoch": 0.05336543845895397, + "grad_norm": 1.4792896509170532, + "learning_rate": 4.755e-06, + "loss": 0.5747, + "step": 953 + }, + { + "epoch": 0.053421435771082985, + "grad_norm": 1.568776249885559, + "learning_rate": 4.76e-06, + "loss": 0.4724, + "step": 954 + }, + { + "epoch": 0.053477433083212005, + "grad_norm": 1.1982817649841309, + "learning_rate": 4.765e-06, + "loss": 0.5061, + "step": 955 + }, + { + "epoch": 0.053533430395341024, + "grad_norm": 1.3185970783233643, + "learning_rate": 4.77e-06, + "loss": 0.5286, + "step": 956 + }, + { + "epoch": 0.05358942770747004, + "grad_norm": 1.147926688194275, + "learning_rate": 4.775e-06, + "loss": 0.3612, + "step": 957 + }, + { + "epoch": 0.05364542501959906, + "grad_norm": 1.26372230052948, + "learning_rate": 4.780000000000001e-06, + "loss": 0.5493, + "step": 958 + }, + { + "epoch": 0.053701422331728074, + "grad_norm": 1.3605045080184937, + "learning_rate": 4.785e-06, + "loss": 0.6715, + "step": 959 + }, + { + "epoch": 0.05375741964385709, + "grad_norm": 1.3014461994171143, + "learning_rate": 4.79e-06, + "loss": 0.4488, + "step": 960 + }, + { + "epoch": 0.05381341695598611, + "grad_norm": 1.37398362159729, + "learning_rate": 4.795e-06, + "loss": 0.4222, + "step": 961 + }, + { + "epoch": 0.05386941426811513, + "grad_norm": 1.230643391609192, + "learning_rate": 4.800000000000001e-06, + "loss": 0.4864, + "step": 962 + }, + { + "epoch": 0.05392541158024415, + "grad_norm": 1.095284104347229, + "learning_rate": 4.805000000000001e-06, + "loss": 0.5956, + "step": 963 + }, + { + "epoch": 0.05398140889237316, + "grad_norm": 1.1946420669555664, + "learning_rate": 4.81e-06, + "loss": 0.4388, + "step": 964 + }, + { + "epoch": 0.05403740620450218, + "grad_norm": 1.0814542770385742, + "learning_rate": 4.8150000000000005e-06, + "loss": 0.4001, + "step": 965 + }, + { + "epoch": 0.0540934035166312, + "grad_norm": 1.1272335052490234, + "learning_rate": 4.8200000000000004e-06, + "loss": 0.4352, + "step": 966 + }, + { + "epoch": 0.05414940082876022, + "grad_norm": 1.2352651357650757, + "learning_rate": 4.825e-06, + "loss": 0.4212, + "step": 967 + }, + { + "epoch": 0.05420539814088924, + "grad_norm": 1.3018606901168823, + "learning_rate": 4.83e-06, + "loss": 0.429, + "step": 968 + }, + { + "epoch": 0.05426139545301825, + "grad_norm": 1.3863848447799683, + "learning_rate": 4.835e-06, + "loss": 0.5049, + "step": 969 + }, + { + "epoch": 0.05431739276514727, + "grad_norm": 1.1036901473999023, + "learning_rate": 4.84e-06, + "loss": 0.4213, + "step": 970 + }, + { + "epoch": 0.05437339007727629, + "grad_norm": 1.420832633972168, + "learning_rate": 4.845e-06, + "loss": 0.42, + "step": 971 + }, + { + "epoch": 0.05442938738940531, + "grad_norm": 1.2023344039916992, + "learning_rate": 4.85e-06, + "loss": 0.5156, + "step": 972 + }, + { + "epoch": 0.05448538470153433, + "grad_norm": 1.184357762336731, + "learning_rate": 4.855e-06, + "loss": 0.4431, + "step": 973 + }, + { + "epoch": 0.05454138201366335, + "grad_norm": 1.2339346408843994, + "learning_rate": 4.86e-06, + "loss": 0.4456, + "step": 974 + }, + { + "epoch": 0.05459737932579236, + "grad_norm": 1.1052863597869873, + "learning_rate": 4.865e-06, + "loss": 0.3762, + "step": 975 + }, + { + "epoch": 0.05465337663792138, + "grad_norm": 1.15488600730896, + "learning_rate": 4.87e-06, + "loss": 0.4208, + "step": 976 + }, + { + "epoch": 0.0547093739500504, + "grad_norm": 1.5858561992645264, + "learning_rate": 4.875000000000001e-06, + "loss": 0.4824, + "step": 977 + }, + { + "epoch": 0.05476537126217942, + "grad_norm": 1.2476688623428345, + "learning_rate": 4.880000000000001e-06, + "loss": 0.6081, + "step": 978 + }, + { + "epoch": 0.054821368574308436, + "grad_norm": 0.9850094318389893, + "learning_rate": 4.885e-06, + "loss": 0.4135, + "step": 979 + }, + { + "epoch": 0.05487736588643745, + "grad_norm": 1.3759442567825317, + "learning_rate": 4.89e-06, + "loss": 0.5223, + "step": 980 + }, + { + "epoch": 0.05493336319856647, + "grad_norm": 1.2599742412567139, + "learning_rate": 4.8950000000000006e-06, + "loss": 0.5832, + "step": 981 + }, + { + "epoch": 0.05498936051069549, + "grad_norm": 2.0648694038391113, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.8202, + "step": 982 + }, + { + "epoch": 0.055045357822824506, + "grad_norm": 1.2609869241714478, + "learning_rate": 4.9050000000000005e-06, + "loss": 0.4878, + "step": 983 + }, + { + "epoch": 0.055101355134953525, + "grad_norm": 1.1653976440429688, + "learning_rate": 4.9100000000000004e-06, + "loss": 0.4716, + "step": 984 + }, + { + "epoch": 0.05515735244708254, + "grad_norm": 1.4410059452056885, + "learning_rate": 4.915e-06, + "loss": 0.5038, + "step": 985 + }, + { + "epoch": 0.05521334975921156, + "grad_norm": 1.3019509315490723, + "learning_rate": 4.92e-06, + "loss": 0.417, + "step": 986 + }, + { + "epoch": 0.055269347071340576, + "grad_norm": 1.4015498161315918, + "learning_rate": 4.925e-06, + "loss": 0.6574, + "step": 987 + }, + { + "epoch": 0.055325344383469595, + "grad_norm": 1.9603328704833984, + "learning_rate": 4.93e-06, + "loss": 0.5267, + "step": 988 + }, + { + "epoch": 0.055381341695598614, + "grad_norm": 1.3965656757354736, + "learning_rate": 4.935e-06, + "loss": 0.4231, + "step": 989 + }, + { + "epoch": 0.055437339007727626, + "grad_norm": 1.1022210121154785, + "learning_rate": 4.94e-06, + "loss": 0.4136, + "step": 990 + }, + { + "epoch": 0.055493336319856645, + "grad_norm": 1.0762064456939697, + "learning_rate": 4.945e-06, + "loss": 0.4043, + "step": 991 + }, + { + "epoch": 0.055549333631985665, + "grad_norm": 1.375928282737732, + "learning_rate": 4.950000000000001e-06, + "loss": 0.4881, + "step": 992 + }, + { + "epoch": 0.055605330944114684, + "grad_norm": 1.385292649269104, + "learning_rate": 4.955e-06, + "loss": 0.4528, + "step": 993 + }, + { + "epoch": 0.0556613282562437, + "grad_norm": 1.17954683303833, + "learning_rate": 4.96e-06, + "loss": 0.4497, + "step": 994 + }, + { + "epoch": 0.055717325568372715, + "grad_norm": 1.412698745727539, + "learning_rate": 4.965e-06, + "loss": 0.4283, + "step": 995 + }, + { + "epoch": 0.055773322880501734, + "grad_norm": 1.191758155822754, + "learning_rate": 4.970000000000001e-06, + "loss": 0.3965, + "step": 996 + }, + { + "epoch": 0.05582932019263075, + "grad_norm": 6.146461486816406, + "learning_rate": 4.975000000000001e-06, + "loss": 0.5264, + "step": 997 + }, + { + "epoch": 0.05588531750475977, + "grad_norm": 1.2692173719406128, + "learning_rate": 4.98e-06, + "loss": 0.4529, + "step": 998 + }, + { + "epoch": 0.05594131481688879, + "grad_norm": 1.5443811416625977, + "learning_rate": 4.985e-06, + "loss": 0.4728, + "step": 999 + }, + { + "epoch": 0.055997312129017804, + "grad_norm": 1.2640265226364136, + "learning_rate": 4.9900000000000005e-06, + "loss": 0.5565, + "step": 1000 + }, + { + "epoch": 0.05605330944114682, + "grad_norm": 1.2944355010986328, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.5138, + "step": 1001 + }, + { + "epoch": 0.05610930675327584, + "grad_norm": 1.1413021087646484, + "learning_rate": 5e-06, + "loss": 0.4785, + "step": 1002 + }, + { + "epoch": 0.05616530406540486, + "grad_norm": 1.3369749784469604, + "learning_rate": 5.005e-06, + "loss": 0.4315, + "step": 1003 + }, + { + "epoch": 0.05622130137753388, + "grad_norm": 1.085232138633728, + "learning_rate": 5.01e-06, + "loss": 0.3798, + "step": 1004 + }, + { + "epoch": 0.0562772986896629, + "grad_norm": 1.620424747467041, + "learning_rate": 5.015e-06, + "loss": 0.4397, + "step": 1005 + }, + { + "epoch": 0.05633329600179191, + "grad_norm": 1.3571701049804688, + "learning_rate": 5.02e-06, + "loss": 0.5498, + "step": 1006 + }, + { + "epoch": 0.05638929331392093, + "grad_norm": 1.3521087169647217, + "learning_rate": 5.025e-06, + "loss": 0.3857, + "step": 1007 + }, + { + "epoch": 0.05644529062604995, + "grad_norm": 1.1394175291061401, + "learning_rate": 5.03e-06, + "loss": 0.5012, + "step": 1008 + }, + { + "epoch": 0.05650128793817897, + "grad_norm": 1.5971336364746094, + "learning_rate": 5.035e-06, + "loss": 0.3897, + "step": 1009 + }, + { + "epoch": 0.05655728525030799, + "grad_norm": 1.226791262626648, + "learning_rate": 5.04e-06, + "loss": 0.4669, + "step": 1010 + }, + { + "epoch": 0.056613282562437, + "grad_norm": 1.140771746635437, + "learning_rate": 5.045000000000001e-06, + "loss": 0.4582, + "step": 1011 + }, + { + "epoch": 0.05666927987456602, + "grad_norm": 1.2240629196166992, + "learning_rate": 5.050000000000001e-06, + "loss": 0.5498, + "step": 1012 + }, + { + "epoch": 0.05672527718669504, + "grad_norm": 1.2744507789611816, + "learning_rate": 5.055e-06, + "loss": 0.5201, + "step": 1013 + }, + { + "epoch": 0.05678127449882406, + "grad_norm": 1.2416425943374634, + "learning_rate": 5.06e-06, + "loss": 0.455, + "step": 1014 + }, + { + "epoch": 0.05683727181095308, + "grad_norm": 1.136209487915039, + "learning_rate": 5.065000000000001e-06, + "loss": 0.4787, + "step": 1015 + }, + { + "epoch": 0.05689326912308209, + "grad_norm": 1.156378149986267, + "learning_rate": 5.070000000000001e-06, + "loss": 0.4328, + "step": 1016 + }, + { + "epoch": 0.05694926643521111, + "grad_norm": 1.2695443630218506, + "learning_rate": 5.0750000000000005e-06, + "loss": 0.4819, + "step": 1017 + }, + { + "epoch": 0.05700526374734013, + "grad_norm": 1.191765308380127, + "learning_rate": 5.08e-06, + "loss": 0.3793, + "step": 1018 + }, + { + "epoch": 0.05706126105946915, + "grad_norm": 1.2647852897644043, + "learning_rate": 5.0850000000000004e-06, + "loss": 0.3896, + "step": 1019 + }, + { + "epoch": 0.057117258371598166, + "grad_norm": 1.2763571739196777, + "learning_rate": 5.09e-06, + "loss": 0.6245, + "step": 1020 + }, + { + "epoch": 0.05717325568372718, + "grad_norm": 1.239418864250183, + "learning_rate": 5.095e-06, + "loss": 0.4452, + "step": 1021 + }, + { + "epoch": 0.0572292529958562, + "grad_norm": 1.3589504957199097, + "learning_rate": 5.1e-06, + "loss": 0.514, + "step": 1022 + }, + { + "epoch": 0.05728525030798522, + "grad_norm": 1.1839314699172974, + "learning_rate": 5.105e-06, + "loss": 0.4385, + "step": 1023 + }, + { + "epoch": 0.057341247620114236, + "grad_norm": 1.2520166635513306, + "learning_rate": 5.11e-06, + "loss": 0.5166, + "step": 1024 + }, + { + "epoch": 0.057397244932243255, + "grad_norm": 1.7305189371109009, + "learning_rate": 5.115e-06, + "loss": 0.4474, + "step": 1025 + }, + { + "epoch": 0.05745324224437227, + "grad_norm": 1.2060898542404175, + "learning_rate": 5.12e-06, + "loss": 0.509, + "step": 1026 + }, + { + "epoch": 0.057509239556501286, + "grad_norm": 1.473137617111206, + "learning_rate": 5.125e-06, + "loss": 0.4312, + "step": 1027 + }, + { + "epoch": 0.057565236868630305, + "grad_norm": 1.0966969728469849, + "learning_rate": 5.13e-06, + "loss": 0.4443, + "step": 1028 + }, + { + "epoch": 0.057621234180759325, + "grad_norm": 1.2823864221572876, + "learning_rate": 5.135e-06, + "loss": 0.533, + "step": 1029 + }, + { + "epoch": 0.057677231492888344, + "grad_norm": 1.2967363595962524, + "learning_rate": 5.140000000000001e-06, + "loss": 0.4279, + "step": 1030 + }, + { + "epoch": 0.057733228805017356, + "grad_norm": 1.336930751800537, + "learning_rate": 5.145000000000001e-06, + "loss": 0.5224, + "step": 1031 + }, + { + "epoch": 0.057789226117146375, + "grad_norm": 1.5669735670089722, + "learning_rate": 5.15e-06, + "loss": 0.5194, + "step": 1032 + }, + { + "epoch": 0.057845223429275394, + "grad_norm": 1.1038568019866943, + "learning_rate": 5.155e-06, + "loss": 0.4, + "step": 1033 + }, + { + "epoch": 0.05790122074140441, + "grad_norm": 1.100191354751587, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.4871, + "step": 1034 + }, + { + "epoch": 0.05795721805353343, + "grad_norm": 1.2605558633804321, + "learning_rate": 5.1650000000000005e-06, + "loss": 0.4253, + "step": 1035 + }, + { + "epoch": 0.05801321536566245, + "grad_norm": 1.4248777627944946, + "learning_rate": 5.1700000000000005e-06, + "loss": 0.3997, + "step": 1036 + }, + { + "epoch": 0.058069212677791464, + "grad_norm": 1.2265772819519043, + "learning_rate": 5.175e-06, + "loss": 0.4736, + "step": 1037 + }, + { + "epoch": 0.05812520998992048, + "grad_norm": 1.3616083860397339, + "learning_rate": 5.18e-06, + "loss": 0.4686, + "step": 1038 + }, + { + "epoch": 0.0581812073020495, + "grad_norm": 1.1219232082366943, + "learning_rate": 5.185e-06, + "loss": 0.3539, + "step": 1039 + }, + { + "epoch": 0.05823720461417852, + "grad_norm": 1.4267109632492065, + "learning_rate": 5.19e-06, + "loss": 0.5069, + "step": 1040 + }, + { + "epoch": 0.05829320192630754, + "grad_norm": 1.2390908002853394, + "learning_rate": 5.195e-06, + "loss": 0.5129, + "step": 1041 + }, + { + "epoch": 0.05834919923843655, + "grad_norm": 1.2516577243804932, + "learning_rate": 5.2e-06, + "loss": 0.5162, + "step": 1042 + }, + { + "epoch": 0.05840519655056557, + "grad_norm": 1.876985788345337, + "learning_rate": 5.205e-06, + "loss": 0.3696, + "step": 1043 + }, + { + "epoch": 0.05846119386269459, + "grad_norm": 1.3517183065414429, + "learning_rate": 5.21e-06, + "loss": 0.5062, + "step": 1044 + }, + { + "epoch": 0.05851719117482361, + "grad_norm": 1.160207748413086, + "learning_rate": 5.215e-06, + "loss": 0.4457, + "step": 1045 + }, + { + "epoch": 0.05857318848695263, + "grad_norm": 1.3615148067474365, + "learning_rate": 5.220000000000001e-06, + "loss": 0.5174, + "step": 1046 + }, + { + "epoch": 0.05862918579908164, + "grad_norm": 1.04086434841156, + "learning_rate": 5.225e-06, + "loss": 0.4562, + "step": 1047 + }, + { + "epoch": 0.05868518311121066, + "grad_norm": 1.3939971923828125, + "learning_rate": 5.23e-06, + "loss": 0.4531, + "step": 1048 + }, + { + "epoch": 0.05874118042333968, + "grad_norm": 1.3616175651550293, + "learning_rate": 5.235000000000001e-06, + "loss": 0.478, + "step": 1049 + }, + { + "epoch": 0.0587971777354687, + "grad_norm": 1.2056399583816528, + "learning_rate": 5.240000000000001e-06, + "loss": 0.5392, + "step": 1050 + }, + { + "epoch": 0.05885317504759772, + "grad_norm": 1.2128673791885376, + "learning_rate": 5.245e-06, + "loss": 0.3881, + "step": 1051 + }, + { + "epoch": 0.05890917235972673, + "grad_norm": 1.4392180442810059, + "learning_rate": 5.25e-06, + "loss": 0.4251, + "step": 1052 + }, + { + "epoch": 0.05896516967185575, + "grad_norm": 1.199028730392456, + "learning_rate": 5.2550000000000005e-06, + "loss": 0.3948, + "step": 1053 + }, + { + "epoch": 0.05902116698398477, + "grad_norm": 1.0078922510147095, + "learning_rate": 5.2600000000000005e-06, + "loss": 0.3426, + "step": 1054 + }, + { + "epoch": 0.05907716429611379, + "grad_norm": 1.1445984840393066, + "learning_rate": 5.265e-06, + "loss": 0.3774, + "step": 1055 + }, + { + "epoch": 0.05913316160824281, + "grad_norm": 1.3585809469223022, + "learning_rate": 5.2699999999999995e-06, + "loss": 0.6656, + "step": 1056 + }, + { + "epoch": 0.05918915892037182, + "grad_norm": 1.065910816192627, + "learning_rate": 5.275e-06, + "loss": 0.4658, + "step": 1057 + }, + { + "epoch": 0.05924515623250084, + "grad_norm": 1.2044261693954468, + "learning_rate": 5.28e-06, + "loss": 0.5031, + "step": 1058 + }, + { + "epoch": 0.05930115354462986, + "grad_norm": 1.6254397630691528, + "learning_rate": 5.285e-06, + "loss": 0.4542, + "step": 1059 + }, + { + "epoch": 0.05935715085675888, + "grad_norm": 1.1306949853897095, + "learning_rate": 5.29e-06, + "loss": 0.4671, + "step": 1060 + }, + { + "epoch": 0.059413148168887896, + "grad_norm": 1.2099978923797607, + "learning_rate": 5.295e-06, + "loss": 0.4892, + "step": 1061 + }, + { + "epoch": 0.05946914548101691, + "grad_norm": 1.401309609413147, + "learning_rate": 5.3e-06, + "loss": 0.5684, + "step": 1062 + }, + { + "epoch": 0.05952514279314593, + "grad_norm": 1.8093961477279663, + "learning_rate": 5.305e-06, + "loss": 0.4251, + "step": 1063 + }, + { + "epoch": 0.059581140105274946, + "grad_norm": 1.5952280759811401, + "learning_rate": 5.31e-06, + "loss": 0.5887, + "step": 1064 + }, + { + "epoch": 0.059637137417403965, + "grad_norm": 1.223723292350769, + "learning_rate": 5.315000000000001e-06, + "loss": 0.3764, + "step": 1065 + }, + { + "epoch": 0.059693134729532984, + "grad_norm": 1.25835120677948, + "learning_rate": 5.32e-06, + "loss": 0.4321, + "step": 1066 + }, + { + "epoch": 0.059749132041662004, + "grad_norm": 1.1497814655303955, + "learning_rate": 5.325e-06, + "loss": 0.4415, + "step": 1067 + }, + { + "epoch": 0.059805129353791016, + "grad_norm": 1.0686076879501343, + "learning_rate": 5.330000000000001e-06, + "loss": 0.3571, + "step": 1068 + }, + { + "epoch": 0.059861126665920035, + "grad_norm": 1.2597800493240356, + "learning_rate": 5.335000000000001e-06, + "loss": 0.4686, + "step": 1069 + }, + { + "epoch": 0.059917123978049054, + "grad_norm": 1.2790300846099854, + "learning_rate": 5.3400000000000005e-06, + "loss": 0.4832, + "step": 1070 + }, + { + "epoch": 0.05997312129017807, + "grad_norm": 1.1553374528884888, + "learning_rate": 5.345e-06, + "loss": 0.5143, + "step": 1071 + }, + { + "epoch": 0.06002911860230709, + "grad_norm": 1.1005035638809204, + "learning_rate": 5.3500000000000004e-06, + "loss": 0.4896, + "step": 1072 + }, + { + "epoch": 0.060085115914436105, + "grad_norm": 1.3469942808151245, + "learning_rate": 5.355e-06, + "loss": 0.4591, + "step": 1073 + }, + { + "epoch": 0.060141113226565124, + "grad_norm": 1.2604517936706543, + "learning_rate": 5.36e-06, + "loss": 0.5521, + "step": 1074 + }, + { + "epoch": 0.06019711053869414, + "grad_norm": 1.243295431137085, + "learning_rate": 5.365e-06, + "loss": 0.6024, + "step": 1075 + }, + { + "epoch": 0.06025310785082316, + "grad_norm": 1.3653740882873535, + "learning_rate": 5.37e-06, + "loss": 0.4207, + "step": 1076 + }, + { + "epoch": 0.06030910516295218, + "grad_norm": 1.1624484062194824, + "learning_rate": 5.375e-06, + "loss": 0.4241, + "step": 1077 + }, + { + "epoch": 0.060365102475081193, + "grad_norm": 1.260948657989502, + "learning_rate": 5.38e-06, + "loss": 0.4985, + "step": 1078 + }, + { + "epoch": 0.06042109978721021, + "grad_norm": 1.099338173866272, + "learning_rate": 5.385e-06, + "loss": 0.4063, + "step": 1079 + }, + { + "epoch": 0.06047709709933923, + "grad_norm": 1.4310381412506104, + "learning_rate": 5.390000000000001e-06, + "loss": 0.5739, + "step": 1080 + }, + { + "epoch": 0.06053309441146825, + "grad_norm": 1.2879993915557861, + "learning_rate": 5.395e-06, + "loss": 0.4956, + "step": 1081 + }, + { + "epoch": 0.06058909172359727, + "grad_norm": 1.1941678524017334, + "learning_rate": 5.4e-06, + "loss": 0.3758, + "step": 1082 + }, + { + "epoch": 0.06064508903572628, + "grad_norm": 1.1191668510437012, + "learning_rate": 5.405e-06, + "loss": 0.5986, + "step": 1083 + }, + { + "epoch": 0.0607010863478553, + "grad_norm": 1.4496634006500244, + "learning_rate": 5.410000000000001e-06, + "loss": 0.4977, + "step": 1084 + }, + { + "epoch": 0.06075708365998432, + "grad_norm": 1.4725685119628906, + "learning_rate": 5.415e-06, + "loss": 0.5158, + "step": 1085 + }, + { + "epoch": 0.06081308097211334, + "grad_norm": 1.0966806411743164, + "learning_rate": 5.42e-06, + "loss": 0.4611, + "step": 1086 + }, + { + "epoch": 0.06086907828424236, + "grad_norm": 1.06184720993042, + "learning_rate": 5.4250000000000006e-06, + "loss": 0.4319, + "step": 1087 + }, + { + "epoch": 0.06092507559637137, + "grad_norm": 1.874305248260498, + "learning_rate": 5.4300000000000005e-06, + "loss": 0.5097, + "step": 1088 + }, + { + "epoch": 0.06098107290850039, + "grad_norm": 2.8478968143463135, + "learning_rate": 5.4350000000000005e-06, + "loss": 0.3575, + "step": 1089 + }, + { + "epoch": 0.06103707022062941, + "grad_norm": 1.3469611406326294, + "learning_rate": 5.44e-06, + "loss": 0.428, + "step": 1090 + }, + { + "epoch": 0.06109306753275843, + "grad_norm": 1.0713883638381958, + "learning_rate": 5.445e-06, + "loss": 0.3892, + "step": 1091 + }, + { + "epoch": 0.06114906484488745, + "grad_norm": 1.3371243476867676, + "learning_rate": 5.45e-06, + "loss": 0.577, + "step": 1092 + }, + { + "epoch": 0.06120506215701646, + "grad_norm": 1.6126753091812134, + "learning_rate": 5.455e-06, + "loss": 0.4554, + "step": 1093 + }, + { + "epoch": 0.06126105946914548, + "grad_norm": 1.1165636777877808, + "learning_rate": 5.46e-06, + "loss": 0.358, + "step": 1094 + }, + { + "epoch": 0.0613170567812745, + "grad_norm": 1.371865153312683, + "learning_rate": 5.465e-06, + "loss": 0.4045, + "step": 1095 + }, + { + "epoch": 0.06137305409340352, + "grad_norm": 1.3662270307540894, + "learning_rate": 5.47e-06, + "loss": 0.3902, + "step": 1096 + }, + { + "epoch": 0.061429051405532537, + "grad_norm": 1.5756611824035645, + "learning_rate": 5.475e-06, + "loss": 0.8939, + "step": 1097 + }, + { + "epoch": 0.061485048717661556, + "grad_norm": 1.336142897605896, + "learning_rate": 5.48e-06, + "loss": 0.549, + "step": 1098 + }, + { + "epoch": 0.06154104602979057, + "grad_norm": 1.1127017736434937, + "learning_rate": 5.485000000000001e-06, + "loss": 0.462, + "step": 1099 + }, + { + "epoch": 0.06159704334191959, + "grad_norm": 1.4991849660873413, + "learning_rate": 5.49e-06, + "loss": 0.4478, + "step": 1100 + }, + { + "epoch": 0.061653040654048606, + "grad_norm": 1.2078429460525513, + "learning_rate": 5.495e-06, + "loss": 0.3859, + "step": 1101 + }, + { + "epoch": 0.061709037966177625, + "grad_norm": 1.0736446380615234, + "learning_rate": 5.500000000000001e-06, + "loss": 0.3938, + "step": 1102 + }, + { + "epoch": 0.061765035278306644, + "grad_norm": 1.1046690940856934, + "learning_rate": 5.505000000000001e-06, + "loss": 0.335, + "step": 1103 + }, + { + "epoch": 0.06182103259043566, + "grad_norm": 1.3569049835205078, + "learning_rate": 5.510000000000001e-06, + "loss": 0.3924, + "step": 1104 + }, + { + "epoch": 0.061877029902564676, + "grad_norm": 2.3091189861297607, + "learning_rate": 5.515e-06, + "loss": 0.5528, + "step": 1105 + }, + { + "epoch": 0.061933027214693695, + "grad_norm": 1.2597846984863281, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.4925, + "step": 1106 + }, + { + "epoch": 0.061989024526822714, + "grad_norm": 1.195900321006775, + "learning_rate": 5.5250000000000005e-06, + "loss": 0.5321, + "step": 1107 + }, + { + "epoch": 0.06204502183895173, + "grad_norm": 1.2691329717636108, + "learning_rate": 5.53e-06, + "loss": 0.4351, + "step": 1108 + }, + { + "epoch": 0.062101019151080745, + "grad_norm": 1.3498635292053223, + "learning_rate": 5.535e-06, + "loss": 0.4116, + "step": 1109 + }, + { + "epoch": 0.062157016463209765, + "grad_norm": 1.189966082572937, + "learning_rate": 5.54e-06, + "loss": 0.4857, + "step": 1110 + }, + { + "epoch": 0.062213013775338784, + "grad_norm": 1.262734293937683, + "learning_rate": 5.545e-06, + "loss": 0.6124, + "step": 1111 + }, + { + "epoch": 0.0622690110874678, + "grad_norm": 1.9112929105758667, + "learning_rate": 5.55e-06, + "loss": 0.4752, + "step": 1112 + }, + { + "epoch": 0.06232500839959682, + "grad_norm": 1.508002519607544, + "learning_rate": 5.555e-06, + "loss": 0.4645, + "step": 1113 + }, + { + "epoch": 0.062381005711725834, + "grad_norm": 1.417464017868042, + "learning_rate": 5.56e-06, + "loss": 0.449, + "step": 1114 + }, + { + "epoch": 0.06243700302385485, + "grad_norm": 1.1880978345870972, + "learning_rate": 5.565e-06, + "loss": 0.4766, + "step": 1115 + }, + { + "epoch": 0.06249300033598387, + "grad_norm": 1.3594896793365479, + "learning_rate": 5.57e-06, + "loss": 0.4782, + "step": 1116 + }, + { + "epoch": 0.06254899764811289, + "grad_norm": 1.158437967300415, + "learning_rate": 5.575e-06, + "loss": 0.5021, + "step": 1117 + }, + { + "epoch": 0.06260499496024191, + "grad_norm": 1.074735164642334, + "learning_rate": 5.580000000000001e-06, + "loss": 0.4648, + "step": 1118 + }, + { + "epoch": 0.06266099227237093, + "grad_norm": 1.4043564796447754, + "learning_rate": 5.585e-06, + "loss": 0.5121, + "step": 1119 + }, + { + "epoch": 0.06271698958449995, + "grad_norm": 1.1679229736328125, + "learning_rate": 5.59e-06, + "loss": 0.3579, + "step": 1120 + }, + { + "epoch": 0.06277298689662897, + "grad_norm": 1.1285709142684937, + "learning_rate": 5.595000000000001e-06, + "loss": 0.2937, + "step": 1121 + }, + { + "epoch": 0.06282898420875797, + "grad_norm": 1.2264790534973145, + "learning_rate": 5.600000000000001e-06, + "loss": 0.5157, + "step": 1122 + }, + { + "epoch": 0.06288498152088699, + "grad_norm": 1.1236133575439453, + "learning_rate": 5.6050000000000005e-06, + "loss": 0.4314, + "step": 1123 + }, + { + "epoch": 0.06294097883301601, + "grad_norm": 1.0496784448623657, + "learning_rate": 5.61e-06, + "loss": 0.3789, + "step": 1124 + }, + { + "epoch": 0.06299697614514503, + "grad_norm": 1.0616122484207153, + "learning_rate": 5.6150000000000005e-06, + "loss": 0.4315, + "step": 1125 + }, + { + "epoch": 0.06305297345727405, + "grad_norm": 1.397759199142456, + "learning_rate": 5.62e-06, + "loss": 0.7762, + "step": 1126 + }, + { + "epoch": 0.06310897076940307, + "grad_norm": 1.163167119026184, + "learning_rate": 5.625e-06, + "loss": 0.4815, + "step": 1127 + }, + { + "epoch": 0.06316496808153209, + "grad_norm": 1.3902819156646729, + "learning_rate": 5.63e-06, + "loss": 0.5662, + "step": 1128 + }, + { + "epoch": 0.06322096539366111, + "grad_norm": 1.2462685108184814, + "learning_rate": 5.635e-06, + "loss": 0.458, + "step": 1129 + }, + { + "epoch": 0.06327696270579013, + "grad_norm": 1.4905333518981934, + "learning_rate": 5.64e-06, + "loss": 0.4204, + "step": 1130 + }, + { + "epoch": 0.06333296001791915, + "grad_norm": 1.259095549583435, + "learning_rate": 5.645e-06, + "loss": 0.4723, + "step": 1131 + }, + { + "epoch": 0.06338895733004815, + "grad_norm": 1.1537303924560547, + "learning_rate": 5.65e-06, + "loss": 0.4517, + "step": 1132 + }, + { + "epoch": 0.06344495464217717, + "grad_norm": 1.472391963005066, + "learning_rate": 5.655000000000001e-06, + "loss": 0.451, + "step": 1133 + }, + { + "epoch": 0.06350095195430619, + "grad_norm": 1.1070297956466675, + "learning_rate": 5.66e-06, + "loss": 0.4294, + "step": 1134 + }, + { + "epoch": 0.06355694926643521, + "grad_norm": 1.4566556215286255, + "learning_rate": 5.665e-06, + "loss": 0.6163, + "step": 1135 + }, + { + "epoch": 0.06361294657856423, + "grad_norm": 1.146721363067627, + "learning_rate": 5.67e-06, + "loss": 0.3609, + "step": 1136 + }, + { + "epoch": 0.06366894389069325, + "grad_norm": 1.228152871131897, + "learning_rate": 5.675000000000001e-06, + "loss": 0.5157, + "step": 1137 + }, + { + "epoch": 0.06372494120282227, + "grad_norm": 1.262759804725647, + "learning_rate": 5.680000000000001e-06, + "loss": 0.5129, + "step": 1138 + }, + { + "epoch": 0.06378093851495129, + "grad_norm": 1.3428313732147217, + "learning_rate": 5.685e-06, + "loss": 0.4894, + "step": 1139 + }, + { + "epoch": 0.0638369358270803, + "grad_norm": 1.4424748420715332, + "learning_rate": 5.690000000000001e-06, + "loss": 0.4817, + "step": 1140 + }, + { + "epoch": 0.06389293313920932, + "grad_norm": 1.4872840642929077, + "learning_rate": 5.6950000000000005e-06, + "loss": 0.4142, + "step": 1141 + }, + { + "epoch": 0.06394893045133833, + "grad_norm": 1.3789427280426025, + "learning_rate": 5.7000000000000005e-06, + "loss": 0.4699, + "step": 1142 + }, + { + "epoch": 0.06400492776346735, + "grad_norm": 1.0845141410827637, + "learning_rate": 5.705e-06, + "loss": 0.4109, + "step": 1143 + }, + { + "epoch": 0.06406092507559637, + "grad_norm": 1.1307657957077026, + "learning_rate": 5.71e-06, + "loss": 0.4811, + "step": 1144 + }, + { + "epoch": 0.06411692238772539, + "grad_norm": 1.0611789226531982, + "learning_rate": 5.715e-06, + "loss": 0.4281, + "step": 1145 + }, + { + "epoch": 0.0641729196998544, + "grad_norm": 1.1467761993408203, + "learning_rate": 5.72e-06, + "loss": 0.3959, + "step": 1146 + }, + { + "epoch": 0.06422891701198342, + "grad_norm": 1.3303271532058716, + "learning_rate": 5.725e-06, + "loss": 0.4611, + "step": 1147 + }, + { + "epoch": 0.06428491432411244, + "grad_norm": 1.106090784072876, + "learning_rate": 5.73e-06, + "loss": 0.451, + "step": 1148 + }, + { + "epoch": 0.06434091163624146, + "grad_norm": 1.0233514308929443, + "learning_rate": 5.735e-06, + "loss": 0.4053, + "step": 1149 + }, + { + "epoch": 0.06439690894837048, + "grad_norm": 0.9405644536018372, + "learning_rate": 5.74e-06, + "loss": 0.3488, + "step": 1150 + }, + { + "epoch": 0.0644529062604995, + "grad_norm": 1.0472259521484375, + "learning_rate": 5.745e-06, + "loss": 0.4616, + "step": 1151 + }, + { + "epoch": 0.06450890357262852, + "grad_norm": 1.146802544593811, + "learning_rate": 5.750000000000001e-06, + "loss": 0.4724, + "step": 1152 + }, + { + "epoch": 0.06456490088475753, + "grad_norm": 1.3439722061157227, + "learning_rate": 5.755e-06, + "loss": 0.5802, + "step": 1153 + }, + { + "epoch": 0.06462089819688654, + "grad_norm": 1.4531556367874146, + "learning_rate": 5.76e-06, + "loss": 0.5034, + "step": 1154 + }, + { + "epoch": 0.06467689550901556, + "grad_norm": 1.0436081886291504, + "learning_rate": 5.765e-06, + "loss": 0.3639, + "step": 1155 + }, + { + "epoch": 0.06473289282114458, + "grad_norm": 1.1610552072525024, + "learning_rate": 5.770000000000001e-06, + "loss": 0.4778, + "step": 1156 + }, + { + "epoch": 0.0647888901332736, + "grad_norm": 1.209583044052124, + "learning_rate": 5.775000000000001e-06, + "loss": 0.4406, + "step": 1157 + }, + { + "epoch": 0.06484488744540262, + "grad_norm": 1.3594990968704224, + "learning_rate": 5.78e-06, + "loss": 0.5006, + "step": 1158 + }, + { + "epoch": 0.06490088475753164, + "grad_norm": 1.2716442346572876, + "learning_rate": 5.7850000000000005e-06, + "loss": 0.5065, + "step": 1159 + }, + { + "epoch": 0.06495688206966066, + "grad_norm": 1.2829830646514893, + "learning_rate": 5.7900000000000005e-06, + "loss": 0.6007, + "step": 1160 + }, + { + "epoch": 0.06501287938178968, + "grad_norm": 1.3459588289260864, + "learning_rate": 5.795e-06, + "loss": 0.5943, + "step": 1161 + }, + { + "epoch": 0.0650688766939187, + "grad_norm": 1.4075367450714111, + "learning_rate": 5.8e-06, + "loss": 0.5364, + "step": 1162 + }, + { + "epoch": 0.0651248740060477, + "grad_norm": 1.419149398803711, + "learning_rate": 5.805e-06, + "loss": 0.4434, + "step": 1163 + }, + { + "epoch": 0.06518087131817672, + "grad_norm": 1.4249473810195923, + "learning_rate": 5.81e-06, + "loss": 0.5376, + "step": 1164 + }, + { + "epoch": 0.06523686863030574, + "grad_norm": 1.5048781633377075, + "learning_rate": 5.815e-06, + "loss": 0.5568, + "step": 1165 + }, + { + "epoch": 0.06529286594243476, + "grad_norm": 1.059531807899475, + "learning_rate": 5.82e-06, + "loss": 0.3689, + "step": 1166 + }, + { + "epoch": 0.06534886325456378, + "grad_norm": 1.1879932880401611, + "learning_rate": 5.825000000000001e-06, + "loss": 0.4556, + "step": 1167 + }, + { + "epoch": 0.0654048605666928, + "grad_norm": 2.4681849479675293, + "learning_rate": 5.83e-06, + "loss": 0.4627, + "step": 1168 + }, + { + "epoch": 0.06546085787882182, + "grad_norm": 1.6035898923873901, + "learning_rate": 5.835e-06, + "loss": 0.4652, + "step": 1169 + }, + { + "epoch": 0.06551685519095084, + "grad_norm": 1.147926688194275, + "learning_rate": 5.84e-06, + "loss": 0.3389, + "step": 1170 + }, + { + "epoch": 0.06557285250307986, + "grad_norm": 1.3321911096572876, + "learning_rate": 5.845000000000001e-06, + "loss": 0.5156, + "step": 1171 + }, + { + "epoch": 0.06562884981520888, + "grad_norm": 1.3565380573272705, + "learning_rate": 5.850000000000001e-06, + "loss": 0.4251, + "step": 1172 + }, + { + "epoch": 0.06568484712733788, + "grad_norm": 1.0869241952896118, + "learning_rate": 5.855e-06, + "loss": 0.4787, + "step": 1173 + }, + { + "epoch": 0.0657408444394669, + "grad_norm": 1.4895256757736206, + "learning_rate": 5.86e-06, + "loss": 0.6179, + "step": 1174 + }, + { + "epoch": 0.06579684175159592, + "grad_norm": 1.591769814491272, + "learning_rate": 5.865000000000001e-06, + "loss": 0.5253, + "step": 1175 + }, + { + "epoch": 0.06585283906372494, + "grad_norm": 1.162840485572815, + "learning_rate": 5.8700000000000005e-06, + "loss": 0.4676, + "step": 1176 + }, + { + "epoch": 0.06590883637585396, + "grad_norm": 1.1410831212997437, + "learning_rate": 5.875e-06, + "loss": 0.3855, + "step": 1177 + }, + { + "epoch": 0.06596483368798298, + "grad_norm": 1.4543075561523438, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.4941, + "step": 1178 + }, + { + "epoch": 0.066020831000112, + "grad_norm": 1.4013811349868774, + "learning_rate": 5.885e-06, + "loss": 0.476, + "step": 1179 + }, + { + "epoch": 0.06607682831224101, + "grad_norm": 2.4917755126953125, + "learning_rate": 5.89e-06, + "loss": 0.3922, + "step": 1180 + }, + { + "epoch": 0.06613282562437003, + "grad_norm": 1.086756944656372, + "learning_rate": 5.895e-06, + "loss": 0.4143, + "step": 1181 + }, + { + "epoch": 0.06618882293649905, + "grad_norm": 1.2095462083816528, + "learning_rate": 5.9e-06, + "loss": 0.4772, + "step": 1182 + }, + { + "epoch": 0.06624482024862807, + "grad_norm": 1.4608217477798462, + "learning_rate": 5.905e-06, + "loss": 0.4348, + "step": 1183 + }, + { + "epoch": 0.06630081756075708, + "grad_norm": 1.523728609085083, + "learning_rate": 5.91e-06, + "loss": 0.6067, + "step": 1184 + }, + { + "epoch": 0.0663568148728861, + "grad_norm": 1.1437195539474487, + "learning_rate": 5.915e-06, + "loss": 0.3993, + "step": 1185 + }, + { + "epoch": 0.06641281218501512, + "grad_norm": 1.146959662437439, + "learning_rate": 5.920000000000001e-06, + "loss": 0.474, + "step": 1186 + }, + { + "epoch": 0.06646880949714414, + "grad_norm": 1.1326645612716675, + "learning_rate": 5.925e-06, + "loss": 0.3891, + "step": 1187 + }, + { + "epoch": 0.06652480680927315, + "grad_norm": 1.0731227397918701, + "learning_rate": 5.93e-06, + "loss": 0.3448, + "step": 1188 + }, + { + "epoch": 0.06658080412140217, + "grad_norm": 1.418973684310913, + "learning_rate": 5.935e-06, + "loss": 0.4704, + "step": 1189 + }, + { + "epoch": 0.06663680143353119, + "grad_norm": 1.1930632591247559, + "learning_rate": 5.940000000000001e-06, + "loss": 0.3829, + "step": 1190 + }, + { + "epoch": 0.06669279874566021, + "grad_norm": 1.1238776445388794, + "learning_rate": 5.945000000000001e-06, + "loss": 0.3387, + "step": 1191 + }, + { + "epoch": 0.06674879605778923, + "grad_norm": 1.3016777038574219, + "learning_rate": 5.95e-06, + "loss": 0.5368, + "step": 1192 + }, + { + "epoch": 0.06680479336991825, + "grad_norm": 1.2945044040679932, + "learning_rate": 5.955000000000001e-06, + "loss": 0.4609, + "step": 1193 + }, + { + "epoch": 0.06686079068204726, + "grad_norm": 1.4390751123428345, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.476, + "step": 1194 + }, + { + "epoch": 0.06691678799417627, + "grad_norm": 1.038458228111267, + "learning_rate": 5.9650000000000005e-06, + "loss": 0.3525, + "step": 1195 + }, + { + "epoch": 0.0669727853063053, + "grad_norm": 1.1566381454467773, + "learning_rate": 5.9700000000000004e-06, + "loss": 0.4043, + "step": 1196 + }, + { + "epoch": 0.06702878261843431, + "grad_norm": 1.3829346895217896, + "learning_rate": 5.975e-06, + "loss": 0.3849, + "step": 1197 + }, + { + "epoch": 0.06708477993056333, + "grad_norm": 1.5022857189178467, + "learning_rate": 5.98e-06, + "loss": 0.5313, + "step": 1198 + }, + { + "epoch": 0.06714077724269235, + "grad_norm": 1.3618924617767334, + "learning_rate": 5.985e-06, + "loss": 0.4925, + "step": 1199 + }, + { + "epoch": 0.06719677455482137, + "grad_norm": 1.1289489269256592, + "learning_rate": 5.99e-06, + "loss": 0.4691, + "step": 1200 + }, + { + "epoch": 0.06725277186695039, + "grad_norm": 1.4186307191848755, + "learning_rate": 5.995e-06, + "loss": 0.5976, + "step": 1201 + }, + { + "epoch": 0.06730876917907941, + "grad_norm": 1.2453892230987549, + "learning_rate": 6e-06, + "loss": 0.7152, + "step": 1202 + }, + { + "epoch": 0.06736476649120843, + "grad_norm": 1.31748628616333, + "learning_rate": 6.005e-06, + "loss": 0.6554, + "step": 1203 + }, + { + "epoch": 0.06742076380333743, + "grad_norm": 1.271621584892273, + "learning_rate": 6.01e-06, + "loss": 0.5174, + "step": 1204 + }, + { + "epoch": 0.06747676111546645, + "grad_norm": 1.4273734092712402, + "learning_rate": 6.015000000000001e-06, + "loss": 0.5568, + "step": 1205 + }, + { + "epoch": 0.06753275842759547, + "grad_norm": 1.1785576343536377, + "learning_rate": 6.02e-06, + "loss": 0.4569, + "step": 1206 + }, + { + "epoch": 0.06758875573972449, + "grad_norm": 1.4180866479873657, + "learning_rate": 6.025e-06, + "loss": 0.5308, + "step": 1207 + }, + { + "epoch": 0.06764475305185351, + "grad_norm": 1.1484123468399048, + "learning_rate": 6.03e-06, + "loss": 0.3796, + "step": 1208 + }, + { + "epoch": 0.06770075036398253, + "grad_norm": 1.2170040607452393, + "learning_rate": 6.035000000000001e-06, + "loss": 0.4532, + "step": 1209 + }, + { + "epoch": 0.06775674767611155, + "grad_norm": 1.2905791997909546, + "learning_rate": 6.040000000000001e-06, + "loss": 0.4595, + "step": 1210 + }, + { + "epoch": 0.06781274498824057, + "grad_norm": 1.2938952445983887, + "learning_rate": 6.045e-06, + "loss": 0.4142, + "step": 1211 + }, + { + "epoch": 0.06786874230036959, + "grad_norm": 1.3278106451034546, + "learning_rate": 6.0500000000000005e-06, + "loss": 0.4678, + "step": 1212 + }, + { + "epoch": 0.0679247396124986, + "grad_norm": 1.311219334602356, + "learning_rate": 6.0550000000000005e-06, + "loss": 0.3523, + "step": 1213 + }, + { + "epoch": 0.06798073692462762, + "grad_norm": 1.160986304283142, + "learning_rate": 6.0600000000000004e-06, + "loss": 0.4355, + "step": 1214 + }, + { + "epoch": 0.06803673423675663, + "grad_norm": 1.3731441497802734, + "learning_rate": 6.065e-06, + "loss": 0.5311, + "step": 1215 + }, + { + "epoch": 0.06809273154888565, + "grad_norm": 1.191652536392212, + "learning_rate": 6.07e-06, + "loss": 0.4536, + "step": 1216 + }, + { + "epoch": 0.06814872886101467, + "grad_norm": 1.0599191188812256, + "learning_rate": 6.075e-06, + "loss": 0.4172, + "step": 1217 + }, + { + "epoch": 0.06820472617314369, + "grad_norm": 1.1530780792236328, + "learning_rate": 6.08e-06, + "loss": 0.4636, + "step": 1218 + }, + { + "epoch": 0.0682607234852727, + "grad_norm": 1.4733500480651855, + "learning_rate": 6.085e-06, + "loss": 0.6163, + "step": 1219 + }, + { + "epoch": 0.06831672079740173, + "grad_norm": 1.335119605064392, + "learning_rate": 6.090000000000001e-06, + "loss": 0.5694, + "step": 1220 + }, + { + "epoch": 0.06837271810953074, + "grad_norm": 1.1457488536834717, + "learning_rate": 6.095e-06, + "loss": 0.4272, + "step": 1221 + }, + { + "epoch": 0.06842871542165976, + "grad_norm": 1.125653862953186, + "learning_rate": 6.1e-06, + "loss": 0.3819, + "step": 1222 + }, + { + "epoch": 0.06848471273378878, + "grad_norm": 1.4815268516540527, + "learning_rate": 6.105e-06, + "loss": 0.6059, + "step": 1223 + }, + { + "epoch": 0.0685407100459178, + "grad_norm": 1.072304606437683, + "learning_rate": 6.110000000000001e-06, + "loss": 0.5435, + "step": 1224 + }, + { + "epoch": 0.06859670735804681, + "grad_norm": 1.2699904441833496, + "learning_rate": 6.115000000000001e-06, + "loss": 0.4013, + "step": 1225 + }, + { + "epoch": 0.06865270467017583, + "grad_norm": 1.4592626094818115, + "learning_rate": 6.12e-06, + "loss": 0.4995, + "step": 1226 + }, + { + "epoch": 0.06870870198230485, + "grad_norm": 1.1897928714752197, + "learning_rate": 6.125e-06, + "loss": 0.3772, + "step": 1227 + }, + { + "epoch": 0.06876469929443386, + "grad_norm": 1.142614483833313, + "learning_rate": 6.130000000000001e-06, + "loss": 0.3635, + "step": 1228 + }, + { + "epoch": 0.06882069660656288, + "grad_norm": 1.2323251962661743, + "learning_rate": 6.1350000000000006e-06, + "loss": 0.4454, + "step": 1229 + }, + { + "epoch": 0.0688766939186919, + "grad_norm": 1.40288507938385, + "learning_rate": 6.1400000000000005e-06, + "loss": 0.5085, + "step": 1230 + }, + { + "epoch": 0.06893269123082092, + "grad_norm": 1.1993368864059448, + "learning_rate": 6.1450000000000005e-06, + "loss": 0.3992, + "step": 1231 + }, + { + "epoch": 0.06898868854294994, + "grad_norm": 1.6297578811645508, + "learning_rate": 6.15e-06, + "loss": 0.5697, + "step": 1232 + }, + { + "epoch": 0.06904468585507896, + "grad_norm": 1.4021323919296265, + "learning_rate": 6.155e-06, + "loss": 0.3727, + "step": 1233 + }, + { + "epoch": 0.06910068316720798, + "grad_norm": 1.0993765592575073, + "learning_rate": 6.16e-06, + "loss": 0.6237, + "step": 1234 + }, + { + "epoch": 0.06915668047933698, + "grad_norm": 1.3789044618606567, + "learning_rate": 6.165e-06, + "loss": 0.5462, + "step": 1235 + }, + { + "epoch": 0.069212677791466, + "grad_norm": 1.5529453754425049, + "learning_rate": 6.17e-06, + "loss": 0.5599, + "step": 1236 + }, + { + "epoch": 0.06926867510359502, + "grad_norm": 1.2780702114105225, + "learning_rate": 6.175e-06, + "loss": 0.6287, + "step": 1237 + }, + { + "epoch": 0.06932467241572404, + "grad_norm": 0.9809923768043518, + "learning_rate": 6.18e-06, + "loss": 0.3876, + "step": 1238 + }, + { + "epoch": 0.06938066972785306, + "grad_norm": 1.0424203872680664, + "learning_rate": 6.185000000000001e-06, + "loss": 0.3569, + "step": 1239 + }, + { + "epoch": 0.06943666703998208, + "grad_norm": 1.1957716941833496, + "learning_rate": 6.19e-06, + "loss": 0.5428, + "step": 1240 + }, + { + "epoch": 0.0694926643521111, + "grad_norm": 1.3562878370285034, + "learning_rate": 6.195e-06, + "loss": 0.4914, + "step": 1241 + }, + { + "epoch": 0.06954866166424012, + "grad_norm": 1.4439308643341064, + "learning_rate": 6.2e-06, + "loss": 0.5092, + "step": 1242 + }, + { + "epoch": 0.06960465897636914, + "grad_norm": 1.3014452457427979, + "learning_rate": 6.205000000000001e-06, + "loss": 0.4253, + "step": 1243 + }, + { + "epoch": 0.06966065628849816, + "grad_norm": 1.09708571434021, + "learning_rate": 6.210000000000001e-06, + "loss": 0.4735, + "step": 1244 + }, + { + "epoch": 0.06971665360062718, + "grad_norm": 1.2579667568206787, + "learning_rate": 6.215e-06, + "loss": 0.3417, + "step": 1245 + }, + { + "epoch": 0.06977265091275618, + "grad_norm": 1.1711158752441406, + "learning_rate": 6.22e-06, + "loss": 0.5578, + "step": 1246 + }, + { + "epoch": 0.0698286482248852, + "grad_norm": 1.3848479986190796, + "learning_rate": 6.2250000000000005e-06, + "loss": 0.3832, + "step": 1247 + }, + { + "epoch": 0.06988464553701422, + "grad_norm": 1.2990936040878296, + "learning_rate": 6.2300000000000005e-06, + "loss": 0.5252, + "step": 1248 + }, + { + "epoch": 0.06994064284914324, + "grad_norm": 1.3192658424377441, + "learning_rate": 6.2350000000000004e-06, + "loss": 0.6234, + "step": 1249 + }, + { + "epoch": 0.06999664016127226, + "grad_norm": 1.1719008684158325, + "learning_rate": 6.24e-06, + "loss": 0.4339, + "step": 1250 + }, + { + "epoch": 0.07005263747340128, + "grad_norm": 1.21258544921875, + "learning_rate": 6.245e-06, + "loss": 0.4835, + "step": 1251 + }, + { + "epoch": 0.0701086347855303, + "grad_norm": 1.2766406536102295, + "learning_rate": 6.25e-06, + "loss": 0.5146, + "step": 1252 + }, + { + "epoch": 0.07016463209765932, + "grad_norm": 1.305820107460022, + "learning_rate": 6.254999999999999e-06, + "loss": 0.3805, + "step": 1253 + }, + { + "epoch": 0.07022062940978833, + "grad_norm": 1.2711191177368164, + "learning_rate": 6.26e-06, + "loss": 0.4137, + "step": 1254 + }, + { + "epoch": 0.07027662672191735, + "grad_norm": 1.3470959663391113, + "learning_rate": 6.265e-06, + "loss": 0.4417, + "step": 1255 + }, + { + "epoch": 0.07033262403404636, + "grad_norm": 0.9316703677177429, + "learning_rate": 6.270000000000001e-06, + "loss": 0.3569, + "step": 1256 + }, + { + "epoch": 0.07038862134617538, + "grad_norm": 1.3561193943023682, + "learning_rate": 6.275e-06, + "loss": 0.5043, + "step": 1257 + }, + { + "epoch": 0.0704446186583044, + "grad_norm": 1.0972591638565063, + "learning_rate": 6.28e-06, + "loss": 0.4823, + "step": 1258 + }, + { + "epoch": 0.07050061597043342, + "grad_norm": 1.1062606573104858, + "learning_rate": 6.285000000000001e-06, + "loss": 0.3591, + "step": 1259 + }, + { + "epoch": 0.07055661328256244, + "grad_norm": 1.186181902885437, + "learning_rate": 6.29e-06, + "loss": 0.6183, + "step": 1260 + }, + { + "epoch": 0.07061261059469146, + "grad_norm": 1.2585527896881104, + "learning_rate": 6.295000000000001e-06, + "loss": 0.5375, + "step": 1261 + }, + { + "epoch": 0.07066860790682047, + "grad_norm": 1.2941310405731201, + "learning_rate": 6.300000000000001e-06, + "loss": 0.4806, + "step": 1262 + }, + { + "epoch": 0.0707246052189495, + "grad_norm": 1.3581117391586304, + "learning_rate": 6.305e-06, + "loss": 0.5579, + "step": 1263 + }, + { + "epoch": 0.07078060253107851, + "grad_norm": 1.5740511417388916, + "learning_rate": 6.3100000000000006e-06, + "loss": 0.5084, + "step": 1264 + }, + { + "epoch": 0.07083659984320753, + "grad_norm": 1.3985000848770142, + "learning_rate": 6.315e-06, + "loss": 0.4166, + "step": 1265 + }, + { + "epoch": 0.07089259715533654, + "grad_norm": 1.2136940956115723, + "learning_rate": 6.320000000000001e-06, + "loss": 0.4584, + "step": 1266 + }, + { + "epoch": 0.07094859446746556, + "grad_norm": 1.1227129697799683, + "learning_rate": 6.3250000000000004e-06, + "loss": 0.388, + "step": 1267 + }, + { + "epoch": 0.07100459177959458, + "grad_norm": 1.2944831848144531, + "learning_rate": 6.3299999999999995e-06, + "loss": 0.5666, + "step": 1268 + }, + { + "epoch": 0.0710605890917236, + "grad_norm": 1.0914361476898193, + "learning_rate": 6.335e-06, + "loss": 0.3782, + "step": 1269 + }, + { + "epoch": 0.07111658640385261, + "grad_norm": 1.227116346359253, + "learning_rate": 6.34e-06, + "loss": 0.5257, + "step": 1270 + }, + { + "epoch": 0.07117258371598163, + "grad_norm": 1.1540474891662598, + "learning_rate": 6.345000000000001e-06, + "loss": 0.3348, + "step": 1271 + }, + { + "epoch": 0.07122858102811065, + "grad_norm": 1.340309977531433, + "learning_rate": 6.35e-06, + "loss": 0.465, + "step": 1272 + }, + { + "epoch": 0.07128457834023967, + "grad_norm": 1.0309977531433105, + "learning_rate": 6.355e-06, + "loss": 0.4303, + "step": 1273 + }, + { + "epoch": 0.07134057565236869, + "grad_norm": 1.1087217330932617, + "learning_rate": 6.360000000000001e-06, + "loss": 0.3617, + "step": 1274 + }, + { + "epoch": 0.07139657296449771, + "grad_norm": 1.0575168132781982, + "learning_rate": 6.365e-06, + "loss": 0.4675, + "step": 1275 + }, + { + "epoch": 0.07145257027662673, + "grad_norm": 3.266204833984375, + "learning_rate": 6.370000000000001e-06, + "loss": 0.5266, + "step": 1276 + }, + { + "epoch": 0.07150856758875573, + "grad_norm": 1.163043737411499, + "learning_rate": 6.375000000000001e-06, + "loss": 0.3054, + "step": 1277 + }, + { + "epoch": 0.07156456490088475, + "grad_norm": 1.5021004676818848, + "learning_rate": 6.38e-06, + "loss": 0.6384, + "step": 1278 + }, + { + "epoch": 0.07162056221301377, + "grad_norm": 1.5890547037124634, + "learning_rate": 6.385000000000001e-06, + "loss": 0.6123, + "step": 1279 + }, + { + "epoch": 0.07167655952514279, + "grad_norm": 1.2791961431503296, + "learning_rate": 6.39e-06, + "loss": 0.6111, + "step": 1280 + }, + { + "epoch": 0.07173255683727181, + "grad_norm": 1.395918846130371, + "learning_rate": 6.395000000000001e-06, + "loss": 0.3987, + "step": 1281 + }, + { + "epoch": 0.07178855414940083, + "grad_norm": 1.292610764503479, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.4165, + "step": 1282 + }, + { + "epoch": 0.07184455146152985, + "grad_norm": 1.6642959117889404, + "learning_rate": 6.405e-06, + "loss": 0.4285, + "step": 1283 + }, + { + "epoch": 0.07190054877365887, + "grad_norm": 1.4356571435928345, + "learning_rate": 6.4100000000000005e-06, + "loss": 0.4188, + "step": 1284 + }, + { + "epoch": 0.07195654608578789, + "grad_norm": 1.263612985610962, + "learning_rate": 6.415e-06, + "loss": 0.4058, + "step": 1285 + }, + { + "epoch": 0.0720125433979169, + "grad_norm": 1.3215962648391724, + "learning_rate": 6.4199999999999995e-06, + "loss": 0.5013, + "step": 1286 + }, + { + "epoch": 0.07206854071004591, + "grad_norm": 1.2451215982437134, + "learning_rate": 6.425e-06, + "loss": 0.5058, + "step": 1287 + }, + { + "epoch": 0.07212453802217493, + "grad_norm": 1.1814186573028564, + "learning_rate": 6.43e-06, + "loss": 0.3944, + "step": 1288 + }, + { + "epoch": 0.07218053533430395, + "grad_norm": 1.1279220581054688, + "learning_rate": 6.435000000000001e-06, + "loss": 0.4033, + "step": 1289 + }, + { + "epoch": 0.07223653264643297, + "grad_norm": 1.166107416152954, + "learning_rate": 6.44e-06, + "loss": 0.4462, + "step": 1290 + }, + { + "epoch": 0.07229252995856199, + "grad_norm": 1.1797864437103271, + "learning_rate": 6.444999999999999e-06, + "loss": 0.3905, + "step": 1291 + }, + { + "epoch": 0.07234852727069101, + "grad_norm": 1.0709209442138672, + "learning_rate": 6.45e-06, + "loss": 0.3772, + "step": 1292 + }, + { + "epoch": 0.07240452458282003, + "grad_norm": 1.115615963935852, + "learning_rate": 6.455e-06, + "loss": 0.3869, + "step": 1293 + }, + { + "epoch": 0.07246052189494905, + "grad_norm": 1.7949514389038086, + "learning_rate": 6.460000000000001e-06, + "loss": 0.5933, + "step": 1294 + }, + { + "epoch": 0.07251651920707806, + "grad_norm": 1.6825324296951294, + "learning_rate": 6.465e-06, + "loss": 0.5273, + "step": 1295 + }, + { + "epoch": 0.07257251651920708, + "grad_norm": 1.1637367010116577, + "learning_rate": 6.47e-06, + "loss": 0.4111, + "step": 1296 + }, + { + "epoch": 0.07262851383133609, + "grad_norm": 1.2039698362350464, + "learning_rate": 6.475000000000001e-06, + "loss": 0.5101, + "step": 1297 + }, + { + "epoch": 0.07268451114346511, + "grad_norm": 1.3971374034881592, + "learning_rate": 6.48e-06, + "loss": 0.4956, + "step": 1298 + }, + { + "epoch": 0.07274050845559413, + "grad_norm": 1.3009939193725586, + "learning_rate": 6.485000000000001e-06, + "loss": 0.371, + "step": 1299 + }, + { + "epoch": 0.07279650576772315, + "grad_norm": 1.4169080257415771, + "learning_rate": 6.4900000000000005e-06, + "loss": 0.7171, + "step": 1300 + }, + { + "epoch": 0.07285250307985217, + "grad_norm": 2.2091469764709473, + "learning_rate": 6.495e-06, + "loss": 0.565, + "step": 1301 + }, + { + "epoch": 0.07290850039198118, + "grad_norm": 1.427626371383667, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.39, + "step": 1302 + }, + { + "epoch": 0.0729644977041102, + "grad_norm": 1.2434399127960205, + "learning_rate": 6.505e-06, + "loss": 0.4394, + "step": 1303 + }, + { + "epoch": 0.07302049501623922, + "grad_norm": 1.6202256679534912, + "learning_rate": 6.510000000000001e-06, + "loss": 0.5287, + "step": 1304 + }, + { + "epoch": 0.07307649232836824, + "grad_norm": 2.5532824993133545, + "learning_rate": 6.515e-06, + "loss": 0.577, + "step": 1305 + }, + { + "epoch": 0.07313248964049726, + "grad_norm": 1.2064380645751953, + "learning_rate": 6.519999999999999e-06, + "loss": 0.6086, + "step": 1306 + }, + { + "epoch": 0.07318848695262628, + "grad_norm": 1.3083627223968506, + "learning_rate": 6.525e-06, + "loss": 0.4166, + "step": 1307 + }, + { + "epoch": 0.07324448426475529, + "grad_norm": 1.352712631225586, + "learning_rate": 6.53e-06, + "loss": 0.4438, + "step": 1308 + }, + { + "epoch": 0.0733004815768843, + "grad_norm": 1.2328424453735352, + "learning_rate": 6.535000000000001e-06, + "loss": 0.4751, + "step": 1309 + }, + { + "epoch": 0.07335647888901332, + "grad_norm": 1.2588063478469849, + "learning_rate": 6.54e-06, + "loss": 0.5114, + "step": 1310 + }, + { + "epoch": 0.07341247620114234, + "grad_norm": 1.082762360572815, + "learning_rate": 6.545e-06, + "loss": 0.4373, + "step": 1311 + }, + { + "epoch": 0.07346847351327136, + "grad_norm": 1.2910832166671753, + "learning_rate": 6.550000000000001e-06, + "loss": 0.4089, + "step": 1312 + }, + { + "epoch": 0.07352447082540038, + "grad_norm": 1.1955432891845703, + "learning_rate": 6.555e-06, + "loss": 0.41, + "step": 1313 + }, + { + "epoch": 0.0735804681375294, + "grad_norm": 1.530601143836975, + "learning_rate": 6.560000000000001e-06, + "loss": 0.7009, + "step": 1314 + }, + { + "epoch": 0.07363646544965842, + "grad_norm": 1.209916353225708, + "learning_rate": 6.565000000000001e-06, + "loss": 0.6077, + "step": 1315 + }, + { + "epoch": 0.07369246276178744, + "grad_norm": 1.1398082971572876, + "learning_rate": 6.57e-06, + "loss": 0.4088, + "step": 1316 + }, + { + "epoch": 0.07374846007391646, + "grad_norm": 1.4278721809387207, + "learning_rate": 6.5750000000000006e-06, + "loss": 0.5536, + "step": 1317 + }, + { + "epoch": 0.07380445738604546, + "grad_norm": 1.253266453742981, + "learning_rate": 6.58e-06, + "loss": 0.4177, + "step": 1318 + }, + { + "epoch": 0.07386045469817448, + "grad_norm": 1.234006404876709, + "learning_rate": 6.5850000000000005e-06, + "loss": 0.3958, + "step": 1319 + }, + { + "epoch": 0.0739164520103035, + "grad_norm": 1.3969980478286743, + "learning_rate": 6.5900000000000004e-06, + "loss": 0.5297, + "step": 1320 + }, + { + "epoch": 0.07397244932243252, + "grad_norm": 1.5530579090118408, + "learning_rate": 6.5949999999999995e-06, + "loss": 0.5444, + "step": 1321 + }, + { + "epoch": 0.07402844663456154, + "grad_norm": 1.1098066568374634, + "learning_rate": 6.6e-06, + "loss": 0.4511, + "step": 1322 + }, + { + "epoch": 0.07408444394669056, + "grad_norm": 1.1944104433059692, + "learning_rate": 6.605e-06, + "loss": 0.3559, + "step": 1323 + }, + { + "epoch": 0.07414044125881958, + "grad_norm": 1.2156531810760498, + "learning_rate": 6.610000000000001e-06, + "loss": 0.4142, + "step": 1324 + }, + { + "epoch": 0.0741964385709486, + "grad_norm": 1.2404552698135376, + "learning_rate": 6.615e-06, + "loss": 0.4893, + "step": 1325 + }, + { + "epoch": 0.07425243588307762, + "grad_norm": 1.3357646465301514, + "learning_rate": 6.62e-06, + "loss": 0.4867, + "step": 1326 + }, + { + "epoch": 0.07430843319520664, + "grad_norm": 1.24103844165802, + "learning_rate": 6.625000000000001e-06, + "loss": 0.563, + "step": 1327 + }, + { + "epoch": 0.07436443050733564, + "grad_norm": 0.9767637848854065, + "learning_rate": 6.63e-06, + "loss": 0.4116, + "step": 1328 + }, + { + "epoch": 0.07442042781946466, + "grad_norm": 1.257356882095337, + "learning_rate": 6.635000000000001e-06, + "loss": 0.487, + "step": 1329 + }, + { + "epoch": 0.07447642513159368, + "grad_norm": 1.5664671659469604, + "learning_rate": 6.640000000000001e-06, + "loss": 0.4498, + "step": 1330 + }, + { + "epoch": 0.0745324224437227, + "grad_norm": 1.2948386669158936, + "learning_rate": 6.645e-06, + "loss": 0.4589, + "step": 1331 + }, + { + "epoch": 0.07458841975585172, + "grad_norm": 1.3933333158493042, + "learning_rate": 6.650000000000001e-06, + "loss": 0.5161, + "step": 1332 + }, + { + "epoch": 0.07464441706798074, + "grad_norm": 1.3152962923049927, + "learning_rate": 6.655e-06, + "loss": 0.539, + "step": 1333 + }, + { + "epoch": 0.07470041438010976, + "grad_norm": 1.059032678604126, + "learning_rate": 6.660000000000001e-06, + "loss": 0.5141, + "step": 1334 + }, + { + "epoch": 0.07475641169223878, + "grad_norm": 1.3351902961730957, + "learning_rate": 6.6650000000000006e-06, + "loss": 0.4759, + "step": 1335 + }, + { + "epoch": 0.0748124090043678, + "grad_norm": 1.3436874151229858, + "learning_rate": 6.67e-06, + "loss": 0.5057, + "step": 1336 + }, + { + "epoch": 0.07486840631649681, + "grad_norm": 1.3562463521957397, + "learning_rate": 6.6750000000000005e-06, + "loss": 0.419, + "step": 1337 + }, + { + "epoch": 0.07492440362862583, + "grad_norm": 1.434891939163208, + "learning_rate": 6.68e-06, + "loss": 0.4996, + "step": 1338 + }, + { + "epoch": 0.07498040094075484, + "grad_norm": 1.249218463897705, + "learning_rate": 6.685000000000001e-06, + "loss": 0.3987, + "step": 1339 + }, + { + "epoch": 0.07503639825288386, + "grad_norm": 1.236860990524292, + "learning_rate": 6.69e-06, + "loss": 0.5033, + "step": 1340 + }, + { + "epoch": 0.07509239556501288, + "grad_norm": 1.1586852073669434, + "learning_rate": 6.695e-06, + "loss": 0.4026, + "step": 1341 + }, + { + "epoch": 0.0751483928771419, + "grad_norm": 1.1438720226287842, + "learning_rate": 6.700000000000001e-06, + "loss": 0.411, + "step": 1342 + }, + { + "epoch": 0.07520439018927091, + "grad_norm": 1.0562424659729004, + "learning_rate": 6.705e-06, + "loss": 0.346, + "step": 1343 + }, + { + "epoch": 0.07526038750139993, + "grad_norm": 1.7857998609542847, + "learning_rate": 6.710000000000001e-06, + "loss": 0.3514, + "step": 1344 + }, + { + "epoch": 0.07531638481352895, + "grad_norm": 1.415966510772705, + "learning_rate": 6.715e-06, + "loss": 0.4802, + "step": 1345 + }, + { + "epoch": 0.07537238212565797, + "grad_norm": 1.238795280456543, + "learning_rate": 6.72e-06, + "loss": 0.4715, + "step": 1346 + }, + { + "epoch": 0.07542837943778699, + "grad_norm": 1.2985560894012451, + "learning_rate": 6.725000000000001e-06, + "loss": 0.4027, + "step": 1347 + }, + { + "epoch": 0.07548437674991601, + "grad_norm": 1.053097128868103, + "learning_rate": 6.73e-06, + "loss": 0.3804, + "step": 1348 + }, + { + "epoch": 0.07554037406204502, + "grad_norm": 1.1594445705413818, + "learning_rate": 6.735e-06, + "loss": 0.4952, + "step": 1349 + }, + { + "epoch": 0.07559637137417403, + "grad_norm": 1.3159594535827637, + "learning_rate": 6.740000000000001e-06, + "loss": 0.3801, + "step": 1350 + }, + { + "epoch": 0.07565236868630305, + "grad_norm": 1.2488571405410767, + "learning_rate": 6.745e-06, + "loss": 0.5075, + "step": 1351 + }, + { + "epoch": 0.07570836599843207, + "grad_norm": 1.1797436475753784, + "learning_rate": 6.750000000000001e-06, + "loss": 0.4398, + "step": 1352 + }, + { + "epoch": 0.07576436331056109, + "grad_norm": 1.1828269958496094, + "learning_rate": 6.7550000000000005e-06, + "loss": 0.5035, + "step": 1353 + }, + { + "epoch": 0.07582036062269011, + "grad_norm": 1.5088539123535156, + "learning_rate": 6.76e-06, + "loss": 0.4993, + "step": 1354 + }, + { + "epoch": 0.07587635793481913, + "grad_norm": 1.2271095514297485, + "learning_rate": 6.7650000000000005e-06, + "loss": 0.4532, + "step": 1355 + }, + { + "epoch": 0.07593235524694815, + "grad_norm": 1.0573952198028564, + "learning_rate": 6.7699999999999996e-06, + "loss": 0.3829, + "step": 1356 + }, + { + "epoch": 0.07598835255907717, + "grad_norm": 1.1782804727554321, + "learning_rate": 6.775000000000001e-06, + "loss": 0.4338, + "step": 1357 + }, + { + "epoch": 0.07604434987120619, + "grad_norm": 1.1794538497924805, + "learning_rate": 6.78e-06, + "loss": 0.4669, + "step": 1358 + }, + { + "epoch": 0.07610034718333519, + "grad_norm": 1.2195823192596436, + "learning_rate": 6.784999999999999e-06, + "loss": 0.614, + "step": 1359 + }, + { + "epoch": 0.07615634449546421, + "grad_norm": 1.3329391479492188, + "learning_rate": 6.79e-06, + "loss": 0.4395, + "step": 1360 + }, + { + "epoch": 0.07621234180759323, + "grad_norm": 1.1854500770568848, + "learning_rate": 6.795e-06, + "loss": 0.5083, + "step": 1361 + }, + { + "epoch": 0.07626833911972225, + "grad_norm": 1.340144395828247, + "learning_rate": 6.800000000000001e-06, + "loss": 0.4426, + "step": 1362 + }, + { + "epoch": 0.07632433643185127, + "grad_norm": 1.565130352973938, + "learning_rate": 6.805e-06, + "loss": 0.4845, + "step": 1363 + }, + { + "epoch": 0.07638033374398029, + "grad_norm": 2.765289306640625, + "learning_rate": 6.81e-06, + "loss": 0.5165, + "step": 1364 + }, + { + "epoch": 0.07643633105610931, + "grad_norm": 1.2596156597137451, + "learning_rate": 6.815000000000001e-06, + "loss": 0.619, + "step": 1365 + }, + { + "epoch": 0.07649232836823833, + "grad_norm": 1.4560434818267822, + "learning_rate": 6.82e-06, + "loss": 0.4165, + "step": 1366 + }, + { + "epoch": 0.07654832568036735, + "grad_norm": 1.0805444717407227, + "learning_rate": 6.825000000000001e-06, + "loss": 0.4847, + "step": 1367 + }, + { + "epoch": 0.07660432299249637, + "grad_norm": 1.2908236980438232, + "learning_rate": 6.830000000000001e-06, + "loss": 0.4831, + "step": 1368 + }, + { + "epoch": 0.07666032030462538, + "grad_norm": 1.1790556907653809, + "learning_rate": 6.835e-06, + "loss": 0.4685, + "step": 1369 + }, + { + "epoch": 0.07671631761675439, + "grad_norm": 1.2401238679885864, + "learning_rate": 6.840000000000001e-06, + "loss": 0.4726, + "step": 1370 + }, + { + "epoch": 0.07677231492888341, + "grad_norm": 1.1133661270141602, + "learning_rate": 6.845e-06, + "loss": 0.3749, + "step": 1371 + }, + { + "epoch": 0.07682831224101243, + "grad_norm": 1.0683152675628662, + "learning_rate": 6.8500000000000005e-06, + "loss": 0.3595, + "step": 1372 + }, + { + "epoch": 0.07688430955314145, + "grad_norm": 1.19899582862854, + "learning_rate": 6.8550000000000004e-06, + "loss": 0.4688, + "step": 1373 + }, + { + "epoch": 0.07694030686527047, + "grad_norm": 1.1337486505508423, + "learning_rate": 6.8599999999999995e-06, + "loss": 0.454, + "step": 1374 + }, + { + "epoch": 0.07699630417739949, + "grad_norm": 1.4259730577468872, + "learning_rate": 6.865e-06, + "loss": 0.5977, + "step": 1375 + }, + { + "epoch": 0.0770523014895285, + "grad_norm": 0.9646322727203369, + "learning_rate": 6.87e-06, + "loss": 0.4321, + "step": 1376 + }, + { + "epoch": 0.07710829880165752, + "grad_norm": 1.3179805278778076, + "learning_rate": 6.875000000000001e-06, + "loss": 0.4031, + "step": 1377 + }, + { + "epoch": 0.07716429611378654, + "grad_norm": 1.463568925857544, + "learning_rate": 6.88e-06, + "loss": 0.5436, + "step": 1378 + }, + { + "epoch": 0.07722029342591556, + "grad_norm": 1.2384649515151978, + "learning_rate": 6.885e-06, + "loss": 0.4111, + "step": 1379 + }, + { + "epoch": 0.07727629073804457, + "grad_norm": 1.4996534585952759, + "learning_rate": 6.890000000000001e-06, + "loss": 0.4076, + "step": 1380 + }, + { + "epoch": 0.07733228805017359, + "grad_norm": 1.232329249382019, + "learning_rate": 6.895e-06, + "loss": 0.4954, + "step": 1381 + }, + { + "epoch": 0.0773882853623026, + "grad_norm": 1.4218552112579346, + "learning_rate": 6.900000000000001e-06, + "loss": 0.5603, + "step": 1382 + }, + { + "epoch": 0.07744428267443162, + "grad_norm": 1.2759547233581543, + "learning_rate": 6.905e-06, + "loss": 0.5696, + "step": 1383 + }, + { + "epoch": 0.07750027998656064, + "grad_norm": 1.9820356369018555, + "learning_rate": 6.91e-06, + "loss": 0.3646, + "step": 1384 + }, + { + "epoch": 0.07755627729868966, + "grad_norm": 1.2480307817459106, + "learning_rate": 6.915000000000001e-06, + "loss": 0.4816, + "step": 1385 + }, + { + "epoch": 0.07761227461081868, + "grad_norm": 1.3854848146438599, + "learning_rate": 6.92e-06, + "loss": 0.4986, + "step": 1386 + }, + { + "epoch": 0.0776682719229477, + "grad_norm": 1.5433881282806396, + "learning_rate": 6.925000000000001e-06, + "loss": 0.4568, + "step": 1387 + }, + { + "epoch": 0.07772426923507672, + "grad_norm": 1.2369163036346436, + "learning_rate": 6.9300000000000006e-06, + "loss": 0.5573, + "step": 1388 + }, + { + "epoch": 0.07778026654720574, + "grad_norm": 1.256779432296753, + "learning_rate": 6.935e-06, + "loss": 0.436, + "step": 1389 + }, + { + "epoch": 0.07783626385933475, + "grad_norm": 1.3208945989608765, + "learning_rate": 6.9400000000000005e-06, + "loss": 0.5535, + "step": 1390 + }, + { + "epoch": 0.07789226117146376, + "grad_norm": 1.238890290260315, + "learning_rate": 6.945e-06, + "loss": 0.4379, + "step": 1391 + }, + { + "epoch": 0.07794825848359278, + "grad_norm": 1.3645505905151367, + "learning_rate": 6.950000000000001e-06, + "loss": 0.6061, + "step": 1392 + }, + { + "epoch": 0.0780042557957218, + "grad_norm": 1.1675653457641602, + "learning_rate": 6.955e-06, + "loss": 0.3521, + "step": 1393 + }, + { + "epoch": 0.07806025310785082, + "grad_norm": 1.2153840065002441, + "learning_rate": 6.9599999999999994e-06, + "loss": 0.4654, + "step": 1394 + }, + { + "epoch": 0.07811625041997984, + "grad_norm": 1.3639699220657349, + "learning_rate": 6.965000000000001e-06, + "loss": 0.497, + "step": 1395 + }, + { + "epoch": 0.07817224773210886, + "grad_norm": 1.2933604717254639, + "learning_rate": 6.97e-06, + "loss": 0.6267, + "step": 1396 + }, + { + "epoch": 0.07822824504423788, + "grad_norm": 1.2269926071166992, + "learning_rate": 6.975000000000001e-06, + "loss": 0.4726, + "step": 1397 + }, + { + "epoch": 0.0782842423563669, + "grad_norm": 1.5721561908721924, + "learning_rate": 6.98e-06, + "loss": 0.4109, + "step": 1398 + }, + { + "epoch": 0.07834023966849592, + "grad_norm": 1.0359388589859009, + "learning_rate": 6.985e-06, + "loss": 0.4389, + "step": 1399 + }, + { + "epoch": 0.07839623698062494, + "grad_norm": 4.911037445068359, + "learning_rate": 6.990000000000001e-06, + "loss": 0.45, + "step": 1400 + }, + { + "epoch": 0.07845223429275394, + "grad_norm": 1.179355263710022, + "learning_rate": 6.995e-06, + "loss": 0.4637, + "step": 1401 + }, + { + "epoch": 0.07850823160488296, + "grad_norm": 1.384724736213684, + "learning_rate": 7.000000000000001e-06, + "loss": 0.5596, + "step": 1402 + }, + { + "epoch": 0.07856422891701198, + "grad_norm": 1.3643035888671875, + "learning_rate": 7.005000000000001e-06, + "loss": 0.5343, + "step": 1403 + }, + { + "epoch": 0.078620226229141, + "grad_norm": 1.1445242166519165, + "learning_rate": 7.01e-06, + "loss": 0.5881, + "step": 1404 + }, + { + "epoch": 0.07867622354127002, + "grad_norm": 1.1189980506896973, + "learning_rate": 7.015000000000001e-06, + "loss": 0.379, + "step": 1405 + }, + { + "epoch": 0.07873222085339904, + "grad_norm": 1.3632959127426147, + "learning_rate": 7.0200000000000006e-06, + "loss": 0.4533, + "step": 1406 + }, + { + "epoch": 0.07878821816552806, + "grad_norm": 1.0132023096084595, + "learning_rate": 7.025000000000001e-06, + "loss": 0.4028, + "step": 1407 + }, + { + "epoch": 0.07884421547765708, + "grad_norm": 1.574580192565918, + "learning_rate": 7.0300000000000005e-06, + "loss": 0.7049, + "step": 1408 + }, + { + "epoch": 0.0789002127897861, + "grad_norm": 1.207859754562378, + "learning_rate": 7.0349999999999996e-06, + "loss": 0.3993, + "step": 1409 + }, + { + "epoch": 0.07895621010191511, + "grad_norm": 1.1706397533416748, + "learning_rate": 7.04e-06, + "loss": 0.3902, + "step": 1410 + }, + { + "epoch": 0.07901220741404412, + "grad_norm": 1.36715829372406, + "learning_rate": 7.045e-06, + "loss": 0.5922, + "step": 1411 + }, + { + "epoch": 0.07906820472617314, + "grad_norm": 3.39699649810791, + "learning_rate": 7.049999999999999e-06, + "loss": 0.4428, + "step": 1412 + }, + { + "epoch": 0.07912420203830216, + "grad_norm": 1.1224071979522705, + "learning_rate": 7.055e-06, + "loss": 0.4317, + "step": 1413 + }, + { + "epoch": 0.07918019935043118, + "grad_norm": 1.2464991807937622, + "learning_rate": 7.06e-06, + "loss": 0.4403, + "step": 1414 + }, + { + "epoch": 0.0792361966625602, + "grad_norm": 1.354768991470337, + "learning_rate": 7.065000000000001e-06, + "loss": 0.4857, + "step": 1415 + }, + { + "epoch": 0.07929219397468922, + "grad_norm": 1.5229969024658203, + "learning_rate": 7.07e-06, + "loss": 0.5088, + "step": 1416 + }, + { + "epoch": 0.07934819128681823, + "grad_norm": 1.3952661752700806, + "learning_rate": 7.075e-06, + "loss": 0.6225, + "step": 1417 + }, + { + "epoch": 0.07940418859894725, + "grad_norm": 1.2597805261611938, + "learning_rate": 7.080000000000001e-06, + "loss": 0.3716, + "step": 1418 + }, + { + "epoch": 0.07946018591107627, + "grad_norm": 1.479173183441162, + "learning_rate": 7.085e-06, + "loss": 0.6558, + "step": 1419 + }, + { + "epoch": 0.07951618322320529, + "grad_norm": 1.175714135169983, + "learning_rate": 7.090000000000001e-06, + "loss": 0.6377, + "step": 1420 + }, + { + "epoch": 0.0795721805353343, + "grad_norm": 1.1564451456069946, + "learning_rate": 7.095000000000001e-06, + "loss": 0.5012, + "step": 1421 + }, + { + "epoch": 0.07962817784746332, + "grad_norm": 1.0812937021255493, + "learning_rate": 7.1e-06, + "loss": 0.4637, + "step": 1422 + }, + { + "epoch": 0.07968417515959234, + "grad_norm": 1.143661379814148, + "learning_rate": 7.105000000000001e-06, + "loss": 0.3334, + "step": 1423 + }, + { + "epoch": 0.07974017247172135, + "grad_norm": 1.3311735391616821, + "learning_rate": 7.11e-06, + "loss": 0.4989, + "step": 1424 + }, + { + "epoch": 0.07979616978385037, + "grad_norm": 1.2069625854492188, + "learning_rate": 7.1150000000000005e-06, + "loss": 0.3392, + "step": 1425 + }, + { + "epoch": 0.07985216709597939, + "grad_norm": 1.180067539215088, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.3681, + "step": 1426 + }, + { + "epoch": 0.07990816440810841, + "grad_norm": 1.2177414894104004, + "learning_rate": 7.1249999999999995e-06, + "loss": 0.4858, + "step": 1427 + }, + { + "epoch": 0.07996416172023743, + "grad_norm": 1.3053133487701416, + "learning_rate": 7.13e-06, + "loss": 0.4826, + "step": 1428 + }, + { + "epoch": 0.08002015903236645, + "grad_norm": 1.080035924911499, + "learning_rate": 7.135e-06, + "loss": 0.3143, + "step": 1429 + }, + { + "epoch": 0.08007615634449547, + "grad_norm": 1.509613275527954, + "learning_rate": 7.140000000000001e-06, + "loss": 0.4389, + "step": 1430 + }, + { + "epoch": 0.08013215365662449, + "grad_norm": 1.1446751356124878, + "learning_rate": 7.145e-06, + "loss": 0.4379, + "step": 1431 + }, + { + "epoch": 0.0801881509687535, + "grad_norm": 1.4199235439300537, + "learning_rate": 7.15e-06, + "loss": 0.4777, + "step": 1432 + }, + { + "epoch": 0.08024414828088251, + "grad_norm": 1.5133382081985474, + "learning_rate": 7.155000000000001e-06, + "loss": 0.6484, + "step": 1433 + }, + { + "epoch": 0.08030014559301153, + "grad_norm": 1.3345826864242554, + "learning_rate": 7.16e-06, + "loss": 0.4601, + "step": 1434 + }, + { + "epoch": 0.08035614290514055, + "grad_norm": 1.524214267730713, + "learning_rate": 7.165000000000001e-06, + "loss": 0.4004, + "step": 1435 + }, + { + "epoch": 0.08041214021726957, + "grad_norm": 1.2183082103729248, + "learning_rate": 7.17e-06, + "loss": 0.4514, + "step": 1436 + }, + { + "epoch": 0.08046813752939859, + "grad_norm": 1.2108083963394165, + "learning_rate": 7.175e-06, + "loss": 0.4932, + "step": 1437 + }, + { + "epoch": 0.08052413484152761, + "grad_norm": 1.309736728668213, + "learning_rate": 7.180000000000001e-06, + "loss": 0.3938, + "step": 1438 + }, + { + "epoch": 0.08058013215365663, + "grad_norm": 0.9972428679466248, + "learning_rate": 7.185e-06, + "loss": 0.322, + "step": 1439 + }, + { + "epoch": 0.08063612946578565, + "grad_norm": 1.2740402221679688, + "learning_rate": 7.190000000000001e-06, + "loss": 0.531, + "step": 1440 + }, + { + "epoch": 0.08069212677791467, + "grad_norm": 1.4080537557601929, + "learning_rate": 7.1950000000000006e-06, + "loss": 0.5996, + "step": 1441 + }, + { + "epoch": 0.08074812409004367, + "grad_norm": 1.7083243131637573, + "learning_rate": 7.2e-06, + "loss": 0.6026, + "step": 1442 + }, + { + "epoch": 0.08080412140217269, + "grad_norm": 1.209126591682434, + "learning_rate": 7.2050000000000005e-06, + "loss": 0.4386, + "step": 1443 + }, + { + "epoch": 0.08086011871430171, + "grad_norm": 1.3964226245880127, + "learning_rate": 7.2100000000000004e-06, + "loss": 0.4701, + "step": 1444 + }, + { + "epoch": 0.08091611602643073, + "grad_norm": 1.3449779748916626, + "learning_rate": 7.215000000000001e-06, + "loss": 0.4253, + "step": 1445 + }, + { + "epoch": 0.08097211333855975, + "grad_norm": 1.313974380493164, + "learning_rate": 7.22e-06, + "loss": 0.4444, + "step": 1446 + }, + { + "epoch": 0.08102811065068877, + "grad_norm": 1.2853071689605713, + "learning_rate": 7.2249999999999994e-06, + "loss": 0.5527, + "step": 1447 + }, + { + "epoch": 0.08108410796281779, + "grad_norm": 1.1168649196624756, + "learning_rate": 7.230000000000001e-06, + "loss": 0.4293, + "step": 1448 + }, + { + "epoch": 0.0811401052749468, + "grad_norm": 1.1112141609191895, + "learning_rate": 7.235e-06, + "loss": 0.5183, + "step": 1449 + }, + { + "epoch": 0.08119610258707582, + "grad_norm": 1.6637715101242065, + "learning_rate": 7.240000000000001e-06, + "loss": 0.4883, + "step": 1450 + }, + { + "epoch": 0.08125209989920484, + "grad_norm": 1.3402663469314575, + "learning_rate": 7.245e-06, + "loss": 0.3671, + "step": 1451 + }, + { + "epoch": 0.08130809721133385, + "grad_norm": 1.248765230178833, + "learning_rate": 7.25e-06, + "loss": 0.3786, + "step": 1452 + }, + { + "epoch": 0.08136409452346287, + "grad_norm": 1.4092042446136475, + "learning_rate": 7.255000000000001e-06, + "loss": 0.4625, + "step": 1453 + }, + { + "epoch": 0.08142009183559189, + "grad_norm": 1.480547547340393, + "learning_rate": 7.26e-06, + "loss": 0.6167, + "step": 1454 + }, + { + "epoch": 0.0814760891477209, + "grad_norm": 1.3180210590362549, + "learning_rate": 7.265000000000001e-06, + "loss": 0.4995, + "step": 1455 + }, + { + "epoch": 0.08153208645984993, + "grad_norm": 1.1281007528305054, + "learning_rate": 7.270000000000001e-06, + "loss": 0.4142, + "step": 1456 + }, + { + "epoch": 0.08158808377197894, + "grad_norm": 1.519881248474121, + "learning_rate": 7.275e-06, + "loss": 0.5344, + "step": 1457 + }, + { + "epoch": 0.08164408108410796, + "grad_norm": 1.4132120609283447, + "learning_rate": 7.280000000000001e-06, + "loss": 0.5994, + "step": 1458 + }, + { + "epoch": 0.08170007839623698, + "grad_norm": 1.0918807983398438, + "learning_rate": 7.2850000000000006e-06, + "loss": 0.3358, + "step": 1459 + }, + { + "epoch": 0.081756075708366, + "grad_norm": 1.292568325996399, + "learning_rate": 7.290000000000001e-06, + "loss": 0.4402, + "step": 1460 + }, + { + "epoch": 0.08181207302049502, + "grad_norm": 1.1813576221466064, + "learning_rate": 7.2950000000000005e-06, + "loss": 0.4083, + "step": 1461 + }, + { + "epoch": 0.08186807033262404, + "grad_norm": 1.139785647392273, + "learning_rate": 7.2999999999999996e-06, + "loss": 0.5056, + "step": 1462 + }, + { + "epoch": 0.08192406764475305, + "grad_norm": 1.500008225440979, + "learning_rate": 7.305e-06, + "loss": 0.4989, + "step": 1463 + }, + { + "epoch": 0.08198006495688206, + "grad_norm": 1.4263228178024292, + "learning_rate": 7.31e-06, + "loss": 0.4714, + "step": 1464 + }, + { + "epoch": 0.08203606226901108, + "grad_norm": 1.2919131517410278, + "learning_rate": 7.315000000000001e-06, + "loss": 0.4517, + "step": 1465 + }, + { + "epoch": 0.0820920595811401, + "grad_norm": 1.3762054443359375, + "learning_rate": 7.32e-06, + "loss": 0.4877, + "step": 1466 + }, + { + "epoch": 0.08214805689326912, + "grad_norm": 1.3548452854156494, + "learning_rate": 7.325e-06, + "loss": 0.4628, + "step": 1467 + }, + { + "epoch": 0.08220405420539814, + "grad_norm": 1.3240776062011719, + "learning_rate": 7.330000000000001e-06, + "loss": 0.5224, + "step": 1468 + }, + { + "epoch": 0.08226005151752716, + "grad_norm": 1.2442258596420288, + "learning_rate": 7.335e-06, + "loss": 0.4429, + "step": 1469 + }, + { + "epoch": 0.08231604882965618, + "grad_norm": 1.2015458345413208, + "learning_rate": 7.340000000000001e-06, + "loss": 0.4401, + "step": 1470 + }, + { + "epoch": 0.0823720461417852, + "grad_norm": 1.0796226263046265, + "learning_rate": 7.345000000000001e-06, + "loss": 0.3873, + "step": 1471 + }, + { + "epoch": 0.08242804345391422, + "grad_norm": 1.2062454223632812, + "learning_rate": 7.35e-06, + "loss": 0.4338, + "step": 1472 + }, + { + "epoch": 0.08248404076604322, + "grad_norm": 1.2619911432266235, + "learning_rate": 7.355000000000001e-06, + "loss": 0.4712, + "step": 1473 + }, + { + "epoch": 0.08254003807817224, + "grad_norm": 1.6455378532409668, + "learning_rate": 7.36e-06, + "loss": 0.4193, + "step": 1474 + }, + { + "epoch": 0.08259603539030126, + "grad_norm": 1.209715485572815, + "learning_rate": 7.365e-06, + "loss": 0.5695, + "step": 1475 + }, + { + "epoch": 0.08265203270243028, + "grad_norm": 1.3279260396957397, + "learning_rate": 7.370000000000001e-06, + "loss": 0.5192, + "step": 1476 + }, + { + "epoch": 0.0827080300145593, + "grad_norm": 1.1530653238296509, + "learning_rate": 7.375e-06, + "loss": 0.3153, + "step": 1477 + }, + { + "epoch": 0.08276402732668832, + "grad_norm": 1.9557344913482666, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.6641, + "step": 1478 + }, + { + "epoch": 0.08282002463881734, + "grad_norm": 1.6863083839416504, + "learning_rate": 7.3850000000000004e-06, + "loss": 0.566, + "step": 1479 + }, + { + "epoch": 0.08287602195094636, + "grad_norm": 1.282794713973999, + "learning_rate": 7.3899999999999995e-06, + "loss": 0.5705, + "step": 1480 + }, + { + "epoch": 0.08293201926307538, + "grad_norm": 1.4731370210647583, + "learning_rate": 7.395e-06, + "loss": 0.5086, + "step": 1481 + }, + { + "epoch": 0.0829880165752044, + "grad_norm": 1.4147073030471802, + "learning_rate": 7.4e-06, + "loss": 0.5286, + "step": 1482 + }, + { + "epoch": 0.0830440138873334, + "grad_norm": 1.2918366193771362, + "learning_rate": 7.405000000000001e-06, + "loss": 0.3988, + "step": 1483 + }, + { + "epoch": 0.08310001119946242, + "grad_norm": 0.9884164929389954, + "learning_rate": 7.41e-06, + "loss": 0.4509, + "step": 1484 + }, + { + "epoch": 0.08315600851159144, + "grad_norm": 1.1498831510543823, + "learning_rate": 7.414999999999999e-06, + "loss": 0.3887, + "step": 1485 + }, + { + "epoch": 0.08321200582372046, + "grad_norm": 1.2837716341018677, + "learning_rate": 7.420000000000001e-06, + "loss": 0.52, + "step": 1486 + }, + { + "epoch": 0.08326800313584948, + "grad_norm": 1.303924560546875, + "learning_rate": 7.425e-06, + "loss": 0.5503, + "step": 1487 + }, + { + "epoch": 0.0833240004479785, + "grad_norm": 1.507111668586731, + "learning_rate": 7.430000000000001e-06, + "loss": 0.6404, + "step": 1488 + }, + { + "epoch": 0.08337999776010752, + "grad_norm": 1.2963229417800903, + "learning_rate": 7.435e-06, + "loss": 0.5778, + "step": 1489 + }, + { + "epoch": 0.08343599507223654, + "grad_norm": 1.512855052947998, + "learning_rate": 7.44e-06, + "loss": 0.5745, + "step": 1490 + }, + { + "epoch": 0.08349199238436555, + "grad_norm": 1.5575251579284668, + "learning_rate": 7.445000000000001e-06, + "loss": 0.4233, + "step": 1491 + }, + { + "epoch": 0.08354798969649457, + "grad_norm": 1.0272166728973389, + "learning_rate": 7.45e-06, + "loss": 0.3971, + "step": 1492 + }, + { + "epoch": 0.08360398700862359, + "grad_norm": 1.458645224571228, + "learning_rate": 7.455000000000001e-06, + "loss": 0.5315, + "step": 1493 + }, + { + "epoch": 0.0836599843207526, + "grad_norm": 1.2564878463745117, + "learning_rate": 7.4600000000000006e-06, + "loss": 0.498, + "step": 1494 + }, + { + "epoch": 0.08371598163288162, + "grad_norm": 1.3385682106018066, + "learning_rate": 7.465e-06, + "loss": 0.6363, + "step": 1495 + }, + { + "epoch": 0.08377197894501064, + "grad_norm": 1.3317980766296387, + "learning_rate": 7.4700000000000005e-06, + "loss": 0.4346, + "step": 1496 + }, + { + "epoch": 0.08382797625713966, + "grad_norm": 1.1899913549423218, + "learning_rate": 7.4750000000000004e-06, + "loss": 0.5111, + "step": 1497 + }, + { + "epoch": 0.08388397356926867, + "grad_norm": 1.3336282968521118, + "learning_rate": 7.480000000000001e-06, + "loss": 0.5527, + "step": 1498 + }, + { + "epoch": 0.0839399708813977, + "grad_norm": 1.2331520318984985, + "learning_rate": 7.485e-06, + "loss": 0.4771, + "step": 1499 + }, + { + "epoch": 0.08399596819352671, + "grad_norm": 1.0488063097000122, + "learning_rate": 7.4899999999999994e-06, + "loss": 0.4086, + "step": 1500 + }, + { + "epoch": 0.08405196550565573, + "grad_norm": 1.606632947921753, + "learning_rate": 7.495e-06, + "loss": 0.5363, + "step": 1501 + }, + { + "epoch": 0.08410796281778475, + "grad_norm": 1.2820321321487427, + "learning_rate": 7.5e-06, + "loss": 0.4214, + "step": 1502 + }, + { + "epoch": 0.08416396012991377, + "grad_norm": 1.3019225597381592, + "learning_rate": 7.505000000000001e-06, + "loss": 0.5302, + "step": 1503 + }, + { + "epoch": 0.08421995744204278, + "grad_norm": 1.2436977624893188, + "learning_rate": 7.51e-06, + "loss": 0.555, + "step": 1504 + }, + { + "epoch": 0.0842759547541718, + "grad_norm": 1.3431636095046997, + "learning_rate": 7.515e-06, + "loss": 0.4918, + "step": 1505 + }, + { + "epoch": 0.08433195206630081, + "grad_norm": 1.1182070970535278, + "learning_rate": 7.520000000000001e-06, + "loss": 0.4375, + "step": 1506 + }, + { + "epoch": 0.08438794937842983, + "grad_norm": 1.193709135055542, + "learning_rate": 7.525e-06, + "loss": 0.3783, + "step": 1507 + }, + { + "epoch": 0.08444394669055885, + "grad_norm": 1.1308382749557495, + "learning_rate": 7.530000000000001e-06, + "loss": 0.4268, + "step": 1508 + }, + { + "epoch": 0.08449994400268787, + "grad_norm": 0.9366526007652283, + "learning_rate": 7.535000000000001e-06, + "loss": 0.4236, + "step": 1509 + }, + { + "epoch": 0.08455594131481689, + "grad_norm": 1.1923974752426147, + "learning_rate": 7.54e-06, + "loss": 0.3839, + "step": 1510 + }, + { + "epoch": 0.08461193862694591, + "grad_norm": 1.1978589296340942, + "learning_rate": 7.545000000000001e-06, + "loss": 0.3851, + "step": 1511 + }, + { + "epoch": 0.08466793593907493, + "grad_norm": 1.1501808166503906, + "learning_rate": 7.55e-06, + "loss": 0.4263, + "step": 1512 + }, + { + "epoch": 0.08472393325120395, + "grad_norm": 1.415138840675354, + "learning_rate": 7.555000000000001e-06, + "loss": 0.5748, + "step": 1513 + }, + { + "epoch": 0.08477993056333297, + "grad_norm": 1.2347288131713867, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.4694, + "step": 1514 + }, + { + "epoch": 0.08483592787546197, + "grad_norm": 1.1840616464614868, + "learning_rate": 7.5649999999999996e-06, + "loss": 0.5046, + "step": 1515 + }, + { + "epoch": 0.08489192518759099, + "grad_norm": 1.046579122543335, + "learning_rate": 7.57e-06, + "loss": 0.4277, + "step": 1516 + }, + { + "epoch": 0.08494792249972001, + "grad_norm": 1.0324609279632568, + "learning_rate": 7.575e-06, + "loss": 0.3577, + "step": 1517 + }, + { + "epoch": 0.08500391981184903, + "grad_norm": 1.2944605350494385, + "learning_rate": 7.580000000000001e-06, + "loss": 0.6079, + "step": 1518 + }, + { + "epoch": 0.08505991712397805, + "grad_norm": 1.3902816772460938, + "learning_rate": 7.585e-06, + "loss": 0.4648, + "step": 1519 + }, + { + "epoch": 0.08511591443610707, + "grad_norm": 1.3388081789016724, + "learning_rate": 7.59e-06, + "loss": 0.4821, + "step": 1520 + }, + { + "epoch": 0.08517191174823609, + "grad_norm": 1.249360203742981, + "learning_rate": 7.595000000000001e-06, + "loss": 0.4765, + "step": 1521 + }, + { + "epoch": 0.0852279090603651, + "grad_norm": 1.153415560722351, + "learning_rate": 7.6e-06, + "loss": 0.3804, + "step": 1522 + }, + { + "epoch": 0.08528390637249413, + "grad_norm": 1.3876703977584839, + "learning_rate": 7.605000000000001e-06, + "loss": 0.6177, + "step": 1523 + }, + { + "epoch": 0.08533990368462314, + "grad_norm": 1.1727885007858276, + "learning_rate": 7.610000000000001e-06, + "loss": 0.4695, + "step": 1524 + }, + { + "epoch": 0.08539590099675215, + "grad_norm": 1.174000859260559, + "learning_rate": 7.615e-06, + "loss": 0.4969, + "step": 1525 + }, + { + "epoch": 0.08545189830888117, + "grad_norm": 1.5841765403747559, + "learning_rate": 7.620000000000001e-06, + "loss": 0.3613, + "step": 1526 + }, + { + "epoch": 0.08550789562101019, + "grad_norm": 1.0650402307510376, + "learning_rate": 7.625e-06, + "loss": 0.4859, + "step": 1527 + }, + { + "epoch": 0.08556389293313921, + "grad_norm": 1.0996922254562378, + "learning_rate": 7.630000000000001e-06, + "loss": 0.4277, + "step": 1528 + }, + { + "epoch": 0.08561989024526823, + "grad_norm": 1.2042404413223267, + "learning_rate": 7.635e-06, + "loss": 0.4736, + "step": 1529 + }, + { + "epoch": 0.08567588755739725, + "grad_norm": 1.3029634952545166, + "learning_rate": 7.64e-06, + "loss": 0.4859, + "step": 1530 + }, + { + "epoch": 0.08573188486952626, + "grad_norm": 1.5617334842681885, + "learning_rate": 7.645e-06, + "loss": 0.5699, + "step": 1531 + }, + { + "epoch": 0.08578788218165528, + "grad_norm": 1.1617907285690308, + "learning_rate": 7.65e-06, + "loss": 0.4353, + "step": 1532 + }, + { + "epoch": 0.0858438794937843, + "grad_norm": 1.383780837059021, + "learning_rate": 7.655e-06, + "loss": 0.5618, + "step": 1533 + }, + { + "epoch": 0.08589987680591332, + "grad_norm": 1.1526728868484497, + "learning_rate": 7.660000000000001e-06, + "loss": 0.4117, + "step": 1534 + }, + { + "epoch": 0.08595587411804233, + "grad_norm": 1.1477508544921875, + "learning_rate": 7.665e-06, + "loss": 0.3695, + "step": 1535 + }, + { + "epoch": 0.08601187143017135, + "grad_norm": 1.674328327178955, + "learning_rate": 7.670000000000001e-06, + "loss": 0.4579, + "step": 1536 + }, + { + "epoch": 0.08606786874230037, + "grad_norm": 1.103920817375183, + "learning_rate": 7.675e-06, + "loss": 0.4256, + "step": 1537 + }, + { + "epoch": 0.08612386605442938, + "grad_norm": 1.1993048191070557, + "learning_rate": 7.68e-06, + "loss": 0.4485, + "step": 1538 + }, + { + "epoch": 0.0861798633665584, + "grad_norm": 1.079773187637329, + "learning_rate": 7.685e-06, + "loss": 0.3587, + "step": 1539 + }, + { + "epoch": 0.08623586067868742, + "grad_norm": 1.1587178707122803, + "learning_rate": 7.69e-06, + "loss": 0.4912, + "step": 1540 + }, + { + "epoch": 0.08629185799081644, + "grad_norm": 1.2015721797943115, + "learning_rate": 7.695e-06, + "loss": 0.5274, + "step": 1541 + }, + { + "epoch": 0.08634785530294546, + "grad_norm": 1.0844711065292358, + "learning_rate": 7.7e-06, + "loss": 0.3939, + "step": 1542 + }, + { + "epoch": 0.08640385261507448, + "grad_norm": 1.1569340229034424, + "learning_rate": 7.705e-06, + "loss": 0.5162, + "step": 1543 + }, + { + "epoch": 0.0864598499272035, + "grad_norm": 1.127315640449524, + "learning_rate": 7.71e-06, + "loss": 0.4135, + "step": 1544 + }, + { + "epoch": 0.08651584723933252, + "grad_norm": 1.184778094291687, + "learning_rate": 7.715e-06, + "loss": 0.477, + "step": 1545 + }, + { + "epoch": 0.08657184455146152, + "grad_norm": 1.3671724796295166, + "learning_rate": 7.72e-06, + "loss": 0.4705, + "step": 1546 + }, + { + "epoch": 0.08662784186359054, + "grad_norm": 1.2034422159194946, + "learning_rate": 7.725e-06, + "loss": 0.4117, + "step": 1547 + }, + { + "epoch": 0.08668383917571956, + "grad_norm": 2.615835428237915, + "learning_rate": 7.73e-06, + "loss": 0.4523, + "step": 1548 + }, + { + "epoch": 0.08673983648784858, + "grad_norm": 1.4498186111450195, + "learning_rate": 7.735000000000001e-06, + "loss": 0.4638, + "step": 1549 + }, + { + "epoch": 0.0867958337999776, + "grad_norm": 1.169029951095581, + "learning_rate": 7.74e-06, + "loss": 0.3688, + "step": 1550 + }, + { + "epoch": 0.08685183111210662, + "grad_norm": 1.2343640327453613, + "learning_rate": 7.745000000000001e-06, + "loss": 0.476, + "step": 1551 + }, + { + "epoch": 0.08690782842423564, + "grad_norm": 1.3974204063415527, + "learning_rate": 7.75e-06, + "loss": 0.422, + "step": 1552 + }, + { + "epoch": 0.08696382573636466, + "grad_norm": 1.21696937084198, + "learning_rate": 7.755e-06, + "loss": 0.4368, + "step": 1553 + }, + { + "epoch": 0.08701982304849368, + "grad_norm": 1.798683524131775, + "learning_rate": 7.76e-06, + "loss": 0.4386, + "step": 1554 + }, + { + "epoch": 0.0870758203606227, + "grad_norm": 1.2375297546386719, + "learning_rate": 7.765e-06, + "loss": 0.6109, + "step": 1555 + }, + { + "epoch": 0.0871318176727517, + "grad_norm": 1.1786304712295532, + "learning_rate": 7.77e-06, + "loss": 0.4902, + "step": 1556 + }, + { + "epoch": 0.08718781498488072, + "grad_norm": 1.4570947885513306, + "learning_rate": 7.775000000000001e-06, + "loss": 0.7328, + "step": 1557 + }, + { + "epoch": 0.08724381229700974, + "grad_norm": 1.12009859085083, + "learning_rate": 7.78e-06, + "loss": 0.4181, + "step": 1558 + }, + { + "epoch": 0.08729980960913876, + "grad_norm": 1.328429102897644, + "learning_rate": 7.785000000000001e-06, + "loss": 0.7427, + "step": 1559 + }, + { + "epoch": 0.08735580692126778, + "grad_norm": 1.4115002155303955, + "learning_rate": 7.79e-06, + "loss": 0.4477, + "step": 1560 + }, + { + "epoch": 0.0874118042333968, + "grad_norm": 1.53958261013031, + "learning_rate": 7.795e-06, + "loss": 0.5461, + "step": 1561 + }, + { + "epoch": 0.08746780154552582, + "grad_norm": 2.6513330936431885, + "learning_rate": 7.8e-06, + "loss": 0.5128, + "step": 1562 + }, + { + "epoch": 0.08752379885765484, + "grad_norm": 1.325663685798645, + "learning_rate": 7.805e-06, + "loss": 0.5435, + "step": 1563 + }, + { + "epoch": 0.08757979616978386, + "grad_norm": 0.9753627777099609, + "learning_rate": 7.810000000000001e-06, + "loss": 0.3439, + "step": 1564 + }, + { + "epoch": 0.08763579348191287, + "grad_norm": 1.3109617233276367, + "learning_rate": 7.815e-06, + "loss": 0.4922, + "step": 1565 + }, + { + "epoch": 0.08769179079404188, + "grad_norm": 1.272173523902893, + "learning_rate": 7.820000000000001e-06, + "loss": 0.4498, + "step": 1566 + }, + { + "epoch": 0.0877477881061709, + "grad_norm": 1.77103853225708, + "learning_rate": 7.825e-06, + "loss": 0.7537, + "step": 1567 + }, + { + "epoch": 0.08780378541829992, + "grad_norm": 1.1595690250396729, + "learning_rate": 7.83e-06, + "loss": 0.4236, + "step": 1568 + }, + { + "epoch": 0.08785978273042894, + "grad_norm": 1.303318738937378, + "learning_rate": 7.835e-06, + "loss": 0.4757, + "step": 1569 + }, + { + "epoch": 0.08791578004255796, + "grad_norm": 1.357015609741211, + "learning_rate": 7.84e-06, + "loss": 0.659, + "step": 1570 + }, + { + "epoch": 0.08797177735468698, + "grad_norm": 1.5197786092758179, + "learning_rate": 7.845e-06, + "loss": 0.5869, + "step": 1571 + }, + { + "epoch": 0.088027774666816, + "grad_norm": 1.1078104972839355, + "learning_rate": 7.850000000000001e-06, + "loss": 0.3845, + "step": 1572 + }, + { + "epoch": 0.08808377197894501, + "grad_norm": 1.304028868675232, + "learning_rate": 7.855e-06, + "loss": 0.4724, + "step": 1573 + }, + { + "epoch": 0.08813976929107403, + "grad_norm": 1.3804655075073242, + "learning_rate": 7.860000000000001e-06, + "loss": 0.4343, + "step": 1574 + }, + { + "epoch": 0.08819576660320305, + "grad_norm": 1.1329188346862793, + "learning_rate": 7.865e-06, + "loss": 0.4534, + "step": 1575 + }, + { + "epoch": 0.08825176391533207, + "grad_norm": 1.2403630018234253, + "learning_rate": 7.870000000000001e-06, + "loss": 0.4721, + "step": 1576 + }, + { + "epoch": 0.08830776122746108, + "grad_norm": 1.1998299360275269, + "learning_rate": 7.875e-06, + "loss": 0.6336, + "step": 1577 + }, + { + "epoch": 0.0883637585395901, + "grad_norm": 1.285543441772461, + "learning_rate": 7.879999999999999e-06, + "loss": 0.5604, + "step": 1578 + }, + { + "epoch": 0.08841975585171911, + "grad_norm": 1.1370717287063599, + "learning_rate": 7.885e-06, + "loss": 0.3956, + "step": 1579 + }, + { + "epoch": 0.08847575316384813, + "grad_norm": 1.230810284614563, + "learning_rate": 7.89e-06, + "loss": 0.5079, + "step": 1580 + }, + { + "epoch": 0.08853175047597715, + "grad_norm": 2.2659618854522705, + "learning_rate": 7.895000000000001e-06, + "loss": 0.5141, + "step": 1581 + }, + { + "epoch": 0.08858774778810617, + "grad_norm": 1.1236450672149658, + "learning_rate": 7.9e-06, + "loss": 0.3948, + "step": 1582 + }, + { + "epoch": 0.08864374510023519, + "grad_norm": 1.294722557067871, + "learning_rate": 7.905e-06, + "loss": 0.444, + "step": 1583 + }, + { + "epoch": 0.08869974241236421, + "grad_norm": 1.1199108362197876, + "learning_rate": 7.91e-06, + "loss": 0.5846, + "step": 1584 + }, + { + "epoch": 0.08875573972449323, + "grad_norm": 1.7658997774124146, + "learning_rate": 7.915e-06, + "loss": 0.649, + "step": 1585 + }, + { + "epoch": 0.08881173703662225, + "grad_norm": 1.241074800491333, + "learning_rate": 7.92e-06, + "loss": 0.4549, + "step": 1586 + }, + { + "epoch": 0.08886773434875125, + "grad_norm": 1.1961077451705933, + "learning_rate": 7.925000000000001e-06, + "loss": 0.4492, + "step": 1587 + }, + { + "epoch": 0.08892373166088027, + "grad_norm": 1.2907549142837524, + "learning_rate": 7.93e-06, + "loss": 0.3696, + "step": 1588 + }, + { + "epoch": 0.08897972897300929, + "grad_norm": 1.470672845840454, + "learning_rate": 7.935000000000001e-06, + "loss": 0.4875, + "step": 1589 + }, + { + "epoch": 0.08903572628513831, + "grad_norm": 1.1282093524932861, + "learning_rate": 7.94e-06, + "loss": 0.348, + "step": 1590 + }, + { + "epoch": 0.08909172359726733, + "grad_norm": 1.446938395500183, + "learning_rate": 7.945000000000001e-06, + "loss": 0.4021, + "step": 1591 + }, + { + "epoch": 0.08914772090939635, + "grad_norm": 1.3912911415100098, + "learning_rate": 7.95e-06, + "loss": 0.6479, + "step": 1592 + }, + { + "epoch": 0.08920371822152537, + "grad_norm": 1.2886724472045898, + "learning_rate": 7.955e-06, + "loss": 0.4321, + "step": 1593 + }, + { + "epoch": 0.08925971553365439, + "grad_norm": 1.1775058507919312, + "learning_rate": 7.96e-06, + "loss": 0.5141, + "step": 1594 + }, + { + "epoch": 0.08931571284578341, + "grad_norm": 1.33034348487854, + "learning_rate": 7.965e-06, + "loss": 0.4683, + "step": 1595 + }, + { + "epoch": 0.08937171015791243, + "grad_norm": 1.1250934600830078, + "learning_rate": 7.97e-06, + "loss": 0.5636, + "step": 1596 + }, + { + "epoch": 0.08942770747004143, + "grad_norm": 1.177208423614502, + "learning_rate": 7.975e-06, + "loss": 0.4972, + "step": 1597 + }, + { + "epoch": 0.08948370478217045, + "grad_norm": 1.2766159772872925, + "learning_rate": 7.98e-06, + "loss": 0.4352, + "step": 1598 + }, + { + "epoch": 0.08953970209429947, + "grad_norm": 1.3314900398254395, + "learning_rate": 7.985e-06, + "loss": 0.5088, + "step": 1599 + }, + { + "epoch": 0.08959569940642849, + "grad_norm": 1.1538054943084717, + "learning_rate": 7.99e-06, + "loss": 0.4706, + "step": 1600 + }, + { + "epoch": 0.08965169671855751, + "grad_norm": 1.2058525085449219, + "learning_rate": 7.995e-06, + "loss": 0.4178, + "step": 1601 + }, + { + "epoch": 0.08970769403068653, + "grad_norm": 1.1523019075393677, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5341, + "step": 1602 + }, + { + "epoch": 0.08976369134281555, + "grad_norm": 1.2555052042007446, + "learning_rate": 8.005e-06, + "loss": 0.4499, + "step": 1603 + }, + { + "epoch": 0.08981968865494457, + "grad_norm": 1.2619866132736206, + "learning_rate": 8.010000000000001e-06, + "loss": 0.4104, + "step": 1604 + }, + { + "epoch": 0.08987568596707358, + "grad_norm": 1.5733071565628052, + "learning_rate": 8.015e-06, + "loss": 0.4929, + "step": 1605 + }, + { + "epoch": 0.0899316832792026, + "grad_norm": 1.2147235870361328, + "learning_rate": 8.02e-06, + "loss": 0.5144, + "step": 1606 + }, + { + "epoch": 0.08998768059133162, + "grad_norm": 1.2371256351470947, + "learning_rate": 8.025e-06, + "loss": 0.4795, + "step": 1607 + }, + { + "epoch": 0.09004367790346063, + "grad_norm": 1.2636479139328003, + "learning_rate": 8.03e-06, + "loss": 0.4319, + "step": 1608 + }, + { + "epoch": 0.09009967521558965, + "grad_norm": 1.4651589393615723, + "learning_rate": 8.035e-06, + "loss": 0.6121, + "step": 1609 + }, + { + "epoch": 0.09015567252771867, + "grad_norm": 1.4587225914001465, + "learning_rate": 8.040000000000001e-06, + "loss": 0.3772, + "step": 1610 + }, + { + "epoch": 0.09021166983984769, + "grad_norm": 1.1905932426452637, + "learning_rate": 8.045e-06, + "loss": 0.464, + "step": 1611 + }, + { + "epoch": 0.0902676671519767, + "grad_norm": 1.5001569986343384, + "learning_rate": 8.050000000000001e-06, + "loss": 0.4785, + "step": 1612 + }, + { + "epoch": 0.09032366446410572, + "grad_norm": 1.0893656015396118, + "learning_rate": 8.055e-06, + "loss": 0.4889, + "step": 1613 + }, + { + "epoch": 0.09037966177623474, + "grad_norm": 1.2985970973968506, + "learning_rate": 8.06e-06, + "loss": 0.4723, + "step": 1614 + }, + { + "epoch": 0.09043565908836376, + "grad_norm": 1.1527200937271118, + "learning_rate": 8.065e-06, + "loss": 0.4792, + "step": 1615 + }, + { + "epoch": 0.09049165640049278, + "grad_norm": 1.253238558769226, + "learning_rate": 8.069999999999999e-06, + "loss": 0.4372, + "step": 1616 + }, + { + "epoch": 0.0905476537126218, + "grad_norm": 1.2246673107147217, + "learning_rate": 8.075000000000001e-06, + "loss": 0.4444, + "step": 1617 + }, + { + "epoch": 0.0906036510247508, + "grad_norm": 0.9724764823913574, + "learning_rate": 8.08e-06, + "loss": 0.426, + "step": 1618 + }, + { + "epoch": 0.09065964833687983, + "grad_norm": 1.9928661584854126, + "learning_rate": 8.085000000000001e-06, + "loss": 0.5301, + "step": 1619 + }, + { + "epoch": 0.09071564564900884, + "grad_norm": 1.286141276359558, + "learning_rate": 8.09e-06, + "loss": 0.5638, + "step": 1620 + }, + { + "epoch": 0.09077164296113786, + "grad_norm": 1.0990967750549316, + "learning_rate": 8.095e-06, + "loss": 0.421, + "step": 1621 + }, + { + "epoch": 0.09082764027326688, + "grad_norm": 1.2366746664047241, + "learning_rate": 8.1e-06, + "loss": 0.4869, + "step": 1622 + }, + { + "epoch": 0.0908836375853959, + "grad_norm": 1.4013638496398926, + "learning_rate": 8.105e-06, + "loss": 0.4185, + "step": 1623 + }, + { + "epoch": 0.09093963489752492, + "grad_norm": 1.1708909273147583, + "learning_rate": 8.11e-06, + "loss": 0.4643, + "step": 1624 + }, + { + "epoch": 0.09099563220965394, + "grad_norm": 1.323397159576416, + "learning_rate": 8.115000000000001e-06, + "loss": 0.4741, + "step": 1625 + }, + { + "epoch": 0.09105162952178296, + "grad_norm": 1.1295795440673828, + "learning_rate": 8.12e-06, + "loss": 0.4907, + "step": 1626 + }, + { + "epoch": 0.09110762683391198, + "grad_norm": 1.4290260076522827, + "learning_rate": 8.125000000000001e-06, + "loss": 0.3986, + "step": 1627 + }, + { + "epoch": 0.09116362414604098, + "grad_norm": 1.2899596691131592, + "learning_rate": 8.13e-06, + "loss": 0.4426, + "step": 1628 + }, + { + "epoch": 0.09121962145817, + "grad_norm": 1.6422119140625, + "learning_rate": 8.135000000000001e-06, + "loss": 0.5782, + "step": 1629 + }, + { + "epoch": 0.09127561877029902, + "grad_norm": 1.2225149869918823, + "learning_rate": 8.14e-06, + "loss": 0.4683, + "step": 1630 + }, + { + "epoch": 0.09133161608242804, + "grad_norm": 1.1593647003173828, + "learning_rate": 8.144999999999999e-06, + "loss": 0.445, + "step": 1631 + }, + { + "epoch": 0.09138761339455706, + "grad_norm": 1.1714057922363281, + "learning_rate": 8.15e-06, + "loss": 0.4355, + "step": 1632 + }, + { + "epoch": 0.09144361070668608, + "grad_norm": 1.0047953128814697, + "learning_rate": 8.155e-06, + "loss": 0.4127, + "step": 1633 + }, + { + "epoch": 0.0914996080188151, + "grad_norm": 1.2258692979812622, + "learning_rate": 8.160000000000001e-06, + "loss": 0.5346, + "step": 1634 + }, + { + "epoch": 0.09155560533094412, + "grad_norm": 1.2532273530960083, + "learning_rate": 8.165e-06, + "loss": 0.4842, + "step": 1635 + }, + { + "epoch": 0.09161160264307314, + "grad_norm": 1.0819934606552124, + "learning_rate": 8.17e-06, + "loss": 0.495, + "step": 1636 + }, + { + "epoch": 0.09166759995520216, + "grad_norm": 1.4493647813796997, + "learning_rate": 8.175e-06, + "loss": 0.5362, + "step": 1637 + }, + { + "epoch": 0.09172359726733118, + "grad_norm": 1.7464864253997803, + "learning_rate": 8.18e-06, + "loss": 0.4703, + "step": 1638 + }, + { + "epoch": 0.09177959457946018, + "grad_norm": 1.2716431617736816, + "learning_rate": 8.185e-06, + "loss": 0.5344, + "step": 1639 + }, + { + "epoch": 0.0918355918915892, + "grad_norm": 1.320268154144287, + "learning_rate": 8.190000000000001e-06, + "loss": 0.3806, + "step": 1640 + }, + { + "epoch": 0.09189158920371822, + "grad_norm": 1.4997364282608032, + "learning_rate": 8.195e-06, + "loss": 0.4835, + "step": 1641 + }, + { + "epoch": 0.09194758651584724, + "grad_norm": 1.2244665622711182, + "learning_rate": 8.200000000000001e-06, + "loss": 0.4452, + "step": 1642 + }, + { + "epoch": 0.09200358382797626, + "grad_norm": 1.1460756063461304, + "learning_rate": 8.205e-06, + "loss": 0.4294, + "step": 1643 + }, + { + "epoch": 0.09205958114010528, + "grad_norm": 1.1829346418380737, + "learning_rate": 8.210000000000001e-06, + "loss": 0.4801, + "step": 1644 + }, + { + "epoch": 0.0921155784522343, + "grad_norm": 1.3419283628463745, + "learning_rate": 8.215e-06, + "loss": 0.5275, + "step": 1645 + }, + { + "epoch": 0.09217157576436331, + "grad_norm": 1.430458426475525, + "learning_rate": 8.22e-06, + "loss": 0.5378, + "step": 1646 + }, + { + "epoch": 0.09222757307649233, + "grad_norm": 1.3106675148010254, + "learning_rate": 8.225e-06, + "loss": 0.4741, + "step": 1647 + }, + { + "epoch": 0.09228357038862135, + "grad_norm": 1.2035247087478638, + "learning_rate": 8.23e-06, + "loss": 0.5643, + "step": 1648 + }, + { + "epoch": 0.09233956770075036, + "grad_norm": 1.4303395748138428, + "learning_rate": 8.235000000000002e-06, + "loss": 0.4945, + "step": 1649 + }, + { + "epoch": 0.09239556501287938, + "grad_norm": 1.3392964601516724, + "learning_rate": 8.24e-06, + "loss": 0.3851, + "step": 1650 + }, + { + "epoch": 0.0924515623250084, + "grad_norm": 1.2980291843414307, + "learning_rate": 8.245e-06, + "loss": 0.5338, + "step": 1651 + }, + { + "epoch": 0.09250755963713742, + "grad_norm": 1.2684897184371948, + "learning_rate": 8.25e-06, + "loss": 0.4215, + "step": 1652 + }, + { + "epoch": 0.09256355694926643, + "grad_norm": 1.3237746953964233, + "learning_rate": 8.255e-06, + "loss": 0.5577, + "step": 1653 + }, + { + "epoch": 0.09261955426139545, + "grad_norm": 1.4197481870651245, + "learning_rate": 8.26e-06, + "loss": 0.4287, + "step": 1654 + }, + { + "epoch": 0.09267555157352447, + "grad_norm": 1.2387175559997559, + "learning_rate": 8.265000000000001e-06, + "loss": 0.5273, + "step": 1655 + }, + { + "epoch": 0.09273154888565349, + "grad_norm": 20.208898544311523, + "learning_rate": 8.27e-06, + "loss": 0.4626, + "step": 1656 + }, + { + "epoch": 0.09278754619778251, + "grad_norm": 1.0856859683990479, + "learning_rate": 8.275000000000001e-06, + "loss": 0.4981, + "step": 1657 + }, + { + "epoch": 0.09284354350991153, + "grad_norm": 1.3090143203735352, + "learning_rate": 8.28e-06, + "loss": 0.5757, + "step": 1658 + }, + { + "epoch": 0.09289954082204054, + "grad_norm": 1.2446850538253784, + "learning_rate": 8.285e-06, + "loss": 0.4142, + "step": 1659 + }, + { + "epoch": 0.09295553813416955, + "grad_norm": 1.3336029052734375, + "learning_rate": 8.29e-06, + "loss": 0.395, + "step": 1660 + }, + { + "epoch": 0.09301153544629857, + "grad_norm": 1.2740191221237183, + "learning_rate": 8.295e-06, + "loss": 0.4564, + "step": 1661 + }, + { + "epoch": 0.09306753275842759, + "grad_norm": 1.667121410369873, + "learning_rate": 8.3e-06, + "loss": 0.4583, + "step": 1662 + }, + { + "epoch": 0.09312353007055661, + "grad_norm": 1.074404001235962, + "learning_rate": 8.305000000000001e-06, + "loss": 0.4812, + "step": 1663 + }, + { + "epoch": 0.09317952738268563, + "grad_norm": 1.130149245262146, + "learning_rate": 8.31e-06, + "loss": 0.4388, + "step": 1664 + }, + { + "epoch": 0.09323552469481465, + "grad_norm": 1.5548359155654907, + "learning_rate": 8.315000000000001e-06, + "loss": 0.4344, + "step": 1665 + }, + { + "epoch": 0.09329152200694367, + "grad_norm": 2.0083305835723877, + "learning_rate": 8.32e-06, + "loss": 0.4691, + "step": 1666 + }, + { + "epoch": 0.09334751931907269, + "grad_norm": 1.2877039909362793, + "learning_rate": 8.325e-06, + "loss": 0.6106, + "step": 1667 + }, + { + "epoch": 0.09340351663120171, + "grad_norm": 1.2804948091506958, + "learning_rate": 8.33e-06, + "loss": 0.4049, + "step": 1668 + }, + { + "epoch": 0.09345951394333073, + "grad_norm": 1.088975429534912, + "learning_rate": 8.334999999999999e-06, + "loss": 0.3796, + "step": 1669 + }, + { + "epoch": 0.09351551125545973, + "grad_norm": 1.1175159215927124, + "learning_rate": 8.34e-06, + "loss": 0.4282, + "step": 1670 + }, + { + "epoch": 0.09357150856758875, + "grad_norm": 1.0835729837417603, + "learning_rate": 8.345e-06, + "loss": 0.372, + "step": 1671 + }, + { + "epoch": 0.09362750587971777, + "grad_norm": 1.7679165601730347, + "learning_rate": 8.350000000000001e-06, + "loss": 0.9209, + "step": 1672 + }, + { + "epoch": 0.09368350319184679, + "grad_norm": 1.1253948211669922, + "learning_rate": 8.355e-06, + "loss": 0.4427, + "step": 1673 + }, + { + "epoch": 0.09373950050397581, + "grad_norm": 1.2487722635269165, + "learning_rate": 8.36e-06, + "loss": 0.4141, + "step": 1674 + }, + { + "epoch": 0.09379549781610483, + "grad_norm": 1.62444269657135, + "learning_rate": 8.365e-06, + "loss": 0.3576, + "step": 1675 + }, + { + "epoch": 0.09385149512823385, + "grad_norm": 1.2208722829818726, + "learning_rate": 8.37e-06, + "loss": 0.4182, + "step": 1676 + }, + { + "epoch": 0.09390749244036287, + "grad_norm": 1.2999380826950073, + "learning_rate": 8.375e-06, + "loss": 0.4973, + "step": 1677 + }, + { + "epoch": 0.09396348975249189, + "grad_norm": 1.1313492059707642, + "learning_rate": 8.380000000000001e-06, + "loss": 0.5136, + "step": 1678 + }, + { + "epoch": 0.0940194870646209, + "grad_norm": 1.0610212087631226, + "learning_rate": 8.385e-06, + "loss": 0.3811, + "step": 1679 + }, + { + "epoch": 0.09407548437674991, + "grad_norm": 1.2607392072677612, + "learning_rate": 8.390000000000001e-06, + "loss": 0.4823, + "step": 1680 + }, + { + "epoch": 0.09413148168887893, + "grad_norm": 1.3894352912902832, + "learning_rate": 8.395e-06, + "loss": 0.5465, + "step": 1681 + }, + { + "epoch": 0.09418747900100795, + "grad_norm": 1.0513899326324463, + "learning_rate": 8.400000000000001e-06, + "loss": 0.4264, + "step": 1682 + }, + { + "epoch": 0.09424347631313697, + "grad_norm": 0.9940479397773743, + "learning_rate": 8.405e-06, + "loss": 0.3418, + "step": 1683 + }, + { + "epoch": 0.09429947362526599, + "grad_norm": 1.2676581144332886, + "learning_rate": 8.409999999999999e-06, + "loss": 0.4437, + "step": 1684 + }, + { + "epoch": 0.094355470937395, + "grad_norm": 1.2914938926696777, + "learning_rate": 8.415e-06, + "loss": 0.5123, + "step": 1685 + }, + { + "epoch": 0.09441146824952402, + "grad_norm": 1.2333265542984009, + "learning_rate": 8.42e-06, + "loss": 0.5305, + "step": 1686 + }, + { + "epoch": 0.09446746556165304, + "grad_norm": 1.0795105695724487, + "learning_rate": 8.425000000000001e-06, + "loss": 0.441, + "step": 1687 + }, + { + "epoch": 0.09452346287378206, + "grad_norm": 1.2224481105804443, + "learning_rate": 8.43e-06, + "loss": 0.3985, + "step": 1688 + }, + { + "epoch": 0.09457946018591108, + "grad_norm": 1.626680612564087, + "learning_rate": 8.435e-06, + "loss": 0.5514, + "step": 1689 + }, + { + "epoch": 0.09463545749804009, + "grad_norm": 1.2383500337600708, + "learning_rate": 8.44e-06, + "loss": 0.4813, + "step": 1690 + }, + { + "epoch": 0.0946914548101691, + "grad_norm": 1.3884074687957764, + "learning_rate": 8.445e-06, + "loss": 0.4207, + "step": 1691 + }, + { + "epoch": 0.09474745212229813, + "grad_norm": 1.1471067667007446, + "learning_rate": 8.45e-06, + "loss": 0.4253, + "step": 1692 + }, + { + "epoch": 0.09480344943442715, + "grad_norm": 1.3975670337677002, + "learning_rate": 8.455000000000001e-06, + "loss": 0.5008, + "step": 1693 + }, + { + "epoch": 0.09485944674655616, + "grad_norm": 1.1151567697525024, + "learning_rate": 8.46e-06, + "loss": 0.3075, + "step": 1694 + }, + { + "epoch": 0.09491544405868518, + "grad_norm": 0.992013692855835, + "learning_rate": 8.465000000000001e-06, + "loss": 0.348, + "step": 1695 + }, + { + "epoch": 0.0949714413708142, + "grad_norm": 1.1896450519561768, + "learning_rate": 8.47e-06, + "loss": 0.5221, + "step": 1696 + }, + { + "epoch": 0.09502743868294322, + "grad_norm": 1.4490216970443726, + "learning_rate": 8.475000000000001e-06, + "loss": 0.4325, + "step": 1697 + }, + { + "epoch": 0.09508343599507224, + "grad_norm": 1.1228501796722412, + "learning_rate": 8.48e-06, + "loss": 0.4076, + "step": 1698 + }, + { + "epoch": 0.09513943330720126, + "grad_norm": 1.3249083757400513, + "learning_rate": 8.485e-06, + "loss": 0.456, + "step": 1699 + }, + { + "epoch": 0.09519543061933028, + "grad_norm": 1.3075578212738037, + "learning_rate": 8.49e-06, + "loss": 0.5684, + "step": 1700 + }, + { + "epoch": 0.09525142793145928, + "grad_norm": 1.3035677671432495, + "learning_rate": 8.495e-06, + "loss": 0.5176, + "step": 1701 + }, + { + "epoch": 0.0953074252435883, + "grad_norm": 1.1941081285476685, + "learning_rate": 8.500000000000002e-06, + "loss": 0.4353, + "step": 1702 + }, + { + "epoch": 0.09536342255571732, + "grad_norm": 1.3645747900009155, + "learning_rate": 8.505e-06, + "loss": 0.4457, + "step": 1703 + }, + { + "epoch": 0.09541941986784634, + "grad_norm": 1.3234845399856567, + "learning_rate": 8.51e-06, + "loss": 0.5028, + "step": 1704 + }, + { + "epoch": 0.09547541717997536, + "grad_norm": 1.5293045043945312, + "learning_rate": 8.515e-06, + "loss": 0.6054, + "step": 1705 + }, + { + "epoch": 0.09553141449210438, + "grad_norm": 1.2742122411727905, + "learning_rate": 8.52e-06, + "loss": 0.3425, + "step": 1706 + }, + { + "epoch": 0.0955874118042334, + "grad_norm": 1.2540256977081299, + "learning_rate": 8.525e-06, + "loss": 0.4417, + "step": 1707 + }, + { + "epoch": 0.09564340911636242, + "grad_norm": 1.1341816186904907, + "learning_rate": 8.53e-06, + "loss": 0.4395, + "step": 1708 + }, + { + "epoch": 0.09569940642849144, + "grad_norm": 1.4354275465011597, + "learning_rate": 8.535e-06, + "loss": 0.5021, + "step": 1709 + }, + { + "epoch": 0.09575540374062046, + "grad_norm": 1.21577787399292, + "learning_rate": 8.540000000000001e-06, + "loss": 0.4162, + "step": 1710 + }, + { + "epoch": 0.09581140105274946, + "grad_norm": 1.385469913482666, + "learning_rate": 8.545e-06, + "loss": 0.3978, + "step": 1711 + }, + { + "epoch": 0.09586739836487848, + "grad_norm": 1.0504804849624634, + "learning_rate": 8.550000000000001e-06, + "loss": 0.46, + "step": 1712 + }, + { + "epoch": 0.0959233956770075, + "grad_norm": 1.2518787384033203, + "learning_rate": 8.555e-06, + "loss": 0.4717, + "step": 1713 + }, + { + "epoch": 0.09597939298913652, + "grad_norm": 1.2391563653945923, + "learning_rate": 8.56e-06, + "loss": 0.3796, + "step": 1714 + }, + { + "epoch": 0.09603539030126554, + "grad_norm": 1.352744460105896, + "learning_rate": 8.565e-06, + "loss": 0.4809, + "step": 1715 + }, + { + "epoch": 0.09609138761339456, + "grad_norm": 1.2199307680130005, + "learning_rate": 8.570000000000001e-06, + "loss": 0.5013, + "step": 1716 + }, + { + "epoch": 0.09614738492552358, + "grad_norm": 2.9449145793914795, + "learning_rate": 8.575000000000002e-06, + "loss": 0.4527, + "step": 1717 + }, + { + "epoch": 0.0962033822376526, + "grad_norm": 1.0867520570755005, + "learning_rate": 8.580000000000001e-06, + "loss": 0.5052, + "step": 1718 + }, + { + "epoch": 0.09625937954978162, + "grad_norm": 1.5014264583587646, + "learning_rate": 8.585e-06, + "loss": 0.4327, + "step": 1719 + }, + { + "epoch": 0.09631537686191063, + "grad_norm": 1.358459711074829, + "learning_rate": 8.59e-06, + "loss": 0.4432, + "step": 1720 + }, + { + "epoch": 0.09637137417403964, + "grad_norm": 1.1366547346115112, + "learning_rate": 8.595e-06, + "loss": 0.4062, + "step": 1721 + }, + { + "epoch": 0.09642737148616866, + "grad_norm": 1.6252096891403198, + "learning_rate": 8.599999999999999e-06, + "loss": 0.4598, + "step": 1722 + }, + { + "epoch": 0.09648336879829768, + "grad_norm": 1.2072232961654663, + "learning_rate": 8.605e-06, + "loss": 0.49, + "step": 1723 + }, + { + "epoch": 0.0965393661104267, + "grad_norm": 1.2367745637893677, + "learning_rate": 8.61e-06, + "loss": 0.4035, + "step": 1724 + }, + { + "epoch": 0.09659536342255572, + "grad_norm": 1.3461731672286987, + "learning_rate": 8.615000000000001e-06, + "loss": 0.4942, + "step": 1725 + }, + { + "epoch": 0.09665136073468474, + "grad_norm": 1.1359554529190063, + "learning_rate": 8.62e-06, + "loss": 0.4236, + "step": 1726 + }, + { + "epoch": 0.09670735804681375, + "grad_norm": 1.090421438217163, + "learning_rate": 8.625e-06, + "loss": 0.4333, + "step": 1727 + }, + { + "epoch": 0.09676335535894277, + "grad_norm": 5.683356285095215, + "learning_rate": 8.63e-06, + "loss": 0.6335, + "step": 1728 + }, + { + "epoch": 0.09681935267107179, + "grad_norm": 1.271315336227417, + "learning_rate": 8.635e-06, + "loss": 0.4435, + "step": 1729 + }, + { + "epoch": 0.09687534998320081, + "grad_norm": 1.2029268741607666, + "learning_rate": 8.64e-06, + "loss": 0.3945, + "step": 1730 + }, + { + "epoch": 0.09693134729532983, + "grad_norm": 1.723079800605774, + "learning_rate": 8.645000000000001e-06, + "loss": 0.4809, + "step": 1731 + }, + { + "epoch": 0.09698734460745884, + "grad_norm": 1.122281551361084, + "learning_rate": 8.65e-06, + "loss": 0.4269, + "step": 1732 + }, + { + "epoch": 0.09704334191958786, + "grad_norm": 1.2079497575759888, + "learning_rate": 8.655000000000001e-06, + "loss": 0.4232, + "step": 1733 + }, + { + "epoch": 0.09709933923171687, + "grad_norm": 1.2280787229537964, + "learning_rate": 8.66e-06, + "loss": 0.3697, + "step": 1734 + }, + { + "epoch": 0.0971553365438459, + "grad_norm": 1.0389596223831177, + "learning_rate": 8.665000000000001e-06, + "loss": 0.4221, + "step": 1735 + }, + { + "epoch": 0.09721133385597491, + "grad_norm": 1.22295343875885, + "learning_rate": 8.67e-06, + "loss": 0.4129, + "step": 1736 + }, + { + "epoch": 0.09726733116810393, + "grad_norm": 1.3335331678390503, + "learning_rate": 8.674999999999999e-06, + "loss": 0.4507, + "step": 1737 + }, + { + "epoch": 0.09732332848023295, + "grad_norm": 1.3346177339553833, + "learning_rate": 8.68e-06, + "loss": 0.3771, + "step": 1738 + }, + { + "epoch": 0.09737932579236197, + "grad_norm": 1.0787492990493774, + "learning_rate": 8.685e-06, + "loss": 0.4032, + "step": 1739 + }, + { + "epoch": 0.09743532310449099, + "grad_norm": 1.352504849433899, + "learning_rate": 8.690000000000002e-06, + "loss": 0.4517, + "step": 1740 + }, + { + "epoch": 0.09749132041662001, + "grad_norm": 1.0957409143447876, + "learning_rate": 8.695e-06, + "loss": 0.3548, + "step": 1741 + }, + { + "epoch": 0.09754731772874901, + "grad_norm": 1.1434050798416138, + "learning_rate": 8.7e-06, + "loss": 0.4702, + "step": 1742 + }, + { + "epoch": 0.09760331504087803, + "grad_norm": 1.2777423858642578, + "learning_rate": 8.705e-06, + "loss": 0.3896, + "step": 1743 + }, + { + "epoch": 0.09765931235300705, + "grad_norm": 1.4034701585769653, + "learning_rate": 8.71e-06, + "loss": 0.4506, + "step": 1744 + }, + { + "epoch": 0.09771530966513607, + "grad_norm": 1.207825779914856, + "learning_rate": 8.715e-06, + "loss": 0.5022, + "step": 1745 + }, + { + "epoch": 0.09777130697726509, + "grad_norm": 1.3076139688491821, + "learning_rate": 8.720000000000001e-06, + "loss": 0.3915, + "step": 1746 + }, + { + "epoch": 0.09782730428939411, + "grad_norm": 1.2862523794174194, + "learning_rate": 8.725e-06, + "loss": 0.5892, + "step": 1747 + }, + { + "epoch": 0.09788330160152313, + "grad_norm": 1.442941665649414, + "learning_rate": 8.730000000000001e-06, + "loss": 0.4577, + "step": 1748 + }, + { + "epoch": 0.09793929891365215, + "grad_norm": 1.251220941543579, + "learning_rate": 8.735e-06, + "loss": 0.4834, + "step": 1749 + }, + { + "epoch": 0.09799529622578117, + "grad_norm": 1.1590315103530884, + "learning_rate": 8.740000000000001e-06, + "loss": 0.5074, + "step": 1750 + }, + { + "epoch": 0.09805129353791019, + "grad_norm": 1.3057984113693237, + "learning_rate": 8.745e-06, + "loss": 0.4082, + "step": 1751 + }, + { + "epoch": 0.09810729085003919, + "grad_norm": 1.29001784324646, + "learning_rate": 8.75e-06, + "loss": 0.4628, + "step": 1752 + }, + { + "epoch": 0.09816328816216821, + "grad_norm": 1.0873615741729736, + "learning_rate": 8.755e-06, + "loss": 0.3739, + "step": 1753 + }, + { + "epoch": 0.09821928547429723, + "grad_norm": 1.2104002237319946, + "learning_rate": 8.76e-06, + "loss": 0.4093, + "step": 1754 + }, + { + "epoch": 0.09827528278642625, + "grad_norm": 1.1916160583496094, + "learning_rate": 8.765000000000002e-06, + "loss": 0.5169, + "step": 1755 + }, + { + "epoch": 0.09833128009855527, + "grad_norm": 1.1350438594818115, + "learning_rate": 8.77e-06, + "loss": 0.4636, + "step": 1756 + }, + { + "epoch": 0.09838727741068429, + "grad_norm": 1.4811171293258667, + "learning_rate": 8.775e-06, + "loss": 0.5471, + "step": 1757 + }, + { + "epoch": 0.0984432747228133, + "grad_norm": 1.5752403736114502, + "learning_rate": 8.78e-06, + "loss": 0.5765, + "step": 1758 + }, + { + "epoch": 0.09849927203494233, + "grad_norm": 2.1089720726013184, + "learning_rate": 8.785e-06, + "loss": 0.4019, + "step": 1759 + }, + { + "epoch": 0.09855526934707134, + "grad_norm": 1.8355612754821777, + "learning_rate": 8.79e-06, + "loss": 0.7225, + "step": 1760 + }, + { + "epoch": 0.09861126665920036, + "grad_norm": 1.0810399055480957, + "learning_rate": 8.795e-06, + "loss": 0.3344, + "step": 1761 + }, + { + "epoch": 0.09866726397132938, + "grad_norm": 1.1854368448257446, + "learning_rate": 8.8e-06, + "loss": 0.3407, + "step": 1762 + }, + { + "epoch": 0.09872326128345839, + "grad_norm": 1.2420506477355957, + "learning_rate": 8.805000000000001e-06, + "loss": 0.415, + "step": 1763 + }, + { + "epoch": 0.09877925859558741, + "grad_norm": 1.075506329536438, + "learning_rate": 8.81e-06, + "loss": 0.4851, + "step": 1764 + }, + { + "epoch": 0.09883525590771643, + "grad_norm": 1.1143125295639038, + "learning_rate": 8.815000000000001e-06, + "loss": 0.4585, + "step": 1765 + }, + { + "epoch": 0.09889125321984545, + "grad_norm": 1.1370161771774292, + "learning_rate": 8.82e-06, + "loss": 0.3993, + "step": 1766 + }, + { + "epoch": 0.09894725053197446, + "grad_norm": 1.1423094272613525, + "learning_rate": 8.825e-06, + "loss": 0.3881, + "step": 1767 + }, + { + "epoch": 0.09900324784410348, + "grad_norm": 1.2995854616165161, + "learning_rate": 8.83e-06, + "loss": 0.4175, + "step": 1768 + }, + { + "epoch": 0.0990592451562325, + "grad_norm": 1.3148881196975708, + "learning_rate": 8.835000000000001e-06, + "loss": 0.6037, + "step": 1769 + }, + { + "epoch": 0.09911524246836152, + "grad_norm": 1.084141731262207, + "learning_rate": 8.840000000000002e-06, + "loss": 0.3905, + "step": 1770 + }, + { + "epoch": 0.09917123978049054, + "grad_norm": 1.2053523063659668, + "learning_rate": 8.845000000000001e-06, + "loss": 0.3216, + "step": 1771 + }, + { + "epoch": 0.09922723709261956, + "grad_norm": 1.2783927917480469, + "learning_rate": 8.85e-06, + "loss": 0.4215, + "step": 1772 + }, + { + "epoch": 0.09928323440474857, + "grad_norm": 1.163581132888794, + "learning_rate": 8.855e-06, + "loss": 0.5082, + "step": 1773 + }, + { + "epoch": 0.09933923171687759, + "grad_norm": 1.0954021215438843, + "learning_rate": 8.86e-06, + "loss": 0.372, + "step": 1774 + }, + { + "epoch": 0.0993952290290066, + "grad_norm": 1.236472487449646, + "learning_rate": 8.865e-06, + "loss": 0.3974, + "step": 1775 + }, + { + "epoch": 0.09945122634113562, + "grad_norm": 1.2931112051010132, + "learning_rate": 8.87e-06, + "loss": 0.43, + "step": 1776 + }, + { + "epoch": 0.09950722365326464, + "grad_norm": 1.1061400175094604, + "learning_rate": 8.875e-06, + "loss": 0.4575, + "step": 1777 + }, + { + "epoch": 0.09956322096539366, + "grad_norm": 1.1908855438232422, + "learning_rate": 8.880000000000001e-06, + "loss": 0.4363, + "step": 1778 + }, + { + "epoch": 0.09961921827752268, + "grad_norm": 1.2525837421417236, + "learning_rate": 8.885e-06, + "loss": 0.5029, + "step": 1779 + }, + { + "epoch": 0.0996752155896517, + "grad_norm": 1.2395910024642944, + "learning_rate": 8.890000000000001e-06, + "loss": 0.4891, + "step": 1780 + }, + { + "epoch": 0.09973121290178072, + "grad_norm": 1.4050637483596802, + "learning_rate": 8.895e-06, + "loss": 0.5142, + "step": 1781 + }, + { + "epoch": 0.09978721021390974, + "grad_norm": 1.4121829271316528, + "learning_rate": 8.9e-06, + "loss": 0.6583, + "step": 1782 + }, + { + "epoch": 0.09984320752603874, + "grad_norm": 1.1940996646881104, + "learning_rate": 8.905e-06, + "loss": 0.4022, + "step": 1783 + }, + { + "epoch": 0.09989920483816776, + "grad_norm": 1.557766318321228, + "learning_rate": 8.910000000000001e-06, + "loss": 0.6801, + "step": 1784 + }, + { + "epoch": 0.09995520215029678, + "grad_norm": 1.3310229778289795, + "learning_rate": 8.915e-06, + "loss": 0.3957, + "step": 1785 + }, + { + "epoch": 0.1000111994624258, + "grad_norm": 1.4797648191452026, + "learning_rate": 8.920000000000001e-06, + "loss": 0.5639, + "step": 1786 + }, + { + "epoch": 0.10006719677455482, + "grad_norm": 0.9999483227729797, + "learning_rate": 8.925e-06, + "loss": 0.4347, + "step": 1787 + }, + { + "epoch": 0.10012319408668384, + "grad_norm": 1.196457862854004, + "learning_rate": 8.930000000000001e-06, + "loss": 0.5074, + "step": 1788 + }, + { + "epoch": 0.10017919139881286, + "grad_norm": 1.206497073173523, + "learning_rate": 8.935e-06, + "loss": 0.2889, + "step": 1789 + }, + { + "epoch": 0.10023518871094188, + "grad_norm": 1.3581793308258057, + "learning_rate": 8.939999999999999e-06, + "loss": 0.3974, + "step": 1790 + }, + { + "epoch": 0.1002911860230709, + "grad_norm": 1.3118083477020264, + "learning_rate": 8.945e-06, + "loss": 0.5182, + "step": 1791 + }, + { + "epoch": 0.10034718333519992, + "grad_norm": 1.631662368774414, + "learning_rate": 8.95e-06, + "loss": 0.4934, + "step": 1792 + }, + { + "epoch": 0.10040318064732894, + "grad_norm": 1.0491358041763306, + "learning_rate": 8.955000000000002e-06, + "loss": 0.4696, + "step": 1793 + }, + { + "epoch": 0.10045917795945794, + "grad_norm": 1.2039273977279663, + "learning_rate": 8.96e-06, + "loss": 0.5036, + "step": 1794 + }, + { + "epoch": 0.10051517527158696, + "grad_norm": 1.1414377689361572, + "learning_rate": 8.965e-06, + "loss": 0.5919, + "step": 1795 + }, + { + "epoch": 0.10057117258371598, + "grad_norm": 1.2074891328811646, + "learning_rate": 8.97e-06, + "loss": 0.5084, + "step": 1796 + }, + { + "epoch": 0.100627169895845, + "grad_norm": 1.3807848691940308, + "learning_rate": 8.975e-06, + "loss": 0.373, + "step": 1797 + }, + { + "epoch": 0.10068316720797402, + "grad_norm": 1.1689538955688477, + "learning_rate": 8.98e-06, + "loss": 0.4797, + "step": 1798 + }, + { + "epoch": 0.10073916452010304, + "grad_norm": 1.133975625038147, + "learning_rate": 8.985e-06, + "loss": 0.4301, + "step": 1799 + }, + { + "epoch": 0.10079516183223206, + "grad_norm": 1.4214868545532227, + "learning_rate": 8.99e-06, + "loss": 0.4385, + "step": 1800 + }, + { + "epoch": 0.10085115914436107, + "grad_norm": 1.0573962926864624, + "learning_rate": 8.995000000000001e-06, + "loss": 0.3093, + "step": 1801 + }, + { + "epoch": 0.1009071564564901, + "grad_norm": 1.1686052083969116, + "learning_rate": 9e-06, + "loss": 0.4314, + "step": 1802 + }, + { + "epoch": 0.10096315376861911, + "grad_norm": 1.1324526071548462, + "learning_rate": 9.005000000000001e-06, + "loss": 0.4602, + "step": 1803 + }, + { + "epoch": 0.10101915108074812, + "grad_norm": 1.3244366645812988, + "learning_rate": 9.01e-06, + "loss": 0.4623, + "step": 1804 + }, + { + "epoch": 0.10107514839287714, + "grad_norm": 1.2081764936447144, + "learning_rate": 9.015e-06, + "loss": 0.6177, + "step": 1805 + }, + { + "epoch": 0.10113114570500616, + "grad_norm": 1.2845309972763062, + "learning_rate": 9.02e-06, + "loss": 0.4678, + "step": 1806 + }, + { + "epoch": 0.10118714301713518, + "grad_norm": 1.27948796749115, + "learning_rate": 9.025e-06, + "loss": 0.4194, + "step": 1807 + }, + { + "epoch": 0.1012431403292642, + "grad_norm": 1.3915820121765137, + "learning_rate": 9.030000000000002e-06, + "loss": 0.4707, + "step": 1808 + }, + { + "epoch": 0.10129913764139321, + "grad_norm": 1.4097074270248413, + "learning_rate": 9.035e-06, + "loss": 0.5651, + "step": 1809 + }, + { + "epoch": 0.10135513495352223, + "grad_norm": 1.1681487560272217, + "learning_rate": 9.04e-06, + "loss": 0.4335, + "step": 1810 + }, + { + "epoch": 0.10141113226565125, + "grad_norm": 1.217890977859497, + "learning_rate": 9.045e-06, + "loss": 0.522, + "step": 1811 + }, + { + "epoch": 0.10146712957778027, + "grad_norm": 1.242827296257019, + "learning_rate": 9.05e-06, + "loss": 0.5948, + "step": 1812 + }, + { + "epoch": 0.10152312688990929, + "grad_norm": 1.3299055099487305, + "learning_rate": 9.055e-06, + "loss": 0.656, + "step": 1813 + }, + { + "epoch": 0.1015791242020383, + "grad_norm": 1.1744288206100464, + "learning_rate": 9.06e-06, + "loss": 0.4626, + "step": 1814 + }, + { + "epoch": 0.10163512151416731, + "grad_norm": 1.298693299293518, + "learning_rate": 9.065e-06, + "loss": 0.5209, + "step": 1815 + }, + { + "epoch": 0.10169111882629633, + "grad_norm": 1.4027745723724365, + "learning_rate": 9.070000000000001e-06, + "loss": 0.4836, + "step": 1816 + }, + { + "epoch": 0.10174711613842535, + "grad_norm": 1.2797088623046875, + "learning_rate": 9.075e-06, + "loss": 0.5703, + "step": 1817 + }, + { + "epoch": 0.10180311345055437, + "grad_norm": 1.4140253067016602, + "learning_rate": 9.080000000000001e-06, + "loss": 0.5811, + "step": 1818 + }, + { + "epoch": 0.10185911076268339, + "grad_norm": 1.6313427686691284, + "learning_rate": 9.085e-06, + "loss": 0.5481, + "step": 1819 + }, + { + "epoch": 0.10191510807481241, + "grad_norm": 1.1847879886627197, + "learning_rate": 9.09e-06, + "loss": 0.4082, + "step": 1820 + }, + { + "epoch": 0.10197110538694143, + "grad_norm": 1.5288350582122803, + "learning_rate": 9.095e-06, + "loss": 0.4765, + "step": 1821 + }, + { + "epoch": 0.10202710269907045, + "grad_norm": 1.3034580945968628, + "learning_rate": 9.100000000000001e-06, + "loss": 0.3948, + "step": 1822 + }, + { + "epoch": 0.10208310001119947, + "grad_norm": 1.2626841068267822, + "learning_rate": 9.105000000000002e-06, + "loss": 0.4911, + "step": 1823 + }, + { + "epoch": 0.10213909732332849, + "grad_norm": 1.275468111038208, + "learning_rate": 9.110000000000001e-06, + "loss": 0.4396, + "step": 1824 + }, + { + "epoch": 0.10219509463545749, + "grad_norm": 1.1918511390686035, + "learning_rate": 9.115e-06, + "loss": 0.4379, + "step": 1825 + }, + { + "epoch": 0.10225109194758651, + "grad_norm": 1.9813170433044434, + "learning_rate": 9.12e-06, + "loss": 0.7549, + "step": 1826 + }, + { + "epoch": 0.10230708925971553, + "grad_norm": 1.3841813802719116, + "learning_rate": 9.125e-06, + "loss": 0.4712, + "step": 1827 + }, + { + "epoch": 0.10236308657184455, + "grad_norm": 1.1758801937103271, + "learning_rate": 9.13e-06, + "loss": 0.5183, + "step": 1828 + }, + { + "epoch": 0.10241908388397357, + "grad_norm": 1.1382752656936646, + "learning_rate": 9.135e-06, + "loss": 0.3649, + "step": 1829 + }, + { + "epoch": 0.10247508119610259, + "grad_norm": 1.2009247541427612, + "learning_rate": 9.14e-06, + "loss": 0.4377, + "step": 1830 + }, + { + "epoch": 0.10253107850823161, + "grad_norm": 1.2643797397613525, + "learning_rate": 9.145000000000001e-06, + "loss": 0.4512, + "step": 1831 + }, + { + "epoch": 0.10258707582036063, + "grad_norm": 1.1410797834396362, + "learning_rate": 9.15e-06, + "loss": 0.4931, + "step": 1832 + }, + { + "epoch": 0.10264307313248965, + "grad_norm": 1.101846694946289, + "learning_rate": 9.155000000000001e-06, + "loss": 0.5352, + "step": 1833 + }, + { + "epoch": 0.10269907044461866, + "grad_norm": 1.570682168006897, + "learning_rate": 9.16e-06, + "loss": 0.4423, + "step": 1834 + }, + { + "epoch": 0.10275506775674767, + "grad_norm": 1.3165384531021118, + "learning_rate": 9.165e-06, + "loss": 0.4251, + "step": 1835 + }, + { + "epoch": 0.10281106506887669, + "grad_norm": 1.0519487857818604, + "learning_rate": 9.17e-06, + "loss": 0.4528, + "step": 1836 + }, + { + "epoch": 0.10286706238100571, + "grad_norm": 1.3688673973083496, + "learning_rate": 9.175000000000001e-06, + "loss": 0.4195, + "step": 1837 + }, + { + "epoch": 0.10292305969313473, + "grad_norm": 1.401411533355713, + "learning_rate": 9.180000000000002e-06, + "loss": 0.55, + "step": 1838 + }, + { + "epoch": 0.10297905700526375, + "grad_norm": 1.0831941366195679, + "learning_rate": 9.185000000000001e-06, + "loss": 0.3569, + "step": 1839 + }, + { + "epoch": 0.10303505431739277, + "grad_norm": 1.2040417194366455, + "learning_rate": 9.19e-06, + "loss": 0.4522, + "step": 1840 + }, + { + "epoch": 0.10309105162952178, + "grad_norm": 1.1933237314224243, + "learning_rate": 9.195000000000001e-06, + "loss": 0.6391, + "step": 1841 + }, + { + "epoch": 0.1031470489416508, + "grad_norm": 1.2009390592575073, + "learning_rate": 9.2e-06, + "loss": 0.5783, + "step": 1842 + }, + { + "epoch": 0.10320304625377982, + "grad_norm": 1.155548095703125, + "learning_rate": 9.205e-06, + "loss": 0.3664, + "step": 1843 + }, + { + "epoch": 0.10325904356590884, + "grad_norm": 1.0223307609558105, + "learning_rate": 9.21e-06, + "loss": 0.3759, + "step": 1844 + }, + { + "epoch": 0.10331504087803785, + "grad_norm": 1.281187891960144, + "learning_rate": 9.215e-06, + "loss": 0.43, + "step": 1845 + }, + { + "epoch": 0.10337103819016687, + "grad_norm": 1.2086399793624878, + "learning_rate": 9.220000000000002e-06, + "loss": 0.3932, + "step": 1846 + }, + { + "epoch": 0.10342703550229589, + "grad_norm": 1.6127561330795288, + "learning_rate": 9.225e-06, + "loss": 0.5527, + "step": 1847 + }, + { + "epoch": 0.1034830328144249, + "grad_norm": 1.1773858070373535, + "learning_rate": 9.23e-06, + "loss": 0.4325, + "step": 1848 + }, + { + "epoch": 0.10353903012655392, + "grad_norm": 1.2475886344909668, + "learning_rate": 9.235e-06, + "loss": 0.4141, + "step": 1849 + }, + { + "epoch": 0.10359502743868294, + "grad_norm": 1.3503400087356567, + "learning_rate": 9.24e-06, + "loss": 0.5307, + "step": 1850 + }, + { + "epoch": 0.10365102475081196, + "grad_norm": 1.1808948516845703, + "learning_rate": 9.245e-06, + "loss": 0.5951, + "step": 1851 + }, + { + "epoch": 0.10370702206294098, + "grad_norm": 1.2310532331466675, + "learning_rate": 9.25e-06, + "loss": 0.5327, + "step": 1852 + }, + { + "epoch": 0.10376301937507, + "grad_norm": 1.1280689239501953, + "learning_rate": 9.255e-06, + "loss": 0.4972, + "step": 1853 + }, + { + "epoch": 0.10381901668719902, + "grad_norm": 1.2415132522583008, + "learning_rate": 9.260000000000001e-06, + "loss": 0.5418, + "step": 1854 + }, + { + "epoch": 0.10387501399932804, + "grad_norm": 1.0911405086517334, + "learning_rate": 9.265e-06, + "loss": 0.402, + "step": 1855 + }, + { + "epoch": 0.10393101131145704, + "grad_norm": 1.1728242635726929, + "learning_rate": 9.270000000000001e-06, + "loss": 0.3286, + "step": 1856 + }, + { + "epoch": 0.10398700862358606, + "grad_norm": 1.2124569416046143, + "learning_rate": 9.275e-06, + "loss": 0.4761, + "step": 1857 + }, + { + "epoch": 0.10404300593571508, + "grad_norm": 1.1276865005493164, + "learning_rate": 9.28e-06, + "loss": 0.3177, + "step": 1858 + }, + { + "epoch": 0.1040990032478441, + "grad_norm": 1.0694518089294434, + "learning_rate": 9.285e-06, + "loss": 0.2832, + "step": 1859 + }, + { + "epoch": 0.10415500055997312, + "grad_norm": 1.132310390472412, + "learning_rate": 9.29e-06, + "loss": 0.4409, + "step": 1860 + }, + { + "epoch": 0.10421099787210214, + "grad_norm": 1.23562490940094, + "learning_rate": 9.295000000000002e-06, + "loss": 0.4543, + "step": 1861 + }, + { + "epoch": 0.10426699518423116, + "grad_norm": 1.2050089836120605, + "learning_rate": 9.3e-06, + "loss": 0.4191, + "step": 1862 + }, + { + "epoch": 0.10432299249636018, + "grad_norm": 1.4085443019866943, + "learning_rate": 9.305e-06, + "loss": 0.4678, + "step": 1863 + }, + { + "epoch": 0.1043789898084892, + "grad_norm": 1.2895890474319458, + "learning_rate": 9.31e-06, + "loss": 0.5659, + "step": 1864 + }, + { + "epoch": 0.10443498712061822, + "grad_norm": 1.4111061096191406, + "learning_rate": 9.315e-06, + "loss": 0.3764, + "step": 1865 + }, + { + "epoch": 0.10449098443274722, + "grad_norm": 1.1714211702346802, + "learning_rate": 9.32e-06, + "loss": 0.4289, + "step": 1866 + }, + { + "epoch": 0.10454698174487624, + "grad_norm": 1.271815299987793, + "learning_rate": 9.325e-06, + "loss": 0.4254, + "step": 1867 + }, + { + "epoch": 0.10460297905700526, + "grad_norm": 1.2217464447021484, + "learning_rate": 9.33e-06, + "loss": 0.4228, + "step": 1868 + }, + { + "epoch": 0.10465897636913428, + "grad_norm": 1.1268996000289917, + "learning_rate": 9.335000000000001e-06, + "loss": 0.4438, + "step": 1869 + }, + { + "epoch": 0.1047149736812633, + "grad_norm": 1.1250314712524414, + "learning_rate": 9.34e-06, + "loss": 0.3935, + "step": 1870 + }, + { + "epoch": 0.10477097099339232, + "grad_norm": 1.4297596216201782, + "learning_rate": 9.345000000000001e-06, + "loss": 0.5209, + "step": 1871 + }, + { + "epoch": 0.10482696830552134, + "grad_norm": 1.139918565750122, + "learning_rate": 9.35e-06, + "loss": 0.4825, + "step": 1872 + }, + { + "epoch": 0.10488296561765036, + "grad_norm": 1.2917416095733643, + "learning_rate": 9.355e-06, + "loss": 0.5089, + "step": 1873 + }, + { + "epoch": 0.10493896292977938, + "grad_norm": 1.4938721656799316, + "learning_rate": 9.36e-06, + "loss": 0.5283, + "step": 1874 + }, + { + "epoch": 0.1049949602419084, + "grad_norm": 1.5548522472381592, + "learning_rate": 9.365000000000001e-06, + "loss": 0.4841, + "step": 1875 + }, + { + "epoch": 0.1050509575540374, + "grad_norm": 1.1434475183486938, + "learning_rate": 9.370000000000002e-06, + "loss": 0.4877, + "step": 1876 + }, + { + "epoch": 0.10510695486616642, + "grad_norm": 2.284370183944702, + "learning_rate": 9.375000000000001e-06, + "loss": 0.4799, + "step": 1877 + }, + { + "epoch": 0.10516295217829544, + "grad_norm": 1.4365899562835693, + "learning_rate": 9.38e-06, + "loss": 0.5354, + "step": 1878 + }, + { + "epoch": 0.10521894949042446, + "grad_norm": 1.1886478662490845, + "learning_rate": 9.385e-06, + "loss": 0.4485, + "step": 1879 + }, + { + "epoch": 0.10527494680255348, + "grad_norm": 1.3608007431030273, + "learning_rate": 9.39e-06, + "loss": 0.4481, + "step": 1880 + }, + { + "epoch": 0.1053309441146825, + "grad_norm": 1.356736421585083, + "learning_rate": 9.395e-06, + "loss": 0.5918, + "step": 1881 + }, + { + "epoch": 0.10538694142681151, + "grad_norm": 1.3387682437896729, + "learning_rate": 9.4e-06, + "loss": 0.5101, + "step": 1882 + }, + { + "epoch": 0.10544293873894053, + "grad_norm": 1.227960467338562, + "learning_rate": 9.405e-06, + "loss": 0.4004, + "step": 1883 + }, + { + "epoch": 0.10549893605106955, + "grad_norm": 1.068472146987915, + "learning_rate": 9.410000000000001e-06, + "loss": 0.4885, + "step": 1884 + }, + { + "epoch": 0.10555493336319857, + "grad_norm": 1.46451735496521, + "learning_rate": 9.415e-06, + "loss": 0.4849, + "step": 1885 + }, + { + "epoch": 0.10561093067532759, + "grad_norm": 1.1203140020370483, + "learning_rate": 9.420000000000001e-06, + "loss": 0.3321, + "step": 1886 + }, + { + "epoch": 0.1056669279874566, + "grad_norm": 1.3179823160171509, + "learning_rate": 9.425e-06, + "loss": 0.3947, + "step": 1887 + }, + { + "epoch": 0.10572292529958562, + "grad_norm": 1.2362351417541504, + "learning_rate": 9.43e-06, + "loss": 0.4712, + "step": 1888 + }, + { + "epoch": 0.10577892261171463, + "grad_norm": 1.349965214729309, + "learning_rate": 9.435e-06, + "loss": 0.4748, + "step": 1889 + }, + { + "epoch": 0.10583491992384365, + "grad_norm": 1.158205509185791, + "learning_rate": 9.44e-06, + "loss": 0.3502, + "step": 1890 + }, + { + "epoch": 0.10589091723597267, + "grad_norm": 1.2987258434295654, + "learning_rate": 9.445000000000002e-06, + "loss": 0.4718, + "step": 1891 + }, + { + "epoch": 0.10594691454810169, + "grad_norm": 1.2764886617660522, + "learning_rate": 9.450000000000001e-06, + "loss": 0.4185, + "step": 1892 + }, + { + "epoch": 0.10600291186023071, + "grad_norm": 1.5539253950119019, + "learning_rate": 9.455e-06, + "loss": 0.4658, + "step": 1893 + }, + { + "epoch": 0.10605890917235973, + "grad_norm": 1.2473183870315552, + "learning_rate": 9.460000000000001e-06, + "loss": 0.3696, + "step": 1894 + }, + { + "epoch": 0.10611490648448875, + "grad_norm": 1.3843179941177368, + "learning_rate": 9.465e-06, + "loss": 0.4765, + "step": 1895 + }, + { + "epoch": 0.10617090379661777, + "grad_norm": 1.3900247812271118, + "learning_rate": 9.47e-06, + "loss": 0.606, + "step": 1896 + }, + { + "epoch": 0.10622690110874677, + "grad_norm": 1.1595515012741089, + "learning_rate": 9.475e-06, + "loss": 0.4623, + "step": 1897 + }, + { + "epoch": 0.1062828984208758, + "grad_norm": 2.569775104522705, + "learning_rate": 9.48e-06, + "loss": 0.3973, + "step": 1898 + }, + { + "epoch": 0.10633889573300481, + "grad_norm": 1.197704792022705, + "learning_rate": 9.485000000000002e-06, + "loss": 0.4226, + "step": 1899 + }, + { + "epoch": 0.10639489304513383, + "grad_norm": 2.009443521499634, + "learning_rate": 9.49e-06, + "loss": 0.5467, + "step": 1900 + }, + { + "epoch": 0.10645089035726285, + "grad_norm": 1.1895710229873657, + "learning_rate": 9.495000000000001e-06, + "loss": 0.4143, + "step": 1901 + }, + { + "epoch": 0.10650688766939187, + "grad_norm": 1.5191071033477783, + "learning_rate": 9.5e-06, + "loss": 0.4761, + "step": 1902 + }, + { + "epoch": 0.10656288498152089, + "grad_norm": 1.3583176136016846, + "learning_rate": 9.505e-06, + "loss": 0.5819, + "step": 1903 + }, + { + "epoch": 0.10661888229364991, + "grad_norm": 1.112642526626587, + "learning_rate": 9.51e-06, + "loss": 0.4729, + "step": 1904 + }, + { + "epoch": 0.10667487960577893, + "grad_norm": 1.0252685546875, + "learning_rate": 9.515e-06, + "loss": 0.4811, + "step": 1905 + }, + { + "epoch": 0.10673087691790795, + "grad_norm": 1.255096673965454, + "learning_rate": 9.52e-06, + "loss": 0.441, + "step": 1906 + }, + { + "epoch": 0.10678687423003695, + "grad_norm": 1.3634824752807617, + "learning_rate": 9.525000000000001e-06, + "loss": 0.4749, + "step": 1907 + }, + { + "epoch": 0.10684287154216597, + "grad_norm": 1.1139448881149292, + "learning_rate": 9.53e-06, + "loss": 0.3083, + "step": 1908 + }, + { + "epoch": 0.10689886885429499, + "grad_norm": 1.3352259397506714, + "learning_rate": 9.535000000000001e-06, + "loss": 0.3185, + "step": 1909 + }, + { + "epoch": 0.10695486616642401, + "grad_norm": 1.2440032958984375, + "learning_rate": 9.54e-06, + "loss": 0.474, + "step": 1910 + }, + { + "epoch": 0.10701086347855303, + "grad_norm": 1.1798661947250366, + "learning_rate": 9.545e-06, + "loss": 0.4714, + "step": 1911 + }, + { + "epoch": 0.10706686079068205, + "grad_norm": 1.2166504859924316, + "learning_rate": 9.55e-06, + "loss": 0.5616, + "step": 1912 + }, + { + "epoch": 0.10712285810281107, + "grad_norm": 1.2180215120315552, + "learning_rate": 9.555e-06, + "loss": 0.4465, + "step": 1913 + }, + { + "epoch": 0.10717885541494009, + "grad_norm": 1.3125149011611938, + "learning_rate": 9.560000000000002e-06, + "loss": 0.5186, + "step": 1914 + }, + { + "epoch": 0.1072348527270691, + "grad_norm": 1.1238898038864136, + "learning_rate": 9.565e-06, + "loss": 0.3985, + "step": 1915 + }, + { + "epoch": 0.10729085003919812, + "grad_norm": 1.2942389249801636, + "learning_rate": 9.57e-06, + "loss": 0.4492, + "step": 1916 + }, + { + "epoch": 0.10734684735132714, + "grad_norm": 1.153488039970398, + "learning_rate": 9.575e-06, + "loss": 0.5233, + "step": 1917 + }, + { + "epoch": 0.10740284466345615, + "grad_norm": 1.1370173692703247, + "learning_rate": 9.58e-06, + "loss": 0.4421, + "step": 1918 + }, + { + "epoch": 0.10745884197558517, + "grad_norm": 1.3307583332061768, + "learning_rate": 9.585e-06, + "loss": 0.4855, + "step": 1919 + }, + { + "epoch": 0.10751483928771419, + "grad_norm": 1.103548526763916, + "learning_rate": 9.59e-06, + "loss": 0.39, + "step": 1920 + }, + { + "epoch": 0.1075708365998432, + "grad_norm": 1.2443008422851562, + "learning_rate": 9.595e-06, + "loss": 0.5393, + "step": 1921 + }, + { + "epoch": 0.10762683391197223, + "grad_norm": 1.0949368476867676, + "learning_rate": 9.600000000000001e-06, + "loss": 0.3842, + "step": 1922 + }, + { + "epoch": 0.10768283122410124, + "grad_norm": 1.3167270421981812, + "learning_rate": 9.605e-06, + "loss": 0.4478, + "step": 1923 + }, + { + "epoch": 0.10773882853623026, + "grad_norm": 1.3839845657348633, + "learning_rate": 9.610000000000001e-06, + "loss": 0.3765, + "step": 1924 + }, + { + "epoch": 0.10779482584835928, + "grad_norm": 1.3814969062805176, + "learning_rate": 9.615e-06, + "loss": 0.5044, + "step": 1925 + }, + { + "epoch": 0.1078508231604883, + "grad_norm": 1.2561522722244263, + "learning_rate": 9.62e-06, + "loss": 0.4028, + "step": 1926 + }, + { + "epoch": 0.10790682047261732, + "grad_norm": 1.1426435708999634, + "learning_rate": 9.625e-06, + "loss": 0.3741, + "step": 1927 + }, + { + "epoch": 0.10796281778474633, + "grad_norm": 1.2376185655593872, + "learning_rate": 9.630000000000001e-06, + "loss": 0.639, + "step": 1928 + }, + { + "epoch": 0.10801881509687535, + "grad_norm": 1.2048944234848022, + "learning_rate": 9.635000000000002e-06, + "loss": 0.4114, + "step": 1929 + }, + { + "epoch": 0.10807481240900436, + "grad_norm": 1.0896660089492798, + "learning_rate": 9.640000000000001e-06, + "loss": 0.4479, + "step": 1930 + }, + { + "epoch": 0.10813080972113338, + "grad_norm": 1.411189079284668, + "learning_rate": 9.645e-06, + "loss": 0.4999, + "step": 1931 + }, + { + "epoch": 0.1081868070332624, + "grad_norm": 1.2659348249435425, + "learning_rate": 9.65e-06, + "loss": 0.5425, + "step": 1932 + }, + { + "epoch": 0.10824280434539142, + "grad_norm": 3.3288135528564453, + "learning_rate": 9.655e-06, + "loss": 0.3585, + "step": 1933 + }, + { + "epoch": 0.10829880165752044, + "grad_norm": 1.1483445167541504, + "learning_rate": 9.66e-06, + "loss": 0.4003, + "step": 1934 + }, + { + "epoch": 0.10835479896964946, + "grad_norm": 1.2844260931015015, + "learning_rate": 9.665e-06, + "loss": 0.5146, + "step": 1935 + }, + { + "epoch": 0.10841079628177848, + "grad_norm": 1.4117387533187866, + "learning_rate": 9.67e-06, + "loss": 0.5105, + "step": 1936 + }, + { + "epoch": 0.1084667935939075, + "grad_norm": 1.4861122369766235, + "learning_rate": 9.675000000000001e-06, + "loss": 0.5692, + "step": 1937 + }, + { + "epoch": 0.1085227909060365, + "grad_norm": 1.534714937210083, + "learning_rate": 9.68e-06, + "loss": 0.4468, + "step": 1938 + }, + { + "epoch": 0.10857878821816552, + "grad_norm": 1.1656982898712158, + "learning_rate": 9.685000000000001e-06, + "loss": 0.4292, + "step": 1939 + }, + { + "epoch": 0.10863478553029454, + "grad_norm": 1.238632321357727, + "learning_rate": 9.69e-06, + "loss": 0.4331, + "step": 1940 + }, + { + "epoch": 0.10869078284242356, + "grad_norm": 1.5374995470046997, + "learning_rate": 9.695e-06, + "loss": 0.6631, + "step": 1941 + }, + { + "epoch": 0.10874678015455258, + "grad_norm": 1.3231433629989624, + "learning_rate": 9.7e-06, + "loss": 0.3684, + "step": 1942 + }, + { + "epoch": 0.1088027774666816, + "grad_norm": 1.2549045085906982, + "learning_rate": 9.705e-06, + "loss": 0.3888, + "step": 1943 + }, + { + "epoch": 0.10885877477881062, + "grad_norm": 0.9933781623840332, + "learning_rate": 9.71e-06, + "loss": 0.4307, + "step": 1944 + }, + { + "epoch": 0.10891477209093964, + "grad_norm": 1.3064879179000854, + "learning_rate": 9.715000000000001e-06, + "loss": 0.495, + "step": 1945 + }, + { + "epoch": 0.10897076940306866, + "grad_norm": 1.2886368036270142, + "learning_rate": 9.72e-06, + "loss": 0.4827, + "step": 1946 + }, + { + "epoch": 0.10902676671519768, + "grad_norm": 1.11863112449646, + "learning_rate": 9.725000000000001e-06, + "loss": 0.39, + "step": 1947 + }, + { + "epoch": 0.1090827640273267, + "grad_norm": 1.2331268787384033, + "learning_rate": 9.73e-06, + "loss": 0.4058, + "step": 1948 + }, + { + "epoch": 0.1091387613394557, + "grad_norm": 4.416565895080566, + "learning_rate": 9.735e-06, + "loss": 0.5548, + "step": 1949 + }, + { + "epoch": 0.10919475865158472, + "grad_norm": 1.183388113975525, + "learning_rate": 9.74e-06, + "loss": 0.4693, + "step": 1950 + }, + { + "epoch": 0.10925075596371374, + "grad_norm": 1.4820784330368042, + "learning_rate": 9.745e-06, + "loss": 0.4546, + "step": 1951 + }, + { + "epoch": 0.10930675327584276, + "grad_norm": 1.2141631841659546, + "learning_rate": 9.750000000000002e-06, + "loss": 0.4229, + "step": 1952 + }, + { + "epoch": 0.10936275058797178, + "grad_norm": 1.192245602607727, + "learning_rate": 9.755e-06, + "loss": 0.4096, + "step": 1953 + }, + { + "epoch": 0.1094187479001008, + "grad_norm": 1.2988064289093018, + "learning_rate": 9.760000000000001e-06, + "loss": 0.5635, + "step": 1954 + }, + { + "epoch": 0.10947474521222982, + "grad_norm": 1.2981138229370117, + "learning_rate": 9.765e-06, + "loss": 0.4759, + "step": 1955 + }, + { + "epoch": 0.10953074252435883, + "grad_norm": 1.0608023405075073, + "learning_rate": 9.77e-06, + "loss": 0.4461, + "step": 1956 + }, + { + "epoch": 0.10958673983648785, + "grad_norm": 1.3462153673171997, + "learning_rate": 9.775e-06, + "loss": 0.4433, + "step": 1957 + }, + { + "epoch": 0.10964273714861687, + "grad_norm": 1.4526351690292358, + "learning_rate": 9.78e-06, + "loss": 0.4635, + "step": 1958 + }, + { + "epoch": 0.10969873446074588, + "grad_norm": 1.1988767385482788, + "learning_rate": 9.785e-06, + "loss": 0.4356, + "step": 1959 + }, + { + "epoch": 0.1097547317728749, + "grad_norm": 1.0474293231964111, + "learning_rate": 9.790000000000001e-06, + "loss": 0.3663, + "step": 1960 + }, + { + "epoch": 0.10981072908500392, + "grad_norm": 1.190447211265564, + "learning_rate": 9.795e-06, + "loss": 0.459, + "step": 1961 + }, + { + "epoch": 0.10986672639713294, + "grad_norm": 1.1437945365905762, + "learning_rate": 9.800000000000001e-06, + "loss": 0.4235, + "step": 1962 + }, + { + "epoch": 0.10992272370926195, + "grad_norm": 1.2026954889297485, + "learning_rate": 9.805e-06, + "loss": 0.5103, + "step": 1963 + }, + { + "epoch": 0.10997872102139097, + "grad_norm": 1.231103539466858, + "learning_rate": 9.810000000000001e-06, + "loss": 0.5811, + "step": 1964 + }, + { + "epoch": 0.11003471833351999, + "grad_norm": 1.1028512716293335, + "learning_rate": 9.815e-06, + "loss": 0.381, + "step": 1965 + }, + { + "epoch": 0.11009071564564901, + "grad_norm": 1.0779800415039062, + "learning_rate": 9.820000000000001e-06, + "loss": 0.3742, + "step": 1966 + }, + { + "epoch": 0.11014671295777803, + "grad_norm": 1.243393063545227, + "learning_rate": 9.825000000000002e-06, + "loss": 0.4914, + "step": 1967 + }, + { + "epoch": 0.11020271026990705, + "grad_norm": 1.29071044921875, + "learning_rate": 9.83e-06, + "loss": 0.4967, + "step": 1968 + }, + { + "epoch": 0.11025870758203606, + "grad_norm": 1.126316785812378, + "learning_rate": 9.835000000000002e-06, + "loss": 0.4539, + "step": 1969 + }, + { + "epoch": 0.11031470489416507, + "grad_norm": 1.5042952299118042, + "learning_rate": 9.84e-06, + "loss": 0.6822, + "step": 1970 + }, + { + "epoch": 0.1103707022062941, + "grad_norm": 1.3173643350601196, + "learning_rate": 9.845e-06, + "loss": 0.4644, + "step": 1971 + }, + { + "epoch": 0.11042669951842311, + "grad_norm": 1.1041350364685059, + "learning_rate": 9.85e-06, + "loss": 0.3427, + "step": 1972 + }, + { + "epoch": 0.11048269683055213, + "grad_norm": 1.0766938924789429, + "learning_rate": 9.855e-06, + "loss": 0.4033, + "step": 1973 + }, + { + "epoch": 0.11053869414268115, + "grad_norm": 1.1862468719482422, + "learning_rate": 9.86e-06, + "loss": 0.477, + "step": 1974 + }, + { + "epoch": 0.11059469145481017, + "grad_norm": 1.4422067403793335, + "learning_rate": 9.865000000000001e-06, + "loss": 0.5174, + "step": 1975 + }, + { + "epoch": 0.11065068876693919, + "grad_norm": 1.438631296157837, + "learning_rate": 9.87e-06, + "loss": 0.4254, + "step": 1976 + }, + { + "epoch": 0.11070668607906821, + "grad_norm": 1.2510143518447876, + "learning_rate": 9.875000000000001e-06, + "loss": 0.4414, + "step": 1977 + }, + { + "epoch": 0.11076268339119723, + "grad_norm": 1.0611276626586914, + "learning_rate": 9.88e-06, + "loss": 0.3732, + "step": 1978 + }, + { + "epoch": 0.11081868070332625, + "grad_norm": 1.4915348291397095, + "learning_rate": 9.885e-06, + "loss": 0.5958, + "step": 1979 + }, + { + "epoch": 0.11087467801545525, + "grad_norm": 1.314444899559021, + "learning_rate": 9.89e-06, + "loss": 0.4871, + "step": 1980 + }, + { + "epoch": 0.11093067532758427, + "grad_norm": 1.2102175951004028, + "learning_rate": 9.895e-06, + "loss": 0.4774, + "step": 1981 + }, + { + "epoch": 0.11098667263971329, + "grad_norm": 1.1883585453033447, + "learning_rate": 9.900000000000002e-06, + "loss": 0.3676, + "step": 1982 + }, + { + "epoch": 0.11104266995184231, + "grad_norm": 1.1065423488616943, + "learning_rate": 9.905000000000001e-06, + "loss": 0.3954, + "step": 1983 + }, + { + "epoch": 0.11109866726397133, + "grad_norm": 1.7444603443145752, + "learning_rate": 9.91e-06, + "loss": 0.7777, + "step": 1984 + }, + { + "epoch": 0.11115466457610035, + "grad_norm": 1.5046581029891968, + "learning_rate": 9.915e-06, + "loss": 0.535, + "step": 1985 + }, + { + "epoch": 0.11121066188822937, + "grad_norm": 1.9591126441955566, + "learning_rate": 9.92e-06, + "loss": 0.4007, + "step": 1986 + }, + { + "epoch": 0.11126665920035839, + "grad_norm": 1.109308123588562, + "learning_rate": 9.925e-06, + "loss": 0.4983, + "step": 1987 + }, + { + "epoch": 0.1113226565124874, + "grad_norm": 1.1649729013442993, + "learning_rate": 9.93e-06, + "loss": 0.4802, + "step": 1988 + }, + { + "epoch": 0.11137865382461642, + "grad_norm": 1.3624181747436523, + "learning_rate": 9.935e-06, + "loss": 0.5429, + "step": 1989 + }, + { + "epoch": 0.11143465113674543, + "grad_norm": 1.2421095371246338, + "learning_rate": 9.940000000000001e-06, + "loss": 0.4532, + "step": 1990 + }, + { + "epoch": 0.11149064844887445, + "grad_norm": 1.2075271606445312, + "learning_rate": 9.945e-06, + "loss": 0.4258, + "step": 1991 + }, + { + "epoch": 0.11154664576100347, + "grad_norm": 1.2682304382324219, + "learning_rate": 9.950000000000001e-06, + "loss": 0.5251, + "step": 1992 + }, + { + "epoch": 0.11160264307313249, + "grad_norm": 1.2718613147735596, + "learning_rate": 9.955e-06, + "loss": 0.4165, + "step": 1993 + }, + { + "epoch": 0.1116586403852615, + "grad_norm": 1.4027260541915894, + "learning_rate": 9.96e-06, + "loss": 0.6875, + "step": 1994 + }, + { + "epoch": 0.11171463769739053, + "grad_norm": 1.2574266195297241, + "learning_rate": 9.965e-06, + "loss": 0.4169, + "step": 1995 + }, + { + "epoch": 0.11177063500951955, + "grad_norm": 1.2845044136047363, + "learning_rate": 9.97e-06, + "loss": 0.4741, + "step": 1996 + }, + { + "epoch": 0.11182663232164856, + "grad_norm": 1.383315086364746, + "learning_rate": 9.975e-06, + "loss": 0.5904, + "step": 1997 + }, + { + "epoch": 0.11188262963377758, + "grad_norm": 1.2629704475402832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.4515, + "step": 1998 + }, + { + "epoch": 0.1119386269459066, + "grad_norm": 1.3212579488754272, + "learning_rate": 9.985e-06, + "loss": 0.5825, + "step": 1999 + }, + { + "epoch": 0.11199462425803561, + "grad_norm": 1.274903655052185, + "learning_rate": 9.990000000000001e-06, + "loss": 0.5872, + "step": 2000 + }, + { + "epoch": 0.11205062157016463, + "grad_norm": 1.2747188806533813, + "learning_rate": 9.995e-06, + "loss": 0.3708, + "step": 2001 + }, + { + "epoch": 0.11210661888229365, + "grad_norm": 1.472104549407959, + "learning_rate": 1e-05, + "loss": 0.5602, + "step": 2002 + }, + { + "epoch": 0.11216261619442267, + "grad_norm": 1.256666660308838, + "learning_rate": 1.0005e-05, + "loss": 0.4248, + "step": 2003 + }, + { + "epoch": 0.11221861350655168, + "grad_norm": 1.092176079750061, + "learning_rate": 1.001e-05, + "loss": 0.4048, + "step": 2004 + }, + { + "epoch": 0.1122746108186807, + "grad_norm": 1.1835278272628784, + "learning_rate": 1.0015000000000002e-05, + "loss": 0.5522, + "step": 2005 + }, + { + "epoch": 0.11233060813080972, + "grad_norm": 17.31488800048828, + "learning_rate": 1.002e-05, + "loss": 0.354, + "step": 2006 + }, + { + "epoch": 0.11238660544293874, + "grad_norm": 1.2700695991516113, + "learning_rate": 1.0025000000000001e-05, + "loss": 0.6672, + "step": 2007 + }, + { + "epoch": 0.11244260275506776, + "grad_norm": 1.2675044536590576, + "learning_rate": 1.003e-05, + "loss": 0.5727, + "step": 2008 + }, + { + "epoch": 0.11249860006719678, + "grad_norm": 1.354222059249878, + "learning_rate": 1.0035e-05, + "loss": 0.553, + "step": 2009 + }, + { + "epoch": 0.1125545973793258, + "grad_norm": 1.2412958145141602, + "learning_rate": 1.004e-05, + "loss": 0.3739, + "step": 2010 + }, + { + "epoch": 0.1126105946914548, + "grad_norm": 1.1372991800308228, + "learning_rate": 1.0045e-05, + "loss": 0.3767, + "step": 2011 + }, + { + "epoch": 0.11266659200358382, + "grad_norm": 1.4014700651168823, + "learning_rate": 1.005e-05, + "loss": 0.611, + "step": 2012 + }, + { + "epoch": 0.11272258931571284, + "grad_norm": 1.648236632347107, + "learning_rate": 1.0055000000000001e-05, + "loss": 0.546, + "step": 2013 + }, + { + "epoch": 0.11277858662784186, + "grad_norm": 1.0424588918685913, + "learning_rate": 1.006e-05, + "loss": 0.3135, + "step": 2014 + }, + { + "epoch": 0.11283458393997088, + "grad_norm": 1.1952458620071411, + "learning_rate": 1.0065000000000001e-05, + "loss": 0.4185, + "step": 2015 + }, + { + "epoch": 0.1128905812520999, + "grad_norm": 1.2783581018447876, + "learning_rate": 1.007e-05, + "loss": 0.5532, + "step": 2016 + }, + { + "epoch": 0.11294657856422892, + "grad_norm": 1.3510452508926392, + "learning_rate": 1.0075000000000001e-05, + "loss": 0.3781, + "step": 2017 + }, + { + "epoch": 0.11300257587635794, + "grad_norm": 1.534055471420288, + "learning_rate": 1.008e-05, + "loss": 0.5066, + "step": 2018 + }, + { + "epoch": 0.11305857318848696, + "grad_norm": 1.1314103603363037, + "learning_rate": 1.0085e-05, + "loss": 0.3968, + "step": 2019 + }, + { + "epoch": 0.11311457050061598, + "grad_norm": 1.1797354221343994, + "learning_rate": 1.0090000000000002e-05, + "loss": 0.2986, + "step": 2020 + }, + { + "epoch": 0.11317056781274498, + "grad_norm": 1.1401501893997192, + "learning_rate": 1.0095e-05, + "loss": 0.4593, + "step": 2021 + }, + { + "epoch": 0.113226565124874, + "grad_norm": 1.2980574369430542, + "learning_rate": 1.0100000000000002e-05, + "loss": 0.6112, + "step": 2022 + }, + { + "epoch": 0.11328256243700302, + "grad_norm": 1.022275447845459, + "learning_rate": 1.0105e-05, + "loss": 0.4415, + "step": 2023 + }, + { + "epoch": 0.11333855974913204, + "grad_norm": 1.181807518005371, + "learning_rate": 1.011e-05, + "loss": 0.4308, + "step": 2024 + }, + { + "epoch": 0.11339455706126106, + "grad_norm": 1.1795766353607178, + "learning_rate": 1.0115e-05, + "loss": 0.4049, + "step": 2025 + }, + { + "epoch": 0.11345055437339008, + "grad_norm": 1.262852668762207, + "learning_rate": 1.012e-05, + "loss": 0.4993, + "step": 2026 + }, + { + "epoch": 0.1135065516855191, + "grad_norm": 1.3481334447860718, + "learning_rate": 1.0125e-05, + "loss": 0.5144, + "step": 2027 + }, + { + "epoch": 0.11356254899764812, + "grad_norm": 1.1121824979782104, + "learning_rate": 1.0130000000000001e-05, + "loss": 0.3884, + "step": 2028 + }, + { + "epoch": 0.11361854630977714, + "grad_norm": 1.3804322481155396, + "learning_rate": 1.0135e-05, + "loss": 0.4579, + "step": 2029 + }, + { + "epoch": 0.11367454362190615, + "grad_norm": 1.271407127380371, + "learning_rate": 1.0140000000000001e-05, + "loss": 0.6424, + "step": 2030 + }, + { + "epoch": 0.11373054093403516, + "grad_norm": 1.1223245859146118, + "learning_rate": 1.0145e-05, + "loss": 0.4934, + "step": 2031 + }, + { + "epoch": 0.11378653824616418, + "grad_norm": 1.3676866292953491, + "learning_rate": 1.0150000000000001e-05, + "loss": 0.4716, + "step": 2032 + }, + { + "epoch": 0.1138425355582932, + "grad_norm": 1.60161292552948, + "learning_rate": 1.0155e-05, + "loss": 0.5407, + "step": 2033 + }, + { + "epoch": 0.11389853287042222, + "grad_norm": 1.5236585140228271, + "learning_rate": 1.016e-05, + "loss": 0.4677, + "step": 2034 + }, + { + "epoch": 0.11395453018255124, + "grad_norm": 1.2690157890319824, + "learning_rate": 1.0165e-05, + "loss": 0.4658, + "step": 2035 + }, + { + "epoch": 0.11401052749468026, + "grad_norm": 1.9310686588287354, + "learning_rate": 1.0170000000000001e-05, + "loss": 0.3959, + "step": 2036 + }, + { + "epoch": 0.11406652480680927, + "grad_norm": 1.4214739799499512, + "learning_rate": 1.0175e-05, + "loss": 0.5339, + "step": 2037 + }, + { + "epoch": 0.1141225221189383, + "grad_norm": 1.6591894626617432, + "learning_rate": 1.018e-05, + "loss": 0.8932, + "step": 2038 + }, + { + "epoch": 0.11417851943106731, + "grad_norm": 1.1825146675109863, + "learning_rate": 1.0185e-05, + "loss": 0.4305, + "step": 2039 + }, + { + "epoch": 0.11423451674319633, + "grad_norm": 1.151872158050537, + "learning_rate": 1.019e-05, + "loss": 0.4143, + "step": 2040 + }, + { + "epoch": 0.11429051405532535, + "grad_norm": 1.2419713735580444, + "learning_rate": 1.0195e-05, + "loss": 0.6054, + "step": 2041 + }, + { + "epoch": 0.11434651136745436, + "grad_norm": 2.2976205348968506, + "learning_rate": 1.02e-05, + "loss": 0.5601, + "step": 2042 + }, + { + "epoch": 0.11440250867958338, + "grad_norm": 1.0336729288101196, + "learning_rate": 1.0205000000000001e-05, + "loss": 0.3298, + "step": 2043 + }, + { + "epoch": 0.1144585059917124, + "grad_norm": 13.27386474609375, + "learning_rate": 1.021e-05, + "loss": 0.4519, + "step": 2044 + }, + { + "epoch": 0.11451450330384141, + "grad_norm": 1.9197330474853516, + "learning_rate": 1.0215000000000001e-05, + "loss": 0.4625, + "step": 2045 + }, + { + "epoch": 0.11457050061597043, + "grad_norm": 1.1858235597610474, + "learning_rate": 1.022e-05, + "loss": 0.4263, + "step": 2046 + }, + { + "epoch": 0.11462649792809945, + "grad_norm": 1.409785509109497, + "learning_rate": 1.0225e-05, + "loss": 0.5989, + "step": 2047 + }, + { + "epoch": 0.11468249524022847, + "grad_norm": 1.406874656677246, + "learning_rate": 1.023e-05, + "loss": 0.4339, + "step": 2048 + }, + { + "epoch": 0.11473849255235749, + "grad_norm": 2.0559372901916504, + "learning_rate": 1.0235e-05, + "loss": 0.4359, + "step": 2049 + }, + { + "epoch": 0.11479448986448651, + "grad_norm": 1.2511943578720093, + "learning_rate": 1.024e-05, + "loss": 0.4325, + "step": 2050 + }, + { + "epoch": 0.11485048717661553, + "grad_norm": 1.1281684637069702, + "learning_rate": 1.0245000000000001e-05, + "loss": 0.378, + "step": 2051 + }, + { + "epoch": 0.11490648448874453, + "grad_norm": 1.5144761800765991, + "learning_rate": 1.025e-05, + "loss": 0.409, + "step": 2052 + }, + { + "epoch": 0.11496248180087355, + "grad_norm": 1.0558453798294067, + "learning_rate": 1.0255000000000001e-05, + "loss": 0.3365, + "step": 2053 + }, + { + "epoch": 0.11501847911300257, + "grad_norm": 1.1261556148529053, + "learning_rate": 1.026e-05, + "loss": 0.3968, + "step": 2054 + }, + { + "epoch": 0.11507447642513159, + "grad_norm": 1.3315730094909668, + "learning_rate": 1.0265e-05, + "loss": 0.5828, + "step": 2055 + }, + { + "epoch": 0.11513047373726061, + "grad_norm": 1.191011905670166, + "learning_rate": 1.027e-05, + "loss": 0.5525, + "step": 2056 + }, + { + "epoch": 0.11518647104938963, + "grad_norm": 1.2746089696884155, + "learning_rate": 1.0275e-05, + "loss": 0.4807, + "step": 2057 + }, + { + "epoch": 0.11524246836151865, + "grad_norm": 1.04375159740448, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.3857, + "step": 2058 + }, + { + "epoch": 0.11529846567364767, + "grad_norm": 1.2364025115966797, + "learning_rate": 1.0285e-05, + "loss": 0.4123, + "step": 2059 + }, + { + "epoch": 0.11535446298577669, + "grad_norm": 1.1593568325042725, + "learning_rate": 1.0290000000000001e-05, + "loss": 0.3481, + "step": 2060 + }, + { + "epoch": 0.1154104602979057, + "grad_norm": 1.2940306663513184, + "learning_rate": 1.0295e-05, + "loss": 0.4622, + "step": 2061 + }, + { + "epoch": 0.11546645761003471, + "grad_norm": 1.0955636501312256, + "learning_rate": 1.03e-05, + "loss": 0.3879, + "step": 2062 + }, + { + "epoch": 0.11552245492216373, + "grad_norm": 1.4514633417129517, + "learning_rate": 1.0305e-05, + "loss": 0.3795, + "step": 2063 + }, + { + "epoch": 0.11557845223429275, + "grad_norm": 1.148901104927063, + "learning_rate": 1.031e-05, + "loss": 0.4643, + "step": 2064 + }, + { + "epoch": 0.11563444954642177, + "grad_norm": 1.4800533056259155, + "learning_rate": 1.0315e-05, + "loss": 0.4183, + "step": 2065 + }, + { + "epoch": 0.11569044685855079, + "grad_norm": 1.5105128288269043, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.6919, + "step": 2066 + }, + { + "epoch": 0.11574644417067981, + "grad_norm": 1.1953701972961426, + "learning_rate": 1.0325e-05, + "loss": 0.4238, + "step": 2067 + }, + { + "epoch": 0.11580244148280883, + "grad_norm": 1.1766806840896606, + "learning_rate": 1.0330000000000001e-05, + "loss": 0.3962, + "step": 2068 + }, + { + "epoch": 0.11585843879493785, + "grad_norm": 1.2753911018371582, + "learning_rate": 1.0335e-05, + "loss": 0.5049, + "step": 2069 + }, + { + "epoch": 0.11591443610706686, + "grad_norm": 1.0954228639602661, + "learning_rate": 1.0340000000000001e-05, + "loss": 0.4322, + "step": 2070 + }, + { + "epoch": 0.11597043341919588, + "grad_norm": 1.5553898811340332, + "learning_rate": 1.0345e-05, + "loss": 0.4716, + "step": 2071 + }, + { + "epoch": 0.1160264307313249, + "grad_norm": 1.0009772777557373, + "learning_rate": 1.035e-05, + "loss": 0.3666, + "step": 2072 + }, + { + "epoch": 0.11608242804345391, + "grad_norm": 1.017662525177002, + "learning_rate": 1.0355000000000002e-05, + "loss": 0.4066, + "step": 2073 + }, + { + "epoch": 0.11613842535558293, + "grad_norm": 1.2353122234344482, + "learning_rate": 1.036e-05, + "loss": 0.5978, + "step": 2074 + }, + { + "epoch": 0.11619442266771195, + "grad_norm": 1.3603179454803467, + "learning_rate": 1.0365000000000002e-05, + "loss": 0.5137, + "step": 2075 + }, + { + "epoch": 0.11625041997984097, + "grad_norm": 1.1561442613601685, + "learning_rate": 1.037e-05, + "loss": 0.5091, + "step": 2076 + }, + { + "epoch": 0.11630641729196999, + "grad_norm": 1.3805582523345947, + "learning_rate": 1.0375e-05, + "loss": 0.3834, + "step": 2077 + }, + { + "epoch": 0.116362414604099, + "grad_norm": 1.298583745956421, + "learning_rate": 1.038e-05, + "loss": 0.4891, + "step": 2078 + }, + { + "epoch": 0.11641841191622802, + "grad_norm": 1.1770060062408447, + "learning_rate": 1.0385e-05, + "loss": 0.513, + "step": 2079 + }, + { + "epoch": 0.11647440922835704, + "grad_norm": 1.1579817533493042, + "learning_rate": 1.039e-05, + "loss": 0.3686, + "step": 2080 + }, + { + "epoch": 0.11653040654048606, + "grad_norm": 0.9921775460243225, + "learning_rate": 1.0395000000000001e-05, + "loss": 0.3811, + "step": 2081 + }, + { + "epoch": 0.11658640385261508, + "grad_norm": 1.3118376731872559, + "learning_rate": 1.04e-05, + "loss": 0.4713, + "step": 2082 + }, + { + "epoch": 0.11664240116474409, + "grad_norm": 1.1961724758148193, + "learning_rate": 1.0405000000000001e-05, + "loss": 0.507, + "step": 2083 + }, + { + "epoch": 0.1166983984768731, + "grad_norm": 1.0339444875717163, + "learning_rate": 1.041e-05, + "loss": 0.4746, + "step": 2084 + }, + { + "epoch": 0.11675439578900212, + "grad_norm": 1.3444161415100098, + "learning_rate": 1.0415000000000001e-05, + "loss": 0.5601, + "step": 2085 + }, + { + "epoch": 0.11681039310113114, + "grad_norm": 1.2207037210464478, + "learning_rate": 1.042e-05, + "loss": 0.6489, + "step": 2086 + }, + { + "epoch": 0.11686639041326016, + "grad_norm": 1.1908115148544312, + "learning_rate": 1.0425e-05, + "loss": 0.4842, + "step": 2087 + }, + { + "epoch": 0.11692238772538918, + "grad_norm": 1.1665292978286743, + "learning_rate": 1.043e-05, + "loss": 0.3457, + "step": 2088 + }, + { + "epoch": 0.1169783850375182, + "grad_norm": 1.1667604446411133, + "learning_rate": 1.0435000000000001e-05, + "loss": 0.4657, + "step": 2089 + }, + { + "epoch": 0.11703438234964722, + "grad_norm": 1.1318879127502441, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.5898, + "step": 2090 + }, + { + "epoch": 0.11709037966177624, + "grad_norm": 1.0579155683517456, + "learning_rate": 1.0445e-05, + "loss": 0.3713, + "step": 2091 + }, + { + "epoch": 0.11714637697390526, + "grad_norm": 1.1594823598861694, + "learning_rate": 1.045e-05, + "loss": 0.5215, + "step": 2092 + }, + { + "epoch": 0.11720237428603426, + "grad_norm": 1.0567612648010254, + "learning_rate": 1.0455e-05, + "loss": 0.3901, + "step": 2093 + }, + { + "epoch": 0.11725837159816328, + "grad_norm": 1.176743507385254, + "learning_rate": 1.046e-05, + "loss": 0.522, + "step": 2094 + }, + { + "epoch": 0.1173143689102923, + "grad_norm": 1.27535879611969, + "learning_rate": 1.0465e-05, + "loss": 0.3966, + "step": 2095 + }, + { + "epoch": 0.11737036622242132, + "grad_norm": 1.0443687438964844, + "learning_rate": 1.0470000000000001e-05, + "loss": 0.3516, + "step": 2096 + }, + { + "epoch": 0.11742636353455034, + "grad_norm": 1.2144715785980225, + "learning_rate": 1.0475e-05, + "loss": 0.3338, + "step": 2097 + }, + { + "epoch": 0.11748236084667936, + "grad_norm": 1.1741278171539307, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.5254, + "step": 2098 + }, + { + "epoch": 0.11753835815880838, + "grad_norm": 1.267101526260376, + "learning_rate": 1.0485e-05, + "loss": 0.5502, + "step": 2099 + }, + { + "epoch": 0.1175943554709374, + "grad_norm": 1.103272557258606, + "learning_rate": 1.049e-05, + "loss": 0.4883, + "step": 2100 + }, + { + "epoch": 0.11765035278306642, + "grad_norm": 1.1672580242156982, + "learning_rate": 1.0495e-05, + "loss": 0.407, + "step": 2101 + }, + { + "epoch": 0.11770635009519544, + "grad_norm": 1.3263310194015503, + "learning_rate": 1.05e-05, + "loss": 0.5444, + "step": 2102 + }, + { + "epoch": 0.11776234740732446, + "grad_norm": 1.1278438568115234, + "learning_rate": 1.0505e-05, + "loss": 0.4182, + "step": 2103 + }, + { + "epoch": 0.11781834471945346, + "grad_norm": 1.1948771476745605, + "learning_rate": 1.0510000000000001e-05, + "loss": 0.3452, + "step": 2104 + }, + { + "epoch": 0.11787434203158248, + "grad_norm": 1.6579550504684448, + "learning_rate": 1.0515e-05, + "loss": 0.4885, + "step": 2105 + }, + { + "epoch": 0.1179303393437115, + "grad_norm": 1.0571820735931396, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.3004, + "step": 2106 + }, + { + "epoch": 0.11798633665584052, + "grad_norm": 1.228864073753357, + "learning_rate": 1.0525e-05, + "loss": 0.4447, + "step": 2107 + }, + { + "epoch": 0.11804233396796954, + "grad_norm": 1.2009637355804443, + "learning_rate": 1.053e-05, + "loss": 0.5708, + "step": 2108 + }, + { + "epoch": 0.11809833128009856, + "grad_norm": 1.3223775625228882, + "learning_rate": 1.0535e-05, + "loss": 0.4903, + "step": 2109 + }, + { + "epoch": 0.11815432859222758, + "grad_norm": 1.6229612827301025, + "learning_rate": 1.0539999999999999e-05, + "loss": 0.4234, + "step": 2110 + }, + { + "epoch": 0.1182103259043566, + "grad_norm": 1.176546573638916, + "learning_rate": 1.0545000000000002e-05, + "loss": 0.4466, + "step": 2111 + }, + { + "epoch": 0.11826632321648561, + "grad_norm": 1.011500358581543, + "learning_rate": 1.055e-05, + "loss": 0.3573, + "step": 2112 + }, + { + "epoch": 0.11832232052861463, + "grad_norm": 1.4040199518203735, + "learning_rate": 1.0555000000000001e-05, + "loss": 0.5961, + "step": 2113 + }, + { + "epoch": 0.11837831784074364, + "grad_norm": 1.226555585861206, + "learning_rate": 1.056e-05, + "loss": 0.386, + "step": 2114 + }, + { + "epoch": 0.11843431515287266, + "grad_norm": 0.9699138402938843, + "learning_rate": 1.0565e-05, + "loss": 0.3447, + "step": 2115 + }, + { + "epoch": 0.11849031246500168, + "grad_norm": 1.2999788522720337, + "learning_rate": 1.057e-05, + "loss": 0.4002, + "step": 2116 + }, + { + "epoch": 0.1185463097771307, + "grad_norm": 1.2441914081573486, + "learning_rate": 1.0575e-05, + "loss": 0.4244, + "step": 2117 + }, + { + "epoch": 0.11860230708925971, + "grad_norm": 1.3216584920883179, + "learning_rate": 1.058e-05, + "loss": 0.753, + "step": 2118 + }, + { + "epoch": 0.11865830440138873, + "grad_norm": 1.2601038217544556, + "learning_rate": 1.0585000000000001e-05, + "loss": 0.4507, + "step": 2119 + }, + { + "epoch": 0.11871430171351775, + "grad_norm": 1.233767032623291, + "learning_rate": 1.059e-05, + "loss": 0.44, + "step": 2120 + }, + { + "epoch": 0.11877029902564677, + "grad_norm": 1.421810269355774, + "learning_rate": 1.0595000000000001e-05, + "loss": 0.4378, + "step": 2121 + }, + { + "epoch": 0.11882629633777579, + "grad_norm": 1.1413922309875488, + "learning_rate": 1.06e-05, + "loss": 0.4579, + "step": 2122 + }, + { + "epoch": 0.11888229364990481, + "grad_norm": 1.8250983953475952, + "learning_rate": 1.0605000000000001e-05, + "loss": 0.4029, + "step": 2123 + }, + { + "epoch": 0.11893829096203382, + "grad_norm": 1.1074143648147583, + "learning_rate": 1.061e-05, + "loss": 0.4235, + "step": 2124 + }, + { + "epoch": 0.11899428827416283, + "grad_norm": 1.3174351453781128, + "learning_rate": 1.0615e-05, + "loss": 0.4614, + "step": 2125 + }, + { + "epoch": 0.11905028558629185, + "grad_norm": 1.3979164361953735, + "learning_rate": 1.062e-05, + "loss": 0.5064, + "step": 2126 + }, + { + "epoch": 0.11910628289842087, + "grad_norm": 1.1409600973129272, + "learning_rate": 1.0625e-05, + "loss": 0.3579, + "step": 2127 + }, + { + "epoch": 0.11916228021054989, + "grad_norm": 1.1139148473739624, + "learning_rate": 1.0630000000000002e-05, + "loss": 0.4408, + "step": 2128 + }, + { + "epoch": 0.11921827752267891, + "grad_norm": 1.3749010562896729, + "learning_rate": 1.0635e-05, + "loss": 0.4861, + "step": 2129 + }, + { + "epoch": 0.11927427483480793, + "grad_norm": 1.289667010307312, + "learning_rate": 1.064e-05, + "loss": 0.4676, + "step": 2130 + }, + { + "epoch": 0.11933027214693695, + "grad_norm": 1.2668863534927368, + "learning_rate": 1.0645e-05, + "loss": 0.6976, + "step": 2131 + }, + { + "epoch": 0.11938626945906597, + "grad_norm": 1.4886201620101929, + "learning_rate": 1.065e-05, + "loss": 0.5122, + "step": 2132 + }, + { + "epoch": 0.11944226677119499, + "grad_norm": 15.780389785766602, + "learning_rate": 1.0655e-05, + "loss": 0.3455, + "step": 2133 + }, + { + "epoch": 0.11949826408332401, + "grad_norm": 1.3982264995574951, + "learning_rate": 1.0660000000000001e-05, + "loss": 0.569, + "step": 2134 + }, + { + "epoch": 0.11955426139545301, + "grad_norm": 1.2477631568908691, + "learning_rate": 1.0665e-05, + "loss": 0.4225, + "step": 2135 + }, + { + "epoch": 0.11961025870758203, + "grad_norm": 1.1074628829956055, + "learning_rate": 1.0670000000000001e-05, + "loss": 0.3849, + "step": 2136 + }, + { + "epoch": 0.11966625601971105, + "grad_norm": 1.1565700769424438, + "learning_rate": 1.0675e-05, + "loss": 0.3819, + "step": 2137 + }, + { + "epoch": 0.11972225333184007, + "grad_norm": 1.2876352071762085, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.4623, + "step": 2138 + }, + { + "epoch": 0.11977825064396909, + "grad_norm": 1.0950559377670288, + "learning_rate": 1.0685e-05, + "loss": 0.3622, + "step": 2139 + }, + { + "epoch": 0.11983424795609811, + "grad_norm": 1.1870139837265015, + "learning_rate": 1.069e-05, + "loss": 0.3958, + "step": 2140 + }, + { + "epoch": 0.11989024526822713, + "grad_norm": 1.1585975885391235, + "learning_rate": 1.0695e-05, + "loss": 0.3928, + "step": 2141 + }, + { + "epoch": 0.11994624258035615, + "grad_norm": 1.1703457832336426, + "learning_rate": 1.0700000000000001e-05, + "loss": 0.3748, + "step": 2142 + }, + { + "epoch": 0.12000223989248517, + "grad_norm": 1.3534200191497803, + "learning_rate": 1.0705000000000002e-05, + "loss": 0.4758, + "step": 2143 + }, + { + "epoch": 0.12005823720461418, + "grad_norm": 1.3202478885650635, + "learning_rate": 1.071e-05, + "loss": 0.461, + "step": 2144 + }, + { + "epoch": 0.12011423451674319, + "grad_norm": 1.2206354141235352, + "learning_rate": 1.0715e-05, + "loss": 0.3884, + "step": 2145 + }, + { + "epoch": 0.12017023182887221, + "grad_norm": 1.0906764268875122, + "learning_rate": 1.072e-05, + "loss": 0.431, + "step": 2146 + }, + { + "epoch": 0.12022622914100123, + "grad_norm": 1.2466380596160889, + "learning_rate": 1.0725e-05, + "loss": 0.4785, + "step": 2147 + }, + { + "epoch": 0.12028222645313025, + "grad_norm": 1.2545379400253296, + "learning_rate": 1.073e-05, + "loss": 0.4105, + "step": 2148 + }, + { + "epoch": 0.12033822376525927, + "grad_norm": 1.18149733543396, + "learning_rate": 1.0735000000000001e-05, + "loss": 0.5563, + "step": 2149 + }, + { + "epoch": 0.12039422107738829, + "grad_norm": 1.1253278255462646, + "learning_rate": 1.074e-05, + "loss": 0.4171, + "step": 2150 + }, + { + "epoch": 0.1204502183895173, + "grad_norm": 1.4532430171966553, + "learning_rate": 1.0745000000000001e-05, + "loss": 0.4619, + "step": 2151 + }, + { + "epoch": 0.12050621570164632, + "grad_norm": 1.6640323400497437, + "learning_rate": 1.075e-05, + "loss": 0.494, + "step": 2152 + }, + { + "epoch": 0.12056221301377534, + "grad_norm": 1.1227400302886963, + "learning_rate": 1.0755000000000001e-05, + "loss": 0.362, + "step": 2153 + }, + { + "epoch": 0.12061821032590436, + "grad_norm": 0.9881584048271179, + "learning_rate": 1.076e-05, + "loss": 0.3443, + "step": 2154 + }, + { + "epoch": 0.12067420763803337, + "grad_norm": 1.3102385997772217, + "learning_rate": 1.0765e-05, + "loss": 0.489, + "step": 2155 + }, + { + "epoch": 0.12073020495016239, + "grad_norm": 1.0849230289459229, + "learning_rate": 1.077e-05, + "loss": 0.4264, + "step": 2156 + }, + { + "epoch": 0.1207862022622914, + "grad_norm": 1.1320273876190186, + "learning_rate": 1.0775000000000001e-05, + "loss": 0.4323, + "step": 2157 + }, + { + "epoch": 0.12084219957442043, + "grad_norm": 1.3121912479400635, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.5436, + "step": 2158 + }, + { + "epoch": 0.12089819688654944, + "grad_norm": 1.8350551128387451, + "learning_rate": 1.0785000000000001e-05, + "loss": 0.4331, + "step": 2159 + }, + { + "epoch": 0.12095419419867846, + "grad_norm": 1.2972077131271362, + "learning_rate": 1.079e-05, + "loss": 0.3937, + "step": 2160 + }, + { + "epoch": 0.12101019151080748, + "grad_norm": 1.2910444736480713, + "learning_rate": 1.0795e-05, + "loss": 0.4605, + "step": 2161 + }, + { + "epoch": 0.1210661888229365, + "grad_norm": 1.1541708707809448, + "learning_rate": 1.08e-05, + "loss": 0.3115, + "step": 2162 + }, + { + "epoch": 0.12112218613506552, + "grad_norm": 1.3426966667175293, + "learning_rate": 1.0804999999999999e-05, + "loss": 0.4149, + "step": 2163 + }, + { + "epoch": 0.12117818344719454, + "grad_norm": 1.162108302116394, + "learning_rate": 1.081e-05, + "loss": 0.3712, + "step": 2164 + }, + { + "epoch": 0.12123418075932356, + "grad_norm": 1.4027280807495117, + "learning_rate": 1.0815e-05, + "loss": 0.4655, + "step": 2165 + }, + { + "epoch": 0.12129017807145256, + "grad_norm": 1.7777239084243774, + "learning_rate": 1.0820000000000001e-05, + "loss": 0.5956, + "step": 2166 + }, + { + "epoch": 0.12134617538358158, + "grad_norm": 1.2932366132736206, + "learning_rate": 1.0825e-05, + "loss": 0.4147, + "step": 2167 + }, + { + "epoch": 0.1214021726957106, + "grad_norm": 1.4258068799972534, + "learning_rate": 1.083e-05, + "loss": 0.5727, + "step": 2168 + }, + { + "epoch": 0.12145817000783962, + "grad_norm": 1.4387284517288208, + "learning_rate": 1.0835e-05, + "loss": 0.4624, + "step": 2169 + }, + { + "epoch": 0.12151416731996864, + "grad_norm": 1.250396490097046, + "learning_rate": 1.084e-05, + "loss": 0.4046, + "step": 2170 + }, + { + "epoch": 0.12157016463209766, + "grad_norm": 1.2278938293457031, + "learning_rate": 1.0845e-05, + "loss": 0.4158, + "step": 2171 + }, + { + "epoch": 0.12162616194422668, + "grad_norm": 1.2678710222244263, + "learning_rate": 1.0850000000000001e-05, + "loss": 0.3746, + "step": 2172 + }, + { + "epoch": 0.1216821592563557, + "grad_norm": 1.1719387769699097, + "learning_rate": 1.0855e-05, + "loss": 0.3382, + "step": 2173 + }, + { + "epoch": 0.12173815656848472, + "grad_norm": 1.2299120426177979, + "learning_rate": 1.0860000000000001e-05, + "loss": 0.4751, + "step": 2174 + }, + { + "epoch": 0.12179415388061374, + "grad_norm": 1.2288856506347656, + "learning_rate": 1.0865e-05, + "loss": 0.5048, + "step": 2175 + }, + { + "epoch": 0.12185015119274274, + "grad_norm": 1.4430570602416992, + "learning_rate": 1.0870000000000001e-05, + "loss": 0.4628, + "step": 2176 + }, + { + "epoch": 0.12190614850487176, + "grad_norm": 1.255171775817871, + "learning_rate": 1.0875e-05, + "loss": 0.405, + "step": 2177 + }, + { + "epoch": 0.12196214581700078, + "grad_norm": 1.0752614736557007, + "learning_rate": 1.088e-05, + "loss": 0.5331, + "step": 2178 + }, + { + "epoch": 0.1220181431291298, + "grad_norm": 1.3410518169403076, + "learning_rate": 1.0885e-05, + "loss": 0.4597, + "step": 2179 + }, + { + "epoch": 0.12207414044125882, + "grad_norm": 1.2840832471847534, + "learning_rate": 1.089e-05, + "loss": 0.5856, + "step": 2180 + }, + { + "epoch": 0.12213013775338784, + "grad_norm": 1.24722158908844, + "learning_rate": 1.0895000000000002e-05, + "loss": 0.4622, + "step": 2181 + }, + { + "epoch": 0.12218613506551686, + "grad_norm": 1.6612390279769897, + "learning_rate": 1.09e-05, + "loss": 0.605, + "step": 2182 + }, + { + "epoch": 0.12224213237764588, + "grad_norm": 1.183188796043396, + "learning_rate": 1.0905e-05, + "loss": 0.4119, + "step": 2183 + }, + { + "epoch": 0.1222981296897749, + "grad_norm": 1.1382944583892822, + "learning_rate": 1.091e-05, + "loss": 0.4015, + "step": 2184 + }, + { + "epoch": 0.12235412700190391, + "grad_norm": 1.2495747804641724, + "learning_rate": 1.0915e-05, + "loss": 0.3978, + "step": 2185 + }, + { + "epoch": 0.12241012431403292, + "grad_norm": 1.3227219581604004, + "learning_rate": 1.092e-05, + "loss": 0.4539, + "step": 2186 + }, + { + "epoch": 0.12246612162616194, + "grad_norm": 1.1677227020263672, + "learning_rate": 1.0925000000000001e-05, + "loss": 0.35, + "step": 2187 + }, + { + "epoch": 0.12252211893829096, + "grad_norm": 1.3256142139434814, + "learning_rate": 1.093e-05, + "loss": 0.4132, + "step": 2188 + }, + { + "epoch": 0.12257811625041998, + "grad_norm": 1.0301586389541626, + "learning_rate": 1.0935000000000001e-05, + "loss": 0.4447, + "step": 2189 + }, + { + "epoch": 0.122634113562549, + "grad_norm": 1.344541311264038, + "learning_rate": 1.094e-05, + "loss": 0.5965, + "step": 2190 + }, + { + "epoch": 0.12269011087467802, + "grad_norm": 1.0082032680511475, + "learning_rate": 1.0945000000000001e-05, + "loss": 0.3608, + "step": 2191 + }, + { + "epoch": 0.12274610818680703, + "grad_norm": 1.1168889999389648, + "learning_rate": 1.095e-05, + "loss": 0.4109, + "step": 2192 + }, + { + "epoch": 0.12280210549893605, + "grad_norm": 1.1556227207183838, + "learning_rate": 1.0955e-05, + "loss": 0.357, + "step": 2193 + }, + { + "epoch": 0.12285810281106507, + "grad_norm": 1.1768267154693604, + "learning_rate": 1.096e-05, + "loss": 0.527, + "step": 2194 + }, + { + "epoch": 0.12291410012319409, + "grad_norm": 1.3603123426437378, + "learning_rate": 1.0965000000000001e-05, + "loss": 0.5147, + "step": 2195 + }, + { + "epoch": 0.12297009743532311, + "grad_norm": 1.364448070526123, + "learning_rate": 1.0970000000000002e-05, + "loss": 0.451, + "step": 2196 + }, + { + "epoch": 0.12302609474745212, + "grad_norm": 1.1654924154281616, + "learning_rate": 1.0975e-05, + "loss": 0.3761, + "step": 2197 + }, + { + "epoch": 0.12308209205958114, + "grad_norm": 1.1860575675964355, + "learning_rate": 1.098e-05, + "loss": 0.4211, + "step": 2198 + }, + { + "epoch": 0.12313808937171015, + "grad_norm": 1.3204054832458496, + "learning_rate": 1.0985e-05, + "loss": 0.6236, + "step": 2199 + }, + { + "epoch": 0.12319408668383917, + "grad_norm": 1.0727225542068481, + "learning_rate": 1.099e-05, + "loss": 0.4035, + "step": 2200 + }, + { + "epoch": 0.1232500839959682, + "grad_norm": 1.1432641744613647, + "learning_rate": 1.0995e-05, + "loss": 0.3848, + "step": 2201 + }, + { + "epoch": 0.12330608130809721, + "grad_norm": 1.2635337114334106, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.4212, + "step": 2202 + }, + { + "epoch": 0.12336207862022623, + "grad_norm": 1.2043837308883667, + "learning_rate": 1.1005e-05, + "loss": 0.5291, + "step": 2203 + }, + { + "epoch": 0.12341807593235525, + "grad_norm": 1.2756997346878052, + "learning_rate": 1.1010000000000001e-05, + "loss": 0.4819, + "step": 2204 + }, + { + "epoch": 0.12347407324448427, + "grad_norm": 1.1286869049072266, + "learning_rate": 1.1015e-05, + "loss": 0.3912, + "step": 2205 + }, + { + "epoch": 0.12353007055661329, + "grad_norm": 1.1184715032577515, + "learning_rate": 1.1020000000000001e-05, + "loss": 0.3552, + "step": 2206 + }, + { + "epoch": 0.1235860678687423, + "grad_norm": 1.805359959602356, + "learning_rate": 1.1025e-05, + "loss": 0.5879, + "step": 2207 + }, + { + "epoch": 0.12364206518087131, + "grad_norm": 1.4591259956359863, + "learning_rate": 1.103e-05, + "loss": 0.6414, + "step": 2208 + }, + { + "epoch": 0.12369806249300033, + "grad_norm": 1.0697208642959595, + "learning_rate": 1.1035e-05, + "loss": 0.3618, + "step": 2209 + }, + { + "epoch": 0.12375405980512935, + "grad_norm": 1.4364018440246582, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.4088, + "step": 2210 + }, + { + "epoch": 0.12381005711725837, + "grad_norm": 1.1499199867248535, + "learning_rate": 1.1045000000000002e-05, + "loss": 0.371, + "step": 2211 + }, + { + "epoch": 0.12386605442938739, + "grad_norm": 1.1446573734283447, + "learning_rate": 1.1050000000000001e-05, + "loss": 0.3733, + "step": 2212 + }, + { + "epoch": 0.12392205174151641, + "grad_norm": 1.2948927879333496, + "learning_rate": 1.1055e-05, + "loss": 0.5075, + "step": 2213 + }, + { + "epoch": 0.12397804905364543, + "grad_norm": 1.4152624607086182, + "learning_rate": 1.106e-05, + "loss": 0.5764, + "step": 2214 + }, + { + "epoch": 0.12403404636577445, + "grad_norm": 1.479748249053955, + "learning_rate": 1.1065e-05, + "loss": 0.5262, + "step": 2215 + }, + { + "epoch": 0.12409004367790347, + "grad_norm": 1.7374500036239624, + "learning_rate": 1.107e-05, + "loss": 0.5617, + "step": 2216 + }, + { + "epoch": 0.12414604099003247, + "grad_norm": 1.1393375396728516, + "learning_rate": 1.1075e-05, + "loss": 0.4754, + "step": 2217 + }, + { + "epoch": 0.12420203830216149, + "grad_norm": 1.1788643598556519, + "learning_rate": 1.108e-05, + "loss": 0.4177, + "step": 2218 + }, + { + "epoch": 0.12425803561429051, + "grad_norm": 1.2186473608016968, + "learning_rate": 1.1085000000000001e-05, + "loss": 0.6167, + "step": 2219 + }, + { + "epoch": 0.12431403292641953, + "grad_norm": 1.2026934623718262, + "learning_rate": 1.109e-05, + "loss": 0.3933, + "step": 2220 + }, + { + "epoch": 0.12437003023854855, + "grad_norm": 1.1073105335235596, + "learning_rate": 1.1095e-05, + "loss": 0.5107, + "step": 2221 + }, + { + "epoch": 0.12442602755067757, + "grad_norm": 1.339430570602417, + "learning_rate": 1.11e-05, + "loss": 0.4354, + "step": 2222 + }, + { + "epoch": 0.12448202486280659, + "grad_norm": 1.357672095298767, + "learning_rate": 1.1105e-05, + "loss": 0.4647, + "step": 2223 + }, + { + "epoch": 0.1245380221749356, + "grad_norm": 1.2059109210968018, + "learning_rate": 1.111e-05, + "loss": 0.4529, + "step": 2224 + }, + { + "epoch": 0.12459401948706463, + "grad_norm": 1.2800805568695068, + "learning_rate": 1.1115000000000001e-05, + "loss": 0.4615, + "step": 2225 + }, + { + "epoch": 0.12465001679919364, + "grad_norm": 1.1725817918777466, + "learning_rate": 1.112e-05, + "loss": 0.3642, + "step": 2226 + }, + { + "epoch": 0.12470601411132266, + "grad_norm": 1.4158438444137573, + "learning_rate": 1.1125000000000001e-05, + "loss": 0.3824, + "step": 2227 + }, + { + "epoch": 0.12476201142345167, + "grad_norm": 1.0686370134353638, + "learning_rate": 1.113e-05, + "loss": 0.4248, + "step": 2228 + }, + { + "epoch": 0.12481800873558069, + "grad_norm": 1.151619553565979, + "learning_rate": 1.1135000000000001e-05, + "loss": 0.4378, + "step": 2229 + }, + { + "epoch": 0.1248740060477097, + "grad_norm": 1.1525603532791138, + "learning_rate": 1.114e-05, + "loss": 0.3432, + "step": 2230 + }, + { + "epoch": 0.12493000335983873, + "grad_norm": 1.104702353477478, + "learning_rate": 1.1145e-05, + "loss": 0.4535, + "step": 2231 + }, + { + "epoch": 0.12498600067196775, + "grad_norm": 1.0402547121047974, + "learning_rate": 1.115e-05, + "loss": 0.3269, + "step": 2232 + }, + { + "epoch": 0.12504199798409676, + "grad_norm": 1.2609214782714844, + "learning_rate": 1.1155e-05, + "loss": 0.4081, + "step": 2233 + }, + { + "epoch": 0.12509799529622578, + "grad_norm": 1.3451021909713745, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.5328, + "step": 2234 + }, + { + "epoch": 0.1251539926083548, + "grad_norm": 1.1217625141143799, + "learning_rate": 1.1165e-05, + "loss": 0.4598, + "step": 2235 + }, + { + "epoch": 0.12520998992048382, + "grad_norm": 1.4307420253753662, + "learning_rate": 1.117e-05, + "loss": 0.4804, + "step": 2236 + }, + { + "epoch": 0.12526598723261284, + "grad_norm": 1.2126822471618652, + "learning_rate": 1.1175e-05, + "loss": 0.5237, + "step": 2237 + }, + { + "epoch": 0.12532198454474186, + "grad_norm": 2.062070846557617, + "learning_rate": 1.118e-05, + "loss": 0.3978, + "step": 2238 + }, + { + "epoch": 0.12537798185687088, + "grad_norm": 1.2071523666381836, + "learning_rate": 1.1185e-05, + "loss": 0.421, + "step": 2239 + }, + { + "epoch": 0.1254339791689999, + "grad_norm": 1.3621774911880493, + "learning_rate": 1.1190000000000001e-05, + "loss": 0.4159, + "step": 2240 + }, + { + "epoch": 0.12548997648112892, + "grad_norm": 1.3251384496688843, + "learning_rate": 1.1195e-05, + "loss": 0.5032, + "step": 2241 + }, + { + "epoch": 0.12554597379325794, + "grad_norm": 1.3219131231307983, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.4851, + "step": 2242 + }, + { + "epoch": 0.12560197110538693, + "grad_norm": 1.1742477416992188, + "learning_rate": 1.1205e-05, + "loss": 0.415, + "step": 2243 + }, + { + "epoch": 0.12565796841751595, + "grad_norm": 1.6238731145858765, + "learning_rate": 1.1210000000000001e-05, + "loss": 0.5411, + "step": 2244 + }, + { + "epoch": 0.12571396572964497, + "grad_norm": 1.1181552410125732, + "learning_rate": 1.1215e-05, + "loss": 0.5741, + "step": 2245 + }, + { + "epoch": 0.12576996304177399, + "grad_norm": 1.1299830675125122, + "learning_rate": 1.122e-05, + "loss": 0.3773, + "step": 2246 + }, + { + "epoch": 0.125825960353903, + "grad_norm": 1.1737785339355469, + "learning_rate": 1.1225e-05, + "loss": 0.4077, + "step": 2247 + }, + { + "epoch": 0.12588195766603202, + "grad_norm": 1.0765039920806885, + "learning_rate": 1.1230000000000001e-05, + "loss": 0.5425, + "step": 2248 + }, + { + "epoch": 0.12593795497816104, + "grad_norm": 1.0482008457183838, + "learning_rate": 1.1235000000000002e-05, + "loss": 0.403, + "step": 2249 + }, + { + "epoch": 0.12599395229029006, + "grad_norm": 1.0707420110702515, + "learning_rate": 1.124e-05, + "loss": 0.4714, + "step": 2250 + }, + { + "epoch": 0.12604994960241908, + "grad_norm": 1.2540937662124634, + "learning_rate": 1.1245e-05, + "loss": 0.3512, + "step": 2251 + }, + { + "epoch": 0.1261059469145481, + "grad_norm": 1.2286819219589233, + "learning_rate": 1.125e-05, + "loss": 0.397, + "step": 2252 + }, + { + "epoch": 0.12616194422667712, + "grad_norm": 1.150970458984375, + "learning_rate": 1.1255e-05, + "loss": 0.4503, + "step": 2253 + }, + { + "epoch": 0.12621794153880614, + "grad_norm": 1.4410957098007202, + "learning_rate": 1.126e-05, + "loss": 0.5917, + "step": 2254 + }, + { + "epoch": 0.12627393885093516, + "grad_norm": 1.1711270809173584, + "learning_rate": 1.1265e-05, + "loss": 0.4428, + "step": 2255 + }, + { + "epoch": 0.12632993616306418, + "grad_norm": 1.3283476829528809, + "learning_rate": 1.127e-05, + "loss": 0.5402, + "step": 2256 + }, + { + "epoch": 0.1263859334751932, + "grad_norm": 1.4431616067886353, + "learning_rate": 1.1275000000000001e-05, + "loss": 0.5657, + "step": 2257 + }, + { + "epoch": 0.12644193078732222, + "grad_norm": 1.122452974319458, + "learning_rate": 1.128e-05, + "loss": 0.4835, + "step": 2258 + }, + { + "epoch": 0.12649792809945123, + "grad_norm": 1.05092191696167, + "learning_rate": 1.1285000000000001e-05, + "loss": 0.4517, + "step": 2259 + }, + { + "epoch": 0.12655392541158025, + "grad_norm": 1.0658460855484009, + "learning_rate": 1.129e-05, + "loss": 0.4475, + "step": 2260 + }, + { + "epoch": 0.12660992272370927, + "grad_norm": 1.3967729806900024, + "learning_rate": 1.1295e-05, + "loss": 0.4501, + "step": 2261 + }, + { + "epoch": 0.1266659200358383, + "grad_norm": 1.4232945442199707, + "learning_rate": 1.13e-05, + "loss": 0.5232, + "step": 2262 + }, + { + "epoch": 0.1267219173479673, + "grad_norm": 1.4301387071609497, + "learning_rate": 1.1305000000000001e-05, + "loss": 0.6171, + "step": 2263 + }, + { + "epoch": 0.1267779146600963, + "grad_norm": 1.4325543642044067, + "learning_rate": 1.1310000000000002e-05, + "loss": 0.5432, + "step": 2264 + }, + { + "epoch": 0.12683391197222532, + "grad_norm": 1.292751669883728, + "learning_rate": 1.1315000000000001e-05, + "loss": 0.5031, + "step": 2265 + }, + { + "epoch": 0.12688990928435434, + "grad_norm": 1.2938718795776367, + "learning_rate": 1.132e-05, + "loss": 0.5866, + "step": 2266 + }, + { + "epoch": 0.12694590659648336, + "grad_norm": 1.285792589187622, + "learning_rate": 1.1325e-05, + "loss": 0.4874, + "step": 2267 + }, + { + "epoch": 0.12700190390861238, + "grad_norm": 1.2472320795059204, + "learning_rate": 1.133e-05, + "loss": 0.5701, + "step": 2268 + }, + { + "epoch": 0.1270579012207414, + "grad_norm": 1.1968580484390259, + "learning_rate": 1.1335e-05, + "loss": 0.4281, + "step": 2269 + }, + { + "epoch": 0.12711389853287042, + "grad_norm": 1.3408206701278687, + "learning_rate": 1.134e-05, + "loss": 0.4181, + "step": 2270 + }, + { + "epoch": 0.12716989584499944, + "grad_norm": 1.2121702432632446, + "learning_rate": 1.1345e-05, + "loss": 0.4036, + "step": 2271 + }, + { + "epoch": 0.12722589315712846, + "grad_norm": 1.0712432861328125, + "learning_rate": 1.1350000000000001e-05, + "loss": 0.4192, + "step": 2272 + }, + { + "epoch": 0.12728189046925747, + "grad_norm": 1.1871991157531738, + "learning_rate": 1.1355e-05, + "loss": 0.3987, + "step": 2273 + }, + { + "epoch": 0.1273378877813865, + "grad_norm": 1.0942261219024658, + "learning_rate": 1.1360000000000001e-05, + "loss": 0.4153, + "step": 2274 + }, + { + "epoch": 0.1273938850935155, + "grad_norm": 1.2354952096939087, + "learning_rate": 1.1365e-05, + "loss": 0.5993, + "step": 2275 + }, + { + "epoch": 0.12744988240564453, + "grad_norm": 1.4431953430175781, + "learning_rate": 1.137e-05, + "loss": 0.4021, + "step": 2276 + }, + { + "epoch": 0.12750587971777355, + "grad_norm": 1.1220649480819702, + "learning_rate": 1.1375e-05, + "loss": 0.4295, + "step": 2277 + }, + { + "epoch": 0.12756187702990257, + "grad_norm": 1.1681586503982544, + "learning_rate": 1.1380000000000001e-05, + "loss": 0.4202, + "step": 2278 + }, + { + "epoch": 0.1276178743420316, + "grad_norm": 1.1978394985198975, + "learning_rate": 1.1385000000000002e-05, + "loss": 0.5624, + "step": 2279 + }, + { + "epoch": 0.1276738716541606, + "grad_norm": 1.1839028596878052, + "learning_rate": 1.1390000000000001e-05, + "loss": 0.4514, + "step": 2280 + }, + { + "epoch": 0.12772986896628963, + "grad_norm": 1.1288633346557617, + "learning_rate": 1.1395e-05, + "loss": 0.5819, + "step": 2281 + }, + { + "epoch": 0.12778586627841865, + "grad_norm": 0.9166799783706665, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.3726, + "step": 2282 + }, + { + "epoch": 0.12784186359054767, + "grad_norm": 1.317753791809082, + "learning_rate": 1.1405e-05, + "loss": 0.3707, + "step": 2283 + }, + { + "epoch": 0.12789786090267666, + "grad_norm": 1.3430582284927368, + "learning_rate": 1.141e-05, + "loss": 0.5908, + "step": 2284 + }, + { + "epoch": 0.12795385821480568, + "grad_norm": 1.2576439380645752, + "learning_rate": 1.1415e-05, + "loss": 0.5255, + "step": 2285 + }, + { + "epoch": 0.1280098555269347, + "grad_norm": 13.189178466796875, + "learning_rate": 1.142e-05, + "loss": 0.4606, + "step": 2286 + }, + { + "epoch": 0.12806585283906372, + "grad_norm": 1.1022242307662964, + "learning_rate": 1.1425000000000002e-05, + "loss": 0.3685, + "step": 2287 + }, + { + "epoch": 0.12812185015119273, + "grad_norm": 1.0560503005981445, + "learning_rate": 1.143e-05, + "loss": 0.4096, + "step": 2288 + }, + { + "epoch": 0.12817784746332175, + "grad_norm": 1.2018636465072632, + "learning_rate": 1.1435e-05, + "loss": 0.4187, + "step": 2289 + }, + { + "epoch": 0.12823384477545077, + "grad_norm": 1.204622745513916, + "learning_rate": 1.144e-05, + "loss": 0.4063, + "step": 2290 + }, + { + "epoch": 0.1282898420875798, + "grad_norm": 1.3194224834442139, + "learning_rate": 1.1445e-05, + "loss": 0.6101, + "step": 2291 + }, + { + "epoch": 0.1283458393997088, + "grad_norm": 1.1247808933258057, + "learning_rate": 1.145e-05, + "loss": 0.4507, + "step": 2292 + }, + { + "epoch": 0.12840183671183783, + "grad_norm": 1.17900550365448, + "learning_rate": 1.1455000000000001e-05, + "loss": 0.4489, + "step": 2293 + }, + { + "epoch": 0.12845783402396685, + "grad_norm": 1.0201923847198486, + "learning_rate": 1.146e-05, + "loss": 0.4107, + "step": 2294 + }, + { + "epoch": 0.12851383133609587, + "grad_norm": 1.1454983949661255, + "learning_rate": 1.1465000000000001e-05, + "loss": 0.4719, + "step": 2295 + }, + { + "epoch": 0.1285698286482249, + "grad_norm": 1.003904104232788, + "learning_rate": 1.147e-05, + "loss": 0.3516, + "step": 2296 + }, + { + "epoch": 0.1286258259603539, + "grad_norm": 1.0594944953918457, + "learning_rate": 1.1475000000000001e-05, + "loss": 0.4612, + "step": 2297 + }, + { + "epoch": 0.12868182327248293, + "grad_norm": 1.2085590362548828, + "learning_rate": 1.148e-05, + "loss": 0.4192, + "step": 2298 + }, + { + "epoch": 0.12873782058461194, + "grad_norm": 1.2151544094085693, + "learning_rate": 1.1485e-05, + "loss": 0.3859, + "step": 2299 + }, + { + "epoch": 0.12879381789674096, + "grad_norm": 1.1104528903961182, + "learning_rate": 1.149e-05, + "loss": 0.387, + "step": 2300 + }, + { + "epoch": 0.12884981520886998, + "grad_norm": 1.1801403760910034, + "learning_rate": 1.1495000000000001e-05, + "loss": 0.4741, + "step": 2301 + }, + { + "epoch": 0.128905812520999, + "grad_norm": 1.357081651687622, + "learning_rate": 1.1500000000000002e-05, + "loss": 0.5217, + "step": 2302 + }, + { + "epoch": 0.12896180983312802, + "grad_norm": 1.403161883354187, + "learning_rate": 1.1505e-05, + "loss": 0.5856, + "step": 2303 + }, + { + "epoch": 0.12901780714525704, + "grad_norm": 1.230513334274292, + "learning_rate": 1.151e-05, + "loss": 0.4272, + "step": 2304 + }, + { + "epoch": 0.12907380445738603, + "grad_norm": 1.083959937095642, + "learning_rate": 1.1515e-05, + "loss": 0.4267, + "step": 2305 + }, + { + "epoch": 0.12912980176951505, + "grad_norm": 1.5055561065673828, + "learning_rate": 1.152e-05, + "loss": 0.5365, + "step": 2306 + }, + { + "epoch": 0.12918579908164407, + "grad_norm": 1.0667353868484497, + "learning_rate": 1.1525e-05, + "loss": 0.3674, + "step": 2307 + }, + { + "epoch": 0.1292417963937731, + "grad_norm": 1.5784897804260254, + "learning_rate": 1.153e-05, + "loss": 0.5412, + "step": 2308 + }, + { + "epoch": 0.1292977937059021, + "grad_norm": 1.2324721813201904, + "learning_rate": 1.1535e-05, + "loss": 0.4253, + "step": 2309 + }, + { + "epoch": 0.12935379101803113, + "grad_norm": 1.1168930530548096, + "learning_rate": 1.1540000000000001e-05, + "loss": 0.565, + "step": 2310 + }, + { + "epoch": 0.12940978833016015, + "grad_norm": 4.88847541809082, + "learning_rate": 1.1545e-05, + "loss": 0.3856, + "step": 2311 + }, + { + "epoch": 0.12946578564228917, + "grad_norm": 1.3440314531326294, + "learning_rate": 1.1550000000000001e-05, + "loss": 0.4282, + "step": 2312 + }, + { + "epoch": 0.12952178295441819, + "grad_norm": 1.2424455881118774, + "learning_rate": 1.1555e-05, + "loss": 0.5565, + "step": 2313 + }, + { + "epoch": 0.1295777802665472, + "grad_norm": 1.1632091999053955, + "learning_rate": 1.156e-05, + "loss": 0.5432, + "step": 2314 + }, + { + "epoch": 0.12963377757867622, + "grad_norm": 1.2187398672103882, + "learning_rate": 1.1565e-05, + "loss": 0.5051, + "step": 2315 + }, + { + "epoch": 0.12968977489080524, + "grad_norm": 1.359572172164917, + "learning_rate": 1.1570000000000001e-05, + "loss": 0.4581, + "step": 2316 + }, + { + "epoch": 0.12974577220293426, + "grad_norm": 1.4037069082260132, + "learning_rate": 1.1575000000000002e-05, + "loss": 0.5692, + "step": 2317 + }, + { + "epoch": 0.12980176951506328, + "grad_norm": 1.292794942855835, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.4582, + "step": 2318 + }, + { + "epoch": 0.1298577668271923, + "grad_norm": 1.0918874740600586, + "learning_rate": 1.1585e-05, + "loss": 0.3554, + "step": 2319 + }, + { + "epoch": 0.12991376413932132, + "grad_norm": 1.0295218229293823, + "learning_rate": 1.159e-05, + "loss": 0.3185, + "step": 2320 + }, + { + "epoch": 0.12996976145145034, + "grad_norm": 1.1888872385025024, + "learning_rate": 1.1595e-05, + "loss": 0.3827, + "step": 2321 + }, + { + "epoch": 0.13002575876357936, + "grad_norm": 1.218995451927185, + "learning_rate": 1.16e-05, + "loss": 0.4396, + "step": 2322 + }, + { + "epoch": 0.13008175607570838, + "grad_norm": 1.046060562133789, + "learning_rate": 1.1605e-05, + "loss": 0.3439, + "step": 2323 + }, + { + "epoch": 0.1301377533878374, + "grad_norm": 1.1816738843917847, + "learning_rate": 1.161e-05, + "loss": 0.4889, + "step": 2324 + }, + { + "epoch": 0.13019375069996642, + "grad_norm": 1.1769605875015259, + "learning_rate": 1.1615000000000001e-05, + "loss": 0.5111, + "step": 2325 + }, + { + "epoch": 0.1302497480120954, + "grad_norm": 11.965676307678223, + "learning_rate": 1.162e-05, + "loss": 0.4498, + "step": 2326 + }, + { + "epoch": 0.13030574532422443, + "grad_norm": 1.245835542678833, + "learning_rate": 1.1625000000000001e-05, + "loss": 0.4435, + "step": 2327 + }, + { + "epoch": 0.13036174263635344, + "grad_norm": 1.261703372001648, + "learning_rate": 1.163e-05, + "loss": 0.3417, + "step": 2328 + }, + { + "epoch": 0.13041773994848246, + "grad_norm": 1.4906578063964844, + "learning_rate": 1.1635e-05, + "loss": 0.4779, + "step": 2329 + }, + { + "epoch": 0.13047373726061148, + "grad_norm": 1.5169082880020142, + "learning_rate": 1.164e-05, + "loss": 0.5212, + "step": 2330 + }, + { + "epoch": 0.1305297345727405, + "grad_norm": 1.599611520767212, + "learning_rate": 1.1645000000000001e-05, + "loss": 0.4286, + "step": 2331 + }, + { + "epoch": 0.13058573188486952, + "grad_norm": 1.9279402494430542, + "learning_rate": 1.1650000000000002e-05, + "loss": 0.466, + "step": 2332 + }, + { + "epoch": 0.13064172919699854, + "grad_norm": 1.1912360191345215, + "learning_rate": 1.1655000000000001e-05, + "loss": 0.4664, + "step": 2333 + }, + { + "epoch": 0.13069772650912756, + "grad_norm": 1.3635510206222534, + "learning_rate": 1.166e-05, + "loss": 0.4613, + "step": 2334 + }, + { + "epoch": 0.13075372382125658, + "grad_norm": 1.300800085067749, + "learning_rate": 1.1665000000000001e-05, + "loss": 0.5828, + "step": 2335 + }, + { + "epoch": 0.1308097211333856, + "grad_norm": 1.1869381666183472, + "learning_rate": 1.167e-05, + "loss": 0.3331, + "step": 2336 + }, + { + "epoch": 0.13086571844551462, + "grad_norm": 1.4955719709396362, + "learning_rate": 1.1675000000000001e-05, + "loss": 0.5642, + "step": 2337 + }, + { + "epoch": 0.13092171575764364, + "grad_norm": 1.167914867401123, + "learning_rate": 1.168e-05, + "loss": 0.4144, + "step": 2338 + }, + { + "epoch": 0.13097771306977266, + "grad_norm": 1.3202574253082275, + "learning_rate": 1.1685e-05, + "loss": 0.4789, + "step": 2339 + }, + { + "epoch": 0.13103371038190167, + "grad_norm": 1.2649132013320923, + "learning_rate": 1.1690000000000002e-05, + "loss": 0.4761, + "step": 2340 + }, + { + "epoch": 0.1310897076940307, + "grad_norm": 1.065508484840393, + "learning_rate": 1.1695e-05, + "loss": 0.4014, + "step": 2341 + }, + { + "epoch": 0.1311457050061597, + "grad_norm": 0.9914081692695618, + "learning_rate": 1.1700000000000001e-05, + "loss": 0.3879, + "step": 2342 + }, + { + "epoch": 0.13120170231828873, + "grad_norm": 1.1917837858200073, + "learning_rate": 1.1705e-05, + "loss": 0.4464, + "step": 2343 + }, + { + "epoch": 0.13125769963041775, + "grad_norm": 1.1857242584228516, + "learning_rate": 1.171e-05, + "loss": 0.3995, + "step": 2344 + }, + { + "epoch": 0.13131369694254677, + "grad_norm": 1.0622613430023193, + "learning_rate": 1.1715e-05, + "loss": 0.3963, + "step": 2345 + }, + { + "epoch": 0.13136969425467576, + "grad_norm": 1.1169847249984741, + "learning_rate": 1.172e-05, + "loss": 0.3781, + "step": 2346 + }, + { + "epoch": 0.13142569156680478, + "grad_norm": 1.2596418857574463, + "learning_rate": 1.1725e-05, + "loss": 0.4689, + "step": 2347 + }, + { + "epoch": 0.1314816888789338, + "grad_norm": 1.2164725065231323, + "learning_rate": 1.1730000000000001e-05, + "loss": 0.4248, + "step": 2348 + }, + { + "epoch": 0.13153768619106282, + "grad_norm": 1.1801409721374512, + "learning_rate": 1.1735e-05, + "loss": 0.4889, + "step": 2349 + }, + { + "epoch": 0.13159368350319184, + "grad_norm": 0.9698452949523926, + "learning_rate": 1.1740000000000001e-05, + "loss": 0.4187, + "step": 2350 + }, + { + "epoch": 0.13164968081532086, + "grad_norm": 1.3726561069488525, + "learning_rate": 1.1745e-05, + "loss": 0.6082, + "step": 2351 + }, + { + "epoch": 0.13170567812744988, + "grad_norm": 1.310638666152954, + "learning_rate": 1.175e-05, + "loss": 0.4059, + "step": 2352 + }, + { + "epoch": 0.1317616754395789, + "grad_norm": 1.0972820520401, + "learning_rate": 1.1755e-05, + "loss": 0.5182, + "step": 2353 + }, + { + "epoch": 0.13181767275170791, + "grad_norm": 1.407310128211975, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.4274, + "step": 2354 + }, + { + "epoch": 0.13187367006383693, + "grad_norm": 1.238159418106079, + "learning_rate": 1.1765000000000002e-05, + "loss": 0.4569, + "step": 2355 + }, + { + "epoch": 0.13192966737596595, + "grad_norm": 1.4061623811721802, + "learning_rate": 1.177e-05, + "loss": 0.5208, + "step": 2356 + }, + { + "epoch": 0.13198566468809497, + "grad_norm": 1.0815753936767578, + "learning_rate": 1.1775e-05, + "loss": 0.5048, + "step": 2357 + }, + { + "epoch": 0.132041662000224, + "grad_norm": 1.5622210502624512, + "learning_rate": 1.178e-05, + "loss": 0.4615, + "step": 2358 + }, + { + "epoch": 0.132097659312353, + "grad_norm": 1.1877309083938599, + "learning_rate": 1.1785e-05, + "loss": 0.4608, + "step": 2359 + }, + { + "epoch": 0.13215365662448203, + "grad_norm": 1.7222483158111572, + "learning_rate": 1.179e-05, + "loss": 0.4566, + "step": 2360 + }, + { + "epoch": 0.13220965393661105, + "grad_norm": 0.9789606928825378, + "learning_rate": 1.1795e-05, + "loss": 0.3139, + "step": 2361 + }, + { + "epoch": 0.13226565124874007, + "grad_norm": 1.3129633665084839, + "learning_rate": 1.18e-05, + "loss": 0.5787, + "step": 2362 + }, + { + "epoch": 0.1323216485608691, + "grad_norm": 1.207702398300171, + "learning_rate": 1.1805000000000001e-05, + "loss": 0.5146, + "step": 2363 + }, + { + "epoch": 0.1323776458729981, + "grad_norm": 1.3435063362121582, + "learning_rate": 1.181e-05, + "loss": 0.4662, + "step": 2364 + }, + { + "epoch": 0.13243364318512713, + "grad_norm": 1.13418710231781, + "learning_rate": 1.1815000000000001e-05, + "loss": 0.3667, + "step": 2365 + }, + { + "epoch": 0.13248964049725614, + "grad_norm": 1.2030316591262817, + "learning_rate": 1.182e-05, + "loss": 0.3939, + "step": 2366 + }, + { + "epoch": 0.13254563780938514, + "grad_norm": 1.1721278429031372, + "learning_rate": 1.1825e-05, + "loss": 0.5194, + "step": 2367 + }, + { + "epoch": 0.13260163512151416, + "grad_norm": 1.1768015623092651, + "learning_rate": 1.183e-05, + "loss": 0.3973, + "step": 2368 + }, + { + "epoch": 0.13265763243364317, + "grad_norm": 1.225443720817566, + "learning_rate": 1.1835000000000001e-05, + "loss": 0.4661, + "step": 2369 + }, + { + "epoch": 0.1327136297457722, + "grad_norm": 1.093108892440796, + "learning_rate": 1.1840000000000002e-05, + "loss": 0.3354, + "step": 2370 + }, + { + "epoch": 0.1327696270579012, + "grad_norm": 1.1921272277832031, + "learning_rate": 1.1845000000000001e-05, + "loss": 0.4253, + "step": 2371 + }, + { + "epoch": 0.13282562437003023, + "grad_norm": 1.2796928882598877, + "learning_rate": 1.185e-05, + "loss": 0.4149, + "step": 2372 + }, + { + "epoch": 0.13288162168215925, + "grad_norm": 1.2408664226531982, + "learning_rate": 1.1855e-05, + "loss": 0.4325, + "step": 2373 + }, + { + "epoch": 0.13293761899428827, + "grad_norm": 1.502134919166565, + "learning_rate": 1.186e-05, + "loss": 0.5105, + "step": 2374 + }, + { + "epoch": 0.1329936163064173, + "grad_norm": 1.290805459022522, + "learning_rate": 1.1865e-05, + "loss": 0.5634, + "step": 2375 + }, + { + "epoch": 0.1330496136185463, + "grad_norm": 1.6113225221633911, + "learning_rate": 1.187e-05, + "loss": 0.5746, + "step": 2376 + }, + { + "epoch": 0.13310561093067533, + "grad_norm": 1.0114905834197998, + "learning_rate": 1.1875e-05, + "loss": 0.4165, + "step": 2377 + }, + { + "epoch": 0.13316160824280435, + "grad_norm": 1.1047070026397705, + "learning_rate": 1.1880000000000001e-05, + "loss": 0.4955, + "step": 2378 + }, + { + "epoch": 0.13321760555493337, + "grad_norm": 1.0724822282791138, + "learning_rate": 1.1885e-05, + "loss": 0.4241, + "step": 2379 + }, + { + "epoch": 0.13327360286706239, + "grad_norm": 1.5702725648880005, + "learning_rate": 1.1890000000000001e-05, + "loss": 0.3546, + "step": 2380 + }, + { + "epoch": 0.1333296001791914, + "grad_norm": 1.3315016031265259, + "learning_rate": 1.1895e-05, + "loss": 0.4321, + "step": 2381 + }, + { + "epoch": 0.13338559749132042, + "grad_norm": 2.0095887184143066, + "learning_rate": 1.19e-05, + "loss": 0.4651, + "step": 2382 + }, + { + "epoch": 0.13344159480344944, + "grad_norm": 1.2531713247299194, + "learning_rate": 1.1905e-05, + "loss": 0.4597, + "step": 2383 + }, + { + "epoch": 0.13349759211557846, + "grad_norm": 1.2469711303710938, + "learning_rate": 1.1910000000000001e-05, + "loss": 0.4425, + "step": 2384 + }, + { + "epoch": 0.13355358942770748, + "grad_norm": 1.7293294668197632, + "learning_rate": 1.1915000000000002e-05, + "loss": 0.4855, + "step": 2385 + }, + { + "epoch": 0.1336095867398365, + "grad_norm": 1.332313895225525, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.3875, + "step": 2386 + }, + { + "epoch": 0.13366558405196552, + "grad_norm": 1.1652700901031494, + "learning_rate": 1.1925e-05, + "loss": 0.5291, + "step": 2387 + }, + { + "epoch": 0.1337215813640945, + "grad_norm": 1.0502643585205078, + "learning_rate": 1.1930000000000001e-05, + "loss": 0.3097, + "step": 2388 + }, + { + "epoch": 0.13377757867622353, + "grad_norm": 1.5209193229675293, + "learning_rate": 1.1935e-05, + "loss": 0.4903, + "step": 2389 + }, + { + "epoch": 0.13383357598835255, + "grad_norm": 1.1833019256591797, + "learning_rate": 1.1940000000000001e-05, + "loss": 0.4808, + "step": 2390 + }, + { + "epoch": 0.13388957330048157, + "grad_norm": 1.3718583583831787, + "learning_rate": 1.1945e-05, + "loss": 0.4008, + "step": 2391 + }, + { + "epoch": 0.1339455706126106, + "grad_norm": 1.2104849815368652, + "learning_rate": 1.195e-05, + "loss": 0.4653, + "step": 2392 + }, + { + "epoch": 0.1340015679247396, + "grad_norm": 1.029047966003418, + "learning_rate": 1.1955000000000002e-05, + "loss": 0.4049, + "step": 2393 + }, + { + "epoch": 0.13405756523686863, + "grad_norm": 3.4568305015563965, + "learning_rate": 1.196e-05, + "loss": 0.4246, + "step": 2394 + }, + { + "epoch": 0.13411356254899764, + "grad_norm": 1.1154710054397583, + "learning_rate": 1.1965000000000001e-05, + "loss": 0.4391, + "step": 2395 + }, + { + "epoch": 0.13416955986112666, + "grad_norm": 1.150034785270691, + "learning_rate": 1.197e-05, + "loss": 0.3921, + "step": 2396 + }, + { + "epoch": 0.13422555717325568, + "grad_norm": 1.2795981168746948, + "learning_rate": 1.1975e-05, + "loss": 0.3264, + "step": 2397 + }, + { + "epoch": 0.1342815544853847, + "grad_norm": 1.226135015487671, + "learning_rate": 1.198e-05, + "loss": 0.5624, + "step": 2398 + }, + { + "epoch": 0.13433755179751372, + "grad_norm": 1.088365077972412, + "learning_rate": 1.1985e-05, + "loss": 0.3313, + "step": 2399 + }, + { + "epoch": 0.13439354910964274, + "grad_norm": 1.1446475982666016, + "learning_rate": 1.199e-05, + "loss": 0.4507, + "step": 2400 + }, + { + "epoch": 0.13444954642177176, + "grad_norm": 1.3302208185195923, + "learning_rate": 1.1995000000000001e-05, + "loss": 0.4239, + "step": 2401 + }, + { + "epoch": 0.13450554373390078, + "grad_norm": 1.1120394468307495, + "learning_rate": 1.2e-05, + "loss": 0.4509, + "step": 2402 + }, + { + "epoch": 0.1345615410460298, + "grad_norm": 1.155068278312683, + "learning_rate": 1.2005000000000001e-05, + "loss": 0.434, + "step": 2403 + }, + { + "epoch": 0.13461753835815882, + "grad_norm": 1.1789742708206177, + "learning_rate": 1.201e-05, + "loss": 0.2487, + "step": 2404 + }, + { + "epoch": 0.13467353567028784, + "grad_norm": 1.2293139696121216, + "learning_rate": 1.2015000000000001e-05, + "loss": 0.5815, + "step": 2405 + }, + { + "epoch": 0.13472953298241686, + "grad_norm": 1.2889957427978516, + "learning_rate": 1.202e-05, + "loss": 0.6689, + "step": 2406 + }, + { + "epoch": 0.13478553029454587, + "grad_norm": 1.4162606000900269, + "learning_rate": 1.2025000000000001e-05, + "loss": 0.4166, + "step": 2407 + }, + { + "epoch": 0.13484152760667487, + "grad_norm": 1.2048757076263428, + "learning_rate": 1.2030000000000002e-05, + "loss": 0.4165, + "step": 2408 + }, + { + "epoch": 0.13489752491880388, + "grad_norm": 1.0300843715667725, + "learning_rate": 1.2035e-05, + "loss": 0.3927, + "step": 2409 + }, + { + "epoch": 0.1349535222309329, + "grad_norm": 1.2152204513549805, + "learning_rate": 1.204e-05, + "loss": 0.507, + "step": 2410 + }, + { + "epoch": 0.13500951954306192, + "grad_norm": 1.2543202638626099, + "learning_rate": 1.2045e-05, + "loss": 0.3755, + "step": 2411 + }, + { + "epoch": 0.13506551685519094, + "grad_norm": 1.1340587139129639, + "learning_rate": 1.205e-05, + "loss": 0.4536, + "step": 2412 + }, + { + "epoch": 0.13512151416731996, + "grad_norm": 1.5051394701004028, + "learning_rate": 1.2055e-05, + "loss": 0.4643, + "step": 2413 + }, + { + "epoch": 0.13517751147944898, + "grad_norm": 1.0842450857162476, + "learning_rate": 1.206e-05, + "loss": 0.3299, + "step": 2414 + }, + { + "epoch": 0.135233508791578, + "grad_norm": 1.1488196849822998, + "learning_rate": 1.2065e-05, + "loss": 0.445, + "step": 2415 + }, + { + "epoch": 0.13528950610370702, + "grad_norm": 1.1640756130218506, + "learning_rate": 1.2070000000000001e-05, + "loss": 0.4106, + "step": 2416 + }, + { + "epoch": 0.13534550341583604, + "grad_norm": 1.4307230710983276, + "learning_rate": 1.2075e-05, + "loss": 0.4711, + "step": 2417 + }, + { + "epoch": 0.13540150072796506, + "grad_norm": 1.7485276460647583, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.6179, + "step": 2418 + }, + { + "epoch": 0.13545749804009408, + "grad_norm": 1.2945975065231323, + "learning_rate": 1.2085e-05, + "loss": 0.5587, + "step": 2419 + }, + { + "epoch": 0.1355134953522231, + "grad_norm": 1.1517056226730347, + "learning_rate": 1.209e-05, + "loss": 0.3638, + "step": 2420 + }, + { + "epoch": 0.13556949266435211, + "grad_norm": 1.4271259307861328, + "learning_rate": 1.2095e-05, + "loss": 0.4494, + "step": 2421 + }, + { + "epoch": 0.13562548997648113, + "grad_norm": 1.8111350536346436, + "learning_rate": 1.2100000000000001e-05, + "loss": 0.5689, + "step": 2422 + }, + { + "epoch": 0.13568148728861015, + "grad_norm": 1.0819214582443237, + "learning_rate": 1.2105000000000002e-05, + "loss": 0.4173, + "step": 2423 + }, + { + "epoch": 0.13573748460073917, + "grad_norm": 1.2219858169555664, + "learning_rate": 1.2110000000000001e-05, + "loss": 0.4282, + "step": 2424 + }, + { + "epoch": 0.1357934819128682, + "grad_norm": 1.6722608804702759, + "learning_rate": 1.2115e-05, + "loss": 0.5747, + "step": 2425 + }, + { + "epoch": 0.1358494792249972, + "grad_norm": 1.2269113063812256, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.3372, + "step": 2426 + }, + { + "epoch": 0.13590547653712623, + "grad_norm": 1.5652239322662354, + "learning_rate": 1.2125e-05, + "loss": 0.4925, + "step": 2427 + }, + { + "epoch": 0.13596147384925525, + "grad_norm": 1.2126359939575195, + "learning_rate": 1.213e-05, + "loss": 0.3988, + "step": 2428 + }, + { + "epoch": 0.13601747116138424, + "grad_norm": 1.5335955619812012, + "learning_rate": 1.2135e-05, + "loss": 0.4273, + "step": 2429 + }, + { + "epoch": 0.13607346847351326, + "grad_norm": 1.3891562223434448, + "learning_rate": 1.214e-05, + "loss": 0.4148, + "step": 2430 + }, + { + "epoch": 0.13612946578564228, + "grad_norm": 1.6771533489227295, + "learning_rate": 1.2145000000000001e-05, + "loss": 0.5713, + "step": 2431 + }, + { + "epoch": 0.1361854630977713, + "grad_norm": 1.255646824836731, + "learning_rate": 1.215e-05, + "loss": 0.4457, + "step": 2432 + }, + { + "epoch": 0.13624146040990032, + "grad_norm": 1.1224887371063232, + "learning_rate": 1.2155000000000001e-05, + "loss": 0.3463, + "step": 2433 + }, + { + "epoch": 0.13629745772202934, + "grad_norm": 1.122126817703247, + "learning_rate": 1.216e-05, + "loss": 0.4545, + "step": 2434 + }, + { + "epoch": 0.13635345503415836, + "grad_norm": 1.541749119758606, + "learning_rate": 1.2165e-05, + "loss": 0.4453, + "step": 2435 + }, + { + "epoch": 0.13640945234628737, + "grad_norm": 1.2225465774536133, + "learning_rate": 1.217e-05, + "loss": 0.5011, + "step": 2436 + }, + { + "epoch": 0.1364654496584164, + "grad_norm": 1.091254711151123, + "learning_rate": 1.2175e-05, + "loss": 0.4753, + "step": 2437 + }, + { + "epoch": 0.1365214469705454, + "grad_norm": 1.122660517692566, + "learning_rate": 1.2180000000000002e-05, + "loss": 0.438, + "step": 2438 + }, + { + "epoch": 0.13657744428267443, + "grad_norm": 1.0282984972000122, + "learning_rate": 1.2185000000000001e-05, + "loss": 0.4516, + "step": 2439 + }, + { + "epoch": 0.13663344159480345, + "grad_norm": 1.0135828256607056, + "learning_rate": 1.219e-05, + "loss": 0.4791, + "step": 2440 + }, + { + "epoch": 0.13668943890693247, + "grad_norm": 1.4245688915252686, + "learning_rate": 1.2195000000000001e-05, + "loss": 0.462, + "step": 2441 + }, + { + "epoch": 0.1367454362190615, + "grad_norm": 1.4579600095748901, + "learning_rate": 1.22e-05, + "loss": 0.5111, + "step": 2442 + }, + { + "epoch": 0.1368014335311905, + "grad_norm": 1.064908742904663, + "learning_rate": 1.2205000000000001e-05, + "loss": 0.368, + "step": 2443 + }, + { + "epoch": 0.13685743084331953, + "grad_norm": 1.4000297784805298, + "learning_rate": 1.221e-05, + "loss": 0.3009, + "step": 2444 + }, + { + "epoch": 0.13691342815544855, + "grad_norm": 1.2969307899475098, + "learning_rate": 1.2215e-05, + "loss": 0.3928, + "step": 2445 + }, + { + "epoch": 0.13696942546757757, + "grad_norm": 3.748359441757202, + "learning_rate": 1.2220000000000002e-05, + "loss": 0.3718, + "step": 2446 + }, + { + "epoch": 0.13702542277970658, + "grad_norm": 1.4786757230758667, + "learning_rate": 1.2225e-05, + "loss": 0.4797, + "step": 2447 + }, + { + "epoch": 0.1370814200918356, + "grad_norm": 1.3004447221755981, + "learning_rate": 1.2230000000000001e-05, + "loss": 0.5356, + "step": 2448 + }, + { + "epoch": 0.13713741740396462, + "grad_norm": 1.1472911834716797, + "learning_rate": 1.2235e-05, + "loss": 0.3354, + "step": 2449 + }, + { + "epoch": 0.13719341471609361, + "grad_norm": 1.274685025215149, + "learning_rate": 1.224e-05, + "loss": 0.4278, + "step": 2450 + }, + { + "epoch": 0.13724941202822263, + "grad_norm": 1.412546157836914, + "learning_rate": 1.2245e-05, + "loss": 0.5157, + "step": 2451 + }, + { + "epoch": 0.13730540934035165, + "grad_norm": 1.0137228965759277, + "learning_rate": 1.225e-05, + "loss": 0.4446, + "step": 2452 + }, + { + "epoch": 0.13736140665248067, + "grad_norm": 2.3677423000335693, + "learning_rate": 1.2255e-05, + "loss": 0.5029, + "step": 2453 + }, + { + "epoch": 0.1374174039646097, + "grad_norm": 1.2410162687301636, + "learning_rate": 1.2260000000000001e-05, + "loss": 0.4446, + "step": 2454 + }, + { + "epoch": 0.1374734012767387, + "grad_norm": 1.2138422727584839, + "learning_rate": 1.2265e-05, + "loss": 0.4604, + "step": 2455 + }, + { + "epoch": 0.13752939858886773, + "grad_norm": 1.2067878246307373, + "learning_rate": 1.2270000000000001e-05, + "loss": 0.4757, + "step": 2456 + }, + { + "epoch": 0.13758539590099675, + "grad_norm": 1.2064911127090454, + "learning_rate": 1.2275e-05, + "loss": 0.4175, + "step": 2457 + }, + { + "epoch": 0.13764139321312577, + "grad_norm": 1.3723119497299194, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.4115, + "step": 2458 + }, + { + "epoch": 0.1376973905252548, + "grad_norm": 1.29324209690094, + "learning_rate": 1.2285e-05, + "loss": 0.4133, + "step": 2459 + }, + { + "epoch": 0.1377533878373838, + "grad_norm": 1.12079656124115, + "learning_rate": 1.2290000000000001e-05, + "loss": 0.4476, + "step": 2460 + }, + { + "epoch": 0.13780938514951283, + "grad_norm": 1.2613892555236816, + "learning_rate": 1.2295000000000002e-05, + "loss": 0.5619, + "step": 2461 + }, + { + "epoch": 0.13786538246164184, + "grad_norm": 1.4234511852264404, + "learning_rate": 1.23e-05, + "loss": 0.5854, + "step": 2462 + }, + { + "epoch": 0.13792137977377086, + "grad_norm": 1.1991769075393677, + "learning_rate": 1.2305000000000002e-05, + "loss": 0.4518, + "step": 2463 + }, + { + "epoch": 0.13797737708589988, + "grad_norm": 1.281947135925293, + "learning_rate": 1.231e-05, + "loss": 0.4044, + "step": 2464 + }, + { + "epoch": 0.1380333743980289, + "grad_norm": 1.3853332996368408, + "learning_rate": 1.2315e-05, + "loss": 0.6167, + "step": 2465 + }, + { + "epoch": 0.13808937171015792, + "grad_norm": 1.23692786693573, + "learning_rate": 1.232e-05, + "loss": 0.4404, + "step": 2466 + }, + { + "epoch": 0.13814536902228694, + "grad_norm": 14.155194282531738, + "learning_rate": 1.2325e-05, + "loss": 0.7027, + "step": 2467 + }, + { + "epoch": 0.13820136633441596, + "grad_norm": 1.47590172290802, + "learning_rate": 1.233e-05, + "loss": 0.3997, + "step": 2468 + }, + { + "epoch": 0.13825736364654498, + "grad_norm": 1.3184940814971924, + "learning_rate": 1.2335000000000001e-05, + "loss": 0.3687, + "step": 2469 + }, + { + "epoch": 0.13831336095867397, + "grad_norm": 1.1062796115875244, + "learning_rate": 1.234e-05, + "loss": 0.4121, + "step": 2470 + }, + { + "epoch": 0.138369358270803, + "grad_norm": 1.1457158327102661, + "learning_rate": 1.2345000000000001e-05, + "loss": 0.5695, + "step": 2471 + }, + { + "epoch": 0.138425355582932, + "grad_norm": 1.1479376554489136, + "learning_rate": 1.235e-05, + "loss": 0.5058, + "step": 2472 + }, + { + "epoch": 0.13848135289506103, + "grad_norm": 1.411997675895691, + "learning_rate": 1.2355e-05, + "loss": 0.5817, + "step": 2473 + }, + { + "epoch": 0.13853735020719005, + "grad_norm": 1.0818930864334106, + "learning_rate": 1.236e-05, + "loss": 0.5058, + "step": 2474 + }, + { + "epoch": 0.13859334751931907, + "grad_norm": 1.1070023775100708, + "learning_rate": 1.2365e-05, + "loss": 0.3422, + "step": 2475 + }, + { + "epoch": 0.13864934483144808, + "grad_norm": 1.2352498769760132, + "learning_rate": 1.2370000000000002e-05, + "loss": 0.3878, + "step": 2476 + }, + { + "epoch": 0.1387053421435771, + "grad_norm": 1.0397839546203613, + "learning_rate": 1.2375000000000001e-05, + "loss": 0.3501, + "step": 2477 + }, + { + "epoch": 0.13876133945570612, + "grad_norm": 1.1352453231811523, + "learning_rate": 1.238e-05, + "loss": 0.3688, + "step": 2478 + }, + { + "epoch": 0.13881733676783514, + "grad_norm": 1.3301494121551514, + "learning_rate": 1.2385000000000001e-05, + "loss": 0.5199, + "step": 2479 + }, + { + "epoch": 0.13887333407996416, + "grad_norm": 1.426438570022583, + "learning_rate": 1.239e-05, + "loss": 0.453, + "step": 2480 + }, + { + "epoch": 0.13892933139209318, + "grad_norm": 1.164884090423584, + "learning_rate": 1.2395e-05, + "loss": 0.323, + "step": 2481 + }, + { + "epoch": 0.1389853287042222, + "grad_norm": 1.1007789373397827, + "learning_rate": 1.24e-05, + "loss": 0.3785, + "step": 2482 + }, + { + "epoch": 0.13904132601635122, + "grad_norm": 1.163779854774475, + "learning_rate": 1.2405e-05, + "loss": 0.4914, + "step": 2483 + }, + { + "epoch": 0.13909732332848024, + "grad_norm": 1.3361643552780151, + "learning_rate": 1.2410000000000001e-05, + "loss": 0.4419, + "step": 2484 + }, + { + "epoch": 0.13915332064060926, + "grad_norm": 1.0904892683029175, + "learning_rate": 1.2415e-05, + "loss": 0.4141, + "step": 2485 + }, + { + "epoch": 0.13920931795273828, + "grad_norm": 1.2341094017028809, + "learning_rate": 1.2420000000000001e-05, + "loss": 0.4362, + "step": 2486 + }, + { + "epoch": 0.1392653152648673, + "grad_norm": 1.0510798692703247, + "learning_rate": 1.2425e-05, + "loss": 0.415, + "step": 2487 + }, + { + "epoch": 0.13932131257699631, + "grad_norm": 0.9391849637031555, + "learning_rate": 1.243e-05, + "loss": 0.4643, + "step": 2488 + }, + { + "epoch": 0.13937730988912533, + "grad_norm": 1.0622152090072632, + "learning_rate": 1.2435e-05, + "loss": 0.3353, + "step": 2489 + }, + { + "epoch": 0.13943330720125435, + "grad_norm": 1.3571791648864746, + "learning_rate": 1.244e-05, + "loss": 0.4687, + "step": 2490 + }, + { + "epoch": 0.13948930451338334, + "grad_norm": 1.1573200225830078, + "learning_rate": 1.2445e-05, + "loss": 0.3999, + "step": 2491 + }, + { + "epoch": 0.13954530182551236, + "grad_norm": 1.1886022090911865, + "learning_rate": 1.2450000000000001e-05, + "loss": 0.3521, + "step": 2492 + }, + { + "epoch": 0.13960129913764138, + "grad_norm": 2.901169538497925, + "learning_rate": 1.2455e-05, + "loss": 0.4332, + "step": 2493 + }, + { + "epoch": 0.1396572964497704, + "grad_norm": 2.1086270809173584, + "learning_rate": 1.2460000000000001e-05, + "loss": 0.4126, + "step": 2494 + }, + { + "epoch": 0.13971329376189942, + "grad_norm": 1.3888367414474487, + "learning_rate": 1.2465e-05, + "loss": 0.4817, + "step": 2495 + }, + { + "epoch": 0.13976929107402844, + "grad_norm": 1.083674669265747, + "learning_rate": 1.2470000000000001e-05, + "loss": 0.383, + "step": 2496 + }, + { + "epoch": 0.13982528838615746, + "grad_norm": 1.3220771551132202, + "learning_rate": 1.2475e-05, + "loss": 0.5275, + "step": 2497 + }, + { + "epoch": 0.13988128569828648, + "grad_norm": 1.1915440559387207, + "learning_rate": 1.248e-05, + "loss": 0.4635, + "step": 2498 + }, + { + "epoch": 0.1399372830104155, + "grad_norm": 1.0927143096923828, + "learning_rate": 1.2485000000000002e-05, + "loss": 0.4208, + "step": 2499 + }, + { + "epoch": 0.13999328032254452, + "grad_norm": 1.2048420906066895, + "learning_rate": 1.249e-05, + "loss": 0.4241, + "step": 2500 + }, + { + "epoch": 0.14004927763467354, + "grad_norm": 1.1832644939422607, + "learning_rate": 1.2495000000000001e-05, + "loss": 0.5051, + "step": 2501 + }, + { + "epoch": 0.14010527494680255, + "grad_norm": 1.1782584190368652, + "learning_rate": 1.25e-05, + "loss": 0.5329, + "step": 2502 + }, + { + "epoch": 0.14016127225893157, + "grad_norm": 1.2893174886703491, + "learning_rate": 1.2505e-05, + "loss": 0.4063, + "step": 2503 + }, + { + "epoch": 0.1402172695710606, + "grad_norm": 1.0172559022903442, + "learning_rate": 1.2509999999999999e-05, + "loss": 0.3683, + "step": 2504 + }, + { + "epoch": 0.1402732668831896, + "grad_norm": 1.2625501155853271, + "learning_rate": 1.2515000000000001e-05, + "loss": 0.3565, + "step": 2505 + }, + { + "epoch": 0.14032926419531863, + "grad_norm": 1.2588109970092773, + "learning_rate": 1.252e-05, + "loss": 0.4849, + "step": 2506 + }, + { + "epoch": 0.14038526150744765, + "grad_norm": 1.3349969387054443, + "learning_rate": 1.2525000000000001e-05, + "loss": 0.5433, + "step": 2507 + }, + { + "epoch": 0.14044125881957667, + "grad_norm": 1.1840282678604126, + "learning_rate": 1.253e-05, + "loss": 0.5035, + "step": 2508 + }, + { + "epoch": 0.1404972561317057, + "grad_norm": 1.236934781074524, + "learning_rate": 1.2535e-05, + "loss": 0.3751, + "step": 2509 + }, + { + "epoch": 0.1405532534438347, + "grad_norm": 1.3336421251296997, + "learning_rate": 1.2540000000000002e-05, + "loss": 0.5805, + "step": 2510 + }, + { + "epoch": 0.14060925075596373, + "grad_norm": 1.1406078338623047, + "learning_rate": 1.2545000000000001e-05, + "loss": 0.4124, + "step": 2511 + }, + { + "epoch": 0.14066524806809272, + "grad_norm": 1.195516586303711, + "learning_rate": 1.255e-05, + "loss": 0.4189, + "step": 2512 + }, + { + "epoch": 0.14072124538022174, + "grad_norm": 1.338142991065979, + "learning_rate": 1.2555000000000001e-05, + "loss": 0.46, + "step": 2513 + }, + { + "epoch": 0.14077724269235076, + "grad_norm": 1.2000536918640137, + "learning_rate": 1.256e-05, + "loss": 0.5513, + "step": 2514 + }, + { + "epoch": 0.14083324000447978, + "grad_norm": 1.066293716430664, + "learning_rate": 1.2565000000000003e-05, + "loss": 0.3948, + "step": 2515 + }, + { + "epoch": 0.1408892373166088, + "grad_norm": 1.0766243934631348, + "learning_rate": 1.2570000000000002e-05, + "loss": 0.5143, + "step": 2516 + }, + { + "epoch": 0.14094523462873781, + "grad_norm": 1.296629786491394, + "learning_rate": 1.2575e-05, + "loss": 0.4408, + "step": 2517 + }, + { + "epoch": 0.14100123194086683, + "grad_norm": 1.11188542842865, + "learning_rate": 1.258e-05, + "loss": 0.4171, + "step": 2518 + }, + { + "epoch": 0.14105722925299585, + "grad_norm": 1.1227508783340454, + "learning_rate": 1.2584999999999999e-05, + "loss": 0.571, + "step": 2519 + }, + { + "epoch": 0.14111322656512487, + "grad_norm": 1.1131638288497925, + "learning_rate": 1.2590000000000001e-05, + "loss": 0.5317, + "step": 2520 + }, + { + "epoch": 0.1411692238772539, + "grad_norm": 1.5281466245651245, + "learning_rate": 1.2595e-05, + "loss": 0.3774, + "step": 2521 + }, + { + "epoch": 0.1412252211893829, + "grad_norm": 1.1421079635620117, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.4091, + "step": 2522 + }, + { + "epoch": 0.14128121850151193, + "grad_norm": 1.5237642526626587, + "learning_rate": 1.2605e-05, + "loss": 0.446, + "step": 2523 + }, + { + "epoch": 0.14133721581364095, + "grad_norm": 1.4897043704986572, + "learning_rate": 1.261e-05, + "loss": 0.5319, + "step": 2524 + }, + { + "epoch": 0.14139321312576997, + "grad_norm": 1.6626853942871094, + "learning_rate": 1.2615000000000002e-05, + "loss": 0.5172, + "step": 2525 + }, + { + "epoch": 0.141449210437899, + "grad_norm": 1.3545503616333008, + "learning_rate": 1.2620000000000001e-05, + "loss": 0.5435, + "step": 2526 + }, + { + "epoch": 0.141505207750028, + "grad_norm": 3.209622383117676, + "learning_rate": 1.2625e-05, + "loss": 0.528, + "step": 2527 + }, + { + "epoch": 0.14156120506215703, + "grad_norm": 1.1368006467819214, + "learning_rate": 1.263e-05, + "loss": 0.4018, + "step": 2528 + }, + { + "epoch": 0.14161720237428604, + "grad_norm": 1.019187092781067, + "learning_rate": 1.2635e-05, + "loss": 0.329, + "step": 2529 + }, + { + "epoch": 0.14167319968641506, + "grad_norm": 1.2222051620483398, + "learning_rate": 1.2640000000000003e-05, + "loss": 0.3975, + "step": 2530 + }, + { + "epoch": 0.14172919699854408, + "grad_norm": 1.1408919095993042, + "learning_rate": 1.2645000000000002e-05, + "loss": 0.4006, + "step": 2531 + }, + { + "epoch": 0.14178519431067307, + "grad_norm": 1.2872763872146606, + "learning_rate": 1.2650000000000001e-05, + "loss": 0.3789, + "step": 2532 + }, + { + "epoch": 0.1418411916228021, + "grad_norm": 2.181398868560791, + "learning_rate": 1.2655e-05, + "loss": 0.3909, + "step": 2533 + }, + { + "epoch": 0.1418971889349311, + "grad_norm": 1.4929990768432617, + "learning_rate": 1.2659999999999999e-05, + "loss": 0.4905, + "step": 2534 + }, + { + "epoch": 0.14195318624706013, + "grad_norm": 1.4117430448532104, + "learning_rate": 1.2665000000000002e-05, + "loss": 0.4487, + "step": 2535 + }, + { + "epoch": 0.14200918355918915, + "grad_norm": 1.4963791370391846, + "learning_rate": 1.267e-05, + "loss": 0.5112, + "step": 2536 + }, + { + "epoch": 0.14206518087131817, + "grad_norm": 1.105371117591858, + "learning_rate": 1.2675000000000001e-05, + "loss": 0.3966, + "step": 2537 + }, + { + "epoch": 0.1421211781834472, + "grad_norm": 1.4377449750900269, + "learning_rate": 1.268e-05, + "loss": 0.4421, + "step": 2538 + }, + { + "epoch": 0.1421771754955762, + "grad_norm": 1.1643390655517578, + "learning_rate": 1.2685e-05, + "loss": 0.3825, + "step": 2539 + }, + { + "epoch": 0.14223317280770523, + "grad_norm": 1.2283012866973877, + "learning_rate": 1.2690000000000002e-05, + "loss": 0.4719, + "step": 2540 + }, + { + "epoch": 0.14228917011983425, + "grad_norm": 1.1322824954986572, + "learning_rate": 1.2695000000000001e-05, + "loss": 0.3118, + "step": 2541 + }, + { + "epoch": 0.14234516743196327, + "grad_norm": 1.1129308938980103, + "learning_rate": 1.27e-05, + "loss": 0.3436, + "step": 2542 + }, + { + "epoch": 0.14240116474409228, + "grad_norm": 1.2738077640533447, + "learning_rate": 1.2705e-05, + "loss": 0.3745, + "step": 2543 + }, + { + "epoch": 0.1424571620562213, + "grad_norm": 1.1257846355438232, + "learning_rate": 1.271e-05, + "loss": 0.3746, + "step": 2544 + }, + { + "epoch": 0.14251315936835032, + "grad_norm": 1.2664601802825928, + "learning_rate": 1.2715000000000001e-05, + "loss": 0.5354, + "step": 2545 + }, + { + "epoch": 0.14256915668047934, + "grad_norm": 1.3434889316558838, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.4078, + "step": 2546 + }, + { + "epoch": 0.14262515399260836, + "grad_norm": 1.602239727973938, + "learning_rate": 1.2725000000000001e-05, + "loss": 0.4273, + "step": 2547 + }, + { + "epoch": 0.14268115130473738, + "grad_norm": 1.0283304452896118, + "learning_rate": 1.273e-05, + "loss": 0.3713, + "step": 2548 + }, + { + "epoch": 0.1427371486168664, + "grad_norm": 1.1225849390029907, + "learning_rate": 1.2735e-05, + "loss": 0.5288, + "step": 2549 + }, + { + "epoch": 0.14279314592899542, + "grad_norm": 1.2365565299987793, + "learning_rate": 1.2740000000000002e-05, + "loss": 0.3731, + "step": 2550 + }, + { + "epoch": 0.14284914324112444, + "grad_norm": 1.0750316381454468, + "learning_rate": 1.2745e-05, + "loss": 0.3926, + "step": 2551 + }, + { + "epoch": 0.14290514055325346, + "grad_norm": 1.3363348245620728, + "learning_rate": 1.2750000000000002e-05, + "loss": 0.4665, + "step": 2552 + }, + { + "epoch": 0.14296113786538245, + "grad_norm": 1.021571159362793, + "learning_rate": 1.2755e-05, + "loss": 0.4289, + "step": 2553 + }, + { + "epoch": 0.14301713517751147, + "grad_norm": 1.1585955619812012, + "learning_rate": 1.276e-05, + "loss": 0.3102, + "step": 2554 + }, + { + "epoch": 0.1430731324896405, + "grad_norm": 1.2394684553146362, + "learning_rate": 1.2765000000000002e-05, + "loss": 0.4, + "step": 2555 + }, + { + "epoch": 0.1431291298017695, + "grad_norm": 1.216558575630188, + "learning_rate": 1.2770000000000001e-05, + "loss": 0.4016, + "step": 2556 + }, + { + "epoch": 0.14318512711389852, + "grad_norm": 1.206903338432312, + "learning_rate": 1.2775e-05, + "loss": 0.4442, + "step": 2557 + }, + { + "epoch": 0.14324112442602754, + "grad_norm": 0.900111198425293, + "learning_rate": 1.278e-05, + "loss": 0.3385, + "step": 2558 + }, + { + "epoch": 0.14329712173815656, + "grad_norm": 1.3177870512008667, + "learning_rate": 1.2785e-05, + "loss": 0.3729, + "step": 2559 + }, + { + "epoch": 0.14335311905028558, + "grad_norm": 1.273997187614441, + "learning_rate": 1.2790000000000001e-05, + "loss": 0.3623, + "step": 2560 + }, + { + "epoch": 0.1434091163624146, + "grad_norm": 1.188696026802063, + "learning_rate": 1.2795000000000002e-05, + "loss": 0.3905, + "step": 2561 + }, + { + "epoch": 0.14346511367454362, + "grad_norm": 1.4021480083465576, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.5325, + "step": 2562 + }, + { + "epoch": 0.14352111098667264, + "grad_norm": 1.4283920526504517, + "learning_rate": 1.2805e-05, + "loss": 0.4506, + "step": 2563 + }, + { + "epoch": 0.14357710829880166, + "grad_norm": 1.5277773141860962, + "learning_rate": 1.281e-05, + "loss": 0.5579, + "step": 2564 + }, + { + "epoch": 0.14363310561093068, + "grad_norm": 1.1661988496780396, + "learning_rate": 1.2814999999999998e-05, + "loss": 0.3988, + "step": 2565 + }, + { + "epoch": 0.1436891029230597, + "grad_norm": 1.3075827360153198, + "learning_rate": 1.2820000000000001e-05, + "loss": 0.4105, + "step": 2566 + }, + { + "epoch": 0.14374510023518872, + "grad_norm": 1.5823769569396973, + "learning_rate": 1.2825000000000002e-05, + "loss": 0.5679, + "step": 2567 + }, + { + "epoch": 0.14380109754731774, + "grad_norm": 1.0940037965774536, + "learning_rate": 1.283e-05, + "loss": 0.3605, + "step": 2568 + }, + { + "epoch": 0.14385709485944675, + "grad_norm": 1.1388133764266968, + "learning_rate": 1.2835e-05, + "loss": 0.3079, + "step": 2569 + }, + { + "epoch": 0.14391309217157577, + "grad_norm": 1.4478360414505005, + "learning_rate": 1.2839999999999999e-05, + "loss": 0.4968, + "step": 2570 + }, + { + "epoch": 0.1439690894837048, + "grad_norm": 1.2097203731536865, + "learning_rate": 1.2845000000000002e-05, + "loss": 0.4297, + "step": 2571 + }, + { + "epoch": 0.1440250867958338, + "grad_norm": 1.618012547492981, + "learning_rate": 1.285e-05, + "loss": 0.419, + "step": 2572 + }, + { + "epoch": 0.14408108410796283, + "grad_norm": 1.4660046100616455, + "learning_rate": 1.2855e-05, + "loss": 0.3765, + "step": 2573 + }, + { + "epoch": 0.14413708142009182, + "grad_norm": 1.2173787355422974, + "learning_rate": 1.286e-05, + "loss": 0.3228, + "step": 2574 + }, + { + "epoch": 0.14419307873222084, + "grad_norm": 1.3873306512832642, + "learning_rate": 1.2865e-05, + "loss": 0.5869, + "step": 2575 + }, + { + "epoch": 0.14424907604434986, + "grad_norm": 1.5580289363861084, + "learning_rate": 1.2870000000000002e-05, + "loss": 0.4932, + "step": 2576 + }, + { + "epoch": 0.14430507335647888, + "grad_norm": 1.7026492357254028, + "learning_rate": 1.2875000000000001e-05, + "loss": 0.5149, + "step": 2577 + }, + { + "epoch": 0.1443610706686079, + "grad_norm": 1.3750256299972534, + "learning_rate": 1.288e-05, + "loss": 0.495, + "step": 2578 + }, + { + "epoch": 0.14441706798073692, + "grad_norm": 1.2004586458206177, + "learning_rate": 1.2885e-05, + "loss": 0.4588, + "step": 2579 + }, + { + "epoch": 0.14447306529286594, + "grad_norm": 1.4534201622009277, + "learning_rate": 1.2889999999999999e-05, + "loss": 0.5208, + "step": 2580 + }, + { + "epoch": 0.14452906260499496, + "grad_norm": 1.2705984115600586, + "learning_rate": 1.2895000000000001e-05, + "loss": 0.3537, + "step": 2581 + }, + { + "epoch": 0.14458505991712398, + "grad_norm": 1.1209765672683716, + "learning_rate": 1.29e-05, + "loss": 0.4387, + "step": 2582 + }, + { + "epoch": 0.144641057229253, + "grad_norm": 1.1241347789764404, + "learning_rate": 1.2905000000000001e-05, + "loss": 0.3922, + "step": 2583 + }, + { + "epoch": 0.14469705454138201, + "grad_norm": 1.6147980690002441, + "learning_rate": 1.291e-05, + "loss": 0.4985, + "step": 2584 + }, + { + "epoch": 0.14475305185351103, + "grad_norm": 1.542745590209961, + "learning_rate": 1.2915e-05, + "loss": 0.6905, + "step": 2585 + }, + { + "epoch": 0.14480904916564005, + "grad_norm": 1.2592793703079224, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.4949, + "step": 2586 + }, + { + "epoch": 0.14486504647776907, + "grad_norm": 1.1622910499572754, + "learning_rate": 1.2925e-05, + "loss": 0.3956, + "step": 2587 + }, + { + "epoch": 0.1449210437898981, + "grad_norm": 1.2540090084075928, + "learning_rate": 1.293e-05, + "loss": 0.5471, + "step": 2588 + }, + { + "epoch": 0.1449770411020271, + "grad_norm": 1.256731390953064, + "learning_rate": 1.2935e-05, + "loss": 0.3732, + "step": 2589 + }, + { + "epoch": 0.14503303841415613, + "grad_norm": 1.5238062143325806, + "learning_rate": 1.294e-05, + "loss": 0.6349, + "step": 2590 + }, + { + "epoch": 0.14508903572628515, + "grad_norm": 1.0773160457611084, + "learning_rate": 1.2945000000000002e-05, + "loss": 0.4166, + "step": 2591 + }, + { + "epoch": 0.14514503303841417, + "grad_norm": 1.1376806497573853, + "learning_rate": 1.2950000000000001e-05, + "loss": 0.4292, + "step": 2592 + }, + { + "epoch": 0.1452010303505432, + "grad_norm": 1.2171424627304077, + "learning_rate": 1.2955e-05, + "loss": 0.3987, + "step": 2593 + }, + { + "epoch": 0.14525702766267218, + "grad_norm": 1.1606625318527222, + "learning_rate": 1.296e-05, + "loss": 0.4594, + "step": 2594 + }, + { + "epoch": 0.1453130249748012, + "grad_norm": 1.031876802444458, + "learning_rate": 1.2964999999999999e-05, + "loss": 0.5854, + "step": 2595 + }, + { + "epoch": 0.14536902228693022, + "grad_norm": 1.1607109308242798, + "learning_rate": 1.2970000000000001e-05, + "loss": 0.4157, + "step": 2596 + }, + { + "epoch": 0.14542501959905924, + "grad_norm": 1.0473979711532593, + "learning_rate": 1.2975e-05, + "loss": 0.426, + "step": 2597 + }, + { + "epoch": 0.14548101691118825, + "grad_norm": 1.1430045366287231, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.4142, + "step": 2598 + }, + { + "epoch": 0.14553701422331727, + "grad_norm": 1.156032919883728, + "learning_rate": 1.2985e-05, + "loss": 0.4853, + "step": 2599 + }, + { + "epoch": 0.1455930115354463, + "grad_norm": 1.1408398151397705, + "learning_rate": 1.299e-05, + "loss": 0.4186, + "step": 2600 + }, + { + "epoch": 0.1456490088475753, + "grad_norm": 1.059762716293335, + "learning_rate": 1.2995000000000002e-05, + "loss": 0.4544, + "step": 2601 + }, + { + "epoch": 0.14570500615970433, + "grad_norm": 1.4854395389556885, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.7862, + "step": 2602 + }, + { + "epoch": 0.14576100347183335, + "grad_norm": 1.234014868736267, + "learning_rate": 1.3005e-05, + "loss": 0.4272, + "step": 2603 + }, + { + "epoch": 0.14581700078396237, + "grad_norm": 5.808340072631836, + "learning_rate": 1.301e-05, + "loss": 0.4147, + "step": 2604 + }, + { + "epoch": 0.1458729980960914, + "grad_norm": 1.2030842304229736, + "learning_rate": 1.3015e-05, + "loss": 0.3735, + "step": 2605 + }, + { + "epoch": 0.1459289954082204, + "grad_norm": 1.3522212505340576, + "learning_rate": 1.3020000000000002e-05, + "loss": 0.3872, + "step": 2606 + }, + { + "epoch": 0.14598499272034943, + "grad_norm": 1.1150144338607788, + "learning_rate": 1.3025000000000002e-05, + "loss": 0.5558, + "step": 2607 + }, + { + "epoch": 0.14604099003247845, + "grad_norm": 1.0971496105194092, + "learning_rate": 1.303e-05, + "loss": 0.3065, + "step": 2608 + }, + { + "epoch": 0.14609698734460747, + "grad_norm": 1.3269139528274536, + "learning_rate": 1.3035e-05, + "loss": 0.4228, + "step": 2609 + }, + { + "epoch": 0.14615298465673648, + "grad_norm": 1.054341435432434, + "learning_rate": 1.3039999999999999e-05, + "loss": 0.3975, + "step": 2610 + }, + { + "epoch": 0.1462089819688655, + "grad_norm": 1.7736667394638062, + "learning_rate": 1.3045000000000001e-05, + "loss": 0.4209, + "step": 2611 + }, + { + "epoch": 0.14626497928099452, + "grad_norm": 1.1408703327178955, + "learning_rate": 1.305e-05, + "loss": 0.3248, + "step": 2612 + }, + { + "epoch": 0.14632097659312354, + "grad_norm": 1.2324352264404297, + "learning_rate": 1.3055000000000001e-05, + "loss": 0.6046, + "step": 2613 + }, + { + "epoch": 0.14637697390525256, + "grad_norm": 1.1829913854599, + "learning_rate": 1.306e-05, + "loss": 0.364, + "step": 2614 + }, + { + "epoch": 0.14643297121738155, + "grad_norm": 1.1477619409561157, + "learning_rate": 1.3065e-05, + "loss": 0.3573, + "step": 2615 + }, + { + "epoch": 0.14648896852951057, + "grad_norm": 1.2593798637390137, + "learning_rate": 1.3070000000000002e-05, + "loss": 0.3189, + "step": 2616 + }, + { + "epoch": 0.1465449658416396, + "grad_norm": 1.5565099716186523, + "learning_rate": 1.3075000000000001e-05, + "loss": 0.4649, + "step": 2617 + }, + { + "epoch": 0.1466009631537686, + "grad_norm": 1.105677604675293, + "learning_rate": 1.308e-05, + "loss": 0.4293, + "step": 2618 + }, + { + "epoch": 0.14665696046589763, + "grad_norm": 1.015621304512024, + "learning_rate": 1.3085e-05, + "loss": 0.3262, + "step": 2619 + }, + { + "epoch": 0.14671295777802665, + "grad_norm": 1.252323031425476, + "learning_rate": 1.309e-05, + "loss": 0.4461, + "step": 2620 + }, + { + "epoch": 0.14676895509015567, + "grad_norm": 1.135851502418518, + "learning_rate": 1.3095000000000003e-05, + "loss": 0.4974, + "step": 2621 + }, + { + "epoch": 0.1468249524022847, + "grad_norm": 1.3516892194747925, + "learning_rate": 1.3100000000000002e-05, + "loss": 0.5207, + "step": 2622 + }, + { + "epoch": 0.1468809497144137, + "grad_norm": 1.4236551523208618, + "learning_rate": 1.3105e-05, + "loss": 0.448, + "step": 2623 + }, + { + "epoch": 0.14693694702654272, + "grad_norm": 1.2360954284667969, + "learning_rate": 1.311e-05, + "loss": 0.4699, + "step": 2624 + }, + { + "epoch": 0.14699294433867174, + "grad_norm": 1.2243598699569702, + "learning_rate": 1.3114999999999999e-05, + "loss": 0.4643, + "step": 2625 + }, + { + "epoch": 0.14704894165080076, + "grad_norm": 1.3075956106185913, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.6034, + "step": 2626 + }, + { + "epoch": 0.14710493896292978, + "grad_norm": 1.52328360080719, + "learning_rate": 1.3125e-05, + "loss": 0.4696, + "step": 2627 + }, + { + "epoch": 0.1471609362750588, + "grad_norm": 1.355431318283081, + "learning_rate": 1.3130000000000001e-05, + "loss": 0.5116, + "step": 2628 + }, + { + "epoch": 0.14721693358718782, + "grad_norm": 1.3428106307983398, + "learning_rate": 1.3135e-05, + "loss": 0.3957, + "step": 2629 + }, + { + "epoch": 0.14727293089931684, + "grad_norm": 1.2088851928710938, + "learning_rate": 1.314e-05, + "loss": 0.5184, + "step": 2630 + }, + { + "epoch": 0.14732892821144586, + "grad_norm": 1.3796359300613403, + "learning_rate": 1.3145000000000002e-05, + "loss": 0.3684, + "step": 2631 + }, + { + "epoch": 0.14738492552357488, + "grad_norm": 1.294643759727478, + "learning_rate": 1.3150000000000001e-05, + "loss": 0.4408, + "step": 2632 + }, + { + "epoch": 0.1474409228357039, + "grad_norm": 1.2840946912765503, + "learning_rate": 1.3155e-05, + "loss": 0.4452, + "step": 2633 + }, + { + "epoch": 0.14749692014783292, + "grad_norm": 1.0767340660095215, + "learning_rate": 1.316e-05, + "loss": 0.3368, + "step": 2634 + }, + { + "epoch": 0.14755291745996194, + "grad_norm": 1.0889029502868652, + "learning_rate": 1.3165e-05, + "loss": 0.3129, + "step": 2635 + }, + { + "epoch": 0.14760891477209093, + "grad_norm": 1.5532374382019043, + "learning_rate": 1.3170000000000001e-05, + "loss": 0.4134, + "step": 2636 + }, + { + "epoch": 0.14766491208421995, + "grad_norm": 1.4018138647079468, + "learning_rate": 1.3175000000000002e-05, + "loss": 0.6566, + "step": 2637 + }, + { + "epoch": 0.14772090939634897, + "grad_norm": 1.1475685834884644, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.4866, + "step": 2638 + }, + { + "epoch": 0.14777690670847798, + "grad_norm": 1.2613779306411743, + "learning_rate": 1.3185e-05, + "loss": 0.3669, + "step": 2639 + }, + { + "epoch": 0.147832904020607, + "grad_norm": 1.3189911842346191, + "learning_rate": 1.3189999999999999e-05, + "loss": 0.4983, + "step": 2640 + }, + { + "epoch": 0.14788890133273602, + "grad_norm": 1.0426316261291504, + "learning_rate": 1.3195000000000002e-05, + "loss": 0.3501, + "step": 2641 + }, + { + "epoch": 0.14794489864486504, + "grad_norm": 1.1406067609786987, + "learning_rate": 1.32e-05, + "loss": 0.3735, + "step": 2642 + }, + { + "epoch": 0.14800089595699406, + "grad_norm": 1.1951979398727417, + "learning_rate": 1.3205000000000001e-05, + "loss": 0.4212, + "step": 2643 + }, + { + "epoch": 0.14805689326912308, + "grad_norm": 1.0712980031967163, + "learning_rate": 1.321e-05, + "loss": 0.3888, + "step": 2644 + }, + { + "epoch": 0.1481128905812521, + "grad_norm": 1.1973940134048462, + "learning_rate": 1.3215e-05, + "loss": 0.4139, + "step": 2645 + }, + { + "epoch": 0.14816888789338112, + "grad_norm": 1.3590928316116333, + "learning_rate": 1.3220000000000002e-05, + "loss": 0.5349, + "step": 2646 + }, + { + "epoch": 0.14822488520551014, + "grad_norm": 1.2918621301651, + "learning_rate": 1.3225000000000001e-05, + "loss": 0.442, + "step": 2647 + }, + { + "epoch": 0.14828088251763916, + "grad_norm": 1.199952483177185, + "learning_rate": 1.323e-05, + "loss": 0.5537, + "step": 2648 + }, + { + "epoch": 0.14833687982976818, + "grad_norm": 1.1175743341445923, + "learning_rate": 1.3235e-05, + "loss": 0.4777, + "step": 2649 + }, + { + "epoch": 0.1483928771418972, + "grad_norm": 1.4558801651000977, + "learning_rate": 1.324e-05, + "loss": 0.6484, + "step": 2650 + }, + { + "epoch": 0.14844887445402621, + "grad_norm": 1.170845866203308, + "learning_rate": 1.3245000000000001e-05, + "loss": 0.4357, + "step": 2651 + }, + { + "epoch": 0.14850487176615523, + "grad_norm": 1.2923656702041626, + "learning_rate": 1.3250000000000002e-05, + "loss": 0.5244, + "step": 2652 + }, + { + "epoch": 0.14856086907828425, + "grad_norm": 1.153900146484375, + "learning_rate": 1.3255000000000001e-05, + "loss": 0.4252, + "step": 2653 + }, + { + "epoch": 0.14861686639041327, + "grad_norm": 1.2607039213180542, + "learning_rate": 1.326e-05, + "loss": 0.4234, + "step": 2654 + }, + { + "epoch": 0.1486728637025423, + "grad_norm": 1.284332513809204, + "learning_rate": 1.3265e-05, + "loss": 0.4065, + "step": 2655 + }, + { + "epoch": 0.14872886101467128, + "grad_norm": 1.0736377239227295, + "learning_rate": 1.3270000000000002e-05, + "loss": 0.4597, + "step": 2656 + }, + { + "epoch": 0.1487848583268003, + "grad_norm": 1.2907679080963135, + "learning_rate": 1.3275e-05, + "loss": 0.4447, + "step": 2657 + }, + { + "epoch": 0.14884085563892932, + "grad_norm": 1.1318539381027222, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.3366, + "step": 2658 + }, + { + "epoch": 0.14889685295105834, + "grad_norm": 1.27273690700531, + "learning_rate": 1.3285e-05, + "loss": 0.5273, + "step": 2659 + }, + { + "epoch": 0.14895285026318736, + "grad_norm": 1.3122705221176147, + "learning_rate": 1.329e-05, + "loss": 0.5537, + "step": 2660 + }, + { + "epoch": 0.14900884757531638, + "grad_norm": 1.3192979097366333, + "learning_rate": 1.3295000000000002e-05, + "loss": 0.5078, + "step": 2661 + }, + { + "epoch": 0.1490648448874454, + "grad_norm": 1.4099100828170776, + "learning_rate": 1.3300000000000001e-05, + "loss": 0.4531, + "step": 2662 + }, + { + "epoch": 0.14912084219957442, + "grad_norm": 6.571522235870361, + "learning_rate": 1.3305e-05, + "loss": 0.4682, + "step": 2663 + }, + { + "epoch": 0.14917683951170344, + "grad_norm": 1.0490186214447021, + "learning_rate": 1.331e-05, + "loss": 0.34, + "step": 2664 + }, + { + "epoch": 0.14923283682383245, + "grad_norm": 1.4035292863845825, + "learning_rate": 1.3315e-05, + "loss": 0.4918, + "step": 2665 + }, + { + "epoch": 0.14928883413596147, + "grad_norm": 1.2334212064743042, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.543, + "step": 2666 + }, + { + "epoch": 0.1493448314480905, + "grad_norm": 1.3419967889785767, + "learning_rate": 1.3325000000000002e-05, + "loss": 0.3962, + "step": 2667 + }, + { + "epoch": 0.1494008287602195, + "grad_norm": 1.2629648447036743, + "learning_rate": 1.3330000000000001e-05, + "loss": 0.5449, + "step": 2668 + }, + { + "epoch": 0.14945682607234853, + "grad_norm": 1.0612081289291382, + "learning_rate": 1.3335e-05, + "loss": 0.3091, + "step": 2669 + }, + { + "epoch": 0.14951282338447755, + "grad_norm": 1.045214056968689, + "learning_rate": 1.334e-05, + "loss": 0.3962, + "step": 2670 + }, + { + "epoch": 0.14956882069660657, + "grad_norm": 1.0915015935897827, + "learning_rate": 1.3345000000000002e-05, + "loss": 0.4691, + "step": 2671 + }, + { + "epoch": 0.1496248180087356, + "grad_norm": 1.229937195777893, + "learning_rate": 1.3350000000000001e-05, + "loss": 0.3836, + "step": 2672 + }, + { + "epoch": 0.1496808153208646, + "grad_norm": 1.303233027458191, + "learning_rate": 1.3355e-05, + "loss": 0.4157, + "step": 2673 + }, + { + "epoch": 0.14973681263299363, + "grad_norm": 1.1198750734329224, + "learning_rate": 1.336e-05, + "loss": 0.4337, + "step": 2674 + }, + { + "epoch": 0.14979280994512265, + "grad_norm": 1.3870460987091064, + "learning_rate": 1.3365e-05, + "loss": 0.5762, + "step": 2675 + }, + { + "epoch": 0.14984880725725166, + "grad_norm": 1.2103842496871948, + "learning_rate": 1.3370000000000002e-05, + "loss": 0.4496, + "step": 2676 + }, + { + "epoch": 0.14990480456938066, + "grad_norm": 0.9885027408599854, + "learning_rate": 1.3375000000000002e-05, + "loss": 0.3836, + "step": 2677 + }, + { + "epoch": 0.14996080188150968, + "grad_norm": 1.1948119401931763, + "learning_rate": 1.338e-05, + "loss": 0.441, + "step": 2678 + }, + { + "epoch": 0.1500167991936387, + "grad_norm": 1.1679539680480957, + "learning_rate": 1.3385e-05, + "loss": 0.4136, + "step": 2679 + }, + { + "epoch": 0.1500727965057677, + "grad_norm": 1.3134140968322754, + "learning_rate": 1.339e-05, + "loss": 0.7125, + "step": 2680 + }, + { + "epoch": 0.15012879381789673, + "grad_norm": 1.2312588691711426, + "learning_rate": 1.3395000000000001e-05, + "loss": 0.3682, + "step": 2681 + }, + { + "epoch": 0.15018479113002575, + "grad_norm": 1.028341293334961, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.2944, + "step": 2682 + }, + { + "epoch": 0.15024078844215477, + "grad_norm": 1.3490902185440063, + "learning_rate": 1.3405000000000001e-05, + "loss": 0.4722, + "step": 2683 + }, + { + "epoch": 0.1502967857542838, + "grad_norm": 1.3073066473007202, + "learning_rate": 1.341e-05, + "loss": 0.4111, + "step": 2684 + }, + { + "epoch": 0.1503527830664128, + "grad_norm": 1.1631131172180176, + "learning_rate": 1.3415e-05, + "loss": 0.4215, + "step": 2685 + }, + { + "epoch": 0.15040878037854183, + "grad_norm": 1.4284385442733765, + "learning_rate": 1.3420000000000002e-05, + "loss": 0.5716, + "step": 2686 + }, + { + "epoch": 0.15046477769067085, + "grad_norm": 1.2655913829803467, + "learning_rate": 1.3425000000000001e-05, + "loss": 0.353, + "step": 2687 + }, + { + "epoch": 0.15052077500279987, + "grad_norm": 1.0791429281234741, + "learning_rate": 1.343e-05, + "loss": 0.4433, + "step": 2688 + }, + { + "epoch": 0.1505767723149289, + "grad_norm": 1.2483781576156616, + "learning_rate": 1.3435000000000001e-05, + "loss": 0.4055, + "step": 2689 + }, + { + "epoch": 0.1506327696270579, + "grad_norm": 1.4182034730911255, + "learning_rate": 1.344e-05, + "loss": 0.4274, + "step": 2690 + }, + { + "epoch": 0.15068876693918692, + "grad_norm": 1.1914207935333252, + "learning_rate": 1.3445e-05, + "loss": 0.3799, + "step": 2691 + }, + { + "epoch": 0.15074476425131594, + "grad_norm": 1.1741597652435303, + "learning_rate": 1.3450000000000002e-05, + "loss": 0.4406, + "step": 2692 + }, + { + "epoch": 0.15080076156344496, + "grad_norm": 1.1351898908615112, + "learning_rate": 1.3455e-05, + "loss": 0.3233, + "step": 2693 + }, + { + "epoch": 0.15085675887557398, + "grad_norm": 1.2189592123031616, + "learning_rate": 1.346e-05, + "loss": 0.4002, + "step": 2694 + }, + { + "epoch": 0.150912756187703, + "grad_norm": 1.1392837762832642, + "learning_rate": 1.3465e-05, + "loss": 0.3374, + "step": 2695 + }, + { + "epoch": 0.15096875349983202, + "grad_norm": 1.401028037071228, + "learning_rate": 1.347e-05, + "loss": 0.4197, + "step": 2696 + }, + { + "epoch": 0.15102475081196104, + "grad_norm": 1.1850882768630981, + "learning_rate": 1.3475000000000002e-05, + "loss": 0.4155, + "step": 2697 + }, + { + "epoch": 0.15108074812409003, + "grad_norm": 1.4200987815856934, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.381, + "step": 2698 + }, + { + "epoch": 0.15113674543621905, + "grad_norm": 1.0592139959335327, + "learning_rate": 1.3485e-05, + "loss": 0.3618, + "step": 2699 + }, + { + "epoch": 0.15119274274834807, + "grad_norm": 1.2360087633132935, + "learning_rate": 1.349e-05, + "loss": 0.3661, + "step": 2700 + }, + { + "epoch": 0.1512487400604771, + "grad_norm": 1.242908239364624, + "learning_rate": 1.3494999999999999e-05, + "loss": 0.514, + "step": 2701 + }, + { + "epoch": 0.1513047373726061, + "grad_norm": 1.3020732402801514, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.4683, + "step": 2702 + }, + { + "epoch": 0.15136073468473513, + "grad_norm": 1.5411075353622437, + "learning_rate": 1.3505e-05, + "loss": 0.421, + "step": 2703 + }, + { + "epoch": 0.15141673199686415, + "grad_norm": 1.2832776308059692, + "learning_rate": 1.3510000000000001e-05, + "loss": 0.4048, + "step": 2704 + }, + { + "epoch": 0.15147272930899316, + "grad_norm": 1.2762134075164795, + "learning_rate": 1.3515e-05, + "loss": 0.5117, + "step": 2705 + }, + { + "epoch": 0.15152872662112218, + "grad_norm": 1.3318418264389038, + "learning_rate": 1.352e-05, + "loss": 0.4159, + "step": 2706 + }, + { + "epoch": 0.1515847239332512, + "grad_norm": 1.4038443565368652, + "learning_rate": 1.3525000000000002e-05, + "loss": 0.5075, + "step": 2707 + }, + { + "epoch": 0.15164072124538022, + "grad_norm": 1.3062500953674316, + "learning_rate": 1.3530000000000001e-05, + "loss": 0.6217, + "step": 2708 + }, + { + "epoch": 0.15169671855750924, + "grad_norm": 1.1369431018829346, + "learning_rate": 1.3535e-05, + "loss": 0.4208, + "step": 2709 + }, + { + "epoch": 0.15175271586963826, + "grad_norm": 1.464113473892212, + "learning_rate": 1.3539999999999999e-05, + "loss": 0.4137, + "step": 2710 + }, + { + "epoch": 0.15180871318176728, + "grad_norm": 1.2051522731781006, + "learning_rate": 1.3545e-05, + "loss": 0.5511, + "step": 2711 + }, + { + "epoch": 0.1518647104938963, + "grad_norm": 1.232174038887024, + "learning_rate": 1.3550000000000002e-05, + "loss": 0.4952, + "step": 2712 + }, + { + "epoch": 0.15192070780602532, + "grad_norm": 1.2316820621490479, + "learning_rate": 1.3555000000000002e-05, + "loss": 0.4071, + "step": 2713 + }, + { + "epoch": 0.15197670511815434, + "grad_norm": 1.3428975343704224, + "learning_rate": 1.356e-05, + "loss": 0.486, + "step": 2714 + }, + { + "epoch": 0.15203270243028336, + "grad_norm": 1.1460320949554443, + "learning_rate": 1.3565e-05, + "loss": 0.4263, + "step": 2715 + }, + { + "epoch": 0.15208869974241238, + "grad_norm": 1.3443206548690796, + "learning_rate": 1.3569999999999999e-05, + "loss": 0.4956, + "step": 2716 + }, + { + "epoch": 0.1521446970545414, + "grad_norm": 1.0955899953842163, + "learning_rate": 1.3575000000000001e-05, + "loss": 0.3872, + "step": 2717 + }, + { + "epoch": 0.15220069436667039, + "grad_norm": 1.1686948537826538, + "learning_rate": 1.358e-05, + "loss": 0.487, + "step": 2718 + }, + { + "epoch": 0.1522566916787994, + "grad_norm": 1.1902086734771729, + "learning_rate": 1.3585000000000001e-05, + "loss": 0.4301, + "step": 2719 + }, + { + "epoch": 0.15231268899092842, + "grad_norm": 1.2101625204086304, + "learning_rate": 1.359e-05, + "loss": 0.4894, + "step": 2720 + }, + { + "epoch": 0.15236868630305744, + "grad_norm": 1.1747902631759644, + "learning_rate": 1.3595e-05, + "loss": 0.3868, + "step": 2721 + }, + { + "epoch": 0.15242468361518646, + "grad_norm": 1.752328634262085, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.5366, + "step": 2722 + }, + { + "epoch": 0.15248068092731548, + "grad_norm": 1.3041211366653442, + "learning_rate": 1.3605000000000001e-05, + "loss": 0.4857, + "step": 2723 + }, + { + "epoch": 0.1525366782394445, + "grad_norm": 1.0765955448150635, + "learning_rate": 1.361e-05, + "loss": 0.345, + "step": 2724 + }, + { + "epoch": 0.15259267555157352, + "grad_norm": 1.4420080184936523, + "learning_rate": 1.3615e-05, + "loss": 0.5749, + "step": 2725 + }, + { + "epoch": 0.15264867286370254, + "grad_norm": 1.1885454654693604, + "learning_rate": 1.362e-05, + "loss": 0.5055, + "step": 2726 + }, + { + "epoch": 0.15270467017583156, + "grad_norm": 1.2062809467315674, + "learning_rate": 1.3625e-05, + "loss": 0.4599, + "step": 2727 + }, + { + "epoch": 0.15276066748796058, + "grad_norm": 1.3495934009552002, + "learning_rate": 1.3630000000000002e-05, + "loss": 0.4824, + "step": 2728 + }, + { + "epoch": 0.1528166648000896, + "grad_norm": 1.6765187978744507, + "learning_rate": 1.3635e-05, + "loss": 0.5063, + "step": 2729 + }, + { + "epoch": 0.15287266211221862, + "grad_norm": 1.3617475032806396, + "learning_rate": 1.364e-05, + "loss": 0.475, + "step": 2730 + }, + { + "epoch": 0.15292865942434763, + "grad_norm": 1.0638370513916016, + "learning_rate": 1.3644999999999999e-05, + "loss": 0.3662, + "step": 2731 + }, + { + "epoch": 0.15298465673647665, + "grad_norm": 1.3291374444961548, + "learning_rate": 1.3650000000000001e-05, + "loss": 0.4763, + "step": 2732 + }, + { + "epoch": 0.15304065404860567, + "grad_norm": 0.9607083797454834, + "learning_rate": 1.3655e-05, + "loss": 0.3983, + "step": 2733 + }, + { + "epoch": 0.1530966513607347, + "grad_norm": 1.1476843357086182, + "learning_rate": 1.3660000000000001e-05, + "loss": 0.5913, + "step": 2734 + }, + { + "epoch": 0.1531526486728637, + "grad_norm": 1.1318851709365845, + "learning_rate": 1.3665e-05, + "loss": 0.4258, + "step": 2735 + }, + { + "epoch": 0.15320864598499273, + "grad_norm": 1.1070514917373657, + "learning_rate": 1.367e-05, + "loss": 0.4925, + "step": 2736 + }, + { + "epoch": 0.15326464329712175, + "grad_norm": 1.3675343990325928, + "learning_rate": 1.3675000000000002e-05, + "loss": 0.5905, + "step": 2737 + }, + { + "epoch": 0.15332064060925077, + "grad_norm": 1.3040297031402588, + "learning_rate": 1.3680000000000001e-05, + "loss": 0.4395, + "step": 2738 + }, + { + "epoch": 0.15337663792137976, + "grad_norm": 1.1907511949539185, + "learning_rate": 1.3685e-05, + "loss": 0.3728, + "step": 2739 + }, + { + "epoch": 0.15343263523350878, + "grad_norm": 1.1401329040527344, + "learning_rate": 1.369e-05, + "loss": 0.446, + "step": 2740 + }, + { + "epoch": 0.1534886325456378, + "grad_norm": 1.0914613008499146, + "learning_rate": 1.3695e-05, + "loss": 0.3913, + "step": 2741 + }, + { + "epoch": 0.15354462985776682, + "grad_norm": 1.1461143493652344, + "learning_rate": 1.3700000000000001e-05, + "loss": 0.4044, + "step": 2742 + }, + { + "epoch": 0.15360062716989584, + "grad_norm": 1.229974627494812, + "learning_rate": 1.3705000000000002e-05, + "loss": 0.5203, + "step": 2743 + }, + { + "epoch": 0.15365662448202486, + "grad_norm": 1.1887822151184082, + "learning_rate": 1.3710000000000001e-05, + "loss": 0.3773, + "step": 2744 + }, + { + "epoch": 0.15371262179415388, + "grad_norm": 1.0333051681518555, + "learning_rate": 1.3715e-05, + "loss": 0.4415, + "step": 2745 + }, + { + "epoch": 0.1537686191062829, + "grad_norm": 1.1656396389007568, + "learning_rate": 1.3719999999999999e-05, + "loss": 0.4399, + "step": 2746 + }, + { + "epoch": 0.1538246164184119, + "grad_norm": 1.2455742359161377, + "learning_rate": 1.3725000000000002e-05, + "loss": 0.4364, + "step": 2747 + }, + { + "epoch": 0.15388061373054093, + "grad_norm": 1.0791863203048706, + "learning_rate": 1.373e-05, + "loss": 0.3751, + "step": 2748 + }, + { + "epoch": 0.15393661104266995, + "grad_norm": 1.1460646390914917, + "learning_rate": 1.3735000000000001e-05, + "loss": 0.4815, + "step": 2749 + }, + { + "epoch": 0.15399260835479897, + "grad_norm": 1.4752322435379028, + "learning_rate": 1.374e-05, + "loss": 0.6131, + "step": 2750 + }, + { + "epoch": 0.154048605666928, + "grad_norm": 1.1885502338409424, + "learning_rate": 1.3745e-05, + "loss": 0.5409, + "step": 2751 + }, + { + "epoch": 0.154104602979057, + "grad_norm": 1.6608283519744873, + "learning_rate": 1.3750000000000002e-05, + "loss": 0.4526, + "step": 2752 + }, + { + "epoch": 0.15416060029118603, + "grad_norm": 1.4488664865493774, + "learning_rate": 1.3755000000000001e-05, + "loss": 0.3633, + "step": 2753 + }, + { + "epoch": 0.15421659760331505, + "grad_norm": 1.2881131172180176, + "learning_rate": 1.376e-05, + "loss": 0.5018, + "step": 2754 + }, + { + "epoch": 0.15427259491544407, + "grad_norm": 1.3838692903518677, + "learning_rate": 1.3765e-05, + "loss": 0.5169, + "step": 2755 + }, + { + "epoch": 0.15432859222757309, + "grad_norm": 1.2695553302764893, + "learning_rate": 1.377e-05, + "loss": 0.5031, + "step": 2756 + }, + { + "epoch": 0.1543845895397021, + "grad_norm": 1.3294891119003296, + "learning_rate": 1.3775000000000001e-05, + "loss": 0.5022, + "step": 2757 + }, + { + "epoch": 0.15444058685183112, + "grad_norm": 1.100566029548645, + "learning_rate": 1.3780000000000002e-05, + "loss": 0.4378, + "step": 2758 + }, + { + "epoch": 0.15449658416396014, + "grad_norm": 1.1493943929672241, + "learning_rate": 1.3785000000000001e-05, + "loss": 0.5951, + "step": 2759 + }, + { + "epoch": 0.15455258147608913, + "grad_norm": 1.3388559818267822, + "learning_rate": 1.379e-05, + "loss": 0.4385, + "step": 2760 + }, + { + "epoch": 0.15460857878821815, + "grad_norm": 1.2471098899841309, + "learning_rate": 1.3795e-05, + "loss": 0.3845, + "step": 2761 + }, + { + "epoch": 0.15466457610034717, + "grad_norm": 1.0742604732513428, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.343, + "step": 2762 + }, + { + "epoch": 0.1547205734124762, + "grad_norm": 1.1153868436813354, + "learning_rate": 1.3805e-05, + "loss": 0.3902, + "step": 2763 + }, + { + "epoch": 0.1547765707246052, + "grad_norm": 1.2439930438995361, + "learning_rate": 1.381e-05, + "loss": 0.4284, + "step": 2764 + }, + { + "epoch": 0.15483256803673423, + "grad_norm": 2.0175135135650635, + "learning_rate": 1.3815e-05, + "loss": 0.3932, + "step": 2765 + }, + { + "epoch": 0.15488856534886325, + "grad_norm": 0.9710941910743713, + "learning_rate": 1.382e-05, + "loss": 0.2777, + "step": 2766 + }, + { + "epoch": 0.15494456266099227, + "grad_norm": 1.301509976387024, + "learning_rate": 1.3825000000000002e-05, + "loss": 0.418, + "step": 2767 + }, + { + "epoch": 0.1550005599731213, + "grad_norm": 1.266956090927124, + "learning_rate": 1.3830000000000001e-05, + "loss": 0.4642, + "step": 2768 + }, + { + "epoch": 0.1550565572852503, + "grad_norm": 1.3388291597366333, + "learning_rate": 1.3835e-05, + "loss": 0.5071, + "step": 2769 + }, + { + "epoch": 0.15511255459737933, + "grad_norm": 1.240614414215088, + "learning_rate": 1.384e-05, + "loss": 0.4912, + "step": 2770 + }, + { + "epoch": 0.15516855190950835, + "grad_norm": 1.2789167165756226, + "learning_rate": 1.3845e-05, + "loss": 0.4411, + "step": 2771 + }, + { + "epoch": 0.15522454922163736, + "grad_norm": 1.1685004234313965, + "learning_rate": 1.3850000000000001e-05, + "loss": 0.4822, + "step": 2772 + }, + { + "epoch": 0.15528054653376638, + "grad_norm": 1.5487463474273682, + "learning_rate": 1.3855000000000002e-05, + "loss": 0.4153, + "step": 2773 + }, + { + "epoch": 0.1553365438458954, + "grad_norm": 1.0213351249694824, + "learning_rate": 1.3860000000000001e-05, + "loss": 0.4142, + "step": 2774 + }, + { + "epoch": 0.15539254115802442, + "grad_norm": 1.3269951343536377, + "learning_rate": 1.3865e-05, + "loss": 0.3837, + "step": 2775 + }, + { + "epoch": 0.15544853847015344, + "grad_norm": 1.3169128894805908, + "learning_rate": 1.387e-05, + "loss": 0.3842, + "step": 2776 + }, + { + "epoch": 0.15550453578228246, + "grad_norm": 1.3252760171890259, + "learning_rate": 1.3875000000000002e-05, + "loss": 0.3117, + "step": 2777 + }, + { + "epoch": 0.15556053309441148, + "grad_norm": 1.3185728788375854, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.5022, + "step": 2778 + }, + { + "epoch": 0.1556165304065405, + "grad_norm": 1.1187517642974854, + "learning_rate": 1.3885e-05, + "loss": 0.4589, + "step": 2779 + }, + { + "epoch": 0.1556725277186695, + "grad_norm": 1.306776762008667, + "learning_rate": 1.389e-05, + "loss": 0.6016, + "step": 2780 + }, + { + "epoch": 0.1557285250307985, + "grad_norm": 1.2258596420288086, + "learning_rate": 1.3895e-05, + "loss": 0.506, + "step": 2781 + }, + { + "epoch": 0.15578452234292753, + "grad_norm": 1.2695173025131226, + "learning_rate": 1.3900000000000002e-05, + "loss": 0.5451, + "step": 2782 + }, + { + "epoch": 0.15584051965505655, + "grad_norm": 1.2983871698379517, + "learning_rate": 1.3905000000000002e-05, + "loss": 0.4632, + "step": 2783 + }, + { + "epoch": 0.15589651696718557, + "grad_norm": 1.0823630094528198, + "learning_rate": 1.391e-05, + "loss": 0.3751, + "step": 2784 + }, + { + "epoch": 0.15595251427931459, + "grad_norm": 1.118971586227417, + "learning_rate": 1.3915e-05, + "loss": 0.4565, + "step": 2785 + }, + { + "epoch": 0.1560085115914436, + "grad_norm": 1.221749186515808, + "learning_rate": 1.3919999999999999e-05, + "loss": 0.415, + "step": 2786 + }, + { + "epoch": 0.15606450890357262, + "grad_norm": 1.352028727531433, + "learning_rate": 1.3925000000000001e-05, + "loss": 0.4915, + "step": 2787 + }, + { + "epoch": 0.15612050621570164, + "grad_norm": 1.284001111984253, + "learning_rate": 1.3930000000000002e-05, + "loss": 0.4595, + "step": 2788 + }, + { + "epoch": 0.15617650352783066, + "grad_norm": 1.2041031122207642, + "learning_rate": 1.3935000000000001e-05, + "loss": 0.4143, + "step": 2789 + }, + { + "epoch": 0.15623250083995968, + "grad_norm": 1.3998245000839233, + "learning_rate": 1.394e-05, + "loss": 0.5409, + "step": 2790 + }, + { + "epoch": 0.1562884981520887, + "grad_norm": 1.4810839891433716, + "learning_rate": 1.3945e-05, + "loss": 0.4637, + "step": 2791 + }, + { + "epoch": 0.15634449546421772, + "grad_norm": 1.242527961730957, + "learning_rate": 1.3950000000000002e-05, + "loss": 0.3989, + "step": 2792 + }, + { + "epoch": 0.15640049277634674, + "grad_norm": 1.5063345432281494, + "learning_rate": 1.3955000000000001e-05, + "loss": 0.4764, + "step": 2793 + }, + { + "epoch": 0.15645649008847576, + "grad_norm": 1.2918951511383057, + "learning_rate": 1.396e-05, + "loss": 0.469, + "step": 2794 + }, + { + "epoch": 0.15651248740060478, + "grad_norm": 1.5124133825302124, + "learning_rate": 1.3965000000000001e-05, + "loss": 0.6527, + "step": 2795 + }, + { + "epoch": 0.1565684847127338, + "grad_norm": 1.2914239168167114, + "learning_rate": 1.397e-05, + "loss": 0.4341, + "step": 2796 + }, + { + "epoch": 0.15662448202486282, + "grad_norm": 1.5263826847076416, + "learning_rate": 1.3975000000000003e-05, + "loss": 0.5055, + "step": 2797 + }, + { + "epoch": 0.15668047933699183, + "grad_norm": 1.289559245109558, + "learning_rate": 1.3980000000000002e-05, + "loss": 0.4069, + "step": 2798 + }, + { + "epoch": 0.15673647664912085, + "grad_norm": 1.3386255502700806, + "learning_rate": 1.3985e-05, + "loss": 0.5132, + "step": 2799 + }, + { + "epoch": 0.15679247396124987, + "grad_norm": 1.1883652210235596, + "learning_rate": 1.399e-05, + "loss": 0.5696, + "step": 2800 + }, + { + "epoch": 0.15684847127337886, + "grad_norm": 1.2485971450805664, + "learning_rate": 1.3994999999999999e-05, + "loss": 0.3799, + "step": 2801 + }, + { + "epoch": 0.15690446858550788, + "grad_norm": 1.091121792793274, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.3849, + "step": 2802 + }, + { + "epoch": 0.1569604658976369, + "grad_norm": 1.335418462753296, + "learning_rate": 1.4005000000000002e-05, + "loss": 0.5873, + "step": 2803 + }, + { + "epoch": 0.15701646320976592, + "grad_norm": 1.2520933151245117, + "learning_rate": 1.4010000000000001e-05, + "loss": 0.388, + "step": 2804 + }, + { + "epoch": 0.15707246052189494, + "grad_norm": 1.3190689086914062, + "learning_rate": 1.4015e-05, + "loss": 0.5136, + "step": 2805 + }, + { + "epoch": 0.15712845783402396, + "grad_norm": 1.1924923658370972, + "learning_rate": 1.402e-05, + "loss": 0.5447, + "step": 2806 + }, + { + "epoch": 0.15718445514615298, + "grad_norm": 1.170854091644287, + "learning_rate": 1.4025000000000002e-05, + "loss": 0.5238, + "step": 2807 + }, + { + "epoch": 0.157240452458282, + "grad_norm": 1.2387036085128784, + "learning_rate": 1.4030000000000001e-05, + "loss": 0.5482, + "step": 2808 + }, + { + "epoch": 0.15729644977041102, + "grad_norm": 1.2129141092300415, + "learning_rate": 1.4035e-05, + "loss": 0.4282, + "step": 2809 + }, + { + "epoch": 0.15735244708254004, + "grad_norm": 1.2074551582336426, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.4376, + "step": 2810 + }, + { + "epoch": 0.15740844439466906, + "grad_norm": 1.2769153118133545, + "learning_rate": 1.4045e-05, + "loss": 0.5132, + "step": 2811 + }, + { + "epoch": 0.15746444170679808, + "grad_norm": 1.108242154121399, + "learning_rate": 1.4050000000000003e-05, + "loss": 0.4453, + "step": 2812 + }, + { + "epoch": 0.1575204390189271, + "grad_norm": 1.230486512184143, + "learning_rate": 1.4055000000000002e-05, + "loss": 0.3676, + "step": 2813 + }, + { + "epoch": 0.1575764363310561, + "grad_norm": 1.2049204111099243, + "learning_rate": 1.4060000000000001e-05, + "loss": 0.4022, + "step": 2814 + }, + { + "epoch": 0.15763243364318513, + "grad_norm": 1.43659508228302, + "learning_rate": 1.4065e-05, + "loss": 0.3815, + "step": 2815 + }, + { + "epoch": 0.15768843095531415, + "grad_norm": 1.3100559711456299, + "learning_rate": 1.4069999999999999e-05, + "loss": 0.4313, + "step": 2816 + }, + { + "epoch": 0.15774442826744317, + "grad_norm": 1.345204472541809, + "learning_rate": 1.4075e-05, + "loss": 0.3449, + "step": 2817 + }, + { + "epoch": 0.1578004255795722, + "grad_norm": 1.1812174320220947, + "learning_rate": 1.408e-05, + "loss": 0.4178, + "step": 2818 + }, + { + "epoch": 0.1578564228917012, + "grad_norm": 1.0329145193099976, + "learning_rate": 1.4085000000000002e-05, + "loss": 0.3366, + "step": 2819 + }, + { + "epoch": 0.15791242020383023, + "grad_norm": 1.1096454858779907, + "learning_rate": 1.409e-05, + "loss": 0.4205, + "step": 2820 + }, + { + "epoch": 0.15796841751595925, + "grad_norm": 1.0906081199645996, + "learning_rate": 1.4095e-05, + "loss": 0.41, + "step": 2821 + }, + { + "epoch": 0.15802441482808824, + "grad_norm": 1.1314079761505127, + "learning_rate": 1.4099999999999999e-05, + "loss": 0.4788, + "step": 2822 + }, + { + "epoch": 0.15808041214021726, + "grad_norm": 1.343037486076355, + "learning_rate": 1.4105000000000001e-05, + "loss": 0.4959, + "step": 2823 + }, + { + "epoch": 0.15813640945234628, + "grad_norm": 1.146909236907959, + "learning_rate": 1.411e-05, + "loss": 0.4298, + "step": 2824 + }, + { + "epoch": 0.1581924067644753, + "grad_norm": 1.0961363315582275, + "learning_rate": 1.4115000000000001e-05, + "loss": 0.3701, + "step": 2825 + }, + { + "epoch": 0.15824840407660432, + "grad_norm": 1.1320148706436157, + "learning_rate": 1.412e-05, + "loss": 0.365, + "step": 2826 + }, + { + "epoch": 0.15830440138873333, + "grad_norm": 1.2287229299545288, + "learning_rate": 1.4125e-05, + "loss": 0.3599, + "step": 2827 + }, + { + "epoch": 0.15836039870086235, + "grad_norm": 1.4046834707260132, + "learning_rate": 1.4130000000000002e-05, + "loss": 0.4867, + "step": 2828 + }, + { + "epoch": 0.15841639601299137, + "grad_norm": 1.079779863357544, + "learning_rate": 1.4135000000000001e-05, + "loss": 0.3229, + "step": 2829 + }, + { + "epoch": 0.1584723933251204, + "grad_norm": 1.482107400894165, + "learning_rate": 1.414e-05, + "loss": 0.5059, + "step": 2830 + }, + { + "epoch": 0.1585283906372494, + "grad_norm": 1.0793694257736206, + "learning_rate": 1.4145e-05, + "loss": 0.3989, + "step": 2831 + }, + { + "epoch": 0.15858438794937843, + "grad_norm": 1.906036376953125, + "learning_rate": 1.415e-05, + "loss": 0.4956, + "step": 2832 + }, + { + "epoch": 0.15864038526150745, + "grad_norm": 1.4872894287109375, + "learning_rate": 1.4155000000000001e-05, + "loss": 0.4636, + "step": 2833 + }, + { + "epoch": 0.15869638257363647, + "grad_norm": 1.1432147026062012, + "learning_rate": 1.4160000000000002e-05, + "loss": 0.4302, + "step": 2834 + }, + { + "epoch": 0.1587523798857655, + "grad_norm": 1.4246662855148315, + "learning_rate": 1.4165e-05, + "loss": 0.3449, + "step": 2835 + }, + { + "epoch": 0.1588083771978945, + "grad_norm": 1.111423134803772, + "learning_rate": 1.417e-05, + "loss": 0.3431, + "step": 2836 + }, + { + "epoch": 0.15886437451002353, + "grad_norm": 1.2800912857055664, + "learning_rate": 1.4174999999999999e-05, + "loss": 0.5825, + "step": 2837 + }, + { + "epoch": 0.15892037182215255, + "grad_norm": 1.2523735761642456, + "learning_rate": 1.4180000000000001e-05, + "loss": 0.5138, + "step": 2838 + }, + { + "epoch": 0.15897636913428156, + "grad_norm": 1.3163225650787354, + "learning_rate": 1.4185e-05, + "loss": 0.4908, + "step": 2839 + }, + { + "epoch": 0.15903236644641058, + "grad_norm": 1.0494143962860107, + "learning_rate": 1.4190000000000001e-05, + "loss": 0.3915, + "step": 2840 + }, + { + "epoch": 0.1590883637585396, + "grad_norm": 1.2246289253234863, + "learning_rate": 1.4195e-05, + "loss": 0.4301, + "step": 2841 + }, + { + "epoch": 0.1591443610706686, + "grad_norm": 1.6092647314071655, + "learning_rate": 1.42e-05, + "loss": 0.6129, + "step": 2842 + }, + { + "epoch": 0.1592003583827976, + "grad_norm": 1.4280298948287964, + "learning_rate": 1.4205000000000002e-05, + "loss": 0.517, + "step": 2843 + }, + { + "epoch": 0.15925635569492663, + "grad_norm": 1.171018362045288, + "learning_rate": 1.4210000000000001e-05, + "loss": 0.3967, + "step": 2844 + }, + { + "epoch": 0.15931235300705565, + "grad_norm": 1.0142732858657837, + "learning_rate": 1.4215e-05, + "loss": 0.3814, + "step": 2845 + }, + { + "epoch": 0.15936835031918467, + "grad_norm": 1.1765437126159668, + "learning_rate": 1.422e-05, + "loss": 0.4179, + "step": 2846 + }, + { + "epoch": 0.1594243476313137, + "grad_norm": 1.0797767639160156, + "learning_rate": 1.4225e-05, + "loss": 0.3455, + "step": 2847 + }, + { + "epoch": 0.1594803449434427, + "grad_norm": 1.056195855140686, + "learning_rate": 1.4230000000000001e-05, + "loss": 0.4499, + "step": 2848 + }, + { + "epoch": 0.15953634225557173, + "grad_norm": 1.177141785621643, + "learning_rate": 1.4235000000000002e-05, + "loss": 0.4186, + "step": 2849 + }, + { + "epoch": 0.15959233956770075, + "grad_norm": 1.1607415676116943, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.4834, + "step": 2850 + }, + { + "epoch": 0.15964833687982977, + "grad_norm": 1.601799488067627, + "learning_rate": 1.4245e-05, + "loss": 0.455, + "step": 2851 + }, + { + "epoch": 0.15970433419195879, + "grad_norm": 1.3250458240509033, + "learning_rate": 1.4249999999999999e-05, + "loss": 0.4554, + "step": 2852 + }, + { + "epoch": 0.1597603315040878, + "grad_norm": 1.2698512077331543, + "learning_rate": 1.4255000000000002e-05, + "loss": 0.5106, + "step": 2853 + }, + { + "epoch": 0.15981632881621682, + "grad_norm": 1.3289108276367188, + "learning_rate": 1.426e-05, + "loss": 0.4777, + "step": 2854 + }, + { + "epoch": 0.15987232612834584, + "grad_norm": 1.2498283386230469, + "learning_rate": 1.4265e-05, + "loss": 0.471, + "step": 2855 + }, + { + "epoch": 0.15992832344047486, + "grad_norm": 1.1666674613952637, + "learning_rate": 1.427e-05, + "loss": 0.4196, + "step": 2856 + }, + { + "epoch": 0.15998432075260388, + "grad_norm": 1.1459076404571533, + "learning_rate": 1.4275e-05, + "loss": 0.4377, + "step": 2857 + }, + { + "epoch": 0.1600403180647329, + "grad_norm": 1.3304330110549927, + "learning_rate": 1.4280000000000002e-05, + "loss": 0.6097, + "step": 2858 + }, + { + "epoch": 0.16009631537686192, + "grad_norm": 1.171502947807312, + "learning_rate": 1.4285000000000001e-05, + "loss": 0.3562, + "step": 2859 + }, + { + "epoch": 0.16015231268899094, + "grad_norm": 1.9733786582946777, + "learning_rate": 1.429e-05, + "loss": 0.4004, + "step": 2860 + }, + { + "epoch": 0.16020831000111996, + "grad_norm": 1.3946818113327026, + "learning_rate": 1.4295e-05, + "loss": 0.4732, + "step": 2861 + }, + { + "epoch": 0.16026430731324898, + "grad_norm": 1.0983641147613525, + "learning_rate": 1.43e-05, + "loss": 0.3928, + "step": 2862 + }, + { + "epoch": 0.16032030462537797, + "grad_norm": 1.106260895729065, + "learning_rate": 1.4305000000000001e-05, + "loss": 0.3349, + "step": 2863 + }, + { + "epoch": 0.160376301937507, + "grad_norm": 1.3615952730178833, + "learning_rate": 1.4310000000000002e-05, + "loss": 0.4601, + "step": 2864 + }, + { + "epoch": 0.160432299249636, + "grad_norm": 1.24410879611969, + "learning_rate": 1.4315000000000001e-05, + "loss": 0.4041, + "step": 2865 + }, + { + "epoch": 0.16048829656176503, + "grad_norm": 1.5198066234588623, + "learning_rate": 1.432e-05, + "loss": 0.4549, + "step": 2866 + }, + { + "epoch": 0.16054429387389405, + "grad_norm": 1.0579211711883545, + "learning_rate": 1.4325e-05, + "loss": 0.414, + "step": 2867 + }, + { + "epoch": 0.16060029118602306, + "grad_norm": 1.3212554454803467, + "learning_rate": 1.4330000000000002e-05, + "loss": 0.6388, + "step": 2868 + }, + { + "epoch": 0.16065628849815208, + "grad_norm": 1.2160145044326782, + "learning_rate": 1.4335e-05, + "loss": 0.4994, + "step": 2869 + }, + { + "epoch": 0.1607122858102811, + "grad_norm": 1.075178623199463, + "learning_rate": 1.434e-05, + "loss": 0.3752, + "step": 2870 + }, + { + "epoch": 0.16076828312241012, + "grad_norm": 1.1443240642547607, + "learning_rate": 1.4345e-05, + "loss": 0.5189, + "step": 2871 + }, + { + "epoch": 0.16082428043453914, + "grad_norm": 1.1939789056777954, + "learning_rate": 1.435e-05, + "loss": 0.4313, + "step": 2872 + }, + { + "epoch": 0.16088027774666816, + "grad_norm": 1.0715328454971313, + "learning_rate": 1.4355000000000002e-05, + "loss": 0.3825, + "step": 2873 + }, + { + "epoch": 0.16093627505879718, + "grad_norm": 1.2756158113479614, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.5611, + "step": 2874 + }, + { + "epoch": 0.1609922723709262, + "grad_norm": 1.0364662408828735, + "learning_rate": 1.4365e-05, + "loss": 0.3575, + "step": 2875 + }, + { + "epoch": 0.16104826968305522, + "grad_norm": 1.26275634765625, + "learning_rate": 1.437e-05, + "loss": 0.5086, + "step": 2876 + }, + { + "epoch": 0.16110426699518424, + "grad_norm": 1.209248423576355, + "learning_rate": 1.4374999999999999e-05, + "loss": 0.4786, + "step": 2877 + }, + { + "epoch": 0.16116026430731326, + "grad_norm": 1.212764859199524, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.404, + "step": 2878 + }, + { + "epoch": 0.16121626161944227, + "grad_norm": 1.119140625, + "learning_rate": 1.4385000000000002e-05, + "loss": 0.3736, + "step": 2879 + }, + { + "epoch": 0.1612722589315713, + "grad_norm": 2.300510883331299, + "learning_rate": 1.4390000000000001e-05, + "loss": 0.4957, + "step": 2880 + }, + { + "epoch": 0.1613282562437003, + "grad_norm": 1.0119082927703857, + "learning_rate": 1.4395e-05, + "loss": 0.4613, + "step": 2881 + }, + { + "epoch": 0.16138425355582933, + "grad_norm": 1.2868393659591675, + "learning_rate": 1.44e-05, + "loss": 0.7153, + "step": 2882 + }, + { + "epoch": 0.16144025086795835, + "grad_norm": 1.080954909324646, + "learning_rate": 1.4405000000000002e-05, + "loss": 0.4354, + "step": 2883 + }, + { + "epoch": 0.16149624818008734, + "grad_norm": 1.0970935821533203, + "learning_rate": 1.4410000000000001e-05, + "loss": 0.4083, + "step": 2884 + }, + { + "epoch": 0.16155224549221636, + "grad_norm": 2.6635384559631348, + "learning_rate": 1.4415e-05, + "loss": 0.5151, + "step": 2885 + }, + { + "epoch": 0.16160824280434538, + "grad_norm": 1.5100364685058594, + "learning_rate": 1.4420000000000001e-05, + "loss": 0.414, + "step": 2886 + }, + { + "epoch": 0.1616642401164744, + "grad_norm": 1.3002201318740845, + "learning_rate": 1.4425e-05, + "loss": 0.5592, + "step": 2887 + }, + { + "epoch": 0.16172023742860342, + "grad_norm": 1.2284952402114868, + "learning_rate": 1.4430000000000002e-05, + "loss": 0.4932, + "step": 2888 + }, + { + "epoch": 0.16177623474073244, + "grad_norm": 1.1353243589401245, + "learning_rate": 1.4435000000000002e-05, + "loss": 0.455, + "step": 2889 + }, + { + "epoch": 0.16183223205286146, + "grad_norm": 1.4194884300231934, + "learning_rate": 1.444e-05, + "loss": 0.4574, + "step": 2890 + }, + { + "epoch": 0.16188822936499048, + "grad_norm": 0.9726737141609192, + "learning_rate": 1.4445e-05, + "loss": 0.4064, + "step": 2891 + }, + { + "epoch": 0.1619442266771195, + "grad_norm": 1.0060579776763916, + "learning_rate": 1.4449999999999999e-05, + "loss": 0.3954, + "step": 2892 + }, + { + "epoch": 0.16200022398924852, + "grad_norm": 1.358452320098877, + "learning_rate": 1.4455000000000001e-05, + "loss": 0.5166, + "step": 2893 + }, + { + "epoch": 0.16205622130137753, + "grad_norm": 1.0128717422485352, + "learning_rate": 1.4460000000000002e-05, + "loss": 0.364, + "step": 2894 + }, + { + "epoch": 0.16211221861350655, + "grad_norm": 1.2332308292388916, + "learning_rate": 1.4465000000000001e-05, + "loss": 0.3993, + "step": 2895 + }, + { + "epoch": 0.16216821592563557, + "grad_norm": 1.2258899211883545, + "learning_rate": 1.447e-05, + "loss": 0.4307, + "step": 2896 + }, + { + "epoch": 0.1622242132377646, + "grad_norm": 1.2921556234359741, + "learning_rate": 1.4475e-05, + "loss": 0.4426, + "step": 2897 + }, + { + "epoch": 0.1622802105498936, + "grad_norm": 1.0930753946304321, + "learning_rate": 1.4480000000000002e-05, + "loss": 0.3127, + "step": 2898 + }, + { + "epoch": 0.16233620786202263, + "grad_norm": 1.4140493869781494, + "learning_rate": 1.4485000000000001e-05, + "loss": 0.4241, + "step": 2899 + }, + { + "epoch": 0.16239220517415165, + "grad_norm": 1.1093642711639404, + "learning_rate": 1.449e-05, + "loss": 0.4391, + "step": 2900 + }, + { + "epoch": 0.16244820248628067, + "grad_norm": 0.9811723232269287, + "learning_rate": 1.4495000000000001e-05, + "loss": 0.3002, + "step": 2901 + }, + { + "epoch": 0.1625041997984097, + "grad_norm": 1.0897340774536133, + "learning_rate": 1.45e-05, + "loss": 0.4447, + "step": 2902 + }, + { + "epoch": 0.1625601971105387, + "grad_norm": 1.60068941116333, + "learning_rate": 1.4505000000000003e-05, + "loss": 0.4836, + "step": 2903 + }, + { + "epoch": 0.1626161944226677, + "grad_norm": 1.0105966329574585, + "learning_rate": 1.4510000000000002e-05, + "loss": 0.4024, + "step": 2904 + }, + { + "epoch": 0.16267219173479672, + "grad_norm": 1.241045594215393, + "learning_rate": 1.4515e-05, + "loss": 0.452, + "step": 2905 + }, + { + "epoch": 0.16272818904692574, + "grad_norm": 1.3478405475616455, + "learning_rate": 1.452e-05, + "loss": 0.6681, + "step": 2906 + }, + { + "epoch": 0.16278418635905476, + "grad_norm": 1.3035074472427368, + "learning_rate": 1.4524999999999999e-05, + "loss": 0.4031, + "step": 2907 + }, + { + "epoch": 0.16284018367118377, + "grad_norm": 1.215703010559082, + "learning_rate": 1.4530000000000001e-05, + "loss": 0.4691, + "step": 2908 + }, + { + "epoch": 0.1628961809833128, + "grad_norm": 1.289297103881836, + "learning_rate": 1.4535e-05, + "loss": 0.3669, + "step": 2909 + }, + { + "epoch": 0.1629521782954418, + "grad_norm": 1.149066686630249, + "learning_rate": 1.4540000000000001e-05, + "loss": 0.4264, + "step": 2910 + }, + { + "epoch": 0.16300817560757083, + "grad_norm": 1.2812926769256592, + "learning_rate": 1.4545e-05, + "loss": 0.4938, + "step": 2911 + }, + { + "epoch": 0.16306417291969985, + "grad_norm": 1.0730267763137817, + "learning_rate": 1.455e-05, + "loss": 0.4693, + "step": 2912 + }, + { + "epoch": 0.16312017023182887, + "grad_norm": 1.1804014444351196, + "learning_rate": 1.4555000000000002e-05, + "loss": 0.4372, + "step": 2913 + }, + { + "epoch": 0.1631761675439579, + "grad_norm": 1.2339531183242798, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.4266, + "step": 2914 + }, + { + "epoch": 0.1632321648560869, + "grad_norm": 1.1102869510650635, + "learning_rate": 1.4565e-05, + "loss": 0.4657, + "step": 2915 + }, + { + "epoch": 0.16328816216821593, + "grad_norm": 1.4539523124694824, + "learning_rate": 1.4570000000000001e-05, + "loss": 0.4176, + "step": 2916 + }, + { + "epoch": 0.16334415948034495, + "grad_norm": 1.2633984088897705, + "learning_rate": 1.4575e-05, + "loss": 0.4214, + "step": 2917 + }, + { + "epoch": 0.16340015679247397, + "grad_norm": 1.2158887386322021, + "learning_rate": 1.4580000000000003e-05, + "loss": 0.4991, + "step": 2918 + }, + { + "epoch": 0.16345615410460299, + "grad_norm": 1.266831636428833, + "learning_rate": 1.4585000000000002e-05, + "loss": 0.5787, + "step": 2919 + }, + { + "epoch": 0.163512151416732, + "grad_norm": 1.1851109266281128, + "learning_rate": 1.4590000000000001e-05, + "loss": 0.5018, + "step": 2920 + }, + { + "epoch": 0.16356814872886102, + "grad_norm": 1.096369981765747, + "learning_rate": 1.4595e-05, + "loss": 0.3453, + "step": 2921 + }, + { + "epoch": 0.16362414604099004, + "grad_norm": 1.7543258666992188, + "learning_rate": 1.4599999999999999e-05, + "loss": 0.5218, + "step": 2922 + }, + { + "epoch": 0.16368014335311906, + "grad_norm": 1.2711646556854248, + "learning_rate": 1.4605000000000002e-05, + "loss": 0.3713, + "step": 2923 + }, + { + "epoch": 0.16373614066524808, + "grad_norm": 1.3825998306274414, + "learning_rate": 1.461e-05, + "loss": 0.4534, + "step": 2924 + }, + { + "epoch": 0.16379213797737707, + "grad_norm": 1.39090096950531, + "learning_rate": 1.4615000000000002e-05, + "loss": 0.6809, + "step": 2925 + }, + { + "epoch": 0.1638481352895061, + "grad_norm": 1.1341646909713745, + "learning_rate": 1.462e-05, + "loss": 0.2352, + "step": 2926 + }, + { + "epoch": 0.1639041326016351, + "grad_norm": 1.1790781021118164, + "learning_rate": 1.4625e-05, + "loss": 0.4092, + "step": 2927 + }, + { + "epoch": 0.16396012991376413, + "grad_norm": 1.4217054843902588, + "learning_rate": 1.4630000000000002e-05, + "loss": 0.5113, + "step": 2928 + }, + { + "epoch": 0.16401612722589315, + "grad_norm": 1.1839795112609863, + "learning_rate": 1.4635000000000001e-05, + "loss": 0.4317, + "step": 2929 + }, + { + "epoch": 0.16407212453802217, + "grad_norm": 1.1309460401535034, + "learning_rate": 1.464e-05, + "loss": 0.3266, + "step": 2930 + }, + { + "epoch": 0.1641281218501512, + "grad_norm": 1.3069939613342285, + "learning_rate": 1.4645e-05, + "loss": 0.5372, + "step": 2931 + }, + { + "epoch": 0.1641841191622802, + "grad_norm": 1.2711175680160522, + "learning_rate": 1.465e-05, + "loss": 0.4304, + "step": 2932 + }, + { + "epoch": 0.16424011647440923, + "grad_norm": 1.1943514347076416, + "learning_rate": 1.4655000000000003e-05, + "loss": 0.3806, + "step": 2933 + }, + { + "epoch": 0.16429611378653824, + "grad_norm": 1.2070180177688599, + "learning_rate": 1.4660000000000002e-05, + "loss": 0.4196, + "step": 2934 + }, + { + "epoch": 0.16435211109866726, + "grad_norm": 1.223737120628357, + "learning_rate": 1.4665000000000001e-05, + "loss": 0.4812, + "step": 2935 + }, + { + "epoch": 0.16440810841079628, + "grad_norm": 1.440280556678772, + "learning_rate": 1.467e-05, + "loss": 0.4457, + "step": 2936 + }, + { + "epoch": 0.1644641057229253, + "grad_norm": 1.0822083950042725, + "learning_rate": 1.4675e-05, + "loss": 0.3355, + "step": 2937 + }, + { + "epoch": 0.16452010303505432, + "grad_norm": 1.3040459156036377, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.4158, + "step": 2938 + }, + { + "epoch": 0.16457610034718334, + "grad_norm": 1.1827491521835327, + "learning_rate": 1.4685000000000001e-05, + "loss": 0.4322, + "step": 2939 + }, + { + "epoch": 0.16463209765931236, + "grad_norm": 1.0448668003082275, + "learning_rate": 1.4690000000000002e-05, + "loss": 0.3462, + "step": 2940 + }, + { + "epoch": 0.16468809497144138, + "grad_norm": 1.0867873430252075, + "learning_rate": 1.4695e-05, + "loss": 0.4471, + "step": 2941 + }, + { + "epoch": 0.1647440922835704, + "grad_norm": 1.357195496559143, + "learning_rate": 1.47e-05, + "loss": 0.491, + "step": 2942 + }, + { + "epoch": 0.16480008959569942, + "grad_norm": 1.3898372650146484, + "learning_rate": 1.4704999999999999e-05, + "loss": 0.5863, + "step": 2943 + }, + { + "epoch": 0.16485608690782844, + "grad_norm": 1.168304681777954, + "learning_rate": 1.4710000000000001e-05, + "loss": 0.3575, + "step": 2944 + }, + { + "epoch": 0.16491208421995746, + "grad_norm": 1.2201783657073975, + "learning_rate": 1.4715e-05, + "loss": 0.4379, + "step": 2945 + }, + { + "epoch": 0.16496808153208645, + "grad_norm": 1.1552715301513672, + "learning_rate": 1.472e-05, + "loss": 0.4842, + "step": 2946 + }, + { + "epoch": 0.16502407884421547, + "grad_norm": 1.0281933546066284, + "learning_rate": 1.4725e-05, + "loss": 0.344, + "step": 2947 + }, + { + "epoch": 0.16508007615634449, + "grad_norm": 1.1922563314437866, + "learning_rate": 1.473e-05, + "loss": 0.4426, + "step": 2948 + }, + { + "epoch": 0.1651360734684735, + "grad_norm": 1.4431538581848145, + "learning_rate": 1.4735000000000002e-05, + "loss": 0.5686, + "step": 2949 + }, + { + "epoch": 0.16519207078060252, + "grad_norm": 1.0972483158111572, + "learning_rate": 1.4740000000000001e-05, + "loss": 0.36, + "step": 2950 + }, + { + "epoch": 0.16524806809273154, + "grad_norm": 1.3279143571853638, + "learning_rate": 1.4745e-05, + "loss": 0.4609, + "step": 2951 + }, + { + "epoch": 0.16530406540486056, + "grad_norm": 1.2672828435897827, + "learning_rate": 1.475e-05, + "loss": 0.5439, + "step": 2952 + }, + { + "epoch": 0.16536006271698958, + "grad_norm": 1.2774341106414795, + "learning_rate": 1.4755e-05, + "loss": 0.3992, + "step": 2953 + }, + { + "epoch": 0.1654160600291186, + "grad_norm": 1.1626168489456177, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.4066, + "step": 2954 + }, + { + "epoch": 0.16547205734124762, + "grad_norm": 1.351238489151001, + "learning_rate": 1.4765000000000002e-05, + "loss": 0.4526, + "step": 2955 + }, + { + "epoch": 0.16552805465337664, + "grad_norm": 1.3523439168930054, + "learning_rate": 1.4770000000000001e-05, + "loss": 0.2999, + "step": 2956 + }, + { + "epoch": 0.16558405196550566, + "grad_norm": 1.2778282165527344, + "learning_rate": 1.4775e-05, + "loss": 0.4493, + "step": 2957 + }, + { + "epoch": 0.16564004927763468, + "grad_norm": 1.4594963788986206, + "learning_rate": 1.4779999999999999e-05, + "loss": 0.4177, + "step": 2958 + }, + { + "epoch": 0.1656960465897637, + "grad_norm": 1.1730430126190186, + "learning_rate": 1.4785000000000002e-05, + "loss": 0.4972, + "step": 2959 + }, + { + "epoch": 0.16575204390189271, + "grad_norm": 1.3805487155914307, + "learning_rate": 1.479e-05, + "loss": 0.4852, + "step": 2960 + }, + { + "epoch": 0.16580804121402173, + "grad_norm": 1.6004854440689087, + "learning_rate": 1.4795e-05, + "loss": 0.4528, + "step": 2961 + }, + { + "epoch": 0.16586403852615075, + "grad_norm": 1.24202299118042, + "learning_rate": 1.48e-05, + "loss": 0.4216, + "step": 2962 + }, + { + "epoch": 0.16592003583827977, + "grad_norm": 1.1655060052871704, + "learning_rate": 1.4805e-05, + "loss": 0.298, + "step": 2963 + }, + { + "epoch": 0.1659760331504088, + "grad_norm": 1.1057674884796143, + "learning_rate": 1.4810000000000002e-05, + "loss": 0.3616, + "step": 2964 + }, + { + "epoch": 0.1660320304625378, + "grad_norm": 1.1026337146759033, + "learning_rate": 1.4815000000000001e-05, + "loss": 0.4615, + "step": 2965 + }, + { + "epoch": 0.1660880277746668, + "grad_norm": 1.2011353969573975, + "learning_rate": 1.482e-05, + "loss": 0.4535, + "step": 2966 + }, + { + "epoch": 0.16614402508679582, + "grad_norm": 1.1062843799591064, + "learning_rate": 1.4825e-05, + "loss": 0.3815, + "step": 2967 + }, + { + "epoch": 0.16620002239892484, + "grad_norm": 1.2528876066207886, + "learning_rate": 1.4829999999999999e-05, + "loss": 0.4937, + "step": 2968 + }, + { + "epoch": 0.16625601971105386, + "grad_norm": 1.081746220588684, + "learning_rate": 1.4835000000000001e-05, + "loss": 0.4291, + "step": 2969 + }, + { + "epoch": 0.16631201702318288, + "grad_norm": 1.4553874731063843, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.5446, + "step": 2970 + }, + { + "epoch": 0.1663680143353119, + "grad_norm": 1.3763076066970825, + "learning_rate": 1.4845000000000001e-05, + "loss": 0.4954, + "step": 2971 + }, + { + "epoch": 0.16642401164744092, + "grad_norm": 1.634247899055481, + "learning_rate": 1.485e-05, + "loss": 0.5409, + "step": 2972 + }, + { + "epoch": 0.16648000895956994, + "grad_norm": 1.0740395784378052, + "learning_rate": 1.4855e-05, + "loss": 0.4845, + "step": 2973 + }, + { + "epoch": 0.16653600627169896, + "grad_norm": 1.0615090131759644, + "learning_rate": 1.4860000000000002e-05, + "loss": 0.431, + "step": 2974 + }, + { + "epoch": 0.16659200358382797, + "grad_norm": 1.1746183633804321, + "learning_rate": 1.4865e-05, + "loss": 0.4175, + "step": 2975 + }, + { + "epoch": 0.166648000895957, + "grad_norm": 1.1579493284225464, + "learning_rate": 1.487e-05, + "loss": 0.5676, + "step": 2976 + }, + { + "epoch": 0.166703998208086, + "grad_norm": 1.1894301176071167, + "learning_rate": 1.4875e-05, + "loss": 0.3935, + "step": 2977 + }, + { + "epoch": 0.16675999552021503, + "grad_norm": 1.4352726936340332, + "learning_rate": 1.488e-05, + "loss": 0.525, + "step": 2978 + }, + { + "epoch": 0.16681599283234405, + "grad_norm": 1.2767298221588135, + "learning_rate": 1.4885000000000002e-05, + "loss": 0.4904, + "step": 2979 + }, + { + "epoch": 0.16687199014447307, + "grad_norm": 1.2601221799850464, + "learning_rate": 1.4890000000000001e-05, + "loss": 0.4483, + "step": 2980 + }, + { + "epoch": 0.1669279874566021, + "grad_norm": 1.524616003036499, + "learning_rate": 1.4895e-05, + "loss": 0.6168, + "step": 2981 + }, + { + "epoch": 0.1669839847687311, + "grad_norm": 1.2325519323349, + "learning_rate": 1.49e-05, + "loss": 0.4345, + "step": 2982 + }, + { + "epoch": 0.16703998208086013, + "grad_norm": 1.2944282293319702, + "learning_rate": 1.4904999999999999e-05, + "loss": 0.5398, + "step": 2983 + }, + { + "epoch": 0.16709597939298915, + "grad_norm": 1.2927069664001465, + "learning_rate": 1.4910000000000001e-05, + "loss": 0.6528, + "step": 2984 + }, + { + "epoch": 0.16715197670511817, + "grad_norm": 1.38869047164917, + "learning_rate": 1.4915000000000002e-05, + "loss": 0.4316, + "step": 2985 + }, + { + "epoch": 0.16720797401724719, + "grad_norm": 1.2885438203811646, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.4888, + "step": 2986 + }, + { + "epoch": 0.16726397132937618, + "grad_norm": 1.0417410135269165, + "learning_rate": 1.4925e-05, + "loss": 0.3589, + "step": 2987 + }, + { + "epoch": 0.1673199686415052, + "grad_norm": 1.3367822170257568, + "learning_rate": 1.493e-05, + "loss": 0.495, + "step": 2988 + }, + { + "epoch": 0.16737596595363421, + "grad_norm": 1.2681429386138916, + "learning_rate": 1.4935000000000002e-05, + "loss": 0.411, + "step": 2989 + }, + { + "epoch": 0.16743196326576323, + "grad_norm": 1.2044123411178589, + "learning_rate": 1.4940000000000001e-05, + "loss": 0.4475, + "step": 2990 + }, + { + "epoch": 0.16748796057789225, + "grad_norm": 1.0016268491744995, + "learning_rate": 1.4945e-05, + "loss": 0.3603, + "step": 2991 + }, + { + "epoch": 0.16754395789002127, + "grad_norm": 1.2403982877731323, + "learning_rate": 1.4950000000000001e-05, + "loss": 0.3746, + "step": 2992 + }, + { + "epoch": 0.1675999552021503, + "grad_norm": 1.594496250152588, + "learning_rate": 1.4955e-05, + "loss": 0.625, + "step": 2993 + }, + { + "epoch": 0.1676559525142793, + "grad_norm": 1.1329015493392944, + "learning_rate": 1.4960000000000002e-05, + "loss": 0.3695, + "step": 2994 + }, + { + "epoch": 0.16771194982640833, + "grad_norm": 1.2726589441299438, + "learning_rate": 1.4965000000000002e-05, + "loss": 0.4215, + "step": 2995 + }, + { + "epoch": 0.16776794713853735, + "grad_norm": 1.1244759559631348, + "learning_rate": 1.497e-05, + "loss": 0.4554, + "step": 2996 + }, + { + "epoch": 0.16782394445066637, + "grad_norm": 1.028041958808899, + "learning_rate": 1.4975e-05, + "loss": 0.4082, + "step": 2997 + }, + { + "epoch": 0.1678799417627954, + "grad_norm": 1.3320670127868652, + "learning_rate": 1.4979999999999999e-05, + "loss": 0.5336, + "step": 2998 + }, + { + "epoch": 0.1679359390749244, + "grad_norm": 1.1810829639434814, + "learning_rate": 1.4985000000000001e-05, + "loss": 0.4698, + "step": 2999 + }, + { + "epoch": 0.16799193638705343, + "grad_norm": 1.2294703722000122, + "learning_rate": 1.499e-05, + "loss": 0.5342, + "step": 3000 + }, + { + "epoch": 0.16804793369918244, + "grad_norm": 13.666152954101562, + "learning_rate": 1.4995000000000001e-05, + "loss": 0.4374, + "step": 3001 + }, + { + "epoch": 0.16810393101131146, + "grad_norm": 1.1388001441955566, + "learning_rate": 1.5e-05, + "loss": 0.4293, + "step": 3002 + }, + { + "epoch": 0.16815992832344048, + "grad_norm": 1.140563726425171, + "learning_rate": 1.5005e-05, + "loss": 0.411, + "step": 3003 + }, + { + "epoch": 0.1682159256355695, + "grad_norm": 1.1854054927825928, + "learning_rate": 1.5010000000000002e-05, + "loss": 0.3789, + "step": 3004 + }, + { + "epoch": 0.16827192294769852, + "grad_norm": 1.3162811994552612, + "learning_rate": 1.5015000000000001e-05, + "loss": 0.5886, + "step": 3005 + }, + { + "epoch": 0.16832792025982754, + "grad_norm": 1.1015150547027588, + "learning_rate": 1.502e-05, + "loss": 0.3127, + "step": 3006 + }, + { + "epoch": 0.16838391757195656, + "grad_norm": 1.1969358921051025, + "learning_rate": 1.5025000000000001e-05, + "loss": 0.4865, + "step": 3007 + }, + { + "epoch": 0.16843991488408555, + "grad_norm": 1.3611640930175781, + "learning_rate": 1.503e-05, + "loss": 0.5478, + "step": 3008 + }, + { + "epoch": 0.16849591219621457, + "grad_norm": 1.1446624994277954, + "learning_rate": 1.5035000000000003e-05, + "loss": 0.4432, + "step": 3009 + }, + { + "epoch": 0.1685519095083436, + "grad_norm": 1.3180516958236694, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.4095, + "step": 3010 + }, + { + "epoch": 0.1686079068204726, + "grad_norm": 1.0477768182754517, + "learning_rate": 1.5045e-05, + "loss": 0.4066, + "step": 3011 + }, + { + "epoch": 0.16866390413260163, + "grad_norm": 1.051212191581726, + "learning_rate": 1.505e-05, + "loss": 0.3413, + "step": 3012 + }, + { + "epoch": 0.16871990144473065, + "grad_norm": 1.187883734703064, + "learning_rate": 1.5054999999999999e-05, + "loss": 0.3827, + "step": 3013 + }, + { + "epoch": 0.16877589875685967, + "grad_norm": 1.0885032415390015, + "learning_rate": 1.5060000000000001e-05, + "loss": 0.4144, + "step": 3014 + }, + { + "epoch": 0.16883189606898868, + "grad_norm": 1.3088181018829346, + "learning_rate": 1.5065e-05, + "loss": 0.4413, + "step": 3015 + }, + { + "epoch": 0.1688878933811177, + "grad_norm": 1.3181278705596924, + "learning_rate": 1.5070000000000001e-05, + "loss": 0.3683, + "step": 3016 + }, + { + "epoch": 0.16894389069324672, + "grad_norm": 1.3163723945617676, + "learning_rate": 1.5075e-05, + "loss": 0.4511, + "step": 3017 + }, + { + "epoch": 0.16899988800537574, + "grad_norm": 1.29971444606781, + "learning_rate": 1.508e-05, + "loss": 0.4471, + "step": 3018 + }, + { + "epoch": 0.16905588531750476, + "grad_norm": 1.2885290384292603, + "learning_rate": 1.5085000000000002e-05, + "loss": 0.4663, + "step": 3019 + }, + { + "epoch": 0.16911188262963378, + "grad_norm": 1.0854727029800415, + "learning_rate": 1.5090000000000001e-05, + "loss": 0.4764, + "step": 3020 + }, + { + "epoch": 0.1691678799417628, + "grad_norm": 1.3393666744232178, + "learning_rate": 1.5095e-05, + "loss": 0.4239, + "step": 3021 + }, + { + "epoch": 0.16922387725389182, + "grad_norm": 1.4022681713104248, + "learning_rate": 1.51e-05, + "loss": 0.5483, + "step": 3022 + }, + { + "epoch": 0.16927987456602084, + "grad_norm": 1.2516348361968994, + "learning_rate": 1.5105e-05, + "loss": 0.4618, + "step": 3023 + }, + { + "epoch": 0.16933587187814986, + "grad_norm": 1.2458224296569824, + "learning_rate": 1.5110000000000003e-05, + "loss": 0.4085, + "step": 3024 + }, + { + "epoch": 0.16939186919027888, + "grad_norm": 1.6810840368270874, + "learning_rate": 1.5115000000000002e-05, + "loss": 0.3476, + "step": 3025 + }, + { + "epoch": 0.1694478665024079, + "grad_norm": 1.6540249586105347, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.4504, + "step": 3026 + }, + { + "epoch": 0.16950386381453691, + "grad_norm": 1.420183777809143, + "learning_rate": 1.5125e-05, + "loss": 0.5645, + "step": 3027 + }, + { + "epoch": 0.16955986112666593, + "grad_norm": 1.0648268461227417, + "learning_rate": 1.5129999999999999e-05, + "loss": 0.3991, + "step": 3028 + }, + { + "epoch": 0.16961585843879493, + "grad_norm": 1.2405067682266235, + "learning_rate": 1.5135000000000002e-05, + "loss": 0.5124, + "step": 3029 + }, + { + "epoch": 0.16967185575092394, + "grad_norm": 1.7114293575286865, + "learning_rate": 1.514e-05, + "loss": 0.5756, + "step": 3030 + }, + { + "epoch": 0.16972785306305296, + "grad_norm": 1.1942071914672852, + "learning_rate": 1.5145000000000002e-05, + "loss": 0.4559, + "step": 3031 + }, + { + "epoch": 0.16978385037518198, + "grad_norm": 1.2123992443084717, + "learning_rate": 1.515e-05, + "loss": 0.3711, + "step": 3032 + }, + { + "epoch": 0.169839847687311, + "grad_norm": 1.1135144233703613, + "learning_rate": 1.5155e-05, + "loss": 0.5915, + "step": 3033 + }, + { + "epoch": 0.16989584499944002, + "grad_norm": 1.4512019157409668, + "learning_rate": 1.5160000000000002e-05, + "loss": 0.4455, + "step": 3034 + }, + { + "epoch": 0.16995184231156904, + "grad_norm": 2.1916909217834473, + "learning_rate": 1.5165000000000001e-05, + "loss": 0.4776, + "step": 3035 + }, + { + "epoch": 0.17000783962369806, + "grad_norm": 1.0539445877075195, + "learning_rate": 1.517e-05, + "loss": 0.4514, + "step": 3036 + }, + { + "epoch": 0.17006383693582708, + "grad_norm": 1.3222472667694092, + "learning_rate": 1.5175e-05, + "loss": 0.4183, + "step": 3037 + }, + { + "epoch": 0.1701198342479561, + "grad_norm": 1.3580297231674194, + "learning_rate": 1.518e-05, + "loss": 0.5505, + "step": 3038 + }, + { + "epoch": 0.17017583156008512, + "grad_norm": 1.196560025215149, + "learning_rate": 1.5185000000000003e-05, + "loss": 0.437, + "step": 3039 + }, + { + "epoch": 0.17023182887221414, + "grad_norm": 1.0886143445968628, + "learning_rate": 1.5190000000000002e-05, + "loss": 0.332, + "step": 3040 + }, + { + "epoch": 0.17028782618434316, + "grad_norm": 1.1416614055633545, + "learning_rate": 1.5195000000000001e-05, + "loss": 0.3313, + "step": 3041 + }, + { + "epoch": 0.17034382349647217, + "grad_norm": 1.246839165687561, + "learning_rate": 1.52e-05, + "loss": 0.5246, + "step": 3042 + }, + { + "epoch": 0.1703998208086012, + "grad_norm": 1.0934566259384155, + "learning_rate": 1.5205e-05, + "loss": 0.488, + "step": 3043 + }, + { + "epoch": 0.1704558181207302, + "grad_norm": 1.2822872400283813, + "learning_rate": 1.5210000000000002e-05, + "loss": 0.4606, + "step": 3044 + }, + { + "epoch": 0.17051181543285923, + "grad_norm": 1.2843599319458008, + "learning_rate": 1.5215000000000001e-05, + "loss": 0.4081, + "step": 3045 + }, + { + "epoch": 0.17056781274498825, + "grad_norm": 1.3425087928771973, + "learning_rate": 1.5220000000000002e-05, + "loss": 0.4268, + "step": 3046 + }, + { + "epoch": 0.17062381005711727, + "grad_norm": 1.0990785360336304, + "learning_rate": 1.5225e-05, + "loss": 0.3835, + "step": 3047 + }, + { + "epoch": 0.1706798073692463, + "grad_norm": 1.4088256359100342, + "learning_rate": 1.523e-05, + "loss": 0.8543, + "step": 3048 + }, + { + "epoch": 0.17073580468137528, + "grad_norm": 2.091273546218872, + "learning_rate": 1.5235000000000002e-05, + "loss": 0.4991, + "step": 3049 + }, + { + "epoch": 0.1707918019935043, + "grad_norm": 1.6073461771011353, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.3343, + "step": 3050 + }, + { + "epoch": 0.17084779930563332, + "grad_norm": 1.0276155471801758, + "learning_rate": 1.5245e-05, + "loss": 0.3591, + "step": 3051 + }, + { + "epoch": 0.17090379661776234, + "grad_norm": 1.3987497091293335, + "learning_rate": 1.525e-05, + "loss": 0.7108, + "step": 3052 + }, + { + "epoch": 0.17095979392989136, + "grad_norm": 1.3775502443313599, + "learning_rate": 1.5255e-05, + "loss": 0.3814, + "step": 3053 + }, + { + "epoch": 0.17101579124202038, + "grad_norm": 1.2856470346450806, + "learning_rate": 1.5260000000000003e-05, + "loss": 0.4444, + "step": 3054 + }, + { + "epoch": 0.1710717885541494, + "grad_norm": 1.4564297199249268, + "learning_rate": 1.5265e-05, + "loss": 0.5069, + "step": 3055 + }, + { + "epoch": 0.17112778586627841, + "grad_norm": 1.328133225440979, + "learning_rate": 1.527e-05, + "loss": 0.5168, + "step": 3056 + }, + { + "epoch": 0.17118378317840743, + "grad_norm": 1.12852144241333, + "learning_rate": 1.5275000000000002e-05, + "loss": 0.4805, + "step": 3057 + }, + { + "epoch": 0.17123978049053645, + "grad_norm": 1.114654541015625, + "learning_rate": 1.528e-05, + "loss": 0.4181, + "step": 3058 + }, + { + "epoch": 0.17129577780266547, + "grad_norm": 1.1266412734985352, + "learning_rate": 1.5285000000000004e-05, + "loss": 0.3826, + "step": 3059 + }, + { + "epoch": 0.1713517751147945, + "grad_norm": 1.3138937950134277, + "learning_rate": 1.529e-05, + "loss": 0.4112, + "step": 3060 + }, + { + "epoch": 0.1714077724269235, + "grad_norm": 1.2326487302780151, + "learning_rate": 1.5295000000000002e-05, + "loss": 0.4183, + "step": 3061 + }, + { + "epoch": 0.17146376973905253, + "grad_norm": 1.2712304592132568, + "learning_rate": 1.53e-05, + "loss": 0.5426, + "step": 3062 + }, + { + "epoch": 0.17151976705118155, + "grad_norm": 2.414008617401123, + "learning_rate": 1.5305e-05, + "loss": 0.5064, + "step": 3063 + }, + { + "epoch": 0.17157576436331057, + "grad_norm": 1.1649055480957031, + "learning_rate": 1.531e-05, + "loss": 0.4934, + "step": 3064 + }, + { + "epoch": 0.1716317616754396, + "grad_norm": 1.3338874578475952, + "learning_rate": 1.5315e-05, + "loss": 0.3964, + "step": 3065 + }, + { + "epoch": 0.1716877589875686, + "grad_norm": 1.398659348487854, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.4128, + "step": 3066 + }, + { + "epoch": 0.17174375629969763, + "grad_norm": 1.2956663370132446, + "learning_rate": 1.5325e-05, + "loss": 0.4504, + "step": 3067 + }, + { + "epoch": 0.17179975361182664, + "grad_norm": 1.2123852968215942, + "learning_rate": 1.533e-05, + "loss": 0.3877, + "step": 3068 + }, + { + "epoch": 0.17185575092395566, + "grad_norm": 1.2996089458465576, + "learning_rate": 1.5334999999999998e-05, + "loss": 0.3504, + "step": 3069 + }, + { + "epoch": 0.17191174823608465, + "grad_norm": 1.2874212265014648, + "learning_rate": 1.5340000000000002e-05, + "loss": 0.4792, + "step": 3070 + }, + { + "epoch": 0.17196774554821367, + "grad_norm": 1.4098880290985107, + "learning_rate": 1.5345e-05, + "loss": 0.5226, + "step": 3071 + }, + { + "epoch": 0.1720237428603427, + "grad_norm": 1.5210481882095337, + "learning_rate": 1.535e-05, + "loss": 0.6552, + "step": 3072 + }, + { + "epoch": 0.1720797401724717, + "grad_norm": 1.165636658668518, + "learning_rate": 1.5355e-05, + "loss": 0.3838, + "step": 3073 + }, + { + "epoch": 0.17213573748460073, + "grad_norm": 1.2327104806900024, + "learning_rate": 1.536e-05, + "loss": 0.5711, + "step": 3074 + }, + { + "epoch": 0.17219173479672975, + "grad_norm": 1.2012931108474731, + "learning_rate": 1.5365000000000003e-05, + "loss": 0.3197, + "step": 3075 + }, + { + "epoch": 0.17224773210885877, + "grad_norm": 1.347421407699585, + "learning_rate": 1.537e-05, + "loss": 0.4665, + "step": 3076 + }, + { + "epoch": 0.1723037294209878, + "grad_norm": 1.07391357421875, + "learning_rate": 1.5375e-05, + "loss": 0.3609, + "step": 3077 + }, + { + "epoch": 0.1723597267331168, + "grad_norm": 1.2701846361160278, + "learning_rate": 1.538e-05, + "loss": 0.4394, + "step": 3078 + }, + { + "epoch": 0.17241572404524583, + "grad_norm": 1.3560216426849365, + "learning_rate": 1.5385e-05, + "loss": 0.3959, + "step": 3079 + }, + { + "epoch": 0.17247172135737485, + "grad_norm": 1.2126895189285278, + "learning_rate": 1.539e-05, + "loss": 0.4248, + "step": 3080 + }, + { + "epoch": 0.17252771866950387, + "grad_norm": 1.192703127861023, + "learning_rate": 1.5395e-05, + "loss": 0.4845, + "step": 3081 + }, + { + "epoch": 0.17258371598163288, + "grad_norm": 1.2717024087905884, + "learning_rate": 1.54e-05, + "loss": 0.3706, + "step": 3082 + }, + { + "epoch": 0.1726397132937619, + "grad_norm": 1.38326096534729, + "learning_rate": 1.5405e-05, + "loss": 0.3618, + "step": 3083 + }, + { + "epoch": 0.17269571060589092, + "grad_norm": 1.286408543586731, + "learning_rate": 1.541e-05, + "loss": 0.4486, + "step": 3084 + }, + { + "epoch": 0.17275170791801994, + "grad_norm": 1.0604596138000488, + "learning_rate": 1.5415e-05, + "loss": 0.3375, + "step": 3085 + }, + { + "epoch": 0.17280770523014896, + "grad_norm": 1.3629469871520996, + "learning_rate": 1.542e-05, + "loss": 0.4036, + "step": 3086 + }, + { + "epoch": 0.17286370254227798, + "grad_norm": 1.4825447797775269, + "learning_rate": 1.5425000000000002e-05, + "loss": 0.4507, + "step": 3087 + }, + { + "epoch": 0.172919699854407, + "grad_norm": 1.2464237213134766, + "learning_rate": 1.543e-05, + "loss": 0.4137, + "step": 3088 + }, + { + "epoch": 0.17297569716653602, + "grad_norm": 1.3638365268707275, + "learning_rate": 1.5435e-05, + "loss": 0.3957, + "step": 3089 + }, + { + "epoch": 0.17303169447866504, + "grad_norm": 1.085571050643921, + "learning_rate": 1.544e-05, + "loss": 0.4331, + "step": 3090 + }, + { + "epoch": 0.17308769179079403, + "grad_norm": 1.0439016819000244, + "learning_rate": 1.5445000000000002e-05, + "loss": 0.4425, + "step": 3091 + }, + { + "epoch": 0.17314368910292305, + "grad_norm": 1.42640221118927, + "learning_rate": 1.545e-05, + "loss": 0.5665, + "step": 3092 + }, + { + "epoch": 0.17319968641505207, + "grad_norm": 1.3006322383880615, + "learning_rate": 1.5455e-05, + "loss": 0.5638, + "step": 3093 + }, + { + "epoch": 0.1732556837271811, + "grad_norm": 1.4400304555892944, + "learning_rate": 1.546e-05, + "loss": 0.4891, + "step": 3094 + }, + { + "epoch": 0.1733116810393101, + "grad_norm": 1.1101337671279907, + "learning_rate": 1.5465000000000002e-05, + "loss": 0.3336, + "step": 3095 + }, + { + "epoch": 0.17336767835143913, + "grad_norm": 1.6164960861206055, + "learning_rate": 1.5470000000000003e-05, + "loss": 0.3791, + "step": 3096 + }, + { + "epoch": 0.17342367566356814, + "grad_norm": 1.3061712980270386, + "learning_rate": 1.5475e-05, + "loss": 0.4327, + "step": 3097 + }, + { + "epoch": 0.17347967297569716, + "grad_norm": 1.2732435464859009, + "learning_rate": 1.548e-05, + "loss": 0.4474, + "step": 3098 + }, + { + "epoch": 0.17353567028782618, + "grad_norm": 1.1919748783111572, + "learning_rate": 1.5484999999999998e-05, + "loss": 0.3302, + "step": 3099 + }, + { + "epoch": 0.1735916675999552, + "grad_norm": 1.0183470249176025, + "learning_rate": 1.5490000000000002e-05, + "loss": 0.3668, + "step": 3100 + }, + { + "epoch": 0.17364766491208422, + "grad_norm": 1.2144869565963745, + "learning_rate": 1.5495e-05, + "loss": 0.3694, + "step": 3101 + }, + { + "epoch": 0.17370366222421324, + "grad_norm": 1.2006837129592896, + "learning_rate": 1.55e-05, + "loss": 0.3806, + "step": 3102 + }, + { + "epoch": 0.17375965953634226, + "grad_norm": 1.1483407020568848, + "learning_rate": 1.5505e-05, + "loss": 0.4532, + "step": 3103 + }, + { + "epoch": 0.17381565684847128, + "grad_norm": 1.499038577079773, + "learning_rate": 1.551e-05, + "loss": 0.4937, + "step": 3104 + }, + { + "epoch": 0.1738716541606003, + "grad_norm": 1.1571745872497559, + "learning_rate": 1.5515000000000003e-05, + "loss": 0.3937, + "step": 3105 + }, + { + "epoch": 0.17392765147272932, + "grad_norm": 1.2323365211486816, + "learning_rate": 1.552e-05, + "loss": 0.407, + "step": 3106 + }, + { + "epoch": 0.17398364878485834, + "grad_norm": 4.923003673553467, + "learning_rate": 1.5525e-05, + "loss": 0.3192, + "step": 3107 + }, + { + "epoch": 0.17403964609698735, + "grad_norm": 1.1060408353805542, + "learning_rate": 1.553e-05, + "loss": 0.4702, + "step": 3108 + }, + { + "epoch": 0.17409564340911637, + "grad_norm": 1.3280766010284424, + "learning_rate": 1.5535e-05, + "loss": 0.6155, + "step": 3109 + }, + { + "epoch": 0.1741516407212454, + "grad_norm": 1.5248066186904907, + "learning_rate": 1.554e-05, + "loss": 0.5402, + "step": 3110 + }, + { + "epoch": 0.17420763803337438, + "grad_norm": 1.3715664148330688, + "learning_rate": 1.5545e-05, + "loss": 0.4532, + "step": 3111 + }, + { + "epoch": 0.1742636353455034, + "grad_norm": 1.1363284587860107, + "learning_rate": 1.5550000000000002e-05, + "loss": 0.5191, + "step": 3112 + }, + { + "epoch": 0.17431963265763242, + "grad_norm": 1.1898376941680908, + "learning_rate": 1.5555e-05, + "loss": 0.3865, + "step": 3113 + }, + { + "epoch": 0.17437562996976144, + "grad_norm": 1.2565094232559204, + "learning_rate": 1.556e-05, + "loss": 0.762, + "step": 3114 + }, + { + "epoch": 0.17443162728189046, + "grad_norm": 1.0949702262878418, + "learning_rate": 1.5565e-05, + "loss": 0.4829, + "step": 3115 + }, + { + "epoch": 0.17448762459401948, + "grad_norm": 1.1513077020645142, + "learning_rate": 1.5570000000000002e-05, + "loss": 0.5075, + "step": 3116 + }, + { + "epoch": 0.1745436219061485, + "grad_norm": 1.1390165090560913, + "learning_rate": 1.5575e-05, + "loss": 0.4113, + "step": 3117 + }, + { + "epoch": 0.17459961921827752, + "grad_norm": 1.1877434253692627, + "learning_rate": 1.558e-05, + "loss": 0.3997, + "step": 3118 + }, + { + "epoch": 0.17465561653040654, + "grad_norm": 1.2542484998703003, + "learning_rate": 1.5585e-05, + "loss": 0.3045, + "step": 3119 + }, + { + "epoch": 0.17471161384253556, + "grad_norm": 1.3228293657302856, + "learning_rate": 1.559e-05, + "loss": 0.4405, + "step": 3120 + }, + { + "epoch": 0.17476761115466458, + "grad_norm": 1.401941180229187, + "learning_rate": 1.5595000000000002e-05, + "loss": 0.4046, + "step": 3121 + }, + { + "epoch": 0.1748236084667936, + "grad_norm": 1.18621826171875, + "learning_rate": 1.56e-05, + "loss": 0.3908, + "step": 3122 + }, + { + "epoch": 0.17487960577892261, + "grad_norm": 1.0453542470932007, + "learning_rate": 1.5605e-05, + "loss": 0.2901, + "step": 3123 + }, + { + "epoch": 0.17493560309105163, + "grad_norm": 1.1331298351287842, + "learning_rate": 1.561e-05, + "loss": 0.3826, + "step": 3124 + }, + { + "epoch": 0.17499160040318065, + "grad_norm": 1.3581092357635498, + "learning_rate": 1.5615000000000002e-05, + "loss": 0.585, + "step": 3125 + }, + { + "epoch": 0.17504759771530967, + "grad_norm": 1.1543517112731934, + "learning_rate": 1.5620000000000003e-05, + "loss": 0.3554, + "step": 3126 + }, + { + "epoch": 0.1751035950274387, + "grad_norm": 1.265600562095642, + "learning_rate": 1.5625e-05, + "loss": 0.4692, + "step": 3127 + }, + { + "epoch": 0.1751595923395677, + "grad_norm": 1.3256622552871704, + "learning_rate": 1.563e-05, + "loss": 0.4647, + "step": 3128 + }, + { + "epoch": 0.17521558965169673, + "grad_norm": 1.303101897239685, + "learning_rate": 1.5635e-05, + "loss": 0.359, + "step": 3129 + }, + { + "epoch": 0.17527158696382575, + "grad_norm": 1.1615647077560425, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.5086, + "step": 3130 + }, + { + "epoch": 0.17532758427595477, + "grad_norm": 1.257786750793457, + "learning_rate": 1.5645e-05, + "loss": 0.4834, + "step": 3131 + }, + { + "epoch": 0.17538358158808376, + "grad_norm": 1.3199831247329712, + "learning_rate": 1.565e-05, + "loss": 0.4673, + "step": 3132 + }, + { + "epoch": 0.17543957890021278, + "grad_norm": 1.4349507093429565, + "learning_rate": 1.5655000000000002e-05, + "loss": 0.5471, + "step": 3133 + }, + { + "epoch": 0.1754955762123418, + "grad_norm": 1.3560909032821655, + "learning_rate": 1.566e-05, + "loss": 0.4311, + "step": 3134 + }, + { + "epoch": 0.17555157352447082, + "grad_norm": 1.2484694719314575, + "learning_rate": 1.5665000000000003e-05, + "loss": 0.4119, + "step": 3135 + }, + { + "epoch": 0.17560757083659984, + "grad_norm": 1.0213234424591064, + "learning_rate": 1.567e-05, + "loss": 0.3423, + "step": 3136 + }, + { + "epoch": 0.17566356814872885, + "grad_norm": 1.153052806854248, + "learning_rate": 1.5675e-05, + "loss": 0.4602, + "step": 3137 + }, + { + "epoch": 0.17571956546085787, + "grad_norm": 1.1225149631500244, + "learning_rate": 1.568e-05, + "loss": 0.3676, + "step": 3138 + }, + { + "epoch": 0.1757755627729869, + "grad_norm": 1.1485469341278076, + "learning_rate": 1.5685e-05, + "loss": 0.5003, + "step": 3139 + }, + { + "epoch": 0.1758315600851159, + "grad_norm": 1.3653138875961304, + "learning_rate": 1.569e-05, + "loss": 0.497, + "step": 3140 + }, + { + "epoch": 0.17588755739724493, + "grad_norm": 1.5089391469955444, + "learning_rate": 1.5695e-05, + "loss": 0.3692, + "step": 3141 + }, + { + "epoch": 0.17594355470937395, + "grad_norm": 1.2435405254364014, + "learning_rate": 1.5700000000000002e-05, + "loss": 0.4499, + "step": 3142 + }, + { + "epoch": 0.17599955202150297, + "grad_norm": 1.225533366203308, + "learning_rate": 1.5705e-05, + "loss": 0.4798, + "step": 3143 + }, + { + "epoch": 0.176055549333632, + "grad_norm": 1.2281639575958252, + "learning_rate": 1.571e-05, + "loss": 0.4124, + "step": 3144 + }, + { + "epoch": 0.176111546645761, + "grad_norm": 1.0934711694717407, + "learning_rate": 1.5715e-05, + "loss": 0.385, + "step": 3145 + }, + { + "epoch": 0.17616754395789003, + "grad_norm": 1.3133177757263184, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.6233, + "step": 3146 + }, + { + "epoch": 0.17622354127001905, + "grad_norm": 0.9715774059295654, + "learning_rate": 1.5725e-05, + "loss": 0.3162, + "step": 3147 + }, + { + "epoch": 0.17627953858214807, + "grad_norm": 1.1965703964233398, + "learning_rate": 1.573e-05, + "loss": 0.3621, + "step": 3148 + }, + { + "epoch": 0.17633553589427708, + "grad_norm": 1.2315959930419922, + "learning_rate": 1.5735e-05, + "loss": 0.4043, + "step": 3149 + }, + { + "epoch": 0.1763915332064061, + "grad_norm": 0.9554809927940369, + "learning_rate": 1.5740000000000002e-05, + "loss": 0.3764, + "step": 3150 + }, + { + "epoch": 0.17644753051853512, + "grad_norm": 1.1647918224334717, + "learning_rate": 1.5745000000000003e-05, + "loss": 0.6171, + "step": 3151 + }, + { + "epoch": 0.17650352783066414, + "grad_norm": 1.220215082168579, + "learning_rate": 1.575e-05, + "loss": 0.5622, + "step": 3152 + }, + { + "epoch": 0.17655952514279313, + "grad_norm": 1.1583800315856934, + "learning_rate": 1.5755e-05, + "loss": 0.3858, + "step": 3153 + }, + { + "epoch": 0.17661552245492215, + "grad_norm": 1.3775138854980469, + "learning_rate": 1.5759999999999998e-05, + "loss": 0.5959, + "step": 3154 + }, + { + "epoch": 0.17667151976705117, + "grad_norm": 1.2710376977920532, + "learning_rate": 1.5765000000000002e-05, + "loss": 0.4621, + "step": 3155 + }, + { + "epoch": 0.1767275170791802, + "grad_norm": 1.2439836263656616, + "learning_rate": 1.577e-05, + "loss": 0.4974, + "step": 3156 + }, + { + "epoch": 0.1767835143913092, + "grad_norm": 1.2212399244308472, + "learning_rate": 1.5775e-05, + "loss": 0.5128, + "step": 3157 + }, + { + "epoch": 0.17683951170343823, + "grad_norm": 1.3426244258880615, + "learning_rate": 1.578e-05, + "loss": 0.5533, + "step": 3158 + }, + { + "epoch": 0.17689550901556725, + "grad_norm": 1.1366629600524902, + "learning_rate": 1.5785e-05, + "loss": 0.4811, + "step": 3159 + }, + { + "epoch": 0.17695150632769627, + "grad_norm": 1.3456504344940186, + "learning_rate": 1.5790000000000003e-05, + "loss": 0.4341, + "step": 3160 + }, + { + "epoch": 0.1770075036398253, + "grad_norm": 1.1240496635437012, + "learning_rate": 1.5795e-05, + "loss": 0.4233, + "step": 3161 + }, + { + "epoch": 0.1770635009519543, + "grad_norm": 1.4094661474227905, + "learning_rate": 1.58e-05, + "loss": 0.5234, + "step": 3162 + }, + { + "epoch": 0.17711949826408332, + "grad_norm": 1.1591876745224, + "learning_rate": 1.5805000000000002e-05, + "loss": 0.5352, + "step": 3163 + }, + { + "epoch": 0.17717549557621234, + "grad_norm": 1.3503307104110718, + "learning_rate": 1.581e-05, + "loss": 0.5129, + "step": 3164 + }, + { + "epoch": 0.17723149288834136, + "grad_norm": 1.1911805868148804, + "learning_rate": 1.5815000000000004e-05, + "loss": 0.4462, + "step": 3165 + }, + { + "epoch": 0.17728749020047038, + "grad_norm": 1.2154439687728882, + "learning_rate": 1.582e-05, + "loss": 0.4113, + "step": 3166 + }, + { + "epoch": 0.1773434875125994, + "grad_norm": 1.424020528793335, + "learning_rate": 1.5825000000000002e-05, + "loss": 0.4254, + "step": 3167 + }, + { + "epoch": 0.17739948482472842, + "grad_norm": 1.2356969118118286, + "learning_rate": 1.583e-05, + "loss": 0.4238, + "step": 3168 + }, + { + "epoch": 0.17745548213685744, + "grad_norm": 1.2835613489151, + "learning_rate": 1.5835e-05, + "loss": 0.4716, + "step": 3169 + }, + { + "epoch": 0.17751147944898646, + "grad_norm": 1.5604387521743774, + "learning_rate": 1.584e-05, + "loss": 0.5455, + "step": 3170 + }, + { + "epoch": 0.17756747676111548, + "grad_norm": 1.066110372543335, + "learning_rate": 1.5845e-05, + "loss": 0.4306, + "step": 3171 + }, + { + "epoch": 0.1776234740732445, + "grad_norm": 1.3697227239608765, + "learning_rate": 1.5850000000000002e-05, + "loss": 0.4028, + "step": 3172 + }, + { + "epoch": 0.1776794713853735, + "grad_norm": 1.1536370515823364, + "learning_rate": 1.5855e-05, + "loss": 0.5904, + "step": 3173 + }, + { + "epoch": 0.1777354686975025, + "grad_norm": 1.0349966287612915, + "learning_rate": 1.586e-05, + "loss": 0.3471, + "step": 3174 + }, + { + "epoch": 0.17779146600963153, + "grad_norm": 1.278933048248291, + "learning_rate": 1.5865e-05, + "loss": 0.4645, + "step": 3175 + }, + { + "epoch": 0.17784746332176055, + "grad_norm": 1.2004108428955078, + "learning_rate": 1.5870000000000002e-05, + "loss": 0.5068, + "step": 3176 + }, + { + "epoch": 0.17790346063388957, + "grad_norm": 1.3695133924484253, + "learning_rate": 1.5875e-05, + "loss": 0.5353, + "step": 3177 + }, + { + "epoch": 0.17795945794601858, + "grad_norm": 1.1981608867645264, + "learning_rate": 1.588e-05, + "loss": 0.3492, + "step": 3178 + }, + { + "epoch": 0.1780154552581476, + "grad_norm": 1.4457505941390991, + "learning_rate": 1.5885e-05, + "loss": 0.6995, + "step": 3179 + }, + { + "epoch": 0.17807145257027662, + "grad_norm": 1.5259352922439575, + "learning_rate": 1.5890000000000002e-05, + "loss": 0.7483, + "step": 3180 + }, + { + "epoch": 0.17812744988240564, + "grad_norm": 1.0876466035842896, + "learning_rate": 1.5895000000000003e-05, + "loss": 0.4789, + "step": 3181 + }, + { + "epoch": 0.17818344719453466, + "grad_norm": 1.0859925746917725, + "learning_rate": 1.59e-05, + "loss": 0.4018, + "step": 3182 + }, + { + "epoch": 0.17823944450666368, + "grad_norm": 1.1666775941848755, + "learning_rate": 1.5905e-05, + "loss": 0.4115, + "step": 3183 + }, + { + "epoch": 0.1782954418187927, + "grad_norm": 1.1463675498962402, + "learning_rate": 1.591e-05, + "loss": 0.3401, + "step": 3184 + }, + { + "epoch": 0.17835143913092172, + "grad_norm": 1.154799461364746, + "learning_rate": 1.5915000000000003e-05, + "loss": 0.3368, + "step": 3185 + }, + { + "epoch": 0.17840743644305074, + "grad_norm": 1.1332404613494873, + "learning_rate": 1.592e-05, + "loss": 0.4372, + "step": 3186 + }, + { + "epoch": 0.17846343375517976, + "grad_norm": 1.2234429121017456, + "learning_rate": 1.5925e-05, + "loss": 0.4924, + "step": 3187 + }, + { + "epoch": 0.17851943106730878, + "grad_norm": 1.1957119703292847, + "learning_rate": 1.593e-05, + "loss": 0.4729, + "step": 3188 + }, + { + "epoch": 0.1785754283794378, + "grad_norm": 1.2825809717178345, + "learning_rate": 1.5935e-05, + "loss": 0.4555, + "step": 3189 + }, + { + "epoch": 0.17863142569156681, + "grad_norm": 3.089897871017456, + "learning_rate": 1.594e-05, + "loss": 0.4048, + "step": 3190 + }, + { + "epoch": 0.17868742300369583, + "grad_norm": 1.099593997001648, + "learning_rate": 1.5945e-05, + "loss": 0.3809, + "step": 3191 + }, + { + "epoch": 0.17874342031582485, + "grad_norm": 1.1275124549865723, + "learning_rate": 1.595e-05, + "loss": 0.3926, + "step": 3192 + }, + { + "epoch": 0.17879941762795387, + "grad_norm": 1.413540005683899, + "learning_rate": 1.5955e-05, + "loss": 0.4899, + "step": 3193 + }, + { + "epoch": 0.17885541494008286, + "grad_norm": 1.2551677227020264, + "learning_rate": 1.596e-05, + "loss": 0.4831, + "step": 3194 + }, + { + "epoch": 0.17891141225221188, + "grad_norm": 1.3640986680984497, + "learning_rate": 1.5965e-05, + "loss": 0.6391, + "step": 3195 + }, + { + "epoch": 0.1789674095643409, + "grad_norm": 1.2376562356948853, + "learning_rate": 1.597e-05, + "loss": 0.4542, + "step": 3196 + }, + { + "epoch": 0.17902340687646992, + "grad_norm": 1.3746341466903687, + "learning_rate": 1.5975000000000002e-05, + "loss": 0.5066, + "step": 3197 + }, + { + "epoch": 0.17907940418859894, + "grad_norm": 0.9634360074996948, + "learning_rate": 1.598e-05, + "loss": 0.3555, + "step": 3198 + }, + { + "epoch": 0.17913540150072796, + "grad_norm": 1.433107852935791, + "learning_rate": 1.5985e-05, + "loss": 0.4511, + "step": 3199 + }, + { + "epoch": 0.17919139881285698, + "grad_norm": 1.2713160514831543, + "learning_rate": 1.599e-05, + "loss": 0.3922, + "step": 3200 + }, + { + "epoch": 0.179247396124986, + "grad_norm": 1.3845282793045044, + "learning_rate": 1.5995000000000002e-05, + "loss": 0.5921, + "step": 3201 + }, + { + "epoch": 0.17930339343711502, + "grad_norm": 1.1872634887695312, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4316, + "step": 3202 + }, + { + "epoch": 0.17935939074924404, + "grad_norm": 1.2111296653747559, + "learning_rate": 1.6005e-05, + "loss": 0.3921, + "step": 3203 + }, + { + "epoch": 0.17941538806137305, + "grad_norm": 1.0769726037979126, + "learning_rate": 1.601e-05, + "loss": 0.3636, + "step": 3204 + }, + { + "epoch": 0.17947138537350207, + "grad_norm": 1.1083983182907104, + "learning_rate": 1.6014999999999998e-05, + "loss": 0.3965, + "step": 3205 + }, + { + "epoch": 0.1795273826856311, + "grad_norm": 1.2002310752868652, + "learning_rate": 1.6020000000000002e-05, + "loss": 0.369, + "step": 3206 + }, + { + "epoch": 0.1795833799977601, + "grad_norm": 1.1439322233200073, + "learning_rate": 1.6025e-05, + "loss": 0.369, + "step": 3207 + }, + { + "epoch": 0.17963937730988913, + "grad_norm": 1.3816206455230713, + "learning_rate": 1.603e-05, + "loss": 0.5202, + "step": 3208 + }, + { + "epoch": 0.17969537462201815, + "grad_norm": 1.112925410270691, + "learning_rate": 1.6035e-05, + "loss": 0.3286, + "step": 3209 + }, + { + "epoch": 0.17975137193414717, + "grad_norm": 1.315228819847107, + "learning_rate": 1.604e-05, + "loss": 0.4827, + "step": 3210 + }, + { + "epoch": 0.1798073692462762, + "grad_norm": 2.721423387527466, + "learning_rate": 1.6045000000000003e-05, + "loss": 0.5205, + "step": 3211 + }, + { + "epoch": 0.1798633665584052, + "grad_norm": 1.248304009437561, + "learning_rate": 1.605e-05, + "loss": 0.4123, + "step": 3212 + }, + { + "epoch": 0.17991936387053423, + "grad_norm": 1.1161116361618042, + "learning_rate": 1.6055e-05, + "loss": 0.4418, + "step": 3213 + }, + { + "epoch": 0.17997536118266325, + "grad_norm": 1.2112042903900146, + "learning_rate": 1.606e-05, + "loss": 0.5014, + "step": 3214 + }, + { + "epoch": 0.18003135849479224, + "grad_norm": 1.0611644983291626, + "learning_rate": 1.6065e-05, + "loss": 0.3713, + "step": 3215 + }, + { + "epoch": 0.18008735580692126, + "grad_norm": 1.357844352722168, + "learning_rate": 1.607e-05, + "loss": 0.5438, + "step": 3216 + }, + { + "epoch": 0.18014335311905028, + "grad_norm": 0.9168925285339355, + "learning_rate": 1.6075e-05, + "loss": 0.3858, + "step": 3217 + }, + { + "epoch": 0.1801993504311793, + "grad_norm": 1.3100770711898804, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.5269, + "step": 3218 + }, + { + "epoch": 0.18025534774330831, + "grad_norm": 1.1514203548431396, + "learning_rate": 1.6085e-05, + "loss": 0.4493, + "step": 3219 + }, + { + "epoch": 0.18031134505543733, + "grad_norm": 1.176634669303894, + "learning_rate": 1.609e-05, + "loss": 0.4187, + "step": 3220 + }, + { + "epoch": 0.18036734236756635, + "grad_norm": 1.0469194650650024, + "learning_rate": 1.6095e-05, + "loss": 0.43, + "step": 3221 + }, + { + "epoch": 0.18042333967969537, + "grad_norm": 1.5243301391601562, + "learning_rate": 1.6100000000000002e-05, + "loss": 0.4707, + "step": 3222 + }, + { + "epoch": 0.1804793369918244, + "grad_norm": 1.0046426057815552, + "learning_rate": 1.6105e-05, + "loss": 0.3883, + "step": 3223 + }, + { + "epoch": 0.1805353343039534, + "grad_norm": 1.3109506368637085, + "learning_rate": 1.611e-05, + "loss": 0.5103, + "step": 3224 + }, + { + "epoch": 0.18059133161608243, + "grad_norm": 1.446347951889038, + "learning_rate": 1.6115e-05, + "loss": 0.4054, + "step": 3225 + }, + { + "epoch": 0.18064732892821145, + "grad_norm": 1.2103548049926758, + "learning_rate": 1.612e-05, + "loss": 0.5018, + "step": 3226 + }, + { + "epoch": 0.18070332624034047, + "grad_norm": 1.17299222946167, + "learning_rate": 1.6125000000000002e-05, + "loss": 0.4382, + "step": 3227 + }, + { + "epoch": 0.1807593235524695, + "grad_norm": 1.3501638174057007, + "learning_rate": 1.613e-05, + "loss": 0.5293, + "step": 3228 + }, + { + "epoch": 0.1808153208645985, + "grad_norm": 1.369954228401184, + "learning_rate": 1.6135e-05, + "loss": 0.4372, + "step": 3229 + }, + { + "epoch": 0.18087131817672752, + "grad_norm": 1.442979097366333, + "learning_rate": 1.6139999999999998e-05, + "loss": 0.3399, + "step": 3230 + }, + { + "epoch": 0.18092731548885654, + "grad_norm": 1.3130812644958496, + "learning_rate": 1.6145000000000002e-05, + "loss": 0.3832, + "step": 3231 + }, + { + "epoch": 0.18098331280098556, + "grad_norm": 1.3480647802352905, + "learning_rate": 1.6150000000000003e-05, + "loss": 0.4964, + "step": 3232 + }, + { + "epoch": 0.18103931011311458, + "grad_norm": 1.1867642402648926, + "learning_rate": 1.6155e-05, + "loss": 0.3486, + "step": 3233 + }, + { + "epoch": 0.1810953074252436, + "grad_norm": 1.3922346830368042, + "learning_rate": 1.616e-05, + "loss": 0.5325, + "step": 3234 + }, + { + "epoch": 0.1811513047373726, + "grad_norm": 1.1357862949371338, + "learning_rate": 1.6165e-05, + "loss": 0.637, + "step": 3235 + }, + { + "epoch": 0.1812073020495016, + "grad_norm": 1.2021393775939941, + "learning_rate": 1.6170000000000003e-05, + "loss": 0.4567, + "step": 3236 + }, + { + "epoch": 0.18126329936163063, + "grad_norm": 1.1709843873977661, + "learning_rate": 1.6175e-05, + "loss": 0.4624, + "step": 3237 + }, + { + "epoch": 0.18131929667375965, + "grad_norm": 1.1683293581008911, + "learning_rate": 1.618e-05, + "loss": 0.3118, + "step": 3238 + }, + { + "epoch": 0.18137529398588867, + "grad_norm": 1.096571922302246, + "learning_rate": 1.6185000000000002e-05, + "loss": 0.3676, + "step": 3239 + }, + { + "epoch": 0.1814312912980177, + "grad_norm": 1.2501559257507324, + "learning_rate": 1.619e-05, + "loss": 0.4443, + "step": 3240 + }, + { + "epoch": 0.1814872886101467, + "grad_norm": 1.254988670349121, + "learning_rate": 1.6195000000000003e-05, + "loss": 0.5011, + "step": 3241 + }, + { + "epoch": 0.18154328592227573, + "grad_norm": 1.2877148389816284, + "learning_rate": 1.62e-05, + "loss": 0.4825, + "step": 3242 + }, + { + "epoch": 0.18159928323440475, + "grad_norm": 1.2468076944351196, + "learning_rate": 1.6205e-05, + "loss": 0.4051, + "step": 3243 + }, + { + "epoch": 0.18165528054653376, + "grad_norm": 1.257940411567688, + "learning_rate": 1.621e-05, + "loss": 0.5026, + "step": 3244 + }, + { + "epoch": 0.18171127785866278, + "grad_norm": 1.0413216352462769, + "learning_rate": 1.6215e-05, + "loss": 0.3557, + "step": 3245 + }, + { + "epoch": 0.1817672751707918, + "grad_norm": 1.2333520650863647, + "learning_rate": 1.622e-05, + "loss": 0.4615, + "step": 3246 + }, + { + "epoch": 0.18182327248292082, + "grad_norm": 1.1986799240112305, + "learning_rate": 1.6225e-05, + "loss": 0.4174, + "step": 3247 + }, + { + "epoch": 0.18187926979504984, + "grad_norm": 1.9241962432861328, + "learning_rate": 1.6230000000000002e-05, + "loss": 0.4915, + "step": 3248 + }, + { + "epoch": 0.18193526710717886, + "grad_norm": 1.2649354934692383, + "learning_rate": 1.6235e-05, + "loss": 0.654, + "step": 3249 + }, + { + "epoch": 0.18199126441930788, + "grad_norm": 1.273526906967163, + "learning_rate": 1.624e-05, + "loss": 0.413, + "step": 3250 + }, + { + "epoch": 0.1820472617314369, + "grad_norm": 1.438672661781311, + "learning_rate": 1.6245e-05, + "loss": 0.406, + "step": 3251 + }, + { + "epoch": 0.18210325904356592, + "grad_norm": 1.3299928903579712, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.4551, + "step": 3252 + }, + { + "epoch": 0.18215925635569494, + "grad_norm": 1.1286715269088745, + "learning_rate": 1.6255e-05, + "loss": 0.437, + "step": 3253 + }, + { + "epoch": 0.18221525366782396, + "grad_norm": 1.193117618560791, + "learning_rate": 1.626e-05, + "loss": 0.5152, + "step": 3254 + }, + { + "epoch": 0.18227125097995298, + "grad_norm": 2.6552906036376953, + "learning_rate": 1.6265e-05, + "loss": 0.4532, + "step": 3255 + }, + { + "epoch": 0.18232724829208197, + "grad_norm": 1.1225849390029907, + "learning_rate": 1.6270000000000002e-05, + "loss": 0.4489, + "step": 3256 + }, + { + "epoch": 0.182383245604211, + "grad_norm": 1.1070002317428589, + "learning_rate": 1.6275000000000003e-05, + "loss": 0.2876, + "step": 3257 + }, + { + "epoch": 0.18243924291634, + "grad_norm": 2.4725143909454346, + "learning_rate": 1.628e-05, + "loss": 0.3886, + "step": 3258 + }, + { + "epoch": 0.18249524022846902, + "grad_norm": 1.3732391595840454, + "learning_rate": 1.6285e-05, + "loss": 0.5453, + "step": 3259 + }, + { + "epoch": 0.18255123754059804, + "grad_norm": 1.3322089910507202, + "learning_rate": 1.6289999999999998e-05, + "loss": 0.4839, + "step": 3260 + }, + { + "epoch": 0.18260723485272706, + "grad_norm": 1.0850098133087158, + "learning_rate": 1.6295000000000002e-05, + "loss": 0.4464, + "step": 3261 + }, + { + "epoch": 0.18266323216485608, + "grad_norm": 1.1595327854156494, + "learning_rate": 1.63e-05, + "loss": 0.3385, + "step": 3262 + }, + { + "epoch": 0.1827192294769851, + "grad_norm": 1.194762945175171, + "learning_rate": 1.6305e-05, + "loss": 0.4882, + "step": 3263 + }, + { + "epoch": 0.18277522678911412, + "grad_norm": 1.1758840084075928, + "learning_rate": 1.631e-05, + "loss": 0.437, + "step": 3264 + }, + { + "epoch": 0.18283122410124314, + "grad_norm": 1.1860482692718506, + "learning_rate": 1.6315e-05, + "loss": 0.6276, + "step": 3265 + }, + { + "epoch": 0.18288722141337216, + "grad_norm": 1.4705352783203125, + "learning_rate": 1.6320000000000003e-05, + "loss": 0.4491, + "step": 3266 + }, + { + "epoch": 0.18294321872550118, + "grad_norm": 1.955003023147583, + "learning_rate": 1.6325e-05, + "loss": 0.5621, + "step": 3267 + }, + { + "epoch": 0.1829992160376302, + "grad_norm": 3.5246810913085938, + "learning_rate": 1.633e-05, + "loss": 0.4047, + "step": 3268 + }, + { + "epoch": 0.18305521334975922, + "grad_norm": 1.4596385955810547, + "learning_rate": 1.6335e-05, + "loss": 0.4728, + "step": 3269 + }, + { + "epoch": 0.18311121066188824, + "grad_norm": 1.2304860353469849, + "learning_rate": 1.634e-05, + "loss": 0.4617, + "step": 3270 + }, + { + "epoch": 0.18316720797401725, + "grad_norm": 1.2931616306304932, + "learning_rate": 1.6345000000000004e-05, + "loss": 0.4159, + "step": 3271 + }, + { + "epoch": 0.18322320528614627, + "grad_norm": 1.710740566253662, + "learning_rate": 1.635e-05, + "loss": 0.3767, + "step": 3272 + }, + { + "epoch": 0.1832792025982753, + "grad_norm": 1.0282319784164429, + "learning_rate": 1.6355000000000002e-05, + "loss": 0.3537, + "step": 3273 + }, + { + "epoch": 0.1833351999104043, + "grad_norm": 1.1052045822143555, + "learning_rate": 1.636e-05, + "loss": 0.5242, + "step": 3274 + }, + { + "epoch": 0.18339119722253333, + "grad_norm": 1.0453317165374756, + "learning_rate": 1.6365e-05, + "loss": 0.4622, + "step": 3275 + }, + { + "epoch": 0.18344719453466235, + "grad_norm": 1.3618289232254028, + "learning_rate": 1.637e-05, + "loss": 0.3347, + "step": 3276 + }, + { + "epoch": 0.18350319184679134, + "grad_norm": 1.2078756093978882, + "learning_rate": 1.6375e-05, + "loss": 0.3666, + "step": 3277 + }, + { + "epoch": 0.18355918915892036, + "grad_norm": 1.0806200504302979, + "learning_rate": 1.6380000000000002e-05, + "loss": 0.3669, + "step": 3278 + }, + { + "epoch": 0.18361518647104938, + "grad_norm": 1.468646764755249, + "learning_rate": 1.6385e-05, + "loss": 0.4764, + "step": 3279 + }, + { + "epoch": 0.1836711837831784, + "grad_norm": 1.1237543821334839, + "learning_rate": 1.639e-05, + "loss": 0.4137, + "step": 3280 + }, + { + "epoch": 0.18372718109530742, + "grad_norm": 1.1732734441757202, + "learning_rate": 1.6395e-05, + "loss": 0.4237, + "step": 3281 + }, + { + "epoch": 0.18378317840743644, + "grad_norm": 1.1597936153411865, + "learning_rate": 1.6400000000000002e-05, + "loss": 0.3797, + "step": 3282 + }, + { + "epoch": 0.18383917571956546, + "grad_norm": 2.0686988830566406, + "learning_rate": 1.6405e-05, + "loss": 0.7229, + "step": 3283 + }, + { + "epoch": 0.18389517303169448, + "grad_norm": 1.333763599395752, + "learning_rate": 1.641e-05, + "loss": 0.4357, + "step": 3284 + }, + { + "epoch": 0.1839511703438235, + "grad_norm": 1.0924245119094849, + "learning_rate": 1.6415e-05, + "loss": 0.3907, + "step": 3285 + }, + { + "epoch": 0.1840071676559525, + "grad_norm": 1.6450637578964233, + "learning_rate": 1.6420000000000002e-05, + "loss": 0.4214, + "step": 3286 + }, + { + "epoch": 0.18406316496808153, + "grad_norm": 1.07072114944458, + "learning_rate": 1.6425000000000003e-05, + "loss": 0.4035, + "step": 3287 + }, + { + "epoch": 0.18411916228021055, + "grad_norm": 1.138118028640747, + "learning_rate": 1.643e-05, + "loss": 0.3068, + "step": 3288 + }, + { + "epoch": 0.18417515959233957, + "grad_norm": 1.1400213241577148, + "learning_rate": 1.6435e-05, + "loss": 0.4134, + "step": 3289 + }, + { + "epoch": 0.1842311569044686, + "grad_norm": 1.229209303855896, + "learning_rate": 1.644e-05, + "loss": 0.6559, + "step": 3290 + }, + { + "epoch": 0.1842871542165976, + "grad_norm": 1.224164605140686, + "learning_rate": 1.6445000000000003e-05, + "loss": 0.4453, + "step": 3291 + }, + { + "epoch": 0.18434315152872663, + "grad_norm": 1.1119213104248047, + "learning_rate": 1.645e-05, + "loss": 0.3873, + "step": 3292 + }, + { + "epoch": 0.18439914884085565, + "grad_norm": 1.228772759437561, + "learning_rate": 1.6455e-05, + "loss": 0.5373, + "step": 3293 + }, + { + "epoch": 0.18445514615298467, + "grad_norm": 1.1952736377716064, + "learning_rate": 1.646e-05, + "loss": 0.4511, + "step": 3294 + }, + { + "epoch": 0.1845111434651137, + "grad_norm": 1.3519757986068726, + "learning_rate": 1.6465e-05, + "loss": 0.5203, + "step": 3295 + }, + { + "epoch": 0.1845671407772427, + "grad_norm": 1.3381385803222656, + "learning_rate": 1.6470000000000003e-05, + "loss": 0.4271, + "step": 3296 + }, + { + "epoch": 0.1846231380893717, + "grad_norm": 1.0474534034729004, + "learning_rate": 1.6475e-05, + "loss": 0.4076, + "step": 3297 + }, + { + "epoch": 0.18467913540150072, + "grad_norm": 1.3297818899154663, + "learning_rate": 1.648e-05, + "loss": 0.4738, + "step": 3298 + }, + { + "epoch": 0.18473513271362973, + "grad_norm": 1.1280972957611084, + "learning_rate": 1.6485e-05, + "loss": 0.4053, + "step": 3299 + }, + { + "epoch": 0.18479113002575875, + "grad_norm": 1.257645606994629, + "learning_rate": 1.649e-05, + "loss": 0.4157, + "step": 3300 + }, + { + "epoch": 0.18484712733788777, + "grad_norm": 1.3946285247802734, + "learning_rate": 1.6495e-05, + "loss": 0.4413, + "step": 3301 + }, + { + "epoch": 0.1849031246500168, + "grad_norm": 1.6009902954101562, + "learning_rate": 1.65e-05, + "loss": 0.6021, + "step": 3302 + }, + { + "epoch": 0.1849591219621458, + "grad_norm": 1.2088947296142578, + "learning_rate": 1.6505000000000002e-05, + "loss": 0.4489, + "step": 3303 + }, + { + "epoch": 0.18501511927427483, + "grad_norm": 1.2706345319747925, + "learning_rate": 1.651e-05, + "loss": 0.5796, + "step": 3304 + }, + { + "epoch": 0.18507111658640385, + "grad_norm": 1.1603660583496094, + "learning_rate": 1.6515e-05, + "loss": 0.4107, + "step": 3305 + }, + { + "epoch": 0.18512711389853287, + "grad_norm": 1.2458375692367554, + "learning_rate": 1.652e-05, + "loss": 0.4275, + "step": 3306 + }, + { + "epoch": 0.1851831112106619, + "grad_norm": 1.3546783924102783, + "learning_rate": 1.6525000000000002e-05, + "loss": 0.4559, + "step": 3307 + }, + { + "epoch": 0.1852391085227909, + "grad_norm": 1.1928722858428955, + "learning_rate": 1.6530000000000003e-05, + "loss": 0.455, + "step": 3308 + }, + { + "epoch": 0.18529510583491993, + "grad_norm": 0.9799600839614868, + "learning_rate": 1.6535e-05, + "loss": 0.3385, + "step": 3309 + }, + { + "epoch": 0.18535110314704895, + "grad_norm": 1.222354769706726, + "learning_rate": 1.654e-05, + "loss": 0.4966, + "step": 3310 + }, + { + "epoch": 0.18540710045917796, + "grad_norm": 1.098609209060669, + "learning_rate": 1.6545e-05, + "loss": 0.3555, + "step": 3311 + }, + { + "epoch": 0.18546309777130698, + "grad_norm": 1.2803016901016235, + "learning_rate": 1.6550000000000002e-05, + "loss": 0.4859, + "step": 3312 + }, + { + "epoch": 0.185519095083436, + "grad_norm": 1.0965261459350586, + "learning_rate": 1.6555e-05, + "loss": 0.498, + "step": 3313 + }, + { + "epoch": 0.18557509239556502, + "grad_norm": 1.2349803447723389, + "learning_rate": 1.656e-05, + "loss": 0.3836, + "step": 3314 + }, + { + "epoch": 0.18563108970769404, + "grad_norm": 1.229987621307373, + "learning_rate": 1.6565e-05, + "loss": 0.4132, + "step": 3315 + }, + { + "epoch": 0.18568708701982306, + "grad_norm": 1.1460660696029663, + "learning_rate": 1.657e-05, + "loss": 0.4971, + "step": 3316 + }, + { + "epoch": 0.18574308433195208, + "grad_norm": 1.3114126920700073, + "learning_rate": 1.6575000000000003e-05, + "loss": 0.6052, + "step": 3317 + }, + { + "epoch": 0.18579908164408107, + "grad_norm": 1.1734764575958252, + "learning_rate": 1.658e-05, + "loss": 0.4576, + "step": 3318 + }, + { + "epoch": 0.1858550789562101, + "grad_norm": 1.2834408283233643, + "learning_rate": 1.6585e-05, + "loss": 0.4598, + "step": 3319 + }, + { + "epoch": 0.1859110762683391, + "grad_norm": 1.509110689163208, + "learning_rate": 1.659e-05, + "loss": 0.4597, + "step": 3320 + }, + { + "epoch": 0.18596707358046813, + "grad_norm": 1.226487398147583, + "learning_rate": 1.6595e-05, + "loss": 0.3691, + "step": 3321 + }, + { + "epoch": 0.18602307089259715, + "grad_norm": 1.135722041130066, + "learning_rate": 1.66e-05, + "loss": 0.5141, + "step": 3322 + }, + { + "epoch": 0.18607906820472617, + "grad_norm": 1.3855825662612915, + "learning_rate": 1.6605e-05, + "loss": 0.4158, + "step": 3323 + }, + { + "epoch": 0.18613506551685519, + "grad_norm": 1.1391615867614746, + "learning_rate": 1.6610000000000002e-05, + "loss": 0.4244, + "step": 3324 + }, + { + "epoch": 0.1861910628289842, + "grad_norm": 1.1902307271957397, + "learning_rate": 1.6615e-05, + "loss": 0.4648, + "step": 3325 + }, + { + "epoch": 0.18624706014111322, + "grad_norm": 1.1830710172653198, + "learning_rate": 1.662e-05, + "loss": 0.4394, + "step": 3326 + }, + { + "epoch": 0.18630305745324224, + "grad_norm": 1.219680905342102, + "learning_rate": 1.6625e-05, + "loss": 0.3814, + "step": 3327 + }, + { + "epoch": 0.18635905476537126, + "grad_norm": 1.0361963510513306, + "learning_rate": 1.6630000000000002e-05, + "loss": 0.3575, + "step": 3328 + }, + { + "epoch": 0.18641505207750028, + "grad_norm": 1.4810428619384766, + "learning_rate": 1.6635e-05, + "loss": 0.5595, + "step": 3329 + }, + { + "epoch": 0.1864710493896293, + "grad_norm": 1.082762598991394, + "learning_rate": 1.664e-05, + "loss": 0.5685, + "step": 3330 + }, + { + "epoch": 0.18652704670175832, + "grad_norm": 1.3045333623886108, + "learning_rate": 1.6645e-05, + "loss": 0.6185, + "step": 3331 + }, + { + "epoch": 0.18658304401388734, + "grad_norm": 1.1538302898406982, + "learning_rate": 1.665e-05, + "loss": 0.4312, + "step": 3332 + }, + { + "epoch": 0.18663904132601636, + "grad_norm": 1.336602807044983, + "learning_rate": 1.6655000000000002e-05, + "loss": 0.3832, + "step": 3333 + }, + { + "epoch": 0.18669503863814538, + "grad_norm": 1.2260713577270508, + "learning_rate": 1.666e-05, + "loss": 0.3917, + "step": 3334 + }, + { + "epoch": 0.1867510359502744, + "grad_norm": 1.078220248222351, + "learning_rate": 1.6665e-05, + "loss": 0.4161, + "step": 3335 + }, + { + "epoch": 0.18680703326240342, + "grad_norm": 1.0990285873413086, + "learning_rate": 1.6669999999999998e-05, + "loss": 0.4115, + "step": 3336 + }, + { + "epoch": 0.18686303057453243, + "grad_norm": 1.2535604238510132, + "learning_rate": 1.6675000000000002e-05, + "loss": 0.4842, + "step": 3337 + }, + { + "epoch": 0.18691902788666145, + "grad_norm": 1.0767483711242676, + "learning_rate": 1.668e-05, + "loss": 0.3885, + "step": 3338 + }, + { + "epoch": 0.18697502519879045, + "grad_norm": 1.1079645156860352, + "learning_rate": 1.6685e-05, + "loss": 0.4516, + "step": 3339 + }, + { + "epoch": 0.18703102251091946, + "grad_norm": 1.3449302911758423, + "learning_rate": 1.669e-05, + "loss": 0.4821, + "step": 3340 + }, + { + "epoch": 0.18708701982304848, + "grad_norm": 1.2095441818237305, + "learning_rate": 1.6695e-05, + "loss": 0.3834, + "step": 3341 + }, + { + "epoch": 0.1871430171351775, + "grad_norm": 1.2906239032745361, + "learning_rate": 1.6700000000000003e-05, + "loss": 0.7066, + "step": 3342 + }, + { + "epoch": 0.18719901444730652, + "grad_norm": 1.172589659690857, + "learning_rate": 1.6705e-05, + "loss": 0.4537, + "step": 3343 + }, + { + "epoch": 0.18725501175943554, + "grad_norm": 1.1754515171051025, + "learning_rate": 1.671e-05, + "loss": 0.4595, + "step": 3344 + }, + { + "epoch": 0.18731100907156456, + "grad_norm": 1.052424669265747, + "learning_rate": 1.6715000000000002e-05, + "loss": 0.3947, + "step": 3345 + }, + { + "epoch": 0.18736700638369358, + "grad_norm": 1.1512207984924316, + "learning_rate": 1.672e-05, + "loss": 0.3732, + "step": 3346 + }, + { + "epoch": 0.1874230036958226, + "grad_norm": 1.1828744411468506, + "learning_rate": 1.6725000000000003e-05, + "loss": 0.2665, + "step": 3347 + }, + { + "epoch": 0.18747900100795162, + "grad_norm": 1.670555830001831, + "learning_rate": 1.673e-05, + "loss": 0.5061, + "step": 3348 + }, + { + "epoch": 0.18753499832008064, + "grad_norm": 1.185208797454834, + "learning_rate": 1.6735e-05, + "loss": 0.3772, + "step": 3349 + }, + { + "epoch": 0.18759099563220966, + "grad_norm": 1.3466157913208008, + "learning_rate": 1.674e-05, + "loss": 0.5246, + "step": 3350 + }, + { + "epoch": 0.18764699294433868, + "grad_norm": 1.374879240989685, + "learning_rate": 1.6745e-05, + "loss": 0.5011, + "step": 3351 + }, + { + "epoch": 0.1877029902564677, + "grad_norm": 1.1384507417678833, + "learning_rate": 1.675e-05, + "loss": 0.3486, + "step": 3352 + }, + { + "epoch": 0.1877589875685967, + "grad_norm": 1.1417534351348877, + "learning_rate": 1.6755e-05, + "loss": 0.3765, + "step": 3353 + }, + { + "epoch": 0.18781498488072573, + "grad_norm": 1.1946250200271606, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.3828, + "step": 3354 + }, + { + "epoch": 0.18787098219285475, + "grad_norm": 1.1479038000106812, + "learning_rate": 1.6765e-05, + "loss": 0.4193, + "step": 3355 + }, + { + "epoch": 0.18792697950498377, + "grad_norm": 1.2469680309295654, + "learning_rate": 1.677e-05, + "loss": 0.3967, + "step": 3356 + }, + { + "epoch": 0.1879829768171128, + "grad_norm": 1.0604054927825928, + "learning_rate": 1.6775e-05, + "loss": 0.3472, + "step": 3357 + }, + { + "epoch": 0.1880389741292418, + "grad_norm": 1.4340620040893555, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.4665, + "step": 3358 + }, + { + "epoch": 0.1880949714413708, + "grad_norm": 1.5655089616775513, + "learning_rate": 1.6785e-05, + "loss": 0.4712, + "step": 3359 + }, + { + "epoch": 0.18815096875349982, + "grad_norm": 1.0947039127349854, + "learning_rate": 1.679e-05, + "loss": 0.4547, + "step": 3360 + }, + { + "epoch": 0.18820696606562884, + "grad_norm": 1.078403353691101, + "learning_rate": 1.6795e-05, + "loss": 0.4219, + "step": 3361 + }, + { + "epoch": 0.18826296337775786, + "grad_norm": 1.148415446281433, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.4868, + "step": 3362 + }, + { + "epoch": 0.18831896068988688, + "grad_norm": 1.633833646774292, + "learning_rate": 1.6805000000000003e-05, + "loss": 0.5315, + "step": 3363 + }, + { + "epoch": 0.1883749580020159, + "grad_norm": 1.1244282722473145, + "learning_rate": 1.681e-05, + "loss": 0.3964, + "step": 3364 + }, + { + "epoch": 0.18843095531414492, + "grad_norm": 1.420272946357727, + "learning_rate": 1.6815e-05, + "loss": 0.4208, + "step": 3365 + }, + { + "epoch": 0.18848695262627393, + "grad_norm": 1.3265632390975952, + "learning_rate": 1.6819999999999998e-05, + "loss": 0.3931, + "step": 3366 + }, + { + "epoch": 0.18854294993840295, + "grad_norm": 1.3832519054412842, + "learning_rate": 1.6825000000000002e-05, + "loss": 0.4967, + "step": 3367 + }, + { + "epoch": 0.18859894725053197, + "grad_norm": 1.217711329460144, + "learning_rate": 1.683e-05, + "loss": 0.4278, + "step": 3368 + }, + { + "epoch": 0.188654944562661, + "grad_norm": 1.2732466459274292, + "learning_rate": 1.6835e-05, + "loss": 0.5072, + "step": 3369 + }, + { + "epoch": 0.18871094187479, + "grad_norm": 1.0388083457946777, + "learning_rate": 1.684e-05, + "loss": 0.438, + "step": 3370 + }, + { + "epoch": 0.18876693918691903, + "grad_norm": 1.2363613843917847, + "learning_rate": 1.6845e-05, + "loss": 0.4776, + "step": 3371 + }, + { + "epoch": 0.18882293649904805, + "grad_norm": 1.2334297895431519, + "learning_rate": 1.6850000000000003e-05, + "loss": 0.4249, + "step": 3372 + }, + { + "epoch": 0.18887893381117707, + "grad_norm": 1.1314977407455444, + "learning_rate": 1.6855e-05, + "loss": 0.3967, + "step": 3373 + }, + { + "epoch": 0.1889349311233061, + "grad_norm": 1.0096185207366943, + "learning_rate": 1.686e-05, + "loss": 0.3685, + "step": 3374 + }, + { + "epoch": 0.1889909284354351, + "grad_norm": 1.3609317541122437, + "learning_rate": 1.6865e-05, + "loss": 0.4632, + "step": 3375 + }, + { + "epoch": 0.18904692574756413, + "grad_norm": 1.0902884006500244, + "learning_rate": 1.687e-05, + "loss": 0.3116, + "step": 3376 + }, + { + "epoch": 0.18910292305969315, + "grad_norm": 1.6312155723571777, + "learning_rate": 1.6875000000000004e-05, + "loss": 0.4591, + "step": 3377 + }, + { + "epoch": 0.18915892037182216, + "grad_norm": 1.2797203063964844, + "learning_rate": 1.688e-05, + "loss": 0.3855, + "step": 3378 + }, + { + "epoch": 0.18921491768395118, + "grad_norm": 1.2569507360458374, + "learning_rate": 1.6885000000000002e-05, + "loss": 0.4277, + "step": 3379 + }, + { + "epoch": 0.18927091499608018, + "grad_norm": 1.1022034883499146, + "learning_rate": 1.689e-05, + "loss": 0.3475, + "step": 3380 + }, + { + "epoch": 0.1893269123082092, + "grad_norm": 1.084113359451294, + "learning_rate": 1.6895e-05, + "loss": 0.415, + "step": 3381 + }, + { + "epoch": 0.1893829096203382, + "grad_norm": 1.3184095621109009, + "learning_rate": 1.69e-05, + "loss": 0.3926, + "step": 3382 + }, + { + "epoch": 0.18943890693246723, + "grad_norm": 1.2320349216461182, + "learning_rate": 1.6905e-05, + "loss": 0.4394, + "step": 3383 + }, + { + "epoch": 0.18949490424459625, + "grad_norm": 2.3696770668029785, + "learning_rate": 1.6910000000000002e-05, + "loss": 0.3514, + "step": 3384 + }, + { + "epoch": 0.18955090155672527, + "grad_norm": 1.1212445497512817, + "learning_rate": 1.6915e-05, + "loss": 0.6016, + "step": 3385 + }, + { + "epoch": 0.1896068988688543, + "grad_norm": 1.0768115520477295, + "learning_rate": 1.692e-05, + "loss": 0.4472, + "step": 3386 + }, + { + "epoch": 0.1896628961809833, + "grad_norm": 1.5229803323745728, + "learning_rate": 1.6925e-05, + "loss": 0.4502, + "step": 3387 + }, + { + "epoch": 0.18971889349311233, + "grad_norm": 1.1422045230865479, + "learning_rate": 1.6930000000000002e-05, + "loss": 0.2921, + "step": 3388 + }, + { + "epoch": 0.18977489080524135, + "grad_norm": 1.3426090478897095, + "learning_rate": 1.6935e-05, + "loss": 0.4891, + "step": 3389 + }, + { + "epoch": 0.18983088811737037, + "grad_norm": 1.250209093093872, + "learning_rate": 1.694e-05, + "loss": 0.446, + "step": 3390 + }, + { + "epoch": 0.18988688542949939, + "grad_norm": 1.2611923217773438, + "learning_rate": 1.6945e-05, + "loss": 0.4895, + "step": 3391 + }, + { + "epoch": 0.1899428827416284, + "grad_norm": 1.2335516214370728, + "learning_rate": 1.6950000000000002e-05, + "loss": 0.4183, + "step": 3392 + }, + { + "epoch": 0.18999888005375742, + "grad_norm": 1.0964800119400024, + "learning_rate": 1.6955000000000003e-05, + "loss": 0.4134, + "step": 3393 + }, + { + "epoch": 0.19005487736588644, + "grad_norm": 1.8114066123962402, + "learning_rate": 1.696e-05, + "loss": 0.5505, + "step": 3394 + }, + { + "epoch": 0.19011087467801546, + "grad_norm": 1.3944091796875, + "learning_rate": 1.6965e-05, + "loss": 0.6571, + "step": 3395 + }, + { + "epoch": 0.19016687199014448, + "grad_norm": 1.537448763847351, + "learning_rate": 1.697e-05, + "loss": 0.486, + "step": 3396 + }, + { + "epoch": 0.1902228693022735, + "grad_norm": 1.3557907342910767, + "learning_rate": 1.6975000000000003e-05, + "loss": 0.4855, + "step": 3397 + }, + { + "epoch": 0.19027886661440252, + "grad_norm": 1.2067703008651733, + "learning_rate": 1.698e-05, + "loss": 0.5486, + "step": 3398 + }, + { + "epoch": 0.19033486392653154, + "grad_norm": 1.1005481481552124, + "learning_rate": 1.6985e-05, + "loss": 0.4072, + "step": 3399 + }, + { + "epoch": 0.19039086123866056, + "grad_norm": 1.3335630893707275, + "learning_rate": 1.699e-05, + "loss": 0.4883, + "step": 3400 + }, + { + "epoch": 0.19044685855078955, + "grad_norm": 1.3925936222076416, + "learning_rate": 1.6995e-05, + "loss": 0.4764, + "step": 3401 + }, + { + "epoch": 0.19050285586291857, + "grad_norm": 2.9519593715667725, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.5681, + "step": 3402 + }, + { + "epoch": 0.1905588531750476, + "grad_norm": 1.0862815380096436, + "learning_rate": 1.7005e-05, + "loss": 0.3684, + "step": 3403 + }, + { + "epoch": 0.1906148504871766, + "grad_norm": 1.3254729509353638, + "learning_rate": 1.701e-05, + "loss": 0.4242, + "step": 3404 + }, + { + "epoch": 0.19067084779930563, + "grad_norm": 1.2939419746398926, + "learning_rate": 1.7015e-05, + "loss": 0.6702, + "step": 3405 + }, + { + "epoch": 0.19072684511143465, + "grad_norm": 1.1743439435958862, + "learning_rate": 1.702e-05, + "loss": 0.4808, + "step": 3406 + }, + { + "epoch": 0.19078284242356366, + "grad_norm": 1.2036492824554443, + "learning_rate": 1.7025e-05, + "loss": 0.3705, + "step": 3407 + }, + { + "epoch": 0.19083883973569268, + "grad_norm": 1.1415599584579468, + "learning_rate": 1.703e-05, + "loss": 0.5025, + "step": 3408 + }, + { + "epoch": 0.1908948370478217, + "grad_norm": 1.1346678733825684, + "learning_rate": 1.7035000000000002e-05, + "loss": 0.5311, + "step": 3409 + }, + { + "epoch": 0.19095083435995072, + "grad_norm": 1.3246341943740845, + "learning_rate": 1.704e-05, + "loss": 0.5128, + "step": 3410 + }, + { + "epoch": 0.19100683167207974, + "grad_norm": 1.2443675994873047, + "learning_rate": 1.7045e-05, + "loss": 0.6092, + "step": 3411 + }, + { + "epoch": 0.19106282898420876, + "grad_norm": 1.1285314559936523, + "learning_rate": 1.705e-05, + "loss": 0.3824, + "step": 3412 + }, + { + "epoch": 0.19111882629633778, + "grad_norm": 1.1717637777328491, + "learning_rate": 1.7055000000000002e-05, + "loss": 0.446, + "step": 3413 + }, + { + "epoch": 0.1911748236084668, + "grad_norm": 1.0986918210983276, + "learning_rate": 1.706e-05, + "loss": 0.4009, + "step": 3414 + }, + { + "epoch": 0.19123082092059582, + "grad_norm": 1.2700912952423096, + "learning_rate": 1.7065e-05, + "loss": 0.4518, + "step": 3415 + }, + { + "epoch": 0.19128681823272484, + "grad_norm": 1.0021355152130127, + "learning_rate": 1.707e-05, + "loss": 0.3094, + "step": 3416 + }, + { + "epoch": 0.19134281554485386, + "grad_norm": 1.3428599834442139, + "learning_rate": 1.7075e-05, + "loss": 0.3574, + "step": 3417 + }, + { + "epoch": 0.19139881285698288, + "grad_norm": 1.2496545314788818, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.4016, + "step": 3418 + }, + { + "epoch": 0.1914548101691119, + "grad_norm": 1.1841208934783936, + "learning_rate": 1.7085e-05, + "loss": 0.4462, + "step": 3419 + }, + { + "epoch": 0.1915108074812409, + "grad_norm": 1.4078991413116455, + "learning_rate": 1.709e-05, + "loss": 0.4315, + "step": 3420 + }, + { + "epoch": 0.1915668047933699, + "grad_norm": 1.3215162754058838, + "learning_rate": 1.7095e-05, + "loss": 0.4588, + "step": 3421 + }, + { + "epoch": 0.19162280210549892, + "grad_norm": 1.2057838439941406, + "learning_rate": 1.7100000000000002e-05, + "loss": 0.4078, + "step": 3422 + }, + { + "epoch": 0.19167879941762794, + "grad_norm": 1.2553930282592773, + "learning_rate": 1.7105000000000003e-05, + "loss": 0.3858, + "step": 3423 + }, + { + "epoch": 0.19173479672975696, + "grad_norm": 1.4046380519866943, + "learning_rate": 1.711e-05, + "loss": 0.6379, + "step": 3424 + }, + { + "epoch": 0.19179079404188598, + "grad_norm": 1.4144824743270874, + "learning_rate": 1.7115e-05, + "loss": 0.4791, + "step": 3425 + }, + { + "epoch": 0.191846791354015, + "grad_norm": 1.804913878440857, + "learning_rate": 1.712e-05, + "loss": 0.5213, + "step": 3426 + }, + { + "epoch": 0.19190278866614402, + "grad_norm": 1.282642126083374, + "learning_rate": 1.7125000000000003e-05, + "loss": 0.5367, + "step": 3427 + }, + { + "epoch": 0.19195878597827304, + "grad_norm": 1.3015738725662231, + "learning_rate": 1.713e-05, + "loss": 0.4267, + "step": 3428 + }, + { + "epoch": 0.19201478329040206, + "grad_norm": 1.5885628461837769, + "learning_rate": 1.7135e-05, + "loss": 0.5507, + "step": 3429 + }, + { + "epoch": 0.19207078060253108, + "grad_norm": 1.3165570497512817, + "learning_rate": 1.7140000000000002e-05, + "loss": 0.4275, + "step": 3430 + }, + { + "epoch": 0.1921267779146601, + "grad_norm": 1.299818992614746, + "learning_rate": 1.7145e-05, + "loss": 0.5218, + "step": 3431 + }, + { + "epoch": 0.19218277522678912, + "grad_norm": 1.2481540441513062, + "learning_rate": 1.7150000000000004e-05, + "loss": 0.4117, + "step": 3432 + }, + { + "epoch": 0.19223877253891813, + "grad_norm": 1.2244377136230469, + "learning_rate": 1.7155e-05, + "loss": 0.4895, + "step": 3433 + }, + { + "epoch": 0.19229476985104715, + "grad_norm": 1.1141343116760254, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.488, + "step": 3434 + }, + { + "epoch": 0.19235076716317617, + "grad_norm": 1.5054800510406494, + "learning_rate": 1.7165e-05, + "loss": 0.5853, + "step": 3435 + }, + { + "epoch": 0.1924067644753052, + "grad_norm": 1.1070597171783447, + "learning_rate": 1.717e-05, + "loss": 0.4004, + "step": 3436 + }, + { + "epoch": 0.1924627617874342, + "grad_norm": 1.2049047946929932, + "learning_rate": 1.7175e-05, + "loss": 0.437, + "step": 3437 + }, + { + "epoch": 0.19251875909956323, + "grad_norm": 1.857879638671875, + "learning_rate": 1.718e-05, + "loss": 0.4563, + "step": 3438 + }, + { + "epoch": 0.19257475641169225, + "grad_norm": 1.1342371702194214, + "learning_rate": 1.7185000000000002e-05, + "loss": 0.3864, + "step": 3439 + }, + { + "epoch": 0.19263075372382127, + "grad_norm": 1.1872518062591553, + "learning_rate": 1.719e-05, + "loss": 0.5765, + "step": 3440 + }, + { + "epoch": 0.1926867510359503, + "grad_norm": 1.1592533588409424, + "learning_rate": 1.7195e-05, + "loss": 0.3613, + "step": 3441 + }, + { + "epoch": 0.19274274834807928, + "grad_norm": 1.2872483730316162, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.4437, + "step": 3442 + }, + { + "epoch": 0.1927987456602083, + "grad_norm": NaN, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.2918, + "step": 3443 + }, + { + "epoch": 0.19285474297233732, + "grad_norm": 1.2049381732940674, + "learning_rate": 1.7205000000000002e-05, + "loss": 0.383, + "step": 3444 + }, + { + "epoch": 0.19291074028446634, + "grad_norm": 1.1656842231750488, + "learning_rate": 1.721e-05, + "loss": 0.4182, + "step": 3445 + }, + { + "epoch": 0.19296673759659536, + "grad_norm": 1.127423644065857, + "learning_rate": 1.7215e-05, + "loss": 0.4357, + "step": 3446 + }, + { + "epoch": 0.19302273490872437, + "grad_norm": 1.9054218530654907, + "learning_rate": 1.722e-05, + "loss": 0.3881, + "step": 3447 + }, + { + "epoch": 0.1930787322208534, + "grad_norm": 1.3814903497695923, + "learning_rate": 1.7225e-05, + "loss": 0.6128, + "step": 3448 + }, + { + "epoch": 0.1931347295329824, + "grad_norm": 1.302217960357666, + "learning_rate": 1.7230000000000003e-05, + "loss": 0.4546, + "step": 3449 + }, + { + "epoch": 0.19319072684511143, + "grad_norm": 1.1936508417129517, + "learning_rate": 1.7235e-05, + "loss": 0.4362, + "step": 3450 + }, + { + "epoch": 0.19324672415724045, + "grad_norm": 1.2464027404785156, + "learning_rate": 1.724e-05, + "loss": 0.4663, + "step": 3451 + }, + { + "epoch": 0.19330272146936947, + "grad_norm": 1.2320516109466553, + "learning_rate": 1.7245e-05, + "loss": 0.497, + "step": 3452 + }, + { + "epoch": 0.1933587187814985, + "grad_norm": 1.2153738737106323, + "learning_rate": 1.725e-05, + "loss": 0.5814, + "step": 3453 + }, + { + "epoch": 0.1934147160936275, + "grad_norm": 1.3801615238189697, + "learning_rate": 1.7255000000000003e-05, + "loss": 0.478, + "step": 3454 + }, + { + "epoch": 0.19347071340575653, + "grad_norm": 1.465854525566101, + "learning_rate": 1.726e-05, + "loss": 0.4406, + "step": 3455 + }, + { + "epoch": 0.19352671071788555, + "grad_norm": 2.0683443546295166, + "learning_rate": 1.7265e-05, + "loss": 0.5539, + "step": 3456 + }, + { + "epoch": 0.19358270803001457, + "grad_norm": 1.6779159307479858, + "learning_rate": 1.727e-05, + "loss": 0.5283, + "step": 3457 + }, + { + "epoch": 0.19363870534214359, + "grad_norm": 1.1865346431732178, + "learning_rate": 1.7275e-05, + "loss": 0.5156, + "step": 3458 + }, + { + "epoch": 0.1936947026542726, + "grad_norm": 1.2769807577133179, + "learning_rate": 1.728e-05, + "loss": 0.5397, + "step": 3459 + }, + { + "epoch": 0.19375069996640162, + "grad_norm": 1.1956596374511719, + "learning_rate": 1.7285e-05, + "loss": 0.3929, + "step": 3460 + }, + { + "epoch": 0.19380669727853064, + "grad_norm": 1.1267244815826416, + "learning_rate": 1.7290000000000002e-05, + "loss": 0.3968, + "step": 3461 + }, + { + "epoch": 0.19386269459065966, + "grad_norm": 1.2932826280593872, + "learning_rate": 1.7295e-05, + "loss": 0.4719, + "step": 3462 + }, + { + "epoch": 0.19391869190278865, + "grad_norm": 1.1786478757858276, + "learning_rate": 1.73e-05, + "loss": 0.4781, + "step": 3463 + }, + { + "epoch": 0.19397468921491767, + "grad_norm": 1.1676743030548096, + "learning_rate": 1.7305e-05, + "loss": 0.3853, + "step": 3464 + }, + { + "epoch": 0.1940306865270467, + "grad_norm": 1.2847338914871216, + "learning_rate": 1.7310000000000002e-05, + "loss": 0.5865, + "step": 3465 + }, + { + "epoch": 0.1940866838391757, + "grad_norm": 1.2821464538574219, + "learning_rate": 1.7315e-05, + "loss": 0.4091, + "step": 3466 + }, + { + "epoch": 0.19414268115130473, + "grad_norm": 1.1550301313400269, + "learning_rate": 1.732e-05, + "loss": 0.4035, + "step": 3467 + }, + { + "epoch": 0.19419867846343375, + "grad_norm": 1.3662973642349243, + "learning_rate": 1.7325e-05, + "loss": 0.4234, + "step": 3468 + }, + { + "epoch": 0.19425467577556277, + "grad_norm": 1.301316261291504, + "learning_rate": 1.7330000000000002e-05, + "loss": 0.4344, + "step": 3469 + }, + { + "epoch": 0.1943106730876918, + "grad_norm": 1.1564757823944092, + "learning_rate": 1.7335000000000003e-05, + "loss": 0.5202, + "step": 3470 + }, + { + "epoch": 0.1943666703998208, + "grad_norm": 1.756524682044983, + "learning_rate": 1.734e-05, + "loss": 0.5682, + "step": 3471 + }, + { + "epoch": 0.19442266771194983, + "grad_norm": 1.184188961982727, + "learning_rate": 1.7345e-05, + "loss": 0.4627, + "step": 3472 + }, + { + "epoch": 0.19447866502407885, + "grad_norm": 1.332406997680664, + "learning_rate": 1.7349999999999998e-05, + "loss": 0.4546, + "step": 3473 + }, + { + "epoch": 0.19453466233620786, + "grad_norm": 1.2890806198120117, + "learning_rate": 1.7355000000000002e-05, + "loss": 0.4304, + "step": 3474 + }, + { + "epoch": 0.19459065964833688, + "grad_norm": 1.150179386138916, + "learning_rate": 1.736e-05, + "loss": 0.4157, + "step": 3475 + }, + { + "epoch": 0.1946466569604659, + "grad_norm": 1.432388186454773, + "learning_rate": 1.7365e-05, + "loss": 0.5514, + "step": 3476 + }, + { + "epoch": 0.19470265427259492, + "grad_norm": 1.2167162895202637, + "learning_rate": 1.737e-05, + "loss": 0.4673, + "step": 3477 + }, + { + "epoch": 0.19475865158472394, + "grad_norm": 1.4046660661697388, + "learning_rate": 1.7375e-05, + "loss": 0.5461, + "step": 3478 + }, + { + "epoch": 0.19481464889685296, + "grad_norm": 1.2196552753448486, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.425, + "step": 3479 + }, + { + "epoch": 0.19487064620898198, + "grad_norm": 1.8962860107421875, + "learning_rate": 1.7385e-05, + "loss": 0.5941, + "step": 3480 + }, + { + "epoch": 0.194926643521111, + "grad_norm": 1.1011854410171509, + "learning_rate": 1.739e-05, + "loss": 0.4297, + "step": 3481 + }, + { + "epoch": 0.19498264083324002, + "grad_norm": 1.023834466934204, + "learning_rate": 1.7395e-05, + "loss": 0.3723, + "step": 3482 + }, + { + "epoch": 0.195038638145369, + "grad_norm": 1.4861829280853271, + "learning_rate": 1.74e-05, + "loss": 0.4788, + "step": 3483 + }, + { + "epoch": 0.19509463545749803, + "grad_norm": 1.1437004804611206, + "learning_rate": 1.7405e-05, + "loss": 0.364, + "step": 3484 + }, + { + "epoch": 0.19515063276962705, + "grad_norm": 1.8231146335601807, + "learning_rate": 1.741e-05, + "loss": 0.5312, + "step": 3485 + }, + { + "epoch": 0.19520663008175607, + "grad_norm": 1.5955363512039185, + "learning_rate": 1.7415000000000002e-05, + "loss": 0.4753, + "step": 3486 + }, + { + "epoch": 0.19526262739388509, + "grad_norm": 1.3428277969360352, + "learning_rate": 1.742e-05, + "loss": 0.4872, + "step": 3487 + }, + { + "epoch": 0.1953186247060141, + "grad_norm": 1.1213557720184326, + "learning_rate": 1.7425e-05, + "loss": 0.3974, + "step": 3488 + }, + { + "epoch": 0.19537462201814312, + "grad_norm": 1.1225180625915527, + "learning_rate": 1.743e-05, + "loss": 0.3424, + "step": 3489 + }, + { + "epoch": 0.19543061933027214, + "grad_norm": 1.5477772951126099, + "learning_rate": 1.7435e-05, + "loss": 0.6466, + "step": 3490 + }, + { + "epoch": 0.19548661664240116, + "grad_norm": 1.1222776174545288, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.4342, + "step": 3491 + }, + { + "epoch": 0.19554261395453018, + "grad_norm": 1.058448314666748, + "learning_rate": 1.7445e-05, + "loss": 0.3581, + "step": 3492 + }, + { + "epoch": 0.1955986112666592, + "grad_norm": 1.4802191257476807, + "learning_rate": 1.745e-05, + "loss": 0.4271, + "step": 3493 + }, + { + "epoch": 0.19565460857878822, + "grad_norm": 1.29079008102417, + "learning_rate": 1.7455e-05, + "loss": 0.4136, + "step": 3494 + }, + { + "epoch": 0.19571060589091724, + "grad_norm": 1.099254846572876, + "learning_rate": 1.7460000000000002e-05, + "loss": 0.3975, + "step": 3495 + }, + { + "epoch": 0.19576660320304626, + "grad_norm": 1.9378408193588257, + "learning_rate": 1.7465e-05, + "loss": 0.5841, + "step": 3496 + }, + { + "epoch": 0.19582260051517528, + "grad_norm": 1.5394763946533203, + "learning_rate": 1.747e-05, + "loss": 0.4734, + "step": 3497 + }, + { + "epoch": 0.1958785978273043, + "grad_norm": 1.3304824829101562, + "learning_rate": 1.7475e-05, + "loss": 0.3832, + "step": 3498 + }, + { + "epoch": 0.19593459513943332, + "grad_norm": 1.273561954498291, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.5094, + "step": 3499 + }, + { + "epoch": 0.19599059245156233, + "grad_norm": 1.340970516204834, + "learning_rate": 1.7485000000000003e-05, + "loss": 0.4204, + "step": 3500 + }, + { + "epoch": 0.19604658976369135, + "grad_norm": 1.1625409126281738, + "learning_rate": 1.749e-05, + "loss": 0.4155, + "step": 3501 + }, + { + "epoch": 0.19610258707582037, + "grad_norm": 0.9949480295181274, + "learning_rate": 1.7495e-05, + "loss": 0.4649, + "step": 3502 + }, + { + "epoch": 0.1961585843879494, + "grad_norm": 1.329535961151123, + "learning_rate": 1.75e-05, + "loss": 0.5627, + "step": 3503 + }, + { + "epoch": 0.19621458170007838, + "grad_norm": 1.1956413984298706, + "learning_rate": 1.7505000000000003e-05, + "loss": 0.3473, + "step": 3504 + }, + { + "epoch": 0.1962705790122074, + "grad_norm": 1.3843954801559448, + "learning_rate": 1.751e-05, + "loss": 0.5833, + "step": 3505 + }, + { + "epoch": 0.19632657632433642, + "grad_norm": 1.2624551057815552, + "learning_rate": 1.7515e-05, + "loss": 0.3751, + "step": 3506 + }, + { + "epoch": 0.19638257363646544, + "grad_norm": 2.117694854736328, + "learning_rate": 1.752e-05, + "loss": 0.3899, + "step": 3507 + }, + { + "epoch": 0.19643857094859446, + "grad_norm": 1.484227180480957, + "learning_rate": 1.7525e-05, + "loss": 0.4206, + "step": 3508 + }, + { + "epoch": 0.19649456826072348, + "grad_norm": 1.1634652614593506, + "learning_rate": 1.7530000000000003e-05, + "loss": 0.3729, + "step": 3509 + }, + { + "epoch": 0.1965505655728525, + "grad_norm": 1.2138384580612183, + "learning_rate": 1.7535e-05, + "loss": 0.3716, + "step": 3510 + }, + { + "epoch": 0.19660656288498152, + "grad_norm": 1.0454835891723633, + "learning_rate": 1.754e-05, + "loss": 0.3648, + "step": 3511 + }, + { + "epoch": 0.19666256019711054, + "grad_norm": 1.5565184354782104, + "learning_rate": 1.7545e-05, + "loss": 0.5028, + "step": 3512 + }, + { + "epoch": 0.19671855750923956, + "grad_norm": 1.8158458471298218, + "learning_rate": 1.755e-05, + "loss": 0.4712, + "step": 3513 + }, + { + "epoch": 0.19677455482136857, + "grad_norm": 0.9922124743461609, + "learning_rate": 1.7555e-05, + "loss": 0.4019, + "step": 3514 + }, + { + "epoch": 0.1968305521334976, + "grad_norm": 1.0502554178237915, + "learning_rate": 1.756e-05, + "loss": 0.3467, + "step": 3515 + }, + { + "epoch": 0.1968865494456266, + "grad_norm": 4.462704181671143, + "learning_rate": 1.7565000000000002e-05, + "loss": 0.5373, + "step": 3516 + }, + { + "epoch": 0.19694254675775563, + "grad_norm": 1.2560471296310425, + "learning_rate": 1.757e-05, + "loss": 0.4816, + "step": 3517 + }, + { + "epoch": 0.19699854406988465, + "grad_norm": 1.1651909351348877, + "learning_rate": 1.7575e-05, + "loss": 0.4926, + "step": 3518 + }, + { + "epoch": 0.19705454138201367, + "grad_norm": 1.382475733757019, + "learning_rate": 1.758e-05, + "loss": 0.4079, + "step": 3519 + }, + { + "epoch": 0.1971105386941427, + "grad_norm": 1.362947940826416, + "learning_rate": 1.7585000000000002e-05, + "loss": 0.6844, + "step": 3520 + }, + { + "epoch": 0.1971665360062717, + "grad_norm": 1.4244499206542969, + "learning_rate": 1.759e-05, + "loss": 0.3927, + "step": 3521 + }, + { + "epoch": 0.19722253331840073, + "grad_norm": 1.4018924236297607, + "learning_rate": 1.7595e-05, + "loss": 0.4086, + "step": 3522 + }, + { + "epoch": 0.19727853063052975, + "grad_norm": 1.0669325590133667, + "learning_rate": 1.76e-05, + "loss": 0.3776, + "step": 3523 + }, + { + "epoch": 0.19733452794265877, + "grad_norm": 1.1211423873901367, + "learning_rate": 1.7605000000000002e-05, + "loss": 0.3565, + "step": 3524 + }, + { + "epoch": 0.19739052525478776, + "grad_norm": 1.3023756742477417, + "learning_rate": 1.7610000000000002e-05, + "loss": 0.4846, + "step": 3525 + }, + { + "epoch": 0.19744652256691678, + "grad_norm": 1.5412284135818481, + "learning_rate": 1.7615e-05, + "loss": 0.6275, + "step": 3526 + }, + { + "epoch": 0.1975025198790458, + "grad_norm": 1.2327158451080322, + "learning_rate": 1.762e-05, + "loss": 0.3694, + "step": 3527 + }, + { + "epoch": 0.19755851719117482, + "grad_norm": 1.0797861814498901, + "learning_rate": 1.7625e-05, + "loss": 0.3528, + "step": 3528 + }, + { + "epoch": 0.19761451450330383, + "grad_norm": 1.3560792207717896, + "learning_rate": 1.7630000000000002e-05, + "loss": 0.5039, + "step": 3529 + }, + { + "epoch": 0.19767051181543285, + "grad_norm": 1.2430589199066162, + "learning_rate": 1.7635000000000003e-05, + "loss": 0.4804, + "step": 3530 + }, + { + "epoch": 0.19772650912756187, + "grad_norm": 1.2607671022415161, + "learning_rate": 1.764e-05, + "loss": 0.4166, + "step": 3531 + }, + { + "epoch": 0.1977825064396909, + "grad_norm": 1.0509237051010132, + "learning_rate": 1.7645e-05, + "loss": 0.3564, + "step": 3532 + }, + { + "epoch": 0.1978385037518199, + "grad_norm": 1.234286904335022, + "learning_rate": 1.765e-05, + "loss": 0.4267, + "step": 3533 + }, + { + "epoch": 0.19789450106394893, + "grad_norm": 1.2905102968215942, + "learning_rate": 1.7655000000000003e-05, + "loss": 0.425, + "step": 3534 + }, + { + "epoch": 0.19795049837607795, + "grad_norm": 1.1756755113601685, + "learning_rate": 1.766e-05, + "loss": 0.3897, + "step": 3535 + }, + { + "epoch": 0.19800649568820697, + "grad_norm": 1.2536176443099976, + "learning_rate": 1.7665e-05, + "loss": 0.4033, + "step": 3536 + }, + { + "epoch": 0.198062493000336, + "grad_norm": 1.1823030710220337, + "learning_rate": 1.7670000000000002e-05, + "loss": 0.4084, + "step": 3537 + }, + { + "epoch": 0.198118490312465, + "grad_norm": 1.295543909072876, + "learning_rate": 1.7675e-05, + "loss": 0.3927, + "step": 3538 + }, + { + "epoch": 0.19817448762459403, + "grad_norm": 1.1278620958328247, + "learning_rate": 1.7680000000000004e-05, + "loss": 0.4166, + "step": 3539 + }, + { + "epoch": 0.19823048493672304, + "grad_norm": 1.3221131563186646, + "learning_rate": 1.7685e-05, + "loss": 0.4792, + "step": 3540 + }, + { + "epoch": 0.19828648224885206, + "grad_norm": 1.1316249370574951, + "learning_rate": 1.7690000000000002e-05, + "loss": 0.6107, + "step": 3541 + }, + { + "epoch": 0.19834247956098108, + "grad_norm": 1.281059741973877, + "learning_rate": 1.7695e-05, + "loss": 0.4603, + "step": 3542 + }, + { + "epoch": 0.1983984768731101, + "grad_norm": 1.519079327583313, + "learning_rate": 1.77e-05, + "loss": 0.9461, + "step": 3543 + }, + { + "epoch": 0.19845447418523912, + "grad_norm": 1.1021186113357544, + "learning_rate": 1.7705e-05, + "loss": 0.4054, + "step": 3544 + }, + { + "epoch": 0.1985104714973681, + "grad_norm": 1.2137590646743774, + "learning_rate": 1.771e-05, + "loss": 0.3485, + "step": 3545 + }, + { + "epoch": 0.19856646880949713, + "grad_norm": 1.8117284774780273, + "learning_rate": 1.7715000000000002e-05, + "loss": 0.4094, + "step": 3546 + }, + { + "epoch": 0.19862246612162615, + "grad_norm": 1.588281273841858, + "learning_rate": 1.772e-05, + "loss": 0.4485, + "step": 3547 + }, + { + "epoch": 0.19867846343375517, + "grad_norm": 1.4057315587997437, + "learning_rate": 1.7725e-05, + "loss": 0.5001, + "step": 3548 + }, + { + "epoch": 0.1987344607458842, + "grad_norm": 1.0868313312530518, + "learning_rate": 1.773e-05, + "loss": 0.42, + "step": 3549 + }, + { + "epoch": 0.1987904580580132, + "grad_norm": 1.1870468854904175, + "learning_rate": 1.7735000000000002e-05, + "loss": 0.4441, + "step": 3550 + }, + { + "epoch": 0.19884645537014223, + "grad_norm": 1.0263683795928955, + "learning_rate": 1.774e-05, + "loss": 0.4931, + "step": 3551 + }, + { + "epoch": 0.19890245268227125, + "grad_norm": 1.1085573434829712, + "learning_rate": 1.7745e-05, + "loss": 0.4928, + "step": 3552 + }, + { + "epoch": 0.19895844999440027, + "grad_norm": 1.0907138586044312, + "learning_rate": 1.775e-05, + "loss": 0.481, + "step": 3553 + }, + { + "epoch": 0.19901444730652929, + "grad_norm": 1.4030075073242188, + "learning_rate": 1.7755000000000002e-05, + "loss": 0.6967, + "step": 3554 + }, + { + "epoch": 0.1990704446186583, + "grad_norm": 1.2095900774002075, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.3484, + "step": 3555 + }, + { + "epoch": 0.19912644193078732, + "grad_norm": 1.1291601657867432, + "learning_rate": 1.7765e-05, + "loss": 0.3844, + "step": 3556 + }, + { + "epoch": 0.19918243924291634, + "grad_norm": 1.094610571861267, + "learning_rate": 1.777e-05, + "loss": 0.4705, + "step": 3557 + }, + { + "epoch": 0.19923843655504536, + "grad_norm": 1.1452468633651733, + "learning_rate": 1.7775e-05, + "loss": 0.4597, + "step": 3558 + }, + { + "epoch": 0.19929443386717438, + "grad_norm": 1.232744812965393, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.5608, + "step": 3559 + }, + { + "epoch": 0.1993504311793034, + "grad_norm": 1.1906026601791382, + "learning_rate": 1.7785e-05, + "loss": 0.3942, + "step": 3560 + }, + { + "epoch": 0.19940642849143242, + "grad_norm": 0.969206690788269, + "learning_rate": 1.779e-05, + "loss": 0.2845, + "step": 3561 + }, + { + "epoch": 0.19946242580356144, + "grad_norm": 1.2132002115249634, + "learning_rate": 1.7795e-05, + "loss": 0.4097, + "step": 3562 + }, + { + "epoch": 0.19951842311569046, + "grad_norm": 1.3831021785736084, + "learning_rate": 1.78e-05, + "loss": 0.4446, + "step": 3563 + }, + { + "epoch": 0.19957442042781948, + "grad_norm": 1.3817284107208252, + "learning_rate": 1.7805000000000003e-05, + "loss": 0.3767, + "step": 3564 + }, + { + "epoch": 0.1996304177399485, + "grad_norm": 1.3107645511627197, + "learning_rate": 1.781e-05, + "loss": 0.4725, + "step": 3565 + }, + { + "epoch": 0.1996864150520775, + "grad_norm": 1.0882208347320557, + "learning_rate": 1.7815e-05, + "loss": 0.4093, + "step": 3566 + }, + { + "epoch": 0.1997424123642065, + "grad_norm": 1.105525255203247, + "learning_rate": 1.7820000000000002e-05, + "loss": 0.3445, + "step": 3567 + }, + { + "epoch": 0.19979840967633553, + "grad_norm": 1.329720377922058, + "learning_rate": 1.7825e-05, + "loss": 0.4393, + "step": 3568 + }, + { + "epoch": 0.19985440698846454, + "grad_norm": 1.0822575092315674, + "learning_rate": 1.783e-05, + "loss": 0.3839, + "step": 3569 + }, + { + "epoch": 0.19991040430059356, + "grad_norm": 1.0894161462783813, + "learning_rate": 1.7835e-05, + "loss": 0.344, + "step": 3570 + }, + { + "epoch": 0.19996640161272258, + "grad_norm": 1.1637275218963623, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.4248, + "step": 3571 + }, + { + "epoch": 0.2000223989248516, + "grad_norm": 1.4276413917541504, + "learning_rate": 1.7845e-05, + "loss": 0.4069, + "step": 3572 + }, + { + "epoch": 0.20007839623698062, + "grad_norm": 1.1614675521850586, + "learning_rate": 1.785e-05, + "loss": 0.3726, + "step": 3573 + }, + { + "epoch": 0.20013439354910964, + "grad_norm": 1.1339586973190308, + "learning_rate": 1.7855e-05, + "loss": 0.6507, + "step": 3574 + }, + { + "epoch": 0.20019039086123866, + "grad_norm": 1.5252258777618408, + "learning_rate": 1.7860000000000002e-05, + "loss": 0.4611, + "step": 3575 + }, + { + "epoch": 0.20024638817336768, + "grad_norm": 1.1716711521148682, + "learning_rate": 1.7865000000000003e-05, + "loss": 0.3847, + "step": 3576 + }, + { + "epoch": 0.2003023854854967, + "grad_norm": 1.0965520143508911, + "learning_rate": 1.787e-05, + "loss": 0.4186, + "step": 3577 + }, + { + "epoch": 0.20035838279762572, + "grad_norm": 1.3718756437301636, + "learning_rate": 1.7875e-05, + "loss": 0.5508, + "step": 3578 + }, + { + "epoch": 0.20041438010975474, + "grad_norm": 1.2677582502365112, + "learning_rate": 1.7879999999999998e-05, + "loss": 0.4108, + "step": 3579 + }, + { + "epoch": 0.20047037742188376, + "grad_norm": 1.3012787103652954, + "learning_rate": 1.7885000000000002e-05, + "loss": 0.5334, + "step": 3580 + }, + { + "epoch": 0.20052637473401277, + "grad_norm": 1.152340292930603, + "learning_rate": 1.789e-05, + "loss": 0.3269, + "step": 3581 + }, + { + "epoch": 0.2005823720461418, + "grad_norm": 1.3426048755645752, + "learning_rate": 1.7895e-05, + "loss": 0.5069, + "step": 3582 + }, + { + "epoch": 0.2006383693582708, + "grad_norm": 1.2388484477996826, + "learning_rate": 1.79e-05, + "loss": 0.544, + "step": 3583 + }, + { + "epoch": 0.20069436667039983, + "grad_norm": 1.2543814182281494, + "learning_rate": 1.7905e-05, + "loss": 0.4034, + "step": 3584 + }, + { + "epoch": 0.20075036398252885, + "grad_norm": 1.060953140258789, + "learning_rate": 1.7910000000000003e-05, + "loss": 0.4458, + "step": 3585 + }, + { + "epoch": 0.20080636129465787, + "grad_norm": 1.3129549026489258, + "learning_rate": 1.7915e-05, + "loss": 0.5294, + "step": 3586 + }, + { + "epoch": 0.20086235860678686, + "grad_norm": 1.1931536197662354, + "learning_rate": 1.792e-05, + "loss": 0.4113, + "step": 3587 + }, + { + "epoch": 0.20091835591891588, + "grad_norm": 1.254481315612793, + "learning_rate": 1.7925e-05, + "loss": 0.4261, + "step": 3588 + }, + { + "epoch": 0.2009743532310449, + "grad_norm": 1.1662412881851196, + "learning_rate": 1.793e-05, + "loss": 0.348, + "step": 3589 + }, + { + "epoch": 0.20103035054317392, + "grad_norm": 1.0701323747634888, + "learning_rate": 1.7935e-05, + "loss": 0.4522, + "step": 3590 + }, + { + "epoch": 0.20108634785530294, + "grad_norm": 1.308609127998352, + "learning_rate": 1.794e-05, + "loss": 0.4527, + "step": 3591 + }, + { + "epoch": 0.20114234516743196, + "grad_norm": 1.1434340476989746, + "learning_rate": 1.7945000000000002e-05, + "loss": 0.3729, + "step": 3592 + }, + { + "epoch": 0.20119834247956098, + "grad_norm": 1.10639488697052, + "learning_rate": 1.795e-05, + "loss": 0.402, + "step": 3593 + }, + { + "epoch": 0.20125433979169, + "grad_norm": 2.03780460357666, + "learning_rate": 1.7955e-05, + "loss": 0.4185, + "step": 3594 + }, + { + "epoch": 0.20131033710381901, + "grad_norm": 1.222267508506775, + "learning_rate": 1.796e-05, + "loss": 0.4905, + "step": 3595 + }, + { + "epoch": 0.20136633441594803, + "grad_norm": 1.2427198886871338, + "learning_rate": 1.7965e-05, + "loss": 0.4275, + "step": 3596 + }, + { + "epoch": 0.20142233172807705, + "grad_norm": 1.4295467138290405, + "learning_rate": 1.797e-05, + "loss": 0.4698, + "step": 3597 + }, + { + "epoch": 0.20147832904020607, + "grad_norm": 1.0984424352645874, + "learning_rate": 1.7975e-05, + "loss": 0.4006, + "step": 3598 + }, + { + "epoch": 0.2015343263523351, + "grad_norm": 1.083338975906372, + "learning_rate": 1.798e-05, + "loss": 0.4357, + "step": 3599 + }, + { + "epoch": 0.2015903236644641, + "grad_norm": 1.1363531351089478, + "learning_rate": 1.7985e-05, + "loss": 0.4991, + "step": 3600 + }, + { + "epoch": 0.20164632097659313, + "grad_norm": 1.5071083307266235, + "learning_rate": 1.7990000000000002e-05, + "loss": 0.4102, + "step": 3601 + }, + { + "epoch": 0.20170231828872215, + "grad_norm": 1.1894534826278687, + "learning_rate": 1.7995e-05, + "loss": 0.3997, + "step": 3602 + }, + { + "epoch": 0.20175831560085117, + "grad_norm": 1.1389291286468506, + "learning_rate": 1.8e-05, + "loss": 0.5243, + "step": 3603 + }, + { + "epoch": 0.2018143129129802, + "grad_norm": 1.225938081741333, + "learning_rate": 1.8005e-05, + "loss": 0.3694, + "step": 3604 + }, + { + "epoch": 0.2018703102251092, + "grad_norm": 1.445648431777954, + "learning_rate": 1.8010000000000002e-05, + "loss": 0.3522, + "step": 3605 + }, + { + "epoch": 0.20192630753723823, + "grad_norm": 1.427200436592102, + "learning_rate": 1.8015000000000003e-05, + "loss": 0.5047, + "step": 3606 + }, + { + "epoch": 0.20198230484936722, + "grad_norm": 1.5686529874801636, + "learning_rate": 1.802e-05, + "loss": 0.6975, + "step": 3607 + }, + { + "epoch": 0.20203830216149624, + "grad_norm": 1.1451834440231323, + "learning_rate": 1.8025e-05, + "loss": 0.4568, + "step": 3608 + }, + { + "epoch": 0.20209429947362526, + "grad_norm": 1.3459903001785278, + "learning_rate": 1.803e-05, + "loss": 0.5214, + "step": 3609 + }, + { + "epoch": 0.20215029678575427, + "grad_norm": 1.2125604152679443, + "learning_rate": 1.8035000000000003e-05, + "loss": 0.3619, + "step": 3610 + }, + { + "epoch": 0.2022062940978833, + "grad_norm": 0.9751166105270386, + "learning_rate": 1.804e-05, + "loss": 0.5279, + "step": 3611 + }, + { + "epoch": 0.2022622914100123, + "grad_norm": 1.3576555252075195, + "learning_rate": 1.8045e-05, + "loss": 0.4481, + "step": 3612 + }, + { + "epoch": 0.20231828872214133, + "grad_norm": 2.4999935626983643, + "learning_rate": 1.805e-05, + "loss": 0.4593, + "step": 3613 + }, + { + "epoch": 0.20237428603427035, + "grad_norm": 0.9820563197135925, + "learning_rate": 1.8055e-05, + "loss": 0.3046, + "step": 3614 + }, + { + "epoch": 0.20243028334639937, + "grad_norm": 1.426029920578003, + "learning_rate": 1.8060000000000003e-05, + "loss": 0.4896, + "step": 3615 + }, + { + "epoch": 0.2024862806585284, + "grad_norm": 1.2235302925109863, + "learning_rate": 1.8065e-05, + "loss": 0.3901, + "step": 3616 + }, + { + "epoch": 0.2025422779706574, + "grad_norm": 1.4202797412872314, + "learning_rate": 1.807e-05, + "loss": 0.4293, + "step": 3617 + }, + { + "epoch": 0.20259827528278643, + "grad_norm": 1.1697888374328613, + "learning_rate": 1.8075e-05, + "loss": 0.4151, + "step": 3618 + }, + { + "epoch": 0.20265427259491545, + "grad_norm": 1.2334222793579102, + "learning_rate": 1.808e-05, + "loss": 0.4224, + "step": 3619 + }, + { + "epoch": 0.20271026990704447, + "grad_norm": 1.3797261714935303, + "learning_rate": 1.8085e-05, + "loss": 0.488, + "step": 3620 + }, + { + "epoch": 0.20276626721917348, + "grad_norm": 0.9996871948242188, + "learning_rate": 1.809e-05, + "loss": 0.3075, + "step": 3621 + }, + { + "epoch": 0.2028222645313025, + "grad_norm": 1.453730583190918, + "learning_rate": 1.8095000000000002e-05, + "loss": 0.394, + "step": 3622 + }, + { + "epoch": 0.20287826184343152, + "grad_norm": 1.36322820186615, + "learning_rate": 1.81e-05, + "loss": 0.3731, + "step": 3623 + }, + { + "epoch": 0.20293425915556054, + "grad_norm": 1.352899432182312, + "learning_rate": 1.8105e-05, + "loss": 0.5815, + "step": 3624 + }, + { + "epoch": 0.20299025646768956, + "grad_norm": 1.3098249435424805, + "learning_rate": 1.811e-05, + "loss": 0.5279, + "step": 3625 + }, + { + "epoch": 0.20304625377981858, + "grad_norm": 1.2320561408996582, + "learning_rate": 1.8115000000000002e-05, + "loss": 0.4337, + "step": 3626 + }, + { + "epoch": 0.2031022510919476, + "grad_norm": 1.6283172369003296, + "learning_rate": 1.812e-05, + "loss": 0.3657, + "step": 3627 + }, + { + "epoch": 0.2031582484040766, + "grad_norm": 1.1894358396530151, + "learning_rate": 1.8125e-05, + "loss": 0.4657, + "step": 3628 + }, + { + "epoch": 0.2032142457162056, + "grad_norm": 1.1958972215652466, + "learning_rate": 1.813e-05, + "loss": 0.4009, + "step": 3629 + }, + { + "epoch": 0.20327024302833463, + "grad_norm": 1.127791404724121, + "learning_rate": 1.8135000000000002e-05, + "loss": 0.4063, + "step": 3630 + }, + { + "epoch": 0.20332624034046365, + "grad_norm": 1.1458134651184082, + "learning_rate": 1.8140000000000003e-05, + "loss": 0.4677, + "step": 3631 + }, + { + "epoch": 0.20338223765259267, + "grad_norm": 1.2265417575836182, + "learning_rate": 1.8145e-05, + "loss": 0.4175, + "step": 3632 + }, + { + "epoch": 0.2034382349647217, + "grad_norm": 1.3974417448043823, + "learning_rate": 1.815e-05, + "loss": 0.4813, + "step": 3633 + }, + { + "epoch": 0.2034942322768507, + "grad_norm": 1.2176661491394043, + "learning_rate": 1.8154999999999998e-05, + "loss": 0.4138, + "step": 3634 + }, + { + "epoch": 0.20355022958897973, + "grad_norm": 1.1092934608459473, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.4615, + "step": 3635 + }, + { + "epoch": 0.20360622690110874, + "grad_norm": 1.218031883239746, + "learning_rate": 1.8165000000000003e-05, + "loss": 0.5333, + "step": 3636 + }, + { + "epoch": 0.20366222421323776, + "grad_norm": 1.1778126955032349, + "learning_rate": 1.817e-05, + "loss": 0.4052, + "step": 3637 + }, + { + "epoch": 0.20371822152536678, + "grad_norm": 1.1710573434829712, + "learning_rate": 1.8175e-05, + "loss": 0.3803, + "step": 3638 + }, + { + "epoch": 0.2037742188374958, + "grad_norm": 1.1477383375167847, + "learning_rate": 1.818e-05, + "loss": 0.4362, + "step": 3639 + }, + { + "epoch": 0.20383021614962482, + "grad_norm": 1.2956640720367432, + "learning_rate": 1.8185000000000003e-05, + "loss": 0.4826, + "step": 3640 + }, + { + "epoch": 0.20388621346175384, + "grad_norm": 1.3115049600601196, + "learning_rate": 1.819e-05, + "loss": 0.56, + "step": 3641 + }, + { + "epoch": 0.20394221077388286, + "grad_norm": 1.4746919870376587, + "learning_rate": 1.8195e-05, + "loss": 0.4711, + "step": 3642 + }, + { + "epoch": 0.20399820808601188, + "grad_norm": 1.923878788948059, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.5686, + "step": 3643 + }, + { + "epoch": 0.2040542053981409, + "grad_norm": 1.1285382509231567, + "learning_rate": 1.8205e-05, + "loss": 0.4183, + "step": 3644 + }, + { + "epoch": 0.20411020271026992, + "grad_norm": 1.5769509077072144, + "learning_rate": 1.8210000000000004e-05, + "loss": 0.4979, + "step": 3645 + }, + { + "epoch": 0.20416620002239894, + "grad_norm": 1.2398681640625, + "learning_rate": 1.8215e-05, + "loss": 0.4363, + "step": 3646 + }, + { + "epoch": 0.20422219733452796, + "grad_norm": 1.1484941244125366, + "learning_rate": 1.8220000000000002e-05, + "loss": 0.391, + "step": 3647 + }, + { + "epoch": 0.20427819464665697, + "grad_norm": 1.3237804174423218, + "learning_rate": 1.8225e-05, + "loss": 0.6652, + "step": 3648 + }, + { + "epoch": 0.20433419195878597, + "grad_norm": 1.0015528202056885, + "learning_rate": 1.823e-05, + "loss": 0.3516, + "step": 3649 + }, + { + "epoch": 0.20439018927091498, + "grad_norm": 1.793995976448059, + "learning_rate": 1.8235e-05, + "loss": 0.7392, + "step": 3650 + }, + { + "epoch": 0.204446186583044, + "grad_norm": 1.415885329246521, + "learning_rate": 1.824e-05, + "loss": 0.3627, + "step": 3651 + }, + { + "epoch": 0.20450218389517302, + "grad_norm": 1.268105149269104, + "learning_rate": 1.8245000000000002e-05, + "loss": 0.4041, + "step": 3652 + }, + { + "epoch": 0.20455818120730204, + "grad_norm": 1.5501044988632202, + "learning_rate": 1.825e-05, + "loss": 0.4487, + "step": 3653 + }, + { + "epoch": 0.20461417851943106, + "grad_norm": 2.0684571266174316, + "learning_rate": 1.8255e-05, + "loss": 0.4629, + "step": 3654 + }, + { + "epoch": 0.20467017583156008, + "grad_norm": 1.2597365379333496, + "learning_rate": 1.826e-05, + "loss": 0.4913, + "step": 3655 + }, + { + "epoch": 0.2047261731436891, + "grad_norm": 0.9715607166290283, + "learning_rate": 1.8265000000000002e-05, + "loss": 0.5863, + "step": 3656 + }, + { + "epoch": 0.20478217045581812, + "grad_norm": 1.2507647275924683, + "learning_rate": 1.827e-05, + "loss": 0.4449, + "step": 3657 + }, + { + "epoch": 0.20483816776794714, + "grad_norm": 1.1853148937225342, + "learning_rate": 1.8275e-05, + "loss": 0.4402, + "step": 3658 + }, + { + "epoch": 0.20489416508007616, + "grad_norm": 1.6511564254760742, + "learning_rate": 1.828e-05, + "loss": 0.4787, + "step": 3659 + }, + { + "epoch": 0.20495016239220518, + "grad_norm": 1.2150965929031372, + "learning_rate": 1.8285000000000002e-05, + "loss": 0.4149, + "step": 3660 + }, + { + "epoch": 0.2050061597043342, + "grad_norm": 1.2021454572677612, + "learning_rate": 1.8290000000000003e-05, + "loss": 0.4282, + "step": 3661 + }, + { + "epoch": 0.20506215701646321, + "grad_norm": 1.0750913619995117, + "learning_rate": 1.8295e-05, + "loss": 0.411, + "step": 3662 + }, + { + "epoch": 0.20511815432859223, + "grad_norm": 1.0906314849853516, + "learning_rate": 1.83e-05, + "loss": 0.4069, + "step": 3663 + }, + { + "epoch": 0.20517415164072125, + "grad_norm": 1.1319752931594849, + "learning_rate": 1.8305e-05, + "loss": 0.4575, + "step": 3664 + }, + { + "epoch": 0.20523014895285027, + "grad_norm": 1.0121747255325317, + "learning_rate": 1.8310000000000003e-05, + "loss": 0.4482, + "step": 3665 + }, + { + "epoch": 0.2052861462649793, + "grad_norm": 1.1749929189682007, + "learning_rate": 1.8315e-05, + "loss": 0.3555, + "step": 3666 + }, + { + "epoch": 0.2053421435771083, + "grad_norm": 1.1696017980575562, + "learning_rate": 1.832e-05, + "loss": 0.3813, + "step": 3667 + }, + { + "epoch": 0.20539814088923733, + "grad_norm": 1.187969446182251, + "learning_rate": 1.8325e-05, + "loss": 0.5685, + "step": 3668 + }, + { + "epoch": 0.20545413820136632, + "grad_norm": 1.239634394645691, + "learning_rate": 1.833e-05, + "loss": 0.4809, + "step": 3669 + }, + { + "epoch": 0.20551013551349534, + "grad_norm": 1.0861446857452393, + "learning_rate": 1.8335000000000003e-05, + "loss": 0.3965, + "step": 3670 + }, + { + "epoch": 0.20556613282562436, + "grad_norm": 1.1064029932022095, + "learning_rate": 1.834e-05, + "loss": 0.5242, + "step": 3671 + }, + { + "epoch": 0.20562213013775338, + "grad_norm": 1.0597418546676636, + "learning_rate": 1.8345e-05, + "loss": 0.4466, + "step": 3672 + }, + { + "epoch": 0.2056781274498824, + "grad_norm": 1.2875657081604004, + "learning_rate": 1.8350000000000002e-05, + "loss": 0.3883, + "step": 3673 + }, + { + "epoch": 0.20573412476201142, + "grad_norm": 1.3540278673171997, + "learning_rate": 1.8355e-05, + "loss": 0.5198, + "step": 3674 + }, + { + "epoch": 0.20579012207414044, + "grad_norm": 1.4080015420913696, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.4586, + "step": 3675 + }, + { + "epoch": 0.20584611938626945, + "grad_norm": 1.2323344945907593, + "learning_rate": 1.8365e-05, + "loss": 0.5165, + "step": 3676 + }, + { + "epoch": 0.20590211669839847, + "grad_norm": 1.1104493141174316, + "learning_rate": 1.8370000000000002e-05, + "loss": 0.3237, + "step": 3677 + }, + { + "epoch": 0.2059581140105275, + "grad_norm": 1.1640260219573975, + "learning_rate": 1.8375e-05, + "loss": 0.4496, + "step": 3678 + }, + { + "epoch": 0.2060141113226565, + "grad_norm": 1.1987717151641846, + "learning_rate": 1.838e-05, + "loss": 0.383, + "step": 3679 + }, + { + "epoch": 0.20607010863478553, + "grad_norm": 1.0047123432159424, + "learning_rate": 1.8385e-05, + "loss": 0.4024, + "step": 3680 + }, + { + "epoch": 0.20612610594691455, + "grad_norm": 1.3158748149871826, + "learning_rate": 1.8390000000000002e-05, + "loss": 0.3559, + "step": 3681 + }, + { + "epoch": 0.20618210325904357, + "grad_norm": 1.124266505241394, + "learning_rate": 1.8395000000000003e-05, + "loss": 0.3636, + "step": 3682 + }, + { + "epoch": 0.2062381005711726, + "grad_norm": 1.1218291521072388, + "learning_rate": 1.84e-05, + "loss": 0.4406, + "step": 3683 + }, + { + "epoch": 0.2062940978833016, + "grad_norm": 1.4445165395736694, + "learning_rate": 1.8405e-05, + "loss": 0.6085, + "step": 3684 + }, + { + "epoch": 0.20635009519543063, + "grad_norm": 1.0143694877624512, + "learning_rate": 1.841e-05, + "loss": 0.3806, + "step": 3685 + }, + { + "epoch": 0.20640609250755965, + "grad_norm": 1.389467477798462, + "learning_rate": 1.8415000000000002e-05, + "loss": 0.5131, + "step": 3686 + }, + { + "epoch": 0.20646208981968867, + "grad_norm": 1.1578280925750732, + "learning_rate": 1.842e-05, + "loss": 0.4549, + "step": 3687 + }, + { + "epoch": 0.20651808713181768, + "grad_norm": 1.009496808052063, + "learning_rate": 1.8425e-05, + "loss": 0.3502, + "step": 3688 + }, + { + "epoch": 0.2065740844439467, + "grad_norm": 1.3251539468765259, + "learning_rate": 1.843e-05, + "loss": 0.4412, + "step": 3689 + }, + { + "epoch": 0.2066300817560757, + "grad_norm": 1.3728529214859009, + "learning_rate": 1.8435000000000002e-05, + "loss": 0.4851, + "step": 3690 + }, + { + "epoch": 0.20668607906820471, + "grad_norm": 1.215119481086731, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.4707, + "step": 3691 + }, + { + "epoch": 0.20674207638033373, + "grad_norm": 1.2378907203674316, + "learning_rate": 1.8445e-05, + "loss": 0.4372, + "step": 3692 + }, + { + "epoch": 0.20679807369246275, + "grad_norm": 1.4404093027114868, + "learning_rate": 1.845e-05, + "loss": 0.4095, + "step": 3693 + }, + { + "epoch": 0.20685407100459177, + "grad_norm": 1.1341023445129395, + "learning_rate": 1.8455e-05, + "loss": 0.5088, + "step": 3694 + }, + { + "epoch": 0.2069100683167208, + "grad_norm": 1.326324701309204, + "learning_rate": 1.846e-05, + "loss": 0.3672, + "step": 3695 + }, + { + "epoch": 0.2069660656288498, + "grad_norm": 1.1963605880737305, + "learning_rate": 1.8465e-05, + "loss": 0.4715, + "step": 3696 + }, + { + "epoch": 0.20702206294097883, + "grad_norm": 1.351576566696167, + "learning_rate": 1.847e-05, + "loss": 0.538, + "step": 3697 + }, + { + "epoch": 0.20707806025310785, + "grad_norm": 1.4372308254241943, + "learning_rate": 1.8475000000000002e-05, + "loss": 0.3944, + "step": 3698 + }, + { + "epoch": 0.20713405756523687, + "grad_norm": 1.6565874814987183, + "learning_rate": 1.848e-05, + "loss": 0.4954, + "step": 3699 + }, + { + "epoch": 0.2071900548773659, + "grad_norm": 1.2670085430145264, + "learning_rate": 1.8485e-05, + "loss": 0.3647, + "step": 3700 + }, + { + "epoch": 0.2072460521894949, + "grad_norm": 1.227007508277893, + "learning_rate": 1.849e-05, + "loss": 0.3455, + "step": 3701 + }, + { + "epoch": 0.20730204950162393, + "grad_norm": 1.4911811351776123, + "learning_rate": 1.8495e-05, + "loss": 0.5222, + "step": 3702 + }, + { + "epoch": 0.20735804681375294, + "grad_norm": 1.2862108945846558, + "learning_rate": 1.85e-05, + "loss": 0.4129, + "step": 3703 + }, + { + "epoch": 0.20741404412588196, + "grad_norm": 1.2085920572280884, + "learning_rate": 1.8505e-05, + "loss": 0.4963, + "step": 3704 + }, + { + "epoch": 0.20747004143801098, + "grad_norm": 1.2827470302581787, + "learning_rate": 1.851e-05, + "loss": 0.4467, + "step": 3705 + }, + { + "epoch": 0.20752603875014, + "grad_norm": 1.2292492389678955, + "learning_rate": 1.8515e-05, + "loss": 0.5528, + "step": 3706 + }, + { + "epoch": 0.20758203606226902, + "grad_norm": 1.120884895324707, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.3977, + "step": 3707 + }, + { + "epoch": 0.20763803337439804, + "grad_norm": 1.2557817697525024, + "learning_rate": 1.8525e-05, + "loss": 0.3356, + "step": 3708 + }, + { + "epoch": 0.20769403068652706, + "grad_norm": 1.2214707136154175, + "learning_rate": 1.853e-05, + "loss": 0.3781, + "step": 3709 + }, + { + "epoch": 0.20775002799865608, + "grad_norm": 1.42682945728302, + "learning_rate": 1.8535e-05, + "loss": 0.5651, + "step": 3710 + }, + { + "epoch": 0.20780602531078507, + "grad_norm": 1.4732400178909302, + "learning_rate": 1.8540000000000002e-05, + "loss": 0.5905, + "step": 3711 + }, + { + "epoch": 0.2078620226229141, + "grad_norm": 1.1726034879684448, + "learning_rate": 1.8545000000000003e-05, + "loss": 0.5278, + "step": 3712 + }, + { + "epoch": 0.2079180199350431, + "grad_norm": 1.4470715522766113, + "learning_rate": 1.855e-05, + "loss": 0.4314, + "step": 3713 + }, + { + "epoch": 0.20797401724717213, + "grad_norm": 2.5902199745178223, + "learning_rate": 1.8555e-05, + "loss": 0.3967, + "step": 3714 + }, + { + "epoch": 0.20803001455930115, + "grad_norm": 1.2021496295928955, + "learning_rate": 1.856e-05, + "loss": 0.3446, + "step": 3715 + }, + { + "epoch": 0.20808601187143017, + "grad_norm": 1.0416159629821777, + "learning_rate": 1.8565000000000003e-05, + "loss": 0.3733, + "step": 3716 + }, + { + "epoch": 0.20814200918355918, + "grad_norm": 1.1497995853424072, + "learning_rate": 1.857e-05, + "loss": 0.4278, + "step": 3717 + }, + { + "epoch": 0.2081980064956882, + "grad_norm": 1.2033779621124268, + "learning_rate": 1.8575e-05, + "loss": 0.4386, + "step": 3718 + }, + { + "epoch": 0.20825400380781722, + "grad_norm": 1.0341720581054688, + "learning_rate": 1.858e-05, + "loss": 0.4024, + "step": 3719 + }, + { + "epoch": 0.20831000111994624, + "grad_norm": 1.464414358139038, + "learning_rate": 1.8585e-05, + "loss": 0.3869, + "step": 3720 + }, + { + "epoch": 0.20836599843207526, + "grad_norm": 1.1235709190368652, + "learning_rate": 1.8590000000000003e-05, + "loss": 0.5638, + "step": 3721 + }, + { + "epoch": 0.20842199574420428, + "grad_norm": 1.870194911956787, + "learning_rate": 1.8595e-05, + "loss": 0.4259, + "step": 3722 + }, + { + "epoch": 0.2084779930563333, + "grad_norm": 1.689988613128662, + "learning_rate": 1.86e-05, + "loss": 0.5645, + "step": 3723 + }, + { + "epoch": 0.20853399036846232, + "grad_norm": 1.531330943107605, + "learning_rate": 1.8605e-05, + "loss": 0.3632, + "step": 3724 + }, + { + "epoch": 0.20858998768059134, + "grad_norm": 1.2564024925231934, + "learning_rate": 1.861e-05, + "loss": 0.3609, + "step": 3725 + }, + { + "epoch": 0.20864598499272036, + "grad_norm": 1.226952314376831, + "learning_rate": 1.8615e-05, + "loss": 0.4338, + "step": 3726 + }, + { + "epoch": 0.20870198230484938, + "grad_norm": 1.4002666473388672, + "learning_rate": 1.862e-05, + "loss": 0.4459, + "step": 3727 + }, + { + "epoch": 0.2087579796169784, + "grad_norm": 1.1037551164627075, + "learning_rate": 1.8625000000000002e-05, + "loss": 0.4196, + "step": 3728 + }, + { + "epoch": 0.20881397692910741, + "grad_norm": 1.1568371057510376, + "learning_rate": 1.863e-05, + "loss": 0.6159, + "step": 3729 + }, + { + "epoch": 0.20886997424123643, + "grad_norm": 1.2428112030029297, + "learning_rate": 1.8635e-05, + "loss": 0.5167, + "step": 3730 + }, + { + "epoch": 0.20892597155336542, + "grad_norm": 1.3571763038635254, + "learning_rate": 1.864e-05, + "loss": 0.4118, + "step": 3731 + }, + { + "epoch": 0.20898196886549444, + "grad_norm": 1.3883105516433716, + "learning_rate": 1.8645000000000002e-05, + "loss": 0.4858, + "step": 3732 + }, + { + "epoch": 0.20903796617762346, + "grad_norm": 1.0853148698806763, + "learning_rate": 1.865e-05, + "loss": 0.4217, + "step": 3733 + }, + { + "epoch": 0.20909396348975248, + "grad_norm": 1.1567766666412354, + "learning_rate": 1.8655e-05, + "loss": 0.4391, + "step": 3734 + }, + { + "epoch": 0.2091499608018815, + "grad_norm": 1.1084656715393066, + "learning_rate": 1.866e-05, + "loss": 0.4222, + "step": 3735 + }, + { + "epoch": 0.20920595811401052, + "grad_norm": 1.4658235311508179, + "learning_rate": 1.8665000000000002e-05, + "loss": 0.4633, + "step": 3736 + }, + { + "epoch": 0.20926195542613954, + "grad_norm": 1.1171648502349854, + "learning_rate": 1.8670000000000003e-05, + "loss": 0.4406, + "step": 3737 + }, + { + "epoch": 0.20931795273826856, + "grad_norm": 1.4457217454910278, + "learning_rate": 1.8675e-05, + "loss": 0.4031, + "step": 3738 + }, + { + "epoch": 0.20937395005039758, + "grad_norm": 1.199464201927185, + "learning_rate": 1.868e-05, + "loss": 0.4826, + "step": 3739 + }, + { + "epoch": 0.2094299473625266, + "grad_norm": 1.1101138591766357, + "learning_rate": 1.8684999999999998e-05, + "loss": 0.4125, + "step": 3740 + }, + { + "epoch": 0.20948594467465562, + "grad_norm": 1.6544722318649292, + "learning_rate": 1.8690000000000002e-05, + "loss": 0.4482, + "step": 3741 + }, + { + "epoch": 0.20954194198678464, + "grad_norm": 1.0128848552703857, + "learning_rate": 1.8695e-05, + "loss": 0.3927, + "step": 3742 + }, + { + "epoch": 0.20959793929891365, + "grad_norm": 1.020386815071106, + "learning_rate": 1.87e-05, + "loss": 0.317, + "step": 3743 + }, + { + "epoch": 0.20965393661104267, + "grad_norm": 1.2744553089141846, + "learning_rate": 1.8705e-05, + "loss": 0.4031, + "step": 3744 + }, + { + "epoch": 0.2097099339231717, + "grad_norm": 1.2086420059204102, + "learning_rate": 1.871e-05, + "loss": 0.4046, + "step": 3745 + }, + { + "epoch": 0.2097659312353007, + "grad_norm": 1.2720385789871216, + "learning_rate": 1.8715000000000003e-05, + "loss": 0.4306, + "step": 3746 + }, + { + "epoch": 0.20982192854742973, + "grad_norm": 1.245012640953064, + "learning_rate": 1.872e-05, + "loss": 0.4181, + "step": 3747 + }, + { + "epoch": 0.20987792585955875, + "grad_norm": 1.219249963760376, + "learning_rate": 1.8725e-05, + "loss": 0.4987, + "step": 3748 + }, + { + "epoch": 0.20993392317168777, + "grad_norm": 1.035294532775879, + "learning_rate": 1.8730000000000002e-05, + "loss": 0.3916, + "step": 3749 + }, + { + "epoch": 0.2099899204838168, + "grad_norm": 1.2753604650497437, + "learning_rate": 1.8735e-05, + "loss": 0.4066, + "step": 3750 + }, + { + "epoch": 0.2100459177959458, + "grad_norm": 2.5616674423217773, + "learning_rate": 1.8740000000000004e-05, + "loss": 0.5153, + "step": 3751 + }, + { + "epoch": 0.2101019151080748, + "grad_norm": 1.2599841356277466, + "learning_rate": 1.8745e-05, + "loss": 0.4677, + "step": 3752 + }, + { + "epoch": 0.21015791242020382, + "grad_norm": 1.5064753293991089, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.559, + "step": 3753 + }, + { + "epoch": 0.21021390973233284, + "grad_norm": 1.3997769355773926, + "learning_rate": 1.8755e-05, + "loss": 0.4659, + "step": 3754 + }, + { + "epoch": 0.21026990704446186, + "grad_norm": 1.1182029247283936, + "learning_rate": 1.876e-05, + "loss": 0.2785, + "step": 3755 + }, + { + "epoch": 0.21032590435659088, + "grad_norm": 1.0288310050964355, + "learning_rate": 1.8765e-05, + "loss": 0.3188, + "step": 3756 + }, + { + "epoch": 0.2103819016687199, + "grad_norm": 1.3543319702148438, + "learning_rate": 1.877e-05, + "loss": 0.4312, + "step": 3757 + }, + { + "epoch": 0.21043789898084891, + "grad_norm": 1.339989423751831, + "learning_rate": 1.8775000000000002e-05, + "loss": 0.5552, + "step": 3758 + }, + { + "epoch": 0.21049389629297793, + "grad_norm": 1.3205184936523438, + "learning_rate": 1.878e-05, + "loss": 0.6186, + "step": 3759 + }, + { + "epoch": 0.21054989360510695, + "grad_norm": 1.0515899658203125, + "learning_rate": 1.8785e-05, + "loss": 0.2768, + "step": 3760 + }, + { + "epoch": 0.21060589091723597, + "grad_norm": 1.117658257484436, + "learning_rate": 1.879e-05, + "loss": 0.5759, + "step": 3761 + }, + { + "epoch": 0.210661888229365, + "grad_norm": 1.1721341609954834, + "learning_rate": 1.8795000000000002e-05, + "loss": 0.369, + "step": 3762 + }, + { + "epoch": 0.210717885541494, + "grad_norm": 1.2887459993362427, + "learning_rate": 1.88e-05, + "loss": 0.4338, + "step": 3763 + }, + { + "epoch": 0.21077388285362303, + "grad_norm": 1.096023440361023, + "learning_rate": 1.8805e-05, + "loss": 0.3805, + "step": 3764 + }, + { + "epoch": 0.21082988016575205, + "grad_norm": 1.596562147140503, + "learning_rate": 1.881e-05, + "loss": 0.4453, + "step": 3765 + }, + { + "epoch": 0.21088587747788107, + "grad_norm": 1.3703455924987793, + "learning_rate": 1.8815000000000002e-05, + "loss": 0.4875, + "step": 3766 + }, + { + "epoch": 0.2109418747900101, + "grad_norm": 1.202178716659546, + "learning_rate": 1.8820000000000003e-05, + "loss": 0.4183, + "step": 3767 + }, + { + "epoch": 0.2109978721021391, + "grad_norm": 1.2643823623657227, + "learning_rate": 1.8825e-05, + "loss": 0.3607, + "step": 3768 + }, + { + "epoch": 0.21105386941426812, + "grad_norm": 1.2469091415405273, + "learning_rate": 1.883e-05, + "loss": 0.5585, + "step": 3769 + }, + { + "epoch": 0.21110986672639714, + "grad_norm": 1.277901530265808, + "learning_rate": 1.8835e-05, + "loss": 0.4921, + "step": 3770 + }, + { + "epoch": 0.21116586403852616, + "grad_norm": 1.6774253845214844, + "learning_rate": 1.8840000000000003e-05, + "loss": 0.7642, + "step": 3771 + }, + { + "epoch": 0.21122186135065518, + "grad_norm": 1.413717269897461, + "learning_rate": 1.8845e-05, + "loss": 0.4913, + "step": 3772 + }, + { + "epoch": 0.21127785866278417, + "grad_norm": 1.151466965675354, + "learning_rate": 1.885e-05, + "loss": 0.3912, + "step": 3773 + }, + { + "epoch": 0.2113338559749132, + "grad_norm": 1.195562720298767, + "learning_rate": 1.8855e-05, + "loss": 0.3835, + "step": 3774 + }, + { + "epoch": 0.2113898532870422, + "grad_norm": 1.1106895208358765, + "learning_rate": 1.886e-05, + "loss": 0.4649, + "step": 3775 + }, + { + "epoch": 0.21144585059917123, + "grad_norm": 1.3207541704177856, + "learning_rate": 1.8865000000000003e-05, + "loss": 0.5763, + "step": 3776 + }, + { + "epoch": 0.21150184791130025, + "grad_norm": 1.8818163871765137, + "learning_rate": 1.887e-05, + "loss": 0.3591, + "step": 3777 + }, + { + "epoch": 0.21155784522342927, + "grad_norm": 1.256289005279541, + "learning_rate": 1.8875e-05, + "loss": 0.4754, + "step": 3778 + }, + { + "epoch": 0.2116138425355583, + "grad_norm": 1.5186047554016113, + "learning_rate": 1.888e-05, + "loss": 0.6175, + "step": 3779 + }, + { + "epoch": 0.2116698398476873, + "grad_norm": 1.3236587047576904, + "learning_rate": 1.8885e-05, + "loss": 0.4407, + "step": 3780 + }, + { + "epoch": 0.21172583715981633, + "grad_norm": 1.3085664510726929, + "learning_rate": 1.8890000000000004e-05, + "loss": 0.4197, + "step": 3781 + }, + { + "epoch": 0.21178183447194535, + "grad_norm": 1.1369835138320923, + "learning_rate": 1.8895e-05, + "loss": 0.3776, + "step": 3782 + }, + { + "epoch": 0.21183783178407437, + "grad_norm": 1.1113334894180298, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.3816, + "step": 3783 + }, + { + "epoch": 0.21189382909620338, + "grad_norm": 1.2961411476135254, + "learning_rate": 1.8905e-05, + "loss": 0.4108, + "step": 3784 + }, + { + "epoch": 0.2119498264083324, + "grad_norm": 1.045507550239563, + "learning_rate": 1.891e-05, + "loss": 0.3978, + "step": 3785 + }, + { + "epoch": 0.21200582372046142, + "grad_norm": 1.2163691520690918, + "learning_rate": 1.8915e-05, + "loss": 0.4174, + "step": 3786 + }, + { + "epoch": 0.21206182103259044, + "grad_norm": 1.1749398708343506, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.5375, + "step": 3787 + }, + { + "epoch": 0.21211781834471946, + "grad_norm": 1.2830992937088013, + "learning_rate": 1.8925000000000003e-05, + "loss": 0.4793, + "step": 3788 + }, + { + "epoch": 0.21217381565684848, + "grad_norm": 1.1235780715942383, + "learning_rate": 1.893e-05, + "loss": 0.4108, + "step": 3789 + }, + { + "epoch": 0.2122298129689775, + "grad_norm": 1.4002350568771362, + "learning_rate": 1.8935e-05, + "loss": 0.6034, + "step": 3790 + }, + { + "epoch": 0.21228581028110652, + "grad_norm": 1.2956151962280273, + "learning_rate": 1.894e-05, + "loss": 0.5659, + "step": 3791 + }, + { + "epoch": 0.21234180759323554, + "grad_norm": 1.1184310913085938, + "learning_rate": 1.8945000000000002e-05, + "loss": 0.3904, + "step": 3792 + }, + { + "epoch": 0.21239780490536453, + "grad_norm": 1.0486329793930054, + "learning_rate": 1.895e-05, + "loss": 0.3441, + "step": 3793 + }, + { + "epoch": 0.21245380221749355, + "grad_norm": 1.1602412462234497, + "learning_rate": 1.8955e-05, + "loss": 0.4124, + "step": 3794 + }, + { + "epoch": 0.21250979952962257, + "grad_norm": 1.1501165628433228, + "learning_rate": 1.896e-05, + "loss": 0.4024, + "step": 3795 + }, + { + "epoch": 0.2125657968417516, + "grad_norm": 1.0928531885147095, + "learning_rate": 1.8965000000000002e-05, + "loss": 0.3906, + "step": 3796 + }, + { + "epoch": 0.2126217941538806, + "grad_norm": 1.1243789196014404, + "learning_rate": 1.8970000000000003e-05, + "loss": 0.3966, + "step": 3797 + }, + { + "epoch": 0.21267779146600962, + "grad_norm": 1.264003872871399, + "learning_rate": 1.8975e-05, + "loss": 0.5284, + "step": 3798 + }, + { + "epoch": 0.21273378877813864, + "grad_norm": 1.400215983390808, + "learning_rate": 1.898e-05, + "loss": 0.4005, + "step": 3799 + }, + { + "epoch": 0.21278978609026766, + "grad_norm": 1.1315284967422485, + "learning_rate": 1.8985e-05, + "loss": 0.3573, + "step": 3800 + }, + { + "epoch": 0.21284578340239668, + "grad_norm": 1.10823392868042, + "learning_rate": 1.8990000000000003e-05, + "loss": 0.4071, + "step": 3801 + }, + { + "epoch": 0.2129017807145257, + "grad_norm": 1.2107127904891968, + "learning_rate": 1.8995e-05, + "loss": 0.4024, + "step": 3802 + }, + { + "epoch": 0.21295777802665472, + "grad_norm": 1.29421067237854, + "learning_rate": 1.9e-05, + "loss": 0.4936, + "step": 3803 + }, + { + "epoch": 0.21301377533878374, + "grad_norm": 2.305823802947998, + "learning_rate": 1.9005000000000002e-05, + "loss": 0.326, + "step": 3804 + }, + { + "epoch": 0.21306977265091276, + "grad_norm": 1.4411567449569702, + "learning_rate": 1.901e-05, + "loss": 0.5825, + "step": 3805 + }, + { + "epoch": 0.21312576996304178, + "grad_norm": 0.961778461933136, + "learning_rate": 1.9015000000000003e-05, + "loss": 0.3066, + "step": 3806 + }, + { + "epoch": 0.2131817672751708, + "grad_norm": 1.2145847082138062, + "learning_rate": 1.902e-05, + "loss": 0.4319, + "step": 3807 + }, + { + "epoch": 0.21323776458729982, + "grad_norm": 1.2316093444824219, + "learning_rate": 1.9025e-05, + "loss": 0.3642, + "step": 3808 + }, + { + "epoch": 0.21329376189942884, + "grad_norm": 1.1941778659820557, + "learning_rate": 1.903e-05, + "loss": 0.4714, + "step": 3809 + }, + { + "epoch": 0.21334975921155785, + "grad_norm": 1.402944564819336, + "learning_rate": 1.9035e-05, + "loss": 0.4408, + "step": 3810 + }, + { + "epoch": 0.21340575652368687, + "grad_norm": 1.1802160739898682, + "learning_rate": 1.904e-05, + "loss": 0.3552, + "step": 3811 + }, + { + "epoch": 0.2134617538358159, + "grad_norm": 1.1805145740509033, + "learning_rate": 1.9045e-05, + "loss": 0.4566, + "step": 3812 + }, + { + "epoch": 0.2135177511479449, + "grad_norm": 1.1233844757080078, + "learning_rate": 1.9050000000000002e-05, + "loss": 0.3404, + "step": 3813 + }, + { + "epoch": 0.2135737484600739, + "grad_norm": 1.4245500564575195, + "learning_rate": 1.9055e-05, + "loss": 0.4563, + "step": 3814 + }, + { + "epoch": 0.21362974577220292, + "grad_norm": 1.065510869026184, + "learning_rate": 1.906e-05, + "loss": 0.3891, + "step": 3815 + }, + { + "epoch": 0.21368574308433194, + "grad_norm": 1.2466912269592285, + "learning_rate": 1.9064999999999998e-05, + "loss": 0.4299, + "step": 3816 + }, + { + "epoch": 0.21374174039646096, + "grad_norm": 1.3753843307495117, + "learning_rate": 1.9070000000000002e-05, + "loss": 0.5251, + "step": 3817 + }, + { + "epoch": 0.21379773770858998, + "grad_norm": 1.0985254049301147, + "learning_rate": 1.9075000000000003e-05, + "loss": 0.4184, + "step": 3818 + }, + { + "epoch": 0.213853735020719, + "grad_norm": 1.2996129989624023, + "learning_rate": 1.908e-05, + "loss": 0.3592, + "step": 3819 + }, + { + "epoch": 0.21390973233284802, + "grad_norm": 1.3616174459457397, + "learning_rate": 1.9085e-05, + "loss": 0.4883, + "step": 3820 + }, + { + "epoch": 0.21396572964497704, + "grad_norm": 1.2674696445465088, + "learning_rate": 1.909e-05, + "loss": 0.5054, + "step": 3821 + }, + { + "epoch": 0.21402172695710606, + "grad_norm": 1.243328332901001, + "learning_rate": 1.9095000000000003e-05, + "loss": 0.3761, + "step": 3822 + }, + { + "epoch": 0.21407772426923508, + "grad_norm": 1.2152774333953857, + "learning_rate": 1.91e-05, + "loss": 0.4544, + "step": 3823 + }, + { + "epoch": 0.2141337215813641, + "grad_norm": 1.3071707487106323, + "learning_rate": 1.9105e-05, + "loss": 0.4506, + "step": 3824 + }, + { + "epoch": 0.21418971889349311, + "grad_norm": 1.2705812454223633, + "learning_rate": 1.911e-05, + "loss": 0.4259, + "step": 3825 + }, + { + "epoch": 0.21424571620562213, + "grad_norm": 1.104065179824829, + "learning_rate": 1.9115e-05, + "loss": 0.4035, + "step": 3826 + }, + { + "epoch": 0.21430171351775115, + "grad_norm": 1.5568335056304932, + "learning_rate": 1.9120000000000003e-05, + "loss": 0.3943, + "step": 3827 + }, + { + "epoch": 0.21435771082988017, + "grad_norm": 1.2089629173278809, + "learning_rate": 1.9125e-05, + "loss": 0.4169, + "step": 3828 + }, + { + "epoch": 0.2144137081420092, + "grad_norm": 1.3171229362487793, + "learning_rate": 1.913e-05, + "loss": 0.4887, + "step": 3829 + }, + { + "epoch": 0.2144697054541382, + "grad_norm": 1.2432467937469482, + "learning_rate": 1.9135e-05, + "loss": 0.3612, + "step": 3830 + }, + { + "epoch": 0.21452570276626723, + "grad_norm": 1.291901707649231, + "learning_rate": 1.914e-05, + "loss": 0.4334, + "step": 3831 + }, + { + "epoch": 0.21458170007839625, + "grad_norm": 1.3384559154510498, + "learning_rate": 1.9145e-05, + "loss": 0.648, + "step": 3832 + }, + { + "epoch": 0.21463769739052527, + "grad_norm": 1.176081657409668, + "learning_rate": 1.915e-05, + "loss": 0.4228, + "step": 3833 + }, + { + "epoch": 0.2146936947026543, + "grad_norm": 1.0742981433868408, + "learning_rate": 1.9155000000000002e-05, + "loss": 0.386, + "step": 3834 + }, + { + "epoch": 0.21474969201478328, + "grad_norm": 1.1499167680740356, + "learning_rate": 1.916e-05, + "loss": 0.457, + "step": 3835 + }, + { + "epoch": 0.2148056893269123, + "grad_norm": 1.1521497964859009, + "learning_rate": 1.9165e-05, + "loss": 0.4305, + "step": 3836 + }, + { + "epoch": 0.21486168663904132, + "grad_norm": 1.220842957496643, + "learning_rate": 1.917e-05, + "loss": 0.4883, + "step": 3837 + }, + { + "epoch": 0.21491768395117034, + "grad_norm": 1.277765154838562, + "learning_rate": 1.9175000000000002e-05, + "loss": 0.5959, + "step": 3838 + }, + { + "epoch": 0.21497368126329935, + "grad_norm": 1.2280707359313965, + "learning_rate": 1.918e-05, + "loss": 0.3969, + "step": 3839 + }, + { + "epoch": 0.21502967857542837, + "grad_norm": 1.1846415996551514, + "learning_rate": 1.9185e-05, + "loss": 0.4591, + "step": 3840 + }, + { + "epoch": 0.2150856758875574, + "grad_norm": 1.2512959241867065, + "learning_rate": 1.919e-05, + "loss": 0.4162, + "step": 3841 + }, + { + "epoch": 0.2151416731996864, + "grad_norm": 1.3382731676101685, + "learning_rate": 1.9195000000000002e-05, + "loss": 0.4439, + "step": 3842 + }, + { + "epoch": 0.21519767051181543, + "grad_norm": 1.1869760751724243, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.3983, + "step": 3843 + }, + { + "epoch": 0.21525366782394445, + "grad_norm": 1.209533929824829, + "learning_rate": 1.9205e-05, + "loss": 0.4118, + "step": 3844 + }, + { + "epoch": 0.21530966513607347, + "grad_norm": 1.108127474784851, + "learning_rate": 1.921e-05, + "loss": 0.4376, + "step": 3845 + }, + { + "epoch": 0.2153656624482025, + "grad_norm": 1.2668966054916382, + "learning_rate": 1.9214999999999998e-05, + "loss": 0.4207, + "step": 3846 + }, + { + "epoch": 0.2154216597603315, + "grad_norm": 1.006713628768921, + "learning_rate": 1.9220000000000002e-05, + "loss": 0.4193, + "step": 3847 + }, + { + "epoch": 0.21547765707246053, + "grad_norm": 1.2234450578689575, + "learning_rate": 1.9225e-05, + "loss": 0.4321, + "step": 3848 + }, + { + "epoch": 0.21553365438458955, + "grad_norm": 1.158087134361267, + "learning_rate": 1.923e-05, + "loss": 0.4531, + "step": 3849 + }, + { + "epoch": 0.21558965169671856, + "grad_norm": 1.0506086349487305, + "learning_rate": 1.9235e-05, + "loss": 0.3995, + "step": 3850 + }, + { + "epoch": 0.21564564900884758, + "grad_norm": 1.504860520362854, + "learning_rate": 1.924e-05, + "loss": 0.4866, + "step": 3851 + }, + { + "epoch": 0.2157016463209766, + "grad_norm": 1.173839807510376, + "learning_rate": 1.9245000000000003e-05, + "loss": 0.3839, + "step": 3852 + }, + { + "epoch": 0.21575764363310562, + "grad_norm": 1.140916347503662, + "learning_rate": 1.925e-05, + "loss": 0.4344, + "step": 3853 + }, + { + "epoch": 0.21581364094523464, + "grad_norm": 1.2121093273162842, + "learning_rate": 1.9255e-05, + "loss": 0.4094, + "step": 3854 + }, + { + "epoch": 0.21586963825736363, + "grad_norm": 0.9509993195533752, + "learning_rate": 1.9260000000000002e-05, + "loss": 0.4051, + "step": 3855 + }, + { + "epoch": 0.21592563556949265, + "grad_norm": 1.1066420078277588, + "learning_rate": 1.9265e-05, + "loss": 0.5323, + "step": 3856 + }, + { + "epoch": 0.21598163288162167, + "grad_norm": 1.3245099782943726, + "learning_rate": 1.9270000000000004e-05, + "loss": 0.3931, + "step": 3857 + }, + { + "epoch": 0.2160376301937507, + "grad_norm": 1.0722681283950806, + "learning_rate": 1.9275e-05, + "loss": 0.3758, + "step": 3858 + }, + { + "epoch": 0.2160936275058797, + "grad_norm": 1.0039154291152954, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.294, + "step": 3859 + }, + { + "epoch": 0.21614962481800873, + "grad_norm": 1.2737786769866943, + "learning_rate": 1.9285e-05, + "loss": 0.3626, + "step": 3860 + }, + { + "epoch": 0.21620562213013775, + "grad_norm": 1.5499736070632935, + "learning_rate": 1.929e-05, + "loss": 0.4792, + "step": 3861 + }, + { + "epoch": 0.21626161944226677, + "grad_norm": 1.185977578163147, + "learning_rate": 1.9295e-05, + "loss": 0.3455, + "step": 3862 + }, + { + "epoch": 0.2163176167543958, + "grad_norm": 1.3004982471466064, + "learning_rate": 1.93e-05, + "loss": 0.4465, + "step": 3863 + }, + { + "epoch": 0.2163736140665248, + "grad_norm": 1.3049736022949219, + "learning_rate": 1.9305000000000002e-05, + "loss": 0.4994, + "step": 3864 + }, + { + "epoch": 0.21642961137865382, + "grad_norm": 1.0772696733474731, + "learning_rate": 1.931e-05, + "loss": 0.3288, + "step": 3865 + }, + { + "epoch": 0.21648560869078284, + "grad_norm": 1.2598570585250854, + "learning_rate": 1.9315e-05, + "loss": 0.4872, + "step": 3866 + }, + { + "epoch": 0.21654160600291186, + "grad_norm": 1.141111135482788, + "learning_rate": 1.932e-05, + "loss": 0.3861, + "step": 3867 + }, + { + "epoch": 0.21659760331504088, + "grad_norm": 1.1527847051620483, + "learning_rate": 1.9325000000000002e-05, + "loss": 0.3721, + "step": 3868 + }, + { + "epoch": 0.2166536006271699, + "grad_norm": 1.2805637121200562, + "learning_rate": 1.933e-05, + "loss": 0.4316, + "step": 3869 + }, + { + "epoch": 0.21670959793929892, + "grad_norm": 1.1219383478164673, + "learning_rate": 1.9335e-05, + "loss": 0.3921, + "step": 3870 + }, + { + "epoch": 0.21676559525142794, + "grad_norm": 1.264259696006775, + "learning_rate": 1.934e-05, + "loss": 0.518, + "step": 3871 + }, + { + "epoch": 0.21682159256355696, + "grad_norm": 1.3242542743682861, + "learning_rate": 1.9345000000000002e-05, + "loss": 0.7451, + "step": 3872 + }, + { + "epoch": 0.21687758987568598, + "grad_norm": 1.1558231115341187, + "learning_rate": 1.9350000000000003e-05, + "loss": 0.3191, + "step": 3873 + }, + { + "epoch": 0.216933587187815, + "grad_norm": 1.0650345087051392, + "learning_rate": 1.9355e-05, + "loss": 0.3282, + "step": 3874 + }, + { + "epoch": 0.21698958449994402, + "grad_norm": 1.3427801132202148, + "learning_rate": 1.936e-05, + "loss": 0.4338, + "step": 3875 + }, + { + "epoch": 0.217045581812073, + "grad_norm": 1.2418391704559326, + "learning_rate": 1.9365e-05, + "loss": 0.3861, + "step": 3876 + }, + { + "epoch": 0.21710157912420203, + "grad_norm": 1.1236191987991333, + "learning_rate": 1.9370000000000003e-05, + "loss": 0.3656, + "step": 3877 + }, + { + "epoch": 0.21715757643633105, + "grad_norm": 1.2333683967590332, + "learning_rate": 1.9375e-05, + "loss": 0.455, + "step": 3878 + }, + { + "epoch": 0.21721357374846006, + "grad_norm": 1.2741773128509521, + "learning_rate": 1.938e-05, + "loss": 0.5378, + "step": 3879 + }, + { + "epoch": 0.21726957106058908, + "grad_norm": 1.2516076564788818, + "learning_rate": 1.9385e-05, + "loss": 0.5302, + "step": 3880 + }, + { + "epoch": 0.2173255683727181, + "grad_norm": 1.4284601211547852, + "learning_rate": 1.939e-05, + "loss": 0.5134, + "step": 3881 + }, + { + "epoch": 0.21738156568484712, + "grad_norm": 1.147981882095337, + "learning_rate": 1.9395000000000003e-05, + "loss": 0.4111, + "step": 3882 + }, + { + "epoch": 0.21743756299697614, + "grad_norm": 1.3357466459274292, + "learning_rate": 1.94e-05, + "loss": 0.4595, + "step": 3883 + }, + { + "epoch": 0.21749356030910516, + "grad_norm": 1.2503118515014648, + "learning_rate": 1.9405e-05, + "loss": 0.4039, + "step": 3884 + }, + { + "epoch": 0.21754955762123418, + "grad_norm": 1.1886709928512573, + "learning_rate": 1.941e-05, + "loss": 0.5563, + "step": 3885 + }, + { + "epoch": 0.2176055549333632, + "grad_norm": 1.2082996368408203, + "learning_rate": 1.9415e-05, + "loss": 0.386, + "step": 3886 + }, + { + "epoch": 0.21766155224549222, + "grad_norm": 1.1673897504806519, + "learning_rate": 1.942e-05, + "loss": 0.4305, + "step": 3887 + }, + { + "epoch": 0.21771754955762124, + "grad_norm": 1.125820279121399, + "learning_rate": 1.9425e-05, + "loss": 0.3498, + "step": 3888 + }, + { + "epoch": 0.21777354686975026, + "grad_norm": 1.2262734174728394, + "learning_rate": 1.9430000000000002e-05, + "loss": 0.4673, + "step": 3889 + }, + { + "epoch": 0.21782954418187928, + "grad_norm": 1.3357839584350586, + "learning_rate": 1.9435e-05, + "loss": 0.3753, + "step": 3890 + }, + { + "epoch": 0.2178855414940083, + "grad_norm": 1.0969672203063965, + "learning_rate": 1.944e-05, + "loss": 0.3779, + "step": 3891 + }, + { + "epoch": 0.2179415388061373, + "grad_norm": 1.1183667182922363, + "learning_rate": 1.9445e-05, + "loss": 0.5111, + "step": 3892 + }, + { + "epoch": 0.21799753611826633, + "grad_norm": 1.522701621055603, + "learning_rate": 1.9450000000000002e-05, + "loss": 0.5003, + "step": 3893 + }, + { + "epoch": 0.21805353343039535, + "grad_norm": 1.3292375802993774, + "learning_rate": 1.9455000000000003e-05, + "loss": 0.4663, + "step": 3894 + }, + { + "epoch": 0.21810953074252437, + "grad_norm": 1.110558032989502, + "learning_rate": 1.946e-05, + "loss": 0.4689, + "step": 3895 + }, + { + "epoch": 0.2181655280546534, + "grad_norm": 1.293588638305664, + "learning_rate": 1.9465e-05, + "loss": 0.3785, + "step": 3896 + }, + { + "epoch": 0.21822152536678238, + "grad_norm": 1.0624083280563354, + "learning_rate": 1.947e-05, + "loss": 0.3824, + "step": 3897 + }, + { + "epoch": 0.2182775226789114, + "grad_norm": 1.5652928352355957, + "learning_rate": 1.9475000000000002e-05, + "loss": 0.491, + "step": 3898 + }, + { + "epoch": 0.21833351999104042, + "grad_norm": 1.1526103019714355, + "learning_rate": 1.948e-05, + "loss": 0.4683, + "step": 3899 + }, + { + "epoch": 0.21838951730316944, + "grad_norm": 0.9201048612594604, + "learning_rate": 1.9485e-05, + "loss": 0.2474, + "step": 3900 + }, + { + "epoch": 0.21844551461529846, + "grad_norm": 1.3379740715026855, + "learning_rate": 1.949e-05, + "loss": 0.4053, + "step": 3901 + }, + { + "epoch": 0.21850151192742748, + "grad_norm": 1.0218881368637085, + "learning_rate": 1.9495000000000002e-05, + "loss": 0.3582, + "step": 3902 + }, + { + "epoch": 0.2185575092395565, + "grad_norm": 1.1562334299087524, + "learning_rate": 1.9500000000000003e-05, + "loss": 0.6028, + "step": 3903 + }, + { + "epoch": 0.21861350655168552, + "grad_norm": 1.180961012840271, + "learning_rate": 1.9505e-05, + "loss": 0.5034, + "step": 3904 + }, + { + "epoch": 0.21866950386381453, + "grad_norm": 1.1061652898788452, + "learning_rate": 1.951e-05, + "loss": 0.3482, + "step": 3905 + }, + { + "epoch": 0.21872550117594355, + "grad_norm": 1.178895115852356, + "learning_rate": 1.9515e-05, + "loss": 0.4866, + "step": 3906 + }, + { + "epoch": 0.21878149848807257, + "grad_norm": 1.2782617807388306, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.4267, + "step": 3907 + }, + { + "epoch": 0.2188374958002016, + "grad_norm": 1.481217622756958, + "learning_rate": 1.9525e-05, + "loss": 0.4508, + "step": 3908 + }, + { + "epoch": 0.2188934931123306, + "grad_norm": 1.2187516689300537, + "learning_rate": 1.953e-05, + "loss": 0.5677, + "step": 3909 + }, + { + "epoch": 0.21894949042445963, + "grad_norm": 1.1630263328552246, + "learning_rate": 1.9535000000000002e-05, + "loss": 0.4421, + "step": 3910 + }, + { + "epoch": 0.21900548773658865, + "grad_norm": 1.4248945713043213, + "learning_rate": 1.954e-05, + "loss": 0.4959, + "step": 3911 + }, + { + "epoch": 0.21906148504871767, + "grad_norm": 1.3881953954696655, + "learning_rate": 1.9545000000000003e-05, + "loss": 0.5092, + "step": 3912 + }, + { + "epoch": 0.2191174823608467, + "grad_norm": 1.2719542980194092, + "learning_rate": 1.955e-05, + "loss": 0.5339, + "step": 3913 + }, + { + "epoch": 0.2191734796729757, + "grad_norm": 1.3698889017105103, + "learning_rate": 1.9555e-05, + "loss": 0.452, + "step": 3914 + }, + { + "epoch": 0.21922947698510473, + "grad_norm": 1.281928300857544, + "learning_rate": 1.956e-05, + "loss": 0.4297, + "step": 3915 + }, + { + "epoch": 0.21928547429723375, + "grad_norm": 1.2747917175292969, + "learning_rate": 1.9565e-05, + "loss": 0.4263, + "step": 3916 + }, + { + "epoch": 0.21934147160936274, + "grad_norm": 1.3958020210266113, + "learning_rate": 1.957e-05, + "loss": 0.4027, + "step": 3917 + }, + { + "epoch": 0.21939746892149176, + "grad_norm": 1.191336989402771, + "learning_rate": 1.9575e-05, + "loss": 0.4204, + "step": 3918 + }, + { + "epoch": 0.21945346623362078, + "grad_norm": 1.225386142730713, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.4535, + "step": 3919 + }, + { + "epoch": 0.2195094635457498, + "grad_norm": 1.052686095237732, + "learning_rate": 1.9585e-05, + "loss": 0.3126, + "step": 3920 + }, + { + "epoch": 0.2195654608578788, + "grad_norm": 1.013994574546814, + "learning_rate": 1.959e-05, + "loss": 0.4611, + "step": 3921 + }, + { + "epoch": 0.21962145817000783, + "grad_norm": 1.1470279693603516, + "learning_rate": 1.9595e-05, + "loss": 0.3578, + "step": 3922 + }, + { + "epoch": 0.21967745548213685, + "grad_norm": 1.25358247756958, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.3754, + "step": 3923 + }, + { + "epoch": 0.21973345279426587, + "grad_norm": 1.0770961046218872, + "learning_rate": 1.9605e-05, + "loss": 0.4533, + "step": 3924 + }, + { + "epoch": 0.2197894501063949, + "grad_norm": 1.4587650299072266, + "learning_rate": 1.961e-05, + "loss": 0.465, + "step": 3925 + }, + { + "epoch": 0.2198454474185239, + "grad_norm": 1.3744086027145386, + "learning_rate": 1.9615e-05, + "loss": 0.5234, + "step": 3926 + }, + { + "epoch": 0.21990144473065293, + "grad_norm": 5.708791732788086, + "learning_rate": 1.9620000000000002e-05, + "loss": 0.3558, + "step": 3927 + }, + { + "epoch": 0.21995744204278195, + "grad_norm": 1.502600908279419, + "learning_rate": 1.9625000000000003e-05, + "loss": 0.6503, + "step": 3928 + }, + { + "epoch": 0.22001343935491097, + "grad_norm": 1.393741488456726, + "learning_rate": 1.963e-05, + "loss": 0.4353, + "step": 3929 + }, + { + "epoch": 0.22006943666703999, + "grad_norm": 1.2048612833023071, + "learning_rate": 1.9635e-05, + "loss": 0.4566, + "step": 3930 + }, + { + "epoch": 0.220125433979169, + "grad_norm": 1.2519006729125977, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.6378, + "step": 3931 + }, + { + "epoch": 0.22018143129129802, + "grad_norm": 1.1331911087036133, + "learning_rate": 1.9645000000000002e-05, + "loss": 0.3402, + "step": 3932 + }, + { + "epoch": 0.22023742860342704, + "grad_norm": 1.2613248825073242, + "learning_rate": 1.9650000000000003e-05, + "loss": 0.4721, + "step": 3933 + }, + { + "epoch": 0.22029342591555606, + "grad_norm": 1.1872540712356567, + "learning_rate": 1.9655e-05, + "loss": 0.4616, + "step": 3934 + }, + { + "epoch": 0.22034942322768508, + "grad_norm": 1.1382859945297241, + "learning_rate": 1.966e-05, + "loss": 0.3649, + "step": 3935 + }, + { + "epoch": 0.2204054205398141, + "grad_norm": 1.139190673828125, + "learning_rate": 1.9665e-05, + "loss": 0.4429, + "step": 3936 + }, + { + "epoch": 0.22046141785194312, + "grad_norm": 1.1289645433425903, + "learning_rate": 1.9670000000000003e-05, + "loss": 0.4831, + "step": 3937 + }, + { + "epoch": 0.2205174151640721, + "grad_norm": 1.1161208152770996, + "learning_rate": 1.9675e-05, + "loss": 0.516, + "step": 3938 + }, + { + "epoch": 0.22057341247620113, + "grad_norm": 1.1716969013214111, + "learning_rate": 1.968e-05, + "loss": 0.3657, + "step": 3939 + }, + { + "epoch": 0.22062940978833015, + "grad_norm": 1.2149205207824707, + "learning_rate": 1.9685000000000002e-05, + "loss": 0.5391, + "step": 3940 + }, + { + "epoch": 0.22068540710045917, + "grad_norm": 1.528893232345581, + "learning_rate": 1.969e-05, + "loss": 0.4852, + "step": 3941 + }, + { + "epoch": 0.2207414044125882, + "grad_norm": 1.6157225370407104, + "learning_rate": 1.9695e-05, + "loss": 0.4268, + "step": 3942 + }, + { + "epoch": 0.2207974017247172, + "grad_norm": 1.2216227054595947, + "learning_rate": 1.97e-05, + "loss": 0.4365, + "step": 3943 + }, + { + "epoch": 0.22085339903684623, + "grad_norm": 1.2056119441986084, + "learning_rate": 1.9705000000000002e-05, + "loss": 0.3892, + "step": 3944 + }, + { + "epoch": 0.22090939634897525, + "grad_norm": 1.2248709201812744, + "learning_rate": 1.971e-05, + "loss": 0.4097, + "step": 3945 + }, + { + "epoch": 0.22096539366110426, + "grad_norm": 1.1583555936813354, + "learning_rate": 1.9715e-05, + "loss": 0.3983, + "step": 3946 + }, + { + "epoch": 0.22102139097323328, + "grad_norm": 1.0924626588821411, + "learning_rate": 1.972e-05, + "loss": 0.3504, + "step": 3947 + }, + { + "epoch": 0.2210773882853623, + "grad_norm": 1.202951431274414, + "learning_rate": 1.9725000000000002e-05, + "loss": 0.5088, + "step": 3948 + }, + { + "epoch": 0.22113338559749132, + "grad_norm": 1.0647072792053223, + "learning_rate": 1.9730000000000003e-05, + "loss": 0.4107, + "step": 3949 + }, + { + "epoch": 0.22118938290962034, + "grad_norm": 1.2809765338897705, + "learning_rate": 1.9735e-05, + "loss": 0.4364, + "step": 3950 + }, + { + "epoch": 0.22124538022174936, + "grad_norm": 1.095226526260376, + "learning_rate": 1.974e-05, + "loss": 0.4434, + "step": 3951 + }, + { + "epoch": 0.22130137753387838, + "grad_norm": 1.2039659023284912, + "learning_rate": 1.9744999999999998e-05, + "loss": 0.4319, + "step": 3952 + }, + { + "epoch": 0.2213573748460074, + "grad_norm": 1.1134363412857056, + "learning_rate": 1.9750000000000002e-05, + "loss": 0.3769, + "step": 3953 + }, + { + "epoch": 0.22141337215813642, + "grad_norm": 1.2964189052581787, + "learning_rate": 1.9755e-05, + "loss": 0.4453, + "step": 3954 + }, + { + "epoch": 0.22146936947026544, + "grad_norm": 1.1339832544326782, + "learning_rate": 1.976e-05, + "loss": 0.4056, + "step": 3955 + }, + { + "epoch": 0.22152536678239446, + "grad_norm": 1.169204831123352, + "learning_rate": 1.9765e-05, + "loss": 0.5727, + "step": 3956 + }, + { + "epoch": 0.22158136409452348, + "grad_norm": 1.220909833908081, + "learning_rate": 1.977e-05, + "loss": 0.4262, + "step": 3957 + }, + { + "epoch": 0.2216373614066525, + "grad_norm": 3.000483512878418, + "learning_rate": 1.9775000000000003e-05, + "loss": 0.5093, + "step": 3958 + }, + { + "epoch": 0.22169335871878149, + "grad_norm": 1.0850926637649536, + "learning_rate": 1.978e-05, + "loss": 0.3535, + "step": 3959 + }, + { + "epoch": 0.2217493560309105, + "grad_norm": 1.3514493703842163, + "learning_rate": 1.9785e-05, + "loss": 0.5564, + "step": 3960 + }, + { + "epoch": 0.22180535334303952, + "grad_norm": 1.3057911396026611, + "learning_rate": 1.979e-05, + "loss": 0.4899, + "step": 3961 + }, + { + "epoch": 0.22186135065516854, + "grad_norm": 1.3611317873001099, + "learning_rate": 1.9795e-05, + "loss": 0.461, + "step": 3962 + }, + { + "epoch": 0.22191734796729756, + "grad_norm": 0.9021110534667969, + "learning_rate": 1.9800000000000004e-05, + "loss": 0.3009, + "step": 3963 + }, + { + "epoch": 0.22197334527942658, + "grad_norm": 1.1194170713424683, + "learning_rate": 1.9805e-05, + "loss": 0.4004, + "step": 3964 + }, + { + "epoch": 0.2220293425915556, + "grad_norm": 1.1084462404251099, + "learning_rate": 1.9810000000000002e-05, + "loss": 0.4546, + "step": 3965 + }, + { + "epoch": 0.22208533990368462, + "grad_norm": 1.375201940536499, + "learning_rate": 1.9815e-05, + "loss": 0.4415, + "step": 3966 + }, + { + "epoch": 0.22214133721581364, + "grad_norm": 1.2064272165298462, + "learning_rate": 1.982e-05, + "loss": 0.3728, + "step": 3967 + }, + { + "epoch": 0.22219733452794266, + "grad_norm": 1.4434298276901245, + "learning_rate": 1.9825e-05, + "loss": 0.3807, + "step": 3968 + }, + { + "epoch": 0.22225333184007168, + "grad_norm": 1.2069168090820312, + "learning_rate": 1.983e-05, + "loss": 0.616, + "step": 3969 + }, + { + "epoch": 0.2223093291522007, + "grad_norm": 1.4005426168441772, + "learning_rate": 1.9835000000000002e-05, + "loss": 0.4133, + "step": 3970 + }, + { + "epoch": 0.22236532646432972, + "grad_norm": 1.2770768404006958, + "learning_rate": 1.984e-05, + "loss": 0.4056, + "step": 3971 + }, + { + "epoch": 0.22242132377645873, + "grad_norm": 1.7044603824615479, + "learning_rate": 1.9845e-05, + "loss": 0.5747, + "step": 3972 + }, + { + "epoch": 0.22247732108858775, + "grad_norm": 1.3524123430252075, + "learning_rate": 1.985e-05, + "loss": 0.5833, + "step": 3973 + }, + { + "epoch": 0.22253331840071677, + "grad_norm": 1.2796204090118408, + "learning_rate": 1.9855000000000002e-05, + "loss": 0.472, + "step": 3974 + }, + { + "epoch": 0.2225893157128458, + "grad_norm": 1.0586299896240234, + "learning_rate": 1.986e-05, + "loss": 0.4362, + "step": 3975 + }, + { + "epoch": 0.2226453130249748, + "grad_norm": 1.1387319564819336, + "learning_rate": 1.9865e-05, + "loss": 0.5358, + "step": 3976 + }, + { + "epoch": 0.22270131033710383, + "grad_norm": 1.3744721412658691, + "learning_rate": 1.987e-05, + "loss": 0.5624, + "step": 3977 + }, + { + "epoch": 0.22275730764923285, + "grad_norm": 1.2997620105743408, + "learning_rate": 1.9875000000000002e-05, + "loss": 0.4796, + "step": 3978 + }, + { + "epoch": 0.22281330496136184, + "grad_norm": 1.1262180805206299, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.5009, + "step": 3979 + }, + { + "epoch": 0.22286930227349086, + "grad_norm": 1.2399729490280151, + "learning_rate": 1.9885e-05, + "loss": 0.4173, + "step": 3980 + }, + { + "epoch": 0.22292529958561988, + "grad_norm": 1.1392662525177002, + "learning_rate": 1.989e-05, + "loss": 0.5183, + "step": 3981 + }, + { + "epoch": 0.2229812968977489, + "grad_norm": 1.3595386743545532, + "learning_rate": 1.9895e-05, + "loss": 0.4768, + "step": 3982 + }, + { + "epoch": 0.22303729420987792, + "grad_norm": 1.2113465070724487, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.4527, + "step": 3983 + }, + { + "epoch": 0.22309329152200694, + "grad_norm": 1.2641816139221191, + "learning_rate": 1.9905e-05, + "loss": 0.4014, + "step": 3984 + }, + { + "epoch": 0.22314928883413596, + "grad_norm": 1.269322156906128, + "learning_rate": 1.991e-05, + "loss": 0.4136, + "step": 3985 + }, + { + "epoch": 0.22320528614626498, + "grad_norm": 1.7250635623931885, + "learning_rate": 1.9915e-05, + "loss": 0.5067, + "step": 3986 + }, + { + "epoch": 0.223261283458394, + "grad_norm": 1.4223933219909668, + "learning_rate": 1.992e-05, + "loss": 0.3975, + "step": 3987 + }, + { + "epoch": 0.223317280770523, + "grad_norm": 1.2370339632034302, + "learning_rate": 1.9925000000000003e-05, + "loss": 0.4593, + "step": 3988 + }, + { + "epoch": 0.22337327808265203, + "grad_norm": 1.584632396697998, + "learning_rate": 1.993e-05, + "loss": 0.4526, + "step": 3989 + }, + { + "epoch": 0.22342927539478105, + "grad_norm": 1.1744674444198608, + "learning_rate": 1.9935e-05, + "loss": 0.4607, + "step": 3990 + }, + { + "epoch": 0.22348527270691007, + "grad_norm": 1.1270041465759277, + "learning_rate": 1.994e-05, + "loss": 0.3616, + "step": 3991 + }, + { + "epoch": 0.2235412700190391, + "grad_norm": 1.543769359588623, + "learning_rate": 1.9945e-05, + "loss": 0.439, + "step": 3992 + }, + { + "epoch": 0.2235972673311681, + "grad_norm": 1.010480284690857, + "learning_rate": 1.995e-05, + "loss": 0.3787, + "step": 3993 + }, + { + "epoch": 0.22365326464329713, + "grad_norm": 1.2416563034057617, + "learning_rate": 1.9955e-05, + "loss": 0.4614, + "step": 3994 + }, + { + "epoch": 0.22370926195542615, + "grad_norm": 1.3135097026824951, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.499, + "step": 3995 + }, + { + "epoch": 0.22376525926755517, + "grad_norm": 1.3679999113082886, + "learning_rate": 1.9965e-05, + "loss": 0.3731, + "step": 3996 + }, + { + "epoch": 0.22382125657968419, + "grad_norm": 1.2174789905548096, + "learning_rate": 1.997e-05, + "loss": 0.3717, + "step": 3997 + }, + { + "epoch": 0.2238772538918132, + "grad_norm": 1.1787962913513184, + "learning_rate": 1.9975e-05, + "loss": 0.4345, + "step": 3998 + }, + { + "epoch": 0.22393325120394222, + "grad_norm": 1.3835020065307617, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.4765, + "step": 3999 + }, + { + "epoch": 0.22398924851607122, + "grad_norm": 1.0391042232513428, + "learning_rate": 1.9985000000000003e-05, + "loss": 0.4041, + "step": 4000 + }, + { + "epoch": 0.22404524582820023, + "grad_norm": 1.2916306257247925, + "learning_rate": 1.999e-05, + "loss": 0.5135, + "step": 4001 + }, + { + "epoch": 0.22410124314032925, + "grad_norm": 1.3005999326705933, + "learning_rate": 1.9995e-05, + "loss": 0.3908, + "step": 4002 + }, + { + "epoch": 0.22415724045245827, + "grad_norm": 1.5253877639770508, + "learning_rate": 2e-05, + "loss": 0.5192, + "step": 4003 + }, + { + "epoch": 0.2242132377645873, + "grad_norm": 1.100231647491455, + "learning_rate": 2.0005000000000002e-05, + "loss": 0.4016, + "step": 4004 + }, + { + "epoch": 0.2242692350767163, + "grad_norm": 1.447638988494873, + "learning_rate": 2.001e-05, + "loss": 0.4658, + "step": 4005 + }, + { + "epoch": 0.22432523238884533, + "grad_norm": 1.2481557130813599, + "learning_rate": 2.0015e-05, + "loss": 0.4316, + "step": 4006 + }, + { + "epoch": 0.22438122970097435, + "grad_norm": 1.5969074964523315, + "learning_rate": 2.002e-05, + "loss": 0.5882, + "step": 4007 + }, + { + "epoch": 0.22443722701310337, + "grad_norm": 1.2705366611480713, + "learning_rate": 2.0025000000000002e-05, + "loss": 0.5284, + "step": 4008 + }, + { + "epoch": 0.2244932243252324, + "grad_norm": 1.133697509765625, + "learning_rate": 2.0030000000000003e-05, + "loss": 0.4072, + "step": 4009 + }, + { + "epoch": 0.2245492216373614, + "grad_norm": 0.9591661691665649, + "learning_rate": 2.0035e-05, + "loss": 0.4083, + "step": 4010 + }, + { + "epoch": 0.22460521894949043, + "grad_norm": 1.754600167274475, + "learning_rate": 2.004e-05, + "loss": 0.4694, + "step": 4011 + }, + { + "epoch": 0.22466121626161945, + "grad_norm": 1.204201340675354, + "learning_rate": 2.0045e-05, + "loss": 0.4599, + "step": 4012 + }, + { + "epoch": 0.22471721357374846, + "grad_norm": 1.0342845916748047, + "learning_rate": 2.0050000000000003e-05, + "loss": 0.2756, + "step": 4013 + }, + { + "epoch": 0.22477321088587748, + "grad_norm": 1.1289149522781372, + "learning_rate": 2.0055e-05, + "loss": 0.2926, + "step": 4014 + }, + { + "epoch": 0.2248292081980065, + "grad_norm": 1.4004709720611572, + "learning_rate": 2.006e-05, + "loss": 0.4329, + "step": 4015 + }, + { + "epoch": 0.22488520551013552, + "grad_norm": 1.1849826574325562, + "learning_rate": 2.0065000000000002e-05, + "loss": 0.4633, + "step": 4016 + }, + { + "epoch": 0.22494120282226454, + "grad_norm": 1.387434482574463, + "learning_rate": 2.007e-05, + "loss": 0.535, + "step": 4017 + }, + { + "epoch": 0.22499720013439356, + "grad_norm": 1.1173462867736816, + "learning_rate": 2.0075000000000003e-05, + "loss": 0.385, + "step": 4018 + }, + { + "epoch": 0.22505319744652258, + "grad_norm": 1.1251437664031982, + "learning_rate": 2.008e-05, + "loss": 0.4063, + "step": 4019 + }, + { + "epoch": 0.2251091947586516, + "grad_norm": 1.127577304840088, + "learning_rate": 2.0085e-05, + "loss": 0.3449, + "step": 4020 + }, + { + "epoch": 0.2251651920707806, + "grad_norm": 1.073022723197937, + "learning_rate": 2.009e-05, + "loss": 0.4331, + "step": 4021 + }, + { + "epoch": 0.2252211893829096, + "grad_norm": 1.1744555234909058, + "learning_rate": 2.0095e-05, + "loss": 0.3727, + "step": 4022 + }, + { + "epoch": 0.22527718669503863, + "grad_norm": 1.1534401178359985, + "learning_rate": 2.01e-05, + "loss": 0.4825, + "step": 4023 + }, + { + "epoch": 0.22533318400716765, + "grad_norm": 3.915041208267212, + "learning_rate": 2.0105e-05, + "loss": 0.3737, + "step": 4024 + }, + { + "epoch": 0.22538918131929667, + "grad_norm": 1.0764929056167603, + "learning_rate": 2.0110000000000002e-05, + "loss": 0.3697, + "step": 4025 + }, + { + "epoch": 0.22544517863142569, + "grad_norm": 1.2173855304718018, + "learning_rate": 2.0115e-05, + "loss": 0.4043, + "step": 4026 + }, + { + "epoch": 0.2255011759435547, + "grad_norm": 1.309613585472107, + "learning_rate": 2.012e-05, + "loss": 0.4685, + "step": 4027 + }, + { + "epoch": 0.22555717325568372, + "grad_norm": 1.1317449808120728, + "learning_rate": 2.0125e-05, + "loss": 0.5362, + "step": 4028 + }, + { + "epoch": 0.22561317056781274, + "grad_norm": 1.1389638185501099, + "learning_rate": 2.0130000000000002e-05, + "loss": 0.336, + "step": 4029 + }, + { + "epoch": 0.22566916787994176, + "grad_norm": 1.1004079580307007, + "learning_rate": 2.0135e-05, + "loss": 0.3205, + "step": 4030 + }, + { + "epoch": 0.22572516519207078, + "grad_norm": 1.545257329940796, + "learning_rate": 2.014e-05, + "loss": 0.5128, + "step": 4031 + }, + { + "epoch": 0.2257811625041998, + "grad_norm": 1.1331626176834106, + "learning_rate": 2.0145e-05, + "loss": 0.4678, + "step": 4032 + }, + { + "epoch": 0.22583715981632882, + "grad_norm": 1.08872652053833, + "learning_rate": 2.0150000000000002e-05, + "loss": 0.4114, + "step": 4033 + }, + { + "epoch": 0.22589315712845784, + "grad_norm": 1.0941650867462158, + "learning_rate": 2.0155000000000003e-05, + "loss": 0.4096, + "step": 4034 + }, + { + "epoch": 0.22594915444058686, + "grad_norm": 1.186246633529663, + "learning_rate": 2.016e-05, + "loss": 0.407, + "step": 4035 + }, + { + "epoch": 0.22600515175271588, + "grad_norm": 1.2471370697021484, + "learning_rate": 2.0165e-05, + "loss": 0.4056, + "step": 4036 + }, + { + "epoch": 0.2260611490648449, + "grad_norm": 1.18431556224823, + "learning_rate": 2.017e-05, + "loss": 0.3996, + "step": 4037 + }, + { + "epoch": 0.22611714637697392, + "grad_norm": 1.24180006980896, + "learning_rate": 2.0175000000000003e-05, + "loss": 0.4061, + "step": 4038 + }, + { + "epoch": 0.22617314368910293, + "grad_norm": 1.2733039855957031, + "learning_rate": 2.0180000000000003e-05, + "loss": 0.4062, + "step": 4039 + }, + { + "epoch": 0.22622914100123195, + "grad_norm": 1.229092001914978, + "learning_rate": 2.0185e-05, + "loss": 0.4002, + "step": 4040 + }, + { + "epoch": 0.22628513831336095, + "grad_norm": 1.216475009918213, + "learning_rate": 2.019e-05, + "loss": 0.4704, + "step": 4041 + }, + { + "epoch": 0.22634113562548996, + "grad_norm": 1.1875768899917603, + "learning_rate": 2.0195e-05, + "loss": 0.3753, + "step": 4042 + }, + { + "epoch": 0.22639713293761898, + "grad_norm": 1.3118387460708618, + "learning_rate": 2.0200000000000003e-05, + "loss": 0.3932, + "step": 4043 + }, + { + "epoch": 0.226453130249748, + "grad_norm": 1.1569195985794067, + "learning_rate": 2.0205e-05, + "loss": 0.4384, + "step": 4044 + }, + { + "epoch": 0.22650912756187702, + "grad_norm": 1.3034249544143677, + "learning_rate": 2.021e-05, + "loss": 0.4687, + "step": 4045 + }, + { + "epoch": 0.22656512487400604, + "grad_norm": 1.364123821258545, + "learning_rate": 2.0215000000000002e-05, + "loss": 0.4705, + "step": 4046 + }, + { + "epoch": 0.22662112218613506, + "grad_norm": 1.1235162019729614, + "learning_rate": 2.022e-05, + "loss": 0.373, + "step": 4047 + }, + { + "epoch": 0.22667711949826408, + "grad_norm": 1.688078761100769, + "learning_rate": 2.0225000000000004e-05, + "loss": 0.4103, + "step": 4048 + }, + { + "epoch": 0.2267331168103931, + "grad_norm": 1.3599395751953125, + "learning_rate": 2.023e-05, + "loss": 0.4554, + "step": 4049 + }, + { + "epoch": 0.22678911412252212, + "grad_norm": 1.1683887243270874, + "learning_rate": 2.0235000000000002e-05, + "loss": 0.5024, + "step": 4050 + }, + { + "epoch": 0.22684511143465114, + "grad_norm": 1.1350637674331665, + "learning_rate": 2.024e-05, + "loss": 0.3886, + "step": 4051 + }, + { + "epoch": 0.22690110874678016, + "grad_norm": 1.2215251922607422, + "learning_rate": 2.0245e-05, + "loss": 0.4779, + "step": 4052 + }, + { + "epoch": 0.22695710605890917, + "grad_norm": 1.2877695560455322, + "learning_rate": 2.025e-05, + "loss": 0.5126, + "step": 4053 + }, + { + "epoch": 0.2270131033710382, + "grad_norm": 1.397469162940979, + "learning_rate": 2.0255000000000002e-05, + "loss": 0.5584, + "step": 4054 + }, + { + "epoch": 0.2270691006831672, + "grad_norm": 1.0369408130645752, + "learning_rate": 2.0260000000000003e-05, + "loss": 0.3528, + "step": 4055 + }, + { + "epoch": 0.22712509799529623, + "grad_norm": 1.805389165878296, + "learning_rate": 2.0265e-05, + "loss": 0.4645, + "step": 4056 + }, + { + "epoch": 0.22718109530742525, + "grad_norm": 1.299943208694458, + "learning_rate": 2.027e-05, + "loss": 0.4551, + "step": 4057 + }, + { + "epoch": 0.22723709261955427, + "grad_norm": 1.1022294759750366, + "learning_rate": 2.0275e-05, + "loss": 0.3104, + "step": 4058 + }, + { + "epoch": 0.2272930899316833, + "grad_norm": 1.477035641670227, + "learning_rate": 2.0280000000000002e-05, + "loss": 0.4018, + "step": 4059 + }, + { + "epoch": 0.2273490872438123, + "grad_norm": 1.0145372152328491, + "learning_rate": 2.0285e-05, + "loss": 0.3613, + "step": 4060 + }, + { + "epoch": 0.22740508455594133, + "grad_norm": 1.1505661010742188, + "learning_rate": 2.029e-05, + "loss": 0.4782, + "step": 4061 + }, + { + "epoch": 0.22746108186807032, + "grad_norm": 1.1380447149276733, + "learning_rate": 2.0295e-05, + "loss": 0.3925, + "step": 4062 + }, + { + "epoch": 0.22751707918019934, + "grad_norm": 1.3335957527160645, + "learning_rate": 2.0300000000000002e-05, + "loss": 0.3813, + "step": 4063 + }, + { + "epoch": 0.22757307649232836, + "grad_norm": 1.0564080476760864, + "learning_rate": 2.0305000000000003e-05, + "loss": 0.4491, + "step": 4064 + }, + { + "epoch": 0.22762907380445738, + "grad_norm": 1.1848081350326538, + "learning_rate": 2.031e-05, + "loss": 0.4345, + "step": 4065 + }, + { + "epoch": 0.2276850711165864, + "grad_norm": 1.1922739744186401, + "learning_rate": 2.0315e-05, + "loss": 0.4108, + "step": 4066 + }, + { + "epoch": 0.22774106842871542, + "grad_norm": 1.2298710346221924, + "learning_rate": 2.032e-05, + "loss": 0.474, + "step": 4067 + }, + { + "epoch": 0.22779706574084443, + "grad_norm": 1.4893807172775269, + "learning_rate": 2.0325e-05, + "loss": 0.567, + "step": 4068 + }, + { + "epoch": 0.22785306305297345, + "grad_norm": 1.437177062034607, + "learning_rate": 2.033e-05, + "loss": 0.4056, + "step": 4069 + }, + { + "epoch": 0.22790906036510247, + "grad_norm": 1.269019365310669, + "learning_rate": 2.0335e-05, + "loss": 0.4033, + "step": 4070 + }, + { + "epoch": 0.2279650576772315, + "grad_norm": 1.282313585281372, + "learning_rate": 2.0340000000000002e-05, + "loss": 0.4129, + "step": 4071 + }, + { + "epoch": 0.2280210549893605, + "grad_norm": 1.614296317100525, + "learning_rate": 2.0345e-05, + "loss": 0.4413, + "step": 4072 + }, + { + "epoch": 0.22807705230148953, + "grad_norm": 1.1022065877914429, + "learning_rate": 2.035e-05, + "loss": 0.2764, + "step": 4073 + }, + { + "epoch": 0.22813304961361855, + "grad_norm": 1.1078407764434814, + "learning_rate": 2.0355e-05, + "loss": 0.4086, + "step": 4074 + }, + { + "epoch": 0.22818904692574757, + "grad_norm": 1.0584485530853271, + "learning_rate": 2.036e-05, + "loss": 0.4224, + "step": 4075 + }, + { + "epoch": 0.2282450442378766, + "grad_norm": 11.007890701293945, + "learning_rate": 2.0365000000000002e-05, + "loss": 0.6179, + "step": 4076 + }, + { + "epoch": 0.2283010415500056, + "grad_norm": 1.19613778591156, + "learning_rate": 2.037e-05, + "loss": 0.4075, + "step": 4077 + }, + { + "epoch": 0.22835703886213463, + "grad_norm": 1.2499363422393799, + "learning_rate": 2.0375e-05, + "loss": 0.3791, + "step": 4078 + }, + { + "epoch": 0.22841303617426365, + "grad_norm": 1.310246229171753, + "learning_rate": 2.038e-05, + "loss": 0.4022, + "step": 4079 + }, + { + "epoch": 0.22846903348639266, + "grad_norm": 1.4538041353225708, + "learning_rate": 2.0385000000000002e-05, + "loss": 0.5483, + "step": 4080 + }, + { + "epoch": 0.22852503079852168, + "grad_norm": 1.1844452619552612, + "learning_rate": 2.039e-05, + "loss": 0.4425, + "step": 4081 + }, + { + "epoch": 0.2285810281106507, + "grad_norm": 1.2127856016159058, + "learning_rate": 2.0395e-05, + "loss": 0.4141, + "step": 4082 + }, + { + "epoch": 0.2286370254227797, + "grad_norm": 1.1689798831939697, + "learning_rate": 2.04e-05, + "loss": 0.4201, + "step": 4083 + }, + { + "epoch": 0.2286930227349087, + "grad_norm": 1.2544913291931152, + "learning_rate": 2.0405000000000002e-05, + "loss": 0.5006, + "step": 4084 + }, + { + "epoch": 0.22874902004703773, + "grad_norm": 1.3648995161056519, + "learning_rate": 2.0410000000000003e-05, + "loss": 0.386, + "step": 4085 + }, + { + "epoch": 0.22880501735916675, + "grad_norm": 1.1508864164352417, + "learning_rate": 2.0415e-05, + "loss": 0.364, + "step": 4086 + }, + { + "epoch": 0.22886101467129577, + "grad_norm": 1.1088776588439941, + "learning_rate": 2.042e-05, + "loss": 0.397, + "step": 4087 + }, + { + "epoch": 0.2289170119834248, + "grad_norm": 1.291202425956726, + "learning_rate": 2.0425e-05, + "loss": 0.4527, + "step": 4088 + }, + { + "epoch": 0.2289730092955538, + "grad_norm": 1.4738807678222656, + "learning_rate": 2.0430000000000003e-05, + "loss": 0.4579, + "step": 4089 + }, + { + "epoch": 0.22902900660768283, + "grad_norm": 1.0973998308181763, + "learning_rate": 2.0435e-05, + "loss": 0.2876, + "step": 4090 + }, + { + "epoch": 0.22908500391981185, + "grad_norm": 1.024002194404602, + "learning_rate": 2.044e-05, + "loss": 0.4462, + "step": 4091 + }, + { + "epoch": 0.22914100123194087, + "grad_norm": 1.2808518409729004, + "learning_rate": 2.0445e-05, + "loss": 0.3622, + "step": 4092 + }, + { + "epoch": 0.22919699854406989, + "grad_norm": 1.0888512134552002, + "learning_rate": 2.045e-05, + "loss": 0.3681, + "step": 4093 + }, + { + "epoch": 0.2292529958561989, + "grad_norm": 0.9593205451965332, + "learning_rate": 2.0455000000000003e-05, + "loss": 0.3167, + "step": 4094 + }, + { + "epoch": 0.22930899316832792, + "grad_norm": 1.2199854850769043, + "learning_rate": 2.046e-05, + "loss": 0.441, + "step": 4095 + }, + { + "epoch": 0.22936499048045694, + "grad_norm": 1.281033992767334, + "learning_rate": 2.0465e-05, + "loss": 0.5411, + "step": 4096 + }, + { + "epoch": 0.22942098779258596, + "grad_norm": 1.123522162437439, + "learning_rate": 2.047e-05, + "loss": 0.3339, + "step": 4097 + }, + { + "epoch": 0.22947698510471498, + "grad_norm": 0.994236409664154, + "learning_rate": 2.0475e-05, + "loss": 0.3565, + "step": 4098 + }, + { + "epoch": 0.229532982416844, + "grad_norm": 1.1076732873916626, + "learning_rate": 2.048e-05, + "loss": 0.4364, + "step": 4099 + }, + { + "epoch": 0.22958897972897302, + "grad_norm": 1.6202208995819092, + "learning_rate": 2.0485e-05, + "loss": 0.4889, + "step": 4100 + }, + { + "epoch": 0.22964497704110204, + "grad_norm": 1.2983157634735107, + "learning_rate": 2.0490000000000002e-05, + "loss": 0.5233, + "step": 4101 + }, + { + "epoch": 0.22970097435323106, + "grad_norm": 1.2962462902069092, + "learning_rate": 2.0495e-05, + "loss": 0.5092, + "step": 4102 + }, + { + "epoch": 0.22975697166536005, + "grad_norm": 1.4778438806533813, + "learning_rate": 2.05e-05, + "loss": 0.5732, + "step": 4103 + }, + { + "epoch": 0.22981296897748907, + "grad_norm": 1.2814544439315796, + "learning_rate": 2.0505e-05, + "loss": 0.4424, + "step": 4104 + }, + { + "epoch": 0.2298689662896181, + "grad_norm": 1.4536662101745605, + "learning_rate": 2.0510000000000002e-05, + "loss": 0.5741, + "step": 4105 + }, + { + "epoch": 0.2299249636017471, + "grad_norm": 1.1617083549499512, + "learning_rate": 2.0515e-05, + "loss": 0.4934, + "step": 4106 + }, + { + "epoch": 0.22998096091387613, + "grad_norm": 1.3995189666748047, + "learning_rate": 2.052e-05, + "loss": 0.3733, + "step": 4107 + }, + { + "epoch": 0.23003695822600514, + "grad_norm": 0.9614766836166382, + "learning_rate": 2.0525e-05, + "loss": 0.3484, + "step": 4108 + }, + { + "epoch": 0.23009295553813416, + "grad_norm": 1.136983036994934, + "learning_rate": 2.053e-05, + "loss": 0.3716, + "step": 4109 + }, + { + "epoch": 0.23014895285026318, + "grad_norm": 1.257845163345337, + "learning_rate": 2.0535000000000002e-05, + "loss": 0.4114, + "step": 4110 + }, + { + "epoch": 0.2302049501623922, + "grad_norm": 1.6189639568328857, + "learning_rate": 2.054e-05, + "loss": 0.6255, + "step": 4111 + }, + { + "epoch": 0.23026094747452122, + "grad_norm": 1.0244381427764893, + "learning_rate": 2.0545e-05, + "loss": 0.2547, + "step": 4112 + }, + { + "epoch": 0.23031694478665024, + "grad_norm": 1.1978487968444824, + "learning_rate": 2.055e-05, + "loss": 0.3887, + "step": 4113 + }, + { + "epoch": 0.23037294209877926, + "grad_norm": 1.0918502807617188, + "learning_rate": 2.0555000000000002e-05, + "loss": 0.3757, + "step": 4114 + }, + { + "epoch": 0.23042893941090828, + "grad_norm": 1.8906546831130981, + "learning_rate": 2.0560000000000003e-05, + "loss": 0.4998, + "step": 4115 + }, + { + "epoch": 0.2304849367230373, + "grad_norm": 1.2287076711654663, + "learning_rate": 2.0565e-05, + "loss": 0.4845, + "step": 4116 + }, + { + "epoch": 0.23054093403516632, + "grad_norm": 1.193910002708435, + "learning_rate": 2.057e-05, + "loss": 0.3618, + "step": 4117 + }, + { + "epoch": 0.23059693134729534, + "grad_norm": 1.2260141372680664, + "learning_rate": 2.0575e-05, + "loss": 0.414, + "step": 4118 + }, + { + "epoch": 0.23065292865942436, + "grad_norm": 1.1606512069702148, + "learning_rate": 2.0580000000000003e-05, + "loss": 0.3471, + "step": 4119 + }, + { + "epoch": 0.23070892597155337, + "grad_norm": 1.0225188732147217, + "learning_rate": 2.0585e-05, + "loss": 0.4305, + "step": 4120 + }, + { + "epoch": 0.2307649232836824, + "grad_norm": 1.2030072212219238, + "learning_rate": 2.059e-05, + "loss": 0.3368, + "step": 4121 + }, + { + "epoch": 0.2308209205958114, + "grad_norm": 1.2328826189041138, + "learning_rate": 2.0595000000000002e-05, + "loss": 0.5293, + "step": 4122 + }, + { + "epoch": 0.23087691790794043, + "grad_norm": 1.2912899255752563, + "learning_rate": 2.06e-05, + "loss": 0.5007, + "step": 4123 + }, + { + "epoch": 0.23093291522006942, + "grad_norm": 1.3425554037094116, + "learning_rate": 2.0605000000000003e-05, + "loss": 0.5251, + "step": 4124 + }, + { + "epoch": 0.23098891253219844, + "grad_norm": 1.1544756889343262, + "learning_rate": 2.061e-05, + "loss": 0.3617, + "step": 4125 + }, + { + "epoch": 0.23104490984432746, + "grad_norm": 1.1377413272857666, + "learning_rate": 2.0615e-05, + "loss": 0.3603, + "step": 4126 + }, + { + "epoch": 0.23110090715645648, + "grad_norm": 1.1988937854766846, + "learning_rate": 2.062e-05, + "loss": 0.473, + "step": 4127 + }, + { + "epoch": 0.2311569044685855, + "grad_norm": 1.0015439987182617, + "learning_rate": 2.0625e-05, + "loss": 0.4049, + "step": 4128 + }, + { + "epoch": 0.23121290178071452, + "grad_norm": 1.2456467151641846, + "learning_rate": 2.063e-05, + "loss": 0.403, + "step": 4129 + }, + { + "epoch": 0.23126889909284354, + "grad_norm": 1.124685287475586, + "learning_rate": 2.0635e-05, + "loss": 0.405, + "step": 4130 + }, + { + "epoch": 0.23132489640497256, + "grad_norm": 1.2655459642410278, + "learning_rate": 2.0640000000000002e-05, + "loss": 0.354, + "step": 4131 + }, + { + "epoch": 0.23138089371710158, + "grad_norm": 1.039529800415039, + "learning_rate": 2.0645e-05, + "loss": 0.354, + "step": 4132 + }, + { + "epoch": 0.2314368910292306, + "grad_norm": 1.238182783126831, + "learning_rate": 2.065e-05, + "loss": 0.3983, + "step": 4133 + }, + { + "epoch": 0.23149288834135962, + "grad_norm": 1.2212363481521606, + "learning_rate": 2.0655e-05, + "loss": 0.4486, + "step": 4134 + }, + { + "epoch": 0.23154888565348863, + "grad_norm": 1.1392161846160889, + "learning_rate": 2.0660000000000002e-05, + "loss": 0.3777, + "step": 4135 + }, + { + "epoch": 0.23160488296561765, + "grad_norm": 1.199353575706482, + "learning_rate": 2.0665e-05, + "loss": 0.2687, + "step": 4136 + }, + { + "epoch": 0.23166088027774667, + "grad_norm": 1.4285528659820557, + "learning_rate": 2.067e-05, + "loss": 0.2833, + "step": 4137 + }, + { + "epoch": 0.2317168775898757, + "grad_norm": 1.139581561088562, + "learning_rate": 2.0675e-05, + "loss": 0.4413, + "step": 4138 + }, + { + "epoch": 0.2317728749020047, + "grad_norm": 1.2959959506988525, + "learning_rate": 2.0680000000000002e-05, + "loss": 0.4027, + "step": 4139 + }, + { + "epoch": 0.23182887221413373, + "grad_norm": 1.315169095993042, + "learning_rate": 2.0685000000000003e-05, + "loss": 0.4878, + "step": 4140 + }, + { + "epoch": 0.23188486952626275, + "grad_norm": 1.1356548070907593, + "learning_rate": 2.069e-05, + "loss": 0.4531, + "step": 4141 + }, + { + "epoch": 0.23194086683839177, + "grad_norm": 1.150536060333252, + "learning_rate": 2.0695e-05, + "loss": 0.4032, + "step": 4142 + }, + { + "epoch": 0.2319968641505208, + "grad_norm": 1.3828458786010742, + "learning_rate": 2.07e-05, + "loss": 0.6142, + "step": 4143 + }, + { + "epoch": 0.2320528614626498, + "grad_norm": 1.0190892219543457, + "learning_rate": 2.0705000000000003e-05, + "loss": 0.4076, + "step": 4144 + }, + { + "epoch": 0.2321088587747788, + "grad_norm": 1.242693543434143, + "learning_rate": 2.0710000000000003e-05, + "loss": 0.4358, + "step": 4145 + }, + { + "epoch": 0.23216485608690782, + "grad_norm": 1.4580390453338623, + "learning_rate": 2.0715e-05, + "loss": 0.5017, + "step": 4146 + }, + { + "epoch": 0.23222085339903684, + "grad_norm": 1.630743384361267, + "learning_rate": 2.072e-05, + "loss": 0.5403, + "step": 4147 + }, + { + "epoch": 0.23227685071116586, + "grad_norm": 1.3127927780151367, + "learning_rate": 2.0725e-05, + "loss": 0.5772, + "step": 4148 + }, + { + "epoch": 0.23233284802329487, + "grad_norm": 1.32776939868927, + "learning_rate": 2.0730000000000003e-05, + "loss": 0.4103, + "step": 4149 + }, + { + "epoch": 0.2323888453354239, + "grad_norm": 1.1280535459518433, + "learning_rate": 2.0735e-05, + "loss": 0.3944, + "step": 4150 + }, + { + "epoch": 0.2324448426475529, + "grad_norm": 1.0871868133544922, + "learning_rate": 2.074e-05, + "loss": 0.4578, + "step": 4151 + }, + { + "epoch": 0.23250083995968193, + "grad_norm": 1.211596965789795, + "learning_rate": 2.0745000000000002e-05, + "loss": 0.3587, + "step": 4152 + }, + { + "epoch": 0.23255683727181095, + "grad_norm": 1.4957422018051147, + "learning_rate": 2.075e-05, + "loss": 0.4731, + "step": 4153 + }, + { + "epoch": 0.23261283458393997, + "grad_norm": 1.413071632385254, + "learning_rate": 2.0755000000000004e-05, + "loss": 0.5488, + "step": 4154 + }, + { + "epoch": 0.232668831896069, + "grad_norm": 1.5196342468261719, + "learning_rate": 2.076e-05, + "loss": 0.4164, + "step": 4155 + }, + { + "epoch": 0.232724829208198, + "grad_norm": 4.492232799530029, + "learning_rate": 2.0765000000000002e-05, + "loss": 0.589, + "step": 4156 + }, + { + "epoch": 0.23278082652032703, + "grad_norm": 1.2077269554138184, + "learning_rate": 2.077e-05, + "loss": 0.3548, + "step": 4157 + }, + { + "epoch": 0.23283682383245605, + "grad_norm": 0.9782938957214355, + "learning_rate": 2.0775e-05, + "loss": 0.301, + "step": 4158 + }, + { + "epoch": 0.23289282114458507, + "grad_norm": 1.1544853448867798, + "learning_rate": 2.078e-05, + "loss": 0.428, + "step": 4159 + }, + { + "epoch": 0.23294881845671409, + "grad_norm": 1.3924914598464966, + "learning_rate": 2.0785000000000002e-05, + "loss": 0.3908, + "step": 4160 + }, + { + "epoch": 0.2330048157688431, + "grad_norm": 1.404755711555481, + "learning_rate": 2.0790000000000003e-05, + "loss": 0.5763, + "step": 4161 + }, + { + "epoch": 0.23306081308097212, + "grad_norm": 1.3203518390655518, + "learning_rate": 2.0795e-05, + "loss": 0.4953, + "step": 4162 + }, + { + "epoch": 0.23311681039310114, + "grad_norm": 1.2612299919128418, + "learning_rate": 2.08e-05, + "loss": 0.3418, + "step": 4163 + }, + { + "epoch": 0.23317280770523016, + "grad_norm": 1.3685733079910278, + "learning_rate": 2.0805e-05, + "loss": 0.4777, + "step": 4164 + }, + { + "epoch": 0.23322880501735915, + "grad_norm": 1.4660611152648926, + "learning_rate": 2.0810000000000002e-05, + "loss": 0.5761, + "step": 4165 + }, + { + "epoch": 0.23328480232948817, + "grad_norm": 2.6750452518463135, + "learning_rate": 2.0815e-05, + "loss": 0.3986, + "step": 4166 + }, + { + "epoch": 0.2333407996416172, + "grad_norm": 1.106176495552063, + "learning_rate": 2.082e-05, + "loss": 0.4198, + "step": 4167 + }, + { + "epoch": 0.2333967969537462, + "grad_norm": 1.3173162937164307, + "learning_rate": 2.0825e-05, + "loss": 0.6655, + "step": 4168 + }, + { + "epoch": 0.23345279426587523, + "grad_norm": 1.1135119199752808, + "learning_rate": 2.0830000000000002e-05, + "loss": 0.4669, + "step": 4169 + }, + { + "epoch": 0.23350879157800425, + "grad_norm": 1.1758356094360352, + "learning_rate": 2.0835000000000003e-05, + "loss": 0.4082, + "step": 4170 + }, + { + "epoch": 0.23356478889013327, + "grad_norm": 1.1846275329589844, + "learning_rate": 2.084e-05, + "loss": 0.4446, + "step": 4171 + }, + { + "epoch": 0.2336207862022623, + "grad_norm": 1.242761492729187, + "learning_rate": 2.0845e-05, + "loss": 0.3611, + "step": 4172 + }, + { + "epoch": 0.2336767835143913, + "grad_norm": 1.046606183052063, + "learning_rate": 2.085e-05, + "loss": 0.424, + "step": 4173 + }, + { + "epoch": 0.23373278082652033, + "grad_norm": 3.2558372020721436, + "learning_rate": 2.0855000000000003e-05, + "loss": 0.4671, + "step": 4174 + }, + { + "epoch": 0.23378877813864934, + "grad_norm": 1.3819587230682373, + "learning_rate": 2.086e-05, + "loss": 0.4429, + "step": 4175 + }, + { + "epoch": 0.23384477545077836, + "grad_norm": 0.9712210297584534, + "learning_rate": 2.0865e-05, + "loss": 0.3406, + "step": 4176 + }, + { + "epoch": 0.23390077276290738, + "grad_norm": 1.188122034072876, + "learning_rate": 2.0870000000000002e-05, + "loss": 0.4788, + "step": 4177 + }, + { + "epoch": 0.2339567700750364, + "grad_norm": 1.0752432346343994, + "learning_rate": 2.0875e-05, + "loss": 0.471, + "step": 4178 + }, + { + "epoch": 0.23401276738716542, + "grad_norm": 1.4987022876739502, + "learning_rate": 2.0880000000000003e-05, + "loss": 0.4276, + "step": 4179 + }, + { + "epoch": 0.23406876469929444, + "grad_norm": 1.4521417617797852, + "learning_rate": 2.0885e-05, + "loss": 0.3481, + "step": 4180 + }, + { + "epoch": 0.23412476201142346, + "grad_norm": 1.2257338762283325, + "learning_rate": 2.089e-05, + "loss": 0.4603, + "step": 4181 + }, + { + "epoch": 0.23418075932355248, + "grad_norm": 1.2818821668624878, + "learning_rate": 2.0895e-05, + "loss": 0.3515, + "step": 4182 + }, + { + "epoch": 0.2342367566356815, + "grad_norm": 1.5747284889221191, + "learning_rate": 2.09e-05, + "loss": 0.4984, + "step": 4183 + }, + { + "epoch": 0.23429275394781052, + "grad_norm": 1.2921171188354492, + "learning_rate": 2.0905000000000004e-05, + "loss": 0.5154, + "step": 4184 + }, + { + "epoch": 0.23434875125993954, + "grad_norm": 1.3256133794784546, + "learning_rate": 2.091e-05, + "loss": 0.4911, + "step": 4185 + }, + { + "epoch": 0.23440474857206853, + "grad_norm": 1.235826015472412, + "learning_rate": 2.0915000000000002e-05, + "loss": 0.3948, + "step": 4186 + }, + { + "epoch": 0.23446074588419755, + "grad_norm": 1.1025710105895996, + "learning_rate": 2.092e-05, + "loss": 0.3685, + "step": 4187 + }, + { + "epoch": 0.23451674319632657, + "grad_norm": 1.6832563877105713, + "learning_rate": 2.0925e-05, + "loss": 0.4591, + "step": 4188 + }, + { + "epoch": 0.23457274050845558, + "grad_norm": 1.313822627067566, + "learning_rate": 2.093e-05, + "loss": 0.483, + "step": 4189 + }, + { + "epoch": 0.2346287378205846, + "grad_norm": 1.299112319946289, + "learning_rate": 2.0935000000000002e-05, + "loss": 0.5008, + "step": 4190 + }, + { + "epoch": 0.23468473513271362, + "grad_norm": 1.257529616355896, + "learning_rate": 2.0940000000000003e-05, + "loss": 0.5007, + "step": 4191 + }, + { + "epoch": 0.23474073244484264, + "grad_norm": 3.786053419113159, + "learning_rate": 2.0945e-05, + "loss": 0.4312, + "step": 4192 + }, + { + "epoch": 0.23479672975697166, + "grad_norm": 1.1573549509048462, + "learning_rate": 2.095e-05, + "loss": 0.3606, + "step": 4193 + }, + { + "epoch": 0.23485272706910068, + "grad_norm": 1.4084893465042114, + "learning_rate": 2.0955e-05, + "loss": 0.5302, + "step": 4194 + }, + { + "epoch": 0.2349087243812297, + "grad_norm": 1.0694756507873535, + "learning_rate": 2.0960000000000003e-05, + "loss": 0.309, + "step": 4195 + }, + { + "epoch": 0.23496472169335872, + "grad_norm": 1.0330634117126465, + "learning_rate": 2.0965e-05, + "loss": 0.3656, + "step": 4196 + }, + { + "epoch": 0.23502071900548774, + "grad_norm": 1.2291839122772217, + "learning_rate": 2.097e-05, + "loss": 0.3507, + "step": 4197 + }, + { + "epoch": 0.23507671631761676, + "grad_norm": 1.1577256917953491, + "learning_rate": 2.0975e-05, + "loss": 0.3888, + "step": 4198 + }, + { + "epoch": 0.23513271362974578, + "grad_norm": 1.1568584442138672, + "learning_rate": 2.098e-05, + "loss": 0.5337, + "step": 4199 + }, + { + "epoch": 0.2351887109418748, + "grad_norm": 1.1812026500701904, + "learning_rate": 2.0985000000000003e-05, + "loss": 0.4119, + "step": 4200 + }, + { + "epoch": 0.23524470825400381, + "grad_norm": 1.2275822162628174, + "learning_rate": 2.099e-05, + "loss": 0.353, + "step": 4201 + }, + { + "epoch": 0.23530070556613283, + "grad_norm": 1.1405919790267944, + "learning_rate": 2.0995e-05, + "loss": 0.4058, + "step": 4202 + }, + { + "epoch": 0.23535670287826185, + "grad_norm": 1.1904031038284302, + "learning_rate": 2.1e-05, + "loss": 0.3385, + "step": 4203 + }, + { + "epoch": 0.23541270019039087, + "grad_norm": 1.063535213470459, + "learning_rate": 2.1005e-05, + "loss": 0.3502, + "step": 4204 + }, + { + "epoch": 0.2354686975025199, + "grad_norm": 1.3525657653808594, + "learning_rate": 2.101e-05, + "loss": 0.5297, + "step": 4205 + }, + { + "epoch": 0.2355246948146489, + "grad_norm": 1.166908621788025, + "learning_rate": 2.1015e-05, + "loss": 0.3891, + "step": 4206 + }, + { + "epoch": 0.2355806921267779, + "grad_norm": 1.3686550855636597, + "learning_rate": 2.1020000000000002e-05, + "loss": 0.497, + "step": 4207 + }, + { + "epoch": 0.23563668943890692, + "grad_norm": 1.311750888824463, + "learning_rate": 2.1025e-05, + "loss": 0.4479, + "step": 4208 + }, + { + "epoch": 0.23569268675103594, + "grad_norm": 1.2289323806762695, + "learning_rate": 2.103e-05, + "loss": 0.4434, + "step": 4209 + }, + { + "epoch": 0.23574868406316496, + "grad_norm": 1.266406536102295, + "learning_rate": 2.1035e-05, + "loss": 0.4205, + "step": 4210 + }, + { + "epoch": 0.23580468137529398, + "grad_norm": 1.1448001861572266, + "learning_rate": 2.1040000000000002e-05, + "loss": 0.3897, + "step": 4211 + }, + { + "epoch": 0.235860678687423, + "grad_norm": 1.3236624002456665, + "learning_rate": 2.1045e-05, + "loss": 0.4088, + "step": 4212 + }, + { + "epoch": 0.23591667599955202, + "grad_norm": 1.2581989765167236, + "learning_rate": 2.105e-05, + "loss": 0.379, + "step": 4213 + }, + { + "epoch": 0.23597267331168104, + "grad_norm": 1.0380505323410034, + "learning_rate": 2.1055e-05, + "loss": 0.3879, + "step": 4214 + }, + { + "epoch": 0.23602867062381006, + "grad_norm": 1.1456139087677002, + "learning_rate": 2.106e-05, + "loss": 0.4532, + "step": 4215 + }, + { + "epoch": 0.23608466793593907, + "grad_norm": 1.1638354063034058, + "learning_rate": 2.1065000000000002e-05, + "loss": 0.4703, + "step": 4216 + }, + { + "epoch": 0.2361406652480681, + "grad_norm": 1.608616590499878, + "learning_rate": 2.107e-05, + "loss": 0.5295, + "step": 4217 + }, + { + "epoch": 0.2361966625601971, + "grad_norm": 1.3021283149719238, + "learning_rate": 2.1075e-05, + "loss": 0.3815, + "step": 4218 + }, + { + "epoch": 0.23625265987232613, + "grad_norm": 5.347463607788086, + "learning_rate": 2.1079999999999998e-05, + "loss": 0.3893, + "step": 4219 + }, + { + "epoch": 0.23630865718445515, + "grad_norm": 1.0997823476791382, + "learning_rate": 2.1085000000000002e-05, + "loss": 0.3264, + "step": 4220 + }, + { + "epoch": 0.23636465449658417, + "grad_norm": 1.0885735750198364, + "learning_rate": 2.1090000000000003e-05, + "loss": 0.3894, + "step": 4221 + }, + { + "epoch": 0.2364206518087132, + "grad_norm": 1.0570690631866455, + "learning_rate": 2.1095e-05, + "loss": 0.374, + "step": 4222 + }, + { + "epoch": 0.2364766491208422, + "grad_norm": 1.38384211063385, + "learning_rate": 2.11e-05, + "loss": 0.4647, + "step": 4223 + }, + { + "epoch": 0.23653264643297123, + "grad_norm": 1.2475320100784302, + "learning_rate": 2.1105e-05, + "loss": 0.5061, + "step": 4224 + }, + { + "epoch": 0.23658864374510025, + "grad_norm": 1.3273953199386597, + "learning_rate": 2.1110000000000003e-05, + "loss": 0.4165, + "step": 4225 + }, + { + "epoch": 0.23664464105722927, + "grad_norm": 1.0664968490600586, + "learning_rate": 2.1115e-05, + "loss": 0.3721, + "step": 4226 + }, + { + "epoch": 0.23670063836935826, + "grad_norm": 1.255656361579895, + "learning_rate": 2.112e-05, + "loss": 0.4174, + "step": 4227 + }, + { + "epoch": 0.23675663568148728, + "grad_norm": 1.2060269117355347, + "learning_rate": 2.1125000000000002e-05, + "loss": 0.4766, + "step": 4228 + }, + { + "epoch": 0.2368126329936163, + "grad_norm": 1.2741271257400513, + "learning_rate": 2.113e-05, + "loss": 0.4044, + "step": 4229 + }, + { + "epoch": 0.23686863030574531, + "grad_norm": 1.3632087707519531, + "learning_rate": 2.1135000000000003e-05, + "loss": 0.4712, + "step": 4230 + }, + { + "epoch": 0.23692462761787433, + "grad_norm": 1.3004961013793945, + "learning_rate": 2.114e-05, + "loss": 0.4695, + "step": 4231 + }, + { + "epoch": 0.23698062493000335, + "grad_norm": 1.64434814453125, + "learning_rate": 2.1145e-05, + "loss": 0.463, + "step": 4232 + }, + { + "epoch": 0.23703662224213237, + "grad_norm": 1.0939937829971313, + "learning_rate": 2.115e-05, + "loss": 0.4293, + "step": 4233 + }, + { + "epoch": 0.2370926195542614, + "grad_norm": 1.2327302694320679, + "learning_rate": 2.1155e-05, + "loss": 0.4478, + "step": 4234 + }, + { + "epoch": 0.2371486168663904, + "grad_norm": 1.1831134557724, + "learning_rate": 2.116e-05, + "loss": 0.4402, + "step": 4235 + }, + { + "epoch": 0.23720461417851943, + "grad_norm": 1.0770121812820435, + "learning_rate": 2.1165e-05, + "loss": 0.6058, + "step": 4236 + }, + { + "epoch": 0.23726061149064845, + "grad_norm": 1.3046811819076538, + "learning_rate": 2.1170000000000002e-05, + "loss": 0.4307, + "step": 4237 + }, + { + "epoch": 0.23731660880277747, + "grad_norm": 1.263245701789856, + "learning_rate": 2.1175e-05, + "loss": 0.4071, + "step": 4238 + }, + { + "epoch": 0.2373726061149065, + "grad_norm": 1.2050762176513672, + "learning_rate": 2.118e-05, + "loss": 0.5376, + "step": 4239 + }, + { + "epoch": 0.2374286034270355, + "grad_norm": 1.1449065208435059, + "learning_rate": 2.1185e-05, + "loss": 0.4948, + "step": 4240 + }, + { + "epoch": 0.23748460073916453, + "grad_norm": 1.1334929466247559, + "learning_rate": 2.1190000000000002e-05, + "loss": 0.3879, + "step": 4241 + }, + { + "epoch": 0.23754059805129354, + "grad_norm": 1.0854896306991577, + "learning_rate": 2.1195e-05, + "loss": 0.3292, + "step": 4242 + }, + { + "epoch": 0.23759659536342256, + "grad_norm": 1.4370067119598389, + "learning_rate": 2.12e-05, + "loss": 0.6236, + "step": 4243 + }, + { + "epoch": 0.23765259267555158, + "grad_norm": 1.1262603998184204, + "learning_rate": 2.1205e-05, + "loss": 0.4683, + "step": 4244 + }, + { + "epoch": 0.2377085899876806, + "grad_norm": 1.2429581880569458, + "learning_rate": 2.1210000000000002e-05, + "loss": 0.4212, + "step": 4245 + }, + { + "epoch": 0.23776458729980962, + "grad_norm": 1.2421106100082397, + "learning_rate": 2.1215000000000003e-05, + "loss": 0.6051, + "step": 4246 + }, + { + "epoch": 0.23782058461193864, + "grad_norm": 1.3536938428878784, + "learning_rate": 2.122e-05, + "loss": 0.5184, + "step": 4247 + }, + { + "epoch": 0.23787658192406763, + "grad_norm": 1.1302820444107056, + "learning_rate": 2.1225e-05, + "loss": 0.4098, + "step": 4248 + }, + { + "epoch": 0.23793257923619665, + "grad_norm": 1.2246413230895996, + "learning_rate": 2.123e-05, + "loss": 0.4006, + "step": 4249 + }, + { + "epoch": 0.23798857654832567, + "grad_norm": 2.1079390048980713, + "learning_rate": 2.1235000000000003e-05, + "loss": 0.483, + "step": 4250 + }, + { + "epoch": 0.2380445738604547, + "grad_norm": 1.4226256608963013, + "learning_rate": 2.124e-05, + "loss": 0.4082, + "step": 4251 + }, + { + "epoch": 0.2381005711725837, + "grad_norm": 1.766867995262146, + "learning_rate": 2.1245e-05, + "loss": 0.3793, + "step": 4252 + }, + { + "epoch": 0.23815656848471273, + "grad_norm": 1.7468613386154175, + "learning_rate": 2.125e-05, + "loss": 0.6336, + "step": 4253 + }, + { + "epoch": 0.23821256579684175, + "grad_norm": 1.261499285697937, + "learning_rate": 2.1255e-05, + "loss": 0.4244, + "step": 4254 + }, + { + "epoch": 0.23826856310897077, + "grad_norm": 2.6207706928253174, + "learning_rate": 2.1260000000000003e-05, + "loss": 0.4324, + "step": 4255 + }, + { + "epoch": 0.23832456042109978, + "grad_norm": 1.1398931741714478, + "learning_rate": 2.1265e-05, + "loss": 0.4442, + "step": 4256 + }, + { + "epoch": 0.2383805577332288, + "grad_norm": 1.4394181966781616, + "learning_rate": 2.127e-05, + "loss": 0.4277, + "step": 4257 + }, + { + "epoch": 0.23843655504535782, + "grad_norm": 1.1773593425750732, + "learning_rate": 2.1275000000000002e-05, + "loss": 0.4644, + "step": 4258 + }, + { + "epoch": 0.23849255235748684, + "grad_norm": 1.4122238159179688, + "learning_rate": 2.128e-05, + "loss": 0.5031, + "step": 4259 + }, + { + "epoch": 0.23854854966961586, + "grad_norm": 1.206392765045166, + "learning_rate": 2.1285000000000004e-05, + "loss": 0.407, + "step": 4260 + }, + { + "epoch": 0.23860454698174488, + "grad_norm": 1.189375638961792, + "learning_rate": 2.129e-05, + "loss": 0.3432, + "step": 4261 + }, + { + "epoch": 0.2386605442938739, + "grad_norm": 1.1893759965896606, + "learning_rate": 2.1295000000000002e-05, + "loss": 0.4062, + "step": 4262 + }, + { + "epoch": 0.23871654160600292, + "grad_norm": 1.5574356317520142, + "learning_rate": 2.13e-05, + "loss": 0.3862, + "step": 4263 + }, + { + "epoch": 0.23877253891813194, + "grad_norm": 1.6357659101486206, + "learning_rate": 2.1305e-05, + "loss": 0.4128, + "step": 4264 + }, + { + "epoch": 0.23882853623026096, + "grad_norm": 1.0961475372314453, + "learning_rate": 2.131e-05, + "loss": 0.303, + "step": 4265 + }, + { + "epoch": 0.23888453354238998, + "grad_norm": 1.287390947341919, + "learning_rate": 2.1315000000000002e-05, + "loss": 0.5198, + "step": 4266 + }, + { + "epoch": 0.238940530854519, + "grad_norm": 1.3111846446990967, + "learning_rate": 2.1320000000000003e-05, + "loss": 0.4619, + "step": 4267 + }, + { + "epoch": 0.23899652816664801, + "grad_norm": 1.5583844184875488, + "learning_rate": 2.1325e-05, + "loss": 0.3147, + "step": 4268 + }, + { + "epoch": 0.239052525478777, + "grad_norm": 1.2882252931594849, + "learning_rate": 2.133e-05, + "loss": 0.3848, + "step": 4269 + }, + { + "epoch": 0.23910852279090603, + "grad_norm": 1.083142638206482, + "learning_rate": 2.1335e-05, + "loss": 0.4959, + "step": 4270 + }, + { + "epoch": 0.23916452010303504, + "grad_norm": 1.1915721893310547, + "learning_rate": 2.1340000000000002e-05, + "loss": 0.3234, + "step": 4271 + }, + { + "epoch": 0.23922051741516406, + "grad_norm": 1.1333130598068237, + "learning_rate": 2.1345e-05, + "loss": 0.371, + "step": 4272 + }, + { + "epoch": 0.23927651472729308, + "grad_norm": 1.2794235944747925, + "learning_rate": 2.135e-05, + "loss": 0.415, + "step": 4273 + }, + { + "epoch": 0.2393325120394221, + "grad_norm": 1.1736987829208374, + "learning_rate": 2.1355e-05, + "loss": 0.3776, + "step": 4274 + }, + { + "epoch": 0.23938850935155112, + "grad_norm": 1.090275526046753, + "learning_rate": 2.1360000000000002e-05, + "loss": 0.433, + "step": 4275 + }, + { + "epoch": 0.23944450666368014, + "grad_norm": 1.1244940757751465, + "learning_rate": 2.1365000000000003e-05, + "loss": 0.5467, + "step": 4276 + }, + { + "epoch": 0.23950050397580916, + "grad_norm": 1.1872576475143433, + "learning_rate": 2.137e-05, + "loss": 0.4813, + "step": 4277 + }, + { + "epoch": 0.23955650128793818, + "grad_norm": 1.2008262872695923, + "learning_rate": 2.1375e-05, + "loss": 0.4253, + "step": 4278 + }, + { + "epoch": 0.2396124986000672, + "grad_norm": 1.2897939682006836, + "learning_rate": 2.138e-05, + "loss": 0.415, + "step": 4279 + }, + { + "epoch": 0.23966849591219622, + "grad_norm": 1.1022006273269653, + "learning_rate": 2.1385000000000003e-05, + "loss": 0.4679, + "step": 4280 + }, + { + "epoch": 0.23972449322432524, + "grad_norm": 1.2431297302246094, + "learning_rate": 2.139e-05, + "loss": 0.3932, + "step": 4281 + }, + { + "epoch": 0.23978049053645425, + "grad_norm": 1.5248690843582153, + "learning_rate": 2.1395e-05, + "loss": 0.4623, + "step": 4282 + }, + { + "epoch": 0.23983648784858327, + "grad_norm": 1.0249652862548828, + "learning_rate": 2.1400000000000002e-05, + "loss": 0.3644, + "step": 4283 + }, + { + "epoch": 0.2398924851607123, + "grad_norm": 1.4337689876556396, + "learning_rate": 2.1405e-05, + "loss": 0.4221, + "step": 4284 + }, + { + "epoch": 0.2399484824728413, + "grad_norm": 1.283555269241333, + "learning_rate": 2.1410000000000003e-05, + "loss": 0.453, + "step": 4285 + }, + { + "epoch": 0.24000447978497033, + "grad_norm": 1.3956892490386963, + "learning_rate": 2.1415e-05, + "loss": 0.4818, + "step": 4286 + }, + { + "epoch": 0.24006047709709935, + "grad_norm": 1.484602451324463, + "learning_rate": 2.142e-05, + "loss": 0.5885, + "step": 4287 + }, + { + "epoch": 0.24011647440922837, + "grad_norm": 1.120192050933838, + "learning_rate": 2.1425e-05, + "loss": 0.366, + "step": 4288 + }, + { + "epoch": 0.24017247172135736, + "grad_norm": 5.211393356323242, + "learning_rate": 2.143e-05, + "loss": 0.451, + "step": 4289 + }, + { + "epoch": 0.24022846903348638, + "grad_norm": 1.0974055528640747, + "learning_rate": 2.1435000000000004e-05, + "loss": 0.3342, + "step": 4290 + }, + { + "epoch": 0.2402844663456154, + "grad_norm": 1.5038377046585083, + "learning_rate": 2.144e-05, + "loss": 0.4989, + "step": 4291 + }, + { + "epoch": 0.24034046365774442, + "grad_norm": 0.933678150177002, + "learning_rate": 2.1445000000000002e-05, + "loss": 0.2991, + "step": 4292 + }, + { + "epoch": 0.24039646096987344, + "grad_norm": 1.149804949760437, + "learning_rate": 2.145e-05, + "loss": 0.5103, + "step": 4293 + }, + { + "epoch": 0.24045245828200246, + "grad_norm": 1.0830732583999634, + "learning_rate": 2.1455e-05, + "loss": 0.4144, + "step": 4294 + }, + { + "epoch": 0.24050845559413148, + "grad_norm": 1.2888356447219849, + "learning_rate": 2.146e-05, + "loss": 0.4835, + "step": 4295 + }, + { + "epoch": 0.2405644529062605, + "grad_norm": 5.33606481552124, + "learning_rate": 2.1465000000000002e-05, + "loss": 0.5273, + "step": 4296 + }, + { + "epoch": 0.24062045021838951, + "grad_norm": 1.4628833532333374, + "learning_rate": 2.1470000000000003e-05, + "loss": 0.4482, + "step": 4297 + }, + { + "epoch": 0.24067644753051853, + "grad_norm": 1.1477042436599731, + "learning_rate": 2.1475e-05, + "loss": 0.5156, + "step": 4298 + }, + { + "epoch": 0.24073244484264755, + "grad_norm": 1.2502121925354004, + "learning_rate": 2.148e-05, + "loss": 0.3495, + "step": 4299 + }, + { + "epoch": 0.24078844215477657, + "grad_norm": 1.1966519355773926, + "learning_rate": 2.1485000000000002e-05, + "loss": 0.4084, + "step": 4300 + }, + { + "epoch": 0.2408444394669056, + "grad_norm": 1.095430850982666, + "learning_rate": 2.1490000000000003e-05, + "loss": 0.4155, + "step": 4301 + }, + { + "epoch": 0.2409004367790346, + "grad_norm": 1.3317145109176636, + "learning_rate": 2.1495e-05, + "loss": 0.3883, + "step": 4302 + }, + { + "epoch": 0.24095643409116363, + "grad_norm": 1.2294490337371826, + "learning_rate": 2.15e-05, + "loss": 0.5213, + "step": 4303 + }, + { + "epoch": 0.24101243140329265, + "grad_norm": 1.0221933126449585, + "learning_rate": 2.1505e-05, + "loss": 0.2962, + "step": 4304 + }, + { + "epoch": 0.24106842871542167, + "grad_norm": 1.0682564973831177, + "learning_rate": 2.1510000000000002e-05, + "loss": 0.4275, + "step": 4305 + }, + { + "epoch": 0.2411244260275507, + "grad_norm": 1.2987785339355469, + "learning_rate": 2.1515000000000003e-05, + "loss": 0.3958, + "step": 4306 + }, + { + "epoch": 0.2411804233396797, + "grad_norm": 1.083464503288269, + "learning_rate": 2.152e-05, + "loss": 0.4409, + "step": 4307 + }, + { + "epoch": 0.24123642065180873, + "grad_norm": 1.1261757612228394, + "learning_rate": 2.1525e-05, + "loss": 0.4545, + "step": 4308 + }, + { + "epoch": 0.24129241796393774, + "grad_norm": 1.1840627193450928, + "learning_rate": 2.153e-05, + "loss": 0.4417, + "step": 4309 + }, + { + "epoch": 0.24134841527606674, + "grad_norm": 2.27750301361084, + "learning_rate": 2.1535000000000003e-05, + "loss": 0.5773, + "step": 4310 + }, + { + "epoch": 0.24140441258819575, + "grad_norm": 1.263756513595581, + "learning_rate": 2.154e-05, + "loss": 0.4707, + "step": 4311 + }, + { + "epoch": 0.24146040990032477, + "grad_norm": 1.3066948652267456, + "learning_rate": 2.1545e-05, + "loss": 0.4886, + "step": 4312 + }, + { + "epoch": 0.2415164072124538, + "grad_norm": 1.117456078529358, + "learning_rate": 2.1550000000000002e-05, + "loss": 0.4034, + "step": 4313 + }, + { + "epoch": 0.2415724045245828, + "grad_norm": 1.612994909286499, + "learning_rate": 2.1555e-05, + "loss": 0.3843, + "step": 4314 + }, + { + "epoch": 0.24162840183671183, + "grad_norm": 1.3376786708831787, + "learning_rate": 2.1560000000000004e-05, + "loss": 0.5396, + "step": 4315 + }, + { + "epoch": 0.24168439914884085, + "grad_norm": 1.0367299318313599, + "learning_rate": 2.1565e-05, + "loss": 0.3505, + "step": 4316 + }, + { + "epoch": 0.24174039646096987, + "grad_norm": 1.2690515518188477, + "learning_rate": 2.1570000000000002e-05, + "loss": 0.4122, + "step": 4317 + }, + { + "epoch": 0.2417963937730989, + "grad_norm": 1.1069188117980957, + "learning_rate": 2.1575e-05, + "loss": 0.4649, + "step": 4318 + }, + { + "epoch": 0.2418523910852279, + "grad_norm": 1.1699316501617432, + "learning_rate": 2.158e-05, + "loss": 0.531, + "step": 4319 + }, + { + "epoch": 0.24190838839735693, + "grad_norm": 1.2313868999481201, + "learning_rate": 2.1585e-05, + "loss": 0.4042, + "step": 4320 + }, + { + "epoch": 0.24196438570948595, + "grad_norm": 1.5419354438781738, + "learning_rate": 2.159e-05, + "loss": 0.5642, + "step": 4321 + }, + { + "epoch": 0.24202038302161497, + "grad_norm": 1.1893784999847412, + "learning_rate": 2.1595000000000002e-05, + "loss": 0.3844, + "step": 4322 + }, + { + "epoch": 0.24207638033374398, + "grad_norm": 1.4173463582992554, + "learning_rate": 2.16e-05, + "loss": 0.5144, + "step": 4323 + }, + { + "epoch": 0.242132377645873, + "grad_norm": 0.9899412393569946, + "learning_rate": 2.1605e-05, + "loss": 0.3475, + "step": 4324 + }, + { + "epoch": 0.24218837495800202, + "grad_norm": 1.3944251537322998, + "learning_rate": 2.1609999999999998e-05, + "loss": 0.4864, + "step": 4325 + }, + { + "epoch": 0.24224437227013104, + "grad_norm": 1.2878979444503784, + "learning_rate": 2.1615000000000002e-05, + "loss": 0.3878, + "step": 4326 + }, + { + "epoch": 0.24230036958226006, + "grad_norm": 1.1655343770980835, + "learning_rate": 2.162e-05, + "loss": 0.284, + "step": 4327 + }, + { + "epoch": 0.24235636689438908, + "grad_norm": 1.329520583152771, + "learning_rate": 2.1625e-05, + "loss": 0.518, + "step": 4328 + }, + { + "epoch": 0.2424123642065181, + "grad_norm": 1.3053680658340454, + "learning_rate": 2.163e-05, + "loss": 0.5209, + "step": 4329 + }, + { + "epoch": 0.24246836151864712, + "grad_norm": 1.5850566625595093, + "learning_rate": 2.1635e-05, + "loss": 0.4758, + "step": 4330 + }, + { + "epoch": 0.2425243588307761, + "grad_norm": 1.0623127222061157, + "learning_rate": 2.1640000000000003e-05, + "loss": 0.4205, + "step": 4331 + }, + { + "epoch": 0.24258035614290513, + "grad_norm": 1.486602783203125, + "learning_rate": 2.1645e-05, + "loss": 0.5736, + "step": 4332 + }, + { + "epoch": 0.24263635345503415, + "grad_norm": 1.1264127492904663, + "learning_rate": 2.165e-05, + "loss": 0.3344, + "step": 4333 + }, + { + "epoch": 0.24269235076716317, + "grad_norm": 1.114925503730774, + "learning_rate": 2.1655000000000002e-05, + "loss": 0.3654, + "step": 4334 + }, + { + "epoch": 0.2427483480792922, + "grad_norm": 0.9926008582115173, + "learning_rate": 2.166e-05, + "loss": 0.4218, + "step": 4335 + }, + { + "epoch": 0.2428043453914212, + "grad_norm": 1.0779087543487549, + "learning_rate": 2.1665000000000003e-05, + "loss": 0.3215, + "step": 4336 + }, + { + "epoch": 0.24286034270355022, + "grad_norm": 1.2170718908309937, + "learning_rate": 2.167e-05, + "loss": 0.6787, + "step": 4337 + }, + { + "epoch": 0.24291634001567924, + "grad_norm": 1.7566055059432983, + "learning_rate": 2.1675e-05, + "loss": 0.5167, + "step": 4338 + }, + { + "epoch": 0.24297233732780826, + "grad_norm": 1.25746488571167, + "learning_rate": 2.168e-05, + "loss": 0.4718, + "step": 4339 + }, + { + "epoch": 0.24302833463993728, + "grad_norm": 1.1558181047439575, + "learning_rate": 2.1685e-05, + "loss": 0.475, + "step": 4340 + }, + { + "epoch": 0.2430843319520663, + "grad_norm": 9.28864860534668, + "learning_rate": 2.169e-05, + "loss": 0.4129, + "step": 4341 + }, + { + "epoch": 0.24314032926419532, + "grad_norm": 1.0949761867523193, + "learning_rate": 2.1695e-05, + "loss": 0.3915, + "step": 4342 + }, + { + "epoch": 0.24319632657632434, + "grad_norm": 1.1187132596969604, + "learning_rate": 2.1700000000000002e-05, + "loss": 0.3921, + "step": 4343 + }, + { + "epoch": 0.24325232388845336, + "grad_norm": 1.0224883556365967, + "learning_rate": 2.1705e-05, + "loss": 0.3613, + "step": 4344 + }, + { + "epoch": 0.24330832120058238, + "grad_norm": 1.2836576700210571, + "learning_rate": 2.171e-05, + "loss": 0.4491, + "step": 4345 + }, + { + "epoch": 0.2433643185127114, + "grad_norm": 1.13035249710083, + "learning_rate": 2.1715e-05, + "loss": 0.3769, + "step": 4346 + }, + { + "epoch": 0.24342031582484042, + "grad_norm": 1.0483314990997314, + "learning_rate": 2.1720000000000002e-05, + "loss": 0.4401, + "step": 4347 + }, + { + "epoch": 0.24347631313696944, + "grad_norm": 1.3753732442855835, + "learning_rate": 2.1725e-05, + "loss": 0.5379, + "step": 4348 + }, + { + "epoch": 0.24353231044909845, + "grad_norm": 4.958712100982666, + "learning_rate": 2.173e-05, + "loss": 0.3993, + "step": 4349 + }, + { + "epoch": 0.24358830776122747, + "grad_norm": 1.0264718532562256, + "learning_rate": 2.1735e-05, + "loss": 0.3764, + "step": 4350 + }, + { + "epoch": 0.24364430507335647, + "grad_norm": 1.439278483390808, + "learning_rate": 2.1740000000000002e-05, + "loss": 0.651, + "step": 4351 + }, + { + "epoch": 0.24370030238548548, + "grad_norm": 1.1682460308074951, + "learning_rate": 2.1745000000000003e-05, + "loss": 0.3624, + "step": 4352 + }, + { + "epoch": 0.2437562996976145, + "grad_norm": 1.3046634197235107, + "learning_rate": 2.175e-05, + "loss": 0.4369, + "step": 4353 + }, + { + "epoch": 0.24381229700974352, + "grad_norm": 1.123918890953064, + "learning_rate": 2.1755e-05, + "loss": 0.4421, + "step": 4354 + }, + { + "epoch": 0.24386829432187254, + "grad_norm": 1.194492220878601, + "learning_rate": 2.176e-05, + "loss": 0.3583, + "step": 4355 + }, + { + "epoch": 0.24392429163400156, + "grad_norm": 1.5091285705566406, + "learning_rate": 2.1765000000000003e-05, + "loss": 0.557, + "step": 4356 + }, + { + "epoch": 0.24398028894613058, + "grad_norm": 1.6220325231552124, + "learning_rate": 2.177e-05, + "loss": 0.563, + "step": 4357 + }, + { + "epoch": 0.2440362862582596, + "grad_norm": 1.1459437608718872, + "learning_rate": 2.1775e-05, + "loss": 0.4278, + "step": 4358 + }, + { + "epoch": 0.24409228357038862, + "grad_norm": 1.2954154014587402, + "learning_rate": 2.178e-05, + "loss": 0.5298, + "step": 4359 + }, + { + "epoch": 0.24414828088251764, + "grad_norm": 1.2371245622634888, + "learning_rate": 2.1785e-05, + "loss": 0.343, + "step": 4360 + }, + { + "epoch": 0.24420427819464666, + "grad_norm": 1.1733187437057495, + "learning_rate": 2.1790000000000003e-05, + "loss": 0.3858, + "step": 4361 + }, + { + "epoch": 0.24426027550677568, + "grad_norm": 1.2497729063034058, + "learning_rate": 2.1795e-05, + "loss": 0.3206, + "step": 4362 + }, + { + "epoch": 0.2443162728189047, + "grad_norm": 1.3116555213928223, + "learning_rate": 2.18e-05, + "loss": 0.5136, + "step": 4363 + }, + { + "epoch": 0.24437227013103371, + "grad_norm": 1.1510710716247559, + "learning_rate": 2.1805e-05, + "loss": 0.4088, + "step": 4364 + }, + { + "epoch": 0.24442826744316273, + "grad_norm": 1.3727792501449585, + "learning_rate": 2.181e-05, + "loss": 0.4046, + "step": 4365 + }, + { + "epoch": 0.24448426475529175, + "grad_norm": 1.2303552627563477, + "learning_rate": 2.1815000000000004e-05, + "loss": 0.3942, + "step": 4366 + }, + { + "epoch": 0.24454026206742077, + "grad_norm": 1.3398981094360352, + "learning_rate": 2.182e-05, + "loss": 0.4133, + "step": 4367 + }, + { + "epoch": 0.2445962593795498, + "grad_norm": 1.425965428352356, + "learning_rate": 2.1825000000000002e-05, + "loss": 0.4376, + "step": 4368 + }, + { + "epoch": 0.2446522566916788, + "grad_norm": 1.1586977243423462, + "learning_rate": 2.183e-05, + "loss": 0.4468, + "step": 4369 + }, + { + "epoch": 0.24470825400380783, + "grad_norm": 1.0181491374969482, + "learning_rate": 2.1835e-05, + "loss": 0.4791, + "step": 4370 + }, + { + "epoch": 0.24476425131593685, + "grad_norm": 1.1308424472808838, + "learning_rate": 2.184e-05, + "loss": 0.3961, + "step": 4371 + }, + { + "epoch": 0.24482024862806584, + "grad_norm": 1.4323550462722778, + "learning_rate": 2.1845000000000002e-05, + "loss": 0.5017, + "step": 4372 + }, + { + "epoch": 0.24487624594019486, + "grad_norm": 1.1525380611419678, + "learning_rate": 2.1850000000000003e-05, + "loss": 0.4749, + "step": 4373 + }, + { + "epoch": 0.24493224325232388, + "grad_norm": 1.2878707647323608, + "learning_rate": 2.1855e-05, + "loss": 0.4322, + "step": 4374 + }, + { + "epoch": 0.2449882405644529, + "grad_norm": 1.210702896118164, + "learning_rate": 2.186e-05, + "loss": 0.4982, + "step": 4375 + }, + { + "epoch": 0.24504423787658192, + "grad_norm": 1.2443805932998657, + "learning_rate": 2.1865e-05, + "loss": 0.3842, + "step": 4376 + }, + { + "epoch": 0.24510023518871094, + "grad_norm": 1.083103060722351, + "learning_rate": 2.1870000000000002e-05, + "loss": 0.3987, + "step": 4377 + }, + { + "epoch": 0.24515623250083995, + "grad_norm": 1.1591500043869019, + "learning_rate": 2.1875e-05, + "loss": 0.3353, + "step": 4378 + }, + { + "epoch": 0.24521222981296897, + "grad_norm": 1.045145869255066, + "learning_rate": 2.188e-05, + "loss": 0.4127, + "step": 4379 + }, + { + "epoch": 0.245268227125098, + "grad_norm": 1.5242241621017456, + "learning_rate": 2.1885e-05, + "loss": 0.5309, + "step": 4380 + }, + { + "epoch": 0.245324224437227, + "grad_norm": 1.0925313234329224, + "learning_rate": 2.1890000000000002e-05, + "loss": 0.456, + "step": 4381 + }, + { + "epoch": 0.24538022174935603, + "grad_norm": 1.1851149797439575, + "learning_rate": 2.1895000000000003e-05, + "loss": 0.3898, + "step": 4382 + }, + { + "epoch": 0.24543621906148505, + "grad_norm": 1.1446529626846313, + "learning_rate": 2.19e-05, + "loss": 0.357, + "step": 4383 + }, + { + "epoch": 0.24549221637361407, + "grad_norm": 1.3524234294891357, + "learning_rate": 2.1905e-05, + "loss": 0.4854, + "step": 4384 + }, + { + "epoch": 0.2455482136857431, + "grad_norm": 1.198757290840149, + "learning_rate": 2.191e-05, + "loss": 0.4094, + "step": 4385 + }, + { + "epoch": 0.2456042109978721, + "grad_norm": 1.1887050867080688, + "learning_rate": 2.1915000000000003e-05, + "loss": 0.3984, + "step": 4386 + }, + { + "epoch": 0.24566020831000113, + "grad_norm": 1.2815239429473877, + "learning_rate": 2.192e-05, + "loss": 0.3801, + "step": 4387 + }, + { + "epoch": 0.24571620562213015, + "grad_norm": 1.220858097076416, + "learning_rate": 2.1925e-05, + "loss": 0.429, + "step": 4388 + }, + { + "epoch": 0.24577220293425917, + "grad_norm": 1.093279480934143, + "learning_rate": 2.1930000000000002e-05, + "loss": 0.3266, + "step": 4389 + }, + { + "epoch": 0.24582820024638818, + "grad_norm": 1.432355523109436, + "learning_rate": 2.1935e-05, + "loss": 0.4068, + "step": 4390 + }, + { + "epoch": 0.2458841975585172, + "grad_norm": 1.2643381357192993, + "learning_rate": 2.1940000000000003e-05, + "loss": 0.5241, + "step": 4391 + }, + { + "epoch": 0.24594019487064622, + "grad_norm": 1.1043672561645508, + "learning_rate": 2.1945e-05, + "loss": 0.4859, + "step": 4392 + }, + { + "epoch": 0.24599619218277521, + "grad_norm": 1.317258596420288, + "learning_rate": 2.195e-05, + "loss": 0.5257, + "step": 4393 + }, + { + "epoch": 0.24605218949490423, + "grad_norm": 1.2772438526153564, + "learning_rate": 2.1955e-05, + "loss": 0.4314, + "step": 4394 + }, + { + "epoch": 0.24610818680703325, + "grad_norm": 3.390718460083008, + "learning_rate": 2.196e-05, + "loss": 0.4308, + "step": 4395 + }, + { + "epoch": 0.24616418411916227, + "grad_norm": 1.1928256750106812, + "learning_rate": 2.1965e-05, + "loss": 0.4142, + "step": 4396 + }, + { + "epoch": 0.2462201814312913, + "grad_norm": 1.1900676488876343, + "learning_rate": 2.197e-05, + "loss": 0.3713, + "step": 4397 + }, + { + "epoch": 0.2462761787434203, + "grad_norm": 1.3439291715621948, + "learning_rate": 2.1975000000000002e-05, + "loss": 0.5114, + "step": 4398 + }, + { + "epoch": 0.24633217605554933, + "grad_norm": 1.282044768333435, + "learning_rate": 2.198e-05, + "loss": 0.5487, + "step": 4399 + }, + { + "epoch": 0.24638817336767835, + "grad_norm": 1.2520606517791748, + "learning_rate": 2.1985e-05, + "loss": 0.4649, + "step": 4400 + }, + { + "epoch": 0.24644417067980737, + "grad_norm": 1.1088248491287231, + "learning_rate": 2.199e-05, + "loss": 0.3805, + "step": 4401 + }, + { + "epoch": 0.2465001679919364, + "grad_norm": 1.6109001636505127, + "learning_rate": 2.1995000000000002e-05, + "loss": 0.3532, + "step": 4402 + }, + { + "epoch": 0.2465561653040654, + "grad_norm": 1.2714914083480835, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.4865, + "step": 4403 + }, + { + "epoch": 0.24661216261619442, + "grad_norm": 1.204676628112793, + "learning_rate": 2.2005e-05, + "loss": 0.5088, + "step": 4404 + }, + { + "epoch": 0.24666815992832344, + "grad_norm": 1.2741031646728516, + "learning_rate": 2.201e-05, + "loss": 0.4747, + "step": 4405 + }, + { + "epoch": 0.24672415724045246, + "grad_norm": 1.3649706840515137, + "learning_rate": 2.2015000000000002e-05, + "loss": 0.5744, + "step": 4406 + }, + { + "epoch": 0.24678015455258148, + "grad_norm": 1.3730049133300781, + "learning_rate": 2.2020000000000003e-05, + "loss": 0.5204, + "step": 4407 + }, + { + "epoch": 0.2468361518647105, + "grad_norm": 1.2699775695800781, + "learning_rate": 2.2025e-05, + "loss": 0.4023, + "step": 4408 + }, + { + "epoch": 0.24689214917683952, + "grad_norm": 1.1627721786499023, + "learning_rate": 2.203e-05, + "loss": 0.4179, + "step": 4409 + }, + { + "epoch": 0.24694814648896854, + "grad_norm": 1.1745370626449585, + "learning_rate": 2.2035e-05, + "loss": 0.4344, + "step": 4410 + }, + { + "epoch": 0.24700414380109756, + "grad_norm": 1.1994861364364624, + "learning_rate": 2.2040000000000002e-05, + "loss": 0.4245, + "step": 4411 + }, + { + "epoch": 0.24706014111322658, + "grad_norm": 0.9576858282089233, + "learning_rate": 2.2045000000000003e-05, + "loss": 0.3195, + "step": 4412 + }, + { + "epoch": 0.24711613842535557, + "grad_norm": 1.258575201034546, + "learning_rate": 2.205e-05, + "loss": 0.425, + "step": 4413 + }, + { + "epoch": 0.2471721357374846, + "grad_norm": 1.6441184282302856, + "learning_rate": 2.2055e-05, + "loss": 0.511, + "step": 4414 + }, + { + "epoch": 0.2472281330496136, + "grad_norm": 1.2258238792419434, + "learning_rate": 2.206e-05, + "loss": 0.391, + "step": 4415 + }, + { + "epoch": 0.24728413036174263, + "grad_norm": 1.3207824230194092, + "learning_rate": 2.2065000000000003e-05, + "loss": 0.4906, + "step": 4416 + }, + { + "epoch": 0.24734012767387165, + "grad_norm": 1.6287943124771118, + "learning_rate": 2.207e-05, + "loss": 0.6048, + "step": 4417 + }, + { + "epoch": 0.24739612498600067, + "grad_norm": 1.5828886032104492, + "learning_rate": 2.2075e-05, + "loss": 0.4398, + "step": 4418 + }, + { + "epoch": 0.24745212229812968, + "grad_norm": 1.2290335893630981, + "learning_rate": 2.2080000000000002e-05, + "loss": 0.4713, + "step": 4419 + }, + { + "epoch": 0.2475081196102587, + "grad_norm": 1.4252212047576904, + "learning_rate": 2.2085e-05, + "loss": 0.4958, + "step": 4420 + }, + { + "epoch": 0.24756411692238772, + "grad_norm": 1.0584359169006348, + "learning_rate": 2.2090000000000004e-05, + "loss": 0.4633, + "step": 4421 + }, + { + "epoch": 0.24762011423451674, + "grad_norm": 0.9757269024848938, + "learning_rate": 2.2095e-05, + "loss": 0.3672, + "step": 4422 + }, + { + "epoch": 0.24767611154664576, + "grad_norm": 1.0999170541763306, + "learning_rate": 2.2100000000000002e-05, + "loss": 0.4658, + "step": 4423 + }, + { + "epoch": 0.24773210885877478, + "grad_norm": 1.0702378749847412, + "learning_rate": 2.2105e-05, + "loss": 0.3745, + "step": 4424 + }, + { + "epoch": 0.2477881061709038, + "grad_norm": 1.335506796836853, + "learning_rate": 2.211e-05, + "loss": 0.423, + "step": 4425 + }, + { + "epoch": 0.24784410348303282, + "grad_norm": 1.1725655794143677, + "learning_rate": 2.2115e-05, + "loss": 0.4771, + "step": 4426 + }, + { + "epoch": 0.24790010079516184, + "grad_norm": 1.1515663862228394, + "learning_rate": 2.212e-05, + "loss": 0.3655, + "step": 4427 + }, + { + "epoch": 0.24795609810729086, + "grad_norm": 1.1827774047851562, + "learning_rate": 2.2125000000000002e-05, + "loss": 0.4368, + "step": 4428 + }, + { + "epoch": 0.24801209541941988, + "grad_norm": 1.267818570137024, + "learning_rate": 2.213e-05, + "loss": 0.4739, + "step": 4429 + }, + { + "epoch": 0.2480680927315489, + "grad_norm": 1.034224510192871, + "learning_rate": 2.2135e-05, + "loss": 0.3464, + "step": 4430 + }, + { + "epoch": 0.24812409004367791, + "grad_norm": 1.0499874353408813, + "learning_rate": 2.214e-05, + "loss": 0.44, + "step": 4431 + }, + { + "epoch": 0.24818008735580693, + "grad_norm": 1.1445691585540771, + "learning_rate": 2.2145000000000002e-05, + "loss": 0.3484, + "step": 4432 + }, + { + "epoch": 0.24823608466793595, + "grad_norm": 1.0334786176681519, + "learning_rate": 2.215e-05, + "loss": 0.5041, + "step": 4433 + }, + { + "epoch": 0.24829208198006494, + "grad_norm": 1.2405943870544434, + "learning_rate": 2.2155e-05, + "loss": 0.4128, + "step": 4434 + }, + { + "epoch": 0.24834807929219396, + "grad_norm": 1.390884280204773, + "learning_rate": 2.216e-05, + "loss": 0.5113, + "step": 4435 + }, + { + "epoch": 0.24840407660432298, + "grad_norm": 1.0858657360076904, + "learning_rate": 2.2165000000000002e-05, + "loss": 0.363, + "step": 4436 + }, + { + "epoch": 0.248460073916452, + "grad_norm": 1.241275429725647, + "learning_rate": 2.2170000000000003e-05, + "loss": 0.3941, + "step": 4437 + }, + { + "epoch": 0.24851607122858102, + "grad_norm": 1.1823296546936035, + "learning_rate": 2.2175e-05, + "loss": 0.3404, + "step": 4438 + }, + { + "epoch": 0.24857206854071004, + "grad_norm": 1.2853224277496338, + "learning_rate": 2.218e-05, + "loss": 0.3665, + "step": 4439 + }, + { + "epoch": 0.24862806585283906, + "grad_norm": 1.2950419187545776, + "learning_rate": 2.2185000000000002e-05, + "loss": 0.4266, + "step": 4440 + }, + { + "epoch": 0.24868406316496808, + "grad_norm": 1.5352532863616943, + "learning_rate": 2.219e-05, + "loss": 0.6076, + "step": 4441 + }, + { + "epoch": 0.2487400604770971, + "grad_norm": 1.2307060956954956, + "learning_rate": 2.2195000000000003e-05, + "loss": 0.4616, + "step": 4442 + }, + { + "epoch": 0.24879605778922612, + "grad_norm": 1.073006510734558, + "learning_rate": 2.22e-05, + "loss": 0.3265, + "step": 4443 + }, + { + "epoch": 0.24885205510135514, + "grad_norm": 1.1062570810317993, + "learning_rate": 2.2205000000000002e-05, + "loss": 0.3564, + "step": 4444 + }, + { + "epoch": 0.24890805241348415, + "grad_norm": 1.2196799516677856, + "learning_rate": 2.221e-05, + "loss": 0.4025, + "step": 4445 + }, + { + "epoch": 0.24896404972561317, + "grad_norm": 1.1566624641418457, + "learning_rate": 2.2215e-05, + "loss": 0.3357, + "step": 4446 + }, + { + "epoch": 0.2490200470377422, + "grad_norm": 1.2082751989364624, + "learning_rate": 2.222e-05, + "loss": 0.442, + "step": 4447 + }, + { + "epoch": 0.2490760443498712, + "grad_norm": 1.264952540397644, + "learning_rate": 2.2225e-05, + "loss": 0.5093, + "step": 4448 + }, + { + "epoch": 0.24913204166200023, + "grad_norm": 1.0614506006240845, + "learning_rate": 2.2230000000000002e-05, + "loss": 0.3028, + "step": 4449 + }, + { + "epoch": 0.24918803897412925, + "grad_norm": 1.1484986543655396, + "learning_rate": 2.2235e-05, + "loss": 0.4714, + "step": 4450 + }, + { + "epoch": 0.24924403628625827, + "grad_norm": 1.2769389152526855, + "learning_rate": 2.224e-05, + "loss": 0.5447, + "step": 4451 + }, + { + "epoch": 0.2493000335983873, + "grad_norm": 1.1708201169967651, + "learning_rate": 2.2245e-05, + "loss": 0.3064, + "step": 4452 + }, + { + "epoch": 0.2493560309105163, + "grad_norm": 1.3028026819229126, + "learning_rate": 2.2250000000000002e-05, + "loss": 0.4678, + "step": 4453 + }, + { + "epoch": 0.24941202822264533, + "grad_norm": 1.1337742805480957, + "learning_rate": 2.2255e-05, + "loss": 0.4317, + "step": 4454 + }, + { + "epoch": 0.24946802553477432, + "grad_norm": 1.2387633323669434, + "learning_rate": 2.226e-05, + "loss": 0.4296, + "step": 4455 + }, + { + "epoch": 0.24952402284690334, + "grad_norm": 1.2744488716125488, + "learning_rate": 2.2265e-05, + "loss": 0.4337, + "step": 4456 + }, + { + "epoch": 0.24958002015903236, + "grad_norm": 1.4655698537826538, + "learning_rate": 2.2270000000000002e-05, + "loss": 0.3449, + "step": 4457 + }, + { + "epoch": 0.24963601747116138, + "grad_norm": 1.068646788597107, + "learning_rate": 2.2275000000000003e-05, + "loss": 0.3799, + "step": 4458 + }, + { + "epoch": 0.2496920147832904, + "grad_norm": 1.1122599840164185, + "learning_rate": 2.228e-05, + "loss": 0.4005, + "step": 4459 + }, + { + "epoch": 0.2497480120954194, + "grad_norm": 1.5223112106323242, + "learning_rate": 2.2285e-05, + "loss": 0.4872, + "step": 4460 + }, + { + "epoch": 0.24980400940754843, + "grad_norm": 1.0510022640228271, + "learning_rate": 2.229e-05, + "loss": 0.3729, + "step": 4461 + }, + { + "epoch": 0.24986000671967745, + "grad_norm": 1.342260718345642, + "learning_rate": 2.2295000000000003e-05, + "loss": 0.5689, + "step": 4462 + }, + { + "epoch": 0.24991600403180647, + "grad_norm": 1.2196329832077026, + "learning_rate": 2.23e-05, + "loss": 0.4905, + "step": 4463 + }, + { + "epoch": 0.2499720013439355, + "grad_norm": 1.9157620668411255, + "learning_rate": 2.2305e-05, + "loss": 0.4368, + "step": 4464 + }, + { + "epoch": 0.2500279986560645, + "grad_norm": 1.316776990890503, + "learning_rate": 2.231e-05, + "loss": 0.3954, + "step": 4465 + }, + { + "epoch": 0.25008399596819353, + "grad_norm": 1.326110601425171, + "learning_rate": 2.2315e-05, + "loss": 0.4895, + "step": 4466 + }, + { + "epoch": 0.25013999328032255, + "grad_norm": 1.2966643571853638, + "learning_rate": 2.2320000000000003e-05, + "loss": 0.451, + "step": 4467 + }, + { + "epoch": 0.25019599059245157, + "grad_norm": 1.364499807357788, + "learning_rate": 2.2325e-05, + "loss": 0.4063, + "step": 4468 + }, + { + "epoch": 0.2502519879045806, + "grad_norm": 1.3014533519744873, + "learning_rate": 2.233e-05, + "loss": 0.3914, + "step": 4469 + }, + { + "epoch": 0.2503079852167096, + "grad_norm": 0.955682098865509, + "learning_rate": 2.2335e-05, + "loss": 0.4266, + "step": 4470 + }, + { + "epoch": 0.2503639825288386, + "grad_norm": 1.215097188949585, + "learning_rate": 2.234e-05, + "loss": 0.3631, + "step": 4471 + }, + { + "epoch": 0.25041997984096764, + "grad_norm": 1.2024786472320557, + "learning_rate": 2.2345e-05, + "loss": 0.4752, + "step": 4472 + }, + { + "epoch": 0.25047597715309666, + "grad_norm": 1.1775621175765991, + "learning_rate": 2.235e-05, + "loss": 0.5077, + "step": 4473 + }, + { + "epoch": 0.2505319744652257, + "grad_norm": 1.1632726192474365, + "learning_rate": 2.2355000000000002e-05, + "loss": 0.428, + "step": 4474 + }, + { + "epoch": 0.2505879717773547, + "grad_norm": 1.2444415092468262, + "learning_rate": 2.236e-05, + "loss": 0.4456, + "step": 4475 + }, + { + "epoch": 0.2506439690894837, + "grad_norm": 1.2137200832366943, + "learning_rate": 2.2365e-05, + "loss": 0.3735, + "step": 4476 + }, + { + "epoch": 0.25069996640161274, + "grad_norm": 1.350601315498352, + "learning_rate": 2.237e-05, + "loss": 0.4603, + "step": 4477 + }, + { + "epoch": 0.25075596371374176, + "grad_norm": 1.1135917901992798, + "learning_rate": 2.2375000000000002e-05, + "loss": 0.4001, + "step": 4478 + }, + { + "epoch": 0.2508119610258708, + "grad_norm": 1.1618399620056152, + "learning_rate": 2.2380000000000003e-05, + "loss": 0.4734, + "step": 4479 + }, + { + "epoch": 0.2508679583379998, + "grad_norm": 1.1615564823150635, + "learning_rate": 2.2385e-05, + "loss": 0.4212, + "step": 4480 + }, + { + "epoch": 0.2509239556501288, + "grad_norm": 2.1652941703796387, + "learning_rate": 2.239e-05, + "loss": 0.424, + "step": 4481 + }, + { + "epoch": 0.25097995296225784, + "grad_norm": 1.3567856550216675, + "learning_rate": 2.2395e-05, + "loss": 0.7095, + "step": 4482 + }, + { + "epoch": 0.25103595027438685, + "grad_norm": 1.3855979442596436, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.5495, + "step": 4483 + }, + { + "epoch": 0.2510919475865159, + "grad_norm": 1.2875161170959473, + "learning_rate": 2.2405e-05, + "loss": 0.3818, + "step": 4484 + }, + { + "epoch": 0.2511479448986449, + "grad_norm": 1.3533802032470703, + "learning_rate": 2.241e-05, + "loss": 0.4189, + "step": 4485 + }, + { + "epoch": 0.25120394221077386, + "grad_norm": 1.444612741470337, + "learning_rate": 2.2415e-05, + "loss": 0.4597, + "step": 4486 + }, + { + "epoch": 0.2512599395229029, + "grad_norm": 1.1549077033996582, + "learning_rate": 2.2420000000000002e-05, + "loss": 0.4765, + "step": 4487 + }, + { + "epoch": 0.2513159368350319, + "grad_norm": 1.3089532852172852, + "learning_rate": 2.2425000000000003e-05, + "loss": 0.5499, + "step": 4488 + }, + { + "epoch": 0.2513719341471609, + "grad_norm": 1.3971365690231323, + "learning_rate": 2.243e-05, + "loss": 0.6296, + "step": 4489 + }, + { + "epoch": 0.25142793145928993, + "grad_norm": 1.27325439453125, + "learning_rate": 2.2435e-05, + "loss": 0.4193, + "step": 4490 + }, + { + "epoch": 0.25148392877141895, + "grad_norm": 1.5754708051681519, + "learning_rate": 2.244e-05, + "loss": 0.4805, + "step": 4491 + }, + { + "epoch": 0.25153992608354797, + "grad_norm": 1.695016622543335, + "learning_rate": 2.2445000000000003e-05, + "loss": 0.5813, + "step": 4492 + }, + { + "epoch": 0.251595923395677, + "grad_norm": 1.0356920957565308, + "learning_rate": 2.245e-05, + "loss": 0.4802, + "step": 4493 + }, + { + "epoch": 0.251651920707806, + "grad_norm": 1.0675708055496216, + "learning_rate": 2.2455e-05, + "loss": 0.3035, + "step": 4494 + }, + { + "epoch": 0.25170791801993503, + "grad_norm": 1.0566827058792114, + "learning_rate": 2.2460000000000002e-05, + "loss": 0.3935, + "step": 4495 + }, + { + "epoch": 0.25176391533206405, + "grad_norm": 1.114900827407837, + "learning_rate": 2.2465e-05, + "loss": 0.38, + "step": 4496 + }, + { + "epoch": 0.25181991264419307, + "grad_norm": 1.1412712335586548, + "learning_rate": 2.2470000000000003e-05, + "loss": 0.463, + "step": 4497 + }, + { + "epoch": 0.2518759099563221, + "grad_norm": 1.238297939300537, + "learning_rate": 2.2475e-05, + "loss": 0.3508, + "step": 4498 + }, + { + "epoch": 0.2519319072684511, + "grad_norm": 1.077498197555542, + "learning_rate": 2.248e-05, + "loss": 0.3731, + "step": 4499 + }, + { + "epoch": 0.2519879045805801, + "grad_norm": 1.1671607494354248, + "learning_rate": 2.2485e-05, + "loss": 0.2541, + "step": 4500 + }, + { + "epoch": 0.25204390189270914, + "grad_norm": 1.0954142808914185, + "learning_rate": 2.249e-05, + "loss": 0.3176, + "step": 4501 + }, + { + "epoch": 0.25209989920483816, + "grad_norm": 1.1871159076690674, + "learning_rate": 2.2495e-05, + "loss": 0.3175, + "step": 4502 + }, + { + "epoch": 0.2521558965169672, + "grad_norm": 1.5153894424438477, + "learning_rate": 2.25e-05, + "loss": 0.4673, + "step": 4503 + }, + { + "epoch": 0.2522118938290962, + "grad_norm": 1.1715959310531616, + "learning_rate": 2.2505000000000002e-05, + "loss": 0.4213, + "step": 4504 + }, + { + "epoch": 0.2522678911412252, + "grad_norm": 1.47342050075531, + "learning_rate": 2.251e-05, + "loss": 0.4968, + "step": 4505 + }, + { + "epoch": 0.25232388845335424, + "grad_norm": 12.813328742980957, + "learning_rate": 2.2515e-05, + "loss": 0.4618, + "step": 4506 + }, + { + "epoch": 0.25237988576548326, + "grad_norm": 1.367955207824707, + "learning_rate": 2.252e-05, + "loss": 0.4697, + "step": 4507 + }, + { + "epoch": 0.2524358830776123, + "grad_norm": 1.470633625984192, + "learning_rate": 2.2525000000000002e-05, + "loss": 0.4447, + "step": 4508 + }, + { + "epoch": 0.2524918803897413, + "grad_norm": 1.6107702255249023, + "learning_rate": 2.253e-05, + "loss": 0.5733, + "step": 4509 + }, + { + "epoch": 0.2525478777018703, + "grad_norm": 1.2405142784118652, + "learning_rate": 2.2535e-05, + "loss": 0.5126, + "step": 4510 + }, + { + "epoch": 0.25260387501399933, + "grad_norm": 1.1081312894821167, + "learning_rate": 2.254e-05, + "loss": 0.326, + "step": 4511 + }, + { + "epoch": 0.25265987232612835, + "grad_norm": 1.2331016063690186, + "learning_rate": 2.2545000000000002e-05, + "loss": 0.4817, + "step": 4512 + }, + { + "epoch": 0.2527158696382574, + "grad_norm": 1.1779156923294067, + "learning_rate": 2.2550000000000003e-05, + "loss": 0.4658, + "step": 4513 + }, + { + "epoch": 0.2527718669503864, + "grad_norm": 1.5347821712493896, + "learning_rate": 2.2555e-05, + "loss": 0.4134, + "step": 4514 + }, + { + "epoch": 0.2528278642625154, + "grad_norm": 1.191646695137024, + "learning_rate": 2.256e-05, + "loss": 0.5631, + "step": 4515 + }, + { + "epoch": 0.25288386157464443, + "grad_norm": 1.340574860572815, + "learning_rate": 2.2565e-05, + "loss": 0.4469, + "step": 4516 + }, + { + "epoch": 0.25293985888677345, + "grad_norm": 1.1392805576324463, + "learning_rate": 2.2570000000000002e-05, + "loss": 0.4925, + "step": 4517 + }, + { + "epoch": 0.25299585619890247, + "grad_norm": 1.2708933353424072, + "learning_rate": 2.2575000000000003e-05, + "loss": 0.4406, + "step": 4518 + }, + { + "epoch": 0.2530518535110315, + "grad_norm": 1.3625980615615845, + "learning_rate": 2.258e-05, + "loss": 0.497, + "step": 4519 + }, + { + "epoch": 0.2531078508231605, + "grad_norm": 0.9875698685646057, + "learning_rate": 2.2585e-05, + "loss": 0.3762, + "step": 4520 + }, + { + "epoch": 0.2531638481352895, + "grad_norm": 1.1133662462234497, + "learning_rate": 2.259e-05, + "loss": 0.3702, + "step": 4521 + }, + { + "epoch": 0.25321984544741855, + "grad_norm": 1.538754940032959, + "learning_rate": 2.2595000000000003e-05, + "loss": 0.5965, + "step": 4522 + }, + { + "epoch": 0.25327584275954756, + "grad_norm": 1.3976726531982422, + "learning_rate": 2.26e-05, + "loss": 0.3976, + "step": 4523 + }, + { + "epoch": 0.2533318400716766, + "grad_norm": 1.1165637969970703, + "learning_rate": 2.2605e-05, + "loss": 0.389, + "step": 4524 + }, + { + "epoch": 0.2533878373838056, + "grad_norm": 1.373055338859558, + "learning_rate": 2.2610000000000002e-05, + "loss": 0.4778, + "step": 4525 + }, + { + "epoch": 0.2534438346959346, + "grad_norm": 1.2408130168914795, + "learning_rate": 2.2615e-05, + "loss": 0.6502, + "step": 4526 + }, + { + "epoch": 0.2534998320080636, + "grad_norm": 0.9629552364349365, + "learning_rate": 2.2620000000000004e-05, + "loss": 0.4091, + "step": 4527 + }, + { + "epoch": 0.2535558293201926, + "grad_norm": 1.3871678113937378, + "learning_rate": 2.2625e-05, + "loss": 0.4843, + "step": 4528 + }, + { + "epoch": 0.2536118266323216, + "grad_norm": 1.508987545967102, + "learning_rate": 2.2630000000000002e-05, + "loss": 0.4926, + "step": 4529 + }, + { + "epoch": 0.25366782394445064, + "grad_norm": 1.3006188869476318, + "learning_rate": 2.2635e-05, + "loss": 0.4367, + "step": 4530 + }, + { + "epoch": 0.25372382125657966, + "grad_norm": 1.3513522148132324, + "learning_rate": 2.264e-05, + "loss": 0.4645, + "step": 4531 + }, + { + "epoch": 0.2537798185687087, + "grad_norm": 1.414544939994812, + "learning_rate": 2.2645e-05, + "loss": 0.6809, + "step": 4532 + }, + { + "epoch": 0.2538358158808377, + "grad_norm": 1.3013323545455933, + "learning_rate": 2.265e-05, + "loss": 0.5165, + "step": 4533 + }, + { + "epoch": 0.2538918131929667, + "grad_norm": 1.2263951301574707, + "learning_rate": 2.2655000000000002e-05, + "loss": 0.5141, + "step": 4534 + }, + { + "epoch": 0.25394781050509574, + "grad_norm": 1.914273738861084, + "learning_rate": 2.266e-05, + "loss": 0.4634, + "step": 4535 + }, + { + "epoch": 0.25400380781722476, + "grad_norm": 1.4267395734786987, + "learning_rate": 2.2665e-05, + "loss": 0.4487, + "step": 4536 + }, + { + "epoch": 0.2540598051293538, + "grad_norm": 1.1767939329147339, + "learning_rate": 2.267e-05, + "loss": 0.434, + "step": 4537 + }, + { + "epoch": 0.2541158024414828, + "grad_norm": 1.0303541421890259, + "learning_rate": 2.2675000000000002e-05, + "loss": 0.3804, + "step": 4538 + }, + { + "epoch": 0.2541717997536118, + "grad_norm": 1.2321490049362183, + "learning_rate": 2.268e-05, + "loss": 0.4092, + "step": 4539 + }, + { + "epoch": 0.25422779706574083, + "grad_norm": 1.1032359600067139, + "learning_rate": 2.2685e-05, + "loss": 0.4038, + "step": 4540 + }, + { + "epoch": 0.25428379437786985, + "grad_norm": 1.090529441833496, + "learning_rate": 2.269e-05, + "loss": 0.4244, + "step": 4541 + }, + { + "epoch": 0.2543397916899989, + "grad_norm": 1.335461139678955, + "learning_rate": 2.2695000000000002e-05, + "loss": 0.5703, + "step": 4542 + }, + { + "epoch": 0.2543957890021279, + "grad_norm": 1.1057159900665283, + "learning_rate": 2.2700000000000003e-05, + "loss": 0.3951, + "step": 4543 + }, + { + "epoch": 0.2544517863142569, + "grad_norm": 1.1145004034042358, + "learning_rate": 2.2705e-05, + "loss": 0.3985, + "step": 4544 + }, + { + "epoch": 0.25450778362638593, + "grad_norm": 1.1383142471313477, + "learning_rate": 2.271e-05, + "loss": 0.4392, + "step": 4545 + }, + { + "epoch": 0.25456378093851495, + "grad_norm": 1.4747370481491089, + "learning_rate": 2.2715e-05, + "loss": 0.4301, + "step": 4546 + }, + { + "epoch": 0.25461977825064397, + "grad_norm": 1.2272024154663086, + "learning_rate": 2.2720000000000003e-05, + "loss": 0.3965, + "step": 4547 + }, + { + "epoch": 0.254675775562773, + "grad_norm": 1.4545232057571411, + "learning_rate": 2.2725000000000003e-05, + "loss": 0.4497, + "step": 4548 + }, + { + "epoch": 0.254731772874902, + "grad_norm": 1.2840518951416016, + "learning_rate": 2.273e-05, + "loss": 0.3729, + "step": 4549 + }, + { + "epoch": 0.254787770187031, + "grad_norm": 1.1751893758773804, + "learning_rate": 2.2735000000000002e-05, + "loss": 0.3626, + "step": 4550 + }, + { + "epoch": 0.25484376749916005, + "grad_norm": 1.1711511611938477, + "learning_rate": 2.274e-05, + "loss": 0.3928, + "step": 4551 + }, + { + "epoch": 0.25489976481128906, + "grad_norm": 1.4283241033554077, + "learning_rate": 2.2745000000000003e-05, + "loss": 0.4136, + "step": 4552 + }, + { + "epoch": 0.2549557621234181, + "grad_norm": 1.297680139541626, + "learning_rate": 2.275e-05, + "loss": 0.4825, + "step": 4553 + }, + { + "epoch": 0.2550117594355471, + "grad_norm": 1.3692355155944824, + "learning_rate": 2.2755e-05, + "loss": 0.4102, + "step": 4554 + }, + { + "epoch": 0.2550677567476761, + "grad_norm": 1.418385624885559, + "learning_rate": 2.2760000000000002e-05, + "loss": 0.5126, + "step": 4555 + }, + { + "epoch": 0.25512375405980514, + "grad_norm": 1.3031096458435059, + "learning_rate": 2.2765e-05, + "loss": 0.5565, + "step": 4556 + }, + { + "epoch": 0.25517975137193416, + "grad_norm": 1.1401340961456299, + "learning_rate": 2.2770000000000004e-05, + "loss": 0.4568, + "step": 4557 + }, + { + "epoch": 0.2552357486840632, + "grad_norm": 1.2938733100891113, + "learning_rate": 2.2775e-05, + "loss": 0.4326, + "step": 4558 + }, + { + "epoch": 0.2552917459961922, + "grad_norm": 1.1616764068603516, + "learning_rate": 2.2780000000000002e-05, + "loss": 0.36, + "step": 4559 + }, + { + "epoch": 0.2553477433083212, + "grad_norm": 1.2217586040496826, + "learning_rate": 2.2785e-05, + "loss": 0.5322, + "step": 4560 + }, + { + "epoch": 0.25540374062045024, + "grad_norm": 1.309910774230957, + "learning_rate": 2.279e-05, + "loss": 0.6896, + "step": 4561 + }, + { + "epoch": 0.25545973793257926, + "grad_norm": 1.2356170415878296, + "learning_rate": 2.2795e-05, + "loss": 0.4242, + "step": 4562 + }, + { + "epoch": 0.2555157352447083, + "grad_norm": 1.4275574684143066, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.5966, + "step": 4563 + }, + { + "epoch": 0.2555717325568373, + "grad_norm": 1.1685105562210083, + "learning_rate": 2.2805000000000003e-05, + "loss": 0.3881, + "step": 4564 + }, + { + "epoch": 0.2556277298689663, + "grad_norm": 1.184205174446106, + "learning_rate": 2.281e-05, + "loss": 0.5306, + "step": 4565 + }, + { + "epoch": 0.25568372718109533, + "grad_norm": 1.4231535196304321, + "learning_rate": 2.2815e-05, + "loss": 0.3465, + "step": 4566 + }, + { + "epoch": 0.25573972449322435, + "grad_norm": 1.3884516954421997, + "learning_rate": 2.282e-05, + "loss": 0.4995, + "step": 4567 + }, + { + "epoch": 0.2557957218053533, + "grad_norm": 1.2389317750930786, + "learning_rate": 2.2825000000000003e-05, + "loss": 0.3583, + "step": 4568 + }, + { + "epoch": 0.25585171911748233, + "grad_norm": 1.3554555177688599, + "learning_rate": 2.283e-05, + "loss": 0.4887, + "step": 4569 + }, + { + "epoch": 0.25590771642961135, + "grad_norm": 1.494165301322937, + "learning_rate": 2.2835e-05, + "loss": 0.44, + "step": 4570 + }, + { + "epoch": 0.2559637137417404, + "grad_norm": 1.0958404541015625, + "learning_rate": 2.284e-05, + "loss": 0.499, + "step": 4571 + }, + { + "epoch": 0.2560197110538694, + "grad_norm": 1.5103360414505005, + "learning_rate": 2.2845e-05, + "loss": 0.5023, + "step": 4572 + }, + { + "epoch": 0.2560757083659984, + "grad_norm": 1.204907774925232, + "learning_rate": 2.2850000000000003e-05, + "loss": 0.4267, + "step": 4573 + }, + { + "epoch": 0.25613170567812743, + "grad_norm": 1.4339368343353271, + "learning_rate": 2.2855e-05, + "loss": 0.5432, + "step": 4574 + }, + { + "epoch": 0.25618770299025645, + "grad_norm": 1.1898574829101562, + "learning_rate": 2.286e-05, + "loss": 0.3848, + "step": 4575 + }, + { + "epoch": 0.25624370030238547, + "grad_norm": 1.053621530532837, + "learning_rate": 2.2865e-05, + "loss": 0.3681, + "step": 4576 + }, + { + "epoch": 0.2562996976145145, + "grad_norm": 1.312021017074585, + "learning_rate": 2.287e-05, + "loss": 0.464, + "step": 4577 + }, + { + "epoch": 0.2563556949266435, + "grad_norm": 1.321693778038025, + "learning_rate": 2.2875e-05, + "loss": 0.4943, + "step": 4578 + }, + { + "epoch": 0.2564116922387725, + "grad_norm": 1.1725354194641113, + "learning_rate": 2.288e-05, + "loss": 0.3536, + "step": 4579 + }, + { + "epoch": 0.25646768955090155, + "grad_norm": 1.213531732559204, + "learning_rate": 2.2885000000000002e-05, + "loss": 0.3764, + "step": 4580 + }, + { + "epoch": 0.25652368686303056, + "grad_norm": 1.1212034225463867, + "learning_rate": 2.289e-05, + "loss": 0.5052, + "step": 4581 + }, + { + "epoch": 0.2565796841751596, + "grad_norm": 1.2362000942230225, + "learning_rate": 2.2895e-05, + "loss": 0.4205, + "step": 4582 + }, + { + "epoch": 0.2566356814872886, + "grad_norm": 1.1426849365234375, + "learning_rate": 2.29e-05, + "loss": 0.4204, + "step": 4583 + }, + { + "epoch": 0.2566916787994176, + "grad_norm": 1.246262788772583, + "learning_rate": 2.2905000000000002e-05, + "loss": 0.3154, + "step": 4584 + }, + { + "epoch": 0.25674767611154664, + "grad_norm": 1.2284398078918457, + "learning_rate": 2.2910000000000003e-05, + "loss": 0.4645, + "step": 4585 + }, + { + "epoch": 0.25680367342367566, + "grad_norm": 1.2259362936019897, + "learning_rate": 2.2915e-05, + "loss": 0.4662, + "step": 4586 + }, + { + "epoch": 0.2568596707358047, + "grad_norm": 1.1868433952331543, + "learning_rate": 2.292e-05, + "loss": 0.4249, + "step": 4587 + }, + { + "epoch": 0.2569156680479337, + "grad_norm": 1.2697373628616333, + "learning_rate": 2.2925e-05, + "loss": 0.4276, + "step": 4588 + }, + { + "epoch": 0.2569716653600627, + "grad_norm": 1.3835608959197998, + "learning_rate": 2.2930000000000002e-05, + "loss": 0.6027, + "step": 4589 + }, + { + "epoch": 0.25702766267219174, + "grad_norm": 1.2019926309585571, + "learning_rate": 2.2935e-05, + "loss": 0.4143, + "step": 4590 + }, + { + "epoch": 0.25708365998432076, + "grad_norm": 1.1837432384490967, + "learning_rate": 2.294e-05, + "loss": 0.4683, + "step": 4591 + }, + { + "epoch": 0.2571396572964498, + "grad_norm": 1.3229570388793945, + "learning_rate": 2.2945e-05, + "loss": 0.6599, + "step": 4592 + }, + { + "epoch": 0.2571956546085788, + "grad_norm": 1.2265925407409668, + "learning_rate": 2.2950000000000002e-05, + "loss": 0.4594, + "step": 4593 + }, + { + "epoch": 0.2572516519207078, + "grad_norm": 1.4257705211639404, + "learning_rate": 2.2955000000000003e-05, + "loss": 0.3496, + "step": 4594 + }, + { + "epoch": 0.25730764923283683, + "grad_norm": 1.018993854522705, + "learning_rate": 2.296e-05, + "loss": 0.4226, + "step": 4595 + }, + { + "epoch": 0.25736364654496585, + "grad_norm": 1.0288788080215454, + "learning_rate": 2.2965e-05, + "loss": 0.3671, + "step": 4596 + }, + { + "epoch": 0.25741964385709487, + "grad_norm": 1.3952146768569946, + "learning_rate": 2.297e-05, + "loss": 0.3726, + "step": 4597 + }, + { + "epoch": 0.2574756411692239, + "grad_norm": 1.4173377752304077, + "learning_rate": 2.2975000000000003e-05, + "loss": 0.549, + "step": 4598 + }, + { + "epoch": 0.2575316384813529, + "grad_norm": 1.2097254991531372, + "learning_rate": 2.298e-05, + "loss": 0.5329, + "step": 4599 + }, + { + "epoch": 0.25758763579348193, + "grad_norm": 1.0523312091827393, + "learning_rate": 2.2985e-05, + "loss": 0.4327, + "step": 4600 + }, + { + "epoch": 0.25764363310561095, + "grad_norm": 1.171477198600769, + "learning_rate": 2.2990000000000002e-05, + "loss": 0.3262, + "step": 4601 + }, + { + "epoch": 0.25769963041773997, + "grad_norm": 1.3108364343643188, + "learning_rate": 2.2995e-05, + "loss": 0.4516, + "step": 4602 + }, + { + "epoch": 0.257755627729869, + "grad_norm": 1.0478096008300781, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3064, + "step": 4603 + }, + { + "epoch": 0.257811625041998, + "grad_norm": 1.145826816558838, + "learning_rate": 2.3005e-05, + "loss": 0.4085, + "step": 4604 + }, + { + "epoch": 0.257867622354127, + "grad_norm": 1.2956432104110718, + "learning_rate": 2.301e-05, + "loss": 0.4424, + "step": 4605 + }, + { + "epoch": 0.25792361966625604, + "grad_norm": 1.2124707698822021, + "learning_rate": 2.3015e-05, + "loss": 0.4248, + "step": 4606 + }, + { + "epoch": 0.25797961697838506, + "grad_norm": 1.0245577096939087, + "learning_rate": 2.302e-05, + "loss": 0.4546, + "step": 4607 + }, + { + "epoch": 0.2580356142905141, + "grad_norm": 1.367613434791565, + "learning_rate": 2.3025e-05, + "loss": 0.4073, + "step": 4608 + }, + { + "epoch": 0.2580916116026431, + "grad_norm": 1.5224664211273193, + "learning_rate": 2.303e-05, + "loss": 0.4822, + "step": 4609 + }, + { + "epoch": 0.25814760891477206, + "grad_norm": 1.2754371166229248, + "learning_rate": 2.3035000000000002e-05, + "loss": 0.5098, + "step": 4610 + }, + { + "epoch": 0.2582036062269011, + "grad_norm": 1.6209826469421387, + "learning_rate": 2.304e-05, + "loss": 0.4425, + "step": 4611 + }, + { + "epoch": 0.2582596035390301, + "grad_norm": 1.2482712268829346, + "learning_rate": 2.3045e-05, + "loss": 0.5628, + "step": 4612 + }, + { + "epoch": 0.2583156008511591, + "grad_norm": 1.3711938858032227, + "learning_rate": 2.305e-05, + "loss": 0.4381, + "step": 4613 + }, + { + "epoch": 0.25837159816328814, + "grad_norm": 1.0197200775146484, + "learning_rate": 2.3055000000000002e-05, + "loss": 0.3743, + "step": 4614 + }, + { + "epoch": 0.25842759547541716, + "grad_norm": 1.2958955764770508, + "learning_rate": 2.306e-05, + "loss": 0.386, + "step": 4615 + }, + { + "epoch": 0.2584835927875462, + "grad_norm": 1.3102233409881592, + "learning_rate": 2.3065e-05, + "loss": 0.5122, + "step": 4616 + }, + { + "epoch": 0.2585395900996752, + "grad_norm": 1.064424991607666, + "learning_rate": 2.307e-05, + "loss": 0.4189, + "step": 4617 + }, + { + "epoch": 0.2585955874118042, + "grad_norm": 1.2254605293273926, + "learning_rate": 2.3075000000000002e-05, + "loss": 0.4455, + "step": 4618 + }, + { + "epoch": 0.25865158472393324, + "grad_norm": 1.271575927734375, + "learning_rate": 2.3080000000000003e-05, + "loss": 0.4599, + "step": 4619 + }, + { + "epoch": 0.25870758203606226, + "grad_norm": 1.0416302680969238, + "learning_rate": 2.3085e-05, + "loss": 0.4125, + "step": 4620 + }, + { + "epoch": 0.2587635793481913, + "grad_norm": 1.469797134399414, + "learning_rate": 2.309e-05, + "loss": 0.4717, + "step": 4621 + }, + { + "epoch": 0.2588195766603203, + "grad_norm": 1.3082259893417358, + "learning_rate": 2.3095e-05, + "loss": 0.4036, + "step": 4622 + }, + { + "epoch": 0.2588755739724493, + "grad_norm": 1.3652042150497437, + "learning_rate": 2.3100000000000002e-05, + "loss": 0.5413, + "step": 4623 + }, + { + "epoch": 0.25893157128457833, + "grad_norm": 1.3426355123519897, + "learning_rate": 2.3105000000000003e-05, + "loss": 0.5378, + "step": 4624 + }, + { + "epoch": 0.25898756859670735, + "grad_norm": 1.111953854560852, + "learning_rate": 2.311e-05, + "loss": 0.2973, + "step": 4625 + }, + { + "epoch": 0.25904356590883637, + "grad_norm": 1.1020268201828003, + "learning_rate": 2.3115e-05, + "loss": 0.4528, + "step": 4626 + }, + { + "epoch": 0.2590995632209654, + "grad_norm": 1.0999373197555542, + "learning_rate": 2.312e-05, + "loss": 0.3916, + "step": 4627 + }, + { + "epoch": 0.2591555605330944, + "grad_norm": 1.2418850660324097, + "learning_rate": 2.3125000000000003e-05, + "loss": 0.483, + "step": 4628 + }, + { + "epoch": 0.25921155784522343, + "grad_norm": 1.1667850017547607, + "learning_rate": 2.313e-05, + "loss": 0.4024, + "step": 4629 + }, + { + "epoch": 0.25926755515735245, + "grad_norm": 1.1083531379699707, + "learning_rate": 2.3135e-05, + "loss": 0.4499, + "step": 4630 + }, + { + "epoch": 0.25932355246948147, + "grad_norm": 1.260475516319275, + "learning_rate": 2.3140000000000002e-05, + "loss": 0.5076, + "step": 4631 + }, + { + "epoch": 0.2593795497816105, + "grad_norm": 1.1606007814407349, + "learning_rate": 2.3145e-05, + "loss": 0.4015, + "step": 4632 + }, + { + "epoch": 0.2594355470937395, + "grad_norm": 1.4891753196716309, + "learning_rate": 2.3150000000000004e-05, + "loss": 0.5287, + "step": 4633 + }, + { + "epoch": 0.2594915444058685, + "grad_norm": 1.2093613147735596, + "learning_rate": 2.3155e-05, + "loss": 0.5178, + "step": 4634 + }, + { + "epoch": 0.25954754171799754, + "grad_norm": 1.1132348775863647, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.5762, + "step": 4635 + }, + { + "epoch": 0.25960353903012656, + "grad_norm": 1.8130476474761963, + "learning_rate": 2.3165e-05, + "loss": 0.553, + "step": 4636 + }, + { + "epoch": 0.2596595363422556, + "grad_norm": 1.2123457193374634, + "learning_rate": 2.317e-05, + "loss": 0.6021, + "step": 4637 + }, + { + "epoch": 0.2597155336543846, + "grad_norm": 1.457024335861206, + "learning_rate": 2.3175e-05, + "loss": 0.5136, + "step": 4638 + }, + { + "epoch": 0.2597715309665136, + "grad_norm": 1.319503664970398, + "learning_rate": 2.318e-05, + "loss": 0.4162, + "step": 4639 + }, + { + "epoch": 0.25982752827864264, + "grad_norm": 1.2477689981460571, + "learning_rate": 2.3185000000000002e-05, + "loss": 0.4707, + "step": 4640 + }, + { + "epoch": 0.25988352559077166, + "grad_norm": 1.1592180728912354, + "learning_rate": 2.319e-05, + "loss": 0.2992, + "step": 4641 + }, + { + "epoch": 0.2599395229029007, + "grad_norm": 1.3189729452133179, + "learning_rate": 2.3195e-05, + "loss": 0.52, + "step": 4642 + }, + { + "epoch": 0.2599955202150297, + "grad_norm": 1.237096905708313, + "learning_rate": 2.32e-05, + "loss": 0.6822, + "step": 4643 + }, + { + "epoch": 0.2600515175271587, + "grad_norm": 1.4841668605804443, + "learning_rate": 2.3205000000000002e-05, + "loss": 0.4308, + "step": 4644 + }, + { + "epoch": 0.26010751483928773, + "grad_norm": 1.2012847661972046, + "learning_rate": 2.321e-05, + "loss": 0.5243, + "step": 4645 + }, + { + "epoch": 0.26016351215141675, + "grad_norm": 1.1921275854110718, + "learning_rate": 2.3215e-05, + "loss": 0.4918, + "step": 4646 + }, + { + "epoch": 0.2602195094635458, + "grad_norm": 1.1845691204071045, + "learning_rate": 2.322e-05, + "loss": 0.384, + "step": 4647 + }, + { + "epoch": 0.2602755067756748, + "grad_norm": 0.9477851986885071, + "learning_rate": 2.3225000000000002e-05, + "loss": 0.3095, + "step": 4648 + }, + { + "epoch": 0.2603315040878038, + "grad_norm": 1.0472500324249268, + "learning_rate": 2.3230000000000003e-05, + "loss": 0.349, + "step": 4649 + }, + { + "epoch": 0.26038750139993283, + "grad_norm": 1.1473208665847778, + "learning_rate": 2.3235e-05, + "loss": 0.3444, + "step": 4650 + }, + { + "epoch": 0.2604434987120618, + "grad_norm": 1.0765466690063477, + "learning_rate": 2.324e-05, + "loss": 0.356, + "step": 4651 + }, + { + "epoch": 0.2604994960241908, + "grad_norm": 1.3327019214630127, + "learning_rate": 2.3245e-05, + "loss": 0.4309, + "step": 4652 + }, + { + "epoch": 0.26055549333631983, + "grad_norm": 1.0411831140518188, + "learning_rate": 2.3250000000000003e-05, + "loss": 0.4155, + "step": 4653 + }, + { + "epoch": 0.26061149064844885, + "grad_norm": 1.1315257549285889, + "learning_rate": 2.3255e-05, + "loss": 0.3979, + "step": 4654 + }, + { + "epoch": 0.26066748796057787, + "grad_norm": 1.1296149492263794, + "learning_rate": 2.326e-05, + "loss": 0.3826, + "step": 4655 + }, + { + "epoch": 0.2607234852727069, + "grad_norm": 1.3033127784729004, + "learning_rate": 2.3265000000000002e-05, + "loss": 0.4008, + "step": 4656 + }, + { + "epoch": 0.2607794825848359, + "grad_norm": 1.210599660873413, + "learning_rate": 2.327e-05, + "loss": 0.5096, + "step": 4657 + }, + { + "epoch": 0.26083547989696493, + "grad_norm": 1.4044979810714722, + "learning_rate": 2.3275000000000003e-05, + "loss": 0.4934, + "step": 4658 + }, + { + "epoch": 0.26089147720909395, + "grad_norm": 4.5497307777404785, + "learning_rate": 2.328e-05, + "loss": 0.3402, + "step": 4659 + }, + { + "epoch": 0.26094747452122297, + "grad_norm": 1.3142727613449097, + "learning_rate": 2.3285e-05, + "loss": 0.397, + "step": 4660 + }, + { + "epoch": 0.261003471833352, + "grad_norm": 1.4795187711715698, + "learning_rate": 2.3290000000000002e-05, + "loss": 0.4657, + "step": 4661 + }, + { + "epoch": 0.261059469145481, + "grad_norm": 1.0734316110610962, + "learning_rate": 2.3295e-05, + "loss": 0.4536, + "step": 4662 + }, + { + "epoch": 0.26111546645761, + "grad_norm": 1.2687352895736694, + "learning_rate": 2.3300000000000004e-05, + "loss": 0.4132, + "step": 4663 + }, + { + "epoch": 0.26117146376973904, + "grad_norm": 1.1907485723495483, + "learning_rate": 2.3305e-05, + "loss": 0.5572, + "step": 4664 + }, + { + "epoch": 0.26122746108186806, + "grad_norm": 1.0302767753601074, + "learning_rate": 2.3310000000000002e-05, + "loss": 0.3827, + "step": 4665 + }, + { + "epoch": 0.2612834583939971, + "grad_norm": 2.662245750427246, + "learning_rate": 2.3315e-05, + "loss": 0.3729, + "step": 4666 + }, + { + "epoch": 0.2613394557061261, + "grad_norm": 1.0700167417526245, + "learning_rate": 2.332e-05, + "loss": 0.5333, + "step": 4667 + }, + { + "epoch": 0.2613954530182551, + "grad_norm": 1.3027064800262451, + "learning_rate": 2.3325e-05, + "loss": 0.449, + "step": 4668 + }, + { + "epoch": 0.26145145033038414, + "grad_norm": 1.1364097595214844, + "learning_rate": 2.3330000000000002e-05, + "loss": 0.3691, + "step": 4669 + }, + { + "epoch": 0.26150744764251316, + "grad_norm": 1.3689920902252197, + "learning_rate": 2.3335000000000003e-05, + "loss": 0.5063, + "step": 4670 + }, + { + "epoch": 0.2615634449546422, + "grad_norm": 1.2601542472839355, + "learning_rate": 2.334e-05, + "loss": 0.48, + "step": 4671 + }, + { + "epoch": 0.2616194422667712, + "grad_norm": 1.0073635578155518, + "learning_rate": 2.3345e-05, + "loss": 0.3379, + "step": 4672 + }, + { + "epoch": 0.2616754395789002, + "grad_norm": 1.1468099355697632, + "learning_rate": 2.3350000000000002e-05, + "loss": 0.3992, + "step": 4673 + }, + { + "epoch": 0.26173143689102923, + "grad_norm": 1.1332155466079712, + "learning_rate": 2.3355000000000003e-05, + "loss": 0.4549, + "step": 4674 + }, + { + "epoch": 0.26178743420315825, + "grad_norm": 4.049678325653076, + "learning_rate": 2.336e-05, + "loss": 0.4229, + "step": 4675 + }, + { + "epoch": 0.2618434315152873, + "grad_norm": 0.9749440550804138, + "learning_rate": 2.3365e-05, + "loss": 0.3084, + "step": 4676 + }, + { + "epoch": 0.2618994288274163, + "grad_norm": 1.0869070291519165, + "learning_rate": 2.337e-05, + "loss": 0.4879, + "step": 4677 + }, + { + "epoch": 0.2619554261395453, + "grad_norm": 1.350502848625183, + "learning_rate": 2.3375000000000002e-05, + "loss": 0.483, + "step": 4678 + }, + { + "epoch": 0.26201142345167433, + "grad_norm": 1.2524774074554443, + "learning_rate": 2.3380000000000003e-05, + "loss": 0.4149, + "step": 4679 + }, + { + "epoch": 0.26206742076380335, + "grad_norm": 1.0579578876495361, + "learning_rate": 2.3385e-05, + "loss": 0.3859, + "step": 4680 + }, + { + "epoch": 0.26212341807593237, + "grad_norm": 1.2938880920410156, + "learning_rate": 2.339e-05, + "loss": 0.4728, + "step": 4681 + }, + { + "epoch": 0.2621794153880614, + "grad_norm": 1.2401838302612305, + "learning_rate": 2.3395e-05, + "loss": 0.4062, + "step": 4682 + }, + { + "epoch": 0.2622354127001904, + "grad_norm": 1.2220298051834106, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.3831, + "step": 4683 + }, + { + "epoch": 0.2622914100123194, + "grad_norm": 1.1159085035324097, + "learning_rate": 2.3405e-05, + "loss": 0.4694, + "step": 4684 + }, + { + "epoch": 0.26234740732444845, + "grad_norm": 1.4172265529632568, + "learning_rate": 2.341e-05, + "loss": 0.376, + "step": 4685 + }, + { + "epoch": 0.26240340463657746, + "grad_norm": 1.4832614660263062, + "learning_rate": 2.3415000000000002e-05, + "loss": 0.4544, + "step": 4686 + }, + { + "epoch": 0.2624594019487065, + "grad_norm": 1.2928845882415771, + "learning_rate": 2.342e-05, + "loss": 0.5758, + "step": 4687 + }, + { + "epoch": 0.2625153992608355, + "grad_norm": 1.3649362325668335, + "learning_rate": 2.3425000000000004e-05, + "loss": 0.435, + "step": 4688 + }, + { + "epoch": 0.2625713965729645, + "grad_norm": 1.32871413230896, + "learning_rate": 2.343e-05, + "loss": 0.4684, + "step": 4689 + }, + { + "epoch": 0.26262739388509354, + "grad_norm": 1.632665753364563, + "learning_rate": 2.3435000000000002e-05, + "loss": 0.5861, + "step": 4690 + }, + { + "epoch": 0.26268339119722256, + "grad_norm": 1.1872894763946533, + "learning_rate": 2.344e-05, + "loss": 0.4616, + "step": 4691 + }, + { + "epoch": 0.2627393885093515, + "grad_norm": 1.101766586303711, + "learning_rate": 2.3445e-05, + "loss": 0.3728, + "step": 4692 + }, + { + "epoch": 0.26279538582148054, + "grad_norm": 1.2853742837905884, + "learning_rate": 2.345e-05, + "loss": 0.4049, + "step": 4693 + }, + { + "epoch": 0.26285138313360956, + "grad_norm": 1.1566184759140015, + "learning_rate": 2.3455e-05, + "loss": 0.3942, + "step": 4694 + }, + { + "epoch": 0.2629073804457386, + "grad_norm": 1.2635637521743774, + "learning_rate": 2.3460000000000002e-05, + "loss": 0.5816, + "step": 4695 + }, + { + "epoch": 0.2629633777578676, + "grad_norm": 1.4154095649719238, + "learning_rate": 2.3465e-05, + "loss": 0.4768, + "step": 4696 + }, + { + "epoch": 0.2630193750699966, + "grad_norm": 1.0704032182693481, + "learning_rate": 2.347e-05, + "loss": 0.3733, + "step": 4697 + }, + { + "epoch": 0.26307537238212564, + "grad_norm": 1.040293574333191, + "learning_rate": 2.3475e-05, + "loss": 0.4653, + "step": 4698 + }, + { + "epoch": 0.26313136969425466, + "grad_norm": 1.2215741872787476, + "learning_rate": 2.3480000000000002e-05, + "loss": 0.3926, + "step": 4699 + }, + { + "epoch": 0.2631873670063837, + "grad_norm": 1.3420178890228271, + "learning_rate": 2.3485000000000003e-05, + "loss": 0.5665, + "step": 4700 + }, + { + "epoch": 0.2632433643185127, + "grad_norm": 1.307134985923767, + "learning_rate": 2.349e-05, + "loss": 0.3772, + "step": 4701 + }, + { + "epoch": 0.2632993616306417, + "grad_norm": 1.4651503562927246, + "learning_rate": 2.3495e-05, + "loss": 0.5187, + "step": 4702 + }, + { + "epoch": 0.26335535894277073, + "grad_norm": 1.3955353498458862, + "learning_rate": 2.35e-05, + "loss": 0.3472, + "step": 4703 + }, + { + "epoch": 0.26341135625489975, + "grad_norm": 1.3448361158370972, + "learning_rate": 2.3505000000000003e-05, + "loss": 0.5255, + "step": 4704 + }, + { + "epoch": 0.2634673535670288, + "grad_norm": 1.1564198732376099, + "learning_rate": 2.351e-05, + "loss": 0.4308, + "step": 4705 + }, + { + "epoch": 0.2635233508791578, + "grad_norm": 1.274255633354187, + "learning_rate": 2.3515e-05, + "loss": 0.4997, + "step": 4706 + }, + { + "epoch": 0.2635793481912868, + "grad_norm": 1.2160266637802124, + "learning_rate": 2.3520000000000002e-05, + "loss": 0.4688, + "step": 4707 + }, + { + "epoch": 0.26363534550341583, + "grad_norm": 1.0255680084228516, + "learning_rate": 2.3525e-05, + "loss": 0.291, + "step": 4708 + }, + { + "epoch": 0.26369134281554485, + "grad_norm": 1.1800979375839233, + "learning_rate": 2.3530000000000003e-05, + "loss": 0.4176, + "step": 4709 + }, + { + "epoch": 0.26374734012767387, + "grad_norm": 1.404802918434143, + "learning_rate": 2.3535e-05, + "loss": 0.4543, + "step": 4710 + }, + { + "epoch": 0.2638033374398029, + "grad_norm": 3.526089668273926, + "learning_rate": 2.354e-05, + "loss": 0.4589, + "step": 4711 + }, + { + "epoch": 0.2638593347519319, + "grad_norm": 1.169753909111023, + "learning_rate": 2.3545e-05, + "loss": 0.3912, + "step": 4712 + }, + { + "epoch": 0.2639153320640609, + "grad_norm": 1.3709254264831543, + "learning_rate": 2.355e-05, + "loss": 0.5337, + "step": 4713 + }, + { + "epoch": 0.26397132937618994, + "grad_norm": 1.6002558469772339, + "learning_rate": 2.3555e-05, + "loss": 0.3886, + "step": 4714 + }, + { + "epoch": 0.26402732668831896, + "grad_norm": 1.47130286693573, + "learning_rate": 2.356e-05, + "loss": 0.4143, + "step": 4715 + }, + { + "epoch": 0.264083324000448, + "grad_norm": 1.2650200128555298, + "learning_rate": 2.3565000000000002e-05, + "loss": 0.4887, + "step": 4716 + }, + { + "epoch": 0.264139321312577, + "grad_norm": 1.438278317451477, + "learning_rate": 2.357e-05, + "loss": 0.5166, + "step": 4717 + }, + { + "epoch": 0.264195318624706, + "grad_norm": 1.1774498224258423, + "learning_rate": 2.3575e-05, + "loss": 0.3873, + "step": 4718 + }, + { + "epoch": 0.26425131593683504, + "grad_norm": 1.2495230436325073, + "learning_rate": 2.358e-05, + "loss": 0.4097, + "step": 4719 + }, + { + "epoch": 0.26430731324896406, + "grad_norm": 1.2106897830963135, + "learning_rate": 2.3585000000000002e-05, + "loss": 0.4468, + "step": 4720 + }, + { + "epoch": 0.2643633105610931, + "grad_norm": 1.0755109786987305, + "learning_rate": 2.359e-05, + "loss": 0.4026, + "step": 4721 + }, + { + "epoch": 0.2644193078732221, + "grad_norm": 1.3256511688232422, + "learning_rate": 2.3595e-05, + "loss": 0.4691, + "step": 4722 + }, + { + "epoch": 0.2644753051853511, + "grad_norm": 1.1157100200653076, + "learning_rate": 2.36e-05, + "loss": 0.4084, + "step": 4723 + }, + { + "epoch": 0.26453130249748014, + "grad_norm": 1.227482795715332, + "learning_rate": 2.3605000000000002e-05, + "loss": 0.463, + "step": 4724 + }, + { + "epoch": 0.26458729980960916, + "grad_norm": 1.2092044353485107, + "learning_rate": 2.3610000000000003e-05, + "loss": 0.4472, + "step": 4725 + }, + { + "epoch": 0.2646432971217382, + "grad_norm": 1.1431828737258911, + "learning_rate": 2.3615e-05, + "loss": 0.3422, + "step": 4726 + }, + { + "epoch": 0.2646992944338672, + "grad_norm": 1.1278163194656372, + "learning_rate": 2.362e-05, + "loss": 0.3907, + "step": 4727 + }, + { + "epoch": 0.2647552917459962, + "grad_norm": 2.944944381713867, + "learning_rate": 2.3624999999999998e-05, + "loss": 0.3977, + "step": 4728 + }, + { + "epoch": 0.26481128905812523, + "grad_norm": 1.0838154554367065, + "learning_rate": 2.3630000000000002e-05, + "loss": 0.4222, + "step": 4729 + }, + { + "epoch": 0.26486728637025425, + "grad_norm": 1.3145902156829834, + "learning_rate": 2.3635000000000003e-05, + "loss": 0.4228, + "step": 4730 + }, + { + "epoch": 0.26492328368238327, + "grad_norm": 1.237244725227356, + "learning_rate": 2.364e-05, + "loss": 0.496, + "step": 4731 + }, + { + "epoch": 0.2649792809945123, + "grad_norm": 1.3676848411560059, + "learning_rate": 2.3645e-05, + "loss": 0.4149, + "step": 4732 + }, + { + "epoch": 0.2650352783066413, + "grad_norm": 1.361025333404541, + "learning_rate": 2.365e-05, + "loss": 0.3989, + "step": 4733 + }, + { + "epoch": 0.2650912756187703, + "grad_norm": 1.2569867372512817, + "learning_rate": 2.3655000000000003e-05, + "loss": 0.4032, + "step": 4734 + }, + { + "epoch": 0.2651472729308993, + "grad_norm": 1.121619701385498, + "learning_rate": 2.366e-05, + "loss": 0.4134, + "step": 4735 + }, + { + "epoch": 0.2652032702430283, + "grad_norm": 1.3902980089187622, + "learning_rate": 2.3665e-05, + "loss": 0.4361, + "step": 4736 + }, + { + "epoch": 0.26525926755515733, + "grad_norm": 1.2956088781356812, + "learning_rate": 2.3670000000000002e-05, + "loss": 0.5531, + "step": 4737 + }, + { + "epoch": 0.26531526486728635, + "grad_norm": 1.6822242736816406, + "learning_rate": 2.3675e-05, + "loss": 0.5115, + "step": 4738 + }, + { + "epoch": 0.26537126217941537, + "grad_norm": 1.040383219718933, + "learning_rate": 2.3680000000000004e-05, + "loss": 0.4566, + "step": 4739 + }, + { + "epoch": 0.2654272594915444, + "grad_norm": 1.2056008577346802, + "learning_rate": 2.3685e-05, + "loss": 0.3719, + "step": 4740 + }, + { + "epoch": 0.2654832568036734, + "grad_norm": 1.303937554359436, + "learning_rate": 2.3690000000000002e-05, + "loss": 0.4788, + "step": 4741 + }, + { + "epoch": 0.2655392541158024, + "grad_norm": 1.337628960609436, + "learning_rate": 2.3695e-05, + "loss": 0.45, + "step": 4742 + }, + { + "epoch": 0.26559525142793144, + "grad_norm": 1.2148423194885254, + "learning_rate": 2.37e-05, + "loss": 0.448, + "step": 4743 + }, + { + "epoch": 0.26565124874006046, + "grad_norm": 1.4554351568222046, + "learning_rate": 2.3705e-05, + "loss": 0.3779, + "step": 4744 + }, + { + "epoch": 0.2657072460521895, + "grad_norm": 1.3510658740997314, + "learning_rate": 2.371e-05, + "loss": 0.4273, + "step": 4745 + }, + { + "epoch": 0.2657632433643185, + "grad_norm": 1.185594081878662, + "learning_rate": 2.3715000000000002e-05, + "loss": 0.5057, + "step": 4746 + }, + { + "epoch": 0.2658192406764475, + "grad_norm": 1.1551557779312134, + "learning_rate": 2.372e-05, + "loss": 0.3967, + "step": 4747 + }, + { + "epoch": 0.26587523798857654, + "grad_norm": 1.1626313924789429, + "learning_rate": 2.3725e-05, + "loss": 0.423, + "step": 4748 + }, + { + "epoch": 0.26593123530070556, + "grad_norm": 1.1862657070159912, + "learning_rate": 2.373e-05, + "loss": 0.4447, + "step": 4749 + }, + { + "epoch": 0.2659872326128346, + "grad_norm": 0.9451472163200378, + "learning_rate": 2.3735000000000002e-05, + "loss": 0.3167, + "step": 4750 + }, + { + "epoch": 0.2660432299249636, + "grad_norm": 1.0458323955535889, + "learning_rate": 2.374e-05, + "loss": 0.5032, + "step": 4751 + }, + { + "epoch": 0.2660992272370926, + "grad_norm": 1.0098243951797485, + "learning_rate": 2.3745e-05, + "loss": 0.399, + "step": 4752 + }, + { + "epoch": 0.26615522454922164, + "grad_norm": 1.2983150482177734, + "learning_rate": 2.375e-05, + "loss": 0.4968, + "step": 4753 + }, + { + "epoch": 0.26621122186135066, + "grad_norm": 1.238157868385315, + "learning_rate": 2.3755000000000002e-05, + "loss": 0.4248, + "step": 4754 + }, + { + "epoch": 0.2662672191734797, + "grad_norm": 1.2264575958251953, + "learning_rate": 2.3760000000000003e-05, + "loss": 0.4707, + "step": 4755 + }, + { + "epoch": 0.2663232164856087, + "grad_norm": 1.0791478157043457, + "learning_rate": 2.3765e-05, + "loss": 0.3632, + "step": 4756 + }, + { + "epoch": 0.2663792137977377, + "grad_norm": 1.2311959266662598, + "learning_rate": 2.377e-05, + "loss": 0.4551, + "step": 4757 + }, + { + "epoch": 0.26643521110986673, + "grad_norm": 1.0826047658920288, + "learning_rate": 2.3775e-05, + "loss": 0.3833, + "step": 4758 + }, + { + "epoch": 0.26649120842199575, + "grad_norm": 1.1972943544387817, + "learning_rate": 2.3780000000000003e-05, + "loss": 0.4986, + "step": 4759 + }, + { + "epoch": 0.26654720573412477, + "grad_norm": 1.5173351764678955, + "learning_rate": 2.3785e-05, + "loss": 0.4581, + "step": 4760 + }, + { + "epoch": 0.2666032030462538, + "grad_norm": 1.081595540046692, + "learning_rate": 2.379e-05, + "loss": 0.4176, + "step": 4761 + }, + { + "epoch": 0.2666592003583828, + "grad_norm": 1.485235333442688, + "learning_rate": 2.3795000000000002e-05, + "loss": 0.4463, + "step": 4762 + }, + { + "epoch": 0.2667151976705118, + "grad_norm": 1.043321132659912, + "learning_rate": 2.38e-05, + "loss": 0.3156, + "step": 4763 + }, + { + "epoch": 0.26677119498264085, + "grad_norm": 1.2274316549301147, + "learning_rate": 2.3805000000000003e-05, + "loss": 0.6176, + "step": 4764 + }, + { + "epoch": 0.26682719229476987, + "grad_norm": 1.2730541229248047, + "learning_rate": 2.381e-05, + "loss": 0.4126, + "step": 4765 + }, + { + "epoch": 0.2668831896068989, + "grad_norm": 1.2899019718170166, + "learning_rate": 2.3815e-05, + "loss": 0.3958, + "step": 4766 + }, + { + "epoch": 0.2669391869190279, + "grad_norm": 1.1440935134887695, + "learning_rate": 2.3820000000000002e-05, + "loss": 0.3716, + "step": 4767 + }, + { + "epoch": 0.2669951842311569, + "grad_norm": 1.1878410577774048, + "learning_rate": 2.3825e-05, + "loss": 0.4676, + "step": 4768 + }, + { + "epoch": 0.26705118154328594, + "grad_norm": 1.21403169631958, + "learning_rate": 2.3830000000000004e-05, + "loss": 0.4589, + "step": 4769 + }, + { + "epoch": 0.26710717885541496, + "grad_norm": 1.1536064147949219, + "learning_rate": 2.3835e-05, + "loss": 0.4081, + "step": 4770 + }, + { + "epoch": 0.267163176167544, + "grad_norm": 1.207689642906189, + "learning_rate": 2.3840000000000002e-05, + "loss": 0.6582, + "step": 4771 + }, + { + "epoch": 0.267219173479673, + "grad_norm": 1.1608504056930542, + "learning_rate": 2.3845e-05, + "loss": 0.4031, + "step": 4772 + }, + { + "epoch": 0.267275170791802, + "grad_norm": 1.3046009540557861, + "learning_rate": 2.385e-05, + "loss": 0.4721, + "step": 4773 + }, + { + "epoch": 0.26733116810393104, + "grad_norm": 1.1998666524887085, + "learning_rate": 2.3855e-05, + "loss": 0.3622, + "step": 4774 + }, + { + "epoch": 0.26738716541606, + "grad_norm": 6.5772223472595215, + "learning_rate": 2.3860000000000002e-05, + "loss": 0.4529, + "step": 4775 + }, + { + "epoch": 0.267443162728189, + "grad_norm": 1.2209452390670776, + "learning_rate": 2.3865000000000003e-05, + "loss": 0.3514, + "step": 4776 + }, + { + "epoch": 0.26749916004031804, + "grad_norm": 1.2021710872650146, + "learning_rate": 2.387e-05, + "loss": 0.4096, + "step": 4777 + }, + { + "epoch": 0.26755515735244706, + "grad_norm": 1.1438333988189697, + "learning_rate": 2.3875e-05, + "loss": 0.3843, + "step": 4778 + }, + { + "epoch": 0.2676111546645761, + "grad_norm": 1.3314814567565918, + "learning_rate": 2.3880000000000002e-05, + "loss": 0.4809, + "step": 4779 + }, + { + "epoch": 0.2676671519767051, + "grad_norm": 1.1786727905273438, + "learning_rate": 2.3885000000000003e-05, + "loss": 0.3968, + "step": 4780 + }, + { + "epoch": 0.2677231492888341, + "grad_norm": 1.0819017887115479, + "learning_rate": 2.389e-05, + "loss": 0.3484, + "step": 4781 + }, + { + "epoch": 0.26777914660096314, + "grad_norm": 1.17559015750885, + "learning_rate": 2.3895e-05, + "loss": 0.4232, + "step": 4782 + }, + { + "epoch": 0.26783514391309216, + "grad_norm": 1.4684512615203857, + "learning_rate": 2.39e-05, + "loss": 0.4547, + "step": 4783 + }, + { + "epoch": 0.2678911412252212, + "grad_norm": 1.2338345050811768, + "learning_rate": 2.3905000000000002e-05, + "loss": 0.4311, + "step": 4784 + }, + { + "epoch": 0.2679471385373502, + "grad_norm": 1.3646914958953857, + "learning_rate": 2.3910000000000003e-05, + "loss": 0.5086, + "step": 4785 + }, + { + "epoch": 0.2680031358494792, + "grad_norm": 1.2001941204071045, + "learning_rate": 2.3915e-05, + "loss": 0.385, + "step": 4786 + }, + { + "epoch": 0.26805913316160823, + "grad_norm": 1.1692379713058472, + "learning_rate": 2.392e-05, + "loss": 0.6629, + "step": 4787 + }, + { + "epoch": 0.26811513047373725, + "grad_norm": 1.214898943901062, + "learning_rate": 2.3925e-05, + "loss": 0.415, + "step": 4788 + }, + { + "epoch": 0.26817112778586627, + "grad_norm": 1.2813019752502441, + "learning_rate": 2.3930000000000003e-05, + "loss": 0.4171, + "step": 4789 + }, + { + "epoch": 0.2682271250979953, + "grad_norm": 1.1769672632217407, + "learning_rate": 2.3935e-05, + "loss": 0.3184, + "step": 4790 + }, + { + "epoch": 0.2682831224101243, + "grad_norm": 1.01447594165802, + "learning_rate": 2.394e-05, + "loss": 0.3295, + "step": 4791 + }, + { + "epoch": 0.2683391197222533, + "grad_norm": 1.1792190074920654, + "learning_rate": 2.3945000000000002e-05, + "loss": 0.3586, + "step": 4792 + }, + { + "epoch": 0.26839511703438235, + "grad_norm": 1.2017855644226074, + "learning_rate": 2.395e-05, + "loss": 0.3741, + "step": 4793 + }, + { + "epoch": 0.26845111434651137, + "grad_norm": 1.1356403827667236, + "learning_rate": 2.3955000000000004e-05, + "loss": 0.5755, + "step": 4794 + }, + { + "epoch": 0.2685071116586404, + "grad_norm": 1.1810275316238403, + "learning_rate": 2.396e-05, + "loss": 0.5044, + "step": 4795 + }, + { + "epoch": 0.2685631089707694, + "grad_norm": 1.492426872253418, + "learning_rate": 2.3965000000000002e-05, + "loss": 0.6112, + "step": 4796 + }, + { + "epoch": 0.2686191062828984, + "grad_norm": 1.587020993232727, + "learning_rate": 2.397e-05, + "loss": 0.4583, + "step": 4797 + }, + { + "epoch": 0.26867510359502744, + "grad_norm": 1.438361644744873, + "learning_rate": 2.3975e-05, + "loss": 0.4967, + "step": 4798 + }, + { + "epoch": 0.26873110090715646, + "grad_norm": 1.1004462242126465, + "learning_rate": 2.398e-05, + "loss": 0.5319, + "step": 4799 + }, + { + "epoch": 0.2687870982192855, + "grad_norm": 1.253659963607788, + "learning_rate": 2.3985e-05, + "loss": 0.4341, + "step": 4800 + }, + { + "epoch": 0.2688430955314145, + "grad_norm": 1.118528962135315, + "learning_rate": 2.3990000000000002e-05, + "loss": 0.3381, + "step": 4801 + }, + { + "epoch": 0.2688990928435435, + "grad_norm": 1.5777441263198853, + "learning_rate": 2.3995e-05, + "loss": 0.3385, + "step": 4802 + }, + { + "epoch": 0.26895509015567254, + "grad_norm": 1.1764894723892212, + "learning_rate": 2.4e-05, + "loss": 0.4373, + "step": 4803 + }, + { + "epoch": 0.26901108746780156, + "grad_norm": 1.0975189208984375, + "learning_rate": 2.4005e-05, + "loss": 0.3497, + "step": 4804 + }, + { + "epoch": 0.2690670847799306, + "grad_norm": 1.1311278343200684, + "learning_rate": 2.4010000000000002e-05, + "loss": 0.3729, + "step": 4805 + }, + { + "epoch": 0.2691230820920596, + "grad_norm": 1.2189937829971313, + "learning_rate": 2.4015000000000003e-05, + "loss": 0.5284, + "step": 4806 + }, + { + "epoch": 0.2691790794041886, + "grad_norm": 2.796288251876831, + "learning_rate": 2.402e-05, + "loss": 0.3669, + "step": 4807 + }, + { + "epoch": 0.26923507671631763, + "grad_norm": 1.243399977684021, + "learning_rate": 2.4025e-05, + "loss": 0.4037, + "step": 4808 + }, + { + "epoch": 0.26929107402844665, + "grad_norm": 1.410094976425171, + "learning_rate": 2.4030000000000002e-05, + "loss": 0.5426, + "step": 4809 + }, + { + "epoch": 0.26934707134057567, + "grad_norm": 1.1329196691513062, + "learning_rate": 2.4035000000000003e-05, + "loss": 0.3994, + "step": 4810 + }, + { + "epoch": 0.2694030686527047, + "grad_norm": 1.3636788129806519, + "learning_rate": 2.404e-05, + "loss": 0.4161, + "step": 4811 + }, + { + "epoch": 0.2694590659648337, + "grad_norm": 1.0871763229370117, + "learning_rate": 2.4045e-05, + "loss": 0.3582, + "step": 4812 + }, + { + "epoch": 0.26951506327696273, + "grad_norm": 1.1364787817001343, + "learning_rate": 2.4050000000000002e-05, + "loss": 0.4485, + "step": 4813 + }, + { + "epoch": 0.26957106058909175, + "grad_norm": 1.3407330513000488, + "learning_rate": 2.4055000000000003e-05, + "loss": 0.4873, + "step": 4814 + }, + { + "epoch": 0.26962705790122077, + "grad_norm": 1.1668158769607544, + "learning_rate": 2.4060000000000003e-05, + "loss": 0.6243, + "step": 4815 + }, + { + "epoch": 0.26968305521334973, + "grad_norm": 1.3728643655776978, + "learning_rate": 2.4065e-05, + "loss": 0.4324, + "step": 4816 + }, + { + "epoch": 0.26973905252547875, + "grad_norm": 1.2123281955718994, + "learning_rate": 2.407e-05, + "loss": 0.4673, + "step": 4817 + }, + { + "epoch": 0.26979504983760777, + "grad_norm": 1.2647933959960938, + "learning_rate": 2.4075e-05, + "loss": 0.355, + "step": 4818 + }, + { + "epoch": 0.2698510471497368, + "grad_norm": 1.1391644477844238, + "learning_rate": 2.408e-05, + "loss": 0.4842, + "step": 4819 + }, + { + "epoch": 0.2699070444618658, + "grad_norm": 1.1071332693099976, + "learning_rate": 2.4085e-05, + "loss": 0.3956, + "step": 4820 + }, + { + "epoch": 0.2699630417739948, + "grad_norm": 1.0850937366485596, + "learning_rate": 2.409e-05, + "loss": 0.4921, + "step": 4821 + }, + { + "epoch": 0.27001903908612385, + "grad_norm": 1.157414197921753, + "learning_rate": 2.4095000000000002e-05, + "loss": 0.4969, + "step": 4822 + }, + { + "epoch": 0.27007503639825287, + "grad_norm": 1.525781273841858, + "learning_rate": 2.41e-05, + "loss": 0.4886, + "step": 4823 + }, + { + "epoch": 0.2701310337103819, + "grad_norm": 1.0770370960235596, + "learning_rate": 2.4105e-05, + "loss": 0.4066, + "step": 4824 + }, + { + "epoch": 0.2701870310225109, + "grad_norm": 1.1031999588012695, + "learning_rate": 2.411e-05, + "loss": 0.3887, + "step": 4825 + }, + { + "epoch": 0.2702430283346399, + "grad_norm": 1.0829763412475586, + "learning_rate": 2.4115000000000002e-05, + "loss": 0.3964, + "step": 4826 + }, + { + "epoch": 0.27029902564676894, + "grad_norm": 1.165955901145935, + "learning_rate": 2.412e-05, + "loss": 0.3823, + "step": 4827 + }, + { + "epoch": 0.27035502295889796, + "grad_norm": 0.955703854560852, + "learning_rate": 2.4125e-05, + "loss": 0.3086, + "step": 4828 + }, + { + "epoch": 0.270411020271027, + "grad_norm": 1.2926037311553955, + "learning_rate": 2.413e-05, + "loss": 0.4862, + "step": 4829 + }, + { + "epoch": 0.270467017583156, + "grad_norm": 1.4200623035430908, + "learning_rate": 2.4135000000000002e-05, + "loss": 0.591, + "step": 4830 + }, + { + "epoch": 0.270523014895285, + "grad_norm": 1.1878716945648193, + "learning_rate": 2.4140000000000003e-05, + "loss": 0.3831, + "step": 4831 + }, + { + "epoch": 0.27057901220741404, + "grad_norm": 1.1649898290634155, + "learning_rate": 2.4145e-05, + "loss": 0.3933, + "step": 4832 + }, + { + "epoch": 0.27063500951954306, + "grad_norm": 1.396077275276184, + "learning_rate": 2.415e-05, + "loss": 0.3464, + "step": 4833 + }, + { + "epoch": 0.2706910068316721, + "grad_norm": 1.1895679235458374, + "learning_rate": 2.4154999999999998e-05, + "loss": 0.4139, + "step": 4834 + }, + { + "epoch": 0.2707470041438011, + "grad_norm": 1.3763668537139893, + "learning_rate": 2.4160000000000002e-05, + "loss": 0.451, + "step": 4835 + }, + { + "epoch": 0.2708030014559301, + "grad_norm": 1.2594642639160156, + "learning_rate": 2.4165e-05, + "loss": 0.3893, + "step": 4836 + }, + { + "epoch": 0.27085899876805913, + "grad_norm": 1.2071425914764404, + "learning_rate": 2.417e-05, + "loss": 0.4495, + "step": 4837 + }, + { + "epoch": 0.27091499608018815, + "grad_norm": 1.3346881866455078, + "learning_rate": 2.4175e-05, + "loss": 0.5326, + "step": 4838 + }, + { + "epoch": 0.27097099339231717, + "grad_norm": 1.264471173286438, + "learning_rate": 2.418e-05, + "loss": 0.5254, + "step": 4839 + }, + { + "epoch": 0.2710269907044462, + "grad_norm": 1.154289960861206, + "learning_rate": 2.4185000000000003e-05, + "loss": 0.4498, + "step": 4840 + }, + { + "epoch": 0.2710829880165752, + "grad_norm": 1.2578608989715576, + "learning_rate": 2.419e-05, + "loss": 0.3984, + "step": 4841 + }, + { + "epoch": 0.27113898532870423, + "grad_norm": 1.193702220916748, + "learning_rate": 2.4195e-05, + "loss": 0.4636, + "step": 4842 + }, + { + "epoch": 0.27119498264083325, + "grad_norm": 6.047183990478516, + "learning_rate": 2.4200000000000002e-05, + "loss": 0.4836, + "step": 4843 + }, + { + "epoch": 0.27125097995296227, + "grad_norm": 1.906936764717102, + "learning_rate": 2.4205e-05, + "loss": 0.4673, + "step": 4844 + }, + { + "epoch": 0.2713069772650913, + "grad_norm": 1.150715708732605, + "learning_rate": 2.4210000000000004e-05, + "loss": 0.5078, + "step": 4845 + }, + { + "epoch": 0.2713629745772203, + "grad_norm": 1.1989617347717285, + "learning_rate": 2.4215e-05, + "loss": 0.479, + "step": 4846 + }, + { + "epoch": 0.2714189718893493, + "grad_norm": 1.1006189584732056, + "learning_rate": 2.4220000000000002e-05, + "loss": 0.4624, + "step": 4847 + }, + { + "epoch": 0.27147496920147834, + "grad_norm": 1.0710850954055786, + "learning_rate": 2.4225e-05, + "loss": 0.3288, + "step": 4848 + }, + { + "epoch": 0.27153096651360736, + "grad_norm": 1.242953896522522, + "learning_rate": 2.423e-05, + "loss": 0.4696, + "step": 4849 + }, + { + "epoch": 0.2715869638257364, + "grad_norm": 1.2073276042938232, + "learning_rate": 2.4235e-05, + "loss": 0.4357, + "step": 4850 + }, + { + "epoch": 0.2716429611378654, + "grad_norm": 1.162598967552185, + "learning_rate": 2.4240000000000002e-05, + "loss": 0.4673, + "step": 4851 + }, + { + "epoch": 0.2716989584499944, + "grad_norm": 1.40058171749115, + "learning_rate": 2.4245000000000002e-05, + "loss": 0.5045, + "step": 4852 + }, + { + "epoch": 0.27175495576212344, + "grad_norm": 1.334394931793213, + "learning_rate": 2.425e-05, + "loss": 0.4338, + "step": 4853 + }, + { + "epoch": 0.27181095307425246, + "grad_norm": 1.1808335781097412, + "learning_rate": 2.4255e-05, + "loss": 0.5973, + "step": 4854 + }, + { + "epoch": 0.2718669503863815, + "grad_norm": 1.288703441619873, + "learning_rate": 2.426e-05, + "loss": 0.4282, + "step": 4855 + }, + { + "epoch": 0.2719229476985105, + "grad_norm": 1.2336734533309937, + "learning_rate": 2.4265000000000002e-05, + "loss": 0.6002, + "step": 4856 + }, + { + "epoch": 0.2719789450106395, + "grad_norm": 1.362411618232727, + "learning_rate": 2.427e-05, + "loss": 0.4054, + "step": 4857 + }, + { + "epoch": 0.2720349423227685, + "grad_norm": 1.4240525960922241, + "learning_rate": 2.4275e-05, + "loss": 0.5253, + "step": 4858 + }, + { + "epoch": 0.2720909396348975, + "grad_norm": 1.1944591999053955, + "learning_rate": 2.428e-05, + "loss": 0.481, + "step": 4859 + }, + { + "epoch": 0.2721469369470265, + "grad_norm": 1.2200754880905151, + "learning_rate": 2.4285000000000002e-05, + "loss": 0.281, + "step": 4860 + }, + { + "epoch": 0.27220293425915554, + "grad_norm": 1.2856251001358032, + "learning_rate": 2.4290000000000003e-05, + "loss": 0.3931, + "step": 4861 + }, + { + "epoch": 0.27225893157128456, + "grad_norm": 1.1528245210647583, + "learning_rate": 2.4295e-05, + "loss": 0.4851, + "step": 4862 + }, + { + "epoch": 0.2723149288834136, + "grad_norm": 1.2538505792617798, + "learning_rate": 2.43e-05, + "loss": 0.4604, + "step": 4863 + }, + { + "epoch": 0.2723709261955426, + "grad_norm": 1.1997997760772705, + "learning_rate": 2.4305e-05, + "loss": 0.4507, + "step": 4864 + }, + { + "epoch": 0.2724269235076716, + "grad_norm": 1.1665483713150024, + "learning_rate": 2.4310000000000003e-05, + "loss": 0.3933, + "step": 4865 + }, + { + "epoch": 0.27248292081980063, + "grad_norm": 1.152035117149353, + "learning_rate": 2.4315e-05, + "loss": 0.5699, + "step": 4866 + }, + { + "epoch": 0.27253891813192965, + "grad_norm": 1.3405358791351318, + "learning_rate": 2.432e-05, + "loss": 0.525, + "step": 4867 + }, + { + "epoch": 0.27259491544405867, + "grad_norm": 1.1431670188903809, + "learning_rate": 2.4325000000000002e-05, + "loss": 0.3511, + "step": 4868 + }, + { + "epoch": 0.2726509127561877, + "grad_norm": 1.0083286762237549, + "learning_rate": 2.433e-05, + "loss": 0.3615, + "step": 4869 + }, + { + "epoch": 0.2727069100683167, + "grad_norm": 1.4361419677734375, + "learning_rate": 2.4335000000000003e-05, + "loss": 0.3746, + "step": 4870 + }, + { + "epoch": 0.27276290738044573, + "grad_norm": 1.2798997163772583, + "learning_rate": 2.434e-05, + "loss": 0.5562, + "step": 4871 + }, + { + "epoch": 0.27281890469257475, + "grad_norm": 1.5994699001312256, + "learning_rate": 2.4345e-05, + "loss": 0.4758, + "step": 4872 + }, + { + "epoch": 0.27287490200470377, + "grad_norm": 1.4659181833267212, + "learning_rate": 2.435e-05, + "loss": 0.4897, + "step": 4873 + }, + { + "epoch": 0.2729308993168328, + "grad_norm": 1.0324087142944336, + "learning_rate": 2.4355e-05, + "loss": 0.3914, + "step": 4874 + }, + { + "epoch": 0.2729868966289618, + "grad_norm": 1.11372709274292, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.4483, + "step": 4875 + }, + { + "epoch": 0.2730428939410908, + "grad_norm": 1.0878872871398926, + "learning_rate": 2.4365e-05, + "loss": 0.4028, + "step": 4876 + }, + { + "epoch": 0.27309889125321984, + "grad_norm": 1.1511904001235962, + "learning_rate": 2.4370000000000002e-05, + "loss": 0.2844, + "step": 4877 + }, + { + "epoch": 0.27315488856534886, + "grad_norm": 1.0558775663375854, + "learning_rate": 2.4375e-05, + "loss": 0.4916, + "step": 4878 + }, + { + "epoch": 0.2732108858774779, + "grad_norm": 1.0329701900482178, + "learning_rate": 2.438e-05, + "loss": 0.3883, + "step": 4879 + }, + { + "epoch": 0.2732668831896069, + "grad_norm": 1.5741260051727295, + "learning_rate": 2.4385e-05, + "loss": 0.468, + "step": 4880 + }, + { + "epoch": 0.2733228805017359, + "grad_norm": 1.1136219501495361, + "learning_rate": 2.4390000000000002e-05, + "loss": 0.3881, + "step": 4881 + }, + { + "epoch": 0.27337887781386494, + "grad_norm": 1.2174041271209717, + "learning_rate": 2.4395000000000003e-05, + "loss": 0.4176, + "step": 4882 + }, + { + "epoch": 0.27343487512599396, + "grad_norm": 1.137325644493103, + "learning_rate": 2.44e-05, + "loss": 0.4695, + "step": 4883 + }, + { + "epoch": 0.273490872438123, + "grad_norm": 1.1314067840576172, + "learning_rate": 2.4405e-05, + "loss": 0.3367, + "step": 4884 + }, + { + "epoch": 0.273546869750252, + "grad_norm": 1.1522043943405151, + "learning_rate": 2.4410000000000002e-05, + "loss": 0.4068, + "step": 4885 + }, + { + "epoch": 0.273602867062381, + "grad_norm": 1.210898995399475, + "learning_rate": 2.4415000000000003e-05, + "loss": 0.5382, + "step": 4886 + }, + { + "epoch": 0.27365886437451004, + "grad_norm": 1.389197826385498, + "learning_rate": 2.442e-05, + "loss": 0.5014, + "step": 4887 + }, + { + "epoch": 0.27371486168663905, + "grad_norm": 1.416251540184021, + "learning_rate": 2.4425e-05, + "loss": 0.5832, + "step": 4888 + }, + { + "epoch": 0.2737708589987681, + "grad_norm": 1.4145478010177612, + "learning_rate": 2.443e-05, + "loss": 0.5117, + "step": 4889 + }, + { + "epoch": 0.2738268563108971, + "grad_norm": 1.226279854774475, + "learning_rate": 2.4435000000000002e-05, + "loss": 0.4941, + "step": 4890 + }, + { + "epoch": 0.2738828536230261, + "grad_norm": 0.9975038766860962, + "learning_rate": 2.4440000000000003e-05, + "loss": 0.3055, + "step": 4891 + }, + { + "epoch": 0.27393885093515513, + "grad_norm": 1.38507080078125, + "learning_rate": 2.4445e-05, + "loss": 0.4014, + "step": 4892 + }, + { + "epoch": 0.27399484824728415, + "grad_norm": 1.3080692291259766, + "learning_rate": 2.445e-05, + "loss": 0.502, + "step": 4893 + }, + { + "epoch": 0.27405084555941317, + "grad_norm": 1.0307468175888062, + "learning_rate": 2.4455e-05, + "loss": 0.3938, + "step": 4894 + }, + { + "epoch": 0.2741068428715422, + "grad_norm": 1.2235243320465088, + "learning_rate": 2.4460000000000003e-05, + "loss": 0.3463, + "step": 4895 + }, + { + "epoch": 0.2741628401836712, + "grad_norm": 1.427044153213501, + "learning_rate": 2.4465e-05, + "loss": 0.5066, + "step": 4896 + }, + { + "epoch": 0.2742188374958002, + "grad_norm": 1.1579972505569458, + "learning_rate": 2.447e-05, + "loss": 0.4708, + "step": 4897 + }, + { + "epoch": 0.27427483480792925, + "grad_norm": 1.1244254112243652, + "learning_rate": 2.4475000000000002e-05, + "loss": 0.3776, + "step": 4898 + }, + { + "epoch": 0.2743308321200582, + "grad_norm": 1.1676359176635742, + "learning_rate": 2.448e-05, + "loss": 0.4447, + "step": 4899 + }, + { + "epoch": 0.27438682943218723, + "grad_norm": 1.186279058456421, + "learning_rate": 2.4485000000000004e-05, + "loss": 0.5561, + "step": 4900 + }, + { + "epoch": 0.27444282674431625, + "grad_norm": 1.1873775720596313, + "learning_rate": 2.449e-05, + "loss": 0.4251, + "step": 4901 + }, + { + "epoch": 0.27449882405644527, + "grad_norm": 1.4521654844284058, + "learning_rate": 2.4495000000000002e-05, + "loss": 0.5958, + "step": 4902 + }, + { + "epoch": 0.2745548213685743, + "grad_norm": 1.1759424209594727, + "learning_rate": 2.45e-05, + "loss": 0.3257, + "step": 4903 + }, + { + "epoch": 0.2746108186807033, + "grad_norm": 1.4622273445129395, + "learning_rate": 2.4505e-05, + "loss": 0.4231, + "step": 4904 + }, + { + "epoch": 0.2746668159928323, + "grad_norm": 1.1844336986541748, + "learning_rate": 2.451e-05, + "loss": 0.465, + "step": 4905 + }, + { + "epoch": 0.27472281330496134, + "grad_norm": 1.2939742803573608, + "learning_rate": 2.4515e-05, + "loss": 0.4654, + "step": 4906 + }, + { + "epoch": 0.27477881061709036, + "grad_norm": 1.2372941970825195, + "learning_rate": 2.4520000000000002e-05, + "loss": 0.4132, + "step": 4907 + }, + { + "epoch": 0.2748348079292194, + "grad_norm": 1.1188563108444214, + "learning_rate": 2.4525e-05, + "loss": 0.37, + "step": 4908 + }, + { + "epoch": 0.2748908052413484, + "grad_norm": 1.455541729927063, + "learning_rate": 2.453e-05, + "loss": 0.4644, + "step": 4909 + }, + { + "epoch": 0.2749468025534774, + "grad_norm": 1.6214908361434937, + "learning_rate": 2.4535e-05, + "loss": 0.4694, + "step": 4910 + }, + { + "epoch": 0.27500279986560644, + "grad_norm": 1.17264986038208, + "learning_rate": 2.4540000000000002e-05, + "loss": 0.3957, + "step": 4911 + }, + { + "epoch": 0.27505879717773546, + "grad_norm": 1.0448565483093262, + "learning_rate": 2.4545000000000003e-05, + "loss": 0.3585, + "step": 4912 + }, + { + "epoch": 0.2751147944898645, + "grad_norm": 1.3260798454284668, + "learning_rate": 2.455e-05, + "loss": 0.4594, + "step": 4913 + }, + { + "epoch": 0.2751707918019935, + "grad_norm": 1.7412939071655273, + "learning_rate": 2.4555e-05, + "loss": 0.4749, + "step": 4914 + }, + { + "epoch": 0.2752267891141225, + "grad_norm": 1.4049450159072876, + "learning_rate": 2.4560000000000002e-05, + "loss": 0.4712, + "step": 4915 + }, + { + "epoch": 0.27528278642625154, + "grad_norm": 1.2056244611740112, + "learning_rate": 2.4565000000000003e-05, + "loss": 0.4464, + "step": 4916 + }, + { + "epoch": 0.27533878373838055, + "grad_norm": 1.6052457094192505, + "learning_rate": 2.457e-05, + "loss": 0.4002, + "step": 4917 + }, + { + "epoch": 0.2753947810505096, + "grad_norm": 1.1523957252502441, + "learning_rate": 2.4575e-05, + "loss": 0.425, + "step": 4918 + }, + { + "epoch": 0.2754507783626386, + "grad_norm": 1.3647980690002441, + "learning_rate": 2.4580000000000002e-05, + "loss": 0.536, + "step": 4919 + }, + { + "epoch": 0.2755067756747676, + "grad_norm": 1.340335726737976, + "learning_rate": 2.4585000000000003e-05, + "loss": 0.4708, + "step": 4920 + }, + { + "epoch": 0.27556277298689663, + "grad_norm": 1.038291096687317, + "learning_rate": 2.4590000000000003e-05, + "loss": 0.3694, + "step": 4921 + }, + { + "epoch": 0.27561877029902565, + "grad_norm": 1.2275313138961792, + "learning_rate": 2.4595e-05, + "loss": 0.5274, + "step": 4922 + }, + { + "epoch": 0.27567476761115467, + "grad_norm": 1.0066707134246826, + "learning_rate": 2.46e-05, + "loss": 0.332, + "step": 4923 + }, + { + "epoch": 0.2757307649232837, + "grad_norm": 1.0508103370666504, + "learning_rate": 2.4605e-05, + "loss": 0.4163, + "step": 4924 + }, + { + "epoch": 0.2757867622354127, + "grad_norm": 1.1964588165283203, + "learning_rate": 2.4610000000000003e-05, + "loss": 0.419, + "step": 4925 + }, + { + "epoch": 0.2758427595475417, + "grad_norm": 1.2009811401367188, + "learning_rate": 2.4615e-05, + "loss": 0.4048, + "step": 4926 + }, + { + "epoch": 0.27589875685967075, + "grad_norm": 1.2661802768707275, + "learning_rate": 2.462e-05, + "loss": 0.3793, + "step": 4927 + }, + { + "epoch": 0.27595475417179977, + "grad_norm": 1.1443170309066772, + "learning_rate": 2.4625000000000002e-05, + "loss": 0.375, + "step": 4928 + }, + { + "epoch": 0.2760107514839288, + "grad_norm": 1.1995105743408203, + "learning_rate": 2.463e-05, + "loss": 0.3884, + "step": 4929 + }, + { + "epoch": 0.2760667487960578, + "grad_norm": 1.284255862236023, + "learning_rate": 2.4635000000000004e-05, + "loss": 0.3677, + "step": 4930 + }, + { + "epoch": 0.2761227461081868, + "grad_norm": 1.3781988620758057, + "learning_rate": 2.464e-05, + "loss": 0.459, + "step": 4931 + }, + { + "epoch": 0.27617874342031584, + "grad_norm": 1.1450660228729248, + "learning_rate": 2.4645000000000002e-05, + "loss": 0.3256, + "step": 4932 + }, + { + "epoch": 0.27623474073244486, + "grad_norm": 1.0806472301483154, + "learning_rate": 2.465e-05, + "loss": 0.4452, + "step": 4933 + }, + { + "epoch": 0.2762907380445739, + "grad_norm": 1.2479525804519653, + "learning_rate": 2.4655e-05, + "loss": 0.3918, + "step": 4934 + }, + { + "epoch": 0.2763467353567029, + "grad_norm": 1.154831051826477, + "learning_rate": 2.466e-05, + "loss": 0.3139, + "step": 4935 + }, + { + "epoch": 0.2764027326688319, + "grad_norm": 1.1486061811447144, + "learning_rate": 2.4665000000000002e-05, + "loss": 0.3987, + "step": 4936 + }, + { + "epoch": 0.27645872998096094, + "grad_norm": 1.382432222366333, + "learning_rate": 2.4670000000000003e-05, + "loss": 0.3491, + "step": 4937 + }, + { + "epoch": 0.27651472729308996, + "grad_norm": 1.035082221031189, + "learning_rate": 2.4675e-05, + "loss": 0.3382, + "step": 4938 + }, + { + "epoch": 0.276570724605219, + "grad_norm": 1.1332824230194092, + "learning_rate": 2.468e-05, + "loss": 0.3603, + "step": 4939 + }, + { + "epoch": 0.27662672191734794, + "grad_norm": 1.4208630323410034, + "learning_rate": 2.4685e-05, + "loss": 0.4386, + "step": 4940 + }, + { + "epoch": 0.27668271922947696, + "grad_norm": 1.26140296459198, + "learning_rate": 2.4690000000000002e-05, + "loss": 0.4126, + "step": 4941 + }, + { + "epoch": 0.276738716541606, + "grad_norm": 1.047723412513733, + "learning_rate": 2.4695e-05, + "loss": 0.3948, + "step": 4942 + }, + { + "epoch": 0.276794713853735, + "grad_norm": 1.1719717979431152, + "learning_rate": 2.47e-05, + "loss": 0.5086, + "step": 4943 + }, + { + "epoch": 0.276850711165864, + "grad_norm": 1.5609759092330933, + "learning_rate": 2.4705e-05, + "loss": 0.6026, + "step": 4944 + }, + { + "epoch": 0.27690670847799304, + "grad_norm": 1.3194032907485962, + "learning_rate": 2.471e-05, + "loss": 0.3067, + "step": 4945 + }, + { + "epoch": 0.27696270579012205, + "grad_norm": 1.4805982112884521, + "learning_rate": 2.4715000000000003e-05, + "loss": 0.5675, + "step": 4946 + }, + { + "epoch": 0.2770187031022511, + "grad_norm": 1.563145399093628, + "learning_rate": 2.472e-05, + "loss": 0.3775, + "step": 4947 + }, + { + "epoch": 0.2770747004143801, + "grad_norm": 1.1520642042160034, + "learning_rate": 2.4725e-05, + "loss": 0.4203, + "step": 4948 + }, + { + "epoch": 0.2771306977265091, + "grad_norm": 1.2293246984481812, + "learning_rate": 2.473e-05, + "loss": 0.4264, + "step": 4949 + }, + { + "epoch": 0.27718669503863813, + "grad_norm": 1.1341722011566162, + "learning_rate": 2.4735e-05, + "loss": 0.3552, + "step": 4950 + }, + { + "epoch": 0.27724269235076715, + "grad_norm": 1.4974896907806396, + "learning_rate": 2.4740000000000004e-05, + "loss": 0.4868, + "step": 4951 + }, + { + "epoch": 0.27729868966289617, + "grad_norm": 1.3844859600067139, + "learning_rate": 2.4745e-05, + "loss": 0.4703, + "step": 4952 + }, + { + "epoch": 0.2773546869750252, + "grad_norm": 1.133491039276123, + "learning_rate": 2.4750000000000002e-05, + "loss": 0.3953, + "step": 4953 + }, + { + "epoch": 0.2774106842871542, + "grad_norm": 1.1602778434753418, + "learning_rate": 2.4755e-05, + "loss": 0.4219, + "step": 4954 + }, + { + "epoch": 0.2774666815992832, + "grad_norm": 1.3232240676879883, + "learning_rate": 2.476e-05, + "loss": 0.3785, + "step": 4955 + }, + { + "epoch": 0.27752267891141225, + "grad_norm": 1.514744520187378, + "learning_rate": 2.4765e-05, + "loss": 0.4758, + "step": 4956 + }, + { + "epoch": 0.27757867622354127, + "grad_norm": 1.1780812740325928, + "learning_rate": 2.4770000000000002e-05, + "loss": 0.3897, + "step": 4957 + }, + { + "epoch": 0.2776346735356703, + "grad_norm": 1.2148231267929077, + "learning_rate": 2.4775000000000003e-05, + "loss": 0.3999, + "step": 4958 + }, + { + "epoch": 0.2776906708477993, + "grad_norm": 1.3397095203399658, + "learning_rate": 2.478e-05, + "loss": 0.3621, + "step": 4959 + }, + { + "epoch": 0.2777466681599283, + "grad_norm": 1.153542160987854, + "learning_rate": 2.4785e-05, + "loss": 0.3358, + "step": 4960 + }, + { + "epoch": 0.27780266547205734, + "grad_norm": 1.0039702653884888, + "learning_rate": 2.479e-05, + "loss": 0.3771, + "step": 4961 + }, + { + "epoch": 0.27785866278418636, + "grad_norm": 1.048505425453186, + "learning_rate": 2.4795000000000002e-05, + "loss": 0.4062, + "step": 4962 + }, + { + "epoch": 0.2779146600963154, + "grad_norm": 1.4633502960205078, + "learning_rate": 2.48e-05, + "loss": 0.4179, + "step": 4963 + }, + { + "epoch": 0.2779706574084444, + "grad_norm": 1.3561614751815796, + "learning_rate": 2.4805e-05, + "loss": 0.395, + "step": 4964 + }, + { + "epoch": 0.2780266547205734, + "grad_norm": 1.1100013256072998, + "learning_rate": 2.481e-05, + "loss": 0.4721, + "step": 4965 + }, + { + "epoch": 0.27808265203270244, + "grad_norm": 1.2962788343429565, + "learning_rate": 2.4815000000000002e-05, + "loss": 0.5376, + "step": 4966 + }, + { + "epoch": 0.27813864934483146, + "grad_norm": 1.1644928455352783, + "learning_rate": 2.4820000000000003e-05, + "loss": 0.4759, + "step": 4967 + }, + { + "epoch": 0.2781946466569605, + "grad_norm": 1.102250337600708, + "learning_rate": 2.4825e-05, + "loss": 0.358, + "step": 4968 + }, + { + "epoch": 0.2782506439690895, + "grad_norm": 0.9479377269744873, + "learning_rate": 2.483e-05, + "loss": 0.3354, + "step": 4969 + }, + { + "epoch": 0.2783066412812185, + "grad_norm": 1.2795408964157104, + "learning_rate": 2.4835e-05, + "loss": 0.3787, + "step": 4970 + }, + { + "epoch": 0.27836263859334753, + "grad_norm": 1.070845365524292, + "learning_rate": 2.4840000000000003e-05, + "loss": 0.4005, + "step": 4971 + }, + { + "epoch": 0.27841863590547655, + "grad_norm": 1.3072528839111328, + "learning_rate": 2.4845e-05, + "loss": 0.5137, + "step": 4972 + }, + { + "epoch": 0.27847463321760557, + "grad_norm": 1.3732484579086304, + "learning_rate": 2.485e-05, + "loss": 0.5127, + "step": 4973 + }, + { + "epoch": 0.2785306305297346, + "grad_norm": 1.2259336709976196, + "learning_rate": 2.4855000000000002e-05, + "loss": 0.4656, + "step": 4974 + }, + { + "epoch": 0.2785866278418636, + "grad_norm": 1.290574073791504, + "learning_rate": 2.486e-05, + "loss": 0.6261, + "step": 4975 + }, + { + "epoch": 0.27864262515399263, + "grad_norm": 1.3530707359313965, + "learning_rate": 2.4865000000000003e-05, + "loss": 0.649, + "step": 4976 + }, + { + "epoch": 0.27869862246612165, + "grad_norm": 1.1110451221466064, + "learning_rate": 2.487e-05, + "loss": 0.4133, + "step": 4977 + }, + { + "epoch": 0.27875461977825067, + "grad_norm": 1.2195521593093872, + "learning_rate": 2.4875e-05, + "loss": 0.4227, + "step": 4978 + }, + { + "epoch": 0.2788106170903797, + "grad_norm": 1.1598562002182007, + "learning_rate": 2.488e-05, + "loss": 0.4384, + "step": 4979 + }, + { + "epoch": 0.2788666144025087, + "grad_norm": 1.4165732860565186, + "learning_rate": 2.4885e-05, + "loss": 0.4252, + "step": 4980 + }, + { + "epoch": 0.2789226117146377, + "grad_norm": 1.760197639465332, + "learning_rate": 2.489e-05, + "loss": 0.3759, + "step": 4981 + }, + { + "epoch": 0.2789786090267667, + "grad_norm": 1.1311376094818115, + "learning_rate": 2.4895e-05, + "loss": 0.37, + "step": 4982 + }, + { + "epoch": 0.2790346063388957, + "grad_norm": 1.2808183431625366, + "learning_rate": 2.4900000000000002e-05, + "loss": 0.4915, + "step": 4983 + }, + { + "epoch": 0.2790906036510247, + "grad_norm": 1.4143792390823364, + "learning_rate": 2.4905e-05, + "loss": 0.5387, + "step": 4984 + }, + { + "epoch": 0.27914660096315375, + "grad_norm": 1.1529659032821655, + "learning_rate": 2.491e-05, + "loss": 0.4424, + "step": 4985 + }, + { + "epoch": 0.27920259827528277, + "grad_norm": 1.3599112033843994, + "learning_rate": 2.4915e-05, + "loss": 0.3906, + "step": 4986 + }, + { + "epoch": 0.2792585955874118, + "grad_norm": 1.1522306203842163, + "learning_rate": 2.4920000000000002e-05, + "loss": 0.448, + "step": 4987 + }, + { + "epoch": 0.2793145928995408, + "grad_norm": 1.0313642024993896, + "learning_rate": 2.4925000000000003e-05, + "loss": 0.4001, + "step": 4988 + }, + { + "epoch": 0.2793705902116698, + "grad_norm": 1.1960699558258057, + "learning_rate": 2.493e-05, + "loss": 0.3986, + "step": 4989 + }, + { + "epoch": 0.27942658752379884, + "grad_norm": 1.370259404182434, + "learning_rate": 2.4935e-05, + "loss": 0.4686, + "step": 4990 + }, + { + "epoch": 0.27948258483592786, + "grad_norm": 1.4210662841796875, + "learning_rate": 2.4940000000000002e-05, + "loss": 0.4857, + "step": 4991 + }, + { + "epoch": 0.2795385821480569, + "grad_norm": 1.4025802612304688, + "learning_rate": 2.4945000000000003e-05, + "loss": 0.4234, + "step": 4992 + }, + { + "epoch": 0.2795945794601859, + "grad_norm": 2.0384538173675537, + "learning_rate": 2.495e-05, + "loss": 0.4057, + "step": 4993 + }, + { + "epoch": 0.2796505767723149, + "grad_norm": 1.112663745880127, + "learning_rate": 2.4955e-05, + "loss": 0.3578, + "step": 4994 + }, + { + "epoch": 0.27970657408444394, + "grad_norm": 1.6824556589126587, + "learning_rate": 2.496e-05, + "loss": 0.5093, + "step": 4995 + }, + { + "epoch": 0.27976257139657296, + "grad_norm": 1.216307282447815, + "learning_rate": 2.4965000000000002e-05, + "loss": 0.3959, + "step": 4996 + }, + { + "epoch": 0.279818568708702, + "grad_norm": 1.1118022203445435, + "learning_rate": 2.4970000000000003e-05, + "loss": 0.4064, + "step": 4997 + }, + { + "epoch": 0.279874566020831, + "grad_norm": 1.4133844375610352, + "learning_rate": 2.4975e-05, + "loss": 0.4684, + "step": 4998 + }, + { + "epoch": 0.27993056333296, + "grad_norm": 1.2977321147918701, + "learning_rate": 2.498e-05, + "loss": 0.4066, + "step": 4999 + }, + { + "epoch": 0.27998656064508903, + "grad_norm": 1.1011120080947876, + "learning_rate": 2.4985e-05, + "loss": 0.3026, + "step": 5000 + }, + { + "epoch": 0.28004255795721805, + "grad_norm": 1.4148776531219482, + "learning_rate": 2.4990000000000003e-05, + "loss": 0.5396, + "step": 5001 + }, + { + "epoch": 0.28009855526934707, + "grad_norm": 1.0133522748947144, + "learning_rate": 2.4995e-05, + "loss": 0.3985, + "step": 5002 + }, + { + "epoch": 0.2801545525814761, + "grad_norm": 1.296737790107727, + "learning_rate": 2.5e-05, + "loss": 0.4786, + "step": 5003 + }, + { + "epoch": 0.2802105498936051, + "grad_norm": 1.4818477630615234, + "learning_rate": 2.5005000000000002e-05, + "loss": 0.5541, + "step": 5004 + }, + { + "epoch": 0.28026654720573413, + "grad_norm": 1.2512661218643188, + "learning_rate": 2.501e-05, + "loss": 0.4498, + "step": 5005 + }, + { + "epoch": 0.28032254451786315, + "grad_norm": 1.0812656879425049, + "learning_rate": 2.5015e-05, + "loss": 0.3263, + "step": 5006 + }, + { + "epoch": 0.28037854182999217, + "grad_norm": 1.1355310678482056, + "learning_rate": 2.5019999999999998e-05, + "loss": 0.4299, + "step": 5007 + }, + { + "epoch": 0.2804345391421212, + "grad_norm": 1.1510392427444458, + "learning_rate": 2.5025e-05, + "loss": 0.4134, + "step": 5008 + }, + { + "epoch": 0.2804905364542502, + "grad_norm": 1.1566981077194214, + "learning_rate": 2.5030000000000003e-05, + "loss": 0.3365, + "step": 5009 + }, + { + "epoch": 0.2805465337663792, + "grad_norm": 1.1639187335968018, + "learning_rate": 2.5035000000000003e-05, + "loss": 0.4559, + "step": 5010 + }, + { + "epoch": 0.28060253107850824, + "grad_norm": 1.1393630504608154, + "learning_rate": 2.504e-05, + "loss": 0.3888, + "step": 5011 + }, + { + "epoch": 0.28065852839063726, + "grad_norm": 1.2431306838989258, + "learning_rate": 2.5045e-05, + "loss": 0.5282, + "step": 5012 + }, + { + "epoch": 0.2807145257027663, + "grad_norm": 1.3750158548355103, + "learning_rate": 2.5050000000000002e-05, + "loss": 0.6099, + "step": 5013 + }, + { + "epoch": 0.2807705230148953, + "grad_norm": 1.3600974082946777, + "learning_rate": 2.5055e-05, + "loss": 0.3914, + "step": 5014 + }, + { + "epoch": 0.2808265203270243, + "grad_norm": 1.1310267448425293, + "learning_rate": 2.506e-05, + "loss": 0.3774, + "step": 5015 + }, + { + "epoch": 0.28088251763915334, + "grad_norm": 1.2618962526321411, + "learning_rate": 2.5064999999999998e-05, + "loss": 0.4592, + "step": 5016 + }, + { + "epoch": 0.28093851495128236, + "grad_norm": 1.211580514907837, + "learning_rate": 2.507e-05, + "loss": 0.4236, + "step": 5017 + }, + { + "epoch": 0.2809945122634114, + "grad_norm": 1.16752028465271, + "learning_rate": 2.5075e-05, + "loss": 0.3997, + "step": 5018 + }, + { + "epoch": 0.2810505095755404, + "grad_norm": 1.2811846733093262, + "learning_rate": 2.5080000000000004e-05, + "loss": 0.4685, + "step": 5019 + }, + { + "epoch": 0.2811065068876694, + "grad_norm": 1.071851372718811, + "learning_rate": 2.5085000000000005e-05, + "loss": 0.3922, + "step": 5020 + }, + { + "epoch": 0.28116250419979844, + "grad_norm": 1.2484806776046753, + "learning_rate": 2.5090000000000002e-05, + "loss": 0.4412, + "step": 5021 + }, + { + "epoch": 0.28121850151192745, + "grad_norm": 1.1056876182556152, + "learning_rate": 2.5095000000000003e-05, + "loss": 0.4058, + "step": 5022 + }, + { + "epoch": 0.2812744988240564, + "grad_norm": 1.1852781772613525, + "learning_rate": 2.51e-05, + "loss": 0.4035, + "step": 5023 + }, + { + "epoch": 0.28133049613618544, + "grad_norm": 1.5772156715393066, + "learning_rate": 2.5105e-05, + "loss": 0.4305, + "step": 5024 + }, + { + "epoch": 0.28138649344831446, + "grad_norm": 1.1575450897216797, + "learning_rate": 2.5110000000000002e-05, + "loss": 0.4063, + "step": 5025 + }, + { + "epoch": 0.2814424907604435, + "grad_norm": 1.2748372554779053, + "learning_rate": 2.5115e-05, + "loss": 0.4925, + "step": 5026 + }, + { + "epoch": 0.2814984880725725, + "grad_norm": 1.3930341005325317, + "learning_rate": 2.512e-05, + "loss": 0.441, + "step": 5027 + }, + { + "epoch": 0.2815544853847015, + "grad_norm": 1.5599757432937622, + "learning_rate": 2.5124999999999997e-05, + "loss": 0.4644, + "step": 5028 + }, + { + "epoch": 0.28161048269683053, + "grad_norm": 1.35392427444458, + "learning_rate": 2.5130000000000005e-05, + "loss": 0.5732, + "step": 5029 + }, + { + "epoch": 0.28166648000895955, + "grad_norm": 2.7221648693084717, + "learning_rate": 2.5135000000000002e-05, + "loss": 0.3834, + "step": 5030 + }, + { + "epoch": 0.28172247732108857, + "grad_norm": 1.168181300163269, + "learning_rate": 2.5140000000000003e-05, + "loss": 0.4182, + "step": 5031 + }, + { + "epoch": 0.2817784746332176, + "grad_norm": 1.1578096151351929, + "learning_rate": 2.5145e-05, + "loss": 0.3981, + "step": 5032 + }, + { + "epoch": 0.2818344719453466, + "grad_norm": 1.066641926765442, + "learning_rate": 2.515e-05, + "loss": 0.4481, + "step": 5033 + }, + { + "epoch": 0.28189046925747563, + "grad_norm": 1.20148766040802, + "learning_rate": 2.5155000000000002e-05, + "loss": 0.4609, + "step": 5034 + }, + { + "epoch": 0.28194646656960465, + "grad_norm": 1.1593502759933472, + "learning_rate": 2.516e-05, + "loss": 0.5019, + "step": 5035 + }, + { + "epoch": 0.28200246388173367, + "grad_norm": 1.2151565551757812, + "learning_rate": 2.5165e-05, + "loss": 0.4255, + "step": 5036 + }, + { + "epoch": 0.2820584611938627, + "grad_norm": 1.204663872718811, + "learning_rate": 2.5169999999999998e-05, + "loss": 0.4624, + "step": 5037 + }, + { + "epoch": 0.2821144585059917, + "grad_norm": 1.0738803148269653, + "learning_rate": 2.5175e-05, + "loss": 0.4004, + "step": 5038 + }, + { + "epoch": 0.2821704558181207, + "grad_norm": 1.1140012741088867, + "learning_rate": 2.5180000000000003e-05, + "loss": 0.3801, + "step": 5039 + }, + { + "epoch": 0.28222645313024974, + "grad_norm": 1.0218623876571655, + "learning_rate": 2.5185000000000004e-05, + "loss": 0.3478, + "step": 5040 + }, + { + "epoch": 0.28228245044237876, + "grad_norm": 1.285437822341919, + "learning_rate": 2.519e-05, + "loss": 0.4935, + "step": 5041 + }, + { + "epoch": 0.2823384477545078, + "grad_norm": 1.3076173067092896, + "learning_rate": 2.5195000000000002e-05, + "loss": 0.3867, + "step": 5042 + }, + { + "epoch": 0.2823944450666368, + "grad_norm": 1.273854374885559, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.4336, + "step": 5043 + }, + { + "epoch": 0.2824504423787658, + "grad_norm": 1.2694003582000732, + "learning_rate": 2.5205e-05, + "loss": 0.4235, + "step": 5044 + }, + { + "epoch": 0.28250643969089484, + "grad_norm": 1.1363548040390015, + "learning_rate": 2.521e-05, + "loss": 0.3504, + "step": 5045 + }, + { + "epoch": 0.28256243700302386, + "grad_norm": 1.2498666048049927, + "learning_rate": 2.5214999999999998e-05, + "loss": 0.4245, + "step": 5046 + }, + { + "epoch": 0.2826184343151529, + "grad_norm": 1.1381645202636719, + "learning_rate": 2.522e-05, + "loss": 0.3956, + "step": 5047 + }, + { + "epoch": 0.2826744316272819, + "grad_norm": 1.3356531858444214, + "learning_rate": 2.5225e-05, + "loss": 0.4546, + "step": 5048 + }, + { + "epoch": 0.2827304289394109, + "grad_norm": 1.0721495151519775, + "learning_rate": 2.5230000000000004e-05, + "loss": 0.5557, + "step": 5049 + }, + { + "epoch": 0.28278642625153994, + "grad_norm": 1.2902302742004395, + "learning_rate": 2.5235e-05, + "loss": 0.5123, + "step": 5050 + }, + { + "epoch": 0.28284242356366895, + "grad_norm": 1.2670661211013794, + "learning_rate": 2.5240000000000002e-05, + "loss": 0.4056, + "step": 5051 + }, + { + "epoch": 0.282898420875798, + "grad_norm": 1.1348469257354736, + "learning_rate": 2.5245000000000003e-05, + "loss": 0.4311, + "step": 5052 + }, + { + "epoch": 0.282954418187927, + "grad_norm": 1.2487995624542236, + "learning_rate": 2.525e-05, + "loss": 0.4668, + "step": 5053 + }, + { + "epoch": 0.283010415500056, + "grad_norm": 1.5302445888519287, + "learning_rate": 2.5255e-05, + "loss": 0.4978, + "step": 5054 + }, + { + "epoch": 0.28306641281218503, + "grad_norm": 1.219663143157959, + "learning_rate": 2.526e-05, + "loss": 0.3641, + "step": 5055 + }, + { + "epoch": 0.28312241012431405, + "grad_norm": 1.0319263935089111, + "learning_rate": 2.5265e-05, + "loss": 0.3338, + "step": 5056 + }, + { + "epoch": 0.28317840743644307, + "grad_norm": 1.1401755809783936, + "learning_rate": 2.527e-05, + "loss": 0.3796, + "step": 5057 + }, + { + "epoch": 0.2832344047485721, + "grad_norm": 1.1127512454986572, + "learning_rate": 2.5274999999999998e-05, + "loss": 0.5296, + "step": 5058 + }, + { + "epoch": 0.2832904020607011, + "grad_norm": 1.3074079751968384, + "learning_rate": 2.5280000000000005e-05, + "loss": 0.3188, + "step": 5059 + }, + { + "epoch": 0.2833463993728301, + "grad_norm": 1.3833627700805664, + "learning_rate": 2.5285000000000003e-05, + "loss": 0.5894, + "step": 5060 + }, + { + "epoch": 0.28340239668495915, + "grad_norm": 1.3308225870132446, + "learning_rate": 2.5290000000000004e-05, + "loss": 0.4272, + "step": 5061 + }, + { + "epoch": 0.28345839399708816, + "grad_norm": 1.2617563009262085, + "learning_rate": 2.5295e-05, + "loss": 0.3797, + "step": 5062 + }, + { + "epoch": 0.2835143913092172, + "grad_norm": 1.2066013813018799, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.4103, + "step": 5063 + }, + { + "epoch": 0.28357038862134615, + "grad_norm": 1.3341902494430542, + "learning_rate": 2.5305000000000003e-05, + "loss": 0.3874, + "step": 5064 + }, + { + "epoch": 0.28362638593347517, + "grad_norm": 1.2941293716430664, + "learning_rate": 2.531e-05, + "loss": 0.3671, + "step": 5065 + }, + { + "epoch": 0.2836823832456042, + "grad_norm": 0.9018704891204834, + "learning_rate": 2.5315e-05, + "loss": 0.3604, + "step": 5066 + }, + { + "epoch": 0.2837383805577332, + "grad_norm": 4.461050510406494, + "learning_rate": 2.5319999999999998e-05, + "loss": 0.4973, + "step": 5067 + }, + { + "epoch": 0.2837943778698622, + "grad_norm": 1.8063386678695679, + "learning_rate": 2.5325e-05, + "loss": 0.4332, + "step": 5068 + }, + { + "epoch": 0.28385037518199124, + "grad_norm": 1.1380176544189453, + "learning_rate": 2.5330000000000003e-05, + "loss": 0.3468, + "step": 5069 + }, + { + "epoch": 0.28390637249412026, + "grad_norm": 1.3340381383895874, + "learning_rate": 2.5335000000000004e-05, + "loss": 0.4126, + "step": 5070 + }, + { + "epoch": 0.2839623698062493, + "grad_norm": 1.4871255159378052, + "learning_rate": 2.534e-05, + "loss": 0.5937, + "step": 5071 + }, + { + "epoch": 0.2840183671183783, + "grad_norm": 1.2137213945388794, + "learning_rate": 2.5345000000000002e-05, + "loss": 0.3692, + "step": 5072 + }, + { + "epoch": 0.2840743644305073, + "grad_norm": 1.7437152862548828, + "learning_rate": 2.5350000000000003e-05, + "loss": 0.4339, + "step": 5073 + }, + { + "epoch": 0.28413036174263634, + "grad_norm": 1.2067270278930664, + "learning_rate": 2.5355e-05, + "loss": 0.4506, + "step": 5074 + }, + { + "epoch": 0.28418635905476536, + "grad_norm": 1.5555673837661743, + "learning_rate": 2.536e-05, + "loss": 0.4121, + "step": 5075 + }, + { + "epoch": 0.2842423563668944, + "grad_norm": 1.3813008069992065, + "learning_rate": 2.5365e-05, + "loss": 0.4734, + "step": 5076 + }, + { + "epoch": 0.2842983536790234, + "grad_norm": 1.4264832735061646, + "learning_rate": 2.537e-05, + "loss": 0.4616, + "step": 5077 + }, + { + "epoch": 0.2843543509911524, + "grad_norm": 1.092544674873352, + "learning_rate": 2.5375e-05, + "loss": 0.3485, + "step": 5078 + }, + { + "epoch": 0.28441034830328144, + "grad_norm": 1.1917117834091187, + "learning_rate": 2.5380000000000004e-05, + "loss": 0.5205, + "step": 5079 + }, + { + "epoch": 0.28446634561541045, + "grad_norm": 1.355815052986145, + "learning_rate": 2.5385000000000002e-05, + "loss": 0.4565, + "step": 5080 + }, + { + "epoch": 0.2845223429275395, + "grad_norm": 1.0870096683502197, + "learning_rate": 2.5390000000000003e-05, + "loss": 0.4144, + "step": 5081 + }, + { + "epoch": 0.2845783402396685, + "grad_norm": 1.094519853591919, + "learning_rate": 2.5395000000000003e-05, + "loss": 0.4035, + "step": 5082 + }, + { + "epoch": 0.2846343375517975, + "grad_norm": 1.3015124797821045, + "learning_rate": 2.54e-05, + "loss": 0.4412, + "step": 5083 + }, + { + "epoch": 0.28469033486392653, + "grad_norm": 1.3095399141311646, + "learning_rate": 2.5405e-05, + "loss": 0.3885, + "step": 5084 + }, + { + "epoch": 0.28474633217605555, + "grad_norm": 1.2276149988174438, + "learning_rate": 2.541e-05, + "loss": 0.4867, + "step": 5085 + }, + { + "epoch": 0.28480232948818457, + "grad_norm": 1.4077141284942627, + "learning_rate": 2.5415e-05, + "loss": 0.4127, + "step": 5086 + }, + { + "epoch": 0.2848583268003136, + "grad_norm": 7.861964702606201, + "learning_rate": 2.542e-05, + "loss": 0.372, + "step": 5087 + }, + { + "epoch": 0.2849143241124426, + "grad_norm": 1.4856375455856323, + "learning_rate": 2.5424999999999998e-05, + "loss": 0.5014, + "step": 5088 + }, + { + "epoch": 0.2849703214245716, + "grad_norm": 1.215444564819336, + "learning_rate": 2.5430000000000002e-05, + "loss": 0.4289, + "step": 5089 + }, + { + "epoch": 0.28502631873670065, + "grad_norm": 1.1757819652557373, + "learning_rate": 2.5435000000000003e-05, + "loss": 0.4394, + "step": 5090 + }, + { + "epoch": 0.28508231604882966, + "grad_norm": 1.2638258934020996, + "learning_rate": 2.5440000000000004e-05, + "loss": 0.4226, + "step": 5091 + }, + { + "epoch": 0.2851383133609587, + "grad_norm": 1.574288010597229, + "learning_rate": 2.5445e-05, + "loss": 0.3677, + "step": 5092 + }, + { + "epoch": 0.2851943106730877, + "grad_norm": 1.1244778633117676, + "learning_rate": 2.5450000000000002e-05, + "loss": 0.4044, + "step": 5093 + }, + { + "epoch": 0.2852503079852167, + "grad_norm": 1.2923802137374878, + "learning_rate": 2.5455e-05, + "loss": 0.4191, + "step": 5094 + }, + { + "epoch": 0.28530630529734574, + "grad_norm": 1.1234909296035767, + "learning_rate": 2.546e-05, + "loss": 0.5451, + "step": 5095 + }, + { + "epoch": 0.28536230260947476, + "grad_norm": 0.9855756759643555, + "learning_rate": 2.5465e-05, + "loss": 0.4568, + "step": 5096 + }, + { + "epoch": 0.2854182999216038, + "grad_norm": 1.175218105316162, + "learning_rate": 2.547e-05, + "loss": 0.3712, + "step": 5097 + }, + { + "epoch": 0.2854742972337328, + "grad_norm": 0.9445306658744812, + "learning_rate": 2.5475e-05, + "loss": 0.3229, + "step": 5098 + }, + { + "epoch": 0.2855302945458618, + "grad_norm": 1.227358102798462, + "learning_rate": 2.5480000000000003e-05, + "loss": 0.4326, + "step": 5099 + }, + { + "epoch": 0.28558629185799084, + "grad_norm": 1.0580247640609741, + "learning_rate": 2.5485000000000004e-05, + "loss": 0.3549, + "step": 5100 + }, + { + "epoch": 0.28564228917011986, + "grad_norm": 1.212801218032837, + "learning_rate": 2.549e-05, + "loss": 0.4072, + "step": 5101 + }, + { + "epoch": 0.2856982864822489, + "grad_norm": 1.2784329652786255, + "learning_rate": 2.5495000000000002e-05, + "loss": 0.4814, + "step": 5102 + }, + { + "epoch": 0.2857542837943779, + "grad_norm": 1.2336822748184204, + "learning_rate": 2.5500000000000003e-05, + "loss": 0.4123, + "step": 5103 + }, + { + "epoch": 0.2858102811065069, + "grad_norm": 1.3024848699569702, + "learning_rate": 2.5505e-05, + "loss": 0.3914, + "step": 5104 + }, + { + "epoch": 0.28586627841863593, + "grad_norm": 1.5754663944244385, + "learning_rate": 2.551e-05, + "loss": 0.5698, + "step": 5105 + }, + { + "epoch": 0.2859222757307649, + "grad_norm": 1.086853265762329, + "learning_rate": 2.5515e-05, + "loss": 0.4066, + "step": 5106 + }, + { + "epoch": 0.2859782730428939, + "grad_norm": 1.2721822261810303, + "learning_rate": 2.552e-05, + "loss": 0.4683, + "step": 5107 + }, + { + "epoch": 0.28603427035502293, + "grad_norm": 1.3510558605194092, + "learning_rate": 2.5525e-05, + "loss": 0.4937, + "step": 5108 + }, + { + "epoch": 0.28609026766715195, + "grad_norm": 1.0186558961868286, + "learning_rate": 2.5530000000000005e-05, + "loss": 0.3634, + "step": 5109 + }, + { + "epoch": 0.286146264979281, + "grad_norm": 1.2649930715560913, + "learning_rate": 2.5535000000000002e-05, + "loss": 0.4996, + "step": 5110 + }, + { + "epoch": 0.28620226229141, + "grad_norm": 1.2046400308609009, + "learning_rate": 2.5540000000000003e-05, + "loss": 0.4747, + "step": 5111 + }, + { + "epoch": 0.286258259603539, + "grad_norm": 1.1831685304641724, + "learning_rate": 2.5545000000000004e-05, + "loss": 0.4545, + "step": 5112 + }, + { + "epoch": 0.28631425691566803, + "grad_norm": 1.6218205690383911, + "learning_rate": 2.555e-05, + "loss": 0.6066, + "step": 5113 + }, + { + "epoch": 0.28637025422779705, + "grad_norm": 1.156431794166565, + "learning_rate": 2.5555000000000002e-05, + "loss": 0.3627, + "step": 5114 + }, + { + "epoch": 0.28642625153992607, + "grad_norm": 1.2976090908050537, + "learning_rate": 2.556e-05, + "loss": 0.4457, + "step": 5115 + }, + { + "epoch": 0.2864822488520551, + "grad_norm": 1.2460031509399414, + "learning_rate": 2.5565e-05, + "loss": 0.6375, + "step": 5116 + }, + { + "epoch": 0.2865382461641841, + "grad_norm": 1.0107266902923584, + "learning_rate": 2.557e-05, + "loss": 0.3809, + "step": 5117 + }, + { + "epoch": 0.2865942434763131, + "grad_norm": 1.262149691581726, + "learning_rate": 2.5574999999999998e-05, + "loss": 0.3705, + "step": 5118 + }, + { + "epoch": 0.28665024078844215, + "grad_norm": 1.5213390588760376, + "learning_rate": 2.5580000000000002e-05, + "loss": 0.4707, + "step": 5119 + }, + { + "epoch": 0.28670623810057116, + "grad_norm": 1.0707709789276123, + "learning_rate": 2.5585000000000003e-05, + "loss": 0.4222, + "step": 5120 + }, + { + "epoch": 0.2867622354127002, + "grad_norm": 1.4900261163711548, + "learning_rate": 2.5590000000000004e-05, + "loss": 0.4853, + "step": 5121 + }, + { + "epoch": 0.2868182327248292, + "grad_norm": 1.1637797355651855, + "learning_rate": 2.5595e-05, + "loss": 0.3548, + "step": 5122 + }, + { + "epoch": 0.2868742300369582, + "grad_norm": 1.3067561388015747, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.5567, + "step": 5123 + }, + { + "epoch": 0.28693022734908724, + "grad_norm": 1.1704126596450806, + "learning_rate": 2.5605e-05, + "loss": 0.4554, + "step": 5124 + }, + { + "epoch": 0.28698622466121626, + "grad_norm": 2.44575834274292, + "learning_rate": 2.561e-05, + "loss": 0.4063, + "step": 5125 + }, + { + "epoch": 0.2870422219733453, + "grad_norm": 1.180340051651001, + "learning_rate": 2.5615e-05, + "loss": 0.3816, + "step": 5126 + }, + { + "epoch": 0.2870982192854743, + "grad_norm": 1.2813947200775146, + "learning_rate": 2.562e-05, + "loss": 0.5276, + "step": 5127 + }, + { + "epoch": 0.2871542165976033, + "grad_norm": 1.2095357179641724, + "learning_rate": 2.5625e-05, + "loss": 0.3904, + "step": 5128 + }, + { + "epoch": 0.28721021390973234, + "grad_norm": 1.0724081993103027, + "learning_rate": 2.5629999999999997e-05, + "loss": 0.4676, + "step": 5129 + }, + { + "epoch": 0.28726621122186136, + "grad_norm": 1.3954498767852783, + "learning_rate": 2.5635000000000004e-05, + "loss": 0.4394, + "step": 5130 + }, + { + "epoch": 0.2873222085339904, + "grad_norm": 1.6469731330871582, + "learning_rate": 2.5640000000000002e-05, + "loss": 0.5405, + "step": 5131 + }, + { + "epoch": 0.2873782058461194, + "grad_norm": 1.2044121026992798, + "learning_rate": 2.5645000000000003e-05, + "loss": 0.4052, + "step": 5132 + }, + { + "epoch": 0.2874342031582484, + "grad_norm": 1.0413527488708496, + "learning_rate": 2.5650000000000003e-05, + "loss": 0.2997, + "step": 5133 + }, + { + "epoch": 0.28749020047037743, + "grad_norm": 1.354501485824585, + "learning_rate": 2.5655e-05, + "loss": 0.3402, + "step": 5134 + }, + { + "epoch": 0.28754619778250645, + "grad_norm": 1.4380000829696655, + "learning_rate": 2.566e-05, + "loss": 0.4822, + "step": 5135 + }, + { + "epoch": 0.28760219509463547, + "grad_norm": 1.2316100597381592, + "learning_rate": 2.5665e-05, + "loss": 0.5041, + "step": 5136 + }, + { + "epoch": 0.2876581924067645, + "grad_norm": 1.3962647914886475, + "learning_rate": 2.567e-05, + "loss": 0.5794, + "step": 5137 + }, + { + "epoch": 0.2877141897188935, + "grad_norm": 1.6493854522705078, + "learning_rate": 2.5675e-05, + "loss": 0.5601, + "step": 5138 + }, + { + "epoch": 0.28777018703102253, + "grad_norm": 1.103853464126587, + "learning_rate": 2.5679999999999998e-05, + "loss": 0.617, + "step": 5139 + }, + { + "epoch": 0.28782618434315155, + "grad_norm": 1.2031441926956177, + "learning_rate": 2.5685000000000002e-05, + "loss": 0.4776, + "step": 5140 + }, + { + "epoch": 0.28788218165528057, + "grad_norm": 1.3442846536636353, + "learning_rate": 2.5690000000000003e-05, + "loss": 0.4194, + "step": 5141 + }, + { + "epoch": 0.2879381789674096, + "grad_norm": 1.297261118888855, + "learning_rate": 2.5695000000000004e-05, + "loss": 0.5212, + "step": 5142 + }, + { + "epoch": 0.2879941762795386, + "grad_norm": 1.2500529289245605, + "learning_rate": 2.57e-05, + "loss": 0.4298, + "step": 5143 + }, + { + "epoch": 0.2880501735916676, + "grad_norm": 1.1209262609481812, + "learning_rate": 2.5705000000000002e-05, + "loss": 0.3535, + "step": 5144 + }, + { + "epoch": 0.28810617090379664, + "grad_norm": 1.3996305465698242, + "learning_rate": 2.571e-05, + "loss": 0.4195, + "step": 5145 + }, + { + "epoch": 0.28816216821592566, + "grad_norm": 1.3168365955352783, + "learning_rate": 2.5715e-05, + "loss": 0.4505, + "step": 5146 + }, + { + "epoch": 0.2882181655280546, + "grad_norm": 1.3115452527999878, + "learning_rate": 2.572e-05, + "loss": 0.4509, + "step": 5147 + }, + { + "epoch": 0.28827416284018365, + "grad_norm": 1.2151601314544678, + "learning_rate": 2.5725e-05, + "loss": 0.5359, + "step": 5148 + }, + { + "epoch": 0.28833016015231266, + "grad_norm": 1.0965923070907593, + "learning_rate": 2.573e-05, + "loss": 0.4087, + "step": 5149 + }, + { + "epoch": 0.2883861574644417, + "grad_norm": 1.2941659688949585, + "learning_rate": 2.5735000000000003e-05, + "loss": 0.4825, + "step": 5150 + }, + { + "epoch": 0.2884421547765707, + "grad_norm": 1.1040208339691162, + "learning_rate": 2.5740000000000004e-05, + "loss": 0.4537, + "step": 5151 + }, + { + "epoch": 0.2884981520886997, + "grad_norm": 1.198751449584961, + "learning_rate": 2.5745e-05, + "loss": 0.4545, + "step": 5152 + }, + { + "epoch": 0.28855414940082874, + "grad_norm": 1.3784911632537842, + "learning_rate": 2.5750000000000002e-05, + "loss": 0.4048, + "step": 5153 + }, + { + "epoch": 0.28861014671295776, + "grad_norm": 1.2006009817123413, + "learning_rate": 2.5755e-05, + "loss": 0.4406, + "step": 5154 + }, + { + "epoch": 0.2886661440250868, + "grad_norm": 1.1796938180923462, + "learning_rate": 2.576e-05, + "loss": 0.4501, + "step": 5155 + }, + { + "epoch": 0.2887221413372158, + "grad_norm": 1.0965385437011719, + "learning_rate": 2.5765e-05, + "loss": 0.3665, + "step": 5156 + }, + { + "epoch": 0.2887781386493448, + "grad_norm": 1.062350869178772, + "learning_rate": 2.577e-05, + "loss": 0.3916, + "step": 5157 + }, + { + "epoch": 0.28883413596147384, + "grad_norm": 1.24132239818573, + "learning_rate": 2.5775e-05, + "loss": 0.3841, + "step": 5158 + }, + { + "epoch": 0.28889013327360286, + "grad_norm": 1.4969040155410767, + "learning_rate": 2.5779999999999997e-05, + "loss": 0.5066, + "step": 5159 + }, + { + "epoch": 0.2889461305857319, + "grad_norm": 1.4512512683868408, + "learning_rate": 2.5785000000000005e-05, + "loss": 0.4499, + "step": 5160 + }, + { + "epoch": 0.2890021278978609, + "grad_norm": 1.1753685474395752, + "learning_rate": 2.5790000000000002e-05, + "loss": 0.3539, + "step": 5161 + }, + { + "epoch": 0.2890581252099899, + "grad_norm": 1.3325785398483276, + "learning_rate": 2.5795000000000003e-05, + "loss": 0.5696, + "step": 5162 + }, + { + "epoch": 0.28911412252211893, + "grad_norm": 1.0597665309906006, + "learning_rate": 2.58e-05, + "loss": 0.3675, + "step": 5163 + }, + { + "epoch": 0.28917011983424795, + "grad_norm": 1.442156195640564, + "learning_rate": 2.5805e-05, + "loss": 0.4321, + "step": 5164 + }, + { + "epoch": 0.28922611714637697, + "grad_norm": 1.0994163751602173, + "learning_rate": 2.5810000000000002e-05, + "loss": 0.3789, + "step": 5165 + }, + { + "epoch": 0.289282114458506, + "grad_norm": 1.2429791688919067, + "learning_rate": 2.5815e-05, + "loss": 0.4146, + "step": 5166 + }, + { + "epoch": 0.289338111770635, + "grad_norm": 1.4033571481704712, + "learning_rate": 2.582e-05, + "loss": 0.4169, + "step": 5167 + }, + { + "epoch": 0.28939410908276403, + "grad_norm": 1.0820399522781372, + "learning_rate": 2.5824999999999998e-05, + "loss": 0.3315, + "step": 5168 + }, + { + "epoch": 0.28945010639489305, + "grad_norm": 1.3229745626449585, + "learning_rate": 2.583e-05, + "loss": 0.4174, + "step": 5169 + }, + { + "epoch": 0.28950610370702207, + "grad_norm": 1.4531428813934326, + "learning_rate": 2.5835000000000003e-05, + "loss": 0.5334, + "step": 5170 + }, + { + "epoch": 0.2895621010191511, + "grad_norm": 1.3618649244308472, + "learning_rate": 2.5840000000000003e-05, + "loss": 0.4907, + "step": 5171 + }, + { + "epoch": 0.2896180983312801, + "grad_norm": 1.25014066696167, + "learning_rate": 2.5845000000000004e-05, + "loss": 0.465, + "step": 5172 + }, + { + "epoch": 0.2896740956434091, + "grad_norm": 1.3573648929595947, + "learning_rate": 2.585e-05, + "loss": 0.4262, + "step": 5173 + }, + { + "epoch": 0.28973009295553814, + "grad_norm": 1.3631477355957031, + "learning_rate": 2.5855000000000002e-05, + "loss": 0.5082, + "step": 5174 + }, + { + "epoch": 0.28978609026766716, + "grad_norm": 1.305877447128296, + "learning_rate": 2.586e-05, + "loss": 0.497, + "step": 5175 + }, + { + "epoch": 0.2898420875797962, + "grad_norm": 1.152976632118225, + "learning_rate": 2.5865e-05, + "loss": 0.437, + "step": 5176 + }, + { + "epoch": 0.2898980848919252, + "grad_norm": 1.2623867988586426, + "learning_rate": 2.587e-05, + "loss": 0.4915, + "step": 5177 + }, + { + "epoch": 0.2899540822040542, + "grad_norm": 1.0398708581924438, + "learning_rate": 2.5875e-05, + "loss": 0.435, + "step": 5178 + }, + { + "epoch": 0.29001007951618324, + "grad_norm": 1.1778053045272827, + "learning_rate": 2.588e-05, + "loss": 0.524, + "step": 5179 + }, + { + "epoch": 0.29006607682831226, + "grad_norm": 1.231372356414795, + "learning_rate": 2.5885000000000004e-05, + "loss": 0.4914, + "step": 5180 + }, + { + "epoch": 0.2901220741404413, + "grad_norm": 1.2939479351043701, + "learning_rate": 2.5890000000000005e-05, + "loss": 0.3787, + "step": 5181 + }, + { + "epoch": 0.2901780714525703, + "grad_norm": 1.2962067127227783, + "learning_rate": 2.5895000000000002e-05, + "loss": 0.468, + "step": 5182 + }, + { + "epoch": 0.2902340687646993, + "grad_norm": 1.5894113779067993, + "learning_rate": 2.5900000000000003e-05, + "loss": 0.5801, + "step": 5183 + }, + { + "epoch": 0.29029006607682833, + "grad_norm": 1.2547224760055542, + "learning_rate": 2.5905e-05, + "loss": 0.4094, + "step": 5184 + }, + { + "epoch": 0.29034606338895735, + "grad_norm": 2.165175676345825, + "learning_rate": 2.591e-05, + "loss": 0.4425, + "step": 5185 + }, + { + "epoch": 0.2904020607010864, + "grad_norm": 1.166042685508728, + "learning_rate": 2.5915000000000002e-05, + "loss": 0.376, + "step": 5186 + }, + { + "epoch": 0.2904580580132154, + "grad_norm": 1.2118500471115112, + "learning_rate": 2.592e-05, + "loss": 0.3256, + "step": 5187 + }, + { + "epoch": 0.29051405532534436, + "grad_norm": 1.4399436712265015, + "learning_rate": 2.5925e-05, + "loss": 0.5189, + "step": 5188 + }, + { + "epoch": 0.2905700526374734, + "grad_norm": 1.0340296030044556, + "learning_rate": 2.5929999999999997e-05, + "loss": 0.2661, + "step": 5189 + }, + { + "epoch": 0.2906260499496024, + "grad_norm": 1.224493384361267, + "learning_rate": 2.5935000000000005e-05, + "loss": 0.4585, + "step": 5190 + }, + { + "epoch": 0.2906820472617314, + "grad_norm": 1.3201141357421875, + "learning_rate": 2.5940000000000002e-05, + "loss": 0.4362, + "step": 5191 + }, + { + "epoch": 0.29073804457386043, + "grad_norm": 1.560363531112671, + "learning_rate": 2.5945000000000003e-05, + "loss": 0.5274, + "step": 5192 + }, + { + "epoch": 0.29079404188598945, + "grad_norm": 1.2800922393798828, + "learning_rate": 2.595e-05, + "loss": 0.3426, + "step": 5193 + }, + { + "epoch": 0.29085003919811847, + "grad_norm": 1.2105079889297485, + "learning_rate": 2.5955e-05, + "loss": 0.362, + "step": 5194 + }, + { + "epoch": 0.2909060365102475, + "grad_norm": 1.2776095867156982, + "learning_rate": 2.5960000000000002e-05, + "loss": 0.3604, + "step": 5195 + }, + { + "epoch": 0.2909620338223765, + "grad_norm": 1.4299017190933228, + "learning_rate": 2.5965e-05, + "loss": 0.4469, + "step": 5196 + }, + { + "epoch": 0.29101803113450553, + "grad_norm": 1.1238104104995728, + "learning_rate": 2.597e-05, + "loss": 0.3655, + "step": 5197 + }, + { + "epoch": 0.29107402844663455, + "grad_norm": 1.1724493503570557, + "learning_rate": 2.5974999999999998e-05, + "loss": 0.4825, + "step": 5198 + }, + { + "epoch": 0.29113002575876357, + "grad_norm": 1.156067967414856, + "learning_rate": 2.598e-05, + "loss": 0.4406, + "step": 5199 + }, + { + "epoch": 0.2911860230708926, + "grad_norm": 1.392245888710022, + "learning_rate": 2.5985000000000003e-05, + "loss": 0.4914, + "step": 5200 + }, + { + "epoch": 0.2912420203830216, + "grad_norm": 1.2607609033584595, + "learning_rate": 2.5990000000000004e-05, + "loss": 0.3389, + "step": 5201 + }, + { + "epoch": 0.2912980176951506, + "grad_norm": 1.2951900959014893, + "learning_rate": 2.5995000000000004e-05, + "loss": 0.5232, + "step": 5202 + }, + { + "epoch": 0.29135401500727964, + "grad_norm": 1.1071255207061768, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.3389, + "step": 5203 + }, + { + "epoch": 0.29141001231940866, + "grad_norm": 1.5917093753814697, + "learning_rate": 2.6005000000000003e-05, + "loss": 0.4165, + "step": 5204 + }, + { + "epoch": 0.2914660096315377, + "grad_norm": 1.740160346031189, + "learning_rate": 2.601e-05, + "loss": 0.5188, + "step": 5205 + }, + { + "epoch": 0.2915220069436667, + "grad_norm": 1.6251461505889893, + "learning_rate": 2.6015e-05, + "loss": 0.6349, + "step": 5206 + }, + { + "epoch": 0.2915780042557957, + "grad_norm": 1.1782416105270386, + "learning_rate": 2.602e-05, + "loss": 0.4572, + "step": 5207 + }, + { + "epoch": 0.29163400156792474, + "grad_norm": 1.1010757684707642, + "learning_rate": 2.6025e-05, + "loss": 0.3086, + "step": 5208 + }, + { + "epoch": 0.29168999888005376, + "grad_norm": 1.2914388179779053, + "learning_rate": 2.603e-05, + "loss": 0.4165, + "step": 5209 + }, + { + "epoch": 0.2917459961921828, + "grad_norm": 1.0075737237930298, + "learning_rate": 2.6035000000000004e-05, + "loss": 0.3249, + "step": 5210 + }, + { + "epoch": 0.2918019935043118, + "grad_norm": 1.1916017532348633, + "learning_rate": 2.6040000000000005e-05, + "loss": 0.4307, + "step": 5211 + }, + { + "epoch": 0.2918579908164408, + "grad_norm": 1.2970917224884033, + "learning_rate": 2.6045000000000002e-05, + "loss": 0.5413, + "step": 5212 + }, + { + "epoch": 0.29191398812856983, + "grad_norm": 1.0869152545928955, + "learning_rate": 2.6050000000000003e-05, + "loss": 0.3936, + "step": 5213 + }, + { + "epoch": 0.29196998544069885, + "grad_norm": 1.3564529418945312, + "learning_rate": 2.6055e-05, + "loss": 0.3797, + "step": 5214 + }, + { + "epoch": 0.2920259827528279, + "grad_norm": 1.2981070280075073, + "learning_rate": 2.606e-05, + "loss": 0.4802, + "step": 5215 + }, + { + "epoch": 0.2920819800649569, + "grad_norm": 1.3548084497451782, + "learning_rate": 2.6065000000000002e-05, + "loss": 0.3205, + "step": 5216 + }, + { + "epoch": 0.2921379773770859, + "grad_norm": 1.379685878753662, + "learning_rate": 2.607e-05, + "loss": 0.4269, + "step": 5217 + }, + { + "epoch": 0.29219397468921493, + "grad_norm": 1.2204341888427734, + "learning_rate": 2.6075e-05, + "loss": 0.4196, + "step": 5218 + }, + { + "epoch": 0.29224997200134395, + "grad_norm": 1.3498417139053345, + "learning_rate": 2.6079999999999998e-05, + "loss": 0.3891, + "step": 5219 + }, + { + "epoch": 0.29230596931347297, + "grad_norm": 1.0870332717895508, + "learning_rate": 2.6085000000000005e-05, + "loss": 0.3441, + "step": 5220 + }, + { + "epoch": 0.292361966625602, + "grad_norm": 1.2560654878616333, + "learning_rate": 2.6090000000000003e-05, + "loss": 0.4986, + "step": 5221 + }, + { + "epoch": 0.292417963937731, + "grad_norm": 1.2160749435424805, + "learning_rate": 2.6095000000000003e-05, + "loss": 0.5128, + "step": 5222 + }, + { + "epoch": 0.29247396124986, + "grad_norm": 1.0488581657409668, + "learning_rate": 2.61e-05, + "loss": 0.334, + "step": 5223 + }, + { + "epoch": 0.29252995856198905, + "grad_norm": 1.336931824684143, + "learning_rate": 2.6105e-05, + "loss": 0.4615, + "step": 5224 + }, + { + "epoch": 0.29258595587411806, + "grad_norm": 1.1971161365509033, + "learning_rate": 2.6110000000000002e-05, + "loss": 0.5235, + "step": 5225 + }, + { + "epoch": 0.2926419531862471, + "grad_norm": 1.2192145586013794, + "learning_rate": 2.6115e-05, + "loss": 0.4185, + "step": 5226 + }, + { + "epoch": 0.2926979504983761, + "grad_norm": 1.3168421983718872, + "learning_rate": 2.612e-05, + "loss": 0.4166, + "step": 5227 + }, + { + "epoch": 0.2927539478105051, + "grad_norm": 1.271488070487976, + "learning_rate": 2.6124999999999998e-05, + "loss": 0.5113, + "step": 5228 + }, + { + "epoch": 0.29280994512263414, + "grad_norm": 1.1534937620162964, + "learning_rate": 2.613e-05, + "loss": 0.4285, + "step": 5229 + }, + { + "epoch": 0.2928659424347631, + "grad_norm": 1.4117984771728516, + "learning_rate": 2.6135000000000003e-05, + "loss": 0.4157, + "step": 5230 + }, + { + "epoch": 0.2929219397468921, + "grad_norm": 1.0014228820800781, + "learning_rate": 2.6140000000000004e-05, + "loss": 0.3379, + "step": 5231 + }, + { + "epoch": 0.29297793705902114, + "grad_norm": 1.0868574380874634, + "learning_rate": 2.6145e-05, + "loss": 0.4443, + "step": 5232 + }, + { + "epoch": 0.29303393437115016, + "grad_norm": 1.707353949546814, + "learning_rate": 2.6150000000000002e-05, + "loss": 0.4213, + "step": 5233 + }, + { + "epoch": 0.2930899316832792, + "grad_norm": 1.113995909690857, + "learning_rate": 2.6155000000000003e-05, + "loss": 0.2867, + "step": 5234 + }, + { + "epoch": 0.2931459289954082, + "grad_norm": 0.9036622047424316, + "learning_rate": 2.616e-05, + "loss": 0.2777, + "step": 5235 + }, + { + "epoch": 0.2932019263075372, + "grad_norm": 5.723801136016846, + "learning_rate": 2.6165e-05, + "loss": 0.4799, + "step": 5236 + }, + { + "epoch": 0.29325792361966624, + "grad_norm": 1.1289016008377075, + "learning_rate": 2.617e-05, + "loss": 0.3623, + "step": 5237 + }, + { + "epoch": 0.29331392093179526, + "grad_norm": 1.2886630296707153, + "learning_rate": 2.6175e-05, + "loss": 0.3451, + "step": 5238 + }, + { + "epoch": 0.2933699182439243, + "grad_norm": 1.478747844696045, + "learning_rate": 2.618e-05, + "loss": 0.4712, + "step": 5239 + }, + { + "epoch": 0.2934259155560533, + "grad_norm": 1.2470319271087646, + "learning_rate": 2.6185000000000004e-05, + "loss": 0.4373, + "step": 5240 + }, + { + "epoch": 0.2934819128681823, + "grad_norm": 1.5660260915756226, + "learning_rate": 2.6190000000000005e-05, + "loss": 0.5966, + "step": 5241 + }, + { + "epoch": 0.29353791018031133, + "grad_norm": 1.3090187311172485, + "learning_rate": 2.6195000000000002e-05, + "loss": 0.4731, + "step": 5242 + }, + { + "epoch": 0.29359390749244035, + "grad_norm": 1.6193612813949585, + "learning_rate": 2.6200000000000003e-05, + "loss": 0.4805, + "step": 5243 + }, + { + "epoch": 0.2936499048045694, + "grad_norm": 1.3604973554611206, + "learning_rate": 2.6205e-05, + "loss": 0.4926, + "step": 5244 + }, + { + "epoch": 0.2937059021166984, + "grad_norm": 1.1638587713241577, + "learning_rate": 2.621e-05, + "loss": 0.4017, + "step": 5245 + }, + { + "epoch": 0.2937618994288274, + "grad_norm": 1.0777242183685303, + "learning_rate": 2.6215000000000002e-05, + "loss": 0.371, + "step": 5246 + }, + { + "epoch": 0.29381789674095643, + "grad_norm": 1.2868236303329468, + "learning_rate": 2.622e-05, + "loss": 0.4956, + "step": 5247 + }, + { + "epoch": 0.29387389405308545, + "grad_norm": 1.0613162517547607, + "learning_rate": 2.6225e-05, + "loss": 0.3956, + "step": 5248 + }, + { + "epoch": 0.29392989136521447, + "grad_norm": 1.9690322875976562, + "learning_rate": 2.6229999999999998e-05, + "loss": 0.4669, + "step": 5249 + }, + { + "epoch": 0.2939858886773435, + "grad_norm": 1.1180588006973267, + "learning_rate": 2.6235000000000005e-05, + "loss": 0.4524, + "step": 5250 + }, + { + "epoch": 0.2940418859894725, + "grad_norm": 1.1822772026062012, + "learning_rate": 2.6240000000000003e-05, + "loss": 0.4241, + "step": 5251 + }, + { + "epoch": 0.2940978833016015, + "grad_norm": 1.2001798152923584, + "learning_rate": 2.6245000000000004e-05, + "loss": 0.4325, + "step": 5252 + }, + { + "epoch": 0.29415388061373055, + "grad_norm": 1.5586347579956055, + "learning_rate": 2.625e-05, + "loss": 0.6026, + "step": 5253 + }, + { + "epoch": 0.29420987792585956, + "grad_norm": 1.2429914474487305, + "learning_rate": 2.6255000000000002e-05, + "loss": 0.4933, + "step": 5254 + }, + { + "epoch": 0.2942658752379886, + "grad_norm": 1.1091340780258179, + "learning_rate": 2.6260000000000003e-05, + "loss": 0.416, + "step": 5255 + }, + { + "epoch": 0.2943218725501176, + "grad_norm": 1.7527451515197754, + "learning_rate": 2.6265e-05, + "loss": 0.656, + "step": 5256 + }, + { + "epoch": 0.2943778698622466, + "grad_norm": 1.5141398906707764, + "learning_rate": 2.627e-05, + "loss": 0.6055, + "step": 5257 + }, + { + "epoch": 0.29443386717437564, + "grad_norm": 1.19267737865448, + "learning_rate": 2.6275e-05, + "loss": 0.3927, + "step": 5258 + }, + { + "epoch": 0.29448986448650466, + "grad_norm": 1.6720216274261475, + "learning_rate": 2.628e-05, + "loss": 0.5716, + "step": 5259 + }, + { + "epoch": 0.2945458617986337, + "grad_norm": 1.1752185821533203, + "learning_rate": 2.6285e-05, + "loss": 0.4485, + "step": 5260 + }, + { + "epoch": 0.2946018591107627, + "grad_norm": 1.1701414585113525, + "learning_rate": 2.6290000000000004e-05, + "loss": 0.4355, + "step": 5261 + }, + { + "epoch": 0.2946578564228917, + "grad_norm": 1.6818978786468506, + "learning_rate": 2.6295e-05, + "loss": 0.5101, + "step": 5262 + }, + { + "epoch": 0.29471385373502074, + "grad_norm": 1.0774511098861694, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.3872, + "step": 5263 + }, + { + "epoch": 0.29476985104714976, + "grad_norm": 1.1634888648986816, + "learning_rate": 2.6305000000000003e-05, + "loss": 0.4338, + "step": 5264 + }, + { + "epoch": 0.2948258483592788, + "grad_norm": 1.1789129972457886, + "learning_rate": 2.631e-05, + "loss": 0.4044, + "step": 5265 + }, + { + "epoch": 0.2948818456714078, + "grad_norm": 1.176689624786377, + "learning_rate": 2.6315e-05, + "loss": 0.4996, + "step": 5266 + }, + { + "epoch": 0.2949378429835368, + "grad_norm": 1.0302584171295166, + "learning_rate": 2.632e-05, + "loss": 0.3189, + "step": 5267 + }, + { + "epoch": 0.29499384029566583, + "grad_norm": 2.2208893299102783, + "learning_rate": 2.6325e-05, + "loss": 0.6289, + "step": 5268 + }, + { + "epoch": 0.29504983760779485, + "grad_norm": 1.3514935970306396, + "learning_rate": 2.633e-05, + "loss": 0.5208, + "step": 5269 + }, + { + "epoch": 0.29510583491992387, + "grad_norm": 1.4031091928482056, + "learning_rate": 2.6334999999999998e-05, + "loss": 0.4056, + "step": 5270 + }, + { + "epoch": 0.29516183223205283, + "grad_norm": 1.3235684633255005, + "learning_rate": 2.6340000000000002e-05, + "loss": 0.4818, + "step": 5271 + }, + { + "epoch": 0.29521782954418185, + "grad_norm": 1.2179088592529297, + "learning_rate": 2.6345000000000003e-05, + "loss": 0.4226, + "step": 5272 + }, + { + "epoch": 0.2952738268563109, + "grad_norm": 1.4105010032653809, + "learning_rate": 2.6350000000000004e-05, + "loss": 0.5017, + "step": 5273 + }, + { + "epoch": 0.2953298241684399, + "grad_norm": 3.7600419521331787, + "learning_rate": 2.6355e-05, + "loss": 0.4592, + "step": 5274 + }, + { + "epoch": 0.2953858214805689, + "grad_norm": 1.1268954277038574, + "learning_rate": 2.6360000000000002e-05, + "loss": 0.4334, + "step": 5275 + }, + { + "epoch": 0.29544181879269793, + "grad_norm": 1.2404820919036865, + "learning_rate": 2.6365e-05, + "loss": 0.3669, + "step": 5276 + }, + { + "epoch": 0.29549781610482695, + "grad_norm": 1.1679812669754028, + "learning_rate": 2.637e-05, + "loss": 0.478, + "step": 5277 + }, + { + "epoch": 0.29555381341695597, + "grad_norm": 1.0536789894104004, + "learning_rate": 2.6375e-05, + "loss": 0.3539, + "step": 5278 + }, + { + "epoch": 0.295609810729085, + "grad_norm": 1.0712404251098633, + "learning_rate": 2.6379999999999998e-05, + "loss": 0.4798, + "step": 5279 + }, + { + "epoch": 0.295665808041214, + "grad_norm": 1.2229100465774536, + "learning_rate": 2.6385e-05, + "loss": 0.3869, + "step": 5280 + }, + { + "epoch": 0.295721805353343, + "grad_norm": 1.2663319110870361, + "learning_rate": 2.6390000000000003e-05, + "loss": 0.4911, + "step": 5281 + }, + { + "epoch": 0.29577780266547204, + "grad_norm": 1.721091628074646, + "learning_rate": 2.6395000000000004e-05, + "loss": 0.5603, + "step": 5282 + }, + { + "epoch": 0.29583379997760106, + "grad_norm": 1.3844891786575317, + "learning_rate": 2.64e-05, + "loss": 0.4141, + "step": 5283 + }, + { + "epoch": 0.2958897972897301, + "grad_norm": 1.3836339712142944, + "learning_rate": 2.6405000000000002e-05, + "loss": 0.485, + "step": 5284 + }, + { + "epoch": 0.2959457946018591, + "grad_norm": 1.2061278820037842, + "learning_rate": 2.6410000000000003e-05, + "loss": 0.3872, + "step": 5285 + }, + { + "epoch": 0.2960017919139881, + "grad_norm": 1.4442470073699951, + "learning_rate": 2.6415e-05, + "loss": 0.3686, + "step": 5286 + }, + { + "epoch": 0.29605778922611714, + "grad_norm": 1.3720992803573608, + "learning_rate": 2.642e-05, + "loss": 0.5564, + "step": 5287 + }, + { + "epoch": 0.29611378653824616, + "grad_norm": 1.1148436069488525, + "learning_rate": 2.6425e-05, + "loss": 0.3856, + "step": 5288 + }, + { + "epoch": 0.2961697838503752, + "grad_norm": 1.1165697574615479, + "learning_rate": 2.643e-05, + "loss": 0.3491, + "step": 5289 + }, + { + "epoch": 0.2962257811625042, + "grad_norm": 1.0927644968032837, + "learning_rate": 2.6435e-05, + "loss": 0.3341, + "step": 5290 + }, + { + "epoch": 0.2962817784746332, + "grad_norm": 1.2208930253982544, + "learning_rate": 2.6440000000000004e-05, + "loss": 0.4179, + "step": 5291 + }, + { + "epoch": 0.29633777578676224, + "grad_norm": 1.1353740692138672, + "learning_rate": 2.6445000000000002e-05, + "loss": 0.3374, + "step": 5292 + }, + { + "epoch": 0.29639377309889126, + "grad_norm": 1.3508774042129517, + "learning_rate": 2.6450000000000003e-05, + "loss": 0.4931, + "step": 5293 + }, + { + "epoch": 0.2964497704110203, + "grad_norm": 1.188261866569519, + "learning_rate": 2.6455000000000003e-05, + "loss": 0.3865, + "step": 5294 + }, + { + "epoch": 0.2965057677231493, + "grad_norm": 1.3219794034957886, + "learning_rate": 2.646e-05, + "loss": 0.4982, + "step": 5295 + }, + { + "epoch": 0.2965617650352783, + "grad_norm": 1.3908417224884033, + "learning_rate": 2.6465e-05, + "loss": 0.5614, + "step": 5296 + }, + { + "epoch": 0.29661776234740733, + "grad_norm": 0.9926604628562927, + "learning_rate": 2.647e-05, + "loss": 0.3857, + "step": 5297 + }, + { + "epoch": 0.29667375965953635, + "grad_norm": 1.7230104207992554, + "learning_rate": 2.6475e-05, + "loss": 0.5869, + "step": 5298 + }, + { + "epoch": 0.29672975697166537, + "grad_norm": 1.4793049097061157, + "learning_rate": 2.648e-05, + "loss": 0.4726, + "step": 5299 + }, + { + "epoch": 0.2967857542837944, + "grad_norm": 1.1072078943252563, + "learning_rate": 2.6484999999999998e-05, + "loss": 0.3418, + "step": 5300 + }, + { + "epoch": 0.2968417515959234, + "grad_norm": 1.1886978149414062, + "learning_rate": 2.6490000000000002e-05, + "loss": 0.395, + "step": 5301 + }, + { + "epoch": 0.29689774890805243, + "grad_norm": 1.1559021472930908, + "learning_rate": 2.6495000000000003e-05, + "loss": 0.4542, + "step": 5302 + }, + { + "epoch": 0.29695374622018145, + "grad_norm": 1.074407935142517, + "learning_rate": 2.6500000000000004e-05, + "loss": 0.4714, + "step": 5303 + }, + { + "epoch": 0.29700974353231047, + "grad_norm": 1.2388737201690674, + "learning_rate": 2.6505e-05, + "loss": 0.4807, + "step": 5304 + }, + { + "epoch": 0.2970657408444395, + "grad_norm": 1.353883981704712, + "learning_rate": 2.6510000000000002e-05, + "loss": 0.372, + "step": 5305 + }, + { + "epoch": 0.2971217381565685, + "grad_norm": 0.8923586010932922, + "learning_rate": 2.6515e-05, + "loss": 0.254, + "step": 5306 + }, + { + "epoch": 0.2971777354686975, + "grad_norm": 1.645772933959961, + "learning_rate": 2.652e-05, + "loss": 0.5328, + "step": 5307 + }, + { + "epoch": 0.29723373278082654, + "grad_norm": 1.2057169675827026, + "learning_rate": 2.6525e-05, + "loss": 0.4626, + "step": 5308 + }, + { + "epoch": 0.29728973009295556, + "grad_norm": 1.567265510559082, + "learning_rate": 2.653e-05, + "loss": 0.6087, + "step": 5309 + }, + { + "epoch": 0.2973457274050846, + "grad_norm": 1.189358115196228, + "learning_rate": 2.6535e-05, + "loss": 0.5019, + "step": 5310 + }, + { + "epoch": 0.2974017247172136, + "grad_norm": 1.2605701684951782, + "learning_rate": 2.6540000000000003e-05, + "loss": 0.4445, + "step": 5311 + }, + { + "epoch": 0.29745772202934256, + "grad_norm": 1.3523656129837036, + "learning_rate": 2.6545000000000004e-05, + "loss": 0.4203, + "step": 5312 + }, + { + "epoch": 0.2975137193414716, + "grad_norm": 1.2546820640563965, + "learning_rate": 2.655e-05, + "loss": 0.3697, + "step": 5313 + }, + { + "epoch": 0.2975697166536006, + "grad_norm": 1.0982375144958496, + "learning_rate": 2.6555000000000002e-05, + "loss": 0.3868, + "step": 5314 + }, + { + "epoch": 0.2976257139657296, + "grad_norm": 1.1533113718032837, + "learning_rate": 2.6560000000000003e-05, + "loss": 0.4109, + "step": 5315 + }, + { + "epoch": 0.29768171127785864, + "grad_norm": 1.237347960472107, + "learning_rate": 2.6565e-05, + "loss": 0.4632, + "step": 5316 + }, + { + "epoch": 0.29773770858998766, + "grad_norm": 1.1130071878433228, + "learning_rate": 2.657e-05, + "loss": 0.4277, + "step": 5317 + }, + { + "epoch": 0.2977937059021167, + "grad_norm": 1.2339191436767578, + "learning_rate": 2.6575e-05, + "loss": 0.3036, + "step": 5318 + }, + { + "epoch": 0.2978497032142457, + "grad_norm": 1.2246606349945068, + "learning_rate": 2.658e-05, + "loss": 0.4144, + "step": 5319 + }, + { + "epoch": 0.2979057005263747, + "grad_norm": 1.2826143503189087, + "learning_rate": 2.6585e-05, + "loss": 0.5132, + "step": 5320 + }, + { + "epoch": 0.29796169783850374, + "grad_norm": 1.2820706367492676, + "learning_rate": 2.6590000000000005e-05, + "loss": 0.3742, + "step": 5321 + }, + { + "epoch": 0.29801769515063276, + "grad_norm": 1.1958434581756592, + "learning_rate": 2.6595000000000002e-05, + "loss": 0.4186, + "step": 5322 + }, + { + "epoch": 0.2980736924627618, + "grad_norm": 1.3425551652908325, + "learning_rate": 2.6600000000000003e-05, + "loss": 0.3936, + "step": 5323 + }, + { + "epoch": 0.2981296897748908, + "grad_norm": 1.3036679029464722, + "learning_rate": 2.6605000000000004e-05, + "loss": 0.4485, + "step": 5324 + }, + { + "epoch": 0.2981856870870198, + "grad_norm": 1.1589664220809937, + "learning_rate": 2.661e-05, + "loss": 0.384, + "step": 5325 + }, + { + "epoch": 0.29824168439914883, + "grad_norm": 1.0981476306915283, + "learning_rate": 2.6615000000000002e-05, + "loss": 0.3953, + "step": 5326 + }, + { + "epoch": 0.29829768171127785, + "grad_norm": 1.6495417356491089, + "learning_rate": 2.662e-05, + "loss": 0.4254, + "step": 5327 + }, + { + "epoch": 0.29835367902340687, + "grad_norm": 2.0696768760681152, + "learning_rate": 2.6625e-05, + "loss": 0.4943, + "step": 5328 + }, + { + "epoch": 0.2984096763355359, + "grad_norm": 1.2074174880981445, + "learning_rate": 2.663e-05, + "loss": 0.3937, + "step": 5329 + }, + { + "epoch": 0.2984656736476649, + "grad_norm": 1.1499083042144775, + "learning_rate": 2.6634999999999998e-05, + "loss": 0.3489, + "step": 5330 + }, + { + "epoch": 0.2985216709597939, + "grad_norm": 1.2360188961029053, + "learning_rate": 2.6640000000000002e-05, + "loss": 0.4464, + "step": 5331 + }, + { + "epoch": 0.29857766827192295, + "grad_norm": 1.1654911041259766, + "learning_rate": 2.6645000000000003e-05, + "loss": 0.4109, + "step": 5332 + }, + { + "epoch": 0.29863366558405197, + "grad_norm": 1.2289232015609741, + "learning_rate": 2.6650000000000004e-05, + "loss": 0.3535, + "step": 5333 + }, + { + "epoch": 0.298689662896181, + "grad_norm": 1.474075436592102, + "learning_rate": 2.6655e-05, + "loss": 0.5, + "step": 5334 + }, + { + "epoch": 0.29874566020831, + "grad_norm": 1.3659262657165527, + "learning_rate": 2.6660000000000002e-05, + "loss": 0.5725, + "step": 5335 + }, + { + "epoch": 0.298801657520439, + "grad_norm": 1.4412240982055664, + "learning_rate": 2.6665e-05, + "loss": 0.4145, + "step": 5336 + }, + { + "epoch": 0.29885765483256804, + "grad_norm": 1.2272512912750244, + "learning_rate": 2.667e-05, + "loss": 0.5215, + "step": 5337 + }, + { + "epoch": 0.29891365214469706, + "grad_norm": 1.1624239683151245, + "learning_rate": 2.6675e-05, + "loss": 0.545, + "step": 5338 + }, + { + "epoch": 0.2989696494568261, + "grad_norm": 1.381621241569519, + "learning_rate": 2.668e-05, + "loss": 0.6109, + "step": 5339 + }, + { + "epoch": 0.2990256467689551, + "grad_norm": 1.4536820650100708, + "learning_rate": 2.6685e-05, + "loss": 0.5272, + "step": 5340 + }, + { + "epoch": 0.2990816440810841, + "grad_norm": 0.9184737801551819, + "learning_rate": 2.6690000000000004e-05, + "loss": 0.3607, + "step": 5341 + }, + { + "epoch": 0.29913764139321314, + "grad_norm": 1.2206549644470215, + "learning_rate": 2.6695000000000004e-05, + "loss": 0.4182, + "step": 5342 + }, + { + "epoch": 0.29919363870534216, + "grad_norm": 1.0115301609039307, + "learning_rate": 2.6700000000000002e-05, + "loss": 0.3466, + "step": 5343 + }, + { + "epoch": 0.2992496360174712, + "grad_norm": 1.1144261360168457, + "learning_rate": 2.6705000000000003e-05, + "loss": 0.387, + "step": 5344 + }, + { + "epoch": 0.2993056333296002, + "grad_norm": 1.4079464673995972, + "learning_rate": 2.671e-05, + "loss": 0.6056, + "step": 5345 + }, + { + "epoch": 0.2993616306417292, + "grad_norm": 1.191615104675293, + "learning_rate": 2.6715e-05, + "loss": 0.4494, + "step": 5346 + }, + { + "epoch": 0.29941762795385823, + "grad_norm": 0.9942251443862915, + "learning_rate": 2.672e-05, + "loss": 0.3627, + "step": 5347 + }, + { + "epoch": 0.29947362526598725, + "grad_norm": 1.1981528997421265, + "learning_rate": 2.6725e-05, + "loss": 0.5028, + "step": 5348 + }, + { + "epoch": 0.2995296225781163, + "grad_norm": 1.0284557342529297, + "learning_rate": 2.673e-05, + "loss": 0.4042, + "step": 5349 + }, + { + "epoch": 0.2995856198902453, + "grad_norm": 1.6800273656845093, + "learning_rate": 2.6734999999999997e-05, + "loss": 0.5364, + "step": 5350 + }, + { + "epoch": 0.2996416172023743, + "grad_norm": 1.4248425960540771, + "learning_rate": 2.6740000000000005e-05, + "loss": 0.5269, + "step": 5351 + }, + { + "epoch": 0.29969761451450333, + "grad_norm": 1.2523330450057983, + "learning_rate": 2.6745000000000002e-05, + "loss": 0.5303, + "step": 5352 + }, + { + "epoch": 0.29975361182663235, + "grad_norm": 1.0883934497833252, + "learning_rate": 2.6750000000000003e-05, + "loss": 0.3841, + "step": 5353 + }, + { + "epoch": 0.2998096091387613, + "grad_norm": 1.0700803995132446, + "learning_rate": 2.6755000000000004e-05, + "loss": 0.393, + "step": 5354 + }, + { + "epoch": 0.29986560645089033, + "grad_norm": 1.1892080307006836, + "learning_rate": 2.676e-05, + "loss": 0.299, + "step": 5355 + }, + { + "epoch": 0.29992160376301935, + "grad_norm": 0.9703457355499268, + "learning_rate": 2.6765000000000002e-05, + "loss": 0.3694, + "step": 5356 + }, + { + "epoch": 0.29997760107514837, + "grad_norm": 4.798460483551025, + "learning_rate": 2.677e-05, + "loss": 0.4071, + "step": 5357 + }, + { + "epoch": 0.3000335983872774, + "grad_norm": 1.0188037157058716, + "learning_rate": 2.6775e-05, + "loss": 0.4911, + "step": 5358 + }, + { + "epoch": 0.3000895956994064, + "grad_norm": 1.1308927536010742, + "learning_rate": 2.678e-05, + "loss": 0.4291, + "step": 5359 + }, + { + "epoch": 0.3001455930115354, + "grad_norm": 1.2467466592788696, + "learning_rate": 2.6785e-05, + "loss": 0.4322, + "step": 5360 + }, + { + "epoch": 0.30020159032366445, + "grad_norm": 1.0073515176773071, + "learning_rate": 2.6790000000000003e-05, + "loss": 0.3015, + "step": 5361 + }, + { + "epoch": 0.30025758763579347, + "grad_norm": 1.4517663717269897, + "learning_rate": 2.6795000000000003e-05, + "loss": 0.5408, + "step": 5362 + }, + { + "epoch": 0.3003135849479225, + "grad_norm": 1.2505501508712769, + "learning_rate": 2.6800000000000004e-05, + "loss": 0.4191, + "step": 5363 + }, + { + "epoch": 0.3003695822600515, + "grad_norm": 1.104689359664917, + "learning_rate": 2.6805000000000002e-05, + "loss": 0.3857, + "step": 5364 + }, + { + "epoch": 0.3004255795721805, + "grad_norm": 1.179711103439331, + "learning_rate": 2.6810000000000003e-05, + "loss": 0.5031, + "step": 5365 + }, + { + "epoch": 0.30048157688430954, + "grad_norm": 1.175329327583313, + "learning_rate": 2.6815e-05, + "loss": 0.3959, + "step": 5366 + }, + { + "epoch": 0.30053757419643856, + "grad_norm": 1.5125898122787476, + "learning_rate": 2.682e-05, + "loss": 0.345, + "step": 5367 + }, + { + "epoch": 0.3005935715085676, + "grad_norm": 1.0912848711013794, + "learning_rate": 2.6825e-05, + "loss": 0.4557, + "step": 5368 + }, + { + "epoch": 0.3006495688206966, + "grad_norm": 1.1836665868759155, + "learning_rate": 2.683e-05, + "loss": 0.4663, + "step": 5369 + }, + { + "epoch": 0.3007055661328256, + "grad_norm": 1.2546424865722656, + "learning_rate": 2.6835e-05, + "loss": 0.4343, + "step": 5370 + }, + { + "epoch": 0.30076156344495464, + "grad_norm": 1.251499891281128, + "learning_rate": 2.6840000000000004e-05, + "loss": 0.5248, + "step": 5371 + }, + { + "epoch": 0.30081756075708366, + "grad_norm": 1.3481758832931519, + "learning_rate": 2.6845000000000005e-05, + "loss": 0.3643, + "step": 5372 + }, + { + "epoch": 0.3008735580692127, + "grad_norm": 1.3178684711456299, + "learning_rate": 2.6850000000000002e-05, + "loss": 0.4057, + "step": 5373 + }, + { + "epoch": 0.3009295553813417, + "grad_norm": 2.1568729877471924, + "learning_rate": 2.6855000000000003e-05, + "loss": 0.4088, + "step": 5374 + }, + { + "epoch": 0.3009855526934707, + "grad_norm": 1.8466178178787231, + "learning_rate": 2.686e-05, + "loss": 0.5227, + "step": 5375 + }, + { + "epoch": 0.30104155000559973, + "grad_norm": 1.2152745723724365, + "learning_rate": 2.6865e-05, + "loss": 0.5257, + "step": 5376 + }, + { + "epoch": 0.30109754731772875, + "grad_norm": 1.4119421243667603, + "learning_rate": 2.6870000000000002e-05, + "loss": 0.3585, + "step": 5377 + }, + { + "epoch": 0.3011535446298578, + "grad_norm": 1.2167298793792725, + "learning_rate": 2.6875e-05, + "loss": 0.4351, + "step": 5378 + }, + { + "epoch": 0.3012095419419868, + "grad_norm": 1.2719478607177734, + "learning_rate": 2.688e-05, + "loss": 0.4216, + "step": 5379 + }, + { + "epoch": 0.3012655392541158, + "grad_norm": 1.229483962059021, + "learning_rate": 2.6884999999999998e-05, + "loss": 0.4044, + "step": 5380 + }, + { + "epoch": 0.30132153656624483, + "grad_norm": 1.5179332494735718, + "learning_rate": 2.689e-05, + "loss": 0.3877, + "step": 5381 + }, + { + "epoch": 0.30137753387837385, + "grad_norm": 1.4955464601516724, + "learning_rate": 2.6895000000000003e-05, + "loss": 0.5172, + "step": 5382 + }, + { + "epoch": 0.30143353119050287, + "grad_norm": 1.1615431308746338, + "learning_rate": 2.6900000000000003e-05, + "loss": 0.443, + "step": 5383 + }, + { + "epoch": 0.3014895285026319, + "grad_norm": 1.117387294769287, + "learning_rate": 2.6905e-05, + "loss": 0.3921, + "step": 5384 + }, + { + "epoch": 0.3015455258147609, + "grad_norm": 1.1309614181518555, + "learning_rate": 2.691e-05, + "loss": 0.4126, + "step": 5385 + }, + { + "epoch": 0.3016015231268899, + "grad_norm": 1.202384352684021, + "learning_rate": 2.6915000000000002e-05, + "loss": 0.4628, + "step": 5386 + }, + { + "epoch": 0.30165752043901894, + "grad_norm": 1.3168935775756836, + "learning_rate": 2.692e-05, + "loss": 0.4685, + "step": 5387 + }, + { + "epoch": 0.30171351775114796, + "grad_norm": 1.3338346481323242, + "learning_rate": 2.6925e-05, + "loss": 0.4951, + "step": 5388 + }, + { + "epoch": 0.301769515063277, + "grad_norm": 1.2364870309829712, + "learning_rate": 2.693e-05, + "loss": 0.3585, + "step": 5389 + }, + { + "epoch": 0.301825512375406, + "grad_norm": 1.512573003768921, + "learning_rate": 2.6935e-05, + "loss": 0.4172, + "step": 5390 + }, + { + "epoch": 0.301881509687535, + "grad_norm": 1.2886953353881836, + "learning_rate": 2.694e-05, + "loss": 0.4066, + "step": 5391 + }, + { + "epoch": 0.30193750699966404, + "grad_norm": 1.155105471611023, + "learning_rate": 2.6945000000000004e-05, + "loss": 0.3305, + "step": 5392 + }, + { + "epoch": 0.30199350431179306, + "grad_norm": 1.0350037813186646, + "learning_rate": 2.6950000000000005e-05, + "loss": 0.4095, + "step": 5393 + }, + { + "epoch": 0.3020495016239221, + "grad_norm": 1.0273566246032715, + "learning_rate": 2.6955000000000002e-05, + "loss": 0.3563, + "step": 5394 + }, + { + "epoch": 0.30210549893605104, + "grad_norm": 1.1574289798736572, + "learning_rate": 2.6960000000000003e-05, + "loss": 0.5187, + "step": 5395 + }, + { + "epoch": 0.30216149624818006, + "grad_norm": 1.1745827198028564, + "learning_rate": 2.6965e-05, + "loss": 0.3549, + "step": 5396 + }, + { + "epoch": 0.3022174935603091, + "grad_norm": 1.4617410898208618, + "learning_rate": 2.697e-05, + "loss": 0.5529, + "step": 5397 + }, + { + "epoch": 0.3022734908724381, + "grad_norm": 2.2632431983947754, + "learning_rate": 2.6975000000000002e-05, + "loss": 0.5178, + "step": 5398 + }, + { + "epoch": 0.3023294881845671, + "grad_norm": 1.0546752214431763, + "learning_rate": 2.698e-05, + "loss": 0.3277, + "step": 5399 + }, + { + "epoch": 0.30238548549669614, + "grad_norm": 1.3697587251663208, + "learning_rate": 2.6985e-05, + "loss": 0.4477, + "step": 5400 + }, + { + "epoch": 0.30244148280882516, + "grad_norm": 1.1841217279434204, + "learning_rate": 2.6989999999999997e-05, + "loss": 0.6534, + "step": 5401 + }, + { + "epoch": 0.3024974801209542, + "grad_norm": 1.2722182273864746, + "learning_rate": 2.6995000000000005e-05, + "loss": 0.3931, + "step": 5402 + }, + { + "epoch": 0.3025534774330832, + "grad_norm": 1.2848879098892212, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3851, + "step": 5403 + }, + { + "epoch": 0.3026094747452122, + "grad_norm": 1.2802610397338867, + "learning_rate": 2.7005000000000003e-05, + "loss": 0.5178, + "step": 5404 + }, + { + "epoch": 0.30266547205734123, + "grad_norm": 1.2036280632019043, + "learning_rate": 2.701e-05, + "loss": 0.4296, + "step": 5405 + }, + { + "epoch": 0.30272146936947025, + "grad_norm": 1.3071026802062988, + "learning_rate": 2.7015e-05, + "loss": 0.5502, + "step": 5406 + }, + { + "epoch": 0.30277746668159927, + "grad_norm": 1.4859977960586548, + "learning_rate": 2.7020000000000002e-05, + "loss": 0.5912, + "step": 5407 + }, + { + "epoch": 0.3028334639937283, + "grad_norm": 1.4547327756881714, + "learning_rate": 2.7025e-05, + "loss": 0.4606, + "step": 5408 + }, + { + "epoch": 0.3028894613058573, + "grad_norm": 1.0255621671676636, + "learning_rate": 2.703e-05, + "loss": 0.4704, + "step": 5409 + }, + { + "epoch": 0.30294545861798633, + "grad_norm": 29.907569885253906, + "learning_rate": 2.7034999999999998e-05, + "loss": 0.3666, + "step": 5410 + }, + { + "epoch": 0.30300145593011535, + "grad_norm": 1.1806755065917969, + "learning_rate": 2.704e-05, + "loss": 0.443, + "step": 5411 + }, + { + "epoch": 0.30305745324224437, + "grad_norm": 1.1206432580947876, + "learning_rate": 2.7045000000000003e-05, + "loss": 0.3466, + "step": 5412 + }, + { + "epoch": 0.3031134505543734, + "grad_norm": 1.1510664224624634, + "learning_rate": 2.7050000000000004e-05, + "loss": 0.5531, + "step": 5413 + }, + { + "epoch": 0.3031694478665024, + "grad_norm": 1.128298044204712, + "learning_rate": 2.7055e-05, + "loss": 0.4528, + "step": 5414 + }, + { + "epoch": 0.3032254451786314, + "grad_norm": 1.5585412979125977, + "learning_rate": 2.7060000000000002e-05, + "loss": 0.6488, + "step": 5415 + }, + { + "epoch": 0.30328144249076044, + "grad_norm": 1.4108123779296875, + "learning_rate": 2.7065000000000003e-05, + "loss": 0.4687, + "step": 5416 + }, + { + "epoch": 0.30333743980288946, + "grad_norm": 0.9559827446937561, + "learning_rate": 2.707e-05, + "loss": 0.2918, + "step": 5417 + }, + { + "epoch": 0.3033934371150185, + "grad_norm": 1.0924289226531982, + "learning_rate": 2.7075e-05, + "loss": 0.4397, + "step": 5418 + }, + { + "epoch": 0.3034494344271475, + "grad_norm": 1.3185755014419556, + "learning_rate": 2.7079999999999998e-05, + "loss": 0.4814, + "step": 5419 + }, + { + "epoch": 0.3035054317392765, + "grad_norm": 1.189787745475769, + "learning_rate": 2.7085e-05, + "loss": 0.4277, + "step": 5420 + }, + { + "epoch": 0.30356142905140554, + "grad_norm": 1.108835220336914, + "learning_rate": 2.709e-05, + "loss": 0.4128, + "step": 5421 + }, + { + "epoch": 0.30361742636353456, + "grad_norm": 1.2378427982330322, + "learning_rate": 2.7095000000000004e-05, + "loss": 0.3862, + "step": 5422 + }, + { + "epoch": 0.3036734236756636, + "grad_norm": 1.2699310779571533, + "learning_rate": 2.7100000000000005e-05, + "loss": 0.3449, + "step": 5423 + }, + { + "epoch": 0.3037294209877926, + "grad_norm": 1.3861756324768066, + "learning_rate": 2.7105000000000002e-05, + "loss": 0.5577, + "step": 5424 + }, + { + "epoch": 0.3037854182999216, + "grad_norm": 1.676414966583252, + "learning_rate": 2.7110000000000003e-05, + "loss": 0.4003, + "step": 5425 + }, + { + "epoch": 0.30384141561205064, + "grad_norm": 1.1581931114196777, + "learning_rate": 2.7115e-05, + "loss": 0.4549, + "step": 5426 + }, + { + "epoch": 0.30389741292417966, + "grad_norm": 1.4321802854537964, + "learning_rate": 2.712e-05, + "loss": 0.53, + "step": 5427 + }, + { + "epoch": 0.3039534102363087, + "grad_norm": 1.5010488033294678, + "learning_rate": 2.7125000000000002e-05, + "loss": 0.4593, + "step": 5428 + }, + { + "epoch": 0.3040094075484377, + "grad_norm": 1.455575704574585, + "learning_rate": 2.713e-05, + "loss": 0.4798, + "step": 5429 + }, + { + "epoch": 0.3040654048605667, + "grad_norm": 1.265520691871643, + "learning_rate": 2.7135e-05, + "loss": 0.6806, + "step": 5430 + }, + { + "epoch": 0.30412140217269573, + "grad_norm": 1.3100696802139282, + "learning_rate": 2.7139999999999998e-05, + "loss": 0.6165, + "step": 5431 + }, + { + "epoch": 0.30417739948482475, + "grad_norm": 1.181958556175232, + "learning_rate": 2.7145000000000005e-05, + "loss": 0.4008, + "step": 5432 + }, + { + "epoch": 0.30423339679695377, + "grad_norm": 1.2225289344787598, + "learning_rate": 2.7150000000000003e-05, + "loss": 0.3583, + "step": 5433 + }, + { + "epoch": 0.3042893941090828, + "grad_norm": 1.6007293462753296, + "learning_rate": 2.7155000000000003e-05, + "loss": 0.4714, + "step": 5434 + }, + { + "epoch": 0.3043453914212118, + "grad_norm": 1.3652026653289795, + "learning_rate": 2.716e-05, + "loss": 0.4413, + "step": 5435 + }, + { + "epoch": 0.30440138873334077, + "grad_norm": 1.3229869604110718, + "learning_rate": 2.7165e-05, + "loss": 0.3679, + "step": 5436 + }, + { + "epoch": 0.3044573860454698, + "grad_norm": 1.6404391527175903, + "learning_rate": 2.7170000000000002e-05, + "loss": 0.4618, + "step": 5437 + }, + { + "epoch": 0.3045133833575988, + "grad_norm": 1.1321698427200317, + "learning_rate": 2.7175e-05, + "loss": 0.4808, + "step": 5438 + }, + { + "epoch": 0.30456938066972783, + "grad_norm": 1.2999236583709717, + "learning_rate": 2.718e-05, + "loss": 0.433, + "step": 5439 + }, + { + "epoch": 0.30462537798185685, + "grad_norm": 1.1787896156311035, + "learning_rate": 2.7184999999999998e-05, + "loss": 0.5162, + "step": 5440 + }, + { + "epoch": 0.30468137529398587, + "grad_norm": 1.1056458950042725, + "learning_rate": 2.719e-05, + "loss": 0.368, + "step": 5441 + }, + { + "epoch": 0.3047373726061149, + "grad_norm": 1.2190624475479126, + "learning_rate": 2.7195000000000003e-05, + "loss": 0.4291, + "step": 5442 + }, + { + "epoch": 0.3047933699182439, + "grad_norm": 1.6064512729644775, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.493, + "step": 5443 + }, + { + "epoch": 0.3048493672303729, + "grad_norm": 1.5153958797454834, + "learning_rate": 2.7205e-05, + "loss": 0.4283, + "step": 5444 + }, + { + "epoch": 0.30490536454250194, + "grad_norm": 1.2916572093963623, + "learning_rate": 2.7210000000000002e-05, + "loss": 0.4106, + "step": 5445 + }, + { + "epoch": 0.30496136185463096, + "grad_norm": 1.1816558837890625, + "learning_rate": 2.7215000000000003e-05, + "loss": 0.3662, + "step": 5446 + }, + { + "epoch": 0.30501735916676, + "grad_norm": 1.0784937143325806, + "learning_rate": 2.722e-05, + "loss": 0.4712, + "step": 5447 + }, + { + "epoch": 0.305073356478889, + "grad_norm": 1.6013766527175903, + "learning_rate": 2.7225e-05, + "loss": 0.5618, + "step": 5448 + }, + { + "epoch": 0.305129353791018, + "grad_norm": 1.3034296035766602, + "learning_rate": 2.723e-05, + "loss": 0.5378, + "step": 5449 + }, + { + "epoch": 0.30518535110314704, + "grad_norm": 1.2664868831634521, + "learning_rate": 2.7235e-05, + "loss": 0.4298, + "step": 5450 + }, + { + "epoch": 0.30524134841527606, + "grad_norm": 1.2620458602905273, + "learning_rate": 2.724e-05, + "loss": 0.3901, + "step": 5451 + }, + { + "epoch": 0.3052973457274051, + "grad_norm": 1.152858018875122, + "learning_rate": 2.7245000000000004e-05, + "loss": 0.4303, + "step": 5452 + }, + { + "epoch": 0.3053533430395341, + "grad_norm": 1.1570677757263184, + "learning_rate": 2.725e-05, + "loss": 0.4018, + "step": 5453 + }, + { + "epoch": 0.3054093403516631, + "grad_norm": 1.1236780881881714, + "learning_rate": 2.7255000000000002e-05, + "loss": 0.3916, + "step": 5454 + }, + { + "epoch": 0.30546533766379214, + "grad_norm": 1.2584707736968994, + "learning_rate": 2.7260000000000003e-05, + "loss": 0.4971, + "step": 5455 + }, + { + "epoch": 0.30552133497592115, + "grad_norm": 1.286197304725647, + "learning_rate": 2.7265e-05, + "loss": 0.4123, + "step": 5456 + }, + { + "epoch": 0.3055773322880502, + "grad_norm": 1.1834160089492798, + "learning_rate": 2.727e-05, + "loss": 0.4101, + "step": 5457 + }, + { + "epoch": 0.3056333296001792, + "grad_norm": 0.9636449217796326, + "learning_rate": 2.7275e-05, + "loss": 0.3525, + "step": 5458 + }, + { + "epoch": 0.3056893269123082, + "grad_norm": 1.1810790300369263, + "learning_rate": 2.728e-05, + "loss": 0.4266, + "step": 5459 + }, + { + "epoch": 0.30574532422443723, + "grad_norm": 1.2433044910430908, + "learning_rate": 2.7285e-05, + "loss": 0.4085, + "step": 5460 + }, + { + "epoch": 0.30580132153656625, + "grad_norm": 1.2087873220443726, + "learning_rate": 2.7289999999999998e-05, + "loss": 0.4045, + "step": 5461 + }, + { + "epoch": 0.30585731884869527, + "grad_norm": 1.1909713745117188, + "learning_rate": 2.7295000000000005e-05, + "loss": 0.4767, + "step": 5462 + }, + { + "epoch": 0.3059133161608243, + "grad_norm": 1.2529692649841309, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.3872, + "step": 5463 + }, + { + "epoch": 0.3059693134729533, + "grad_norm": 1.424086093902588, + "learning_rate": 2.7305000000000004e-05, + "loss": 0.5771, + "step": 5464 + }, + { + "epoch": 0.3060253107850823, + "grad_norm": 1.120654821395874, + "learning_rate": 2.731e-05, + "loss": 0.3698, + "step": 5465 + }, + { + "epoch": 0.30608130809721135, + "grad_norm": 1.3193612098693848, + "learning_rate": 2.7315000000000002e-05, + "loss": 0.417, + "step": 5466 + }, + { + "epoch": 0.30613730540934037, + "grad_norm": 1.2496871948242188, + "learning_rate": 2.7320000000000003e-05, + "loss": 0.4239, + "step": 5467 + }, + { + "epoch": 0.3061933027214694, + "grad_norm": 1.2125095129013062, + "learning_rate": 2.7325e-05, + "loss": 0.3446, + "step": 5468 + }, + { + "epoch": 0.3062493000335984, + "grad_norm": 1.3269721269607544, + "learning_rate": 2.733e-05, + "loss": 0.4778, + "step": 5469 + }, + { + "epoch": 0.3063052973457274, + "grad_norm": 1.2976957559585571, + "learning_rate": 2.7335e-05, + "loss": 0.4254, + "step": 5470 + }, + { + "epoch": 0.30636129465785644, + "grad_norm": 1.2981770038604736, + "learning_rate": 2.734e-05, + "loss": 0.4417, + "step": 5471 + }, + { + "epoch": 0.30641729196998546, + "grad_norm": 1.0654748678207397, + "learning_rate": 2.7345000000000003e-05, + "loss": 0.3958, + "step": 5472 + }, + { + "epoch": 0.3064732892821145, + "grad_norm": 1.086867094039917, + "learning_rate": 2.7350000000000004e-05, + "loss": 0.4133, + "step": 5473 + }, + { + "epoch": 0.3065292865942435, + "grad_norm": 1.3270715475082397, + "learning_rate": 2.7355e-05, + "loss": 0.5076, + "step": 5474 + }, + { + "epoch": 0.3065852839063725, + "grad_norm": 1.3053569793701172, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.5096, + "step": 5475 + }, + { + "epoch": 0.30664128121850154, + "grad_norm": 1.1925352811813354, + "learning_rate": 2.7365000000000003e-05, + "loss": 0.4149, + "step": 5476 + }, + { + "epoch": 0.30669727853063056, + "grad_norm": 1.3278213739395142, + "learning_rate": 2.737e-05, + "loss": 0.4896, + "step": 5477 + }, + { + "epoch": 0.3067532758427595, + "grad_norm": 1.2239010334014893, + "learning_rate": 2.7375e-05, + "loss": 0.529, + "step": 5478 + }, + { + "epoch": 0.30680927315488854, + "grad_norm": 1.0836764574050903, + "learning_rate": 2.738e-05, + "loss": 0.3727, + "step": 5479 + }, + { + "epoch": 0.30686527046701756, + "grad_norm": 1.2639875411987305, + "learning_rate": 2.7385e-05, + "loss": 0.4574, + "step": 5480 + }, + { + "epoch": 0.3069212677791466, + "grad_norm": 1.3477859497070312, + "learning_rate": 2.739e-05, + "loss": 0.5604, + "step": 5481 + }, + { + "epoch": 0.3069772650912756, + "grad_norm": 1.1119790077209473, + "learning_rate": 2.7395000000000005e-05, + "loss": 0.4352, + "step": 5482 + }, + { + "epoch": 0.3070332624034046, + "grad_norm": 1.3080546855926514, + "learning_rate": 2.7400000000000002e-05, + "loss": 0.4564, + "step": 5483 + }, + { + "epoch": 0.30708925971553364, + "grad_norm": 1.1802984476089478, + "learning_rate": 2.7405000000000003e-05, + "loss": 0.4341, + "step": 5484 + }, + { + "epoch": 0.30714525702766265, + "grad_norm": 1.335644006729126, + "learning_rate": 2.7410000000000004e-05, + "loss": 0.6299, + "step": 5485 + }, + { + "epoch": 0.3072012543397917, + "grad_norm": 1.1943070888519287, + "learning_rate": 2.7415e-05, + "loss": 0.4315, + "step": 5486 + }, + { + "epoch": 0.3072572516519207, + "grad_norm": 1.4698803424835205, + "learning_rate": 2.7420000000000002e-05, + "loss": 0.4656, + "step": 5487 + }, + { + "epoch": 0.3073132489640497, + "grad_norm": 1.4018625020980835, + "learning_rate": 2.7425e-05, + "loss": 0.3524, + "step": 5488 + }, + { + "epoch": 0.30736924627617873, + "grad_norm": 1.2805378437042236, + "learning_rate": 2.743e-05, + "loss": 0.4766, + "step": 5489 + }, + { + "epoch": 0.30742524358830775, + "grad_norm": 1.030600905418396, + "learning_rate": 2.7435e-05, + "loss": 0.365, + "step": 5490 + }, + { + "epoch": 0.30748124090043677, + "grad_norm": 1.0351371765136719, + "learning_rate": 2.7439999999999998e-05, + "loss": 0.3058, + "step": 5491 + }, + { + "epoch": 0.3075372382125658, + "grad_norm": 1.1799652576446533, + "learning_rate": 2.7445000000000002e-05, + "loss": 0.4724, + "step": 5492 + }, + { + "epoch": 0.3075932355246948, + "grad_norm": 1.1889554262161255, + "learning_rate": 2.7450000000000003e-05, + "loss": 0.3925, + "step": 5493 + }, + { + "epoch": 0.3076492328368238, + "grad_norm": 1.3212112188339233, + "learning_rate": 2.7455000000000004e-05, + "loss": 0.4217, + "step": 5494 + }, + { + "epoch": 0.30770523014895285, + "grad_norm": 1.0993441343307495, + "learning_rate": 2.746e-05, + "loss": 0.3788, + "step": 5495 + }, + { + "epoch": 0.30776122746108187, + "grad_norm": 1.5850281715393066, + "learning_rate": 2.7465000000000002e-05, + "loss": 0.4674, + "step": 5496 + }, + { + "epoch": 0.3078172247732109, + "grad_norm": 1.312206506729126, + "learning_rate": 2.7470000000000003e-05, + "loss": 0.4479, + "step": 5497 + }, + { + "epoch": 0.3078732220853399, + "grad_norm": 1.1603442430496216, + "learning_rate": 2.7475e-05, + "loss": 0.4297, + "step": 5498 + }, + { + "epoch": 0.3079292193974689, + "grad_norm": 1.311500072479248, + "learning_rate": 2.748e-05, + "loss": 0.4327, + "step": 5499 + }, + { + "epoch": 0.30798521670959794, + "grad_norm": 1.2915374040603638, + "learning_rate": 2.7485e-05, + "loss": 0.5015, + "step": 5500 + }, + { + "epoch": 0.30804121402172696, + "grad_norm": 1.0879619121551514, + "learning_rate": 2.749e-05, + "loss": 0.3734, + "step": 5501 + }, + { + "epoch": 0.308097211333856, + "grad_norm": 1.2090144157409668, + "learning_rate": 2.7495000000000004e-05, + "loss": 0.4948, + "step": 5502 + }, + { + "epoch": 0.308153208645985, + "grad_norm": 1.6563881635665894, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.4062, + "step": 5503 + }, + { + "epoch": 0.308209205958114, + "grad_norm": 1.4525583982467651, + "learning_rate": 2.7505000000000002e-05, + "loss": 0.3904, + "step": 5504 + }, + { + "epoch": 0.30826520327024304, + "grad_norm": 1.367253303527832, + "learning_rate": 2.7510000000000003e-05, + "loss": 0.3965, + "step": 5505 + }, + { + "epoch": 0.30832120058237206, + "grad_norm": 1.3779171705245972, + "learning_rate": 2.7515000000000003e-05, + "loss": 0.4004, + "step": 5506 + }, + { + "epoch": 0.3083771978945011, + "grad_norm": 1.220894694328308, + "learning_rate": 2.752e-05, + "loss": 0.4829, + "step": 5507 + }, + { + "epoch": 0.3084331952066301, + "grad_norm": 1.0637099742889404, + "learning_rate": 2.7525e-05, + "loss": 0.3627, + "step": 5508 + }, + { + "epoch": 0.3084891925187591, + "grad_norm": 1.20713210105896, + "learning_rate": 2.753e-05, + "loss": 0.3328, + "step": 5509 + }, + { + "epoch": 0.30854518983088813, + "grad_norm": 1.1569844484329224, + "learning_rate": 2.7535e-05, + "loss": 0.5242, + "step": 5510 + }, + { + "epoch": 0.30860118714301715, + "grad_norm": 1.340274453163147, + "learning_rate": 2.754e-05, + "loss": 0.4154, + "step": 5511 + }, + { + "epoch": 0.30865718445514617, + "grad_norm": 1.2659398317337036, + "learning_rate": 2.7544999999999998e-05, + "loss": 0.368, + "step": 5512 + }, + { + "epoch": 0.3087131817672752, + "grad_norm": 0.991502046585083, + "learning_rate": 2.7550000000000002e-05, + "loss": 0.3658, + "step": 5513 + }, + { + "epoch": 0.3087691790794042, + "grad_norm": 1.056311845779419, + "learning_rate": 2.7555000000000003e-05, + "loss": 0.4026, + "step": 5514 + }, + { + "epoch": 0.30882517639153323, + "grad_norm": 1.4860235452651978, + "learning_rate": 2.7560000000000004e-05, + "loss": 0.4851, + "step": 5515 + }, + { + "epoch": 0.30888117370366225, + "grad_norm": 1.1402136087417603, + "learning_rate": 2.7565e-05, + "loss": 0.315, + "step": 5516 + }, + { + "epoch": 0.30893717101579127, + "grad_norm": 1.2493200302124023, + "learning_rate": 2.7570000000000002e-05, + "loss": 0.467, + "step": 5517 + }, + { + "epoch": 0.3089931683279203, + "grad_norm": 1.1811987161636353, + "learning_rate": 2.7575e-05, + "loss": 0.3523, + "step": 5518 + }, + { + "epoch": 0.30904916564004925, + "grad_norm": 2.222553253173828, + "learning_rate": 2.758e-05, + "loss": 0.542, + "step": 5519 + }, + { + "epoch": 0.30910516295217827, + "grad_norm": 1.4518640041351318, + "learning_rate": 2.7585e-05, + "loss": 0.4165, + "step": 5520 + }, + { + "epoch": 0.3091611602643073, + "grad_norm": 1.4288419485092163, + "learning_rate": 2.759e-05, + "loss": 0.4544, + "step": 5521 + }, + { + "epoch": 0.3092171575764363, + "grad_norm": 1.4100167751312256, + "learning_rate": 2.7595e-05, + "loss": 0.5303, + "step": 5522 + }, + { + "epoch": 0.3092731548885653, + "grad_norm": 1.3606417179107666, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.4296, + "step": 5523 + }, + { + "epoch": 0.30932915220069435, + "grad_norm": 1.1433569192886353, + "learning_rate": 2.7605000000000004e-05, + "loss": 0.389, + "step": 5524 + }, + { + "epoch": 0.30938514951282337, + "grad_norm": 1.2820134162902832, + "learning_rate": 2.761e-05, + "loss": 0.5378, + "step": 5525 + }, + { + "epoch": 0.3094411468249524, + "grad_norm": 1.375495433807373, + "learning_rate": 2.7615000000000002e-05, + "loss": 0.4937, + "step": 5526 + }, + { + "epoch": 0.3094971441370814, + "grad_norm": 1.3101216554641724, + "learning_rate": 2.762e-05, + "loss": 0.4153, + "step": 5527 + }, + { + "epoch": 0.3095531414492104, + "grad_norm": 1.2939258813858032, + "learning_rate": 2.7625e-05, + "loss": 0.4385, + "step": 5528 + }, + { + "epoch": 0.30960913876133944, + "grad_norm": 1.3447812795639038, + "learning_rate": 2.763e-05, + "loss": 0.5507, + "step": 5529 + }, + { + "epoch": 0.30966513607346846, + "grad_norm": 1.0491257905960083, + "learning_rate": 2.7635e-05, + "loss": 0.388, + "step": 5530 + }, + { + "epoch": 0.3097211333855975, + "grad_norm": 1.0880898237228394, + "learning_rate": 2.764e-05, + "loss": 0.4203, + "step": 5531 + }, + { + "epoch": 0.3097771306977265, + "grad_norm": 1.275537371635437, + "learning_rate": 2.7644999999999997e-05, + "loss": 0.4122, + "step": 5532 + }, + { + "epoch": 0.3098331280098555, + "grad_norm": 1.2868367433547974, + "learning_rate": 2.7650000000000005e-05, + "loss": 0.4093, + "step": 5533 + }, + { + "epoch": 0.30988912532198454, + "grad_norm": 1.0064719915390015, + "learning_rate": 2.7655000000000002e-05, + "loss": 0.4153, + "step": 5534 + }, + { + "epoch": 0.30994512263411356, + "grad_norm": 1.1891820430755615, + "learning_rate": 2.7660000000000003e-05, + "loss": 0.4099, + "step": 5535 + }, + { + "epoch": 0.3100011199462426, + "grad_norm": 1.3737549781799316, + "learning_rate": 2.7665000000000004e-05, + "loss": 0.5673, + "step": 5536 + }, + { + "epoch": 0.3100571172583716, + "grad_norm": 1.3942443132400513, + "learning_rate": 2.767e-05, + "loss": 0.4315, + "step": 5537 + }, + { + "epoch": 0.3101131145705006, + "grad_norm": 1.105025291442871, + "learning_rate": 2.7675000000000002e-05, + "loss": 0.4325, + "step": 5538 + }, + { + "epoch": 0.31016911188262963, + "grad_norm": 1.2726826667785645, + "learning_rate": 2.768e-05, + "loss": 0.419, + "step": 5539 + }, + { + "epoch": 0.31022510919475865, + "grad_norm": 1.1925272941589355, + "learning_rate": 2.7685e-05, + "loss": 0.3958, + "step": 5540 + }, + { + "epoch": 0.31028110650688767, + "grad_norm": 1.500717043876648, + "learning_rate": 2.769e-05, + "loss": 0.3668, + "step": 5541 + }, + { + "epoch": 0.3103371038190167, + "grad_norm": 1.0971508026123047, + "learning_rate": 2.7694999999999998e-05, + "loss": 0.33, + "step": 5542 + }, + { + "epoch": 0.3103931011311457, + "grad_norm": 1.1650553941726685, + "learning_rate": 2.7700000000000002e-05, + "loss": 0.4002, + "step": 5543 + }, + { + "epoch": 0.31044909844327473, + "grad_norm": 1.1508090496063232, + "learning_rate": 2.7705000000000003e-05, + "loss": 0.5138, + "step": 5544 + }, + { + "epoch": 0.31050509575540375, + "grad_norm": 1.3230493068695068, + "learning_rate": 2.7710000000000004e-05, + "loss": 0.5699, + "step": 5545 + }, + { + "epoch": 0.31056109306753277, + "grad_norm": 1.0767821073532104, + "learning_rate": 2.7715e-05, + "loss": 0.4049, + "step": 5546 + }, + { + "epoch": 0.3106170903796618, + "grad_norm": 1.247046947479248, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.4443, + "step": 5547 + }, + { + "epoch": 0.3106730876917908, + "grad_norm": 1.0499744415283203, + "learning_rate": 2.7725e-05, + "loss": 0.4459, + "step": 5548 + }, + { + "epoch": 0.3107290850039198, + "grad_norm": 1.4015235900878906, + "learning_rate": 2.773e-05, + "loss": 0.5551, + "step": 5549 + }, + { + "epoch": 0.31078508231604884, + "grad_norm": 1.2208417654037476, + "learning_rate": 2.7735e-05, + "loss": 0.4329, + "step": 5550 + }, + { + "epoch": 0.31084107962817786, + "grad_norm": 1.5305595397949219, + "learning_rate": 2.774e-05, + "loss": 0.4818, + "step": 5551 + }, + { + "epoch": 0.3108970769403069, + "grad_norm": 1.173004388809204, + "learning_rate": 2.7745e-05, + "loss": 0.4242, + "step": 5552 + }, + { + "epoch": 0.3109530742524359, + "grad_norm": 1.2876002788543701, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.36, + "step": 5553 + }, + { + "epoch": 0.3110090715645649, + "grad_norm": 1.264246940612793, + "learning_rate": 2.7755000000000004e-05, + "loss": 0.3828, + "step": 5554 + }, + { + "epoch": 0.31106506887669394, + "grad_norm": 1.0596487522125244, + "learning_rate": 2.7760000000000002e-05, + "loss": 0.3565, + "step": 5555 + }, + { + "epoch": 0.31112106618882296, + "grad_norm": 1.1578136682510376, + "learning_rate": 2.7765000000000003e-05, + "loss": 0.4483, + "step": 5556 + }, + { + "epoch": 0.311177063500952, + "grad_norm": 1.1922684907913208, + "learning_rate": 2.777e-05, + "loss": 0.3374, + "step": 5557 + }, + { + "epoch": 0.311233060813081, + "grad_norm": 1.4355252981185913, + "learning_rate": 2.7775e-05, + "loss": 0.5503, + "step": 5558 + }, + { + "epoch": 0.31128905812521, + "grad_norm": 1.160586953163147, + "learning_rate": 2.778e-05, + "loss": 0.3731, + "step": 5559 + }, + { + "epoch": 0.311345055437339, + "grad_norm": 1.2258049249649048, + "learning_rate": 2.7785e-05, + "loss": 0.3638, + "step": 5560 + }, + { + "epoch": 0.311401052749468, + "grad_norm": 1.42200767993927, + "learning_rate": 2.779e-05, + "loss": 0.4598, + "step": 5561 + }, + { + "epoch": 0.311457050061597, + "grad_norm": 1.2269688844680786, + "learning_rate": 2.7794999999999997e-05, + "loss": 0.3328, + "step": 5562 + }, + { + "epoch": 0.31151304737372604, + "grad_norm": 1.2529104948043823, + "learning_rate": 2.7800000000000005e-05, + "loss": 0.5513, + "step": 5563 + }, + { + "epoch": 0.31156904468585506, + "grad_norm": 1.2473069429397583, + "learning_rate": 2.7805000000000002e-05, + "loss": 0.5143, + "step": 5564 + }, + { + "epoch": 0.3116250419979841, + "grad_norm": 1.274416446685791, + "learning_rate": 2.7810000000000003e-05, + "loss": 0.4876, + "step": 5565 + }, + { + "epoch": 0.3116810393101131, + "grad_norm": 1.2429513931274414, + "learning_rate": 2.7815e-05, + "loss": 0.4585, + "step": 5566 + }, + { + "epoch": 0.3117370366222421, + "grad_norm": 1.2814960479736328, + "learning_rate": 2.782e-05, + "loss": 0.4002, + "step": 5567 + }, + { + "epoch": 0.31179303393437113, + "grad_norm": 1.2824534177780151, + "learning_rate": 2.7825000000000002e-05, + "loss": 0.4641, + "step": 5568 + }, + { + "epoch": 0.31184903124650015, + "grad_norm": 1.194035291671753, + "learning_rate": 2.783e-05, + "loss": 0.4814, + "step": 5569 + }, + { + "epoch": 0.31190502855862917, + "grad_norm": 1.5696885585784912, + "learning_rate": 2.7835e-05, + "loss": 0.5196, + "step": 5570 + }, + { + "epoch": 0.3119610258707582, + "grad_norm": 1.2227907180786133, + "learning_rate": 2.7839999999999998e-05, + "loss": 0.3883, + "step": 5571 + }, + { + "epoch": 0.3120170231828872, + "grad_norm": 0.9320754408836365, + "learning_rate": 2.7845e-05, + "loss": 0.3771, + "step": 5572 + }, + { + "epoch": 0.31207302049501623, + "grad_norm": 1.4009031057357788, + "learning_rate": 2.7850000000000003e-05, + "loss": 0.3545, + "step": 5573 + }, + { + "epoch": 0.31212901780714525, + "grad_norm": 1.3426100015640259, + "learning_rate": 2.7855000000000004e-05, + "loss": 0.4465, + "step": 5574 + }, + { + "epoch": 0.31218501511927427, + "grad_norm": 1.4786632061004639, + "learning_rate": 2.7860000000000004e-05, + "loss": 0.4563, + "step": 5575 + }, + { + "epoch": 0.3122410124314033, + "grad_norm": 1.123327612876892, + "learning_rate": 2.7865000000000002e-05, + "loss": 0.4662, + "step": 5576 + }, + { + "epoch": 0.3122970097435323, + "grad_norm": 1.3635311126708984, + "learning_rate": 2.7870000000000003e-05, + "loss": 0.5339, + "step": 5577 + }, + { + "epoch": 0.3123530070556613, + "grad_norm": 7.306573390960693, + "learning_rate": 2.7875e-05, + "loss": 0.4075, + "step": 5578 + }, + { + "epoch": 0.31240900436779034, + "grad_norm": 1.197943925857544, + "learning_rate": 2.788e-05, + "loss": 0.456, + "step": 5579 + }, + { + "epoch": 0.31246500167991936, + "grad_norm": 1.428166151046753, + "learning_rate": 2.7885e-05, + "loss": 0.3866, + "step": 5580 + }, + { + "epoch": 0.3125209989920484, + "grad_norm": 1.0873808860778809, + "learning_rate": 2.789e-05, + "loss": 0.3884, + "step": 5581 + }, + { + "epoch": 0.3125769963041774, + "grad_norm": 1.3680000305175781, + "learning_rate": 2.7895e-05, + "loss": 0.4507, + "step": 5582 + }, + { + "epoch": 0.3126329936163064, + "grad_norm": 1.1258691549301147, + "learning_rate": 2.7900000000000004e-05, + "loss": 0.4959, + "step": 5583 + }, + { + "epoch": 0.31268899092843544, + "grad_norm": 1.0404407978057861, + "learning_rate": 2.7905000000000005e-05, + "loss": 0.4571, + "step": 5584 + }, + { + "epoch": 0.31274498824056446, + "grad_norm": 1.3987343311309814, + "learning_rate": 2.7910000000000002e-05, + "loss": 0.4018, + "step": 5585 + }, + { + "epoch": 0.3128009855526935, + "grad_norm": 1.401898980140686, + "learning_rate": 2.7915000000000003e-05, + "loss": 0.3769, + "step": 5586 + }, + { + "epoch": 0.3128569828648225, + "grad_norm": 1.24025559425354, + "learning_rate": 2.792e-05, + "loss": 0.401, + "step": 5587 + }, + { + "epoch": 0.3129129801769515, + "grad_norm": 1.172914743423462, + "learning_rate": 2.7925e-05, + "loss": 0.4351, + "step": 5588 + }, + { + "epoch": 0.31296897748908054, + "grad_norm": 1.3653998374938965, + "learning_rate": 2.7930000000000002e-05, + "loss": 0.4432, + "step": 5589 + }, + { + "epoch": 0.31302497480120955, + "grad_norm": 1.4646999835968018, + "learning_rate": 2.7935e-05, + "loss": 0.5552, + "step": 5590 + }, + { + "epoch": 0.3130809721133386, + "grad_norm": 1.4121801853179932, + "learning_rate": 2.794e-05, + "loss": 0.391, + "step": 5591 + }, + { + "epoch": 0.3131369694254676, + "grad_norm": 1.7955048084259033, + "learning_rate": 2.7944999999999998e-05, + "loss": 0.53, + "step": 5592 + }, + { + "epoch": 0.3131929667375966, + "grad_norm": 1.044741153717041, + "learning_rate": 2.7950000000000005e-05, + "loss": 0.401, + "step": 5593 + }, + { + "epoch": 0.31324896404972563, + "grad_norm": 1.3014229536056519, + "learning_rate": 2.7955000000000003e-05, + "loss": 0.4103, + "step": 5594 + }, + { + "epoch": 0.31330496136185465, + "grad_norm": 1.2207649946212769, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.4124, + "step": 5595 + }, + { + "epoch": 0.31336095867398367, + "grad_norm": 1.241531252861023, + "learning_rate": 2.7965e-05, + "loss": 0.3804, + "step": 5596 + }, + { + "epoch": 0.3134169559861127, + "grad_norm": 1.4336752891540527, + "learning_rate": 2.797e-05, + "loss": 0.4088, + "step": 5597 + }, + { + "epoch": 0.3134729532982417, + "grad_norm": 1.685155987739563, + "learning_rate": 2.7975000000000002e-05, + "loss": 0.5316, + "step": 5598 + }, + { + "epoch": 0.3135289506103707, + "grad_norm": 1.1184717416763306, + "learning_rate": 2.798e-05, + "loss": 0.3776, + "step": 5599 + }, + { + "epoch": 0.31358494792249975, + "grad_norm": 1.1784001588821411, + "learning_rate": 2.7985e-05, + "loss": 0.3587, + "step": 5600 + }, + { + "epoch": 0.31364094523462877, + "grad_norm": 1.4661425352096558, + "learning_rate": 2.7989999999999998e-05, + "loss": 0.628, + "step": 5601 + }, + { + "epoch": 0.31369694254675773, + "grad_norm": 1.212763786315918, + "learning_rate": 2.7995e-05, + "loss": 0.3783, + "step": 5602 + }, + { + "epoch": 0.31375293985888675, + "grad_norm": 1.6633150577545166, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.4354, + "step": 5603 + }, + { + "epoch": 0.31380893717101577, + "grad_norm": 1.237869381904602, + "learning_rate": 2.8005000000000004e-05, + "loss": 0.4162, + "step": 5604 + }, + { + "epoch": 0.3138649344831448, + "grad_norm": 1.1660290956497192, + "learning_rate": 2.8010000000000005e-05, + "loss": 0.3373, + "step": 5605 + }, + { + "epoch": 0.3139209317952738, + "grad_norm": 1.2090556621551514, + "learning_rate": 2.8015000000000002e-05, + "loss": 0.5203, + "step": 5606 + }, + { + "epoch": 0.3139769291074028, + "grad_norm": 1.4198402166366577, + "learning_rate": 2.8020000000000003e-05, + "loss": 0.4898, + "step": 5607 + }, + { + "epoch": 0.31403292641953184, + "grad_norm": 1.1494313478469849, + "learning_rate": 2.8025e-05, + "loss": 0.4086, + "step": 5608 + }, + { + "epoch": 0.31408892373166086, + "grad_norm": 1.2049280405044556, + "learning_rate": 2.803e-05, + "loss": 0.3833, + "step": 5609 + }, + { + "epoch": 0.3141449210437899, + "grad_norm": 1.3241567611694336, + "learning_rate": 2.8035000000000002e-05, + "loss": 0.5409, + "step": 5610 + }, + { + "epoch": 0.3142009183559189, + "grad_norm": 1.1381441354751587, + "learning_rate": 2.804e-05, + "loss": 0.416, + "step": 5611 + }, + { + "epoch": 0.3142569156680479, + "grad_norm": 1.4359488487243652, + "learning_rate": 2.8045e-05, + "loss": 0.4532, + "step": 5612 + }, + { + "epoch": 0.31431291298017694, + "grad_norm": 1.5256915092468262, + "learning_rate": 2.8050000000000004e-05, + "loss": 0.5876, + "step": 5613 + }, + { + "epoch": 0.31436891029230596, + "grad_norm": 1.111158847808838, + "learning_rate": 2.8055000000000005e-05, + "loss": 0.3589, + "step": 5614 + }, + { + "epoch": 0.314424907604435, + "grad_norm": 1.328284740447998, + "learning_rate": 2.8060000000000002e-05, + "loss": 0.4164, + "step": 5615 + }, + { + "epoch": 0.314480904916564, + "grad_norm": 1.2936842441558838, + "learning_rate": 2.8065000000000003e-05, + "loss": 0.6145, + "step": 5616 + }, + { + "epoch": 0.314536902228693, + "grad_norm": 1.5181560516357422, + "learning_rate": 2.807e-05, + "loss": 0.4223, + "step": 5617 + }, + { + "epoch": 0.31459289954082204, + "grad_norm": 1.1503472328186035, + "learning_rate": 2.8075e-05, + "loss": 0.4605, + "step": 5618 + }, + { + "epoch": 0.31464889685295105, + "grad_norm": 1.248275637626648, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.3591, + "step": 5619 + }, + { + "epoch": 0.3147048941650801, + "grad_norm": 1.1627516746520996, + "learning_rate": 2.8085e-05, + "loss": 0.3392, + "step": 5620 + }, + { + "epoch": 0.3147608914772091, + "grad_norm": 1.1381075382232666, + "learning_rate": 2.809e-05, + "loss": 0.4387, + "step": 5621 + }, + { + "epoch": 0.3148168887893381, + "grad_norm": 1.2067393064498901, + "learning_rate": 2.8094999999999998e-05, + "loss": 0.4865, + "step": 5622 + }, + { + "epoch": 0.31487288610146713, + "grad_norm": 1.7616690397262573, + "learning_rate": 2.8100000000000005e-05, + "loss": 0.4576, + "step": 5623 + }, + { + "epoch": 0.31492888341359615, + "grad_norm": 1.2296456098556519, + "learning_rate": 2.8105000000000003e-05, + "loss": 0.403, + "step": 5624 + }, + { + "epoch": 0.31498488072572517, + "grad_norm": 1.0594286918640137, + "learning_rate": 2.8110000000000004e-05, + "loss": 0.4521, + "step": 5625 + }, + { + "epoch": 0.3150408780378542, + "grad_norm": 1.0713024139404297, + "learning_rate": 2.8115e-05, + "loss": 0.3054, + "step": 5626 + }, + { + "epoch": 0.3150968753499832, + "grad_norm": 1.0795053243637085, + "learning_rate": 2.8120000000000002e-05, + "loss": 0.3588, + "step": 5627 + }, + { + "epoch": 0.3151528726621122, + "grad_norm": 1.4612751007080078, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.5633, + "step": 5628 + }, + { + "epoch": 0.31520886997424125, + "grad_norm": 1.1460578441619873, + "learning_rate": 2.813e-05, + "loss": 0.3124, + "step": 5629 + }, + { + "epoch": 0.31526486728637027, + "grad_norm": 1.154659628868103, + "learning_rate": 2.8135e-05, + "loss": 0.4332, + "step": 5630 + }, + { + "epoch": 0.3153208645984993, + "grad_norm": 1.8693031072616577, + "learning_rate": 2.8139999999999998e-05, + "loss": 0.5789, + "step": 5631 + }, + { + "epoch": 0.3153768619106283, + "grad_norm": 1.6017730236053467, + "learning_rate": 2.8145e-05, + "loss": 0.473, + "step": 5632 + }, + { + "epoch": 0.3154328592227573, + "grad_norm": 4.376621246337891, + "learning_rate": 2.815e-05, + "loss": 0.4659, + "step": 5633 + }, + { + "epoch": 0.31548885653488634, + "grad_norm": 1.125675916671753, + "learning_rate": 2.8155000000000004e-05, + "loss": 0.4019, + "step": 5634 + }, + { + "epoch": 0.31554485384701536, + "grad_norm": 1.2513413429260254, + "learning_rate": 2.816e-05, + "loss": 0.4662, + "step": 5635 + }, + { + "epoch": 0.3156008511591444, + "grad_norm": 0.9438375234603882, + "learning_rate": 2.8165000000000002e-05, + "loss": 0.3095, + "step": 5636 + }, + { + "epoch": 0.3156568484712734, + "grad_norm": 1.113909363746643, + "learning_rate": 2.8170000000000003e-05, + "loss": 0.5185, + "step": 5637 + }, + { + "epoch": 0.3157128457834024, + "grad_norm": 1.4468607902526855, + "learning_rate": 2.8175e-05, + "loss": 0.4859, + "step": 5638 + }, + { + "epoch": 0.31576884309553144, + "grad_norm": 1.0653977394104004, + "learning_rate": 2.818e-05, + "loss": 0.3885, + "step": 5639 + }, + { + "epoch": 0.31582484040766046, + "grad_norm": 1.389087438583374, + "learning_rate": 2.8185e-05, + "loss": 0.3641, + "step": 5640 + }, + { + "epoch": 0.3158808377197895, + "grad_norm": 1.0898454189300537, + "learning_rate": 2.819e-05, + "loss": 0.3619, + "step": 5641 + }, + { + "epoch": 0.3159368350319185, + "grad_norm": 1.2313554286956787, + "learning_rate": 2.8195e-05, + "loss": 0.3261, + "step": 5642 + }, + { + "epoch": 0.31599283234404746, + "grad_norm": 1.072247862815857, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.375, + "step": 5643 + }, + { + "epoch": 0.3160488296561765, + "grad_norm": 1.2600675821304321, + "learning_rate": 2.8205000000000005e-05, + "loss": 0.3451, + "step": 5644 + }, + { + "epoch": 0.3161048269683055, + "grad_norm": 1.0342261791229248, + "learning_rate": 2.8210000000000003e-05, + "loss": 0.3224, + "step": 5645 + }, + { + "epoch": 0.3161608242804345, + "grad_norm": 1.348740816116333, + "learning_rate": 2.8215000000000003e-05, + "loss": 0.5904, + "step": 5646 + }, + { + "epoch": 0.31621682159256354, + "grad_norm": 1.1729305982589722, + "learning_rate": 2.822e-05, + "loss": 0.3687, + "step": 5647 + }, + { + "epoch": 0.31627281890469255, + "grad_norm": 1.4918018579483032, + "learning_rate": 2.8225e-05, + "loss": 0.5176, + "step": 5648 + }, + { + "epoch": 0.3163288162168216, + "grad_norm": 1.546281337738037, + "learning_rate": 2.8230000000000002e-05, + "loss": 0.5422, + "step": 5649 + }, + { + "epoch": 0.3163848135289506, + "grad_norm": 1.0162818431854248, + "learning_rate": 2.8235e-05, + "loss": 0.4217, + "step": 5650 + }, + { + "epoch": 0.3164408108410796, + "grad_norm": 1.5955443382263184, + "learning_rate": 2.824e-05, + "loss": 0.4193, + "step": 5651 + }, + { + "epoch": 0.31649680815320863, + "grad_norm": 1.1460846662521362, + "learning_rate": 2.8244999999999998e-05, + "loss": 0.3683, + "step": 5652 + }, + { + "epoch": 0.31655280546533765, + "grad_norm": 1.2553859949111938, + "learning_rate": 2.825e-05, + "loss": 0.5352, + "step": 5653 + }, + { + "epoch": 0.31660880277746667, + "grad_norm": 1.315063714981079, + "learning_rate": 2.8255000000000003e-05, + "loss": 0.3756, + "step": 5654 + }, + { + "epoch": 0.3166648000895957, + "grad_norm": 1.206869125366211, + "learning_rate": 2.8260000000000004e-05, + "loss": 0.452, + "step": 5655 + }, + { + "epoch": 0.3167207974017247, + "grad_norm": 1.225957989692688, + "learning_rate": 2.8265e-05, + "loss": 0.539, + "step": 5656 + }, + { + "epoch": 0.3167767947138537, + "grad_norm": 1.1983261108398438, + "learning_rate": 2.8270000000000002e-05, + "loss": 0.4334, + "step": 5657 + }, + { + "epoch": 0.31683279202598275, + "grad_norm": 1.1372549533843994, + "learning_rate": 2.8275000000000003e-05, + "loss": 0.2428, + "step": 5658 + }, + { + "epoch": 0.31688878933811176, + "grad_norm": 1.3845407962799072, + "learning_rate": 2.828e-05, + "loss": 0.4571, + "step": 5659 + }, + { + "epoch": 0.3169447866502408, + "grad_norm": 1.270957350730896, + "learning_rate": 2.8285e-05, + "loss": 0.3377, + "step": 5660 + }, + { + "epoch": 0.3170007839623698, + "grad_norm": 1.497532606124878, + "learning_rate": 2.829e-05, + "loss": 0.3983, + "step": 5661 + }, + { + "epoch": 0.3170567812744988, + "grad_norm": 1.2636052370071411, + "learning_rate": 2.8295e-05, + "loss": 0.5258, + "step": 5662 + }, + { + "epoch": 0.31711277858662784, + "grad_norm": 1.3569202423095703, + "learning_rate": 2.83e-05, + "loss": 0.5553, + "step": 5663 + }, + { + "epoch": 0.31716877589875686, + "grad_norm": 1.2245830297470093, + "learning_rate": 2.8305000000000004e-05, + "loss": 0.3707, + "step": 5664 + }, + { + "epoch": 0.3172247732108859, + "grad_norm": 1.4354572296142578, + "learning_rate": 2.8310000000000002e-05, + "loss": 0.4876, + "step": 5665 + }, + { + "epoch": 0.3172807705230149, + "grad_norm": 1.2246136665344238, + "learning_rate": 2.8315000000000002e-05, + "loss": 0.5164, + "step": 5666 + }, + { + "epoch": 0.3173367678351439, + "grad_norm": 1.0793654918670654, + "learning_rate": 2.8320000000000003e-05, + "loss": 0.3274, + "step": 5667 + }, + { + "epoch": 0.31739276514727294, + "grad_norm": 1.2143782377243042, + "learning_rate": 2.8325e-05, + "loss": 0.5446, + "step": 5668 + }, + { + "epoch": 0.31744876245940196, + "grad_norm": 1.2183564901351929, + "learning_rate": 2.833e-05, + "loss": 0.495, + "step": 5669 + }, + { + "epoch": 0.317504759771531, + "grad_norm": 1.2809075117111206, + "learning_rate": 2.8335e-05, + "loss": 0.487, + "step": 5670 + }, + { + "epoch": 0.31756075708366, + "grad_norm": 1.1521151065826416, + "learning_rate": 2.834e-05, + "loss": 0.4489, + "step": 5671 + }, + { + "epoch": 0.317616754395789, + "grad_norm": 1.0466850996017456, + "learning_rate": 2.8345e-05, + "loss": 0.3761, + "step": 5672 + }, + { + "epoch": 0.31767275170791803, + "grad_norm": 1.2535468339920044, + "learning_rate": 2.8349999999999998e-05, + "loss": 0.3309, + "step": 5673 + }, + { + "epoch": 0.31772874902004705, + "grad_norm": 1.3420789241790771, + "learning_rate": 2.8355000000000002e-05, + "loss": 0.4376, + "step": 5674 + }, + { + "epoch": 0.31778474633217607, + "grad_norm": 1.7165738344192505, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.4187, + "step": 5675 + }, + { + "epoch": 0.3178407436443051, + "grad_norm": 1.2799228429794312, + "learning_rate": 2.8365000000000004e-05, + "loss": 0.5057, + "step": 5676 + }, + { + "epoch": 0.3178967409564341, + "grad_norm": 1.2187089920043945, + "learning_rate": 2.837e-05, + "loss": 0.4693, + "step": 5677 + }, + { + "epoch": 0.31795273826856313, + "grad_norm": 1.1975899934768677, + "learning_rate": 2.8375000000000002e-05, + "loss": 0.5128, + "step": 5678 + }, + { + "epoch": 0.31800873558069215, + "grad_norm": 1.2510820627212524, + "learning_rate": 2.8380000000000003e-05, + "loss": 0.3987, + "step": 5679 + }, + { + "epoch": 0.31806473289282117, + "grad_norm": 1.257769227027893, + "learning_rate": 2.8385e-05, + "loss": 0.4245, + "step": 5680 + }, + { + "epoch": 0.3181207302049502, + "grad_norm": 1.138027310371399, + "learning_rate": 2.839e-05, + "loss": 0.4206, + "step": 5681 + }, + { + "epoch": 0.3181767275170792, + "grad_norm": 1.360088586807251, + "learning_rate": 2.8395e-05, + "loss": 0.6143, + "step": 5682 + }, + { + "epoch": 0.3182327248292082, + "grad_norm": 1.3086761236190796, + "learning_rate": 2.84e-05, + "loss": 0.4096, + "step": 5683 + }, + { + "epoch": 0.3182887221413372, + "grad_norm": 1.507885217666626, + "learning_rate": 2.8405000000000003e-05, + "loss": 0.4123, + "step": 5684 + }, + { + "epoch": 0.3183447194534662, + "grad_norm": 1.4379240274429321, + "learning_rate": 2.8410000000000004e-05, + "loss": 0.3721, + "step": 5685 + }, + { + "epoch": 0.3184007167655952, + "grad_norm": 1.6149991750717163, + "learning_rate": 2.8415e-05, + "loss": 0.4733, + "step": 5686 + }, + { + "epoch": 0.31845671407772425, + "grad_norm": 1.240517497062683, + "learning_rate": 2.8420000000000002e-05, + "loss": 0.4846, + "step": 5687 + }, + { + "epoch": 0.31851271138985326, + "grad_norm": 1.319047451019287, + "learning_rate": 2.8425000000000003e-05, + "loss": 0.3765, + "step": 5688 + }, + { + "epoch": 0.3185687087019823, + "grad_norm": 1.7602545022964478, + "learning_rate": 2.843e-05, + "loss": 0.476, + "step": 5689 + }, + { + "epoch": 0.3186247060141113, + "grad_norm": 1.3483355045318604, + "learning_rate": 2.8435e-05, + "loss": 0.3854, + "step": 5690 + }, + { + "epoch": 0.3186807033262403, + "grad_norm": 1.6905882358551025, + "learning_rate": 2.844e-05, + "loss": 0.5992, + "step": 5691 + }, + { + "epoch": 0.31873670063836934, + "grad_norm": 1.2765854597091675, + "learning_rate": 2.8445e-05, + "loss": 0.4451, + "step": 5692 + }, + { + "epoch": 0.31879269795049836, + "grad_norm": 1.7190817594528198, + "learning_rate": 2.845e-05, + "loss": 0.5721, + "step": 5693 + }, + { + "epoch": 0.3188486952626274, + "grad_norm": 1.3409048318862915, + "learning_rate": 2.8455000000000005e-05, + "loss": 0.6477, + "step": 5694 + }, + { + "epoch": 0.3189046925747564, + "grad_norm": 1.4718717336654663, + "learning_rate": 2.8460000000000002e-05, + "loss": 0.5517, + "step": 5695 + }, + { + "epoch": 0.3189606898868854, + "grad_norm": 1.3900972604751587, + "learning_rate": 2.8465000000000003e-05, + "loss": 0.5744, + "step": 5696 + }, + { + "epoch": 0.31901668719901444, + "grad_norm": 1.1296007633209229, + "learning_rate": 2.8470000000000004e-05, + "loss": 0.3525, + "step": 5697 + }, + { + "epoch": 0.31907268451114346, + "grad_norm": 1.5070109367370605, + "learning_rate": 2.8475e-05, + "loss": 0.6085, + "step": 5698 + }, + { + "epoch": 0.3191286818232725, + "grad_norm": 1.1078076362609863, + "learning_rate": 2.8480000000000002e-05, + "loss": 0.4084, + "step": 5699 + }, + { + "epoch": 0.3191846791354015, + "grad_norm": 1.2499613761901855, + "learning_rate": 2.8485e-05, + "loss": 0.3357, + "step": 5700 + }, + { + "epoch": 0.3192406764475305, + "grad_norm": 1.331945538520813, + "learning_rate": 2.849e-05, + "loss": 0.5819, + "step": 5701 + }, + { + "epoch": 0.31929667375965953, + "grad_norm": 1.3197909593582153, + "learning_rate": 2.8495e-05, + "loss": 0.3637, + "step": 5702 + }, + { + "epoch": 0.31935267107178855, + "grad_norm": 1.1270290613174438, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.4476, + "step": 5703 + }, + { + "epoch": 0.31940866838391757, + "grad_norm": 3.274280309677124, + "learning_rate": 2.8505000000000002e-05, + "loss": 0.4659, + "step": 5704 + }, + { + "epoch": 0.3194646656960466, + "grad_norm": 1.2685467004776, + "learning_rate": 2.8510000000000003e-05, + "loss": 0.3911, + "step": 5705 + }, + { + "epoch": 0.3195206630081756, + "grad_norm": 1.19549560546875, + "learning_rate": 2.8515000000000004e-05, + "loss": 0.4517, + "step": 5706 + }, + { + "epoch": 0.31957666032030463, + "grad_norm": 1.320735216140747, + "learning_rate": 2.852e-05, + "loss": 0.4496, + "step": 5707 + }, + { + "epoch": 0.31963265763243365, + "grad_norm": 1.380415678024292, + "learning_rate": 2.8525000000000002e-05, + "loss": 0.4642, + "step": 5708 + }, + { + "epoch": 0.31968865494456267, + "grad_norm": 1.2442623376846313, + "learning_rate": 2.853e-05, + "loss": 0.465, + "step": 5709 + }, + { + "epoch": 0.3197446522566917, + "grad_norm": 1.3137507438659668, + "learning_rate": 2.8535e-05, + "loss": 0.4749, + "step": 5710 + }, + { + "epoch": 0.3198006495688207, + "grad_norm": 1.3102186918258667, + "learning_rate": 2.854e-05, + "loss": 0.4644, + "step": 5711 + }, + { + "epoch": 0.3198566468809497, + "grad_norm": 1.0463544130325317, + "learning_rate": 2.8545e-05, + "loss": 0.3313, + "step": 5712 + }, + { + "epoch": 0.31991264419307874, + "grad_norm": 1.3802485466003418, + "learning_rate": 2.855e-05, + "loss": 0.4341, + "step": 5713 + }, + { + "epoch": 0.31996864150520776, + "grad_norm": 3.6623709201812744, + "learning_rate": 2.8555000000000004e-05, + "loss": 0.5099, + "step": 5714 + }, + { + "epoch": 0.3200246388173368, + "grad_norm": 1.4516994953155518, + "learning_rate": 2.8560000000000004e-05, + "loss": 0.4278, + "step": 5715 + }, + { + "epoch": 0.3200806361294658, + "grad_norm": 1.276397466659546, + "learning_rate": 2.8565000000000002e-05, + "loss": 0.4023, + "step": 5716 + }, + { + "epoch": 0.3201366334415948, + "grad_norm": 1.2371009588241577, + "learning_rate": 2.8570000000000003e-05, + "loss": 0.445, + "step": 5717 + }, + { + "epoch": 0.32019263075372384, + "grad_norm": 1.18202543258667, + "learning_rate": 2.8575000000000003e-05, + "loss": 0.5315, + "step": 5718 + }, + { + "epoch": 0.32024862806585286, + "grad_norm": 1.3691107034683228, + "learning_rate": 2.858e-05, + "loss": 0.3928, + "step": 5719 + }, + { + "epoch": 0.3203046253779819, + "grad_norm": 1.5681381225585938, + "learning_rate": 2.8585e-05, + "loss": 0.4085, + "step": 5720 + }, + { + "epoch": 0.3203606226901109, + "grad_norm": 1.030142068862915, + "learning_rate": 2.859e-05, + "loss": 0.3923, + "step": 5721 + }, + { + "epoch": 0.3204166200022399, + "grad_norm": 1.2838459014892578, + "learning_rate": 2.8595e-05, + "loss": 0.5295, + "step": 5722 + }, + { + "epoch": 0.32047261731436893, + "grad_norm": 1.3073571920394897, + "learning_rate": 2.86e-05, + "loss": 0.4917, + "step": 5723 + }, + { + "epoch": 0.32052861462649795, + "grad_norm": 1.3806993961334229, + "learning_rate": 2.8605000000000005e-05, + "loss": 0.5803, + "step": 5724 + }, + { + "epoch": 0.320584611938627, + "grad_norm": 1.4577785730361938, + "learning_rate": 2.8610000000000002e-05, + "loss": 0.4319, + "step": 5725 + }, + { + "epoch": 0.32064060925075594, + "grad_norm": 1.2400891780853271, + "learning_rate": 2.8615000000000003e-05, + "loss": 0.5681, + "step": 5726 + }, + { + "epoch": 0.32069660656288496, + "grad_norm": 1.2736188173294067, + "learning_rate": 2.8620000000000004e-05, + "loss": 0.496, + "step": 5727 + }, + { + "epoch": 0.320752603875014, + "grad_norm": 1.7031136751174927, + "learning_rate": 2.8625e-05, + "loss": 0.314, + "step": 5728 + }, + { + "epoch": 0.320808601187143, + "grad_norm": 1.4622594118118286, + "learning_rate": 2.8630000000000002e-05, + "loss": 0.4218, + "step": 5729 + }, + { + "epoch": 0.320864598499272, + "grad_norm": 1.282241702079773, + "learning_rate": 2.8635e-05, + "loss": 0.5403, + "step": 5730 + }, + { + "epoch": 0.32092059581140103, + "grad_norm": 1.379214882850647, + "learning_rate": 2.864e-05, + "loss": 0.3999, + "step": 5731 + }, + { + "epoch": 0.32097659312353005, + "grad_norm": 1.029502272605896, + "learning_rate": 2.8645e-05, + "loss": 0.3157, + "step": 5732 + }, + { + "epoch": 0.32103259043565907, + "grad_norm": 1.2138969898223877, + "learning_rate": 2.865e-05, + "loss": 0.4902, + "step": 5733 + }, + { + "epoch": 0.3210885877477881, + "grad_norm": 1.1390117406845093, + "learning_rate": 2.8655000000000003e-05, + "loss": 0.3516, + "step": 5734 + }, + { + "epoch": 0.3211445850599171, + "grad_norm": 1.102679967880249, + "learning_rate": 2.8660000000000003e-05, + "loss": 0.4028, + "step": 5735 + }, + { + "epoch": 0.32120058237204613, + "grad_norm": 1.5025776624679565, + "learning_rate": 2.8665000000000004e-05, + "loss": 0.5073, + "step": 5736 + }, + { + "epoch": 0.32125657968417515, + "grad_norm": 2.1668403148651123, + "learning_rate": 2.867e-05, + "loss": 0.4717, + "step": 5737 + }, + { + "epoch": 0.32131257699630417, + "grad_norm": 1.0500065088272095, + "learning_rate": 2.8675000000000002e-05, + "loss": 0.4076, + "step": 5738 + }, + { + "epoch": 0.3213685743084332, + "grad_norm": 1.29715096950531, + "learning_rate": 2.868e-05, + "loss": 0.5435, + "step": 5739 + }, + { + "epoch": 0.3214245716205622, + "grad_norm": 2.245598316192627, + "learning_rate": 2.8685e-05, + "loss": 0.4601, + "step": 5740 + }, + { + "epoch": 0.3214805689326912, + "grad_norm": 1.4639257192611694, + "learning_rate": 2.869e-05, + "loss": 0.3894, + "step": 5741 + }, + { + "epoch": 0.32153656624482024, + "grad_norm": 1.1997876167297363, + "learning_rate": 2.8695e-05, + "loss": 0.3474, + "step": 5742 + }, + { + "epoch": 0.32159256355694926, + "grad_norm": 1.290321707725525, + "learning_rate": 2.87e-05, + "loss": 0.4773, + "step": 5743 + }, + { + "epoch": 0.3216485608690783, + "grad_norm": 1.3255780935287476, + "learning_rate": 2.8705000000000004e-05, + "loss": 0.54, + "step": 5744 + }, + { + "epoch": 0.3217045581812073, + "grad_norm": 5.353636264801025, + "learning_rate": 2.8710000000000005e-05, + "loss": 0.4118, + "step": 5745 + }, + { + "epoch": 0.3217605554933363, + "grad_norm": 1.1815154552459717, + "learning_rate": 2.8715000000000002e-05, + "loss": 0.327, + "step": 5746 + }, + { + "epoch": 0.32181655280546534, + "grad_norm": 1.3262768983840942, + "learning_rate": 2.8720000000000003e-05, + "loss": 0.4177, + "step": 5747 + }, + { + "epoch": 0.32187255011759436, + "grad_norm": 1.1953827142715454, + "learning_rate": 2.8725e-05, + "loss": 0.4005, + "step": 5748 + }, + { + "epoch": 0.3219285474297234, + "grad_norm": 1.3174980878829956, + "learning_rate": 2.873e-05, + "loss": 0.4097, + "step": 5749 + }, + { + "epoch": 0.3219845447418524, + "grad_norm": 1.2677258253097534, + "learning_rate": 2.8735000000000002e-05, + "loss": 0.3018, + "step": 5750 + }, + { + "epoch": 0.3220405420539814, + "grad_norm": 1.2160332202911377, + "learning_rate": 2.874e-05, + "loss": 0.5729, + "step": 5751 + }, + { + "epoch": 0.32209653936611043, + "grad_norm": 1.1369692087173462, + "learning_rate": 2.8745e-05, + "loss": 0.387, + "step": 5752 + }, + { + "epoch": 0.32215253667823945, + "grad_norm": 1.3775033950805664, + "learning_rate": 2.8749999999999997e-05, + "loss": 0.5746, + "step": 5753 + }, + { + "epoch": 0.3222085339903685, + "grad_norm": 1.3365129232406616, + "learning_rate": 2.8754999999999998e-05, + "loss": 0.4738, + "step": 5754 + }, + { + "epoch": 0.3222645313024975, + "grad_norm": 1.5569610595703125, + "learning_rate": 2.8760000000000002e-05, + "loss": 0.4895, + "step": 5755 + }, + { + "epoch": 0.3223205286146265, + "grad_norm": 1.3198693990707397, + "learning_rate": 2.8765000000000003e-05, + "loss": 0.3842, + "step": 5756 + }, + { + "epoch": 0.32237652592675553, + "grad_norm": 1.2834845781326294, + "learning_rate": 2.8770000000000004e-05, + "loss": 0.4862, + "step": 5757 + }, + { + "epoch": 0.32243252323888455, + "grad_norm": 1.2282882928848267, + "learning_rate": 2.8775e-05, + "loss": 0.3435, + "step": 5758 + }, + { + "epoch": 0.32248852055101357, + "grad_norm": 1.120464563369751, + "learning_rate": 2.8780000000000002e-05, + "loss": 0.3643, + "step": 5759 + }, + { + "epoch": 0.3225445178631426, + "grad_norm": 1.117025375366211, + "learning_rate": 2.8785e-05, + "loss": 0.4665, + "step": 5760 + }, + { + "epoch": 0.3226005151752716, + "grad_norm": 1.0791250467300415, + "learning_rate": 2.879e-05, + "loss": 0.3406, + "step": 5761 + }, + { + "epoch": 0.3226565124874006, + "grad_norm": 1.3383996486663818, + "learning_rate": 2.8795e-05, + "loss": 0.3864, + "step": 5762 + }, + { + "epoch": 0.32271250979952965, + "grad_norm": 1.1708077192306519, + "learning_rate": 2.88e-05, + "loss": 0.3926, + "step": 5763 + }, + { + "epoch": 0.32276850711165866, + "grad_norm": 1.342558741569519, + "learning_rate": 2.8805e-05, + "loss": 0.3435, + "step": 5764 + }, + { + "epoch": 0.3228245044237877, + "grad_norm": 1.3170289993286133, + "learning_rate": 2.8810000000000004e-05, + "loss": 0.4149, + "step": 5765 + }, + { + "epoch": 0.3228805017359167, + "grad_norm": 1.3614591360092163, + "learning_rate": 2.8815000000000004e-05, + "loss": 0.3713, + "step": 5766 + }, + { + "epoch": 0.32293649904804567, + "grad_norm": 1.175022840499878, + "learning_rate": 2.8820000000000002e-05, + "loss": 0.4052, + "step": 5767 + }, + { + "epoch": 0.3229924963601747, + "grad_norm": 0.9799365997314453, + "learning_rate": 2.8825000000000003e-05, + "loss": 0.2728, + "step": 5768 + }, + { + "epoch": 0.3230484936723037, + "grad_norm": 1.7954214811325073, + "learning_rate": 2.883e-05, + "loss": 0.5044, + "step": 5769 + }, + { + "epoch": 0.3231044909844327, + "grad_norm": 1.3334827423095703, + "learning_rate": 2.8835e-05, + "loss": 0.4983, + "step": 5770 + }, + { + "epoch": 0.32316048829656174, + "grad_norm": 1.1819570064544678, + "learning_rate": 2.8840000000000002e-05, + "loss": 0.4125, + "step": 5771 + }, + { + "epoch": 0.32321648560869076, + "grad_norm": 1.1033072471618652, + "learning_rate": 2.8845e-05, + "loss": 0.3211, + "step": 5772 + }, + { + "epoch": 0.3232724829208198, + "grad_norm": 1.1614561080932617, + "learning_rate": 2.885e-05, + "loss": 0.5159, + "step": 5773 + }, + { + "epoch": 0.3233284802329488, + "grad_norm": 1.4699541330337524, + "learning_rate": 2.8854999999999997e-05, + "loss": 0.6175, + "step": 5774 + }, + { + "epoch": 0.3233844775450778, + "grad_norm": 0.9454047083854675, + "learning_rate": 2.8860000000000005e-05, + "loss": 0.3131, + "step": 5775 + }, + { + "epoch": 0.32344047485720684, + "grad_norm": 1.3530412912368774, + "learning_rate": 2.8865000000000002e-05, + "loss": 0.4906, + "step": 5776 + }, + { + "epoch": 0.32349647216933586, + "grad_norm": 1.1419742107391357, + "learning_rate": 2.8870000000000003e-05, + "loss": 0.4197, + "step": 5777 + }, + { + "epoch": 0.3235524694814649, + "grad_norm": 1.2685377597808838, + "learning_rate": 2.8875e-05, + "loss": 0.4257, + "step": 5778 + }, + { + "epoch": 0.3236084667935939, + "grad_norm": 1.2069966793060303, + "learning_rate": 2.888e-05, + "loss": 0.479, + "step": 5779 + }, + { + "epoch": 0.3236644641057229, + "grad_norm": 1.2854137420654297, + "learning_rate": 2.8885000000000002e-05, + "loss": 0.2771, + "step": 5780 + }, + { + "epoch": 0.32372046141785193, + "grad_norm": 1.4132953882217407, + "learning_rate": 2.889e-05, + "loss": 0.5198, + "step": 5781 + }, + { + "epoch": 0.32377645872998095, + "grad_norm": 1.051746129989624, + "learning_rate": 2.8895e-05, + "loss": 0.3799, + "step": 5782 + }, + { + "epoch": 0.32383245604211, + "grad_norm": 1.1334714889526367, + "learning_rate": 2.8899999999999998e-05, + "loss": 0.3857, + "step": 5783 + }, + { + "epoch": 0.323888453354239, + "grad_norm": 1.6783074140548706, + "learning_rate": 2.8905e-05, + "loss": 0.5421, + "step": 5784 + }, + { + "epoch": 0.323944450666368, + "grad_norm": 1.5858482122421265, + "learning_rate": 2.8910000000000003e-05, + "loss": 0.5914, + "step": 5785 + }, + { + "epoch": 0.32400044797849703, + "grad_norm": 1.5772172212600708, + "learning_rate": 2.8915000000000004e-05, + "loss": 0.4384, + "step": 5786 + }, + { + "epoch": 0.32405644529062605, + "grad_norm": 1.386919379234314, + "learning_rate": 2.8920000000000004e-05, + "loss": 0.5433, + "step": 5787 + }, + { + "epoch": 0.32411244260275507, + "grad_norm": 1.2350701093673706, + "learning_rate": 2.8925000000000002e-05, + "loss": 0.2706, + "step": 5788 + }, + { + "epoch": 0.3241684399148841, + "grad_norm": 1.2195590734481812, + "learning_rate": 2.8930000000000003e-05, + "loss": 0.4213, + "step": 5789 + }, + { + "epoch": 0.3242244372270131, + "grad_norm": 1.4639583826065063, + "learning_rate": 2.8935e-05, + "loss": 0.5474, + "step": 5790 + }, + { + "epoch": 0.3242804345391421, + "grad_norm": 1.1940836906433105, + "learning_rate": 2.894e-05, + "loss": 0.489, + "step": 5791 + }, + { + "epoch": 0.32433643185127115, + "grad_norm": 1.021995186805725, + "learning_rate": 2.8945e-05, + "loss": 0.3867, + "step": 5792 + }, + { + "epoch": 0.32439242916340016, + "grad_norm": 1.2153525352478027, + "learning_rate": 2.895e-05, + "loss": 0.3555, + "step": 5793 + }, + { + "epoch": 0.3244484264755292, + "grad_norm": 1.187088966369629, + "learning_rate": 2.8955e-05, + "loss": 0.4573, + "step": 5794 + }, + { + "epoch": 0.3245044237876582, + "grad_norm": 6.31917142868042, + "learning_rate": 2.8960000000000004e-05, + "loss": 0.3281, + "step": 5795 + }, + { + "epoch": 0.3245604210997872, + "grad_norm": 1.2404654026031494, + "learning_rate": 2.8965000000000005e-05, + "loss": 0.372, + "step": 5796 + }, + { + "epoch": 0.32461641841191624, + "grad_norm": 1.165102243423462, + "learning_rate": 2.8970000000000002e-05, + "loss": 0.3417, + "step": 5797 + }, + { + "epoch": 0.32467241572404526, + "grad_norm": 1.2347089052200317, + "learning_rate": 2.8975000000000003e-05, + "loss": 0.4466, + "step": 5798 + }, + { + "epoch": 0.3247284130361743, + "grad_norm": 1.1590970754623413, + "learning_rate": 2.898e-05, + "loss": 0.4122, + "step": 5799 + }, + { + "epoch": 0.3247844103483033, + "grad_norm": 1.2014491558074951, + "learning_rate": 2.8985e-05, + "loss": 0.5527, + "step": 5800 + }, + { + "epoch": 0.3248404076604323, + "grad_norm": 1.2441595792770386, + "learning_rate": 2.8990000000000002e-05, + "loss": 0.6121, + "step": 5801 + }, + { + "epoch": 0.32489640497256134, + "grad_norm": 0.9242920875549316, + "learning_rate": 2.8995e-05, + "loss": 0.3175, + "step": 5802 + }, + { + "epoch": 0.32495240228469036, + "grad_norm": 1.5692812204360962, + "learning_rate": 2.9e-05, + "loss": 0.6651, + "step": 5803 + }, + { + "epoch": 0.3250083995968194, + "grad_norm": 1.4126709699630737, + "learning_rate": 2.9004999999999998e-05, + "loss": 0.3985, + "step": 5804 + }, + { + "epoch": 0.3250643969089484, + "grad_norm": 1.0389025211334229, + "learning_rate": 2.9010000000000005e-05, + "loss": 0.3518, + "step": 5805 + }, + { + "epoch": 0.3251203942210774, + "grad_norm": 1.4933347702026367, + "learning_rate": 2.9015000000000003e-05, + "loss": 0.4727, + "step": 5806 + }, + { + "epoch": 0.32517639153320643, + "grad_norm": 1.28818941116333, + "learning_rate": 2.9020000000000003e-05, + "loss": 0.4388, + "step": 5807 + }, + { + "epoch": 0.3252323888453354, + "grad_norm": 1.238250494003296, + "learning_rate": 2.9025e-05, + "loss": 0.5254, + "step": 5808 + }, + { + "epoch": 0.3252883861574644, + "grad_norm": 5.895007610321045, + "learning_rate": 2.903e-05, + "loss": 0.3471, + "step": 5809 + }, + { + "epoch": 0.32534438346959343, + "grad_norm": 1.2327629327774048, + "learning_rate": 2.9035000000000002e-05, + "loss": 0.3286, + "step": 5810 + }, + { + "epoch": 0.32540038078172245, + "grad_norm": 1.2726235389709473, + "learning_rate": 2.904e-05, + "loss": 0.474, + "step": 5811 + }, + { + "epoch": 0.3254563780938515, + "grad_norm": 1.2369911670684814, + "learning_rate": 2.9045e-05, + "loss": 0.4916, + "step": 5812 + }, + { + "epoch": 0.3255123754059805, + "grad_norm": 1.1697537899017334, + "learning_rate": 2.9049999999999998e-05, + "loss": 0.4745, + "step": 5813 + }, + { + "epoch": 0.3255683727181095, + "grad_norm": 1.4516698122024536, + "learning_rate": 2.9055e-05, + "loss": 0.4536, + "step": 5814 + }, + { + "epoch": 0.32562437003023853, + "grad_norm": 1.222223162651062, + "learning_rate": 2.9060000000000003e-05, + "loss": 0.455, + "step": 5815 + }, + { + "epoch": 0.32568036734236755, + "grad_norm": 1.2108324766159058, + "learning_rate": 2.9065000000000004e-05, + "loss": 0.4331, + "step": 5816 + }, + { + "epoch": 0.32573636465449657, + "grad_norm": 1.1987415552139282, + "learning_rate": 2.907e-05, + "loss": 0.4094, + "step": 5817 + }, + { + "epoch": 0.3257923619666256, + "grad_norm": 1.2771337032318115, + "learning_rate": 2.9075000000000002e-05, + "loss": 0.4081, + "step": 5818 + }, + { + "epoch": 0.3258483592787546, + "grad_norm": 1.3818776607513428, + "learning_rate": 2.9080000000000003e-05, + "loss": 0.5237, + "step": 5819 + }, + { + "epoch": 0.3259043565908836, + "grad_norm": 1.0041522979736328, + "learning_rate": 2.9085e-05, + "loss": 0.4008, + "step": 5820 + }, + { + "epoch": 0.32596035390301265, + "grad_norm": 1.1087275743484497, + "learning_rate": 2.909e-05, + "loss": 0.3765, + "step": 5821 + }, + { + "epoch": 0.32601635121514166, + "grad_norm": 1.0786428451538086, + "learning_rate": 2.9095e-05, + "loss": 0.4555, + "step": 5822 + }, + { + "epoch": 0.3260723485272707, + "grad_norm": 1.2589988708496094, + "learning_rate": 2.91e-05, + "loss": 0.3989, + "step": 5823 + }, + { + "epoch": 0.3261283458393997, + "grad_norm": 1.1303046941757202, + "learning_rate": 2.9105e-05, + "loss": 0.4542, + "step": 5824 + }, + { + "epoch": 0.3261843431515287, + "grad_norm": 1.238532543182373, + "learning_rate": 2.9110000000000004e-05, + "loss": 0.4887, + "step": 5825 + }, + { + "epoch": 0.32624034046365774, + "grad_norm": 1.1298034191131592, + "learning_rate": 2.9115000000000005e-05, + "loss": 0.3971, + "step": 5826 + }, + { + "epoch": 0.32629633777578676, + "grad_norm": 1.25812828540802, + "learning_rate": 2.9120000000000002e-05, + "loss": 0.3751, + "step": 5827 + }, + { + "epoch": 0.3263523350879158, + "grad_norm": 2.0425925254821777, + "learning_rate": 2.9125000000000003e-05, + "loss": 0.6093, + "step": 5828 + }, + { + "epoch": 0.3264083324000448, + "grad_norm": 1.1871418952941895, + "learning_rate": 2.913e-05, + "loss": 0.3733, + "step": 5829 + }, + { + "epoch": 0.3264643297121738, + "grad_norm": 1.1049633026123047, + "learning_rate": 2.9135e-05, + "loss": 0.3699, + "step": 5830 + }, + { + "epoch": 0.32652032702430284, + "grad_norm": 1.097481369972229, + "learning_rate": 2.9140000000000002e-05, + "loss": 0.3666, + "step": 5831 + }, + { + "epoch": 0.32657632433643186, + "grad_norm": 1.218788743019104, + "learning_rate": 2.9145e-05, + "loss": 0.4946, + "step": 5832 + }, + { + "epoch": 0.3266323216485609, + "grad_norm": 1.073500394821167, + "learning_rate": 2.915e-05, + "loss": 0.4256, + "step": 5833 + }, + { + "epoch": 0.3266883189606899, + "grad_norm": 1.147639513015747, + "learning_rate": 2.9154999999999998e-05, + "loss": 0.4234, + "step": 5834 + }, + { + "epoch": 0.3267443162728189, + "grad_norm": 1.3192564249038696, + "learning_rate": 2.9160000000000005e-05, + "loss": 0.398, + "step": 5835 + }, + { + "epoch": 0.32680031358494793, + "grad_norm": 1.2515380382537842, + "learning_rate": 2.9165000000000003e-05, + "loss": 0.3749, + "step": 5836 + }, + { + "epoch": 0.32685631089707695, + "grad_norm": 1.0096896886825562, + "learning_rate": 2.9170000000000004e-05, + "loss": 0.377, + "step": 5837 + }, + { + "epoch": 0.32691230820920597, + "grad_norm": 1.4385930299758911, + "learning_rate": 2.9175e-05, + "loss": 0.42, + "step": 5838 + }, + { + "epoch": 0.326968305521335, + "grad_norm": 1.1319615840911865, + "learning_rate": 2.9180000000000002e-05, + "loss": 0.4285, + "step": 5839 + }, + { + "epoch": 0.327024302833464, + "grad_norm": 1.2909079790115356, + "learning_rate": 2.9185000000000003e-05, + "loss": 0.3371, + "step": 5840 + }, + { + "epoch": 0.32708030014559303, + "grad_norm": 1.3051002025604248, + "learning_rate": 2.919e-05, + "loss": 0.4579, + "step": 5841 + }, + { + "epoch": 0.32713629745772205, + "grad_norm": 1.2702544927597046, + "learning_rate": 2.9195e-05, + "loss": 0.4637, + "step": 5842 + }, + { + "epoch": 0.32719229476985107, + "grad_norm": 1.6683790683746338, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.3962, + "step": 5843 + }, + { + "epoch": 0.3272482920819801, + "grad_norm": 1.2359310388565063, + "learning_rate": 2.9205e-05, + "loss": 0.4835, + "step": 5844 + }, + { + "epoch": 0.3273042893941091, + "grad_norm": 1.5957162380218506, + "learning_rate": 2.9210000000000003e-05, + "loss": 0.529, + "step": 5845 + }, + { + "epoch": 0.3273602867062381, + "grad_norm": 1.5851261615753174, + "learning_rate": 2.9215000000000004e-05, + "loss": 0.6152, + "step": 5846 + }, + { + "epoch": 0.32741628401836714, + "grad_norm": 1.1549354791641235, + "learning_rate": 2.922e-05, + "loss": 0.5002, + "step": 5847 + }, + { + "epoch": 0.32747228133049616, + "grad_norm": 1.7438483238220215, + "learning_rate": 2.9225000000000002e-05, + "loss": 0.5194, + "step": 5848 + }, + { + "epoch": 0.3275282786426252, + "grad_norm": 1.0823826789855957, + "learning_rate": 2.9230000000000003e-05, + "loss": 0.3454, + "step": 5849 + }, + { + "epoch": 0.32758427595475414, + "grad_norm": 1.1117078065872192, + "learning_rate": 2.9235e-05, + "loss": 0.3429, + "step": 5850 + }, + { + "epoch": 0.32764027326688316, + "grad_norm": 1.3410907983779907, + "learning_rate": 2.924e-05, + "loss": 0.4798, + "step": 5851 + }, + { + "epoch": 0.3276962705790122, + "grad_norm": 1.2882479429244995, + "learning_rate": 2.9245e-05, + "loss": 0.4065, + "step": 5852 + }, + { + "epoch": 0.3277522678911412, + "grad_norm": 1.0705493688583374, + "learning_rate": 2.925e-05, + "loss": 0.4158, + "step": 5853 + }, + { + "epoch": 0.3278082652032702, + "grad_norm": 1.15053129196167, + "learning_rate": 2.9255e-05, + "loss": 0.44, + "step": 5854 + }, + { + "epoch": 0.32786426251539924, + "grad_norm": 1.1476150751113892, + "learning_rate": 2.9260000000000004e-05, + "loss": 0.4863, + "step": 5855 + }, + { + "epoch": 0.32792025982752826, + "grad_norm": 1.0913232564926147, + "learning_rate": 2.9265000000000002e-05, + "loss": 0.3419, + "step": 5856 + }, + { + "epoch": 0.3279762571396573, + "grad_norm": 1.147476315498352, + "learning_rate": 2.9270000000000003e-05, + "loss": 0.524, + "step": 5857 + }, + { + "epoch": 0.3280322544517863, + "grad_norm": 1.0973833799362183, + "learning_rate": 2.9275000000000003e-05, + "loss": 0.4205, + "step": 5858 + }, + { + "epoch": 0.3280882517639153, + "grad_norm": 1.4630779027938843, + "learning_rate": 2.928e-05, + "loss": 0.5707, + "step": 5859 + }, + { + "epoch": 0.32814424907604434, + "grad_norm": 1.075952172279358, + "learning_rate": 2.9285e-05, + "loss": 0.276, + "step": 5860 + }, + { + "epoch": 0.32820024638817336, + "grad_norm": 1.5002340078353882, + "learning_rate": 2.929e-05, + "loss": 0.4981, + "step": 5861 + }, + { + "epoch": 0.3282562437003024, + "grad_norm": 0.9377226233482361, + "learning_rate": 2.9295e-05, + "loss": 0.3876, + "step": 5862 + }, + { + "epoch": 0.3283122410124314, + "grad_norm": 1.3477425575256348, + "learning_rate": 2.93e-05, + "loss": 0.4883, + "step": 5863 + }, + { + "epoch": 0.3283682383245604, + "grad_norm": 1.6481932401657104, + "learning_rate": 2.9304999999999998e-05, + "loss": 0.5014, + "step": 5864 + }, + { + "epoch": 0.32842423563668943, + "grad_norm": 1.1377789974212646, + "learning_rate": 2.9310000000000006e-05, + "loss": 0.3973, + "step": 5865 + }, + { + "epoch": 0.32848023294881845, + "grad_norm": 1.1514617204666138, + "learning_rate": 2.9315000000000003e-05, + "loss": 0.4386, + "step": 5866 + }, + { + "epoch": 0.32853623026094747, + "grad_norm": 1.2926123142242432, + "learning_rate": 2.9320000000000004e-05, + "loss": 0.4478, + "step": 5867 + }, + { + "epoch": 0.3285922275730765, + "grad_norm": 1.2897146940231323, + "learning_rate": 2.9325e-05, + "loss": 0.3578, + "step": 5868 + }, + { + "epoch": 0.3286482248852055, + "grad_norm": 1.2622413635253906, + "learning_rate": 2.9330000000000002e-05, + "loss": 0.4091, + "step": 5869 + }, + { + "epoch": 0.32870422219733453, + "grad_norm": 1.2460603713989258, + "learning_rate": 2.9335000000000003e-05, + "loss": 0.3957, + "step": 5870 + }, + { + "epoch": 0.32876021950946355, + "grad_norm": 1.0655896663665771, + "learning_rate": 2.934e-05, + "loss": 0.4109, + "step": 5871 + }, + { + "epoch": 0.32881621682159257, + "grad_norm": 1.2387356758117676, + "learning_rate": 2.9345e-05, + "loss": 0.3508, + "step": 5872 + }, + { + "epoch": 0.3288722141337216, + "grad_norm": 1.2638161182403564, + "learning_rate": 2.935e-05, + "loss": 0.4697, + "step": 5873 + }, + { + "epoch": 0.3289282114458506, + "grad_norm": 1.4944034814834595, + "learning_rate": 2.9355e-05, + "loss": 0.4519, + "step": 5874 + }, + { + "epoch": 0.3289842087579796, + "grad_norm": 1.0900880098342896, + "learning_rate": 2.9360000000000003e-05, + "loss": 0.4708, + "step": 5875 + }, + { + "epoch": 0.32904020607010864, + "grad_norm": 1.1817508935928345, + "learning_rate": 2.9365000000000004e-05, + "loss": 0.4162, + "step": 5876 + }, + { + "epoch": 0.32909620338223766, + "grad_norm": 1.2182482481002808, + "learning_rate": 2.9370000000000002e-05, + "loss": 0.4709, + "step": 5877 + }, + { + "epoch": 0.3291522006943667, + "grad_norm": 1.1931767463684082, + "learning_rate": 2.9375000000000003e-05, + "loss": 0.3835, + "step": 5878 + }, + { + "epoch": 0.3292081980064957, + "grad_norm": 1.245471715927124, + "learning_rate": 2.9380000000000003e-05, + "loss": 0.4692, + "step": 5879 + }, + { + "epoch": 0.3292641953186247, + "grad_norm": 1.108049750328064, + "learning_rate": 2.9385e-05, + "loss": 0.5946, + "step": 5880 + }, + { + "epoch": 0.32932019263075374, + "grad_norm": 1.0938096046447754, + "learning_rate": 2.939e-05, + "loss": 0.3256, + "step": 5881 + }, + { + "epoch": 0.32937618994288276, + "grad_norm": 1.4213799238204956, + "learning_rate": 2.9395e-05, + "loss": 0.3809, + "step": 5882 + }, + { + "epoch": 0.3294321872550118, + "grad_norm": 0.97994464635849, + "learning_rate": 2.94e-05, + "loss": 0.2664, + "step": 5883 + }, + { + "epoch": 0.3294881845671408, + "grad_norm": 1.332228183746338, + "learning_rate": 2.9405e-05, + "loss": 0.4286, + "step": 5884 + }, + { + "epoch": 0.3295441818792698, + "grad_norm": 1.1238287687301636, + "learning_rate": 2.9409999999999998e-05, + "loss": 0.3891, + "step": 5885 + }, + { + "epoch": 0.32960017919139883, + "grad_norm": 1.1983394622802734, + "learning_rate": 2.9415000000000002e-05, + "loss": 0.4452, + "step": 5886 + }, + { + "epoch": 0.32965617650352785, + "grad_norm": 1.1007509231567383, + "learning_rate": 2.9420000000000003e-05, + "loss": 0.3467, + "step": 5887 + }, + { + "epoch": 0.3297121738156569, + "grad_norm": 1.1530849933624268, + "learning_rate": 2.9425000000000004e-05, + "loss": 0.3385, + "step": 5888 + }, + { + "epoch": 0.3297681711277859, + "grad_norm": 1.2139053344726562, + "learning_rate": 2.943e-05, + "loss": 0.3979, + "step": 5889 + }, + { + "epoch": 0.3298241684399149, + "grad_norm": 1.1592355966567993, + "learning_rate": 2.9435000000000002e-05, + "loss": 0.4563, + "step": 5890 + }, + { + "epoch": 0.3298801657520439, + "grad_norm": 1.191756248474121, + "learning_rate": 2.944e-05, + "loss": 0.3815, + "step": 5891 + }, + { + "epoch": 0.3299361630641729, + "grad_norm": 1.5103602409362793, + "learning_rate": 2.9445e-05, + "loss": 0.5304, + "step": 5892 + }, + { + "epoch": 0.3299921603763019, + "grad_norm": 1.2559441328048706, + "learning_rate": 2.945e-05, + "loss": 0.4914, + "step": 5893 + }, + { + "epoch": 0.33004815768843093, + "grad_norm": 1.2686264514923096, + "learning_rate": 2.9455e-05, + "loss": 0.4153, + "step": 5894 + }, + { + "epoch": 0.33010415500055995, + "grad_norm": 1.3157553672790527, + "learning_rate": 2.946e-05, + "loss": 0.3492, + "step": 5895 + }, + { + "epoch": 0.33016015231268897, + "grad_norm": 1.3160196542739868, + "learning_rate": 2.9465000000000003e-05, + "loss": 0.4295, + "step": 5896 + }, + { + "epoch": 0.330216149624818, + "grad_norm": 1.253178596496582, + "learning_rate": 2.9470000000000004e-05, + "loss": 0.3636, + "step": 5897 + }, + { + "epoch": 0.330272146936947, + "grad_norm": 1.1357029676437378, + "learning_rate": 2.9475e-05, + "loss": 0.3601, + "step": 5898 + }, + { + "epoch": 0.33032814424907603, + "grad_norm": 1.3248504400253296, + "learning_rate": 2.9480000000000002e-05, + "loss": 0.5028, + "step": 5899 + }, + { + "epoch": 0.33038414156120505, + "grad_norm": 0.962448239326477, + "learning_rate": 2.9485000000000003e-05, + "loss": 0.296, + "step": 5900 + }, + { + "epoch": 0.33044013887333407, + "grad_norm": 1.3095908164978027, + "learning_rate": 2.949e-05, + "loss": 0.3847, + "step": 5901 + }, + { + "epoch": 0.3304961361854631, + "grad_norm": 0.9791283011436462, + "learning_rate": 2.9495e-05, + "loss": 0.2507, + "step": 5902 + }, + { + "epoch": 0.3305521334975921, + "grad_norm": 1.371442198753357, + "learning_rate": 2.95e-05, + "loss": 0.3895, + "step": 5903 + }, + { + "epoch": 0.3306081308097211, + "grad_norm": 1.227273941040039, + "learning_rate": 2.9505e-05, + "loss": 0.3704, + "step": 5904 + }, + { + "epoch": 0.33066412812185014, + "grad_norm": 1.2468384504318237, + "learning_rate": 2.951e-05, + "loss": 0.4539, + "step": 5905 + }, + { + "epoch": 0.33072012543397916, + "grad_norm": 1.2714205980300903, + "learning_rate": 2.9515000000000005e-05, + "loss": 0.4572, + "step": 5906 + }, + { + "epoch": 0.3307761227461082, + "grad_norm": 2.2266745567321777, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.4953, + "step": 5907 + }, + { + "epoch": 0.3308321200582372, + "grad_norm": 1.3567014932632446, + "learning_rate": 2.9525000000000003e-05, + "loss": 0.5034, + "step": 5908 + }, + { + "epoch": 0.3308881173703662, + "grad_norm": 0.9874616265296936, + "learning_rate": 2.9530000000000004e-05, + "loss": 0.3476, + "step": 5909 + }, + { + "epoch": 0.33094411468249524, + "grad_norm": 1.2961267232894897, + "learning_rate": 2.9535e-05, + "loss": 0.6049, + "step": 5910 + }, + { + "epoch": 0.33100011199462426, + "grad_norm": 1.4374582767486572, + "learning_rate": 2.9540000000000002e-05, + "loss": 0.4624, + "step": 5911 + }, + { + "epoch": 0.3310561093067533, + "grad_norm": 2.4435040950775146, + "learning_rate": 2.9545e-05, + "loss": 0.5334, + "step": 5912 + }, + { + "epoch": 0.3311121066188823, + "grad_norm": 1.7158364057540894, + "learning_rate": 2.955e-05, + "loss": 0.5466, + "step": 5913 + }, + { + "epoch": 0.3311681039310113, + "grad_norm": 1.1626513004302979, + "learning_rate": 2.9555e-05, + "loss": 0.3731, + "step": 5914 + }, + { + "epoch": 0.33122410124314033, + "grad_norm": 1.1088207960128784, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.5113, + "step": 5915 + }, + { + "epoch": 0.33128009855526935, + "grad_norm": 1.0978524684906006, + "learning_rate": 2.9565000000000002e-05, + "loss": 0.3087, + "step": 5916 + }, + { + "epoch": 0.3313360958673984, + "grad_norm": 1.5576510429382324, + "learning_rate": 2.9570000000000003e-05, + "loss": 0.3773, + "step": 5917 + }, + { + "epoch": 0.3313920931795274, + "grad_norm": 1.8396542072296143, + "learning_rate": 2.9575000000000004e-05, + "loss": 0.4837, + "step": 5918 + }, + { + "epoch": 0.3314480904916564, + "grad_norm": 1.2947006225585938, + "learning_rate": 2.958e-05, + "loss": 0.4933, + "step": 5919 + }, + { + "epoch": 0.33150408780378543, + "grad_norm": 1.352721095085144, + "learning_rate": 2.9585000000000002e-05, + "loss": 0.4512, + "step": 5920 + }, + { + "epoch": 0.33156008511591445, + "grad_norm": 1.2786140441894531, + "learning_rate": 2.959e-05, + "loss": 0.5311, + "step": 5921 + }, + { + "epoch": 0.33161608242804347, + "grad_norm": 1.3983354568481445, + "learning_rate": 2.9595e-05, + "loss": 0.4592, + "step": 5922 + }, + { + "epoch": 0.3316720797401725, + "grad_norm": 1.2974272966384888, + "learning_rate": 2.96e-05, + "loss": 0.3691, + "step": 5923 + }, + { + "epoch": 0.3317280770523015, + "grad_norm": 1.0443476438522339, + "learning_rate": 2.9605e-05, + "loss": 0.4561, + "step": 5924 + }, + { + "epoch": 0.3317840743644305, + "grad_norm": 1.1290427446365356, + "learning_rate": 2.961e-05, + "loss": 0.3246, + "step": 5925 + }, + { + "epoch": 0.33184007167655954, + "grad_norm": 1.2534984350204468, + "learning_rate": 2.9615000000000004e-05, + "loss": 0.4785, + "step": 5926 + }, + { + "epoch": 0.33189606898868856, + "grad_norm": 1.167789340019226, + "learning_rate": 2.9620000000000004e-05, + "loss": 0.3557, + "step": 5927 + }, + { + "epoch": 0.3319520663008176, + "grad_norm": 1.1558021306991577, + "learning_rate": 2.9625000000000002e-05, + "loss": 0.4502, + "step": 5928 + }, + { + "epoch": 0.3320080636129466, + "grad_norm": 1.1212310791015625, + "learning_rate": 2.9630000000000003e-05, + "loss": 0.3688, + "step": 5929 + }, + { + "epoch": 0.3320640609250756, + "grad_norm": 1.2709614038467407, + "learning_rate": 2.9635e-05, + "loss": 0.5522, + "step": 5930 + }, + { + "epoch": 0.33212005823720464, + "grad_norm": 1.2580879926681519, + "learning_rate": 2.964e-05, + "loss": 0.3608, + "step": 5931 + }, + { + "epoch": 0.3321760555493336, + "grad_norm": 1.2257146835327148, + "learning_rate": 2.9645e-05, + "loss": 0.3542, + "step": 5932 + }, + { + "epoch": 0.3322320528614626, + "grad_norm": 2.1395881175994873, + "learning_rate": 2.965e-05, + "loss": 0.3611, + "step": 5933 + }, + { + "epoch": 0.33228805017359164, + "grad_norm": 1.2016584873199463, + "learning_rate": 2.9655e-05, + "loss": 0.3914, + "step": 5934 + }, + { + "epoch": 0.33234404748572066, + "grad_norm": 1.1124950647354126, + "learning_rate": 2.9659999999999997e-05, + "loss": 0.3443, + "step": 5935 + }, + { + "epoch": 0.3324000447978497, + "grad_norm": 0.9924523234367371, + "learning_rate": 2.9665000000000005e-05, + "loss": 0.343, + "step": 5936 + }, + { + "epoch": 0.3324560421099787, + "grad_norm": 1.3250898122787476, + "learning_rate": 2.9670000000000002e-05, + "loss": 0.5017, + "step": 5937 + }, + { + "epoch": 0.3325120394221077, + "grad_norm": 1.2025245428085327, + "learning_rate": 2.9675000000000003e-05, + "loss": 0.4645, + "step": 5938 + }, + { + "epoch": 0.33256803673423674, + "grad_norm": 1.294891119003296, + "learning_rate": 2.9680000000000004e-05, + "loss": 0.3881, + "step": 5939 + }, + { + "epoch": 0.33262403404636576, + "grad_norm": 1.1268936395645142, + "learning_rate": 2.9685e-05, + "loss": 0.3423, + "step": 5940 + }, + { + "epoch": 0.3326800313584948, + "grad_norm": 1.1726906299591064, + "learning_rate": 2.9690000000000002e-05, + "loss": 0.4491, + "step": 5941 + }, + { + "epoch": 0.3327360286706238, + "grad_norm": 1.1992634534835815, + "learning_rate": 2.9695e-05, + "loss": 0.527, + "step": 5942 + }, + { + "epoch": 0.3327920259827528, + "grad_norm": 1.3620399236679077, + "learning_rate": 2.97e-05, + "loss": 0.4762, + "step": 5943 + }, + { + "epoch": 0.33284802329488183, + "grad_norm": 1.216871738433838, + "learning_rate": 2.9705e-05, + "loss": 0.468, + "step": 5944 + }, + { + "epoch": 0.33290402060701085, + "grad_norm": 1.324371099472046, + "learning_rate": 2.971e-05, + "loss": 0.3604, + "step": 5945 + }, + { + "epoch": 0.3329600179191399, + "grad_norm": 1.2822390794754028, + "learning_rate": 2.9715000000000003e-05, + "loss": 0.6002, + "step": 5946 + }, + { + "epoch": 0.3330160152312689, + "grad_norm": 1.1421071290969849, + "learning_rate": 2.9720000000000003e-05, + "loss": 0.3899, + "step": 5947 + }, + { + "epoch": 0.3330720125433979, + "grad_norm": 1.2771941423416138, + "learning_rate": 2.9725000000000004e-05, + "loss": 0.4653, + "step": 5948 + }, + { + "epoch": 0.33312800985552693, + "grad_norm": 1.1364119052886963, + "learning_rate": 2.973e-05, + "loss": 0.4113, + "step": 5949 + }, + { + "epoch": 0.33318400716765595, + "grad_norm": 1.1866883039474487, + "learning_rate": 2.9735000000000002e-05, + "loss": 0.3082, + "step": 5950 + }, + { + "epoch": 0.33324000447978497, + "grad_norm": 1.2553757429122925, + "learning_rate": 2.974e-05, + "loss": 0.4513, + "step": 5951 + }, + { + "epoch": 0.333296001791914, + "grad_norm": 1.2461782693862915, + "learning_rate": 2.9745e-05, + "loss": 0.4556, + "step": 5952 + }, + { + "epoch": 0.333351999104043, + "grad_norm": 1.210750699043274, + "learning_rate": 2.975e-05, + "loss": 0.4, + "step": 5953 + }, + { + "epoch": 0.333407996416172, + "grad_norm": 1.0719366073608398, + "learning_rate": 2.9755e-05, + "loss": 0.315, + "step": 5954 + }, + { + "epoch": 0.33346399372830104, + "grad_norm": 2.0919089317321777, + "learning_rate": 2.976e-05, + "loss": 0.4499, + "step": 5955 + }, + { + "epoch": 0.33351999104043006, + "grad_norm": 1.1930122375488281, + "learning_rate": 2.9765000000000004e-05, + "loss": 0.4203, + "step": 5956 + }, + { + "epoch": 0.3335759883525591, + "grad_norm": 1.2995959520339966, + "learning_rate": 2.9770000000000005e-05, + "loss": 0.4431, + "step": 5957 + }, + { + "epoch": 0.3336319856646881, + "grad_norm": 1.2213457822799683, + "learning_rate": 2.9775000000000002e-05, + "loss": 0.4014, + "step": 5958 + }, + { + "epoch": 0.3336879829768171, + "grad_norm": 1.2256245613098145, + "learning_rate": 2.9780000000000003e-05, + "loss": 0.3807, + "step": 5959 + }, + { + "epoch": 0.33374398028894614, + "grad_norm": 1.2050431966781616, + "learning_rate": 2.9785e-05, + "loss": 0.4637, + "step": 5960 + }, + { + "epoch": 0.33379997760107516, + "grad_norm": 1.3042926788330078, + "learning_rate": 2.979e-05, + "loss": 0.4021, + "step": 5961 + }, + { + "epoch": 0.3338559749132042, + "grad_norm": 1.3158693313598633, + "learning_rate": 2.9795000000000002e-05, + "loss": 0.3686, + "step": 5962 + }, + { + "epoch": 0.3339119722253332, + "grad_norm": 1.1798471212387085, + "learning_rate": 2.98e-05, + "loss": 0.5074, + "step": 5963 + }, + { + "epoch": 0.3339679695374622, + "grad_norm": 4.180051326751709, + "learning_rate": 2.9805e-05, + "loss": 0.4549, + "step": 5964 + }, + { + "epoch": 0.33402396684959124, + "grad_norm": 1.3571497201919556, + "learning_rate": 2.9809999999999997e-05, + "loss": 0.5164, + "step": 5965 + }, + { + "epoch": 0.33407996416172026, + "grad_norm": 1.260483980178833, + "learning_rate": 2.9815000000000005e-05, + "loss": 0.4295, + "step": 5966 + }, + { + "epoch": 0.3341359614738493, + "grad_norm": 4.337609767913818, + "learning_rate": 2.9820000000000002e-05, + "loss": 0.5291, + "step": 5967 + }, + { + "epoch": 0.3341919587859783, + "grad_norm": 1.2313674688339233, + "learning_rate": 2.9825000000000003e-05, + "loss": 0.3771, + "step": 5968 + }, + { + "epoch": 0.3342479560981073, + "grad_norm": 1.0947515964508057, + "learning_rate": 2.9830000000000004e-05, + "loss": 0.3706, + "step": 5969 + }, + { + "epoch": 0.33430395341023633, + "grad_norm": 1.1112008094787598, + "learning_rate": 2.9835e-05, + "loss": 0.3836, + "step": 5970 + }, + { + "epoch": 0.33435995072236535, + "grad_norm": 1.6684881448745728, + "learning_rate": 2.9840000000000002e-05, + "loss": 0.777, + "step": 5971 + }, + { + "epoch": 0.33441594803449437, + "grad_norm": 1.222672939300537, + "learning_rate": 2.9845e-05, + "loss": 0.3733, + "step": 5972 + }, + { + "epoch": 0.3344719453466234, + "grad_norm": 1.3642115592956543, + "learning_rate": 2.985e-05, + "loss": 0.4429, + "step": 5973 + }, + { + "epoch": 0.33452794265875235, + "grad_norm": 1.3494887351989746, + "learning_rate": 2.9855e-05, + "loss": 0.5372, + "step": 5974 + }, + { + "epoch": 0.33458393997088137, + "grad_norm": 1.0940035581588745, + "learning_rate": 2.986e-05, + "loss": 0.3721, + "step": 5975 + }, + { + "epoch": 0.3346399372830104, + "grad_norm": 1.239007830619812, + "learning_rate": 2.9865000000000003e-05, + "loss": 0.4012, + "step": 5976 + }, + { + "epoch": 0.3346959345951394, + "grad_norm": 1.5419095754623413, + "learning_rate": 2.9870000000000004e-05, + "loss": 0.4955, + "step": 5977 + }, + { + "epoch": 0.33475193190726843, + "grad_norm": 1.0845848321914673, + "learning_rate": 2.9875000000000004e-05, + "loss": 0.4384, + "step": 5978 + }, + { + "epoch": 0.33480792921939745, + "grad_norm": 1.198300838470459, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.4088, + "step": 5979 + }, + { + "epoch": 0.33486392653152647, + "grad_norm": 1.2984102964401245, + "learning_rate": 2.9885000000000003e-05, + "loss": 0.5256, + "step": 5980 + }, + { + "epoch": 0.3349199238436555, + "grad_norm": 1.0907014608383179, + "learning_rate": 2.989e-05, + "loss": 0.317, + "step": 5981 + }, + { + "epoch": 0.3349759211557845, + "grad_norm": 1.1750301122665405, + "learning_rate": 2.9895e-05, + "loss": 0.3831, + "step": 5982 + }, + { + "epoch": 0.3350319184679135, + "grad_norm": 0.9335674047470093, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.3426, + "step": 5983 + }, + { + "epoch": 0.33508791578004254, + "grad_norm": 1.2949601411819458, + "learning_rate": 2.9905e-05, + "loss": 0.5453, + "step": 5984 + }, + { + "epoch": 0.33514391309217156, + "grad_norm": 1.1657267808914185, + "learning_rate": 2.991e-05, + "loss": 0.4275, + "step": 5985 + }, + { + "epoch": 0.3351999104043006, + "grad_norm": 0.9324960708618164, + "learning_rate": 2.9915000000000004e-05, + "loss": 0.247, + "step": 5986 + }, + { + "epoch": 0.3352559077164296, + "grad_norm": 1.3305065631866455, + "learning_rate": 2.9920000000000005e-05, + "loss": 0.3834, + "step": 5987 + }, + { + "epoch": 0.3353119050285586, + "grad_norm": 1.611104965209961, + "learning_rate": 2.9925000000000002e-05, + "loss": 0.4356, + "step": 5988 + }, + { + "epoch": 0.33536790234068764, + "grad_norm": 1.1590913534164429, + "learning_rate": 2.9930000000000003e-05, + "loss": 0.4541, + "step": 5989 + }, + { + "epoch": 0.33542389965281666, + "grad_norm": 1.1663968563079834, + "learning_rate": 2.9935e-05, + "loss": 0.3898, + "step": 5990 + }, + { + "epoch": 0.3354798969649457, + "grad_norm": 0.9980918169021606, + "learning_rate": 2.994e-05, + "loss": 0.4104, + "step": 5991 + }, + { + "epoch": 0.3355358942770747, + "grad_norm": 1.218346118927002, + "learning_rate": 2.9945000000000002e-05, + "loss": 0.3951, + "step": 5992 + }, + { + "epoch": 0.3355918915892037, + "grad_norm": 1.4852898120880127, + "learning_rate": 2.995e-05, + "loss": 0.5311, + "step": 5993 + }, + { + "epoch": 0.33564788890133274, + "grad_norm": 1.5939909219741821, + "learning_rate": 2.9955e-05, + "loss": 0.5085, + "step": 5994 + }, + { + "epoch": 0.33570388621346176, + "grad_norm": 1.1286392211914062, + "learning_rate": 2.9959999999999998e-05, + "loss": 0.3972, + "step": 5995 + }, + { + "epoch": 0.3357598835255908, + "grad_norm": 1.3640192747116089, + "learning_rate": 2.9965000000000005e-05, + "loss": 0.6368, + "step": 5996 + }, + { + "epoch": 0.3358158808377198, + "grad_norm": 1.1870054006576538, + "learning_rate": 2.9970000000000003e-05, + "loss": 0.3638, + "step": 5997 + }, + { + "epoch": 0.3358718781498488, + "grad_norm": 1.1932722330093384, + "learning_rate": 2.9975000000000004e-05, + "loss": 0.3375, + "step": 5998 + }, + { + "epoch": 0.33592787546197783, + "grad_norm": 1.1448731422424316, + "learning_rate": 2.998e-05, + "loss": 0.4382, + "step": 5999 + }, + { + "epoch": 0.33598387277410685, + "grad_norm": 1.493430256843567, + "learning_rate": 2.9985000000000002e-05, + "loss": 0.3791, + "step": 6000 + }, + { + "epoch": 0.33603987008623587, + "grad_norm": 1.4207919836044312, + "learning_rate": 2.9990000000000003e-05, + "loss": 0.4886, + "step": 6001 + }, + { + "epoch": 0.3360958673983649, + "grad_norm": 1.2372599840164185, + "learning_rate": 2.9995e-05, + "loss": 0.3462, + "step": 6002 + }, + { + "epoch": 0.3361518647104939, + "grad_norm": 1.1469321250915527, + "learning_rate": 3e-05, + "loss": 0.2863, + "step": 6003 + }, + { + "epoch": 0.3362078620226229, + "grad_norm": 1.5369642972946167, + "learning_rate": 3.0004999999999998e-05, + "loss": 0.42, + "step": 6004 + }, + { + "epoch": 0.33626385933475195, + "grad_norm": 1.1388967037200928, + "learning_rate": 3.001e-05, + "loss": 0.3926, + "step": 6005 + }, + { + "epoch": 0.33631985664688097, + "grad_norm": 1.1151459217071533, + "learning_rate": 3.0015e-05, + "loss": 0.32, + "step": 6006 + }, + { + "epoch": 0.33637585395901, + "grad_norm": 1.7051790952682495, + "learning_rate": 3.0020000000000004e-05, + "loss": 0.3744, + "step": 6007 + }, + { + "epoch": 0.336431851271139, + "grad_norm": 1.3285667896270752, + "learning_rate": 3.0025000000000005e-05, + "loss": 0.4068, + "step": 6008 + }, + { + "epoch": 0.336487848583268, + "grad_norm": 1.3671685457229614, + "learning_rate": 3.0030000000000002e-05, + "loss": 0.476, + "step": 6009 + }, + { + "epoch": 0.33654384589539704, + "grad_norm": 1.2600650787353516, + "learning_rate": 3.0035000000000003e-05, + "loss": 0.4219, + "step": 6010 + }, + { + "epoch": 0.33659984320752606, + "grad_norm": 1.5153084993362427, + "learning_rate": 3.004e-05, + "loss": 0.5961, + "step": 6011 + }, + { + "epoch": 0.3366558405196551, + "grad_norm": 1.4338983297348022, + "learning_rate": 3.0045e-05, + "loss": 0.4875, + "step": 6012 + }, + { + "epoch": 0.3367118378317841, + "grad_norm": 1.2349830865859985, + "learning_rate": 3.0050000000000002e-05, + "loss": 0.3986, + "step": 6013 + }, + { + "epoch": 0.3367678351439131, + "grad_norm": 1.2845345735549927, + "learning_rate": 3.0055e-05, + "loss": 0.3558, + "step": 6014 + }, + { + "epoch": 0.3368238324560421, + "grad_norm": 1.5571088790893555, + "learning_rate": 3.006e-05, + "loss": 0.3869, + "step": 6015 + }, + { + "epoch": 0.3368798297681711, + "grad_norm": 1.8339508771896362, + "learning_rate": 3.0064999999999998e-05, + "loss": 0.3918, + "step": 6016 + }, + { + "epoch": 0.3369358270803001, + "grad_norm": 1.1436814069747925, + "learning_rate": 3.0070000000000005e-05, + "loss": 0.327, + "step": 6017 + }, + { + "epoch": 0.33699182439242914, + "grad_norm": 1.402848720550537, + "learning_rate": 3.0075000000000003e-05, + "loss": 0.4811, + "step": 6018 + }, + { + "epoch": 0.33704782170455816, + "grad_norm": 1.5328079462051392, + "learning_rate": 3.0080000000000003e-05, + "loss": 0.4196, + "step": 6019 + }, + { + "epoch": 0.3371038190166872, + "grad_norm": 1.229766607284546, + "learning_rate": 3.0085e-05, + "loss": 0.4686, + "step": 6020 + }, + { + "epoch": 0.3371598163288162, + "grad_norm": 1.4152299165725708, + "learning_rate": 3.009e-05, + "loss": 0.3973, + "step": 6021 + }, + { + "epoch": 0.3372158136409452, + "grad_norm": 1.3779040575027466, + "learning_rate": 3.0095000000000002e-05, + "loss": 0.5503, + "step": 6022 + }, + { + "epoch": 0.33727181095307424, + "grad_norm": 1.532416820526123, + "learning_rate": 3.01e-05, + "loss": 0.4722, + "step": 6023 + }, + { + "epoch": 0.33732780826520326, + "grad_norm": 1.2227298021316528, + "learning_rate": 3.0105e-05, + "loss": 0.3752, + "step": 6024 + }, + { + "epoch": 0.3373838055773323, + "grad_norm": 1.1379321813583374, + "learning_rate": 3.0109999999999998e-05, + "loss": 0.5022, + "step": 6025 + }, + { + "epoch": 0.3374398028894613, + "grad_norm": 1.117943525314331, + "learning_rate": 3.0115e-05, + "loss": 0.4367, + "step": 6026 + }, + { + "epoch": 0.3374958002015903, + "grad_norm": 1.1439199447631836, + "learning_rate": 3.0120000000000003e-05, + "loss": 0.372, + "step": 6027 + }, + { + "epoch": 0.33755179751371933, + "grad_norm": 1.347422480583191, + "learning_rate": 3.0125000000000004e-05, + "loss": 0.5321, + "step": 6028 + }, + { + "epoch": 0.33760779482584835, + "grad_norm": 1.2958378791809082, + "learning_rate": 3.013e-05, + "loss": 0.6347, + "step": 6029 + }, + { + "epoch": 0.33766379213797737, + "grad_norm": 1.301550269126892, + "learning_rate": 3.0135000000000002e-05, + "loss": 0.3741, + "step": 6030 + }, + { + "epoch": 0.3377197894501064, + "grad_norm": 1.3525784015655518, + "learning_rate": 3.0140000000000003e-05, + "loss": 0.3619, + "step": 6031 + }, + { + "epoch": 0.3377757867622354, + "grad_norm": 1.347469449043274, + "learning_rate": 3.0145e-05, + "loss": 0.4806, + "step": 6032 + }, + { + "epoch": 0.3378317840743644, + "grad_norm": 1.193375825881958, + "learning_rate": 3.015e-05, + "loss": 0.4192, + "step": 6033 + }, + { + "epoch": 0.33788778138649345, + "grad_norm": 1.3086562156677246, + "learning_rate": 3.0155e-05, + "loss": 0.3881, + "step": 6034 + }, + { + "epoch": 0.33794377869862247, + "grad_norm": 1.2944972515106201, + "learning_rate": 3.016e-05, + "loss": 0.4319, + "step": 6035 + }, + { + "epoch": 0.3379997760107515, + "grad_norm": 1.7294611930847168, + "learning_rate": 3.0165e-05, + "loss": 0.4066, + "step": 6036 + }, + { + "epoch": 0.3380557733228805, + "grad_norm": 1.4674289226531982, + "learning_rate": 3.0170000000000004e-05, + "loss": 0.4316, + "step": 6037 + }, + { + "epoch": 0.3381117706350095, + "grad_norm": 1.7740062475204468, + "learning_rate": 3.0175e-05, + "loss": 0.402, + "step": 6038 + }, + { + "epoch": 0.33816776794713854, + "grad_norm": 1.2527861595153809, + "learning_rate": 3.0180000000000002e-05, + "loss": 0.3912, + "step": 6039 + }, + { + "epoch": 0.33822376525926756, + "grad_norm": 1.3782498836517334, + "learning_rate": 3.0185000000000003e-05, + "loss": 0.5425, + "step": 6040 + }, + { + "epoch": 0.3382797625713966, + "grad_norm": 1.044850468635559, + "learning_rate": 3.019e-05, + "loss": 0.4195, + "step": 6041 + }, + { + "epoch": 0.3383357598835256, + "grad_norm": 1.184483528137207, + "learning_rate": 3.0195e-05, + "loss": 0.3694, + "step": 6042 + }, + { + "epoch": 0.3383917571956546, + "grad_norm": 5.612485885620117, + "learning_rate": 3.02e-05, + "loss": 0.6494, + "step": 6043 + }, + { + "epoch": 0.33844775450778364, + "grad_norm": 1.1050502061843872, + "learning_rate": 3.0205e-05, + "loss": 0.4542, + "step": 6044 + }, + { + "epoch": 0.33850375181991266, + "grad_norm": 1.2551862001419067, + "learning_rate": 3.021e-05, + "loss": 0.4103, + "step": 6045 + }, + { + "epoch": 0.3385597491320417, + "grad_norm": 1.382351040840149, + "learning_rate": 3.0214999999999998e-05, + "loss": 0.4621, + "step": 6046 + }, + { + "epoch": 0.3386157464441707, + "grad_norm": 0.9998094439506531, + "learning_rate": 3.0220000000000005e-05, + "loss": 0.3793, + "step": 6047 + }, + { + "epoch": 0.3386717437562997, + "grad_norm": 1.1794143915176392, + "learning_rate": 3.0225000000000003e-05, + "loss": 0.3553, + "step": 6048 + }, + { + "epoch": 0.33872774106842873, + "grad_norm": 1.4121116399765015, + "learning_rate": 3.0230000000000004e-05, + "loss": 0.3912, + "step": 6049 + }, + { + "epoch": 0.33878373838055775, + "grad_norm": 1.1807981729507446, + "learning_rate": 3.0235e-05, + "loss": 0.4023, + "step": 6050 + }, + { + "epoch": 0.33883973569268677, + "grad_norm": 1.160921573638916, + "learning_rate": 3.0240000000000002e-05, + "loss": 0.3538, + "step": 6051 + }, + { + "epoch": 0.3388957330048158, + "grad_norm": 1.1883474588394165, + "learning_rate": 3.0245000000000003e-05, + "loss": 0.3711, + "step": 6052 + }, + { + "epoch": 0.3389517303169448, + "grad_norm": 1.2117105722427368, + "learning_rate": 3.025e-05, + "loss": 0.3819, + "step": 6053 + }, + { + "epoch": 0.33900772762907383, + "grad_norm": 1.452558159828186, + "learning_rate": 3.0255e-05, + "loss": 0.4038, + "step": 6054 + }, + { + "epoch": 0.33906372494120285, + "grad_norm": 1.2886942625045776, + "learning_rate": 3.0259999999999998e-05, + "loss": 0.5197, + "step": 6055 + }, + { + "epoch": 0.33911972225333187, + "grad_norm": 1.525838017463684, + "learning_rate": 3.0265e-05, + "loss": 0.4201, + "step": 6056 + }, + { + "epoch": 0.33917571956546083, + "grad_norm": 1.502649188041687, + "learning_rate": 3.0270000000000003e-05, + "loss": 0.5808, + "step": 6057 + }, + { + "epoch": 0.33923171687758985, + "grad_norm": 1.1106334924697876, + "learning_rate": 3.0275000000000004e-05, + "loss": 0.4563, + "step": 6058 + }, + { + "epoch": 0.33928771418971887, + "grad_norm": 1.3896900415420532, + "learning_rate": 3.028e-05, + "loss": 0.4421, + "step": 6059 + }, + { + "epoch": 0.3393437115018479, + "grad_norm": 1.3931236267089844, + "learning_rate": 3.0285000000000002e-05, + "loss": 0.3937, + "step": 6060 + }, + { + "epoch": 0.3393997088139769, + "grad_norm": 1.1702464818954468, + "learning_rate": 3.0290000000000003e-05, + "loss": 0.4756, + "step": 6061 + }, + { + "epoch": 0.3394557061261059, + "grad_norm": 1.3623554706573486, + "learning_rate": 3.0295e-05, + "loss": 0.4031, + "step": 6062 + }, + { + "epoch": 0.33951170343823495, + "grad_norm": 1.37270188331604, + "learning_rate": 3.03e-05, + "loss": 0.503, + "step": 6063 + }, + { + "epoch": 0.33956770075036397, + "grad_norm": 1.3576231002807617, + "learning_rate": 3.0305e-05, + "loss": 0.5454, + "step": 6064 + }, + { + "epoch": 0.339623698062493, + "grad_norm": 1.1892614364624023, + "learning_rate": 3.031e-05, + "loss": 0.4471, + "step": 6065 + }, + { + "epoch": 0.339679695374622, + "grad_norm": 1.1943871974945068, + "learning_rate": 3.0315e-05, + "loss": 0.5917, + "step": 6066 + }, + { + "epoch": 0.339735692686751, + "grad_norm": 1.1706382036209106, + "learning_rate": 3.0320000000000004e-05, + "loss": 0.5538, + "step": 6067 + }, + { + "epoch": 0.33979168999888004, + "grad_norm": 1.2243510484695435, + "learning_rate": 3.0325000000000002e-05, + "loss": 0.2955, + "step": 6068 + }, + { + "epoch": 0.33984768731100906, + "grad_norm": 1.0546960830688477, + "learning_rate": 3.0330000000000003e-05, + "loss": 0.3759, + "step": 6069 + }, + { + "epoch": 0.3399036846231381, + "grad_norm": 1.1734250783920288, + "learning_rate": 3.0335000000000003e-05, + "loss": 0.3993, + "step": 6070 + }, + { + "epoch": 0.3399596819352671, + "grad_norm": 1.121383547782898, + "learning_rate": 3.034e-05, + "loss": 0.4006, + "step": 6071 + }, + { + "epoch": 0.3400156792473961, + "grad_norm": 1.3144326210021973, + "learning_rate": 3.0345e-05, + "loss": 0.4595, + "step": 6072 + }, + { + "epoch": 0.34007167655952514, + "grad_norm": 1.1103761196136475, + "learning_rate": 3.035e-05, + "loss": 0.3824, + "step": 6073 + }, + { + "epoch": 0.34012767387165416, + "grad_norm": 1.0838195085525513, + "learning_rate": 3.0355e-05, + "loss": 0.344, + "step": 6074 + }, + { + "epoch": 0.3401836711837832, + "grad_norm": 1.0776463747024536, + "learning_rate": 3.036e-05, + "loss": 0.3973, + "step": 6075 + }, + { + "epoch": 0.3402396684959122, + "grad_norm": 1.2838622331619263, + "learning_rate": 3.0364999999999998e-05, + "loss": 0.5617, + "step": 6076 + }, + { + "epoch": 0.3402956658080412, + "grad_norm": 1.385256290435791, + "learning_rate": 3.0370000000000006e-05, + "loss": 0.7115, + "step": 6077 + }, + { + "epoch": 0.34035166312017023, + "grad_norm": 1.182480812072754, + "learning_rate": 3.0375000000000003e-05, + "loss": 0.4803, + "step": 6078 + }, + { + "epoch": 0.34040766043229925, + "grad_norm": 1.7291700839996338, + "learning_rate": 3.0380000000000004e-05, + "loss": 0.5289, + "step": 6079 + }, + { + "epoch": 0.34046365774442827, + "grad_norm": 1.4423400163650513, + "learning_rate": 3.0385e-05, + "loss": 0.521, + "step": 6080 + }, + { + "epoch": 0.3405196550565573, + "grad_norm": 2.5962367057800293, + "learning_rate": 3.0390000000000002e-05, + "loss": 0.5251, + "step": 6081 + }, + { + "epoch": 0.3405756523686863, + "grad_norm": 1.3955833911895752, + "learning_rate": 3.0395000000000003e-05, + "loss": 0.4798, + "step": 6082 + }, + { + "epoch": 0.34063164968081533, + "grad_norm": 1.157979130744934, + "learning_rate": 3.04e-05, + "loss": 0.3445, + "step": 6083 + }, + { + "epoch": 0.34068764699294435, + "grad_norm": 1.4732646942138672, + "learning_rate": 3.0405e-05, + "loss": 0.6209, + "step": 6084 + }, + { + "epoch": 0.34074364430507337, + "grad_norm": 1.9608231782913208, + "learning_rate": 3.041e-05, + "loss": 0.5513, + "step": 6085 + }, + { + "epoch": 0.3407996416172024, + "grad_norm": 1.2772008180618286, + "learning_rate": 3.0415e-05, + "loss": 0.4158, + "step": 6086 + }, + { + "epoch": 0.3408556389293314, + "grad_norm": 1.2368332147598267, + "learning_rate": 3.0420000000000004e-05, + "loss": 0.5146, + "step": 6087 + }, + { + "epoch": 0.3409116362414604, + "grad_norm": 1.4972248077392578, + "learning_rate": 3.0425000000000004e-05, + "loss": 0.5596, + "step": 6088 + }, + { + "epoch": 0.34096763355358944, + "grad_norm": 1.2302608489990234, + "learning_rate": 3.0430000000000002e-05, + "loss": 0.3766, + "step": 6089 + }, + { + "epoch": 0.34102363086571846, + "grad_norm": 1.5643738508224487, + "learning_rate": 3.0435000000000003e-05, + "loss": 0.3371, + "step": 6090 + }, + { + "epoch": 0.3410796281778475, + "grad_norm": 1.244739294052124, + "learning_rate": 3.0440000000000003e-05, + "loss": 0.4079, + "step": 6091 + }, + { + "epoch": 0.3411356254899765, + "grad_norm": 1.2407468557357788, + "learning_rate": 3.0445e-05, + "loss": 0.4041, + "step": 6092 + }, + { + "epoch": 0.3411916228021055, + "grad_norm": 1.6655876636505127, + "learning_rate": 3.045e-05, + "loss": 0.3608, + "step": 6093 + }, + { + "epoch": 0.34124762011423454, + "grad_norm": 1.256253957748413, + "learning_rate": 3.0455e-05, + "loss": 0.4899, + "step": 6094 + }, + { + "epoch": 0.34130361742636356, + "grad_norm": 1.1335060596466064, + "learning_rate": 3.046e-05, + "loss": 0.4612, + "step": 6095 + }, + { + "epoch": 0.3413596147384926, + "grad_norm": 1.1727112531661987, + "learning_rate": 3.0465e-05, + "loss": 0.3644, + "step": 6096 + }, + { + "epoch": 0.3414156120506216, + "grad_norm": 1.1319737434387207, + "learning_rate": 3.0470000000000005e-05, + "loss": 0.3832, + "step": 6097 + }, + { + "epoch": 0.34147160936275056, + "grad_norm": 1.1234557628631592, + "learning_rate": 3.0475000000000002e-05, + "loss": 0.3799, + "step": 6098 + }, + { + "epoch": 0.3415276066748796, + "grad_norm": 1.4665559530258179, + "learning_rate": 3.0480000000000003e-05, + "loss": 0.5543, + "step": 6099 + }, + { + "epoch": 0.3415836039870086, + "grad_norm": 1.3194949626922607, + "learning_rate": 3.0485000000000004e-05, + "loss": 0.4193, + "step": 6100 + }, + { + "epoch": 0.3416396012991376, + "grad_norm": 1.2255374193191528, + "learning_rate": 3.049e-05, + "loss": 0.4217, + "step": 6101 + }, + { + "epoch": 0.34169559861126664, + "grad_norm": 1.2952872514724731, + "learning_rate": 3.0495000000000002e-05, + "loss": 0.4849, + "step": 6102 + }, + { + "epoch": 0.34175159592339566, + "grad_norm": 1.278414011001587, + "learning_rate": 3.05e-05, + "loss": 0.5239, + "step": 6103 + }, + { + "epoch": 0.3418075932355247, + "grad_norm": 1.3654206991195679, + "learning_rate": 3.0505e-05, + "loss": 0.3876, + "step": 6104 + }, + { + "epoch": 0.3418635905476537, + "grad_norm": 1.132808804512024, + "learning_rate": 3.051e-05, + "loss": 0.3932, + "step": 6105 + }, + { + "epoch": 0.3419195878597827, + "grad_norm": 1.2543237209320068, + "learning_rate": 3.0515e-05, + "loss": 0.4998, + "step": 6106 + }, + { + "epoch": 0.34197558517191173, + "grad_norm": 1.3329524993896484, + "learning_rate": 3.0520000000000006e-05, + "loss": 0.4027, + "step": 6107 + }, + { + "epoch": 0.34203158248404075, + "grad_norm": 1.3539350032806396, + "learning_rate": 3.0525e-05, + "loss": 0.5102, + "step": 6108 + }, + { + "epoch": 0.34208757979616977, + "grad_norm": 1.3504351377487183, + "learning_rate": 3.053e-05, + "loss": 0.3668, + "step": 6109 + }, + { + "epoch": 0.3421435771082988, + "grad_norm": 1.2522965669631958, + "learning_rate": 3.0535000000000005e-05, + "loss": 0.4975, + "step": 6110 + }, + { + "epoch": 0.3421995744204278, + "grad_norm": 1.115391492843628, + "learning_rate": 3.054e-05, + "loss": 0.3742, + "step": 6111 + }, + { + "epoch": 0.34225557173255683, + "grad_norm": 1.205297827720642, + "learning_rate": 3.0545e-05, + "loss": 0.3266, + "step": 6112 + }, + { + "epoch": 0.34231156904468585, + "grad_norm": 1.2326608896255493, + "learning_rate": 3.0550000000000004e-05, + "loss": 0.3953, + "step": 6113 + }, + { + "epoch": 0.34236756635681487, + "grad_norm": 1.3323166370391846, + "learning_rate": 3.0555e-05, + "loss": 0.4736, + "step": 6114 + }, + { + "epoch": 0.3424235636689439, + "grad_norm": 1.5239416360855103, + "learning_rate": 3.056e-05, + "loss": 0.4383, + "step": 6115 + }, + { + "epoch": 0.3424795609810729, + "grad_norm": 1.2085906267166138, + "learning_rate": 3.0564999999999996e-05, + "loss": 0.3913, + "step": 6116 + }, + { + "epoch": 0.3425355582932019, + "grad_norm": 1.346198320388794, + "learning_rate": 3.057000000000001e-05, + "loss": 0.3611, + "step": 6117 + }, + { + "epoch": 0.34259155560533094, + "grad_norm": 1.3381444215774536, + "learning_rate": 3.0575000000000005e-05, + "loss": 0.3948, + "step": 6118 + }, + { + "epoch": 0.34264755291745996, + "grad_norm": 1.4198280572891235, + "learning_rate": 3.058e-05, + "loss": 0.5018, + "step": 6119 + }, + { + "epoch": 0.342703550229589, + "grad_norm": 1.2343220710754395, + "learning_rate": 3.0585e-05, + "loss": 0.4075, + "step": 6120 + }, + { + "epoch": 0.342759547541718, + "grad_norm": 1.4251677989959717, + "learning_rate": 3.0590000000000004e-05, + "loss": 0.3703, + "step": 6121 + }, + { + "epoch": 0.342815544853847, + "grad_norm": 1.425933837890625, + "learning_rate": 3.0595e-05, + "loss": 0.5023, + "step": 6122 + }, + { + "epoch": 0.34287154216597604, + "grad_norm": 1.3435081243515015, + "learning_rate": 3.06e-05, + "loss": 0.422, + "step": 6123 + }, + { + "epoch": 0.34292753947810506, + "grad_norm": 1.4446288347244263, + "learning_rate": 3.0605e-05, + "loss": 0.5023, + "step": 6124 + }, + { + "epoch": 0.3429835367902341, + "grad_norm": 1.1390122175216675, + "learning_rate": 3.061e-05, + "loss": 0.363, + "step": 6125 + }, + { + "epoch": 0.3430395341023631, + "grad_norm": 1.2695108652114868, + "learning_rate": 3.0615e-05, + "loss": 0.4795, + "step": 6126 + }, + { + "epoch": 0.3430955314144921, + "grad_norm": 1.1526217460632324, + "learning_rate": 3.062e-05, + "loss": 0.4332, + "step": 6127 + }, + { + "epoch": 0.34315152872662114, + "grad_norm": 1.4268572330474854, + "learning_rate": 3.0625000000000006e-05, + "loss": 0.4968, + "step": 6128 + }, + { + "epoch": 0.34320752603875015, + "grad_norm": 1.3458688259124756, + "learning_rate": 3.063e-05, + "loss": 0.4948, + "step": 6129 + }, + { + "epoch": 0.3432635233508792, + "grad_norm": 1.4268699884414673, + "learning_rate": 3.0635e-05, + "loss": 0.5031, + "step": 6130 + }, + { + "epoch": 0.3433195206630082, + "grad_norm": 1.0886454582214355, + "learning_rate": 3.0640000000000005e-05, + "loss": 0.3386, + "step": 6131 + }, + { + "epoch": 0.3433755179751372, + "grad_norm": 1.8940868377685547, + "learning_rate": 3.0645e-05, + "loss": 0.4986, + "step": 6132 + }, + { + "epoch": 0.34343151528726623, + "grad_norm": 1.2075221538543701, + "learning_rate": 3.065e-05, + "loss": 0.3753, + "step": 6133 + }, + { + "epoch": 0.34348751259939525, + "grad_norm": 1.0235775709152222, + "learning_rate": 3.0655e-05, + "loss": 0.3847, + "step": 6134 + }, + { + "epoch": 0.34354350991152427, + "grad_norm": 1.2436648607254028, + "learning_rate": 3.066e-05, + "loss": 0.5126, + "step": 6135 + }, + { + "epoch": 0.3435995072236533, + "grad_norm": 9.105645179748535, + "learning_rate": 3.0665e-05, + "loss": 0.4121, + "step": 6136 + }, + { + "epoch": 0.3436555045357823, + "grad_norm": 1.3691940307617188, + "learning_rate": 3.0669999999999996e-05, + "loss": 0.4754, + "step": 6137 + }, + { + "epoch": 0.3437115018479113, + "grad_norm": 1.1193677186965942, + "learning_rate": 3.067500000000001e-05, + "loss": 0.3744, + "step": 6138 + }, + { + "epoch": 0.3437674991600403, + "grad_norm": 1.0840986967086792, + "learning_rate": 3.0680000000000004e-05, + "loss": 0.4402, + "step": 6139 + }, + { + "epoch": 0.3438234964721693, + "grad_norm": 1.5513265132904053, + "learning_rate": 3.0685e-05, + "loss": 0.3624, + "step": 6140 + }, + { + "epoch": 0.34387949378429833, + "grad_norm": 1.279994010925293, + "learning_rate": 3.069e-05, + "loss": 0.4096, + "step": 6141 + }, + { + "epoch": 0.34393549109642735, + "grad_norm": 1.1125075817108154, + "learning_rate": 3.0695000000000003e-05, + "loss": 0.4031, + "step": 6142 + }, + { + "epoch": 0.34399148840855637, + "grad_norm": 1.304520606994629, + "learning_rate": 3.07e-05, + "loss": 0.367, + "step": 6143 + }, + { + "epoch": 0.3440474857206854, + "grad_norm": 1.2914345264434814, + "learning_rate": 3.0705e-05, + "loss": 0.4538, + "step": 6144 + }, + { + "epoch": 0.3441034830328144, + "grad_norm": 1.1114362478256226, + "learning_rate": 3.071e-05, + "loss": 0.3721, + "step": 6145 + }, + { + "epoch": 0.3441594803449434, + "grad_norm": 1.2352871894836426, + "learning_rate": 3.0715e-05, + "loss": 0.4383, + "step": 6146 + }, + { + "epoch": 0.34421547765707244, + "grad_norm": 1.0656942129135132, + "learning_rate": 3.072e-05, + "loss": 0.4117, + "step": 6147 + }, + { + "epoch": 0.34427147496920146, + "grad_norm": 1.1222028732299805, + "learning_rate": 3.0725e-05, + "loss": 0.3683, + "step": 6148 + }, + { + "epoch": 0.3443274722813305, + "grad_norm": 1.280902624130249, + "learning_rate": 3.0730000000000006e-05, + "loss": 0.5762, + "step": 6149 + }, + { + "epoch": 0.3443834695934595, + "grad_norm": 1.1570007801055908, + "learning_rate": 3.0735e-05, + "loss": 0.366, + "step": 6150 + }, + { + "epoch": 0.3444394669055885, + "grad_norm": 1.252055048942566, + "learning_rate": 3.074e-05, + "loss": 0.3943, + "step": 6151 + }, + { + "epoch": 0.34449546421771754, + "grad_norm": 1.2108405828475952, + "learning_rate": 3.0745000000000005e-05, + "loss": 0.3738, + "step": 6152 + }, + { + "epoch": 0.34455146152984656, + "grad_norm": 1.0659451484680176, + "learning_rate": 3.075e-05, + "loss": 0.4407, + "step": 6153 + }, + { + "epoch": 0.3446074588419756, + "grad_norm": 1.2742671966552734, + "learning_rate": 3.0755e-05, + "loss": 0.409, + "step": 6154 + }, + { + "epoch": 0.3446634561541046, + "grad_norm": 1.1289572715759277, + "learning_rate": 3.076e-05, + "loss": 0.4658, + "step": 6155 + }, + { + "epoch": 0.3447194534662336, + "grad_norm": 1.2440177202224731, + "learning_rate": 3.0765e-05, + "loss": 0.4304, + "step": 6156 + }, + { + "epoch": 0.34477545077836264, + "grad_norm": 1.1720653772354126, + "learning_rate": 3.077e-05, + "loss": 0.4095, + "step": 6157 + }, + { + "epoch": 0.34483144809049165, + "grad_norm": 7.197729110717773, + "learning_rate": 3.0775e-05, + "loss": 0.5063, + "step": 6158 + }, + { + "epoch": 0.3448874454026207, + "grad_norm": 1.2122024297714233, + "learning_rate": 3.078e-05, + "loss": 0.4376, + "step": 6159 + }, + { + "epoch": 0.3449434427147497, + "grad_norm": 1.476108193397522, + "learning_rate": 3.0785000000000004e-05, + "loss": 0.5088, + "step": 6160 + }, + { + "epoch": 0.3449994400268787, + "grad_norm": 1.1375744342803955, + "learning_rate": 3.079e-05, + "loss": 0.4141, + "step": 6161 + }, + { + "epoch": 0.34505543733900773, + "grad_norm": 1.2847084999084473, + "learning_rate": 3.0795e-05, + "loss": 0.458, + "step": 6162 + }, + { + "epoch": 0.34511143465113675, + "grad_norm": 1.247577428817749, + "learning_rate": 3.08e-05, + "loss": 0.4325, + "step": 6163 + }, + { + "epoch": 0.34516743196326577, + "grad_norm": 1.1049706935882568, + "learning_rate": 3.0805e-05, + "loss": 0.4173, + "step": 6164 + }, + { + "epoch": 0.3452234292753948, + "grad_norm": 1.5211737155914307, + "learning_rate": 3.081e-05, + "loss": 0.6595, + "step": 6165 + }, + { + "epoch": 0.3452794265875238, + "grad_norm": 1.4209784269332886, + "learning_rate": 3.0815e-05, + "loss": 0.461, + "step": 6166 + }, + { + "epoch": 0.3453354238996528, + "grad_norm": 1.1894135475158691, + "learning_rate": 3.082e-05, + "loss": 0.6291, + "step": 6167 + }, + { + "epoch": 0.34539142121178185, + "grad_norm": 1.2749963998794556, + "learning_rate": 3.0825000000000004e-05, + "loss": 0.4693, + "step": 6168 + }, + { + "epoch": 0.34544741852391087, + "grad_norm": 1.2753534317016602, + "learning_rate": 3.083e-05, + "loss": 0.4812, + "step": 6169 + }, + { + "epoch": 0.3455034158360399, + "grad_norm": 1.4958391189575195, + "learning_rate": 3.0835000000000005e-05, + "loss": 0.4399, + "step": 6170 + }, + { + "epoch": 0.3455594131481689, + "grad_norm": 1.2740752696990967, + "learning_rate": 3.084e-05, + "loss": 0.3982, + "step": 6171 + }, + { + "epoch": 0.3456154104602979, + "grad_norm": 1.0687915086746216, + "learning_rate": 3.0845e-05, + "loss": 0.3227, + "step": 6172 + }, + { + "epoch": 0.34567140777242694, + "grad_norm": 1.1849191188812256, + "learning_rate": 3.0850000000000004e-05, + "loss": 0.3809, + "step": 6173 + }, + { + "epoch": 0.34572740508455596, + "grad_norm": 1.2271841764450073, + "learning_rate": 3.0855e-05, + "loss": 0.4786, + "step": 6174 + }, + { + "epoch": 0.345783402396685, + "grad_norm": 1.063286304473877, + "learning_rate": 3.086e-05, + "loss": 0.3983, + "step": 6175 + }, + { + "epoch": 0.345839399708814, + "grad_norm": 0.9903095364570618, + "learning_rate": 3.0865e-05, + "loss": 0.3459, + "step": 6176 + }, + { + "epoch": 0.345895397020943, + "grad_norm": 1.322891354560852, + "learning_rate": 3.087e-05, + "loss": 0.4494, + "step": 6177 + }, + { + "epoch": 0.34595139433307204, + "grad_norm": 1.1799235343933105, + "learning_rate": 3.0875000000000005e-05, + "loss": 0.3814, + "step": 6178 + }, + { + "epoch": 0.34600739164520106, + "grad_norm": 1.1407090425491333, + "learning_rate": 3.088e-05, + "loss": 0.409, + "step": 6179 + }, + { + "epoch": 0.3460633889573301, + "grad_norm": 1.314505934715271, + "learning_rate": 3.0885e-05, + "loss": 0.412, + "step": 6180 + }, + { + "epoch": 0.34611938626945904, + "grad_norm": 1.1176612377166748, + "learning_rate": 3.0890000000000004e-05, + "loss": 0.4532, + "step": 6181 + }, + { + "epoch": 0.34617538358158806, + "grad_norm": 1.3744513988494873, + "learning_rate": 3.0895e-05, + "loss": 0.5124, + "step": 6182 + }, + { + "epoch": 0.3462313808937171, + "grad_norm": 1.1842596530914307, + "learning_rate": 3.09e-05, + "loss": 0.451, + "step": 6183 + }, + { + "epoch": 0.3462873782058461, + "grad_norm": 1.231649398803711, + "learning_rate": 3.0905e-05, + "loss": 0.3933, + "step": 6184 + }, + { + "epoch": 0.3463433755179751, + "grad_norm": 1.2014415264129639, + "learning_rate": 3.091e-05, + "loss": 0.3315, + "step": 6185 + }, + { + "epoch": 0.34639937283010414, + "grad_norm": 5.304755687713623, + "learning_rate": 3.0915e-05, + "loss": 0.4169, + "step": 6186 + }, + { + "epoch": 0.34645537014223315, + "grad_norm": 1.3249987363815308, + "learning_rate": 3.092e-05, + "loss": 0.4412, + "step": 6187 + }, + { + "epoch": 0.3465113674543622, + "grad_norm": 3.466057062149048, + "learning_rate": 3.0925000000000006e-05, + "loss": 0.4771, + "step": 6188 + }, + { + "epoch": 0.3465673647664912, + "grad_norm": 1.0502848625183105, + "learning_rate": 3.0930000000000004e-05, + "loss": 0.3134, + "step": 6189 + }, + { + "epoch": 0.3466233620786202, + "grad_norm": 1.3575304746627808, + "learning_rate": 3.0935e-05, + "loss": 0.4014, + "step": 6190 + }, + { + "epoch": 0.34667935939074923, + "grad_norm": 1.1503523588180542, + "learning_rate": 3.0940000000000005e-05, + "loss": 0.4059, + "step": 6191 + }, + { + "epoch": 0.34673535670287825, + "grad_norm": 1.267883062362671, + "learning_rate": 3.0945e-05, + "loss": 0.5432, + "step": 6192 + }, + { + "epoch": 0.34679135401500727, + "grad_norm": 2.9755451679229736, + "learning_rate": 3.095e-05, + "loss": 0.3337, + "step": 6193 + }, + { + "epoch": 0.3468473513271363, + "grad_norm": 1.181565523147583, + "learning_rate": 3.0955e-05, + "loss": 0.3977, + "step": 6194 + }, + { + "epoch": 0.3469033486392653, + "grad_norm": 1.145254373550415, + "learning_rate": 3.096e-05, + "loss": 0.4545, + "step": 6195 + }, + { + "epoch": 0.3469593459513943, + "grad_norm": 1.4250879287719727, + "learning_rate": 3.0965e-05, + "loss": 0.484, + "step": 6196 + }, + { + "epoch": 0.34701534326352335, + "grad_norm": 1.1087721586227417, + "learning_rate": 3.0969999999999997e-05, + "loss": 0.458, + "step": 6197 + }, + { + "epoch": 0.34707134057565237, + "grad_norm": 1.4464209079742432, + "learning_rate": 3.0975e-05, + "loss": 0.5284, + "step": 6198 + }, + { + "epoch": 0.3471273378877814, + "grad_norm": 1.2910740375518799, + "learning_rate": 3.0980000000000005e-05, + "loss": 0.3903, + "step": 6199 + }, + { + "epoch": 0.3471833351999104, + "grad_norm": 1.1853680610656738, + "learning_rate": 3.0985e-05, + "loss": 0.3896, + "step": 6200 + }, + { + "epoch": 0.3472393325120394, + "grad_norm": 1.0837509632110596, + "learning_rate": 3.099e-05, + "loss": 0.427, + "step": 6201 + }, + { + "epoch": 0.34729532982416844, + "grad_norm": 1.038487434387207, + "learning_rate": 3.0995000000000004e-05, + "loss": 0.4323, + "step": 6202 + }, + { + "epoch": 0.34735132713629746, + "grad_norm": 1.403473138809204, + "learning_rate": 3.1e-05, + "loss": 0.4597, + "step": 6203 + }, + { + "epoch": 0.3474073244484265, + "grad_norm": 1.3064563274383545, + "learning_rate": 3.1005e-05, + "loss": 0.5115, + "step": 6204 + }, + { + "epoch": 0.3474633217605555, + "grad_norm": 1.1536040306091309, + "learning_rate": 3.101e-05, + "loss": 0.3336, + "step": 6205 + }, + { + "epoch": 0.3475193190726845, + "grad_norm": 1.3112775087356567, + "learning_rate": 3.1015e-05, + "loss": 0.4123, + "step": 6206 + }, + { + "epoch": 0.34757531638481354, + "grad_norm": 1.2330294847488403, + "learning_rate": 3.102e-05, + "loss": 0.3784, + "step": 6207 + }, + { + "epoch": 0.34763131369694256, + "grad_norm": 1.2423793077468872, + "learning_rate": 3.1025e-05, + "loss": 0.4646, + "step": 6208 + }, + { + "epoch": 0.3476873110090716, + "grad_norm": 1.0161997079849243, + "learning_rate": 3.1030000000000006e-05, + "loss": 0.3805, + "step": 6209 + }, + { + "epoch": 0.3477433083212006, + "grad_norm": 1.2694252729415894, + "learning_rate": 3.1035000000000004e-05, + "loss": 0.385, + "step": 6210 + }, + { + "epoch": 0.3477993056333296, + "grad_norm": 1.343778133392334, + "learning_rate": 3.104e-05, + "loss": 0.4388, + "step": 6211 + }, + { + "epoch": 0.34785530294545863, + "grad_norm": 1.2166775465011597, + "learning_rate": 3.1045000000000005e-05, + "loss": 0.3247, + "step": 6212 + }, + { + "epoch": 0.34791130025758765, + "grad_norm": 1.2452661991119385, + "learning_rate": 3.105e-05, + "loss": 0.339, + "step": 6213 + }, + { + "epoch": 0.34796729756971667, + "grad_norm": 1.1289180517196655, + "learning_rate": 3.1055e-05, + "loss": 0.4189, + "step": 6214 + }, + { + "epoch": 0.3480232948818457, + "grad_norm": 0.9198927879333496, + "learning_rate": 3.106e-05, + "loss": 0.361, + "step": 6215 + }, + { + "epoch": 0.3480792921939747, + "grad_norm": 1.1886038780212402, + "learning_rate": 3.1065e-05, + "loss": 0.5015, + "step": 6216 + }, + { + "epoch": 0.34813528950610373, + "grad_norm": 1.1647417545318604, + "learning_rate": 3.107e-05, + "loss": 0.3268, + "step": 6217 + }, + { + "epoch": 0.34819128681823275, + "grad_norm": 1.2943861484527588, + "learning_rate": 3.1075e-05, + "loss": 0.3943, + "step": 6218 + }, + { + "epoch": 0.34824728413036177, + "grad_norm": 1.125791072845459, + "learning_rate": 3.108e-05, + "loss": 0.3445, + "step": 6219 + }, + { + "epoch": 0.3483032814424908, + "grad_norm": 1.4532657861709595, + "learning_rate": 3.1085000000000005e-05, + "loss": 0.4646, + "step": 6220 + }, + { + "epoch": 0.3483592787546198, + "grad_norm": 1.043227195739746, + "learning_rate": 3.109e-05, + "loss": 0.3693, + "step": 6221 + }, + { + "epoch": 0.34841527606674877, + "grad_norm": 1.2873167991638184, + "learning_rate": 3.1095e-05, + "loss": 0.4276, + "step": 6222 + }, + { + "epoch": 0.3484712733788778, + "grad_norm": 1.4242085218429565, + "learning_rate": 3.1100000000000004e-05, + "loss": 0.5273, + "step": 6223 + }, + { + "epoch": 0.3485272706910068, + "grad_norm": 1.1898281574249268, + "learning_rate": 3.1105e-05, + "loss": 0.4503, + "step": 6224 + }, + { + "epoch": 0.3485832680031358, + "grad_norm": 1.1694384813308716, + "learning_rate": 3.111e-05, + "loss": 0.3438, + "step": 6225 + }, + { + "epoch": 0.34863926531526485, + "grad_norm": 1.185025930404663, + "learning_rate": 3.1115e-05, + "loss": 0.337, + "step": 6226 + }, + { + "epoch": 0.34869526262739386, + "grad_norm": 1.0493113994598389, + "learning_rate": 3.112e-05, + "loss": 0.3695, + "step": 6227 + }, + { + "epoch": 0.3487512599395229, + "grad_norm": 1.441767692565918, + "learning_rate": 3.1125000000000004e-05, + "loss": 0.3575, + "step": 6228 + }, + { + "epoch": 0.3488072572516519, + "grad_norm": 1.06027090549469, + "learning_rate": 3.113e-05, + "loss": 0.2929, + "step": 6229 + }, + { + "epoch": 0.3488632545637809, + "grad_norm": 1.1034297943115234, + "learning_rate": 3.1135000000000006e-05, + "loss": 0.4501, + "step": 6230 + }, + { + "epoch": 0.34891925187590994, + "grad_norm": 1.1654653549194336, + "learning_rate": 3.1140000000000003e-05, + "loss": 0.4183, + "step": 6231 + }, + { + "epoch": 0.34897524918803896, + "grad_norm": 1.605057954788208, + "learning_rate": 3.1145e-05, + "loss": 0.4588, + "step": 6232 + }, + { + "epoch": 0.349031246500168, + "grad_norm": 1.2390538454055786, + "learning_rate": 3.115e-05, + "loss": 0.4808, + "step": 6233 + }, + { + "epoch": 0.349087243812297, + "grad_norm": 1.6129049062728882, + "learning_rate": 3.1155e-05, + "loss": 0.6286, + "step": 6234 + }, + { + "epoch": 0.349143241124426, + "grad_norm": 1.207960605621338, + "learning_rate": 3.116e-05, + "loss": 0.5001, + "step": 6235 + }, + { + "epoch": 0.34919923843655504, + "grad_norm": 1.1692736148834229, + "learning_rate": 3.1165e-05, + "loss": 0.3719, + "step": 6236 + }, + { + "epoch": 0.34925523574868406, + "grad_norm": 1.7736616134643555, + "learning_rate": 3.117e-05, + "loss": 0.609, + "step": 6237 + }, + { + "epoch": 0.3493112330608131, + "grad_norm": 1.7082831859588623, + "learning_rate": 3.1175000000000006e-05, + "loss": 0.4509, + "step": 6238 + }, + { + "epoch": 0.3493672303729421, + "grad_norm": 2.39262056350708, + "learning_rate": 3.118e-05, + "loss": 0.6077, + "step": 6239 + }, + { + "epoch": 0.3494232276850711, + "grad_norm": 1.184689998626709, + "learning_rate": 3.1185e-05, + "loss": 0.4831, + "step": 6240 + }, + { + "epoch": 0.34947922499720013, + "grad_norm": 1.4418725967407227, + "learning_rate": 3.1190000000000005e-05, + "loss": 0.5638, + "step": 6241 + }, + { + "epoch": 0.34953522230932915, + "grad_norm": 1.3676230907440186, + "learning_rate": 3.1195e-05, + "loss": 0.5351, + "step": 6242 + }, + { + "epoch": 0.34959121962145817, + "grad_norm": 1.1918115615844727, + "learning_rate": 3.12e-05, + "loss": 0.488, + "step": 6243 + }, + { + "epoch": 0.3496472169335872, + "grad_norm": 1.1994301080703735, + "learning_rate": 3.1205000000000004e-05, + "loss": 0.4618, + "step": 6244 + }, + { + "epoch": 0.3497032142457162, + "grad_norm": 1.205239176750183, + "learning_rate": 3.121e-05, + "loss": 0.4535, + "step": 6245 + }, + { + "epoch": 0.34975921155784523, + "grad_norm": 1.0689464807510376, + "learning_rate": 3.1215e-05, + "loss": 0.4064, + "step": 6246 + }, + { + "epoch": 0.34981520886997425, + "grad_norm": 1.5116850137710571, + "learning_rate": 3.122e-05, + "loss": 0.5589, + "step": 6247 + }, + { + "epoch": 0.34987120618210327, + "grad_norm": 1.1299906969070435, + "learning_rate": 3.122500000000001e-05, + "loss": 0.361, + "step": 6248 + }, + { + "epoch": 0.3499272034942323, + "grad_norm": 1.441096544265747, + "learning_rate": 3.1230000000000004e-05, + "loss": 0.488, + "step": 6249 + }, + { + "epoch": 0.3499832008063613, + "grad_norm": 1.3029266595840454, + "learning_rate": 3.1235e-05, + "loss": 0.4782, + "step": 6250 + }, + { + "epoch": 0.3500391981184903, + "grad_norm": 1.3709391355514526, + "learning_rate": 3.1240000000000006e-05, + "loss": 0.5139, + "step": 6251 + }, + { + "epoch": 0.35009519543061934, + "grad_norm": 1.1097272634506226, + "learning_rate": 3.1245e-05, + "loss": 0.4393, + "step": 6252 + }, + { + "epoch": 0.35015119274274836, + "grad_norm": 1.2350056171417236, + "learning_rate": 3.125e-05, + "loss": 0.4682, + "step": 6253 + }, + { + "epoch": 0.3502071900548774, + "grad_norm": 1.2020292282104492, + "learning_rate": 3.1255e-05, + "loss": 0.4677, + "step": 6254 + }, + { + "epoch": 0.3502631873670064, + "grad_norm": 1.249942421913147, + "learning_rate": 3.126e-05, + "loss": 0.3179, + "step": 6255 + }, + { + "epoch": 0.3503191846791354, + "grad_norm": 1.1817785501480103, + "learning_rate": 3.1265e-05, + "loss": 0.4654, + "step": 6256 + }, + { + "epoch": 0.35037518199126444, + "grad_norm": 1.414600133895874, + "learning_rate": 3.127e-05, + "loss": 0.5809, + "step": 6257 + }, + { + "epoch": 0.35043117930339346, + "grad_norm": 1.3949699401855469, + "learning_rate": 3.1275e-05, + "loss": 0.3887, + "step": 6258 + }, + { + "epoch": 0.3504871766155225, + "grad_norm": 1.4005590677261353, + "learning_rate": 3.1280000000000005e-05, + "loss": 0.4218, + "step": 6259 + }, + { + "epoch": 0.3505431739276515, + "grad_norm": 1.2349687814712524, + "learning_rate": 3.1285e-05, + "loss": 0.4686, + "step": 6260 + }, + { + "epoch": 0.3505991712397805, + "grad_norm": 1.2598515748977661, + "learning_rate": 3.129e-05, + "loss": 0.3772, + "step": 6261 + }, + { + "epoch": 0.35065516855190954, + "grad_norm": 1.1200759410858154, + "learning_rate": 3.1295000000000004e-05, + "loss": 0.3523, + "step": 6262 + }, + { + "epoch": 0.3507111658640385, + "grad_norm": 1.159464955329895, + "learning_rate": 3.13e-05, + "loss": 0.3585, + "step": 6263 + }, + { + "epoch": 0.3507671631761675, + "grad_norm": 1.3332213163375854, + "learning_rate": 3.1305e-05, + "loss": 0.4859, + "step": 6264 + }, + { + "epoch": 0.35082316048829654, + "grad_norm": 0.9965168833732605, + "learning_rate": 3.1310000000000003e-05, + "loss": 0.3704, + "step": 6265 + }, + { + "epoch": 0.35087915780042556, + "grad_norm": 1.3566290140151978, + "learning_rate": 3.1315e-05, + "loss": 0.5692, + "step": 6266 + }, + { + "epoch": 0.3509351551125546, + "grad_norm": 1.304772138595581, + "learning_rate": 3.132e-05, + "loss": 0.3989, + "step": 6267 + }, + { + "epoch": 0.3509911524246836, + "grad_norm": 1.2333190441131592, + "learning_rate": 3.1324999999999996e-05, + "loss": 0.429, + "step": 6268 + }, + { + "epoch": 0.3510471497368126, + "grad_norm": 1.3603678941726685, + "learning_rate": 3.133000000000001e-05, + "loss": 0.3178, + "step": 6269 + }, + { + "epoch": 0.35110314704894163, + "grad_norm": 1.4109737873077393, + "learning_rate": 3.1335000000000004e-05, + "loss": 0.6681, + "step": 6270 + }, + { + "epoch": 0.35115914436107065, + "grad_norm": 1.0344195365905762, + "learning_rate": 3.134e-05, + "loss": 0.2755, + "step": 6271 + }, + { + "epoch": 0.35121514167319967, + "grad_norm": 1.2682287693023682, + "learning_rate": 3.1345e-05, + "loss": 0.5373, + "step": 6272 + }, + { + "epoch": 0.3512711389853287, + "grad_norm": 1.3705300092697144, + "learning_rate": 3.135e-05, + "loss": 0.497, + "step": 6273 + }, + { + "epoch": 0.3513271362974577, + "grad_norm": 1.1386823654174805, + "learning_rate": 3.1355e-05, + "loss": 0.4038, + "step": 6274 + }, + { + "epoch": 0.35138313360958673, + "grad_norm": 1.5180588960647583, + "learning_rate": 3.136e-05, + "loss": 0.4209, + "step": 6275 + }, + { + "epoch": 0.35143913092171575, + "grad_norm": 1.1405197381973267, + "learning_rate": 3.1365e-05, + "loss": 0.3937, + "step": 6276 + }, + { + "epoch": 0.35149512823384477, + "grad_norm": 1.2759274244308472, + "learning_rate": 3.137e-05, + "loss": 0.4227, + "step": 6277 + }, + { + "epoch": 0.3515511255459738, + "grad_norm": 1.3097624778747559, + "learning_rate": 3.1375e-05, + "loss": 0.3797, + "step": 6278 + }, + { + "epoch": 0.3516071228581028, + "grad_norm": 1.594214677810669, + "learning_rate": 3.138e-05, + "loss": 0.3762, + "step": 6279 + }, + { + "epoch": 0.3516631201702318, + "grad_norm": 1.5521080493927002, + "learning_rate": 3.1385000000000005e-05, + "loss": 0.4929, + "step": 6280 + }, + { + "epoch": 0.35171911748236084, + "grad_norm": 1.3778748512268066, + "learning_rate": 3.139e-05, + "loss": 0.4782, + "step": 6281 + }, + { + "epoch": 0.35177511479448986, + "grad_norm": 1.325209617614746, + "learning_rate": 3.1395e-05, + "loss": 0.3956, + "step": 6282 + }, + { + "epoch": 0.3518311121066189, + "grad_norm": 1.1175185441970825, + "learning_rate": 3.1400000000000004e-05, + "loss": 0.3477, + "step": 6283 + }, + { + "epoch": 0.3518871094187479, + "grad_norm": 1.3731361627578735, + "learning_rate": 3.1405e-05, + "loss": 0.3739, + "step": 6284 + }, + { + "epoch": 0.3519431067308769, + "grad_norm": 1.2805825471878052, + "learning_rate": 3.141e-05, + "loss": 0.4728, + "step": 6285 + }, + { + "epoch": 0.35199910404300594, + "grad_norm": 1.4022307395935059, + "learning_rate": 3.1415e-05, + "loss": 0.5399, + "step": 6286 + }, + { + "epoch": 0.35205510135513496, + "grad_norm": 1.5376461744308472, + "learning_rate": 3.142e-05, + "loss": 0.4116, + "step": 6287 + }, + { + "epoch": 0.352111098667264, + "grad_norm": 1.169945240020752, + "learning_rate": 3.1425e-05, + "loss": 0.4146, + "step": 6288 + }, + { + "epoch": 0.352167095979393, + "grad_norm": 1.2208759784698486, + "learning_rate": 3.143e-05, + "loss": 0.3352, + "step": 6289 + }, + { + "epoch": 0.352223093291522, + "grad_norm": 1.0417909622192383, + "learning_rate": 3.1435000000000007e-05, + "loss": 0.3359, + "step": 6290 + }, + { + "epoch": 0.35227909060365103, + "grad_norm": 1.1003497838974, + "learning_rate": 3.1440000000000004e-05, + "loss": 0.4339, + "step": 6291 + }, + { + "epoch": 0.35233508791578005, + "grad_norm": 1.071714162826538, + "learning_rate": 3.1445e-05, + "loss": 0.4731, + "step": 6292 + }, + { + "epoch": 0.3523910852279091, + "grad_norm": 1.254522681236267, + "learning_rate": 3.145e-05, + "loss": 0.4213, + "step": 6293 + }, + { + "epoch": 0.3524470825400381, + "grad_norm": 1.2709230184555054, + "learning_rate": 3.1455e-05, + "loss": 0.4648, + "step": 6294 + }, + { + "epoch": 0.3525030798521671, + "grad_norm": 1.3310257196426392, + "learning_rate": 3.146e-05, + "loss": 0.4866, + "step": 6295 + }, + { + "epoch": 0.35255907716429613, + "grad_norm": 1.053220272064209, + "learning_rate": 3.1465e-05, + "loss": 0.3499, + "step": 6296 + }, + { + "epoch": 0.35261507447642515, + "grad_norm": 1.2098735570907593, + "learning_rate": 3.147e-05, + "loss": 0.4933, + "step": 6297 + }, + { + "epoch": 0.35267107178855417, + "grad_norm": 1.2267911434173584, + "learning_rate": 3.1475e-05, + "loss": 0.4664, + "step": 6298 + }, + { + "epoch": 0.3527270691006832, + "grad_norm": 1.0496174097061157, + "learning_rate": 3.1480000000000004e-05, + "loss": 0.3439, + "step": 6299 + }, + { + "epoch": 0.3527830664128122, + "grad_norm": 1.759591817855835, + "learning_rate": 3.1485e-05, + "loss": 0.3957, + "step": 6300 + }, + { + "epoch": 0.3528390637249412, + "grad_norm": 1.3334206342697144, + "learning_rate": 3.1490000000000005e-05, + "loss": 0.5113, + "step": 6301 + }, + { + "epoch": 0.35289506103707025, + "grad_norm": 0.9650449156761169, + "learning_rate": 3.1495e-05, + "loss": 0.3999, + "step": 6302 + }, + { + "epoch": 0.35295105834919926, + "grad_norm": 1.84669029712677, + "learning_rate": 3.15e-05, + "loss": 0.4424, + "step": 6303 + }, + { + "epoch": 0.3530070556613283, + "grad_norm": 1.1730831861495972, + "learning_rate": 3.1505000000000004e-05, + "loss": 0.5259, + "step": 6304 + }, + { + "epoch": 0.35306305297345725, + "grad_norm": 1.1850773096084595, + "learning_rate": 3.151e-05, + "loss": 0.4617, + "step": 6305 + }, + { + "epoch": 0.35311905028558627, + "grad_norm": 1.1357048749923706, + "learning_rate": 3.1515e-05, + "loss": 0.4219, + "step": 6306 + }, + { + "epoch": 0.3531750475977153, + "grad_norm": 1.2091214656829834, + "learning_rate": 3.1519999999999996e-05, + "loss": 0.397, + "step": 6307 + }, + { + "epoch": 0.3532310449098443, + "grad_norm": 1.161665678024292, + "learning_rate": 3.1525e-05, + "loss": 0.539, + "step": 6308 + }, + { + "epoch": 0.3532870422219733, + "grad_norm": 1.1879373788833618, + "learning_rate": 3.1530000000000005e-05, + "loss": 0.4209, + "step": 6309 + }, + { + "epoch": 0.35334303953410234, + "grad_norm": 1.4861319065093994, + "learning_rate": 3.1535e-05, + "loss": 0.4973, + "step": 6310 + }, + { + "epoch": 0.35339903684623136, + "grad_norm": 1.3899377584457397, + "learning_rate": 3.154e-05, + "loss": 0.4838, + "step": 6311 + }, + { + "epoch": 0.3534550341583604, + "grad_norm": 1.3385703563690186, + "learning_rate": 3.1545000000000004e-05, + "loss": 0.5009, + "step": 6312 + }, + { + "epoch": 0.3535110314704894, + "grad_norm": 1.191323161125183, + "learning_rate": 3.155e-05, + "loss": 0.4146, + "step": 6313 + }, + { + "epoch": 0.3535670287826184, + "grad_norm": 1.427211880683899, + "learning_rate": 3.1555e-05, + "loss": 0.4842, + "step": 6314 + }, + { + "epoch": 0.35362302609474744, + "grad_norm": 1.081580638885498, + "learning_rate": 3.156e-05, + "loss": 0.4151, + "step": 6315 + }, + { + "epoch": 0.35367902340687646, + "grad_norm": 1.265127182006836, + "learning_rate": 3.1565e-05, + "loss": 0.3041, + "step": 6316 + }, + { + "epoch": 0.3537350207190055, + "grad_norm": 1.2927141189575195, + "learning_rate": 3.157e-05, + "loss": 0.5042, + "step": 6317 + }, + { + "epoch": 0.3537910180311345, + "grad_norm": 1.1545687913894653, + "learning_rate": 3.1575e-05, + "loss": 0.386, + "step": 6318 + }, + { + "epoch": 0.3538470153432635, + "grad_norm": 1.1653871536254883, + "learning_rate": 3.1580000000000006e-05, + "loss": 0.3908, + "step": 6319 + }, + { + "epoch": 0.35390301265539253, + "grad_norm": 1.319900393486023, + "learning_rate": 3.1585e-05, + "loss": 0.4542, + "step": 6320 + }, + { + "epoch": 0.35395900996752155, + "grad_norm": 0.9685698747634888, + "learning_rate": 3.159e-05, + "loss": 0.3191, + "step": 6321 + }, + { + "epoch": 0.3540150072796506, + "grad_norm": 1.3467005491256714, + "learning_rate": 3.1595000000000005e-05, + "loss": 0.4291, + "step": 6322 + }, + { + "epoch": 0.3540710045917796, + "grad_norm": 2.2519562244415283, + "learning_rate": 3.16e-05, + "loss": 0.4011, + "step": 6323 + }, + { + "epoch": 0.3541270019039086, + "grad_norm": 1.185945987701416, + "learning_rate": 3.1605e-05, + "loss": 0.3492, + "step": 6324 + }, + { + "epoch": 0.35418299921603763, + "grad_norm": 1.2187808752059937, + "learning_rate": 3.1610000000000004e-05, + "loss": 0.4182, + "step": 6325 + }, + { + "epoch": 0.35423899652816665, + "grad_norm": 1.1608519554138184, + "learning_rate": 3.1615e-05, + "loss": 0.4505, + "step": 6326 + }, + { + "epoch": 0.35429499384029567, + "grad_norm": 1.1894186735153198, + "learning_rate": 3.162e-05, + "loss": 0.5263, + "step": 6327 + }, + { + "epoch": 0.3543509911524247, + "grad_norm": 1.2153644561767578, + "learning_rate": 3.1624999999999996e-05, + "loss": 0.4624, + "step": 6328 + }, + { + "epoch": 0.3544069884645537, + "grad_norm": 1.2270259857177734, + "learning_rate": 3.163000000000001e-05, + "loss": 0.455, + "step": 6329 + }, + { + "epoch": 0.3544629857766827, + "grad_norm": 0.9415087103843689, + "learning_rate": 3.1635000000000005e-05, + "loss": 0.2646, + "step": 6330 + }, + { + "epoch": 0.35451898308881175, + "grad_norm": 1.7275992631912231, + "learning_rate": 3.164e-05, + "loss": 0.4851, + "step": 6331 + }, + { + "epoch": 0.35457498040094076, + "grad_norm": 1.3599194288253784, + "learning_rate": 3.1645e-05, + "loss": 0.43, + "step": 6332 + }, + { + "epoch": 0.3546309777130698, + "grad_norm": 1.7129243612289429, + "learning_rate": 3.1650000000000004e-05, + "loss": 0.4948, + "step": 6333 + }, + { + "epoch": 0.3546869750251988, + "grad_norm": 1.1714155673980713, + "learning_rate": 3.1655e-05, + "loss": 0.3515, + "step": 6334 + }, + { + "epoch": 0.3547429723373278, + "grad_norm": 1.2481791973114014, + "learning_rate": 3.166e-05, + "loss": 0.4746, + "step": 6335 + }, + { + "epoch": 0.35479896964945684, + "grad_norm": 1.1287357807159424, + "learning_rate": 3.1665e-05, + "loss": 0.4145, + "step": 6336 + }, + { + "epoch": 0.35485496696158586, + "grad_norm": 1.4542763233184814, + "learning_rate": 3.167e-05, + "loss": 0.3924, + "step": 6337 + }, + { + "epoch": 0.3549109642737149, + "grad_norm": 1.4236118793487549, + "learning_rate": 3.1675e-05, + "loss": 0.4936, + "step": 6338 + }, + { + "epoch": 0.3549669615858439, + "grad_norm": 1.5000228881835938, + "learning_rate": 3.168e-05, + "loss": 0.6088, + "step": 6339 + }, + { + "epoch": 0.3550229588979729, + "grad_norm": 1.1576695442199707, + "learning_rate": 3.1685000000000006e-05, + "loss": 0.4053, + "step": 6340 + }, + { + "epoch": 0.35507895621010194, + "grad_norm": 1.2773702144622803, + "learning_rate": 3.169e-05, + "loss": 0.4207, + "step": 6341 + }, + { + "epoch": 0.35513495352223096, + "grad_norm": 1.1120816469192505, + "learning_rate": 3.1695e-05, + "loss": 0.3528, + "step": 6342 + }, + { + "epoch": 0.35519095083436, + "grad_norm": 1.0784268379211426, + "learning_rate": 3.1700000000000005e-05, + "loss": 0.3445, + "step": 6343 + }, + { + "epoch": 0.355246948146489, + "grad_norm": 1.5653364658355713, + "learning_rate": 3.1705e-05, + "loss": 0.5955, + "step": 6344 + }, + { + "epoch": 0.355302945458618, + "grad_norm": 1.2151880264282227, + "learning_rate": 3.171e-05, + "loss": 0.4883, + "step": 6345 + }, + { + "epoch": 0.355358942770747, + "grad_norm": 1.3910470008850098, + "learning_rate": 3.1715e-05, + "loss": 0.542, + "step": 6346 + }, + { + "epoch": 0.355414940082876, + "grad_norm": 1.0868487358093262, + "learning_rate": 3.172e-05, + "loss": 0.3592, + "step": 6347 + }, + { + "epoch": 0.355470937395005, + "grad_norm": 1.9859716892242432, + "learning_rate": 3.1725e-05, + "loss": 0.3893, + "step": 6348 + }, + { + "epoch": 0.35552693470713403, + "grad_norm": 1.0183755159378052, + "learning_rate": 3.173e-05, + "loss": 0.401, + "step": 6349 + }, + { + "epoch": 0.35558293201926305, + "grad_norm": 1.2667468786239624, + "learning_rate": 3.1735e-05, + "loss": 0.434, + "step": 6350 + }, + { + "epoch": 0.3556389293313921, + "grad_norm": 1.2418057918548584, + "learning_rate": 3.1740000000000004e-05, + "loss": 0.58, + "step": 6351 + }, + { + "epoch": 0.3556949266435211, + "grad_norm": 1.1524723768234253, + "learning_rate": 3.1745e-05, + "loss": 0.4516, + "step": 6352 + }, + { + "epoch": 0.3557509239556501, + "grad_norm": 1.9870307445526123, + "learning_rate": 3.175e-05, + "loss": 0.457, + "step": 6353 + }, + { + "epoch": 0.35580692126777913, + "grad_norm": 1.1945549249649048, + "learning_rate": 3.1755000000000003e-05, + "loss": 0.6001, + "step": 6354 + }, + { + "epoch": 0.35586291857990815, + "grad_norm": 1.2876054048538208, + "learning_rate": 3.176e-05, + "loss": 0.444, + "step": 6355 + }, + { + "epoch": 0.35591891589203717, + "grad_norm": 1.264521837234497, + "learning_rate": 3.1765e-05, + "loss": 0.3927, + "step": 6356 + }, + { + "epoch": 0.3559749132041662, + "grad_norm": 1.2872791290283203, + "learning_rate": 3.177e-05, + "loss": 0.3936, + "step": 6357 + }, + { + "epoch": 0.3560309105162952, + "grad_norm": 1.345687747001648, + "learning_rate": 3.1775e-05, + "loss": 0.4547, + "step": 6358 + }, + { + "epoch": 0.3560869078284242, + "grad_norm": 1.096648097038269, + "learning_rate": 3.1780000000000004e-05, + "loss": 0.3369, + "step": 6359 + }, + { + "epoch": 0.35614290514055325, + "grad_norm": 1.0514757633209229, + "learning_rate": 3.1785e-05, + "loss": 0.3599, + "step": 6360 + }, + { + "epoch": 0.35619890245268226, + "grad_norm": 1.674454927444458, + "learning_rate": 3.1790000000000006e-05, + "loss": 0.6327, + "step": 6361 + }, + { + "epoch": 0.3562548997648113, + "grad_norm": 1.044646143913269, + "learning_rate": 3.1795e-05, + "loss": 0.3736, + "step": 6362 + }, + { + "epoch": 0.3563108970769403, + "grad_norm": 1.229859709739685, + "learning_rate": 3.18e-05, + "loss": 0.3778, + "step": 6363 + }, + { + "epoch": 0.3563668943890693, + "grad_norm": 1.2639544010162354, + "learning_rate": 3.1805000000000005e-05, + "loss": 0.496, + "step": 6364 + }, + { + "epoch": 0.35642289170119834, + "grad_norm": 1.1227244138717651, + "learning_rate": 3.181e-05, + "loss": 0.3985, + "step": 6365 + }, + { + "epoch": 0.35647888901332736, + "grad_norm": 1.4106838703155518, + "learning_rate": 3.1815e-05, + "loss": 0.4, + "step": 6366 + }, + { + "epoch": 0.3565348863254564, + "grad_norm": 1.2340143918991089, + "learning_rate": 3.182e-05, + "loss": 0.4061, + "step": 6367 + }, + { + "epoch": 0.3565908836375854, + "grad_norm": 1.538635492324829, + "learning_rate": 3.1825e-05, + "loss": 0.5551, + "step": 6368 + }, + { + "epoch": 0.3566468809497144, + "grad_norm": 1.2566502094268799, + "learning_rate": 3.1830000000000005e-05, + "loss": 0.5342, + "step": 6369 + }, + { + "epoch": 0.35670287826184344, + "grad_norm": 1.134588599205017, + "learning_rate": 3.1835e-05, + "loss": 0.5413, + "step": 6370 + }, + { + "epoch": 0.35675887557397246, + "grad_norm": 1.3127809762954712, + "learning_rate": 3.184e-05, + "loss": 0.3591, + "step": 6371 + }, + { + "epoch": 0.3568148728861015, + "grad_norm": 1.045646071434021, + "learning_rate": 3.1845000000000004e-05, + "loss": 0.4485, + "step": 6372 + }, + { + "epoch": 0.3568708701982305, + "grad_norm": 1.19120454788208, + "learning_rate": 3.185e-05, + "loss": 0.4151, + "step": 6373 + }, + { + "epoch": 0.3569268675103595, + "grad_norm": 1.1111185550689697, + "learning_rate": 3.1855e-05, + "loss": 0.5048, + "step": 6374 + }, + { + "epoch": 0.35698286482248853, + "grad_norm": 1.6205518245697021, + "learning_rate": 3.186e-05, + "loss": 0.4923, + "step": 6375 + }, + { + "epoch": 0.35703886213461755, + "grad_norm": 1.3513946533203125, + "learning_rate": 3.1865e-05, + "loss": 0.4694, + "step": 6376 + }, + { + "epoch": 0.35709485944674657, + "grad_norm": 1.2300530672073364, + "learning_rate": 3.187e-05, + "loss": 0.4352, + "step": 6377 + }, + { + "epoch": 0.3571508567588756, + "grad_norm": 1.2493767738342285, + "learning_rate": 3.1875e-05, + "loss": 0.5207, + "step": 6378 + }, + { + "epoch": 0.3572068540710046, + "grad_norm": 1.2247203588485718, + "learning_rate": 3.188e-05, + "loss": 0.32, + "step": 6379 + }, + { + "epoch": 0.35726285138313363, + "grad_norm": 1.0390961170196533, + "learning_rate": 3.1885000000000004e-05, + "loss": 0.3408, + "step": 6380 + }, + { + "epoch": 0.35731884869526265, + "grad_norm": 3.3146679401397705, + "learning_rate": 3.189e-05, + "loss": 0.4566, + "step": 6381 + }, + { + "epoch": 0.35737484600739167, + "grad_norm": 1.019629955291748, + "learning_rate": 3.1895000000000005e-05, + "loss": 0.3753, + "step": 6382 + }, + { + "epoch": 0.3574308433195207, + "grad_norm": 1.3227936029434204, + "learning_rate": 3.19e-05, + "loss": 0.6134, + "step": 6383 + }, + { + "epoch": 0.3574868406316497, + "grad_norm": 1.313201904296875, + "learning_rate": 3.1905e-05, + "loss": 0.4308, + "step": 6384 + }, + { + "epoch": 0.3575428379437787, + "grad_norm": 1.2985275983810425, + "learning_rate": 3.191e-05, + "loss": 0.4359, + "step": 6385 + }, + { + "epoch": 0.35759883525590774, + "grad_norm": 1.136992335319519, + "learning_rate": 3.1915e-05, + "loss": 0.4, + "step": 6386 + }, + { + "epoch": 0.3576548325680367, + "grad_norm": 1.094826102256775, + "learning_rate": 3.192e-05, + "loss": 0.3882, + "step": 6387 + }, + { + "epoch": 0.3577108298801657, + "grad_norm": 1.4285268783569336, + "learning_rate": 3.1925e-05, + "loss": 0.4504, + "step": 6388 + }, + { + "epoch": 0.35776682719229475, + "grad_norm": 1.1870620250701904, + "learning_rate": 3.193e-05, + "loss": 0.5015, + "step": 6389 + }, + { + "epoch": 0.35782282450442376, + "grad_norm": 1.3874123096466064, + "learning_rate": 3.1935000000000005e-05, + "loss": 0.6506, + "step": 6390 + }, + { + "epoch": 0.3578788218165528, + "grad_norm": 1.2326061725616455, + "learning_rate": 3.194e-05, + "loss": 0.3939, + "step": 6391 + }, + { + "epoch": 0.3579348191286818, + "grad_norm": 1.4296495914459229, + "learning_rate": 3.1945e-05, + "loss": 0.6558, + "step": 6392 + }, + { + "epoch": 0.3579908164408108, + "grad_norm": 1.5314677953720093, + "learning_rate": 3.1950000000000004e-05, + "loss": 0.3221, + "step": 6393 + }, + { + "epoch": 0.35804681375293984, + "grad_norm": 1.0734069347381592, + "learning_rate": 3.1955e-05, + "loss": 0.3728, + "step": 6394 + }, + { + "epoch": 0.35810281106506886, + "grad_norm": 1.264939308166504, + "learning_rate": 3.196e-05, + "loss": 0.4224, + "step": 6395 + }, + { + "epoch": 0.3581588083771979, + "grad_norm": 1.4076383113861084, + "learning_rate": 3.1965e-05, + "loss": 0.4076, + "step": 6396 + }, + { + "epoch": 0.3582148056893269, + "grad_norm": 1.0512858629226685, + "learning_rate": 3.197e-05, + "loss": 0.4198, + "step": 6397 + }, + { + "epoch": 0.3582708030014559, + "grad_norm": 1.4213348627090454, + "learning_rate": 3.1975e-05, + "loss": 0.3753, + "step": 6398 + }, + { + "epoch": 0.35832680031358494, + "grad_norm": 1.445689082145691, + "learning_rate": 3.198e-05, + "loss": 0.4111, + "step": 6399 + }, + { + "epoch": 0.35838279762571396, + "grad_norm": 1.4471769332885742, + "learning_rate": 3.1985000000000006e-05, + "loss": 0.3986, + "step": 6400 + }, + { + "epoch": 0.358438794937843, + "grad_norm": 1.280672311782837, + "learning_rate": 3.1990000000000004e-05, + "loss": 0.4805, + "step": 6401 + }, + { + "epoch": 0.358494792249972, + "grad_norm": 1.2855925559997559, + "learning_rate": 3.1995e-05, + "loss": 0.4193, + "step": 6402 + }, + { + "epoch": 0.358550789562101, + "grad_norm": 1.4518502950668335, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.4552, + "step": 6403 + }, + { + "epoch": 0.35860678687423003, + "grad_norm": 1.4236962795257568, + "learning_rate": 3.2005e-05, + "loss": 0.5373, + "step": 6404 + }, + { + "epoch": 0.35866278418635905, + "grad_norm": 1.2707767486572266, + "learning_rate": 3.201e-05, + "loss": 0.3918, + "step": 6405 + }, + { + "epoch": 0.35871878149848807, + "grad_norm": 1.2630311250686646, + "learning_rate": 3.2015e-05, + "loss": 0.5197, + "step": 6406 + }, + { + "epoch": 0.3587747788106171, + "grad_norm": 1.2726080417633057, + "learning_rate": 3.202e-05, + "loss": 0.3782, + "step": 6407 + }, + { + "epoch": 0.3588307761227461, + "grad_norm": 1.275527000427246, + "learning_rate": 3.2025e-05, + "loss": 0.4598, + "step": 6408 + }, + { + "epoch": 0.35888677343487513, + "grad_norm": 1.2320438623428345, + "learning_rate": 3.2029999999999997e-05, + "loss": 0.4221, + "step": 6409 + }, + { + "epoch": 0.35894277074700415, + "grad_norm": 1.4411370754241943, + "learning_rate": 3.2035e-05, + "loss": 0.5037, + "step": 6410 + }, + { + "epoch": 0.35899876805913317, + "grad_norm": 1.2561625242233276, + "learning_rate": 3.2040000000000005e-05, + "loss": 0.3273, + "step": 6411 + }, + { + "epoch": 0.3590547653712622, + "grad_norm": 1.2090235948562622, + "learning_rate": 3.2045e-05, + "loss": 0.4458, + "step": 6412 + }, + { + "epoch": 0.3591107626833912, + "grad_norm": 1.248092532157898, + "learning_rate": 3.205e-05, + "loss": 0.4784, + "step": 6413 + }, + { + "epoch": 0.3591667599955202, + "grad_norm": 1.1258567571640015, + "learning_rate": 3.2055000000000004e-05, + "loss": 0.4477, + "step": 6414 + }, + { + "epoch": 0.35922275730764924, + "grad_norm": 1.2157437801361084, + "learning_rate": 3.206e-05, + "loss": 0.4207, + "step": 6415 + }, + { + "epoch": 0.35927875461977826, + "grad_norm": 1.399810791015625, + "learning_rate": 3.2065e-05, + "loss": 0.3596, + "step": 6416 + }, + { + "epoch": 0.3593347519319073, + "grad_norm": 1.2439607381820679, + "learning_rate": 3.207e-05, + "loss": 0.467, + "step": 6417 + }, + { + "epoch": 0.3593907492440363, + "grad_norm": 1.3221555948257446, + "learning_rate": 3.2075e-05, + "loss": 0.4603, + "step": 6418 + }, + { + "epoch": 0.3594467465561653, + "grad_norm": 1.1428906917572021, + "learning_rate": 3.208e-05, + "loss": 0.3442, + "step": 6419 + }, + { + "epoch": 0.35950274386829434, + "grad_norm": 1.3556814193725586, + "learning_rate": 3.2085e-05, + "loss": 0.4593, + "step": 6420 + }, + { + "epoch": 0.35955874118042336, + "grad_norm": 1.1192015409469604, + "learning_rate": 3.2090000000000006e-05, + "loss": 0.3109, + "step": 6421 + }, + { + "epoch": 0.3596147384925524, + "grad_norm": 1.3313668966293335, + "learning_rate": 3.2095000000000004e-05, + "loss": 0.5065, + "step": 6422 + }, + { + "epoch": 0.3596707358046814, + "grad_norm": 1.2944973707199097, + "learning_rate": 3.21e-05, + "loss": 0.4034, + "step": 6423 + }, + { + "epoch": 0.3597267331168104, + "grad_norm": 1.1574398279190063, + "learning_rate": 3.2105e-05, + "loss": 0.3418, + "step": 6424 + }, + { + "epoch": 0.35978273042893943, + "grad_norm": 1.1419357061386108, + "learning_rate": 3.211e-05, + "loss": 0.349, + "step": 6425 + }, + { + "epoch": 0.35983872774106845, + "grad_norm": 1.5139812231063843, + "learning_rate": 3.2115e-05, + "loss": 0.3879, + "step": 6426 + }, + { + "epoch": 0.3598947250531975, + "grad_norm": 1.2404354810714722, + "learning_rate": 3.212e-05, + "loss": 0.4619, + "step": 6427 + }, + { + "epoch": 0.3599507223653265, + "grad_norm": 1.1889538764953613, + "learning_rate": 3.2125e-05, + "loss": 0.4278, + "step": 6428 + }, + { + "epoch": 0.36000671967745546, + "grad_norm": 1.2059705257415771, + "learning_rate": 3.213e-05, + "loss": 0.4227, + "step": 6429 + }, + { + "epoch": 0.3600627169895845, + "grad_norm": 1.1609004735946655, + "learning_rate": 3.2135e-05, + "loss": 0.404, + "step": 6430 + }, + { + "epoch": 0.3601187143017135, + "grad_norm": 1.007348656654358, + "learning_rate": 3.214e-05, + "loss": 0.4094, + "step": 6431 + }, + { + "epoch": 0.3601747116138425, + "grad_norm": 1.18724524974823, + "learning_rate": 3.2145000000000005e-05, + "loss": 0.4617, + "step": 6432 + }, + { + "epoch": 0.36023070892597153, + "grad_norm": 1.2899388074874878, + "learning_rate": 3.215e-05, + "loss": 0.5475, + "step": 6433 + }, + { + "epoch": 0.36028670623810055, + "grad_norm": 1.316030740737915, + "learning_rate": 3.2155e-05, + "loss": 0.5201, + "step": 6434 + }, + { + "epoch": 0.36034270355022957, + "grad_norm": 1.1159597635269165, + "learning_rate": 3.2160000000000004e-05, + "loss": 0.4133, + "step": 6435 + }, + { + "epoch": 0.3603987008623586, + "grad_norm": 1.3339399099349976, + "learning_rate": 3.2165e-05, + "loss": 0.4596, + "step": 6436 + }, + { + "epoch": 0.3604546981744876, + "grad_norm": 1.3317457437515259, + "learning_rate": 3.217e-05, + "loss": 0.4083, + "step": 6437 + }, + { + "epoch": 0.36051069548661663, + "grad_norm": 1.309380054473877, + "learning_rate": 3.2175e-05, + "loss": 0.4438, + "step": 6438 + }, + { + "epoch": 0.36056669279874565, + "grad_norm": 1.1014667749404907, + "learning_rate": 3.218e-05, + "loss": 0.5102, + "step": 6439 + }, + { + "epoch": 0.36062269011087467, + "grad_norm": 1.2829416990280151, + "learning_rate": 3.2185000000000004e-05, + "loss": 0.4968, + "step": 6440 + }, + { + "epoch": 0.3606786874230037, + "grad_norm": 1.0472525358200073, + "learning_rate": 3.219e-05, + "loss": 0.3369, + "step": 6441 + }, + { + "epoch": 0.3607346847351327, + "grad_norm": 1.2059707641601562, + "learning_rate": 3.2195000000000006e-05, + "loss": 0.4293, + "step": 6442 + }, + { + "epoch": 0.3607906820472617, + "grad_norm": 1.0743045806884766, + "learning_rate": 3.2200000000000003e-05, + "loss": 0.2988, + "step": 6443 + }, + { + "epoch": 0.36084667935939074, + "grad_norm": 1.1963176727294922, + "learning_rate": 3.2205e-05, + "loss": 0.4514, + "step": 6444 + }, + { + "epoch": 0.36090267667151976, + "grad_norm": 1.4290324449539185, + "learning_rate": 3.221e-05, + "loss": 0.4848, + "step": 6445 + }, + { + "epoch": 0.3609586739836488, + "grad_norm": 1.4431935548782349, + "learning_rate": 3.2215e-05, + "loss": 0.4874, + "step": 6446 + }, + { + "epoch": 0.3610146712957778, + "grad_norm": 1.5518471002578735, + "learning_rate": 3.222e-05, + "loss": 0.6615, + "step": 6447 + }, + { + "epoch": 0.3610706686079068, + "grad_norm": 1.2346594333648682, + "learning_rate": 3.2225e-05, + "loss": 0.4037, + "step": 6448 + }, + { + "epoch": 0.36112666592003584, + "grad_norm": 1.3797889947891235, + "learning_rate": 3.223e-05, + "loss": 0.3868, + "step": 6449 + }, + { + "epoch": 0.36118266323216486, + "grad_norm": 1.1214135885238647, + "learning_rate": 3.2235000000000006e-05, + "loss": 0.3973, + "step": 6450 + }, + { + "epoch": 0.3612386605442939, + "grad_norm": 1.1265441179275513, + "learning_rate": 3.224e-05, + "loss": 0.3838, + "step": 6451 + }, + { + "epoch": 0.3612946578564229, + "grad_norm": 1.2349735498428345, + "learning_rate": 3.2245e-05, + "loss": 0.388, + "step": 6452 + }, + { + "epoch": 0.3613506551685519, + "grad_norm": 1.1909857988357544, + "learning_rate": 3.2250000000000005e-05, + "loss": 0.4089, + "step": 6453 + }, + { + "epoch": 0.36140665248068093, + "grad_norm": 1.2351858615875244, + "learning_rate": 3.2255e-05, + "loss": 0.4081, + "step": 6454 + }, + { + "epoch": 0.36146264979280995, + "grad_norm": 1.1014418601989746, + "learning_rate": 3.226e-05, + "loss": 0.3513, + "step": 6455 + }, + { + "epoch": 0.361518647104939, + "grad_norm": 1.296622395515442, + "learning_rate": 3.2265000000000004e-05, + "loss": 0.4217, + "step": 6456 + }, + { + "epoch": 0.361574644417068, + "grad_norm": 1.2340731620788574, + "learning_rate": 3.227e-05, + "loss": 0.4694, + "step": 6457 + }, + { + "epoch": 0.361630641729197, + "grad_norm": 1.2117183208465576, + "learning_rate": 3.2275e-05, + "loss": 0.3697, + "step": 6458 + }, + { + "epoch": 0.36168663904132603, + "grad_norm": 1.2185648679733276, + "learning_rate": 3.2279999999999996e-05, + "loss": 0.4544, + "step": 6459 + }, + { + "epoch": 0.36174263635345505, + "grad_norm": 1.2371495962142944, + "learning_rate": 3.228500000000001e-05, + "loss": 0.4588, + "step": 6460 + }, + { + "epoch": 0.36179863366558407, + "grad_norm": 1.1846920251846313, + "learning_rate": 3.2290000000000004e-05, + "loss": 0.4448, + "step": 6461 + }, + { + "epoch": 0.3618546309777131, + "grad_norm": 1.308303713798523, + "learning_rate": 3.2295e-05, + "loss": 0.4576, + "step": 6462 + }, + { + "epoch": 0.3619106282898421, + "grad_norm": 1.1310920715332031, + "learning_rate": 3.2300000000000006e-05, + "loss": 0.4626, + "step": 6463 + }, + { + "epoch": 0.3619666256019711, + "grad_norm": 1.205828070640564, + "learning_rate": 3.2305e-05, + "loss": 0.4259, + "step": 6464 + }, + { + "epoch": 0.36202262291410015, + "grad_norm": 1.215994954109192, + "learning_rate": 3.231e-05, + "loss": 0.3851, + "step": 6465 + }, + { + "epoch": 0.36207862022622916, + "grad_norm": 1.2289583683013916, + "learning_rate": 3.2315e-05, + "loss": 0.4878, + "step": 6466 + }, + { + "epoch": 0.3621346175383582, + "grad_norm": 1.1727473735809326, + "learning_rate": 3.232e-05, + "loss": 0.3297, + "step": 6467 + }, + { + "epoch": 0.3621906148504872, + "grad_norm": 1.1282618045806885, + "learning_rate": 3.2325e-05, + "loss": 0.402, + "step": 6468 + }, + { + "epoch": 0.3622466121626162, + "grad_norm": 1.4074574708938599, + "learning_rate": 3.233e-05, + "loss": 0.4783, + "step": 6469 + }, + { + "epoch": 0.3623026094747452, + "grad_norm": 2.0473852157592773, + "learning_rate": 3.2335e-05, + "loss": 0.5243, + "step": 6470 + }, + { + "epoch": 0.3623586067868742, + "grad_norm": 1.3587144613265991, + "learning_rate": 3.2340000000000005e-05, + "loss": 0.385, + "step": 6471 + }, + { + "epoch": 0.3624146040990032, + "grad_norm": 1.1086276769638062, + "learning_rate": 3.2345e-05, + "loss": 0.3856, + "step": 6472 + }, + { + "epoch": 0.36247060141113224, + "grad_norm": 1.2624282836914062, + "learning_rate": 3.235e-05, + "loss": 0.5057, + "step": 6473 + }, + { + "epoch": 0.36252659872326126, + "grad_norm": 1.1324260234832764, + "learning_rate": 3.2355000000000004e-05, + "loss": 0.4098, + "step": 6474 + }, + { + "epoch": 0.3625825960353903, + "grad_norm": 1.260992407798767, + "learning_rate": 3.236e-05, + "loss": 0.3632, + "step": 6475 + }, + { + "epoch": 0.3626385933475193, + "grad_norm": 1.2799453735351562, + "learning_rate": 3.2365e-05, + "loss": 0.3703, + "step": 6476 + }, + { + "epoch": 0.3626945906596483, + "grad_norm": 1.233601689338684, + "learning_rate": 3.2370000000000003e-05, + "loss": 0.5573, + "step": 6477 + }, + { + "epoch": 0.36275058797177734, + "grad_norm": 1.1402902603149414, + "learning_rate": 3.2375e-05, + "loss": 0.3808, + "step": 6478 + }, + { + "epoch": 0.36280658528390636, + "grad_norm": 1.103020429611206, + "learning_rate": 3.238e-05, + "loss": 0.386, + "step": 6479 + }, + { + "epoch": 0.3628625825960354, + "grad_norm": 1.1356475353240967, + "learning_rate": 3.2385e-05, + "loss": 0.406, + "step": 6480 + }, + { + "epoch": 0.3629185799081644, + "grad_norm": 1.1693097352981567, + "learning_rate": 3.239000000000001e-05, + "loss": 0.4771, + "step": 6481 + }, + { + "epoch": 0.3629745772202934, + "grad_norm": 1.4075266122817993, + "learning_rate": 3.2395000000000004e-05, + "loss": 0.5045, + "step": 6482 + }, + { + "epoch": 0.36303057453242243, + "grad_norm": 1.3106882572174072, + "learning_rate": 3.24e-05, + "loss": 0.4642, + "step": 6483 + }, + { + "epoch": 0.36308657184455145, + "grad_norm": 1.2524840831756592, + "learning_rate": 3.2405e-05, + "loss": 0.4903, + "step": 6484 + }, + { + "epoch": 0.3631425691566805, + "grad_norm": 1.4604068994522095, + "learning_rate": 3.241e-05, + "loss": 0.4731, + "step": 6485 + }, + { + "epoch": 0.3631985664688095, + "grad_norm": 1.555456280708313, + "learning_rate": 3.2415e-05, + "loss": 0.4913, + "step": 6486 + }, + { + "epoch": 0.3632545637809385, + "grad_norm": 1.2731359004974365, + "learning_rate": 3.242e-05, + "loss": 0.3683, + "step": 6487 + }, + { + "epoch": 0.36331056109306753, + "grad_norm": 1.37197744846344, + "learning_rate": 3.2425e-05, + "loss": 0.418, + "step": 6488 + }, + { + "epoch": 0.36336655840519655, + "grad_norm": 1.0760390758514404, + "learning_rate": 3.243e-05, + "loss": 0.364, + "step": 6489 + }, + { + "epoch": 0.36342255571732557, + "grad_norm": 1.191428542137146, + "learning_rate": 3.2435000000000004e-05, + "loss": 0.4739, + "step": 6490 + }, + { + "epoch": 0.3634785530294546, + "grad_norm": 2.9587643146514893, + "learning_rate": 3.244e-05, + "loss": 0.4657, + "step": 6491 + }, + { + "epoch": 0.3635345503415836, + "grad_norm": 1.2364811897277832, + "learning_rate": 3.2445000000000005e-05, + "loss": 0.53, + "step": 6492 + }, + { + "epoch": 0.3635905476537126, + "grad_norm": 1.3529722690582275, + "learning_rate": 3.245e-05, + "loss": 0.4389, + "step": 6493 + }, + { + "epoch": 0.36364654496584164, + "grad_norm": 1.2198383808135986, + "learning_rate": 3.2455e-05, + "loss": 0.4862, + "step": 6494 + }, + { + "epoch": 0.36370254227797066, + "grad_norm": 1.2174612283706665, + "learning_rate": 3.2460000000000004e-05, + "loss": 0.4445, + "step": 6495 + }, + { + "epoch": 0.3637585395900997, + "grad_norm": 0.9913917183876038, + "learning_rate": 3.2465e-05, + "loss": 0.4996, + "step": 6496 + }, + { + "epoch": 0.3638145369022287, + "grad_norm": 1.5098800659179688, + "learning_rate": 3.247e-05, + "loss": 0.393, + "step": 6497 + }, + { + "epoch": 0.3638705342143577, + "grad_norm": 1.3465111255645752, + "learning_rate": 3.2474999999999997e-05, + "loss": 0.4528, + "step": 6498 + }, + { + "epoch": 0.36392653152648674, + "grad_norm": 1.342793345451355, + "learning_rate": 3.248e-05, + "loss": 0.5098, + "step": 6499 + }, + { + "epoch": 0.36398252883861576, + "grad_norm": 1.5947413444519043, + "learning_rate": 3.2485000000000005e-05, + "loss": 0.4775, + "step": 6500 + }, + { + "epoch": 0.3640385261507448, + "grad_norm": 1.0939754247665405, + "learning_rate": 3.249e-05, + "loss": 0.3968, + "step": 6501 + }, + { + "epoch": 0.3640945234628738, + "grad_norm": 1.2496519088745117, + "learning_rate": 3.2495000000000007e-05, + "loss": 0.4816, + "step": 6502 + }, + { + "epoch": 0.3641505207750028, + "grad_norm": 1.2200695276260376, + "learning_rate": 3.2500000000000004e-05, + "loss": 0.449, + "step": 6503 + }, + { + "epoch": 0.36420651808713184, + "grad_norm": 1.184432029724121, + "learning_rate": 3.2505e-05, + "loss": 0.4768, + "step": 6504 + }, + { + "epoch": 0.36426251539926086, + "grad_norm": 1.0775400400161743, + "learning_rate": 3.251e-05, + "loss": 0.3957, + "step": 6505 + }, + { + "epoch": 0.3643185127113899, + "grad_norm": 1.1742106676101685, + "learning_rate": 3.2515e-05, + "loss": 0.4073, + "step": 6506 + }, + { + "epoch": 0.3643745100235189, + "grad_norm": 1.1494220495224, + "learning_rate": 3.252e-05, + "loss": 0.4473, + "step": 6507 + }, + { + "epoch": 0.3644305073356479, + "grad_norm": 1.1055777072906494, + "learning_rate": 3.2525e-05, + "loss": 0.3384, + "step": 6508 + }, + { + "epoch": 0.36448650464777693, + "grad_norm": 1.275314211845398, + "learning_rate": 3.253e-05, + "loss": 0.4473, + "step": 6509 + }, + { + "epoch": 0.36454250195990595, + "grad_norm": 1.1592401266098022, + "learning_rate": 3.2535e-05, + "loss": 0.4372, + "step": 6510 + }, + { + "epoch": 0.3645984992720349, + "grad_norm": 1.2728155851364136, + "learning_rate": 3.2540000000000004e-05, + "loss": 0.4949, + "step": 6511 + }, + { + "epoch": 0.36465449658416393, + "grad_norm": 1.1379468441009521, + "learning_rate": 3.2545e-05, + "loss": 0.4844, + "step": 6512 + }, + { + "epoch": 0.36471049389629295, + "grad_norm": 1.2028402090072632, + "learning_rate": 3.2550000000000005e-05, + "loss": 0.4344, + "step": 6513 + }, + { + "epoch": 0.364766491208422, + "grad_norm": 1.402000069618225, + "learning_rate": 3.2555e-05, + "loss": 0.3926, + "step": 6514 + }, + { + "epoch": 0.364822488520551, + "grad_norm": 1.4037359952926636, + "learning_rate": 3.256e-05, + "loss": 0.2992, + "step": 6515 + }, + { + "epoch": 0.36487848583268, + "grad_norm": 1.2170379161834717, + "learning_rate": 3.2565000000000004e-05, + "loss": 0.5853, + "step": 6516 + }, + { + "epoch": 0.36493448314480903, + "grad_norm": 1.0671933889389038, + "learning_rate": 3.257e-05, + "loss": 0.4087, + "step": 6517 + }, + { + "epoch": 0.36499048045693805, + "grad_norm": 1.2431203126907349, + "learning_rate": 3.2575e-05, + "loss": 0.4131, + "step": 6518 + }, + { + "epoch": 0.36504647776906707, + "grad_norm": 1.300535798072815, + "learning_rate": 3.2579999999999996e-05, + "loss": 0.3265, + "step": 6519 + }, + { + "epoch": 0.3651024750811961, + "grad_norm": 1.442633867263794, + "learning_rate": 3.2585e-05, + "loss": 0.4488, + "step": 6520 + }, + { + "epoch": 0.3651584723933251, + "grad_norm": 1.062958002090454, + "learning_rate": 3.2590000000000005e-05, + "loss": 0.2811, + "step": 6521 + }, + { + "epoch": 0.3652144697054541, + "grad_norm": 1.154906153678894, + "learning_rate": 3.2595e-05, + "loss": 0.3751, + "step": 6522 + }, + { + "epoch": 0.36527046701758314, + "grad_norm": 1.2711113691329956, + "learning_rate": 3.26e-05, + "loss": 0.5827, + "step": 6523 + }, + { + "epoch": 0.36532646432971216, + "grad_norm": 1.325225591659546, + "learning_rate": 3.2605000000000004e-05, + "loss": 0.319, + "step": 6524 + }, + { + "epoch": 0.3653824616418412, + "grad_norm": 1.3151487112045288, + "learning_rate": 3.261e-05, + "loss": 0.4581, + "step": 6525 + }, + { + "epoch": 0.3654384589539702, + "grad_norm": 1.2263975143432617, + "learning_rate": 3.2615e-05, + "loss": 0.4808, + "step": 6526 + }, + { + "epoch": 0.3654944562660992, + "grad_norm": 1.1650111675262451, + "learning_rate": 3.262e-05, + "loss": 0.3016, + "step": 6527 + }, + { + "epoch": 0.36555045357822824, + "grad_norm": 1.2386382818222046, + "learning_rate": 3.2625e-05, + "loss": 0.4782, + "step": 6528 + }, + { + "epoch": 0.36560645089035726, + "grad_norm": 1.1024527549743652, + "learning_rate": 3.263e-05, + "loss": 0.3295, + "step": 6529 + }, + { + "epoch": 0.3656624482024863, + "grad_norm": 1.4803990125656128, + "learning_rate": 3.2635e-05, + "loss": 0.5028, + "step": 6530 + }, + { + "epoch": 0.3657184455146153, + "grad_norm": 1.300776720046997, + "learning_rate": 3.2640000000000006e-05, + "loss": 0.5235, + "step": 6531 + }, + { + "epoch": 0.3657744428267443, + "grad_norm": 1.1828978061676025, + "learning_rate": 3.2645e-05, + "loss": 0.3642, + "step": 6532 + }, + { + "epoch": 0.36583044013887334, + "grad_norm": 1.1352730989456177, + "learning_rate": 3.265e-05, + "loss": 0.4256, + "step": 6533 + }, + { + "epoch": 0.36588643745100236, + "grad_norm": 1.3790315389633179, + "learning_rate": 3.2655000000000005e-05, + "loss": 0.4952, + "step": 6534 + }, + { + "epoch": 0.3659424347631314, + "grad_norm": 1.1742215156555176, + "learning_rate": 3.266e-05, + "loss": 0.4102, + "step": 6535 + }, + { + "epoch": 0.3659984320752604, + "grad_norm": 1.2382011413574219, + "learning_rate": 3.2665e-05, + "loss": 0.4639, + "step": 6536 + }, + { + "epoch": 0.3660544293873894, + "grad_norm": 1.2191896438598633, + "learning_rate": 3.267e-05, + "loss": 0.3776, + "step": 6537 + }, + { + "epoch": 0.36611042669951843, + "grad_norm": 1.1040349006652832, + "learning_rate": 3.2675e-05, + "loss": 0.4356, + "step": 6538 + }, + { + "epoch": 0.36616642401164745, + "grad_norm": 1.3455814123153687, + "learning_rate": 3.268e-05, + "loss": 0.4472, + "step": 6539 + }, + { + "epoch": 0.36622242132377647, + "grad_norm": 1.3614933490753174, + "learning_rate": 3.2684999999999996e-05, + "loss": 0.6831, + "step": 6540 + }, + { + "epoch": 0.3662784186359055, + "grad_norm": 1.1957603693008423, + "learning_rate": 3.269000000000001e-05, + "loss": 0.5281, + "step": 6541 + }, + { + "epoch": 0.3663344159480345, + "grad_norm": 1.596968173980713, + "learning_rate": 3.2695000000000005e-05, + "loss": 0.4646, + "step": 6542 + }, + { + "epoch": 0.3663904132601635, + "grad_norm": 1.2392882108688354, + "learning_rate": 3.27e-05, + "loss": 0.5475, + "step": 6543 + }, + { + "epoch": 0.36644641057229255, + "grad_norm": 1.2253211736679077, + "learning_rate": 3.2705e-05, + "loss": 0.3998, + "step": 6544 + }, + { + "epoch": 0.36650240788442157, + "grad_norm": 1.2598199844360352, + "learning_rate": 3.2710000000000004e-05, + "loss": 0.3963, + "step": 6545 + }, + { + "epoch": 0.3665584051965506, + "grad_norm": 1.208227276802063, + "learning_rate": 3.2715e-05, + "loss": 0.5363, + "step": 6546 + }, + { + "epoch": 0.3666144025086796, + "grad_norm": 1.0527280569076538, + "learning_rate": 3.272e-05, + "loss": 0.3906, + "step": 6547 + }, + { + "epoch": 0.3666703998208086, + "grad_norm": 1.2148113250732422, + "learning_rate": 3.2725e-05, + "loss": 0.4611, + "step": 6548 + }, + { + "epoch": 0.36672639713293764, + "grad_norm": 1.420346736907959, + "learning_rate": 3.273e-05, + "loss": 0.5132, + "step": 6549 + }, + { + "epoch": 0.36678239444506666, + "grad_norm": 1.3327460289001465, + "learning_rate": 3.2735e-05, + "loss": 0.3674, + "step": 6550 + }, + { + "epoch": 0.3668383917571957, + "grad_norm": 1.1744295358657837, + "learning_rate": 3.274e-05, + "loss": 0.3614, + "step": 6551 + }, + { + "epoch": 0.3668943890693247, + "grad_norm": 1.3436514139175415, + "learning_rate": 3.2745000000000006e-05, + "loss": 0.3978, + "step": 6552 + }, + { + "epoch": 0.36695038638145366, + "grad_norm": 1.2072014808654785, + "learning_rate": 3.275e-05, + "loss": 0.3727, + "step": 6553 + }, + { + "epoch": 0.3670063836935827, + "grad_norm": 1.1398029327392578, + "learning_rate": 3.2755e-05, + "loss": 0.4489, + "step": 6554 + }, + { + "epoch": 0.3670623810057117, + "grad_norm": 1.4085496664047241, + "learning_rate": 3.2760000000000005e-05, + "loss": 0.694, + "step": 6555 + }, + { + "epoch": 0.3671183783178407, + "grad_norm": 1.164756417274475, + "learning_rate": 3.2765e-05, + "loss": 0.4144, + "step": 6556 + }, + { + "epoch": 0.36717437562996974, + "grad_norm": 1.3206162452697754, + "learning_rate": 3.277e-05, + "loss": 0.4299, + "step": 6557 + }, + { + "epoch": 0.36723037294209876, + "grad_norm": 1.059890866279602, + "learning_rate": 3.2775e-05, + "loss": 0.3972, + "step": 6558 + }, + { + "epoch": 0.3672863702542278, + "grad_norm": 1.2624162435531616, + "learning_rate": 3.278e-05, + "loss": 0.4341, + "step": 6559 + }, + { + "epoch": 0.3673423675663568, + "grad_norm": 1.3863948583602905, + "learning_rate": 3.2785e-05, + "loss": 0.5112, + "step": 6560 + }, + { + "epoch": 0.3673983648784858, + "grad_norm": 1.219224214553833, + "learning_rate": 3.279e-05, + "loss": 0.4561, + "step": 6561 + }, + { + "epoch": 0.36745436219061484, + "grad_norm": 1.2431224584579468, + "learning_rate": 3.2795e-05, + "loss": 0.4304, + "step": 6562 + }, + { + "epoch": 0.36751035950274386, + "grad_norm": 1.2224704027175903, + "learning_rate": 3.2800000000000004e-05, + "loss": 0.3912, + "step": 6563 + }, + { + "epoch": 0.3675663568148729, + "grad_norm": 2.1497962474823, + "learning_rate": 3.2805e-05, + "loss": 0.668, + "step": 6564 + }, + { + "epoch": 0.3676223541270019, + "grad_norm": 1.4057832956314087, + "learning_rate": 3.281e-05, + "loss": 0.7726, + "step": 6565 + }, + { + "epoch": 0.3676783514391309, + "grad_norm": 1.4491353034973145, + "learning_rate": 3.2815000000000003e-05, + "loss": 0.4135, + "step": 6566 + }, + { + "epoch": 0.36773434875125993, + "grad_norm": 1.0267125368118286, + "learning_rate": 3.282e-05, + "loss": 0.3927, + "step": 6567 + }, + { + "epoch": 0.36779034606338895, + "grad_norm": 1.2104021310806274, + "learning_rate": 3.2825e-05, + "loss": 0.3914, + "step": 6568 + }, + { + "epoch": 0.36784634337551797, + "grad_norm": 1.4252570867538452, + "learning_rate": 3.283e-05, + "loss": 0.4011, + "step": 6569 + }, + { + "epoch": 0.367902340687647, + "grad_norm": 1.1146646738052368, + "learning_rate": 3.2835e-05, + "loss": 0.5136, + "step": 6570 + }, + { + "epoch": 0.367958337999776, + "grad_norm": 1.1819242238998413, + "learning_rate": 3.2840000000000004e-05, + "loss": 0.3871, + "step": 6571 + }, + { + "epoch": 0.368014335311905, + "grad_norm": 1.1665692329406738, + "learning_rate": 3.2845e-05, + "loss": 0.3557, + "step": 6572 + }, + { + "epoch": 0.36807033262403405, + "grad_norm": 1.575913906097412, + "learning_rate": 3.2850000000000006e-05, + "loss": 0.4912, + "step": 6573 + }, + { + "epoch": 0.36812632993616307, + "grad_norm": 1.8078118562698364, + "learning_rate": 3.2855e-05, + "loss": 0.3965, + "step": 6574 + }, + { + "epoch": 0.3681823272482921, + "grad_norm": 1.1450138092041016, + "learning_rate": 3.286e-05, + "loss": 0.319, + "step": 6575 + }, + { + "epoch": 0.3682383245604211, + "grad_norm": 1.0713926553726196, + "learning_rate": 3.2865000000000005e-05, + "loss": 0.3741, + "step": 6576 + }, + { + "epoch": 0.3682943218725501, + "grad_norm": 1.2838785648345947, + "learning_rate": 3.287e-05, + "loss": 0.3915, + "step": 6577 + }, + { + "epoch": 0.36835031918467914, + "grad_norm": 1.1639436483383179, + "learning_rate": 3.2875e-05, + "loss": 0.4957, + "step": 6578 + }, + { + "epoch": 0.36840631649680816, + "grad_norm": 1.3116776943206787, + "learning_rate": 3.288e-05, + "loss": 0.4058, + "step": 6579 + }, + { + "epoch": 0.3684623138089372, + "grad_norm": 1.3612570762634277, + "learning_rate": 3.2885e-05, + "loss": 0.4285, + "step": 6580 + }, + { + "epoch": 0.3685183111210662, + "grad_norm": 1.1726627349853516, + "learning_rate": 3.2890000000000005e-05, + "loss": 0.3682, + "step": 6581 + }, + { + "epoch": 0.3685743084331952, + "grad_norm": 1.2552701234817505, + "learning_rate": 3.2895e-05, + "loss": 0.404, + "step": 6582 + }, + { + "epoch": 0.36863030574532424, + "grad_norm": 1.3821288347244263, + "learning_rate": 3.29e-05, + "loss": 0.5341, + "step": 6583 + }, + { + "epoch": 0.36868630305745326, + "grad_norm": 1.3465137481689453, + "learning_rate": 3.2905000000000004e-05, + "loss": 0.5916, + "step": 6584 + }, + { + "epoch": 0.3687423003695823, + "grad_norm": 1.141660451889038, + "learning_rate": 3.291e-05, + "loss": 0.4408, + "step": 6585 + }, + { + "epoch": 0.3687982976817113, + "grad_norm": 1.1541905403137207, + "learning_rate": 3.2915e-05, + "loss": 0.3921, + "step": 6586 + }, + { + "epoch": 0.3688542949938403, + "grad_norm": 1.2907469272613525, + "learning_rate": 3.292e-05, + "loss": 0.51, + "step": 6587 + }, + { + "epoch": 0.36891029230596933, + "grad_norm": 1.2633461952209473, + "learning_rate": 3.2925e-05, + "loss": 0.4051, + "step": 6588 + }, + { + "epoch": 0.36896628961809835, + "grad_norm": 1.078071117401123, + "learning_rate": 3.293e-05, + "loss": 0.3231, + "step": 6589 + }, + { + "epoch": 0.3690222869302274, + "grad_norm": 1.319933295249939, + "learning_rate": 3.2935e-05, + "loss": 0.4022, + "step": 6590 + }, + { + "epoch": 0.3690782842423564, + "grad_norm": 1.2563138008117676, + "learning_rate": 3.2940000000000006e-05, + "loss": 0.4468, + "step": 6591 + }, + { + "epoch": 0.3691342815544854, + "grad_norm": 1.109421730041504, + "learning_rate": 3.2945000000000004e-05, + "loss": 0.3579, + "step": 6592 + }, + { + "epoch": 0.36919027886661443, + "grad_norm": 1.5637937784194946, + "learning_rate": 3.295e-05, + "loss": 0.5781, + "step": 6593 + }, + { + "epoch": 0.3692462761787434, + "grad_norm": 1.3978664875030518, + "learning_rate": 3.2955000000000006e-05, + "loss": 0.4671, + "step": 6594 + }, + { + "epoch": 0.3693022734908724, + "grad_norm": 1.0309531688690186, + "learning_rate": 3.296e-05, + "loss": 0.3874, + "step": 6595 + }, + { + "epoch": 0.36935827080300143, + "grad_norm": 1.4158213138580322, + "learning_rate": 3.2965e-05, + "loss": 0.3398, + "step": 6596 + }, + { + "epoch": 0.36941426811513045, + "grad_norm": 1.2440460920333862, + "learning_rate": 3.297e-05, + "loss": 0.4682, + "step": 6597 + }, + { + "epoch": 0.36947026542725947, + "grad_norm": 1.157631516456604, + "learning_rate": 3.2975e-05, + "loss": 0.363, + "step": 6598 + }, + { + "epoch": 0.3695262627393885, + "grad_norm": 1.1247055530548096, + "learning_rate": 3.298e-05, + "loss": 0.4482, + "step": 6599 + }, + { + "epoch": 0.3695822600515175, + "grad_norm": 1.1412831544876099, + "learning_rate": 3.2985e-05, + "loss": 0.3781, + "step": 6600 + }, + { + "epoch": 0.3696382573636465, + "grad_norm": 1.1583365201950073, + "learning_rate": 3.299e-05, + "loss": 0.3765, + "step": 6601 + }, + { + "epoch": 0.36969425467577555, + "grad_norm": 1.2251805067062378, + "learning_rate": 3.2995000000000005e-05, + "loss": 0.4022, + "step": 6602 + }, + { + "epoch": 0.36975025198790457, + "grad_norm": 1.2362018823623657, + "learning_rate": 3.3e-05, + "loss": 0.5551, + "step": 6603 + }, + { + "epoch": 0.3698062493000336, + "grad_norm": 1.3730159997940063, + "learning_rate": 3.3005e-05, + "loss": 0.4925, + "step": 6604 + }, + { + "epoch": 0.3698622466121626, + "grad_norm": 1.365923285484314, + "learning_rate": 3.3010000000000004e-05, + "loss": 0.4339, + "step": 6605 + }, + { + "epoch": 0.3699182439242916, + "grad_norm": 1.1870639324188232, + "learning_rate": 3.3015e-05, + "loss": 0.3789, + "step": 6606 + }, + { + "epoch": 0.36997424123642064, + "grad_norm": 4.544864177703857, + "learning_rate": 3.302e-05, + "loss": 0.5686, + "step": 6607 + }, + { + "epoch": 0.37003023854854966, + "grad_norm": 1.421617865562439, + "learning_rate": 3.3025e-05, + "loss": 0.5282, + "step": 6608 + }, + { + "epoch": 0.3700862358606787, + "grad_norm": 1.3793761730194092, + "learning_rate": 3.303e-05, + "loss": 0.4925, + "step": 6609 + }, + { + "epoch": 0.3701422331728077, + "grad_norm": 1.3852094411849976, + "learning_rate": 3.3035e-05, + "loss": 0.4386, + "step": 6610 + }, + { + "epoch": 0.3701982304849367, + "grad_norm": 1.2681360244750977, + "learning_rate": 3.304e-05, + "loss": 0.3768, + "step": 6611 + }, + { + "epoch": 0.37025422779706574, + "grad_norm": 1.3386889696121216, + "learning_rate": 3.3045000000000006e-05, + "loss": 0.4566, + "step": 6612 + }, + { + "epoch": 0.37031022510919476, + "grad_norm": 1.2781789302825928, + "learning_rate": 3.3050000000000004e-05, + "loss": 0.4486, + "step": 6613 + }, + { + "epoch": 0.3703662224213238, + "grad_norm": 1.4412894248962402, + "learning_rate": 3.3055e-05, + "loss": 0.4828, + "step": 6614 + }, + { + "epoch": 0.3704222197334528, + "grad_norm": 1.2869528532028198, + "learning_rate": 3.3060000000000005e-05, + "loss": 0.4226, + "step": 6615 + }, + { + "epoch": 0.3704782170455818, + "grad_norm": 1.3932974338531494, + "learning_rate": 3.3065e-05, + "loss": 0.4495, + "step": 6616 + }, + { + "epoch": 0.37053421435771083, + "grad_norm": 1.136078119277954, + "learning_rate": 3.307e-05, + "loss": 0.4614, + "step": 6617 + }, + { + "epoch": 0.37059021166983985, + "grad_norm": 1.113147497177124, + "learning_rate": 3.3075e-05, + "loss": 0.3735, + "step": 6618 + }, + { + "epoch": 0.37064620898196887, + "grad_norm": 1.1008528470993042, + "learning_rate": 3.308e-05, + "loss": 0.4377, + "step": 6619 + }, + { + "epoch": 0.3707022062940979, + "grad_norm": 1.2893664836883545, + "learning_rate": 3.3085e-05, + "loss": 0.4508, + "step": 6620 + }, + { + "epoch": 0.3707582036062269, + "grad_norm": 1.1024919748306274, + "learning_rate": 3.309e-05, + "loss": 0.3329, + "step": 6621 + }, + { + "epoch": 0.37081420091835593, + "grad_norm": 1.320804476737976, + "learning_rate": 3.3095e-05, + "loss": 0.4158, + "step": 6622 + }, + { + "epoch": 0.37087019823048495, + "grad_norm": 1.3588051795959473, + "learning_rate": 3.3100000000000005e-05, + "loss": 0.5241, + "step": 6623 + }, + { + "epoch": 0.37092619554261397, + "grad_norm": 1.2875909805297852, + "learning_rate": 3.3105e-05, + "loss": 0.3818, + "step": 6624 + }, + { + "epoch": 0.370982192854743, + "grad_norm": 1.308329463005066, + "learning_rate": 3.311e-05, + "loss": 0.5868, + "step": 6625 + }, + { + "epoch": 0.371038190166872, + "grad_norm": 1.737322449684143, + "learning_rate": 3.3115000000000004e-05, + "loss": 0.4847, + "step": 6626 + }, + { + "epoch": 0.371094187479001, + "grad_norm": 4.365012168884277, + "learning_rate": 3.312e-05, + "loss": 0.4465, + "step": 6627 + }, + { + "epoch": 0.37115018479113004, + "grad_norm": 1.3691514730453491, + "learning_rate": 3.3125e-05, + "loss": 0.4063, + "step": 6628 + }, + { + "epoch": 0.37120618210325906, + "grad_norm": 1.4230196475982666, + "learning_rate": 3.313e-05, + "loss": 0.461, + "step": 6629 + }, + { + "epoch": 0.3712621794153881, + "grad_norm": 1.1951203346252441, + "learning_rate": 3.3135e-05, + "loss": 0.4172, + "step": 6630 + }, + { + "epoch": 0.3713181767275171, + "grad_norm": 1.3649755716323853, + "learning_rate": 3.314e-05, + "loss": 0.3652, + "step": 6631 + }, + { + "epoch": 0.3713741740396461, + "grad_norm": 1.3521397113800049, + "learning_rate": 3.3145e-05, + "loss": 0.4778, + "step": 6632 + }, + { + "epoch": 0.37143017135177514, + "grad_norm": 1.3003733158111572, + "learning_rate": 3.3150000000000006e-05, + "loss": 0.3717, + "step": 6633 + }, + { + "epoch": 0.37148616866390416, + "grad_norm": 1.3330631256103516, + "learning_rate": 3.3155000000000004e-05, + "loss": 0.4405, + "step": 6634 + }, + { + "epoch": 0.3715421659760331, + "grad_norm": 1.1360352039337158, + "learning_rate": 3.316e-05, + "loss": 0.4608, + "step": 6635 + }, + { + "epoch": 0.37159816328816214, + "grad_norm": 1.2262935638427734, + "learning_rate": 3.3165e-05, + "loss": 0.4187, + "step": 6636 + }, + { + "epoch": 0.37165416060029116, + "grad_norm": 1.1865484714508057, + "learning_rate": 3.317e-05, + "loss": 0.403, + "step": 6637 + }, + { + "epoch": 0.3717101579124202, + "grad_norm": 1.1020766496658325, + "learning_rate": 3.3175e-05, + "loss": 0.4161, + "step": 6638 + }, + { + "epoch": 0.3717661552245492, + "grad_norm": 1.1455082893371582, + "learning_rate": 3.318e-05, + "loss": 0.4625, + "step": 6639 + }, + { + "epoch": 0.3718221525366782, + "grad_norm": 1.2724734544754028, + "learning_rate": 3.3185e-05, + "loss": 0.4234, + "step": 6640 + }, + { + "epoch": 0.37187814984880724, + "grad_norm": 1.2132536172866821, + "learning_rate": 3.319e-05, + "loss": 0.3902, + "step": 6641 + }, + { + "epoch": 0.37193414716093626, + "grad_norm": 1.353208303451538, + "learning_rate": 3.3195e-05, + "loss": 0.5316, + "step": 6642 + }, + { + "epoch": 0.3719901444730653, + "grad_norm": 1.3749722242355347, + "learning_rate": 3.32e-05, + "loss": 0.3778, + "step": 6643 + }, + { + "epoch": 0.3720461417851943, + "grad_norm": 1.0999027490615845, + "learning_rate": 3.3205000000000005e-05, + "loss": 0.3242, + "step": 6644 + }, + { + "epoch": 0.3721021390973233, + "grad_norm": 1.3657257556915283, + "learning_rate": 3.321e-05, + "loss": 0.5076, + "step": 6645 + }, + { + "epoch": 0.37215813640945233, + "grad_norm": 1.0309100151062012, + "learning_rate": 3.3215e-05, + "loss": 0.3803, + "step": 6646 + }, + { + "epoch": 0.37221413372158135, + "grad_norm": 1.1874897480010986, + "learning_rate": 3.3220000000000004e-05, + "loss": 0.4817, + "step": 6647 + }, + { + "epoch": 0.37227013103371037, + "grad_norm": 1.455005407333374, + "learning_rate": 3.3225e-05, + "loss": 0.4271, + "step": 6648 + }, + { + "epoch": 0.3723261283458394, + "grad_norm": 1.4285304546356201, + "learning_rate": 3.323e-05, + "loss": 0.3734, + "step": 6649 + }, + { + "epoch": 0.3723821256579684, + "grad_norm": 1.0313078165054321, + "learning_rate": 3.3235e-05, + "loss": 0.3734, + "step": 6650 + }, + { + "epoch": 0.37243812297009743, + "grad_norm": 0.9433008432388306, + "learning_rate": 3.324e-05, + "loss": 0.318, + "step": 6651 + }, + { + "epoch": 0.37249412028222645, + "grad_norm": 1.1286393404006958, + "learning_rate": 3.3245000000000004e-05, + "loss": 0.459, + "step": 6652 + }, + { + "epoch": 0.37255011759435547, + "grad_norm": 0.9914267659187317, + "learning_rate": 3.325e-05, + "loss": 0.3964, + "step": 6653 + }, + { + "epoch": 0.3726061149064845, + "grad_norm": 1.1322031021118164, + "learning_rate": 3.3255000000000006e-05, + "loss": 0.5777, + "step": 6654 + }, + { + "epoch": 0.3726621122186135, + "grad_norm": 1.4263185262680054, + "learning_rate": 3.3260000000000003e-05, + "loss": 0.5518, + "step": 6655 + }, + { + "epoch": 0.3727181095307425, + "grad_norm": 1.1430028676986694, + "learning_rate": 3.3265e-05, + "loss": 0.4088, + "step": 6656 + }, + { + "epoch": 0.37277410684287154, + "grad_norm": 1.332008957862854, + "learning_rate": 3.327e-05, + "loss": 0.4516, + "step": 6657 + }, + { + "epoch": 0.37283010415500056, + "grad_norm": 1.0811599493026733, + "learning_rate": 3.3275e-05, + "loss": 0.3934, + "step": 6658 + }, + { + "epoch": 0.3728861014671296, + "grad_norm": 1.345110297203064, + "learning_rate": 3.328e-05, + "loss": 0.4315, + "step": 6659 + }, + { + "epoch": 0.3729420987792586, + "grad_norm": 1.412089467048645, + "learning_rate": 3.3285e-05, + "loss": 0.5837, + "step": 6660 + }, + { + "epoch": 0.3729980960913876, + "grad_norm": 1.514289379119873, + "learning_rate": 3.329e-05, + "loss": 0.4549, + "step": 6661 + }, + { + "epoch": 0.37305409340351664, + "grad_norm": 1.1381101608276367, + "learning_rate": 3.3295000000000006e-05, + "loss": 0.3819, + "step": 6662 + }, + { + "epoch": 0.37311009071564566, + "grad_norm": 1.1717133522033691, + "learning_rate": 3.33e-05, + "loss": 0.4161, + "step": 6663 + }, + { + "epoch": 0.3731660880277747, + "grad_norm": 1.2617908716201782, + "learning_rate": 3.3305e-05, + "loss": 0.4284, + "step": 6664 + }, + { + "epoch": 0.3732220853399037, + "grad_norm": 1.1256204843521118, + "learning_rate": 3.3310000000000005e-05, + "loss": 0.3324, + "step": 6665 + }, + { + "epoch": 0.3732780826520327, + "grad_norm": 1.3104017972946167, + "learning_rate": 3.3315e-05, + "loss": 0.603, + "step": 6666 + }, + { + "epoch": 0.37333407996416174, + "grad_norm": 1.2038390636444092, + "learning_rate": 3.332e-05, + "loss": 0.5616, + "step": 6667 + }, + { + "epoch": 0.37339007727629075, + "grad_norm": 1.3213260173797607, + "learning_rate": 3.3325000000000004e-05, + "loss": 0.4087, + "step": 6668 + }, + { + "epoch": 0.3734460745884198, + "grad_norm": 1.4307289123535156, + "learning_rate": 3.333e-05, + "loss": 0.5434, + "step": 6669 + }, + { + "epoch": 0.3735020719005488, + "grad_norm": 1.0907399654388428, + "learning_rate": 3.3335e-05, + "loss": 0.3285, + "step": 6670 + }, + { + "epoch": 0.3735580692126778, + "grad_norm": 1.2606312036514282, + "learning_rate": 3.3339999999999996e-05, + "loss": 0.3946, + "step": 6671 + }, + { + "epoch": 0.37361406652480683, + "grad_norm": 1.20966637134552, + "learning_rate": 3.334500000000001e-05, + "loss": 0.3584, + "step": 6672 + }, + { + "epoch": 0.37367006383693585, + "grad_norm": 1.0477896928787231, + "learning_rate": 3.3350000000000004e-05, + "loss": 0.3188, + "step": 6673 + }, + { + "epoch": 0.37372606114906487, + "grad_norm": 1.3895022869110107, + "learning_rate": 3.3355e-05, + "loss": 0.6125, + "step": 6674 + }, + { + "epoch": 0.3737820584611939, + "grad_norm": 1.870864748954773, + "learning_rate": 3.336e-05, + "loss": 0.4728, + "step": 6675 + }, + { + "epoch": 0.3738380557733229, + "grad_norm": 1.2265498638153076, + "learning_rate": 3.3365e-05, + "loss": 0.3698, + "step": 6676 + }, + { + "epoch": 0.37389405308545187, + "grad_norm": 1.2869940996170044, + "learning_rate": 3.337e-05, + "loss": 0.5283, + "step": 6677 + }, + { + "epoch": 0.3739500503975809, + "grad_norm": 1.4047949314117432, + "learning_rate": 3.3375e-05, + "loss": 0.3931, + "step": 6678 + }, + { + "epoch": 0.3740060477097099, + "grad_norm": 1.167543888092041, + "learning_rate": 3.338e-05, + "loss": 0.4388, + "step": 6679 + }, + { + "epoch": 0.37406204502183893, + "grad_norm": 1.3333889245986938, + "learning_rate": 3.3385e-05, + "loss": 0.3845, + "step": 6680 + }, + { + "epoch": 0.37411804233396795, + "grad_norm": 1.2822749614715576, + "learning_rate": 3.339e-05, + "loss": 0.4201, + "step": 6681 + }, + { + "epoch": 0.37417403964609697, + "grad_norm": 1.1371928453445435, + "learning_rate": 3.3395e-05, + "loss": 0.3391, + "step": 6682 + }, + { + "epoch": 0.374230036958226, + "grad_norm": 1.3467376232147217, + "learning_rate": 3.3400000000000005e-05, + "loss": 0.4459, + "step": 6683 + }, + { + "epoch": 0.374286034270355, + "grad_norm": 1.1777925491333008, + "learning_rate": 3.3405e-05, + "loss": 0.4282, + "step": 6684 + }, + { + "epoch": 0.374342031582484, + "grad_norm": 1.192771077156067, + "learning_rate": 3.341e-05, + "loss": 0.4234, + "step": 6685 + }, + { + "epoch": 0.37439802889461304, + "grad_norm": 1.1964887380599976, + "learning_rate": 3.3415000000000004e-05, + "loss": 0.4521, + "step": 6686 + }, + { + "epoch": 0.37445402620674206, + "grad_norm": 1.1602240800857544, + "learning_rate": 3.342e-05, + "loss": 0.5186, + "step": 6687 + }, + { + "epoch": 0.3745100235188711, + "grad_norm": 1.2414394617080688, + "learning_rate": 3.3425e-05, + "loss": 0.3399, + "step": 6688 + }, + { + "epoch": 0.3745660208310001, + "grad_norm": 1.127994418144226, + "learning_rate": 3.3430000000000003e-05, + "loss": 0.5001, + "step": 6689 + }, + { + "epoch": 0.3746220181431291, + "grad_norm": 1.3012959957122803, + "learning_rate": 3.3435e-05, + "loss": 0.4527, + "step": 6690 + }, + { + "epoch": 0.37467801545525814, + "grad_norm": 1.074070692062378, + "learning_rate": 3.344e-05, + "loss": 0.401, + "step": 6691 + }, + { + "epoch": 0.37473401276738716, + "grad_norm": 1.3016616106033325, + "learning_rate": 3.3445e-05, + "loss": 0.4612, + "step": 6692 + }, + { + "epoch": 0.3747900100795162, + "grad_norm": 1.3945709466934204, + "learning_rate": 3.345000000000001e-05, + "loss": 0.4831, + "step": 6693 + }, + { + "epoch": 0.3748460073916452, + "grad_norm": 1.1574516296386719, + "learning_rate": 3.3455000000000004e-05, + "loss": 0.5866, + "step": 6694 + }, + { + "epoch": 0.3749020047037742, + "grad_norm": 1.5415083169937134, + "learning_rate": 3.346e-05, + "loss": 0.4111, + "step": 6695 + }, + { + "epoch": 0.37495800201590324, + "grad_norm": 1.3542542457580566, + "learning_rate": 3.3465e-05, + "loss": 0.4482, + "step": 6696 + }, + { + "epoch": 0.37501399932803225, + "grad_norm": 1.5849406719207764, + "learning_rate": 3.347e-05, + "loss": 0.3569, + "step": 6697 + }, + { + "epoch": 0.3750699966401613, + "grad_norm": 1.2463269233703613, + "learning_rate": 3.3475e-05, + "loss": 0.4923, + "step": 6698 + }, + { + "epoch": 0.3751259939522903, + "grad_norm": 1.2878328561782837, + "learning_rate": 3.348e-05, + "loss": 0.4517, + "step": 6699 + }, + { + "epoch": 0.3751819912644193, + "grad_norm": 1.3001095056533813, + "learning_rate": 3.3485e-05, + "loss": 0.4187, + "step": 6700 + }, + { + "epoch": 0.37523798857654833, + "grad_norm": 1.1260403394699097, + "learning_rate": 3.349e-05, + "loss": 0.3583, + "step": 6701 + }, + { + "epoch": 0.37529398588867735, + "grad_norm": 1.2898516654968262, + "learning_rate": 3.3495000000000004e-05, + "loss": 0.4044, + "step": 6702 + }, + { + "epoch": 0.37534998320080637, + "grad_norm": 1.0880471467971802, + "learning_rate": 3.35e-05, + "loss": 0.3948, + "step": 6703 + }, + { + "epoch": 0.3754059805129354, + "grad_norm": 0.983112096786499, + "learning_rate": 3.3505000000000005e-05, + "loss": 0.3603, + "step": 6704 + }, + { + "epoch": 0.3754619778250644, + "grad_norm": 1.2240639925003052, + "learning_rate": 3.351e-05, + "loss": 0.4636, + "step": 6705 + }, + { + "epoch": 0.3755179751371934, + "grad_norm": 1.7151166200637817, + "learning_rate": 3.3515e-05, + "loss": 0.5167, + "step": 6706 + }, + { + "epoch": 0.37557397244932245, + "grad_norm": 1.0267263650894165, + "learning_rate": 3.3520000000000004e-05, + "loss": 0.3929, + "step": 6707 + }, + { + "epoch": 0.37562996976145147, + "grad_norm": 2.0249452590942383, + "learning_rate": 3.3525e-05, + "loss": 0.4115, + "step": 6708 + }, + { + "epoch": 0.3756859670735805, + "grad_norm": 1.2309340238571167, + "learning_rate": 3.353e-05, + "loss": 0.4488, + "step": 6709 + }, + { + "epoch": 0.3757419643857095, + "grad_norm": 1.1514673233032227, + "learning_rate": 3.3534999999999997e-05, + "loss": 0.3519, + "step": 6710 + }, + { + "epoch": 0.3757979616978385, + "grad_norm": 0.9589496850967407, + "learning_rate": 3.354e-05, + "loss": 0.4595, + "step": 6711 + }, + { + "epoch": 0.37585395900996754, + "grad_norm": 1.345346450805664, + "learning_rate": 3.3545000000000005e-05, + "loss": 0.4561, + "step": 6712 + }, + { + "epoch": 0.37590995632209656, + "grad_norm": 1.1441172361373901, + "learning_rate": 3.355e-05, + "loss": 0.3547, + "step": 6713 + }, + { + "epoch": 0.3759659536342256, + "grad_norm": 1.4294555187225342, + "learning_rate": 3.3555e-05, + "loss": 0.4995, + "step": 6714 + }, + { + "epoch": 0.3760219509463546, + "grad_norm": 1.6530765295028687, + "learning_rate": 3.3560000000000004e-05, + "loss": 0.4534, + "step": 6715 + }, + { + "epoch": 0.3760779482584836, + "grad_norm": 1.140238642692566, + "learning_rate": 3.3565e-05, + "loss": 0.4516, + "step": 6716 + }, + { + "epoch": 0.37613394557061264, + "grad_norm": 1.2830810546875, + "learning_rate": 3.357e-05, + "loss": 0.4988, + "step": 6717 + }, + { + "epoch": 0.3761899428827416, + "grad_norm": 1.5989134311676025, + "learning_rate": 3.3575e-05, + "loss": 0.4608, + "step": 6718 + }, + { + "epoch": 0.3762459401948706, + "grad_norm": 1.3424721956253052, + "learning_rate": 3.358e-05, + "loss": 0.6089, + "step": 6719 + }, + { + "epoch": 0.37630193750699964, + "grad_norm": 1.6010934114456177, + "learning_rate": 3.3585e-05, + "loss": 0.6814, + "step": 6720 + }, + { + "epoch": 0.37635793481912866, + "grad_norm": 1.0401952266693115, + "learning_rate": 3.359e-05, + "loss": 0.3748, + "step": 6721 + }, + { + "epoch": 0.3764139321312577, + "grad_norm": 1.324708104133606, + "learning_rate": 3.3595000000000006e-05, + "loss": 0.3935, + "step": 6722 + }, + { + "epoch": 0.3764699294433867, + "grad_norm": 1.4354658126831055, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.4073, + "step": 6723 + }, + { + "epoch": 0.3765259267555157, + "grad_norm": 1.1871925592422485, + "learning_rate": 3.3605e-05, + "loss": 0.3696, + "step": 6724 + }, + { + "epoch": 0.37658192406764474, + "grad_norm": 1.34526789188385, + "learning_rate": 3.3610000000000005e-05, + "loss": 0.4582, + "step": 6725 + }, + { + "epoch": 0.37663792137977375, + "grad_norm": 1.047061562538147, + "learning_rate": 3.3615e-05, + "loss": 0.401, + "step": 6726 + }, + { + "epoch": 0.3766939186919028, + "grad_norm": 1.6513768434524536, + "learning_rate": 3.362e-05, + "loss": 0.4484, + "step": 6727 + }, + { + "epoch": 0.3767499160040318, + "grad_norm": 1.3245466947555542, + "learning_rate": 3.3625000000000004e-05, + "loss": 0.4729, + "step": 6728 + }, + { + "epoch": 0.3768059133161608, + "grad_norm": 2.3662190437316895, + "learning_rate": 3.363e-05, + "loss": 0.4553, + "step": 6729 + }, + { + "epoch": 0.37686191062828983, + "grad_norm": 1.32633638381958, + "learning_rate": 3.3635e-05, + "loss": 0.4385, + "step": 6730 + }, + { + "epoch": 0.37691790794041885, + "grad_norm": 1.370506763458252, + "learning_rate": 3.3639999999999996e-05, + "loss": 0.5247, + "step": 6731 + }, + { + "epoch": 0.37697390525254787, + "grad_norm": 1.3929905891418457, + "learning_rate": 3.364500000000001e-05, + "loss": 0.5001, + "step": 6732 + }, + { + "epoch": 0.3770299025646769, + "grad_norm": 1.440767526626587, + "learning_rate": 3.3650000000000005e-05, + "loss": 0.5486, + "step": 6733 + }, + { + "epoch": 0.3770858998768059, + "grad_norm": 1.2727113962173462, + "learning_rate": 3.3655e-05, + "loss": 0.5041, + "step": 6734 + }, + { + "epoch": 0.3771418971889349, + "grad_norm": 1.1316595077514648, + "learning_rate": 3.366e-05, + "loss": 0.4268, + "step": 6735 + }, + { + "epoch": 0.37719789450106395, + "grad_norm": 1.1441484689712524, + "learning_rate": 3.3665000000000004e-05, + "loss": 0.3752, + "step": 6736 + }, + { + "epoch": 0.37725389181319297, + "grad_norm": 1.4344395399093628, + "learning_rate": 3.367e-05, + "loss": 0.4142, + "step": 6737 + }, + { + "epoch": 0.377309889125322, + "grad_norm": 1.3655970096588135, + "learning_rate": 3.3675e-05, + "loss": 0.3492, + "step": 6738 + }, + { + "epoch": 0.377365886437451, + "grad_norm": 1.1607861518859863, + "learning_rate": 3.368e-05, + "loss": 0.451, + "step": 6739 + }, + { + "epoch": 0.37742188374958, + "grad_norm": 3.041842460632324, + "learning_rate": 3.3685e-05, + "loss": 0.5925, + "step": 6740 + }, + { + "epoch": 0.37747788106170904, + "grad_norm": 1.3775370121002197, + "learning_rate": 3.369e-05, + "loss": 0.3979, + "step": 6741 + }, + { + "epoch": 0.37753387837383806, + "grad_norm": 1.4027009010314941, + "learning_rate": 3.3695e-05, + "loss": 0.459, + "step": 6742 + }, + { + "epoch": 0.3775898756859671, + "grad_norm": 1.8847804069519043, + "learning_rate": 3.3700000000000006e-05, + "loss": 0.6248, + "step": 6743 + }, + { + "epoch": 0.3776458729980961, + "grad_norm": 1.2024933099746704, + "learning_rate": 3.3705000000000003e-05, + "loss": 0.4764, + "step": 6744 + }, + { + "epoch": 0.3777018703102251, + "grad_norm": 1.3369094133377075, + "learning_rate": 3.371e-05, + "loss": 0.4192, + "step": 6745 + }, + { + "epoch": 0.37775786762235414, + "grad_norm": 1.276689887046814, + "learning_rate": 3.3715000000000005e-05, + "loss": 0.4924, + "step": 6746 + }, + { + "epoch": 0.37781386493448316, + "grad_norm": 1.388626217842102, + "learning_rate": 3.372e-05, + "loss": 0.4662, + "step": 6747 + }, + { + "epoch": 0.3778698622466122, + "grad_norm": 1.1112879514694214, + "learning_rate": 3.3725e-05, + "loss": 0.3071, + "step": 6748 + }, + { + "epoch": 0.3779258595587412, + "grad_norm": 1.2572911977767944, + "learning_rate": 3.373e-05, + "loss": 0.4241, + "step": 6749 + }, + { + "epoch": 0.3779818568708702, + "grad_norm": 2.8464155197143555, + "learning_rate": 3.3735e-05, + "loss": 0.5338, + "step": 6750 + }, + { + "epoch": 0.37803785418299923, + "grad_norm": 1.3102375268936157, + "learning_rate": 3.374e-05, + "loss": 0.3824, + "step": 6751 + }, + { + "epoch": 0.37809385149512825, + "grad_norm": 1.555628776550293, + "learning_rate": 3.3745e-05, + "loss": 0.5413, + "step": 6752 + }, + { + "epoch": 0.37814984880725727, + "grad_norm": 1.146290898323059, + "learning_rate": 3.375000000000001e-05, + "loss": 0.3548, + "step": 6753 + }, + { + "epoch": 0.3782058461193863, + "grad_norm": 1.0477023124694824, + "learning_rate": 3.3755000000000005e-05, + "loss": 0.3244, + "step": 6754 + }, + { + "epoch": 0.3782618434315153, + "grad_norm": 1.1944644451141357, + "learning_rate": 3.376e-05, + "loss": 0.4431, + "step": 6755 + }, + { + "epoch": 0.37831784074364433, + "grad_norm": 1.236586332321167, + "learning_rate": 3.3765e-05, + "loss": 0.4362, + "step": 6756 + }, + { + "epoch": 0.37837383805577335, + "grad_norm": 13.946683883666992, + "learning_rate": 3.3770000000000004e-05, + "loss": 0.399, + "step": 6757 + }, + { + "epoch": 0.37842983536790237, + "grad_norm": 1.2594490051269531, + "learning_rate": 3.3775e-05, + "loss": 0.3616, + "step": 6758 + }, + { + "epoch": 0.37848583268003133, + "grad_norm": 1.4111195802688599, + "learning_rate": 3.378e-05, + "loss": 0.4966, + "step": 6759 + }, + { + "epoch": 0.37854182999216035, + "grad_norm": 1.3201584815979004, + "learning_rate": 3.3785e-05, + "loss": 0.386, + "step": 6760 + }, + { + "epoch": 0.37859782730428937, + "grad_norm": 1.3126658201217651, + "learning_rate": 3.379e-05, + "loss": 0.506, + "step": 6761 + }, + { + "epoch": 0.3786538246164184, + "grad_norm": 1.1481921672821045, + "learning_rate": 3.3795e-05, + "loss": 0.3953, + "step": 6762 + }, + { + "epoch": 0.3787098219285474, + "grad_norm": 1.41203773021698, + "learning_rate": 3.38e-05, + "loss": 0.3521, + "step": 6763 + }, + { + "epoch": 0.3787658192406764, + "grad_norm": 1.215990424156189, + "learning_rate": 3.3805000000000006e-05, + "loss": 0.4788, + "step": 6764 + }, + { + "epoch": 0.37882181655280545, + "grad_norm": 1.1901772022247314, + "learning_rate": 3.381e-05, + "loss": 0.3711, + "step": 6765 + }, + { + "epoch": 0.37887781386493447, + "grad_norm": 1.2417351007461548, + "learning_rate": 3.3815e-05, + "loss": 0.4276, + "step": 6766 + }, + { + "epoch": 0.3789338111770635, + "grad_norm": 1.1263600587844849, + "learning_rate": 3.3820000000000005e-05, + "loss": 0.4496, + "step": 6767 + }, + { + "epoch": 0.3789898084891925, + "grad_norm": 1.247110366821289, + "learning_rate": 3.3825e-05, + "loss": 0.4089, + "step": 6768 + }, + { + "epoch": 0.3790458058013215, + "grad_norm": 1.2428250312805176, + "learning_rate": 3.383e-05, + "loss": 0.4079, + "step": 6769 + }, + { + "epoch": 0.37910180311345054, + "grad_norm": 1.5003178119659424, + "learning_rate": 3.3835e-05, + "loss": 0.5265, + "step": 6770 + }, + { + "epoch": 0.37915780042557956, + "grad_norm": 1.2733747959136963, + "learning_rate": 3.384e-05, + "loss": 0.4329, + "step": 6771 + }, + { + "epoch": 0.3792137977377086, + "grad_norm": 1.390162706375122, + "learning_rate": 3.3845e-05, + "loss": 0.4958, + "step": 6772 + }, + { + "epoch": 0.3792697950498376, + "grad_norm": 1.2606457471847534, + "learning_rate": 3.385e-05, + "loss": 0.4001, + "step": 6773 + }, + { + "epoch": 0.3793257923619666, + "grad_norm": 1.286514401435852, + "learning_rate": 3.3855e-05, + "loss": 0.3534, + "step": 6774 + }, + { + "epoch": 0.37938178967409564, + "grad_norm": 1.1558682918548584, + "learning_rate": 3.3860000000000004e-05, + "loss": 0.4396, + "step": 6775 + }, + { + "epoch": 0.37943778698622466, + "grad_norm": 1.3537018299102783, + "learning_rate": 3.3865e-05, + "loss": 0.4584, + "step": 6776 + }, + { + "epoch": 0.3794937842983537, + "grad_norm": 1.4790765047073364, + "learning_rate": 3.387e-05, + "loss": 0.5101, + "step": 6777 + }, + { + "epoch": 0.3795497816104827, + "grad_norm": 1.381075382232666, + "learning_rate": 3.3875000000000003e-05, + "loss": 0.4182, + "step": 6778 + }, + { + "epoch": 0.3796057789226117, + "grad_norm": 1.909953236579895, + "learning_rate": 3.388e-05, + "loss": 0.4372, + "step": 6779 + }, + { + "epoch": 0.37966177623474073, + "grad_norm": 1.7746893167495728, + "learning_rate": 3.3885e-05, + "loss": 0.4639, + "step": 6780 + }, + { + "epoch": 0.37971777354686975, + "grad_norm": 1.4533138275146484, + "learning_rate": 3.389e-05, + "loss": 0.4153, + "step": 6781 + }, + { + "epoch": 0.37977377085899877, + "grad_norm": 1.1535168886184692, + "learning_rate": 3.3895e-05, + "loss": 0.3526, + "step": 6782 + }, + { + "epoch": 0.3798297681711278, + "grad_norm": 1.2879915237426758, + "learning_rate": 3.3900000000000004e-05, + "loss": 0.3643, + "step": 6783 + }, + { + "epoch": 0.3798857654832568, + "grad_norm": 1.2191060781478882, + "learning_rate": 3.3905e-05, + "loss": 0.5161, + "step": 6784 + }, + { + "epoch": 0.37994176279538583, + "grad_norm": 1.0810246467590332, + "learning_rate": 3.3910000000000006e-05, + "loss": 0.4164, + "step": 6785 + }, + { + "epoch": 0.37999776010751485, + "grad_norm": 1.4238454103469849, + "learning_rate": 3.3915e-05, + "loss": 0.426, + "step": 6786 + }, + { + "epoch": 0.38005375741964387, + "grad_norm": 1.1587119102478027, + "learning_rate": 3.392e-05, + "loss": 0.3992, + "step": 6787 + }, + { + "epoch": 0.3801097547317729, + "grad_norm": 1.2623403072357178, + "learning_rate": 3.3925e-05, + "loss": 0.4614, + "step": 6788 + }, + { + "epoch": 0.3801657520439019, + "grad_norm": 1.398996114730835, + "learning_rate": 3.393e-05, + "loss": 0.4433, + "step": 6789 + }, + { + "epoch": 0.3802217493560309, + "grad_norm": 1.2736495733261108, + "learning_rate": 3.3935e-05, + "loss": 0.3597, + "step": 6790 + }, + { + "epoch": 0.38027774666815994, + "grad_norm": 1.165753960609436, + "learning_rate": 3.394e-05, + "loss": 0.3583, + "step": 6791 + }, + { + "epoch": 0.38033374398028896, + "grad_norm": 1.2651578187942505, + "learning_rate": 3.3945e-05, + "loss": 0.4531, + "step": 6792 + }, + { + "epoch": 0.380389741292418, + "grad_norm": 1.4925222396850586, + "learning_rate": 3.3950000000000005e-05, + "loss": 0.4597, + "step": 6793 + }, + { + "epoch": 0.380445738604547, + "grad_norm": 1.1772981882095337, + "learning_rate": 3.3955e-05, + "loss": 0.4551, + "step": 6794 + }, + { + "epoch": 0.380501735916676, + "grad_norm": 1.0072182416915894, + "learning_rate": 3.396e-05, + "loss": 0.3487, + "step": 6795 + }, + { + "epoch": 0.38055773322880504, + "grad_norm": 1.4475287199020386, + "learning_rate": 3.3965000000000004e-05, + "loss": 0.5036, + "step": 6796 + }, + { + "epoch": 0.38061373054093406, + "grad_norm": 1.2004691362380981, + "learning_rate": 3.397e-05, + "loss": 0.4327, + "step": 6797 + }, + { + "epoch": 0.3806697278530631, + "grad_norm": 1.0906919240951538, + "learning_rate": 3.3975e-05, + "loss": 0.3438, + "step": 6798 + }, + { + "epoch": 0.3807257251651921, + "grad_norm": 1.304298996925354, + "learning_rate": 3.398e-05, + "loss": 0.4037, + "step": 6799 + }, + { + "epoch": 0.3807817224773211, + "grad_norm": 1.767535924911499, + "learning_rate": 3.3985e-05, + "loss": 0.5008, + "step": 6800 + }, + { + "epoch": 0.3808377197894501, + "grad_norm": 1.1377581357955933, + "learning_rate": 3.399e-05, + "loss": 0.3894, + "step": 6801 + }, + { + "epoch": 0.3808937171015791, + "grad_norm": 1.1364119052886963, + "learning_rate": 3.3995e-05, + "loss": 0.433, + "step": 6802 + }, + { + "epoch": 0.3809497144137081, + "grad_norm": 1.2736361026763916, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4333, + "step": 6803 + }, + { + "epoch": 0.38100571172583714, + "grad_norm": 1.2721407413482666, + "learning_rate": 3.4005000000000004e-05, + "loss": 0.3837, + "step": 6804 + }, + { + "epoch": 0.38106170903796616, + "grad_norm": 0.9903769493103027, + "learning_rate": 3.401e-05, + "loss": 0.3657, + "step": 6805 + }, + { + "epoch": 0.3811177063500952, + "grad_norm": 1.243751883506775, + "learning_rate": 3.4015000000000006e-05, + "loss": 0.415, + "step": 6806 + }, + { + "epoch": 0.3811737036622242, + "grad_norm": 1.1863563060760498, + "learning_rate": 3.402e-05, + "loss": 0.527, + "step": 6807 + }, + { + "epoch": 0.3812297009743532, + "grad_norm": 1.2471858263015747, + "learning_rate": 3.4025e-05, + "loss": 0.5276, + "step": 6808 + }, + { + "epoch": 0.38128569828648223, + "grad_norm": 1.4560558795928955, + "learning_rate": 3.403e-05, + "loss": 0.3881, + "step": 6809 + }, + { + "epoch": 0.38134169559861125, + "grad_norm": 1.310857892036438, + "learning_rate": 3.4035e-05, + "loss": 0.6719, + "step": 6810 + }, + { + "epoch": 0.38139769291074027, + "grad_norm": 1.2389583587646484, + "learning_rate": 3.404e-05, + "loss": 0.4477, + "step": 6811 + }, + { + "epoch": 0.3814536902228693, + "grad_norm": 1.1277942657470703, + "learning_rate": 3.4045e-05, + "loss": 0.3727, + "step": 6812 + }, + { + "epoch": 0.3815096875349983, + "grad_norm": 1.286651611328125, + "learning_rate": 3.405e-05, + "loss": 0.3979, + "step": 6813 + }, + { + "epoch": 0.38156568484712733, + "grad_norm": 1.2730367183685303, + "learning_rate": 3.4055000000000005e-05, + "loss": 0.3411, + "step": 6814 + }, + { + "epoch": 0.38162168215925635, + "grad_norm": 1.1979669332504272, + "learning_rate": 3.406e-05, + "loss": 0.3594, + "step": 6815 + }, + { + "epoch": 0.38167767947138537, + "grad_norm": 1.4920653104782104, + "learning_rate": 3.4065e-05, + "loss": 0.4879, + "step": 6816 + }, + { + "epoch": 0.3817336767835144, + "grad_norm": 1.131361722946167, + "learning_rate": 3.4070000000000004e-05, + "loss": 0.3472, + "step": 6817 + }, + { + "epoch": 0.3817896740956434, + "grad_norm": 1.2309305667877197, + "learning_rate": 3.4075e-05, + "loss": 0.3836, + "step": 6818 + }, + { + "epoch": 0.3818456714077724, + "grad_norm": 1.2341068983078003, + "learning_rate": 3.408e-05, + "loss": 0.5457, + "step": 6819 + }, + { + "epoch": 0.38190166871990144, + "grad_norm": 1.188979983329773, + "learning_rate": 3.4085e-05, + "loss": 0.4289, + "step": 6820 + }, + { + "epoch": 0.38195766603203046, + "grad_norm": 1.1993087530136108, + "learning_rate": 3.409e-05, + "loss": 0.5416, + "step": 6821 + }, + { + "epoch": 0.3820136633441595, + "grad_norm": 0.96775883436203, + "learning_rate": 3.4095e-05, + "loss": 0.426, + "step": 6822 + }, + { + "epoch": 0.3820696606562885, + "grad_norm": 2.017106056213379, + "learning_rate": 3.41e-05, + "loss": 0.5721, + "step": 6823 + }, + { + "epoch": 0.3821256579684175, + "grad_norm": 1.203216791152954, + "learning_rate": 3.4105000000000006e-05, + "loss": 0.3714, + "step": 6824 + }, + { + "epoch": 0.38218165528054654, + "grad_norm": 1.0981453657150269, + "learning_rate": 3.4110000000000004e-05, + "loss": 0.3762, + "step": 6825 + }, + { + "epoch": 0.38223765259267556, + "grad_norm": 1.0832266807556152, + "learning_rate": 3.4115e-05, + "loss": 0.4779, + "step": 6826 + }, + { + "epoch": 0.3822936499048046, + "grad_norm": 1.519089698791504, + "learning_rate": 3.412e-05, + "loss": 0.6131, + "step": 6827 + }, + { + "epoch": 0.3823496472169336, + "grad_norm": 1.205888271331787, + "learning_rate": 3.4125e-05, + "loss": 0.4936, + "step": 6828 + }, + { + "epoch": 0.3824056445290626, + "grad_norm": 1.3022352457046509, + "learning_rate": 3.413e-05, + "loss": 0.4341, + "step": 6829 + }, + { + "epoch": 0.38246164184119164, + "grad_norm": 1.164675235748291, + "learning_rate": 3.4135e-05, + "loss": 0.4531, + "step": 6830 + }, + { + "epoch": 0.38251763915332065, + "grad_norm": 1.1891977787017822, + "learning_rate": 3.414e-05, + "loss": 0.531, + "step": 6831 + }, + { + "epoch": 0.3825736364654497, + "grad_norm": 1.1762791872024536, + "learning_rate": 3.4145e-05, + "loss": 0.4263, + "step": 6832 + }, + { + "epoch": 0.3826296337775787, + "grad_norm": 1.4778733253479004, + "learning_rate": 3.415e-05, + "loss": 0.733, + "step": 6833 + }, + { + "epoch": 0.3826856310897077, + "grad_norm": 1.2638355493545532, + "learning_rate": 3.4155e-05, + "loss": 0.404, + "step": 6834 + }, + { + "epoch": 0.38274162840183673, + "grad_norm": 1.3806214332580566, + "learning_rate": 3.4160000000000005e-05, + "loss": 0.537, + "step": 6835 + }, + { + "epoch": 0.38279762571396575, + "grad_norm": 1.2141627073287964, + "learning_rate": 3.4165e-05, + "loss": 0.457, + "step": 6836 + }, + { + "epoch": 0.38285362302609477, + "grad_norm": 1.233931303024292, + "learning_rate": 3.417e-05, + "loss": 0.4319, + "step": 6837 + }, + { + "epoch": 0.3829096203382238, + "grad_norm": 1.4215152263641357, + "learning_rate": 3.4175000000000004e-05, + "loss": 0.5703, + "step": 6838 + }, + { + "epoch": 0.3829656176503528, + "grad_norm": 1.1874788999557495, + "learning_rate": 3.418e-05, + "loss": 0.5177, + "step": 6839 + }, + { + "epoch": 0.3830216149624818, + "grad_norm": 1.167649269104004, + "learning_rate": 3.4185e-05, + "loss": 0.3902, + "step": 6840 + }, + { + "epoch": 0.38307761227461085, + "grad_norm": 1.2854527235031128, + "learning_rate": 3.419e-05, + "loss": 0.4082, + "step": 6841 + }, + { + "epoch": 0.3831336095867398, + "grad_norm": 1.080438256263733, + "learning_rate": 3.4195e-05, + "loss": 0.4204, + "step": 6842 + }, + { + "epoch": 0.38318960689886883, + "grad_norm": 1.3386930227279663, + "learning_rate": 3.4200000000000005e-05, + "loss": 0.5462, + "step": 6843 + }, + { + "epoch": 0.38324560421099785, + "grad_norm": 1.4374494552612305, + "learning_rate": 3.4205e-05, + "loss": 0.4368, + "step": 6844 + }, + { + "epoch": 0.38330160152312687, + "grad_norm": 1.7632851600646973, + "learning_rate": 3.4210000000000006e-05, + "loss": 0.3249, + "step": 6845 + }, + { + "epoch": 0.3833575988352559, + "grad_norm": 1.5548715591430664, + "learning_rate": 3.4215000000000004e-05, + "loss": 0.6289, + "step": 6846 + }, + { + "epoch": 0.3834135961473849, + "grad_norm": 1.3940459489822388, + "learning_rate": 3.422e-05, + "loss": 0.4671, + "step": 6847 + }, + { + "epoch": 0.3834695934595139, + "grad_norm": 1.5930670499801636, + "learning_rate": 3.4225e-05, + "loss": 0.6257, + "step": 6848 + }, + { + "epoch": 0.38352559077164294, + "grad_norm": 1.5224014520645142, + "learning_rate": 3.423e-05, + "loss": 0.4133, + "step": 6849 + }, + { + "epoch": 0.38358158808377196, + "grad_norm": 1.4070916175842285, + "learning_rate": 3.4235e-05, + "loss": 0.3598, + "step": 6850 + }, + { + "epoch": 0.383637585395901, + "grad_norm": 1.2294327020645142, + "learning_rate": 3.424e-05, + "loss": 0.4053, + "step": 6851 + }, + { + "epoch": 0.38369358270803, + "grad_norm": 1.1390637159347534, + "learning_rate": 3.4245e-05, + "loss": 0.4106, + "step": 6852 + }, + { + "epoch": 0.383749580020159, + "grad_norm": 1.3385306596755981, + "learning_rate": 3.4250000000000006e-05, + "loss": 0.4459, + "step": 6853 + }, + { + "epoch": 0.38380557733228804, + "grad_norm": 1.1420754194259644, + "learning_rate": 3.4255e-05, + "loss": 0.3797, + "step": 6854 + }, + { + "epoch": 0.38386157464441706, + "grad_norm": 1.1515679359436035, + "learning_rate": 3.426e-05, + "loss": 0.3679, + "step": 6855 + }, + { + "epoch": 0.3839175719565461, + "grad_norm": 1.1968247890472412, + "learning_rate": 3.4265000000000005e-05, + "loss": 0.4082, + "step": 6856 + }, + { + "epoch": 0.3839735692686751, + "grad_norm": 1.1582374572753906, + "learning_rate": 3.427e-05, + "loss": 0.3743, + "step": 6857 + }, + { + "epoch": 0.3840295665808041, + "grad_norm": 1.2968791723251343, + "learning_rate": 3.4275e-05, + "loss": 0.4489, + "step": 6858 + }, + { + "epoch": 0.38408556389293314, + "grad_norm": 1.2227622270584106, + "learning_rate": 3.4280000000000004e-05, + "loss": 0.4424, + "step": 6859 + }, + { + "epoch": 0.38414156120506215, + "grad_norm": 1.312872290611267, + "learning_rate": 3.4285e-05, + "loss": 0.4034, + "step": 6860 + }, + { + "epoch": 0.3841975585171912, + "grad_norm": 1.2574517726898193, + "learning_rate": 3.429e-05, + "loss": 0.4268, + "step": 6861 + }, + { + "epoch": 0.3842535558293202, + "grad_norm": 1.277334451675415, + "learning_rate": 3.4294999999999996e-05, + "loss": 0.4557, + "step": 6862 + }, + { + "epoch": 0.3843095531414492, + "grad_norm": 1.556626558303833, + "learning_rate": 3.430000000000001e-05, + "loss": 0.851, + "step": 6863 + }, + { + "epoch": 0.38436555045357823, + "grad_norm": 2.4134163856506348, + "learning_rate": 3.4305000000000004e-05, + "loss": 0.49, + "step": 6864 + }, + { + "epoch": 0.38442154776570725, + "grad_norm": 1.1647013425827026, + "learning_rate": 3.431e-05, + "loss": 0.6369, + "step": 6865 + }, + { + "epoch": 0.38447754507783627, + "grad_norm": 1.2150636911392212, + "learning_rate": 3.4315000000000006e-05, + "loss": 0.5115, + "step": 6866 + }, + { + "epoch": 0.3845335423899653, + "grad_norm": 1.3191980123519897, + "learning_rate": 3.4320000000000003e-05, + "loss": 0.428, + "step": 6867 + }, + { + "epoch": 0.3845895397020943, + "grad_norm": 1.2424001693725586, + "learning_rate": 3.4325e-05, + "loss": 0.4363, + "step": 6868 + }, + { + "epoch": 0.3846455370142233, + "grad_norm": 1.0711196660995483, + "learning_rate": 3.433e-05, + "loss": 0.45, + "step": 6869 + }, + { + "epoch": 0.38470153432635235, + "grad_norm": 1.1021277904510498, + "learning_rate": 3.4335e-05, + "loss": 0.3696, + "step": 6870 + }, + { + "epoch": 0.38475753163848136, + "grad_norm": 1.745692491531372, + "learning_rate": 3.434e-05, + "loss": 0.4261, + "step": 6871 + }, + { + "epoch": 0.3848135289506104, + "grad_norm": 1.2101738452911377, + "learning_rate": 3.4345e-05, + "loss": 0.4441, + "step": 6872 + }, + { + "epoch": 0.3848695262627394, + "grad_norm": 1.2984495162963867, + "learning_rate": 3.435e-05, + "loss": 0.6471, + "step": 6873 + }, + { + "epoch": 0.3849255235748684, + "grad_norm": 1.0604979991912842, + "learning_rate": 3.4355000000000006e-05, + "loss": 0.4003, + "step": 6874 + }, + { + "epoch": 0.38498152088699744, + "grad_norm": 1.4221092462539673, + "learning_rate": 3.436e-05, + "loss": 0.4043, + "step": 6875 + }, + { + "epoch": 0.38503751819912646, + "grad_norm": 1.2192177772521973, + "learning_rate": 3.4365e-05, + "loss": 0.4698, + "step": 6876 + }, + { + "epoch": 0.3850935155112555, + "grad_norm": 1.405113697052002, + "learning_rate": 3.4370000000000005e-05, + "loss": 0.4693, + "step": 6877 + }, + { + "epoch": 0.3851495128233845, + "grad_norm": 1.3599504232406616, + "learning_rate": 3.4375e-05, + "loss": 0.4721, + "step": 6878 + }, + { + "epoch": 0.3852055101355135, + "grad_norm": 2.1586360931396484, + "learning_rate": 3.438e-05, + "loss": 0.4071, + "step": 6879 + }, + { + "epoch": 0.38526150744764254, + "grad_norm": 1.1666386127471924, + "learning_rate": 3.4385000000000004e-05, + "loss": 0.4566, + "step": 6880 + }, + { + "epoch": 0.38531750475977156, + "grad_norm": 1.0913177728652954, + "learning_rate": 3.439e-05, + "loss": 0.3272, + "step": 6881 + }, + { + "epoch": 0.3853735020719006, + "grad_norm": 1.3614965677261353, + "learning_rate": 3.4395e-05, + "loss": 0.3895, + "step": 6882 + }, + { + "epoch": 0.38542949938402954, + "grad_norm": 1.186721920967102, + "learning_rate": 3.4399999999999996e-05, + "loss": 0.364, + "step": 6883 + }, + { + "epoch": 0.38548549669615856, + "grad_norm": 1.7788968086242676, + "learning_rate": 3.440500000000001e-05, + "loss": 0.8067, + "step": 6884 + }, + { + "epoch": 0.3855414940082876, + "grad_norm": 1.1849788427352905, + "learning_rate": 3.4410000000000004e-05, + "loss": 0.4643, + "step": 6885 + }, + { + "epoch": 0.3855974913204166, + "grad_norm": 1.4890176057815552, + "learning_rate": 3.4415e-05, + "loss": 0.4669, + "step": 6886 + }, + { + "epoch": 0.3856534886325456, + "grad_norm": 1.48883056640625, + "learning_rate": 3.442e-05, + "loss": 0.3893, + "step": 6887 + }, + { + "epoch": 0.38570948594467463, + "grad_norm": 1.2005877494812012, + "learning_rate": 3.4425e-05, + "loss": 0.5971, + "step": 6888 + }, + { + "epoch": 0.38576548325680365, + "grad_norm": 1.5162949562072754, + "learning_rate": 3.443e-05, + "loss": 0.4271, + "step": 6889 + }, + { + "epoch": 0.3858214805689327, + "grad_norm": 1.1552103757858276, + "learning_rate": 3.4435e-05, + "loss": 0.37, + "step": 6890 + }, + { + "epoch": 0.3858774778810617, + "grad_norm": 1.5426958799362183, + "learning_rate": 3.444e-05, + "loss": 0.4643, + "step": 6891 + }, + { + "epoch": 0.3859334751931907, + "grad_norm": 2.051938056945801, + "learning_rate": 3.4445e-05, + "loss": 0.4913, + "step": 6892 + }, + { + "epoch": 0.38598947250531973, + "grad_norm": 1.3490667343139648, + "learning_rate": 3.445e-05, + "loss": 0.4701, + "step": 6893 + }, + { + "epoch": 0.38604546981744875, + "grad_norm": 1.3706315755844116, + "learning_rate": 3.4455e-05, + "loss": 0.5357, + "step": 6894 + }, + { + "epoch": 0.38610146712957777, + "grad_norm": 1.1113626956939697, + "learning_rate": 3.4460000000000005e-05, + "loss": 0.355, + "step": 6895 + }, + { + "epoch": 0.3861574644417068, + "grad_norm": 3.260390281677246, + "learning_rate": 3.4465e-05, + "loss": 0.47, + "step": 6896 + }, + { + "epoch": 0.3862134617538358, + "grad_norm": 1.1313581466674805, + "learning_rate": 3.447e-05, + "loss": 0.3574, + "step": 6897 + }, + { + "epoch": 0.3862694590659648, + "grad_norm": 1.3877431154251099, + "learning_rate": 3.4475000000000005e-05, + "loss": 0.5123, + "step": 6898 + }, + { + "epoch": 0.38632545637809385, + "grad_norm": 1.3723994493484497, + "learning_rate": 3.448e-05, + "loss": 0.4964, + "step": 6899 + }, + { + "epoch": 0.38638145369022286, + "grad_norm": 1.228934407234192, + "learning_rate": 3.4485e-05, + "loss": 0.4702, + "step": 6900 + }, + { + "epoch": 0.3864374510023519, + "grad_norm": 1.1545768976211548, + "learning_rate": 3.449e-05, + "loss": 0.4698, + "step": 6901 + }, + { + "epoch": 0.3864934483144809, + "grad_norm": 1.2166193723678589, + "learning_rate": 3.4495e-05, + "loss": 0.4343, + "step": 6902 + }, + { + "epoch": 0.3865494456266099, + "grad_norm": 1.4506065845489502, + "learning_rate": 3.45e-05, + "loss": 0.4393, + "step": 6903 + }, + { + "epoch": 0.38660544293873894, + "grad_norm": 1.083709716796875, + "learning_rate": 3.4505e-05, + "loss": 0.3242, + "step": 6904 + }, + { + "epoch": 0.38666144025086796, + "grad_norm": 1.1156466007232666, + "learning_rate": 3.451000000000001e-05, + "loss": 0.397, + "step": 6905 + }, + { + "epoch": 0.386717437562997, + "grad_norm": 1.5354053974151611, + "learning_rate": 3.4515000000000004e-05, + "loss": 0.5582, + "step": 6906 + }, + { + "epoch": 0.386773434875126, + "grad_norm": 1.1571729183197021, + "learning_rate": 3.452e-05, + "loss": 0.3811, + "step": 6907 + }, + { + "epoch": 0.386829432187255, + "grad_norm": 1.1743639707565308, + "learning_rate": 3.4525e-05, + "loss": 0.5123, + "step": 6908 + }, + { + "epoch": 0.38688542949938404, + "grad_norm": 1.0858750343322754, + "learning_rate": 3.453e-05, + "loss": 0.4123, + "step": 6909 + }, + { + "epoch": 0.38694142681151306, + "grad_norm": 1.1950575113296509, + "learning_rate": 3.4535e-05, + "loss": 0.4931, + "step": 6910 + }, + { + "epoch": 0.3869974241236421, + "grad_norm": 1.3804479837417603, + "learning_rate": 3.454e-05, + "loss": 0.4097, + "step": 6911 + }, + { + "epoch": 0.3870534214357711, + "grad_norm": 1.3190300464630127, + "learning_rate": 3.4545e-05, + "loss": 0.3901, + "step": 6912 + }, + { + "epoch": 0.3871094187479001, + "grad_norm": 1.4819968938827515, + "learning_rate": 3.455e-05, + "loss": 0.499, + "step": 6913 + }, + { + "epoch": 0.38716541606002913, + "grad_norm": 1.4721925258636475, + "learning_rate": 3.4555000000000004e-05, + "loss": 0.4597, + "step": 6914 + }, + { + "epoch": 0.38722141337215815, + "grad_norm": 1.1170094013214111, + "learning_rate": 3.456e-05, + "loss": 0.5047, + "step": 6915 + }, + { + "epoch": 0.38727741068428717, + "grad_norm": 1.3028054237365723, + "learning_rate": 3.4565000000000005e-05, + "loss": 0.4317, + "step": 6916 + }, + { + "epoch": 0.3873334079964162, + "grad_norm": 1.096895456314087, + "learning_rate": 3.457e-05, + "loss": 0.3691, + "step": 6917 + }, + { + "epoch": 0.3873894053085452, + "grad_norm": 1.234584927558899, + "learning_rate": 3.4575e-05, + "loss": 0.4658, + "step": 6918 + }, + { + "epoch": 0.38744540262067423, + "grad_norm": 1.1240357160568237, + "learning_rate": 3.4580000000000004e-05, + "loss": 0.3645, + "step": 6919 + }, + { + "epoch": 0.38750139993280325, + "grad_norm": 2.325113296508789, + "learning_rate": 3.4585e-05, + "loss": 0.3758, + "step": 6920 + }, + { + "epoch": 0.38755739724493227, + "grad_norm": 1.31596839427948, + "learning_rate": 3.459e-05, + "loss": 0.5102, + "step": 6921 + }, + { + "epoch": 0.3876133945570613, + "grad_norm": 1.3582637310028076, + "learning_rate": 3.4594999999999997e-05, + "loss": 0.4485, + "step": 6922 + }, + { + "epoch": 0.3876693918691903, + "grad_norm": 1.4770441055297852, + "learning_rate": 3.46e-05, + "loss": 0.4524, + "step": 6923 + }, + { + "epoch": 0.3877253891813193, + "grad_norm": 1.3404148817062378, + "learning_rate": 3.4605000000000005e-05, + "loss": 0.5851, + "step": 6924 + }, + { + "epoch": 0.3877813864934483, + "grad_norm": 1.0689105987548828, + "learning_rate": 3.461e-05, + "loss": 0.3645, + "step": 6925 + }, + { + "epoch": 0.3878373838055773, + "grad_norm": 1.2504135370254517, + "learning_rate": 3.4615e-05, + "loss": 0.4709, + "step": 6926 + }, + { + "epoch": 0.3878933811177063, + "grad_norm": 1.2252495288848877, + "learning_rate": 3.4620000000000004e-05, + "loss": 0.4627, + "step": 6927 + }, + { + "epoch": 0.38794937842983535, + "grad_norm": 1.0927207469940186, + "learning_rate": 3.4625e-05, + "loss": 0.388, + "step": 6928 + }, + { + "epoch": 0.38800537574196436, + "grad_norm": 1.1858336925506592, + "learning_rate": 3.463e-05, + "loss": 0.4278, + "step": 6929 + }, + { + "epoch": 0.3880613730540934, + "grad_norm": 1.2361561059951782, + "learning_rate": 3.4635e-05, + "loss": 0.3855, + "step": 6930 + }, + { + "epoch": 0.3881173703662224, + "grad_norm": 1.2411030530929565, + "learning_rate": 3.464e-05, + "loss": 0.4048, + "step": 6931 + }, + { + "epoch": 0.3881733676783514, + "grad_norm": 1.1442571878433228, + "learning_rate": 3.4645e-05, + "loss": 0.4149, + "step": 6932 + }, + { + "epoch": 0.38822936499048044, + "grad_norm": 1.3089278936386108, + "learning_rate": 3.465e-05, + "loss": 0.5776, + "step": 6933 + }, + { + "epoch": 0.38828536230260946, + "grad_norm": 1.1244102716445923, + "learning_rate": 3.4655000000000006e-05, + "loss": 0.3307, + "step": 6934 + }, + { + "epoch": 0.3883413596147385, + "grad_norm": 1.507246732711792, + "learning_rate": 3.4660000000000004e-05, + "loss": 0.5329, + "step": 6935 + }, + { + "epoch": 0.3883973569268675, + "grad_norm": 4.239734649658203, + "learning_rate": 3.4665e-05, + "loss": 0.4608, + "step": 6936 + }, + { + "epoch": 0.3884533542389965, + "grad_norm": 1.2836531400680542, + "learning_rate": 3.4670000000000005e-05, + "loss": 0.3798, + "step": 6937 + }, + { + "epoch": 0.38850935155112554, + "grad_norm": 1.5413106679916382, + "learning_rate": 3.4675e-05, + "loss": 0.5689, + "step": 6938 + }, + { + "epoch": 0.38856534886325456, + "grad_norm": 1.3127338886260986, + "learning_rate": 3.468e-05, + "loss": 0.4873, + "step": 6939 + }, + { + "epoch": 0.3886213461753836, + "grad_norm": 1.2186760902404785, + "learning_rate": 3.4685000000000004e-05, + "loss": 0.3925, + "step": 6940 + }, + { + "epoch": 0.3886773434875126, + "grad_norm": 1.1795878410339355, + "learning_rate": 3.469e-05, + "loss": 0.3932, + "step": 6941 + }, + { + "epoch": 0.3887333407996416, + "grad_norm": 1.3185460567474365, + "learning_rate": 3.4695e-05, + "loss": 0.3922, + "step": 6942 + }, + { + "epoch": 0.38878933811177063, + "grad_norm": 1.2021520137786865, + "learning_rate": 3.4699999999999996e-05, + "loss": 0.3763, + "step": 6943 + }, + { + "epoch": 0.38884533542389965, + "grad_norm": 1.2987655401229858, + "learning_rate": 3.470500000000001e-05, + "loss": 0.5663, + "step": 6944 + }, + { + "epoch": 0.38890133273602867, + "grad_norm": 1.2102904319763184, + "learning_rate": 3.4710000000000005e-05, + "loss": 0.4477, + "step": 6945 + }, + { + "epoch": 0.3889573300481577, + "grad_norm": 1.288582444190979, + "learning_rate": 3.4715e-05, + "loss": 0.3822, + "step": 6946 + }, + { + "epoch": 0.3890133273602867, + "grad_norm": 1.1250205039978027, + "learning_rate": 3.472e-05, + "loss": 0.435, + "step": 6947 + }, + { + "epoch": 0.38906932467241573, + "grad_norm": 1.4243067502975464, + "learning_rate": 3.4725000000000004e-05, + "loss": 0.5506, + "step": 6948 + }, + { + "epoch": 0.38912532198454475, + "grad_norm": 1.173213243484497, + "learning_rate": 3.473e-05, + "loss": 0.4241, + "step": 6949 + }, + { + "epoch": 0.38918131929667377, + "grad_norm": 1.061355471611023, + "learning_rate": 3.4735e-05, + "loss": 0.3489, + "step": 6950 + }, + { + "epoch": 0.3892373166088028, + "grad_norm": 1.2567192316055298, + "learning_rate": 3.474e-05, + "loss": 0.5298, + "step": 6951 + }, + { + "epoch": 0.3892933139209318, + "grad_norm": 1.2359381914138794, + "learning_rate": 3.4745e-05, + "loss": 0.3426, + "step": 6952 + }, + { + "epoch": 0.3893493112330608, + "grad_norm": 1.3463321924209595, + "learning_rate": 3.475e-05, + "loss": 0.412, + "step": 6953 + }, + { + "epoch": 0.38940530854518984, + "grad_norm": 1.8644763231277466, + "learning_rate": 3.4755e-05, + "loss": 0.5142, + "step": 6954 + }, + { + "epoch": 0.38946130585731886, + "grad_norm": 0.9919539093971252, + "learning_rate": 3.4760000000000006e-05, + "loss": 0.4033, + "step": 6955 + }, + { + "epoch": 0.3895173031694479, + "grad_norm": 1.1118013858795166, + "learning_rate": 3.4765000000000003e-05, + "loss": 0.4297, + "step": 6956 + }, + { + "epoch": 0.3895733004815769, + "grad_norm": 1.2252516746520996, + "learning_rate": 3.477e-05, + "loss": 0.457, + "step": 6957 + }, + { + "epoch": 0.3896292977937059, + "grad_norm": 1.1286903619766235, + "learning_rate": 3.4775000000000005e-05, + "loss": 0.3852, + "step": 6958 + }, + { + "epoch": 0.38968529510583494, + "grad_norm": 1.4658342599868774, + "learning_rate": 3.478e-05, + "loss": 0.3999, + "step": 6959 + }, + { + "epoch": 0.38974129241796396, + "grad_norm": 1.2423683404922485, + "learning_rate": 3.4785e-05, + "loss": 0.51, + "step": 6960 + }, + { + "epoch": 0.389797289730093, + "grad_norm": 1.2950979471206665, + "learning_rate": 3.479e-05, + "loss": 0.3804, + "step": 6961 + }, + { + "epoch": 0.389853287042222, + "grad_norm": 1.4664896726608276, + "learning_rate": 3.4795e-05, + "loss": 0.3658, + "step": 6962 + }, + { + "epoch": 0.389909284354351, + "grad_norm": 1.451002836227417, + "learning_rate": 3.48e-05, + "loss": 0.5165, + "step": 6963 + }, + { + "epoch": 0.38996528166648003, + "grad_norm": 1.3125041723251343, + "learning_rate": 3.4805e-05, + "loss": 0.4174, + "step": 6964 + }, + { + "epoch": 0.39002127897860905, + "grad_norm": 1.2604563236236572, + "learning_rate": 3.481e-05, + "loss": 0.3917, + "step": 6965 + }, + { + "epoch": 0.390077276290738, + "grad_norm": 1.142025113105774, + "learning_rate": 3.4815000000000005e-05, + "loss": 0.3222, + "step": 6966 + }, + { + "epoch": 0.39013327360286704, + "grad_norm": 1.1667081117630005, + "learning_rate": 3.482e-05, + "loss": 0.4721, + "step": 6967 + }, + { + "epoch": 0.39018927091499606, + "grad_norm": 1.1558924913406372, + "learning_rate": 3.4825e-05, + "loss": 0.3383, + "step": 6968 + }, + { + "epoch": 0.3902452682271251, + "grad_norm": 1.4228202104568481, + "learning_rate": 3.4830000000000004e-05, + "loss": 0.4312, + "step": 6969 + }, + { + "epoch": 0.3903012655392541, + "grad_norm": 1.2872172594070435, + "learning_rate": 3.4835e-05, + "loss": 0.4468, + "step": 6970 + }, + { + "epoch": 0.3903572628513831, + "grad_norm": 1.1830133199691772, + "learning_rate": 3.484e-05, + "loss": 0.44, + "step": 6971 + }, + { + "epoch": 0.39041326016351213, + "grad_norm": 3.1067752838134766, + "learning_rate": 3.4845e-05, + "loss": 0.3564, + "step": 6972 + }, + { + "epoch": 0.39046925747564115, + "grad_norm": 1.09810209274292, + "learning_rate": 3.485e-05, + "loss": 0.4307, + "step": 6973 + }, + { + "epoch": 0.39052525478777017, + "grad_norm": 1.3250813484191895, + "learning_rate": 3.4855000000000004e-05, + "loss": 0.3495, + "step": 6974 + }, + { + "epoch": 0.3905812520998992, + "grad_norm": 1.537216067314148, + "learning_rate": 3.486e-05, + "loss": 0.6629, + "step": 6975 + }, + { + "epoch": 0.3906372494120282, + "grad_norm": 1.3258821964263916, + "learning_rate": 3.4865000000000006e-05, + "loss": 0.4447, + "step": 6976 + }, + { + "epoch": 0.39069324672415723, + "grad_norm": 1.4364511966705322, + "learning_rate": 3.487e-05, + "loss": 0.535, + "step": 6977 + }, + { + "epoch": 0.39074924403628625, + "grad_norm": 1.1638134717941284, + "learning_rate": 3.4875e-05, + "loss": 0.4004, + "step": 6978 + }, + { + "epoch": 0.39080524134841527, + "grad_norm": 1.2769513130187988, + "learning_rate": 3.4880000000000005e-05, + "loss": 0.6045, + "step": 6979 + }, + { + "epoch": 0.3908612386605443, + "grad_norm": 1.2911280393600464, + "learning_rate": 3.4885e-05, + "loss": 0.4701, + "step": 6980 + }, + { + "epoch": 0.3909172359726733, + "grad_norm": 1.2202502489089966, + "learning_rate": 3.489e-05, + "loss": 0.4383, + "step": 6981 + }, + { + "epoch": 0.3909732332848023, + "grad_norm": 1.4804140329360962, + "learning_rate": 3.4895e-05, + "loss": 0.381, + "step": 6982 + }, + { + "epoch": 0.39102923059693134, + "grad_norm": 1.1945704221725464, + "learning_rate": 3.49e-05, + "loss": 0.4915, + "step": 6983 + }, + { + "epoch": 0.39108522790906036, + "grad_norm": 1.3579177856445312, + "learning_rate": 3.4905000000000005e-05, + "loss": 0.5499, + "step": 6984 + }, + { + "epoch": 0.3911412252211894, + "grad_norm": 1.2986173629760742, + "learning_rate": 3.491e-05, + "loss": 0.4338, + "step": 6985 + }, + { + "epoch": 0.3911972225333184, + "grad_norm": 1.363681674003601, + "learning_rate": 3.4915e-05, + "loss": 0.6142, + "step": 6986 + }, + { + "epoch": 0.3912532198454474, + "grad_norm": 1.0938702821731567, + "learning_rate": 3.4920000000000004e-05, + "loss": 0.3773, + "step": 6987 + }, + { + "epoch": 0.39130921715757644, + "grad_norm": 1.4995903968811035, + "learning_rate": 3.4925e-05, + "loss": 0.5526, + "step": 6988 + }, + { + "epoch": 0.39136521446970546, + "grad_norm": 1.4191012382507324, + "learning_rate": 3.493e-05, + "loss": 0.4109, + "step": 6989 + }, + { + "epoch": 0.3914212117818345, + "grad_norm": 1.1995792388916016, + "learning_rate": 3.4935000000000003e-05, + "loss": 0.4719, + "step": 6990 + }, + { + "epoch": 0.3914772090939635, + "grad_norm": 1.0581377744674683, + "learning_rate": 3.494e-05, + "loss": 0.3769, + "step": 6991 + }, + { + "epoch": 0.3915332064060925, + "grad_norm": 1.1604219675064087, + "learning_rate": 3.4945e-05, + "loss": 0.3578, + "step": 6992 + }, + { + "epoch": 0.39158920371822153, + "grad_norm": 1.4558964967727661, + "learning_rate": 3.495e-05, + "loss": 0.672, + "step": 6993 + }, + { + "epoch": 0.39164520103035055, + "grad_norm": 1.0141115188598633, + "learning_rate": 3.495500000000001e-05, + "loss": 0.363, + "step": 6994 + }, + { + "epoch": 0.3917011983424796, + "grad_norm": 1.1952109336853027, + "learning_rate": 3.4960000000000004e-05, + "loss": 0.4501, + "step": 6995 + }, + { + "epoch": 0.3917571956546086, + "grad_norm": 1.6617878675460815, + "learning_rate": 3.4965e-05, + "loss": 0.4786, + "step": 6996 + }, + { + "epoch": 0.3918131929667376, + "grad_norm": 1.493242621421814, + "learning_rate": 3.4970000000000006e-05, + "loss": 0.4174, + "step": 6997 + }, + { + "epoch": 0.39186919027886663, + "grad_norm": 15.541422843933105, + "learning_rate": 3.4975e-05, + "loss": 0.4603, + "step": 6998 + }, + { + "epoch": 0.39192518759099565, + "grad_norm": 1.217512607574463, + "learning_rate": 3.498e-05, + "loss": 0.4167, + "step": 6999 + }, + { + "epoch": 0.39198118490312467, + "grad_norm": 1.1900042295455933, + "learning_rate": 3.4985e-05, + "loss": 0.453, + "step": 7000 + }, + { + "epoch": 0.3920371822152537, + "grad_norm": 1.1864182949066162, + "learning_rate": 3.499e-05, + "loss": 0.3501, + "step": 7001 + }, + { + "epoch": 0.3920931795273827, + "grad_norm": 1.2844698429107666, + "learning_rate": 3.4995e-05, + "loss": 0.4631, + "step": 7002 + }, + { + "epoch": 0.3921491768395117, + "grad_norm": 1.172251582145691, + "learning_rate": 3.5e-05, + "loss": 0.4115, + "step": 7003 + }, + { + "epoch": 0.39220517415164075, + "grad_norm": 1.2889574766159058, + "learning_rate": 3.5005e-05, + "loss": 0.4025, + "step": 7004 + }, + { + "epoch": 0.39226117146376976, + "grad_norm": 1.1393121480941772, + "learning_rate": 3.5010000000000005e-05, + "loss": 0.4025, + "step": 7005 + }, + { + "epoch": 0.3923171687758988, + "grad_norm": 1.583275556564331, + "learning_rate": 3.5015e-05, + "loss": 0.6657, + "step": 7006 + }, + { + "epoch": 0.39237316608802775, + "grad_norm": 1.3567055463790894, + "learning_rate": 3.502e-05, + "loss": 0.5812, + "step": 7007 + }, + { + "epoch": 0.39242916340015677, + "grad_norm": 1.3116973638534546, + "learning_rate": 3.5025000000000004e-05, + "loss": 0.527, + "step": 7008 + }, + { + "epoch": 0.3924851607122858, + "grad_norm": 1.4356811046600342, + "learning_rate": 3.503e-05, + "loss": 0.4215, + "step": 7009 + }, + { + "epoch": 0.3925411580244148, + "grad_norm": 1.354910969734192, + "learning_rate": 3.5035e-05, + "loss": 0.3817, + "step": 7010 + }, + { + "epoch": 0.3925971553365438, + "grad_norm": 1.2192323207855225, + "learning_rate": 3.504e-05, + "loss": 0.3654, + "step": 7011 + }, + { + "epoch": 0.39265315264867284, + "grad_norm": 1.248289942741394, + "learning_rate": 3.5045e-05, + "loss": 0.4381, + "step": 7012 + }, + { + "epoch": 0.39270914996080186, + "grad_norm": 1.6048190593719482, + "learning_rate": 3.505e-05, + "loss": 0.6216, + "step": 7013 + }, + { + "epoch": 0.3927651472729309, + "grad_norm": 1.28898024559021, + "learning_rate": 3.5055e-05, + "loss": 0.5206, + "step": 7014 + }, + { + "epoch": 0.3928211445850599, + "grad_norm": 1.372096300125122, + "learning_rate": 3.5060000000000007e-05, + "loss": 0.5168, + "step": 7015 + }, + { + "epoch": 0.3928771418971889, + "grad_norm": 1.2836592197418213, + "learning_rate": 3.5065000000000004e-05, + "loss": 0.3966, + "step": 7016 + }, + { + "epoch": 0.39293313920931794, + "grad_norm": 1.5730881690979004, + "learning_rate": 3.507e-05, + "loss": 0.4068, + "step": 7017 + }, + { + "epoch": 0.39298913652144696, + "grad_norm": 1.4202806949615479, + "learning_rate": 3.5075000000000006e-05, + "loss": 0.5073, + "step": 7018 + }, + { + "epoch": 0.393045133833576, + "grad_norm": 1.3695437908172607, + "learning_rate": 3.508e-05, + "loss": 0.4575, + "step": 7019 + }, + { + "epoch": 0.393101131145705, + "grad_norm": 1.159608006477356, + "learning_rate": 3.5085e-05, + "loss": 0.4209, + "step": 7020 + }, + { + "epoch": 0.393157128457834, + "grad_norm": 1.1785948276519775, + "learning_rate": 3.509e-05, + "loss": 0.3444, + "step": 7021 + }, + { + "epoch": 0.39321312576996303, + "grad_norm": 1.4187601804733276, + "learning_rate": 3.5095e-05, + "loss": 0.3834, + "step": 7022 + }, + { + "epoch": 0.39326912308209205, + "grad_norm": 1.3515288829803467, + "learning_rate": 3.51e-05, + "loss": 0.6947, + "step": 7023 + }, + { + "epoch": 0.3933251203942211, + "grad_norm": 1.3403400182724, + "learning_rate": 3.5105e-05, + "loss": 0.4919, + "step": 7024 + }, + { + "epoch": 0.3933811177063501, + "grad_norm": 1.4202855825424194, + "learning_rate": 3.511e-05, + "loss": 0.3962, + "step": 7025 + }, + { + "epoch": 0.3934371150184791, + "grad_norm": 1.132371187210083, + "learning_rate": 3.5115000000000005e-05, + "loss": 0.3486, + "step": 7026 + }, + { + "epoch": 0.39349311233060813, + "grad_norm": 1.3173556327819824, + "learning_rate": 3.512e-05, + "loss": 0.4557, + "step": 7027 + }, + { + "epoch": 0.39354910964273715, + "grad_norm": 1.1719353199005127, + "learning_rate": 3.5125e-05, + "loss": 0.3156, + "step": 7028 + }, + { + "epoch": 0.39360510695486617, + "grad_norm": 1.105429768562317, + "learning_rate": 3.5130000000000004e-05, + "loss": 0.3959, + "step": 7029 + }, + { + "epoch": 0.3936611042669952, + "grad_norm": 1.223471999168396, + "learning_rate": 3.5135e-05, + "loss": 0.4871, + "step": 7030 + }, + { + "epoch": 0.3937171015791242, + "grad_norm": 1.2651889324188232, + "learning_rate": 3.514e-05, + "loss": 0.419, + "step": 7031 + }, + { + "epoch": 0.3937730988912532, + "grad_norm": 1.2285356521606445, + "learning_rate": 3.5145e-05, + "loss": 0.4938, + "step": 7032 + }, + { + "epoch": 0.39382909620338225, + "grad_norm": 1.3484148979187012, + "learning_rate": 3.515e-05, + "loss": 0.5286, + "step": 7033 + }, + { + "epoch": 0.39388509351551126, + "grad_norm": 1.1736124753952026, + "learning_rate": 3.5155e-05, + "loss": 0.3927, + "step": 7034 + }, + { + "epoch": 0.3939410908276403, + "grad_norm": 1.4963797330856323, + "learning_rate": 3.516e-05, + "loss": 0.5774, + "step": 7035 + }, + { + "epoch": 0.3939970881397693, + "grad_norm": 1.4173800945281982, + "learning_rate": 3.5165000000000006e-05, + "loss": 0.6018, + "step": 7036 + }, + { + "epoch": 0.3940530854518983, + "grad_norm": 1.1178412437438965, + "learning_rate": 3.5170000000000004e-05, + "loss": 0.5061, + "step": 7037 + }, + { + "epoch": 0.39410908276402734, + "grad_norm": 1.119616985321045, + "learning_rate": 3.5175e-05, + "loss": 0.3985, + "step": 7038 + }, + { + "epoch": 0.39416508007615636, + "grad_norm": 1.2967212200164795, + "learning_rate": 3.518e-05, + "loss": 0.4044, + "step": 7039 + }, + { + "epoch": 0.3942210773882854, + "grad_norm": 1.1283607482910156, + "learning_rate": 3.5185e-05, + "loss": 0.3502, + "step": 7040 + }, + { + "epoch": 0.3942770747004144, + "grad_norm": 1.2243571281433105, + "learning_rate": 3.519e-05, + "loss": 0.474, + "step": 7041 + }, + { + "epoch": 0.3943330720125434, + "grad_norm": 1.5098562240600586, + "learning_rate": 3.5195e-05, + "loss": 0.4933, + "step": 7042 + }, + { + "epoch": 0.39438906932467244, + "grad_norm": 1.3509429693222046, + "learning_rate": 3.52e-05, + "loss": 0.4049, + "step": 7043 + }, + { + "epoch": 0.39444506663680146, + "grad_norm": 1.1621174812316895, + "learning_rate": 3.5205e-05, + "loss": 0.374, + "step": 7044 + }, + { + "epoch": 0.3945010639489305, + "grad_norm": 1.142918586730957, + "learning_rate": 3.5210000000000003e-05, + "loss": 0.3954, + "step": 7045 + }, + { + "epoch": 0.3945570612610595, + "grad_norm": 1.1373687982559204, + "learning_rate": 3.5215e-05, + "loss": 0.325, + "step": 7046 + }, + { + "epoch": 0.3946130585731885, + "grad_norm": 1.0039230585098267, + "learning_rate": 3.5220000000000005e-05, + "loss": 0.408, + "step": 7047 + }, + { + "epoch": 0.39466905588531753, + "grad_norm": 1.5211772918701172, + "learning_rate": 3.5225e-05, + "loss": 0.5512, + "step": 7048 + }, + { + "epoch": 0.3947250531974465, + "grad_norm": 1.3901233673095703, + "learning_rate": 3.523e-05, + "loss": 0.571, + "step": 7049 + }, + { + "epoch": 0.3947810505095755, + "grad_norm": 2.409402370452881, + "learning_rate": 3.5235000000000004e-05, + "loss": 0.3791, + "step": 7050 + }, + { + "epoch": 0.39483704782170453, + "grad_norm": 1.1406456232070923, + "learning_rate": 3.524e-05, + "loss": 0.4096, + "step": 7051 + }, + { + "epoch": 0.39489304513383355, + "grad_norm": 1.1457037925720215, + "learning_rate": 3.5245e-05, + "loss": 0.376, + "step": 7052 + }, + { + "epoch": 0.3949490424459626, + "grad_norm": 1.4743554592132568, + "learning_rate": 3.525e-05, + "loss": 0.5204, + "step": 7053 + }, + { + "epoch": 0.3950050397580916, + "grad_norm": 1.1844478845596313, + "learning_rate": 3.5255e-05, + "loss": 0.4048, + "step": 7054 + }, + { + "epoch": 0.3950610370702206, + "grad_norm": 1.2880264520645142, + "learning_rate": 3.5260000000000005e-05, + "loss": 0.4198, + "step": 7055 + }, + { + "epoch": 0.39511703438234963, + "grad_norm": 1.3818327188491821, + "learning_rate": 3.5265e-05, + "loss": 0.5185, + "step": 7056 + }, + { + "epoch": 0.39517303169447865, + "grad_norm": 1.1847329139709473, + "learning_rate": 3.5270000000000006e-05, + "loss": 0.5541, + "step": 7057 + }, + { + "epoch": 0.39522902900660767, + "grad_norm": 1.0822230577468872, + "learning_rate": 3.5275000000000004e-05, + "loss": 0.3495, + "step": 7058 + }, + { + "epoch": 0.3952850263187367, + "grad_norm": 5.522929668426514, + "learning_rate": 3.528e-05, + "loss": 0.3949, + "step": 7059 + }, + { + "epoch": 0.3953410236308657, + "grad_norm": 1.1144723892211914, + "learning_rate": 3.5285e-05, + "loss": 0.3869, + "step": 7060 + }, + { + "epoch": 0.3953970209429947, + "grad_norm": 1.2210427522659302, + "learning_rate": 3.529e-05, + "loss": 0.3968, + "step": 7061 + }, + { + "epoch": 0.39545301825512374, + "grad_norm": 1.253044605255127, + "learning_rate": 3.5295e-05, + "loss": 0.396, + "step": 7062 + }, + { + "epoch": 0.39550901556725276, + "grad_norm": 1.2485785484313965, + "learning_rate": 3.53e-05, + "loss": 0.3947, + "step": 7063 + }, + { + "epoch": 0.3955650128793818, + "grad_norm": 1.1033822298049927, + "learning_rate": 3.5305e-05, + "loss": 0.3477, + "step": 7064 + }, + { + "epoch": 0.3956210101915108, + "grad_norm": 1.2442518472671509, + "learning_rate": 3.5310000000000006e-05, + "loss": 0.5305, + "step": 7065 + }, + { + "epoch": 0.3956770075036398, + "grad_norm": 1.0721766948699951, + "learning_rate": 3.5315e-05, + "loss": 0.3685, + "step": 7066 + }, + { + "epoch": 0.39573300481576884, + "grad_norm": 1.1637009382247925, + "learning_rate": 3.532e-05, + "loss": 0.5006, + "step": 7067 + }, + { + "epoch": 0.39578900212789786, + "grad_norm": 1.4650561809539795, + "learning_rate": 3.5325000000000005e-05, + "loss": 0.4508, + "step": 7068 + }, + { + "epoch": 0.3958449994400269, + "grad_norm": 1.0735374689102173, + "learning_rate": 3.533e-05, + "loss": 0.4061, + "step": 7069 + }, + { + "epoch": 0.3959009967521559, + "grad_norm": 1.3330549001693726, + "learning_rate": 3.5335e-05, + "loss": 0.4362, + "step": 7070 + }, + { + "epoch": 0.3959569940642849, + "grad_norm": 1.250771403312683, + "learning_rate": 3.5340000000000004e-05, + "loss": 0.5554, + "step": 7071 + }, + { + "epoch": 0.39601299137641394, + "grad_norm": 1.3783133029937744, + "learning_rate": 3.5345e-05, + "loss": 0.51, + "step": 7072 + }, + { + "epoch": 0.39606898868854296, + "grad_norm": 1.0835919380187988, + "learning_rate": 3.535e-05, + "loss": 0.445, + "step": 7073 + }, + { + "epoch": 0.396124986000672, + "grad_norm": 1.3178541660308838, + "learning_rate": 3.5354999999999996e-05, + "loss": 0.4205, + "step": 7074 + }, + { + "epoch": 0.396180983312801, + "grad_norm": 1.3751288652420044, + "learning_rate": 3.536000000000001e-05, + "loss": 0.4258, + "step": 7075 + }, + { + "epoch": 0.39623698062493, + "grad_norm": 1.0545991659164429, + "learning_rate": 3.5365000000000004e-05, + "loss": 0.3546, + "step": 7076 + }, + { + "epoch": 0.39629297793705903, + "grad_norm": 1.3124701976776123, + "learning_rate": 3.537e-05, + "loss": 0.3171, + "step": 7077 + }, + { + "epoch": 0.39634897524918805, + "grad_norm": 1.2867190837860107, + "learning_rate": 3.5375e-05, + "loss": 0.4597, + "step": 7078 + }, + { + "epoch": 0.39640497256131707, + "grad_norm": 1.1991990804672241, + "learning_rate": 3.5380000000000003e-05, + "loss": 0.4953, + "step": 7079 + }, + { + "epoch": 0.3964609698734461, + "grad_norm": 1.238351821899414, + "learning_rate": 3.5385e-05, + "loss": 0.4385, + "step": 7080 + }, + { + "epoch": 0.3965169671855751, + "grad_norm": 1.2495702505111694, + "learning_rate": 3.539e-05, + "loss": 0.4106, + "step": 7081 + }, + { + "epoch": 0.39657296449770413, + "grad_norm": 3.731426477432251, + "learning_rate": 3.5395e-05, + "loss": 0.4307, + "step": 7082 + }, + { + "epoch": 0.39662896180983315, + "grad_norm": 1.1573306322097778, + "learning_rate": 3.54e-05, + "loss": 0.3633, + "step": 7083 + }, + { + "epoch": 0.39668495912196217, + "grad_norm": 1.0320273637771606, + "learning_rate": 3.5405e-05, + "loss": 0.3824, + "step": 7084 + }, + { + "epoch": 0.3967409564340912, + "grad_norm": 1.2094013690948486, + "learning_rate": 3.541e-05, + "loss": 0.3714, + "step": 7085 + }, + { + "epoch": 0.3967969537462202, + "grad_norm": 1.1214096546173096, + "learning_rate": 3.5415000000000006e-05, + "loss": 0.4957, + "step": 7086 + }, + { + "epoch": 0.3968529510583492, + "grad_norm": 1.2384463548660278, + "learning_rate": 3.542e-05, + "loss": 0.4128, + "step": 7087 + }, + { + "epoch": 0.39690894837047824, + "grad_norm": 1.5187432765960693, + "learning_rate": 3.5425e-05, + "loss": 0.5801, + "step": 7088 + }, + { + "epoch": 0.39696494568260726, + "grad_norm": 1.1081868410110474, + "learning_rate": 3.5430000000000005e-05, + "loss": 0.4494, + "step": 7089 + }, + { + "epoch": 0.3970209429947362, + "grad_norm": 1.179426670074463, + "learning_rate": 3.5435e-05, + "loss": 0.4237, + "step": 7090 + }, + { + "epoch": 0.39707694030686524, + "grad_norm": 1.200624942779541, + "learning_rate": 3.544e-05, + "loss": 0.4315, + "step": 7091 + }, + { + "epoch": 0.39713293761899426, + "grad_norm": 1.3659522533416748, + "learning_rate": 3.5445000000000004e-05, + "loss": 0.5273, + "step": 7092 + }, + { + "epoch": 0.3971889349311233, + "grad_norm": 1.3462220430374146, + "learning_rate": 3.545e-05, + "loss": 0.3893, + "step": 7093 + }, + { + "epoch": 0.3972449322432523, + "grad_norm": 1.2855170965194702, + "learning_rate": 3.5455e-05, + "loss": 0.3949, + "step": 7094 + }, + { + "epoch": 0.3973009295553813, + "grad_norm": 1.4168190956115723, + "learning_rate": 3.546e-05, + "loss": 0.4308, + "step": 7095 + }, + { + "epoch": 0.39735692686751034, + "grad_norm": 1.4847972393035889, + "learning_rate": 3.546500000000001e-05, + "loss": 0.4734, + "step": 7096 + }, + { + "epoch": 0.39741292417963936, + "grad_norm": 1.2195695638656616, + "learning_rate": 3.5470000000000004e-05, + "loss": 0.3766, + "step": 7097 + }, + { + "epoch": 0.3974689214917684, + "grad_norm": 1.1849185228347778, + "learning_rate": 3.5475e-05, + "loss": 0.3926, + "step": 7098 + }, + { + "epoch": 0.3975249188038974, + "grad_norm": 1.1611347198486328, + "learning_rate": 3.548e-05, + "loss": 0.3221, + "step": 7099 + }, + { + "epoch": 0.3975809161160264, + "grad_norm": 1.236920952796936, + "learning_rate": 3.5485e-05, + "loss": 0.5203, + "step": 7100 + }, + { + "epoch": 0.39763691342815544, + "grad_norm": 1.2539856433868408, + "learning_rate": 3.549e-05, + "loss": 0.4427, + "step": 7101 + }, + { + "epoch": 0.39769291074028446, + "grad_norm": 1.3313424587249756, + "learning_rate": 3.5495e-05, + "loss": 0.517, + "step": 7102 + }, + { + "epoch": 0.3977489080524135, + "grad_norm": 1.177718162536621, + "learning_rate": 3.55e-05, + "loss": 0.5253, + "step": 7103 + }, + { + "epoch": 0.3978049053645425, + "grad_norm": 1.362850546836853, + "learning_rate": 3.5505e-05, + "loss": 0.421, + "step": 7104 + }, + { + "epoch": 0.3978609026766715, + "grad_norm": 1.25775146484375, + "learning_rate": 3.5510000000000004e-05, + "loss": 0.5547, + "step": 7105 + }, + { + "epoch": 0.39791689998880053, + "grad_norm": 1.2604835033416748, + "learning_rate": 3.5515e-05, + "loss": 0.4058, + "step": 7106 + }, + { + "epoch": 0.39797289730092955, + "grad_norm": 1.0059744119644165, + "learning_rate": 3.5520000000000006e-05, + "loss": 0.3841, + "step": 7107 + }, + { + "epoch": 0.39802889461305857, + "grad_norm": 1.1106350421905518, + "learning_rate": 3.5525e-05, + "loss": 0.3949, + "step": 7108 + }, + { + "epoch": 0.3980848919251876, + "grad_norm": 1.8010390996932983, + "learning_rate": 3.553e-05, + "loss": 0.7235, + "step": 7109 + }, + { + "epoch": 0.3981408892373166, + "grad_norm": 1.3277437686920166, + "learning_rate": 3.5535000000000005e-05, + "loss": 0.4169, + "step": 7110 + }, + { + "epoch": 0.39819688654944563, + "grad_norm": 1.2066757678985596, + "learning_rate": 3.554e-05, + "loss": 0.4722, + "step": 7111 + }, + { + "epoch": 0.39825288386157465, + "grad_norm": 1.8611897230148315, + "learning_rate": 3.5545e-05, + "loss": 0.452, + "step": 7112 + }, + { + "epoch": 0.39830888117370367, + "grad_norm": 1.1620473861694336, + "learning_rate": 3.555e-05, + "loss": 0.498, + "step": 7113 + }, + { + "epoch": 0.3983648784858327, + "grad_norm": 1.1954268217086792, + "learning_rate": 3.5555e-05, + "loss": 0.4771, + "step": 7114 + }, + { + "epoch": 0.3984208757979617, + "grad_norm": 1.1766629219055176, + "learning_rate": 3.5560000000000005e-05, + "loss": 0.3488, + "step": 7115 + }, + { + "epoch": 0.3984768731100907, + "grad_norm": 1.1939324140548706, + "learning_rate": 3.5565e-05, + "loss": 0.4501, + "step": 7116 + }, + { + "epoch": 0.39853287042221974, + "grad_norm": 0.9701012372970581, + "learning_rate": 3.557e-05, + "loss": 0.389, + "step": 7117 + }, + { + "epoch": 0.39858886773434876, + "grad_norm": 1.6646636724472046, + "learning_rate": 3.5575000000000004e-05, + "loss": 0.5372, + "step": 7118 + }, + { + "epoch": 0.3986448650464778, + "grad_norm": 1.4646655321121216, + "learning_rate": 3.558e-05, + "loss": 0.3593, + "step": 7119 + }, + { + "epoch": 0.3987008623586068, + "grad_norm": 1.3005801439285278, + "learning_rate": 3.5585e-05, + "loss": 0.5581, + "step": 7120 + }, + { + "epoch": 0.3987568596707358, + "grad_norm": 1.3204537630081177, + "learning_rate": 3.559e-05, + "loss": 0.3271, + "step": 7121 + }, + { + "epoch": 0.39881285698286484, + "grad_norm": 1.1972370147705078, + "learning_rate": 3.5595e-05, + "loss": 0.6395, + "step": 7122 + }, + { + "epoch": 0.39886885429499386, + "grad_norm": 1.33364999294281, + "learning_rate": 3.56e-05, + "loss": 0.4278, + "step": 7123 + }, + { + "epoch": 0.3989248516071229, + "grad_norm": 1.370723009109497, + "learning_rate": 3.5605e-05, + "loss": 0.4126, + "step": 7124 + }, + { + "epoch": 0.3989808489192519, + "grad_norm": 1.1895666122436523, + "learning_rate": 3.5610000000000006e-05, + "loss": 0.584, + "step": 7125 + }, + { + "epoch": 0.3990368462313809, + "grad_norm": 1.1378017663955688, + "learning_rate": 3.5615000000000004e-05, + "loss": 0.3344, + "step": 7126 + }, + { + "epoch": 0.39909284354350993, + "grad_norm": 1.3302061557769775, + "learning_rate": 3.562e-05, + "loss": 0.3386, + "step": 7127 + }, + { + "epoch": 0.39914884085563895, + "grad_norm": 1.2411571741104126, + "learning_rate": 3.5625000000000005e-05, + "loss": 0.4365, + "step": 7128 + }, + { + "epoch": 0.399204838167768, + "grad_norm": 1.3164284229278564, + "learning_rate": 3.563e-05, + "loss": 0.4375, + "step": 7129 + }, + { + "epoch": 0.399260835479897, + "grad_norm": 1.1458563804626465, + "learning_rate": 3.5635e-05, + "loss": 0.3704, + "step": 7130 + }, + { + "epoch": 0.39931683279202596, + "grad_norm": 1.1261699199676514, + "learning_rate": 3.5640000000000004e-05, + "loss": 0.3955, + "step": 7131 + }, + { + "epoch": 0.399372830104155, + "grad_norm": 1.163615345954895, + "learning_rate": 3.5645e-05, + "loss": 0.3663, + "step": 7132 + }, + { + "epoch": 0.399428827416284, + "grad_norm": 1.8783775568008423, + "learning_rate": 3.565e-05, + "loss": 0.5242, + "step": 7133 + }, + { + "epoch": 0.399484824728413, + "grad_norm": 1.1717891693115234, + "learning_rate": 3.5654999999999997e-05, + "loss": 0.4164, + "step": 7134 + }, + { + "epoch": 0.39954082204054203, + "grad_norm": 1.0024054050445557, + "learning_rate": 3.566e-05, + "loss": 0.4352, + "step": 7135 + }, + { + "epoch": 0.39959681935267105, + "grad_norm": 1.2847247123718262, + "learning_rate": 3.5665000000000005e-05, + "loss": 0.4291, + "step": 7136 + }, + { + "epoch": 0.39965281666480007, + "grad_norm": 1.2100944519042969, + "learning_rate": 3.567e-05, + "loss": 0.4851, + "step": 7137 + }, + { + "epoch": 0.3997088139769291, + "grad_norm": 1.32924222946167, + "learning_rate": 3.5675e-05, + "loss": 0.5232, + "step": 7138 + }, + { + "epoch": 0.3997648112890581, + "grad_norm": 1.2437349557876587, + "learning_rate": 3.5680000000000004e-05, + "loss": 0.4545, + "step": 7139 + }, + { + "epoch": 0.3998208086011871, + "grad_norm": 1.652258038520813, + "learning_rate": 3.5685e-05, + "loss": 0.6215, + "step": 7140 + }, + { + "epoch": 0.39987680591331615, + "grad_norm": 2.5210049152374268, + "learning_rate": 3.569e-05, + "loss": 0.5923, + "step": 7141 + }, + { + "epoch": 0.39993280322544517, + "grad_norm": 1.4418772459030151, + "learning_rate": 3.5695e-05, + "loss": 0.4272, + "step": 7142 + }, + { + "epoch": 0.3999888005375742, + "grad_norm": 1.1753559112548828, + "learning_rate": 3.57e-05, + "loss": 0.4517, + "step": 7143 + }, + { + "epoch": 0.4000447978497032, + "grad_norm": 1.2177468538284302, + "learning_rate": 3.5705e-05, + "loss": 0.3785, + "step": 7144 + }, + { + "epoch": 0.4001007951618322, + "grad_norm": 1.270458698272705, + "learning_rate": 3.571e-05, + "loss": 0.5715, + "step": 7145 + }, + { + "epoch": 0.40015679247396124, + "grad_norm": 1.3269884586334229, + "learning_rate": 3.5715000000000006e-05, + "loss": 0.402, + "step": 7146 + }, + { + "epoch": 0.40021278978609026, + "grad_norm": 1.286996603012085, + "learning_rate": 3.5720000000000004e-05, + "loss": 0.4246, + "step": 7147 + }, + { + "epoch": 0.4002687870982193, + "grad_norm": 1.4019207954406738, + "learning_rate": 3.5725e-05, + "loss": 0.3634, + "step": 7148 + }, + { + "epoch": 0.4003247844103483, + "grad_norm": 1.0722886323928833, + "learning_rate": 3.5730000000000005e-05, + "loss": 0.3802, + "step": 7149 + }, + { + "epoch": 0.4003807817224773, + "grad_norm": 1.2400652170181274, + "learning_rate": 3.5735e-05, + "loss": 0.3321, + "step": 7150 + }, + { + "epoch": 0.40043677903460634, + "grad_norm": 1.1441702842712402, + "learning_rate": 3.574e-05, + "loss": 0.4854, + "step": 7151 + }, + { + "epoch": 0.40049277634673536, + "grad_norm": 1.040071964263916, + "learning_rate": 3.5745e-05, + "loss": 0.3512, + "step": 7152 + }, + { + "epoch": 0.4005487736588644, + "grad_norm": 1.3732068538665771, + "learning_rate": 3.575e-05, + "loss": 0.3955, + "step": 7153 + }, + { + "epoch": 0.4006047709709934, + "grad_norm": 1.2217940092086792, + "learning_rate": 3.5755e-05, + "loss": 0.5224, + "step": 7154 + }, + { + "epoch": 0.4006607682831224, + "grad_norm": 1.4754819869995117, + "learning_rate": 3.5759999999999996e-05, + "loss": 0.5832, + "step": 7155 + }, + { + "epoch": 0.40071676559525143, + "grad_norm": 1.1872299909591675, + "learning_rate": 3.576500000000001e-05, + "loss": 0.4427, + "step": 7156 + }, + { + "epoch": 0.40077276290738045, + "grad_norm": 1.497488021850586, + "learning_rate": 3.5770000000000005e-05, + "loss": 0.4887, + "step": 7157 + }, + { + "epoch": 0.4008287602195095, + "grad_norm": 1.1495366096496582, + "learning_rate": 3.5775e-05, + "loss": 0.4121, + "step": 7158 + }, + { + "epoch": 0.4008847575316385, + "grad_norm": 1.237281084060669, + "learning_rate": 3.578e-05, + "loss": 0.4101, + "step": 7159 + }, + { + "epoch": 0.4009407548437675, + "grad_norm": 1.3852341175079346, + "learning_rate": 3.5785000000000004e-05, + "loss": 0.4432, + "step": 7160 + }, + { + "epoch": 0.40099675215589653, + "grad_norm": 1.3871299028396606, + "learning_rate": 3.579e-05, + "loss": 0.4908, + "step": 7161 + }, + { + "epoch": 0.40105274946802555, + "grad_norm": 1.0768440961837769, + "learning_rate": 3.5795e-05, + "loss": 0.3331, + "step": 7162 + }, + { + "epoch": 0.40110874678015457, + "grad_norm": 1.0352228879928589, + "learning_rate": 3.58e-05, + "loss": 0.4007, + "step": 7163 + }, + { + "epoch": 0.4011647440922836, + "grad_norm": 1.4716922044754028, + "learning_rate": 3.5805e-05, + "loss": 0.5157, + "step": 7164 + }, + { + "epoch": 0.4012207414044126, + "grad_norm": 1.0265246629714966, + "learning_rate": 3.581e-05, + "loss": 0.3717, + "step": 7165 + }, + { + "epoch": 0.4012767387165416, + "grad_norm": 1.1104140281677246, + "learning_rate": 3.5815e-05, + "loss": 0.3902, + "step": 7166 + }, + { + "epoch": 0.40133273602867064, + "grad_norm": 1.0913019180297852, + "learning_rate": 3.5820000000000006e-05, + "loss": 0.4035, + "step": 7167 + }, + { + "epoch": 0.40138873334079966, + "grad_norm": 1.4303982257843018, + "learning_rate": 3.5825000000000003e-05, + "loss": 0.4168, + "step": 7168 + }, + { + "epoch": 0.4014447306529287, + "grad_norm": 3.429379463195801, + "learning_rate": 3.583e-05, + "loss": 0.7091, + "step": 7169 + }, + { + "epoch": 0.4015007279650577, + "grad_norm": 1.3319716453552246, + "learning_rate": 3.5835000000000005e-05, + "loss": 0.421, + "step": 7170 + }, + { + "epoch": 0.4015567252771867, + "grad_norm": 1.5306086540222168, + "learning_rate": 3.584e-05, + "loss": 0.4384, + "step": 7171 + }, + { + "epoch": 0.40161272258931574, + "grad_norm": 1.2060859203338623, + "learning_rate": 3.5845e-05, + "loss": 0.4859, + "step": 7172 + }, + { + "epoch": 0.4016687199014447, + "grad_norm": 1.3723657131195068, + "learning_rate": 3.585e-05, + "loss": 0.3922, + "step": 7173 + }, + { + "epoch": 0.4017247172135737, + "grad_norm": 1.451207160949707, + "learning_rate": 3.5855e-05, + "loss": 0.468, + "step": 7174 + }, + { + "epoch": 0.40178071452570274, + "grad_norm": 1.2338693141937256, + "learning_rate": 3.586e-05, + "loss": 0.5271, + "step": 7175 + }, + { + "epoch": 0.40183671183783176, + "grad_norm": 1.2146055698394775, + "learning_rate": 3.5865e-05, + "loss": 0.4202, + "step": 7176 + }, + { + "epoch": 0.4018927091499608, + "grad_norm": 1.243673324584961, + "learning_rate": 3.587e-05, + "loss": 0.4702, + "step": 7177 + }, + { + "epoch": 0.4019487064620898, + "grad_norm": 1.3768872022628784, + "learning_rate": 3.5875000000000005e-05, + "loss": 0.4021, + "step": 7178 + }, + { + "epoch": 0.4020047037742188, + "grad_norm": 1.3724054098129272, + "learning_rate": 3.588e-05, + "loss": 0.4104, + "step": 7179 + }, + { + "epoch": 0.40206070108634784, + "grad_norm": 1.1506850719451904, + "learning_rate": 3.5885e-05, + "loss": 0.344, + "step": 7180 + }, + { + "epoch": 0.40211669839847686, + "grad_norm": 1.2319324016571045, + "learning_rate": 3.5890000000000004e-05, + "loss": 0.4383, + "step": 7181 + }, + { + "epoch": 0.4021726957106059, + "grad_norm": 1.4333292245864868, + "learning_rate": 3.5895e-05, + "loss": 0.5512, + "step": 7182 + }, + { + "epoch": 0.4022286930227349, + "grad_norm": 1.2146166563034058, + "learning_rate": 3.59e-05, + "loss": 0.5041, + "step": 7183 + }, + { + "epoch": 0.4022846903348639, + "grad_norm": 1.3595854043960571, + "learning_rate": 3.5905e-05, + "loss": 0.4679, + "step": 7184 + }, + { + "epoch": 0.40234068764699293, + "grad_norm": 1.2343111038208008, + "learning_rate": 3.591e-05, + "loss": 0.3349, + "step": 7185 + }, + { + "epoch": 0.40239668495912195, + "grad_norm": 1.4902734756469727, + "learning_rate": 3.5915000000000004e-05, + "loss": 0.6286, + "step": 7186 + }, + { + "epoch": 0.40245268227125097, + "grad_norm": 1.4743362665176392, + "learning_rate": 3.592e-05, + "loss": 0.4208, + "step": 7187 + }, + { + "epoch": 0.40250867958338, + "grad_norm": 1.2193795442581177, + "learning_rate": 3.5925000000000006e-05, + "loss": 0.5368, + "step": 7188 + }, + { + "epoch": 0.402564676895509, + "grad_norm": 1.2408791780471802, + "learning_rate": 3.593e-05, + "loss": 0.4042, + "step": 7189 + }, + { + "epoch": 0.40262067420763803, + "grad_norm": 1.4323499202728271, + "learning_rate": 3.5935e-05, + "loss": 0.4446, + "step": 7190 + }, + { + "epoch": 0.40267667151976705, + "grad_norm": 1.3486984968185425, + "learning_rate": 3.594e-05, + "loss": 0.4092, + "step": 7191 + }, + { + "epoch": 0.40273266883189607, + "grad_norm": 1.1780016422271729, + "learning_rate": 3.5945e-05, + "loss": 0.5441, + "step": 7192 + }, + { + "epoch": 0.4027886661440251, + "grad_norm": 1.5800676345825195, + "learning_rate": 3.595e-05, + "loss": 0.7775, + "step": 7193 + }, + { + "epoch": 0.4028446634561541, + "grad_norm": 1.375052809715271, + "learning_rate": 3.5955e-05, + "loss": 0.4215, + "step": 7194 + }, + { + "epoch": 0.4029006607682831, + "grad_norm": 1.192807674407959, + "learning_rate": 3.596e-05, + "loss": 0.4601, + "step": 7195 + }, + { + "epoch": 0.40295665808041214, + "grad_norm": 1.272210717201233, + "learning_rate": 3.5965000000000005e-05, + "loss": 0.4006, + "step": 7196 + }, + { + "epoch": 0.40301265539254116, + "grad_norm": 1.3375262022018433, + "learning_rate": 3.597e-05, + "loss": 0.5038, + "step": 7197 + }, + { + "epoch": 0.4030686527046702, + "grad_norm": 1.3216993808746338, + "learning_rate": 3.5975e-05, + "loss": 0.4402, + "step": 7198 + }, + { + "epoch": 0.4031246500167992, + "grad_norm": 1.2181096076965332, + "learning_rate": 3.5980000000000004e-05, + "loss": 0.355, + "step": 7199 + }, + { + "epoch": 0.4031806473289282, + "grad_norm": 1.183577060699463, + "learning_rate": 3.5985e-05, + "loss": 0.471, + "step": 7200 + }, + { + "epoch": 0.40323664464105724, + "grad_norm": 1.8316634893417358, + "learning_rate": 3.599e-05, + "loss": 0.4463, + "step": 7201 + }, + { + "epoch": 0.40329264195318626, + "grad_norm": 1.3434016704559326, + "learning_rate": 3.5995000000000004e-05, + "loss": 0.5099, + "step": 7202 + }, + { + "epoch": 0.4033486392653153, + "grad_norm": 1.1872845888137817, + "learning_rate": 3.6e-05, + "loss": 0.3647, + "step": 7203 + }, + { + "epoch": 0.4034046365774443, + "grad_norm": 1.775923728942871, + "learning_rate": 3.6005e-05, + "loss": 0.4471, + "step": 7204 + }, + { + "epoch": 0.4034606338895733, + "grad_norm": 1.5498169660568237, + "learning_rate": 3.601e-05, + "loss": 0.4645, + "step": 7205 + }, + { + "epoch": 0.40351663120170234, + "grad_norm": 1.1243666410446167, + "learning_rate": 3.601500000000001e-05, + "loss": 0.4595, + "step": 7206 + }, + { + "epoch": 0.40357262851383136, + "grad_norm": 1.136042833328247, + "learning_rate": 3.6020000000000004e-05, + "loss": 0.5057, + "step": 7207 + }, + { + "epoch": 0.4036286258259604, + "grad_norm": 1.2992314100265503, + "learning_rate": 3.6025e-05, + "loss": 0.4082, + "step": 7208 + }, + { + "epoch": 0.4036846231380894, + "grad_norm": 1.7175463438034058, + "learning_rate": 3.6030000000000006e-05, + "loss": 0.4009, + "step": 7209 + }, + { + "epoch": 0.4037406204502184, + "grad_norm": 1.1723322868347168, + "learning_rate": 3.6035e-05, + "loss": 0.3874, + "step": 7210 + }, + { + "epoch": 0.40379661776234743, + "grad_norm": 1.2163728475570679, + "learning_rate": 3.604e-05, + "loss": 0.4348, + "step": 7211 + }, + { + "epoch": 0.40385261507447645, + "grad_norm": 0.9793835282325745, + "learning_rate": 3.6045e-05, + "loss": 0.3209, + "step": 7212 + }, + { + "epoch": 0.40390861238660547, + "grad_norm": 1.1643357276916504, + "learning_rate": 3.605e-05, + "loss": 0.4987, + "step": 7213 + }, + { + "epoch": 0.40396460969873443, + "grad_norm": 1.122285008430481, + "learning_rate": 3.6055e-05, + "loss": 0.4352, + "step": 7214 + }, + { + "epoch": 0.40402060701086345, + "grad_norm": 1.0734235048294067, + "learning_rate": 3.606e-05, + "loss": 0.5808, + "step": 7215 + }, + { + "epoch": 0.40407660432299247, + "grad_norm": 1.249040961265564, + "learning_rate": 3.6065e-05, + "loss": 0.6445, + "step": 7216 + }, + { + "epoch": 0.4041326016351215, + "grad_norm": 1.110620379447937, + "learning_rate": 3.6070000000000005e-05, + "loss": 0.3976, + "step": 7217 + }, + { + "epoch": 0.4041885989472505, + "grad_norm": 1.151766300201416, + "learning_rate": 3.6075e-05, + "loss": 0.35, + "step": 7218 + }, + { + "epoch": 0.40424459625937953, + "grad_norm": 1.2607028484344482, + "learning_rate": 3.608e-05, + "loss": 0.4196, + "step": 7219 + }, + { + "epoch": 0.40430059357150855, + "grad_norm": 1.0967851877212524, + "learning_rate": 3.6085000000000004e-05, + "loss": 0.4373, + "step": 7220 + }, + { + "epoch": 0.40435659088363757, + "grad_norm": 1.0691070556640625, + "learning_rate": 3.609e-05, + "loss": 0.3334, + "step": 7221 + }, + { + "epoch": 0.4044125881957666, + "grad_norm": 1.171049952507019, + "learning_rate": 3.6095e-05, + "loss": 0.5588, + "step": 7222 + }, + { + "epoch": 0.4044685855078956, + "grad_norm": 1.1470245122909546, + "learning_rate": 3.61e-05, + "loss": 0.4066, + "step": 7223 + }, + { + "epoch": 0.4045245828200246, + "grad_norm": 1.4739209413528442, + "learning_rate": 3.6105e-05, + "loss": 0.5064, + "step": 7224 + }, + { + "epoch": 0.40458058013215364, + "grad_norm": 1.940086007118225, + "learning_rate": 3.611e-05, + "loss": 0.4543, + "step": 7225 + }, + { + "epoch": 0.40463657744428266, + "grad_norm": 1.4099403619766235, + "learning_rate": 3.6115e-05, + "loss": 0.4356, + "step": 7226 + }, + { + "epoch": 0.4046925747564117, + "grad_norm": 1.4548096656799316, + "learning_rate": 3.6120000000000007e-05, + "loss": 0.5859, + "step": 7227 + }, + { + "epoch": 0.4047485720685407, + "grad_norm": 1.3317774534225464, + "learning_rate": 3.6125000000000004e-05, + "loss": 0.4717, + "step": 7228 + }, + { + "epoch": 0.4048045693806697, + "grad_norm": 1.229290246963501, + "learning_rate": 3.613e-05, + "loss": 0.4747, + "step": 7229 + }, + { + "epoch": 0.40486056669279874, + "grad_norm": 1.174689769744873, + "learning_rate": 3.6135000000000006e-05, + "loss": 0.483, + "step": 7230 + }, + { + "epoch": 0.40491656400492776, + "grad_norm": 1.3518184423446655, + "learning_rate": 3.614e-05, + "loss": 0.4043, + "step": 7231 + }, + { + "epoch": 0.4049725613170568, + "grad_norm": 1.2385170459747314, + "learning_rate": 3.6145e-05, + "loss": 0.4015, + "step": 7232 + }, + { + "epoch": 0.4050285586291858, + "grad_norm": 1.234592080116272, + "learning_rate": 3.615e-05, + "loss": 0.4218, + "step": 7233 + }, + { + "epoch": 0.4050845559413148, + "grad_norm": 1.2679672241210938, + "learning_rate": 3.6155e-05, + "loss": 0.4764, + "step": 7234 + }, + { + "epoch": 0.40514055325344384, + "grad_norm": 1.4722155332565308, + "learning_rate": 3.616e-05, + "loss": 0.4824, + "step": 7235 + }, + { + "epoch": 0.40519655056557285, + "grad_norm": 0.9777321219444275, + "learning_rate": 3.6165000000000004e-05, + "loss": 0.3926, + "step": 7236 + }, + { + "epoch": 0.4052525478777019, + "grad_norm": 1.7210427522659302, + "learning_rate": 3.617e-05, + "loss": 0.5682, + "step": 7237 + }, + { + "epoch": 0.4053085451898309, + "grad_norm": 1.0800691843032837, + "learning_rate": 3.6175000000000005e-05, + "loss": 0.4105, + "step": 7238 + }, + { + "epoch": 0.4053645425019599, + "grad_norm": 1.1063060760498047, + "learning_rate": 3.618e-05, + "loss": 0.392, + "step": 7239 + }, + { + "epoch": 0.40542053981408893, + "grad_norm": 1.069801688194275, + "learning_rate": 3.6185e-05, + "loss": 0.3287, + "step": 7240 + }, + { + "epoch": 0.40547653712621795, + "grad_norm": 1.3083385229110718, + "learning_rate": 3.6190000000000004e-05, + "loss": 0.495, + "step": 7241 + }, + { + "epoch": 0.40553253443834697, + "grad_norm": 1.2802073955535889, + "learning_rate": 3.6195e-05, + "loss": 0.4546, + "step": 7242 + }, + { + "epoch": 0.405588531750476, + "grad_norm": 1.2145832777023315, + "learning_rate": 3.62e-05, + "loss": 0.4236, + "step": 7243 + }, + { + "epoch": 0.405644529062605, + "grad_norm": 1.1301569938659668, + "learning_rate": 3.6205e-05, + "loss": 0.4901, + "step": 7244 + }, + { + "epoch": 0.405700526374734, + "grad_norm": 1.6309202909469604, + "learning_rate": 3.621e-05, + "loss": 0.6519, + "step": 7245 + }, + { + "epoch": 0.40575652368686305, + "grad_norm": 1.1026517152786255, + "learning_rate": 3.6215000000000005e-05, + "loss": 0.4012, + "step": 7246 + }, + { + "epoch": 0.40581252099899207, + "grad_norm": 1.5104278326034546, + "learning_rate": 3.622e-05, + "loss": 0.4446, + "step": 7247 + }, + { + "epoch": 0.4058685183111211, + "grad_norm": 1.4630019664764404, + "learning_rate": 3.6225000000000006e-05, + "loss": 0.4647, + "step": 7248 + }, + { + "epoch": 0.4059245156232501, + "grad_norm": 1.742742657661438, + "learning_rate": 3.6230000000000004e-05, + "loss": 0.4791, + "step": 7249 + }, + { + "epoch": 0.4059805129353791, + "grad_norm": 1.2679790258407593, + "learning_rate": 3.6235e-05, + "loss": 0.4973, + "step": 7250 + }, + { + "epoch": 0.40603651024750814, + "grad_norm": 1.2643063068389893, + "learning_rate": 3.624e-05, + "loss": 0.5072, + "step": 7251 + }, + { + "epoch": 0.40609250755963716, + "grad_norm": 1.1733382940292358, + "learning_rate": 3.6245e-05, + "loss": 0.3236, + "step": 7252 + }, + { + "epoch": 0.4061485048717662, + "grad_norm": 1.5944677591323853, + "learning_rate": 3.625e-05, + "loss": 0.445, + "step": 7253 + }, + { + "epoch": 0.4062045021838952, + "grad_norm": 1.2801021337509155, + "learning_rate": 3.6255e-05, + "loss": 0.5094, + "step": 7254 + }, + { + "epoch": 0.40626049949602416, + "grad_norm": 1.1235573291778564, + "learning_rate": 3.626e-05, + "loss": 0.5119, + "step": 7255 + }, + { + "epoch": 0.4063164968081532, + "grad_norm": 1.1305443048477173, + "learning_rate": 3.6265e-05, + "loss": 0.4362, + "step": 7256 + }, + { + "epoch": 0.4063724941202822, + "grad_norm": 1.908765196800232, + "learning_rate": 3.6270000000000003e-05, + "loss": 0.6886, + "step": 7257 + }, + { + "epoch": 0.4064284914324112, + "grad_norm": 1.2548452615737915, + "learning_rate": 3.6275e-05, + "loss": 0.401, + "step": 7258 + }, + { + "epoch": 0.40648448874454024, + "grad_norm": 1.400665044784546, + "learning_rate": 3.6280000000000005e-05, + "loss": 0.4434, + "step": 7259 + }, + { + "epoch": 0.40654048605666926, + "grad_norm": 1.52109694480896, + "learning_rate": 3.6285e-05, + "loss": 0.525, + "step": 7260 + }, + { + "epoch": 0.4065964833687983, + "grad_norm": 1.3359694480895996, + "learning_rate": 3.629e-05, + "loss": 0.4099, + "step": 7261 + }, + { + "epoch": 0.4066524806809273, + "grad_norm": 1.321385383605957, + "learning_rate": 3.6295000000000004e-05, + "loss": 0.4814, + "step": 7262 + }, + { + "epoch": 0.4067084779930563, + "grad_norm": 1.3202811479568481, + "learning_rate": 3.63e-05, + "loss": 0.3902, + "step": 7263 + }, + { + "epoch": 0.40676447530518534, + "grad_norm": 1.1171973943710327, + "learning_rate": 3.6305e-05, + "loss": 0.3772, + "step": 7264 + }, + { + "epoch": 0.40682047261731435, + "grad_norm": 1.2634572982788086, + "learning_rate": 3.6309999999999996e-05, + "loss": 0.4029, + "step": 7265 + }, + { + "epoch": 0.4068764699294434, + "grad_norm": 1.2522668838500977, + "learning_rate": 3.6315e-05, + "loss": 0.3694, + "step": 7266 + }, + { + "epoch": 0.4069324672415724, + "grad_norm": 1.335279107093811, + "learning_rate": 3.6320000000000005e-05, + "loss": 0.5055, + "step": 7267 + }, + { + "epoch": 0.4069884645537014, + "grad_norm": 1.239645004272461, + "learning_rate": 3.6325e-05, + "loss": 0.3645, + "step": 7268 + }, + { + "epoch": 0.40704446186583043, + "grad_norm": 1.522817611694336, + "learning_rate": 3.6330000000000006e-05, + "loss": 0.4993, + "step": 7269 + }, + { + "epoch": 0.40710045917795945, + "grad_norm": 1.3160094022750854, + "learning_rate": 3.6335000000000004e-05, + "loss": 0.5277, + "step": 7270 + }, + { + "epoch": 0.40715645649008847, + "grad_norm": 1.2404543161392212, + "learning_rate": 3.634e-05, + "loss": 0.5226, + "step": 7271 + }, + { + "epoch": 0.4072124538022175, + "grad_norm": 1.3096650838851929, + "learning_rate": 3.6345e-05, + "loss": 0.4599, + "step": 7272 + }, + { + "epoch": 0.4072684511143465, + "grad_norm": 1.3579174280166626, + "learning_rate": 3.635e-05, + "loss": 0.5192, + "step": 7273 + }, + { + "epoch": 0.4073244484264755, + "grad_norm": 1.1769434213638306, + "learning_rate": 3.6355e-05, + "loss": 0.3975, + "step": 7274 + }, + { + "epoch": 0.40738044573860455, + "grad_norm": 1.0503480434417725, + "learning_rate": 3.636e-05, + "loss": 0.3213, + "step": 7275 + }, + { + "epoch": 0.40743644305073357, + "grad_norm": 1.0357081890106201, + "learning_rate": 3.6365e-05, + "loss": 0.3951, + "step": 7276 + }, + { + "epoch": 0.4074924403628626, + "grad_norm": 1.1854957342147827, + "learning_rate": 3.6370000000000006e-05, + "loss": 0.3392, + "step": 7277 + }, + { + "epoch": 0.4075484376749916, + "grad_norm": 1.3929799795150757, + "learning_rate": 3.6375e-05, + "loss": 0.437, + "step": 7278 + }, + { + "epoch": 0.4076044349871206, + "grad_norm": 1.2759002447128296, + "learning_rate": 3.638e-05, + "loss": 0.5598, + "step": 7279 + }, + { + "epoch": 0.40766043229924964, + "grad_norm": 1.2573679685592651, + "learning_rate": 3.6385000000000005e-05, + "loss": 0.3609, + "step": 7280 + }, + { + "epoch": 0.40771642961137866, + "grad_norm": 1.2415151596069336, + "learning_rate": 3.639e-05, + "loss": 0.4047, + "step": 7281 + }, + { + "epoch": 0.4077724269235077, + "grad_norm": 1.0531760454177856, + "learning_rate": 3.6395e-05, + "loss": 0.4488, + "step": 7282 + }, + { + "epoch": 0.4078284242356367, + "grad_norm": 1.5336650609970093, + "learning_rate": 3.6400000000000004e-05, + "loss": 0.4572, + "step": 7283 + }, + { + "epoch": 0.4078844215477657, + "grad_norm": 1.1323792934417725, + "learning_rate": 3.6405e-05, + "loss": 0.4861, + "step": 7284 + }, + { + "epoch": 0.40794041885989474, + "grad_norm": 1.2384064197540283, + "learning_rate": 3.641e-05, + "loss": 0.468, + "step": 7285 + }, + { + "epoch": 0.40799641617202376, + "grad_norm": 1.3712652921676636, + "learning_rate": 3.6414999999999996e-05, + "loss": 0.3852, + "step": 7286 + }, + { + "epoch": 0.4080524134841528, + "grad_norm": 1.0916869640350342, + "learning_rate": 3.642000000000001e-05, + "loss": 0.3724, + "step": 7287 + }, + { + "epoch": 0.4081084107962818, + "grad_norm": 1.1505745649337769, + "learning_rate": 3.6425000000000004e-05, + "loss": 0.3085, + "step": 7288 + }, + { + "epoch": 0.4081644081084108, + "grad_norm": 1.4061627388000488, + "learning_rate": 3.643e-05, + "loss": 0.4416, + "step": 7289 + }, + { + "epoch": 0.40822040542053983, + "grad_norm": 1.1372989416122437, + "learning_rate": 3.6435e-05, + "loss": 0.4334, + "step": 7290 + }, + { + "epoch": 0.40827640273266885, + "grad_norm": 1.311160683631897, + "learning_rate": 3.6440000000000003e-05, + "loss": 0.417, + "step": 7291 + }, + { + "epoch": 0.40833240004479787, + "grad_norm": 1.776376485824585, + "learning_rate": 3.6445e-05, + "loss": 0.4312, + "step": 7292 + }, + { + "epoch": 0.4083883973569269, + "grad_norm": 1.5318913459777832, + "learning_rate": 3.645e-05, + "loss": 0.4828, + "step": 7293 + }, + { + "epoch": 0.4084443946690559, + "grad_norm": 1.1611205339431763, + "learning_rate": 3.6455e-05, + "loss": 0.3558, + "step": 7294 + }, + { + "epoch": 0.40850039198118493, + "grad_norm": 1.2419720888137817, + "learning_rate": 3.646e-05, + "loss": 0.5373, + "step": 7295 + }, + { + "epoch": 0.40855638929331395, + "grad_norm": 1.2146140336990356, + "learning_rate": 3.6465e-05, + "loss": 0.4384, + "step": 7296 + }, + { + "epoch": 0.4086123866054429, + "grad_norm": 1.3515194654464722, + "learning_rate": 3.647e-05, + "loss": 0.4263, + "step": 7297 + }, + { + "epoch": 0.40866838391757193, + "grad_norm": 1.2659810781478882, + "learning_rate": 3.6475000000000006e-05, + "loss": 0.4124, + "step": 7298 + }, + { + "epoch": 0.40872438122970095, + "grad_norm": 1.2609952688217163, + "learning_rate": 3.648e-05, + "loss": 0.4871, + "step": 7299 + }, + { + "epoch": 0.40878037854182997, + "grad_norm": 1.183382272720337, + "learning_rate": 3.6485e-05, + "loss": 0.3869, + "step": 7300 + }, + { + "epoch": 0.408836375853959, + "grad_norm": 1.1628130674362183, + "learning_rate": 3.6490000000000005e-05, + "loss": 0.4279, + "step": 7301 + }, + { + "epoch": 0.408892373166088, + "grad_norm": 1.176312804222107, + "learning_rate": 3.6495e-05, + "loss": 0.3122, + "step": 7302 + }, + { + "epoch": 0.408948370478217, + "grad_norm": 1.3133997917175293, + "learning_rate": 3.65e-05, + "loss": 0.5109, + "step": 7303 + }, + { + "epoch": 0.40900436779034605, + "grad_norm": 1.2249528169631958, + "learning_rate": 3.6505e-05, + "loss": 0.4877, + "step": 7304 + }, + { + "epoch": 0.40906036510247507, + "grad_norm": 1.3445569276809692, + "learning_rate": 3.651e-05, + "loss": 0.5041, + "step": 7305 + }, + { + "epoch": 0.4091163624146041, + "grad_norm": 1.1757526397705078, + "learning_rate": 3.6515e-05, + "loss": 0.4316, + "step": 7306 + }, + { + "epoch": 0.4091723597267331, + "grad_norm": 1.3491877317428589, + "learning_rate": 3.652e-05, + "loss": 0.493, + "step": 7307 + }, + { + "epoch": 0.4092283570388621, + "grad_norm": 1.0845398902893066, + "learning_rate": 3.652500000000001e-05, + "loss": 0.525, + "step": 7308 + }, + { + "epoch": 0.40928435435099114, + "grad_norm": 1.3383963108062744, + "learning_rate": 3.6530000000000004e-05, + "loss": 0.575, + "step": 7309 + }, + { + "epoch": 0.40934035166312016, + "grad_norm": 1.0717452764511108, + "learning_rate": 3.6535e-05, + "loss": 0.4511, + "step": 7310 + }, + { + "epoch": 0.4093963489752492, + "grad_norm": 1.304875373840332, + "learning_rate": 3.654e-05, + "loss": 0.4678, + "step": 7311 + }, + { + "epoch": 0.4094523462873782, + "grad_norm": 1.4814187288284302, + "learning_rate": 3.6545e-05, + "loss": 0.477, + "step": 7312 + }, + { + "epoch": 0.4095083435995072, + "grad_norm": 1.102741003036499, + "learning_rate": 3.655e-05, + "loss": 0.4875, + "step": 7313 + }, + { + "epoch": 0.40956434091163624, + "grad_norm": 1.2406634092330933, + "learning_rate": 3.6555e-05, + "loss": 0.4823, + "step": 7314 + }, + { + "epoch": 0.40962033822376526, + "grad_norm": 1.372307300567627, + "learning_rate": 3.656e-05, + "loss": 0.347, + "step": 7315 + }, + { + "epoch": 0.4096763355358943, + "grad_norm": 1.330670714378357, + "learning_rate": 3.6565e-05, + "loss": 0.5762, + "step": 7316 + }, + { + "epoch": 0.4097323328480233, + "grad_norm": 1.1185837984085083, + "learning_rate": 3.6570000000000004e-05, + "loss": 0.4308, + "step": 7317 + }, + { + "epoch": 0.4097883301601523, + "grad_norm": 1.2783232927322388, + "learning_rate": 3.6575e-05, + "loss": 0.4272, + "step": 7318 + }, + { + "epoch": 0.40984432747228133, + "grad_norm": 1.0761449337005615, + "learning_rate": 3.6580000000000006e-05, + "loss": 0.3062, + "step": 7319 + }, + { + "epoch": 0.40990032478441035, + "grad_norm": 1.0321117639541626, + "learning_rate": 3.6585e-05, + "loss": 0.3292, + "step": 7320 + }, + { + "epoch": 0.40995632209653937, + "grad_norm": 1.1968425512313843, + "learning_rate": 3.659e-05, + "loss": 0.4818, + "step": 7321 + }, + { + "epoch": 0.4100123194086684, + "grad_norm": 1.2820180654525757, + "learning_rate": 3.6595000000000005e-05, + "loss": 0.4734, + "step": 7322 + }, + { + "epoch": 0.4100683167207974, + "grad_norm": 4.161149024963379, + "learning_rate": 3.66e-05, + "loss": 0.5211, + "step": 7323 + }, + { + "epoch": 0.41012431403292643, + "grad_norm": 1.2682455778121948, + "learning_rate": 3.6605e-05, + "loss": 0.4286, + "step": 7324 + }, + { + "epoch": 0.41018031134505545, + "grad_norm": 1.7455681562423706, + "learning_rate": 3.661e-05, + "loss": 0.6589, + "step": 7325 + }, + { + "epoch": 0.41023630865718447, + "grad_norm": 1.2011220455169678, + "learning_rate": 3.6615e-05, + "loss": 0.3598, + "step": 7326 + }, + { + "epoch": 0.4102923059693135, + "grad_norm": 1.5919878482818604, + "learning_rate": 3.6620000000000005e-05, + "loss": 0.4307, + "step": 7327 + }, + { + "epoch": 0.4103483032814425, + "grad_norm": 1.0798871517181396, + "learning_rate": 3.6625e-05, + "loss": 0.3439, + "step": 7328 + }, + { + "epoch": 0.4104043005935715, + "grad_norm": 1.3776450157165527, + "learning_rate": 3.663e-05, + "loss": 0.4705, + "step": 7329 + }, + { + "epoch": 0.41046029790570054, + "grad_norm": 1.2709369659423828, + "learning_rate": 3.6635000000000004e-05, + "loss": 0.3923, + "step": 7330 + }, + { + "epoch": 0.41051629521782956, + "grad_norm": 1.1078665256500244, + "learning_rate": 3.664e-05, + "loss": 0.4569, + "step": 7331 + }, + { + "epoch": 0.4105722925299586, + "grad_norm": 1.228965401649475, + "learning_rate": 3.6645e-05, + "loss": 0.4598, + "step": 7332 + }, + { + "epoch": 0.4106282898420876, + "grad_norm": 1.068197250366211, + "learning_rate": 3.665e-05, + "loss": 0.3491, + "step": 7333 + }, + { + "epoch": 0.4106842871542166, + "grad_norm": 1.3216149806976318, + "learning_rate": 3.6655e-05, + "loss": 0.6087, + "step": 7334 + }, + { + "epoch": 0.41074028446634564, + "grad_norm": 1.2426646947860718, + "learning_rate": 3.666e-05, + "loss": 0.3737, + "step": 7335 + }, + { + "epoch": 0.41079628177847466, + "grad_norm": 1.1281229257583618, + "learning_rate": 3.6665e-05, + "loss": 0.4539, + "step": 7336 + }, + { + "epoch": 0.4108522790906037, + "grad_norm": 1.2939091920852661, + "learning_rate": 3.6670000000000006e-05, + "loss": 0.4575, + "step": 7337 + }, + { + "epoch": 0.41090827640273264, + "grad_norm": 1.2367515563964844, + "learning_rate": 3.6675000000000004e-05, + "loss": 0.4127, + "step": 7338 + }, + { + "epoch": 0.41096427371486166, + "grad_norm": 1.260865569114685, + "learning_rate": 3.668e-05, + "loss": 0.6281, + "step": 7339 + }, + { + "epoch": 0.4110202710269907, + "grad_norm": 1.2586859464645386, + "learning_rate": 3.6685000000000005e-05, + "loss": 0.4123, + "step": 7340 + }, + { + "epoch": 0.4110762683391197, + "grad_norm": 1.1080645322799683, + "learning_rate": 3.669e-05, + "loss": 0.4533, + "step": 7341 + }, + { + "epoch": 0.4111322656512487, + "grad_norm": 1.2502843141555786, + "learning_rate": 3.6695e-05, + "loss": 0.4435, + "step": 7342 + }, + { + "epoch": 0.41118826296337774, + "grad_norm": 1.20815908908844, + "learning_rate": 3.6700000000000004e-05, + "loss": 0.4348, + "step": 7343 + }, + { + "epoch": 0.41124426027550676, + "grad_norm": 1.056998610496521, + "learning_rate": 3.6705e-05, + "loss": 0.3169, + "step": 7344 + }, + { + "epoch": 0.4113002575876358, + "grad_norm": 1.3029078245162964, + "learning_rate": 3.671e-05, + "loss": 0.5681, + "step": 7345 + }, + { + "epoch": 0.4113562548997648, + "grad_norm": 1.2896829843521118, + "learning_rate": 3.6714999999999997e-05, + "loss": 0.7173, + "step": 7346 + }, + { + "epoch": 0.4114122522118938, + "grad_norm": 1.2883481979370117, + "learning_rate": 3.672000000000001e-05, + "loss": 0.4045, + "step": 7347 + }, + { + "epoch": 0.41146824952402283, + "grad_norm": 1.1023024320602417, + "learning_rate": 3.6725000000000005e-05, + "loss": 0.3875, + "step": 7348 + }, + { + "epoch": 0.41152424683615185, + "grad_norm": 1.1380733251571655, + "learning_rate": 3.673e-05, + "loss": 0.3352, + "step": 7349 + }, + { + "epoch": 0.41158024414828087, + "grad_norm": 1.166361689567566, + "learning_rate": 3.6735e-05, + "loss": 0.5477, + "step": 7350 + }, + { + "epoch": 0.4116362414604099, + "grad_norm": 1.4538735151290894, + "learning_rate": 3.6740000000000004e-05, + "loss": 0.4344, + "step": 7351 + }, + { + "epoch": 0.4116922387725389, + "grad_norm": 1.8776609897613525, + "learning_rate": 3.6745e-05, + "loss": 0.4424, + "step": 7352 + }, + { + "epoch": 0.41174823608466793, + "grad_norm": 1.1175944805145264, + "learning_rate": 3.675e-05, + "loss": 0.4074, + "step": 7353 + }, + { + "epoch": 0.41180423339679695, + "grad_norm": 2.575025796890259, + "learning_rate": 3.6755e-05, + "loss": 0.4766, + "step": 7354 + }, + { + "epoch": 0.41186023070892597, + "grad_norm": 1.4082902669906616, + "learning_rate": 3.676e-05, + "loss": 0.5161, + "step": 7355 + }, + { + "epoch": 0.411916228021055, + "grad_norm": 1.2325143814086914, + "learning_rate": 3.6765e-05, + "loss": 0.4787, + "step": 7356 + }, + { + "epoch": 0.411972225333184, + "grad_norm": 1.1196098327636719, + "learning_rate": 3.677e-05, + "loss": 0.3825, + "step": 7357 + }, + { + "epoch": 0.412028222645313, + "grad_norm": 1.7802287340164185, + "learning_rate": 3.6775000000000006e-05, + "loss": 0.4514, + "step": 7358 + }, + { + "epoch": 0.41208421995744204, + "grad_norm": 1.1851742267608643, + "learning_rate": 3.6780000000000004e-05, + "loss": 0.3664, + "step": 7359 + }, + { + "epoch": 0.41214021726957106, + "grad_norm": 1.0434215068817139, + "learning_rate": 3.6785e-05, + "loss": 0.3649, + "step": 7360 + }, + { + "epoch": 0.4121962145817001, + "grad_norm": 1.520704746246338, + "learning_rate": 3.6790000000000005e-05, + "loss": 0.5598, + "step": 7361 + }, + { + "epoch": 0.4122522118938291, + "grad_norm": 1.5654765367507935, + "learning_rate": 3.6795e-05, + "loss": 0.4388, + "step": 7362 + }, + { + "epoch": 0.4123082092059581, + "grad_norm": 1.2206003665924072, + "learning_rate": 3.68e-05, + "loss": 0.5292, + "step": 7363 + }, + { + "epoch": 0.41236420651808714, + "grad_norm": 1.1813868284225464, + "learning_rate": 3.6805e-05, + "loss": 0.4441, + "step": 7364 + }, + { + "epoch": 0.41242020383021616, + "grad_norm": 1.269207239151001, + "learning_rate": 3.681e-05, + "loss": 0.3842, + "step": 7365 + }, + { + "epoch": 0.4124762011423452, + "grad_norm": 1.1340774297714233, + "learning_rate": 3.6815e-05, + "loss": 0.3907, + "step": 7366 + }, + { + "epoch": 0.4125321984544742, + "grad_norm": 1.211429476737976, + "learning_rate": 3.682e-05, + "loss": 0.4474, + "step": 7367 + }, + { + "epoch": 0.4125881957666032, + "grad_norm": 1.2279529571533203, + "learning_rate": 3.6825e-05, + "loss": 0.6188, + "step": 7368 + }, + { + "epoch": 0.41264419307873224, + "grad_norm": 1.2425464391708374, + "learning_rate": 3.6830000000000005e-05, + "loss": 0.3226, + "step": 7369 + }, + { + "epoch": 0.41270019039086125, + "grad_norm": 1.0841212272644043, + "learning_rate": 3.6835e-05, + "loss": 0.3205, + "step": 7370 + }, + { + "epoch": 0.4127561877029903, + "grad_norm": 1.2456731796264648, + "learning_rate": 3.684e-05, + "loss": 0.4568, + "step": 7371 + }, + { + "epoch": 0.4128121850151193, + "grad_norm": 1.2336543798446655, + "learning_rate": 3.6845000000000004e-05, + "loss": 0.4919, + "step": 7372 + }, + { + "epoch": 0.4128681823272483, + "grad_norm": 1.2150521278381348, + "learning_rate": 3.685e-05, + "loss": 0.3569, + "step": 7373 + }, + { + "epoch": 0.41292417963937733, + "grad_norm": 1.4870706796646118, + "learning_rate": 3.6855e-05, + "loss": 0.4903, + "step": 7374 + }, + { + "epoch": 0.41298017695150635, + "grad_norm": 1.4366742372512817, + "learning_rate": 3.686e-05, + "loss": 0.5873, + "step": 7375 + }, + { + "epoch": 0.41303617426363537, + "grad_norm": 1.1463624238967896, + "learning_rate": 3.6865e-05, + "loss": 0.3585, + "step": 7376 + }, + { + "epoch": 0.4130921715757644, + "grad_norm": 1.1975924968719482, + "learning_rate": 3.6870000000000004e-05, + "loss": 0.4505, + "step": 7377 + }, + { + "epoch": 0.4131481688878934, + "grad_norm": 1.291324496269226, + "learning_rate": 3.6875e-05, + "loss": 0.3405, + "step": 7378 + }, + { + "epoch": 0.41320416620002237, + "grad_norm": 1.1325887441635132, + "learning_rate": 3.6880000000000006e-05, + "loss": 0.4735, + "step": 7379 + }, + { + "epoch": 0.4132601635121514, + "grad_norm": 1.3903597593307495, + "learning_rate": 3.6885000000000003e-05, + "loss": 0.4485, + "step": 7380 + }, + { + "epoch": 0.4133161608242804, + "grad_norm": 1.4702013731002808, + "learning_rate": 3.689e-05, + "loss": 0.4377, + "step": 7381 + }, + { + "epoch": 0.41337215813640943, + "grad_norm": 1.2567180395126343, + "learning_rate": 3.6895000000000005e-05, + "loss": 0.347, + "step": 7382 + }, + { + "epoch": 0.41342815544853845, + "grad_norm": 1.202655553817749, + "learning_rate": 3.69e-05, + "loss": 0.3415, + "step": 7383 + }, + { + "epoch": 0.41348415276066747, + "grad_norm": 1.4150012731552124, + "learning_rate": 3.6905e-05, + "loss": 0.4606, + "step": 7384 + }, + { + "epoch": 0.4135401500727965, + "grad_norm": 1.4006381034851074, + "learning_rate": 3.691e-05, + "loss": 0.6619, + "step": 7385 + }, + { + "epoch": 0.4135961473849255, + "grad_norm": 1.134314775466919, + "learning_rate": 3.6915e-05, + "loss": 0.3623, + "step": 7386 + }, + { + "epoch": 0.4136521446970545, + "grad_norm": 1.419512152671814, + "learning_rate": 3.692e-05, + "loss": 0.3783, + "step": 7387 + }, + { + "epoch": 0.41370814200918354, + "grad_norm": 1.394559383392334, + "learning_rate": 3.6925e-05, + "loss": 0.3843, + "step": 7388 + }, + { + "epoch": 0.41376413932131256, + "grad_norm": 1.3517988920211792, + "learning_rate": 3.693e-05, + "loss": 0.5111, + "step": 7389 + }, + { + "epoch": 0.4138201366334416, + "grad_norm": 1.3542639017105103, + "learning_rate": 3.6935000000000005e-05, + "loss": 0.4957, + "step": 7390 + }, + { + "epoch": 0.4138761339455706, + "grad_norm": 1.3230736255645752, + "learning_rate": 3.694e-05, + "loss": 0.3711, + "step": 7391 + }, + { + "epoch": 0.4139321312576996, + "grad_norm": 1.388952374458313, + "learning_rate": 3.6945e-05, + "loss": 0.3725, + "step": 7392 + }, + { + "epoch": 0.41398812856982864, + "grad_norm": 1.247883677482605, + "learning_rate": 3.6950000000000004e-05, + "loss": 0.4009, + "step": 7393 + }, + { + "epoch": 0.41404412588195766, + "grad_norm": 1.362732172012329, + "learning_rate": 3.6955e-05, + "loss": 0.4289, + "step": 7394 + }, + { + "epoch": 0.4141001231940867, + "grad_norm": 1.1690155267715454, + "learning_rate": 3.696e-05, + "loss": 0.4162, + "step": 7395 + }, + { + "epoch": 0.4141561205062157, + "grad_norm": 1.3039852380752563, + "learning_rate": 3.6965e-05, + "loss": 0.624, + "step": 7396 + }, + { + "epoch": 0.4142121178183447, + "grad_norm": 1.0486277341842651, + "learning_rate": 3.697e-05, + "loss": 0.4451, + "step": 7397 + }, + { + "epoch": 0.41426811513047374, + "grad_norm": 1.4914668798446655, + "learning_rate": 3.6975000000000004e-05, + "loss": 0.4872, + "step": 7398 + }, + { + "epoch": 0.41432411244260275, + "grad_norm": 1.2971842288970947, + "learning_rate": 3.698e-05, + "loss": 0.4778, + "step": 7399 + }, + { + "epoch": 0.4143801097547318, + "grad_norm": 1.277298927307129, + "learning_rate": 3.6985000000000006e-05, + "loss": 0.4799, + "step": 7400 + }, + { + "epoch": 0.4144361070668608, + "grad_norm": 1.1096839904785156, + "learning_rate": 3.699e-05, + "loss": 0.3887, + "step": 7401 + }, + { + "epoch": 0.4144921043789898, + "grad_norm": 1.6496092081069946, + "learning_rate": 3.6995e-05, + "loss": 0.3864, + "step": 7402 + }, + { + "epoch": 0.41454810169111883, + "grad_norm": 1.2638951539993286, + "learning_rate": 3.7e-05, + "loss": 0.4364, + "step": 7403 + }, + { + "epoch": 0.41460409900324785, + "grad_norm": 1.452673077583313, + "learning_rate": 3.7005e-05, + "loss": 0.4836, + "step": 7404 + }, + { + "epoch": 0.41466009631537687, + "grad_norm": 1.1603468656539917, + "learning_rate": 3.701e-05, + "loss": 0.4391, + "step": 7405 + }, + { + "epoch": 0.4147160936275059, + "grad_norm": 1.2490837574005127, + "learning_rate": 3.7015e-05, + "loss": 0.377, + "step": 7406 + }, + { + "epoch": 0.4147720909396349, + "grad_norm": 1.301253080368042, + "learning_rate": 3.702e-05, + "loss": 0.4884, + "step": 7407 + }, + { + "epoch": 0.4148280882517639, + "grad_norm": 1.1982135772705078, + "learning_rate": 3.7025000000000005e-05, + "loss": 0.4737, + "step": 7408 + }, + { + "epoch": 0.41488408556389295, + "grad_norm": 1.099167823791504, + "learning_rate": 3.703e-05, + "loss": 0.3405, + "step": 7409 + }, + { + "epoch": 0.41494008287602197, + "grad_norm": 1.174721121788025, + "learning_rate": 3.7035e-05, + "loss": 0.3617, + "step": 7410 + }, + { + "epoch": 0.414996080188151, + "grad_norm": 1.277965784072876, + "learning_rate": 3.7040000000000005e-05, + "loss": 0.4473, + "step": 7411 + }, + { + "epoch": 0.41505207750028, + "grad_norm": 1.1440922021865845, + "learning_rate": 3.7045e-05, + "loss": 0.3614, + "step": 7412 + }, + { + "epoch": 0.415108074812409, + "grad_norm": 1.3988312482833862, + "learning_rate": 3.705e-05, + "loss": 0.5083, + "step": 7413 + }, + { + "epoch": 0.41516407212453804, + "grad_norm": 1.0867133140563965, + "learning_rate": 3.7055000000000004e-05, + "loss": 0.3373, + "step": 7414 + }, + { + "epoch": 0.41522006943666706, + "grad_norm": 1.3099242448806763, + "learning_rate": 3.706e-05, + "loss": 0.758, + "step": 7415 + }, + { + "epoch": 0.4152760667487961, + "grad_norm": 1.2499079704284668, + "learning_rate": 3.7065e-05, + "loss": 0.4199, + "step": 7416 + }, + { + "epoch": 0.4153320640609251, + "grad_norm": 1.0558418035507202, + "learning_rate": 3.707e-05, + "loss": 0.392, + "step": 7417 + }, + { + "epoch": 0.4153880613730541, + "grad_norm": 1.1931686401367188, + "learning_rate": 3.707500000000001e-05, + "loss": 0.3167, + "step": 7418 + }, + { + "epoch": 0.41544405868518314, + "grad_norm": 1.4347862005233765, + "learning_rate": 3.7080000000000004e-05, + "loss": 0.4799, + "step": 7419 + }, + { + "epoch": 0.41550005599731216, + "grad_norm": 1.5621248483657837, + "learning_rate": 3.7085e-05, + "loss": 0.4901, + "step": 7420 + }, + { + "epoch": 0.4155560533094411, + "grad_norm": 1.4398621320724487, + "learning_rate": 3.7090000000000006e-05, + "loss": 0.4738, + "step": 7421 + }, + { + "epoch": 0.41561205062157014, + "grad_norm": 2.3422293663024902, + "learning_rate": 3.7095e-05, + "loss": 0.5476, + "step": 7422 + }, + { + "epoch": 0.41566804793369916, + "grad_norm": 1.1191027164459229, + "learning_rate": 3.71e-05, + "loss": 0.3922, + "step": 7423 + }, + { + "epoch": 0.4157240452458282, + "grad_norm": 1.2718764543533325, + "learning_rate": 3.7105e-05, + "loss": 0.4628, + "step": 7424 + }, + { + "epoch": 0.4157800425579572, + "grad_norm": 1.2488576173782349, + "learning_rate": 3.711e-05, + "loss": 0.3959, + "step": 7425 + }, + { + "epoch": 0.4158360398700862, + "grad_norm": 1.1072102785110474, + "learning_rate": 3.7115e-05, + "loss": 0.4925, + "step": 7426 + }, + { + "epoch": 0.41589203718221524, + "grad_norm": 1.272878885269165, + "learning_rate": 3.712e-05, + "loss": 0.3726, + "step": 7427 + }, + { + "epoch": 0.41594803449434425, + "grad_norm": 1.1713122129440308, + "learning_rate": 3.7125e-05, + "loss": 0.3643, + "step": 7428 + }, + { + "epoch": 0.4160040318064733, + "grad_norm": 1.1248369216918945, + "learning_rate": 3.7130000000000005e-05, + "loss": 0.2857, + "step": 7429 + }, + { + "epoch": 0.4160600291186023, + "grad_norm": 1.3147884607315063, + "learning_rate": 3.7135e-05, + "loss": 0.4309, + "step": 7430 + }, + { + "epoch": 0.4161160264307313, + "grad_norm": 1.1655421257019043, + "learning_rate": 3.714e-05, + "loss": 0.454, + "step": 7431 + }, + { + "epoch": 0.41617202374286033, + "grad_norm": 1.1496081352233887, + "learning_rate": 3.7145000000000004e-05, + "loss": 0.3775, + "step": 7432 + }, + { + "epoch": 0.41622802105498935, + "grad_norm": 1.1635730266571045, + "learning_rate": 3.715e-05, + "loss": 0.3961, + "step": 7433 + }, + { + "epoch": 0.41628401836711837, + "grad_norm": 1.2568323612213135, + "learning_rate": 3.7155e-05, + "loss": 0.5334, + "step": 7434 + }, + { + "epoch": 0.4163400156792474, + "grad_norm": 1.2780178785324097, + "learning_rate": 3.716e-05, + "loss": 0.4151, + "step": 7435 + }, + { + "epoch": 0.4163960129913764, + "grad_norm": 1.2436951398849487, + "learning_rate": 3.7165e-05, + "loss": 0.3052, + "step": 7436 + }, + { + "epoch": 0.4164520103035054, + "grad_norm": 1.131870150566101, + "learning_rate": 3.717e-05, + "loss": 0.3319, + "step": 7437 + }, + { + "epoch": 0.41650800761563445, + "grad_norm": 1.9683313369750977, + "learning_rate": 3.7175e-05, + "loss": 0.4952, + "step": 7438 + }, + { + "epoch": 0.41656400492776346, + "grad_norm": 1.0045701265335083, + "learning_rate": 3.7180000000000007e-05, + "loss": 0.3867, + "step": 7439 + }, + { + "epoch": 0.4166200022398925, + "grad_norm": 1.1301097869873047, + "learning_rate": 3.7185000000000004e-05, + "loss": 0.4603, + "step": 7440 + }, + { + "epoch": 0.4166759995520215, + "grad_norm": 1.4950412511825562, + "learning_rate": 3.719e-05, + "loss": 0.7327, + "step": 7441 + }, + { + "epoch": 0.4167319968641505, + "grad_norm": 1.2363497018814087, + "learning_rate": 3.7195e-05, + "loss": 0.4815, + "step": 7442 + }, + { + "epoch": 0.41678799417627954, + "grad_norm": 1.0679094791412354, + "learning_rate": 3.72e-05, + "loss": 0.3766, + "step": 7443 + }, + { + "epoch": 0.41684399148840856, + "grad_norm": 1.2219816446304321, + "learning_rate": 3.7205e-05, + "loss": 0.4915, + "step": 7444 + }, + { + "epoch": 0.4168999888005376, + "grad_norm": 1.3547922372817993, + "learning_rate": 3.721e-05, + "loss": 0.4618, + "step": 7445 + }, + { + "epoch": 0.4169559861126666, + "grad_norm": 1.3519762754440308, + "learning_rate": 3.7215e-05, + "loss": 0.5428, + "step": 7446 + }, + { + "epoch": 0.4170119834247956, + "grad_norm": 1.3374686241149902, + "learning_rate": 3.722e-05, + "loss": 0.4427, + "step": 7447 + }, + { + "epoch": 0.41706798073692464, + "grad_norm": 1.1669195890426636, + "learning_rate": 3.7225000000000004e-05, + "loss": 0.4255, + "step": 7448 + }, + { + "epoch": 0.41712397804905366, + "grad_norm": 1.2421183586120605, + "learning_rate": 3.723e-05, + "loss": 0.3645, + "step": 7449 + }, + { + "epoch": 0.4171799753611827, + "grad_norm": 1.3754910230636597, + "learning_rate": 3.7235000000000005e-05, + "loss": 0.5434, + "step": 7450 + }, + { + "epoch": 0.4172359726733117, + "grad_norm": 1.1193515062332153, + "learning_rate": 3.724e-05, + "loss": 0.4259, + "step": 7451 + }, + { + "epoch": 0.4172919699854407, + "grad_norm": 1.3255189657211304, + "learning_rate": 3.7245e-05, + "loss": 0.4716, + "step": 7452 + }, + { + "epoch": 0.41734796729756973, + "grad_norm": 1.2888439893722534, + "learning_rate": 3.7250000000000004e-05, + "loss": 0.4525, + "step": 7453 + }, + { + "epoch": 0.41740396460969875, + "grad_norm": 1.7321168184280396, + "learning_rate": 3.7255e-05, + "loss": 0.4392, + "step": 7454 + }, + { + "epoch": 0.41745996192182777, + "grad_norm": 1.167161226272583, + "learning_rate": 3.726e-05, + "loss": 0.3948, + "step": 7455 + }, + { + "epoch": 0.4175159592339568, + "grad_norm": 1.6288481950759888, + "learning_rate": 3.7265e-05, + "loss": 0.4082, + "step": 7456 + }, + { + "epoch": 0.4175719565460858, + "grad_norm": 1.244638442993164, + "learning_rate": 3.727e-05, + "loss": 0.4842, + "step": 7457 + }, + { + "epoch": 0.41762795385821483, + "grad_norm": 1.3217008113861084, + "learning_rate": 3.7275000000000005e-05, + "loss": 0.4173, + "step": 7458 + }, + { + "epoch": 0.41768395117034385, + "grad_norm": 1.2548723220825195, + "learning_rate": 3.728e-05, + "loss": 0.4547, + "step": 7459 + }, + { + "epoch": 0.41773994848247287, + "grad_norm": 1.2116191387176514, + "learning_rate": 3.7285000000000006e-05, + "loss": 0.337, + "step": 7460 + }, + { + "epoch": 0.4177959457946019, + "grad_norm": 1.359483242034912, + "learning_rate": 3.7290000000000004e-05, + "loss": 0.4193, + "step": 7461 + }, + { + "epoch": 0.41785194310673085, + "grad_norm": 1.196447491645813, + "learning_rate": 3.7295e-05, + "loss": 0.48, + "step": 7462 + }, + { + "epoch": 0.41790794041885987, + "grad_norm": 1.2658195495605469, + "learning_rate": 3.73e-05, + "loss": 0.5135, + "step": 7463 + }, + { + "epoch": 0.4179639377309889, + "grad_norm": 1.276366949081421, + "learning_rate": 3.7305e-05, + "loss": 0.4656, + "step": 7464 + }, + { + "epoch": 0.4180199350431179, + "grad_norm": 1.5904545783996582, + "learning_rate": 3.731e-05, + "loss": 0.4994, + "step": 7465 + }, + { + "epoch": 0.4180759323552469, + "grad_norm": 1.1834349632263184, + "learning_rate": 3.7315e-05, + "loss": 0.3434, + "step": 7466 + }, + { + "epoch": 0.41813192966737595, + "grad_norm": 1.1932505369186401, + "learning_rate": 3.732e-05, + "loss": 0.4881, + "step": 7467 + }, + { + "epoch": 0.41818792697950496, + "grad_norm": 1.2890348434448242, + "learning_rate": 3.7325000000000006e-05, + "loss": 0.4165, + "step": 7468 + }, + { + "epoch": 0.418243924291634, + "grad_norm": 1.2739930152893066, + "learning_rate": 3.7330000000000003e-05, + "loss": 0.4243, + "step": 7469 + }, + { + "epoch": 0.418299921603763, + "grad_norm": 1.0256266593933105, + "learning_rate": 3.7335e-05, + "loss": 0.3585, + "step": 7470 + }, + { + "epoch": 0.418355918915892, + "grad_norm": 1.309674859046936, + "learning_rate": 3.7340000000000005e-05, + "loss": 0.4201, + "step": 7471 + }, + { + "epoch": 0.41841191622802104, + "grad_norm": 1.2147879600524902, + "learning_rate": 3.7345e-05, + "loss": 0.3943, + "step": 7472 + }, + { + "epoch": 0.41846791354015006, + "grad_norm": 1.3038194179534912, + "learning_rate": 3.735e-05, + "loss": 0.3817, + "step": 7473 + }, + { + "epoch": 0.4185239108522791, + "grad_norm": 1.4162400960922241, + "learning_rate": 3.7355000000000004e-05, + "loss": 0.462, + "step": 7474 + }, + { + "epoch": 0.4185799081644081, + "grad_norm": 1.294368028640747, + "learning_rate": 3.736e-05, + "loss": 0.4256, + "step": 7475 + }, + { + "epoch": 0.4186359054765371, + "grad_norm": 1.3236455917358398, + "learning_rate": 3.7365e-05, + "loss": 0.3296, + "step": 7476 + }, + { + "epoch": 0.41869190278866614, + "grad_norm": 1.3321163654327393, + "learning_rate": 3.7369999999999996e-05, + "loss": 0.4964, + "step": 7477 + }, + { + "epoch": 0.41874790010079516, + "grad_norm": 1.361327886581421, + "learning_rate": 3.737500000000001e-05, + "loss": 0.4242, + "step": 7478 + }, + { + "epoch": 0.4188038974129242, + "grad_norm": 1.1381531953811646, + "learning_rate": 3.7380000000000005e-05, + "loss": 0.3938, + "step": 7479 + }, + { + "epoch": 0.4188598947250532, + "grad_norm": 1.175363302230835, + "learning_rate": 3.7385e-05, + "loss": 0.3535, + "step": 7480 + }, + { + "epoch": 0.4189158920371822, + "grad_norm": 1.2960225343704224, + "learning_rate": 3.739e-05, + "loss": 0.4894, + "step": 7481 + }, + { + "epoch": 0.41897188934931123, + "grad_norm": 1.4263495206832886, + "learning_rate": 3.7395000000000004e-05, + "loss": 0.5933, + "step": 7482 + }, + { + "epoch": 0.41902788666144025, + "grad_norm": 1.2767161130905151, + "learning_rate": 3.74e-05, + "loss": 0.4764, + "step": 7483 + }, + { + "epoch": 0.41908388397356927, + "grad_norm": 1.178892970085144, + "learning_rate": 3.7405e-05, + "loss": 0.4257, + "step": 7484 + }, + { + "epoch": 0.4191398812856983, + "grad_norm": 1.3218621015548706, + "learning_rate": 3.741e-05, + "loss": 0.4084, + "step": 7485 + }, + { + "epoch": 0.4191958785978273, + "grad_norm": 1.385350227355957, + "learning_rate": 3.7415e-05, + "loss": 0.4606, + "step": 7486 + }, + { + "epoch": 0.41925187590995633, + "grad_norm": 1.311037302017212, + "learning_rate": 3.742e-05, + "loss": 0.4947, + "step": 7487 + }, + { + "epoch": 0.41930787322208535, + "grad_norm": 1.311902403831482, + "learning_rate": 3.7425e-05, + "loss": 0.4066, + "step": 7488 + }, + { + "epoch": 0.41936387053421437, + "grad_norm": 1.1940802335739136, + "learning_rate": 3.7430000000000006e-05, + "loss": 0.3556, + "step": 7489 + }, + { + "epoch": 0.4194198678463434, + "grad_norm": 1.77219557762146, + "learning_rate": 3.7435e-05, + "loss": 0.4119, + "step": 7490 + }, + { + "epoch": 0.4194758651584724, + "grad_norm": 1.2206567525863647, + "learning_rate": 3.744e-05, + "loss": 0.4377, + "step": 7491 + }, + { + "epoch": 0.4195318624706014, + "grad_norm": 1.3089338541030884, + "learning_rate": 3.7445000000000005e-05, + "loss": 0.5447, + "step": 7492 + }, + { + "epoch": 0.41958785978273044, + "grad_norm": 1.2232393026351929, + "learning_rate": 3.745e-05, + "loss": 0.3733, + "step": 7493 + }, + { + "epoch": 0.41964385709485946, + "grad_norm": 0.9914332032203674, + "learning_rate": 3.7455e-05, + "loss": 0.4314, + "step": 7494 + }, + { + "epoch": 0.4196998544069885, + "grad_norm": 1.6882667541503906, + "learning_rate": 3.7460000000000004e-05, + "loss": 0.5703, + "step": 7495 + }, + { + "epoch": 0.4197558517191175, + "grad_norm": 1.0649563074111938, + "learning_rate": 3.7465e-05, + "loss": 0.4024, + "step": 7496 + }, + { + "epoch": 0.4198118490312465, + "grad_norm": 1.0853193998336792, + "learning_rate": 3.747e-05, + "loss": 0.3227, + "step": 7497 + }, + { + "epoch": 0.41986784634337554, + "grad_norm": 1.2998113632202148, + "learning_rate": 3.7475e-05, + "loss": 0.4718, + "step": 7498 + }, + { + "epoch": 0.41992384365550456, + "grad_norm": 1.1960525512695312, + "learning_rate": 3.748000000000001e-05, + "loss": 0.4201, + "step": 7499 + }, + { + "epoch": 0.4199798409676336, + "grad_norm": 1.2740609645843506, + "learning_rate": 3.7485000000000004e-05, + "loss": 0.459, + "step": 7500 + }, + { + "epoch": 0.4200358382797626, + "grad_norm": 1.428773045539856, + "learning_rate": 3.749e-05, + "loss": 0.5769, + "step": 7501 + }, + { + "epoch": 0.4200918355918916, + "grad_norm": 1.283211350440979, + "learning_rate": 3.7495e-05, + "loss": 0.3808, + "step": 7502 + }, + { + "epoch": 0.4201478329040206, + "grad_norm": 1.177197813987732, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.4485, + "step": 7503 + }, + { + "epoch": 0.4202038302161496, + "grad_norm": 1.2770239114761353, + "learning_rate": 3.7505e-05, + "loss": 0.4191, + "step": 7504 + }, + { + "epoch": 0.4202598275282786, + "grad_norm": 1.404529333114624, + "learning_rate": 3.751e-05, + "loss": 0.4719, + "step": 7505 + }, + { + "epoch": 0.42031582484040764, + "grad_norm": 1.4240243434906006, + "learning_rate": 3.7515e-05, + "loss": 0.7294, + "step": 7506 + }, + { + "epoch": 0.42037182215253666, + "grad_norm": 1.0826585292816162, + "learning_rate": 3.752e-05, + "loss": 0.339, + "step": 7507 + }, + { + "epoch": 0.4204278194646657, + "grad_norm": 1.2525144815444946, + "learning_rate": 3.7525e-05, + "loss": 0.4028, + "step": 7508 + }, + { + "epoch": 0.4204838167767947, + "grad_norm": 1.1017965078353882, + "learning_rate": 3.753e-05, + "loss": 0.4051, + "step": 7509 + }, + { + "epoch": 0.4205398140889237, + "grad_norm": 1.0927131175994873, + "learning_rate": 3.7535000000000006e-05, + "loss": 0.3306, + "step": 7510 + }, + { + "epoch": 0.42059581140105273, + "grad_norm": 1.099252462387085, + "learning_rate": 3.754e-05, + "loss": 0.3469, + "step": 7511 + }, + { + "epoch": 0.42065180871318175, + "grad_norm": 1.1844866275787354, + "learning_rate": 3.7545e-05, + "loss": 0.3762, + "step": 7512 + }, + { + "epoch": 0.42070780602531077, + "grad_norm": 1.3952338695526123, + "learning_rate": 3.7550000000000005e-05, + "loss": 0.4501, + "step": 7513 + }, + { + "epoch": 0.4207638033374398, + "grad_norm": 1.1330647468566895, + "learning_rate": 3.7555e-05, + "loss": 0.4405, + "step": 7514 + }, + { + "epoch": 0.4208198006495688, + "grad_norm": 1.39003324508667, + "learning_rate": 3.756e-05, + "loss": 0.5793, + "step": 7515 + }, + { + "epoch": 0.42087579796169783, + "grad_norm": 1.1815036535263062, + "learning_rate": 3.7565e-05, + "loss": 0.5452, + "step": 7516 + }, + { + "epoch": 0.42093179527382685, + "grad_norm": 1.0811365842819214, + "learning_rate": 3.757e-05, + "loss": 0.396, + "step": 7517 + }, + { + "epoch": 0.42098779258595587, + "grad_norm": 1.2157245874404907, + "learning_rate": 3.7575e-05, + "loss": 0.3333, + "step": 7518 + }, + { + "epoch": 0.4210437898980849, + "grad_norm": 1.5133262872695923, + "learning_rate": 3.758e-05, + "loss": 0.5342, + "step": 7519 + }, + { + "epoch": 0.4210997872102139, + "grad_norm": 1.161711573600769, + "learning_rate": 3.758500000000001e-05, + "loss": 0.373, + "step": 7520 + }, + { + "epoch": 0.4211557845223429, + "grad_norm": 1.3511238098144531, + "learning_rate": 3.7590000000000004e-05, + "loss": 0.3456, + "step": 7521 + }, + { + "epoch": 0.42121178183447194, + "grad_norm": 1.7794889211654663, + "learning_rate": 3.7595e-05, + "loss": 0.503, + "step": 7522 + }, + { + "epoch": 0.42126777914660096, + "grad_norm": 1.5192196369171143, + "learning_rate": 3.76e-05, + "loss": 0.4161, + "step": 7523 + }, + { + "epoch": 0.42132377645873, + "grad_norm": 1.1321442127227783, + "learning_rate": 3.7605e-05, + "loss": 0.3586, + "step": 7524 + }, + { + "epoch": 0.421379773770859, + "grad_norm": 1.3278142213821411, + "learning_rate": 3.761e-05, + "loss": 0.4832, + "step": 7525 + }, + { + "epoch": 0.421435771082988, + "grad_norm": 1.2671815156936646, + "learning_rate": 3.7615e-05, + "loss": 0.4192, + "step": 7526 + }, + { + "epoch": 0.42149176839511704, + "grad_norm": 1.416520595550537, + "learning_rate": 3.762e-05, + "loss": 0.4862, + "step": 7527 + }, + { + "epoch": 0.42154776570724606, + "grad_norm": 1.217038631439209, + "learning_rate": 3.7625e-05, + "loss": 0.4498, + "step": 7528 + }, + { + "epoch": 0.4216037630193751, + "grad_norm": 2.6948342323303223, + "learning_rate": 3.7630000000000004e-05, + "loss": 0.6703, + "step": 7529 + }, + { + "epoch": 0.4216597603315041, + "grad_norm": 1.1527397632598877, + "learning_rate": 3.7635e-05, + "loss": 0.4959, + "step": 7530 + }, + { + "epoch": 0.4217157576436331, + "grad_norm": 1.5450860261917114, + "learning_rate": 3.7640000000000006e-05, + "loss": 0.4654, + "step": 7531 + }, + { + "epoch": 0.42177175495576213, + "grad_norm": 1.394510269165039, + "learning_rate": 3.7645e-05, + "loss": 0.5322, + "step": 7532 + }, + { + "epoch": 0.42182775226789115, + "grad_norm": 1.3331291675567627, + "learning_rate": 3.765e-05, + "loss": 0.4591, + "step": 7533 + }, + { + "epoch": 0.4218837495800202, + "grad_norm": 1.3717551231384277, + "learning_rate": 3.7655000000000005e-05, + "loss": 0.5099, + "step": 7534 + }, + { + "epoch": 0.4219397468921492, + "grad_norm": 1.1739580631256104, + "learning_rate": 3.766e-05, + "loss": 0.3854, + "step": 7535 + }, + { + "epoch": 0.4219957442042782, + "grad_norm": 1.1824437379837036, + "learning_rate": 3.7665e-05, + "loss": 0.4561, + "step": 7536 + }, + { + "epoch": 0.42205174151640723, + "grad_norm": 1.332931637763977, + "learning_rate": 3.767e-05, + "loss": 0.3535, + "step": 7537 + }, + { + "epoch": 0.42210773882853625, + "grad_norm": 1.2487376928329468, + "learning_rate": 3.7675e-05, + "loss": 0.4024, + "step": 7538 + }, + { + "epoch": 0.42216373614066527, + "grad_norm": 1.5085293054580688, + "learning_rate": 3.7680000000000005e-05, + "loss": 0.3954, + "step": 7539 + }, + { + "epoch": 0.4222197334527943, + "grad_norm": 1.2047690153121948, + "learning_rate": 3.7685e-05, + "loss": 0.517, + "step": 7540 + }, + { + "epoch": 0.4222757307649233, + "grad_norm": 1.1977018117904663, + "learning_rate": 3.769e-05, + "loss": 0.3594, + "step": 7541 + }, + { + "epoch": 0.4223317280770523, + "grad_norm": 1.2525365352630615, + "learning_rate": 3.7695000000000004e-05, + "loss": 0.5402, + "step": 7542 + }, + { + "epoch": 0.42238772538918135, + "grad_norm": 4.599066734313965, + "learning_rate": 3.77e-05, + "loss": 0.3296, + "step": 7543 + }, + { + "epoch": 0.42244372270131036, + "grad_norm": 1.1863445043563843, + "learning_rate": 3.7705e-05, + "loss": 0.3454, + "step": 7544 + }, + { + "epoch": 0.42249972001343933, + "grad_norm": 1.478772521018982, + "learning_rate": 3.771e-05, + "loss": 0.5987, + "step": 7545 + }, + { + "epoch": 0.42255571732556835, + "grad_norm": 1.3632503747940063, + "learning_rate": 3.7715e-05, + "loss": 0.4575, + "step": 7546 + }, + { + "epoch": 0.42261171463769737, + "grad_norm": 1.4847928285598755, + "learning_rate": 3.772e-05, + "loss": 0.3567, + "step": 7547 + }, + { + "epoch": 0.4226677119498264, + "grad_norm": 1.3456907272338867, + "learning_rate": 3.7725e-05, + "loss": 0.542, + "step": 7548 + }, + { + "epoch": 0.4227237092619554, + "grad_norm": 1.2731678485870361, + "learning_rate": 3.7730000000000006e-05, + "loss": 0.4282, + "step": 7549 + }, + { + "epoch": 0.4227797065740844, + "grad_norm": 4.894685745239258, + "learning_rate": 3.7735000000000004e-05, + "loss": 0.6533, + "step": 7550 + }, + { + "epoch": 0.42283570388621344, + "grad_norm": 1.2923585176467896, + "learning_rate": 3.774e-05, + "loss": 0.6542, + "step": 7551 + }, + { + "epoch": 0.42289170119834246, + "grad_norm": 1.7255858182907104, + "learning_rate": 3.7745000000000005e-05, + "loss": 0.5819, + "step": 7552 + }, + { + "epoch": 0.4229476985104715, + "grad_norm": 1.4177074432373047, + "learning_rate": 3.775e-05, + "loss": 0.5332, + "step": 7553 + }, + { + "epoch": 0.4230036958226005, + "grad_norm": 1.2697304487228394, + "learning_rate": 3.7755e-05, + "loss": 0.48, + "step": 7554 + }, + { + "epoch": 0.4230596931347295, + "grad_norm": 1.4944721460342407, + "learning_rate": 3.776e-05, + "loss": 0.6667, + "step": 7555 + }, + { + "epoch": 0.42311569044685854, + "grad_norm": 1.0259102582931519, + "learning_rate": 3.7765e-05, + "loss": 0.3976, + "step": 7556 + }, + { + "epoch": 0.42317168775898756, + "grad_norm": 1.1391164064407349, + "learning_rate": 3.777e-05, + "loss": 0.526, + "step": 7557 + }, + { + "epoch": 0.4232276850711166, + "grad_norm": 1.4234325885772705, + "learning_rate": 3.7775e-05, + "loss": 0.5159, + "step": 7558 + }, + { + "epoch": 0.4232836823832456, + "grad_norm": 1.290900468826294, + "learning_rate": 3.778000000000001e-05, + "loss": 0.4166, + "step": 7559 + }, + { + "epoch": 0.4233396796953746, + "grad_norm": 1.2310255765914917, + "learning_rate": 3.7785000000000005e-05, + "loss": 0.4356, + "step": 7560 + }, + { + "epoch": 0.42339567700750363, + "grad_norm": 1.2476540803909302, + "learning_rate": 3.779e-05, + "loss": 0.3219, + "step": 7561 + }, + { + "epoch": 0.42345167431963265, + "grad_norm": 1.3199870586395264, + "learning_rate": 3.7795e-05, + "loss": 0.487, + "step": 7562 + }, + { + "epoch": 0.4235076716317617, + "grad_norm": 1.2570263147354126, + "learning_rate": 3.7800000000000004e-05, + "loss": 0.4006, + "step": 7563 + }, + { + "epoch": 0.4235636689438907, + "grad_norm": 1.365960955619812, + "learning_rate": 3.7805e-05, + "loss": 0.4384, + "step": 7564 + }, + { + "epoch": 0.4236196662560197, + "grad_norm": 1.5694210529327393, + "learning_rate": 3.781e-05, + "loss": 0.528, + "step": 7565 + }, + { + "epoch": 0.42367566356814873, + "grad_norm": 1.0389361381530762, + "learning_rate": 3.7815e-05, + "loss": 0.3886, + "step": 7566 + }, + { + "epoch": 0.42373166088027775, + "grad_norm": Infinity, + "learning_rate": 3.7815e-05, + "loss": 0.4799, + "step": 7567 + }, + { + "epoch": 0.42378765819240677, + "grad_norm": 1.392661690711975, + "learning_rate": 3.782e-05, + "loss": 0.4247, + "step": 7568 + }, + { + "epoch": 0.4238436555045358, + "grad_norm": 1.1096845865249634, + "learning_rate": 3.7825e-05, + "loss": 0.354, + "step": 7569 + }, + { + "epoch": 0.4238996528166648, + "grad_norm": 1.1711372137069702, + "learning_rate": 3.783e-05, + "loss": 0.4522, + "step": 7570 + }, + { + "epoch": 0.4239556501287938, + "grad_norm": 1.2791259288787842, + "learning_rate": 3.7835000000000006e-05, + "loss": 0.3417, + "step": 7571 + }, + { + "epoch": 0.42401164744092285, + "grad_norm": 1.1541465520858765, + "learning_rate": 3.7840000000000004e-05, + "loss": 0.4978, + "step": 7572 + }, + { + "epoch": 0.42406764475305186, + "grad_norm": 1.3474169969558716, + "learning_rate": 3.7845e-05, + "loss": 0.4274, + "step": 7573 + }, + { + "epoch": 0.4241236420651809, + "grad_norm": 1.2171330451965332, + "learning_rate": 3.7850000000000005e-05, + "loss": 0.4049, + "step": 7574 + }, + { + "epoch": 0.4241796393773099, + "grad_norm": 1.2262377738952637, + "learning_rate": 3.7855e-05, + "loss": 0.3458, + "step": 7575 + }, + { + "epoch": 0.4242356366894389, + "grad_norm": 2.331810474395752, + "learning_rate": 3.786e-05, + "loss": 0.4114, + "step": 7576 + }, + { + "epoch": 0.42429163400156794, + "grad_norm": 1.209799885749817, + "learning_rate": 3.7865e-05, + "loss": 0.4726, + "step": 7577 + }, + { + "epoch": 0.42434763131369696, + "grad_norm": 1.1870396137237549, + "learning_rate": 3.787e-05, + "loss": 0.4475, + "step": 7578 + }, + { + "epoch": 0.424403628625826, + "grad_norm": 1.3213865756988525, + "learning_rate": 3.7875e-05, + "loss": 0.4684, + "step": 7579 + }, + { + "epoch": 0.424459625937955, + "grad_norm": 1.2779390811920166, + "learning_rate": 3.788e-05, + "loss": 0.4014, + "step": 7580 + }, + { + "epoch": 0.424515623250084, + "grad_norm": 1.1260337829589844, + "learning_rate": 3.7885e-05, + "loss": 0.4165, + "step": 7581 + }, + { + "epoch": 0.42457162056221304, + "grad_norm": 1.3278111219406128, + "learning_rate": 3.7890000000000005e-05, + "loss": 0.4884, + "step": 7582 + }, + { + "epoch": 0.42462761787434206, + "grad_norm": 1.2361208200454712, + "learning_rate": 3.7895e-05, + "loss": 0.4114, + "step": 7583 + }, + { + "epoch": 0.4246836151864711, + "grad_norm": 1.1926347017288208, + "learning_rate": 3.79e-05, + "loss": 0.4686, + "step": 7584 + }, + { + "epoch": 0.4247396124986001, + "grad_norm": 1.308377981185913, + "learning_rate": 3.7905000000000004e-05, + "loss": 0.5798, + "step": 7585 + }, + { + "epoch": 0.42479560981072906, + "grad_norm": 1.328100562095642, + "learning_rate": 3.791e-05, + "loss": 0.4104, + "step": 7586 + }, + { + "epoch": 0.4248516071228581, + "grad_norm": 1.2501837015151978, + "learning_rate": 3.7915e-05, + "loss": 0.3796, + "step": 7587 + }, + { + "epoch": 0.4249076044349871, + "grad_norm": 1.293765664100647, + "learning_rate": 3.792e-05, + "loss": 0.3548, + "step": 7588 + }, + { + "epoch": 0.4249636017471161, + "grad_norm": 1.2599366903305054, + "learning_rate": 3.7925e-05, + "loss": 0.3834, + "step": 7589 + }, + { + "epoch": 0.42501959905924513, + "grad_norm": 1.2977622747421265, + "learning_rate": 3.7930000000000004e-05, + "loss": 0.5509, + "step": 7590 + }, + { + "epoch": 0.42507559637137415, + "grad_norm": 1.1092506647109985, + "learning_rate": 3.7935e-05, + "loss": 0.4144, + "step": 7591 + }, + { + "epoch": 0.4251315936835032, + "grad_norm": 1.2002649307250977, + "learning_rate": 3.7940000000000006e-05, + "loss": 0.3521, + "step": 7592 + }, + { + "epoch": 0.4251875909956322, + "grad_norm": 1.210794448852539, + "learning_rate": 3.7945000000000003e-05, + "loss": 0.3235, + "step": 7593 + }, + { + "epoch": 0.4252435883077612, + "grad_norm": 1.4287084341049194, + "learning_rate": 3.795e-05, + "loss": 0.4229, + "step": 7594 + }, + { + "epoch": 0.42529958561989023, + "grad_norm": 1.0724073648452759, + "learning_rate": 3.7955e-05, + "loss": 0.3281, + "step": 7595 + }, + { + "epoch": 0.42535558293201925, + "grad_norm": 1.2447272539138794, + "learning_rate": 3.796e-05, + "loss": 0.4698, + "step": 7596 + }, + { + "epoch": 0.42541158024414827, + "grad_norm": 1.1794023513793945, + "learning_rate": 3.7965e-05, + "loss": 0.4844, + "step": 7597 + }, + { + "epoch": 0.4254675775562773, + "grad_norm": 1.178626298904419, + "learning_rate": 3.797e-05, + "loss": 0.4751, + "step": 7598 + }, + { + "epoch": 0.4255235748684063, + "grad_norm": 1.1559756994247437, + "learning_rate": 3.7975e-05, + "loss": 0.3661, + "step": 7599 + }, + { + "epoch": 0.4255795721805353, + "grad_norm": 1.593385934829712, + "learning_rate": 3.7980000000000006e-05, + "loss": 0.6293, + "step": 7600 + }, + { + "epoch": 0.42563556949266435, + "grad_norm": 1.361419439315796, + "learning_rate": 3.7985e-05, + "loss": 0.5309, + "step": 7601 + }, + { + "epoch": 0.42569156680479336, + "grad_norm": 1.2820345163345337, + "learning_rate": 3.799e-05, + "loss": 0.5431, + "step": 7602 + }, + { + "epoch": 0.4257475641169224, + "grad_norm": 1.487322449684143, + "learning_rate": 3.7995000000000005e-05, + "loss": 0.548, + "step": 7603 + }, + { + "epoch": 0.4258035614290514, + "grad_norm": 1.1518892049789429, + "learning_rate": 3.8e-05, + "loss": 0.5031, + "step": 7604 + }, + { + "epoch": 0.4258595587411804, + "grad_norm": 1.4109389781951904, + "learning_rate": 3.8005e-05, + "loss": 0.5499, + "step": 7605 + }, + { + "epoch": 0.42591555605330944, + "grad_norm": 1.2256593704223633, + "learning_rate": 3.8010000000000004e-05, + "loss": 0.4154, + "step": 7606 + }, + { + "epoch": 0.42597155336543846, + "grad_norm": 1.0937228202819824, + "learning_rate": 3.8015e-05, + "loss": 0.3778, + "step": 7607 + }, + { + "epoch": 0.4260275506775675, + "grad_norm": 1.2796554565429688, + "learning_rate": 3.802e-05, + "loss": 0.4817, + "step": 7608 + }, + { + "epoch": 0.4260835479896965, + "grad_norm": 1.2420028448104858, + "learning_rate": 3.8025e-05, + "loss": 0.4238, + "step": 7609 + }, + { + "epoch": 0.4261395453018255, + "grad_norm": 1.066815733909607, + "learning_rate": 3.803000000000001e-05, + "loss": 0.4001, + "step": 7610 + }, + { + "epoch": 0.42619554261395454, + "grad_norm": 1.2149802446365356, + "learning_rate": 3.8035000000000004e-05, + "loss": 0.3767, + "step": 7611 + }, + { + "epoch": 0.42625153992608356, + "grad_norm": 1.2878392934799194, + "learning_rate": 3.804e-05, + "loss": 0.419, + "step": 7612 + }, + { + "epoch": 0.4263075372382126, + "grad_norm": 1.2985421419143677, + "learning_rate": 3.8045000000000006e-05, + "loss": 0.447, + "step": 7613 + }, + { + "epoch": 0.4263635345503416, + "grad_norm": 1.4540281295776367, + "learning_rate": 3.805e-05, + "loss": 0.3851, + "step": 7614 + }, + { + "epoch": 0.4264195318624706, + "grad_norm": 1.2379271984100342, + "learning_rate": 3.8055e-05, + "loss": 0.4092, + "step": 7615 + }, + { + "epoch": 0.42647552917459963, + "grad_norm": 1.2478660345077515, + "learning_rate": 3.806e-05, + "loss": 0.3936, + "step": 7616 + }, + { + "epoch": 0.42653152648672865, + "grad_norm": 1.6623990535736084, + "learning_rate": 3.8065e-05, + "loss": 0.5192, + "step": 7617 + }, + { + "epoch": 0.42658752379885767, + "grad_norm": 1.3103492259979248, + "learning_rate": 3.807e-05, + "loss": 0.4512, + "step": 7618 + }, + { + "epoch": 0.4266435211109867, + "grad_norm": 1.294898509979248, + "learning_rate": 3.8075e-05, + "loss": 0.4323, + "step": 7619 + }, + { + "epoch": 0.4266995184231157, + "grad_norm": 1.214297890663147, + "learning_rate": 3.808e-05, + "loss": 0.3929, + "step": 7620 + }, + { + "epoch": 0.42675551573524473, + "grad_norm": 1.2719229459762573, + "learning_rate": 3.8085000000000006e-05, + "loss": 0.4642, + "step": 7621 + }, + { + "epoch": 0.42681151304737375, + "grad_norm": 1.2533601522445679, + "learning_rate": 3.809e-05, + "loss": 0.3769, + "step": 7622 + }, + { + "epoch": 0.42686751035950277, + "grad_norm": 1.5504531860351562, + "learning_rate": 3.8095e-05, + "loss": 0.4822, + "step": 7623 + }, + { + "epoch": 0.4269235076716318, + "grad_norm": 1.2235283851623535, + "learning_rate": 3.8100000000000005e-05, + "loss": 0.3915, + "step": 7624 + }, + { + "epoch": 0.4269795049837608, + "grad_norm": 1.3170377016067505, + "learning_rate": 3.8105e-05, + "loss": 0.4689, + "step": 7625 + }, + { + "epoch": 0.4270355022958898, + "grad_norm": 1.1815876960754395, + "learning_rate": 3.811e-05, + "loss": 0.4741, + "step": 7626 + }, + { + "epoch": 0.4270914996080188, + "grad_norm": 1.4471092224121094, + "learning_rate": 3.8115000000000004e-05, + "loss": 0.5063, + "step": 7627 + }, + { + "epoch": 0.4271474969201478, + "grad_norm": 1.2353376150131226, + "learning_rate": 3.812e-05, + "loss": 0.4274, + "step": 7628 + }, + { + "epoch": 0.4272034942322768, + "grad_norm": 1.41990065574646, + "learning_rate": 3.8125e-05, + "loss": 0.4, + "step": 7629 + }, + { + "epoch": 0.42725949154440584, + "grad_norm": 1.7524484395980835, + "learning_rate": 3.8129999999999996e-05, + "loss": 0.4488, + "step": 7630 + }, + { + "epoch": 0.42731548885653486, + "grad_norm": 1.1383867263793945, + "learning_rate": 3.813500000000001e-05, + "loss": 0.5019, + "step": 7631 + }, + { + "epoch": 0.4273714861686639, + "grad_norm": 1.3128741979599, + "learning_rate": 3.8140000000000004e-05, + "loss": 0.3711, + "step": 7632 + }, + { + "epoch": 0.4274274834807929, + "grad_norm": 1.2062790393829346, + "learning_rate": 3.8145e-05, + "loss": 0.4014, + "step": 7633 + }, + { + "epoch": 0.4274834807929219, + "grad_norm": 1.2138209342956543, + "learning_rate": 3.8150000000000006e-05, + "loss": 0.4114, + "step": 7634 + }, + { + "epoch": 0.42753947810505094, + "grad_norm": 1.4575783014297485, + "learning_rate": 3.8155e-05, + "loss": 0.6132, + "step": 7635 + }, + { + "epoch": 0.42759547541717996, + "grad_norm": 1.1320710182189941, + "learning_rate": 3.816e-05, + "loss": 0.3843, + "step": 7636 + }, + { + "epoch": 0.427651472729309, + "grad_norm": 1.2448700666427612, + "learning_rate": 3.8165e-05, + "loss": 0.5245, + "step": 7637 + }, + { + "epoch": 0.427707470041438, + "grad_norm": 0.9886170029640198, + "learning_rate": 3.817e-05, + "loss": 0.3921, + "step": 7638 + }, + { + "epoch": 0.427763467353567, + "grad_norm": 1.1423417329788208, + "learning_rate": 3.8175e-05, + "loss": 0.3704, + "step": 7639 + }, + { + "epoch": 0.42781946466569604, + "grad_norm": 1.307464838027954, + "learning_rate": 3.818e-05, + "loss": 0.4182, + "step": 7640 + }, + { + "epoch": 0.42787546197782506, + "grad_norm": 1.1473870277404785, + "learning_rate": 3.8185e-05, + "loss": 0.4207, + "step": 7641 + }, + { + "epoch": 0.4279314592899541, + "grad_norm": 1.1849149465560913, + "learning_rate": 3.8190000000000005e-05, + "loss": 0.526, + "step": 7642 + }, + { + "epoch": 0.4279874566020831, + "grad_norm": 1.3337353467941284, + "learning_rate": 3.8195e-05, + "loss": 0.3972, + "step": 7643 + }, + { + "epoch": 0.4280434539142121, + "grad_norm": 1.0717594623565674, + "learning_rate": 3.82e-05, + "loss": 0.3909, + "step": 7644 + }, + { + "epoch": 0.42809945122634113, + "grad_norm": 1.5796343088150024, + "learning_rate": 3.8205000000000004e-05, + "loss": 0.6941, + "step": 7645 + }, + { + "epoch": 0.42815544853847015, + "grad_norm": 1.196513295173645, + "learning_rate": 3.821e-05, + "loss": 0.4088, + "step": 7646 + }, + { + "epoch": 0.42821144585059917, + "grad_norm": 1.0890085697174072, + "learning_rate": 3.8215e-05, + "loss": 0.4185, + "step": 7647 + }, + { + "epoch": 0.4282674431627282, + "grad_norm": 1.166717767715454, + "learning_rate": 3.822e-05, + "loss": 0.4052, + "step": 7648 + }, + { + "epoch": 0.4283234404748572, + "grad_norm": 1.1860620975494385, + "learning_rate": 3.8225e-05, + "loss": 0.5055, + "step": 7649 + }, + { + "epoch": 0.42837943778698623, + "grad_norm": 1.2802859544754028, + "learning_rate": 3.823e-05, + "loss": 0.3698, + "step": 7650 + }, + { + "epoch": 0.42843543509911525, + "grad_norm": 1.1276755332946777, + "learning_rate": 3.8235e-05, + "loss": 0.4438, + "step": 7651 + }, + { + "epoch": 0.42849143241124427, + "grad_norm": 1.3856489658355713, + "learning_rate": 3.8240000000000007e-05, + "loss": 0.5302, + "step": 7652 + }, + { + "epoch": 0.4285474297233733, + "grad_norm": 1.2472093105316162, + "learning_rate": 3.8245000000000004e-05, + "loss": 0.3277, + "step": 7653 + }, + { + "epoch": 0.4286034270355023, + "grad_norm": 1.3688454627990723, + "learning_rate": 3.825e-05, + "loss": 0.3755, + "step": 7654 + }, + { + "epoch": 0.4286594243476313, + "grad_norm": 1.4699273109436035, + "learning_rate": 3.8255e-05, + "loss": 0.4998, + "step": 7655 + }, + { + "epoch": 0.42871542165976034, + "grad_norm": 1.7625372409820557, + "learning_rate": 3.826e-05, + "loss": 0.6153, + "step": 7656 + }, + { + "epoch": 0.42877141897188936, + "grad_norm": 1.4056943655014038, + "learning_rate": 3.8265e-05, + "loss": 0.4559, + "step": 7657 + }, + { + "epoch": 0.4288274162840184, + "grad_norm": 1.6552726030349731, + "learning_rate": 3.827e-05, + "loss": 0.4928, + "step": 7658 + }, + { + "epoch": 0.4288834135961474, + "grad_norm": 1.1531238555908203, + "learning_rate": 3.8275e-05, + "loss": 0.3823, + "step": 7659 + }, + { + "epoch": 0.4289394109082764, + "grad_norm": 1.2474634647369385, + "learning_rate": 3.828e-05, + "loss": 0.4445, + "step": 7660 + }, + { + "epoch": 0.42899540822040544, + "grad_norm": 1.1210706233978271, + "learning_rate": 3.8285000000000004e-05, + "loss": 0.3965, + "step": 7661 + }, + { + "epoch": 0.42905140553253446, + "grad_norm": 1.180922508239746, + "learning_rate": 3.829e-05, + "loss": 0.3521, + "step": 7662 + }, + { + "epoch": 0.4291074028446635, + "grad_norm": 0.9694157838821411, + "learning_rate": 3.8295000000000005e-05, + "loss": 0.4278, + "step": 7663 + }, + { + "epoch": 0.4291634001567925, + "grad_norm": 1.468010663986206, + "learning_rate": 3.83e-05, + "loss": 0.5533, + "step": 7664 + }, + { + "epoch": 0.4292193974689215, + "grad_norm": 1.3029483556747437, + "learning_rate": 3.8305e-05, + "loss": 0.3681, + "step": 7665 + }, + { + "epoch": 0.42927539478105053, + "grad_norm": 1.5082733631134033, + "learning_rate": 3.8310000000000004e-05, + "loss": 0.4196, + "step": 7666 + }, + { + "epoch": 0.42933139209317955, + "grad_norm": 1.4620344638824463, + "learning_rate": 3.8315e-05, + "loss": 0.4516, + "step": 7667 + }, + { + "epoch": 0.4293873894053086, + "grad_norm": 1.3122128248214722, + "learning_rate": 3.832e-05, + "loss": 0.5609, + "step": 7668 + }, + { + "epoch": 0.42944338671743754, + "grad_norm": 1.352057695388794, + "learning_rate": 3.8324999999999996e-05, + "loss": 0.7109, + "step": 7669 + }, + { + "epoch": 0.42949938402956656, + "grad_norm": 1.085706353187561, + "learning_rate": 3.833e-05, + "loss": 0.3828, + "step": 7670 + }, + { + "epoch": 0.4295553813416956, + "grad_norm": 1.1122759580612183, + "learning_rate": 3.8335000000000005e-05, + "loss": 0.3223, + "step": 7671 + }, + { + "epoch": 0.4296113786538246, + "grad_norm": 1.310568928718567, + "learning_rate": 3.834e-05, + "loss": 0.3985, + "step": 7672 + }, + { + "epoch": 0.4296673759659536, + "grad_norm": 1.6231038570404053, + "learning_rate": 3.8345000000000006e-05, + "loss": 0.531, + "step": 7673 + }, + { + "epoch": 0.42972337327808263, + "grad_norm": 1.6763800382614136, + "learning_rate": 3.8350000000000004e-05, + "loss": 0.4758, + "step": 7674 + }, + { + "epoch": 0.42977937059021165, + "grad_norm": 1.440268874168396, + "learning_rate": 3.8355e-05, + "loss": 0.5195, + "step": 7675 + }, + { + "epoch": 0.42983536790234067, + "grad_norm": 1.206406593322754, + "learning_rate": 3.836e-05, + "loss": 0.3597, + "step": 7676 + }, + { + "epoch": 0.4298913652144697, + "grad_norm": 1.423871636390686, + "learning_rate": 3.8365e-05, + "loss": 0.3996, + "step": 7677 + }, + { + "epoch": 0.4299473625265987, + "grad_norm": 1.1362015008926392, + "learning_rate": 3.837e-05, + "loss": 0.3242, + "step": 7678 + }, + { + "epoch": 0.43000335983872773, + "grad_norm": 1.2650737762451172, + "learning_rate": 3.8375e-05, + "loss": 0.3938, + "step": 7679 + }, + { + "epoch": 0.43005935715085675, + "grad_norm": 1.2767328023910522, + "learning_rate": 3.838e-05, + "loss": 0.3629, + "step": 7680 + }, + { + "epoch": 0.43011535446298577, + "grad_norm": 1.2091460227966309, + "learning_rate": 3.8385000000000006e-05, + "loss": 0.3916, + "step": 7681 + }, + { + "epoch": 0.4301713517751148, + "grad_norm": 1.4345455169677734, + "learning_rate": 3.8390000000000003e-05, + "loss": 0.4058, + "step": 7682 + }, + { + "epoch": 0.4302273490872438, + "grad_norm": 1.095362663269043, + "learning_rate": 3.8395e-05, + "loss": 0.4278, + "step": 7683 + }, + { + "epoch": 0.4302833463993728, + "grad_norm": 1.3193087577819824, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.3639, + "step": 7684 + }, + { + "epoch": 0.43033934371150184, + "grad_norm": 1.5311903953552246, + "learning_rate": 3.8405e-05, + "loss": 0.4734, + "step": 7685 + }, + { + "epoch": 0.43039534102363086, + "grad_norm": 1.3788670301437378, + "learning_rate": 3.841e-05, + "loss": 0.4635, + "step": 7686 + }, + { + "epoch": 0.4304513383357599, + "grad_norm": 1.1755338907241821, + "learning_rate": 3.8415000000000004e-05, + "loss": 0.3556, + "step": 7687 + }, + { + "epoch": 0.4305073356478889, + "grad_norm": 1.3992962837219238, + "learning_rate": 3.842e-05, + "loss": 0.4594, + "step": 7688 + }, + { + "epoch": 0.4305633329600179, + "grad_norm": 1.4017950296401978, + "learning_rate": 3.8425e-05, + "loss": 0.4544, + "step": 7689 + }, + { + "epoch": 0.43061933027214694, + "grad_norm": 1.3413540124893188, + "learning_rate": 3.8429999999999996e-05, + "loss": 0.4211, + "step": 7690 + }, + { + "epoch": 0.43067532758427596, + "grad_norm": 1.6711502075195312, + "learning_rate": 3.843500000000001e-05, + "loss": 0.4686, + "step": 7691 + }, + { + "epoch": 0.430731324896405, + "grad_norm": 1.2665237188339233, + "learning_rate": 3.8440000000000005e-05, + "loss": 0.3576, + "step": 7692 + }, + { + "epoch": 0.430787322208534, + "grad_norm": 1.1450090408325195, + "learning_rate": 3.8445e-05, + "loss": 0.304, + "step": 7693 + }, + { + "epoch": 0.430843319520663, + "grad_norm": 1.2788779735565186, + "learning_rate": 3.845e-05, + "loss": 0.5303, + "step": 7694 + }, + { + "epoch": 0.43089931683279203, + "grad_norm": 1.3919241428375244, + "learning_rate": 3.8455000000000004e-05, + "loss": 0.5688, + "step": 7695 + }, + { + "epoch": 0.43095531414492105, + "grad_norm": 1.1815086603164673, + "learning_rate": 3.846e-05, + "loss": 0.4524, + "step": 7696 + }, + { + "epoch": 0.4310113114570501, + "grad_norm": 1.1565638780593872, + "learning_rate": 3.8465e-05, + "loss": 0.4198, + "step": 7697 + }, + { + "epoch": 0.4310673087691791, + "grad_norm": 1.2445493936538696, + "learning_rate": 3.847e-05, + "loss": 0.6976, + "step": 7698 + }, + { + "epoch": 0.4311233060813081, + "grad_norm": 1.1842223405838013, + "learning_rate": 3.8475e-05, + "loss": 0.4367, + "step": 7699 + }, + { + "epoch": 0.43117930339343713, + "grad_norm": 1.1668330430984497, + "learning_rate": 3.848e-05, + "loss": 0.386, + "step": 7700 + }, + { + "epoch": 0.43123530070556615, + "grad_norm": 1.2867923974990845, + "learning_rate": 3.8485e-05, + "loss": 0.4306, + "step": 7701 + }, + { + "epoch": 0.43129129801769517, + "grad_norm": 1.5501301288604736, + "learning_rate": 3.8490000000000006e-05, + "loss": 0.4631, + "step": 7702 + }, + { + "epoch": 0.4313472953298242, + "grad_norm": 1.4100300073623657, + "learning_rate": 3.8495e-05, + "loss": 0.5668, + "step": 7703 + }, + { + "epoch": 0.4314032926419532, + "grad_norm": 1.12372887134552, + "learning_rate": 3.85e-05, + "loss": 0.3659, + "step": 7704 + }, + { + "epoch": 0.4314592899540822, + "grad_norm": 1.3030691146850586, + "learning_rate": 3.8505000000000005e-05, + "loss": 0.353, + "step": 7705 + }, + { + "epoch": 0.43151528726621124, + "grad_norm": 1.2100201845169067, + "learning_rate": 3.851e-05, + "loss": 0.3685, + "step": 7706 + }, + { + "epoch": 0.43157128457834026, + "grad_norm": 1.2769105434417725, + "learning_rate": 3.8515e-05, + "loss": 0.4037, + "step": 7707 + }, + { + "epoch": 0.4316272818904693, + "grad_norm": 1.2264678478240967, + "learning_rate": 3.8520000000000004e-05, + "loss": 0.4961, + "step": 7708 + }, + { + "epoch": 0.4316832792025983, + "grad_norm": 1.1368542909622192, + "learning_rate": 3.8525e-05, + "loss": 0.5397, + "step": 7709 + }, + { + "epoch": 0.43173927651472727, + "grad_norm": 1.2379624843597412, + "learning_rate": 3.853e-05, + "loss": 0.4158, + "step": 7710 + }, + { + "epoch": 0.4317952738268563, + "grad_norm": 1.4977108240127563, + "learning_rate": 3.8535e-05, + "loss": 0.4232, + "step": 7711 + }, + { + "epoch": 0.4318512711389853, + "grad_norm": 1.2705695629119873, + "learning_rate": 3.854000000000001e-05, + "loss": 0.3418, + "step": 7712 + }, + { + "epoch": 0.4319072684511143, + "grad_norm": 1.5933589935302734, + "learning_rate": 3.8545000000000004e-05, + "loss": 0.4735, + "step": 7713 + }, + { + "epoch": 0.43196326576324334, + "grad_norm": 1.3652877807617188, + "learning_rate": 3.855e-05, + "loss": 0.463, + "step": 7714 + }, + { + "epoch": 0.43201926307537236, + "grad_norm": 1.5132670402526855, + "learning_rate": 3.8555e-05, + "loss": 0.4214, + "step": 7715 + }, + { + "epoch": 0.4320752603875014, + "grad_norm": 1.5290735960006714, + "learning_rate": 3.8560000000000004e-05, + "loss": 0.4887, + "step": 7716 + }, + { + "epoch": 0.4321312576996304, + "grad_norm": 1.0978847742080688, + "learning_rate": 3.8565e-05, + "loss": 0.318, + "step": 7717 + }, + { + "epoch": 0.4321872550117594, + "grad_norm": 1.5986554622650146, + "learning_rate": 3.857e-05, + "loss": 0.3287, + "step": 7718 + }, + { + "epoch": 0.43224325232388844, + "grad_norm": 1.3401552438735962, + "learning_rate": 3.8575e-05, + "loss": 0.4553, + "step": 7719 + }, + { + "epoch": 0.43229924963601746, + "grad_norm": 1.586480975151062, + "learning_rate": 3.858e-05, + "loss": 0.5544, + "step": 7720 + }, + { + "epoch": 0.4323552469481465, + "grad_norm": 1.1275185346603394, + "learning_rate": 3.8585000000000004e-05, + "loss": 0.3601, + "step": 7721 + }, + { + "epoch": 0.4324112442602755, + "grad_norm": 1.3213917016983032, + "learning_rate": 3.859e-05, + "loss": 0.4303, + "step": 7722 + }, + { + "epoch": 0.4324672415724045, + "grad_norm": 1.2904232740402222, + "learning_rate": 3.8595000000000006e-05, + "loss": 0.4344, + "step": 7723 + }, + { + "epoch": 0.43252323888453353, + "grad_norm": 1.3175349235534668, + "learning_rate": 3.86e-05, + "loss": 0.492, + "step": 7724 + }, + { + "epoch": 0.43257923619666255, + "grad_norm": 1.0625821352005005, + "learning_rate": 3.8605e-05, + "loss": 0.3455, + "step": 7725 + }, + { + "epoch": 0.4326352335087916, + "grad_norm": 1.3834384679794312, + "learning_rate": 3.8610000000000005e-05, + "loss": 0.395, + "step": 7726 + }, + { + "epoch": 0.4326912308209206, + "grad_norm": 1.3488636016845703, + "learning_rate": 3.8615e-05, + "loss": 0.4571, + "step": 7727 + }, + { + "epoch": 0.4327472281330496, + "grad_norm": 1.7581381797790527, + "learning_rate": 3.862e-05, + "loss": 0.4489, + "step": 7728 + }, + { + "epoch": 0.43280322544517863, + "grad_norm": 1.182015299797058, + "learning_rate": 3.8625e-05, + "loss": 0.2721, + "step": 7729 + }, + { + "epoch": 0.43285922275730765, + "grad_norm": 1.2194050550460815, + "learning_rate": 3.863e-05, + "loss": 0.4309, + "step": 7730 + }, + { + "epoch": 0.43291522006943667, + "grad_norm": 1.1201328039169312, + "learning_rate": 3.8635000000000005e-05, + "loss": 0.3698, + "step": 7731 + }, + { + "epoch": 0.4329712173815657, + "grad_norm": 1.129077672958374, + "learning_rate": 3.864e-05, + "loss": 0.4306, + "step": 7732 + }, + { + "epoch": 0.4330272146936947, + "grad_norm": 1.4392932653427124, + "learning_rate": 3.8645e-05, + "loss": 0.4341, + "step": 7733 + }, + { + "epoch": 0.4330832120058237, + "grad_norm": 1.3381160497665405, + "learning_rate": 3.8650000000000004e-05, + "loss": 0.3719, + "step": 7734 + }, + { + "epoch": 0.43313920931795274, + "grad_norm": 1.306389570236206, + "learning_rate": 3.8655e-05, + "loss": 0.6106, + "step": 7735 + }, + { + "epoch": 0.43319520663008176, + "grad_norm": 1.388902187347412, + "learning_rate": 3.866e-05, + "loss": 0.8112, + "step": 7736 + }, + { + "epoch": 0.4332512039422108, + "grad_norm": 1.2088311910629272, + "learning_rate": 3.8665e-05, + "loss": 0.5203, + "step": 7737 + }, + { + "epoch": 0.4333072012543398, + "grad_norm": 1.1478239297866821, + "learning_rate": 3.867e-05, + "loss": 0.4369, + "step": 7738 + }, + { + "epoch": 0.4333631985664688, + "grad_norm": 1.2467337846755981, + "learning_rate": 3.8675e-05, + "loss": 0.5301, + "step": 7739 + }, + { + "epoch": 0.43341919587859784, + "grad_norm": 1.1686300039291382, + "learning_rate": 3.868e-05, + "loss": 0.395, + "step": 7740 + }, + { + "epoch": 0.43347519319072686, + "grad_norm": 1.4347566366195679, + "learning_rate": 3.8685000000000007e-05, + "loss": 0.4974, + "step": 7741 + }, + { + "epoch": 0.4335311905028559, + "grad_norm": 1.3105829954147339, + "learning_rate": 3.8690000000000004e-05, + "loss": 0.433, + "step": 7742 + }, + { + "epoch": 0.4335871878149849, + "grad_norm": 1.149224042892456, + "learning_rate": 3.8695e-05, + "loss": 0.5755, + "step": 7743 + }, + { + "epoch": 0.4336431851271139, + "grad_norm": 1.3344298601150513, + "learning_rate": 3.8700000000000006e-05, + "loss": 0.5189, + "step": 7744 + }, + { + "epoch": 0.43369918243924294, + "grad_norm": 1.017473816871643, + "learning_rate": 3.8705e-05, + "loss": 0.3963, + "step": 7745 + }, + { + "epoch": 0.43375517975137196, + "grad_norm": 1.1155141592025757, + "learning_rate": 3.871e-05, + "loss": 0.389, + "step": 7746 + }, + { + "epoch": 0.433811177063501, + "grad_norm": 1.467545986175537, + "learning_rate": 3.8715000000000005e-05, + "loss": 0.4094, + "step": 7747 + }, + { + "epoch": 0.43386717437563, + "grad_norm": 1.255529522895813, + "learning_rate": 3.872e-05, + "loss": 0.4239, + "step": 7748 + }, + { + "epoch": 0.433923171687759, + "grad_norm": 1.2360903024673462, + "learning_rate": 3.8725e-05, + "loss": 0.4332, + "step": 7749 + }, + { + "epoch": 0.43397916899988803, + "grad_norm": 1.3143188953399658, + "learning_rate": 3.873e-05, + "loss": 0.4957, + "step": 7750 + }, + { + "epoch": 0.434035166312017, + "grad_norm": 1.3152927160263062, + "learning_rate": 3.873500000000001e-05, + "loss": 0.4215, + "step": 7751 + }, + { + "epoch": 0.434091163624146, + "grad_norm": 1.1224337816238403, + "learning_rate": 3.8740000000000005e-05, + "loss": 0.3767, + "step": 7752 + }, + { + "epoch": 0.43414716093627503, + "grad_norm": 1.1857683658599854, + "learning_rate": 3.8745e-05, + "loss": 0.4179, + "step": 7753 + }, + { + "epoch": 0.43420315824840405, + "grad_norm": 1.1691921949386597, + "learning_rate": 3.875e-05, + "loss": 0.3126, + "step": 7754 + }, + { + "epoch": 0.43425915556053307, + "grad_norm": 1.1144546270370483, + "learning_rate": 3.8755000000000004e-05, + "loss": 0.4183, + "step": 7755 + }, + { + "epoch": 0.4343151528726621, + "grad_norm": 1.328009009361267, + "learning_rate": 3.876e-05, + "loss": 0.5218, + "step": 7756 + }, + { + "epoch": 0.4343711501847911, + "grad_norm": 1.1573644876480103, + "learning_rate": 3.8765e-05, + "loss": 0.3924, + "step": 7757 + }, + { + "epoch": 0.43442714749692013, + "grad_norm": 1.4506826400756836, + "learning_rate": 3.877e-05, + "loss": 0.5673, + "step": 7758 + }, + { + "epoch": 0.43448314480904915, + "grad_norm": 1.173905611038208, + "learning_rate": 3.8775e-05, + "loss": 0.4158, + "step": 7759 + }, + { + "epoch": 0.43453914212117817, + "grad_norm": 1.3109742403030396, + "learning_rate": 3.878e-05, + "loss": 0.3942, + "step": 7760 + }, + { + "epoch": 0.4345951394333072, + "grad_norm": 1.3459542989730835, + "learning_rate": 3.8785e-05, + "loss": 0.4062, + "step": 7761 + }, + { + "epoch": 0.4346511367454362, + "grad_norm": 1.3858788013458252, + "learning_rate": 3.8790000000000006e-05, + "loss": 0.4552, + "step": 7762 + }, + { + "epoch": 0.4347071340575652, + "grad_norm": 1.2540960311889648, + "learning_rate": 3.8795000000000004e-05, + "loss": 0.5135, + "step": 7763 + }, + { + "epoch": 0.43476313136969424, + "grad_norm": 1.2480825185775757, + "learning_rate": 3.88e-05, + "loss": 0.5709, + "step": 7764 + }, + { + "epoch": 0.43481912868182326, + "grad_norm": 1.3746126890182495, + "learning_rate": 3.8805000000000005e-05, + "loss": 0.349, + "step": 7765 + }, + { + "epoch": 0.4348751259939523, + "grad_norm": 1.1591088771820068, + "learning_rate": 3.881e-05, + "loss": 0.3539, + "step": 7766 + }, + { + "epoch": 0.4349311233060813, + "grad_norm": 1.2983176708221436, + "learning_rate": 3.8815e-05, + "loss": 0.5045, + "step": 7767 + }, + { + "epoch": 0.4349871206182103, + "grad_norm": 1.2469282150268555, + "learning_rate": 3.882e-05, + "loss": 0.3675, + "step": 7768 + }, + { + "epoch": 0.43504311793033934, + "grad_norm": 1.2342485189437866, + "learning_rate": 3.8825e-05, + "loss": 0.4768, + "step": 7769 + }, + { + "epoch": 0.43509911524246836, + "grad_norm": 1.05726957321167, + "learning_rate": 3.883e-05, + "loss": 0.3183, + "step": 7770 + }, + { + "epoch": 0.4351551125545974, + "grad_norm": 1.0779683589935303, + "learning_rate": 3.8835e-05, + "loss": 0.3627, + "step": 7771 + }, + { + "epoch": 0.4352111098667264, + "grad_norm": 1.2601139545440674, + "learning_rate": 3.884e-05, + "loss": 0.3713, + "step": 7772 + }, + { + "epoch": 0.4352671071788554, + "grad_norm": 1.1393673419952393, + "learning_rate": 3.8845000000000005e-05, + "loss": 0.3637, + "step": 7773 + }, + { + "epoch": 0.43532310449098444, + "grad_norm": 1.3326117992401123, + "learning_rate": 3.885e-05, + "loss": 0.311, + "step": 7774 + }, + { + "epoch": 0.43537910180311346, + "grad_norm": 1.242078423500061, + "learning_rate": 3.8855e-05, + "loss": 0.4623, + "step": 7775 + }, + { + "epoch": 0.4354350991152425, + "grad_norm": 1.4016425609588623, + "learning_rate": 3.8860000000000004e-05, + "loss": 0.3817, + "step": 7776 + }, + { + "epoch": 0.4354910964273715, + "grad_norm": 1.1242419481277466, + "learning_rate": 3.8865e-05, + "loss": 0.3917, + "step": 7777 + }, + { + "epoch": 0.4355470937395005, + "grad_norm": 1.3407025337219238, + "learning_rate": 3.887e-05, + "loss": 0.3847, + "step": 7778 + }, + { + "epoch": 0.43560309105162953, + "grad_norm": 1.1474888324737549, + "learning_rate": 3.8875e-05, + "loss": 0.3273, + "step": 7779 + }, + { + "epoch": 0.43565908836375855, + "grad_norm": 1.417089819908142, + "learning_rate": 3.888e-05, + "loss": 0.5501, + "step": 7780 + }, + { + "epoch": 0.43571508567588757, + "grad_norm": 1.3149428367614746, + "learning_rate": 3.8885e-05, + "loss": 0.6506, + "step": 7781 + }, + { + "epoch": 0.4357710829880166, + "grad_norm": 1.202264666557312, + "learning_rate": 3.889e-05, + "loss": 0.5165, + "step": 7782 + }, + { + "epoch": 0.4358270803001456, + "grad_norm": 1.2116482257843018, + "learning_rate": 3.8895000000000006e-05, + "loss": 0.4614, + "step": 7783 + }, + { + "epoch": 0.4358830776122746, + "grad_norm": 1.240391492843628, + "learning_rate": 3.8900000000000004e-05, + "loss": 0.4139, + "step": 7784 + }, + { + "epoch": 0.43593907492440365, + "grad_norm": 1.3046212196350098, + "learning_rate": 3.8905e-05, + "loss": 0.3925, + "step": 7785 + }, + { + "epoch": 0.43599507223653267, + "grad_norm": 1.100469708442688, + "learning_rate": 3.8910000000000005e-05, + "loss": 0.452, + "step": 7786 + }, + { + "epoch": 0.4360510695486617, + "grad_norm": 1.89679753780365, + "learning_rate": 3.8915e-05, + "loss": 0.685, + "step": 7787 + }, + { + "epoch": 0.4361070668607907, + "grad_norm": 1.1426291465759277, + "learning_rate": 3.892e-05, + "loss": 0.4713, + "step": 7788 + }, + { + "epoch": 0.4361630641729197, + "grad_norm": 1.0942741632461548, + "learning_rate": 3.8925e-05, + "loss": 0.363, + "step": 7789 + }, + { + "epoch": 0.43621906148504874, + "grad_norm": 1.3065084218978882, + "learning_rate": 3.893e-05, + "loss": 0.435, + "step": 7790 + }, + { + "epoch": 0.43627505879717776, + "grad_norm": 1.1707557439804077, + "learning_rate": 3.8935e-05, + "loss": 0.476, + "step": 7791 + }, + { + "epoch": 0.4363310561093068, + "grad_norm": 1.4024662971496582, + "learning_rate": 3.894e-05, + "loss": 0.4503, + "step": 7792 + }, + { + "epoch": 0.43638705342143574, + "grad_norm": 1.111179232597351, + "learning_rate": 3.8945e-05, + "loss": 0.4734, + "step": 7793 + }, + { + "epoch": 0.43644305073356476, + "grad_norm": 1.2108148336410522, + "learning_rate": 3.8950000000000005e-05, + "loss": 0.4128, + "step": 7794 + }, + { + "epoch": 0.4364990480456938, + "grad_norm": 1.275070071220398, + "learning_rate": 3.8955e-05, + "loss": 0.3925, + "step": 7795 + }, + { + "epoch": 0.4365550453578228, + "grad_norm": 1.3724416494369507, + "learning_rate": 3.896e-05, + "loss": 0.4165, + "step": 7796 + }, + { + "epoch": 0.4366110426699518, + "grad_norm": 1.3684978485107422, + "learning_rate": 3.8965000000000004e-05, + "loss": 0.4433, + "step": 7797 + }, + { + "epoch": 0.43666703998208084, + "grad_norm": 1.1701958179473877, + "learning_rate": 3.897e-05, + "loss": 0.4411, + "step": 7798 + }, + { + "epoch": 0.43672303729420986, + "grad_norm": 1.102452278137207, + "learning_rate": 3.8975e-05, + "loss": 0.392, + "step": 7799 + }, + { + "epoch": 0.4367790346063389, + "grad_norm": 1.1435978412628174, + "learning_rate": 3.898e-05, + "loss": 0.3787, + "step": 7800 + }, + { + "epoch": 0.4368350319184679, + "grad_norm": 1.4434677362442017, + "learning_rate": 3.8985e-05, + "loss": 0.5624, + "step": 7801 + }, + { + "epoch": 0.4368910292305969, + "grad_norm": 1.0614144802093506, + "learning_rate": 3.8990000000000004e-05, + "loss": 0.3437, + "step": 7802 + }, + { + "epoch": 0.43694702654272594, + "grad_norm": 1.2822693586349487, + "learning_rate": 3.8995e-05, + "loss": 0.4794, + "step": 7803 + }, + { + "epoch": 0.43700302385485496, + "grad_norm": 1.189255714416504, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.3455, + "step": 7804 + }, + { + "epoch": 0.437059021166984, + "grad_norm": 1.1027987003326416, + "learning_rate": 3.9005000000000003e-05, + "loss": 0.5032, + "step": 7805 + }, + { + "epoch": 0.437115018479113, + "grad_norm": 1.2029386758804321, + "learning_rate": 3.901e-05, + "loss": 0.3261, + "step": 7806 + }, + { + "epoch": 0.437171015791242, + "grad_norm": 1.1270817518234253, + "learning_rate": 3.9015e-05, + "loss": 0.393, + "step": 7807 + }, + { + "epoch": 0.43722701310337103, + "grad_norm": 1.2903618812561035, + "learning_rate": 3.902e-05, + "loss": 0.3469, + "step": 7808 + }, + { + "epoch": 0.43728301041550005, + "grad_norm": 1.3289403915405273, + "learning_rate": 3.9025e-05, + "loss": 0.4608, + "step": 7809 + }, + { + "epoch": 0.43733900772762907, + "grad_norm": 1.1192606687545776, + "learning_rate": 3.903e-05, + "loss": 0.4093, + "step": 7810 + }, + { + "epoch": 0.4373950050397581, + "grad_norm": 1.3133131265640259, + "learning_rate": 3.9035e-05, + "loss": 0.3993, + "step": 7811 + }, + { + "epoch": 0.4374510023518871, + "grad_norm": 1.2583736181259155, + "learning_rate": 3.9040000000000006e-05, + "loss": 0.4404, + "step": 7812 + }, + { + "epoch": 0.4375069996640161, + "grad_norm": 0.9704046845436096, + "learning_rate": 3.9045e-05, + "loss": 0.3489, + "step": 7813 + }, + { + "epoch": 0.43756299697614515, + "grad_norm": 1.8509451150894165, + "learning_rate": 3.905e-05, + "loss": 0.4609, + "step": 7814 + }, + { + "epoch": 0.43761899428827417, + "grad_norm": 1.298453688621521, + "learning_rate": 3.9055000000000005e-05, + "loss": 0.4883, + "step": 7815 + }, + { + "epoch": 0.4376749916004032, + "grad_norm": 1.572828769683838, + "learning_rate": 3.906e-05, + "loss": 0.665, + "step": 7816 + }, + { + "epoch": 0.4377309889125322, + "grad_norm": 1.1312446594238281, + "learning_rate": 3.9065e-05, + "loss": 0.368, + "step": 7817 + }, + { + "epoch": 0.4377869862246612, + "grad_norm": 1.2823508977890015, + "learning_rate": 3.9070000000000004e-05, + "loss": 0.4562, + "step": 7818 + }, + { + "epoch": 0.43784298353679024, + "grad_norm": 1.3399925231933594, + "learning_rate": 3.9075e-05, + "loss": 0.4222, + "step": 7819 + }, + { + "epoch": 0.43789898084891926, + "grad_norm": 1.2265499830245972, + "learning_rate": 3.908e-05, + "loss": 0.3734, + "step": 7820 + }, + { + "epoch": 0.4379549781610483, + "grad_norm": 1.4076920747756958, + "learning_rate": 3.9085e-05, + "loss": 0.3937, + "step": 7821 + }, + { + "epoch": 0.4380109754731773, + "grad_norm": 1.2663395404815674, + "learning_rate": 3.909000000000001e-05, + "loss": 0.3566, + "step": 7822 + }, + { + "epoch": 0.4380669727853063, + "grad_norm": 1.4407652616500854, + "learning_rate": 3.9095000000000004e-05, + "loss": 0.4386, + "step": 7823 + }, + { + "epoch": 0.43812297009743534, + "grad_norm": 1.1476681232452393, + "learning_rate": 3.91e-05, + "loss": 0.3476, + "step": 7824 + }, + { + "epoch": 0.43817896740956436, + "grad_norm": 1.0374912023544312, + "learning_rate": 3.9105000000000006e-05, + "loss": 0.3338, + "step": 7825 + }, + { + "epoch": 0.4382349647216934, + "grad_norm": 1.0831481218338013, + "learning_rate": 3.911e-05, + "loss": 0.4214, + "step": 7826 + }, + { + "epoch": 0.4382909620338224, + "grad_norm": 1.1083035469055176, + "learning_rate": 3.9115e-05, + "loss": 0.318, + "step": 7827 + }, + { + "epoch": 0.4383469593459514, + "grad_norm": 1.2883092164993286, + "learning_rate": 3.912e-05, + "loss": 0.4401, + "step": 7828 + }, + { + "epoch": 0.43840295665808043, + "grad_norm": 1.313269019126892, + "learning_rate": 3.9125e-05, + "loss": 0.4767, + "step": 7829 + }, + { + "epoch": 0.43845895397020945, + "grad_norm": 1.169537901878357, + "learning_rate": 3.913e-05, + "loss": 0.3928, + "step": 7830 + }, + { + "epoch": 0.43851495128233847, + "grad_norm": 1.2053309679031372, + "learning_rate": 3.9135e-05, + "loss": 0.4535, + "step": 7831 + }, + { + "epoch": 0.4385709485944675, + "grad_norm": 1.3004990816116333, + "learning_rate": 3.914e-05, + "loss": 0.4291, + "step": 7832 + }, + { + "epoch": 0.4386269459065965, + "grad_norm": 1.3423177003860474, + "learning_rate": 3.9145000000000006e-05, + "loss": 0.3814, + "step": 7833 + }, + { + "epoch": 0.4386829432187255, + "grad_norm": 1.397086262702942, + "learning_rate": 3.915e-05, + "loss": 0.423, + "step": 7834 + }, + { + "epoch": 0.4387389405308545, + "grad_norm": 1.1477322578430176, + "learning_rate": 3.9155e-05, + "loss": 0.451, + "step": 7835 + }, + { + "epoch": 0.4387949378429835, + "grad_norm": 1.1389315128326416, + "learning_rate": 3.9160000000000005e-05, + "loss": 0.4046, + "step": 7836 + }, + { + "epoch": 0.43885093515511253, + "grad_norm": 1.3055751323699951, + "learning_rate": 3.9165e-05, + "loss": 0.4471, + "step": 7837 + }, + { + "epoch": 0.43890693246724155, + "grad_norm": 1.1227158308029175, + "learning_rate": 3.917e-05, + "loss": 0.3757, + "step": 7838 + }, + { + "epoch": 0.43896292977937057, + "grad_norm": 1.2383346557617188, + "learning_rate": 3.9175000000000004e-05, + "loss": 0.4505, + "step": 7839 + }, + { + "epoch": 0.4390189270914996, + "grad_norm": 1.3493623733520508, + "learning_rate": 3.918e-05, + "loss": 0.5136, + "step": 7840 + }, + { + "epoch": 0.4390749244036286, + "grad_norm": 1.250669240951538, + "learning_rate": 3.9185e-05, + "loss": 0.2715, + "step": 7841 + }, + { + "epoch": 0.4391309217157576, + "grad_norm": 1.2727452516555786, + "learning_rate": 3.919e-05, + "loss": 0.4297, + "step": 7842 + }, + { + "epoch": 0.43918691902788665, + "grad_norm": 1.1866668462753296, + "learning_rate": 3.919500000000001e-05, + "loss": 0.3604, + "step": 7843 + }, + { + "epoch": 0.43924291634001567, + "grad_norm": 1.1657792329788208, + "learning_rate": 3.9200000000000004e-05, + "loss": 0.3871, + "step": 7844 + }, + { + "epoch": 0.4392989136521447, + "grad_norm": 1.3171136379241943, + "learning_rate": 3.9205e-05, + "loss": 0.4378, + "step": 7845 + }, + { + "epoch": 0.4393549109642737, + "grad_norm": 1.1899813413619995, + "learning_rate": 3.921e-05, + "loss": 0.3927, + "step": 7846 + }, + { + "epoch": 0.4394109082764027, + "grad_norm": 1.1432589292526245, + "learning_rate": 3.9215e-05, + "loss": 0.3734, + "step": 7847 + }, + { + "epoch": 0.43946690558853174, + "grad_norm": 1.1378377676010132, + "learning_rate": 3.922e-05, + "loss": 0.3443, + "step": 7848 + }, + { + "epoch": 0.43952290290066076, + "grad_norm": 1.4463274478912354, + "learning_rate": 3.9225e-05, + "loss": 0.5706, + "step": 7849 + }, + { + "epoch": 0.4395789002127898, + "grad_norm": 1.6434000730514526, + "learning_rate": 3.923e-05, + "loss": 0.5424, + "step": 7850 + }, + { + "epoch": 0.4396348975249188, + "grad_norm": 1.0726593732833862, + "learning_rate": 3.9235e-05, + "loss": 0.3867, + "step": 7851 + }, + { + "epoch": 0.4396908948370478, + "grad_norm": 1.233354926109314, + "learning_rate": 3.9240000000000004e-05, + "loss": 0.4477, + "step": 7852 + }, + { + "epoch": 0.43974689214917684, + "grad_norm": 1.4947172403335571, + "learning_rate": 3.9245e-05, + "loss": 0.3989, + "step": 7853 + }, + { + "epoch": 0.43980288946130586, + "grad_norm": 1.1785705089569092, + "learning_rate": 3.9250000000000005e-05, + "loss": 0.343, + "step": 7854 + }, + { + "epoch": 0.4398588867734349, + "grad_norm": 1.3784432411193848, + "learning_rate": 3.9255e-05, + "loss": 0.4166, + "step": 7855 + }, + { + "epoch": 0.4399148840855639, + "grad_norm": 1.8765945434570312, + "learning_rate": 3.926e-05, + "loss": 0.4711, + "step": 7856 + }, + { + "epoch": 0.4399708813976929, + "grad_norm": 1.1780389547348022, + "learning_rate": 3.9265000000000004e-05, + "loss": 0.437, + "step": 7857 + }, + { + "epoch": 0.44002687870982193, + "grad_norm": 1.1090939044952393, + "learning_rate": 3.927e-05, + "loss": 0.3735, + "step": 7858 + }, + { + "epoch": 0.44008287602195095, + "grad_norm": 1.3783758878707886, + "learning_rate": 3.9275e-05, + "loss": 0.521, + "step": 7859 + }, + { + "epoch": 0.44013887333407997, + "grad_norm": 1.1683554649353027, + "learning_rate": 3.9280000000000003e-05, + "loss": 0.4162, + "step": 7860 + }, + { + "epoch": 0.440194870646209, + "grad_norm": 1.0943423509597778, + "learning_rate": 3.9285e-05, + "loss": 0.5115, + "step": 7861 + }, + { + "epoch": 0.440250867958338, + "grad_norm": 1.2970889806747437, + "learning_rate": 3.9290000000000005e-05, + "loss": 0.5817, + "step": 7862 + }, + { + "epoch": 0.44030686527046703, + "grad_norm": 1.1418607234954834, + "learning_rate": 3.9295e-05, + "loss": 0.3782, + "step": 7863 + }, + { + "epoch": 0.44036286258259605, + "grad_norm": 1.0735609531402588, + "learning_rate": 3.9300000000000007e-05, + "loss": 0.3981, + "step": 7864 + }, + { + "epoch": 0.44041885989472507, + "grad_norm": 1.6328279972076416, + "learning_rate": 3.9305000000000004e-05, + "loss": 0.4539, + "step": 7865 + }, + { + "epoch": 0.4404748572068541, + "grad_norm": 1.295376181602478, + "learning_rate": 3.931e-05, + "loss": 0.46, + "step": 7866 + }, + { + "epoch": 0.4405308545189831, + "grad_norm": 1.2095136642456055, + "learning_rate": 3.9315e-05, + "loss": 0.3505, + "step": 7867 + }, + { + "epoch": 0.4405868518311121, + "grad_norm": 1.2632331848144531, + "learning_rate": 3.932e-05, + "loss": 0.5245, + "step": 7868 + }, + { + "epoch": 0.44064284914324114, + "grad_norm": 1.3711323738098145, + "learning_rate": 3.9325e-05, + "loss": 0.4808, + "step": 7869 + }, + { + "epoch": 0.44069884645537016, + "grad_norm": 1.4462391138076782, + "learning_rate": 3.933e-05, + "loss": 0.4866, + "step": 7870 + }, + { + "epoch": 0.4407548437674992, + "grad_norm": 1.1605569124221802, + "learning_rate": 3.9335e-05, + "loss": 0.4799, + "step": 7871 + }, + { + "epoch": 0.4408108410796282, + "grad_norm": 1.3611857891082764, + "learning_rate": 3.9340000000000006e-05, + "loss": 0.314, + "step": 7872 + }, + { + "epoch": 0.4408668383917572, + "grad_norm": 1.3676567077636719, + "learning_rate": 3.9345000000000004e-05, + "loss": 0.4342, + "step": 7873 + }, + { + "epoch": 0.44092283570388624, + "grad_norm": 1.602771282196045, + "learning_rate": 3.935e-05, + "loss": 0.5213, + "step": 7874 + }, + { + "epoch": 0.4409788330160152, + "grad_norm": 1.1253604888916016, + "learning_rate": 3.9355000000000005e-05, + "loss": 0.3175, + "step": 7875 + }, + { + "epoch": 0.4410348303281442, + "grad_norm": 1.1032730340957642, + "learning_rate": 3.936e-05, + "loss": 0.3832, + "step": 7876 + }, + { + "epoch": 0.44109082764027324, + "grad_norm": 1.2083414793014526, + "learning_rate": 3.9365e-05, + "loss": 0.321, + "step": 7877 + }, + { + "epoch": 0.44114682495240226, + "grad_norm": 1.108339548110962, + "learning_rate": 3.9370000000000004e-05, + "loss": 0.3668, + "step": 7878 + }, + { + "epoch": 0.4412028222645313, + "grad_norm": 1.3832827806472778, + "learning_rate": 3.9375e-05, + "loss": 0.3733, + "step": 7879 + }, + { + "epoch": 0.4412588195766603, + "grad_norm": 1.0996882915496826, + "learning_rate": 3.938e-05, + "loss": 0.365, + "step": 7880 + }, + { + "epoch": 0.4413148168887893, + "grad_norm": 1.3138083219528198, + "learning_rate": 3.9384999999999996e-05, + "loss": 0.46, + "step": 7881 + }, + { + "epoch": 0.44137081420091834, + "grad_norm": 2.8446733951568604, + "learning_rate": 3.939e-05, + "loss": 0.4353, + "step": 7882 + }, + { + "epoch": 0.44142681151304736, + "grad_norm": 1.1881431341171265, + "learning_rate": 3.9395000000000005e-05, + "loss": 0.4411, + "step": 7883 + }, + { + "epoch": 0.4414828088251764, + "grad_norm": 1.2280796766281128, + "learning_rate": 3.94e-05, + "loss": 0.568, + "step": 7884 + }, + { + "epoch": 0.4415388061373054, + "grad_norm": 1.5344942808151245, + "learning_rate": 3.9405e-05, + "loss": 0.381, + "step": 7885 + }, + { + "epoch": 0.4415948034494344, + "grad_norm": 1.349732756614685, + "learning_rate": 3.9410000000000004e-05, + "loss": 0.4975, + "step": 7886 + }, + { + "epoch": 0.44165080076156343, + "grad_norm": 1.2575490474700928, + "learning_rate": 3.9415e-05, + "loss": 0.4531, + "step": 7887 + }, + { + "epoch": 0.44170679807369245, + "grad_norm": 1.1180615425109863, + "learning_rate": 3.942e-05, + "loss": 0.4269, + "step": 7888 + }, + { + "epoch": 0.44176279538582147, + "grad_norm": 1.3497393131256104, + "learning_rate": 3.9425e-05, + "loss": 0.5359, + "step": 7889 + }, + { + "epoch": 0.4418187926979505, + "grad_norm": 1.1241097450256348, + "learning_rate": 3.943e-05, + "loss": 0.463, + "step": 7890 + }, + { + "epoch": 0.4418747900100795, + "grad_norm": 1.443230390548706, + "learning_rate": 3.9435e-05, + "loss": 0.6691, + "step": 7891 + }, + { + "epoch": 0.44193078732220853, + "grad_norm": 1.1160078048706055, + "learning_rate": 3.944e-05, + "loss": 0.3997, + "step": 7892 + }, + { + "epoch": 0.44198678463433755, + "grad_norm": 1.2423415184020996, + "learning_rate": 3.9445000000000006e-05, + "loss": 0.4094, + "step": 7893 + }, + { + "epoch": 0.44204278194646657, + "grad_norm": 1.3356132507324219, + "learning_rate": 3.9450000000000003e-05, + "loss": 0.4605, + "step": 7894 + }, + { + "epoch": 0.4420987792585956, + "grad_norm": 1.4842458963394165, + "learning_rate": 3.9455e-05, + "loss": 0.4993, + "step": 7895 + }, + { + "epoch": 0.4421547765707246, + "grad_norm": 1.5475319623947144, + "learning_rate": 3.9460000000000005e-05, + "loss": 0.3421, + "step": 7896 + }, + { + "epoch": 0.4422107738828536, + "grad_norm": 1.338922381401062, + "learning_rate": 3.9465e-05, + "loss": 0.4591, + "step": 7897 + }, + { + "epoch": 0.44226677119498264, + "grad_norm": 1.264709234237671, + "learning_rate": 3.947e-05, + "loss": 0.5858, + "step": 7898 + }, + { + "epoch": 0.44232276850711166, + "grad_norm": 1.154921293258667, + "learning_rate": 3.9475000000000004e-05, + "loss": 0.4254, + "step": 7899 + }, + { + "epoch": 0.4423787658192407, + "grad_norm": 1.3502569198608398, + "learning_rate": 3.948e-05, + "loss": 0.4737, + "step": 7900 + }, + { + "epoch": 0.4424347631313697, + "grad_norm": 1.159049153327942, + "learning_rate": 3.9485e-05, + "loss": 0.4563, + "step": 7901 + }, + { + "epoch": 0.4424907604434987, + "grad_norm": 1.2016639709472656, + "learning_rate": 3.9489999999999996e-05, + "loss": 0.5038, + "step": 7902 + }, + { + "epoch": 0.44254675775562774, + "grad_norm": 1.406581997871399, + "learning_rate": 3.949500000000001e-05, + "loss": 0.5906, + "step": 7903 + }, + { + "epoch": 0.44260275506775676, + "grad_norm": 1.1897046566009521, + "learning_rate": 3.9500000000000005e-05, + "loss": 0.4028, + "step": 7904 + }, + { + "epoch": 0.4426587523798858, + "grad_norm": 1.1853468418121338, + "learning_rate": 3.9505e-05, + "loss": 0.3767, + "step": 7905 + }, + { + "epoch": 0.4427147496920148, + "grad_norm": 1.3921821117401123, + "learning_rate": 3.951e-05, + "loss": 0.4843, + "step": 7906 + }, + { + "epoch": 0.4427707470041438, + "grad_norm": 1.3302898406982422, + "learning_rate": 3.9515000000000004e-05, + "loss": 0.4561, + "step": 7907 + }, + { + "epoch": 0.44282674431627284, + "grad_norm": 1.3814650774002075, + "learning_rate": 3.952e-05, + "loss": 0.4598, + "step": 7908 + }, + { + "epoch": 0.44288274162840185, + "grad_norm": 1.281061053276062, + "learning_rate": 3.9525e-05, + "loss": 0.4592, + "step": 7909 + }, + { + "epoch": 0.4429387389405309, + "grad_norm": 1.0565319061279297, + "learning_rate": 3.953e-05, + "loss": 0.3522, + "step": 7910 + }, + { + "epoch": 0.4429947362526599, + "grad_norm": 1.2696032524108887, + "learning_rate": 3.9535e-05, + "loss": 0.4059, + "step": 7911 + }, + { + "epoch": 0.4430507335647889, + "grad_norm": 1.0670174360275269, + "learning_rate": 3.954e-05, + "loss": 0.3732, + "step": 7912 + }, + { + "epoch": 0.44310673087691793, + "grad_norm": 1.371474027633667, + "learning_rate": 3.9545e-05, + "loss": 0.4391, + "step": 7913 + }, + { + "epoch": 0.44316272818904695, + "grad_norm": 1.1317119598388672, + "learning_rate": 3.9550000000000006e-05, + "loss": 0.3645, + "step": 7914 + }, + { + "epoch": 0.44321872550117597, + "grad_norm": 1.0873996019363403, + "learning_rate": 3.9555e-05, + "loss": 0.4618, + "step": 7915 + }, + { + "epoch": 0.443274722813305, + "grad_norm": 1.397986888885498, + "learning_rate": 3.956e-05, + "loss": 0.539, + "step": 7916 + }, + { + "epoch": 0.44333072012543395, + "grad_norm": 1.3856488466262817, + "learning_rate": 3.9565000000000005e-05, + "loss": 0.6002, + "step": 7917 + }, + { + "epoch": 0.44338671743756297, + "grad_norm": 1.1605008840560913, + "learning_rate": 3.957e-05, + "loss": 0.3676, + "step": 7918 + }, + { + "epoch": 0.443442714749692, + "grad_norm": 1.2629355192184448, + "learning_rate": 3.9575e-05, + "loss": 0.3796, + "step": 7919 + }, + { + "epoch": 0.443498712061821, + "grad_norm": 1.28914213180542, + "learning_rate": 3.958e-05, + "loss": 0.4858, + "step": 7920 + }, + { + "epoch": 0.44355470937395003, + "grad_norm": 1.1475467681884766, + "learning_rate": 3.9585e-05, + "loss": 0.3757, + "step": 7921 + }, + { + "epoch": 0.44361070668607905, + "grad_norm": 1.3344305753707886, + "learning_rate": 3.959e-05, + "loss": 0.5062, + "step": 7922 + }, + { + "epoch": 0.44366670399820807, + "grad_norm": 1.1459726095199585, + "learning_rate": 3.9595e-05, + "loss": 0.3602, + "step": 7923 + }, + { + "epoch": 0.4437227013103371, + "grad_norm": 1.5019686222076416, + "learning_rate": 3.960000000000001e-05, + "loss": 0.4824, + "step": 7924 + }, + { + "epoch": 0.4437786986224661, + "grad_norm": 1.6637812852859497, + "learning_rate": 3.9605000000000005e-05, + "loss": 0.5459, + "step": 7925 + }, + { + "epoch": 0.4438346959345951, + "grad_norm": 1.2453186511993408, + "learning_rate": 3.961e-05, + "loss": 0.3356, + "step": 7926 + }, + { + "epoch": 0.44389069324672414, + "grad_norm": 1.4240748882293701, + "learning_rate": 3.9615e-05, + "loss": 0.3513, + "step": 7927 + }, + { + "epoch": 0.44394669055885316, + "grad_norm": 1.5543899536132812, + "learning_rate": 3.9620000000000004e-05, + "loss": 0.4535, + "step": 7928 + }, + { + "epoch": 0.4440026878709822, + "grad_norm": 1.4476829767227173, + "learning_rate": 3.9625e-05, + "loss": 0.4958, + "step": 7929 + }, + { + "epoch": 0.4440586851831112, + "grad_norm": 1.2234768867492676, + "learning_rate": 3.963e-05, + "loss": 0.3737, + "step": 7930 + }, + { + "epoch": 0.4441146824952402, + "grad_norm": 1.4079090356826782, + "learning_rate": 3.9635e-05, + "loss": 0.4697, + "step": 7931 + }, + { + "epoch": 0.44417067980736924, + "grad_norm": 1.09403657913208, + "learning_rate": 3.964e-05, + "loss": 0.3721, + "step": 7932 + }, + { + "epoch": 0.44422667711949826, + "grad_norm": 1.1749703884124756, + "learning_rate": 3.9645000000000004e-05, + "loss": 0.3767, + "step": 7933 + }, + { + "epoch": 0.4442826744316273, + "grad_norm": 1.252769112586975, + "learning_rate": 3.965e-05, + "loss": 0.3794, + "step": 7934 + }, + { + "epoch": 0.4443386717437563, + "grad_norm": 1.4599592685699463, + "learning_rate": 3.9655000000000006e-05, + "loss": 0.394, + "step": 7935 + }, + { + "epoch": 0.4443946690558853, + "grad_norm": 1.324910044670105, + "learning_rate": 3.966e-05, + "loss": 0.4233, + "step": 7936 + }, + { + "epoch": 0.44445066636801434, + "grad_norm": 1.1552929878234863, + "learning_rate": 3.9665e-05, + "loss": 0.482, + "step": 7937 + }, + { + "epoch": 0.44450666368014335, + "grad_norm": 1.4286192655563354, + "learning_rate": 3.9670000000000005e-05, + "loss": 0.3971, + "step": 7938 + }, + { + "epoch": 0.4445626609922724, + "grad_norm": 1.4314066171646118, + "learning_rate": 3.9675e-05, + "loss": 0.5845, + "step": 7939 + }, + { + "epoch": 0.4446186583044014, + "grad_norm": 1.1849724054336548, + "learning_rate": 3.968e-05, + "loss": 0.3866, + "step": 7940 + }, + { + "epoch": 0.4446746556165304, + "grad_norm": 1.1440964937210083, + "learning_rate": 3.9685e-05, + "loss": 0.5633, + "step": 7941 + }, + { + "epoch": 0.44473065292865943, + "grad_norm": 1.0037082433700562, + "learning_rate": 3.969e-05, + "loss": 0.3313, + "step": 7942 + }, + { + "epoch": 0.44478665024078845, + "grad_norm": 1.0694080591201782, + "learning_rate": 3.9695000000000005e-05, + "loss": 0.4466, + "step": 7943 + }, + { + "epoch": 0.44484264755291747, + "grad_norm": 1.5107334852218628, + "learning_rate": 3.97e-05, + "loss": 0.5122, + "step": 7944 + }, + { + "epoch": 0.4448986448650465, + "grad_norm": 1.397767424583435, + "learning_rate": 3.9705e-05, + "loss": 0.4112, + "step": 7945 + }, + { + "epoch": 0.4449546421771755, + "grad_norm": 1.3833537101745605, + "learning_rate": 3.9710000000000004e-05, + "loss": 0.3841, + "step": 7946 + }, + { + "epoch": 0.4450106394893045, + "grad_norm": 1.301817536354065, + "learning_rate": 3.9715e-05, + "loss": 0.5478, + "step": 7947 + }, + { + "epoch": 0.44506663680143355, + "grad_norm": 1.5289353132247925, + "learning_rate": 3.972e-05, + "loss": 0.4269, + "step": 7948 + }, + { + "epoch": 0.44512263411356257, + "grad_norm": 1.3700004816055298, + "learning_rate": 3.9725e-05, + "loss": 0.5458, + "step": 7949 + }, + { + "epoch": 0.4451786314256916, + "grad_norm": 1.2713686227798462, + "learning_rate": 3.973e-05, + "loss": 0.4822, + "step": 7950 + }, + { + "epoch": 0.4452346287378206, + "grad_norm": 1.1312288045883179, + "learning_rate": 3.9735e-05, + "loss": 0.3152, + "step": 7951 + }, + { + "epoch": 0.4452906260499496, + "grad_norm": 1.3867393732070923, + "learning_rate": 3.974e-05, + "loss": 0.3717, + "step": 7952 + }, + { + "epoch": 0.44534662336207864, + "grad_norm": 1.2697700262069702, + "learning_rate": 3.9745000000000007e-05, + "loss": 0.659, + "step": 7953 + }, + { + "epoch": 0.44540262067420766, + "grad_norm": 1.2105894088745117, + "learning_rate": 3.9750000000000004e-05, + "loss": 0.4468, + "step": 7954 + }, + { + "epoch": 0.4454586179863367, + "grad_norm": 1.2118644714355469, + "learning_rate": 3.9755e-05, + "loss": 0.3919, + "step": 7955 + }, + { + "epoch": 0.4455146152984657, + "grad_norm": 1.1268454790115356, + "learning_rate": 3.9760000000000006e-05, + "loss": 0.3192, + "step": 7956 + }, + { + "epoch": 0.4455706126105947, + "grad_norm": 1.1013085842132568, + "learning_rate": 3.9765e-05, + "loss": 0.3714, + "step": 7957 + }, + { + "epoch": 0.4456266099227237, + "grad_norm": 1.3739217519760132, + "learning_rate": 3.977e-05, + "loss": 0.4496, + "step": 7958 + }, + { + "epoch": 0.4456826072348527, + "grad_norm": 0.9961612224578857, + "learning_rate": 3.9775e-05, + "loss": 0.3866, + "step": 7959 + }, + { + "epoch": 0.4457386045469817, + "grad_norm": 1.3658822774887085, + "learning_rate": 3.978e-05, + "loss": 0.4917, + "step": 7960 + }, + { + "epoch": 0.44579460185911074, + "grad_norm": 1.3278608322143555, + "learning_rate": 3.9785e-05, + "loss": 0.3447, + "step": 7961 + }, + { + "epoch": 0.44585059917123976, + "grad_norm": 1.2886418104171753, + "learning_rate": 3.979e-05, + "loss": 0.4889, + "step": 7962 + }, + { + "epoch": 0.4459065964833688, + "grad_norm": 1.2320998907089233, + "learning_rate": 3.979500000000001e-05, + "loss": 0.4547, + "step": 7963 + }, + { + "epoch": 0.4459625937954978, + "grad_norm": 1.3106369972229004, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.4884, + "step": 7964 + }, + { + "epoch": 0.4460185911076268, + "grad_norm": 1.3171268701553345, + "learning_rate": 3.9805e-05, + "loss": 0.2768, + "step": 7965 + }, + { + "epoch": 0.44607458841975584, + "grad_norm": 1.265672206878662, + "learning_rate": 3.981e-05, + "loss": 0.5514, + "step": 7966 + }, + { + "epoch": 0.44613058573188485, + "grad_norm": 1.2978721857070923, + "learning_rate": 3.9815000000000004e-05, + "loss": 0.4264, + "step": 7967 + }, + { + "epoch": 0.4461865830440139, + "grad_norm": 1.543387770652771, + "learning_rate": 3.982e-05, + "loss": 0.5642, + "step": 7968 + }, + { + "epoch": 0.4462425803561429, + "grad_norm": 1.3579707145690918, + "learning_rate": 3.9825e-05, + "loss": 0.3669, + "step": 7969 + }, + { + "epoch": 0.4462985776682719, + "grad_norm": 1.495080828666687, + "learning_rate": 3.983e-05, + "loss": 0.4153, + "step": 7970 + }, + { + "epoch": 0.44635457498040093, + "grad_norm": 1.2408615350723267, + "learning_rate": 3.9835e-05, + "loss": 0.4271, + "step": 7971 + }, + { + "epoch": 0.44641057229252995, + "grad_norm": 1.266782283782959, + "learning_rate": 3.984e-05, + "loss": 0.3797, + "step": 7972 + }, + { + "epoch": 0.44646656960465897, + "grad_norm": 1.8094933032989502, + "learning_rate": 3.9845e-05, + "loss": 0.582, + "step": 7973 + }, + { + "epoch": 0.446522566916788, + "grad_norm": 1.3468682765960693, + "learning_rate": 3.9850000000000006e-05, + "loss": 0.473, + "step": 7974 + }, + { + "epoch": 0.446578564228917, + "grad_norm": 1.3554326295852661, + "learning_rate": 3.9855000000000004e-05, + "loss": 0.5442, + "step": 7975 + }, + { + "epoch": 0.446634561541046, + "grad_norm": 1.2305324077606201, + "learning_rate": 3.986e-05, + "loss": 0.4173, + "step": 7976 + }, + { + "epoch": 0.44669055885317505, + "grad_norm": 0.9923909902572632, + "learning_rate": 3.9865000000000005e-05, + "loss": 0.3104, + "step": 7977 + }, + { + "epoch": 0.44674655616530407, + "grad_norm": 1.1111879348754883, + "learning_rate": 3.987e-05, + "loss": 0.4633, + "step": 7978 + }, + { + "epoch": 0.4468025534774331, + "grad_norm": 1.341869592666626, + "learning_rate": 3.9875e-05, + "loss": 0.3594, + "step": 7979 + }, + { + "epoch": 0.4468585507895621, + "grad_norm": 1.3253010511398315, + "learning_rate": 3.988e-05, + "loss": 0.5988, + "step": 7980 + }, + { + "epoch": 0.4469145481016911, + "grad_norm": 1.386620044708252, + "learning_rate": 3.9885e-05, + "loss": 0.4883, + "step": 7981 + }, + { + "epoch": 0.44697054541382014, + "grad_norm": 1.2293016910552979, + "learning_rate": 3.989e-05, + "loss": 0.4162, + "step": 7982 + }, + { + "epoch": 0.44702654272594916, + "grad_norm": 1.0457274913787842, + "learning_rate": 3.9895000000000003e-05, + "loss": 0.3897, + "step": 7983 + }, + { + "epoch": 0.4470825400380782, + "grad_norm": 1.4893736839294434, + "learning_rate": 3.99e-05, + "loss": 0.4356, + "step": 7984 + }, + { + "epoch": 0.4471385373502072, + "grad_norm": 1.1966415643692017, + "learning_rate": 3.9905000000000005e-05, + "loss": 0.3838, + "step": 7985 + }, + { + "epoch": 0.4471945346623362, + "grad_norm": 1.4571969509124756, + "learning_rate": 3.991e-05, + "loss": 0.5493, + "step": 7986 + }, + { + "epoch": 0.44725053197446524, + "grad_norm": 1.3030469417572021, + "learning_rate": 3.9915e-05, + "loss": 0.6022, + "step": 7987 + }, + { + "epoch": 0.44730652928659426, + "grad_norm": 1.2906101942062378, + "learning_rate": 3.9920000000000004e-05, + "loss": 0.6794, + "step": 7988 + }, + { + "epoch": 0.4473625265987233, + "grad_norm": 1.244630217552185, + "learning_rate": 3.9925e-05, + "loss": 0.3808, + "step": 7989 + }, + { + "epoch": 0.4474185239108523, + "grad_norm": 1.330586314201355, + "learning_rate": 3.993e-05, + "loss": 0.5, + "step": 7990 + }, + { + "epoch": 0.4474745212229813, + "grad_norm": 1.4680848121643066, + "learning_rate": 3.9935e-05, + "loss": 0.4274, + "step": 7991 + }, + { + "epoch": 0.44753051853511033, + "grad_norm": 1.2660781145095825, + "learning_rate": 3.994e-05, + "loss": 0.4114, + "step": 7992 + }, + { + "epoch": 0.44758651584723935, + "grad_norm": 1.614398717880249, + "learning_rate": 3.9945000000000005e-05, + "loss": 0.5062, + "step": 7993 + }, + { + "epoch": 0.44764251315936837, + "grad_norm": 1.0764151811599731, + "learning_rate": 3.995e-05, + "loss": 0.3746, + "step": 7994 + }, + { + "epoch": 0.4476985104714974, + "grad_norm": 1.3683723211288452, + "learning_rate": 3.9955000000000006e-05, + "loss": 0.5567, + "step": 7995 + }, + { + "epoch": 0.4477545077836264, + "grad_norm": 1.0107738971710205, + "learning_rate": 3.9960000000000004e-05, + "loss": 0.3768, + "step": 7996 + }, + { + "epoch": 0.44781050509575543, + "grad_norm": 1.3045953512191772, + "learning_rate": 3.9965e-05, + "loss": 0.5264, + "step": 7997 + }, + { + "epoch": 0.44786650240788445, + "grad_norm": 1.2756482362747192, + "learning_rate": 3.9970000000000005e-05, + "loss": 0.4646, + "step": 7998 + }, + { + "epoch": 0.4479224997200134, + "grad_norm": 1.2149758338928223, + "learning_rate": 3.9975e-05, + "loss": 0.4615, + "step": 7999 + }, + { + "epoch": 0.44797849703214243, + "grad_norm": 1.2461477518081665, + "learning_rate": 3.998e-05, + "loss": 0.4821, + "step": 8000 + }, + { + "epoch": 0.44803449434427145, + "grad_norm": 1.3961334228515625, + "learning_rate": 3.9985e-05, + "loss": 0.4495, + "step": 8001 + }, + { + "epoch": 0.44809049165640047, + "grad_norm": 1.2145496606826782, + "learning_rate": 3.999e-05, + "loss": 0.3813, + "step": 8002 + }, + { + "epoch": 0.4481464889685295, + "grad_norm": 1.1947134733200073, + "learning_rate": 3.9995000000000006e-05, + "loss": 0.4029, + "step": 8003 + }, + { + "epoch": 0.4482024862806585, + "grad_norm": 1.1587896347045898, + "learning_rate": 4e-05, + "loss": 0.4203, + "step": 8004 + }, + { + "epoch": 0.4482584835927875, + "grad_norm": 1.4524353742599487, + "learning_rate": 4.0005e-05, + "loss": 0.5276, + "step": 8005 + }, + { + "epoch": 0.44831448090491655, + "grad_norm": 1.0707982778549194, + "learning_rate": 4.0010000000000005e-05, + "loss": 0.3752, + "step": 8006 + }, + { + "epoch": 0.44837047821704556, + "grad_norm": 1.0258558988571167, + "learning_rate": 4.0015e-05, + "loss": 0.3766, + "step": 8007 + }, + { + "epoch": 0.4484264755291746, + "grad_norm": 1.292534589767456, + "learning_rate": 4.002e-05, + "loss": 0.4509, + "step": 8008 + }, + { + "epoch": 0.4484824728413036, + "grad_norm": 1.4285045862197876, + "learning_rate": 4.0025000000000004e-05, + "loss": 0.4806, + "step": 8009 + }, + { + "epoch": 0.4485384701534326, + "grad_norm": 1.1258915662765503, + "learning_rate": 4.003e-05, + "loss": 0.3962, + "step": 8010 + }, + { + "epoch": 0.44859446746556164, + "grad_norm": 1.215669870376587, + "learning_rate": 4.0035e-05, + "loss": 0.5454, + "step": 8011 + }, + { + "epoch": 0.44865046477769066, + "grad_norm": 1.133365273475647, + "learning_rate": 4.004e-05, + "loss": 0.4184, + "step": 8012 + }, + { + "epoch": 0.4487064620898197, + "grad_norm": 1.6083720922470093, + "learning_rate": 4.0045e-05, + "loss": 0.4896, + "step": 8013 + }, + { + "epoch": 0.4487624594019487, + "grad_norm": 1.1899293661117554, + "learning_rate": 4.0050000000000004e-05, + "loss": 0.4205, + "step": 8014 + }, + { + "epoch": 0.4488184567140777, + "grad_norm": 1.310274600982666, + "learning_rate": 4.0055e-05, + "loss": 0.4943, + "step": 8015 + }, + { + "epoch": 0.44887445402620674, + "grad_norm": 1.2298719882965088, + "learning_rate": 4.0060000000000006e-05, + "loss": 0.3656, + "step": 8016 + }, + { + "epoch": 0.44893045133833576, + "grad_norm": 1.258176565170288, + "learning_rate": 4.0065000000000003e-05, + "loss": 0.3348, + "step": 8017 + }, + { + "epoch": 0.4489864486504648, + "grad_norm": 1.1516302824020386, + "learning_rate": 4.007e-05, + "loss": 0.5148, + "step": 8018 + }, + { + "epoch": 0.4490424459625938, + "grad_norm": 1.500391960144043, + "learning_rate": 4.0075e-05, + "loss": 0.4206, + "step": 8019 + }, + { + "epoch": 0.4490984432747228, + "grad_norm": 1.087298035621643, + "learning_rate": 4.008e-05, + "loss": 0.3701, + "step": 8020 + }, + { + "epoch": 0.44915444058685183, + "grad_norm": 1.2705739736557007, + "learning_rate": 4.0085e-05, + "loss": 0.4524, + "step": 8021 + }, + { + "epoch": 0.44921043789898085, + "grad_norm": 1.171284794807434, + "learning_rate": 4.009e-05, + "loss": 0.3866, + "step": 8022 + }, + { + "epoch": 0.44926643521110987, + "grad_norm": 1.157090425491333, + "learning_rate": 4.0095e-05, + "loss": 0.4116, + "step": 8023 + }, + { + "epoch": 0.4493224325232389, + "grad_norm": 1.1388421058654785, + "learning_rate": 4.0100000000000006e-05, + "loss": 0.4194, + "step": 8024 + }, + { + "epoch": 0.4493784298353679, + "grad_norm": 1.231940746307373, + "learning_rate": 4.0105e-05, + "loss": 0.3605, + "step": 8025 + }, + { + "epoch": 0.44943442714749693, + "grad_norm": 1.280762791633606, + "learning_rate": 4.011e-05, + "loss": 0.5177, + "step": 8026 + }, + { + "epoch": 0.44949042445962595, + "grad_norm": 1.1017980575561523, + "learning_rate": 4.0115000000000005e-05, + "loss": 0.3673, + "step": 8027 + }, + { + "epoch": 0.44954642177175497, + "grad_norm": 1.2504467964172363, + "learning_rate": 4.012e-05, + "loss": 0.512, + "step": 8028 + }, + { + "epoch": 0.449602419083884, + "grad_norm": 1.2567552328109741, + "learning_rate": 4.0125e-05, + "loss": 0.3979, + "step": 8029 + }, + { + "epoch": 0.449658416396013, + "grad_norm": 1.2547028064727783, + "learning_rate": 4.0130000000000004e-05, + "loss": 0.429, + "step": 8030 + }, + { + "epoch": 0.449714413708142, + "grad_norm": 1.2034847736358643, + "learning_rate": 4.0135e-05, + "loss": 0.4543, + "step": 8031 + }, + { + "epoch": 0.44977041102027104, + "grad_norm": 1.3718706369400024, + "learning_rate": 4.014e-05, + "loss": 0.4695, + "step": 8032 + }, + { + "epoch": 0.44982640833240006, + "grad_norm": 1.1253634691238403, + "learning_rate": 4.0144999999999996e-05, + "loss": 0.4362, + "step": 8033 + }, + { + "epoch": 0.4498824056445291, + "grad_norm": 1.0990970134735107, + "learning_rate": 4.015000000000001e-05, + "loss": 0.5066, + "step": 8034 + }, + { + "epoch": 0.4499384029566581, + "grad_norm": 1.5450721979141235, + "learning_rate": 4.0155000000000004e-05, + "loss": 0.4642, + "step": 8035 + }, + { + "epoch": 0.4499944002687871, + "grad_norm": 1.2929738759994507, + "learning_rate": 4.016e-05, + "loss": 0.5573, + "step": 8036 + }, + { + "epoch": 0.45005039758091614, + "grad_norm": 1.633539080619812, + "learning_rate": 4.0165000000000006e-05, + "loss": 0.4439, + "step": 8037 + }, + { + "epoch": 0.45010639489304516, + "grad_norm": 1.2135896682739258, + "learning_rate": 4.017e-05, + "loss": 0.3989, + "step": 8038 + }, + { + "epoch": 0.4501623922051742, + "grad_norm": 1.394967794418335, + "learning_rate": 4.0175e-05, + "loss": 0.4351, + "step": 8039 + }, + { + "epoch": 0.4502183895173032, + "grad_norm": 1.177072525024414, + "learning_rate": 4.018e-05, + "loss": 0.4132, + "step": 8040 + }, + { + "epoch": 0.45027438682943216, + "grad_norm": 1.293129563331604, + "learning_rate": 4.0185e-05, + "loss": 0.3174, + "step": 8041 + }, + { + "epoch": 0.4503303841415612, + "grad_norm": 1.3859515190124512, + "learning_rate": 4.019e-05, + "loss": 0.4402, + "step": 8042 + }, + { + "epoch": 0.4503863814536902, + "grad_norm": 1.4350817203521729, + "learning_rate": 4.0195e-05, + "loss": 0.4825, + "step": 8043 + }, + { + "epoch": 0.4504423787658192, + "grad_norm": 1.1298954486846924, + "learning_rate": 4.02e-05, + "loss": 0.4487, + "step": 8044 + }, + { + "epoch": 0.45049837607794824, + "grad_norm": 1.2463808059692383, + "learning_rate": 4.0205000000000006e-05, + "loss": 0.3645, + "step": 8045 + }, + { + "epoch": 0.45055437339007726, + "grad_norm": 1.2132930755615234, + "learning_rate": 4.021e-05, + "loss": 0.3821, + "step": 8046 + }, + { + "epoch": 0.4506103707022063, + "grad_norm": 1.4917255640029907, + "learning_rate": 4.0215e-05, + "loss": 0.5085, + "step": 8047 + }, + { + "epoch": 0.4506663680143353, + "grad_norm": 1.0278327465057373, + "learning_rate": 4.0220000000000005e-05, + "loss": 0.4188, + "step": 8048 + }, + { + "epoch": 0.4507223653264643, + "grad_norm": 1.4655405282974243, + "learning_rate": 4.0225e-05, + "loss": 0.5233, + "step": 8049 + }, + { + "epoch": 0.45077836263859333, + "grad_norm": 1.2491194009780884, + "learning_rate": 4.023e-05, + "loss": 0.4767, + "step": 8050 + }, + { + "epoch": 0.45083435995072235, + "grad_norm": 1.330393671989441, + "learning_rate": 4.0235000000000004e-05, + "loss": 0.4438, + "step": 8051 + }, + { + "epoch": 0.45089035726285137, + "grad_norm": 1.231075406074524, + "learning_rate": 4.024e-05, + "loss": 0.4787, + "step": 8052 + }, + { + "epoch": 0.4509463545749804, + "grad_norm": 1.1820240020751953, + "learning_rate": 4.0245e-05, + "loss": 0.4129, + "step": 8053 + }, + { + "epoch": 0.4510023518871094, + "grad_norm": 1.0467157363891602, + "learning_rate": 4.025e-05, + "loss": 0.3787, + "step": 8054 + }, + { + "epoch": 0.45105834919923843, + "grad_norm": 1.2204055786132812, + "learning_rate": 4.025500000000001e-05, + "loss": 0.3706, + "step": 8055 + }, + { + "epoch": 0.45111434651136745, + "grad_norm": 1.5015779733657837, + "learning_rate": 4.0260000000000004e-05, + "loss": 0.4293, + "step": 8056 + }, + { + "epoch": 0.45117034382349647, + "grad_norm": 1.1499462127685547, + "learning_rate": 4.0265e-05, + "loss": 0.4028, + "step": 8057 + }, + { + "epoch": 0.4512263411356255, + "grad_norm": 1.630139708518982, + "learning_rate": 4.027e-05, + "loss": 0.4948, + "step": 8058 + }, + { + "epoch": 0.4512823384477545, + "grad_norm": 1.1332985162734985, + "learning_rate": 4.0275e-05, + "loss": 0.3303, + "step": 8059 + }, + { + "epoch": 0.4513383357598835, + "grad_norm": 1.9683868885040283, + "learning_rate": 4.028e-05, + "loss": 0.4903, + "step": 8060 + }, + { + "epoch": 0.45139433307201254, + "grad_norm": 1.0710465908050537, + "learning_rate": 4.0285e-05, + "loss": 0.3492, + "step": 8061 + }, + { + "epoch": 0.45145033038414156, + "grad_norm": 1.514875888824463, + "learning_rate": 4.029e-05, + "loss": 0.4255, + "step": 8062 + }, + { + "epoch": 0.4515063276962706, + "grad_norm": 1.4187906980514526, + "learning_rate": 4.0295e-05, + "loss": 0.4634, + "step": 8063 + }, + { + "epoch": 0.4515623250083996, + "grad_norm": 1.4659744501113892, + "learning_rate": 4.0300000000000004e-05, + "loss": 0.4806, + "step": 8064 + }, + { + "epoch": 0.4516183223205286, + "grad_norm": 1.1748239994049072, + "learning_rate": 4.0305e-05, + "loss": 0.3964, + "step": 8065 + }, + { + "epoch": 0.45167431963265764, + "grad_norm": 1.0754516124725342, + "learning_rate": 4.0310000000000005e-05, + "loss": 0.394, + "step": 8066 + }, + { + "epoch": 0.45173031694478666, + "grad_norm": 1.1976890563964844, + "learning_rate": 4.0315e-05, + "loss": 0.3775, + "step": 8067 + }, + { + "epoch": 0.4517863142569157, + "grad_norm": 1.5069286823272705, + "learning_rate": 4.032e-05, + "loss": 0.6658, + "step": 8068 + }, + { + "epoch": 0.4518423115690447, + "grad_norm": 1.6875029802322388, + "learning_rate": 4.0325000000000004e-05, + "loss": 0.4667, + "step": 8069 + }, + { + "epoch": 0.4518983088811737, + "grad_norm": 1.1968203783035278, + "learning_rate": 4.033e-05, + "loss": 0.4361, + "step": 8070 + }, + { + "epoch": 0.45195430619330274, + "grad_norm": 1.481873869895935, + "learning_rate": 4.0335e-05, + "loss": 0.4308, + "step": 8071 + }, + { + "epoch": 0.45201030350543175, + "grad_norm": 1.190264344215393, + "learning_rate": 4.034e-05, + "loss": 0.4504, + "step": 8072 + }, + { + "epoch": 0.4520663008175608, + "grad_norm": 1.1881425380706787, + "learning_rate": 4.0345e-05, + "loss": 0.4565, + "step": 8073 + }, + { + "epoch": 0.4521222981296898, + "grad_norm": 1.15525484085083, + "learning_rate": 4.0350000000000005e-05, + "loss": 0.3779, + "step": 8074 + }, + { + "epoch": 0.4521782954418188, + "grad_norm": 1.2618846893310547, + "learning_rate": 4.0355e-05, + "loss": 0.4439, + "step": 8075 + }, + { + "epoch": 0.45223429275394783, + "grad_norm": 1.3922920227050781, + "learning_rate": 4.0360000000000007e-05, + "loss": 0.4628, + "step": 8076 + }, + { + "epoch": 0.45229029006607685, + "grad_norm": 1.2423697710037231, + "learning_rate": 4.0365000000000004e-05, + "loss": 0.4043, + "step": 8077 + }, + { + "epoch": 0.45234628737820587, + "grad_norm": 1.279617190361023, + "learning_rate": 4.037e-05, + "loss": 0.5224, + "step": 8078 + }, + { + "epoch": 0.4524022846903349, + "grad_norm": 1.2444432973861694, + "learning_rate": 4.0375e-05, + "loss": 0.3951, + "step": 8079 + }, + { + "epoch": 0.4524582820024639, + "grad_norm": 1.1708821058273315, + "learning_rate": 4.038e-05, + "loss": 0.3757, + "step": 8080 + }, + { + "epoch": 0.4525142793145929, + "grad_norm": 1.2084872722625732, + "learning_rate": 4.0385e-05, + "loss": 0.3436, + "step": 8081 + }, + { + "epoch": 0.4525702766267219, + "grad_norm": 1.4237982034683228, + "learning_rate": 4.039e-05, + "loss": 0.4698, + "step": 8082 + }, + { + "epoch": 0.4526262739388509, + "grad_norm": 1.3786941766738892, + "learning_rate": 4.0395e-05, + "loss": 0.3887, + "step": 8083 + }, + { + "epoch": 0.45268227125097993, + "grad_norm": 1.1412067413330078, + "learning_rate": 4.0400000000000006e-05, + "loss": 0.4617, + "step": 8084 + }, + { + "epoch": 0.45273826856310895, + "grad_norm": 1.231732964515686, + "learning_rate": 4.0405000000000004e-05, + "loss": 0.3878, + "step": 8085 + }, + { + "epoch": 0.45279426587523797, + "grad_norm": 1.242849588394165, + "learning_rate": 4.041e-05, + "loss": 0.4939, + "step": 8086 + }, + { + "epoch": 0.452850263187367, + "grad_norm": 1.2185568809509277, + "learning_rate": 4.0415000000000005e-05, + "loss": 0.3916, + "step": 8087 + }, + { + "epoch": 0.452906260499496, + "grad_norm": 1.1475306749343872, + "learning_rate": 4.042e-05, + "loss": 0.4227, + "step": 8088 + }, + { + "epoch": 0.452962257811625, + "grad_norm": 1.4434412717819214, + "learning_rate": 4.0425e-05, + "loss": 0.489, + "step": 8089 + }, + { + "epoch": 0.45301825512375404, + "grad_norm": 1.2608963251113892, + "learning_rate": 4.0430000000000004e-05, + "loss": 0.4135, + "step": 8090 + }, + { + "epoch": 0.45307425243588306, + "grad_norm": 1.2011988162994385, + "learning_rate": 4.0435e-05, + "loss": 0.4414, + "step": 8091 + }, + { + "epoch": 0.4531302497480121, + "grad_norm": 1.0195295810699463, + "learning_rate": 4.044e-05, + "loss": 0.4124, + "step": 8092 + }, + { + "epoch": 0.4531862470601411, + "grad_norm": 1.2460380792617798, + "learning_rate": 4.0444999999999996e-05, + "loss": 0.408, + "step": 8093 + }, + { + "epoch": 0.4532422443722701, + "grad_norm": 1.3952407836914062, + "learning_rate": 4.045000000000001e-05, + "loss": 0.5317, + "step": 8094 + }, + { + "epoch": 0.45329824168439914, + "grad_norm": 1.2081043720245361, + "learning_rate": 4.0455000000000005e-05, + "loss": 0.37, + "step": 8095 + }, + { + "epoch": 0.45335423899652816, + "grad_norm": 1.1570297479629517, + "learning_rate": 4.046e-05, + "loss": 0.365, + "step": 8096 + }, + { + "epoch": 0.4534102363086572, + "grad_norm": 1.3316103219985962, + "learning_rate": 4.0465e-05, + "loss": 0.4904, + "step": 8097 + }, + { + "epoch": 0.4534662336207862, + "grad_norm": 1.4848228693008423, + "learning_rate": 4.0470000000000004e-05, + "loss": 0.4557, + "step": 8098 + }, + { + "epoch": 0.4535222309329152, + "grad_norm": 1.5339934825897217, + "learning_rate": 4.0475e-05, + "loss": 0.4291, + "step": 8099 + }, + { + "epoch": 0.45357822824504423, + "grad_norm": 1.0706136226654053, + "learning_rate": 4.048e-05, + "loss": 0.3559, + "step": 8100 + }, + { + "epoch": 0.45363422555717325, + "grad_norm": 1.1460086107254028, + "learning_rate": 4.0485e-05, + "loss": 0.3852, + "step": 8101 + }, + { + "epoch": 0.4536902228693023, + "grad_norm": 1.1236015558242798, + "learning_rate": 4.049e-05, + "loss": 0.4457, + "step": 8102 + }, + { + "epoch": 0.4537462201814313, + "grad_norm": 1.4457634687423706, + "learning_rate": 4.0495e-05, + "loss": 0.5892, + "step": 8103 + }, + { + "epoch": 0.4538022174935603, + "grad_norm": 1.413439393043518, + "learning_rate": 4.05e-05, + "loss": 0.555, + "step": 8104 + }, + { + "epoch": 0.45385821480568933, + "grad_norm": 1.2305419445037842, + "learning_rate": 4.0505000000000006e-05, + "loss": 0.3831, + "step": 8105 + }, + { + "epoch": 0.45391421211781835, + "grad_norm": 1.3862773180007935, + "learning_rate": 4.0510000000000003e-05, + "loss": 0.4541, + "step": 8106 + }, + { + "epoch": 0.45397020942994737, + "grad_norm": 1.3567650318145752, + "learning_rate": 4.0515e-05, + "loss": 0.5618, + "step": 8107 + }, + { + "epoch": 0.4540262067420764, + "grad_norm": 1.232162356376648, + "learning_rate": 4.0520000000000005e-05, + "loss": 0.3998, + "step": 8108 + }, + { + "epoch": 0.4540822040542054, + "grad_norm": 1.2350515127182007, + "learning_rate": 4.0525e-05, + "loss": 0.4445, + "step": 8109 + }, + { + "epoch": 0.4541382013663344, + "grad_norm": 1.6478445529937744, + "learning_rate": 4.053e-05, + "loss": 0.4283, + "step": 8110 + }, + { + "epoch": 0.45419419867846345, + "grad_norm": 1.372441053390503, + "learning_rate": 4.0535000000000004e-05, + "loss": 0.3788, + "step": 8111 + }, + { + "epoch": 0.45425019599059246, + "grad_norm": 1.1326618194580078, + "learning_rate": 4.054e-05, + "loss": 0.4098, + "step": 8112 + }, + { + "epoch": 0.4543061933027215, + "grad_norm": 1.2759462594985962, + "learning_rate": 4.0545e-05, + "loss": 0.4116, + "step": 8113 + }, + { + "epoch": 0.4543621906148505, + "grad_norm": 1.4068135023117065, + "learning_rate": 4.055e-05, + "loss": 0.3877, + "step": 8114 + }, + { + "epoch": 0.4544181879269795, + "grad_norm": 1.5296299457550049, + "learning_rate": 4.055500000000001e-05, + "loss": 0.5493, + "step": 8115 + }, + { + "epoch": 0.45447418523910854, + "grad_norm": 1.0567477941513062, + "learning_rate": 4.0560000000000005e-05, + "loss": 0.3859, + "step": 8116 + }, + { + "epoch": 0.45453018255123756, + "grad_norm": 1.291696310043335, + "learning_rate": 4.0565e-05, + "loss": 0.4156, + "step": 8117 + }, + { + "epoch": 0.4545861798633666, + "grad_norm": 1.5526171922683716, + "learning_rate": 4.057e-05, + "loss": 0.5647, + "step": 8118 + }, + { + "epoch": 0.4546421771754956, + "grad_norm": 1.2019410133361816, + "learning_rate": 4.0575000000000004e-05, + "loss": 0.3844, + "step": 8119 + }, + { + "epoch": 0.4546981744876246, + "grad_norm": 1.3499386310577393, + "learning_rate": 4.058e-05, + "loss": 0.5272, + "step": 8120 + }, + { + "epoch": 0.45475417179975364, + "grad_norm": 1.597776174545288, + "learning_rate": 4.0585e-05, + "loss": 0.5078, + "step": 8121 + }, + { + "epoch": 0.45481016911188266, + "grad_norm": 1.053828477859497, + "learning_rate": 4.059e-05, + "loss": 0.282, + "step": 8122 + }, + { + "epoch": 0.4548661664240116, + "grad_norm": 1.2141889333724976, + "learning_rate": 4.0595e-05, + "loss": 0.3871, + "step": 8123 + }, + { + "epoch": 0.45492216373614064, + "grad_norm": 1.2278751134872437, + "learning_rate": 4.0600000000000004e-05, + "loss": 0.371, + "step": 8124 + }, + { + "epoch": 0.45497816104826966, + "grad_norm": 1.1336435079574585, + "learning_rate": 4.0605e-05, + "loss": 0.3757, + "step": 8125 + }, + { + "epoch": 0.4550341583603987, + "grad_norm": 1.213625431060791, + "learning_rate": 4.0610000000000006e-05, + "loss": 0.5468, + "step": 8126 + }, + { + "epoch": 0.4550901556725277, + "grad_norm": 1.0659953355789185, + "learning_rate": 4.0615e-05, + "loss": 0.3782, + "step": 8127 + }, + { + "epoch": 0.4551461529846567, + "grad_norm": 1.1252938508987427, + "learning_rate": 4.062e-05, + "loss": 0.3667, + "step": 8128 + }, + { + "epoch": 0.45520215029678573, + "grad_norm": 1.2927285432815552, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.4203, + "step": 8129 + }, + { + "epoch": 0.45525814760891475, + "grad_norm": 1.2861311435699463, + "learning_rate": 4.063e-05, + "loss": 0.4138, + "step": 8130 + }, + { + "epoch": 0.4553141449210438, + "grad_norm": 1.390527606010437, + "learning_rate": 4.0635e-05, + "loss": 0.4645, + "step": 8131 + }, + { + "epoch": 0.4553701422331728, + "grad_norm": 1.4087966680526733, + "learning_rate": 4.064e-05, + "loss": 0.4163, + "step": 8132 + }, + { + "epoch": 0.4554261395453018, + "grad_norm": 1.4138132333755493, + "learning_rate": 4.0645e-05, + "loss": 0.4528, + "step": 8133 + }, + { + "epoch": 0.45548213685743083, + "grad_norm": 1.3514633178710938, + "learning_rate": 4.065e-05, + "loss": 0.3211, + "step": 8134 + }, + { + "epoch": 0.45553813416955985, + "grad_norm": 1.0520461797714233, + "learning_rate": 4.0655e-05, + "loss": 0.4123, + "step": 8135 + }, + { + "epoch": 0.45559413148168887, + "grad_norm": 1.2497740983963013, + "learning_rate": 4.066e-05, + "loss": 0.39, + "step": 8136 + }, + { + "epoch": 0.4556501287938179, + "grad_norm": 1.0242879390716553, + "learning_rate": 4.0665000000000005e-05, + "loss": 0.3915, + "step": 8137 + }, + { + "epoch": 0.4557061261059469, + "grad_norm": 1.367636799812317, + "learning_rate": 4.067e-05, + "loss": 0.362, + "step": 8138 + }, + { + "epoch": 0.4557621234180759, + "grad_norm": 1.2019015550613403, + "learning_rate": 4.0675e-05, + "loss": 0.5287, + "step": 8139 + }, + { + "epoch": 0.45581812073020495, + "grad_norm": 1.3564034700393677, + "learning_rate": 4.0680000000000004e-05, + "loss": 0.5151, + "step": 8140 + }, + { + "epoch": 0.45587411804233396, + "grad_norm": 1.424517035484314, + "learning_rate": 4.0685e-05, + "loss": 0.5017, + "step": 8141 + }, + { + "epoch": 0.455930115354463, + "grad_norm": 1.372143030166626, + "learning_rate": 4.069e-05, + "loss": 0.3535, + "step": 8142 + }, + { + "epoch": 0.455986112666592, + "grad_norm": 1.584250569343567, + "learning_rate": 4.0695e-05, + "loss": 0.5072, + "step": 8143 + }, + { + "epoch": 0.456042109978721, + "grad_norm": 1.085153579711914, + "learning_rate": 4.07e-05, + "loss": 0.4053, + "step": 8144 + }, + { + "epoch": 0.45609810729085004, + "grad_norm": 1.2137624025344849, + "learning_rate": 4.0705000000000004e-05, + "loss": 0.4806, + "step": 8145 + }, + { + "epoch": 0.45615410460297906, + "grad_norm": 1.4045476913452148, + "learning_rate": 4.071e-05, + "loss": 0.6196, + "step": 8146 + }, + { + "epoch": 0.4562101019151081, + "grad_norm": 1.3317464590072632, + "learning_rate": 4.0715000000000006e-05, + "loss": 0.4997, + "step": 8147 + }, + { + "epoch": 0.4562660992272371, + "grad_norm": 1.1357710361480713, + "learning_rate": 4.072e-05, + "loss": 0.3664, + "step": 8148 + }, + { + "epoch": 0.4563220965393661, + "grad_norm": 1.2137690782546997, + "learning_rate": 4.0725e-05, + "loss": 0.3473, + "step": 8149 + }, + { + "epoch": 0.45637809385149514, + "grad_norm": 1.4994457960128784, + "learning_rate": 4.0730000000000005e-05, + "loss": 0.5448, + "step": 8150 + }, + { + "epoch": 0.45643409116362416, + "grad_norm": 1.363540768623352, + "learning_rate": 4.0735e-05, + "loss": 0.4171, + "step": 8151 + }, + { + "epoch": 0.4564900884757532, + "grad_norm": 1.1198484897613525, + "learning_rate": 4.074e-05, + "loss": 0.3726, + "step": 8152 + }, + { + "epoch": 0.4565460857878822, + "grad_norm": 1.3675659894943237, + "learning_rate": 4.0745e-05, + "loss": 0.5798, + "step": 8153 + }, + { + "epoch": 0.4566020831000112, + "grad_norm": 1.2910454273223877, + "learning_rate": 4.075e-05, + "loss": 0.3636, + "step": 8154 + }, + { + "epoch": 0.45665808041214023, + "grad_norm": 1.163226842880249, + "learning_rate": 4.0755000000000005e-05, + "loss": 0.3997, + "step": 8155 + }, + { + "epoch": 0.45671407772426925, + "grad_norm": 1.3678209781646729, + "learning_rate": 4.076e-05, + "loss": 0.5217, + "step": 8156 + }, + { + "epoch": 0.45677007503639827, + "grad_norm": 1.0722938776016235, + "learning_rate": 4.0765e-05, + "loss": 0.3597, + "step": 8157 + }, + { + "epoch": 0.4568260723485273, + "grad_norm": 1.2889726161956787, + "learning_rate": 4.0770000000000004e-05, + "loss": 0.5934, + "step": 8158 + }, + { + "epoch": 0.4568820696606563, + "grad_norm": 1.3214764595031738, + "learning_rate": 4.0775e-05, + "loss": 0.432, + "step": 8159 + }, + { + "epoch": 0.45693806697278533, + "grad_norm": 1.1166839599609375, + "learning_rate": 4.078e-05, + "loss": 0.3357, + "step": 8160 + }, + { + "epoch": 0.45699406428491435, + "grad_norm": 1.1353657245635986, + "learning_rate": 4.0785e-05, + "loss": 0.4701, + "step": 8161 + }, + { + "epoch": 0.45705006159704337, + "grad_norm": 1.2369425296783447, + "learning_rate": 4.079e-05, + "loss": 0.3867, + "step": 8162 + }, + { + "epoch": 0.4571060589091724, + "grad_norm": 1.2651628255844116, + "learning_rate": 4.0795e-05, + "loss": 0.4669, + "step": 8163 + }, + { + "epoch": 0.4571620562213014, + "grad_norm": 1.2256214618682861, + "learning_rate": 4.08e-05, + "loss": 0.3618, + "step": 8164 + }, + { + "epoch": 0.45721805353343037, + "grad_norm": 3.2736332416534424, + "learning_rate": 4.0805000000000007e-05, + "loss": 0.4127, + "step": 8165 + }, + { + "epoch": 0.4572740508455594, + "grad_norm": 1.2997982501983643, + "learning_rate": 4.0810000000000004e-05, + "loss": 0.4168, + "step": 8166 + }, + { + "epoch": 0.4573300481576884, + "grad_norm": 1.2261103391647339, + "learning_rate": 4.0815e-05, + "loss": 0.6351, + "step": 8167 + }, + { + "epoch": 0.4573860454698174, + "grad_norm": 1.2697564363479614, + "learning_rate": 4.0820000000000006e-05, + "loss": 0.4485, + "step": 8168 + }, + { + "epoch": 0.45744204278194645, + "grad_norm": 1.2801098823547363, + "learning_rate": 4.0825e-05, + "loss": 0.4397, + "step": 8169 + }, + { + "epoch": 0.45749804009407546, + "grad_norm": 1.3548836708068848, + "learning_rate": 4.083e-05, + "loss": 0.4038, + "step": 8170 + }, + { + "epoch": 0.4575540374062045, + "grad_norm": 1.4707496166229248, + "learning_rate": 4.0835e-05, + "loss": 0.4418, + "step": 8171 + }, + { + "epoch": 0.4576100347183335, + "grad_norm": 1.1236820220947266, + "learning_rate": 4.084e-05, + "loss": 0.3652, + "step": 8172 + }, + { + "epoch": 0.4576660320304625, + "grad_norm": 1.1483045816421509, + "learning_rate": 4.0845e-05, + "loss": 0.3515, + "step": 8173 + }, + { + "epoch": 0.45772202934259154, + "grad_norm": 1.2674108743667603, + "learning_rate": 4.085e-05, + "loss": 0.5687, + "step": 8174 + }, + { + "epoch": 0.45777802665472056, + "grad_norm": 1.2336115837097168, + "learning_rate": 4.0855e-05, + "loss": 0.4017, + "step": 8175 + }, + { + "epoch": 0.4578340239668496, + "grad_norm": 1.3384089469909668, + "learning_rate": 4.0860000000000005e-05, + "loss": 0.4603, + "step": 8176 + }, + { + "epoch": 0.4578900212789786, + "grad_norm": 1.5389137268066406, + "learning_rate": 4.0865e-05, + "loss": 0.468, + "step": 8177 + }, + { + "epoch": 0.4579460185911076, + "grad_norm": 1.3750388622283936, + "learning_rate": 4.087e-05, + "loss": 0.4232, + "step": 8178 + }, + { + "epoch": 0.45800201590323664, + "grad_norm": 1.1116269826889038, + "learning_rate": 4.0875000000000004e-05, + "loss": 0.3124, + "step": 8179 + }, + { + "epoch": 0.45805801321536566, + "grad_norm": 1.4280734062194824, + "learning_rate": 4.088e-05, + "loss": 0.4786, + "step": 8180 + }, + { + "epoch": 0.4581140105274947, + "grad_norm": 1.4413557052612305, + "learning_rate": 4.0885e-05, + "loss": 0.4382, + "step": 8181 + }, + { + "epoch": 0.4581700078396237, + "grad_norm": 7.288043022155762, + "learning_rate": 4.089e-05, + "loss": 0.5532, + "step": 8182 + }, + { + "epoch": 0.4582260051517527, + "grad_norm": 1.6042112112045288, + "learning_rate": 4.0895e-05, + "loss": 0.6748, + "step": 8183 + }, + { + "epoch": 0.45828200246388173, + "grad_norm": 1.3809505701065063, + "learning_rate": 4.09e-05, + "loss": 0.3865, + "step": 8184 + }, + { + "epoch": 0.45833799977601075, + "grad_norm": 1.2860163450241089, + "learning_rate": 4.0905e-05, + "loss": 0.3698, + "step": 8185 + }, + { + "epoch": 0.45839399708813977, + "grad_norm": 1.2628626823425293, + "learning_rate": 4.0910000000000006e-05, + "loss": 0.6904, + "step": 8186 + }, + { + "epoch": 0.4584499944002688, + "grad_norm": 1.2822555303573608, + "learning_rate": 4.0915000000000004e-05, + "loss": 0.3674, + "step": 8187 + }, + { + "epoch": 0.4585059917123978, + "grad_norm": 1.1814857721328735, + "learning_rate": 4.092e-05, + "loss": 0.4019, + "step": 8188 + }, + { + "epoch": 0.45856198902452683, + "grad_norm": 1.335870623588562, + "learning_rate": 4.0925000000000005e-05, + "loss": 0.4589, + "step": 8189 + }, + { + "epoch": 0.45861798633665585, + "grad_norm": 1.0942025184631348, + "learning_rate": 4.093e-05, + "loss": 0.4158, + "step": 8190 + }, + { + "epoch": 0.45867398364878487, + "grad_norm": 1.0841032266616821, + "learning_rate": 4.0935e-05, + "loss": 0.2678, + "step": 8191 + }, + { + "epoch": 0.4587299809609139, + "grad_norm": 1.435125708580017, + "learning_rate": 4.094e-05, + "loss": 0.4795, + "step": 8192 + }, + { + "epoch": 0.4587859782730429, + "grad_norm": 1.2814698219299316, + "learning_rate": 4.0945e-05, + "loss": 0.2863, + "step": 8193 + }, + { + "epoch": 0.4588419755851719, + "grad_norm": 1.0510350465774536, + "learning_rate": 4.095e-05, + "loss": 0.4812, + "step": 8194 + }, + { + "epoch": 0.45889797289730094, + "grad_norm": 1.5333740711212158, + "learning_rate": 4.0955000000000003e-05, + "loss": 0.4996, + "step": 8195 + }, + { + "epoch": 0.45895397020942996, + "grad_norm": 1.2973487377166748, + "learning_rate": 4.096e-05, + "loss": 0.3833, + "step": 8196 + }, + { + "epoch": 0.459009967521559, + "grad_norm": 1.12303626537323, + "learning_rate": 4.0965000000000005e-05, + "loss": 0.3749, + "step": 8197 + }, + { + "epoch": 0.459065964833688, + "grad_norm": 1.536573052406311, + "learning_rate": 4.097e-05, + "loss": 0.3857, + "step": 8198 + }, + { + "epoch": 0.459121962145817, + "grad_norm": 1.2383390665054321, + "learning_rate": 4.0975e-05, + "loss": 0.5466, + "step": 8199 + }, + { + "epoch": 0.45917795945794604, + "grad_norm": 1.5300266742706299, + "learning_rate": 4.0980000000000004e-05, + "loss": 0.4991, + "step": 8200 + }, + { + "epoch": 0.45923395677007506, + "grad_norm": 1.4450125694274902, + "learning_rate": 4.0985e-05, + "loss": 0.493, + "step": 8201 + }, + { + "epoch": 0.4592899540822041, + "grad_norm": 1.259688377380371, + "learning_rate": 4.099e-05, + "loss": 0.3361, + "step": 8202 + }, + { + "epoch": 0.4593459513943331, + "grad_norm": 1.6072922945022583, + "learning_rate": 4.0995e-05, + "loss": 0.368, + "step": 8203 + }, + { + "epoch": 0.4594019487064621, + "grad_norm": 1.303582787513733, + "learning_rate": 4.1e-05, + "loss": 0.4323, + "step": 8204 + }, + { + "epoch": 0.45945794601859113, + "grad_norm": 1.08994460105896, + "learning_rate": 4.1005000000000005e-05, + "loss": 0.3298, + "step": 8205 + }, + { + "epoch": 0.4595139433307201, + "grad_norm": 1.1609015464782715, + "learning_rate": 4.101e-05, + "loss": 0.3274, + "step": 8206 + }, + { + "epoch": 0.4595699406428491, + "grad_norm": 1.1675680875778198, + "learning_rate": 4.1015000000000006e-05, + "loss": 0.3734, + "step": 8207 + }, + { + "epoch": 0.45962593795497814, + "grad_norm": 1.1078368425369263, + "learning_rate": 4.1020000000000004e-05, + "loss": 0.2963, + "step": 8208 + }, + { + "epoch": 0.45968193526710716, + "grad_norm": 1.1644442081451416, + "learning_rate": 4.1025e-05, + "loss": 0.4045, + "step": 8209 + }, + { + "epoch": 0.4597379325792362, + "grad_norm": 1.2316521406173706, + "learning_rate": 4.103e-05, + "loss": 0.3405, + "step": 8210 + }, + { + "epoch": 0.4597939298913652, + "grad_norm": 1.2362315654754639, + "learning_rate": 4.1035e-05, + "loss": 0.4789, + "step": 8211 + }, + { + "epoch": 0.4598499272034942, + "grad_norm": 1.1099541187286377, + "learning_rate": 4.104e-05, + "loss": 0.3682, + "step": 8212 + }, + { + "epoch": 0.45990592451562323, + "grad_norm": 1.1598701477050781, + "learning_rate": 4.1045e-05, + "loss": 0.4685, + "step": 8213 + }, + { + "epoch": 0.45996192182775225, + "grad_norm": 1.2803281545639038, + "learning_rate": 4.105e-05, + "loss": 0.4272, + "step": 8214 + }, + { + "epoch": 0.46001791913988127, + "grad_norm": 1.3603127002716064, + "learning_rate": 4.1055000000000006e-05, + "loss": 0.4071, + "step": 8215 + }, + { + "epoch": 0.4600739164520103, + "grad_norm": 1.4698951244354248, + "learning_rate": 4.106e-05, + "loss": 0.5065, + "step": 8216 + }, + { + "epoch": 0.4601299137641393, + "grad_norm": 1.216223120689392, + "learning_rate": 4.1065e-05, + "loss": 0.42, + "step": 8217 + }, + { + "epoch": 0.46018591107626833, + "grad_norm": 1.1401433944702148, + "learning_rate": 4.1070000000000005e-05, + "loss": 0.3659, + "step": 8218 + }, + { + "epoch": 0.46024190838839735, + "grad_norm": 1.258061408996582, + "learning_rate": 4.1075e-05, + "loss": 0.3915, + "step": 8219 + }, + { + "epoch": 0.46029790570052637, + "grad_norm": 1.2542272806167603, + "learning_rate": 4.108e-05, + "loss": 0.3817, + "step": 8220 + }, + { + "epoch": 0.4603539030126554, + "grad_norm": 1.1371816396713257, + "learning_rate": 4.1085000000000004e-05, + "loss": 0.3568, + "step": 8221 + }, + { + "epoch": 0.4604099003247844, + "grad_norm": 1.3532615900039673, + "learning_rate": 4.109e-05, + "loss": 0.5167, + "step": 8222 + }, + { + "epoch": 0.4604658976369134, + "grad_norm": 1.1306337118148804, + "learning_rate": 4.1095e-05, + "loss": 0.3148, + "step": 8223 + }, + { + "epoch": 0.46052189494904244, + "grad_norm": 1.223226547241211, + "learning_rate": 4.11e-05, + "loss": 0.4107, + "step": 8224 + }, + { + "epoch": 0.46057789226117146, + "grad_norm": 1.410191297531128, + "learning_rate": 4.110500000000001e-05, + "loss": 0.398, + "step": 8225 + }, + { + "epoch": 0.4606338895733005, + "grad_norm": 1.2490471601486206, + "learning_rate": 4.1110000000000005e-05, + "loss": 0.4474, + "step": 8226 + }, + { + "epoch": 0.4606898868854295, + "grad_norm": 1.339874267578125, + "learning_rate": 4.1115e-05, + "loss": 0.4839, + "step": 8227 + }, + { + "epoch": 0.4607458841975585, + "grad_norm": 1.209472417831421, + "learning_rate": 4.1120000000000006e-05, + "loss": 0.3883, + "step": 8228 + }, + { + "epoch": 0.46080188150968754, + "grad_norm": 1.391992211341858, + "learning_rate": 4.1125000000000004e-05, + "loss": 0.4707, + "step": 8229 + }, + { + "epoch": 0.46085787882181656, + "grad_norm": 1.5319911241531372, + "learning_rate": 4.113e-05, + "loss": 0.529, + "step": 8230 + }, + { + "epoch": 0.4609138761339456, + "grad_norm": 1.330243468284607, + "learning_rate": 4.1135e-05, + "loss": 0.6138, + "step": 8231 + }, + { + "epoch": 0.4609698734460746, + "grad_norm": 1.2333359718322754, + "learning_rate": 4.114e-05, + "loss": 0.4012, + "step": 8232 + }, + { + "epoch": 0.4610258707582036, + "grad_norm": 1.2454456090927124, + "learning_rate": 4.1145e-05, + "loss": 0.3726, + "step": 8233 + }, + { + "epoch": 0.46108186807033263, + "grad_norm": 1.2721551656723022, + "learning_rate": 4.115e-05, + "loss": 0.485, + "step": 8234 + }, + { + "epoch": 0.46113786538246165, + "grad_norm": 1.3370745182037354, + "learning_rate": 4.1155e-05, + "loss": 0.4843, + "step": 8235 + }, + { + "epoch": 0.4611938626945907, + "grad_norm": 1.1030439138412476, + "learning_rate": 4.1160000000000006e-05, + "loss": 0.4659, + "step": 8236 + }, + { + "epoch": 0.4612498600067197, + "grad_norm": 1.1367319822311401, + "learning_rate": 4.1165e-05, + "loss": 0.3827, + "step": 8237 + }, + { + "epoch": 0.4613058573188487, + "grad_norm": 1.2777172327041626, + "learning_rate": 4.117e-05, + "loss": 0.4422, + "step": 8238 + }, + { + "epoch": 0.46136185463097773, + "grad_norm": 1.0879982709884644, + "learning_rate": 4.1175000000000005e-05, + "loss": 0.4092, + "step": 8239 + }, + { + "epoch": 0.46141785194310675, + "grad_norm": 1.1981239318847656, + "learning_rate": 4.118e-05, + "loss": 0.3226, + "step": 8240 + }, + { + "epoch": 0.46147384925523577, + "grad_norm": 1.2820683717727661, + "learning_rate": 4.1185e-05, + "loss": 0.3804, + "step": 8241 + }, + { + "epoch": 0.4615298465673648, + "grad_norm": 1.2135425806045532, + "learning_rate": 4.1190000000000004e-05, + "loss": 0.4471, + "step": 8242 + }, + { + "epoch": 0.4615858438794938, + "grad_norm": 1.178632378578186, + "learning_rate": 4.1195e-05, + "loss": 0.3996, + "step": 8243 + }, + { + "epoch": 0.4616418411916228, + "grad_norm": 1.5495091676712036, + "learning_rate": 4.12e-05, + "loss": 0.4909, + "step": 8244 + }, + { + "epoch": 0.46169783850375185, + "grad_norm": 1.735106348991394, + "learning_rate": 4.1205e-05, + "loss": 0.559, + "step": 8245 + }, + { + "epoch": 0.46175383581588086, + "grad_norm": 1.3498954772949219, + "learning_rate": 4.121000000000001e-05, + "loss": 0.456, + "step": 8246 + }, + { + "epoch": 0.46180983312800983, + "grad_norm": 1.32374906539917, + "learning_rate": 4.1215000000000004e-05, + "loss": 0.4899, + "step": 8247 + }, + { + "epoch": 0.46186583044013885, + "grad_norm": 1.8373581171035767, + "learning_rate": 4.122e-05, + "loss": 0.5818, + "step": 8248 + }, + { + "epoch": 0.46192182775226787, + "grad_norm": 1.378580093383789, + "learning_rate": 4.1225e-05, + "loss": 0.4885, + "step": 8249 + }, + { + "epoch": 0.4619778250643969, + "grad_norm": 1.1680793762207031, + "learning_rate": 4.123e-05, + "loss": 0.3525, + "step": 8250 + }, + { + "epoch": 0.4620338223765259, + "grad_norm": 1.231508731842041, + "learning_rate": 4.1235e-05, + "loss": 0.4798, + "step": 8251 + }, + { + "epoch": 0.4620898196886549, + "grad_norm": 1.0578612089157104, + "learning_rate": 4.124e-05, + "loss": 0.4774, + "step": 8252 + }, + { + "epoch": 0.46214581700078394, + "grad_norm": 1.1454347372055054, + "learning_rate": 4.1245e-05, + "loss": 0.5126, + "step": 8253 + }, + { + "epoch": 0.46220181431291296, + "grad_norm": 1.1849184036254883, + "learning_rate": 4.125e-05, + "loss": 0.4887, + "step": 8254 + }, + { + "epoch": 0.462257811625042, + "grad_norm": 1.3541444540023804, + "learning_rate": 4.1255e-05, + "loss": 0.444, + "step": 8255 + }, + { + "epoch": 0.462313808937171, + "grad_norm": 1.303864598274231, + "learning_rate": 4.126e-05, + "loss": 0.4418, + "step": 8256 + }, + { + "epoch": 0.4623698062493, + "grad_norm": 1.2061721086502075, + "learning_rate": 4.1265000000000006e-05, + "loss": 0.3507, + "step": 8257 + }, + { + "epoch": 0.46242580356142904, + "grad_norm": 1.4810192584991455, + "learning_rate": 4.127e-05, + "loss": 0.4216, + "step": 8258 + }, + { + "epoch": 0.46248180087355806, + "grad_norm": 1.2376545667648315, + "learning_rate": 4.1275e-05, + "loss": 0.3474, + "step": 8259 + }, + { + "epoch": 0.4625377981856871, + "grad_norm": 1.3552175760269165, + "learning_rate": 4.1280000000000005e-05, + "loss": 0.5276, + "step": 8260 + }, + { + "epoch": 0.4625937954978161, + "grad_norm": 1.53526771068573, + "learning_rate": 4.1285e-05, + "loss": 0.5101, + "step": 8261 + }, + { + "epoch": 0.4626497928099451, + "grad_norm": 1.2176669836044312, + "learning_rate": 4.129e-05, + "loss": 0.3766, + "step": 8262 + }, + { + "epoch": 0.46270579012207413, + "grad_norm": 1.3207225799560547, + "learning_rate": 4.1295000000000004e-05, + "loss": 0.3536, + "step": 8263 + }, + { + "epoch": 0.46276178743420315, + "grad_norm": 1.5815467834472656, + "learning_rate": 4.13e-05, + "loss": 0.3933, + "step": 8264 + }, + { + "epoch": 0.4628177847463322, + "grad_norm": 1.1484071016311646, + "learning_rate": 4.1305e-05, + "loss": 0.3941, + "step": 8265 + }, + { + "epoch": 0.4628737820584612, + "grad_norm": 1.2536993026733398, + "learning_rate": 4.131e-05, + "loss": 0.3645, + "step": 8266 + }, + { + "epoch": 0.4629297793705902, + "grad_norm": 1.1876859664916992, + "learning_rate": 4.131500000000001e-05, + "loss": 0.5404, + "step": 8267 + }, + { + "epoch": 0.46298577668271923, + "grad_norm": 1.2260714769363403, + "learning_rate": 4.1320000000000004e-05, + "loss": 0.5652, + "step": 8268 + }, + { + "epoch": 0.46304177399484825, + "grad_norm": 1.2548279762268066, + "learning_rate": 4.1325e-05, + "loss": 0.4476, + "step": 8269 + }, + { + "epoch": 0.46309777130697727, + "grad_norm": 1.3838043212890625, + "learning_rate": 4.133e-05, + "loss": 0.3975, + "step": 8270 + }, + { + "epoch": 0.4631537686191063, + "grad_norm": 1.2394155263900757, + "learning_rate": 4.1335e-05, + "loss": 0.4295, + "step": 8271 + }, + { + "epoch": 0.4632097659312353, + "grad_norm": 1.3314915895462036, + "learning_rate": 4.134e-05, + "loss": 0.5559, + "step": 8272 + }, + { + "epoch": 0.4632657632433643, + "grad_norm": 1.9199459552764893, + "learning_rate": 4.1345e-05, + "loss": 0.5407, + "step": 8273 + }, + { + "epoch": 0.46332176055549334, + "grad_norm": 1.1639423370361328, + "learning_rate": 4.135e-05, + "loss": 0.4417, + "step": 8274 + }, + { + "epoch": 0.46337775786762236, + "grad_norm": 1.4173465967178345, + "learning_rate": 4.1355e-05, + "loss": 0.3688, + "step": 8275 + }, + { + "epoch": 0.4634337551797514, + "grad_norm": 1.2867845296859741, + "learning_rate": 4.1360000000000004e-05, + "loss": 0.387, + "step": 8276 + }, + { + "epoch": 0.4634897524918804, + "grad_norm": 1.2569302320480347, + "learning_rate": 4.1365e-05, + "loss": 0.4712, + "step": 8277 + }, + { + "epoch": 0.4635457498040094, + "grad_norm": 1.2919987440109253, + "learning_rate": 4.1370000000000005e-05, + "loss": 0.5301, + "step": 8278 + }, + { + "epoch": 0.46360174711613844, + "grad_norm": 1.2371081113815308, + "learning_rate": 4.1375e-05, + "loss": 0.4337, + "step": 8279 + }, + { + "epoch": 0.46365774442826746, + "grad_norm": 1.2740482091903687, + "learning_rate": 4.138e-05, + "loss": 0.5003, + "step": 8280 + }, + { + "epoch": 0.4637137417403965, + "grad_norm": 1.199916124343872, + "learning_rate": 4.1385000000000004e-05, + "loss": 0.4129, + "step": 8281 + }, + { + "epoch": 0.4637697390525255, + "grad_norm": 1.294663667678833, + "learning_rate": 4.139e-05, + "loss": 0.4201, + "step": 8282 + }, + { + "epoch": 0.4638257363646545, + "grad_norm": 1.6683567762374878, + "learning_rate": 4.1395e-05, + "loss": 0.4427, + "step": 8283 + }, + { + "epoch": 0.46388173367678354, + "grad_norm": 1.3331955671310425, + "learning_rate": 4.14e-05, + "loss": 0.3179, + "step": 8284 + }, + { + "epoch": 0.46393773098891256, + "grad_norm": 1.1765532493591309, + "learning_rate": 4.1405e-05, + "loss": 0.2859, + "step": 8285 + }, + { + "epoch": 0.4639937283010416, + "grad_norm": 1.1770352125167847, + "learning_rate": 4.1410000000000005e-05, + "loss": 0.4553, + "step": 8286 + }, + { + "epoch": 0.4640497256131706, + "grad_norm": 1.4300110340118408, + "learning_rate": 4.1415e-05, + "loss": 0.3925, + "step": 8287 + }, + { + "epoch": 0.4641057229252996, + "grad_norm": 1.2035119533538818, + "learning_rate": 4.142000000000001e-05, + "loss": 0.4124, + "step": 8288 + }, + { + "epoch": 0.4641617202374286, + "grad_norm": 1.1478502750396729, + "learning_rate": 4.1425000000000004e-05, + "loss": 0.483, + "step": 8289 + }, + { + "epoch": 0.4642177175495576, + "grad_norm": 1.1492865085601807, + "learning_rate": 4.143e-05, + "loss": 0.3764, + "step": 8290 + }, + { + "epoch": 0.4642737148616866, + "grad_norm": 1.1740813255310059, + "learning_rate": 4.1435e-05, + "loss": 0.4237, + "step": 8291 + }, + { + "epoch": 0.46432971217381563, + "grad_norm": 1.120821237564087, + "learning_rate": 4.144e-05, + "loss": 0.3322, + "step": 8292 + }, + { + "epoch": 0.46438570948594465, + "grad_norm": 1.0857949256896973, + "learning_rate": 4.1445e-05, + "loss": 0.3412, + "step": 8293 + }, + { + "epoch": 0.4644417067980737, + "grad_norm": 1.2305532693862915, + "learning_rate": 4.145e-05, + "loss": 0.4732, + "step": 8294 + }, + { + "epoch": 0.4644977041102027, + "grad_norm": 3.610482692718506, + "learning_rate": 4.1455e-05, + "loss": 0.5287, + "step": 8295 + }, + { + "epoch": 0.4645537014223317, + "grad_norm": 1.2439639568328857, + "learning_rate": 4.1460000000000006e-05, + "loss": 0.4171, + "step": 8296 + }, + { + "epoch": 0.46460969873446073, + "grad_norm": 1.2058327198028564, + "learning_rate": 4.1465000000000004e-05, + "loss": 0.4941, + "step": 8297 + }, + { + "epoch": 0.46466569604658975, + "grad_norm": 1.315577507019043, + "learning_rate": 4.147e-05, + "loss": 0.3526, + "step": 8298 + }, + { + "epoch": 0.46472169335871877, + "grad_norm": 1.451165795326233, + "learning_rate": 4.1475000000000005e-05, + "loss": 0.4023, + "step": 8299 + }, + { + "epoch": 0.4647776906708478, + "grad_norm": 1.1746854782104492, + "learning_rate": 4.148e-05, + "loss": 0.3519, + "step": 8300 + }, + { + "epoch": 0.4648336879829768, + "grad_norm": 1.617592692375183, + "learning_rate": 4.1485e-05, + "loss": 0.429, + "step": 8301 + }, + { + "epoch": 0.4648896852951058, + "grad_norm": 1.423981785774231, + "learning_rate": 4.1490000000000004e-05, + "loss": 0.5623, + "step": 8302 + }, + { + "epoch": 0.46494568260723484, + "grad_norm": 1.2950692176818848, + "learning_rate": 4.1495e-05, + "loss": 0.4532, + "step": 8303 + }, + { + "epoch": 0.46500167991936386, + "grad_norm": 1.4684441089630127, + "learning_rate": 4.15e-05, + "loss": 0.5641, + "step": 8304 + }, + { + "epoch": 0.4650576772314929, + "grad_norm": 1.3021410703659058, + "learning_rate": 4.1504999999999996e-05, + "loss": 0.4803, + "step": 8305 + }, + { + "epoch": 0.4651136745436219, + "grad_norm": 1.3266184329986572, + "learning_rate": 4.151000000000001e-05, + "loss": 0.5234, + "step": 8306 + }, + { + "epoch": 0.4651696718557509, + "grad_norm": 1.1590931415557861, + "learning_rate": 4.1515000000000005e-05, + "loss": 0.3411, + "step": 8307 + }, + { + "epoch": 0.46522566916787994, + "grad_norm": 1.4431616067886353, + "learning_rate": 4.152e-05, + "loss": 0.4403, + "step": 8308 + }, + { + "epoch": 0.46528166648000896, + "grad_norm": 1.0642368793487549, + "learning_rate": 4.1525e-05, + "loss": 0.3634, + "step": 8309 + }, + { + "epoch": 0.465337663792138, + "grad_norm": 1.254712462425232, + "learning_rate": 4.1530000000000004e-05, + "loss": 0.4495, + "step": 8310 + }, + { + "epoch": 0.465393661104267, + "grad_norm": 1.1482433080673218, + "learning_rate": 4.1535e-05, + "loss": 0.3812, + "step": 8311 + }, + { + "epoch": 0.465449658416396, + "grad_norm": 1.107596755027771, + "learning_rate": 4.154e-05, + "loss": 0.4688, + "step": 8312 + }, + { + "epoch": 0.46550565572852504, + "grad_norm": 1.2421313524246216, + "learning_rate": 4.1545e-05, + "loss": 0.3898, + "step": 8313 + }, + { + "epoch": 0.46556165304065406, + "grad_norm": 1.170276165008545, + "learning_rate": 4.155e-05, + "loss": 0.3996, + "step": 8314 + }, + { + "epoch": 0.4656176503527831, + "grad_norm": 1.4561901092529297, + "learning_rate": 4.1555e-05, + "loss": 0.4181, + "step": 8315 + }, + { + "epoch": 0.4656736476649121, + "grad_norm": 1.160518765449524, + "learning_rate": 4.156e-05, + "loss": 0.524, + "step": 8316 + }, + { + "epoch": 0.4657296449770411, + "grad_norm": 1.5591087341308594, + "learning_rate": 4.1565000000000006e-05, + "loss": 0.5717, + "step": 8317 + }, + { + "epoch": 0.46578564228917013, + "grad_norm": 1.2539479732513428, + "learning_rate": 4.1570000000000003e-05, + "loss": 0.5269, + "step": 8318 + }, + { + "epoch": 0.46584163960129915, + "grad_norm": 1.2426033020019531, + "learning_rate": 4.1575e-05, + "loss": 0.5566, + "step": 8319 + }, + { + "epoch": 0.46589763691342817, + "grad_norm": 1.1913050413131714, + "learning_rate": 4.1580000000000005e-05, + "loss": 0.3839, + "step": 8320 + }, + { + "epoch": 0.4659536342255572, + "grad_norm": 1.2415966987609863, + "learning_rate": 4.1585e-05, + "loss": 0.3982, + "step": 8321 + }, + { + "epoch": 0.4660096315376862, + "grad_norm": 1.1320737600326538, + "learning_rate": 4.159e-05, + "loss": 0.4148, + "step": 8322 + }, + { + "epoch": 0.46606562884981523, + "grad_norm": 1.401557445526123, + "learning_rate": 4.1595e-05, + "loss": 0.4431, + "step": 8323 + }, + { + "epoch": 0.46612162616194425, + "grad_norm": 1.2837218046188354, + "learning_rate": 4.16e-05, + "loss": 0.4153, + "step": 8324 + }, + { + "epoch": 0.46617762347407327, + "grad_norm": 1.4516154527664185, + "learning_rate": 4.1605e-05, + "loss": 0.493, + "step": 8325 + }, + { + "epoch": 0.4662336207862023, + "grad_norm": 1.219534993171692, + "learning_rate": 4.161e-05, + "loss": 0.4909, + "step": 8326 + }, + { + "epoch": 0.4662896180983313, + "grad_norm": 1.3851169347763062, + "learning_rate": 4.161500000000001e-05, + "loss": 0.3984, + "step": 8327 + }, + { + "epoch": 0.4663456154104603, + "grad_norm": 1.273726224899292, + "learning_rate": 4.1620000000000005e-05, + "loss": 0.3899, + "step": 8328 + }, + { + "epoch": 0.46640161272258934, + "grad_norm": 2.3882343769073486, + "learning_rate": 4.1625e-05, + "loss": 0.6668, + "step": 8329 + }, + { + "epoch": 0.4664576100347183, + "grad_norm": 1.1493287086486816, + "learning_rate": 4.163e-05, + "loss": 0.3776, + "step": 8330 + }, + { + "epoch": 0.4665136073468473, + "grad_norm": 1.4332467317581177, + "learning_rate": 4.1635000000000004e-05, + "loss": 0.4647, + "step": 8331 + }, + { + "epoch": 0.46656960465897634, + "grad_norm": 1.2122044563293457, + "learning_rate": 4.164e-05, + "loss": 0.3503, + "step": 8332 + }, + { + "epoch": 0.46662560197110536, + "grad_norm": 1.2885552644729614, + "learning_rate": 4.1645e-05, + "loss": 0.587, + "step": 8333 + }, + { + "epoch": 0.4666815992832344, + "grad_norm": 1.2309215068817139, + "learning_rate": 4.165e-05, + "loss": 0.5463, + "step": 8334 + }, + { + "epoch": 0.4667375965953634, + "grad_norm": 1.1431134939193726, + "learning_rate": 4.1655e-05, + "loss": 0.3593, + "step": 8335 + }, + { + "epoch": 0.4667935939074924, + "grad_norm": 1.6323133707046509, + "learning_rate": 4.1660000000000004e-05, + "loss": 0.4827, + "step": 8336 + }, + { + "epoch": 0.46684959121962144, + "grad_norm": 1.3194406032562256, + "learning_rate": 4.1665e-05, + "loss": 0.4625, + "step": 8337 + }, + { + "epoch": 0.46690558853175046, + "grad_norm": 1.180116891860962, + "learning_rate": 4.1670000000000006e-05, + "loss": 0.4089, + "step": 8338 + }, + { + "epoch": 0.4669615858438795, + "grad_norm": 1.1480462551116943, + "learning_rate": 4.1675e-05, + "loss": 0.426, + "step": 8339 + }, + { + "epoch": 0.4670175831560085, + "grad_norm": 1.3161413669586182, + "learning_rate": 4.168e-05, + "loss": 0.4495, + "step": 8340 + }, + { + "epoch": 0.4670735804681375, + "grad_norm": 1.3676503896713257, + "learning_rate": 4.1685000000000005e-05, + "loss": 0.4018, + "step": 8341 + }, + { + "epoch": 0.46712957778026654, + "grad_norm": 1.2224594354629517, + "learning_rate": 4.169e-05, + "loss": 0.3976, + "step": 8342 + }, + { + "epoch": 0.46718557509239556, + "grad_norm": 1.1920114755630493, + "learning_rate": 4.1695e-05, + "loss": 0.3581, + "step": 8343 + }, + { + "epoch": 0.4672415724045246, + "grad_norm": 1.3474382162094116, + "learning_rate": 4.17e-05, + "loss": 0.4556, + "step": 8344 + }, + { + "epoch": 0.4672975697166536, + "grad_norm": 1.1781539916992188, + "learning_rate": 4.1705e-05, + "loss": 0.4035, + "step": 8345 + }, + { + "epoch": 0.4673535670287826, + "grad_norm": 1.3382846117019653, + "learning_rate": 4.1710000000000006e-05, + "loss": 0.4421, + "step": 8346 + }, + { + "epoch": 0.46740956434091163, + "grad_norm": 1.2219523191452026, + "learning_rate": 4.1715e-05, + "loss": 0.342, + "step": 8347 + }, + { + "epoch": 0.46746556165304065, + "grad_norm": 1.2026203870773315, + "learning_rate": 4.172e-05, + "loss": 0.3207, + "step": 8348 + }, + { + "epoch": 0.46752155896516967, + "grad_norm": 1.270452857017517, + "learning_rate": 4.1725000000000005e-05, + "loss": 0.3315, + "step": 8349 + }, + { + "epoch": 0.4675775562772987, + "grad_norm": 1.188292145729065, + "learning_rate": 4.173e-05, + "loss": 0.4851, + "step": 8350 + }, + { + "epoch": 0.4676335535894277, + "grad_norm": 1.2668474912643433, + "learning_rate": 4.1735e-05, + "loss": 0.5031, + "step": 8351 + }, + { + "epoch": 0.4676895509015567, + "grad_norm": 1.1694830656051636, + "learning_rate": 4.1740000000000004e-05, + "loss": 0.399, + "step": 8352 + }, + { + "epoch": 0.46774554821368575, + "grad_norm": Infinity, + "learning_rate": 4.1740000000000004e-05, + "loss": 0.3961, + "step": 8353 + }, + { + "epoch": 0.46780154552581477, + "grad_norm": 1.2740497589111328, + "learning_rate": 4.1745e-05, + "loss": 0.3757, + "step": 8354 + }, + { + "epoch": 0.4678575428379438, + "grad_norm": 1.4216548204421997, + "learning_rate": 4.175e-05, + "loss": 0.5351, + "step": 8355 + }, + { + "epoch": 0.4679135401500728, + "grad_norm": 1.3175214529037476, + "learning_rate": 4.1755e-05, + "loss": 0.4135, + "step": 8356 + }, + { + "epoch": 0.4679695374622018, + "grad_norm": 1.3715314865112305, + "learning_rate": 4.176000000000001e-05, + "loss": 0.4682, + "step": 8357 + }, + { + "epoch": 0.46802553477433084, + "grad_norm": 1.3775523900985718, + "learning_rate": 4.1765000000000004e-05, + "loss": 0.3678, + "step": 8358 + }, + { + "epoch": 0.46808153208645986, + "grad_norm": 0.9641080498695374, + "learning_rate": 4.177e-05, + "loss": 0.352, + "step": 8359 + }, + { + "epoch": 0.4681375293985889, + "grad_norm": 1.3386561870574951, + "learning_rate": 4.1775000000000006e-05, + "loss": 0.3755, + "step": 8360 + }, + { + "epoch": 0.4681935267107179, + "grad_norm": 1.8422759771347046, + "learning_rate": 4.178e-05, + "loss": 0.4397, + "step": 8361 + }, + { + "epoch": 0.4682495240228469, + "grad_norm": 1.231330394744873, + "learning_rate": 4.1785e-05, + "loss": 0.4097, + "step": 8362 + }, + { + "epoch": 0.46830552133497594, + "grad_norm": 1.1578826904296875, + "learning_rate": 4.179e-05, + "loss": 0.4185, + "step": 8363 + }, + { + "epoch": 0.46836151864710496, + "grad_norm": 1.117853045463562, + "learning_rate": 4.1795e-05, + "loss": 0.4176, + "step": 8364 + }, + { + "epoch": 0.468417515959234, + "grad_norm": 1.170587420463562, + "learning_rate": 4.18e-05, + "loss": 0.3724, + "step": 8365 + }, + { + "epoch": 0.468473513271363, + "grad_norm": 1.0577501058578491, + "learning_rate": 4.1805e-05, + "loss": 0.4205, + "step": 8366 + }, + { + "epoch": 0.468529510583492, + "grad_norm": 1.3998960256576538, + "learning_rate": 4.181000000000001e-05, + "loss": 0.5887, + "step": 8367 + }, + { + "epoch": 0.46858550789562103, + "grad_norm": 1.2609167098999023, + "learning_rate": 4.1815000000000005e-05, + "loss": 0.4566, + "step": 8368 + }, + { + "epoch": 0.46864150520775005, + "grad_norm": 1.1134520769119263, + "learning_rate": 4.182e-05, + "loss": 0.4511, + "step": 8369 + }, + { + "epoch": 0.4686975025198791, + "grad_norm": 1.513018250465393, + "learning_rate": 4.1825e-05, + "loss": 0.4166, + "step": 8370 + }, + { + "epoch": 0.46875349983200804, + "grad_norm": 1.2183899879455566, + "learning_rate": 4.1830000000000004e-05, + "loss": 0.3214, + "step": 8371 + }, + { + "epoch": 0.46880949714413706, + "grad_norm": 1.2080955505371094, + "learning_rate": 4.1835e-05, + "loss": 0.4435, + "step": 8372 + }, + { + "epoch": 0.4688654944562661, + "grad_norm": 1.2426809072494507, + "learning_rate": 4.184e-05, + "loss": 0.5445, + "step": 8373 + }, + { + "epoch": 0.4689214917683951, + "grad_norm": 1.1984511613845825, + "learning_rate": 4.1845000000000003e-05, + "loss": 0.3413, + "step": 8374 + }, + { + "epoch": 0.4689774890805241, + "grad_norm": 1.206897497177124, + "learning_rate": 4.185e-05, + "loss": 0.6036, + "step": 8375 + }, + { + "epoch": 0.46903348639265313, + "grad_norm": 1.1841862201690674, + "learning_rate": 4.1855e-05, + "loss": 0.3707, + "step": 8376 + }, + { + "epoch": 0.46908948370478215, + "grad_norm": 1.1985225677490234, + "learning_rate": 4.186e-05, + "loss": 0.4071, + "step": 8377 + }, + { + "epoch": 0.46914548101691117, + "grad_norm": 1.1437454223632812, + "learning_rate": 4.1865000000000007e-05, + "loss": 0.3683, + "step": 8378 + }, + { + "epoch": 0.4692014783290402, + "grad_norm": 1.1463754177093506, + "learning_rate": 4.1870000000000004e-05, + "loss": 0.3746, + "step": 8379 + }, + { + "epoch": 0.4692574756411692, + "grad_norm": 1.1942380666732788, + "learning_rate": 4.1875e-05, + "loss": 0.3742, + "step": 8380 + }, + { + "epoch": 0.4693134729532982, + "grad_norm": 1.176209807395935, + "learning_rate": 4.1880000000000006e-05, + "loss": 0.4211, + "step": 8381 + }, + { + "epoch": 0.46936947026542725, + "grad_norm": 1.2539499998092651, + "learning_rate": 4.1885e-05, + "loss": 0.4638, + "step": 8382 + }, + { + "epoch": 0.46942546757755627, + "grad_norm": 1.2557951211929321, + "learning_rate": 4.189e-05, + "loss": 0.3983, + "step": 8383 + }, + { + "epoch": 0.4694814648896853, + "grad_norm": 1.3948652744293213, + "learning_rate": 4.1895e-05, + "loss": 0.4609, + "step": 8384 + }, + { + "epoch": 0.4695374622018143, + "grad_norm": 1.2727895975112915, + "learning_rate": 4.19e-05, + "loss": 0.3754, + "step": 8385 + }, + { + "epoch": 0.4695934595139433, + "grad_norm": 1.0155041217803955, + "learning_rate": 4.1905e-05, + "loss": 0.3234, + "step": 8386 + }, + { + "epoch": 0.46964945682607234, + "grad_norm": 1.5105594396591187, + "learning_rate": 4.191e-05, + "loss": 0.4672, + "step": 8387 + }, + { + "epoch": 0.46970545413820136, + "grad_norm": 1.1224207878112793, + "learning_rate": 4.1915e-05, + "loss": 0.483, + "step": 8388 + }, + { + "epoch": 0.4697614514503304, + "grad_norm": 1.2138935327529907, + "learning_rate": 4.1920000000000005e-05, + "loss": 0.3697, + "step": 8389 + }, + { + "epoch": 0.4698174487624594, + "grad_norm": 1.4979029893875122, + "learning_rate": 4.1925e-05, + "loss": 0.6758, + "step": 8390 + }, + { + "epoch": 0.4698734460745884, + "grad_norm": 1.3279343843460083, + "learning_rate": 4.193e-05, + "loss": 0.3267, + "step": 8391 + }, + { + "epoch": 0.46992944338671744, + "grad_norm": 1.2012410163879395, + "learning_rate": 4.1935000000000004e-05, + "loss": 0.3242, + "step": 8392 + }, + { + "epoch": 0.46998544069884646, + "grad_norm": 1.4319946765899658, + "learning_rate": 4.194e-05, + "loss": 0.601, + "step": 8393 + }, + { + "epoch": 0.4700414380109755, + "grad_norm": 1.10093355178833, + "learning_rate": 4.1945e-05, + "loss": 0.4318, + "step": 8394 + }, + { + "epoch": 0.4700974353231045, + "grad_norm": 1.4722977876663208, + "learning_rate": 4.195e-05, + "loss": 0.5919, + "step": 8395 + }, + { + "epoch": 0.4701534326352335, + "grad_norm": 1.5779893398284912, + "learning_rate": 4.1955e-05, + "loss": 0.4911, + "step": 8396 + }, + { + "epoch": 0.47020942994736253, + "grad_norm": 1.1077783107757568, + "learning_rate": 4.196e-05, + "loss": 0.4134, + "step": 8397 + }, + { + "epoch": 0.47026542725949155, + "grad_norm": 1.2830719947814941, + "learning_rate": 4.1965e-05, + "loss": 0.4634, + "step": 8398 + }, + { + "epoch": 0.47032142457162057, + "grad_norm": 1.5814772844314575, + "learning_rate": 4.1970000000000006e-05, + "loss": 0.4829, + "step": 8399 + }, + { + "epoch": 0.4703774218837496, + "grad_norm": 1.3765275478363037, + "learning_rate": 4.1975000000000004e-05, + "loss": 0.4915, + "step": 8400 + }, + { + "epoch": 0.4704334191958786, + "grad_norm": 1.1571701765060425, + "learning_rate": 4.198e-05, + "loss": 0.3825, + "step": 8401 + }, + { + "epoch": 0.47048941650800763, + "grad_norm": 1.8681832551956177, + "learning_rate": 4.1985000000000005e-05, + "loss": 0.7184, + "step": 8402 + }, + { + "epoch": 0.47054541382013665, + "grad_norm": 1.151965618133545, + "learning_rate": 4.199e-05, + "loss": 0.3928, + "step": 8403 + }, + { + "epoch": 0.47060141113226567, + "grad_norm": 1.2336509227752686, + "learning_rate": 4.1995e-05, + "loss": 0.4089, + "step": 8404 + }, + { + "epoch": 0.4706574084443947, + "grad_norm": 1.2102365493774414, + "learning_rate": 4.2e-05, + "loss": 0.4185, + "step": 8405 + }, + { + "epoch": 0.4707134057565237, + "grad_norm": 1.2155449390411377, + "learning_rate": 4.2005e-05, + "loss": 0.4994, + "step": 8406 + }, + { + "epoch": 0.4707694030686527, + "grad_norm": 1.1627260446548462, + "learning_rate": 4.201e-05, + "loss": 0.4654, + "step": 8407 + }, + { + "epoch": 0.47082540038078174, + "grad_norm": 1.1251270771026611, + "learning_rate": 4.2015000000000003e-05, + "loss": 0.4778, + "step": 8408 + }, + { + "epoch": 0.47088139769291076, + "grad_norm": 1.2458096742630005, + "learning_rate": 4.202e-05, + "loss": 0.3236, + "step": 8409 + }, + { + "epoch": 0.4709373950050398, + "grad_norm": 1.3230431079864502, + "learning_rate": 4.2025000000000005e-05, + "loss": 0.456, + "step": 8410 + }, + { + "epoch": 0.4709933923171688, + "grad_norm": 1.3141199350357056, + "learning_rate": 4.203e-05, + "loss": 0.5702, + "step": 8411 + }, + { + "epoch": 0.4710493896292978, + "grad_norm": 1.612363338470459, + "learning_rate": 4.2035e-05, + "loss": 0.5587, + "step": 8412 + }, + { + "epoch": 0.4711053869414268, + "grad_norm": 1.1775351762771606, + "learning_rate": 4.2040000000000004e-05, + "loss": 0.4286, + "step": 8413 + }, + { + "epoch": 0.4711613842535558, + "grad_norm": 1.340772032737732, + "learning_rate": 4.2045e-05, + "loss": 0.476, + "step": 8414 + }, + { + "epoch": 0.4712173815656848, + "grad_norm": 1.1946804523468018, + "learning_rate": 4.205e-05, + "loss": 0.5355, + "step": 8415 + }, + { + "epoch": 0.47127337887781384, + "grad_norm": 1.1417064666748047, + "learning_rate": 4.2055e-05, + "loss": 0.4496, + "step": 8416 + }, + { + "epoch": 0.47132937618994286, + "grad_norm": 1.1817623376846313, + "learning_rate": 4.206e-05, + "loss": 0.4119, + "step": 8417 + }, + { + "epoch": 0.4713853735020719, + "grad_norm": 1.3147894144058228, + "learning_rate": 4.2065000000000005e-05, + "loss": 0.5118, + "step": 8418 + }, + { + "epoch": 0.4714413708142009, + "grad_norm": 1.122429370880127, + "learning_rate": 4.207e-05, + "loss": 0.3612, + "step": 8419 + }, + { + "epoch": 0.4714973681263299, + "grad_norm": 1.4243937730789185, + "learning_rate": 4.2075000000000006e-05, + "loss": 0.5584, + "step": 8420 + }, + { + "epoch": 0.47155336543845894, + "grad_norm": 1.4853757619857788, + "learning_rate": 4.2080000000000004e-05, + "loss": 0.3971, + "step": 8421 + }, + { + "epoch": 0.47160936275058796, + "grad_norm": 1.1240071058273315, + "learning_rate": 4.2085e-05, + "loss": 0.4124, + "step": 8422 + }, + { + "epoch": 0.471665360062717, + "grad_norm": 1.3261979818344116, + "learning_rate": 4.209e-05, + "loss": 0.445, + "step": 8423 + }, + { + "epoch": 0.471721357374846, + "grad_norm": 1.1619199514389038, + "learning_rate": 4.2095e-05, + "loss": 0.3402, + "step": 8424 + }, + { + "epoch": 0.471777354686975, + "grad_norm": 1.2499545812606812, + "learning_rate": 4.21e-05, + "loss": 0.3863, + "step": 8425 + }, + { + "epoch": 0.47183335199910403, + "grad_norm": 1.130560040473938, + "learning_rate": 4.2105e-05, + "loss": 0.3989, + "step": 8426 + }, + { + "epoch": 0.47188934931123305, + "grad_norm": 1.137787938117981, + "learning_rate": 4.211e-05, + "loss": 0.3587, + "step": 8427 + }, + { + "epoch": 0.47194534662336207, + "grad_norm": 1.1024013757705688, + "learning_rate": 4.2115000000000006e-05, + "loss": 0.416, + "step": 8428 + }, + { + "epoch": 0.4720013439354911, + "grad_norm": 1.1284549236297607, + "learning_rate": 4.212e-05, + "loss": 0.4004, + "step": 8429 + }, + { + "epoch": 0.4720573412476201, + "grad_norm": 1.240843415260315, + "learning_rate": 4.2125e-05, + "loss": 0.4798, + "step": 8430 + }, + { + "epoch": 0.47211333855974913, + "grad_norm": 1.1080268621444702, + "learning_rate": 4.2130000000000005e-05, + "loss": 0.389, + "step": 8431 + }, + { + "epoch": 0.47216933587187815, + "grad_norm": 1.2600219249725342, + "learning_rate": 4.2135e-05, + "loss": 0.4573, + "step": 8432 + }, + { + "epoch": 0.47222533318400717, + "grad_norm": 1.1538052558898926, + "learning_rate": 4.214e-05, + "loss": 0.305, + "step": 8433 + }, + { + "epoch": 0.4722813304961362, + "grad_norm": 1.1136164665222168, + "learning_rate": 4.2145000000000004e-05, + "loss": 0.4308, + "step": 8434 + }, + { + "epoch": 0.4723373278082652, + "grad_norm": 1.4552466869354248, + "learning_rate": 4.215e-05, + "loss": 0.491, + "step": 8435 + }, + { + "epoch": 0.4723933251203942, + "grad_norm": 1.4417535066604614, + "learning_rate": 4.2155e-05, + "loss": 0.4447, + "step": 8436 + }, + { + "epoch": 0.47244932243252324, + "grad_norm": 1.243757724761963, + "learning_rate": 4.2159999999999996e-05, + "loss": 0.3696, + "step": 8437 + }, + { + "epoch": 0.47250531974465226, + "grad_norm": 1.1235696077346802, + "learning_rate": 4.216500000000001e-05, + "loss": 0.3932, + "step": 8438 + }, + { + "epoch": 0.4725613170567813, + "grad_norm": 1.2974927425384521, + "learning_rate": 4.2170000000000005e-05, + "loss": 0.4324, + "step": 8439 + }, + { + "epoch": 0.4726173143689103, + "grad_norm": 1.5832685232162476, + "learning_rate": 4.2175e-05, + "loss": 0.5752, + "step": 8440 + }, + { + "epoch": 0.4726733116810393, + "grad_norm": 1.3697322607040405, + "learning_rate": 4.2180000000000006e-05, + "loss": 0.3736, + "step": 8441 + }, + { + "epoch": 0.47272930899316834, + "grad_norm": 1.3663676977157593, + "learning_rate": 4.2185000000000004e-05, + "loss": 0.5753, + "step": 8442 + }, + { + "epoch": 0.47278530630529736, + "grad_norm": 1.1377853155136108, + "learning_rate": 4.219e-05, + "loss": 0.4741, + "step": 8443 + }, + { + "epoch": 0.4728413036174264, + "grad_norm": 1.0805935859680176, + "learning_rate": 4.2195e-05, + "loss": 0.4755, + "step": 8444 + }, + { + "epoch": 0.4728973009295554, + "grad_norm": 1.3271127939224243, + "learning_rate": 4.22e-05, + "loss": 0.4224, + "step": 8445 + }, + { + "epoch": 0.4729532982416844, + "grad_norm": 1.1402291059494019, + "learning_rate": 4.2205e-05, + "loss": 0.6048, + "step": 8446 + }, + { + "epoch": 0.47300929555381344, + "grad_norm": 1.3297299146652222, + "learning_rate": 4.221e-05, + "loss": 0.4885, + "step": 8447 + }, + { + "epoch": 0.47306529286594245, + "grad_norm": 1.0995362997055054, + "learning_rate": 4.2215e-05, + "loss": 0.4252, + "step": 8448 + }, + { + "epoch": 0.4731212901780715, + "grad_norm": 1.1222656965255737, + "learning_rate": 4.2220000000000006e-05, + "loss": 0.3197, + "step": 8449 + }, + { + "epoch": 0.4731772874902005, + "grad_norm": 1.27969229221344, + "learning_rate": 4.2225e-05, + "loss": 0.4638, + "step": 8450 + }, + { + "epoch": 0.4732332848023295, + "grad_norm": 1.1901013851165771, + "learning_rate": 4.223e-05, + "loss": 0.4299, + "step": 8451 + }, + { + "epoch": 0.47328928211445853, + "grad_norm": 1.2603455781936646, + "learning_rate": 4.2235000000000005e-05, + "loss": 0.3355, + "step": 8452 + }, + { + "epoch": 0.47334527942658755, + "grad_norm": 1.155975341796875, + "learning_rate": 4.224e-05, + "loss": 0.3084, + "step": 8453 + }, + { + "epoch": 0.4734012767387165, + "grad_norm": 1.3510417938232422, + "learning_rate": 4.2245e-05, + "loss": 0.5243, + "step": 8454 + }, + { + "epoch": 0.47345727405084553, + "grad_norm": 1.082627773284912, + "learning_rate": 4.2250000000000004e-05, + "loss": 0.3634, + "step": 8455 + }, + { + "epoch": 0.47351327136297455, + "grad_norm": 1.5972204208374023, + "learning_rate": 4.2255e-05, + "loss": 0.4188, + "step": 8456 + }, + { + "epoch": 0.47356926867510357, + "grad_norm": 1.4694019556045532, + "learning_rate": 4.226e-05, + "loss": 0.3224, + "step": 8457 + }, + { + "epoch": 0.4736252659872326, + "grad_norm": 1.2422003746032715, + "learning_rate": 4.2265e-05, + "loss": 0.5619, + "step": 8458 + }, + { + "epoch": 0.4736812632993616, + "grad_norm": 1.2314832210540771, + "learning_rate": 4.227000000000001e-05, + "loss": 0.3839, + "step": 8459 + }, + { + "epoch": 0.47373726061149063, + "grad_norm": 1.3565618991851807, + "learning_rate": 4.2275000000000004e-05, + "loss": 0.4406, + "step": 8460 + }, + { + "epoch": 0.47379325792361965, + "grad_norm": 3.4537088871002197, + "learning_rate": 4.228e-05, + "loss": 0.5471, + "step": 8461 + }, + { + "epoch": 0.47384925523574867, + "grad_norm": 1.3622307777404785, + "learning_rate": 4.2285e-05, + "loss": 0.4902, + "step": 8462 + }, + { + "epoch": 0.4739052525478777, + "grad_norm": 1.2124581336975098, + "learning_rate": 4.229e-05, + "loss": 0.4515, + "step": 8463 + }, + { + "epoch": 0.4739612498600067, + "grad_norm": 1.5086095333099365, + "learning_rate": 4.2295e-05, + "loss": 0.4574, + "step": 8464 + }, + { + "epoch": 0.4740172471721357, + "grad_norm": 1.3772732019424438, + "learning_rate": 4.23e-05, + "loss": 0.4398, + "step": 8465 + }, + { + "epoch": 0.47407324448426474, + "grad_norm": 1.1363698244094849, + "learning_rate": 4.2305e-05, + "loss": 0.4022, + "step": 8466 + }, + { + "epoch": 0.47412924179639376, + "grad_norm": 1.329436182975769, + "learning_rate": 4.231e-05, + "loss": 0.4389, + "step": 8467 + }, + { + "epoch": 0.4741852391085228, + "grad_norm": 1.2213752269744873, + "learning_rate": 4.2315000000000004e-05, + "loss": 0.3593, + "step": 8468 + }, + { + "epoch": 0.4742412364206518, + "grad_norm": 1.3433914184570312, + "learning_rate": 4.232e-05, + "loss": 0.381, + "step": 8469 + }, + { + "epoch": 0.4742972337327808, + "grad_norm": 1.1535258293151855, + "learning_rate": 4.2325000000000006e-05, + "loss": 0.3721, + "step": 8470 + }, + { + "epoch": 0.47435323104490984, + "grad_norm": 1.349773645401001, + "learning_rate": 4.233e-05, + "loss": 0.4592, + "step": 8471 + }, + { + "epoch": 0.47440922835703886, + "grad_norm": 1.6597414016723633, + "learning_rate": 4.2335e-05, + "loss": 0.4148, + "step": 8472 + }, + { + "epoch": 0.4744652256691679, + "grad_norm": 1.2057273387908936, + "learning_rate": 4.2340000000000005e-05, + "loss": 0.3378, + "step": 8473 + }, + { + "epoch": 0.4745212229812969, + "grad_norm": 1.6307529211044312, + "learning_rate": 4.2345e-05, + "loss": 0.421, + "step": 8474 + }, + { + "epoch": 0.4745772202934259, + "grad_norm": 1.352099895477295, + "learning_rate": 4.235e-05, + "loss": 0.3547, + "step": 8475 + }, + { + "epoch": 0.47463321760555494, + "grad_norm": 1.4897671937942505, + "learning_rate": 4.2355000000000004e-05, + "loss": 0.4558, + "step": 8476 + }, + { + "epoch": 0.47468921491768395, + "grad_norm": 1.261129379272461, + "learning_rate": 4.236e-05, + "loss": 0.42, + "step": 8477 + }, + { + "epoch": 0.474745212229813, + "grad_norm": 1.3802372217178345, + "learning_rate": 4.2365000000000005e-05, + "loss": 0.3786, + "step": 8478 + }, + { + "epoch": 0.474801209541942, + "grad_norm": 1.2289320230484009, + "learning_rate": 4.237e-05, + "loss": 0.4587, + "step": 8479 + }, + { + "epoch": 0.474857206854071, + "grad_norm": 1.220415711402893, + "learning_rate": 4.237500000000001e-05, + "loss": 0.4528, + "step": 8480 + }, + { + "epoch": 0.47491320416620003, + "grad_norm": 1.6037752628326416, + "learning_rate": 4.2380000000000004e-05, + "loss": 0.4565, + "step": 8481 + }, + { + "epoch": 0.47496920147832905, + "grad_norm": 1.3115330934524536, + "learning_rate": 4.2385e-05, + "loss": 0.4804, + "step": 8482 + }, + { + "epoch": 0.47502519879045807, + "grad_norm": 1.4712803363800049, + "learning_rate": 4.239e-05, + "loss": 0.3827, + "step": 8483 + }, + { + "epoch": 0.4750811961025871, + "grad_norm": 1.428632378578186, + "learning_rate": 4.2395e-05, + "loss": 0.5751, + "step": 8484 + }, + { + "epoch": 0.4751371934147161, + "grad_norm": 1.1504849195480347, + "learning_rate": 4.24e-05, + "loss": 0.513, + "step": 8485 + }, + { + "epoch": 0.4751931907268451, + "grad_norm": 1.1852437257766724, + "learning_rate": 4.2405e-05, + "loss": 0.5621, + "step": 8486 + }, + { + "epoch": 0.47524918803897415, + "grad_norm": 1.389432430267334, + "learning_rate": 4.241e-05, + "loss": 0.264, + "step": 8487 + }, + { + "epoch": 0.47530518535110317, + "grad_norm": 1.338192105293274, + "learning_rate": 4.2415000000000006e-05, + "loss": 0.4091, + "step": 8488 + }, + { + "epoch": 0.4753611826632322, + "grad_norm": 1.1868879795074463, + "learning_rate": 4.2420000000000004e-05, + "loss": 0.4872, + "step": 8489 + }, + { + "epoch": 0.4754171799753612, + "grad_norm": 1.2049229145050049, + "learning_rate": 4.2425e-05, + "loss": 0.3504, + "step": 8490 + }, + { + "epoch": 0.4754731772874902, + "grad_norm": 1.1247048377990723, + "learning_rate": 4.2430000000000005e-05, + "loss": 0.3739, + "step": 8491 + }, + { + "epoch": 0.47552917459961924, + "grad_norm": 1.4636249542236328, + "learning_rate": 4.2435e-05, + "loss": 0.4197, + "step": 8492 + }, + { + "epoch": 0.47558517191174826, + "grad_norm": 1.7594317197799683, + "learning_rate": 4.244e-05, + "loss": 0.4573, + "step": 8493 + }, + { + "epoch": 0.4756411692238773, + "grad_norm": 1.2627742290496826, + "learning_rate": 4.2445000000000004e-05, + "loss": 0.4462, + "step": 8494 + }, + { + "epoch": 0.47569716653600624, + "grad_norm": 1.0841120481491089, + "learning_rate": 4.245e-05, + "loss": 0.3393, + "step": 8495 + }, + { + "epoch": 0.47575316384813526, + "grad_norm": 1.362037181854248, + "learning_rate": 4.2455e-05, + "loss": 0.6053, + "step": 8496 + }, + { + "epoch": 0.4758091611602643, + "grad_norm": 1.2184008359909058, + "learning_rate": 4.246e-05, + "loss": 0.4431, + "step": 8497 + }, + { + "epoch": 0.4758651584723933, + "grad_norm": 1.2519798278808594, + "learning_rate": 4.246500000000001e-05, + "loss": 0.4529, + "step": 8498 + }, + { + "epoch": 0.4759211557845223, + "grad_norm": 1.2431145906448364, + "learning_rate": 4.2470000000000005e-05, + "loss": 0.366, + "step": 8499 + }, + { + "epoch": 0.47597715309665134, + "grad_norm": 1.226942539215088, + "learning_rate": 4.2475e-05, + "loss": 0.5221, + "step": 8500 + }, + { + "epoch": 0.47603315040878036, + "grad_norm": 1.2117550373077393, + "learning_rate": 4.248e-05, + "loss": 0.5224, + "step": 8501 + }, + { + "epoch": 0.4760891477209094, + "grad_norm": 1.5471678972244263, + "learning_rate": 4.2485000000000004e-05, + "loss": 0.5953, + "step": 8502 + }, + { + "epoch": 0.4761451450330384, + "grad_norm": 1.3753056526184082, + "learning_rate": 4.249e-05, + "loss": 0.5152, + "step": 8503 + }, + { + "epoch": 0.4762011423451674, + "grad_norm": 1.514878511428833, + "learning_rate": 4.2495e-05, + "loss": 0.7663, + "step": 8504 + }, + { + "epoch": 0.47625713965729644, + "grad_norm": 1.3028452396392822, + "learning_rate": 4.25e-05, + "loss": 0.4354, + "step": 8505 + }, + { + "epoch": 0.47631313696942545, + "grad_norm": 1.2990000247955322, + "learning_rate": 4.2505e-05, + "loss": 0.3929, + "step": 8506 + }, + { + "epoch": 0.4763691342815545, + "grad_norm": 1.1618504524230957, + "learning_rate": 4.251e-05, + "loss": 0.3683, + "step": 8507 + }, + { + "epoch": 0.4764251315936835, + "grad_norm": 1.2800405025482178, + "learning_rate": 4.2515e-05, + "loss": 0.4431, + "step": 8508 + }, + { + "epoch": 0.4764811289058125, + "grad_norm": 1.145199179649353, + "learning_rate": 4.2520000000000006e-05, + "loss": 0.4034, + "step": 8509 + }, + { + "epoch": 0.47653712621794153, + "grad_norm": 1.1259734630584717, + "learning_rate": 4.2525000000000004e-05, + "loss": 0.453, + "step": 8510 + }, + { + "epoch": 0.47659312353007055, + "grad_norm": 1.203216791152954, + "learning_rate": 4.253e-05, + "loss": 0.4499, + "step": 8511 + }, + { + "epoch": 0.47664912084219957, + "grad_norm": 1.1637547016143799, + "learning_rate": 4.2535000000000005e-05, + "loss": 0.415, + "step": 8512 + }, + { + "epoch": 0.4767051181543286, + "grad_norm": 1.1568801403045654, + "learning_rate": 4.254e-05, + "loss": 0.4233, + "step": 8513 + }, + { + "epoch": 0.4767611154664576, + "grad_norm": 1.6813509464263916, + "learning_rate": 4.2545e-05, + "loss": 0.5052, + "step": 8514 + }, + { + "epoch": 0.4768171127785866, + "grad_norm": 1.1524626016616821, + "learning_rate": 4.2550000000000004e-05, + "loss": 0.4701, + "step": 8515 + }, + { + "epoch": 0.47687311009071565, + "grad_norm": 1.2931450605392456, + "learning_rate": 4.2555e-05, + "loss": 0.3449, + "step": 8516 + }, + { + "epoch": 0.47692910740284467, + "grad_norm": 1.2546114921569824, + "learning_rate": 4.256e-05, + "loss": 0.41, + "step": 8517 + }, + { + "epoch": 0.4769851047149737, + "grad_norm": 1.379087209701538, + "learning_rate": 4.2564999999999997e-05, + "loss": 0.5385, + "step": 8518 + }, + { + "epoch": 0.4770411020271027, + "grad_norm": 1.0247600078582764, + "learning_rate": 4.257000000000001e-05, + "loss": 0.3114, + "step": 8519 + }, + { + "epoch": 0.4770970993392317, + "grad_norm": 1.1872286796569824, + "learning_rate": 4.2575000000000005e-05, + "loss": 0.3339, + "step": 8520 + }, + { + "epoch": 0.47715309665136074, + "grad_norm": 1.1792237758636475, + "learning_rate": 4.258e-05, + "loss": 0.6009, + "step": 8521 + }, + { + "epoch": 0.47720909396348976, + "grad_norm": 1.1308717727661133, + "learning_rate": 4.2585e-05, + "loss": 0.4187, + "step": 8522 + }, + { + "epoch": 0.4772650912756188, + "grad_norm": 1.3926209211349487, + "learning_rate": 4.2590000000000004e-05, + "loss": 0.4622, + "step": 8523 + }, + { + "epoch": 0.4773210885877478, + "grad_norm": 1.264739990234375, + "learning_rate": 4.2595e-05, + "loss": 0.3727, + "step": 8524 + }, + { + "epoch": 0.4773770858998768, + "grad_norm": 1.2509269714355469, + "learning_rate": 4.26e-05, + "loss": 0.5148, + "step": 8525 + }, + { + "epoch": 0.47743308321200584, + "grad_norm": 1.2540090084075928, + "learning_rate": 4.2605e-05, + "loss": 0.4493, + "step": 8526 + }, + { + "epoch": 0.47748908052413486, + "grad_norm": 1.2280350923538208, + "learning_rate": 4.261e-05, + "loss": 0.3851, + "step": 8527 + }, + { + "epoch": 0.4775450778362639, + "grad_norm": 1.5690923929214478, + "learning_rate": 4.2615e-05, + "loss": 0.3389, + "step": 8528 + }, + { + "epoch": 0.4776010751483929, + "grad_norm": 1.2065706253051758, + "learning_rate": 4.262e-05, + "loss": 0.4382, + "step": 8529 + }, + { + "epoch": 0.4776570724605219, + "grad_norm": 1.1458523273468018, + "learning_rate": 4.2625000000000006e-05, + "loss": 0.3564, + "step": 8530 + }, + { + "epoch": 0.47771306977265093, + "grad_norm": 1.0513176918029785, + "learning_rate": 4.2630000000000004e-05, + "loss": 0.3677, + "step": 8531 + }, + { + "epoch": 0.47776906708477995, + "grad_norm": 1.4468929767608643, + "learning_rate": 4.2635e-05, + "loss": 0.5722, + "step": 8532 + }, + { + "epoch": 0.47782506439690897, + "grad_norm": 1.1844818592071533, + "learning_rate": 4.2640000000000005e-05, + "loss": 0.3671, + "step": 8533 + }, + { + "epoch": 0.477881061709038, + "grad_norm": 1.572331428527832, + "learning_rate": 4.2645e-05, + "loss": 0.4609, + "step": 8534 + }, + { + "epoch": 0.477937059021167, + "grad_norm": 1.3428393602371216, + "learning_rate": 4.265e-05, + "loss": 0.5407, + "step": 8535 + }, + { + "epoch": 0.47799305633329603, + "grad_norm": 1.227597713470459, + "learning_rate": 4.2655e-05, + "loss": 0.4428, + "step": 8536 + }, + { + "epoch": 0.478049053645425, + "grad_norm": 1.580487608909607, + "learning_rate": 4.266e-05, + "loss": 0.3856, + "step": 8537 + }, + { + "epoch": 0.478105050957554, + "grad_norm": 1.3850281238555908, + "learning_rate": 4.2665e-05, + "loss": 0.4733, + "step": 8538 + }, + { + "epoch": 0.47816104826968303, + "grad_norm": 1.440145492553711, + "learning_rate": 4.267e-05, + "loss": 0.655, + "step": 8539 + }, + { + "epoch": 0.47821704558181205, + "grad_norm": 1.1501185894012451, + "learning_rate": 4.2675e-05, + "loss": 0.4654, + "step": 8540 + }, + { + "epoch": 0.47827304289394107, + "grad_norm": 1.074999213218689, + "learning_rate": 4.2680000000000005e-05, + "loss": 0.3818, + "step": 8541 + }, + { + "epoch": 0.4783290402060701, + "grad_norm": 1.189826488494873, + "learning_rate": 4.2685e-05, + "loss": 0.4428, + "step": 8542 + }, + { + "epoch": 0.4783850375181991, + "grad_norm": 1.0231224298477173, + "learning_rate": 4.269e-05, + "loss": 0.3145, + "step": 8543 + }, + { + "epoch": 0.4784410348303281, + "grad_norm": 1.1621557474136353, + "learning_rate": 4.2695000000000004e-05, + "loss": 0.3708, + "step": 8544 + }, + { + "epoch": 0.47849703214245715, + "grad_norm": 1.0861577987670898, + "learning_rate": 4.27e-05, + "loss": 0.444, + "step": 8545 + }, + { + "epoch": 0.47855302945458617, + "grad_norm": 1.2717492580413818, + "learning_rate": 4.2705e-05, + "loss": 0.413, + "step": 8546 + }, + { + "epoch": 0.4786090267667152, + "grad_norm": 1.286747694015503, + "learning_rate": 4.271e-05, + "loss": 0.4528, + "step": 8547 + }, + { + "epoch": 0.4786650240788442, + "grad_norm": 1.2454153299331665, + "learning_rate": 4.2715e-05, + "loss": 0.4464, + "step": 8548 + }, + { + "epoch": 0.4787210213909732, + "grad_norm": 1.143328070640564, + "learning_rate": 4.2720000000000004e-05, + "loss": 0.3162, + "step": 8549 + }, + { + "epoch": 0.47877701870310224, + "grad_norm": 1.269060492515564, + "learning_rate": 4.2725e-05, + "loss": 0.6328, + "step": 8550 + }, + { + "epoch": 0.47883301601523126, + "grad_norm": 1.2941027879714966, + "learning_rate": 4.2730000000000006e-05, + "loss": 0.4825, + "step": 8551 + }, + { + "epoch": 0.4788890133273603, + "grad_norm": 1.3494073152542114, + "learning_rate": 4.2735e-05, + "loss": 0.5508, + "step": 8552 + }, + { + "epoch": 0.4789450106394893, + "grad_norm": 1.3683594465255737, + "learning_rate": 4.274e-05, + "loss": 0.3838, + "step": 8553 + }, + { + "epoch": 0.4790010079516183, + "grad_norm": 1.2412092685699463, + "learning_rate": 4.2745000000000005e-05, + "loss": 0.401, + "step": 8554 + }, + { + "epoch": 0.47905700526374734, + "grad_norm": 1.2897952795028687, + "learning_rate": 4.275e-05, + "loss": 0.4075, + "step": 8555 + }, + { + "epoch": 0.47911300257587636, + "grad_norm": 1.2257930040359497, + "learning_rate": 4.2755e-05, + "loss": 0.4128, + "step": 8556 + }, + { + "epoch": 0.4791689998880054, + "grad_norm": 1.193520188331604, + "learning_rate": 4.276e-05, + "loss": 0.3751, + "step": 8557 + }, + { + "epoch": 0.4792249972001344, + "grad_norm": 1.2806997299194336, + "learning_rate": 4.2765e-05, + "loss": 0.3365, + "step": 8558 + }, + { + "epoch": 0.4792809945122634, + "grad_norm": 1.281146764755249, + "learning_rate": 4.2770000000000006e-05, + "loss": 0.4771, + "step": 8559 + }, + { + "epoch": 0.47933699182439243, + "grad_norm": 1.3099502325057983, + "learning_rate": 4.2775e-05, + "loss": 0.3065, + "step": 8560 + }, + { + "epoch": 0.47939298913652145, + "grad_norm": 1.3549703359603882, + "learning_rate": 4.278e-05, + "loss": 0.3679, + "step": 8561 + }, + { + "epoch": 0.47944898644865047, + "grad_norm": 1.138702154159546, + "learning_rate": 4.2785000000000005e-05, + "loss": 0.3429, + "step": 8562 + }, + { + "epoch": 0.4795049837607795, + "grad_norm": 1.219333529472351, + "learning_rate": 4.279e-05, + "loss": 0.2814, + "step": 8563 + }, + { + "epoch": 0.4795609810729085, + "grad_norm": 1.2598845958709717, + "learning_rate": 4.2795e-05, + "loss": 0.35, + "step": 8564 + }, + { + "epoch": 0.47961697838503753, + "grad_norm": 1.1545138359069824, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.4768, + "step": 8565 + }, + { + "epoch": 0.47967297569716655, + "grad_norm": 1.149704933166504, + "learning_rate": 4.2805e-05, + "loss": 0.5022, + "step": 8566 + }, + { + "epoch": 0.47972897300929557, + "grad_norm": 1.1559488773345947, + "learning_rate": 4.281e-05, + "loss": 0.368, + "step": 8567 + }, + { + "epoch": 0.4797849703214246, + "grad_norm": 1.2474576234817505, + "learning_rate": 4.2815e-05, + "loss": 0.3818, + "step": 8568 + }, + { + "epoch": 0.4798409676335536, + "grad_norm": 1.2686368227005005, + "learning_rate": 4.282000000000001e-05, + "loss": 0.3357, + "step": 8569 + }, + { + "epoch": 0.4798969649456826, + "grad_norm": 1.383123755455017, + "learning_rate": 4.2825000000000004e-05, + "loss": 0.4867, + "step": 8570 + }, + { + "epoch": 0.47995296225781164, + "grad_norm": 1.1608155965805054, + "learning_rate": 4.283e-05, + "loss": 0.4371, + "step": 8571 + }, + { + "epoch": 0.48000895956994066, + "grad_norm": 1.2059683799743652, + "learning_rate": 4.2835000000000006e-05, + "loss": 0.4577, + "step": 8572 + }, + { + "epoch": 0.4800649568820697, + "grad_norm": 1.4019975662231445, + "learning_rate": 4.284e-05, + "loss": 0.38, + "step": 8573 + }, + { + "epoch": 0.4801209541941987, + "grad_norm": 1.38151216506958, + "learning_rate": 4.2845e-05, + "loss": 0.5864, + "step": 8574 + }, + { + "epoch": 0.4801769515063277, + "grad_norm": 1.21138334274292, + "learning_rate": 4.285e-05, + "loss": 0.4912, + "step": 8575 + }, + { + "epoch": 0.48023294881845674, + "grad_norm": 1.4562879800796509, + "learning_rate": 4.2855e-05, + "loss": 0.427, + "step": 8576 + }, + { + "epoch": 0.48028894613058576, + "grad_norm": 1.2790573835372925, + "learning_rate": 4.286e-05, + "loss": 0.4197, + "step": 8577 + }, + { + "epoch": 0.4803449434427147, + "grad_norm": 1.2116519212722778, + "learning_rate": 4.2865e-05, + "loss": 0.3213, + "step": 8578 + }, + { + "epoch": 0.48040094075484374, + "grad_norm": 1.5755831003189087, + "learning_rate": 4.287000000000001e-05, + "loss": 0.5585, + "step": 8579 + }, + { + "epoch": 0.48045693806697276, + "grad_norm": 1.244770884513855, + "learning_rate": 4.2875000000000005e-05, + "loss": 0.4356, + "step": 8580 + }, + { + "epoch": 0.4805129353791018, + "grad_norm": 1.1025171279907227, + "learning_rate": 4.288e-05, + "loss": 0.4699, + "step": 8581 + }, + { + "epoch": 0.4805689326912308, + "grad_norm": 1.2222015857696533, + "learning_rate": 4.2885e-05, + "loss": 0.3861, + "step": 8582 + }, + { + "epoch": 0.4806249300033598, + "grad_norm": 1.071301817893982, + "learning_rate": 4.2890000000000004e-05, + "loss": 0.3008, + "step": 8583 + }, + { + "epoch": 0.48068092731548884, + "grad_norm": 1.2025933265686035, + "learning_rate": 4.2895e-05, + "loss": 0.3804, + "step": 8584 + }, + { + "epoch": 0.48073692462761786, + "grad_norm": 1.428401231765747, + "learning_rate": 4.29e-05, + "loss": 0.4237, + "step": 8585 + }, + { + "epoch": 0.4807929219397469, + "grad_norm": 1.6277501583099365, + "learning_rate": 4.2905000000000003e-05, + "loss": 0.4789, + "step": 8586 + }, + { + "epoch": 0.4808489192518759, + "grad_norm": 1.3520350456237793, + "learning_rate": 4.291e-05, + "loss": 0.5518, + "step": 8587 + }, + { + "epoch": 0.4809049165640049, + "grad_norm": 1.2806624174118042, + "learning_rate": 4.2915e-05, + "loss": 0.522, + "step": 8588 + }, + { + "epoch": 0.48096091387613393, + "grad_norm": 1.2821451425552368, + "learning_rate": 4.292e-05, + "loss": 0.4721, + "step": 8589 + }, + { + "epoch": 0.48101691118826295, + "grad_norm": 1.2855767011642456, + "learning_rate": 4.2925000000000007e-05, + "loss": 0.3422, + "step": 8590 + }, + { + "epoch": 0.48107290850039197, + "grad_norm": 1.1831409931182861, + "learning_rate": 4.2930000000000004e-05, + "loss": 0.4138, + "step": 8591 + }, + { + "epoch": 0.481128905812521, + "grad_norm": 1.2454614639282227, + "learning_rate": 4.2935e-05, + "loss": 0.4348, + "step": 8592 + }, + { + "epoch": 0.48118490312465, + "grad_norm": 1.504920482635498, + "learning_rate": 4.2940000000000006e-05, + "loss": 0.5542, + "step": 8593 + }, + { + "epoch": 0.48124090043677903, + "grad_norm": 1.6143132448196411, + "learning_rate": 4.2945e-05, + "loss": 0.4956, + "step": 8594 + }, + { + "epoch": 0.48129689774890805, + "grad_norm": 1.2793389558792114, + "learning_rate": 4.295e-05, + "loss": 0.385, + "step": 8595 + }, + { + "epoch": 0.48135289506103707, + "grad_norm": 1.141887903213501, + "learning_rate": 4.2955e-05, + "loss": 0.3376, + "step": 8596 + }, + { + "epoch": 0.4814088923731661, + "grad_norm": 1.3110154867172241, + "learning_rate": 4.296e-05, + "loss": 0.4257, + "step": 8597 + }, + { + "epoch": 0.4814648896852951, + "grad_norm": 1.4544636011123657, + "learning_rate": 4.2965e-05, + "loss": 0.4742, + "step": 8598 + }, + { + "epoch": 0.4815208869974241, + "grad_norm": 1.5760170221328735, + "learning_rate": 4.2970000000000004e-05, + "loss": 0.5291, + "step": 8599 + }, + { + "epoch": 0.48157688430955314, + "grad_norm": 1.3123674392700195, + "learning_rate": 4.2975e-05, + "loss": 0.4305, + "step": 8600 + }, + { + "epoch": 0.48163288162168216, + "grad_norm": 1.5963950157165527, + "learning_rate": 4.2980000000000005e-05, + "loss": 0.5198, + "step": 8601 + }, + { + "epoch": 0.4816888789338112, + "grad_norm": 1.2458404302597046, + "learning_rate": 4.2985e-05, + "loss": 0.492, + "step": 8602 + }, + { + "epoch": 0.4817448762459402, + "grad_norm": 1.2439574003219604, + "learning_rate": 4.299e-05, + "loss": 0.5217, + "step": 8603 + }, + { + "epoch": 0.4818008735580692, + "grad_norm": 1.4142569303512573, + "learning_rate": 4.2995000000000004e-05, + "loss": 0.3955, + "step": 8604 + }, + { + "epoch": 0.48185687087019824, + "grad_norm": 1.3641350269317627, + "learning_rate": 4.3e-05, + "loss": 0.5114, + "step": 8605 + }, + { + "epoch": 0.48191286818232726, + "grad_norm": 1.1383672952651978, + "learning_rate": 4.3005e-05, + "loss": 0.411, + "step": 8606 + }, + { + "epoch": 0.4819688654944563, + "grad_norm": 1.6961710453033447, + "learning_rate": 4.301e-05, + "loss": 0.4136, + "step": 8607 + }, + { + "epoch": 0.4820248628065853, + "grad_norm": 1.1815787553787231, + "learning_rate": 4.3015e-05, + "loss": 0.3027, + "step": 8608 + }, + { + "epoch": 0.4820808601187143, + "grad_norm": 1.0456570386886597, + "learning_rate": 4.3020000000000005e-05, + "loss": 0.378, + "step": 8609 + }, + { + "epoch": 0.48213685743084334, + "grad_norm": 1.0018742084503174, + "learning_rate": 4.3025e-05, + "loss": 0.384, + "step": 8610 + }, + { + "epoch": 0.48219285474297235, + "grad_norm": 1.1864042282104492, + "learning_rate": 4.3030000000000006e-05, + "loss": 0.528, + "step": 8611 + }, + { + "epoch": 0.4822488520551014, + "grad_norm": 1.4641215801239014, + "learning_rate": 4.3035000000000004e-05, + "loss": 0.5839, + "step": 8612 + }, + { + "epoch": 0.4823048493672304, + "grad_norm": 1.232445478439331, + "learning_rate": 4.304e-05, + "loss": 0.449, + "step": 8613 + }, + { + "epoch": 0.4823608466793594, + "grad_norm": 1.4176139831542969, + "learning_rate": 4.3045e-05, + "loss": 0.4188, + "step": 8614 + }, + { + "epoch": 0.48241684399148843, + "grad_norm": 2.108597993850708, + "learning_rate": 4.305e-05, + "loss": 0.3727, + "step": 8615 + }, + { + "epoch": 0.48247284130361745, + "grad_norm": 1.249324083328247, + "learning_rate": 4.3055e-05, + "loss": 0.4655, + "step": 8616 + }, + { + "epoch": 0.48252883861574647, + "grad_norm": 1.4487379789352417, + "learning_rate": 4.306e-05, + "loss": 0.4904, + "step": 8617 + }, + { + "epoch": 0.4825848359278755, + "grad_norm": 1.620957612991333, + "learning_rate": 4.3065e-05, + "loss": 0.3909, + "step": 8618 + }, + { + "epoch": 0.48264083324000445, + "grad_norm": 1.402370810508728, + "learning_rate": 4.3070000000000006e-05, + "loss": 0.6593, + "step": 8619 + }, + { + "epoch": 0.48269683055213347, + "grad_norm": 1.1624068021774292, + "learning_rate": 4.3075000000000003e-05, + "loss": 0.4366, + "step": 8620 + }, + { + "epoch": 0.4827528278642625, + "grad_norm": 1.351397156715393, + "learning_rate": 4.308e-05, + "loss": 0.5857, + "step": 8621 + }, + { + "epoch": 0.4828088251763915, + "grad_norm": 1.030238151550293, + "learning_rate": 4.3085000000000005e-05, + "loss": 0.3954, + "step": 8622 + }, + { + "epoch": 0.48286482248852053, + "grad_norm": 1.4337857961654663, + "learning_rate": 4.309e-05, + "loss": 0.4261, + "step": 8623 + }, + { + "epoch": 0.48292081980064955, + "grad_norm": 1.1636183261871338, + "learning_rate": 4.3095e-05, + "loss": 0.2829, + "step": 8624 + }, + { + "epoch": 0.48297681711277857, + "grad_norm": 1.1158586740493774, + "learning_rate": 4.3100000000000004e-05, + "loss": 0.4353, + "step": 8625 + }, + { + "epoch": 0.4830328144249076, + "grad_norm": 2.748267889022827, + "learning_rate": 4.3105e-05, + "loss": 0.4636, + "step": 8626 + }, + { + "epoch": 0.4830888117370366, + "grad_norm": 1.0934284925460815, + "learning_rate": 4.311e-05, + "loss": 0.3526, + "step": 8627 + }, + { + "epoch": 0.4831448090491656, + "grad_norm": 1.2375462055206299, + "learning_rate": 4.3115e-05, + "loss": 0.444, + "step": 8628 + }, + { + "epoch": 0.48320080636129464, + "grad_norm": 1.3731330633163452, + "learning_rate": 4.312000000000001e-05, + "loss": 0.4732, + "step": 8629 + }, + { + "epoch": 0.48325680367342366, + "grad_norm": 1.1914840936660767, + "learning_rate": 4.3125000000000005e-05, + "loss": 0.3496, + "step": 8630 + }, + { + "epoch": 0.4833128009855527, + "grad_norm": 1.2739779949188232, + "learning_rate": 4.313e-05, + "loss": 0.6161, + "step": 8631 + }, + { + "epoch": 0.4833687982976817, + "grad_norm": 1.5741862058639526, + "learning_rate": 4.3135000000000006e-05, + "loss": 0.6004, + "step": 8632 + }, + { + "epoch": 0.4834247956098107, + "grad_norm": 1.4872034788131714, + "learning_rate": 4.3140000000000004e-05, + "loss": 0.6158, + "step": 8633 + }, + { + "epoch": 0.48348079292193974, + "grad_norm": 2.9754796028137207, + "learning_rate": 4.3145e-05, + "loss": 0.4436, + "step": 8634 + }, + { + "epoch": 0.48353679023406876, + "grad_norm": 1.236561894416809, + "learning_rate": 4.315e-05, + "loss": 0.5231, + "step": 8635 + }, + { + "epoch": 0.4835927875461978, + "grad_norm": 1.2106773853302002, + "learning_rate": 4.3155e-05, + "loss": 0.3766, + "step": 8636 + }, + { + "epoch": 0.4836487848583268, + "grad_norm": 1.407659649848938, + "learning_rate": 4.316e-05, + "loss": 0.5709, + "step": 8637 + }, + { + "epoch": 0.4837047821704558, + "grad_norm": 1.1345996856689453, + "learning_rate": 4.3165e-05, + "loss": 0.363, + "step": 8638 + }, + { + "epoch": 0.48376077948258484, + "grad_norm": 1.2164907455444336, + "learning_rate": 4.317e-05, + "loss": 0.509, + "step": 8639 + }, + { + "epoch": 0.48381677679471385, + "grad_norm": 1.0939404964447021, + "learning_rate": 4.3175000000000006e-05, + "loss": 0.468, + "step": 8640 + }, + { + "epoch": 0.4838727741068429, + "grad_norm": 1.355599045753479, + "learning_rate": 4.318e-05, + "loss": 0.4164, + "step": 8641 + }, + { + "epoch": 0.4839287714189719, + "grad_norm": 1.1362967491149902, + "learning_rate": 4.3185e-05, + "loss": 0.3244, + "step": 8642 + }, + { + "epoch": 0.4839847687311009, + "grad_norm": 1.142841100692749, + "learning_rate": 4.3190000000000005e-05, + "loss": 0.439, + "step": 8643 + }, + { + "epoch": 0.48404076604322993, + "grad_norm": 1.155008316040039, + "learning_rate": 4.3195e-05, + "loss": 0.5317, + "step": 8644 + }, + { + "epoch": 0.48409676335535895, + "grad_norm": 1.1879433393478394, + "learning_rate": 4.32e-05, + "loss": 0.3743, + "step": 8645 + }, + { + "epoch": 0.48415276066748797, + "grad_norm": 1.4973292350769043, + "learning_rate": 4.3205000000000004e-05, + "loss": 0.544, + "step": 8646 + }, + { + "epoch": 0.484208757979617, + "grad_norm": 1.404468059539795, + "learning_rate": 4.321e-05, + "loss": 0.4304, + "step": 8647 + }, + { + "epoch": 0.484264755291746, + "grad_norm": 1.1524723768234253, + "learning_rate": 4.3215e-05, + "loss": 0.3708, + "step": 8648 + }, + { + "epoch": 0.484320752603875, + "grad_norm": 1.2423940896987915, + "learning_rate": 4.3219999999999996e-05, + "loss": 0.509, + "step": 8649 + }, + { + "epoch": 0.48437674991600405, + "grad_norm": 1.4346377849578857, + "learning_rate": 4.322500000000001e-05, + "loss": 0.3965, + "step": 8650 + }, + { + "epoch": 0.48443274722813306, + "grad_norm": 1.4229187965393066, + "learning_rate": 4.3230000000000005e-05, + "loss": 0.4362, + "step": 8651 + }, + { + "epoch": 0.4844887445402621, + "grad_norm": 1.2607970237731934, + "learning_rate": 4.3235e-05, + "loss": 0.4363, + "step": 8652 + }, + { + "epoch": 0.4845447418523911, + "grad_norm": 1.0456748008728027, + "learning_rate": 4.324e-05, + "loss": 0.379, + "step": 8653 + }, + { + "epoch": 0.4846007391645201, + "grad_norm": 1.325932502746582, + "learning_rate": 4.3245000000000004e-05, + "loss": 0.4285, + "step": 8654 + }, + { + "epoch": 0.48465673647664914, + "grad_norm": 1.3755015134811401, + "learning_rate": 4.325e-05, + "loss": 0.3669, + "step": 8655 + }, + { + "epoch": 0.48471273378877816, + "grad_norm": 1.0719605684280396, + "learning_rate": 4.3255e-05, + "loss": 0.3082, + "step": 8656 + }, + { + "epoch": 0.4847687311009072, + "grad_norm": 1.1531341075897217, + "learning_rate": 4.326e-05, + "loss": 0.4137, + "step": 8657 + }, + { + "epoch": 0.4848247284130362, + "grad_norm": 1.3665847778320312, + "learning_rate": 4.3265e-05, + "loss": 0.4001, + "step": 8658 + }, + { + "epoch": 0.4848807257251652, + "grad_norm": 1.3036551475524902, + "learning_rate": 4.327e-05, + "loss": 0.4852, + "step": 8659 + }, + { + "epoch": 0.48493672303729424, + "grad_norm": 1.1637921333312988, + "learning_rate": 4.3275e-05, + "loss": 0.426, + "step": 8660 + }, + { + "epoch": 0.4849927203494232, + "grad_norm": 1.3472542762756348, + "learning_rate": 4.3280000000000006e-05, + "loss": 0.4741, + "step": 8661 + }, + { + "epoch": 0.4850487176615522, + "grad_norm": 1.2393300533294678, + "learning_rate": 4.3285e-05, + "loss": 0.317, + "step": 8662 + }, + { + "epoch": 0.48510471497368124, + "grad_norm": 1.2900378704071045, + "learning_rate": 4.329e-05, + "loss": 0.4287, + "step": 8663 + }, + { + "epoch": 0.48516071228581026, + "grad_norm": 1.5040082931518555, + "learning_rate": 4.3295000000000005e-05, + "loss": 0.4619, + "step": 8664 + }, + { + "epoch": 0.4852167095979393, + "grad_norm": 1.1562973260879517, + "learning_rate": 4.33e-05, + "loss": 0.4546, + "step": 8665 + }, + { + "epoch": 0.4852727069100683, + "grad_norm": 1.3024035692214966, + "learning_rate": 4.3305e-05, + "loss": 0.3741, + "step": 8666 + }, + { + "epoch": 0.4853287042221973, + "grad_norm": 2.070420742034912, + "learning_rate": 4.3310000000000004e-05, + "loss": 0.4455, + "step": 8667 + }, + { + "epoch": 0.48538470153432633, + "grad_norm": 1.1950358152389526, + "learning_rate": 4.3315e-05, + "loss": 0.4381, + "step": 8668 + }, + { + "epoch": 0.48544069884645535, + "grad_norm": 1.0591496229171753, + "learning_rate": 4.332e-05, + "loss": 0.3745, + "step": 8669 + }, + { + "epoch": 0.4854966961585844, + "grad_norm": 1.0986610651016235, + "learning_rate": 4.3325e-05, + "loss": 0.4661, + "step": 8670 + }, + { + "epoch": 0.4855526934707134, + "grad_norm": 1.3725085258483887, + "learning_rate": 4.333000000000001e-05, + "loss": 0.5387, + "step": 8671 + }, + { + "epoch": 0.4856086907828424, + "grad_norm": 1.3331035375595093, + "learning_rate": 4.3335000000000004e-05, + "loss": 0.445, + "step": 8672 + }, + { + "epoch": 0.48566468809497143, + "grad_norm": 1.120967984199524, + "learning_rate": 4.334e-05, + "loss": 0.3554, + "step": 8673 + }, + { + "epoch": 0.48572068540710045, + "grad_norm": 1.117554783821106, + "learning_rate": 4.3345e-05, + "loss": 0.3826, + "step": 8674 + }, + { + "epoch": 0.48577668271922947, + "grad_norm": 1.3135879039764404, + "learning_rate": 4.335e-05, + "loss": 0.4429, + "step": 8675 + }, + { + "epoch": 0.4858326800313585, + "grad_norm": 1.6566495895385742, + "learning_rate": 4.3355e-05, + "loss": 0.3585, + "step": 8676 + }, + { + "epoch": 0.4858886773434875, + "grad_norm": 1.137291431427002, + "learning_rate": 4.336e-05, + "loss": 0.3587, + "step": 8677 + }, + { + "epoch": 0.4859446746556165, + "grad_norm": 7.002615928649902, + "learning_rate": 4.3365e-05, + "loss": 0.3849, + "step": 8678 + }, + { + "epoch": 0.48600067196774555, + "grad_norm": 1.2987697124481201, + "learning_rate": 4.337e-05, + "loss": 0.5691, + "step": 8679 + }, + { + "epoch": 0.48605666927987456, + "grad_norm": 1.201171636581421, + "learning_rate": 4.3375000000000004e-05, + "loss": 0.3965, + "step": 8680 + }, + { + "epoch": 0.4861126665920036, + "grad_norm": 1.2575432062149048, + "learning_rate": 4.338e-05, + "loss": 0.4293, + "step": 8681 + }, + { + "epoch": 0.4861686639041326, + "grad_norm": 1.2194340229034424, + "learning_rate": 4.3385000000000006e-05, + "loss": 0.4773, + "step": 8682 + }, + { + "epoch": 0.4862246612162616, + "grad_norm": 1.40916907787323, + "learning_rate": 4.339e-05, + "loss": 0.6009, + "step": 8683 + }, + { + "epoch": 0.48628065852839064, + "grad_norm": 1.3053512573242188, + "learning_rate": 4.3395e-05, + "loss": 0.4454, + "step": 8684 + }, + { + "epoch": 0.48633665584051966, + "grad_norm": 1.3352710008621216, + "learning_rate": 4.3400000000000005e-05, + "loss": 0.422, + "step": 8685 + }, + { + "epoch": 0.4863926531526487, + "grad_norm": 1.613464117050171, + "learning_rate": 4.3405e-05, + "loss": 0.461, + "step": 8686 + }, + { + "epoch": 0.4864486504647777, + "grad_norm": 1.0982309579849243, + "learning_rate": 4.341e-05, + "loss": 0.3205, + "step": 8687 + }, + { + "epoch": 0.4865046477769067, + "grad_norm": 1.5111366510391235, + "learning_rate": 4.3415e-05, + "loss": 0.4938, + "step": 8688 + }, + { + "epoch": 0.48656064508903574, + "grad_norm": 1.0738887786865234, + "learning_rate": 4.342e-05, + "loss": 0.4375, + "step": 8689 + }, + { + "epoch": 0.48661664240116476, + "grad_norm": 1.1899917125701904, + "learning_rate": 4.3425000000000005e-05, + "loss": 0.3337, + "step": 8690 + }, + { + "epoch": 0.4866726397132938, + "grad_norm": 1.386510968208313, + "learning_rate": 4.343e-05, + "loss": 0.4249, + "step": 8691 + }, + { + "epoch": 0.4867286370254228, + "grad_norm": 1.2535392045974731, + "learning_rate": 4.343500000000001e-05, + "loss": 0.4321, + "step": 8692 + }, + { + "epoch": 0.4867846343375518, + "grad_norm": 1.122766137123108, + "learning_rate": 4.3440000000000004e-05, + "loss": 0.4399, + "step": 8693 + }, + { + "epoch": 0.48684063164968083, + "grad_norm": 1.240119218826294, + "learning_rate": 4.3445e-05, + "loss": 0.533, + "step": 8694 + }, + { + "epoch": 0.48689662896180985, + "grad_norm": 1.1166800260543823, + "learning_rate": 4.345e-05, + "loss": 0.4065, + "step": 8695 + }, + { + "epoch": 0.48695262627393887, + "grad_norm": 1.2940329313278198, + "learning_rate": 4.3455e-05, + "loss": 0.4668, + "step": 8696 + }, + { + "epoch": 0.4870086235860679, + "grad_norm": 1.1920511722564697, + "learning_rate": 4.346e-05, + "loss": 0.4026, + "step": 8697 + }, + { + "epoch": 0.4870646208981969, + "grad_norm": 1.1582311391830444, + "learning_rate": 4.3465e-05, + "loss": 0.5435, + "step": 8698 + }, + { + "epoch": 0.48712061821032593, + "grad_norm": 1.4122322797775269, + "learning_rate": 4.347e-05, + "loss": 0.5706, + "step": 8699 + }, + { + "epoch": 0.48717661552245495, + "grad_norm": 1.1645272970199585, + "learning_rate": 4.3475000000000006e-05, + "loss": 0.4621, + "step": 8700 + }, + { + "epoch": 0.48723261283458397, + "grad_norm": 1.3535151481628418, + "learning_rate": 4.3480000000000004e-05, + "loss": 0.39, + "step": 8701 + }, + { + "epoch": 0.48728861014671293, + "grad_norm": 1.1694250106811523, + "learning_rate": 4.3485e-05, + "loss": 0.4385, + "step": 8702 + }, + { + "epoch": 0.48734460745884195, + "grad_norm": 1.1841671466827393, + "learning_rate": 4.3490000000000005e-05, + "loss": 0.5778, + "step": 8703 + }, + { + "epoch": 0.48740060477097097, + "grad_norm": 1.0413399934768677, + "learning_rate": 4.3495e-05, + "loss": 0.4254, + "step": 8704 + }, + { + "epoch": 0.4874566020831, + "grad_norm": 1.1953142881393433, + "learning_rate": 4.35e-05, + "loss": 0.4206, + "step": 8705 + }, + { + "epoch": 0.487512599395229, + "grad_norm": 1.3137667179107666, + "learning_rate": 4.3505000000000004e-05, + "loss": 0.4972, + "step": 8706 + }, + { + "epoch": 0.487568596707358, + "grad_norm": 1.1413288116455078, + "learning_rate": 4.351e-05, + "loss": 0.3851, + "step": 8707 + }, + { + "epoch": 0.48762459401948705, + "grad_norm": 1.2208783626556396, + "learning_rate": 4.3515e-05, + "loss": 0.4572, + "step": 8708 + }, + { + "epoch": 0.48768059133161606, + "grad_norm": 1.3009867668151855, + "learning_rate": 4.352e-05, + "loss": 0.441, + "step": 8709 + }, + { + "epoch": 0.4877365886437451, + "grad_norm": 1.4197602272033691, + "learning_rate": 4.352500000000001e-05, + "loss": 0.4225, + "step": 8710 + }, + { + "epoch": 0.4877925859558741, + "grad_norm": 1.5292143821716309, + "learning_rate": 4.3530000000000005e-05, + "loss": 0.4581, + "step": 8711 + }, + { + "epoch": 0.4878485832680031, + "grad_norm": 1.1709518432617188, + "learning_rate": 4.3535e-05, + "loss": 0.308, + "step": 8712 + }, + { + "epoch": 0.48790458058013214, + "grad_norm": 1.5590686798095703, + "learning_rate": 4.354e-05, + "loss": 0.351, + "step": 8713 + }, + { + "epoch": 0.48796057789226116, + "grad_norm": 1.4348865747451782, + "learning_rate": 4.3545000000000004e-05, + "loss": 0.5042, + "step": 8714 + }, + { + "epoch": 0.4880165752043902, + "grad_norm": 1.520550012588501, + "learning_rate": 4.355e-05, + "loss": 0.4064, + "step": 8715 + }, + { + "epoch": 0.4880725725165192, + "grad_norm": 1.2306742668151855, + "learning_rate": 4.3555e-05, + "loss": 0.4183, + "step": 8716 + }, + { + "epoch": 0.4881285698286482, + "grad_norm": 1.1449452638626099, + "learning_rate": 4.356e-05, + "loss": 0.4367, + "step": 8717 + }, + { + "epoch": 0.48818456714077724, + "grad_norm": 1.4547386169433594, + "learning_rate": 4.3565e-05, + "loss": 0.3432, + "step": 8718 + }, + { + "epoch": 0.48824056445290626, + "grad_norm": 1.3006969690322876, + "learning_rate": 4.357e-05, + "loss": 0.442, + "step": 8719 + }, + { + "epoch": 0.4882965617650353, + "grad_norm": 1.424604058265686, + "learning_rate": 4.3575e-05, + "loss": 0.6239, + "step": 8720 + }, + { + "epoch": 0.4883525590771643, + "grad_norm": 1.3461222648620605, + "learning_rate": 4.3580000000000006e-05, + "loss": 0.3196, + "step": 8721 + }, + { + "epoch": 0.4884085563892933, + "grad_norm": 1.2990061044692993, + "learning_rate": 4.3585000000000004e-05, + "loss": 0.4009, + "step": 8722 + }, + { + "epoch": 0.48846455370142233, + "grad_norm": 1.274626612663269, + "learning_rate": 4.359e-05, + "loss": 0.4871, + "step": 8723 + }, + { + "epoch": 0.48852055101355135, + "grad_norm": 1.3779020309448242, + "learning_rate": 4.3595000000000005e-05, + "loss": 0.4717, + "step": 8724 + }, + { + "epoch": 0.48857654832568037, + "grad_norm": 1.209932804107666, + "learning_rate": 4.36e-05, + "loss": 0.4969, + "step": 8725 + }, + { + "epoch": 0.4886325456378094, + "grad_norm": 1.4006141424179077, + "learning_rate": 4.3605e-05, + "loss": 0.5306, + "step": 8726 + }, + { + "epoch": 0.4886885429499384, + "grad_norm": 1.6883563995361328, + "learning_rate": 4.361e-05, + "loss": 0.4729, + "step": 8727 + }, + { + "epoch": 0.48874454026206743, + "grad_norm": 1.2552179098129272, + "learning_rate": 4.3615e-05, + "loss": 0.5425, + "step": 8728 + }, + { + "epoch": 0.48880053757419645, + "grad_norm": 1.6334960460662842, + "learning_rate": 4.362e-05, + "loss": 0.3721, + "step": 8729 + }, + { + "epoch": 0.48885653488632547, + "grad_norm": 1.4374152421951294, + "learning_rate": 4.3625e-05, + "loss": 0.3604, + "step": 8730 + }, + { + "epoch": 0.4889125321984545, + "grad_norm": 1.3681845664978027, + "learning_rate": 4.363000000000001e-05, + "loss": 0.4152, + "step": 8731 + }, + { + "epoch": 0.4889685295105835, + "grad_norm": 1.2419909238815308, + "learning_rate": 4.3635000000000005e-05, + "loss": 0.4418, + "step": 8732 + }, + { + "epoch": 0.4890245268227125, + "grad_norm": 1.1012789011001587, + "learning_rate": 4.364e-05, + "loss": 0.3571, + "step": 8733 + }, + { + "epoch": 0.48908052413484154, + "grad_norm": 1.400465965270996, + "learning_rate": 4.3645e-05, + "loss": 0.4592, + "step": 8734 + }, + { + "epoch": 0.48913652144697056, + "grad_norm": 1.4451253414154053, + "learning_rate": 4.3650000000000004e-05, + "loss": 0.5828, + "step": 8735 + }, + { + "epoch": 0.4891925187590996, + "grad_norm": 1.1183686256408691, + "learning_rate": 4.3655e-05, + "loss": 0.3976, + "step": 8736 + }, + { + "epoch": 0.4892485160712286, + "grad_norm": 1.1781045198440552, + "learning_rate": 4.366e-05, + "loss": 0.3226, + "step": 8737 + }, + { + "epoch": 0.4893045133833576, + "grad_norm": 1.074252963066101, + "learning_rate": 4.3665e-05, + "loss": 0.428, + "step": 8738 + }, + { + "epoch": 0.48936051069548664, + "grad_norm": 1.3820005655288696, + "learning_rate": 4.367e-05, + "loss": 0.4251, + "step": 8739 + }, + { + "epoch": 0.48941650800761566, + "grad_norm": 1.3468071222305298, + "learning_rate": 4.3675000000000005e-05, + "loss": 0.4603, + "step": 8740 + }, + { + "epoch": 0.4894725053197447, + "grad_norm": 1.370620846748352, + "learning_rate": 4.368e-05, + "loss": 0.4452, + "step": 8741 + }, + { + "epoch": 0.4895285026318737, + "grad_norm": 1.138685703277588, + "learning_rate": 4.3685000000000006e-05, + "loss": 0.419, + "step": 8742 + }, + { + "epoch": 0.48958449994400266, + "grad_norm": 1.067796230316162, + "learning_rate": 4.3690000000000004e-05, + "loss": 0.4796, + "step": 8743 + }, + { + "epoch": 0.4896404972561317, + "grad_norm": 1.22769033908844, + "learning_rate": 4.3695e-05, + "loss": 0.3606, + "step": 8744 + }, + { + "epoch": 0.4896964945682607, + "grad_norm": 1.3409802913665771, + "learning_rate": 4.3700000000000005e-05, + "loss": 0.3688, + "step": 8745 + }, + { + "epoch": 0.4897524918803897, + "grad_norm": 1.0355149507522583, + "learning_rate": 4.3705e-05, + "loss": 0.4116, + "step": 8746 + }, + { + "epoch": 0.48980848919251874, + "grad_norm": 1.2071590423583984, + "learning_rate": 4.371e-05, + "loss": 0.3694, + "step": 8747 + }, + { + "epoch": 0.48986448650464776, + "grad_norm": 1.3578791618347168, + "learning_rate": 4.3715e-05, + "loss": 0.5592, + "step": 8748 + }, + { + "epoch": 0.4899204838167768, + "grad_norm": 1.3043264150619507, + "learning_rate": 4.372e-05, + "loss": 0.4845, + "step": 8749 + }, + { + "epoch": 0.4899764811289058, + "grad_norm": 1.3237848281860352, + "learning_rate": 4.3725000000000006e-05, + "loss": 0.4548, + "step": 8750 + }, + { + "epoch": 0.4900324784410348, + "grad_norm": 1.137434959411621, + "learning_rate": 4.373e-05, + "loss": 0.4235, + "step": 8751 + }, + { + "epoch": 0.49008847575316383, + "grad_norm": 1.2952182292938232, + "learning_rate": 4.3735e-05, + "loss": 0.4302, + "step": 8752 + }, + { + "epoch": 0.49014447306529285, + "grad_norm": 1.2217761278152466, + "learning_rate": 4.3740000000000005e-05, + "loss": 0.4094, + "step": 8753 + }, + { + "epoch": 0.49020047037742187, + "grad_norm": 1.3580907583236694, + "learning_rate": 4.3745e-05, + "loss": 0.4641, + "step": 8754 + }, + { + "epoch": 0.4902564676895509, + "grad_norm": 1.3834844827651978, + "learning_rate": 4.375e-05, + "loss": 0.5332, + "step": 8755 + }, + { + "epoch": 0.4903124650016799, + "grad_norm": 1.376662254333496, + "learning_rate": 4.3755000000000004e-05, + "loss": 0.4227, + "step": 8756 + }, + { + "epoch": 0.49036846231380893, + "grad_norm": 1.4165754318237305, + "learning_rate": 4.376e-05, + "loss": 0.5348, + "step": 8757 + }, + { + "epoch": 0.49042445962593795, + "grad_norm": 0.9695160388946533, + "learning_rate": 4.3765e-05, + "loss": 0.294, + "step": 8758 + }, + { + "epoch": 0.49048045693806697, + "grad_norm": 1.2843620777130127, + "learning_rate": 4.377e-05, + "loss": 0.5706, + "step": 8759 + }, + { + "epoch": 0.490536454250196, + "grad_norm": 1.5324783325195312, + "learning_rate": 4.3775e-05, + "loss": 0.4855, + "step": 8760 + }, + { + "epoch": 0.490592451562325, + "grad_norm": 1.4297372102737427, + "learning_rate": 4.3780000000000004e-05, + "loss": 0.458, + "step": 8761 + }, + { + "epoch": 0.490648448874454, + "grad_norm": 1.6255004405975342, + "learning_rate": 4.3785e-05, + "loss": 0.4558, + "step": 8762 + }, + { + "epoch": 0.49070444618658304, + "grad_norm": 1.2813847064971924, + "learning_rate": 4.3790000000000006e-05, + "loss": 0.3361, + "step": 8763 + }, + { + "epoch": 0.49076044349871206, + "grad_norm": 1.2937064170837402, + "learning_rate": 4.3795e-05, + "loss": 0.3665, + "step": 8764 + }, + { + "epoch": 0.4908164408108411, + "grad_norm": 1.2819708585739136, + "learning_rate": 4.38e-05, + "loss": 0.3996, + "step": 8765 + }, + { + "epoch": 0.4908724381229701, + "grad_norm": 1.1545991897583008, + "learning_rate": 4.3805000000000005e-05, + "loss": 0.3478, + "step": 8766 + }, + { + "epoch": 0.4909284354350991, + "grad_norm": 1.3991823196411133, + "learning_rate": 4.381e-05, + "loss": 0.4191, + "step": 8767 + }, + { + "epoch": 0.49098443274722814, + "grad_norm": 1.263278603553772, + "learning_rate": 4.3815e-05, + "loss": 0.384, + "step": 8768 + }, + { + "epoch": 0.49104043005935716, + "grad_norm": 1.6084866523742676, + "learning_rate": 4.382e-05, + "loss": 0.353, + "step": 8769 + }, + { + "epoch": 0.4910964273714862, + "grad_norm": 1.383324146270752, + "learning_rate": 4.3825e-05, + "loss": 0.4827, + "step": 8770 + }, + { + "epoch": 0.4911524246836152, + "grad_norm": 1.567399501800537, + "learning_rate": 4.3830000000000006e-05, + "loss": 0.4992, + "step": 8771 + }, + { + "epoch": 0.4912084219957442, + "grad_norm": 1.2193971872329712, + "learning_rate": 4.3835e-05, + "loss": 0.382, + "step": 8772 + }, + { + "epoch": 0.49126441930787323, + "grad_norm": 1.2334585189819336, + "learning_rate": 4.384e-05, + "loss": 0.3591, + "step": 8773 + }, + { + "epoch": 0.49132041662000225, + "grad_norm": 1.1027566194534302, + "learning_rate": 4.3845000000000005e-05, + "loss": 0.3656, + "step": 8774 + }, + { + "epoch": 0.4913764139321313, + "grad_norm": 1.3041020631790161, + "learning_rate": 4.385e-05, + "loss": 0.4085, + "step": 8775 + }, + { + "epoch": 0.4914324112442603, + "grad_norm": 1.1698496341705322, + "learning_rate": 4.3855e-05, + "loss": 0.3652, + "step": 8776 + }, + { + "epoch": 0.4914884085563893, + "grad_norm": 1.3277337551116943, + "learning_rate": 4.3860000000000004e-05, + "loss": 0.4438, + "step": 8777 + }, + { + "epoch": 0.49154440586851833, + "grad_norm": 1.3195639848709106, + "learning_rate": 4.3865e-05, + "loss": 0.4391, + "step": 8778 + }, + { + "epoch": 0.49160040318064735, + "grad_norm": 1.4638186693191528, + "learning_rate": 4.387e-05, + "loss": 0.449, + "step": 8779 + }, + { + "epoch": 0.49165640049277637, + "grad_norm": 1.458646535873413, + "learning_rate": 4.3875e-05, + "loss": 0.3749, + "step": 8780 + }, + { + "epoch": 0.4917123978049054, + "grad_norm": 1.220596194267273, + "learning_rate": 4.388000000000001e-05, + "loss": 0.4252, + "step": 8781 + }, + { + "epoch": 0.4917683951170344, + "grad_norm": 1.456165075302124, + "learning_rate": 4.3885000000000004e-05, + "loss": 0.4328, + "step": 8782 + }, + { + "epoch": 0.4918243924291634, + "grad_norm": 2.3613314628601074, + "learning_rate": 4.389e-05, + "loss": 0.6209, + "step": 8783 + }, + { + "epoch": 0.49188038974129245, + "grad_norm": 1.0482722520828247, + "learning_rate": 4.3895000000000006e-05, + "loss": 0.4775, + "step": 8784 + }, + { + "epoch": 0.4919363870534214, + "grad_norm": 1.2302495241165161, + "learning_rate": 4.39e-05, + "loss": 0.4245, + "step": 8785 + }, + { + "epoch": 0.49199238436555043, + "grad_norm": 1.2513916492462158, + "learning_rate": 4.3905e-05, + "loss": 0.4248, + "step": 8786 + }, + { + "epoch": 0.49204838167767945, + "grad_norm": 1.495124340057373, + "learning_rate": 4.391e-05, + "loss": 0.481, + "step": 8787 + }, + { + "epoch": 0.49210437898980847, + "grad_norm": 1.1340218782424927, + "learning_rate": 4.3915e-05, + "loss": 0.4743, + "step": 8788 + }, + { + "epoch": 0.4921603763019375, + "grad_norm": 1.4609558582305908, + "learning_rate": 4.392e-05, + "loss": 0.4757, + "step": 8789 + }, + { + "epoch": 0.4922163736140665, + "grad_norm": 1.3570151329040527, + "learning_rate": 4.3925e-05, + "loss": 0.485, + "step": 8790 + }, + { + "epoch": 0.4922723709261955, + "grad_norm": 1.118835687637329, + "learning_rate": 4.393e-05, + "loss": 0.376, + "step": 8791 + }, + { + "epoch": 0.49232836823832454, + "grad_norm": 1.5721051692962646, + "learning_rate": 4.3935000000000005e-05, + "loss": 0.5853, + "step": 8792 + }, + { + "epoch": 0.49238436555045356, + "grad_norm": 1.169965147972107, + "learning_rate": 4.394e-05, + "loss": 0.4243, + "step": 8793 + }, + { + "epoch": 0.4924403628625826, + "grad_norm": 1.2264182567596436, + "learning_rate": 4.3945e-05, + "loss": 0.4102, + "step": 8794 + }, + { + "epoch": 0.4924963601747116, + "grad_norm": 0.9702182412147522, + "learning_rate": 4.3950000000000004e-05, + "loss": 0.3123, + "step": 8795 + }, + { + "epoch": 0.4925523574868406, + "grad_norm": 1.12941575050354, + "learning_rate": 4.3955e-05, + "loss": 0.3943, + "step": 8796 + }, + { + "epoch": 0.49260835479896964, + "grad_norm": 1.5503352880477905, + "learning_rate": 4.396e-05, + "loss": 0.4887, + "step": 8797 + }, + { + "epoch": 0.49266435211109866, + "grad_norm": 1.2890199422836304, + "learning_rate": 4.3965000000000003e-05, + "loss": 0.42, + "step": 8798 + }, + { + "epoch": 0.4927203494232277, + "grad_norm": 1.1840254068374634, + "learning_rate": 4.397e-05, + "loss": 0.3709, + "step": 8799 + }, + { + "epoch": 0.4927763467353567, + "grad_norm": 1.4131531715393066, + "learning_rate": 4.3975e-05, + "loss": 0.6136, + "step": 8800 + }, + { + "epoch": 0.4928323440474857, + "grad_norm": 1.085906744003296, + "learning_rate": 4.398e-05, + "loss": 0.3824, + "step": 8801 + }, + { + "epoch": 0.49288834135961473, + "grad_norm": 1.2747128009796143, + "learning_rate": 4.398500000000001e-05, + "loss": 0.3949, + "step": 8802 + }, + { + "epoch": 0.49294433867174375, + "grad_norm": 1.0494940280914307, + "learning_rate": 4.3990000000000004e-05, + "loss": 0.3932, + "step": 8803 + }, + { + "epoch": 0.4930003359838728, + "grad_norm": 1.1495386362075806, + "learning_rate": 4.3995e-05, + "loss": 0.3146, + "step": 8804 + }, + { + "epoch": 0.4930563332960018, + "grad_norm": 1.4327843189239502, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.4102, + "step": 8805 + }, + { + "epoch": 0.4931123306081308, + "grad_norm": 1.2018320560455322, + "learning_rate": 4.4005e-05, + "loss": 0.3846, + "step": 8806 + }, + { + "epoch": 0.49316832792025983, + "grad_norm": 1.2860138416290283, + "learning_rate": 4.401e-05, + "loss": 0.4058, + "step": 8807 + }, + { + "epoch": 0.49322432523238885, + "grad_norm": 1.254707932472229, + "learning_rate": 4.4015e-05, + "loss": 0.3821, + "step": 8808 + }, + { + "epoch": 0.49328032254451787, + "grad_norm": 1.2617658376693726, + "learning_rate": 4.402e-05, + "loss": 0.4053, + "step": 8809 + }, + { + "epoch": 0.4933363198566469, + "grad_norm": 1.1106071472167969, + "learning_rate": 4.4025e-05, + "loss": 0.319, + "step": 8810 + }, + { + "epoch": 0.4933923171687759, + "grad_norm": 1.0793672800064087, + "learning_rate": 4.4030000000000004e-05, + "loss": 0.3452, + "step": 8811 + }, + { + "epoch": 0.4934483144809049, + "grad_norm": 1.3354262113571167, + "learning_rate": 4.4035e-05, + "loss": 0.4187, + "step": 8812 + }, + { + "epoch": 0.49350431179303395, + "grad_norm": 1.1656129360198975, + "learning_rate": 4.4040000000000005e-05, + "loss": 0.4096, + "step": 8813 + }, + { + "epoch": 0.49356030910516296, + "grad_norm": 1.3050516843795776, + "learning_rate": 4.4045e-05, + "loss": 0.402, + "step": 8814 + }, + { + "epoch": 0.493616306417292, + "grad_norm": 1.3620717525482178, + "learning_rate": 4.405e-05, + "loss": 0.4883, + "step": 8815 + }, + { + "epoch": 0.493672303729421, + "grad_norm": 1.1378008127212524, + "learning_rate": 4.4055000000000004e-05, + "loss": 0.3795, + "step": 8816 + }, + { + "epoch": 0.49372830104155, + "grad_norm": 1.1346876621246338, + "learning_rate": 4.406e-05, + "loss": 0.3449, + "step": 8817 + }, + { + "epoch": 0.49378429835367904, + "grad_norm": 1.2142455577850342, + "learning_rate": 4.4065e-05, + "loss": 0.4761, + "step": 8818 + }, + { + "epoch": 0.49384029566580806, + "grad_norm": 1.2473500967025757, + "learning_rate": 4.407e-05, + "loss": 0.5042, + "step": 8819 + }, + { + "epoch": 0.4938962929779371, + "grad_norm": 1.159977912902832, + "learning_rate": 4.4075e-05, + "loss": 0.3856, + "step": 8820 + }, + { + "epoch": 0.4939522902900661, + "grad_norm": 1.3137444257736206, + "learning_rate": 4.4080000000000005e-05, + "loss": 0.4382, + "step": 8821 + }, + { + "epoch": 0.4940082876021951, + "grad_norm": 1.673619270324707, + "learning_rate": 4.4085e-05, + "loss": 0.5157, + "step": 8822 + }, + { + "epoch": 0.49406428491432414, + "grad_norm": 1.3758738040924072, + "learning_rate": 4.4090000000000006e-05, + "loss": 0.5904, + "step": 8823 + }, + { + "epoch": 0.49412028222645316, + "grad_norm": 1.015759825706482, + "learning_rate": 4.4095000000000004e-05, + "loss": 0.331, + "step": 8824 + }, + { + "epoch": 0.4941762795385822, + "grad_norm": 1.161116600036621, + "learning_rate": 4.41e-05, + "loss": 0.2972, + "step": 8825 + }, + { + "epoch": 0.49423227685071114, + "grad_norm": 1.3757725954055786, + "learning_rate": 4.4105e-05, + "loss": 0.3803, + "step": 8826 + }, + { + "epoch": 0.49428827416284016, + "grad_norm": 1.5430757999420166, + "learning_rate": 4.411e-05, + "loss": 0.6873, + "step": 8827 + }, + { + "epoch": 0.4943442714749692, + "grad_norm": 1.497576117515564, + "learning_rate": 4.4115e-05, + "loss": 0.5016, + "step": 8828 + }, + { + "epoch": 0.4944002687870982, + "grad_norm": 1.2487151622772217, + "learning_rate": 4.412e-05, + "loss": 0.4236, + "step": 8829 + }, + { + "epoch": 0.4944562660992272, + "grad_norm": 1.313583254814148, + "learning_rate": 4.4125e-05, + "loss": 0.6077, + "step": 8830 + }, + { + "epoch": 0.49451226341135623, + "grad_norm": 1.2831478118896484, + "learning_rate": 4.4130000000000006e-05, + "loss": 0.4209, + "step": 8831 + }, + { + "epoch": 0.49456826072348525, + "grad_norm": 1.4325164556503296, + "learning_rate": 4.4135000000000003e-05, + "loss": 0.44, + "step": 8832 + }, + { + "epoch": 0.4946242580356143, + "grad_norm": 1.1198872327804565, + "learning_rate": 4.414e-05, + "loss": 0.451, + "step": 8833 + }, + { + "epoch": 0.4946802553477433, + "grad_norm": 1.1292073726654053, + "learning_rate": 4.4145000000000005e-05, + "loss": 0.4119, + "step": 8834 + }, + { + "epoch": 0.4947362526598723, + "grad_norm": 1.7890433073043823, + "learning_rate": 4.415e-05, + "loss": 0.3996, + "step": 8835 + }, + { + "epoch": 0.49479224997200133, + "grad_norm": 1.0275719165802002, + "learning_rate": 4.4155e-05, + "loss": 0.3757, + "step": 8836 + }, + { + "epoch": 0.49484824728413035, + "grad_norm": 1.1006819009780884, + "learning_rate": 4.4160000000000004e-05, + "loss": 0.4615, + "step": 8837 + }, + { + "epoch": 0.49490424459625937, + "grad_norm": 1.203513503074646, + "learning_rate": 4.4165e-05, + "loss": 0.3168, + "step": 8838 + }, + { + "epoch": 0.4949602419083884, + "grad_norm": 1.2412500381469727, + "learning_rate": 4.417e-05, + "loss": 0.4583, + "step": 8839 + }, + { + "epoch": 0.4950162392205174, + "grad_norm": 1.6406822204589844, + "learning_rate": 4.4174999999999996e-05, + "loss": 0.5096, + "step": 8840 + }, + { + "epoch": 0.4950722365326464, + "grad_norm": 1.2065621614456177, + "learning_rate": 4.418000000000001e-05, + "loss": 0.4388, + "step": 8841 + }, + { + "epoch": 0.49512823384477544, + "grad_norm": 1.75081205368042, + "learning_rate": 4.4185000000000005e-05, + "loss": 0.528, + "step": 8842 + }, + { + "epoch": 0.49518423115690446, + "grad_norm": 1.1402934789657593, + "learning_rate": 4.419e-05, + "loss": 0.389, + "step": 8843 + }, + { + "epoch": 0.4952402284690335, + "grad_norm": 1.4472622871398926, + "learning_rate": 4.4195000000000006e-05, + "loss": 0.526, + "step": 8844 + }, + { + "epoch": 0.4952962257811625, + "grad_norm": 1.3543546199798584, + "learning_rate": 4.4200000000000004e-05, + "loss": 0.4009, + "step": 8845 + }, + { + "epoch": 0.4953522230932915, + "grad_norm": 1.2927943468093872, + "learning_rate": 4.4205e-05, + "loss": 0.4673, + "step": 8846 + }, + { + "epoch": 0.49540822040542054, + "grad_norm": 1.1858797073364258, + "learning_rate": 4.421e-05, + "loss": 0.4821, + "step": 8847 + }, + { + "epoch": 0.49546421771754956, + "grad_norm": 1.1288942098617554, + "learning_rate": 4.4215e-05, + "loss": 0.3303, + "step": 8848 + }, + { + "epoch": 0.4955202150296786, + "grad_norm": 1.1077618598937988, + "learning_rate": 4.422e-05, + "loss": 0.3461, + "step": 8849 + }, + { + "epoch": 0.4955762123418076, + "grad_norm": 0.9944883584976196, + "learning_rate": 4.4225e-05, + "loss": 0.3039, + "step": 8850 + }, + { + "epoch": 0.4956322096539366, + "grad_norm": 1.2238086462020874, + "learning_rate": 4.423e-05, + "loss": 0.4426, + "step": 8851 + }, + { + "epoch": 0.49568820696606564, + "grad_norm": 1.333704948425293, + "learning_rate": 4.4235000000000006e-05, + "loss": 0.5154, + "step": 8852 + }, + { + "epoch": 0.49574420427819466, + "grad_norm": 1.2379481792449951, + "learning_rate": 4.424e-05, + "loss": 0.4862, + "step": 8853 + }, + { + "epoch": 0.4958002015903237, + "grad_norm": 1.1883821487426758, + "learning_rate": 4.4245e-05, + "loss": 0.3883, + "step": 8854 + }, + { + "epoch": 0.4958561989024527, + "grad_norm": 1.4488531351089478, + "learning_rate": 4.4250000000000005e-05, + "loss": 0.4037, + "step": 8855 + }, + { + "epoch": 0.4959121962145817, + "grad_norm": 1.6411842107772827, + "learning_rate": 4.4255e-05, + "loss": 0.484, + "step": 8856 + }, + { + "epoch": 0.49596819352671073, + "grad_norm": 1.5303682088851929, + "learning_rate": 4.426e-05, + "loss": 0.5324, + "step": 8857 + }, + { + "epoch": 0.49602419083883975, + "grad_norm": 1.2763642072677612, + "learning_rate": 4.4265000000000004e-05, + "loss": 0.4157, + "step": 8858 + }, + { + "epoch": 0.49608018815096877, + "grad_norm": 1.1704176664352417, + "learning_rate": 4.427e-05, + "loss": 0.4128, + "step": 8859 + }, + { + "epoch": 0.4961361854630978, + "grad_norm": 1.387826681137085, + "learning_rate": 4.4275e-05, + "loss": 0.5283, + "step": 8860 + }, + { + "epoch": 0.4961921827752268, + "grad_norm": 1.2293310165405273, + "learning_rate": 4.428e-05, + "loss": 0.548, + "step": 8861 + }, + { + "epoch": 0.49624818008735583, + "grad_norm": 1.2959572076797485, + "learning_rate": 4.428500000000001e-05, + "loss": 0.4113, + "step": 8862 + }, + { + "epoch": 0.49630417739948485, + "grad_norm": 1.3615657091140747, + "learning_rate": 4.4290000000000005e-05, + "loss": 0.4515, + "step": 8863 + }, + { + "epoch": 0.49636017471161387, + "grad_norm": 1.145766019821167, + "learning_rate": 4.4295e-05, + "loss": 0.383, + "step": 8864 + }, + { + "epoch": 0.4964161720237429, + "grad_norm": 1.296778678894043, + "learning_rate": 4.43e-05, + "loss": 0.4408, + "step": 8865 + }, + { + "epoch": 0.4964721693358719, + "grad_norm": 1.1632485389709473, + "learning_rate": 4.4305000000000004e-05, + "loss": 0.4587, + "step": 8866 + }, + { + "epoch": 0.49652816664800087, + "grad_norm": 1.2473266124725342, + "learning_rate": 4.431e-05, + "loss": 0.4444, + "step": 8867 + }, + { + "epoch": 0.4965841639601299, + "grad_norm": 1.2199045419692993, + "learning_rate": 4.4315e-05, + "loss": 0.3768, + "step": 8868 + }, + { + "epoch": 0.4966401612722589, + "grad_norm": 1.3166041374206543, + "learning_rate": 4.432e-05, + "loss": 0.4984, + "step": 8869 + }, + { + "epoch": 0.4966961585843879, + "grad_norm": 1.2682785987854004, + "learning_rate": 4.4325e-05, + "loss": 0.461, + "step": 8870 + }, + { + "epoch": 0.49675215589651694, + "grad_norm": 1.5442527532577515, + "learning_rate": 4.4330000000000004e-05, + "loss": 0.44, + "step": 8871 + }, + { + "epoch": 0.49680815320864596, + "grad_norm": 1.106680154800415, + "learning_rate": 4.4335e-05, + "loss": 0.3382, + "step": 8872 + }, + { + "epoch": 0.496864150520775, + "grad_norm": 1.2025519609451294, + "learning_rate": 4.4340000000000006e-05, + "loss": 0.4117, + "step": 8873 + }, + { + "epoch": 0.496920147832904, + "grad_norm": 1.3306968212127686, + "learning_rate": 4.4345e-05, + "loss": 0.4045, + "step": 8874 + }, + { + "epoch": 0.496976145145033, + "grad_norm": 1.4084997177124023, + "learning_rate": 4.435e-05, + "loss": 0.7849, + "step": 8875 + }, + { + "epoch": 0.49703214245716204, + "grad_norm": 1.4674460887908936, + "learning_rate": 4.4355000000000005e-05, + "loss": 0.4156, + "step": 8876 + }, + { + "epoch": 0.49708813976929106, + "grad_norm": 1.2347136735916138, + "learning_rate": 4.436e-05, + "loss": 0.3831, + "step": 8877 + }, + { + "epoch": 0.4971441370814201, + "grad_norm": 1.287600040435791, + "learning_rate": 4.4365e-05, + "loss": 0.3978, + "step": 8878 + }, + { + "epoch": 0.4972001343935491, + "grad_norm": 1.413364052772522, + "learning_rate": 4.4370000000000004e-05, + "loss": 0.3869, + "step": 8879 + }, + { + "epoch": 0.4972561317056781, + "grad_norm": 1.6367324590682983, + "learning_rate": 4.4375e-05, + "loss": 0.4517, + "step": 8880 + }, + { + "epoch": 0.49731212901780714, + "grad_norm": 1.4247015714645386, + "learning_rate": 4.438e-05, + "loss": 0.4257, + "step": 8881 + }, + { + "epoch": 0.49736812632993616, + "grad_norm": 1.250550627708435, + "learning_rate": 4.4385e-05, + "loss": 0.5311, + "step": 8882 + }, + { + "epoch": 0.4974241236420652, + "grad_norm": 1.1516586542129517, + "learning_rate": 4.439000000000001e-05, + "loss": 0.469, + "step": 8883 + }, + { + "epoch": 0.4974801209541942, + "grad_norm": 1.2464271783828735, + "learning_rate": 4.4395000000000004e-05, + "loss": 0.5226, + "step": 8884 + }, + { + "epoch": 0.4975361182663232, + "grad_norm": 1.1471192836761475, + "learning_rate": 4.44e-05, + "loss": 0.4076, + "step": 8885 + }, + { + "epoch": 0.49759211557845223, + "grad_norm": 1.119562029838562, + "learning_rate": 4.4405e-05, + "loss": 0.4719, + "step": 8886 + }, + { + "epoch": 0.49764811289058125, + "grad_norm": 1.217142105102539, + "learning_rate": 4.4410000000000003e-05, + "loss": 0.3851, + "step": 8887 + }, + { + "epoch": 0.49770411020271027, + "grad_norm": 1.3142110109329224, + "learning_rate": 4.4415e-05, + "loss": 0.5367, + "step": 8888 + }, + { + "epoch": 0.4977601075148393, + "grad_norm": 1.2821145057678223, + "learning_rate": 4.442e-05, + "loss": 0.3896, + "step": 8889 + }, + { + "epoch": 0.4978161048269683, + "grad_norm": 1.1195389032363892, + "learning_rate": 4.4425e-05, + "loss": 0.3382, + "step": 8890 + }, + { + "epoch": 0.49787210213909733, + "grad_norm": 1.240980863571167, + "learning_rate": 4.443e-05, + "loss": 0.5295, + "step": 8891 + }, + { + "epoch": 0.49792809945122635, + "grad_norm": 1.1518019437789917, + "learning_rate": 4.4435000000000004e-05, + "loss": 0.3763, + "step": 8892 + }, + { + "epoch": 0.49798409676335537, + "grad_norm": 1.1410083770751953, + "learning_rate": 4.444e-05, + "loss": 0.4042, + "step": 8893 + }, + { + "epoch": 0.4980400940754844, + "grad_norm": 1.1585884094238281, + "learning_rate": 4.4445000000000006e-05, + "loss": 0.4042, + "step": 8894 + }, + { + "epoch": 0.4980960913876134, + "grad_norm": 1.2835559844970703, + "learning_rate": 4.445e-05, + "loss": 0.5014, + "step": 8895 + }, + { + "epoch": 0.4981520886997424, + "grad_norm": 1.4558626413345337, + "learning_rate": 4.4455e-05, + "loss": 0.4909, + "step": 8896 + }, + { + "epoch": 0.49820808601187144, + "grad_norm": 1.0214385986328125, + "learning_rate": 4.4460000000000005e-05, + "loss": 0.3897, + "step": 8897 + }, + { + "epoch": 0.49826408332400046, + "grad_norm": 1.1316633224487305, + "learning_rate": 4.4465e-05, + "loss": 0.4244, + "step": 8898 + }, + { + "epoch": 0.4983200806361295, + "grad_norm": 1.2841393947601318, + "learning_rate": 4.447e-05, + "loss": 0.3288, + "step": 8899 + }, + { + "epoch": 0.4983760779482585, + "grad_norm": 1.366647720336914, + "learning_rate": 4.4475e-05, + "loss": 0.4425, + "step": 8900 + }, + { + "epoch": 0.4984320752603875, + "grad_norm": 1.5029202699661255, + "learning_rate": 4.448e-05, + "loss": 0.3958, + "step": 8901 + }, + { + "epoch": 0.49848807257251654, + "grad_norm": 1.1440002918243408, + "learning_rate": 4.4485000000000005e-05, + "loss": 0.4056, + "step": 8902 + }, + { + "epoch": 0.49854406988464556, + "grad_norm": 1.1052082777023315, + "learning_rate": 4.449e-05, + "loss": 0.4626, + "step": 8903 + }, + { + "epoch": 0.4986000671967746, + "grad_norm": 1.1949622631072998, + "learning_rate": 4.4495e-05, + "loss": 0.3967, + "step": 8904 + }, + { + "epoch": 0.4986560645089036, + "grad_norm": 1.3871986865997314, + "learning_rate": 4.4500000000000004e-05, + "loss": 0.4062, + "step": 8905 + }, + { + "epoch": 0.4987120618210326, + "grad_norm": 1.0678956508636475, + "learning_rate": 4.4505e-05, + "loss": 0.3173, + "step": 8906 + }, + { + "epoch": 0.49876805913316163, + "grad_norm": 1.2354152202606201, + "learning_rate": 4.451e-05, + "loss": 0.426, + "step": 8907 + }, + { + "epoch": 0.49882405644529065, + "grad_norm": 1.5488899946212769, + "learning_rate": 4.4515e-05, + "loss": 0.5341, + "step": 8908 + }, + { + "epoch": 0.4988800537574196, + "grad_norm": 1.7664490938186646, + "learning_rate": 4.452e-05, + "loss": 0.4924, + "step": 8909 + }, + { + "epoch": 0.49893605106954864, + "grad_norm": 1.1872198581695557, + "learning_rate": 4.4525e-05, + "loss": 0.4802, + "step": 8910 + }, + { + "epoch": 0.49899204838167766, + "grad_norm": 1.6726036071777344, + "learning_rate": 4.453e-05, + "loss": 0.7142, + "step": 8911 + }, + { + "epoch": 0.4990480456938067, + "grad_norm": 1.430335521697998, + "learning_rate": 4.4535000000000006e-05, + "loss": 0.4979, + "step": 8912 + }, + { + "epoch": 0.4991040430059357, + "grad_norm": 1.2542810440063477, + "learning_rate": 4.4540000000000004e-05, + "loss": 0.4055, + "step": 8913 + }, + { + "epoch": 0.4991600403180647, + "grad_norm": 1.1580100059509277, + "learning_rate": 4.4545e-05, + "loss": 0.4304, + "step": 8914 + }, + { + "epoch": 0.49921603763019373, + "grad_norm": 1.0954418182373047, + "learning_rate": 4.4550000000000005e-05, + "loss": 0.3711, + "step": 8915 + }, + { + "epoch": 0.49927203494232275, + "grad_norm": 1.2141430377960205, + "learning_rate": 4.4555e-05, + "loss": 0.3588, + "step": 8916 + }, + { + "epoch": 0.49932803225445177, + "grad_norm": 1.110926628112793, + "learning_rate": 4.456e-05, + "loss": 0.3741, + "step": 8917 + }, + { + "epoch": 0.4993840295665808, + "grad_norm": 1.374337077140808, + "learning_rate": 4.4565000000000004e-05, + "loss": 0.4762, + "step": 8918 + }, + { + "epoch": 0.4994400268787098, + "grad_norm": 1.0902456045150757, + "learning_rate": 4.457e-05, + "loss": 0.4118, + "step": 8919 + }, + { + "epoch": 0.4994960241908388, + "grad_norm": 1.2432687282562256, + "learning_rate": 4.4575e-05, + "loss": 0.4993, + "step": 8920 + }, + { + "epoch": 0.49955202150296785, + "grad_norm": 1.3850815296173096, + "learning_rate": 4.458e-05, + "loss": 0.5616, + "step": 8921 + }, + { + "epoch": 0.49960801881509687, + "grad_norm": 1.1992584466934204, + "learning_rate": 4.458500000000001e-05, + "loss": 0.4051, + "step": 8922 + }, + { + "epoch": 0.4996640161272259, + "grad_norm": 1.339229941368103, + "learning_rate": 4.4590000000000005e-05, + "loss": 0.5669, + "step": 8923 + }, + { + "epoch": 0.4997200134393549, + "grad_norm": 1.2795366048812866, + "learning_rate": 4.4595e-05, + "loss": 0.5839, + "step": 8924 + }, + { + "epoch": 0.4997760107514839, + "grad_norm": 1.505853295326233, + "learning_rate": 4.46e-05, + "loss": 0.4764, + "step": 8925 + }, + { + "epoch": 0.49983200806361294, + "grad_norm": 1.1352922916412354, + "learning_rate": 4.4605000000000004e-05, + "loss": 0.4109, + "step": 8926 + }, + { + "epoch": 0.49988800537574196, + "grad_norm": 1.429019570350647, + "learning_rate": 4.461e-05, + "loss": 0.5747, + "step": 8927 + }, + { + "epoch": 0.499944002687871, + "grad_norm": 1.4581761360168457, + "learning_rate": 4.4615e-05, + "loss": 0.4963, + "step": 8928 + }, + { + "epoch": 0.5, + "grad_norm": 1.2715257406234741, + "learning_rate": 4.462e-05, + "loss": 0.3885, + "step": 8929 + }, + { + "epoch": 0.500055997312129, + "grad_norm": 1.158057689666748, + "learning_rate": 4.4625e-05, + "loss": 0.368, + "step": 8930 + }, + { + "epoch": 0.500111994624258, + "grad_norm": 1.2566872835159302, + "learning_rate": 4.463e-05, + "loss": 0.4259, + "step": 8931 + }, + { + "epoch": 0.5001679919363871, + "grad_norm": 1.050191044807434, + "learning_rate": 4.4635e-05, + "loss": 0.3771, + "step": 8932 + }, + { + "epoch": 0.5002239892485161, + "grad_norm": 1.2480908632278442, + "learning_rate": 4.4640000000000006e-05, + "loss": 0.5151, + "step": 8933 + }, + { + "epoch": 0.5002799865606451, + "grad_norm": 1.6038448810577393, + "learning_rate": 4.4645000000000004e-05, + "loss": 0.4815, + "step": 8934 + }, + { + "epoch": 0.5003359838727741, + "grad_norm": 1.3310796022415161, + "learning_rate": 4.465e-05, + "loss": 0.4337, + "step": 8935 + }, + { + "epoch": 0.5003919811849031, + "grad_norm": 1.0335931777954102, + "learning_rate": 4.4655000000000005e-05, + "loss": 0.3021, + "step": 8936 + }, + { + "epoch": 0.5004479784970322, + "grad_norm": 1.6768550872802734, + "learning_rate": 4.466e-05, + "loss": 0.5165, + "step": 8937 + }, + { + "epoch": 0.5005039758091612, + "grad_norm": 1.3291367292404175, + "learning_rate": 4.4665e-05, + "loss": 0.3469, + "step": 8938 + }, + { + "epoch": 0.5005599731212902, + "grad_norm": 1.7011210918426514, + "learning_rate": 4.467e-05, + "loss": 0.4439, + "step": 8939 + }, + { + "epoch": 0.5006159704334192, + "grad_norm": 1.314420461654663, + "learning_rate": 4.4675e-05, + "loss": 0.4314, + "step": 8940 + }, + { + "epoch": 0.5006719677455482, + "grad_norm": 1.4272866249084473, + "learning_rate": 4.468e-05, + "loss": 0.6066, + "step": 8941 + }, + { + "epoch": 0.5007279650576772, + "grad_norm": 1.32636559009552, + "learning_rate": 4.4685e-05, + "loss": 0.391, + "step": 8942 + }, + { + "epoch": 0.5007839623698063, + "grad_norm": 1.1752004623413086, + "learning_rate": 4.469e-05, + "loss": 0.4544, + "step": 8943 + }, + { + "epoch": 0.5008399596819353, + "grad_norm": 1.1247951984405518, + "learning_rate": 4.4695000000000005e-05, + "loss": 0.3804, + "step": 8944 + }, + { + "epoch": 0.5008959569940643, + "grad_norm": 1.4395594596862793, + "learning_rate": 4.47e-05, + "loss": 0.3673, + "step": 8945 + }, + { + "epoch": 0.5009519543061933, + "grad_norm": 1.283260464668274, + "learning_rate": 4.4705e-05, + "loss": 0.4168, + "step": 8946 + }, + { + "epoch": 0.5010079516183223, + "grad_norm": 1.0718876123428345, + "learning_rate": 4.4710000000000004e-05, + "loss": 0.4354, + "step": 8947 + }, + { + "epoch": 0.5010639489304514, + "grad_norm": 1.2211148738861084, + "learning_rate": 4.4715e-05, + "loss": 0.3799, + "step": 8948 + }, + { + "epoch": 0.5011199462425804, + "grad_norm": 1.3038597106933594, + "learning_rate": 4.472e-05, + "loss": 0.3742, + "step": 8949 + }, + { + "epoch": 0.5011759435547094, + "grad_norm": 1.9161030054092407, + "learning_rate": 4.4725e-05, + "loss": 0.7407, + "step": 8950 + }, + { + "epoch": 0.5012319408668384, + "grad_norm": 1.223834753036499, + "learning_rate": 4.473e-05, + "loss": 0.3828, + "step": 8951 + }, + { + "epoch": 0.5012879381789674, + "grad_norm": 1.2141587734222412, + "learning_rate": 4.4735000000000005e-05, + "loss": 0.3771, + "step": 8952 + }, + { + "epoch": 0.5013439354910965, + "grad_norm": 1.4120736122131348, + "learning_rate": 4.474e-05, + "loss": 0.5211, + "step": 8953 + }, + { + "epoch": 0.5013999328032255, + "grad_norm": 1.2435225248336792, + "learning_rate": 4.4745000000000006e-05, + "loss": 0.3964, + "step": 8954 + }, + { + "epoch": 0.5014559301153545, + "grad_norm": 1.2978291511535645, + "learning_rate": 4.4750000000000004e-05, + "loss": 0.3875, + "step": 8955 + }, + { + "epoch": 0.5015119274274835, + "grad_norm": 1.3040884733200073, + "learning_rate": 4.4755e-05, + "loss": 0.3999, + "step": 8956 + }, + { + "epoch": 0.5015679247396125, + "grad_norm": 1.2925035953521729, + "learning_rate": 4.4760000000000005e-05, + "loss": 0.3641, + "step": 8957 + }, + { + "epoch": 0.5016239220517416, + "grad_norm": 1.1881881952285767, + "learning_rate": 4.4765e-05, + "loss": 0.398, + "step": 8958 + }, + { + "epoch": 0.5016799193638706, + "grad_norm": 1.1945446729660034, + "learning_rate": 4.477e-05, + "loss": 0.4379, + "step": 8959 + }, + { + "epoch": 0.5017359166759996, + "grad_norm": 1.4754998683929443, + "learning_rate": 4.4775e-05, + "loss": 0.4611, + "step": 8960 + }, + { + "epoch": 0.5017919139881286, + "grad_norm": 1.242138385772705, + "learning_rate": 4.478e-05, + "loss": 0.344, + "step": 8961 + }, + { + "epoch": 0.5018479113002576, + "grad_norm": 1.179626226425171, + "learning_rate": 4.4785000000000006e-05, + "loss": 0.4507, + "step": 8962 + }, + { + "epoch": 0.5019039086123867, + "grad_norm": 1.34430992603302, + "learning_rate": 4.479e-05, + "loss": 0.4302, + "step": 8963 + }, + { + "epoch": 0.5019599059245157, + "grad_norm": 1.2207221984863281, + "learning_rate": 4.4795e-05, + "loss": 0.4214, + "step": 8964 + }, + { + "epoch": 0.5020159032366447, + "grad_norm": 1.1519558429718018, + "learning_rate": 4.4800000000000005e-05, + "loss": 0.3989, + "step": 8965 + }, + { + "epoch": 0.5020719005487737, + "grad_norm": 1.4847782850265503, + "learning_rate": 4.4805e-05, + "loss": 0.6974, + "step": 8966 + }, + { + "epoch": 0.5021278978609027, + "grad_norm": 1.3812850713729858, + "learning_rate": 4.481e-05, + "loss": 0.5247, + "step": 8967 + }, + { + "epoch": 0.5021838951730317, + "grad_norm": 1.2806353569030762, + "learning_rate": 4.4815000000000004e-05, + "loss": 0.4348, + "step": 8968 + }, + { + "epoch": 0.5022398924851608, + "grad_norm": 1.4079915285110474, + "learning_rate": 4.482e-05, + "loss": 0.4269, + "step": 8969 + }, + { + "epoch": 0.5022958897972898, + "grad_norm": 1.0850051641464233, + "learning_rate": 4.4825e-05, + "loss": 0.336, + "step": 8970 + }, + { + "epoch": 0.5023518871094187, + "grad_norm": 1.2497671842575073, + "learning_rate": 4.483e-05, + "loss": 0.4074, + "step": 8971 + }, + { + "epoch": 0.5024078844215477, + "grad_norm": 1.6500200033187866, + "learning_rate": 4.483500000000001e-05, + "loss": 0.4611, + "step": 8972 + }, + { + "epoch": 0.5024638817336767, + "grad_norm": 1.4457597732543945, + "learning_rate": 4.4840000000000004e-05, + "loss": 0.8361, + "step": 8973 + }, + { + "epoch": 0.5025198790458058, + "grad_norm": 1.1702678203582764, + "learning_rate": 4.4845e-05, + "loss": 0.3945, + "step": 8974 + }, + { + "epoch": 0.5025758763579348, + "grad_norm": 1.2733867168426514, + "learning_rate": 4.4850000000000006e-05, + "loss": 0.5797, + "step": 8975 + }, + { + "epoch": 0.5026318736700638, + "grad_norm": 1.2133017778396606, + "learning_rate": 4.4855e-05, + "loss": 0.3941, + "step": 8976 + }, + { + "epoch": 0.5026878709821928, + "grad_norm": 1.2491768598556519, + "learning_rate": 4.486e-05, + "loss": 0.4425, + "step": 8977 + }, + { + "epoch": 0.5027438682943218, + "grad_norm": 1.3059080839157104, + "learning_rate": 4.4865e-05, + "loss": 0.473, + "step": 8978 + }, + { + "epoch": 0.5027998656064508, + "grad_norm": 1.359402060508728, + "learning_rate": 4.487e-05, + "loss": 0.5198, + "step": 8979 + }, + { + "epoch": 0.5028558629185799, + "grad_norm": 1.8723503351211548, + "learning_rate": 4.4875e-05, + "loss": 0.3733, + "step": 8980 + }, + { + "epoch": 0.5029118602307089, + "grad_norm": 1.2968909740447998, + "learning_rate": 4.488e-05, + "loss": 0.4656, + "step": 8981 + }, + { + "epoch": 0.5029678575428379, + "grad_norm": 1.528132677078247, + "learning_rate": 4.488500000000001e-05, + "loss": 0.5171, + "step": 8982 + }, + { + "epoch": 0.5030238548549669, + "grad_norm": 1.212577223777771, + "learning_rate": 4.4890000000000006e-05, + "loss": 0.3623, + "step": 8983 + }, + { + "epoch": 0.5030798521670959, + "grad_norm": 1.7409963607788086, + "learning_rate": 4.4895e-05, + "loss": 0.3292, + "step": 8984 + }, + { + "epoch": 0.503135849479225, + "grad_norm": 1.1049139499664307, + "learning_rate": 4.49e-05, + "loss": 0.358, + "step": 8985 + }, + { + "epoch": 0.503191846791354, + "grad_norm": 1.239724040031433, + "learning_rate": 4.4905000000000005e-05, + "loss": 0.3727, + "step": 8986 + }, + { + "epoch": 0.503247844103483, + "grad_norm": 1.2785840034484863, + "learning_rate": 4.491e-05, + "loss": 0.4344, + "step": 8987 + }, + { + "epoch": 0.503303841415612, + "grad_norm": 1.2609670162200928, + "learning_rate": 4.4915e-05, + "loss": 0.483, + "step": 8988 + }, + { + "epoch": 0.503359838727741, + "grad_norm": 1.1001750230789185, + "learning_rate": 4.4920000000000004e-05, + "loss": 0.3377, + "step": 8989 + }, + { + "epoch": 0.5034158360398701, + "grad_norm": 1.1395809650421143, + "learning_rate": 4.4925e-05, + "loss": 0.4317, + "step": 8990 + }, + { + "epoch": 0.5034718333519991, + "grad_norm": 1.4517383575439453, + "learning_rate": 4.493e-05, + "loss": 0.399, + "step": 8991 + }, + { + "epoch": 0.5035278306641281, + "grad_norm": 1.731385350227356, + "learning_rate": 4.4935e-05, + "loss": 0.6224, + "step": 8992 + }, + { + "epoch": 0.5035838279762571, + "grad_norm": 1.2057623863220215, + "learning_rate": 4.494000000000001e-05, + "loss": 0.478, + "step": 8993 + }, + { + "epoch": 0.5036398252883861, + "grad_norm": 2.52968692779541, + "learning_rate": 4.4945000000000004e-05, + "loss": 0.5506, + "step": 8994 + }, + { + "epoch": 0.5036958226005152, + "grad_norm": 1.3260557651519775, + "learning_rate": 4.495e-05, + "loss": 0.4464, + "step": 8995 + }, + { + "epoch": 0.5037518199126442, + "grad_norm": 1.177143931388855, + "learning_rate": 4.4955000000000006e-05, + "loss": 0.405, + "step": 8996 + }, + { + "epoch": 0.5038078172247732, + "grad_norm": 1.2711893320083618, + "learning_rate": 4.496e-05, + "loss": 0.4228, + "step": 8997 + }, + { + "epoch": 0.5038638145369022, + "grad_norm": 1.2666378021240234, + "learning_rate": 4.4965e-05, + "loss": 0.4483, + "step": 8998 + }, + { + "epoch": 0.5039198118490312, + "grad_norm": 1.1410467624664307, + "learning_rate": 4.497e-05, + "loss": 0.3531, + "step": 8999 + }, + { + "epoch": 0.5039758091611602, + "grad_norm": 1.304840087890625, + "learning_rate": 4.4975e-05, + "loss": 0.4781, + "step": 9000 + }, + { + "epoch": 0.5040318064732893, + "grad_norm": 1.2505258321762085, + "learning_rate": 4.498e-05, + "loss": 0.4548, + "step": 9001 + }, + { + "epoch": 0.5040878037854183, + "grad_norm": 1.2904996871948242, + "learning_rate": 4.4985000000000004e-05, + "loss": 0.4078, + "step": 9002 + }, + { + "epoch": 0.5041438010975473, + "grad_norm": 1.3471643924713135, + "learning_rate": 4.499e-05, + "loss": 0.6741, + "step": 9003 + }, + { + "epoch": 0.5041997984096763, + "grad_norm": 1.1887874603271484, + "learning_rate": 4.4995000000000005e-05, + "loss": 0.4686, + "step": 9004 + }, + { + "epoch": 0.5042557957218053, + "grad_norm": 1.4074841737747192, + "learning_rate": 4.5e-05, + "loss": 0.6854, + "step": 9005 + }, + { + "epoch": 0.5043117930339344, + "grad_norm": 1.458162546157837, + "learning_rate": 4.5005e-05, + "loss": 0.4305, + "step": 9006 + }, + { + "epoch": 0.5043677903460634, + "grad_norm": 1.4055298566818237, + "learning_rate": 4.5010000000000004e-05, + "loss": 0.3209, + "step": 9007 + }, + { + "epoch": 0.5044237876581924, + "grad_norm": 1.2616369724273682, + "learning_rate": 4.5015e-05, + "loss": 0.4925, + "step": 9008 + }, + { + "epoch": 0.5044797849703214, + "grad_norm": 1.2956013679504395, + "learning_rate": 4.502e-05, + "loss": 0.4253, + "step": 9009 + }, + { + "epoch": 0.5045357822824504, + "grad_norm": 1.3505840301513672, + "learning_rate": 4.5025000000000003e-05, + "loss": 0.5589, + "step": 9010 + }, + { + "epoch": 0.5045917795945795, + "grad_norm": 1.4064513444900513, + "learning_rate": 4.503e-05, + "loss": 0.5392, + "step": 9011 + }, + { + "epoch": 0.5046477769067085, + "grad_norm": 2.8027055263519287, + "learning_rate": 4.5035e-05, + "loss": 0.5942, + "step": 9012 + }, + { + "epoch": 0.5047037742188375, + "grad_norm": 1.185011863708496, + "learning_rate": 4.504e-05, + "loss": 0.4601, + "step": 9013 + }, + { + "epoch": 0.5047597715309665, + "grad_norm": 1.240644931793213, + "learning_rate": 4.504500000000001e-05, + "loss": 0.4146, + "step": 9014 + }, + { + "epoch": 0.5048157688430955, + "grad_norm": 1.5560582876205444, + "learning_rate": 4.5050000000000004e-05, + "loss": 0.4718, + "step": 9015 + }, + { + "epoch": 0.5048717661552246, + "grad_norm": 1.1165698766708374, + "learning_rate": 4.5055e-05, + "loss": 0.423, + "step": 9016 + }, + { + "epoch": 0.5049277634673536, + "grad_norm": 1.8999931812286377, + "learning_rate": 4.506e-05, + "loss": 0.3922, + "step": 9017 + }, + { + "epoch": 0.5049837607794826, + "grad_norm": 1.4183663129806519, + "learning_rate": 4.5065e-05, + "loss": 0.4596, + "step": 9018 + }, + { + "epoch": 0.5050397580916116, + "grad_norm": 1.1907249689102173, + "learning_rate": 4.507e-05, + "loss": 0.3696, + "step": 9019 + }, + { + "epoch": 0.5050957554037406, + "grad_norm": 1.2172504663467407, + "learning_rate": 4.5075e-05, + "loss": 0.4613, + "step": 9020 + }, + { + "epoch": 0.5051517527158697, + "grad_norm": 1.169787883758545, + "learning_rate": 4.508e-05, + "loss": 0.364, + "step": 9021 + }, + { + "epoch": 0.5052077500279987, + "grad_norm": 1.4569367170333862, + "learning_rate": 4.5085e-05, + "loss": 0.4964, + "step": 9022 + }, + { + "epoch": 0.5052637473401277, + "grad_norm": 1.2050981521606445, + "learning_rate": 4.5090000000000004e-05, + "loss": 0.3919, + "step": 9023 + }, + { + "epoch": 0.5053197446522567, + "grad_norm": 1.0716274976730347, + "learning_rate": 4.5095e-05, + "loss": 0.3925, + "step": 9024 + }, + { + "epoch": 0.5053757419643857, + "grad_norm": 1.2640608549118042, + "learning_rate": 4.5100000000000005e-05, + "loss": 0.397, + "step": 9025 + }, + { + "epoch": 0.5054317392765147, + "grad_norm": 1.3346258401870728, + "learning_rate": 4.5105e-05, + "loss": 0.3929, + "step": 9026 + }, + { + "epoch": 0.5054877365886438, + "grad_norm": 1.1225422620773315, + "learning_rate": 4.511e-05, + "loss": 0.3526, + "step": 9027 + }, + { + "epoch": 0.5055437339007728, + "grad_norm": 1.5367369651794434, + "learning_rate": 4.5115000000000004e-05, + "loss": 0.4391, + "step": 9028 + }, + { + "epoch": 0.5055997312129018, + "grad_norm": 1.249927282333374, + "learning_rate": 4.512e-05, + "loss": 0.4161, + "step": 9029 + }, + { + "epoch": 0.5056557285250308, + "grad_norm": 1.2556788921356201, + "learning_rate": 4.5125e-05, + "loss": 0.3548, + "step": 9030 + }, + { + "epoch": 0.5057117258371598, + "grad_norm": 1.3210538625717163, + "learning_rate": 4.513e-05, + "loss": 0.5145, + "step": 9031 + }, + { + "epoch": 0.5057677231492889, + "grad_norm": 1.4485116004943848, + "learning_rate": 4.5135e-05, + "loss": 0.4993, + "step": 9032 + }, + { + "epoch": 0.5058237204614179, + "grad_norm": 1.3486223220825195, + "learning_rate": 4.5140000000000005e-05, + "loss": 0.4924, + "step": 9033 + }, + { + "epoch": 0.5058797177735469, + "grad_norm": 1.189575433731079, + "learning_rate": 4.5145e-05, + "loss": 0.5028, + "step": 9034 + }, + { + "epoch": 0.5059357150856759, + "grad_norm": 1.2943909168243408, + "learning_rate": 4.5150000000000006e-05, + "loss": 0.4446, + "step": 9035 + }, + { + "epoch": 0.5059917123978049, + "grad_norm": 1.1260215044021606, + "learning_rate": 4.5155000000000004e-05, + "loss": 0.426, + "step": 9036 + }, + { + "epoch": 0.506047709709934, + "grad_norm": 1.3268543481826782, + "learning_rate": 4.516e-05, + "loss": 0.6069, + "step": 9037 + }, + { + "epoch": 0.506103707022063, + "grad_norm": 1.2931846380233765, + "learning_rate": 4.5165e-05, + "loss": 0.4658, + "step": 9038 + }, + { + "epoch": 0.506159704334192, + "grad_norm": 1.1556150913238525, + "learning_rate": 4.517e-05, + "loss": 0.3782, + "step": 9039 + }, + { + "epoch": 0.506215701646321, + "grad_norm": 1.1783028841018677, + "learning_rate": 4.5175e-05, + "loss": 0.3293, + "step": 9040 + }, + { + "epoch": 0.50627169895845, + "grad_norm": 1.607237696647644, + "learning_rate": 4.518e-05, + "loss": 0.4597, + "step": 9041 + }, + { + "epoch": 0.506327696270579, + "grad_norm": 1.459184169769287, + "learning_rate": 4.5185e-05, + "loss": 0.4745, + "step": 9042 + }, + { + "epoch": 0.5063836935827081, + "grad_norm": 1.2248507738113403, + "learning_rate": 4.5190000000000006e-05, + "loss": 0.5046, + "step": 9043 + }, + { + "epoch": 0.5064396908948371, + "grad_norm": 1.4197899103164673, + "learning_rate": 4.5195000000000004e-05, + "loss": 0.5367, + "step": 9044 + }, + { + "epoch": 0.5064956882069661, + "grad_norm": 1.3539178371429443, + "learning_rate": 4.52e-05, + "loss": 0.4222, + "step": 9045 + }, + { + "epoch": 0.5065516855190951, + "grad_norm": 1.159487247467041, + "learning_rate": 4.5205000000000005e-05, + "loss": 0.4317, + "step": 9046 + }, + { + "epoch": 0.5066076828312241, + "grad_norm": 1.3368951082229614, + "learning_rate": 4.521e-05, + "loss": 0.449, + "step": 9047 + }, + { + "epoch": 0.5066636801433532, + "grad_norm": 1.304384469985962, + "learning_rate": 4.5215e-05, + "loss": 0.7044, + "step": 9048 + }, + { + "epoch": 0.5067196774554822, + "grad_norm": 1.4194291830062866, + "learning_rate": 4.5220000000000004e-05, + "loss": 0.4021, + "step": 9049 + }, + { + "epoch": 0.5067756747676112, + "grad_norm": 1.290582537651062, + "learning_rate": 4.5225e-05, + "loss": 0.5375, + "step": 9050 + }, + { + "epoch": 0.5068316720797402, + "grad_norm": 2.5563111305236816, + "learning_rate": 4.523e-05, + "loss": 0.368, + "step": 9051 + }, + { + "epoch": 0.5068876693918692, + "grad_norm": 1.0181171894073486, + "learning_rate": 4.5234999999999996e-05, + "loss": 0.3027, + "step": 9052 + }, + { + "epoch": 0.5069436667039983, + "grad_norm": 1.1910600662231445, + "learning_rate": 4.524000000000001e-05, + "loss": 0.3645, + "step": 9053 + }, + { + "epoch": 0.5069996640161272, + "grad_norm": 1.234055995941162, + "learning_rate": 4.5245000000000005e-05, + "loss": 0.4281, + "step": 9054 + }, + { + "epoch": 0.5070556613282562, + "grad_norm": 1.3286703824996948, + "learning_rate": 4.525e-05, + "loss": 0.4125, + "step": 9055 + }, + { + "epoch": 0.5071116586403852, + "grad_norm": 1.4298676252365112, + "learning_rate": 4.5255000000000006e-05, + "loss": 0.4882, + "step": 9056 + }, + { + "epoch": 0.5071676559525142, + "grad_norm": 1.1724324226379395, + "learning_rate": 4.5260000000000004e-05, + "loss": 0.3975, + "step": 9057 + }, + { + "epoch": 0.5072236532646432, + "grad_norm": 1.190553903579712, + "learning_rate": 4.5265e-05, + "loss": 0.4498, + "step": 9058 + }, + { + "epoch": 0.5072796505767723, + "grad_norm": 1.2644656896591187, + "learning_rate": 4.527e-05, + "loss": 0.4226, + "step": 9059 + }, + { + "epoch": 0.5073356478889013, + "grad_norm": 1.5083469152450562, + "learning_rate": 4.5275e-05, + "loss": 0.3173, + "step": 9060 + }, + { + "epoch": 0.5073916452010303, + "grad_norm": 1.3417754173278809, + "learning_rate": 4.528e-05, + "loss": 0.3594, + "step": 9061 + }, + { + "epoch": 0.5074476425131593, + "grad_norm": 1.2471858263015747, + "learning_rate": 4.5285e-05, + "loss": 0.3968, + "step": 9062 + }, + { + "epoch": 0.5075036398252883, + "grad_norm": 1.3608206510543823, + "learning_rate": 4.529e-05, + "loss": 0.4765, + "step": 9063 + }, + { + "epoch": 0.5075596371374174, + "grad_norm": 1.4405231475830078, + "learning_rate": 4.5295000000000006e-05, + "loss": 0.5328, + "step": 9064 + }, + { + "epoch": 0.5076156344495464, + "grad_norm": 1.1082167625427246, + "learning_rate": 4.53e-05, + "loss": 0.3742, + "step": 9065 + }, + { + "epoch": 0.5076716317616754, + "grad_norm": 1.232535481452942, + "learning_rate": 4.5305e-05, + "loss": 0.3588, + "step": 9066 + }, + { + "epoch": 0.5077276290738044, + "grad_norm": 1.1461641788482666, + "learning_rate": 4.5310000000000005e-05, + "loss": 0.4133, + "step": 9067 + }, + { + "epoch": 0.5077836263859334, + "grad_norm": 1.3012218475341797, + "learning_rate": 4.5315e-05, + "loss": 0.5193, + "step": 9068 + }, + { + "epoch": 0.5078396236980625, + "grad_norm": 1.2144492864608765, + "learning_rate": 4.532e-05, + "loss": 0.3894, + "step": 9069 + }, + { + "epoch": 0.5078956210101915, + "grad_norm": 1.208936333656311, + "learning_rate": 4.5325000000000004e-05, + "loss": 0.39, + "step": 9070 + }, + { + "epoch": 0.5079516183223205, + "grad_norm": 1.1332958936691284, + "learning_rate": 4.533e-05, + "loss": 0.3271, + "step": 9071 + }, + { + "epoch": 0.5080076156344495, + "grad_norm": 1.1151103973388672, + "learning_rate": 4.5335e-05, + "loss": 0.3278, + "step": 9072 + }, + { + "epoch": 0.5080636129465785, + "grad_norm": 1.5224456787109375, + "learning_rate": 4.534e-05, + "loss": 0.485, + "step": 9073 + }, + { + "epoch": 0.5081196102587076, + "grad_norm": 1.3664674758911133, + "learning_rate": 4.534500000000001e-05, + "loss": 0.5187, + "step": 9074 + }, + { + "epoch": 0.5081756075708366, + "grad_norm": 1.2341505289077759, + "learning_rate": 4.5350000000000005e-05, + "loss": 0.3897, + "step": 9075 + }, + { + "epoch": 0.5082316048829656, + "grad_norm": 1.1455440521240234, + "learning_rate": 4.5355e-05, + "loss": 0.3157, + "step": 9076 + }, + { + "epoch": 0.5082876021950946, + "grad_norm": 1.1693487167358398, + "learning_rate": 4.536e-05, + "loss": 0.376, + "step": 9077 + }, + { + "epoch": 0.5083435995072236, + "grad_norm": 1.324777364730835, + "learning_rate": 4.5365000000000004e-05, + "loss": 0.5683, + "step": 9078 + }, + { + "epoch": 0.5083995968193527, + "grad_norm": 1.2605270147323608, + "learning_rate": 4.537e-05, + "loss": 0.3379, + "step": 9079 + }, + { + "epoch": 0.5084555941314817, + "grad_norm": 1.3456287384033203, + "learning_rate": 4.5375e-05, + "loss": 0.554, + "step": 9080 + }, + { + "epoch": 0.5085115914436107, + "grad_norm": 1.2584619522094727, + "learning_rate": 4.538e-05, + "loss": 0.5242, + "step": 9081 + }, + { + "epoch": 0.5085675887557397, + "grad_norm": 1.4179725646972656, + "learning_rate": 4.5385e-05, + "loss": 0.4579, + "step": 9082 + }, + { + "epoch": 0.5086235860678687, + "grad_norm": 1.4745380878448486, + "learning_rate": 4.5390000000000004e-05, + "loss": 0.428, + "step": 9083 + }, + { + "epoch": 0.5086795833799977, + "grad_norm": 1.2324879169464111, + "learning_rate": 4.5395e-05, + "loss": 0.376, + "step": 9084 + }, + { + "epoch": 0.5087355806921268, + "grad_norm": 1.1317589282989502, + "learning_rate": 4.5400000000000006e-05, + "loss": 0.2786, + "step": 9085 + }, + { + "epoch": 0.5087915780042558, + "grad_norm": 1.1965456008911133, + "learning_rate": 4.5405e-05, + "loss": 0.3678, + "step": 9086 + }, + { + "epoch": 0.5088475753163848, + "grad_norm": 1.6246026754379272, + "learning_rate": 4.541e-05, + "loss": 0.4555, + "step": 9087 + }, + { + "epoch": 0.5089035726285138, + "grad_norm": 1.401979684829712, + "learning_rate": 4.5415000000000005e-05, + "loss": 0.5249, + "step": 9088 + }, + { + "epoch": 0.5089595699406428, + "grad_norm": 1.5277830362319946, + "learning_rate": 4.542e-05, + "loss": 0.3628, + "step": 9089 + }, + { + "epoch": 0.5090155672527719, + "grad_norm": 1.3548916578292847, + "learning_rate": 4.5425e-05, + "loss": 0.4446, + "step": 9090 + }, + { + "epoch": 0.5090715645649009, + "grad_norm": 1.2804852724075317, + "learning_rate": 4.543e-05, + "loss": 0.4077, + "step": 9091 + }, + { + "epoch": 0.5091275618770299, + "grad_norm": 1.2474457025527954, + "learning_rate": 4.5435e-05, + "loss": 0.3732, + "step": 9092 + }, + { + "epoch": 0.5091835591891589, + "grad_norm": 1.3139522075653076, + "learning_rate": 4.5440000000000005e-05, + "loss": 0.468, + "step": 9093 + }, + { + "epoch": 0.5092395565012879, + "grad_norm": 1.2315027713775635, + "learning_rate": 4.5445e-05, + "loss": 0.488, + "step": 9094 + }, + { + "epoch": 0.509295553813417, + "grad_norm": 1.3135638236999512, + "learning_rate": 4.545000000000001e-05, + "loss": 0.4894, + "step": 9095 + }, + { + "epoch": 0.509351551125546, + "grad_norm": 1.1322346925735474, + "learning_rate": 4.5455000000000004e-05, + "loss": 0.3885, + "step": 9096 + }, + { + "epoch": 0.509407548437675, + "grad_norm": 1.29420804977417, + "learning_rate": 4.546e-05, + "loss": 0.3649, + "step": 9097 + }, + { + "epoch": 0.509463545749804, + "grad_norm": 1.3108958005905151, + "learning_rate": 4.5465e-05, + "loss": 0.4254, + "step": 9098 + }, + { + "epoch": 0.509519543061933, + "grad_norm": 1.3444880247116089, + "learning_rate": 4.5470000000000003e-05, + "loss": 0.4003, + "step": 9099 + }, + { + "epoch": 0.509575540374062, + "grad_norm": 1.1340411901474, + "learning_rate": 4.5475e-05, + "loss": 0.3966, + "step": 9100 + }, + { + "epoch": 0.5096315376861911, + "grad_norm": 1.425068974494934, + "learning_rate": 4.548e-05, + "loss": 0.4551, + "step": 9101 + }, + { + "epoch": 0.5096875349983201, + "grad_norm": 1.381592869758606, + "learning_rate": 4.5485e-05, + "loss": 0.3084, + "step": 9102 + }, + { + "epoch": 0.5097435323104491, + "grad_norm": 1.239937663078308, + "learning_rate": 4.549000000000001e-05, + "loss": 0.4293, + "step": 9103 + }, + { + "epoch": 0.5097995296225781, + "grad_norm": 1.8166515827178955, + "learning_rate": 4.5495000000000004e-05, + "loss": 0.593, + "step": 9104 + }, + { + "epoch": 0.5098555269347071, + "grad_norm": 1.406846046447754, + "learning_rate": 4.55e-05, + "loss": 0.515, + "step": 9105 + }, + { + "epoch": 0.5099115242468362, + "grad_norm": 1.3348393440246582, + "learning_rate": 4.5505000000000006e-05, + "loss": 0.4368, + "step": 9106 + }, + { + "epoch": 0.5099675215589652, + "grad_norm": 1.1971629858016968, + "learning_rate": 4.551e-05, + "loss": 0.5018, + "step": 9107 + }, + { + "epoch": 0.5100235188710942, + "grad_norm": 1.2922112941741943, + "learning_rate": 4.5515e-05, + "loss": 0.4869, + "step": 9108 + }, + { + "epoch": 0.5100795161832232, + "grad_norm": 4.104691982269287, + "learning_rate": 4.5520000000000005e-05, + "loss": 0.4964, + "step": 9109 + }, + { + "epoch": 0.5101355134953522, + "grad_norm": 1.2255407571792603, + "learning_rate": 4.5525e-05, + "loss": 0.4425, + "step": 9110 + }, + { + "epoch": 0.5101915108074813, + "grad_norm": 1.3091654777526855, + "learning_rate": 4.553e-05, + "loss": 0.419, + "step": 9111 + }, + { + "epoch": 0.5102475081196103, + "grad_norm": 1.6149533987045288, + "learning_rate": 4.5535e-05, + "loss": 0.4657, + "step": 9112 + }, + { + "epoch": 0.5103035054317393, + "grad_norm": 1.4710536003112793, + "learning_rate": 4.554000000000001e-05, + "loss": 0.4854, + "step": 9113 + }, + { + "epoch": 0.5103595027438683, + "grad_norm": 1.2672430276870728, + "learning_rate": 4.5545000000000005e-05, + "loss": 0.5837, + "step": 9114 + }, + { + "epoch": 0.5104155000559973, + "grad_norm": 1.4874082803726196, + "learning_rate": 4.555e-05, + "loss": 0.5358, + "step": 9115 + }, + { + "epoch": 0.5104714973681264, + "grad_norm": 1.204516887664795, + "learning_rate": 4.5555e-05, + "loss": 0.4116, + "step": 9116 + }, + { + "epoch": 0.5105274946802554, + "grad_norm": 1.1308927536010742, + "learning_rate": 4.5560000000000004e-05, + "loss": 0.3743, + "step": 9117 + }, + { + "epoch": 0.5105834919923844, + "grad_norm": 1.3372341394424438, + "learning_rate": 4.5565e-05, + "loss": 0.5025, + "step": 9118 + }, + { + "epoch": 0.5106394893045134, + "grad_norm": 1.2648262977600098, + "learning_rate": 4.557e-05, + "loss": 0.432, + "step": 9119 + }, + { + "epoch": 0.5106954866166424, + "grad_norm": 1.1696102619171143, + "learning_rate": 4.5575e-05, + "loss": 0.4272, + "step": 9120 + }, + { + "epoch": 0.5107514839287715, + "grad_norm": 1.4026856422424316, + "learning_rate": 4.558e-05, + "loss": 0.3541, + "step": 9121 + }, + { + "epoch": 0.5108074812409005, + "grad_norm": 1.13295316696167, + "learning_rate": 4.5585e-05, + "loss": 0.3299, + "step": 9122 + }, + { + "epoch": 0.5108634785530295, + "grad_norm": 1.5681631565093994, + "learning_rate": 4.559e-05, + "loss": 0.3446, + "step": 9123 + }, + { + "epoch": 0.5109194758651585, + "grad_norm": 1.1067805290222168, + "learning_rate": 4.5595000000000006e-05, + "loss": 0.3119, + "step": 9124 + }, + { + "epoch": 0.5109754731772875, + "grad_norm": 1.3130533695220947, + "learning_rate": 4.5600000000000004e-05, + "loss": 0.4757, + "step": 9125 + }, + { + "epoch": 0.5110314704894166, + "grad_norm": 1.240734577178955, + "learning_rate": 4.5605e-05, + "loss": 0.389, + "step": 9126 + }, + { + "epoch": 0.5110874678015456, + "grad_norm": 1.3343544006347656, + "learning_rate": 4.5610000000000005e-05, + "loss": 0.4339, + "step": 9127 + }, + { + "epoch": 0.5111434651136746, + "grad_norm": 1.2861734628677368, + "learning_rate": 4.5615e-05, + "loss": 0.4505, + "step": 9128 + }, + { + "epoch": 0.5111994624258036, + "grad_norm": 1.5055768489837646, + "learning_rate": 4.562e-05, + "loss": 0.588, + "step": 9129 + }, + { + "epoch": 0.5112554597379326, + "grad_norm": 1.5687438249588013, + "learning_rate": 4.5625e-05, + "loss": 0.4528, + "step": 9130 + }, + { + "epoch": 0.5113114570500616, + "grad_norm": 1.7964882850646973, + "learning_rate": 4.563e-05, + "loss": 0.5269, + "step": 9131 + }, + { + "epoch": 0.5113674543621907, + "grad_norm": 1.2458393573760986, + "learning_rate": 4.5635e-05, + "loss": 0.5181, + "step": 9132 + }, + { + "epoch": 0.5114234516743197, + "grad_norm": 1.0184763669967651, + "learning_rate": 4.564e-05, + "loss": 0.4279, + "step": 9133 + }, + { + "epoch": 0.5114794489864487, + "grad_norm": 1.3518426418304443, + "learning_rate": 4.564500000000001e-05, + "loss": 0.3765, + "step": 9134 + }, + { + "epoch": 0.5115354462985777, + "grad_norm": 1.2622166872024536, + "learning_rate": 4.5650000000000005e-05, + "loss": 0.4849, + "step": 9135 + }, + { + "epoch": 0.5115914436107066, + "grad_norm": 1.3343026638031006, + "learning_rate": 4.5655e-05, + "loss": 0.4758, + "step": 9136 + }, + { + "epoch": 0.5116474409228357, + "grad_norm": 1.2146726846694946, + "learning_rate": 4.566e-05, + "loss": 0.4958, + "step": 9137 + }, + { + "epoch": 0.5117034382349647, + "grad_norm": 1.320116400718689, + "learning_rate": 4.5665000000000004e-05, + "loss": 0.4295, + "step": 9138 + }, + { + "epoch": 0.5117594355470937, + "grad_norm": 1.2639986276626587, + "learning_rate": 4.567e-05, + "loss": 0.3921, + "step": 9139 + }, + { + "epoch": 0.5118154328592227, + "grad_norm": 1.1365134716033936, + "learning_rate": 4.5675e-05, + "loss": 0.4634, + "step": 9140 + }, + { + "epoch": 0.5118714301713517, + "grad_norm": 1.3811694383621216, + "learning_rate": 4.568e-05, + "loss": 0.4476, + "step": 9141 + }, + { + "epoch": 0.5119274274834807, + "grad_norm": 1.1554290056228638, + "learning_rate": 4.5685e-05, + "loss": 0.3517, + "step": 9142 + }, + { + "epoch": 0.5119834247956098, + "grad_norm": 1.1563440561294556, + "learning_rate": 4.569e-05, + "loss": 0.4315, + "step": 9143 + }, + { + "epoch": 0.5120394221077388, + "grad_norm": 1.0936301946640015, + "learning_rate": 4.5695e-05, + "loss": 0.4105, + "step": 9144 + }, + { + "epoch": 0.5120954194198678, + "grad_norm": 1.7846345901489258, + "learning_rate": 4.5700000000000006e-05, + "loss": 0.4825, + "step": 9145 + }, + { + "epoch": 0.5121514167319968, + "grad_norm": 1.1873970031738281, + "learning_rate": 4.5705000000000004e-05, + "loss": 0.4168, + "step": 9146 + }, + { + "epoch": 0.5122074140441258, + "grad_norm": 1.2181586027145386, + "learning_rate": 4.571e-05, + "loss": 0.4442, + "step": 9147 + }, + { + "epoch": 0.5122634113562549, + "grad_norm": 1.1474303007125854, + "learning_rate": 4.5715000000000005e-05, + "loss": 0.449, + "step": 9148 + }, + { + "epoch": 0.5123194086683839, + "grad_norm": 1.3512732982635498, + "learning_rate": 4.572e-05, + "loss": 0.4581, + "step": 9149 + }, + { + "epoch": 0.5123754059805129, + "grad_norm": 1.2675834894180298, + "learning_rate": 4.5725e-05, + "loss": 0.3983, + "step": 9150 + }, + { + "epoch": 0.5124314032926419, + "grad_norm": 1.3232731819152832, + "learning_rate": 4.573e-05, + "loss": 0.4474, + "step": 9151 + }, + { + "epoch": 0.5124874006047709, + "grad_norm": 1.4368561506271362, + "learning_rate": 4.5735e-05, + "loss": 0.391, + "step": 9152 + }, + { + "epoch": 0.5125433979169, + "grad_norm": 1.2953909635543823, + "learning_rate": 4.574e-05, + "loss": 0.4717, + "step": 9153 + }, + { + "epoch": 0.512599395229029, + "grad_norm": 2.3071467876434326, + "learning_rate": 4.5745e-05, + "loss": 0.408, + "step": 9154 + }, + { + "epoch": 0.512655392541158, + "grad_norm": 1.2890418767929077, + "learning_rate": 4.575e-05, + "loss": 0.3286, + "step": 9155 + }, + { + "epoch": 0.512711389853287, + "grad_norm": 1.3090392351150513, + "learning_rate": 4.5755000000000005e-05, + "loss": 0.5139, + "step": 9156 + }, + { + "epoch": 0.512767387165416, + "grad_norm": 1.228469967842102, + "learning_rate": 4.576e-05, + "loss": 0.4131, + "step": 9157 + }, + { + "epoch": 0.512823384477545, + "grad_norm": 1.266729474067688, + "learning_rate": 4.5765e-05, + "loss": 0.4143, + "step": 9158 + }, + { + "epoch": 0.5128793817896741, + "grad_norm": 1.4858880043029785, + "learning_rate": 4.5770000000000004e-05, + "loss": 0.4721, + "step": 9159 + }, + { + "epoch": 0.5129353791018031, + "grad_norm": 1.380627155303955, + "learning_rate": 4.5775e-05, + "loss": 0.6853, + "step": 9160 + }, + { + "epoch": 0.5129913764139321, + "grad_norm": 1.2912437915802002, + "learning_rate": 4.578e-05, + "loss": 0.3152, + "step": 9161 + }, + { + "epoch": 0.5130473737260611, + "grad_norm": 1.0574392080307007, + "learning_rate": 4.5785e-05, + "loss": 0.3956, + "step": 9162 + }, + { + "epoch": 0.5131033710381901, + "grad_norm": 0.939217746257782, + "learning_rate": 4.579e-05, + "loss": 0.2672, + "step": 9163 + }, + { + "epoch": 0.5131593683503192, + "grad_norm": 1.147523045539856, + "learning_rate": 4.5795000000000005e-05, + "loss": 0.3168, + "step": 9164 + }, + { + "epoch": 0.5132153656624482, + "grad_norm": 1.3794147968292236, + "learning_rate": 4.58e-05, + "loss": 0.657, + "step": 9165 + }, + { + "epoch": 0.5132713629745772, + "grad_norm": 1.8935555219650269, + "learning_rate": 4.5805000000000006e-05, + "loss": 0.5039, + "step": 9166 + }, + { + "epoch": 0.5133273602867062, + "grad_norm": 1.233540654182434, + "learning_rate": 4.5810000000000004e-05, + "loss": 0.4168, + "step": 9167 + }, + { + "epoch": 0.5133833575988352, + "grad_norm": 1.2867122888565063, + "learning_rate": 4.5815e-05, + "loss": 0.4104, + "step": 9168 + }, + { + "epoch": 0.5134393549109643, + "grad_norm": 1.1694250106811523, + "learning_rate": 4.5820000000000005e-05, + "loss": 0.425, + "step": 9169 + }, + { + "epoch": 0.5134953522230933, + "grad_norm": 1.2046486139297485, + "learning_rate": 4.5825e-05, + "loss": 0.4514, + "step": 9170 + }, + { + "epoch": 0.5135513495352223, + "grad_norm": 1.1943657398223877, + "learning_rate": 4.583e-05, + "loss": 0.617, + "step": 9171 + }, + { + "epoch": 0.5136073468473513, + "grad_norm": 1.2283899784088135, + "learning_rate": 4.5835e-05, + "loss": 0.4312, + "step": 9172 + }, + { + "epoch": 0.5136633441594803, + "grad_norm": 1.3342396020889282, + "learning_rate": 4.584e-05, + "loss": 0.4931, + "step": 9173 + }, + { + "epoch": 0.5137193414716094, + "grad_norm": 1.4554526805877686, + "learning_rate": 4.5845000000000006e-05, + "loss": 0.4853, + "step": 9174 + }, + { + "epoch": 0.5137753387837384, + "grad_norm": 1.2992995977401733, + "learning_rate": 4.585e-05, + "loss": 0.4486, + "step": 9175 + }, + { + "epoch": 0.5138313360958674, + "grad_norm": 1.2688758373260498, + "learning_rate": 4.5855e-05, + "loss": 0.4145, + "step": 9176 + }, + { + "epoch": 0.5138873334079964, + "grad_norm": 1.1902157068252563, + "learning_rate": 4.5860000000000005e-05, + "loss": 0.3951, + "step": 9177 + }, + { + "epoch": 0.5139433307201254, + "grad_norm": 1.1584275960922241, + "learning_rate": 4.5865e-05, + "loss": 0.3811, + "step": 9178 + }, + { + "epoch": 0.5139993280322545, + "grad_norm": 1.4463282823562622, + "learning_rate": 4.587e-05, + "loss": 0.4477, + "step": 9179 + }, + { + "epoch": 0.5140553253443835, + "grad_norm": 1.225193738937378, + "learning_rate": 4.5875000000000004e-05, + "loss": 0.5085, + "step": 9180 + }, + { + "epoch": 0.5141113226565125, + "grad_norm": 1.3751776218414307, + "learning_rate": 4.588e-05, + "loss": 0.4581, + "step": 9181 + }, + { + "epoch": 0.5141673199686415, + "grad_norm": 1.1857109069824219, + "learning_rate": 4.5885e-05, + "loss": 0.4816, + "step": 9182 + }, + { + "epoch": 0.5142233172807705, + "grad_norm": 1.116721510887146, + "learning_rate": 4.589e-05, + "loss": 0.418, + "step": 9183 + }, + { + "epoch": 0.5142793145928996, + "grad_norm": 1.302652359008789, + "learning_rate": 4.589500000000001e-05, + "loss": 0.3992, + "step": 9184 + }, + { + "epoch": 0.5143353119050286, + "grad_norm": 1.2706457376480103, + "learning_rate": 4.5900000000000004e-05, + "loss": 0.4317, + "step": 9185 + }, + { + "epoch": 0.5143913092171576, + "grad_norm": 1.1368987560272217, + "learning_rate": 4.5905e-05, + "loss": 0.5304, + "step": 9186 + }, + { + "epoch": 0.5144473065292866, + "grad_norm": 1.1194905042648315, + "learning_rate": 4.5910000000000006e-05, + "loss": 0.4262, + "step": 9187 + }, + { + "epoch": 0.5145033038414156, + "grad_norm": 1.4616315364837646, + "learning_rate": 4.5915000000000003e-05, + "loss": 0.412, + "step": 9188 + }, + { + "epoch": 0.5145593011535446, + "grad_norm": 1.2584593296051025, + "learning_rate": 4.592e-05, + "loss": 0.4486, + "step": 9189 + }, + { + "epoch": 0.5146152984656737, + "grad_norm": 1.3513503074645996, + "learning_rate": 4.5925e-05, + "loss": 0.4367, + "step": 9190 + }, + { + "epoch": 0.5146712957778027, + "grad_norm": 1.3395848274230957, + "learning_rate": 4.593e-05, + "loss": 0.4674, + "step": 9191 + }, + { + "epoch": 0.5147272930899317, + "grad_norm": 1.45074462890625, + "learning_rate": 4.5935e-05, + "loss": 0.5375, + "step": 9192 + }, + { + "epoch": 0.5147832904020607, + "grad_norm": 1.2556960582733154, + "learning_rate": 4.594e-05, + "loss": 0.4159, + "step": 9193 + }, + { + "epoch": 0.5148392877141897, + "grad_norm": 1.1413027048110962, + "learning_rate": 4.5945e-05, + "loss": 0.3966, + "step": 9194 + }, + { + "epoch": 0.5148952850263188, + "grad_norm": 1.5372223854064941, + "learning_rate": 4.5950000000000006e-05, + "loss": 0.3643, + "step": 9195 + }, + { + "epoch": 0.5149512823384478, + "grad_norm": 1.4177385568618774, + "learning_rate": 4.5955e-05, + "loss": 0.48, + "step": 9196 + }, + { + "epoch": 0.5150072796505768, + "grad_norm": 1.2636547088623047, + "learning_rate": 4.596e-05, + "loss": 0.4348, + "step": 9197 + }, + { + "epoch": 0.5150632769627058, + "grad_norm": 1.465863823890686, + "learning_rate": 4.5965000000000005e-05, + "loss": 0.4749, + "step": 9198 + }, + { + "epoch": 0.5151192742748348, + "grad_norm": 1.1423953771591187, + "learning_rate": 4.597e-05, + "loss": 0.3845, + "step": 9199 + }, + { + "epoch": 0.5151752715869639, + "grad_norm": 1.100070834159851, + "learning_rate": 4.5975e-05, + "loss": 0.4243, + "step": 9200 + }, + { + "epoch": 0.5152312688990929, + "grad_norm": 1.6481834650039673, + "learning_rate": 4.5980000000000004e-05, + "loss": 0.5836, + "step": 9201 + }, + { + "epoch": 0.5152872662112219, + "grad_norm": 1.323297142982483, + "learning_rate": 4.5985e-05, + "loss": 0.4682, + "step": 9202 + }, + { + "epoch": 0.5153432635233509, + "grad_norm": 1.2022228240966797, + "learning_rate": 4.599e-05, + "loss": 0.4252, + "step": 9203 + }, + { + "epoch": 0.5153992608354799, + "grad_norm": 1.5649847984313965, + "learning_rate": 4.5995e-05, + "loss": 0.6934, + "step": 9204 + }, + { + "epoch": 0.515455258147609, + "grad_norm": 1.4646174907684326, + "learning_rate": 4.600000000000001e-05, + "loss": 0.4992, + "step": 9205 + }, + { + "epoch": 0.515511255459738, + "grad_norm": 1.2009707689285278, + "learning_rate": 4.6005000000000004e-05, + "loss": 0.4129, + "step": 9206 + }, + { + "epoch": 0.515567252771867, + "grad_norm": 1.3114912509918213, + "learning_rate": 4.601e-05, + "loss": 0.3581, + "step": 9207 + }, + { + "epoch": 0.515623250083996, + "grad_norm": 1.2285393476486206, + "learning_rate": 4.6015000000000006e-05, + "loss": 0.4393, + "step": 9208 + }, + { + "epoch": 0.515679247396125, + "grad_norm": 1.554861307144165, + "learning_rate": 4.602e-05, + "loss": 0.4743, + "step": 9209 + }, + { + "epoch": 0.515735244708254, + "grad_norm": 1.0693328380584717, + "learning_rate": 4.6025e-05, + "loss": 0.3656, + "step": 9210 + }, + { + "epoch": 0.5157912420203831, + "grad_norm": 1.2261241674423218, + "learning_rate": 4.603e-05, + "loss": 0.4464, + "step": 9211 + }, + { + "epoch": 0.5158472393325121, + "grad_norm": 1.3556733131408691, + "learning_rate": 4.6035e-05, + "loss": 0.4532, + "step": 9212 + }, + { + "epoch": 0.5159032366446411, + "grad_norm": 1.05940580368042, + "learning_rate": 4.604e-05, + "loss": 0.3792, + "step": 9213 + }, + { + "epoch": 0.5159592339567701, + "grad_norm": 0.9899119138717651, + "learning_rate": 4.6045000000000004e-05, + "loss": 0.362, + "step": 9214 + }, + { + "epoch": 0.5160152312688991, + "grad_norm": 1.0863126516342163, + "learning_rate": 4.605e-05, + "loss": 0.4113, + "step": 9215 + }, + { + "epoch": 0.5160712285810282, + "grad_norm": 1.3411493301391602, + "learning_rate": 4.6055000000000005e-05, + "loss": 0.4646, + "step": 9216 + }, + { + "epoch": 0.5161272258931572, + "grad_norm": 1.2432260513305664, + "learning_rate": 4.606e-05, + "loss": 0.4235, + "step": 9217 + }, + { + "epoch": 0.5161832232052862, + "grad_norm": 1.2353248596191406, + "learning_rate": 4.6065e-05, + "loss": 0.3575, + "step": 9218 + }, + { + "epoch": 0.5162392205174151, + "grad_norm": 1.151160717010498, + "learning_rate": 4.6070000000000004e-05, + "loss": 0.3813, + "step": 9219 + }, + { + "epoch": 0.5162952178295441, + "grad_norm": 1.0681962966918945, + "learning_rate": 4.6075e-05, + "loss": 0.2898, + "step": 9220 + }, + { + "epoch": 0.5163512151416731, + "grad_norm": 1.4751173257827759, + "learning_rate": 4.608e-05, + "loss": 0.611, + "step": 9221 + }, + { + "epoch": 0.5164072124538022, + "grad_norm": 1.1293771266937256, + "learning_rate": 4.6085000000000003e-05, + "loss": 0.4167, + "step": 9222 + }, + { + "epoch": 0.5164632097659312, + "grad_norm": 1.486940622329712, + "learning_rate": 4.609e-05, + "loss": 0.5161, + "step": 9223 + }, + { + "epoch": 0.5165192070780602, + "grad_norm": 1.2248575687408447, + "learning_rate": 4.6095000000000005e-05, + "loss": 0.5836, + "step": 9224 + }, + { + "epoch": 0.5165752043901892, + "grad_norm": 1.1783866882324219, + "learning_rate": 4.61e-05, + "loss": 0.3845, + "step": 9225 + }, + { + "epoch": 0.5166312017023182, + "grad_norm": 1.5090482234954834, + "learning_rate": 4.610500000000001e-05, + "loss": 0.397, + "step": 9226 + }, + { + "epoch": 0.5166871990144473, + "grad_norm": 1.2617485523223877, + "learning_rate": 4.6110000000000004e-05, + "loss": 0.3639, + "step": 9227 + }, + { + "epoch": 0.5167431963265763, + "grad_norm": 1.2481554746627808, + "learning_rate": 4.6115e-05, + "loss": 0.4247, + "step": 9228 + }, + { + "epoch": 0.5167991936387053, + "grad_norm": 1.1832367181777954, + "learning_rate": 4.612e-05, + "loss": 0.3609, + "step": 9229 + }, + { + "epoch": 0.5168551909508343, + "grad_norm": 1.161326289176941, + "learning_rate": 4.6125e-05, + "loss": 0.3136, + "step": 9230 + }, + { + "epoch": 0.5169111882629633, + "grad_norm": 1.1889429092407227, + "learning_rate": 4.613e-05, + "loss": 0.4367, + "step": 9231 + }, + { + "epoch": 0.5169671855750924, + "grad_norm": 1.656028389930725, + "learning_rate": 4.6135e-05, + "loss": 0.5329, + "step": 9232 + }, + { + "epoch": 0.5170231828872214, + "grad_norm": 1.3792628049850464, + "learning_rate": 4.614e-05, + "loss": 0.4479, + "step": 9233 + }, + { + "epoch": 0.5170791801993504, + "grad_norm": 1.2553712129592896, + "learning_rate": 4.6145000000000006e-05, + "loss": 0.4374, + "step": 9234 + }, + { + "epoch": 0.5171351775114794, + "grad_norm": 1.0009866952896118, + "learning_rate": 4.6150000000000004e-05, + "loss": 0.3785, + "step": 9235 + }, + { + "epoch": 0.5171911748236084, + "grad_norm": 1.2896463871002197, + "learning_rate": 4.6155e-05, + "loss": 0.4246, + "step": 9236 + }, + { + "epoch": 0.5172471721357375, + "grad_norm": 1.2183150053024292, + "learning_rate": 4.6160000000000005e-05, + "loss": 0.3809, + "step": 9237 + }, + { + "epoch": 0.5173031694478665, + "grad_norm": 1.196364402770996, + "learning_rate": 4.6165e-05, + "loss": 0.4453, + "step": 9238 + }, + { + "epoch": 0.5173591667599955, + "grad_norm": 1.162322759628296, + "learning_rate": 4.617e-05, + "loss": 0.4103, + "step": 9239 + }, + { + "epoch": 0.5174151640721245, + "grad_norm": 1.1721992492675781, + "learning_rate": 4.6175000000000004e-05, + "loss": 0.4286, + "step": 9240 + }, + { + "epoch": 0.5174711613842535, + "grad_norm": 1.5863608121871948, + "learning_rate": 4.618e-05, + "loss": 0.4548, + "step": 9241 + }, + { + "epoch": 0.5175271586963825, + "grad_norm": 1.2901023626327515, + "learning_rate": 4.6185e-05, + "loss": 0.5696, + "step": 9242 + }, + { + "epoch": 0.5175831560085116, + "grad_norm": 1.0869500637054443, + "learning_rate": 4.619e-05, + "loss": 0.4111, + "step": 9243 + }, + { + "epoch": 0.5176391533206406, + "grad_norm": 1.1780967712402344, + "learning_rate": 4.619500000000001e-05, + "loss": 0.3407, + "step": 9244 + }, + { + "epoch": 0.5176951506327696, + "grad_norm": 1.2058454751968384, + "learning_rate": 4.6200000000000005e-05, + "loss": 0.4369, + "step": 9245 + }, + { + "epoch": 0.5177511479448986, + "grad_norm": 1.329429268836975, + "learning_rate": 4.6205e-05, + "loss": 0.5259, + "step": 9246 + }, + { + "epoch": 0.5178071452570276, + "grad_norm": 1.1401983499526978, + "learning_rate": 4.6210000000000006e-05, + "loss": 0.4004, + "step": 9247 + }, + { + "epoch": 0.5178631425691567, + "grad_norm": 1.2211241722106934, + "learning_rate": 4.6215000000000004e-05, + "loss": 0.4571, + "step": 9248 + }, + { + "epoch": 0.5179191398812857, + "grad_norm": 1.2139710187911987, + "learning_rate": 4.622e-05, + "loss": 0.3738, + "step": 9249 + }, + { + "epoch": 0.5179751371934147, + "grad_norm": 1.2400494813919067, + "learning_rate": 4.6225e-05, + "loss": 0.4665, + "step": 9250 + }, + { + "epoch": 0.5180311345055437, + "grad_norm": 1.2360652685165405, + "learning_rate": 4.623e-05, + "loss": 0.329, + "step": 9251 + }, + { + "epoch": 0.5180871318176727, + "grad_norm": 1.4026408195495605, + "learning_rate": 4.6235e-05, + "loss": 0.5076, + "step": 9252 + }, + { + "epoch": 0.5181431291298018, + "grad_norm": 1.1283280849456787, + "learning_rate": 4.624e-05, + "loss": 0.3409, + "step": 9253 + }, + { + "epoch": 0.5181991264419308, + "grad_norm": 1.180201768875122, + "learning_rate": 4.6245e-05, + "loss": 0.3412, + "step": 9254 + }, + { + "epoch": 0.5182551237540598, + "grad_norm": 1.263129711151123, + "learning_rate": 4.6250000000000006e-05, + "loss": 0.3948, + "step": 9255 + }, + { + "epoch": 0.5183111210661888, + "grad_norm": 2.041626453399658, + "learning_rate": 4.6255000000000004e-05, + "loss": 0.4851, + "step": 9256 + }, + { + "epoch": 0.5183671183783178, + "grad_norm": 1.2479692697525024, + "learning_rate": 4.626e-05, + "loss": 0.3156, + "step": 9257 + }, + { + "epoch": 0.5184231156904469, + "grad_norm": 1.620681881904602, + "learning_rate": 4.6265000000000005e-05, + "loss": 0.418, + "step": 9258 + }, + { + "epoch": 0.5184791130025759, + "grad_norm": 1.504738688468933, + "learning_rate": 4.627e-05, + "loss": 0.4166, + "step": 9259 + }, + { + "epoch": 0.5185351103147049, + "grad_norm": 1.296774983406067, + "learning_rate": 4.6275e-05, + "loss": 0.4553, + "step": 9260 + }, + { + "epoch": 0.5185911076268339, + "grad_norm": 1.2768787145614624, + "learning_rate": 4.6280000000000004e-05, + "loss": 0.4789, + "step": 9261 + }, + { + "epoch": 0.5186471049389629, + "grad_norm": 1.1409201622009277, + "learning_rate": 4.6285e-05, + "loss": 0.4079, + "step": 9262 + }, + { + "epoch": 0.518703102251092, + "grad_norm": 1.0574982166290283, + "learning_rate": 4.629e-05, + "loss": 0.3735, + "step": 9263 + }, + { + "epoch": 0.518759099563221, + "grad_norm": 1.2961485385894775, + "learning_rate": 4.6294999999999996e-05, + "loss": 0.3911, + "step": 9264 + }, + { + "epoch": 0.51881509687535, + "grad_norm": 1.4814493656158447, + "learning_rate": 4.630000000000001e-05, + "loss": 0.3772, + "step": 9265 + }, + { + "epoch": 0.518871094187479, + "grad_norm": 1.281013011932373, + "learning_rate": 4.6305000000000005e-05, + "loss": 0.5341, + "step": 9266 + }, + { + "epoch": 0.518927091499608, + "grad_norm": 1.3805668354034424, + "learning_rate": 4.631e-05, + "loss": 0.4929, + "step": 9267 + }, + { + "epoch": 0.518983088811737, + "grad_norm": 1.4640450477600098, + "learning_rate": 4.6315e-05, + "loss": 0.4276, + "step": 9268 + }, + { + "epoch": 0.5190390861238661, + "grad_norm": 1.4518136978149414, + "learning_rate": 4.6320000000000004e-05, + "loss": 0.5306, + "step": 9269 + }, + { + "epoch": 0.5190950834359951, + "grad_norm": 1.4797710180282593, + "learning_rate": 4.6325e-05, + "loss": 0.3544, + "step": 9270 + }, + { + "epoch": 0.5191510807481241, + "grad_norm": 1.166329026222229, + "learning_rate": 4.633e-05, + "loss": 0.3905, + "step": 9271 + }, + { + "epoch": 0.5192070780602531, + "grad_norm": 1.1806344985961914, + "learning_rate": 4.6335e-05, + "loss": 0.4603, + "step": 9272 + }, + { + "epoch": 0.5192630753723821, + "grad_norm": 1.1213579177856445, + "learning_rate": 4.634e-05, + "loss": 0.3816, + "step": 9273 + }, + { + "epoch": 0.5193190726845112, + "grad_norm": 1.24141263961792, + "learning_rate": 4.6345e-05, + "loss": 0.4185, + "step": 9274 + }, + { + "epoch": 0.5193750699966402, + "grad_norm": 1.2993707656860352, + "learning_rate": 4.635e-05, + "loss": 0.3636, + "step": 9275 + }, + { + "epoch": 0.5194310673087692, + "grad_norm": 1.1595044136047363, + "learning_rate": 4.6355000000000006e-05, + "loss": 0.3716, + "step": 9276 + }, + { + "epoch": 0.5194870646208982, + "grad_norm": 1.4520783424377441, + "learning_rate": 4.636e-05, + "loss": 0.4154, + "step": 9277 + }, + { + "epoch": 0.5195430619330272, + "grad_norm": 1.390235185623169, + "learning_rate": 4.6365e-05, + "loss": 0.5374, + "step": 9278 + }, + { + "epoch": 0.5195990592451563, + "grad_norm": 1.1509873867034912, + "learning_rate": 4.6370000000000005e-05, + "loss": 0.3998, + "step": 9279 + }, + { + "epoch": 0.5196550565572853, + "grad_norm": 1.089005947113037, + "learning_rate": 4.6375e-05, + "loss": 0.3992, + "step": 9280 + }, + { + "epoch": 0.5197110538694143, + "grad_norm": 1.2871884107589722, + "learning_rate": 4.638e-05, + "loss": 0.4491, + "step": 9281 + }, + { + "epoch": 0.5197670511815433, + "grad_norm": 1.0660498142242432, + "learning_rate": 4.6385000000000004e-05, + "loss": 0.3495, + "step": 9282 + }, + { + "epoch": 0.5198230484936723, + "grad_norm": 1.4733692407608032, + "learning_rate": 4.639e-05, + "loss": 0.388, + "step": 9283 + }, + { + "epoch": 0.5198790458058014, + "grad_norm": 1.3952951431274414, + "learning_rate": 4.6395e-05, + "loss": 0.3995, + "step": 9284 + }, + { + "epoch": 0.5199350431179304, + "grad_norm": 1.2849518060684204, + "learning_rate": 4.64e-05, + "loss": 0.3945, + "step": 9285 + }, + { + "epoch": 0.5199910404300594, + "grad_norm": 1.2646955251693726, + "learning_rate": 4.640500000000001e-05, + "loss": 0.4605, + "step": 9286 + }, + { + "epoch": 0.5200470377421884, + "grad_norm": 1.6200830936431885, + "learning_rate": 4.6410000000000005e-05, + "loss": 0.5205, + "step": 9287 + }, + { + "epoch": 0.5201030350543174, + "grad_norm": 1.5152814388275146, + "learning_rate": 4.6415e-05, + "loss": 0.53, + "step": 9288 + }, + { + "epoch": 0.5201590323664464, + "grad_norm": 1.5220857858657837, + "learning_rate": 4.642e-05, + "loss": 0.6232, + "step": 9289 + }, + { + "epoch": 0.5202150296785755, + "grad_norm": 1.34884512424469, + "learning_rate": 4.6425000000000004e-05, + "loss": 0.4985, + "step": 9290 + }, + { + "epoch": 0.5202710269907045, + "grad_norm": 1.3182399272918701, + "learning_rate": 4.643e-05, + "loss": 0.5307, + "step": 9291 + }, + { + "epoch": 0.5203270243028335, + "grad_norm": 1.4727940559387207, + "learning_rate": 4.6435e-05, + "loss": 0.6653, + "step": 9292 + }, + { + "epoch": 0.5203830216149625, + "grad_norm": 1.233260989189148, + "learning_rate": 4.644e-05, + "loss": 0.3941, + "step": 9293 + }, + { + "epoch": 0.5204390189270915, + "grad_norm": 1.5735726356506348, + "learning_rate": 4.6445e-05, + "loss": 0.5239, + "step": 9294 + }, + { + "epoch": 0.5204950162392206, + "grad_norm": 1.2462122440338135, + "learning_rate": 4.6450000000000004e-05, + "loss": 0.4223, + "step": 9295 + }, + { + "epoch": 0.5205510135513496, + "grad_norm": 1.1779415607452393, + "learning_rate": 4.6455e-05, + "loss": 0.3389, + "step": 9296 + }, + { + "epoch": 0.5206070108634786, + "grad_norm": 6.715813636779785, + "learning_rate": 4.6460000000000006e-05, + "loss": 0.5612, + "step": 9297 + }, + { + "epoch": 0.5206630081756076, + "grad_norm": 1.1489617824554443, + "learning_rate": 4.6465e-05, + "loss": 0.3704, + "step": 9298 + }, + { + "epoch": 0.5207190054877366, + "grad_norm": 1.129090666770935, + "learning_rate": 4.647e-05, + "loss": 0.4061, + "step": 9299 + }, + { + "epoch": 0.5207750027998657, + "grad_norm": 1.7248512506484985, + "learning_rate": 4.6475000000000005e-05, + "loss": 0.4675, + "step": 9300 + }, + { + "epoch": 0.5208310001119947, + "grad_norm": 1.4207942485809326, + "learning_rate": 4.648e-05, + "loss": 0.5119, + "step": 9301 + }, + { + "epoch": 0.5208869974241236, + "grad_norm": 1.1348084211349487, + "learning_rate": 4.6485e-05, + "loss": 0.3737, + "step": 9302 + }, + { + "epoch": 0.5209429947362526, + "grad_norm": 1.1250228881835938, + "learning_rate": 4.649e-05, + "loss": 0.4405, + "step": 9303 + }, + { + "epoch": 0.5209989920483816, + "grad_norm": 1.207688808441162, + "learning_rate": 4.6495e-05, + "loss": 0.3961, + "step": 9304 + }, + { + "epoch": 0.5210549893605106, + "grad_norm": 1.2484468221664429, + "learning_rate": 4.6500000000000005e-05, + "loss": 0.5452, + "step": 9305 + }, + { + "epoch": 0.5211109866726397, + "grad_norm": 1.426182508468628, + "learning_rate": 4.6505e-05, + "loss": 0.4543, + "step": 9306 + }, + { + "epoch": 0.5211669839847687, + "grad_norm": 1.3050537109375, + "learning_rate": 4.651e-05, + "loss": 0.3999, + "step": 9307 + }, + { + "epoch": 0.5212229812968977, + "grad_norm": 1.513106346130371, + "learning_rate": 4.6515000000000004e-05, + "loss": 0.5753, + "step": 9308 + }, + { + "epoch": 0.5212789786090267, + "grad_norm": 1.4483026266098022, + "learning_rate": 4.652e-05, + "loss": 0.4234, + "step": 9309 + }, + { + "epoch": 0.5213349759211557, + "grad_norm": 1.6815348863601685, + "learning_rate": 4.6525e-05, + "loss": 0.5076, + "step": 9310 + }, + { + "epoch": 0.5213909732332848, + "grad_norm": 3.2403481006622314, + "learning_rate": 4.6530000000000003e-05, + "loss": 0.4415, + "step": 9311 + }, + { + "epoch": 0.5214469705454138, + "grad_norm": 0.9420170187950134, + "learning_rate": 4.6535e-05, + "loss": 0.4042, + "step": 9312 + }, + { + "epoch": 0.5215029678575428, + "grad_norm": 1.3481045961380005, + "learning_rate": 4.654e-05, + "loss": 0.513, + "step": 9313 + }, + { + "epoch": 0.5215589651696718, + "grad_norm": 1.4042633771896362, + "learning_rate": 4.6545e-05, + "loss": 0.4236, + "step": 9314 + }, + { + "epoch": 0.5216149624818008, + "grad_norm": 1.3333860635757446, + "learning_rate": 4.655000000000001e-05, + "loss": 0.3404, + "step": 9315 + }, + { + "epoch": 0.5216709597939299, + "grad_norm": 1.5369305610656738, + "learning_rate": 4.6555000000000004e-05, + "loss": 0.5177, + "step": 9316 + }, + { + "epoch": 0.5217269571060589, + "grad_norm": 1.1305073499679565, + "learning_rate": 4.656e-05, + "loss": 0.3775, + "step": 9317 + }, + { + "epoch": 0.5217829544181879, + "grad_norm": 1.3339060544967651, + "learning_rate": 4.6565000000000006e-05, + "loss": 0.4506, + "step": 9318 + }, + { + "epoch": 0.5218389517303169, + "grad_norm": 1.486646294593811, + "learning_rate": 4.657e-05, + "loss": 0.4526, + "step": 9319 + }, + { + "epoch": 0.5218949490424459, + "grad_norm": 2.0488927364349365, + "learning_rate": 4.6575e-05, + "loss": 0.3688, + "step": 9320 + }, + { + "epoch": 0.521950946354575, + "grad_norm": 1.4488757848739624, + "learning_rate": 4.6580000000000005e-05, + "loss": 0.5925, + "step": 9321 + }, + { + "epoch": 0.522006943666704, + "grad_norm": 1.2774953842163086, + "learning_rate": 4.6585e-05, + "loss": 0.4723, + "step": 9322 + }, + { + "epoch": 0.522062940978833, + "grad_norm": 1.2316725254058838, + "learning_rate": 4.659e-05, + "loss": 0.4808, + "step": 9323 + }, + { + "epoch": 0.522118938290962, + "grad_norm": 1.5660383701324463, + "learning_rate": 4.6595e-05, + "loss": 0.5014, + "step": 9324 + }, + { + "epoch": 0.522174935603091, + "grad_norm": 1.5444111824035645, + "learning_rate": 4.660000000000001e-05, + "loss": 0.5312, + "step": 9325 + }, + { + "epoch": 0.52223093291522, + "grad_norm": 1.4226216077804565, + "learning_rate": 4.6605000000000005e-05, + "loss": 0.5107, + "step": 9326 + }, + { + "epoch": 0.5222869302273491, + "grad_norm": 1.1815792322158813, + "learning_rate": 4.661e-05, + "loss": 0.349, + "step": 9327 + }, + { + "epoch": 0.5223429275394781, + "grad_norm": 1.321407437324524, + "learning_rate": 4.6615e-05, + "loss": 0.5086, + "step": 9328 + }, + { + "epoch": 0.5223989248516071, + "grad_norm": 1.2084922790527344, + "learning_rate": 4.6620000000000004e-05, + "loss": 0.3924, + "step": 9329 + }, + { + "epoch": 0.5224549221637361, + "grad_norm": 1.177322506904602, + "learning_rate": 4.6625e-05, + "loss": 0.3934, + "step": 9330 + }, + { + "epoch": 0.5225109194758651, + "grad_norm": 1.176064133644104, + "learning_rate": 4.663e-05, + "loss": 0.4001, + "step": 9331 + }, + { + "epoch": 0.5225669167879942, + "grad_norm": 1.2922840118408203, + "learning_rate": 4.6635e-05, + "loss": 0.4604, + "step": 9332 + }, + { + "epoch": 0.5226229141001232, + "grad_norm": 1.0318429470062256, + "learning_rate": 4.664e-05, + "loss": 0.415, + "step": 9333 + }, + { + "epoch": 0.5226789114122522, + "grad_norm": 1.2632776498794556, + "learning_rate": 4.6645e-05, + "loss": 0.4401, + "step": 9334 + }, + { + "epoch": 0.5227349087243812, + "grad_norm": 1.361064076423645, + "learning_rate": 4.665e-05, + "loss": 0.4092, + "step": 9335 + }, + { + "epoch": 0.5227909060365102, + "grad_norm": 1.4519658088684082, + "learning_rate": 4.6655000000000006e-05, + "loss": 0.6166, + "step": 9336 + }, + { + "epoch": 0.5228469033486393, + "grad_norm": 1.2925876379013062, + "learning_rate": 4.6660000000000004e-05, + "loss": 0.4137, + "step": 9337 + }, + { + "epoch": 0.5229029006607683, + "grad_norm": 1.1427985429763794, + "learning_rate": 4.6665e-05, + "loss": 0.3611, + "step": 9338 + }, + { + "epoch": 0.5229588979728973, + "grad_norm": 1.4485900402069092, + "learning_rate": 4.6670000000000005e-05, + "loss": 0.3126, + "step": 9339 + }, + { + "epoch": 0.5230148952850263, + "grad_norm": 1.188767671585083, + "learning_rate": 4.6675e-05, + "loss": 0.3825, + "step": 9340 + }, + { + "epoch": 0.5230708925971553, + "grad_norm": 1.5355230569839478, + "learning_rate": 4.668e-05, + "loss": 0.3954, + "step": 9341 + }, + { + "epoch": 0.5231268899092844, + "grad_norm": 1.3338741064071655, + "learning_rate": 4.6685e-05, + "loss": 0.4825, + "step": 9342 + }, + { + "epoch": 0.5231828872214134, + "grad_norm": 1.626522183418274, + "learning_rate": 4.669e-05, + "loss": 0.5366, + "step": 9343 + }, + { + "epoch": 0.5232388845335424, + "grad_norm": 1.5628392696380615, + "learning_rate": 4.6695e-05, + "loss": 0.5842, + "step": 9344 + }, + { + "epoch": 0.5232948818456714, + "grad_norm": 1.1358919143676758, + "learning_rate": 4.6700000000000003e-05, + "loss": 0.4047, + "step": 9345 + }, + { + "epoch": 0.5233508791578004, + "grad_norm": 1.2265276908874512, + "learning_rate": 4.670500000000001e-05, + "loss": 0.4204, + "step": 9346 + }, + { + "epoch": 0.5234068764699294, + "grad_norm": 1.4317282438278198, + "learning_rate": 4.6710000000000005e-05, + "loss": 0.4173, + "step": 9347 + }, + { + "epoch": 0.5234628737820585, + "grad_norm": 1.3782039880752563, + "learning_rate": 4.6715e-05, + "loss": 0.3343, + "step": 9348 + }, + { + "epoch": 0.5235188710941875, + "grad_norm": 1.3642542362213135, + "learning_rate": 4.672e-05, + "loss": 0.4608, + "step": 9349 + }, + { + "epoch": 0.5235748684063165, + "grad_norm": 1.1251015663146973, + "learning_rate": 4.6725000000000004e-05, + "loss": 0.3629, + "step": 9350 + }, + { + "epoch": 0.5236308657184455, + "grad_norm": 1.2117772102355957, + "learning_rate": 4.673e-05, + "loss": 0.4681, + "step": 9351 + }, + { + "epoch": 0.5236868630305745, + "grad_norm": 1.146168828010559, + "learning_rate": 4.6735e-05, + "loss": 0.4403, + "step": 9352 + }, + { + "epoch": 0.5237428603427036, + "grad_norm": 1.1908000707626343, + "learning_rate": 4.674e-05, + "loss": 0.3886, + "step": 9353 + }, + { + "epoch": 0.5237988576548326, + "grad_norm": 1.557708740234375, + "learning_rate": 4.6745e-05, + "loss": 0.513, + "step": 9354 + }, + { + "epoch": 0.5238548549669616, + "grad_norm": 1.2665703296661377, + "learning_rate": 4.6750000000000005e-05, + "loss": 0.4626, + "step": 9355 + }, + { + "epoch": 0.5239108522790906, + "grad_norm": 1.1932997703552246, + "learning_rate": 4.6755e-05, + "loss": 0.4521, + "step": 9356 + }, + { + "epoch": 0.5239668495912196, + "grad_norm": 1.125240445137024, + "learning_rate": 4.6760000000000006e-05, + "loss": 0.3721, + "step": 9357 + }, + { + "epoch": 0.5240228469033487, + "grad_norm": 1.2674050331115723, + "learning_rate": 4.6765000000000004e-05, + "loss": 0.4335, + "step": 9358 + }, + { + "epoch": 0.5240788442154777, + "grad_norm": 1.3403898477554321, + "learning_rate": 4.677e-05, + "loss": 0.4734, + "step": 9359 + }, + { + "epoch": 0.5241348415276067, + "grad_norm": 1.3714861869812012, + "learning_rate": 4.6775000000000005e-05, + "loss": 0.424, + "step": 9360 + }, + { + "epoch": 0.5241908388397357, + "grad_norm": 1.10027277469635, + "learning_rate": 4.678e-05, + "loss": 0.3875, + "step": 9361 + }, + { + "epoch": 0.5242468361518647, + "grad_norm": 1.2574408054351807, + "learning_rate": 4.6785e-05, + "loss": 0.6071, + "step": 9362 + }, + { + "epoch": 0.5243028334639938, + "grad_norm": 1.5307801961898804, + "learning_rate": 4.679e-05, + "loss": 0.4204, + "step": 9363 + }, + { + "epoch": 0.5243588307761228, + "grad_norm": 1.489932656288147, + "learning_rate": 4.6795e-05, + "loss": 0.5373, + "step": 9364 + }, + { + "epoch": 0.5244148280882518, + "grad_norm": 1.3600836992263794, + "learning_rate": 4.6800000000000006e-05, + "loss": 0.4462, + "step": 9365 + }, + { + "epoch": 0.5244708254003808, + "grad_norm": 1.307382345199585, + "learning_rate": 4.6805e-05, + "loss": 0.3492, + "step": 9366 + }, + { + "epoch": 0.5245268227125098, + "grad_norm": 1.6123884916305542, + "learning_rate": 4.681e-05, + "loss": 0.5212, + "step": 9367 + }, + { + "epoch": 0.5245828200246389, + "grad_norm": 1.2430503368377686, + "learning_rate": 4.6815000000000005e-05, + "loss": 0.4878, + "step": 9368 + }, + { + "epoch": 0.5246388173367679, + "grad_norm": 1.4058754444122314, + "learning_rate": 4.682e-05, + "loss": 0.4161, + "step": 9369 + }, + { + "epoch": 0.5246948146488969, + "grad_norm": 1.9148458242416382, + "learning_rate": 4.6825e-05, + "loss": 0.6418, + "step": 9370 + }, + { + "epoch": 0.5247508119610259, + "grad_norm": 1.1850706338882446, + "learning_rate": 4.6830000000000004e-05, + "loss": 0.4079, + "step": 9371 + }, + { + "epoch": 0.5248068092731549, + "grad_norm": 1.6921254396438599, + "learning_rate": 4.6835e-05, + "loss": 0.4794, + "step": 9372 + }, + { + "epoch": 0.524862806585284, + "grad_norm": 1.3645633459091187, + "learning_rate": 4.684e-05, + "loss": 0.3625, + "step": 9373 + }, + { + "epoch": 0.524918803897413, + "grad_norm": 1.3538650274276733, + "learning_rate": 4.6845e-05, + "loss": 0.4021, + "step": 9374 + }, + { + "epoch": 0.524974801209542, + "grad_norm": 1.3050851821899414, + "learning_rate": 4.685000000000001e-05, + "loss": 0.5553, + "step": 9375 + }, + { + "epoch": 0.525030798521671, + "grad_norm": 1.1158578395843506, + "learning_rate": 4.6855000000000005e-05, + "loss": 0.3695, + "step": 9376 + }, + { + "epoch": 0.5250867958338, + "grad_norm": 1.3095901012420654, + "learning_rate": 4.686e-05, + "loss": 0.5141, + "step": 9377 + }, + { + "epoch": 0.525142793145929, + "grad_norm": 1.19870126247406, + "learning_rate": 4.6865000000000006e-05, + "loss": 0.3386, + "step": 9378 + }, + { + "epoch": 0.5251987904580581, + "grad_norm": 1.4308778047561646, + "learning_rate": 4.6870000000000004e-05, + "loss": 0.4569, + "step": 9379 + }, + { + "epoch": 0.5252547877701871, + "grad_norm": 1.3367048501968384, + "learning_rate": 4.6875e-05, + "loss": 0.4415, + "step": 9380 + }, + { + "epoch": 0.5253107850823161, + "grad_norm": 1.186528205871582, + "learning_rate": 4.688e-05, + "loss": 0.5143, + "step": 9381 + }, + { + "epoch": 0.5253667823944451, + "grad_norm": 1.1099333763122559, + "learning_rate": 4.6885e-05, + "loss": 0.3806, + "step": 9382 + }, + { + "epoch": 0.5254227797065741, + "grad_norm": 1.295485019683838, + "learning_rate": 4.689e-05, + "loss": 0.4786, + "step": 9383 + }, + { + "epoch": 0.525478777018703, + "grad_norm": 1.3310902118682861, + "learning_rate": 4.6895e-05, + "loss": 0.4158, + "step": 9384 + }, + { + "epoch": 0.5255347743308321, + "grad_norm": 1.4873652458190918, + "learning_rate": 4.69e-05, + "loss": 0.4764, + "step": 9385 + }, + { + "epoch": 0.5255907716429611, + "grad_norm": 19.411251068115234, + "learning_rate": 4.6905000000000006e-05, + "loss": 0.5014, + "step": 9386 + }, + { + "epoch": 0.5256467689550901, + "grad_norm": 1.3132100105285645, + "learning_rate": 4.691e-05, + "loss": 0.6241, + "step": 9387 + }, + { + "epoch": 0.5257027662672191, + "grad_norm": 1.4633852243423462, + "learning_rate": 4.6915e-05, + "loss": 0.4219, + "step": 9388 + }, + { + "epoch": 0.5257587635793481, + "grad_norm": 1.215549111366272, + "learning_rate": 4.6920000000000005e-05, + "loss": 0.3689, + "step": 9389 + }, + { + "epoch": 0.5258147608914772, + "grad_norm": 1.663164496421814, + "learning_rate": 4.6925e-05, + "loss": 0.5895, + "step": 9390 + }, + { + "epoch": 0.5258707582036062, + "grad_norm": 1.1354742050170898, + "learning_rate": 4.693e-05, + "loss": 0.4579, + "step": 9391 + }, + { + "epoch": 0.5259267555157352, + "grad_norm": 1.1233223676681519, + "learning_rate": 4.6935000000000004e-05, + "loss": 0.5082, + "step": 9392 + }, + { + "epoch": 0.5259827528278642, + "grad_norm": 1.3693325519561768, + "learning_rate": 4.694e-05, + "loss": 0.4704, + "step": 9393 + }, + { + "epoch": 0.5260387501399932, + "grad_norm": 1.2200404405593872, + "learning_rate": 4.6945e-05, + "loss": 0.4775, + "step": 9394 + }, + { + "epoch": 0.5260947474521223, + "grad_norm": 1.23910653591156, + "learning_rate": 4.695e-05, + "loss": 0.4767, + "step": 9395 + }, + { + "epoch": 0.5261507447642513, + "grad_norm": 1.3311809301376343, + "learning_rate": 4.695500000000001e-05, + "loss": 0.2818, + "step": 9396 + }, + { + "epoch": 0.5262067420763803, + "grad_norm": 1.1943596601486206, + "learning_rate": 4.6960000000000004e-05, + "loss": 0.4084, + "step": 9397 + }, + { + "epoch": 0.5262627393885093, + "grad_norm": 1.1603953838348389, + "learning_rate": 4.6965e-05, + "loss": 0.3852, + "step": 9398 + }, + { + "epoch": 0.5263187367006383, + "grad_norm": 1.0466657876968384, + "learning_rate": 4.6970000000000006e-05, + "loss": 0.4014, + "step": 9399 + }, + { + "epoch": 0.5263747340127674, + "grad_norm": 1.563252568244934, + "learning_rate": 4.6975000000000003e-05, + "loss": 0.4187, + "step": 9400 + }, + { + "epoch": 0.5264307313248964, + "grad_norm": 1.361717700958252, + "learning_rate": 4.698e-05, + "loss": 0.454, + "step": 9401 + }, + { + "epoch": 0.5264867286370254, + "grad_norm": 1.0306333303451538, + "learning_rate": 4.6985e-05, + "loss": 0.3177, + "step": 9402 + }, + { + "epoch": 0.5265427259491544, + "grad_norm": 1.068896770477295, + "learning_rate": 4.699e-05, + "loss": 0.3457, + "step": 9403 + }, + { + "epoch": 0.5265987232612834, + "grad_norm": 1.3894723653793335, + "learning_rate": 4.6995e-05, + "loss": 0.5277, + "step": 9404 + }, + { + "epoch": 0.5266547205734124, + "grad_norm": 1.188368320465088, + "learning_rate": 4.7e-05, + "loss": 0.3323, + "step": 9405 + }, + { + "epoch": 0.5267107178855415, + "grad_norm": 1.2241084575653076, + "learning_rate": 4.7005e-05, + "loss": 0.4512, + "step": 9406 + }, + { + "epoch": 0.5267667151976705, + "grad_norm": 1.2160418033599854, + "learning_rate": 4.7010000000000006e-05, + "loss": 0.4451, + "step": 9407 + }, + { + "epoch": 0.5268227125097995, + "grad_norm": 1.332726001739502, + "learning_rate": 4.7015e-05, + "loss": 0.4397, + "step": 9408 + }, + { + "epoch": 0.5268787098219285, + "grad_norm": 1.3292813301086426, + "learning_rate": 4.702e-05, + "loss": 0.5394, + "step": 9409 + }, + { + "epoch": 0.5269347071340575, + "grad_norm": 1.5254113674163818, + "learning_rate": 4.7025000000000005e-05, + "loss": 0.4445, + "step": 9410 + }, + { + "epoch": 0.5269907044461866, + "grad_norm": 1.443175196647644, + "learning_rate": 4.703e-05, + "loss": 0.4907, + "step": 9411 + }, + { + "epoch": 0.5270467017583156, + "grad_norm": 1.3154882192611694, + "learning_rate": 4.7035e-05, + "loss": 0.3779, + "step": 9412 + }, + { + "epoch": 0.5271026990704446, + "grad_norm": 1.2727384567260742, + "learning_rate": 4.7040000000000004e-05, + "loss": 0.4134, + "step": 9413 + }, + { + "epoch": 0.5271586963825736, + "grad_norm": 1.4846564531326294, + "learning_rate": 4.7045e-05, + "loss": 0.4969, + "step": 9414 + }, + { + "epoch": 0.5272146936947026, + "grad_norm": 1.115434169769287, + "learning_rate": 4.705e-05, + "loss": 0.2774, + "step": 9415 + }, + { + "epoch": 0.5272706910068317, + "grad_norm": 1.1634632349014282, + "learning_rate": 4.7055e-05, + "loss": 0.3853, + "step": 9416 + }, + { + "epoch": 0.5273266883189607, + "grad_norm": 1.208876132965088, + "learning_rate": 4.706000000000001e-05, + "loss": 0.5363, + "step": 9417 + }, + { + "epoch": 0.5273826856310897, + "grad_norm": 1.4385795593261719, + "learning_rate": 4.7065000000000004e-05, + "loss": 0.4161, + "step": 9418 + }, + { + "epoch": 0.5274386829432187, + "grad_norm": 1.210424542427063, + "learning_rate": 4.707e-05, + "loss": 0.4713, + "step": 9419 + }, + { + "epoch": 0.5274946802553477, + "grad_norm": 1.1139848232269287, + "learning_rate": 4.7075e-05, + "loss": 0.451, + "step": 9420 + }, + { + "epoch": 0.5275506775674768, + "grad_norm": 1.3247337341308594, + "learning_rate": 4.708e-05, + "loss": 0.4168, + "step": 9421 + }, + { + "epoch": 0.5276066748796058, + "grad_norm": 1.511054515838623, + "learning_rate": 4.7085e-05, + "loss": 0.4623, + "step": 9422 + }, + { + "epoch": 0.5276626721917348, + "grad_norm": 2.5482265949249268, + "learning_rate": 4.709e-05, + "loss": 0.5358, + "step": 9423 + }, + { + "epoch": 0.5277186695038638, + "grad_norm": 1.2414195537567139, + "learning_rate": 4.7095e-05, + "loss": 0.3446, + "step": 9424 + }, + { + "epoch": 0.5277746668159928, + "grad_norm": 1.1679651737213135, + "learning_rate": 4.71e-05, + "loss": 0.4133, + "step": 9425 + }, + { + "epoch": 0.5278306641281219, + "grad_norm": 1.3512089252471924, + "learning_rate": 4.7105000000000004e-05, + "loss": 0.5281, + "step": 9426 + }, + { + "epoch": 0.5278866614402509, + "grad_norm": 1.1578283309936523, + "learning_rate": 4.711e-05, + "loss": 0.3743, + "step": 9427 + }, + { + "epoch": 0.5279426587523799, + "grad_norm": 1.0728938579559326, + "learning_rate": 4.7115000000000005e-05, + "loss": 0.3666, + "step": 9428 + }, + { + "epoch": 0.5279986560645089, + "grad_norm": 1.27793288230896, + "learning_rate": 4.712e-05, + "loss": 0.4113, + "step": 9429 + }, + { + "epoch": 0.5280546533766379, + "grad_norm": 1.2008014917373657, + "learning_rate": 4.7125e-05, + "loss": 0.458, + "step": 9430 + }, + { + "epoch": 0.528110650688767, + "grad_norm": 1.2136887311935425, + "learning_rate": 4.7130000000000004e-05, + "loss": 0.382, + "step": 9431 + }, + { + "epoch": 0.528166648000896, + "grad_norm": 1.2503955364227295, + "learning_rate": 4.7135e-05, + "loss": 0.4685, + "step": 9432 + }, + { + "epoch": 0.528222645313025, + "grad_norm": 1.1596254110336304, + "learning_rate": 4.714e-05, + "loss": 0.4247, + "step": 9433 + }, + { + "epoch": 0.528278642625154, + "grad_norm": 1.1022257804870605, + "learning_rate": 4.7145000000000003e-05, + "loss": 0.379, + "step": 9434 + }, + { + "epoch": 0.528334639937283, + "grad_norm": 1.4563370943069458, + "learning_rate": 4.715e-05, + "loss": 0.4772, + "step": 9435 + }, + { + "epoch": 0.528390637249412, + "grad_norm": 1.2413662672042847, + "learning_rate": 4.7155000000000005e-05, + "loss": 0.3936, + "step": 9436 + }, + { + "epoch": 0.5284466345615411, + "grad_norm": 1.4025589227676392, + "learning_rate": 4.716e-05, + "loss": 0.4219, + "step": 9437 + }, + { + "epoch": 0.5285026318736701, + "grad_norm": 1.4004168510437012, + "learning_rate": 4.716500000000001e-05, + "loss": 0.4298, + "step": 9438 + }, + { + "epoch": 0.5285586291857991, + "grad_norm": 1.2061214447021484, + "learning_rate": 4.7170000000000004e-05, + "loss": 0.4344, + "step": 9439 + }, + { + "epoch": 0.5286146264979281, + "grad_norm": 1.204013705253601, + "learning_rate": 4.7175e-05, + "loss": 0.389, + "step": 9440 + }, + { + "epoch": 0.5286706238100571, + "grad_norm": 1.3896828889846802, + "learning_rate": 4.718e-05, + "loss": 0.4706, + "step": 9441 + }, + { + "epoch": 0.5287266211221862, + "grad_norm": 1.1460342407226562, + "learning_rate": 4.7185e-05, + "loss": 0.5139, + "step": 9442 + }, + { + "epoch": 0.5287826184343152, + "grad_norm": 1.0229713916778564, + "learning_rate": 4.719e-05, + "loss": 0.353, + "step": 9443 + }, + { + "epoch": 0.5288386157464442, + "grad_norm": 1.3112131357192993, + "learning_rate": 4.7195e-05, + "loss": 0.6277, + "step": 9444 + }, + { + "epoch": 0.5288946130585732, + "grad_norm": 1.111806869506836, + "learning_rate": 4.72e-05, + "loss": 0.3495, + "step": 9445 + }, + { + "epoch": 0.5289506103707022, + "grad_norm": 1.380745768547058, + "learning_rate": 4.7205000000000006e-05, + "loss": 0.443, + "step": 9446 + }, + { + "epoch": 0.5290066076828313, + "grad_norm": 1.5349440574645996, + "learning_rate": 4.7210000000000004e-05, + "loss": 0.7024, + "step": 9447 + }, + { + "epoch": 0.5290626049949603, + "grad_norm": 1.9456593990325928, + "learning_rate": 4.7215e-05, + "loss": 0.6035, + "step": 9448 + }, + { + "epoch": 0.5291186023070893, + "grad_norm": 1.6643338203430176, + "learning_rate": 4.7220000000000005e-05, + "loss": 0.6041, + "step": 9449 + }, + { + "epoch": 0.5291745996192183, + "grad_norm": 1.0799925327301025, + "learning_rate": 4.7225e-05, + "loss": 0.4175, + "step": 9450 + }, + { + "epoch": 0.5292305969313473, + "grad_norm": 1.3846999406814575, + "learning_rate": 4.723e-05, + "loss": 0.4605, + "step": 9451 + }, + { + "epoch": 0.5292865942434763, + "grad_norm": 30.342243194580078, + "learning_rate": 4.7235000000000004e-05, + "loss": 0.4169, + "step": 9452 + }, + { + "epoch": 0.5293425915556054, + "grad_norm": 1.0062071084976196, + "learning_rate": 4.724e-05, + "loss": 0.286, + "step": 9453 + }, + { + "epoch": 0.5293985888677344, + "grad_norm": 1.2299600839614868, + "learning_rate": 4.7245e-05, + "loss": 0.6064, + "step": 9454 + }, + { + "epoch": 0.5294545861798634, + "grad_norm": 1.2189158201217651, + "learning_rate": 4.7249999999999997e-05, + "loss": 0.4084, + "step": 9455 + }, + { + "epoch": 0.5295105834919924, + "grad_norm": 1.4640798568725586, + "learning_rate": 4.725500000000001e-05, + "loss": 0.4591, + "step": 9456 + }, + { + "epoch": 0.5295665808041214, + "grad_norm": 1.0949474573135376, + "learning_rate": 4.7260000000000005e-05, + "loss": 0.3065, + "step": 9457 + }, + { + "epoch": 0.5296225781162505, + "grad_norm": 1.0800518989562988, + "learning_rate": 4.7265e-05, + "loss": 0.3289, + "step": 9458 + }, + { + "epoch": 0.5296785754283795, + "grad_norm": 1.3044289350509644, + "learning_rate": 4.7270000000000007e-05, + "loss": 0.4441, + "step": 9459 + }, + { + "epoch": 0.5297345727405085, + "grad_norm": 1.2596040964126587, + "learning_rate": 4.7275000000000004e-05, + "loss": 0.5076, + "step": 9460 + }, + { + "epoch": 0.5297905700526375, + "grad_norm": 1.1207771301269531, + "learning_rate": 4.728e-05, + "loss": 0.4076, + "step": 9461 + }, + { + "epoch": 0.5298465673647665, + "grad_norm": 1.3882770538330078, + "learning_rate": 4.7285e-05, + "loss": 0.4243, + "step": 9462 + }, + { + "epoch": 0.5299025646768956, + "grad_norm": 2.043513298034668, + "learning_rate": 4.729e-05, + "loss": 0.5781, + "step": 9463 + }, + { + "epoch": 0.5299585619890246, + "grad_norm": 1.4725967645645142, + "learning_rate": 4.7295e-05, + "loss": 0.6341, + "step": 9464 + }, + { + "epoch": 0.5300145593011536, + "grad_norm": 1.2623803615570068, + "learning_rate": 4.73e-05, + "loss": 0.4125, + "step": 9465 + }, + { + "epoch": 0.5300705566132826, + "grad_norm": 1.5083774328231812, + "learning_rate": 4.7305e-05, + "loss": 0.5131, + "step": 9466 + }, + { + "epoch": 0.5301265539254115, + "grad_norm": 1.3852721452713013, + "learning_rate": 4.7310000000000006e-05, + "loss": 0.4347, + "step": 9467 + }, + { + "epoch": 0.5301825512375405, + "grad_norm": 1.2226790189743042, + "learning_rate": 4.7315000000000004e-05, + "loss": 0.4741, + "step": 9468 + }, + { + "epoch": 0.5302385485496696, + "grad_norm": 1.1415860652923584, + "learning_rate": 4.732e-05, + "loss": 0.4221, + "step": 9469 + }, + { + "epoch": 0.5302945458617986, + "grad_norm": 1.0505965948104858, + "learning_rate": 4.7325000000000005e-05, + "loss": 0.3267, + "step": 9470 + }, + { + "epoch": 0.5303505431739276, + "grad_norm": 1.618748664855957, + "learning_rate": 4.733e-05, + "loss": 0.4949, + "step": 9471 + }, + { + "epoch": 0.5304065404860566, + "grad_norm": 1.3280977010726929, + "learning_rate": 4.7335e-05, + "loss": 0.3981, + "step": 9472 + }, + { + "epoch": 0.5304625377981856, + "grad_norm": 1.3729336261749268, + "learning_rate": 4.7340000000000004e-05, + "loss": 0.4779, + "step": 9473 + }, + { + "epoch": 0.5305185351103147, + "grad_norm": 1.0747077465057373, + "learning_rate": 4.7345e-05, + "loss": 0.4052, + "step": 9474 + }, + { + "epoch": 0.5305745324224437, + "grad_norm": 1.391895055770874, + "learning_rate": 4.735e-05, + "loss": 0.4142, + "step": 9475 + }, + { + "epoch": 0.5306305297345727, + "grad_norm": 1.2232345342636108, + "learning_rate": 4.7355e-05, + "loss": 0.382, + "step": 9476 + }, + { + "epoch": 0.5306865270467017, + "grad_norm": 1.3728983402252197, + "learning_rate": 4.736000000000001e-05, + "loss": 0.6502, + "step": 9477 + }, + { + "epoch": 0.5307425243588307, + "grad_norm": 1.1662178039550781, + "learning_rate": 4.7365000000000005e-05, + "loss": 0.4822, + "step": 9478 + }, + { + "epoch": 0.5307985216709598, + "grad_norm": 1.171358346939087, + "learning_rate": 4.737e-05, + "loss": 0.4309, + "step": 9479 + }, + { + "epoch": 0.5308545189830888, + "grad_norm": 5.396470069885254, + "learning_rate": 4.7375e-05, + "loss": 0.5371, + "step": 9480 + }, + { + "epoch": 0.5309105162952178, + "grad_norm": 1.3976140022277832, + "learning_rate": 4.7380000000000004e-05, + "loss": 0.3117, + "step": 9481 + }, + { + "epoch": 0.5309665136073468, + "grad_norm": 1.4935336112976074, + "learning_rate": 4.7385e-05, + "loss": 0.4498, + "step": 9482 + }, + { + "epoch": 0.5310225109194758, + "grad_norm": 1.3390616178512573, + "learning_rate": 4.739e-05, + "loss": 0.4706, + "step": 9483 + }, + { + "epoch": 0.5310785082316049, + "grad_norm": 1.7145739793777466, + "learning_rate": 4.7395e-05, + "loss": 0.4191, + "step": 9484 + }, + { + "epoch": 0.5311345055437339, + "grad_norm": 1.1888912916183472, + "learning_rate": 4.74e-05, + "loss": 0.3363, + "step": 9485 + }, + { + "epoch": 0.5311905028558629, + "grad_norm": 1.2296078205108643, + "learning_rate": 4.7405000000000004e-05, + "loss": 0.3789, + "step": 9486 + }, + { + "epoch": 0.5312465001679919, + "grad_norm": 1.4090124368667603, + "learning_rate": 4.741e-05, + "loss": 0.5327, + "step": 9487 + }, + { + "epoch": 0.5313024974801209, + "grad_norm": 1.6135203838348389, + "learning_rate": 4.7415000000000006e-05, + "loss": 0.5505, + "step": 9488 + }, + { + "epoch": 0.53135849479225, + "grad_norm": 1.1662812232971191, + "learning_rate": 4.742e-05, + "loss": 0.3032, + "step": 9489 + }, + { + "epoch": 0.531414492104379, + "grad_norm": 1.2016109228134155, + "learning_rate": 4.7425e-05, + "loss": 0.3916, + "step": 9490 + }, + { + "epoch": 0.531470489416508, + "grad_norm": 1.1974090337753296, + "learning_rate": 4.7430000000000005e-05, + "loss": 0.4503, + "step": 9491 + }, + { + "epoch": 0.531526486728637, + "grad_norm": 1.194698691368103, + "learning_rate": 4.7435e-05, + "loss": 0.3947, + "step": 9492 + }, + { + "epoch": 0.531582484040766, + "grad_norm": 1.080213189125061, + "learning_rate": 4.744e-05, + "loss": 0.4138, + "step": 9493 + }, + { + "epoch": 0.531638481352895, + "grad_norm": 1.5487487316131592, + "learning_rate": 4.7445e-05, + "loss": 0.398, + "step": 9494 + }, + { + "epoch": 0.5316944786650241, + "grad_norm": 1.090031623840332, + "learning_rate": 4.745e-05, + "loss": 0.3756, + "step": 9495 + }, + { + "epoch": 0.5317504759771531, + "grad_norm": 1.1094419956207275, + "learning_rate": 4.7455000000000006e-05, + "loss": 0.4849, + "step": 9496 + }, + { + "epoch": 0.5318064732892821, + "grad_norm": 1.4495092630386353, + "learning_rate": 4.746e-05, + "loss": 0.3952, + "step": 9497 + }, + { + "epoch": 0.5318624706014111, + "grad_norm": 1.535077452659607, + "learning_rate": 4.746500000000001e-05, + "loss": 0.429, + "step": 9498 + }, + { + "epoch": 0.5319184679135401, + "grad_norm": 1.8276748657226562, + "learning_rate": 4.7470000000000005e-05, + "loss": 0.4638, + "step": 9499 + }, + { + "epoch": 0.5319744652256692, + "grad_norm": 1.2453430891036987, + "learning_rate": 4.7475e-05, + "loss": 0.3779, + "step": 9500 + }, + { + "epoch": 0.5320304625377982, + "grad_norm": 1.2442799806594849, + "learning_rate": 4.748e-05, + "loss": 0.6226, + "step": 9501 + }, + { + "epoch": 0.5320864598499272, + "grad_norm": 1.3208845853805542, + "learning_rate": 4.7485000000000004e-05, + "loss": 0.3726, + "step": 9502 + }, + { + "epoch": 0.5321424571620562, + "grad_norm": 1.1866168975830078, + "learning_rate": 4.749e-05, + "loss": 0.3711, + "step": 9503 + }, + { + "epoch": 0.5321984544741852, + "grad_norm": 1.3956489562988281, + "learning_rate": 4.7495e-05, + "loss": 0.5071, + "step": 9504 + }, + { + "epoch": 0.5322544517863143, + "grad_norm": 1.4432622194290161, + "learning_rate": 4.75e-05, + "loss": 0.5588, + "step": 9505 + }, + { + "epoch": 0.5323104490984433, + "grad_norm": 1.4026999473571777, + "learning_rate": 4.7505e-05, + "loss": 0.4134, + "step": 9506 + }, + { + "epoch": 0.5323664464105723, + "grad_norm": 1.4475175142288208, + "learning_rate": 4.7510000000000004e-05, + "loss": 0.4246, + "step": 9507 + }, + { + "epoch": 0.5324224437227013, + "grad_norm": 1.3349000215530396, + "learning_rate": 4.7515e-05, + "loss": 0.2891, + "step": 9508 + }, + { + "epoch": 0.5324784410348303, + "grad_norm": 1.7620033025741577, + "learning_rate": 4.7520000000000006e-05, + "loss": 0.4458, + "step": 9509 + }, + { + "epoch": 0.5325344383469593, + "grad_norm": 1.14523184299469, + "learning_rate": 4.7525e-05, + "loss": 0.3744, + "step": 9510 + }, + { + "epoch": 0.5325904356590884, + "grad_norm": 1.699660301208496, + "learning_rate": 4.753e-05, + "loss": 0.5277, + "step": 9511 + }, + { + "epoch": 0.5326464329712174, + "grad_norm": 1.4605993032455444, + "learning_rate": 4.7535000000000005e-05, + "loss": 0.4717, + "step": 9512 + }, + { + "epoch": 0.5327024302833464, + "grad_norm": 1.1820509433746338, + "learning_rate": 4.754e-05, + "loss": 0.4485, + "step": 9513 + }, + { + "epoch": 0.5327584275954754, + "grad_norm": 1.1635390520095825, + "learning_rate": 4.7545e-05, + "loss": 0.3693, + "step": 9514 + }, + { + "epoch": 0.5328144249076044, + "grad_norm": 1.3283125162124634, + "learning_rate": 4.755e-05, + "loss": 0.3971, + "step": 9515 + }, + { + "epoch": 0.5328704222197335, + "grad_norm": 1.3234190940856934, + "learning_rate": 4.7555e-05, + "loss": 0.5261, + "step": 9516 + }, + { + "epoch": 0.5329264195318625, + "grad_norm": 1.2321648597717285, + "learning_rate": 4.7560000000000005e-05, + "loss": 0.5013, + "step": 9517 + }, + { + "epoch": 0.5329824168439915, + "grad_norm": 1.5043165683746338, + "learning_rate": 4.7565e-05, + "loss": 0.3885, + "step": 9518 + }, + { + "epoch": 0.5330384141561205, + "grad_norm": 1.2735203504562378, + "learning_rate": 4.757e-05, + "loss": 0.5187, + "step": 9519 + }, + { + "epoch": 0.5330944114682495, + "grad_norm": 1.0651447772979736, + "learning_rate": 4.7575000000000004e-05, + "loss": 0.3281, + "step": 9520 + }, + { + "epoch": 0.5331504087803786, + "grad_norm": 1.2711256742477417, + "learning_rate": 4.758e-05, + "loss": 0.365, + "step": 9521 + }, + { + "epoch": 0.5332064060925076, + "grad_norm": 1.3065346479415894, + "learning_rate": 4.7585e-05, + "loss": 0.5437, + "step": 9522 + }, + { + "epoch": 0.5332624034046366, + "grad_norm": 1.5553252696990967, + "learning_rate": 4.7590000000000003e-05, + "loss": 0.5435, + "step": 9523 + }, + { + "epoch": 0.5333184007167656, + "grad_norm": 1.0811947584152222, + "learning_rate": 4.7595e-05, + "loss": 0.3668, + "step": 9524 + }, + { + "epoch": 0.5333743980288946, + "grad_norm": 1.2615207433700562, + "learning_rate": 4.76e-05, + "loss": 0.4445, + "step": 9525 + }, + { + "epoch": 0.5334303953410237, + "grad_norm": 1.282523512840271, + "learning_rate": 4.7605e-05, + "loss": 0.3629, + "step": 9526 + }, + { + "epoch": 0.5334863926531527, + "grad_norm": 1.3136628866195679, + "learning_rate": 4.761000000000001e-05, + "loss": 0.4487, + "step": 9527 + }, + { + "epoch": 0.5335423899652817, + "grad_norm": 1.2010968923568726, + "learning_rate": 4.7615000000000004e-05, + "loss": 0.3197, + "step": 9528 + }, + { + "epoch": 0.5335983872774107, + "grad_norm": 1.1536890268325806, + "learning_rate": 4.762e-05, + "loss": 0.4651, + "step": 9529 + }, + { + "epoch": 0.5336543845895397, + "grad_norm": 1.1152513027191162, + "learning_rate": 4.7625000000000006e-05, + "loss": 0.4346, + "step": 9530 + }, + { + "epoch": 0.5337103819016688, + "grad_norm": 1.399315595626831, + "learning_rate": 4.763e-05, + "loss": 0.4695, + "step": 9531 + }, + { + "epoch": 0.5337663792137978, + "grad_norm": 1.2495238780975342, + "learning_rate": 4.7635e-05, + "loss": 0.352, + "step": 9532 + }, + { + "epoch": 0.5338223765259268, + "grad_norm": 1.1976304054260254, + "learning_rate": 4.7640000000000005e-05, + "loss": 0.3945, + "step": 9533 + }, + { + "epoch": 0.5338783738380558, + "grad_norm": 1.330531120300293, + "learning_rate": 4.7645e-05, + "loss": 0.4243, + "step": 9534 + }, + { + "epoch": 0.5339343711501848, + "grad_norm": 1.3263925313949585, + "learning_rate": 4.765e-05, + "loss": 0.3887, + "step": 9535 + }, + { + "epoch": 0.5339903684623138, + "grad_norm": 1.2777055501937866, + "learning_rate": 4.7655e-05, + "loss": 0.3793, + "step": 9536 + }, + { + "epoch": 0.5340463657744429, + "grad_norm": 1.4256744384765625, + "learning_rate": 4.766000000000001e-05, + "loss": 0.5506, + "step": 9537 + }, + { + "epoch": 0.5341023630865719, + "grad_norm": 1.3626021146774292, + "learning_rate": 4.7665000000000005e-05, + "loss": 0.4123, + "step": 9538 + }, + { + "epoch": 0.5341583603987009, + "grad_norm": 1.2359644174575806, + "learning_rate": 4.767e-05, + "loss": 0.4575, + "step": 9539 + }, + { + "epoch": 0.5342143577108299, + "grad_norm": 1.4144980907440186, + "learning_rate": 4.7675e-05, + "loss": 0.4501, + "step": 9540 + }, + { + "epoch": 0.5342703550229589, + "grad_norm": 1.4422568082809448, + "learning_rate": 4.7680000000000004e-05, + "loss": 0.581, + "step": 9541 + }, + { + "epoch": 0.534326352335088, + "grad_norm": Infinity, + "learning_rate": 4.7680000000000004e-05, + "loss": 0.4907, + "step": 9542 + }, + { + "epoch": 0.534382349647217, + "grad_norm": 1.2412611246109009, + "learning_rate": 4.7685e-05, + "loss": 0.4355, + "step": 9543 + }, + { + "epoch": 0.534438346959346, + "grad_norm": 1.3127073049545288, + "learning_rate": 4.769e-05, + "loss": 0.4543, + "step": 9544 + }, + { + "epoch": 0.534494344271475, + "grad_norm": 1.9356169700622559, + "learning_rate": 4.7695e-05, + "loss": 0.5243, + "step": 9545 + }, + { + "epoch": 0.534550341583604, + "grad_norm": 1.1254539489746094, + "learning_rate": 4.77e-05, + "loss": 0.4137, + "step": 9546 + }, + { + "epoch": 0.5346063388957331, + "grad_norm": 1.1302624940872192, + "learning_rate": 4.7705e-05, + "loss": 0.4327, + "step": 9547 + }, + { + "epoch": 0.5346623362078621, + "grad_norm": 1.411635160446167, + "learning_rate": 4.771e-05, + "loss": 0.5023, + "step": 9548 + }, + { + "epoch": 0.5347183335199911, + "grad_norm": 1.284804344177246, + "learning_rate": 4.7715000000000006e-05, + "loss": 0.4215, + "step": 9549 + }, + { + "epoch": 0.53477433083212, + "grad_norm": 1.2880300283432007, + "learning_rate": 4.7720000000000004e-05, + "loss": 0.4165, + "step": 9550 + }, + { + "epoch": 0.534830328144249, + "grad_norm": 1.2266498804092407, + "learning_rate": 4.7725e-05, + "loss": 0.3902, + "step": 9551 + }, + { + "epoch": 0.534886325456378, + "grad_norm": 1.2976826429367065, + "learning_rate": 4.7730000000000005e-05, + "loss": 0.4641, + "step": 9552 + }, + { + "epoch": 0.5349423227685071, + "grad_norm": 1.2469403743743896, + "learning_rate": 4.7735e-05, + "loss": 0.3762, + "step": 9553 + }, + { + "epoch": 0.5349983200806361, + "grad_norm": 1.1859091520309448, + "learning_rate": 4.774e-05, + "loss": 0.4301, + "step": 9554 + }, + { + "epoch": 0.5350543173927651, + "grad_norm": 1.3102047443389893, + "learning_rate": 4.7745e-05, + "loss": 0.3257, + "step": 9555 + }, + { + "epoch": 0.5351103147048941, + "grad_norm": 1.27182936668396, + "learning_rate": 4.775e-05, + "loss": 0.3905, + "step": 9556 + }, + { + "epoch": 0.5351663120170231, + "grad_norm": 1.1418534517288208, + "learning_rate": 4.7755e-05, + "loss": 0.4676, + "step": 9557 + }, + { + "epoch": 0.5352223093291522, + "grad_norm": 1.368029236793518, + "learning_rate": 4.7760000000000004e-05, + "loss": 0.4567, + "step": 9558 + }, + { + "epoch": 0.5352783066412812, + "grad_norm": 1.408414602279663, + "learning_rate": 4.7765e-05, + "loss": 0.4692, + "step": 9559 + }, + { + "epoch": 0.5353343039534102, + "grad_norm": 1.2978746891021729, + "learning_rate": 4.7770000000000005e-05, + "loss": 0.418, + "step": 9560 + }, + { + "epoch": 0.5353903012655392, + "grad_norm": 1.1471731662750244, + "learning_rate": 4.7775e-05, + "loss": 0.4687, + "step": 9561 + }, + { + "epoch": 0.5354462985776682, + "grad_norm": 1.217063307762146, + "learning_rate": 4.778e-05, + "loss": 0.3974, + "step": 9562 + }, + { + "epoch": 0.5355022958897973, + "grad_norm": 1.1473416090011597, + "learning_rate": 4.7785000000000004e-05, + "loss": 0.4221, + "step": 9563 + }, + { + "epoch": 0.5355582932019263, + "grad_norm": 1.3304030895233154, + "learning_rate": 4.779e-05, + "loss": 0.4207, + "step": 9564 + }, + { + "epoch": 0.5356142905140553, + "grad_norm": 1.2993205785751343, + "learning_rate": 4.7795e-05, + "loss": 0.3556, + "step": 9565 + }, + { + "epoch": 0.5356702878261843, + "grad_norm": 1.4037895202636719, + "learning_rate": 4.78e-05, + "loss": 0.456, + "step": 9566 + }, + { + "epoch": 0.5357262851383133, + "grad_norm": 1.8493205308914185, + "learning_rate": 4.7805e-05, + "loss": 0.3586, + "step": 9567 + }, + { + "epoch": 0.5357822824504423, + "grad_norm": 1.2599310874938965, + "learning_rate": 4.7810000000000005e-05, + "loss": 0.4284, + "step": 9568 + }, + { + "epoch": 0.5358382797625714, + "grad_norm": 1.3864649534225464, + "learning_rate": 4.7815e-05, + "loss": 0.4289, + "step": 9569 + }, + { + "epoch": 0.5358942770747004, + "grad_norm": 1.3962305784225464, + "learning_rate": 4.7820000000000006e-05, + "loss": 0.4402, + "step": 9570 + }, + { + "epoch": 0.5359502743868294, + "grad_norm": 1.3587095737457275, + "learning_rate": 4.7825000000000004e-05, + "loss": 0.6073, + "step": 9571 + }, + { + "epoch": 0.5360062716989584, + "grad_norm": 1.2020891904830933, + "learning_rate": 4.783e-05, + "loss": 0.3073, + "step": 9572 + }, + { + "epoch": 0.5360622690110874, + "grad_norm": 1.3456652164459229, + "learning_rate": 4.7835000000000005e-05, + "loss": 0.5029, + "step": 9573 + }, + { + "epoch": 0.5361182663232165, + "grad_norm": 1.320943832397461, + "learning_rate": 4.784e-05, + "loss": 0.5947, + "step": 9574 + }, + { + "epoch": 0.5361742636353455, + "grad_norm": 1.4631785154342651, + "learning_rate": 4.7845e-05, + "loss": 0.388, + "step": 9575 + }, + { + "epoch": 0.5362302609474745, + "grad_norm": 1.2012794017791748, + "learning_rate": 4.785e-05, + "loss": 0.4939, + "step": 9576 + }, + { + "epoch": 0.5362862582596035, + "grad_norm": 1.5282450914382935, + "learning_rate": 4.7855e-05, + "loss": 0.4669, + "step": 9577 + }, + { + "epoch": 0.5363422555717325, + "grad_norm": 1.3773553371429443, + "learning_rate": 4.7860000000000006e-05, + "loss": 0.5134, + "step": 9578 + }, + { + "epoch": 0.5363982528838616, + "grad_norm": 1.3201590776443481, + "learning_rate": 4.7865e-05, + "loss": 0.3516, + "step": 9579 + }, + { + "epoch": 0.5364542501959906, + "grad_norm": 1.338982343673706, + "learning_rate": 4.787e-05, + "loss": 0.504, + "step": 9580 + }, + { + "epoch": 0.5365102475081196, + "grad_norm": 0.941386342048645, + "learning_rate": 4.7875000000000005e-05, + "loss": 0.2867, + "step": 9581 + }, + { + "epoch": 0.5365662448202486, + "grad_norm": 1.6042498350143433, + "learning_rate": 4.788e-05, + "loss": 0.4469, + "step": 9582 + }, + { + "epoch": 0.5366222421323776, + "grad_norm": 1.460040807723999, + "learning_rate": 4.7885e-05, + "loss": 0.4622, + "step": 9583 + }, + { + "epoch": 0.5366782394445067, + "grad_norm": 1.2942413091659546, + "learning_rate": 4.7890000000000004e-05, + "loss": 0.3773, + "step": 9584 + }, + { + "epoch": 0.5367342367566357, + "grad_norm": 1.3352640867233276, + "learning_rate": 4.7895e-05, + "loss": 0.4208, + "step": 9585 + }, + { + "epoch": 0.5367902340687647, + "grad_norm": 1.1913381814956665, + "learning_rate": 4.79e-05, + "loss": 0.4136, + "step": 9586 + }, + { + "epoch": 0.5368462313808937, + "grad_norm": 1.2104109525680542, + "learning_rate": 4.7905e-05, + "loss": 0.3568, + "step": 9587 + }, + { + "epoch": 0.5369022286930227, + "grad_norm": 0.90604168176651, + "learning_rate": 4.791000000000001e-05, + "loss": 0.3124, + "step": 9588 + }, + { + "epoch": 0.5369582260051518, + "grad_norm": 1.3224369287490845, + "learning_rate": 4.7915000000000005e-05, + "loss": 0.4234, + "step": 9589 + }, + { + "epoch": 0.5370142233172808, + "grad_norm": 1.2113516330718994, + "learning_rate": 4.792e-05, + "loss": 0.4572, + "step": 9590 + }, + { + "epoch": 0.5370702206294098, + "grad_norm": 1.168519139289856, + "learning_rate": 4.7925000000000006e-05, + "loss": 0.4055, + "step": 9591 + }, + { + "epoch": 0.5371262179415388, + "grad_norm": 1.1018953323364258, + "learning_rate": 4.7930000000000004e-05, + "loss": 0.4418, + "step": 9592 + }, + { + "epoch": 0.5371822152536678, + "grad_norm": 1.2094017267227173, + "learning_rate": 4.7935e-05, + "loss": 0.3992, + "step": 9593 + }, + { + "epoch": 0.5372382125657968, + "grad_norm": 1.0364420413970947, + "learning_rate": 4.794e-05, + "loss": 0.3968, + "step": 9594 + }, + { + "epoch": 0.5372942098779259, + "grad_norm": 1.2871984243392944, + "learning_rate": 4.7945e-05, + "loss": 0.4747, + "step": 9595 + }, + { + "epoch": 0.5373502071900549, + "grad_norm": 1.1016348600387573, + "learning_rate": 4.795e-05, + "loss": 0.3754, + "step": 9596 + }, + { + "epoch": 0.5374062045021839, + "grad_norm": 1.1769936084747314, + "learning_rate": 4.7955e-05, + "loss": 0.3593, + "step": 9597 + }, + { + "epoch": 0.5374622018143129, + "grad_norm": 1.468197226524353, + "learning_rate": 4.796e-05, + "loss": 0.4162, + "step": 9598 + }, + { + "epoch": 0.5375181991264419, + "grad_norm": 1.1717861890792847, + "learning_rate": 4.7965000000000006e-05, + "loss": 0.3874, + "step": 9599 + }, + { + "epoch": 0.537574196438571, + "grad_norm": 1.3909096717834473, + "learning_rate": 4.797e-05, + "loss": 0.4744, + "step": 9600 + }, + { + "epoch": 0.5376301937507, + "grad_norm": 1.302345871925354, + "learning_rate": 4.7975e-05, + "loss": 0.5447, + "step": 9601 + }, + { + "epoch": 0.537686191062829, + "grad_norm": 1.271485447883606, + "learning_rate": 4.7980000000000005e-05, + "loss": 0.3747, + "step": 9602 + }, + { + "epoch": 0.537742188374958, + "grad_norm": 1.1500025987625122, + "learning_rate": 4.7985e-05, + "loss": 0.3789, + "step": 9603 + }, + { + "epoch": 0.537798185687087, + "grad_norm": 1.4302514791488647, + "learning_rate": 4.799e-05, + "loss": 0.5329, + "step": 9604 + }, + { + "epoch": 0.5378541829992161, + "grad_norm": 1.191972017288208, + "learning_rate": 4.7995000000000004e-05, + "loss": 0.4231, + "step": 9605 + }, + { + "epoch": 0.5379101803113451, + "grad_norm": 1.1573739051818848, + "learning_rate": 4.8e-05, + "loss": 0.3268, + "step": 9606 + }, + { + "epoch": 0.5379661776234741, + "grad_norm": 1.227290391921997, + "learning_rate": 4.8005e-05, + "loss": 0.456, + "step": 9607 + }, + { + "epoch": 0.5380221749356031, + "grad_norm": 8.732412338256836, + "learning_rate": 4.801e-05, + "loss": 0.4961, + "step": 9608 + }, + { + "epoch": 0.5380781722477321, + "grad_norm": 1.2821083068847656, + "learning_rate": 4.801500000000001e-05, + "loss": 0.5198, + "step": 9609 + }, + { + "epoch": 0.5381341695598612, + "grad_norm": 1.3780461549758911, + "learning_rate": 4.8020000000000004e-05, + "loss": 0.4525, + "step": 9610 + }, + { + "epoch": 0.5381901668719902, + "grad_norm": 1.4324729442596436, + "learning_rate": 4.8025e-05, + "loss": 0.5719, + "step": 9611 + }, + { + "epoch": 0.5382461641841192, + "grad_norm": 1.385636568069458, + "learning_rate": 4.8030000000000006e-05, + "loss": 0.5452, + "step": 9612 + }, + { + "epoch": 0.5383021614962482, + "grad_norm": 1.4018421173095703, + "learning_rate": 4.8035000000000003e-05, + "loss": 0.4338, + "step": 9613 + }, + { + "epoch": 0.5383581588083772, + "grad_norm": 1.3203582763671875, + "learning_rate": 4.804e-05, + "loss": 0.3865, + "step": 9614 + }, + { + "epoch": 0.5384141561205062, + "grad_norm": 1.5596752166748047, + "learning_rate": 4.8045e-05, + "loss": 0.5408, + "step": 9615 + }, + { + "epoch": 0.5384701534326353, + "grad_norm": 1.4226065874099731, + "learning_rate": 4.805e-05, + "loss": 0.4455, + "step": 9616 + }, + { + "epoch": 0.5385261507447643, + "grad_norm": 1.3288135528564453, + "learning_rate": 4.8055e-05, + "loss": 0.4446, + "step": 9617 + }, + { + "epoch": 0.5385821480568933, + "grad_norm": 1.5903502702713013, + "learning_rate": 4.8060000000000004e-05, + "loss": 0.4474, + "step": 9618 + }, + { + "epoch": 0.5386381453690223, + "grad_norm": 1.1605653762817383, + "learning_rate": 4.8065e-05, + "loss": 0.4431, + "step": 9619 + }, + { + "epoch": 0.5386941426811513, + "grad_norm": 1.0023702383041382, + "learning_rate": 4.8070000000000006e-05, + "loss": 0.4153, + "step": 9620 + }, + { + "epoch": 0.5387501399932804, + "grad_norm": 1.7927113771438599, + "learning_rate": 4.8075e-05, + "loss": 0.5195, + "step": 9621 + }, + { + "epoch": 0.5388061373054094, + "grad_norm": 1.159888744354248, + "learning_rate": 4.808e-05, + "loss": 0.4383, + "step": 9622 + }, + { + "epoch": 0.5388621346175384, + "grad_norm": 1.092410922050476, + "learning_rate": 4.8085000000000005e-05, + "loss": 0.4009, + "step": 9623 + }, + { + "epoch": 0.5389181319296674, + "grad_norm": 1.4227818250656128, + "learning_rate": 4.809e-05, + "loss": 0.5175, + "step": 9624 + }, + { + "epoch": 0.5389741292417964, + "grad_norm": 1.3793169260025024, + "learning_rate": 4.8095e-05, + "loss": 0.5603, + "step": 9625 + }, + { + "epoch": 0.5390301265539255, + "grad_norm": 1.1705825328826904, + "learning_rate": 4.8100000000000004e-05, + "loss": 0.4039, + "step": 9626 + }, + { + "epoch": 0.5390861238660545, + "grad_norm": 1.0983772277832031, + "learning_rate": 4.8105e-05, + "loss": 0.4904, + "step": 9627 + }, + { + "epoch": 0.5391421211781835, + "grad_norm": 1.1396361589431763, + "learning_rate": 4.8110000000000005e-05, + "loss": 0.5152, + "step": 9628 + }, + { + "epoch": 0.5391981184903125, + "grad_norm": 1.3853065967559814, + "learning_rate": 4.8115e-05, + "loss": 0.3574, + "step": 9629 + }, + { + "epoch": 0.5392541158024415, + "grad_norm": 1.1867951154708862, + "learning_rate": 4.812000000000001e-05, + "loss": 0.3441, + "step": 9630 + }, + { + "epoch": 0.5393101131145706, + "grad_norm": 1.5658626556396484, + "learning_rate": 4.8125000000000004e-05, + "loss": 0.596, + "step": 9631 + }, + { + "epoch": 0.5393661104266995, + "grad_norm": 1.3408464193344116, + "learning_rate": 4.813e-05, + "loss": 0.3396, + "step": 9632 + }, + { + "epoch": 0.5394221077388285, + "grad_norm": 1.2120468616485596, + "learning_rate": 4.8135e-05, + "loss": 0.447, + "step": 9633 + }, + { + "epoch": 0.5394781050509575, + "grad_norm": 1.206899642944336, + "learning_rate": 4.814e-05, + "loss": 0.4764, + "step": 9634 + }, + { + "epoch": 0.5395341023630865, + "grad_norm": 1.2194875478744507, + "learning_rate": 4.8145e-05, + "loss": 0.4007, + "step": 9635 + }, + { + "epoch": 0.5395900996752155, + "grad_norm": 1.3562273979187012, + "learning_rate": 4.815e-05, + "loss": 0.5112, + "step": 9636 + }, + { + "epoch": 0.5396460969873446, + "grad_norm": 4.3270063400268555, + "learning_rate": 4.8155e-05, + "loss": 0.3975, + "step": 9637 + }, + { + "epoch": 0.5397020942994736, + "grad_norm": 1.493651270866394, + "learning_rate": 4.816e-05, + "loss": 0.3844, + "step": 9638 + }, + { + "epoch": 0.5397580916116026, + "grad_norm": 1.3843554258346558, + "learning_rate": 4.8165000000000004e-05, + "loss": 0.418, + "step": 9639 + }, + { + "epoch": 0.5398140889237316, + "grad_norm": 1.1690887212753296, + "learning_rate": 4.817e-05, + "loss": 0.3975, + "step": 9640 + }, + { + "epoch": 0.5398700862358606, + "grad_norm": 1.0352396965026855, + "learning_rate": 4.8175000000000005e-05, + "loss": 0.3196, + "step": 9641 + }, + { + "epoch": 0.5399260835479897, + "grad_norm": 1.4064724445343018, + "learning_rate": 4.818e-05, + "loss": 0.3739, + "step": 9642 + }, + { + "epoch": 0.5399820808601187, + "grad_norm": 1.2507612705230713, + "learning_rate": 4.8185e-05, + "loss": 0.3976, + "step": 9643 + }, + { + "epoch": 0.5400380781722477, + "grad_norm": 1.3313459157943726, + "learning_rate": 4.8190000000000004e-05, + "loss": 0.469, + "step": 9644 + }, + { + "epoch": 0.5400940754843767, + "grad_norm": 1.3563543558120728, + "learning_rate": 4.8195e-05, + "loss": 0.4279, + "step": 9645 + }, + { + "epoch": 0.5401500727965057, + "grad_norm": 1.2791963815689087, + "learning_rate": 4.82e-05, + "loss": 0.4305, + "step": 9646 + }, + { + "epoch": 0.5402060701086348, + "grad_norm": 1.1706186532974243, + "learning_rate": 4.8205000000000003e-05, + "loss": 0.3308, + "step": 9647 + }, + { + "epoch": 0.5402620674207638, + "grad_norm": 1.3401410579681396, + "learning_rate": 4.821e-05, + "loss": 0.4504, + "step": 9648 + }, + { + "epoch": 0.5403180647328928, + "grad_norm": 1.125596523284912, + "learning_rate": 4.8215000000000005e-05, + "loss": 0.4313, + "step": 9649 + }, + { + "epoch": 0.5403740620450218, + "grad_norm": 1.2922368049621582, + "learning_rate": 4.822e-05, + "loss": 0.446, + "step": 9650 + }, + { + "epoch": 0.5404300593571508, + "grad_norm": 1.2992316484451294, + "learning_rate": 4.822500000000001e-05, + "loss": 0.4018, + "step": 9651 + }, + { + "epoch": 0.5404860566692798, + "grad_norm": 1.452916145324707, + "learning_rate": 4.8230000000000004e-05, + "loss": 0.4483, + "step": 9652 + }, + { + "epoch": 0.5405420539814089, + "grad_norm": 1.273618221282959, + "learning_rate": 4.8235e-05, + "loss": 0.4828, + "step": 9653 + }, + { + "epoch": 0.5405980512935379, + "grad_norm": 1.3946022987365723, + "learning_rate": 4.824e-05, + "loss": 0.4167, + "step": 9654 + }, + { + "epoch": 0.5406540486056669, + "grad_norm": 1.1924132108688354, + "learning_rate": 4.8245e-05, + "loss": 0.4438, + "step": 9655 + }, + { + "epoch": 0.5407100459177959, + "grad_norm": 1.2493716478347778, + "learning_rate": 4.825e-05, + "loss": 0.3493, + "step": 9656 + }, + { + "epoch": 0.5407660432299249, + "grad_norm": 1.4506077766418457, + "learning_rate": 4.8255e-05, + "loss": 0.4415, + "step": 9657 + }, + { + "epoch": 0.540822040542054, + "grad_norm": 1.2077593803405762, + "learning_rate": 4.826e-05, + "loss": 0.3707, + "step": 9658 + }, + { + "epoch": 0.540878037854183, + "grad_norm": 1.5090138912200928, + "learning_rate": 4.8265000000000006e-05, + "loss": 0.4473, + "step": 9659 + }, + { + "epoch": 0.540934035166312, + "grad_norm": 1.2844105958938599, + "learning_rate": 4.8270000000000004e-05, + "loss": 0.5055, + "step": 9660 + }, + { + "epoch": 0.540990032478441, + "grad_norm": 1.228994607925415, + "learning_rate": 4.8275e-05, + "loss": 0.4315, + "step": 9661 + }, + { + "epoch": 0.54104602979057, + "grad_norm": 1.5682957172393799, + "learning_rate": 4.8280000000000005e-05, + "loss": 0.4486, + "step": 9662 + }, + { + "epoch": 0.5411020271026991, + "grad_norm": 1.6749250888824463, + "learning_rate": 4.8285e-05, + "loss": 0.5563, + "step": 9663 + }, + { + "epoch": 0.5411580244148281, + "grad_norm": 1.1559009552001953, + "learning_rate": 4.829e-05, + "loss": 0.4678, + "step": 9664 + }, + { + "epoch": 0.5412140217269571, + "grad_norm": 1.2349425554275513, + "learning_rate": 4.8295000000000004e-05, + "loss": 0.3568, + "step": 9665 + }, + { + "epoch": 0.5412700190390861, + "grad_norm": 1.2909871339797974, + "learning_rate": 4.83e-05, + "loss": 0.4112, + "step": 9666 + }, + { + "epoch": 0.5413260163512151, + "grad_norm": 1.3625513315200806, + "learning_rate": 4.8305e-05, + "loss": 0.335, + "step": 9667 + }, + { + "epoch": 0.5413820136633442, + "grad_norm": 1.8493263721466064, + "learning_rate": 4.8309999999999997e-05, + "loss": 0.5126, + "step": 9668 + }, + { + "epoch": 0.5414380109754732, + "grad_norm": 1.6567726135253906, + "learning_rate": 4.831500000000001e-05, + "loss": 0.486, + "step": 9669 + }, + { + "epoch": 0.5414940082876022, + "grad_norm": 1.2510757446289062, + "learning_rate": 4.8320000000000005e-05, + "loss": 0.362, + "step": 9670 + }, + { + "epoch": 0.5415500055997312, + "grad_norm": 1.2585492134094238, + "learning_rate": 4.8325e-05, + "loss": 0.4178, + "step": 9671 + }, + { + "epoch": 0.5416060029118602, + "grad_norm": 1.2923367023468018, + "learning_rate": 4.833e-05, + "loss": 0.4974, + "step": 9672 + }, + { + "epoch": 0.5416620002239892, + "grad_norm": 1.4480977058410645, + "learning_rate": 4.8335000000000004e-05, + "loss": 0.4164, + "step": 9673 + }, + { + "epoch": 0.5417179975361183, + "grad_norm": 1.1718289852142334, + "learning_rate": 4.834e-05, + "loss": 0.3666, + "step": 9674 + }, + { + "epoch": 0.5417739948482473, + "grad_norm": 1.2877403497695923, + "learning_rate": 4.8345e-05, + "loss": 0.4637, + "step": 9675 + }, + { + "epoch": 0.5418299921603763, + "grad_norm": 1.0981522798538208, + "learning_rate": 4.835e-05, + "loss": 0.3544, + "step": 9676 + }, + { + "epoch": 0.5418859894725053, + "grad_norm": 1.439459204673767, + "learning_rate": 4.8355e-05, + "loss": 0.4665, + "step": 9677 + }, + { + "epoch": 0.5419419867846343, + "grad_norm": 1.380591869354248, + "learning_rate": 4.836e-05, + "loss": 0.3931, + "step": 9678 + }, + { + "epoch": 0.5419979840967634, + "grad_norm": 1.3844058513641357, + "learning_rate": 4.8365e-05, + "loss": 0.3871, + "step": 9679 + }, + { + "epoch": 0.5420539814088924, + "grad_norm": 1.4423999786376953, + "learning_rate": 4.8370000000000006e-05, + "loss": 0.5286, + "step": 9680 + }, + { + "epoch": 0.5421099787210214, + "grad_norm": 1.132737159729004, + "learning_rate": 4.8375000000000004e-05, + "loss": 0.4016, + "step": 9681 + }, + { + "epoch": 0.5421659760331504, + "grad_norm": 1.215610384941101, + "learning_rate": 4.838e-05, + "loss": 0.3603, + "step": 9682 + }, + { + "epoch": 0.5422219733452794, + "grad_norm": 1.3401881456375122, + "learning_rate": 4.8385000000000005e-05, + "loss": 0.5025, + "step": 9683 + }, + { + "epoch": 0.5422779706574085, + "grad_norm": 1.0976073741912842, + "learning_rate": 4.839e-05, + "loss": 0.3805, + "step": 9684 + }, + { + "epoch": 0.5423339679695375, + "grad_norm": 1.5351399183273315, + "learning_rate": 4.8395e-05, + "loss": 0.5663, + "step": 9685 + }, + { + "epoch": 0.5423899652816665, + "grad_norm": 1.4356317520141602, + "learning_rate": 4.8400000000000004e-05, + "loss": 0.4673, + "step": 9686 + }, + { + "epoch": 0.5424459625937955, + "grad_norm": 1.343070387840271, + "learning_rate": 4.8405e-05, + "loss": 0.3795, + "step": 9687 + }, + { + "epoch": 0.5425019599059245, + "grad_norm": 1.2418947219848633, + "learning_rate": 4.841e-05, + "loss": 0.3363, + "step": 9688 + }, + { + "epoch": 0.5425579572180536, + "grad_norm": 1.2552831172943115, + "learning_rate": 4.8415e-05, + "loss": 0.4219, + "step": 9689 + }, + { + "epoch": 0.5426139545301826, + "grad_norm": 1.0641114711761475, + "learning_rate": 4.842000000000001e-05, + "loss": 0.425, + "step": 9690 + }, + { + "epoch": 0.5426699518423116, + "grad_norm": 1.29921293258667, + "learning_rate": 4.8425000000000005e-05, + "loss": 0.4904, + "step": 9691 + }, + { + "epoch": 0.5427259491544406, + "grad_norm": 1.0799882411956787, + "learning_rate": 4.843e-05, + "loss": 0.4693, + "step": 9692 + }, + { + "epoch": 0.5427819464665696, + "grad_norm": 1.4437817335128784, + "learning_rate": 4.8435e-05, + "loss": 0.5803, + "step": 9693 + }, + { + "epoch": 0.5428379437786987, + "grad_norm": 1.2713543176651, + "learning_rate": 4.8440000000000004e-05, + "loss": 0.3694, + "step": 9694 + }, + { + "epoch": 0.5428939410908277, + "grad_norm": 1.7143093347549438, + "learning_rate": 4.8445e-05, + "loss": 0.6177, + "step": 9695 + }, + { + "epoch": 0.5429499384029567, + "grad_norm": 1.1647511720657349, + "learning_rate": 4.845e-05, + "loss": 0.3676, + "step": 9696 + }, + { + "epoch": 0.5430059357150857, + "grad_norm": 1.4182149171829224, + "learning_rate": 4.8455e-05, + "loss": 0.3801, + "step": 9697 + }, + { + "epoch": 0.5430619330272147, + "grad_norm": 1.255975604057312, + "learning_rate": 4.846e-05, + "loss": 0.3259, + "step": 9698 + }, + { + "epoch": 0.5431179303393437, + "grad_norm": 1.003980040550232, + "learning_rate": 4.8465000000000004e-05, + "loss": 0.3206, + "step": 9699 + }, + { + "epoch": 0.5431739276514728, + "grad_norm": 1.301956057548523, + "learning_rate": 4.847e-05, + "loss": 0.368, + "step": 9700 + }, + { + "epoch": 0.5432299249636018, + "grad_norm": 1.4601362943649292, + "learning_rate": 4.8475000000000006e-05, + "loss": 0.4239, + "step": 9701 + }, + { + "epoch": 0.5432859222757308, + "grad_norm": 8.5525541305542, + "learning_rate": 4.8480000000000003e-05, + "loss": 0.4612, + "step": 9702 + }, + { + "epoch": 0.5433419195878598, + "grad_norm": 1.5122413635253906, + "learning_rate": 4.8485e-05, + "loss": 0.6119, + "step": 9703 + }, + { + "epoch": 0.5433979168999888, + "grad_norm": 1.0617645978927612, + "learning_rate": 4.8490000000000005e-05, + "loss": 0.345, + "step": 9704 + }, + { + "epoch": 0.5434539142121179, + "grad_norm": 1.254473328590393, + "learning_rate": 4.8495e-05, + "loss": 0.6635, + "step": 9705 + }, + { + "epoch": 0.5435099115242469, + "grad_norm": 0.9798499345779419, + "learning_rate": 4.85e-05, + "loss": 0.3501, + "step": 9706 + }, + { + "epoch": 0.5435659088363759, + "grad_norm": 1.405495285987854, + "learning_rate": 4.8505e-05, + "loss": 0.4159, + "step": 9707 + }, + { + "epoch": 0.5436219061485049, + "grad_norm": 1.1959142684936523, + "learning_rate": 4.851e-05, + "loss": 0.5036, + "step": 9708 + }, + { + "epoch": 0.5436779034606339, + "grad_norm": 1.6853902339935303, + "learning_rate": 4.8515000000000006e-05, + "loss": 0.4186, + "step": 9709 + }, + { + "epoch": 0.543733900772763, + "grad_norm": 1.2730882167816162, + "learning_rate": 4.852e-05, + "loss": 0.419, + "step": 9710 + }, + { + "epoch": 0.543789898084892, + "grad_norm": 1.2109779119491577, + "learning_rate": 4.8525e-05, + "loss": 0.4926, + "step": 9711 + }, + { + "epoch": 0.543845895397021, + "grad_norm": 1.1662299633026123, + "learning_rate": 4.8530000000000005e-05, + "loss": 0.4484, + "step": 9712 + }, + { + "epoch": 0.54390189270915, + "grad_norm": 1.0187838077545166, + "learning_rate": 4.8535e-05, + "loss": 0.306, + "step": 9713 + }, + { + "epoch": 0.543957890021279, + "grad_norm": 1.3637886047363281, + "learning_rate": 4.854e-05, + "loss": 0.4984, + "step": 9714 + }, + { + "epoch": 0.5440138873334079, + "grad_norm": 1.4112552404403687, + "learning_rate": 4.8545000000000004e-05, + "loss": 0.4013, + "step": 9715 + }, + { + "epoch": 0.544069884645537, + "grad_norm": 1.2921721935272217, + "learning_rate": 4.855e-05, + "loss": 0.512, + "step": 9716 + }, + { + "epoch": 0.544125881957666, + "grad_norm": 1.2291465997695923, + "learning_rate": 4.8555e-05, + "loss": 0.4627, + "step": 9717 + }, + { + "epoch": 0.544181879269795, + "grad_norm": 1.3939390182495117, + "learning_rate": 4.856e-05, + "loss": 0.3999, + "step": 9718 + }, + { + "epoch": 0.544237876581924, + "grad_norm": 1.3417540788650513, + "learning_rate": 4.856500000000001e-05, + "loss": 0.4685, + "step": 9719 + }, + { + "epoch": 0.544293873894053, + "grad_norm": 1.4190226793289185, + "learning_rate": 4.8570000000000004e-05, + "loss": 0.348, + "step": 9720 + }, + { + "epoch": 0.5443498712061821, + "grad_norm": 1.4172230958938599, + "learning_rate": 4.8575e-05, + "loss": 0.6203, + "step": 9721 + }, + { + "epoch": 0.5444058685183111, + "grad_norm": 1.4824928045272827, + "learning_rate": 4.8580000000000006e-05, + "loss": 0.4865, + "step": 9722 + }, + { + "epoch": 0.5444618658304401, + "grad_norm": 1.2636237144470215, + "learning_rate": 4.8585e-05, + "loss": 0.4615, + "step": 9723 + }, + { + "epoch": 0.5445178631425691, + "grad_norm": 1.3195079565048218, + "learning_rate": 4.859e-05, + "loss": 0.4728, + "step": 9724 + }, + { + "epoch": 0.5445738604546981, + "grad_norm": 1.4156670570373535, + "learning_rate": 4.8595000000000005e-05, + "loss": 0.5529, + "step": 9725 + }, + { + "epoch": 0.5446298577668272, + "grad_norm": 1.441025733947754, + "learning_rate": 4.86e-05, + "loss": 0.518, + "step": 9726 + }, + { + "epoch": 0.5446858550789562, + "grad_norm": 1.2869678735733032, + "learning_rate": 4.8605e-05, + "loss": 0.3476, + "step": 9727 + }, + { + "epoch": 0.5447418523910852, + "grad_norm": 1.0636709928512573, + "learning_rate": 4.861e-05, + "loss": 0.4203, + "step": 9728 + }, + { + "epoch": 0.5447978497032142, + "grad_norm": 1.2076289653778076, + "learning_rate": 4.861500000000001e-05, + "loss": 0.4452, + "step": 9729 + }, + { + "epoch": 0.5448538470153432, + "grad_norm": 1.218843698501587, + "learning_rate": 4.8620000000000005e-05, + "loss": 0.3975, + "step": 9730 + }, + { + "epoch": 0.5449098443274722, + "grad_norm": 2.140641927719116, + "learning_rate": 4.8625e-05, + "loss": 0.4852, + "step": 9731 + }, + { + "epoch": 0.5449658416396013, + "grad_norm": 1.2406911849975586, + "learning_rate": 4.863e-05, + "loss": 0.4207, + "step": 9732 + }, + { + "epoch": 0.5450218389517303, + "grad_norm": 1.2015538215637207, + "learning_rate": 4.8635000000000004e-05, + "loss": 0.4469, + "step": 9733 + }, + { + "epoch": 0.5450778362638593, + "grad_norm": 1.1803382635116577, + "learning_rate": 4.864e-05, + "loss": 0.4405, + "step": 9734 + }, + { + "epoch": 0.5451338335759883, + "grad_norm": 1.1898819208145142, + "learning_rate": 4.8645e-05, + "loss": 0.4249, + "step": 9735 + }, + { + "epoch": 0.5451898308881173, + "grad_norm": 1.2134382724761963, + "learning_rate": 4.8650000000000003e-05, + "loss": 0.4567, + "step": 9736 + }, + { + "epoch": 0.5452458282002464, + "grad_norm": 1.4385006427764893, + "learning_rate": 4.8655e-05, + "loss": 0.4381, + "step": 9737 + }, + { + "epoch": 0.5453018255123754, + "grad_norm": 1.1139192581176758, + "learning_rate": 4.866e-05, + "loss": 0.326, + "step": 9738 + }, + { + "epoch": 0.5453578228245044, + "grad_norm": 1.3936790227890015, + "learning_rate": 4.8665e-05, + "loss": 0.4393, + "step": 9739 + }, + { + "epoch": 0.5454138201366334, + "grad_norm": 1.3649449348449707, + "learning_rate": 4.867000000000001e-05, + "loss": 0.5342, + "step": 9740 + }, + { + "epoch": 0.5454698174487624, + "grad_norm": 1.022350549697876, + "learning_rate": 4.8675000000000004e-05, + "loss": 0.3274, + "step": 9741 + }, + { + "epoch": 0.5455258147608915, + "grad_norm": 1.327319860458374, + "learning_rate": 4.868e-05, + "loss": 0.4178, + "step": 9742 + }, + { + "epoch": 0.5455818120730205, + "grad_norm": 1.623950719833374, + "learning_rate": 4.8685000000000006e-05, + "loss": 0.4655, + "step": 9743 + }, + { + "epoch": 0.5456378093851495, + "grad_norm": 1.4667751789093018, + "learning_rate": 4.869e-05, + "loss": 0.4312, + "step": 9744 + }, + { + "epoch": 0.5456938066972785, + "grad_norm": 1.2036206722259521, + "learning_rate": 4.8695e-05, + "loss": 0.4892, + "step": 9745 + }, + { + "epoch": 0.5457498040094075, + "grad_norm": 1.1602802276611328, + "learning_rate": 4.87e-05, + "loss": 0.4343, + "step": 9746 + }, + { + "epoch": 0.5458058013215366, + "grad_norm": 1.2215708494186401, + "learning_rate": 4.8705e-05, + "loss": 0.3786, + "step": 9747 + }, + { + "epoch": 0.5458617986336656, + "grad_norm": 1.2116730213165283, + "learning_rate": 4.871e-05, + "loss": 0.4915, + "step": 9748 + }, + { + "epoch": 0.5459177959457946, + "grad_norm": 1.3704102039337158, + "learning_rate": 4.8715000000000004e-05, + "loss": 0.4883, + "step": 9749 + }, + { + "epoch": 0.5459737932579236, + "grad_norm": 1.4366365671157837, + "learning_rate": 4.872000000000001e-05, + "loss": 0.437, + "step": 9750 + }, + { + "epoch": 0.5460297905700526, + "grad_norm": 1.5466474294662476, + "learning_rate": 4.8725000000000005e-05, + "loss": 0.503, + "step": 9751 + }, + { + "epoch": 0.5460857878821817, + "grad_norm": 1.1597400903701782, + "learning_rate": 4.873e-05, + "loss": 0.426, + "step": 9752 + }, + { + "epoch": 0.5461417851943107, + "grad_norm": 1.275761604309082, + "learning_rate": 4.8735e-05, + "loss": 0.386, + "step": 9753 + }, + { + "epoch": 0.5461977825064397, + "grad_norm": 1.024357795715332, + "learning_rate": 4.8740000000000004e-05, + "loss": 0.4879, + "step": 9754 + }, + { + "epoch": 0.5462537798185687, + "grad_norm": 1.1839563846588135, + "learning_rate": 4.8745e-05, + "loss": 0.3731, + "step": 9755 + }, + { + "epoch": 0.5463097771306977, + "grad_norm": 1.2781145572662354, + "learning_rate": 4.875e-05, + "loss": 0.3335, + "step": 9756 + }, + { + "epoch": 0.5463657744428267, + "grad_norm": 1.391566514968872, + "learning_rate": 4.8755e-05, + "loss": 0.5612, + "step": 9757 + }, + { + "epoch": 0.5464217717549558, + "grad_norm": 1.3660463094711304, + "learning_rate": 4.876e-05, + "loss": 0.4597, + "step": 9758 + }, + { + "epoch": 0.5464777690670848, + "grad_norm": 1.1681660413742065, + "learning_rate": 4.8765e-05, + "loss": 0.435, + "step": 9759 + }, + { + "epoch": 0.5465337663792138, + "grad_norm": 1.3454241752624512, + "learning_rate": 4.877e-05, + "loss": 0.4118, + "step": 9760 + }, + { + "epoch": 0.5465897636913428, + "grad_norm": 1.1808377504348755, + "learning_rate": 4.8775000000000007e-05, + "loss": 0.3623, + "step": 9761 + }, + { + "epoch": 0.5466457610034718, + "grad_norm": 1.3442800045013428, + "learning_rate": 4.8780000000000004e-05, + "loss": 0.5379, + "step": 9762 + }, + { + "epoch": 0.5467017583156009, + "grad_norm": 1.2783876657485962, + "learning_rate": 4.8785e-05, + "loss": 0.4813, + "step": 9763 + }, + { + "epoch": 0.5467577556277299, + "grad_norm": 1.2709224224090576, + "learning_rate": 4.8790000000000006e-05, + "loss": 0.3285, + "step": 9764 + }, + { + "epoch": 0.5468137529398589, + "grad_norm": 1.3587431907653809, + "learning_rate": 4.8795e-05, + "loss": 0.3749, + "step": 9765 + }, + { + "epoch": 0.5468697502519879, + "grad_norm": 1.3329561948776245, + "learning_rate": 4.88e-05, + "loss": 0.4517, + "step": 9766 + }, + { + "epoch": 0.5469257475641169, + "grad_norm": 1.3308213949203491, + "learning_rate": 4.8805e-05, + "loss": 0.4159, + "step": 9767 + }, + { + "epoch": 0.546981744876246, + "grad_norm": 1.2997395992279053, + "learning_rate": 4.881e-05, + "loss": 0.3801, + "step": 9768 + }, + { + "epoch": 0.547037742188375, + "grad_norm": 1.3082472085952759, + "learning_rate": 4.8815e-05, + "loss": 0.5065, + "step": 9769 + }, + { + "epoch": 0.547093739500504, + "grad_norm": 1.2000705003738403, + "learning_rate": 4.8820000000000004e-05, + "loss": 0.3443, + "step": 9770 + }, + { + "epoch": 0.547149736812633, + "grad_norm": 1.4347835779190063, + "learning_rate": 4.8825e-05, + "loss": 0.5783, + "step": 9771 + }, + { + "epoch": 0.547205734124762, + "grad_norm": 1.2902127504348755, + "learning_rate": 4.8830000000000005e-05, + "loss": 0.3725, + "step": 9772 + }, + { + "epoch": 0.547261731436891, + "grad_norm": 1.6311655044555664, + "learning_rate": 4.8835e-05, + "loss": 0.4099, + "step": 9773 + }, + { + "epoch": 0.5473177287490201, + "grad_norm": 1.20345938205719, + "learning_rate": 4.884e-05, + "loss": 0.3987, + "step": 9774 + }, + { + "epoch": 0.5473737260611491, + "grad_norm": 1.1928012371063232, + "learning_rate": 4.8845000000000004e-05, + "loss": 0.4513, + "step": 9775 + }, + { + "epoch": 0.5474297233732781, + "grad_norm": 1.2258514165878296, + "learning_rate": 4.885e-05, + "loss": 0.4534, + "step": 9776 + }, + { + "epoch": 0.5474857206854071, + "grad_norm": 1.28345787525177, + "learning_rate": 4.8855e-05, + "loss": 0.429, + "step": 9777 + }, + { + "epoch": 0.5475417179975361, + "grad_norm": 1.2173876762390137, + "learning_rate": 4.886e-05, + "loss": 0.3636, + "step": 9778 + }, + { + "epoch": 0.5475977153096652, + "grad_norm": 1.2504884004592896, + "learning_rate": 4.8865e-05, + "loss": 0.55, + "step": 9779 + }, + { + "epoch": 0.5476537126217942, + "grad_norm": 1.0849627256393433, + "learning_rate": 4.8870000000000005e-05, + "loss": 0.4151, + "step": 9780 + }, + { + "epoch": 0.5477097099339232, + "grad_norm": 1.2154240608215332, + "learning_rate": 4.8875e-05, + "loss": 0.4034, + "step": 9781 + }, + { + "epoch": 0.5477657072460522, + "grad_norm": 1.1576330661773682, + "learning_rate": 4.8880000000000006e-05, + "loss": 0.4272, + "step": 9782 + }, + { + "epoch": 0.5478217045581812, + "grad_norm": 1.195458173751831, + "learning_rate": 4.8885000000000004e-05, + "loss": 0.4196, + "step": 9783 + }, + { + "epoch": 0.5478777018703103, + "grad_norm": 1.3338898420333862, + "learning_rate": 4.889e-05, + "loss": 0.5115, + "step": 9784 + }, + { + "epoch": 0.5479336991824393, + "grad_norm": 1.3685988187789917, + "learning_rate": 4.8895e-05, + "loss": 0.3735, + "step": 9785 + }, + { + "epoch": 0.5479896964945683, + "grad_norm": 0.9911099076271057, + "learning_rate": 4.89e-05, + "loss": 0.2798, + "step": 9786 + }, + { + "epoch": 0.5480456938066973, + "grad_norm": 1.5359314680099487, + "learning_rate": 4.8905e-05, + "loss": 0.5281, + "step": 9787 + }, + { + "epoch": 0.5481016911188263, + "grad_norm": 1.1968533992767334, + "learning_rate": 4.891e-05, + "loss": 0.4735, + "step": 9788 + }, + { + "epoch": 0.5481576884309554, + "grad_norm": 1.3393092155456543, + "learning_rate": 4.8915e-05, + "loss": 0.4754, + "step": 9789 + }, + { + "epoch": 0.5482136857430844, + "grad_norm": 1.2218310832977295, + "learning_rate": 4.8920000000000006e-05, + "loss": 0.3819, + "step": 9790 + }, + { + "epoch": 0.5482696830552134, + "grad_norm": 1.3259683847427368, + "learning_rate": 4.8925e-05, + "loss": 0.4729, + "step": 9791 + }, + { + "epoch": 0.5483256803673424, + "grad_norm": 1.296262264251709, + "learning_rate": 4.893e-05, + "loss": 0.4812, + "step": 9792 + }, + { + "epoch": 0.5483816776794714, + "grad_norm": 1.2933176755905151, + "learning_rate": 4.8935000000000005e-05, + "loss": 0.3868, + "step": 9793 + }, + { + "epoch": 0.5484376749916005, + "grad_norm": 1.1595845222473145, + "learning_rate": 4.894e-05, + "loss": 0.3071, + "step": 9794 + }, + { + "epoch": 0.5484936723037295, + "grad_norm": 1.2668019533157349, + "learning_rate": 4.8945e-05, + "loss": 0.487, + "step": 9795 + }, + { + "epoch": 0.5485496696158585, + "grad_norm": 1.3164705038070679, + "learning_rate": 4.8950000000000004e-05, + "loss": 0.4585, + "step": 9796 + }, + { + "epoch": 0.5486056669279875, + "grad_norm": 1.343824028968811, + "learning_rate": 4.8955e-05, + "loss": 0.4897, + "step": 9797 + }, + { + "epoch": 0.5486616642401164, + "grad_norm": 1.3883174657821655, + "learning_rate": 4.896e-05, + "loss": 0.4444, + "step": 9798 + }, + { + "epoch": 0.5487176615522454, + "grad_norm": 1.1468616724014282, + "learning_rate": 4.8965e-05, + "loss": 0.4213, + "step": 9799 + }, + { + "epoch": 0.5487736588643745, + "grad_norm": 1.2404942512512207, + "learning_rate": 4.897000000000001e-05, + "loss": 0.3937, + "step": 9800 + }, + { + "epoch": 0.5488296561765035, + "grad_norm": 1.168466567993164, + "learning_rate": 4.8975000000000005e-05, + "loss": 0.3887, + "step": 9801 + }, + { + "epoch": 0.5488856534886325, + "grad_norm": 1.2331757545471191, + "learning_rate": 4.898e-05, + "loss": 0.4094, + "step": 9802 + }, + { + "epoch": 0.5489416508007615, + "grad_norm": 1.3100107908248901, + "learning_rate": 4.8985000000000006e-05, + "loss": 0.515, + "step": 9803 + }, + { + "epoch": 0.5489976481128905, + "grad_norm": 1.0910251140594482, + "learning_rate": 4.8990000000000004e-05, + "loss": 0.4082, + "step": 9804 + }, + { + "epoch": 0.5490536454250196, + "grad_norm": 1.0836122035980225, + "learning_rate": 4.8995e-05, + "loss": 0.3982, + "step": 9805 + }, + { + "epoch": 0.5491096427371486, + "grad_norm": 1.4018274545669556, + "learning_rate": 4.9e-05, + "loss": 0.413, + "step": 9806 + }, + { + "epoch": 0.5491656400492776, + "grad_norm": 1.7344642877578735, + "learning_rate": 4.9005e-05, + "loss": 0.4465, + "step": 9807 + }, + { + "epoch": 0.5492216373614066, + "grad_norm": 1.191381812095642, + "learning_rate": 4.901e-05, + "loss": 0.4133, + "step": 9808 + }, + { + "epoch": 0.5492776346735356, + "grad_norm": 1.0895473957061768, + "learning_rate": 4.9015e-05, + "loss": 0.3796, + "step": 9809 + }, + { + "epoch": 0.5493336319856646, + "grad_norm": 1.3561818599700928, + "learning_rate": 4.902e-05, + "loss": 0.4408, + "step": 9810 + }, + { + "epoch": 0.5493896292977937, + "grad_norm": 1.2005116939544678, + "learning_rate": 4.9025000000000006e-05, + "loss": 0.3827, + "step": 9811 + }, + { + "epoch": 0.5494456266099227, + "grad_norm": 1.3890430927276611, + "learning_rate": 4.903e-05, + "loss": 0.4115, + "step": 9812 + }, + { + "epoch": 0.5495016239220517, + "grad_norm": 1.304787039756775, + "learning_rate": 4.9035e-05, + "loss": 0.3443, + "step": 9813 + }, + { + "epoch": 0.5495576212341807, + "grad_norm": 1.6234766244888306, + "learning_rate": 4.9040000000000005e-05, + "loss": 0.5334, + "step": 9814 + }, + { + "epoch": 0.5496136185463097, + "grad_norm": 1.1996712684631348, + "learning_rate": 4.9045e-05, + "loss": 0.487, + "step": 9815 + }, + { + "epoch": 0.5496696158584388, + "grad_norm": 1.5660183429718018, + "learning_rate": 4.905e-05, + "loss": 0.6293, + "step": 9816 + }, + { + "epoch": 0.5497256131705678, + "grad_norm": 1.3654955625534058, + "learning_rate": 4.9055000000000004e-05, + "loss": 0.4961, + "step": 9817 + }, + { + "epoch": 0.5497816104826968, + "grad_norm": 1.6731728315353394, + "learning_rate": 4.906e-05, + "loss": 0.6572, + "step": 9818 + }, + { + "epoch": 0.5498376077948258, + "grad_norm": 1.17780601978302, + "learning_rate": 4.9065e-05, + "loss": 0.4288, + "step": 9819 + }, + { + "epoch": 0.5498936051069548, + "grad_norm": 1.3803881406784058, + "learning_rate": 4.907e-05, + "loss": 0.4575, + "step": 9820 + }, + { + "epoch": 0.5499496024190839, + "grad_norm": 1.2742030620574951, + "learning_rate": 4.907500000000001e-05, + "loss": 0.3687, + "step": 9821 + }, + { + "epoch": 0.5500055997312129, + "grad_norm": 1.4195208549499512, + "learning_rate": 4.9080000000000004e-05, + "loss": 0.4378, + "step": 9822 + }, + { + "epoch": 0.5500615970433419, + "grad_norm": 1.21417236328125, + "learning_rate": 4.9085e-05, + "loss": 0.5454, + "step": 9823 + }, + { + "epoch": 0.5501175943554709, + "grad_norm": 1.2277599573135376, + "learning_rate": 4.9090000000000006e-05, + "loss": 0.3335, + "step": 9824 + }, + { + "epoch": 0.5501735916675999, + "grad_norm": 1.2298928499221802, + "learning_rate": 4.9095000000000003e-05, + "loss": 0.5312, + "step": 9825 + }, + { + "epoch": 0.550229588979729, + "grad_norm": 1.0673483610153198, + "learning_rate": 4.91e-05, + "loss": 0.3275, + "step": 9826 + }, + { + "epoch": 0.550285586291858, + "grad_norm": 1.1780321598052979, + "learning_rate": 4.9105e-05, + "loss": 0.437, + "step": 9827 + }, + { + "epoch": 0.550341583603987, + "grad_norm": 1.2712862491607666, + "learning_rate": 4.911e-05, + "loss": 0.3853, + "step": 9828 + }, + { + "epoch": 0.550397580916116, + "grad_norm": 1.6301132440567017, + "learning_rate": 4.9115e-05, + "loss": 0.6661, + "step": 9829 + }, + { + "epoch": 0.550453578228245, + "grad_norm": 1.2521926164627075, + "learning_rate": 4.9120000000000004e-05, + "loss": 0.3093, + "step": 9830 + }, + { + "epoch": 0.550509575540374, + "grad_norm": 1.507238507270813, + "learning_rate": 4.9125e-05, + "loss": 0.4127, + "step": 9831 + }, + { + "epoch": 0.5505655728525031, + "grad_norm": 1.2705938816070557, + "learning_rate": 4.9130000000000006e-05, + "loss": 0.5905, + "step": 9832 + }, + { + "epoch": 0.5506215701646321, + "grad_norm": 1.9501150846481323, + "learning_rate": 4.9135e-05, + "loss": 0.5766, + "step": 9833 + }, + { + "epoch": 0.5506775674767611, + "grad_norm": 1.173202633857727, + "learning_rate": 4.914e-05, + "loss": 0.341, + "step": 9834 + }, + { + "epoch": 0.5507335647888901, + "grad_norm": 1.2615176439285278, + "learning_rate": 4.9145000000000005e-05, + "loss": 0.4548, + "step": 9835 + }, + { + "epoch": 0.5507895621010191, + "grad_norm": 1.2580984830856323, + "learning_rate": 4.915e-05, + "loss": 0.4681, + "step": 9836 + }, + { + "epoch": 0.5508455594131482, + "grad_norm": 1.2937867641448975, + "learning_rate": 4.9155e-05, + "loss": 0.3072, + "step": 9837 + }, + { + "epoch": 0.5509015567252772, + "grad_norm": 1.0775372982025146, + "learning_rate": 4.9160000000000004e-05, + "loss": 0.3315, + "step": 9838 + }, + { + "epoch": 0.5509575540374062, + "grad_norm": 1.1573628187179565, + "learning_rate": 4.9165e-05, + "loss": 0.4074, + "step": 9839 + }, + { + "epoch": 0.5510135513495352, + "grad_norm": 1.6001033782958984, + "learning_rate": 4.9170000000000005e-05, + "loss": 0.4437, + "step": 9840 + }, + { + "epoch": 0.5510695486616642, + "grad_norm": 1.460305094718933, + "learning_rate": 4.9175e-05, + "loss": 0.5827, + "step": 9841 + }, + { + "epoch": 0.5511255459737933, + "grad_norm": 1.392538070678711, + "learning_rate": 4.918000000000001e-05, + "loss": 0.4461, + "step": 9842 + }, + { + "epoch": 0.5511815432859223, + "grad_norm": 1.1869699954986572, + "learning_rate": 4.9185000000000004e-05, + "loss": 0.3958, + "step": 9843 + }, + { + "epoch": 0.5512375405980513, + "grad_norm": 1.3342792987823486, + "learning_rate": 4.919e-05, + "loss": 0.4169, + "step": 9844 + }, + { + "epoch": 0.5512935379101803, + "grad_norm": 5.5547990798950195, + "learning_rate": 4.9195e-05, + "loss": 0.546, + "step": 9845 + }, + { + "epoch": 0.5513495352223093, + "grad_norm": 1.1892287731170654, + "learning_rate": 4.92e-05, + "loss": 0.3339, + "step": 9846 + }, + { + "epoch": 0.5514055325344384, + "grad_norm": 1.1834392547607422, + "learning_rate": 4.9205e-05, + "loss": 0.4276, + "step": 9847 + }, + { + "epoch": 0.5514615298465674, + "grad_norm": 1.444663166999817, + "learning_rate": 4.921e-05, + "loss": 0.3811, + "step": 9848 + }, + { + "epoch": 0.5515175271586964, + "grad_norm": 1.4277019500732422, + "learning_rate": 4.9215e-05, + "loss": 0.3793, + "step": 9849 + }, + { + "epoch": 0.5515735244708254, + "grad_norm": 1.271371841430664, + "learning_rate": 4.9220000000000006e-05, + "loss": 0.5175, + "step": 9850 + }, + { + "epoch": 0.5516295217829544, + "grad_norm": 1.8749375343322754, + "learning_rate": 4.9225000000000004e-05, + "loss": 0.5032, + "step": 9851 + }, + { + "epoch": 0.5516855190950835, + "grad_norm": 1.6567984819412231, + "learning_rate": 4.923e-05, + "loss": 0.5108, + "step": 9852 + }, + { + "epoch": 0.5517415164072125, + "grad_norm": 1.0445003509521484, + "learning_rate": 4.9235000000000005e-05, + "loss": 0.4001, + "step": 9853 + }, + { + "epoch": 0.5517975137193415, + "grad_norm": 1.4645371437072754, + "learning_rate": 4.924e-05, + "loss": 0.4019, + "step": 9854 + }, + { + "epoch": 0.5518535110314705, + "grad_norm": 1.3834123611450195, + "learning_rate": 4.9245e-05, + "loss": 0.5157, + "step": 9855 + }, + { + "epoch": 0.5519095083435995, + "grad_norm": 1.0995264053344727, + "learning_rate": 4.9250000000000004e-05, + "loss": 0.3924, + "step": 9856 + }, + { + "epoch": 0.5519655056557285, + "grad_norm": 1.1953632831573486, + "learning_rate": 4.9255e-05, + "loss": 0.5091, + "step": 9857 + }, + { + "epoch": 0.5520215029678576, + "grad_norm": 1.246602177619934, + "learning_rate": 4.926e-05, + "loss": 0.5428, + "step": 9858 + }, + { + "epoch": 0.5520775002799866, + "grad_norm": 1.1450551748275757, + "learning_rate": 4.9265e-05, + "loss": 0.3966, + "step": 9859 + }, + { + "epoch": 0.5521334975921156, + "grad_norm": 1.177097201347351, + "learning_rate": 4.927000000000001e-05, + "loss": 0.3795, + "step": 9860 + }, + { + "epoch": 0.5521894949042446, + "grad_norm": 1.2279947996139526, + "learning_rate": 4.9275000000000005e-05, + "loss": 0.4562, + "step": 9861 + }, + { + "epoch": 0.5522454922163736, + "grad_norm": 1.361910104751587, + "learning_rate": 4.928e-05, + "loss": 0.5464, + "step": 9862 + }, + { + "epoch": 0.5523014895285027, + "grad_norm": 1.1686800718307495, + "learning_rate": 4.928500000000001e-05, + "loss": 0.4628, + "step": 9863 + }, + { + "epoch": 0.5523574868406317, + "grad_norm": 1.1429228782653809, + "learning_rate": 4.9290000000000004e-05, + "loss": 0.3713, + "step": 9864 + }, + { + "epoch": 0.5524134841527607, + "grad_norm": 1.4145402908325195, + "learning_rate": 4.9295e-05, + "loss": 0.3984, + "step": 9865 + }, + { + "epoch": 0.5524694814648897, + "grad_norm": 1.3317267894744873, + "learning_rate": 4.93e-05, + "loss": 0.5272, + "step": 9866 + }, + { + "epoch": 0.5525254787770187, + "grad_norm": 1.6362569332122803, + "learning_rate": 4.9305e-05, + "loss": 0.5404, + "step": 9867 + }, + { + "epoch": 0.5525814760891478, + "grad_norm": 1.2255516052246094, + "learning_rate": 4.931e-05, + "loss": 0.38, + "step": 9868 + }, + { + "epoch": 0.5526374734012768, + "grad_norm": 1.2675533294677734, + "learning_rate": 4.9315e-05, + "loss": 0.4144, + "step": 9869 + }, + { + "epoch": 0.5526934707134058, + "grad_norm": 48.612266540527344, + "learning_rate": 4.932e-05, + "loss": 0.3633, + "step": 9870 + }, + { + "epoch": 0.5527494680255348, + "grad_norm": 1.4007257223129272, + "learning_rate": 4.9325000000000006e-05, + "loss": 0.3649, + "step": 9871 + }, + { + "epoch": 0.5528054653376638, + "grad_norm": 1.2883875370025635, + "learning_rate": 4.9330000000000004e-05, + "loss": 0.455, + "step": 9872 + }, + { + "epoch": 0.5528614626497929, + "grad_norm": 1.2115495204925537, + "learning_rate": 4.9335e-05, + "loss": 0.4835, + "step": 9873 + }, + { + "epoch": 0.5529174599619219, + "grad_norm": 1.1614922285079956, + "learning_rate": 4.9340000000000005e-05, + "loss": 0.6316, + "step": 9874 + }, + { + "epoch": 0.5529734572740509, + "grad_norm": 1.5327253341674805, + "learning_rate": 4.9345e-05, + "loss": 0.5755, + "step": 9875 + }, + { + "epoch": 0.5530294545861799, + "grad_norm": 1.2586109638214111, + "learning_rate": 4.935e-05, + "loss": 0.5452, + "step": 9876 + }, + { + "epoch": 0.5530854518983089, + "grad_norm": 1.5047032833099365, + "learning_rate": 4.9355000000000004e-05, + "loss": 0.5365, + "step": 9877 + }, + { + "epoch": 0.553141449210438, + "grad_norm": 1.508359670639038, + "learning_rate": 4.936e-05, + "loss": 0.5892, + "step": 9878 + }, + { + "epoch": 0.553197446522567, + "grad_norm": 1.305232286453247, + "learning_rate": 4.9365e-05, + "loss": 0.5255, + "step": 9879 + }, + { + "epoch": 0.5532534438346959, + "grad_norm": 1.352198839187622, + "learning_rate": 4.937e-05, + "loss": 0.4257, + "step": 9880 + }, + { + "epoch": 0.5533094411468249, + "grad_norm": 1.2626460790634155, + "learning_rate": 4.937500000000001e-05, + "loss": 0.4752, + "step": 9881 + }, + { + "epoch": 0.5533654384589539, + "grad_norm": 1.0746902227401733, + "learning_rate": 4.9380000000000005e-05, + "loss": 0.3612, + "step": 9882 + }, + { + "epoch": 0.5534214357710829, + "grad_norm": 1.0874592065811157, + "learning_rate": 4.9385e-05, + "loss": 0.4713, + "step": 9883 + }, + { + "epoch": 0.553477433083212, + "grad_norm": 1.600517988204956, + "learning_rate": 4.939e-05, + "loss": 0.3904, + "step": 9884 + }, + { + "epoch": 0.553533430395341, + "grad_norm": 1.0637706518173218, + "learning_rate": 4.9395000000000004e-05, + "loss": 0.3279, + "step": 9885 + }, + { + "epoch": 0.55358942770747, + "grad_norm": 1.3719364404678345, + "learning_rate": 4.94e-05, + "loss": 0.5074, + "step": 9886 + }, + { + "epoch": 0.553645425019599, + "grad_norm": 1.278235912322998, + "learning_rate": 4.9405e-05, + "loss": 0.4989, + "step": 9887 + }, + { + "epoch": 0.553701422331728, + "grad_norm": 1.3625400066375732, + "learning_rate": 4.941e-05, + "loss": 0.4649, + "step": 9888 + }, + { + "epoch": 0.553757419643857, + "grad_norm": 1.0363482236862183, + "learning_rate": 4.9415e-05, + "loss": 0.416, + "step": 9889 + }, + { + "epoch": 0.5538134169559861, + "grad_norm": 1.31887948513031, + "learning_rate": 4.942e-05, + "loss": 0.4591, + "step": 9890 + }, + { + "epoch": 0.5538694142681151, + "grad_norm": 1.1346615552902222, + "learning_rate": 4.9425e-05, + "loss": 0.3147, + "step": 9891 + }, + { + "epoch": 0.5539254115802441, + "grad_norm": 1.951660394668579, + "learning_rate": 4.9430000000000006e-05, + "loss": 0.4978, + "step": 9892 + }, + { + "epoch": 0.5539814088923731, + "grad_norm": 1.2742087841033936, + "learning_rate": 4.9435000000000004e-05, + "loss": 0.4216, + "step": 9893 + }, + { + "epoch": 0.5540374062045021, + "grad_norm": 1.3981987237930298, + "learning_rate": 4.944e-05, + "loss": 0.3698, + "step": 9894 + }, + { + "epoch": 0.5540934035166312, + "grad_norm": 1.213977336883545, + "learning_rate": 4.9445000000000005e-05, + "loss": 0.4999, + "step": 9895 + }, + { + "epoch": 0.5541494008287602, + "grad_norm": 1.315979242324829, + "learning_rate": 4.945e-05, + "loss": 0.5324, + "step": 9896 + }, + { + "epoch": 0.5542053981408892, + "grad_norm": 1.1904572248458862, + "learning_rate": 4.9455e-05, + "loss": 0.3628, + "step": 9897 + }, + { + "epoch": 0.5542613954530182, + "grad_norm": 1.1681935787200928, + "learning_rate": 4.946e-05, + "loss": 0.3761, + "step": 9898 + }, + { + "epoch": 0.5543173927651472, + "grad_norm": 1.256194829940796, + "learning_rate": 4.9465e-05, + "loss": 0.4615, + "step": 9899 + }, + { + "epoch": 0.5543733900772763, + "grad_norm": 1.124131202697754, + "learning_rate": 4.947e-05, + "loss": 0.5317, + "step": 9900 + }, + { + "epoch": 0.5544293873894053, + "grad_norm": 1.7268904447555542, + "learning_rate": 4.9475e-05, + "loss": 0.4717, + "step": 9901 + }, + { + "epoch": 0.5544853847015343, + "grad_norm": 1.1848986148834229, + "learning_rate": 4.948000000000001e-05, + "loss": 0.4333, + "step": 9902 + }, + { + "epoch": 0.5545413820136633, + "grad_norm": 6.534971714019775, + "learning_rate": 4.9485000000000005e-05, + "loss": 0.3291, + "step": 9903 + }, + { + "epoch": 0.5545973793257923, + "grad_norm": 1.3286279439926147, + "learning_rate": 4.949e-05, + "loss": 0.4083, + "step": 9904 + }, + { + "epoch": 0.5546533766379214, + "grad_norm": 1.2005430459976196, + "learning_rate": 4.9495e-05, + "loss": 0.4521, + "step": 9905 + }, + { + "epoch": 0.5547093739500504, + "grad_norm": 1.1113883256912231, + "learning_rate": 4.9500000000000004e-05, + "loss": 0.4342, + "step": 9906 + }, + { + "epoch": 0.5547653712621794, + "grad_norm": 1.3328896760940552, + "learning_rate": 4.9505e-05, + "loss": 0.4356, + "step": 9907 + }, + { + "epoch": 0.5548213685743084, + "grad_norm": 1.3005143404006958, + "learning_rate": 4.951e-05, + "loss": 0.5701, + "step": 9908 + }, + { + "epoch": 0.5548773658864374, + "grad_norm": 1.0802335739135742, + "learning_rate": 4.9515e-05, + "loss": 0.4, + "step": 9909 + }, + { + "epoch": 0.5549333631985665, + "grad_norm": 1.181265950202942, + "learning_rate": 4.952e-05, + "loss": 0.4187, + "step": 9910 + }, + { + "epoch": 0.5549893605106955, + "grad_norm": 1.1380640268325806, + "learning_rate": 4.9525000000000004e-05, + "loss": 0.345, + "step": 9911 + }, + { + "epoch": 0.5550453578228245, + "grad_norm": 1.468192458152771, + "learning_rate": 4.953e-05, + "loss": 0.5454, + "step": 9912 + }, + { + "epoch": 0.5551013551349535, + "grad_norm": 1.2509827613830566, + "learning_rate": 4.9535000000000006e-05, + "loss": 0.3683, + "step": 9913 + }, + { + "epoch": 0.5551573524470825, + "grad_norm": 1.237339735031128, + "learning_rate": 4.9540000000000003e-05, + "loss": 0.4945, + "step": 9914 + }, + { + "epoch": 0.5552133497592115, + "grad_norm": 1.458763837814331, + "learning_rate": 4.9545e-05, + "loss": 0.4614, + "step": 9915 + }, + { + "epoch": 0.5552693470713406, + "grad_norm": 1.4923385381698608, + "learning_rate": 4.9550000000000005e-05, + "loss": 0.4802, + "step": 9916 + }, + { + "epoch": 0.5553253443834696, + "grad_norm": 1.1870142221450806, + "learning_rate": 4.9555e-05, + "loss": 0.3874, + "step": 9917 + }, + { + "epoch": 0.5553813416955986, + "grad_norm": 1.1183218955993652, + "learning_rate": 4.956e-05, + "loss": 0.4729, + "step": 9918 + }, + { + "epoch": 0.5554373390077276, + "grad_norm": 1.5191528797149658, + "learning_rate": 4.9565e-05, + "loss": 0.4362, + "step": 9919 + }, + { + "epoch": 0.5554933363198566, + "grad_norm": 1.2282307147979736, + "learning_rate": 4.957e-05, + "loss": 0.4814, + "step": 9920 + }, + { + "epoch": 0.5555493336319857, + "grad_norm": 1.3444799184799194, + "learning_rate": 4.9575000000000006e-05, + "loss": 0.4141, + "step": 9921 + }, + { + "epoch": 0.5556053309441147, + "grad_norm": 1.3083947896957397, + "learning_rate": 4.958e-05, + "loss": 0.3594, + "step": 9922 + }, + { + "epoch": 0.5556613282562437, + "grad_norm": 1.306146502494812, + "learning_rate": 4.9585e-05, + "loss": 0.3494, + "step": 9923 + }, + { + "epoch": 0.5557173255683727, + "grad_norm": 1.2219531536102295, + "learning_rate": 4.9590000000000005e-05, + "loss": 0.3582, + "step": 9924 + }, + { + "epoch": 0.5557733228805017, + "grad_norm": 1.2251049280166626, + "learning_rate": 4.9595e-05, + "loss": 0.3454, + "step": 9925 + }, + { + "epoch": 0.5558293201926308, + "grad_norm": 1.1786918640136719, + "learning_rate": 4.96e-05, + "loss": 0.3274, + "step": 9926 + }, + { + "epoch": 0.5558853175047598, + "grad_norm": 1.2297167778015137, + "learning_rate": 4.9605000000000004e-05, + "loss": 0.415, + "step": 9927 + }, + { + "epoch": 0.5559413148168888, + "grad_norm": 1.1262352466583252, + "learning_rate": 4.961e-05, + "loss": 0.3766, + "step": 9928 + }, + { + "epoch": 0.5559973121290178, + "grad_norm": 1.4815571308135986, + "learning_rate": 4.9615e-05, + "loss": 0.4644, + "step": 9929 + }, + { + "epoch": 0.5560533094411468, + "grad_norm": 1.1427712440490723, + "learning_rate": 4.962e-05, + "loss": 0.3341, + "step": 9930 + }, + { + "epoch": 0.5561093067532759, + "grad_norm": 1.142407774925232, + "learning_rate": 4.962500000000001e-05, + "loss": 0.3281, + "step": 9931 + }, + { + "epoch": 0.5561653040654049, + "grad_norm": 1.2612287998199463, + "learning_rate": 4.9630000000000004e-05, + "loss": 0.5076, + "step": 9932 + }, + { + "epoch": 0.5562213013775339, + "grad_norm": 1.4052684307098389, + "learning_rate": 4.9635e-05, + "loss": 0.3742, + "step": 9933 + }, + { + "epoch": 0.5562772986896629, + "grad_norm": 1.068899393081665, + "learning_rate": 4.9640000000000006e-05, + "loss": 0.3504, + "step": 9934 + }, + { + "epoch": 0.5563332960017919, + "grad_norm": 1.4374948740005493, + "learning_rate": 4.9645e-05, + "loss": 0.7201, + "step": 9935 + }, + { + "epoch": 0.556389293313921, + "grad_norm": 1.3756743669509888, + "learning_rate": 4.965e-05, + "loss": 0.4793, + "step": 9936 + }, + { + "epoch": 0.55644529062605, + "grad_norm": 1.3166941404342651, + "learning_rate": 4.9655000000000005e-05, + "loss": 0.3156, + "step": 9937 + }, + { + "epoch": 0.556501287938179, + "grad_norm": 1.1218551397323608, + "learning_rate": 4.966e-05, + "loss": 0.4509, + "step": 9938 + }, + { + "epoch": 0.556557285250308, + "grad_norm": 1.1736061573028564, + "learning_rate": 4.9665e-05, + "loss": 0.3869, + "step": 9939 + }, + { + "epoch": 0.556613282562437, + "grad_norm": 1.6972935199737549, + "learning_rate": 4.967e-05, + "loss": 0.5223, + "step": 9940 + }, + { + "epoch": 0.556669279874566, + "grad_norm": 1.475999116897583, + "learning_rate": 4.967500000000001e-05, + "loss": 0.5346, + "step": 9941 + }, + { + "epoch": 0.5567252771866951, + "grad_norm": 1.411203145980835, + "learning_rate": 4.9680000000000005e-05, + "loss": 0.5327, + "step": 9942 + }, + { + "epoch": 0.5567812744988241, + "grad_norm": 1.2205110788345337, + "learning_rate": 4.9685e-05, + "loss": 0.4637, + "step": 9943 + }, + { + "epoch": 0.5568372718109531, + "grad_norm": 1.2715861797332764, + "learning_rate": 4.969e-05, + "loss": 0.5965, + "step": 9944 + }, + { + "epoch": 0.5568932691230821, + "grad_norm": 1.3103289604187012, + "learning_rate": 4.9695000000000004e-05, + "loss": 0.4397, + "step": 9945 + }, + { + "epoch": 0.5569492664352111, + "grad_norm": 1.4818700551986694, + "learning_rate": 4.97e-05, + "loss": 0.562, + "step": 9946 + }, + { + "epoch": 0.5570052637473402, + "grad_norm": 1.9872112274169922, + "learning_rate": 4.9705e-05, + "loss": 0.4474, + "step": 9947 + }, + { + "epoch": 0.5570612610594692, + "grad_norm": 1.3563531637191772, + "learning_rate": 4.9710000000000003e-05, + "loss": 0.3684, + "step": 9948 + }, + { + "epoch": 0.5571172583715982, + "grad_norm": 1.163193702697754, + "learning_rate": 4.9715e-05, + "loss": 0.3716, + "step": 9949 + }, + { + "epoch": 0.5571732556837272, + "grad_norm": 0.9077993631362915, + "learning_rate": 4.972e-05, + "loss": 0.3329, + "step": 9950 + }, + { + "epoch": 0.5572292529958562, + "grad_norm": 1.0345548391342163, + "learning_rate": 4.9725e-05, + "loss": 0.3246, + "step": 9951 + }, + { + "epoch": 0.5572852503079853, + "grad_norm": 1.3332639932632446, + "learning_rate": 4.973000000000001e-05, + "loss": 0.3712, + "step": 9952 + }, + { + "epoch": 0.5573412476201143, + "grad_norm": 1.148186445236206, + "learning_rate": 4.9735000000000004e-05, + "loss": 0.3677, + "step": 9953 + }, + { + "epoch": 0.5573972449322433, + "grad_norm": 1.2874257564544678, + "learning_rate": 4.974e-05, + "loss": 0.4196, + "step": 9954 + }, + { + "epoch": 0.5574532422443723, + "grad_norm": 1.3812764883041382, + "learning_rate": 4.9745000000000006e-05, + "loss": 0.505, + "step": 9955 + }, + { + "epoch": 0.5575092395565013, + "grad_norm": 1.211657166481018, + "learning_rate": 4.975e-05, + "loss": 0.3548, + "step": 9956 + }, + { + "epoch": 0.5575652368686304, + "grad_norm": 2.642127513885498, + "learning_rate": 4.9755e-05, + "loss": 0.4774, + "step": 9957 + }, + { + "epoch": 0.5576212341807594, + "grad_norm": 1.1937086582183838, + "learning_rate": 4.976e-05, + "loss": 0.3494, + "step": 9958 + }, + { + "epoch": 0.5576772314928884, + "grad_norm": 1.290429711341858, + "learning_rate": 4.9765e-05, + "loss": 0.4232, + "step": 9959 + }, + { + "epoch": 0.5577332288050174, + "grad_norm": 1.9520561695098877, + "learning_rate": 4.977e-05, + "loss": 0.5164, + "step": 9960 + }, + { + "epoch": 0.5577892261171464, + "grad_norm": 1.2326164245605469, + "learning_rate": 4.9775000000000004e-05, + "loss": 0.4576, + "step": 9961 + }, + { + "epoch": 0.5578452234292754, + "grad_norm": 1.1097726821899414, + "learning_rate": 4.978e-05, + "loss": 0.4239, + "step": 9962 + }, + { + "epoch": 0.5579012207414044, + "grad_norm": 1.5771677494049072, + "learning_rate": 4.9785000000000005e-05, + "loss": 0.3854, + "step": 9963 + }, + { + "epoch": 0.5579572180535334, + "grad_norm": 1.1490870714187622, + "learning_rate": 4.979e-05, + "loss": 0.3786, + "step": 9964 + }, + { + "epoch": 0.5580132153656624, + "grad_norm": 1.0461891889572144, + "learning_rate": 4.9795e-05, + "loss": 0.3223, + "step": 9965 + }, + { + "epoch": 0.5580692126777914, + "grad_norm": 1.3650190830230713, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.492, + "step": 9966 + }, + { + "epoch": 0.5581252099899204, + "grad_norm": 1.690585970878601, + "learning_rate": 4.9805e-05, + "loss": 0.5052, + "step": 9967 + }, + { + "epoch": 0.5581812073020495, + "grad_norm": 1.3042556047439575, + "learning_rate": 4.981e-05, + "loss": 0.4262, + "step": 9968 + }, + { + "epoch": 0.5582372046141785, + "grad_norm": 1.3416763544082642, + "learning_rate": 4.9815e-05, + "loss": 0.4576, + "step": 9969 + }, + { + "epoch": 0.5582932019263075, + "grad_norm": 1.3321274518966675, + "learning_rate": 4.982e-05, + "loss": 0.4248, + "step": 9970 + }, + { + "epoch": 0.5583491992384365, + "grad_norm": 1.4489244222640991, + "learning_rate": 4.9825000000000005e-05, + "loss": 0.5264, + "step": 9971 + }, + { + "epoch": 0.5584051965505655, + "grad_norm": 1.2635912895202637, + "learning_rate": 4.983e-05, + "loss": 0.4988, + "step": 9972 + }, + { + "epoch": 0.5584611938626945, + "grad_norm": 1.3435423374176025, + "learning_rate": 4.9835000000000007e-05, + "loss": 0.5256, + "step": 9973 + }, + { + "epoch": 0.5585171911748236, + "grad_norm": 1.4019769430160522, + "learning_rate": 4.9840000000000004e-05, + "loss": 0.4462, + "step": 9974 + }, + { + "epoch": 0.5585731884869526, + "grad_norm": 1.345577597618103, + "learning_rate": 4.9845e-05, + "loss": 0.3777, + "step": 9975 + }, + { + "epoch": 0.5586291857990816, + "grad_norm": 1.290582299232483, + "learning_rate": 4.9850000000000006e-05, + "loss": 0.3645, + "step": 9976 + }, + { + "epoch": 0.5586851831112106, + "grad_norm": 1.3481178283691406, + "learning_rate": 4.9855e-05, + "loss": 0.5196, + "step": 9977 + }, + { + "epoch": 0.5587411804233396, + "grad_norm": 1.1358520984649658, + "learning_rate": 4.986e-05, + "loss": 0.383, + "step": 9978 + }, + { + "epoch": 0.5587971777354687, + "grad_norm": 1.4544248580932617, + "learning_rate": 4.9865e-05, + "loss": 0.6048, + "step": 9979 + }, + { + "epoch": 0.5588531750475977, + "grad_norm": 1.416058897972107, + "learning_rate": 4.987e-05, + "loss": 0.4876, + "step": 9980 + }, + { + "epoch": 0.5589091723597267, + "grad_norm": 1.3810850381851196, + "learning_rate": 4.9875000000000006e-05, + "loss": 0.4473, + "step": 9981 + }, + { + "epoch": 0.5589651696718557, + "grad_norm": 1.3577407598495483, + "learning_rate": 4.9880000000000004e-05, + "loss": 0.5043, + "step": 9982 + }, + { + "epoch": 0.5590211669839847, + "grad_norm": 1.1616915464401245, + "learning_rate": 4.9885e-05, + "loss": 0.4671, + "step": 9983 + }, + { + "epoch": 0.5590771642961138, + "grad_norm": 1.6595820188522339, + "learning_rate": 4.9890000000000005e-05, + "loss": 0.6291, + "step": 9984 + }, + { + "epoch": 0.5591331616082428, + "grad_norm": 1.1977049112319946, + "learning_rate": 4.9895e-05, + "loss": 0.4237, + "step": 9985 + }, + { + "epoch": 0.5591891589203718, + "grad_norm": 1.2889515161514282, + "learning_rate": 4.99e-05, + "loss": 0.4081, + "step": 9986 + }, + { + "epoch": 0.5592451562325008, + "grad_norm": 1.3410112857818604, + "learning_rate": 4.9905000000000004e-05, + "loss": 0.414, + "step": 9987 + }, + { + "epoch": 0.5593011535446298, + "grad_norm": 1.8319560289382935, + "learning_rate": 4.991e-05, + "loss": 0.5875, + "step": 9988 + }, + { + "epoch": 0.5593571508567589, + "grad_norm": 1.1550171375274658, + "learning_rate": 4.9915e-05, + "loss": 0.3828, + "step": 9989 + }, + { + "epoch": 0.5594131481688879, + "grad_norm": 1.2857604026794434, + "learning_rate": 4.992e-05, + "loss": 0.4118, + "step": 9990 + }, + { + "epoch": 0.5594691454810169, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.992500000000001e-05, + "loss": 0.5153, + "step": 9991 + }, + { + "epoch": 0.5595251427931459, + "grad_norm": 1.284501075744629, + "learning_rate": 4.9930000000000005e-05, + "loss": 0.436, + "step": 9992 + }, + { + "epoch": 0.5595811401052749, + "grad_norm": 1.1906112432479858, + "learning_rate": 4.9935e-05, + "loss": 0.4321, + "step": 9993 + }, + { + "epoch": 0.559637137417404, + "grad_norm": 1.2857753038406372, + "learning_rate": 4.9940000000000006e-05, + "loss": 0.3267, + "step": 9994 + }, + { + "epoch": 0.559693134729533, + "grad_norm": 1.9837729930877686, + "learning_rate": 4.9945000000000004e-05, + "loss": 0.4377, + "step": 9995 + }, + { + "epoch": 0.559749132041662, + "grad_norm": 1.4722764492034912, + "learning_rate": 4.995e-05, + "loss": 0.5115, + "step": 9996 + }, + { + "epoch": 0.559805129353791, + "grad_norm": 1.1309281587600708, + "learning_rate": 4.9955e-05, + "loss": 0.4, + "step": 9997 + }, + { + "epoch": 0.55986112666592, + "grad_norm": 1.302437663078308, + "learning_rate": 4.996e-05, + "loss": 0.4132, + "step": 9998 + }, + { + "epoch": 0.559917123978049, + "grad_norm": 1.5880764722824097, + "learning_rate": 4.9965e-05, + "loss": 0.6104, + "step": 9999 + }, + { + "epoch": 0.5599731212901781, + "grad_norm": 1.1306363344192505, + "learning_rate": 4.997e-05, + "loss": 0.3663, + "step": 10000 + }, + { + "epoch": 0.5600291186023071, + "grad_norm": 1.2174371480941772, + "learning_rate": 4.9975e-05, + "loss": 0.337, + "step": 10001 + }, + { + "epoch": 0.5600851159144361, + "grad_norm": 1.394925832748413, + "learning_rate": 4.9980000000000006e-05, + "loss": 0.6143, + "step": 10002 + }, + { + "epoch": 0.5601411132265651, + "grad_norm": 1.1916224956512451, + "learning_rate": 4.9985e-05, + "loss": 0.3698, + "step": 10003 + }, + { + "epoch": 0.5601971105386941, + "grad_norm": 1.4514687061309814, + "learning_rate": 4.999e-05, + "loss": 0.4462, + "step": 10004 + }, + { + "epoch": 0.5602531078508232, + "grad_norm": 1.3292044401168823, + "learning_rate": 4.9995000000000005e-05, + "loss": 0.4409, + "step": 10005 + }, + { + "epoch": 0.5603091051629522, + "grad_norm": 1.328014850616455, + "learning_rate": 5e-05, + "loss": 0.4157, + "step": 10006 + }, + { + "epoch": 0.5603651024750812, + "grad_norm": 1.4254311323165894, + "learning_rate": 5.0005e-05, + "loss": 0.5557, + "step": 10007 + }, + { + "epoch": 0.5604210997872102, + "grad_norm": 1.393754482269287, + "learning_rate": 5.0010000000000004e-05, + "loss": 0.5511, + "step": 10008 + }, + { + "epoch": 0.5604770970993392, + "grad_norm": 1.2166305780410767, + "learning_rate": 5.0015e-05, + "loss": 0.4034, + "step": 10009 + }, + { + "epoch": 0.5605330944114683, + "grad_norm": 1.2587909698486328, + "learning_rate": 5.002e-05, + "loss": 0.4431, + "step": 10010 + }, + { + "epoch": 0.5605890917235973, + "grad_norm": 1.1939144134521484, + "learning_rate": 5.0025e-05, + "loss": 0.4778, + "step": 10011 + }, + { + "epoch": 0.5606450890357263, + "grad_norm": 1.3817065954208374, + "learning_rate": 5.003e-05, + "loss": 0.487, + "step": 10012 + }, + { + "epoch": 0.5607010863478553, + "grad_norm": 1.108214020729065, + "learning_rate": 5.0035e-05, + "loss": 0.456, + "step": 10013 + }, + { + "epoch": 0.5607570836599843, + "grad_norm": 1.1789519786834717, + "learning_rate": 5.0039999999999995e-05, + "loss": 0.4451, + "step": 10014 + }, + { + "epoch": 0.5608130809721134, + "grad_norm": 1.1562879085540771, + "learning_rate": 5.0045e-05, + "loss": 0.3838, + "step": 10015 + }, + { + "epoch": 0.5608690782842424, + "grad_norm": 1.337884783744812, + "learning_rate": 5.005e-05, + "loss": 0.6035, + "step": 10016 + }, + { + "epoch": 0.5609250755963714, + "grad_norm": 1.4621641635894775, + "learning_rate": 5.005500000000001e-05, + "loss": 0.4793, + "step": 10017 + }, + { + "epoch": 0.5609810729085004, + "grad_norm": 1.657607913017273, + "learning_rate": 5.0060000000000005e-05, + "loss": 0.3936, + "step": 10018 + }, + { + "epoch": 0.5610370702206294, + "grad_norm": 1.2850358486175537, + "learning_rate": 5.006500000000001e-05, + "loss": 0.3666, + "step": 10019 + }, + { + "epoch": 0.5610930675327584, + "grad_norm": 1.2727241516113281, + "learning_rate": 5.007000000000001e-05, + "loss": 0.5122, + "step": 10020 + }, + { + "epoch": 0.5611490648448875, + "grad_norm": 1.912054181098938, + "learning_rate": 5.0075000000000004e-05, + "loss": 0.3845, + "step": 10021 + }, + { + "epoch": 0.5612050621570165, + "grad_norm": 1.3623539209365845, + "learning_rate": 5.008e-05, + "loss": 0.4237, + "step": 10022 + }, + { + "epoch": 0.5612610594691455, + "grad_norm": 1.307369589805603, + "learning_rate": 5.0085000000000006e-05, + "loss": 0.523, + "step": 10023 + }, + { + "epoch": 0.5613170567812745, + "grad_norm": 1.2519283294677734, + "learning_rate": 5.009e-05, + "loss": 0.5827, + "step": 10024 + }, + { + "epoch": 0.5613730540934035, + "grad_norm": 1.2211045026779175, + "learning_rate": 5.0095e-05, + "loss": 0.5273, + "step": 10025 + }, + { + "epoch": 0.5614290514055326, + "grad_norm": 1.12700355052948, + "learning_rate": 5.0100000000000005e-05, + "loss": 0.3769, + "step": 10026 + }, + { + "epoch": 0.5614850487176616, + "grad_norm": 1.1931533813476562, + "learning_rate": 5.0105e-05, + "loss": 0.4218, + "step": 10027 + }, + { + "epoch": 0.5615410460297906, + "grad_norm": 1.2497025728225708, + "learning_rate": 5.011e-05, + "loss": 0.3069, + "step": 10028 + }, + { + "epoch": 0.5615970433419196, + "grad_norm": 1.37238609790802, + "learning_rate": 5.0115000000000004e-05, + "loss": 0.4876, + "step": 10029 + }, + { + "epoch": 0.5616530406540486, + "grad_norm": 1.170186996459961, + "learning_rate": 5.012e-05, + "loss": 0.3934, + "step": 10030 + }, + { + "epoch": 0.5617090379661777, + "grad_norm": 1.1139869689941406, + "learning_rate": 5.0125e-05, + "loss": 0.3468, + "step": 10031 + }, + { + "epoch": 0.5617650352783067, + "grad_norm": 1.4631116390228271, + "learning_rate": 5.0129999999999996e-05, + "loss": 0.6199, + "step": 10032 + }, + { + "epoch": 0.5618210325904357, + "grad_norm": 1.2753750085830688, + "learning_rate": 5.0135e-05, + "loss": 0.4501, + "step": 10033 + }, + { + "epoch": 0.5618770299025647, + "grad_norm": 1.632766604423523, + "learning_rate": 5.014e-05, + "loss": 0.4904, + "step": 10034 + }, + { + "epoch": 0.5619330272146937, + "grad_norm": 1.181941270828247, + "learning_rate": 5.0144999999999995e-05, + "loss": 0.4378, + "step": 10035 + }, + { + "epoch": 0.5619890245268228, + "grad_norm": 1.1032267808914185, + "learning_rate": 5.015e-05, + "loss": 0.3056, + "step": 10036 + }, + { + "epoch": 0.5620450218389518, + "grad_norm": 1.1124083995819092, + "learning_rate": 5.015500000000001e-05, + "loss": 0.4144, + "step": 10037 + }, + { + "epoch": 0.5621010191510808, + "grad_norm": 1.645929217338562, + "learning_rate": 5.016000000000001e-05, + "loss": 0.4506, + "step": 10038 + }, + { + "epoch": 0.5621570164632098, + "grad_norm": 1.1254245042800903, + "learning_rate": 5.0165000000000005e-05, + "loss": 0.4107, + "step": 10039 + }, + { + "epoch": 0.5622130137753388, + "grad_norm": 1.5527163743972778, + "learning_rate": 5.017000000000001e-05, + "loss": 0.5131, + "step": 10040 + }, + { + "epoch": 0.5622690110874679, + "grad_norm": 1.2213736772537231, + "learning_rate": 5.017500000000001e-05, + "loss": 0.3638, + "step": 10041 + }, + { + "epoch": 0.5623250083995969, + "grad_norm": 1.2467491626739502, + "learning_rate": 5.0180000000000004e-05, + "loss": 0.4783, + "step": 10042 + }, + { + "epoch": 0.5623810057117259, + "grad_norm": 1.4593853950500488, + "learning_rate": 5.0185e-05, + "loss": 0.525, + "step": 10043 + }, + { + "epoch": 0.5624370030238549, + "grad_norm": 1.1642142534255981, + "learning_rate": 5.0190000000000006e-05, + "loss": 0.4755, + "step": 10044 + }, + { + "epoch": 0.5624930003359839, + "grad_norm": 1.3840899467468262, + "learning_rate": 5.0195e-05, + "loss": 0.4413, + "step": 10045 + }, + { + "epoch": 0.5625489976481128, + "grad_norm": 1.1163687705993652, + "learning_rate": 5.02e-05, + "loss": 0.3448, + "step": 10046 + }, + { + "epoch": 0.5626049949602419, + "grad_norm": 1.3278331756591797, + "learning_rate": 5.0205000000000005e-05, + "loss": 0.397, + "step": 10047 + }, + { + "epoch": 0.5626609922723709, + "grad_norm": 1.5409855842590332, + "learning_rate": 5.021e-05, + "loss": 0.4767, + "step": 10048 + }, + { + "epoch": 0.5627169895844999, + "grad_norm": 1.2589179277420044, + "learning_rate": 5.0215e-05, + "loss": 0.486, + "step": 10049 + }, + { + "epoch": 0.5627729868966289, + "grad_norm": 1.3704774379730225, + "learning_rate": 5.0220000000000004e-05, + "loss": 0.514, + "step": 10050 + }, + { + "epoch": 0.5628289842087579, + "grad_norm": 1.185131311416626, + "learning_rate": 5.0225e-05, + "loss": 0.5717, + "step": 10051 + }, + { + "epoch": 0.562884981520887, + "grad_norm": 1.5050828456878662, + "learning_rate": 5.023e-05, + "loss": 0.4966, + "step": 10052 + }, + { + "epoch": 0.562940978833016, + "grad_norm": 1.078010082244873, + "learning_rate": 5.0234999999999996e-05, + "loss": 0.3227, + "step": 10053 + }, + { + "epoch": 0.562996976145145, + "grad_norm": 1.184657335281372, + "learning_rate": 5.024e-05, + "loss": 0.3177, + "step": 10054 + }, + { + "epoch": 0.563052973457274, + "grad_norm": 1.322494626045227, + "learning_rate": 5.0245e-05, + "loss": 0.4325, + "step": 10055 + }, + { + "epoch": 0.563108970769403, + "grad_norm": 1.3813997507095337, + "learning_rate": 5.0249999999999995e-05, + "loss": 0.586, + "step": 10056 + }, + { + "epoch": 0.563164968081532, + "grad_norm": 1.1439642906188965, + "learning_rate": 5.0255000000000006e-05, + "loss": 0.4306, + "step": 10057 + }, + { + "epoch": 0.5632209653936611, + "grad_norm": 1.2778465747833252, + "learning_rate": 5.026000000000001e-05, + "loss": 0.5442, + "step": 10058 + }, + { + "epoch": 0.5632769627057901, + "grad_norm": 1.2710590362548828, + "learning_rate": 5.026500000000001e-05, + "loss": 0.5092, + "step": 10059 + }, + { + "epoch": 0.5633329600179191, + "grad_norm": 1.255285382270813, + "learning_rate": 5.0270000000000005e-05, + "loss": 0.4325, + "step": 10060 + }, + { + "epoch": 0.5633889573300481, + "grad_norm": 1.280313491821289, + "learning_rate": 5.0275e-05, + "loss": 0.4029, + "step": 10061 + }, + { + "epoch": 0.5634449546421771, + "grad_norm": 1.267838478088379, + "learning_rate": 5.0280000000000006e-05, + "loss": 0.4354, + "step": 10062 + }, + { + "epoch": 0.5635009519543062, + "grad_norm": 1.5349767208099365, + "learning_rate": 5.0285000000000004e-05, + "loss": 0.3912, + "step": 10063 + }, + { + "epoch": 0.5635569492664352, + "grad_norm": 1.314571738243103, + "learning_rate": 5.029e-05, + "loss": 0.4128, + "step": 10064 + }, + { + "epoch": 0.5636129465785642, + "grad_norm": 1.4526143074035645, + "learning_rate": 5.0295000000000006e-05, + "loss": 0.51, + "step": 10065 + }, + { + "epoch": 0.5636689438906932, + "grad_norm": 1.4880921840667725, + "learning_rate": 5.03e-05, + "loss": 0.5507, + "step": 10066 + }, + { + "epoch": 0.5637249412028222, + "grad_norm": 1.6427276134490967, + "learning_rate": 5.0305e-05, + "loss": 0.5428, + "step": 10067 + }, + { + "epoch": 0.5637809385149513, + "grad_norm": 1.3415950536727905, + "learning_rate": 5.0310000000000005e-05, + "loss": 0.4392, + "step": 10068 + }, + { + "epoch": 0.5638369358270803, + "grad_norm": 1.6944310665130615, + "learning_rate": 5.0315e-05, + "loss": 0.2834, + "step": 10069 + }, + { + "epoch": 0.5638929331392093, + "grad_norm": 1.3977125883102417, + "learning_rate": 5.032e-05, + "loss": 0.4614, + "step": 10070 + }, + { + "epoch": 0.5639489304513383, + "grad_norm": 1.3059650659561157, + "learning_rate": 5.0325e-05, + "loss": 0.4491, + "step": 10071 + }, + { + "epoch": 0.5640049277634673, + "grad_norm": 1.402970790863037, + "learning_rate": 5.033e-05, + "loss": 0.622, + "step": 10072 + }, + { + "epoch": 0.5640609250755964, + "grad_norm": 1.445788860321045, + "learning_rate": 5.0335e-05, + "loss": 0.558, + "step": 10073 + }, + { + "epoch": 0.5641169223877254, + "grad_norm": 1.7032382488250732, + "learning_rate": 5.0339999999999996e-05, + "loss": 0.443, + "step": 10074 + }, + { + "epoch": 0.5641729196998544, + "grad_norm": 1.2501990795135498, + "learning_rate": 5.0345e-05, + "loss": 0.4593, + "step": 10075 + }, + { + "epoch": 0.5642289170119834, + "grad_norm": 1.5817010402679443, + "learning_rate": 5.035e-05, + "loss": 0.4985, + "step": 10076 + }, + { + "epoch": 0.5642849143241124, + "grad_norm": 1.1269781589508057, + "learning_rate": 5.035500000000001e-05, + "loss": 0.4437, + "step": 10077 + }, + { + "epoch": 0.5643409116362414, + "grad_norm": 1.16768479347229, + "learning_rate": 5.0360000000000006e-05, + "loss": 0.4073, + "step": 10078 + }, + { + "epoch": 0.5643969089483705, + "grad_norm": 1.4160252809524536, + "learning_rate": 5.036500000000001e-05, + "loss": 0.4502, + "step": 10079 + }, + { + "epoch": 0.5644529062604995, + "grad_norm": 1.240113615989685, + "learning_rate": 5.037000000000001e-05, + "loss": 0.4357, + "step": 10080 + }, + { + "epoch": 0.5645089035726285, + "grad_norm": 1.3157459497451782, + "learning_rate": 5.0375000000000005e-05, + "loss": 0.4175, + "step": 10081 + }, + { + "epoch": 0.5645649008847575, + "grad_norm": 1.5507278442382812, + "learning_rate": 5.038e-05, + "loss": 0.3846, + "step": 10082 + }, + { + "epoch": 0.5646208981968865, + "grad_norm": 1.2351633310317993, + "learning_rate": 5.0385000000000006e-05, + "loss": 0.4688, + "step": 10083 + }, + { + "epoch": 0.5646768955090156, + "grad_norm": 1.5217021703720093, + "learning_rate": 5.0390000000000004e-05, + "loss": 0.424, + "step": 10084 + }, + { + "epoch": 0.5647328928211446, + "grad_norm": 1.1799594163894653, + "learning_rate": 5.0395e-05, + "loss": 0.3173, + "step": 10085 + }, + { + "epoch": 0.5647888901332736, + "grad_norm": 1.218217372894287, + "learning_rate": 5.0400000000000005e-05, + "loss": 0.5611, + "step": 10086 + }, + { + "epoch": 0.5648448874454026, + "grad_norm": 1.0080746412277222, + "learning_rate": 5.0405e-05, + "loss": 0.3904, + "step": 10087 + }, + { + "epoch": 0.5649008847575316, + "grad_norm": 1.2529783248901367, + "learning_rate": 5.041e-05, + "loss": 0.3862, + "step": 10088 + }, + { + "epoch": 0.5649568820696607, + "grad_norm": 1.1760334968566895, + "learning_rate": 5.0415000000000004e-05, + "loss": 0.4188, + "step": 10089 + }, + { + "epoch": 0.5650128793817897, + "grad_norm": 1.174148440361023, + "learning_rate": 5.042e-05, + "loss": 0.456, + "step": 10090 + }, + { + "epoch": 0.5650688766939187, + "grad_norm": 2.8847005367279053, + "learning_rate": 5.0425e-05, + "loss": 0.4242, + "step": 10091 + }, + { + "epoch": 0.5651248740060477, + "grad_norm": 1.396600365638733, + "learning_rate": 5.0429999999999997e-05, + "loss": 0.3525, + "step": 10092 + }, + { + "epoch": 0.5651808713181767, + "grad_norm": 2.1680355072021484, + "learning_rate": 5.0435e-05, + "loss": 0.4377, + "step": 10093 + }, + { + "epoch": 0.5652368686303058, + "grad_norm": 1.2360775470733643, + "learning_rate": 5.044e-05, + "loss": 0.4383, + "step": 10094 + }, + { + "epoch": 0.5652928659424348, + "grad_norm": 1.3207876682281494, + "learning_rate": 5.0444999999999996e-05, + "loss": 0.4131, + "step": 10095 + }, + { + "epoch": 0.5653488632545638, + "grad_norm": 1.5699901580810547, + "learning_rate": 5.045e-05, + "loss": 0.3841, + "step": 10096 + }, + { + "epoch": 0.5654048605666928, + "grad_norm": 1.070851445198059, + "learning_rate": 5.045500000000001e-05, + "loss": 0.2873, + "step": 10097 + }, + { + "epoch": 0.5654608578788218, + "grad_norm": 1.6041005849838257, + "learning_rate": 5.046000000000001e-05, + "loss": 0.457, + "step": 10098 + }, + { + "epoch": 0.5655168551909509, + "grad_norm": 1.3358922004699707, + "learning_rate": 5.0465000000000006e-05, + "loss": 0.395, + "step": 10099 + }, + { + "epoch": 0.5655728525030799, + "grad_norm": 1.224945306777954, + "learning_rate": 5.047e-05, + "loss": 0.5468, + "step": 10100 + }, + { + "epoch": 0.5656288498152089, + "grad_norm": 1.5300770998001099, + "learning_rate": 5.047500000000001e-05, + "loss": 0.4921, + "step": 10101 + }, + { + "epoch": 0.5656848471273379, + "grad_norm": 1.1021238565444946, + "learning_rate": 5.0480000000000005e-05, + "loss": 0.456, + "step": 10102 + }, + { + "epoch": 0.5657408444394669, + "grad_norm": 1.3550925254821777, + "learning_rate": 5.0485e-05, + "loss": 0.5868, + "step": 10103 + }, + { + "epoch": 0.565796841751596, + "grad_norm": 1.0332305431365967, + "learning_rate": 5.0490000000000006e-05, + "loss": 0.3685, + "step": 10104 + }, + { + "epoch": 0.565852839063725, + "grad_norm": 1.2964366674423218, + "learning_rate": 5.0495000000000004e-05, + "loss": 0.621, + "step": 10105 + }, + { + "epoch": 0.565908836375854, + "grad_norm": 1.293464183807373, + "learning_rate": 5.05e-05, + "loss": 0.4659, + "step": 10106 + }, + { + "epoch": 0.565964833687983, + "grad_norm": 1.5895806550979614, + "learning_rate": 5.0505000000000005e-05, + "loss": 0.454, + "step": 10107 + }, + { + "epoch": 0.566020831000112, + "grad_norm": 1.4051517248153687, + "learning_rate": 5.051e-05, + "loss": 0.3849, + "step": 10108 + }, + { + "epoch": 0.566076828312241, + "grad_norm": 1.385178565979004, + "learning_rate": 5.0515e-05, + "loss": 0.4506, + "step": 10109 + }, + { + "epoch": 0.5661328256243701, + "grad_norm": 1.2554985284805298, + "learning_rate": 5.052e-05, + "loss": 0.524, + "step": 10110 + }, + { + "epoch": 0.5661888229364991, + "grad_norm": 1.3433587551116943, + "learning_rate": 5.0525e-05, + "loss": 0.4234, + "step": 10111 + }, + { + "epoch": 0.5662448202486281, + "grad_norm": 1.5164456367492676, + "learning_rate": 5.053e-05, + "loss": 0.481, + "step": 10112 + }, + { + "epoch": 0.5663008175607571, + "grad_norm": 1.239671230316162, + "learning_rate": 5.0534999999999996e-05, + "loss": 0.4495, + "step": 10113 + }, + { + "epoch": 0.5663568148728861, + "grad_norm": 1.2597676515579224, + "learning_rate": 5.054e-05, + "loss": 0.4129, + "step": 10114 + }, + { + "epoch": 0.5664128121850152, + "grad_norm": 1.2329849004745483, + "learning_rate": 5.0545e-05, + "loss": 0.4358, + "step": 10115 + }, + { + "epoch": 0.5664688094971442, + "grad_norm": 1.415442943572998, + "learning_rate": 5.0549999999999995e-05, + "loss": 0.4814, + "step": 10116 + }, + { + "epoch": 0.5665248068092732, + "grad_norm": 1.3459875583648682, + "learning_rate": 5.0555000000000006e-05, + "loss": 0.4023, + "step": 10117 + }, + { + "epoch": 0.5665808041214022, + "grad_norm": 2.1763296127319336, + "learning_rate": 5.056000000000001e-05, + "loss": 0.447, + "step": 10118 + }, + { + "epoch": 0.5666368014335312, + "grad_norm": 1.1652395725250244, + "learning_rate": 5.056500000000001e-05, + "loss": 0.385, + "step": 10119 + }, + { + "epoch": 0.5666927987456603, + "grad_norm": 1.3457837104797363, + "learning_rate": 5.0570000000000005e-05, + "loss": 0.3948, + "step": 10120 + }, + { + "epoch": 0.5667487960577893, + "grad_norm": 1.684307336807251, + "learning_rate": 5.0575e-05, + "loss": 0.5563, + "step": 10121 + }, + { + "epoch": 0.5668047933699183, + "grad_norm": 1.1551640033721924, + "learning_rate": 5.058000000000001e-05, + "loss": 0.528, + "step": 10122 + }, + { + "epoch": 0.5668607906820473, + "grad_norm": 1.3200267553329468, + "learning_rate": 5.0585000000000004e-05, + "loss": 0.4706, + "step": 10123 + }, + { + "epoch": 0.5669167879941763, + "grad_norm": 1.070975661277771, + "learning_rate": 5.059e-05, + "loss": 0.3755, + "step": 10124 + }, + { + "epoch": 0.5669727853063053, + "grad_norm": 1.593381404876709, + "learning_rate": 5.0595000000000006e-05, + "loss": 0.3762, + "step": 10125 + }, + { + "epoch": 0.5670287826184344, + "grad_norm": 1.2779661417007446, + "learning_rate": 5.0600000000000003e-05, + "loss": 0.4005, + "step": 10126 + }, + { + "epoch": 0.5670847799305634, + "grad_norm": 1.187401533126831, + "learning_rate": 5.0605e-05, + "loss": 0.4789, + "step": 10127 + }, + { + "epoch": 0.5671407772426923, + "grad_norm": 1.4982678890228271, + "learning_rate": 5.0610000000000005e-05, + "loss": 0.5009, + "step": 10128 + }, + { + "epoch": 0.5671967745548213, + "grad_norm": 1.1863610744476318, + "learning_rate": 5.0615e-05, + "loss": 0.4679, + "step": 10129 + }, + { + "epoch": 0.5672527718669503, + "grad_norm": 1.1557377576828003, + "learning_rate": 5.062e-05, + "loss": 0.4048, + "step": 10130 + }, + { + "epoch": 0.5673087691790794, + "grad_norm": 1.194615125656128, + "learning_rate": 5.0625e-05, + "loss": 0.4318, + "step": 10131 + }, + { + "epoch": 0.5673647664912084, + "grad_norm": 1.3305182456970215, + "learning_rate": 5.063e-05, + "loss": 0.483, + "step": 10132 + }, + { + "epoch": 0.5674207638033374, + "grad_norm": 1.4567062854766846, + "learning_rate": 5.0635e-05, + "loss": 0.4314, + "step": 10133 + }, + { + "epoch": 0.5674767611154664, + "grad_norm": 1.1840482950210571, + "learning_rate": 5.0639999999999996e-05, + "loss": 0.352, + "step": 10134 + }, + { + "epoch": 0.5675327584275954, + "grad_norm": 1.8856470584869385, + "learning_rate": 5.0645e-05, + "loss": 0.4333, + "step": 10135 + }, + { + "epoch": 0.5675887557397244, + "grad_norm": 1.2895816564559937, + "learning_rate": 5.065e-05, + "loss": 0.4328, + "step": 10136 + }, + { + "epoch": 0.5676447530518535, + "grad_norm": 1.1949578523635864, + "learning_rate": 5.0654999999999995e-05, + "loss": 0.505, + "step": 10137 + }, + { + "epoch": 0.5677007503639825, + "grad_norm": 1.247236728668213, + "learning_rate": 5.0660000000000006e-05, + "loss": 0.4934, + "step": 10138 + }, + { + "epoch": 0.5677567476761115, + "grad_norm": 1.4603780508041382, + "learning_rate": 5.0665000000000004e-05, + "loss": 0.6489, + "step": 10139 + }, + { + "epoch": 0.5678127449882405, + "grad_norm": 1.3208463191986084, + "learning_rate": 5.067000000000001e-05, + "loss": 0.4261, + "step": 10140 + }, + { + "epoch": 0.5678687423003695, + "grad_norm": 1.293473482131958, + "learning_rate": 5.0675000000000005e-05, + "loss": 0.3721, + "step": 10141 + }, + { + "epoch": 0.5679247396124986, + "grad_norm": 1.4508566856384277, + "learning_rate": 5.068e-05, + "loss": 0.3791, + "step": 10142 + }, + { + "epoch": 0.5679807369246276, + "grad_norm": 1.4139388799667358, + "learning_rate": 5.068500000000001e-05, + "loss": 0.4025, + "step": 10143 + }, + { + "epoch": 0.5680367342367566, + "grad_norm": 1.1509840488433838, + "learning_rate": 5.0690000000000004e-05, + "loss": 0.3744, + "step": 10144 + }, + { + "epoch": 0.5680927315488856, + "grad_norm": 1.2031747102737427, + "learning_rate": 5.0695e-05, + "loss": 0.3816, + "step": 10145 + }, + { + "epoch": 0.5681487288610146, + "grad_norm": 1.9181218147277832, + "learning_rate": 5.0700000000000006e-05, + "loss": 0.4964, + "step": 10146 + }, + { + "epoch": 0.5682047261731437, + "grad_norm": 1.152400016784668, + "learning_rate": 5.0705e-05, + "loss": 0.3611, + "step": 10147 + }, + { + "epoch": 0.5682607234852727, + "grad_norm": 1.409926414489746, + "learning_rate": 5.071e-05, + "loss": 0.3901, + "step": 10148 + }, + { + "epoch": 0.5683167207974017, + "grad_norm": 1.3198037147521973, + "learning_rate": 5.0715e-05, + "loss": 0.4538, + "step": 10149 + }, + { + "epoch": 0.5683727181095307, + "grad_norm": 1.2766097784042358, + "learning_rate": 5.072e-05, + "loss": 0.4074, + "step": 10150 + }, + { + "epoch": 0.5684287154216597, + "grad_norm": 1.1689236164093018, + "learning_rate": 5.0725e-05, + "loss": 0.3714, + "step": 10151 + }, + { + "epoch": 0.5684847127337888, + "grad_norm": 1.399337887763977, + "learning_rate": 5.073e-05, + "loss": 0.4977, + "step": 10152 + }, + { + "epoch": 0.5685407100459178, + "grad_norm": 1.2349555492401123, + "learning_rate": 5.0735e-05, + "loss": 0.5333, + "step": 10153 + }, + { + "epoch": 0.5685967073580468, + "grad_norm": 1.2456928491592407, + "learning_rate": 5.074e-05, + "loss": 0.4529, + "step": 10154 + }, + { + "epoch": 0.5686527046701758, + "grad_norm": 1.2561348676681519, + "learning_rate": 5.0744999999999996e-05, + "loss": 0.449, + "step": 10155 + }, + { + "epoch": 0.5687087019823048, + "grad_norm": 1.1609960794448853, + "learning_rate": 5.075e-05, + "loss": 0.3185, + "step": 10156 + }, + { + "epoch": 0.5687646992944339, + "grad_norm": 1.2437316179275513, + "learning_rate": 5.0755e-05, + "loss": 0.4063, + "step": 10157 + }, + { + "epoch": 0.5688206966065629, + "grad_norm": 1.2899188995361328, + "learning_rate": 5.076000000000001e-05, + "loss": 0.4687, + "step": 10158 + }, + { + "epoch": 0.5688766939186919, + "grad_norm": 1.1509284973144531, + "learning_rate": 5.0765000000000006e-05, + "loss": 0.3083, + "step": 10159 + }, + { + "epoch": 0.5689326912308209, + "grad_norm": 1.4670699834823608, + "learning_rate": 5.0770000000000003e-05, + "loss": 0.5398, + "step": 10160 + }, + { + "epoch": 0.5689886885429499, + "grad_norm": 1.5057661533355713, + "learning_rate": 5.077500000000001e-05, + "loss": 0.45, + "step": 10161 + }, + { + "epoch": 0.569044685855079, + "grad_norm": 1.4503322839736938, + "learning_rate": 5.0780000000000005e-05, + "loss": 0.5332, + "step": 10162 + }, + { + "epoch": 0.569100683167208, + "grad_norm": 1.3776382207870483, + "learning_rate": 5.0785e-05, + "loss": 0.5604, + "step": 10163 + }, + { + "epoch": 0.569156680479337, + "grad_norm": 1.2540619373321533, + "learning_rate": 5.079000000000001e-05, + "loss": 0.4003, + "step": 10164 + }, + { + "epoch": 0.569212677791466, + "grad_norm": 1.4435511827468872, + "learning_rate": 5.0795000000000004e-05, + "loss": 0.6547, + "step": 10165 + }, + { + "epoch": 0.569268675103595, + "grad_norm": 1.2559958696365356, + "learning_rate": 5.08e-05, + "loss": 0.4572, + "step": 10166 + }, + { + "epoch": 0.569324672415724, + "grad_norm": 1.4132412672042847, + "learning_rate": 5.0805000000000006e-05, + "loss": 0.5495, + "step": 10167 + }, + { + "epoch": 0.5693806697278531, + "grad_norm": 1.234095811843872, + "learning_rate": 5.081e-05, + "loss": 0.3255, + "step": 10168 + }, + { + "epoch": 0.5694366670399821, + "grad_norm": 1.3978824615478516, + "learning_rate": 5.0815e-05, + "loss": 0.4344, + "step": 10169 + }, + { + "epoch": 0.5694926643521111, + "grad_norm": 1.2753530740737915, + "learning_rate": 5.082e-05, + "loss": 0.3858, + "step": 10170 + }, + { + "epoch": 0.5695486616642401, + "grad_norm": 1.3017603158950806, + "learning_rate": 5.0825e-05, + "loss": 0.4449, + "step": 10171 + }, + { + "epoch": 0.5696046589763691, + "grad_norm": 1.1952513456344604, + "learning_rate": 5.083e-05, + "loss": 0.4657, + "step": 10172 + }, + { + "epoch": 0.5696606562884982, + "grad_norm": 1.2655614614486694, + "learning_rate": 5.0835e-05, + "loss": 0.3996, + "step": 10173 + }, + { + "epoch": 0.5697166536006272, + "grad_norm": 1.0660436153411865, + "learning_rate": 5.084e-05, + "loss": 0.3162, + "step": 10174 + }, + { + "epoch": 0.5697726509127562, + "grad_norm": 1.3102269172668457, + "learning_rate": 5.0845e-05, + "loss": 0.4801, + "step": 10175 + }, + { + "epoch": 0.5698286482248852, + "grad_norm": 1.2505639791488647, + "learning_rate": 5.0849999999999996e-05, + "loss": 0.4468, + "step": 10176 + }, + { + "epoch": 0.5698846455370142, + "grad_norm": 1.0204392671585083, + "learning_rate": 5.0855e-05, + "loss": 0.3442, + "step": 10177 + }, + { + "epoch": 0.5699406428491433, + "grad_norm": 1.2225295305252075, + "learning_rate": 5.0860000000000004e-05, + "loss": 0.4091, + "step": 10178 + }, + { + "epoch": 0.5699966401612723, + "grad_norm": 1.420210361480713, + "learning_rate": 5.086500000000001e-05, + "loss": 0.5708, + "step": 10179 + }, + { + "epoch": 0.5700526374734013, + "grad_norm": 1.2864094972610474, + "learning_rate": 5.0870000000000006e-05, + "loss": 0.4386, + "step": 10180 + }, + { + "epoch": 0.5701086347855303, + "grad_norm": 1.4768576622009277, + "learning_rate": 5.0875e-05, + "loss": 0.3914, + "step": 10181 + }, + { + "epoch": 0.5701646320976593, + "grad_norm": 1.3639792203903198, + "learning_rate": 5.088000000000001e-05, + "loss": 0.4886, + "step": 10182 + }, + { + "epoch": 0.5702206294097883, + "grad_norm": 1.3538386821746826, + "learning_rate": 5.0885000000000005e-05, + "loss": 0.3957, + "step": 10183 + }, + { + "epoch": 0.5702766267219174, + "grad_norm": 1.321562647819519, + "learning_rate": 5.089e-05, + "loss": 0.471, + "step": 10184 + }, + { + "epoch": 0.5703326240340464, + "grad_norm": 1.2572190761566162, + "learning_rate": 5.0895000000000007e-05, + "loss": 0.4959, + "step": 10185 + }, + { + "epoch": 0.5703886213461754, + "grad_norm": 1.2916969060897827, + "learning_rate": 5.0900000000000004e-05, + "loss": 0.4295, + "step": 10186 + }, + { + "epoch": 0.5704446186583044, + "grad_norm": 1.5530734062194824, + "learning_rate": 5.0905e-05, + "loss": 0.5665, + "step": 10187 + }, + { + "epoch": 0.5705006159704334, + "grad_norm": 1.0694494247436523, + "learning_rate": 5.091e-05, + "loss": 0.3661, + "step": 10188 + }, + { + "epoch": 0.5705566132825625, + "grad_norm": 1.4759570360183716, + "learning_rate": 5.0915e-05, + "loss": 0.4288, + "step": 10189 + }, + { + "epoch": 0.5706126105946915, + "grad_norm": 1.5343525409698486, + "learning_rate": 5.092e-05, + "loss": 0.4077, + "step": 10190 + }, + { + "epoch": 0.5706686079068205, + "grad_norm": 1.1712658405303955, + "learning_rate": 5.0925e-05, + "loss": 0.3963, + "step": 10191 + }, + { + "epoch": 0.5707246052189495, + "grad_norm": 1.7039088010787964, + "learning_rate": 5.093e-05, + "loss": 0.4555, + "step": 10192 + }, + { + "epoch": 0.5707806025310785, + "grad_norm": 1.1426856517791748, + "learning_rate": 5.0935e-05, + "loss": 0.3363, + "step": 10193 + }, + { + "epoch": 0.5708365998432076, + "grad_norm": 1.233491063117981, + "learning_rate": 5.094e-05, + "loss": 0.437, + "step": 10194 + }, + { + "epoch": 0.5708925971553366, + "grad_norm": 1.1578457355499268, + "learning_rate": 5.0945e-05, + "loss": 0.4331, + "step": 10195 + }, + { + "epoch": 0.5709485944674656, + "grad_norm": 1.2121589183807373, + "learning_rate": 5.095e-05, + "loss": 0.4047, + "step": 10196 + }, + { + "epoch": 0.5710045917795946, + "grad_norm": 1.410102128982544, + "learning_rate": 5.0954999999999996e-05, + "loss": 0.4156, + "step": 10197 + }, + { + "epoch": 0.5710605890917236, + "grad_norm": 1.3294005393981934, + "learning_rate": 5.096000000000001e-05, + "loss": 0.4113, + "step": 10198 + }, + { + "epoch": 0.5711165864038527, + "grad_norm": 1.3565648794174194, + "learning_rate": 5.0965000000000004e-05, + "loss": 0.4551, + "step": 10199 + }, + { + "epoch": 0.5711725837159817, + "grad_norm": 1.9718737602233887, + "learning_rate": 5.097000000000001e-05, + "loss": 0.5973, + "step": 10200 + }, + { + "epoch": 0.5712285810281107, + "grad_norm": 1.5698515176773071, + "learning_rate": 5.0975000000000006e-05, + "loss": 0.4464, + "step": 10201 + }, + { + "epoch": 0.5712845783402397, + "grad_norm": 1.5377118587493896, + "learning_rate": 5.098e-05, + "loss": 0.4791, + "step": 10202 + }, + { + "epoch": 0.5713405756523687, + "grad_norm": 1.1959205865859985, + "learning_rate": 5.098500000000001e-05, + "loss": 0.4156, + "step": 10203 + }, + { + "epoch": 0.5713965729644978, + "grad_norm": 1.2330917119979858, + "learning_rate": 5.0990000000000005e-05, + "loss": 0.4184, + "step": 10204 + }, + { + "epoch": 0.5714525702766268, + "grad_norm": 1.1605263948440552, + "learning_rate": 5.0995e-05, + "loss": 0.3128, + "step": 10205 + }, + { + "epoch": 0.5715085675887558, + "grad_norm": 1.1472564935684204, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.3788, + "step": 10206 + }, + { + "epoch": 0.5715645649008848, + "grad_norm": 1.4026507139205933, + "learning_rate": 5.1005000000000004e-05, + "loss": 0.4531, + "step": 10207 + }, + { + "epoch": 0.5716205622130138, + "grad_norm": 1.314774513244629, + "learning_rate": 5.101e-05, + "loss": 0.3528, + "step": 10208 + }, + { + "epoch": 0.5716765595251428, + "grad_norm": 1.4226630926132202, + "learning_rate": 5.1015e-05, + "loss": 0.4123, + "step": 10209 + }, + { + "epoch": 0.5717325568372719, + "grad_norm": 1.463738203048706, + "learning_rate": 5.102e-05, + "loss": 0.6458, + "step": 10210 + }, + { + "epoch": 0.5717885541494008, + "grad_norm": 1.4039021730422974, + "learning_rate": 5.1025e-05, + "loss": 0.3781, + "step": 10211 + }, + { + "epoch": 0.5718445514615298, + "grad_norm": 1.6951605081558228, + "learning_rate": 5.103e-05, + "loss": 0.5027, + "step": 10212 + }, + { + "epoch": 0.5719005487736588, + "grad_norm": 1.3380426168441772, + "learning_rate": 5.1035e-05, + "loss": 0.5473, + "step": 10213 + }, + { + "epoch": 0.5719565460857878, + "grad_norm": 1.4729443788528442, + "learning_rate": 5.104e-05, + "loss": 0.3846, + "step": 10214 + }, + { + "epoch": 0.5720125433979169, + "grad_norm": 1.4423785209655762, + "learning_rate": 5.1045e-05, + "loss": 0.7526, + "step": 10215 + }, + { + "epoch": 0.5720685407100459, + "grad_norm": 1.164225459098816, + "learning_rate": 5.105e-05, + "loss": 0.328, + "step": 10216 + }, + { + "epoch": 0.5721245380221749, + "grad_norm": 1.191494107246399, + "learning_rate": 5.1055e-05, + "loss": 0.3963, + "step": 10217 + }, + { + "epoch": 0.5721805353343039, + "grad_norm": 1.256538987159729, + "learning_rate": 5.106000000000001e-05, + "loss": 0.4338, + "step": 10218 + }, + { + "epoch": 0.5722365326464329, + "grad_norm": 1.5546156167984009, + "learning_rate": 5.1065000000000007e-05, + "loss": 0.4119, + "step": 10219 + }, + { + "epoch": 0.572292529958562, + "grad_norm": 1.3575255870819092, + "learning_rate": 5.1070000000000004e-05, + "loss": 0.3489, + "step": 10220 + }, + { + "epoch": 0.572348527270691, + "grad_norm": 1.6723190546035767, + "learning_rate": 5.107500000000001e-05, + "loss": 0.3096, + "step": 10221 + }, + { + "epoch": 0.57240452458282, + "grad_norm": 1.1107463836669922, + "learning_rate": 5.1080000000000006e-05, + "loss": 0.3975, + "step": 10222 + }, + { + "epoch": 0.572460521894949, + "grad_norm": 1.8103303909301758, + "learning_rate": 5.1085e-05, + "loss": 0.36, + "step": 10223 + }, + { + "epoch": 0.572516519207078, + "grad_norm": 2.5603103637695312, + "learning_rate": 5.109000000000001e-05, + "loss": 0.4458, + "step": 10224 + }, + { + "epoch": 0.572572516519207, + "grad_norm": 1.2526216506958008, + "learning_rate": 5.1095000000000005e-05, + "loss": 0.3847, + "step": 10225 + }, + { + "epoch": 0.5726285138313361, + "grad_norm": 1.8950400352478027, + "learning_rate": 5.11e-05, + "loss": 0.5301, + "step": 10226 + }, + { + "epoch": 0.5726845111434651, + "grad_norm": 1.1724538803100586, + "learning_rate": 5.1105000000000006e-05, + "loss": 0.4366, + "step": 10227 + }, + { + "epoch": 0.5727405084555941, + "grad_norm": 1.6718838214874268, + "learning_rate": 5.1110000000000004e-05, + "loss": 0.4654, + "step": 10228 + }, + { + "epoch": 0.5727965057677231, + "grad_norm": 1.3077808618545532, + "learning_rate": 5.1115e-05, + "loss": 0.4893, + "step": 10229 + }, + { + "epoch": 0.5728525030798521, + "grad_norm": 1.3792833089828491, + "learning_rate": 5.112e-05, + "loss": 0.4925, + "step": 10230 + }, + { + "epoch": 0.5729085003919812, + "grad_norm": 1.2939683198928833, + "learning_rate": 5.1125e-05, + "loss": 0.4321, + "step": 10231 + }, + { + "epoch": 0.5729644977041102, + "grad_norm": 1.9967490434646606, + "learning_rate": 5.113e-05, + "loss": 0.3745, + "step": 10232 + }, + { + "epoch": 0.5730204950162392, + "grad_norm": 1.1713602542877197, + "learning_rate": 5.1135e-05, + "loss": 0.4535, + "step": 10233 + }, + { + "epoch": 0.5730764923283682, + "grad_norm": 1.2153222560882568, + "learning_rate": 5.114e-05, + "loss": 0.3062, + "step": 10234 + }, + { + "epoch": 0.5731324896404972, + "grad_norm": 1.45026695728302, + "learning_rate": 5.1145e-05, + "loss": 0.4027, + "step": 10235 + }, + { + "epoch": 0.5731884869526263, + "grad_norm": 1.146943211555481, + "learning_rate": 5.1149999999999996e-05, + "loss": 0.3594, + "step": 10236 + }, + { + "epoch": 0.5732444842647553, + "grad_norm": 1.2367851734161377, + "learning_rate": 5.1155e-05, + "loss": 0.4114, + "step": 10237 + }, + { + "epoch": 0.5733004815768843, + "grad_norm": 1.1932320594787598, + "learning_rate": 5.1160000000000005e-05, + "loss": 0.4816, + "step": 10238 + }, + { + "epoch": 0.5733564788890133, + "grad_norm": 1.2560538053512573, + "learning_rate": 5.116500000000001e-05, + "loss": 0.4513, + "step": 10239 + }, + { + "epoch": 0.5734124762011423, + "grad_norm": 1.418629765510559, + "learning_rate": 5.1170000000000006e-05, + "loss": 0.4694, + "step": 10240 + }, + { + "epoch": 0.5734684735132713, + "grad_norm": 1.2285903692245483, + "learning_rate": 5.1175000000000004e-05, + "loss": 0.4627, + "step": 10241 + }, + { + "epoch": 0.5735244708254004, + "grad_norm": 1.182493805885315, + "learning_rate": 5.118000000000001e-05, + "loss": 0.4864, + "step": 10242 + }, + { + "epoch": 0.5735804681375294, + "grad_norm": 1.2190669775009155, + "learning_rate": 5.1185000000000005e-05, + "loss": 0.3992, + "step": 10243 + }, + { + "epoch": 0.5736364654496584, + "grad_norm": 1.2626402378082275, + "learning_rate": 5.119e-05, + "loss": 0.6578, + "step": 10244 + }, + { + "epoch": 0.5736924627617874, + "grad_norm": 1.4835706949234009, + "learning_rate": 5.119500000000001e-05, + "loss": 0.3841, + "step": 10245 + }, + { + "epoch": 0.5737484600739164, + "grad_norm": 1.3277225494384766, + "learning_rate": 5.1200000000000004e-05, + "loss": 0.4006, + "step": 10246 + }, + { + "epoch": 0.5738044573860455, + "grad_norm": 1.3944958448410034, + "learning_rate": 5.1205e-05, + "loss": 0.56, + "step": 10247 + }, + { + "epoch": 0.5738604546981745, + "grad_norm": 1.4049177169799805, + "learning_rate": 5.121e-05, + "loss": 0.7031, + "step": 10248 + }, + { + "epoch": 0.5739164520103035, + "grad_norm": 1.1918838024139404, + "learning_rate": 5.1215000000000003e-05, + "loss": 0.4446, + "step": 10249 + }, + { + "epoch": 0.5739724493224325, + "grad_norm": 1.4402596950531006, + "learning_rate": 5.122e-05, + "loss": 0.5317, + "step": 10250 + }, + { + "epoch": 0.5740284466345615, + "grad_norm": 1.4664949178695679, + "learning_rate": 5.1225e-05, + "loss": 0.4501, + "step": 10251 + }, + { + "epoch": 0.5740844439466906, + "grad_norm": 1.4004675149917603, + "learning_rate": 5.123e-05, + "loss": 0.4477, + "step": 10252 + }, + { + "epoch": 0.5741404412588196, + "grad_norm": 1.4063783884048462, + "learning_rate": 5.1235e-05, + "loss": 0.4045, + "step": 10253 + }, + { + "epoch": 0.5741964385709486, + "grad_norm": 1.291314959526062, + "learning_rate": 5.124e-05, + "loss": 0.3632, + "step": 10254 + }, + { + "epoch": 0.5742524358830776, + "grad_norm": 1.3557640314102173, + "learning_rate": 5.1245e-05, + "loss": 0.5423, + "step": 10255 + }, + { + "epoch": 0.5743084331952066, + "grad_norm": 1.3525190353393555, + "learning_rate": 5.125e-05, + "loss": 0.3807, + "step": 10256 + }, + { + "epoch": 0.5743644305073357, + "grad_norm": 1.233000636100769, + "learning_rate": 5.1254999999999996e-05, + "loss": 0.3606, + "step": 10257 + }, + { + "epoch": 0.5744204278194647, + "grad_norm": 2.2913849353790283, + "learning_rate": 5.1259999999999994e-05, + "loss": 0.5721, + "step": 10258 + }, + { + "epoch": 0.5744764251315937, + "grad_norm": 1.2874081134796143, + "learning_rate": 5.1265000000000005e-05, + "loss": 0.4782, + "step": 10259 + }, + { + "epoch": 0.5745324224437227, + "grad_norm": 1.4316374063491821, + "learning_rate": 5.127000000000001e-05, + "loss": 0.4022, + "step": 10260 + }, + { + "epoch": 0.5745884197558517, + "grad_norm": 1.219549298286438, + "learning_rate": 5.1275000000000006e-05, + "loss": 0.5247, + "step": 10261 + }, + { + "epoch": 0.5746444170679808, + "grad_norm": 1.2923873662948608, + "learning_rate": 5.1280000000000004e-05, + "loss": 0.3801, + "step": 10262 + }, + { + "epoch": 0.5747004143801098, + "grad_norm": 1.3383703231811523, + "learning_rate": 5.128500000000001e-05, + "loss": 0.5152, + "step": 10263 + }, + { + "epoch": 0.5747564116922388, + "grad_norm": 1.1318565607070923, + "learning_rate": 5.1290000000000005e-05, + "loss": 0.4195, + "step": 10264 + }, + { + "epoch": 0.5748124090043678, + "grad_norm": 1.4272615909576416, + "learning_rate": 5.1295e-05, + "loss": 0.45, + "step": 10265 + }, + { + "epoch": 0.5748684063164968, + "grad_norm": 1.6429742574691772, + "learning_rate": 5.130000000000001e-05, + "loss": 0.5195, + "step": 10266 + }, + { + "epoch": 0.5749244036286258, + "grad_norm": 1.1789437532424927, + "learning_rate": 5.1305000000000004e-05, + "loss": 0.4308, + "step": 10267 + }, + { + "epoch": 0.5749804009407549, + "grad_norm": 1.2067891359329224, + "learning_rate": 5.131e-05, + "loss": 0.4, + "step": 10268 + }, + { + "epoch": 0.5750363982528839, + "grad_norm": 1.212969183921814, + "learning_rate": 5.1315e-05, + "loss": 0.5542, + "step": 10269 + }, + { + "epoch": 0.5750923955650129, + "grad_norm": 1.1853617429733276, + "learning_rate": 5.132e-05, + "loss": 0.4596, + "step": 10270 + }, + { + "epoch": 0.5751483928771419, + "grad_norm": 1.5046513080596924, + "learning_rate": 5.1325e-05, + "loss": 0.459, + "step": 10271 + }, + { + "epoch": 0.5752043901892709, + "grad_norm": 1.5734779834747314, + "learning_rate": 5.133e-05, + "loss": 0.5446, + "step": 10272 + }, + { + "epoch": 0.5752603875014, + "grad_norm": 1.4467272758483887, + "learning_rate": 5.1335e-05, + "loss": 0.517, + "step": 10273 + }, + { + "epoch": 0.575316384813529, + "grad_norm": 1.5649198293685913, + "learning_rate": 5.134e-05, + "loss": 0.547, + "step": 10274 + }, + { + "epoch": 0.575372382125658, + "grad_norm": 1.5195143222808838, + "learning_rate": 5.1345e-05, + "loss": 0.5048, + "step": 10275 + }, + { + "epoch": 0.575428379437787, + "grad_norm": 1.2118948698043823, + "learning_rate": 5.135e-05, + "loss": 0.4739, + "step": 10276 + }, + { + "epoch": 0.575484376749916, + "grad_norm": 1.3803354501724243, + "learning_rate": 5.1355e-05, + "loss": 0.4965, + "step": 10277 + }, + { + "epoch": 0.5755403740620451, + "grad_norm": 1.300803303718567, + "learning_rate": 5.1359999999999996e-05, + "loss": 0.4283, + "step": 10278 + }, + { + "epoch": 0.5755963713741741, + "grad_norm": 1.058893084526062, + "learning_rate": 5.136500000000001e-05, + "loss": 0.322, + "step": 10279 + }, + { + "epoch": 0.5756523686863031, + "grad_norm": 1.2869795560836792, + "learning_rate": 5.1370000000000005e-05, + "loss": 0.3932, + "step": 10280 + }, + { + "epoch": 0.5757083659984321, + "grad_norm": 1.677501916885376, + "learning_rate": 5.137500000000001e-05, + "loss": 0.4229, + "step": 10281 + }, + { + "epoch": 0.5757643633105611, + "grad_norm": 1.824832558631897, + "learning_rate": 5.1380000000000006e-05, + "loss": 0.4507, + "step": 10282 + }, + { + "epoch": 0.5758203606226902, + "grad_norm": 1.1936434507369995, + "learning_rate": 5.1385000000000004e-05, + "loss": 0.3963, + "step": 10283 + }, + { + "epoch": 0.5758763579348192, + "grad_norm": 1.6029094457626343, + "learning_rate": 5.139000000000001e-05, + "loss": 0.5803, + "step": 10284 + }, + { + "epoch": 0.5759323552469482, + "grad_norm": 1.257612943649292, + "learning_rate": 5.1395000000000005e-05, + "loss": 0.3776, + "step": 10285 + }, + { + "epoch": 0.5759883525590772, + "grad_norm": 1.2295781373977661, + "learning_rate": 5.14e-05, + "loss": 0.3462, + "step": 10286 + }, + { + "epoch": 0.5760443498712062, + "grad_norm": 1.4837368726730347, + "learning_rate": 5.1405e-05, + "loss": 0.3868, + "step": 10287 + }, + { + "epoch": 0.5761003471833352, + "grad_norm": 1.1890358924865723, + "learning_rate": 5.1410000000000004e-05, + "loss": 0.5082, + "step": 10288 + }, + { + "epoch": 0.5761563444954643, + "grad_norm": 1.2450222969055176, + "learning_rate": 5.1415e-05, + "loss": 0.4077, + "step": 10289 + }, + { + "epoch": 0.5762123418075933, + "grad_norm": 1.1931641101837158, + "learning_rate": 5.142e-05, + "loss": 0.3834, + "step": 10290 + }, + { + "epoch": 0.5762683391197223, + "grad_norm": 1.5010100603103638, + "learning_rate": 5.1425e-05, + "loss": 0.4207, + "step": 10291 + }, + { + "epoch": 0.5763243364318513, + "grad_norm": 1.242691159248352, + "learning_rate": 5.143e-05, + "loss": 0.2911, + "step": 10292 + }, + { + "epoch": 0.5763803337439803, + "grad_norm": 1.0755046606063843, + "learning_rate": 5.1435e-05, + "loss": 0.398, + "step": 10293 + }, + { + "epoch": 0.5764363310561093, + "grad_norm": 1.1281802654266357, + "learning_rate": 5.144e-05, + "loss": 0.4705, + "step": 10294 + }, + { + "epoch": 0.5764923283682383, + "grad_norm": 1.2325894832611084, + "learning_rate": 5.1445e-05, + "loss": 0.401, + "step": 10295 + }, + { + "epoch": 0.5765483256803673, + "grad_norm": 1.1909339427947998, + "learning_rate": 5.145e-05, + "loss": 0.3308, + "step": 10296 + }, + { + "epoch": 0.5766043229924963, + "grad_norm": 1.4792735576629639, + "learning_rate": 5.1454999999999994e-05, + "loss": 0.4901, + "step": 10297 + }, + { + "epoch": 0.5766603203046253, + "grad_norm": 1.283048152923584, + "learning_rate": 5.146e-05, + "loss": 0.3903, + "step": 10298 + }, + { + "epoch": 0.5767163176167543, + "grad_norm": 1.468291997909546, + "learning_rate": 5.146500000000001e-05, + "loss": 0.4658, + "step": 10299 + }, + { + "epoch": 0.5767723149288834, + "grad_norm": 1.227444052696228, + "learning_rate": 5.147000000000001e-05, + "loss": 0.3662, + "step": 10300 + }, + { + "epoch": 0.5768283122410124, + "grad_norm": 1.3208351135253906, + "learning_rate": 5.1475000000000004e-05, + "loss": 0.3798, + "step": 10301 + }, + { + "epoch": 0.5768843095531414, + "grad_norm": 1.2385649681091309, + "learning_rate": 5.148000000000001e-05, + "loss": 0.3872, + "step": 10302 + }, + { + "epoch": 0.5769403068652704, + "grad_norm": 1.1699784994125366, + "learning_rate": 5.1485000000000006e-05, + "loss": 0.3847, + "step": 10303 + }, + { + "epoch": 0.5769963041773994, + "grad_norm": 1.215610146522522, + "learning_rate": 5.149e-05, + "loss": 0.4676, + "step": 10304 + }, + { + "epoch": 0.5770523014895285, + "grad_norm": 1.3170974254608154, + "learning_rate": 5.149500000000001e-05, + "loss": 0.5343, + "step": 10305 + }, + { + "epoch": 0.5771082988016575, + "grad_norm": 1.1466550827026367, + "learning_rate": 5.1500000000000005e-05, + "loss": 0.4225, + "step": 10306 + }, + { + "epoch": 0.5771642961137865, + "grad_norm": 1.6524522304534912, + "learning_rate": 5.1505e-05, + "loss": 0.52, + "step": 10307 + }, + { + "epoch": 0.5772202934259155, + "grad_norm": 1.1020444631576538, + "learning_rate": 5.151e-05, + "loss": 0.3842, + "step": 10308 + }, + { + "epoch": 0.5772762907380445, + "grad_norm": 1.213584065437317, + "learning_rate": 5.1515000000000004e-05, + "loss": 0.4118, + "step": 10309 + }, + { + "epoch": 0.5773322880501736, + "grad_norm": 1.2105871438980103, + "learning_rate": 5.152e-05, + "loss": 0.4503, + "step": 10310 + }, + { + "epoch": 0.5773882853623026, + "grad_norm": 1.268365740776062, + "learning_rate": 5.1525e-05, + "loss": 0.3572, + "step": 10311 + }, + { + "epoch": 0.5774442826744316, + "grad_norm": 1.1743181943893433, + "learning_rate": 5.153e-05, + "loss": 0.3623, + "step": 10312 + }, + { + "epoch": 0.5775002799865606, + "grad_norm": 1.4468861818313599, + "learning_rate": 5.1535e-05, + "loss": 0.6882, + "step": 10313 + }, + { + "epoch": 0.5775562772986896, + "grad_norm": 1.0792425870895386, + "learning_rate": 5.154e-05, + "loss": 0.4079, + "step": 10314 + }, + { + "epoch": 0.5776122746108187, + "grad_norm": 1.282312035560608, + "learning_rate": 5.1545e-05, + "loss": 0.3473, + "step": 10315 + }, + { + "epoch": 0.5776682719229477, + "grad_norm": 1.4069294929504395, + "learning_rate": 5.155e-05, + "loss": 0.4258, + "step": 10316 + }, + { + "epoch": 0.5777242692350767, + "grad_norm": 1.3479154109954834, + "learning_rate": 5.1555e-05, + "loss": 0.4756, + "step": 10317 + }, + { + "epoch": 0.5777802665472057, + "grad_norm": 1.4789682626724243, + "learning_rate": 5.1559999999999994e-05, + "loss": 0.5154, + "step": 10318 + }, + { + "epoch": 0.5778362638593347, + "grad_norm": 1.3756661415100098, + "learning_rate": 5.1565000000000005e-05, + "loss": 0.5582, + "step": 10319 + }, + { + "epoch": 0.5778922611714638, + "grad_norm": 1.2096366882324219, + "learning_rate": 5.157000000000001e-05, + "loss": 0.4084, + "step": 10320 + }, + { + "epoch": 0.5779482584835928, + "grad_norm": 1.2413229942321777, + "learning_rate": 5.157500000000001e-05, + "loss": 0.3693, + "step": 10321 + }, + { + "epoch": 0.5780042557957218, + "grad_norm": 1.2954576015472412, + "learning_rate": 5.1580000000000004e-05, + "loss": 0.4466, + "step": 10322 + }, + { + "epoch": 0.5780602531078508, + "grad_norm": 1.0908390283584595, + "learning_rate": 5.158500000000001e-05, + "loss": 0.4197, + "step": 10323 + }, + { + "epoch": 0.5781162504199798, + "grad_norm": 1.415282130241394, + "learning_rate": 5.1590000000000006e-05, + "loss": 0.429, + "step": 10324 + }, + { + "epoch": 0.5781722477321088, + "grad_norm": 1.2666020393371582, + "learning_rate": 5.1595e-05, + "loss": 0.503, + "step": 10325 + }, + { + "epoch": 0.5782282450442379, + "grad_norm": 1.303097128868103, + "learning_rate": 5.16e-05, + "loss": 0.398, + "step": 10326 + }, + { + "epoch": 0.5782842423563669, + "grad_norm": 1.124499797821045, + "learning_rate": 5.1605000000000005e-05, + "loss": 0.324, + "step": 10327 + }, + { + "epoch": 0.5783402396684959, + "grad_norm": 1.4660717248916626, + "learning_rate": 5.161e-05, + "loss": 0.3562, + "step": 10328 + }, + { + "epoch": 0.5783962369806249, + "grad_norm": 1.2222203016281128, + "learning_rate": 5.1615e-05, + "loss": 0.3489, + "step": 10329 + }, + { + "epoch": 0.5784522342927539, + "grad_norm": 2.0427260398864746, + "learning_rate": 5.1620000000000004e-05, + "loss": 0.339, + "step": 10330 + }, + { + "epoch": 0.578508231604883, + "grad_norm": 1.3806908130645752, + "learning_rate": 5.1625e-05, + "loss": 0.4112, + "step": 10331 + }, + { + "epoch": 0.578564228917012, + "grad_norm": 2.235337495803833, + "learning_rate": 5.163e-05, + "loss": 0.4836, + "step": 10332 + }, + { + "epoch": 0.578620226229141, + "grad_norm": 1.1346064805984497, + "learning_rate": 5.1635e-05, + "loss": 0.3712, + "step": 10333 + }, + { + "epoch": 0.57867622354127, + "grad_norm": 1.5988128185272217, + "learning_rate": 5.164e-05, + "loss": 0.4664, + "step": 10334 + }, + { + "epoch": 0.578732220853399, + "grad_norm": 1.2711799144744873, + "learning_rate": 5.1645e-05, + "loss": 0.3467, + "step": 10335 + }, + { + "epoch": 0.5787882181655281, + "grad_norm": 1.3669320344924927, + "learning_rate": 5.1649999999999995e-05, + "loss": 0.4762, + "step": 10336 + }, + { + "epoch": 0.5788442154776571, + "grad_norm": 1.696080207824707, + "learning_rate": 5.1655e-05, + "loss": 0.5719, + "step": 10337 + }, + { + "epoch": 0.5789002127897861, + "grad_norm": 1.1864807605743408, + "learning_rate": 5.166e-05, + "loss": 0.7109, + "step": 10338 + }, + { + "epoch": 0.5789562101019151, + "grad_norm": 1.3186880350112915, + "learning_rate": 5.166500000000001e-05, + "loss": 0.3441, + "step": 10339 + }, + { + "epoch": 0.5790122074140441, + "grad_norm": 1.121558666229248, + "learning_rate": 5.1670000000000005e-05, + "loss": 0.3171, + "step": 10340 + }, + { + "epoch": 0.5790682047261732, + "grad_norm": 1.4229801893234253, + "learning_rate": 5.167500000000001e-05, + "loss": 0.6164, + "step": 10341 + }, + { + "epoch": 0.5791242020383022, + "grad_norm": 1.4089537858963013, + "learning_rate": 5.168000000000001e-05, + "loss": 0.5846, + "step": 10342 + }, + { + "epoch": 0.5791801993504312, + "grad_norm": 1.085536003112793, + "learning_rate": 5.1685000000000004e-05, + "loss": 0.3138, + "step": 10343 + }, + { + "epoch": 0.5792361966625602, + "grad_norm": 1.33896803855896, + "learning_rate": 5.169000000000001e-05, + "loss": 0.4165, + "step": 10344 + }, + { + "epoch": 0.5792921939746892, + "grad_norm": 1.3204853534698486, + "learning_rate": 5.1695000000000006e-05, + "loss": 0.4337, + "step": 10345 + }, + { + "epoch": 0.5793481912868182, + "grad_norm": 1.2813446521759033, + "learning_rate": 5.17e-05, + "loss": 0.4423, + "step": 10346 + }, + { + "epoch": 0.5794041885989473, + "grad_norm": 1.2630864381790161, + "learning_rate": 5.1705e-05, + "loss": 0.3796, + "step": 10347 + }, + { + "epoch": 0.5794601859110763, + "grad_norm": 1.107636570930481, + "learning_rate": 5.1710000000000005e-05, + "loss": 0.343, + "step": 10348 + }, + { + "epoch": 0.5795161832232053, + "grad_norm": 1.1181697845458984, + "learning_rate": 5.1715e-05, + "loss": 0.4129, + "step": 10349 + }, + { + "epoch": 0.5795721805353343, + "grad_norm": 1.3029804229736328, + "learning_rate": 5.172e-05, + "loss": 0.4137, + "step": 10350 + }, + { + "epoch": 0.5796281778474633, + "grad_norm": 1.280713677406311, + "learning_rate": 5.1725000000000004e-05, + "loss": 0.4269, + "step": 10351 + }, + { + "epoch": 0.5796841751595924, + "grad_norm": 1.6649599075317383, + "learning_rate": 5.173e-05, + "loss": 0.5059, + "step": 10352 + }, + { + "epoch": 0.5797401724717214, + "grad_norm": 1.3355119228363037, + "learning_rate": 5.1735e-05, + "loss": 0.4417, + "step": 10353 + }, + { + "epoch": 0.5797961697838504, + "grad_norm": 1.4218158721923828, + "learning_rate": 5.174e-05, + "loss": 0.4339, + "step": 10354 + }, + { + "epoch": 0.5798521670959794, + "grad_norm": 1.2330162525177002, + "learning_rate": 5.1745e-05, + "loss": 0.4606, + "step": 10355 + }, + { + "epoch": 0.5799081644081084, + "grad_norm": 1.3094184398651123, + "learning_rate": 5.175e-05, + "loss": 0.4366, + "step": 10356 + }, + { + "epoch": 0.5799641617202375, + "grad_norm": 1.5101141929626465, + "learning_rate": 5.1754999999999995e-05, + "loss": 0.5264, + "step": 10357 + }, + { + "epoch": 0.5800201590323665, + "grad_norm": 1.387771725654602, + "learning_rate": 5.176e-05, + "loss": 0.5416, + "step": 10358 + }, + { + "epoch": 0.5800761563444955, + "grad_norm": 1.2103217840194702, + "learning_rate": 5.176500000000001e-05, + "loss": 0.5158, + "step": 10359 + }, + { + "epoch": 0.5801321536566245, + "grad_norm": 1.405078411102295, + "learning_rate": 5.177000000000001e-05, + "loss": 0.4028, + "step": 10360 + }, + { + "epoch": 0.5801881509687535, + "grad_norm": 2.192035675048828, + "learning_rate": 5.1775000000000005e-05, + "loss": 0.4246, + "step": 10361 + }, + { + "epoch": 0.5802441482808826, + "grad_norm": 1.244006872177124, + "learning_rate": 5.178000000000001e-05, + "loss": 0.3975, + "step": 10362 + }, + { + "epoch": 0.5803001455930116, + "grad_norm": 1.2485039234161377, + "learning_rate": 5.1785000000000006e-05, + "loss": 0.4321, + "step": 10363 + }, + { + "epoch": 0.5803561429051406, + "grad_norm": 1.3139088153839111, + "learning_rate": 5.1790000000000004e-05, + "loss": 0.4581, + "step": 10364 + }, + { + "epoch": 0.5804121402172696, + "grad_norm": 1.3031911849975586, + "learning_rate": 5.1795e-05, + "loss": 0.5174, + "step": 10365 + }, + { + "epoch": 0.5804681375293986, + "grad_norm": 1.196021556854248, + "learning_rate": 5.1800000000000005e-05, + "loss": 0.3223, + "step": 10366 + }, + { + "epoch": 0.5805241348415277, + "grad_norm": 1.153767466545105, + "learning_rate": 5.1805e-05, + "loss": 0.411, + "step": 10367 + }, + { + "epoch": 0.5805801321536567, + "grad_norm": 1.1274921894073486, + "learning_rate": 5.181e-05, + "loss": 0.3668, + "step": 10368 + }, + { + "epoch": 0.5806361294657857, + "grad_norm": 1.3947771787643433, + "learning_rate": 5.1815000000000005e-05, + "loss": 0.325, + "step": 10369 + }, + { + "epoch": 0.5806921267779147, + "grad_norm": 1.2500982284545898, + "learning_rate": 5.182e-05, + "loss": 0.4032, + "step": 10370 + }, + { + "epoch": 0.5807481240900437, + "grad_norm": 1.1656206846237183, + "learning_rate": 5.1825e-05, + "loss": 0.4406, + "step": 10371 + }, + { + "epoch": 0.5808041214021727, + "grad_norm": 1.7991421222686768, + "learning_rate": 5.1830000000000004e-05, + "loss": 0.5553, + "step": 10372 + }, + { + "epoch": 0.5808601187143018, + "grad_norm": 1.2823164463043213, + "learning_rate": 5.1835e-05, + "loss": 0.3647, + "step": 10373 + }, + { + "epoch": 0.5809161160264308, + "grad_norm": 1.5040351152420044, + "learning_rate": 5.184e-05, + "loss": 0.5526, + "step": 10374 + }, + { + "epoch": 0.5809721133385598, + "grad_norm": 1.8107975721359253, + "learning_rate": 5.1844999999999996e-05, + "loss": 0.6746, + "step": 10375 + }, + { + "epoch": 0.5810281106506887, + "grad_norm": 1.2524305582046509, + "learning_rate": 5.185e-05, + "loss": 0.3989, + "step": 10376 + }, + { + "epoch": 0.5810841079628177, + "grad_norm": 1.2219644784927368, + "learning_rate": 5.1855e-05, + "loss": 0.4442, + "step": 10377 + }, + { + "epoch": 0.5811401052749467, + "grad_norm": 1.2759735584259033, + "learning_rate": 5.1859999999999995e-05, + "loss": 0.3981, + "step": 10378 + }, + { + "epoch": 0.5811961025870758, + "grad_norm": 1.234581708908081, + "learning_rate": 5.1865000000000006e-05, + "loss": 0.4084, + "step": 10379 + }, + { + "epoch": 0.5812520998992048, + "grad_norm": 1.4750521183013916, + "learning_rate": 5.187000000000001e-05, + "loss": 0.501, + "step": 10380 + }, + { + "epoch": 0.5813080972113338, + "grad_norm": 1.4189141988754272, + "learning_rate": 5.187500000000001e-05, + "loss": 0.4196, + "step": 10381 + }, + { + "epoch": 0.5813640945234628, + "grad_norm": 1.362120509147644, + "learning_rate": 5.1880000000000005e-05, + "loss": 0.4634, + "step": 10382 + }, + { + "epoch": 0.5814200918355918, + "grad_norm": 1.1404772996902466, + "learning_rate": 5.188500000000001e-05, + "loss": 0.3291, + "step": 10383 + }, + { + "epoch": 0.5814760891477209, + "grad_norm": 1.1161601543426514, + "learning_rate": 5.1890000000000006e-05, + "loss": 0.4301, + "step": 10384 + }, + { + "epoch": 0.5815320864598499, + "grad_norm": 1.398680329322815, + "learning_rate": 5.1895000000000004e-05, + "loss": 0.5729, + "step": 10385 + }, + { + "epoch": 0.5815880837719789, + "grad_norm": 1.1788133382797241, + "learning_rate": 5.19e-05, + "loss": 0.4033, + "step": 10386 + }, + { + "epoch": 0.5816440810841079, + "grad_norm": 1.075253963470459, + "learning_rate": 5.1905000000000005e-05, + "loss": 0.4779, + "step": 10387 + }, + { + "epoch": 0.5817000783962369, + "grad_norm": 1.4342814683914185, + "learning_rate": 5.191e-05, + "loss": 0.56, + "step": 10388 + }, + { + "epoch": 0.581756075708366, + "grad_norm": 1.4962137937545776, + "learning_rate": 5.1915e-05, + "loss": 0.4512, + "step": 10389 + }, + { + "epoch": 0.581812073020495, + "grad_norm": 1.5346430540084839, + "learning_rate": 5.1920000000000004e-05, + "loss": 0.4255, + "step": 10390 + }, + { + "epoch": 0.581868070332624, + "grad_norm": 1.1646647453308105, + "learning_rate": 5.1925e-05, + "loss": 0.3365, + "step": 10391 + }, + { + "epoch": 0.581924067644753, + "grad_norm": 1.2096935510635376, + "learning_rate": 5.193e-05, + "loss": 0.3587, + "step": 10392 + }, + { + "epoch": 0.581980064956882, + "grad_norm": 1.6582748889923096, + "learning_rate": 5.1935e-05, + "loss": 0.4877, + "step": 10393 + }, + { + "epoch": 0.5820360622690111, + "grad_norm": 1.4083611965179443, + "learning_rate": 5.194e-05, + "loss": 0.474, + "step": 10394 + }, + { + "epoch": 0.5820920595811401, + "grad_norm": 1.2445719242095947, + "learning_rate": 5.1945e-05, + "loss": 0.4816, + "step": 10395 + }, + { + "epoch": 0.5821480568932691, + "grad_norm": 20.22435760498047, + "learning_rate": 5.1949999999999996e-05, + "loss": 0.5547, + "step": 10396 + }, + { + "epoch": 0.5822040542053981, + "grad_norm": 1.334487795829773, + "learning_rate": 5.1955e-05, + "loss": 0.3963, + "step": 10397 + }, + { + "epoch": 0.5822600515175271, + "grad_norm": 1.431045413017273, + "learning_rate": 5.196e-05, + "loss": 0.4289, + "step": 10398 + }, + { + "epoch": 0.5823160488296562, + "grad_norm": 1.3006852865219116, + "learning_rate": 5.1964999999999995e-05, + "loss": 0.4283, + "step": 10399 + }, + { + "epoch": 0.5823720461417852, + "grad_norm": 1.5506912469863892, + "learning_rate": 5.1970000000000006e-05, + "loss": 0.4408, + "step": 10400 + }, + { + "epoch": 0.5824280434539142, + "grad_norm": 1.452589988708496, + "learning_rate": 5.197500000000001e-05, + "loss": 0.513, + "step": 10401 + }, + { + "epoch": 0.5824840407660432, + "grad_norm": 1.1917253732681274, + "learning_rate": 5.198000000000001e-05, + "loss": 0.3804, + "step": 10402 + }, + { + "epoch": 0.5825400380781722, + "grad_norm": 1.2255934476852417, + "learning_rate": 5.1985000000000005e-05, + "loss": 0.4037, + "step": 10403 + }, + { + "epoch": 0.5825960353903012, + "grad_norm": 1.1572686433792114, + "learning_rate": 5.199000000000001e-05, + "loss": 0.3719, + "step": 10404 + }, + { + "epoch": 0.5826520327024303, + "grad_norm": 1.2482918500900269, + "learning_rate": 5.1995000000000006e-05, + "loss": 0.4869, + "step": 10405 + }, + { + "epoch": 0.5827080300145593, + "grad_norm": 1.2625491619110107, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.5219, + "step": 10406 + }, + { + "epoch": 0.5827640273266883, + "grad_norm": 1.3208270072937012, + "learning_rate": 5.2005e-05, + "loss": 0.3986, + "step": 10407 + }, + { + "epoch": 0.5828200246388173, + "grad_norm": 1.1687214374542236, + "learning_rate": 5.2010000000000005e-05, + "loss": 0.2842, + "step": 10408 + }, + { + "epoch": 0.5828760219509463, + "grad_norm": 1.534670114517212, + "learning_rate": 5.2015e-05, + "loss": 0.6339, + "step": 10409 + }, + { + "epoch": 0.5829320192630754, + "grad_norm": 1.1783336400985718, + "learning_rate": 5.202e-05, + "loss": 0.3793, + "step": 10410 + }, + { + "epoch": 0.5829880165752044, + "grad_norm": 1.3272316455841064, + "learning_rate": 5.2025000000000004e-05, + "loss": 0.345, + "step": 10411 + }, + { + "epoch": 0.5830440138873334, + "grad_norm": 1.3698999881744385, + "learning_rate": 5.203e-05, + "loss": 0.5388, + "step": 10412 + }, + { + "epoch": 0.5831000111994624, + "grad_norm": 1.1794499158859253, + "learning_rate": 5.2035e-05, + "loss": 0.3658, + "step": 10413 + }, + { + "epoch": 0.5831560085115914, + "grad_norm": 1.287438154220581, + "learning_rate": 5.204e-05, + "loss": 0.5242, + "step": 10414 + }, + { + "epoch": 0.5832120058237205, + "grad_norm": 1.367053508758545, + "learning_rate": 5.2045e-05, + "loss": 0.5462, + "step": 10415 + }, + { + "epoch": 0.5832680031358495, + "grad_norm": 1.1334296464920044, + "learning_rate": 5.205e-05, + "loss": 0.345, + "step": 10416 + }, + { + "epoch": 0.5833240004479785, + "grad_norm": 1.7370926141738892, + "learning_rate": 5.2054999999999995e-05, + "loss": 0.4888, + "step": 10417 + }, + { + "epoch": 0.5833799977601075, + "grad_norm": 1.3780725002288818, + "learning_rate": 5.206e-05, + "loss": 0.5872, + "step": 10418 + }, + { + "epoch": 0.5834359950722365, + "grad_norm": 1.4774549007415771, + "learning_rate": 5.2065e-05, + "loss": 0.5877, + "step": 10419 + }, + { + "epoch": 0.5834919923843656, + "grad_norm": 1.231362223625183, + "learning_rate": 5.207000000000001e-05, + "loss": 0.3213, + "step": 10420 + }, + { + "epoch": 0.5835479896964946, + "grad_norm": 1.2752066850662231, + "learning_rate": 5.2075000000000005e-05, + "loss": 0.3385, + "step": 10421 + }, + { + "epoch": 0.5836039870086236, + "grad_norm": 1.5953947305679321, + "learning_rate": 5.208000000000001e-05, + "loss": 0.4162, + "step": 10422 + }, + { + "epoch": 0.5836599843207526, + "grad_norm": 1.1868245601654053, + "learning_rate": 5.208500000000001e-05, + "loss": 0.5124, + "step": 10423 + }, + { + "epoch": 0.5837159816328816, + "grad_norm": 1.6609355211257935, + "learning_rate": 5.2090000000000004e-05, + "loss": 0.4391, + "step": 10424 + }, + { + "epoch": 0.5837719789450107, + "grad_norm": 1.9022648334503174, + "learning_rate": 5.2095e-05, + "loss": 0.4716, + "step": 10425 + }, + { + "epoch": 0.5838279762571397, + "grad_norm": 1.5023819208145142, + "learning_rate": 5.2100000000000006e-05, + "loss": 0.6172, + "step": 10426 + }, + { + "epoch": 0.5838839735692687, + "grad_norm": 1.3575319051742554, + "learning_rate": 5.2105000000000003e-05, + "loss": 0.455, + "step": 10427 + }, + { + "epoch": 0.5839399708813977, + "grad_norm": 1.2734533548355103, + "learning_rate": 5.211e-05, + "loss": 0.4149, + "step": 10428 + }, + { + "epoch": 0.5839959681935267, + "grad_norm": 1.1898829936981201, + "learning_rate": 5.2115000000000005e-05, + "loss": 0.4806, + "step": 10429 + }, + { + "epoch": 0.5840519655056557, + "grad_norm": 1.3889342546463013, + "learning_rate": 5.212e-05, + "loss": 0.5, + "step": 10430 + }, + { + "epoch": 0.5841079628177848, + "grad_norm": 1.3933897018432617, + "learning_rate": 5.2125e-05, + "loss": 0.3554, + "step": 10431 + }, + { + "epoch": 0.5841639601299138, + "grad_norm": 1.0557312965393066, + "learning_rate": 5.2130000000000004e-05, + "loss": 0.3702, + "step": 10432 + }, + { + "epoch": 0.5842199574420428, + "grad_norm": 1.0459210872650146, + "learning_rate": 5.2135e-05, + "loss": 0.3555, + "step": 10433 + }, + { + "epoch": 0.5842759547541718, + "grad_norm": 1.7518179416656494, + "learning_rate": 5.214e-05, + "loss": 0.4487, + "step": 10434 + }, + { + "epoch": 0.5843319520663008, + "grad_norm": 1.1503556966781616, + "learning_rate": 5.2144999999999996e-05, + "loss": 0.4245, + "step": 10435 + }, + { + "epoch": 0.5843879493784299, + "grad_norm": 1.4215766191482544, + "learning_rate": 5.215e-05, + "loss": 0.441, + "step": 10436 + }, + { + "epoch": 0.5844439466905589, + "grad_norm": 1.3397252559661865, + "learning_rate": 5.2155e-05, + "loss": 0.3169, + "step": 10437 + }, + { + "epoch": 0.5844999440026879, + "grad_norm": 1.2923110723495483, + "learning_rate": 5.2159999999999995e-05, + "loss": 0.4483, + "step": 10438 + }, + { + "epoch": 0.5845559413148169, + "grad_norm": 1.4886518716812134, + "learning_rate": 5.2165e-05, + "loss": 0.4953, + "step": 10439 + }, + { + "epoch": 0.5846119386269459, + "grad_norm": 1.6069964170455933, + "learning_rate": 5.217000000000001e-05, + "loss": 0.5019, + "step": 10440 + }, + { + "epoch": 0.584667935939075, + "grad_norm": 1.211219072341919, + "learning_rate": 5.217500000000001e-05, + "loss": 0.5428, + "step": 10441 + }, + { + "epoch": 0.584723933251204, + "grad_norm": 1.6830753087997437, + "learning_rate": 5.2180000000000005e-05, + "loss": 0.5872, + "step": 10442 + }, + { + "epoch": 0.584779930563333, + "grad_norm": 1.4909956455230713, + "learning_rate": 5.218500000000001e-05, + "loss": 0.566, + "step": 10443 + }, + { + "epoch": 0.584835927875462, + "grad_norm": 1.2251861095428467, + "learning_rate": 5.219000000000001e-05, + "loss": 0.4313, + "step": 10444 + }, + { + "epoch": 0.584891925187591, + "grad_norm": 1.5191506147384644, + "learning_rate": 5.2195000000000004e-05, + "loss": 0.6065, + "step": 10445 + }, + { + "epoch": 0.58494792249972, + "grad_norm": 1.486066460609436, + "learning_rate": 5.22e-05, + "loss": 0.4421, + "step": 10446 + }, + { + "epoch": 0.5850039198118491, + "grad_norm": 1.4842216968536377, + "learning_rate": 5.2205000000000006e-05, + "loss": 0.4484, + "step": 10447 + }, + { + "epoch": 0.5850599171239781, + "grad_norm": 1.1850941181182861, + "learning_rate": 5.221e-05, + "loss": 0.3796, + "step": 10448 + }, + { + "epoch": 0.5851159144361071, + "grad_norm": 1.4743479490280151, + "learning_rate": 5.2215e-05, + "loss": 0.3642, + "step": 10449 + }, + { + "epoch": 0.5851719117482361, + "grad_norm": 1.1929072141647339, + "learning_rate": 5.2220000000000005e-05, + "loss": 0.3456, + "step": 10450 + }, + { + "epoch": 0.5852279090603651, + "grad_norm": 1.238207459449768, + "learning_rate": 5.2225e-05, + "loss": 0.3727, + "step": 10451 + }, + { + "epoch": 0.5852839063724942, + "grad_norm": 1.2077758312225342, + "learning_rate": 5.223e-05, + "loss": 0.3962, + "step": 10452 + }, + { + "epoch": 0.5853399036846232, + "grad_norm": 1.3768528699874878, + "learning_rate": 5.2235000000000004e-05, + "loss": 0.4577, + "step": 10453 + }, + { + "epoch": 0.5853959009967522, + "grad_norm": 1.1925586462020874, + "learning_rate": 5.224e-05, + "loss": 0.54, + "step": 10454 + }, + { + "epoch": 0.5854518983088812, + "grad_norm": 1.4597384929656982, + "learning_rate": 5.2245e-05, + "loss": 0.3021, + "step": 10455 + }, + { + "epoch": 0.5855078956210102, + "grad_norm": 1.1666860580444336, + "learning_rate": 5.2249999999999996e-05, + "loss": 0.3434, + "step": 10456 + }, + { + "epoch": 0.5855638929331393, + "grad_norm": 1.5159988403320312, + "learning_rate": 5.2255e-05, + "loss": 0.5155, + "step": 10457 + }, + { + "epoch": 0.5856198902452683, + "grad_norm": 1.2205889225006104, + "learning_rate": 5.226e-05, + "loss": 0.3108, + "step": 10458 + }, + { + "epoch": 0.5856758875573972, + "grad_norm": 1.1630786657333374, + "learning_rate": 5.2264999999999995e-05, + "loss": 0.4019, + "step": 10459 + }, + { + "epoch": 0.5857318848695262, + "grad_norm": 1.2360212802886963, + "learning_rate": 5.2270000000000006e-05, + "loss": 0.4528, + "step": 10460 + }, + { + "epoch": 0.5857878821816552, + "grad_norm": 1.3985111713409424, + "learning_rate": 5.227500000000001e-05, + "loss": 0.4072, + "step": 10461 + }, + { + "epoch": 0.5858438794937842, + "grad_norm": 1.2422573566436768, + "learning_rate": 5.228000000000001e-05, + "loss": 0.4397, + "step": 10462 + }, + { + "epoch": 0.5858998768059133, + "grad_norm": 1.134757161140442, + "learning_rate": 5.2285000000000005e-05, + "loss": 0.3392, + "step": 10463 + }, + { + "epoch": 0.5859558741180423, + "grad_norm": 1.3189239501953125, + "learning_rate": 5.229e-05, + "loss": 0.6016, + "step": 10464 + }, + { + "epoch": 0.5860118714301713, + "grad_norm": 2.8348681926727295, + "learning_rate": 5.229500000000001e-05, + "loss": 0.3658, + "step": 10465 + }, + { + "epoch": 0.5860678687423003, + "grad_norm": 1.1461681127548218, + "learning_rate": 5.2300000000000004e-05, + "loss": 0.5019, + "step": 10466 + }, + { + "epoch": 0.5861238660544293, + "grad_norm": 1.1149210929870605, + "learning_rate": 5.2305e-05, + "loss": 0.4242, + "step": 10467 + }, + { + "epoch": 0.5861798633665584, + "grad_norm": 1.132940649986267, + "learning_rate": 5.2310000000000006e-05, + "loss": 0.4808, + "step": 10468 + }, + { + "epoch": 0.5862358606786874, + "grad_norm": 1.4029748439788818, + "learning_rate": 5.2315e-05, + "loss": 0.4697, + "step": 10469 + }, + { + "epoch": 0.5862918579908164, + "grad_norm": 1.6982300281524658, + "learning_rate": 5.232e-05, + "loss": 0.3914, + "step": 10470 + }, + { + "epoch": 0.5863478553029454, + "grad_norm": 1.6048387289047241, + "learning_rate": 5.2325000000000005e-05, + "loss": 0.4111, + "step": 10471 + }, + { + "epoch": 0.5864038526150744, + "grad_norm": 1.2392228841781616, + "learning_rate": 5.233e-05, + "loss": 0.4016, + "step": 10472 + }, + { + "epoch": 0.5864598499272035, + "grad_norm": 1.248370885848999, + "learning_rate": 5.2335e-05, + "loss": 0.4372, + "step": 10473 + }, + { + "epoch": 0.5865158472393325, + "grad_norm": 1.248731255531311, + "learning_rate": 5.234e-05, + "loss": 0.5049, + "step": 10474 + }, + { + "epoch": 0.5865718445514615, + "grad_norm": 1.577825665473938, + "learning_rate": 5.2345e-05, + "loss": 0.5232, + "step": 10475 + }, + { + "epoch": 0.5866278418635905, + "grad_norm": 15.490825653076172, + "learning_rate": 5.235e-05, + "loss": 0.6373, + "step": 10476 + }, + { + "epoch": 0.5866838391757195, + "grad_norm": 1.3385379314422607, + "learning_rate": 5.2354999999999996e-05, + "loss": 0.405, + "step": 10477 + }, + { + "epoch": 0.5867398364878486, + "grad_norm": 1.0995076894760132, + "learning_rate": 5.236e-05, + "loss": 0.3462, + "step": 10478 + }, + { + "epoch": 0.5867958337999776, + "grad_norm": 1.135644793510437, + "learning_rate": 5.2365e-05, + "loss": 0.4616, + "step": 10479 + }, + { + "epoch": 0.5868518311121066, + "grad_norm": 1.2388736009597778, + "learning_rate": 5.237000000000001e-05, + "loss": 0.3648, + "step": 10480 + }, + { + "epoch": 0.5869078284242356, + "grad_norm": 1.3318978548049927, + "learning_rate": 5.2375000000000006e-05, + "loss": 0.5455, + "step": 10481 + }, + { + "epoch": 0.5869638257363646, + "grad_norm": 1.180772066116333, + "learning_rate": 5.238000000000001e-05, + "loss": 0.4188, + "step": 10482 + }, + { + "epoch": 0.5870198230484936, + "grad_norm": 1.0729402303695679, + "learning_rate": 5.238500000000001e-05, + "loss": 0.395, + "step": 10483 + }, + { + "epoch": 0.5870758203606227, + "grad_norm": 1.390486717224121, + "learning_rate": 5.2390000000000005e-05, + "loss": 0.48, + "step": 10484 + }, + { + "epoch": 0.5871318176727517, + "grad_norm": 1.126437783241272, + "learning_rate": 5.2395e-05, + "loss": 0.3945, + "step": 10485 + }, + { + "epoch": 0.5871878149848807, + "grad_norm": 1.2294518947601318, + "learning_rate": 5.2400000000000007e-05, + "loss": 0.3782, + "step": 10486 + }, + { + "epoch": 0.5872438122970097, + "grad_norm": 1.1480926275253296, + "learning_rate": 5.2405000000000004e-05, + "loss": 0.3972, + "step": 10487 + }, + { + "epoch": 0.5872998096091387, + "grad_norm": 1.2376410961151123, + "learning_rate": 5.241e-05, + "loss": 0.5238, + "step": 10488 + }, + { + "epoch": 0.5873558069212678, + "grad_norm": 1.3651033639907837, + "learning_rate": 5.2415000000000006e-05, + "loss": 0.4656, + "step": 10489 + }, + { + "epoch": 0.5874118042333968, + "grad_norm": 1.3575665950775146, + "learning_rate": 5.242e-05, + "loss": 0.4168, + "step": 10490 + }, + { + "epoch": 0.5874678015455258, + "grad_norm": 1.1683986186981201, + "learning_rate": 5.2425e-05, + "loss": 0.4651, + "step": 10491 + }, + { + "epoch": 0.5875237988576548, + "grad_norm": 1.036370038986206, + "learning_rate": 5.2430000000000005e-05, + "loss": 0.2864, + "step": 10492 + }, + { + "epoch": 0.5875797961697838, + "grad_norm": 1.2672920227050781, + "learning_rate": 5.2435e-05, + "loss": 0.4514, + "step": 10493 + }, + { + "epoch": 0.5876357934819129, + "grad_norm": 1.1965370178222656, + "learning_rate": 5.244e-05, + "loss": 0.3968, + "step": 10494 + }, + { + "epoch": 0.5876917907940419, + "grad_norm": 1.398776888847351, + "learning_rate": 5.2445e-05, + "loss": 0.4031, + "step": 10495 + }, + { + "epoch": 0.5877477881061709, + "grad_norm": 1.222611904144287, + "learning_rate": 5.245e-05, + "loss": 0.3937, + "step": 10496 + }, + { + "epoch": 0.5878037854182999, + "grad_norm": 2.0282087326049805, + "learning_rate": 5.2455e-05, + "loss": 0.5691, + "step": 10497 + }, + { + "epoch": 0.5878597827304289, + "grad_norm": 1.4633281230926514, + "learning_rate": 5.2459999999999996e-05, + "loss": 0.3477, + "step": 10498 + }, + { + "epoch": 0.587915780042558, + "grad_norm": 1.387283205986023, + "learning_rate": 5.2465e-05, + "loss": 0.4913, + "step": 10499 + }, + { + "epoch": 0.587971777354687, + "grad_norm": 1.2462290525436401, + "learning_rate": 5.247000000000001e-05, + "loss": 0.4113, + "step": 10500 + }, + { + "epoch": 0.588027774666816, + "grad_norm": 1.4916491508483887, + "learning_rate": 5.247500000000001e-05, + "loss": 0.3696, + "step": 10501 + }, + { + "epoch": 0.588083771978945, + "grad_norm": 6.327125072479248, + "learning_rate": 5.2480000000000006e-05, + "loss": 0.4427, + "step": 10502 + }, + { + "epoch": 0.588139769291074, + "grad_norm": 1.3494855165481567, + "learning_rate": 5.2485e-05, + "loss": 0.3723, + "step": 10503 + }, + { + "epoch": 0.588195766603203, + "grad_norm": 1.153686285018921, + "learning_rate": 5.249000000000001e-05, + "loss": 0.4094, + "step": 10504 + }, + { + "epoch": 0.5882517639153321, + "grad_norm": 1.3393079042434692, + "learning_rate": 5.2495000000000005e-05, + "loss": 0.4726, + "step": 10505 + }, + { + "epoch": 0.5883077612274611, + "grad_norm": 1.1729991436004639, + "learning_rate": 5.25e-05, + "loss": 0.3605, + "step": 10506 + }, + { + "epoch": 0.5883637585395901, + "grad_norm": 1.1829841136932373, + "learning_rate": 5.2505000000000006e-05, + "loss": 0.3994, + "step": 10507 + }, + { + "epoch": 0.5884197558517191, + "grad_norm": 1.4506311416625977, + "learning_rate": 5.2510000000000004e-05, + "loss": 0.6831, + "step": 10508 + }, + { + "epoch": 0.5884757531638481, + "grad_norm": 1.20071280002594, + "learning_rate": 5.2515e-05, + "loss": 0.3928, + "step": 10509 + }, + { + "epoch": 0.5885317504759772, + "grad_norm": 1.2628368139266968, + "learning_rate": 5.2520000000000005e-05, + "loss": 0.4455, + "step": 10510 + }, + { + "epoch": 0.5885877477881062, + "grad_norm": 1.1907291412353516, + "learning_rate": 5.2525e-05, + "loss": 0.4332, + "step": 10511 + }, + { + "epoch": 0.5886437451002352, + "grad_norm": 1.2729754447937012, + "learning_rate": 5.253e-05, + "loss": 0.315, + "step": 10512 + }, + { + "epoch": 0.5886997424123642, + "grad_norm": 1.0806745290756226, + "learning_rate": 5.2535e-05, + "loss": 0.3771, + "step": 10513 + }, + { + "epoch": 0.5887557397244932, + "grad_norm": 1.6687424182891846, + "learning_rate": 5.254e-05, + "loss": 0.5379, + "step": 10514 + }, + { + "epoch": 0.5888117370366223, + "grad_norm": 1.374032974243164, + "learning_rate": 5.2545e-05, + "loss": 0.4884, + "step": 10515 + }, + { + "epoch": 0.5888677343487513, + "grad_norm": 1.7508906126022339, + "learning_rate": 5.255e-05, + "loss": 0.3592, + "step": 10516 + }, + { + "epoch": 0.5889237316608803, + "grad_norm": 1.1985833644866943, + "learning_rate": 5.2555e-05, + "loss": 0.4139, + "step": 10517 + }, + { + "epoch": 0.5889797289730093, + "grad_norm": 1.2333550453186035, + "learning_rate": 5.256e-05, + "loss": 0.3817, + "step": 10518 + }, + { + "epoch": 0.5890357262851383, + "grad_norm": 1.2691351175308228, + "learning_rate": 5.2564999999999996e-05, + "loss": 0.4359, + "step": 10519 + }, + { + "epoch": 0.5890917235972674, + "grad_norm": 1.2514894008636475, + "learning_rate": 5.257e-05, + "loss": 0.5062, + "step": 10520 + }, + { + "epoch": 0.5891477209093964, + "grad_norm": 1.4081145524978638, + "learning_rate": 5.257500000000001e-05, + "loss": 0.3867, + "step": 10521 + }, + { + "epoch": 0.5892037182215254, + "grad_norm": 1.2950271368026733, + "learning_rate": 5.258000000000001e-05, + "loss": 0.4013, + "step": 10522 + }, + { + "epoch": 0.5892597155336544, + "grad_norm": 1.096846342086792, + "learning_rate": 5.2585000000000006e-05, + "loss": 0.4453, + "step": 10523 + }, + { + "epoch": 0.5893157128457834, + "grad_norm": 1.0097601413726807, + "learning_rate": 5.259e-05, + "loss": 0.4251, + "step": 10524 + }, + { + "epoch": 0.5893717101579125, + "grad_norm": 1.2172775268554688, + "learning_rate": 5.259500000000001e-05, + "loss": 0.3619, + "step": 10525 + }, + { + "epoch": 0.5894277074700415, + "grad_norm": 1.1469647884368896, + "learning_rate": 5.2600000000000005e-05, + "loss": 0.4751, + "step": 10526 + }, + { + "epoch": 0.5894837047821705, + "grad_norm": 1.3321958780288696, + "learning_rate": 5.2605e-05, + "loss": 0.3585, + "step": 10527 + }, + { + "epoch": 0.5895397020942995, + "grad_norm": 1.2122538089752197, + "learning_rate": 5.2610000000000006e-05, + "loss": 0.3275, + "step": 10528 + }, + { + "epoch": 0.5895956994064285, + "grad_norm": 1.414595603942871, + "learning_rate": 5.2615000000000004e-05, + "loss": 0.5268, + "step": 10529 + }, + { + "epoch": 0.5896516967185575, + "grad_norm": 1.2470372915267944, + "learning_rate": 5.262e-05, + "loss": 0.3476, + "step": 10530 + }, + { + "epoch": 0.5897076940306866, + "grad_norm": 1.3111385107040405, + "learning_rate": 5.2625000000000005e-05, + "loss": 0.4256, + "step": 10531 + }, + { + "epoch": 0.5897636913428156, + "grad_norm": 1.2466546297073364, + "learning_rate": 5.263e-05, + "loss": 0.3991, + "step": 10532 + }, + { + "epoch": 0.5898196886549446, + "grad_norm": 1.1770704984664917, + "learning_rate": 5.2635e-05, + "loss": 0.2895, + "step": 10533 + }, + { + "epoch": 0.5898756859670736, + "grad_norm": 1.14566171169281, + "learning_rate": 5.264e-05, + "loss": 0.5139, + "step": 10534 + }, + { + "epoch": 0.5899316832792026, + "grad_norm": 1.2309952974319458, + "learning_rate": 5.2645e-05, + "loss": 0.4874, + "step": 10535 + }, + { + "epoch": 0.5899876805913317, + "grad_norm": 1.2991855144500732, + "learning_rate": 5.265e-05, + "loss": 0.4646, + "step": 10536 + }, + { + "epoch": 0.5900436779034607, + "grad_norm": 1.1817588806152344, + "learning_rate": 5.2654999999999996e-05, + "loss": 0.3928, + "step": 10537 + }, + { + "epoch": 0.5900996752155897, + "grad_norm": 1.4595263004302979, + "learning_rate": 5.266e-05, + "loss": 0.4957, + "step": 10538 + }, + { + "epoch": 0.5901556725277187, + "grad_norm": 1.3468631505966187, + "learning_rate": 5.2665e-05, + "loss": 0.4032, + "step": 10539 + }, + { + "epoch": 0.5902116698398477, + "grad_norm": 1.106804370880127, + "learning_rate": 5.2669999999999995e-05, + "loss": 0.495, + "step": 10540 + }, + { + "epoch": 0.5902676671519768, + "grad_norm": 1.1846035718917847, + "learning_rate": 5.2675000000000006e-05, + "loss": 0.3707, + "step": 10541 + }, + { + "epoch": 0.5903236644641057, + "grad_norm": 1.2655447721481323, + "learning_rate": 5.2680000000000004e-05, + "loss": 0.4262, + "step": 10542 + }, + { + "epoch": 0.5903796617762347, + "grad_norm": 1.2150630950927734, + "learning_rate": 5.268500000000001e-05, + "loss": 0.5081, + "step": 10543 + }, + { + "epoch": 0.5904356590883637, + "grad_norm": 1.4314662218093872, + "learning_rate": 5.2690000000000005e-05, + "loss": 0.4474, + "step": 10544 + }, + { + "epoch": 0.5904916564004927, + "grad_norm": 2.045747756958008, + "learning_rate": 5.2695e-05, + "loss": 0.4762, + "step": 10545 + }, + { + "epoch": 0.5905476537126217, + "grad_norm": 1.1559627056121826, + "learning_rate": 5.270000000000001e-05, + "loss": 0.3582, + "step": 10546 + }, + { + "epoch": 0.5906036510247508, + "grad_norm": 1.184889793395996, + "learning_rate": 5.2705000000000004e-05, + "loss": 0.3376, + "step": 10547 + }, + { + "epoch": 0.5906596483368798, + "grad_norm": 1.159448504447937, + "learning_rate": 5.271e-05, + "loss": 0.3158, + "step": 10548 + }, + { + "epoch": 0.5907156456490088, + "grad_norm": 1.1259042024612427, + "learning_rate": 5.2715000000000006e-05, + "loss": 0.3088, + "step": 10549 + }, + { + "epoch": 0.5907716429611378, + "grad_norm": 1.158154010772705, + "learning_rate": 5.2720000000000003e-05, + "loss": 0.3598, + "step": 10550 + }, + { + "epoch": 0.5908276402732668, + "grad_norm": 1.3170642852783203, + "learning_rate": 5.2725e-05, + "loss": 0.3795, + "step": 10551 + }, + { + "epoch": 0.5908836375853959, + "grad_norm": 1.4639804363250732, + "learning_rate": 5.273e-05, + "loss": 0.3567, + "step": 10552 + }, + { + "epoch": 0.5909396348975249, + "grad_norm": 1.2510403394699097, + "learning_rate": 5.2735e-05, + "loss": 0.4231, + "step": 10553 + }, + { + "epoch": 0.5909956322096539, + "grad_norm": 1.335352897644043, + "learning_rate": 5.274e-05, + "loss": 0.6167, + "step": 10554 + }, + { + "epoch": 0.5910516295217829, + "grad_norm": 1.1949379444122314, + "learning_rate": 5.2745e-05, + "loss": 0.3584, + "step": 10555 + }, + { + "epoch": 0.5911076268339119, + "grad_norm": 1.1833736896514893, + "learning_rate": 5.275e-05, + "loss": 0.3027, + "step": 10556 + }, + { + "epoch": 0.591163624146041, + "grad_norm": 1.127034306526184, + "learning_rate": 5.2755e-05, + "loss": 0.4002, + "step": 10557 + }, + { + "epoch": 0.59121962145817, + "grad_norm": 1.4355443716049194, + "learning_rate": 5.2759999999999996e-05, + "loss": 0.5975, + "step": 10558 + }, + { + "epoch": 0.591275618770299, + "grad_norm": 1.3591961860656738, + "learning_rate": 5.2765e-05, + "loss": 0.3527, + "step": 10559 + }, + { + "epoch": 0.591331616082428, + "grad_norm": 1.2719610929489136, + "learning_rate": 5.277e-05, + "loss": 0.4086, + "step": 10560 + }, + { + "epoch": 0.591387613394557, + "grad_norm": 1.282096266746521, + "learning_rate": 5.277500000000001e-05, + "loss": 0.3531, + "step": 10561 + }, + { + "epoch": 0.591443610706686, + "grad_norm": 2.146393060684204, + "learning_rate": 5.2780000000000006e-05, + "loss": 0.5905, + "step": 10562 + }, + { + "epoch": 0.5914996080188151, + "grad_norm": 1.8344500064849854, + "learning_rate": 5.2785000000000004e-05, + "loss": 0.4916, + "step": 10563 + }, + { + "epoch": 0.5915556053309441, + "grad_norm": 1.2841085195541382, + "learning_rate": 5.279000000000001e-05, + "loss": 0.4154, + "step": 10564 + }, + { + "epoch": 0.5916116026430731, + "grad_norm": 1.4600125551223755, + "learning_rate": 5.2795000000000005e-05, + "loss": 0.5258, + "step": 10565 + }, + { + "epoch": 0.5916675999552021, + "grad_norm": 1.319330096244812, + "learning_rate": 5.28e-05, + "loss": 0.4115, + "step": 10566 + }, + { + "epoch": 0.5917235972673311, + "grad_norm": 1.253957748413086, + "learning_rate": 5.280500000000001e-05, + "loss": 0.4111, + "step": 10567 + }, + { + "epoch": 0.5917795945794602, + "grad_norm": 2.324519634246826, + "learning_rate": 5.2810000000000004e-05, + "loss": 0.496, + "step": 10568 + }, + { + "epoch": 0.5918355918915892, + "grad_norm": 1.4242885112762451, + "learning_rate": 5.2815e-05, + "loss": 0.4094, + "step": 10569 + }, + { + "epoch": 0.5918915892037182, + "grad_norm": 1.340866208076477, + "learning_rate": 5.2820000000000006e-05, + "loss": 0.3748, + "step": 10570 + }, + { + "epoch": 0.5919475865158472, + "grad_norm": 1.527490496635437, + "learning_rate": 5.2825e-05, + "loss": 0.5074, + "step": 10571 + }, + { + "epoch": 0.5920035838279762, + "grad_norm": 1.5297170877456665, + "learning_rate": 5.283e-05, + "loss": 0.4071, + "step": 10572 + }, + { + "epoch": 0.5920595811401053, + "grad_norm": 1.1640164852142334, + "learning_rate": 5.2835e-05, + "loss": 0.5012, + "step": 10573 + }, + { + "epoch": 0.5921155784522343, + "grad_norm": 1.3273276090621948, + "learning_rate": 5.284e-05, + "loss": 0.3736, + "step": 10574 + }, + { + "epoch": 0.5921715757643633, + "grad_norm": 1.29637610912323, + "learning_rate": 5.2845e-05, + "loss": 0.4081, + "step": 10575 + }, + { + "epoch": 0.5922275730764923, + "grad_norm": 1.5078063011169434, + "learning_rate": 5.285e-05, + "loss": 0.8388, + "step": 10576 + }, + { + "epoch": 0.5922835703886213, + "grad_norm": 1.3077116012573242, + "learning_rate": 5.2855e-05, + "loss": 0.5383, + "step": 10577 + }, + { + "epoch": 0.5923395677007504, + "grad_norm": 1.2726671695709229, + "learning_rate": 5.286e-05, + "loss": 0.4237, + "step": 10578 + }, + { + "epoch": 0.5923955650128794, + "grad_norm": 1.197482943534851, + "learning_rate": 5.2864999999999996e-05, + "loss": 0.352, + "step": 10579 + }, + { + "epoch": 0.5924515623250084, + "grad_norm": 1.6226192712783813, + "learning_rate": 5.287e-05, + "loss": 0.6134, + "step": 10580 + }, + { + "epoch": 0.5925075596371374, + "grad_norm": 1.1241241693496704, + "learning_rate": 5.2875000000000005e-05, + "loss": 0.5402, + "step": 10581 + }, + { + "epoch": 0.5925635569492664, + "grad_norm": 1.4471862316131592, + "learning_rate": 5.288000000000001e-05, + "loss": 0.4068, + "step": 10582 + }, + { + "epoch": 0.5926195542613955, + "grad_norm": 2.0013887882232666, + "learning_rate": 5.2885000000000006e-05, + "loss": 0.4744, + "step": 10583 + }, + { + "epoch": 0.5926755515735245, + "grad_norm": 1.4567906856536865, + "learning_rate": 5.2890000000000004e-05, + "loss": 0.5836, + "step": 10584 + }, + { + "epoch": 0.5927315488856535, + "grad_norm": 1.3054038286209106, + "learning_rate": 5.289500000000001e-05, + "loss": 0.4638, + "step": 10585 + }, + { + "epoch": 0.5927875461977825, + "grad_norm": 1.1921995878219604, + "learning_rate": 5.2900000000000005e-05, + "loss": 0.3657, + "step": 10586 + }, + { + "epoch": 0.5928435435099115, + "grad_norm": 1.283695101737976, + "learning_rate": 5.2905e-05, + "loss": 0.4191, + "step": 10587 + }, + { + "epoch": 0.5928995408220405, + "grad_norm": 1.2014490365982056, + "learning_rate": 5.291000000000001e-05, + "loss": 0.3812, + "step": 10588 + }, + { + "epoch": 0.5929555381341696, + "grad_norm": 1.3544204235076904, + "learning_rate": 5.2915000000000004e-05, + "loss": 0.3469, + "step": 10589 + }, + { + "epoch": 0.5930115354462986, + "grad_norm": 1.3510054349899292, + "learning_rate": 5.292e-05, + "loss": 0.442, + "step": 10590 + }, + { + "epoch": 0.5930675327584276, + "grad_norm": 1.387406349182129, + "learning_rate": 5.2925000000000006e-05, + "loss": 0.5564, + "step": 10591 + }, + { + "epoch": 0.5931235300705566, + "grad_norm": 1.2237368822097778, + "learning_rate": 5.293e-05, + "loss": 0.4988, + "step": 10592 + }, + { + "epoch": 0.5931795273826856, + "grad_norm": 1.5499701499938965, + "learning_rate": 5.2935e-05, + "loss": 0.4317, + "step": 10593 + }, + { + "epoch": 0.5932355246948147, + "grad_norm": 6.83128547668457, + "learning_rate": 5.294e-05, + "loss": 0.4112, + "step": 10594 + }, + { + "epoch": 0.5932915220069437, + "grad_norm": 1.2354567050933838, + "learning_rate": 5.2945e-05, + "loss": 0.4185, + "step": 10595 + }, + { + "epoch": 0.5933475193190727, + "grad_norm": 1.5912731885910034, + "learning_rate": 5.295e-05, + "loss": 0.4071, + "step": 10596 + }, + { + "epoch": 0.5934035166312017, + "grad_norm": 1.1895127296447754, + "learning_rate": 5.2955e-05, + "loss": 0.4107, + "step": 10597 + }, + { + "epoch": 0.5934595139433307, + "grad_norm": 1.3144574165344238, + "learning_rate": 5.296e-05, + "loss": 0.3937, + "step": 10598 + }, + { + "epoch": 0.5935155112554598, + "grad_norm": 1.428004503250122, + "learning_rate": 5.2965e-05, + "loss": 0.4642, + "step": 10599 + }, + { + "epoch": 0.5935715085675888, + "grad_norm": 1.0706285238265991, + "learning_rate": 5.2969999999999996e-05, + "loss": 0.4732, + "step": 10600 + }, + { + "epoch": 0.5936275058797178, + "grad_norm": 1.2697863578796387, + "learning_rate": 5.297500000000001e-05, + "loss": 0.4415, + "step": 10601 + }, + { + "epoch": 0.5936835031918468, + "grad_norm": 1.2962846755981445, + "learning_rate": 5.2980000000000004e-05, + "loss": 0.4408, + "step": 10602 + }, + { + "epoch": 0.5937395005039758, + "grad_norm": 1.3874752521514893, + "learning_rate": 5.298500000000001e-05, + "loss": 0.395, + "step": 10603 + }, + { + "epoch": 0.5937954978161049, + "grad_norm": 1.258508563041687, + "learning_rate": 5.2990000000000006e-05, + "loss": 0.4574, + "step": 10604 + }, + { + "epoch": 0.5938514951282339, + "grad_norm": 1.343687891960144, + "learning_rate": 5.2995e-05, + "loss": 0.4034, + "step": 10605 + }, + { + "epoch": 0.5939074924403629, + "grad_norm": 1.2250497341156006, + "learning_rate": 5.300000000000001e-05, + "loss": 0.3901, + "step": 10606 + }, + { + "epoch": 0.5939634897524919, + "grad_norm": 0.9959533214569092, + "learning_rate": 5.3005000000000005e-05, + "loss": 0.2805, + "step": 10607 + }, + { + "epoch": 0.5940194870646209, + "grad_norm": 1.1464002132415771, + "learning_rate": 5.301e-05, + "loss": 0.3531, + "step": 10608 + }, + { + "epoch": 0.59407548437675, + "grad_norm": 1.2639087438583374, + "learning_rate": 5.3015000000000007e-05, + "loss": 0.3548, + "step": 10609 + }, + { + "epoch": 0.594131481688879, + "grad_norm": 1.5469439029693604, + "learning_rate": 5.3020000000000004e-05, + "loss": 0.6987, + "step": 10610 + }, + { + "epoch": 0.594187479001008, + "grad_norm": 1.2237581014633179, + "learning_rate": 5.3025e-05, + "loss": 0.4156, + "step": 10611 + }, + { + "epoch": 0.594243476313137, + "grad_norm": 1.4172309637069702, + "learning_rate": 5.303e-05, + "loss": 0.5293, + "step": 10612 + }, + { + "epoch": 0.594299473625266, + "grad_norm": 1.323311686515808, + "learning_rate": 5.3035e-05, + "loss": 0.3676, + "step": 10613 + }, + { + "epoch": 0.594355470937395, + "grad_norm": 1.3635905981063843, + "learning_rate": 5.304e-05, + "loss": 0.4182, + "step": 10614 + }, + { + "epoch": 0.5944114682495241, + "grad_norm": 1.357318639755249, + "learning_rate": 5.3045e-05, + "loss": 0.4527, + "step": 10615 + }, + { + "epoch": 0.5944674655616531, + "grad_norm": 1.4004288911819458, + "learning_rate": 5.305e-05, + "loss": 0.4716, + "step": 10616 + }, + { + "epoch": 0.5945234628737821, + "grad_norm": 1.1210845708847046, + "learning_rate": 5.3055e-05, + "loss": 0.3992, + "step": 10617 + }, + { + "epoch": 0.5945794601859111, + "grad_norm": 1.0251448154449463, + "learning_rate": 5.306e-05, + "loss": 0.354, + "step": 10618 + }, + { + "epoch": 0.5946354574980401, + "grad_norm": 1.2607853412628174, + "learning_rate": 5.3065e-05, + "loss": 0.453, + "step": 10619 + }, + { + "epoch": 0.5946914548101692, + "grad_norm": 1.6649878025054932, + "learning_rate": 5.307e-05, + "loss": 0.5101, + "step": 10620 + }, + { + "epoch": 0.5947474521222982, + "grad_norm": 1.504562497138977, + "learning_rate": 5.307500000000001e-05, + "loss": 0.5634, + "step": 10621 + }, + { + "epoch": 0.5948034494344272, + "grad_norm": 1.3259096145629883, + "learning_rate": 5.308000000000001e-05, + "loss": 0.4427, + "step": 10622 + }, + { + "epoch": 0.5948594467465562, + "grad_norm": 1.3511524200439453, + "learning_rate": 5.3085000000000004e-05, + "loss": 0.475, + "step": 10623 + }, + { + "epoch": 0.5949154440586851, + "grad_norm": 1.1541205644607544, + "learning_rate": 5.309000000000001e-05, + "loss": 0.4037, + "step": 10624 + }, + { + "epoch": 0.5949714413708141, + "grad_norm": 1.1316460371017456, + "learning_rate": 5.3095000000000006e-05, + "loss": 0.3553, + "step": 10625 + }, + { + "epoch": 0.5950274386829432, + "grad_norm": 1.2613017559051514, + "learning_rate": 5.31e-05, + "loss": 0.4714, + "step": 10626 + }, + { + "epoch": 0.5950834359950722, + "grad_norm": 1.2521265745162964, + "learning_rate": 5.310500000000001e-05, + "loss": 0.4846, + "step": 10627 + }, + { + "epoch": 0.5951394333072012, + "grad_norm": 1.2765040397644043, + "learning_rate": 5.3110000000000005e-05, + "loss": 0.4363, + "step": 10628 + }, + { + "epoch": 0.5951954306193302, + "grad_norm": 1.2572119235992432, + "learning_rate": 5.3115e-05, + "loss": 0.4709, + "step": 10629 + }, + { + "epoch": 0.5952514279314592, + "grad_norm": 1.1742247343063354, + "learning_rate": 5.3120000000000006e-05, + "loss": 0.4415, + "step": 10630 + }, + { + "epoch": 0.5953074252435883, + "grad_norm": 1.5842454433441162, + "learning_rate": 5.3125000000000004e-05, + "loss": 0.4154, + "step": 10631 + }, + { + "epoch": 0.5953634225557173, + "grad_norm": 1.1859489679336548, + "learning_rate": 5.313e-05, + "loss": 0.3858, + "step": 10632 + }, + { + "epoch": 0.5954194198678463, + "grad_norm": 1.2519792318344116, + "learning_rate": 5.3135e-05, + "loss": 0.368, + "step": 10633 + }, + { + "epoch": 0.5954754171799753, + "grad_norm": 1.3703736066818237, + "learning_rate": 5.314e-05, + "loss": 0.758, + "step": 10634 + }, + { + "epoch": 0.5955314144921043, + "grad_norm": 1.2715849876403809, + "learning_rate": 5.3145e-05, + "loss": 0.4004, + "step": 10635 + }, + { + "epoch": 0.5955874118042334, + "grad_norm": 1.769635558128357, + "learning_rate": 5.315e-05, + "loss": 0.4241, + "step": 10636 + }, + { + "epoch": 0.5956434091163624, + "grad_norm": 1.0612351894378662, + "learning_rate": 5.3155e-05, + "loss": 0.3624, + "step": 10637 + }, + { + "epoch": 0.5956994064284914, + "grad_norm": 1.1915764808654785, + "learning_rate": 5.316e-05, + "loss": 0.4014, + "step": 10638 + }, + { + "epoch": 0.5957554037406204, + "grad_norm": 2.9453630447387695, + "learning_rate": 5.3165e-05, + "loss": 0.4261, + "step": 10639 + }, + { + "epoch": 0.5958114010527494, + "grad_norm": 1.4035197496414185, + "learning_rate": 5.317e-05, + "loss": 0.4261, + "step": 10640 + }, + { + "epoch": 0.5958673983648785, + "grad_norm": 1.150656819343567, + "learning_rate": 5.3175e-05, + "loss": 0.4096, + "step": 10641 + }, + { + "epoch": 0.5959233956770075, + "grad_norm": 1.2750046253204346, + "learning_rate": 5.318000000000001e-05, + "loss": 0.4525, + "step": 10642 + }, + { + "epoch": 0.5959793929891365, + "grad_norm": 1.4290735721588135, + "learning_rate": 5.318500000000001e-05, + "loss": 0.5292, + "step": 10643 + }, + { + "epoch": 0.5960353903012655, + "grad_norm": 1.266943335533142, + "learning_rate": 5.3190000000000004e-05, + "loss": 0.3955, + "step": 10644 + }, + { + "epoch": 0.5960913876133945, + "grad_norm": 1.522802472114563, + "learning_rate": 5.319500000000001e-05, + "loss": 0.7771, + "step": 10645 + }, + { + "epoch": 0.5961473849255235, + "grad_norm": 1.3386750221252441, + "learning_rate": 5.3200000000000006e-05, + "loss": 0.4152, + "step": 10646 + }, + { + "epoch": 0.5962033822376526, + "grad_norm": 1.1313281059265137, + "learning_rate": 5.3205e-05, + "loss": 0.4505, + "step": 10647 + }, + { + "epoch": 0.5962593795497816, + "grad_norm": 1.1104485988616943, + "learning_rate": 5.321000000000001e-05, + "loss": 0.4002, + "step": 10648 + }, + { + "epoch": 0.5963153768619106, + "grad_norm": 1.2520898580551147, + "learning_rate": 5.3215000000000005e-05, + "loss": 0.3537, + "step": 10649 + }, + { + "epoch": 0.5963713741740396, + "grad_norm": 1.1591063737869263, + "learning_rate": 5.322e-05, + "loss": 0.4567, + "step": 10650 + }, + { + "epoch": 0.5964273714861686, + "grad_norm": 1.056723713874817, + "learning_rate": 5.3225e-05, + "loss": 0.3858, + "step": 10651 + }, + { + "epoch": 0.5964833687982977, + "grad_norm": 1.5650540590286255, + "learning_rate": 5.3230000000000004e-05, + "loss": 0.4986, + "step": 10652 + }, + { + "epoch": 0.5965393661104267, + "grad_norm": 1.3419244289398193, + "learning_rate": 5.3235e-05, + "loss": 0.4247, + "step": 10653 + }, + { + "epoch": 0.5965953634225557, + "grad_norm": 1.10451078414917, + "learning_rate": 5.324e-05, + "loss": 0.4241, + "step": 10654 + }, + { + "epoch": 0.5966513607346847, + "grad_norm": 1.2173242568969727, + "learning_rate": 5.3245e-05, + "loss": 0.5483, + "step": 10655 + }, + { + "epoch": 0.5967073580468137, + "grad_norm": 1.4182380437850952, + "learning_rate": 5.325e-05, + "loss": 0.4532, + "step": 10656 + }, + { + "epoch": 0.5967633553589428, + "grad_norm": 1.1876804828643799, + "learning_rate": 5.3255e-05, + "loss": 0.3835, + "step": 10657 + }, + { + "epoch": 0.5968193526710718, + "grad_norm": 1.3040655851364136, + "learning_rate": 5.326e-05, + "loss": 0.559, + "step": 10658 + }, + { + "epoch": 0.5968753499832008, + "grad_norm": 1.5240356922149658, + "learning_rate": 5.3265e-05, + "loss": 0.456, + "step": 10659 + }, + { + "epoch": 0.5969313472953298, + "grad_norm": 1.3377245664596558, + "learning_rate": 5.3269999999999996e-05, + "loss": 0.424, + "step": 10660 + }, + { + "epoch": 0.5969873446074588, + "grad_norm": 1.2126078605651855, + "learning_rate": 5.3274999999999994e-05, + "loss": 0.5067, + "step": 10661 + }, + { + "epoch": 0.5970433419195879, + "grad_norm": 1.19891357421875, + "learning_rate": 5.3280000000000005e-05, + "loss": 0.3772, + "step": 10662 + }, + { + "epoch": 0.5970993392317169, + "grad_norm": 1.408841609954834, + "learning_rate": 5.328500000000001e-05, + "loss": 0.4489, + "step": 10663 + }, + { + "epoch": 0.5971553365438459, + "grad_norm": 1.191271185874939, + "learning_rate": 5.3290000000000006e-05, + "loss": 0.375, + "step": 10664 + }, + { + "epoch": 0.5972113338559749, + "grad_norm": 1.279780387878418, + "learning_rate": 5.3295000000000004e-05, + "loss": 0.38, + "step": 10665 + }, + { + "epoch": 0.5972673311681039, + "grad_norm": 1.4989320039749146, + "learning_rate": 5.330000000000001e-05, + "loss": 0.583, + "step": 10666 + }, + { + "epoch": 0.597323328480233, + "grad_norm": 1.1831014156341553, + "learning_rate": 5.3305000000000005e-05, + "loss": 0.3763, + "step": 10667 + }, + { + "epoch": 0.597379325792362, + "grad_norm": 1.2059293985366821, + "learning_rate": 5.331e-05, + "loss": 0.4694, + "step": 10668 + }, + { + "epoch": 0.597435323104491, + "grad_norm": 1.4759252071380615, + "learning_rate": 5.331500000000001e-05, + "loss": 0.5221, + "step": 10669 + }, + { + "epoch": 0.59749132041662, + "grad_norm": 2.0236058235168457, + "learning_rate": 5.3320000000000004e-05, + "loss": 0.9048, + "step": 10670 + }, + { + "epoch": 0.597547317728749, + "grad_norm": 1.2076748609542847, + "learning_rate": 5.3325e-05, + "loss": 0.3366, + "step": 10671 + }, + { + "epoch": 0.597603315040878, + "grad_norm": 3.8239355087280273, + "learning_rate": 5.333e-05, + "loss": 0.3727, + "step": 10672 + }, + { + "epoch": 0.5976593123530071, + "grad_norm": 1.216123104095459, + "learning_rate": 5.3335000000000003e-05, + "loss": 0.4195, + "step": 10673 + }, + { + "epoch": 0.5977153096651361, + "grad_norm": 1.2644193172454834, + "learning_rate": 5.334e-05, + "loss": 0.4543, + "step": 10674 + }, + { + "epoch": 0.5977713069772651, + "grad_norm": 1.345656394958496, + "learning_rate": 5.3345e-05, + "loss": 0.3471, + "step": 10675 + }, + { + "epoch": 0.5978273042893941, + "grad_norm": 1.1759001016616821, + "learning_rate": 5.335e-05, + "loss": 0.4872, + "step": 10676 + }, + { + "epoch": 0.5978833016015231, + "grad_norm": 1.7768365144729614, + "learning_rate": 5.3355e-05, + "loss": 0.5289, + "step": 10677 + }, + { + "epoch": 0.5979392989136522, + "grad_norm": 1.2655423879623413, + "learning_rate": 5.336e-05, + "loss": 0.3933, + "step": 10678 + }, + { + "epoch": 0.5979952962257812, + "grad_norm": 1.3385236263275146, + "learning_rate": 5.3365e-05, + "loss": 0.4937, + "step": 10679 + }, + { + "epoch": 0.5980512935379102, + "grad_norm": 1.1813132762908936, + "learning_rate": 5.337e-05, + "loss": 0.3674, + "step": 10680 + }, + { + "epoch": 0.5981072908500392, + "grad_norm": 1.2993154525756836, + "learning_rate": 5.3374999999999996e-05, + "loss": 0.4463, + "step": 10681 + }, + { + "epoch": 0.5981632881621682, + "grad_norm": 1.3039528131484985, + "learning_rate": 5.338000000000001e-05, + "loss": 0.4434, + "step": 10682 + }, + { + "epoch": 0.5982192854742973, + "grad_norm": 1.5547949075698853, + "learning_rate": 5.3385000000000005e-05, + "loss": 0.4416, + "step": 10683 + }, + { + "epoch": 0.5982752827864263, + "grad_norm": 1.3940852880477905, + "learning_rate": 5.339000000000001e-05, + "loss": 0.4646, + "step": 10684 + }, + { + "epoch": 0.5983312800985553, + "grad_norm": 1.3054916858673096, + "learning_rate": 5.3395000000000006e-05, + "loss": 0.3423, + "step": 10685 + }, + { + "epoch": 0.5983872774106843, + "grad_norm": 1.3913575410842896, + "learning_rate": 5.3400000000000004e-05, + "loss": 0.4526, + "step": 10686 + }, + { + "epoch": 0.5984432747228133, + "grad_norm": 1.3938846588134766, + "learning_rate": 5.340500000000001e-05, + "loss": 0.4867, + "step": 10687 + }, + { + "epoch": 0.5984992720349424, + "grad_norm": 1.3865050077438354, + "learning_rate": 5.3410000000000005e-05, + "loss": 0.5223, + "step": 10688 + }, + { + "epoch": 0.5985552693470714, + "grad_norm": 1.180454969406128, + "learning_rate": 5.3415e-05, + "loss": 0.3896, + "step": 10689 + }, + { + "epoch": 0.5986112666592004, + "grad_norm": 1.3324973583221436, + "learning_rate": 5.342e-05, + "loss": 0.4556, + "step": 10690 + }, + { + "epoch": 0.5986672639713294, + "grad_norm": 1.3338416814804077, + "learning_rate": 5.3425000000000004e-05, + "loss": 0.3878, + "step": 10691 + }, + { + "epoch": 0.5987232612834584, + "grad_norm": 1.3270137310028076, + "learning_rate": 5.343e-05, + "loss": 0.478, + "step": 10692 + }, + { + "epoch": 0.5987792585955874, + "grad_norm": 1.1505717039108276, + "learning_rate": 5.3435e-05, + "loss": 0.389, + "step": 10693 + }, + { + "epoch": 0.5988352559077165, + "grad_norm": 1.434771180152893, + "learning_rate": 5.344e-05, + "loss": 0.3462, + "step": 10694 + }, + { + "epoch": 0.5988912532198455, + "grad_norm": 1.1641889810562134, + "learning_rate": 5.3445e-05, + "loss": 0.424, + "step": 10695 + }, + { + "epoch": 0.5989472505319745, + "grad_norm": 1.2255793809890747, + "learning_rate": 5.345e-05, + "loss": 0.5723, + "step": 10696 + }, + { + "epoch": 0.5990032478441035, + "grad_norm": 1.0824426412582397, + "learning_rate": 5.3455e-05, + "loss": 0.4253, + "step": 10697 + }, + { + "epoch": 0.5990592451562325, + "grad_norm": 1.2578072547912598, + "learning_rate": 5.346e-05, + "loss": 0.5385, + "step": 10698 + }, + { + "epoch": 0.5991152424683616, + "grad_norm": 1.221637487411499, + "learning_rate": 5.3465e-05, + "loss": 0.4711, + "step": 10699 + }, + { + "epoch": 0.5991712397804906, + "grad_norm": 1.1506383419036865, + "learning_rate": 5.3469999999999995e-05, + "loss": 0.3996, + "step": 10700 + }, + { + "epoch": 0.5992272370926196, + "grad_norm": 1.373428463935852, + "learning_rate": 5.3475e-05, + "loss": 0.3844, + "step": 10701 + }, + { + "epoch": 0.5992832344047486, + "grad_norm": 1.593705415725708, + "learning_rate": 5.348000000000001e-05, + "loss": 0.3559, + "step": 10702 + }, + { + "epoch": 0.5993392317168776, + "grad_norm": 1.0618690252304077, + "learning_rate": 5.348500000000001e-05, + "loss": 0.4307, + "step": 10703 + }, + { + "epoch": 0.5993952290290067, + "grad_norm": 1.1788705587387085, + "learning_rate": 5.3490000000000005e-05, + "loss": 0.4331, + "step": 10704 + }, + { + "epoch": 0.5994512263411357, + "grad_norm": 1.2288025617599487, + "learning_rate": 5.349500000000001e-05, + "loss": 0.3987, + "step": 10705 + }, + { + "epoch": 0.5995072236532647, + "grad_norm": 1.4503600597381592, + "learning_rate": 5.3500000000000006e-05, + "loss": 0.4142, + "step": 10706 + }, + { + "epoch": 0.5995632209653936, + "grad_norm": 1.3712904453277588, + "learning_rate": 5.3505000000000004e-05, + "loss": 0.5967, + "step": 10707 + }, + { + "epoch": 0.5996192182775226, + "grad_norm": 1.2276537418365479, + "learning_rate": 5.351000000000001e-05, + "loss": 0.3684, + "step": 10708 + }, + { + "epoch": 0.5996752155896516, + "grad_norm": 1.0762742757797241, + "learning_rate": 5.3515000000000005e-05, + "loss": 0.3283, + "step": 10709 + }, + { + "epoch": 0.5997312129017807, + "grad_norm": 1.382184624671936, + "learning_rate": 5.352e-05, + "loss": 0.6539, + "step": 10710 + }, + { + "epoch": 0.5997872102139097, + "grad_norm": 1.044752836227417, + "learning_rate": 5.3525e-05, + "loss": 0.3505, + "step": 10711 + }, + { + "epoch": 0.5998432075260387, + "grad_norm": 1.081892490386963, + "learning_rate": 5.3530000000000004e-05, + "loss": 0.4271, + "step": 10712 + }, + { + "epoch": 0.5998992048381677, + "grad_norm": 1.3416136503219604, + "learning_rate": 5.3535e-05, + "loss": 0.3771, + "step": 10713 + }, + { + "epoch": 0.5999552021502967, + "grad_norm": 1.536210060119629, + "learning_rate": 5.354e-05, + "loss": 0.6779, + "step": 10714 + }, + { + "epoch": 0.6000111994624258, + "grad_norm": 1.3057374954223633, + "learning_rate": 5.3545e-05, + "loss": 0.4476, + "step": 10715 + }, + { + "epoch": 0.6000671967745548, + "grad_norm": 1.132282018661499, + "learning_rate": 5.355e-05, + "loss": 0.3702, + "step": 10716 + }, + { + "epoch": 0.6001231940866838, + "grad_norm": 1.3011412620544434, + "learning_rate": 5.3555e-05, + "loss": 0.4967, + "step": 10717 + }, + { + "epoch": 0.6001791913988128, + "grad_norm": 1.2960222959518433, + "learning_rate": 5.356e-05, + "loss": 0.3776, + "step": 10718 + }, + { + "epoch": 0.6002351887109418, + "grad_norm": 1.1946500539779663, + "learning_rate": 5.3565e-05, + "loss": 0.3625, + "step": 10719 + }, + { + "epoch": 0.6002911860230709, + "grad_norm": 1.1649937629699707, + "learning_rate": 5.357e-05, + "loss": 0.4072, + "step": 10720 + }, + { + "epoch": 0.6003471833351999, + "grad_norm": 1.5056583881378174, + "learning_rate": 5.3574999999999994e-05, + "loss": 0.6115, + "step": 10721 + }, + { + "epoch": 0.6004031806473289, + "grad_norm": 1.3560359477996826, + "learning_rate": 5.3580000000000005e-05, + "loss": 0.4659, + "step": 10722 + }, + { + "epoch": 0.6004591779594579, + "grad_norm": 1.4856892824172974, + "learning_rate": 5.358500000000001e-05, + "loss": 0.5304, + "step": 10723 + }, + { + "epoch": 0.6005151752715869, + "grad_norm": 1.1554310321807861, + "learning_rate": 5.359000000000001e-05, + "loss": 0.4317, + "step": 10724 + }, + { + "epoch": 0.600571172583716, + "grad_norm": 1.2974077463150024, + "learning_rate": 5.3595000000000004e-05, + "loss": 0.4683, + "step": 10725 + }, + { + "epoch": 0.600627169895845, + "grad_norm": 1.1984494924545288, + "learning_rate": 5.360000000000001e-05, + "loss": 0.3942, + "step": 10726 + }, + { + "epoch": 0.600683167207974, + "grad_norm": 1.267159342765808, + "learning_rate": 5.3605000000000006e-05, + "loss": 0.3985, + "step": 10727 + }, + { + "epoch": 0.600739164520103, + "grad_norm": 1.2933393716812134, + "learning_rate": 5.3610000000000003e-05, + "loss": 0.521, + "step": 10728 + }, + { + "epoch": 0.600795161832232, + "grad_norm": 1.145164132118225, + "learning_rate": 5.3615e-05, + "loss": 0.3908, + "step": 10729 + }, + { + "epoch": 0.600851159144361, + "grad_norm": 1.5170440673828125, + "learning_rate": 5.3620000000000005e-05, + "loss": 0.4942, + "step": 10730 + }, + { + "epoch": 0.6009071564564901, + "grad_norm": 1.2621502876281738, + "learning_rate": 5.3625e-05, + "loss": 0.445, + "step": 10731 + }, + { + "epoch": 0.6009631537686191, + "grad_norm": 1.3321058750152588, + "learning_rate": 5.363e-05, + "loss": 0.4805, + "step": 10732 + }, + { + "epoch": 0.6010191510807481, + "grad_norm": 3.6692733764648438, + "learning_rate": 5.3635000000000004e-05, + "loss": 0.4295, + "step": 10733 + }, + { + "epoch": 0.6010751483928771, + "grad_norm": 1.2317042350769043, + "learning_rate": 5.364e-05, + "loss": 0.3777, + "step": 10734 + }, + { + "epoch": 0.6011311457050061, + "grad_norm": 1.2057650089263916, + "learning_rate": 5.3645e-05, + "loss": 0.3517, + "step": 10735 + }, + { + "epoch": 0.6011871430171352, + "grad_norm": 1.4327137470245361, + "learning_rate": 5.365e-05, + "loss": 0.4454, + "step": 10736 + }, + { + "epoch": 0.6012431403292642, + "grad_norm": 1.1435508728027344, + "learning_rate": 5.3655e-05, + "loss": 0.5331, + "step": 10737 + }, + { + "epoch": 0.6012991376413932, + "grad_norm": 1.0889784097671509, + "learning_rate": 5.366e-05, + "loss": 0.4379, + "step": 10738 + }, + { + "epoch": 0.6013551349535222, + "grad_norm": 1.335314393043518, + "learning_rate": 5.3664999999999995e-05, + "loss": 0.4332, + "step": 10739 + }, + { + "epoch": 0.6014111322656512, + "grad_norm": 1.3137105703353882, + "learning_rate": 5.367e-05, + "loss": 0.4548, + "step": 10740 + }, + { + "epoch": 0.6014671295777803, + "grad_norm": 1.2829149961471558, + "learning_rate": 5.3675e-05, + "loss": 0.4708, + "step": 10741 + }, + { + "epoch": 0.6015231268899093, + "grad_norm": 1.2563443183898926, + "learning_rate": 5.368000000000001e-05, + "loss": 0.5871, + "step": 10742 + }, + { + "epoch": 0.6015791242020383, + "grad_norm": 1.365319013595581, + "learning_rate": 5.3685000000000005e-05, + "loss": 0.3273, + "step": 10743 + }, + { + "epoch": 0.6016351215141673, + "grad_norm": 1.3159781694412231, + "learning_rate": 5.369000000000001e-05, + "loss": 0.4736, + "step": 10744 + }, + { + "epoch": 0.6016911188262963, + "grad_norm": 1.2725735902786255, + "learning_rate": 5.369500000000001e-05, + "loss": 0.4324, + "step": 10745 + }, + { + "epoch": 0.6017471161384254, + "grad_norm": 1.2729079723358154, + "learning_rate": 5.3700000000000004e-05, + "loss": 0.5507, + "step": 10746 + }, + { + "epoch": 0.6018031134505544, + "grad_norm": 1.3266091346740723, + "learning_rate": 5.370500000000001e-05, + "loss": 0.3879, + "step": 10747 + }, + { + "epoch": 0.6018591107626834, + "grad_norm": 1.4228885173797607, + "learning_rate": 5.3710000000000006e-05, + "loss": 0.4267, + "step": 10748 + }, + { + "epoch": 0.6019151080748124, + "grad_norm": 1.3786468505859375, + "learning_rate": 5.3715e-05, + "loss": 0.6096, + "step": 10749 + }, + { + "epoch": 0.6019711053869414, + "grad_norm": 1.2821747064590454, + "learning_rate": 5.372e-05, + "loss": 0.3711, + "step": 10750 + }, + { + "epoch": 0.6020271026990704, + "grad_norm": 1.3442124128341675, + "learning_rate": 5.3725000000000005e-05, + "loss": 0.5343, + "step": 10751 + }, + { + "epoch": 0.6020831000111995, + "grad_norm": 1.2129201889038086, + "learning_rate": 5.373e-05, + "loss": 0.3621, + "step": 10752 + }, + { + "epoch": 0.6021390973233285, + "grad_norm": 1.8313417434692383, + "learning_rate": 5.3735e-05, + "loss": 0.3928, + "step": 10753 + }, + { + "epoch": 0.6021950946354575, + "grad_norm": 1.3189783096313477, + "learning_rate": 5.3740000000000004e-05, + "loss": 0.4618, + "step": 10754 + }, + { + "epoch": 0.6022510919475865, + "grad_norm": 1.542063593864441, + "learning_rate": 5.3745e-05, + "loss": 0.44, + "step": 10755 + }, + { + "epoch": 0.6023070892597155, + "grad_norm": 1.1043028831481934, + "learning_rate": 5.375e-05, + "loss": 0.4144, + "step": 10756 + }, + { + "epoch": 0.6023630865718446, + "grad_norm": 1.2992949485778809, + "learning_rate": 5.3755e-05, + "loss": 0.5384, + "step": 10757 + }, + { + "epoch": 0.6024190838839736, + "grad_norm": 1.1901628971099854, + "learning_rate": 5.376e-05, + "loss": 0.3763, + "step": 10758 + }, + { + "epoch": 0.6024750811961026, + "grad_norm": 1.3112820386886597, + "learning_rate": 5.3765e-05, + "loss": 0.4026, + "step": 10759 + }, + { + "epoch": 0.6025310785082316, + "grad_norm": 1.519964575767517, + "learning_rate": 5.3769999999999995e-05, + "loss": 0.5919, + "step": 10760 + }, + { + "epoch": 0.6025870758203606, + "grad_norm": 1.2642871141433716, + "learning_rate": 5.3775e-05, + "loss": 0.5022, + "step": 10761 + }, + { + "epoch": 0.6026430731324897, + "grad_norm": 1.2007747888565063, + "learning_rate": 5.378e-05, + "loss": 0.5792, + "step": 10762 + }, + { + "epoch": 0.6026990704446187, + "grad_norm": 1.247137427330017, + "learning_rate": 5.378500000000001e-05, + "loss": 0.3743, + "step": 10763 + }, + { + "epoch": 0.6027550677567477, + "grad_norm": 1.2913554906845093, + "learning_rate": 5.3790000000000005e-05, + "loss": 0.4759, + "step": 10764 + }, + { + "epoch": 0.6028110650688767, + "grad_norm": 1.1013917922973633, + "learning_rate": 5.379500000000001e-05, + "loss": 0.4122, + "step": 10765 + }, + { + "epoch": 0.6028670623810057, + "grad_norm": 2.8226101398468018, + "learning_rate": 5.380000000000001e-05, + "loss": 0.3754, + "step": 10766 + }, + { + "epoch": 0.6029230596931348, + "grad_norm": 1.3585474491119385, + "learning_rate": 5.3805000000000004e-05, + "loss": 0.3925, + "step": 10767 + }, + { + "epoch": 0.6029790570052638, + "grad_norm": 0.9887531995773315, + "learning_rate": 5.381e-05, + "loss": 0.2893, + "step": 10768 + }, + { + "epoch": 0.6030350543173928, + "grad_norm": 1.1428364515304565, + "learning_rate": 5.3815000000000006e-05, + "loss": 0.3991, + "step": 10769 + }, + { + "epoch": 0.6030910516295218, + "grad_norm": 1.4529662132263184, + "learning_rate": 5.382e-05, + "loss": 0.4707, + "step": 10770 + }, + { + "epoch": 0.6031470489416508, + "grad_norm": 1.2549299001693726, + "learning_rate": 5.3825e-05, + "loss": 0.3181, + "step": 10771 + }, + { + "epoch": 0.6032030462537799, + "grad_norm": 1.3555113077163696, + "learning_rate": 5.3830000000000005e-05, + "loss": 0.4483, + "step": 10772 + }, + { + "epoch": 0.6032590435659089, + "grad_norm": 1.4491767883300781, + "learning_rate": 5.3835e-05, + "loss": 0.6002, + "step": 10773 + }, + { + "epoch": 0.6033150408780379, + "grad_norm": 1.3413119316101074, + "learning_rate": 5.384e-05, + "loss": 0.4649, + "step": 10774 + }, + { + "epoch": 0.6033710381901669, + "grad_norm": 1.6401668787002563, + "learning_rate": 5.3845000000000004e-05, + "loss": 0.3622, + "step": 10775 + }, + { + "epoch": 0.6034270355022959, + "grad_norm": 1.498465657234192, + "learning_rate": 5.385e-05, + "loss": 0.5619, + "step": 10776 + }, + { + "epoch": 0.603483032814425, + "grad_norm": 1.2065180540084839, + "learning_rate": 5.3855e-05, + "loss": 0.5187, + "step": 10777 + }, + { + "epoch": 0.603539030126554, + "grad_norm": 1.4181182384490967, + "learning_rate": 5.386e-05, + "loss": 0.4366, + "step": 10778 + }, + { + "epoch": 0.603595027438683, + "grad_norm": 1.4053266048431396, + "learning_rate": 5.3865e-05, + "loss": 0.5172, + "step": 10779 + }, + { + "epoch": 0.603651024750812, + "grad_norm": 1.3527300357818604, + "learning_rate": 5.387e-05, + "loss": 0.5256, + "step": 10780 + }, + { + "epoch": 0.603707022062941, + "grad_norm": 1.6070576906204224, + "learning_rate": 5.3874999999999995e-05, + "loss": 0.5341, + "step": 10781 + }, + { + "epoch": 0.60376301937507, + "grad_norm": 1.2043564319610596, + "learning_rate": 5.388e-05, + "loss": 0.338, + "step": 10782 + }, + { + "epoch": 0.6038190166871991, + "grad_norm": 1.2934094667434692, + "learning_rate": 5.388500000000001e-05, + "loss": 0.3572, + "step": 10783 + }, + { + "epoch": 0.6038750139993281, + "grad_norm": 1.2597966194152832, + "learning_rate": 5.389000000000001e-05, + "loss": 0.4674, + "step": 10784 + }, + { + "epoch": 0.6039310113114571, + "grad_norm": 1.358716607093811, + "learning_rate": 5.3895000000000005e-05, + "loss": 0.4766, + "step": 10785 + }, + { + "epoch": 0.6039870086235861, + "grad_norm": 1.2400068044662476, + "learning_rate": 5.390000000000001e-05, + "loss": 0.4309, + "step": 10786 + }, + { + "epoch": 0.6040430059357151, + "grad_norm": 1.2446389198303223, + "learning_rate": 5.3905000000000007e-05, + "loss": 0.4155, + "step": 10787 + }, + { + "epoch": 0.6040990032478442, + "grad_norm": 1.3253979682922363, + "learning_rate": 5.3910000000000004e-05, + "loss": 0.4481, + "step": 10788 + }, + { + "epoch": 0.6041550005599732, + "grad_norm": 1.2350094318389893, + "learning_rate": 5.3915e-05, + "loss": 0.3623, + "step": 10789 + }, + { + "epoch": 0.6042109978721021, + "grad_norm": 1.4209998846054077, + "learning_rate": 5.3920000000000006e-05, + "loss": 0.4583, + "step": 10790 + }, + { + "epoch": 0.6042669951842311, + "grad_norm": 1.4345864057540894, + "learning_rate": 5.3925e-05, + "loss": 0.4316, + "step": 10791 + }, + { + "epoch": 0.6043229924963601, + "grad_norm": 1.2540873289108276, + "learning_rate": 5.393e-05, + "loss": 0.4195, + "step": 10792 + }, + { + "epoch": 0.6043789898084891, + "grad_norm": 1.4047833681106567, + "learning_rate": 5.3935000000000005e-05, + "loss": 0.4012, + "step": 10793 + }, + { + "epoch": 0.6044349871206182, + "grad_norm": 1.1563358306884766, + "learning_rate": 5.394e-05, + "loss": 0.3616, + "step": 10794 + }, + { + "epoch": 0.6044909844327472, + "grad_norm": 1.5559574365615845, + "learning_rate": 5.3945e-05, + "loss": 0.4022, + "step": 10795 + }, + { + "epoch": 0.6045469817448762, + "grad_norm": 1.235590934753418, + "learning_rate": 5.3950000000000004e-05, + "loss": 0.4277, + "step": 10796 + }, + { + "epoch": 0.6046029790570052, + "grad_norm": 1.1067087650299072, + "learning_rate": 5.3955e-05, + "loss": 0.4182, + "step": 10797 + }, + { + "epoch": 0.6046589763691342, + "grad_norm": 1.5357977151870728, + "learning_rate": 5.396e-05, + "loss": 0.4248, + "step": 10798 + }, + { + "epoch": 0.6047149736812633, + "grad_norm": 1.1177622079849243, + "learning_rate": 5.3964999999999996e-05, + "loss": 0.4271, + "step": 10799 + }, + { + "epoch": 0.6047709709933923, + "grad_norm": 1.2507884502410889, + "learning_rate": 5.397e-05, + "loss": 0.4074, + "step": 10800 + }, + { + "epoch": 0.6048269683055213, + "grad_norm": 1.195744276046753, + "learning_rate": 5.3975e-05, + "loss": 0.3231, + "step": 10801 + }, + { + "epoch": 0.6048829656176503, + "grad_norm": 1.3030754327774048, + "learning_rate": 5.3979999999999995e-05, + "loss": 0.5291, + "step": 10802 + }, + { + "epoch": 0.6049389629297793, + "grad_norm": 1.6797661781311035, + "learning_rate": 5.3985000000000006e-05, + "loss": 0.4854, + "step": 10803 + }, + { + "epoch": 0.6049949602419084, + "grad_norm": 1.1389864683151245, + "learning_rate": 5.399000000000001e-05, + "loss": 0.4245, + "step": 10804 + }, + { + "epoch": 0.6050509575540374, + "grad_norm": 4.239365577697754, + "learning_rate": 5.399500000000001e-05, + "loss": 0.4988, + "step": 10805 + }, + { + "epoch": 0.6051069548661664, + "grad_norm": 1.2595585584640503, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.4289, + "step": 10806 + }, + { + "epoch": 0.6051629521782954, + "grad_norm": 1.268800973892212, + "learning_rate": 5.400500000000001e-05, + "loss": 0.4455, + "step": 10807 + }, + { + "epoch": 0.6052189494904244, + "grad_norm": 1.621047019958496, + "learning_rate": 5.4010000000000006e-05, + "loss": 0.4633, + "step": 10808 + }, + { + "epoch": 0.6052749468025534, + "grad_norm": 1.2500159740447998, + "learning_rate": 5.4015000000000004e-05, + "loss": 0.4274, + "step": 10809 + }, + { + "epoch": 0.6053309441146825, + "grad_norm": 1.5942906141281128, + "learning_rate": 5.402e-05, + "loss": 0.4211, + "step": 10810 + }, + { + "epoch": 0.6053869414268115, + "grad_norm": 1.32368803024292, + "learning_rate": 5.4025000000000005e-05, + "loss": 0.5392, + "step": 10811 + }, + { + "epoch": 0.6054429387389405, + "grad_norm": 1.339520812034607, + "learning_rate": 5.403e-05, + "loss": 0.4374, + "step": 10812 + }, + { + "epoch": 0.6054989360510695, + "grad_norm": 2.1490044593811035, + "learning_rate": 5.4035e-05, + "loss": 0.4251, + "step": 10813 + }, + { + "epoch": 0.6055549333631985, + "grad_norm": 1.3826438188552856, + "learning_rate": 5.4040000000000004e-05, + "loss": 0.4717, + "step": 10814 + }, + { + "epoch": 0.6056109306753276, + "grad_norm": 1.2756162881851196, + "learning_rate": 5.4045e-05, + "loss": 0.3501, + "step": 10815 + }, + { + "epoch": 0.6056669279874566, + "grad_norm": 1.0150901079177856, + "learning_rate": 5.405e-05, + "loss": 0.366, + "step": 10816 + }, + { + "epoch": 0.6057229252995856, + "grad_norm": 1.6545933485031128, + "learning_rate": 5.4055e-05, + "loss": 0.4843, + "step": 10817 + }, + { + "epoch": 0.6057789226117146, + "grad_norm": 1.1055387258529663, + "learning_rate": 5.406e-05, + "loss": 0.38, + "step": 10818 + }, + { + "epoch": 0.6058349199238436, + "grad_norm": 1.436523199081421, + "learning_rate": 5.4065e-05, + "loss": 0.4003, + "step": 10819 + }, + { + "epoch": 0.6058909172359727, + "grad_norm": 1.28406822681427, + "learning_rate": 5.4069999999999996e-05, + "loss": 0.5015, + "step": 10820 + }, + { + "epoch": 0.6059469145481017, + "grad_norm": 1.147472620010376, + "learning_rate": 5.4075e-05, + "loss": 0.2851, + "step": 10821 + }, + { + "epoch": 0.6060029118602307, + "grad_norm": 1.0317665338516235, + "learning_rate": 5.408e-05, + "loss": 0.3906, + "step": 10822 + }, + { + "epoch": 0.6060589091723597, + "grad_norm": 1.2612948417663574, + "learning_rate": 5.408500000000001e-05, + "loss": 0.429, + "step": 10823 + }, + { + "epoch": 0.6061149064844887, + "grad_norm": 1.498570203781128, + "learning_rate": 5.4090000000000006e-05, + "loss": 0.535, + "step": 10824 + }, + { + "epoch": 0.6061709037966178, + "grad_norm": 1.2321866750717163, + "learning_rate": 5.409500000000001e-05, + "loss": 0.4573, + "step": 10825 + }, + { + "epoch": 0.6062269011087468, + "grad_norm": 1.2394964694976807, + "learning_rate": 5.410000000000001e-05, + "loss": 0.4547, + "step": 10826 + }, + { + "epoch": 0.6062828984208758, + "grad_norm": 1.8866934776306152, + "learning_rate": 5.4105000000000005e-05, + "loss": 0.4295, + "step": 10827 + }, + { + "epoch": 0.6063388957330048, + "grad_norm": 1.3016738891601562, + "learning_rate": 5.411e-05, + "loss": 0.418, + "step": 10828 + }, + { + "epoch": 0.6063948930451338, + "grad_norm": 1.2495415210723877, + "learning_rate": 5.4115000000000006e-05, + "loss": 0.3469, + "step": 10829 + }, + { + "epoch": 0.6064508903572629, + "grad_norm": 1.394277811050415, + "learning_rate": 5.4120000000000004e-05, + "loss": 0.4012, + "step": 10830 + }, + { + "epoch": 0.6065068876693919, + "grad_norm": 1.276740550994873, + "learning_rate": 5.4125e-05, + "loss": 0.4102, + "step": 10831 + }, + { + "epoch": 0.6065628849815209, + "grad_norm": 1.152902364730835, + "learning_rate": 5.4130000000000005e-05, + "loss": 0.4301, + "step": 10832 + }, + { + "epoch": 0.6066188822936499, + "grad_norm": 1.3067655563354492, + "learning_rate": 5.4135e-05, + "loss": 0.5145, + "step": 10833 + }, + { + "epoch": 0.6066748796057789, + "grad_norm": 1.089138388633728, + "learning_rate": 5.414e-05, + "loss": 0.371, + "step": 10834 + }, + { + "epoch": 0.606730876917908, + "grad_norm": 1.266743779182434, + "learning_rate": 5.4145000000000004e-05, + "loss": 0.4955, + "step": 10835 + }, + { + "epoch": 0.606786874230037, + "grad_norm": 1.3165127038955688, + "learning_rate": 5.415e-05, + "loss": 0.4544, + "step": 10836 + }, + { + "epoch": 0.606842871542166, + "grad_norm": 1.4284101724624634, + "learning_rate": 5.4155e-05, + "loss": 0.3966, + "step": 10837 + }, + { + "epoch": 0.606898868854295, + "grad_norm": 1.1028484106063843, + "learning_rate": 5.4159999999999996e-05, + "loss": 0.3794, + "step": 10838 + }, + { + "epoch": 0.606954866166424, + "grad_norm": 1.2298623323440552, + "learning_rate": 5.4165e-05, + "loss": 0.424, + "step": 10839 + }, + { + "epoch": 0.607010863478553, + "grad_norm": 1.338499903678894, + "learning_rate": 5.417e-05, + "loss": 0.4441, + "step": 10840 + }, + { + "epoch": 0.6070668607906821, + "grad_norm": 1.2862958908081055, + "learning_rate": 5.4174999999999995e-05, + "loss": 0.4455, + "step": 10841 + }, + { + "epoch": 0.6071228581028111, + "grad_norm": 1.2241559028625488, + "learning_rate": 5.418e-05, + "loss": 0.421, + "step": 10842 + }, + { + "epoch": 0.6071788554149401, + "grad_norm": 1.0324537754058838, + "learning_rate": 5.418500000000001e-05, + "loss": 0.3749, + "step": 10843 + }, + { + "epoch": 0.6072348527270691, + "grad_norm": 1.3654017448425293, + "learning_rate": 5.419000000000001e-05, + "loss": 0.3271, + "step": 10844 + }, + { + "epoch": 0.6072908500391981, + "grad_norm": 1.1245365142822266, + "learning_rate": 5.4195000000000005e-05, + "loss": 0.3942, + "step": 10845 + }, + { + "epoch": 0.6073468473513272, + "grad_norm": 1.579898715019226, + "learning_rate": 5.420000000000001e-05, + "loss": 0.4232, + "step": 10846 + }, + { + "epoch": 0.6074028446634562, + "grad_norm": 1.23211669921875, + "learning_rate": 5.420500000000001e-05, + "loss": 0.4372, + "step": 10847 + }, + { + "epoch": 0.6074588419755852, + "grad_norm": 1.2611342668533325, + "learning_rate": 5.4210000000000004e-05, + "loss": 0.5056, + "step": 10848 + }, + { + "epoch": 0.6075148392877142, + "grad_norm": 1.1216042041778564, + "learning_rate": 5.4215e-05, + "loss": 0.3703, + "step": 10849 + }, + { + "epoch": 0.6075708365998432, + "grad_norm": 1.1612191200256348, + "learning_rate": 5.4220000000000006e-05, + "loss": 0.4331, + "step": 10850 + }, + { + "epoch": 0.6076268339119723, + "grad_norm": 1.5577188730239868, + "learning_rate": 5.4225000000000003e-05, + "loss": 0.6184, + "step": 10851 + }, + { + "epoch": 0.6076828312241013, + "grad_norm": 1.3442915678024292, + "learning_rate": 5.423e-05, + "loss": 0.3291, + "step": 10852 + }, + { + "epoch": 0.6077388285362303, + "grad_norm": 1.355039358139038, + "learning_rate": 5.4235000000000005e-05, + "loss": 0.6839, + "step": 10853 + }, + { + "epoch": 0.6077948258483593, + "grad_norm": 1.3512647151947021, + "learning_rate": 5.424e-05, + "loss": 0.4379, + "step": 10854 + }, + { + "epoch": 0.6078508231604883, + "grad_norm": 1.8527106046676636, + "learning_rate": 5.4245e-05, + "loss": 0.3735, + "step": 10855 + }, + { + "epoch": 0.6079068204726173, + "grad_norm": 1.3146089315414429, + "learning_rate": 5.4250000000000004e-05, + "loss": 0.3954, + "step": 10856 + }, + { + "epoch": 0.6079628177847464, + "grad_norm": 1.2463274002075195, + "learning_rate": 5.4255e-05, + "loss": 0.4262, + "step": 10857 + }, + { + "epoch": 0.6080188150968754, + "grad_norm": 1.1774721145629883, + "learning_rate": 5.426e-05, + "loss": 0.3186, + "step": 10858 + }, + { + "epoch": 0.6080748124090044, + "grad_norm": 1.1638809442520142, + "learning_rate": 5.4264999999999996e-05, + "loss": 0.3631, + "step": 10859 + }, + { + "epoch": 0.6081308097211334, + "grad_norm": 1.337945580482483, + "learning_rate": 5.427e-05, + "loss": 0.3337, + "step": 10860 + }, + { + "epoch": 0.6081868070332624, + "grad_norm": 1.6546392440795898, + "learning_rate": 5.4275e-05, + "loss": 0.5638, + "step": 10861 + }, + { + "epoch": 0.6082428043453915, + "grad_norm": 1.39755117893219, + "learning_rate": 5.4279999999999995e-05, + "loss": 0.4977, + "step": 10862 + }, + { + "epoch": 0.6082988016575205, + "grad_norm": 1.1643980741500854, + "learning_rate": 5.4285000000000006e-05, + "loss": 0.4908, + "step": 10863 + }, + { + "epoch": 0.6083547989696495, + "grad_norm": 1.3000119924545288, + "learning_rate": 5.429000000000001e-05, + "loss": 0.5467, + "step": 10864 + }, + { + "epoch": 0.6084107962817785, + "grad_norm": 1.3411471843719482, + "learning_rate": 5.429500000000001e-05, + "loss": 0.6696, + "step": 10865 + }, + { + "epoch": 0.6084667935939075, + "grad_norm": 1.2692595720291138, + "learning_rate": 5.4300000000000005e-05, + "loss": 0.4051, + "step": 10866 + }, + { + "epoch": 0.6085227909060366, + "grad_norm": 1.143664836883545, + "learning_rate": 5.4305e-05, + "loss": 0.3524, + "step": 10867 + }, + { + "epoch": 0.6085787882181656, + "grad_norm": 1.2592768669128418, + "learning_rate": 5.431000000000001e-05, + "loss": 0.468, + "step": 10868 + }, + { + "epoch": 0.6086347855302946, + "grad_norm": 4.257694721221924, + "learning_rate": 5.4315000000000004e-05, + "loss": 0.375, + "step": 10869 + }, + { + "epoch": 0.6086907828424236, + "grad_norm": 1.3914134502410889, + "learning_rate": 5.432e-05, + "loss": 0.4355, + "step": 10870 + }, + { + "epoch": 0.6087467801545526, + "grad_norm": 1.4659074544906616, + "learning_rate": 5.4325000000000006e-05, + "loss": 0.6274, + "step": 10871 + }, + { + "epoch": 0.6088027774666815, + "grad_norm": 1.2778185606002808, + "learning_rate": 5.433e-05, + "loss": 0.5666, + "step": 10872 + }, + { + "epoch": 0.6088587747788106, + "grad_norm": 1.189117193222046, + "learning_rate": 5.4335e-05, + "loss": 0.4109, + "step": 10873 + }, + { + "epoch": 0.6089147720909396, + "grad_norm": 1.122721791267395, + "learning_rate": 5.4340000000000005e-05, + "loss": 0.4035, + "step": 10874 + }, + { + "epoch": 0.6089707694030686, + "grad_norm": 1.4179353713989258, + "learning_rate": 5.4345e-05, + "loss": 0.5543, + "step": 10875 + }, + { + "epoch": 0.6090267667151976, + "grad_norm": 1.0920861959457397, + "learning_rate": 5.435e-05, + "loss": 0.4486, + "step": 10876 + }, + { + "epoch": 0.6090827640273266, + "grad_norm": 1.174911618232727, + "learning_rate": 5.4355e-05, + "loss": 0.4765, + "step": 10877 + }, + { + "epoch": 0.6091387613394557, + "grad_norm": 1.4525388479232788, + "learning_rate": 5.436e-05, + "loss": 0.4524, + "step": 10878 + }, + { + "epoch": 0.6091947586515847, + "grad_norm": 1.2018752098083496, + "learning_rate": 5.4365e-05, + "loss": 0.3196, + "step": 10879 + }, + { + "epoch": 0.6092507559637137, + "grad_norm": 1.4582374095916748, + "learning_rate": 5.4369999999999996e-05, + "loss": 0.4108, + "step": 10880 + }, + { + "epoch": 0.6093067532758427, + "grad_norm": 1.1760350465774536, + "learning_rate": 5.4375e-05, + "loss": 0.3301, + "step": 10881 + }, + { + "epoch": 0.6093627505879717, + "grad_norm": 1.2863696813583374, + "learning_rate": 5.438e-05, + "loss": 0.4477, + "step": 10882 + }, + { + "epoch": 0.6094187479001008, + "grad_norm": 1.2450844049453735, + "learning_rate": 5.4384999999999995e-05, + "loss": 0.4449, + "step": 10883 + }, + { + "epoch": 0.6094747452122298, + "grad_norm": 1.1178971529006958, + "learning_rate": 5.4390000000000006e-05, + "loss": 0.3883, + "step": 10884 + }, + { + "epoch": 0.6095307425243588, + "grad_norm": 1.490931749343872, + "learning_rate": 5.439500000000001e-05, + "loss": 0.3687, + "step": 10885 + }, + { + "epoch": 0.6095867398364878, + "grad_norm": 1.1268008947372437, + "learning_rate": 5.440000000000001e-05, + "loss": 0.4334, + "step": 10886 + }, + { + "epoch": 0.6096427371486168, + "grad_norm": 1.184987187385559, + "learning_rate": 5.4405000000000005e-05, + "loss": 0.4247, + "step": 10887 + }, + { + "epoch": 0.6096987344607459, + "grad_norm": 1.1118158102035522, + "learning_rate": 5.441e-05, + "loss": 0.3663, + "step": 10888 + }, + { + "epoch": 0.6097547317728749, + "grad_norm": 1.3129841089248657, + "learning_rate": 5.441500000000001e-05, + "loss": 0.2936, + "step": 10889 + }, + { + "epoch": 0.6098107290850039, + "grad_norm": 1.2024022340774536, + "learning_rate": 5.4420000000000004e-05, + "loss": 0.3951, + "step": 10890 + }, + { + "epoch": 0.6098667263971329, + "grad_norm": 1.322929859161377, + "learning_rate": 5.4425e-05, + "loss": 0.3611, + "step": 10891 + }, + { + "epoch": 0.6099227237092619, + "grad_norm": 1.272114872932434, + "learning_rate": 5.4430000000000006e-05, + "loss": 0.4719, + "step": 10892 + }, + { + "epoch": 0.609978721021391, + "grad_norm": 1.081798791885376, + "learning_rate": 5.4435e-05, + "loss": 0.4308, + "step": 10893 + }, + { + "epoch": 0.61003471833352, + "grad_norm": 1.277859091758728, + "learning_rate": 5.444e-05, + "loss": 0.4554, + "step": 10894 + }, + { + "epoch": 0.610090715645649, + "grad_norm": 1.3497674465179443, + "learning_rate": 5.4445000000000005e-05, + "loss": 0.3705, + "step": 10895 + }, + { + "epoch": 0.610146712957778, + "grad_norm": 1.611820936203003, + "learning_rate": 5.445e-05, + "loss": 0.457, + "step": 10896 + }, + { + "epoch": 0.610202710269907, + "grad_norm": 1.240739107131958, + "learning_rate": 5.4455e-05, + "loss": 0.4434, + "step": 10897 + }, + { + "epoch": 0.610258707582036, + "grad_norm": 1.3244433403015137, + "learning_rate": 5.446e-05, + "loss": 0.3844, + "step": 10898 + }, + { + "epoch": 0.6103147048941651, + "grad_norm": 1.235936164855957, + "learning_rate": 5.4465e-05, + "loss": 0.363, + "step": 10899 + }, + { + "epoch": 0.6103707022062941, + "grad_norm": 1.3539587259292603, + "learning_rate": 5.447e-05, + "loss": 0.4094, + "step": 10900 + }, + { + "epoch": 0.6104266995184231, + "grad_norm": 1.5547099113464355, + "learning_rate": 5.4474999999999996e-05, + "loss": 0.5054, + "step": 10901 + }, + { + "epoch": 0.6104826968305521, + "grad_norm": 1.3546676635742188, + "learning_rate": 5.448e-05, + "loss": 0.5038, + "step": 10902 + }, + { + "epoch": 0.6105386941426811, + "grad_norm": 1.2518770694732666, + "learning_rate": 5.4485e-05, + "loss": 0.3849, + "step": 10903 + }, + { + "epoch": 0.6105946914548102, + "grad_norm": 1.5321272611618042, + "learning_rate": 5.449000000000001e-05, + "loss": 0.4972, + "step": 10904 + }, + { + "epoch": 0.6106506887669392, + "grad_norm": 1.3916819095611572, + "learning_rate": 5.4495000000000006e-05, + "loss": 0.4346, + "step": 10905 + }, + { + "epoch": 0.6107066860790682, + "grad_norm": 1.4135522842407227, + "learning_rate": 5.45e-05, + "loss": 0.5123, + "step": 10906 + }, + { + "epoch": 0.6107626833911972, + "grad_norm": 1.1799620389938354, + "learning_rate": 5.450500000000001e-05, + "loss": 0.4236, + "step": 10907 + }, + { + "epoch": 0.6108186807033262, + "grad_norm": 1.6834977865219116, + "learning_rate": 5.4510000000000005e-05, + "loss": 0.4363, + "step": 10908 + }, + { + "epoch": 0.6108746780154553, + "grad_norm": 1.3735376596450806, + "learning_rate": 5.4515e-05, + "loss": 0.5093, + "step": 10909 + }, + { + "epoch": 0.6109306753275843, + "grad_norm": 1.518600583076477, + "learning_rate": 5.4520000000000007e-05, + "loss": 0.5383, + "step": 10910 + }, + { + "epoch": 0.6109866726397133, + "grad_norm": 1.2262818813323975, + "learning_rate": 5.4525000000000004e-05, + "loss": 0.4242, + "step": 10911 + }, + { + "epoch": 0.6110426699518423, + "grad_norm": 1.3563529253005981, + "learning_rate": 5.453e-05, + "loss": 0.5936, + "step": 10912 + }, + { + "epoch": 0.6110986672639713, + "grad_norm": 1.1710436344146729, + "learning_rate": 5.4535000000000006e-05, + "loss": 0.4508, + "step": 10913 + }, + { + "epoch": 0.6111546645761003, + "grad_norm": 1.396704077720642, + "learning_rate": 5.454e-05, + "loss": 0.4649, + "step": 10914 + }, + { + "epoch": 0.6112106618882294, + "grad_norm": 1.7950878143310547, + "learning_rate": 5.4545e-05, + "loss": 0.6378, + "step": 10915 + }, + { + "epoch": 0.6112666592003584, + "grad_norm": 1.2352250814437866, + "learning_rate": 5.455e-05, + "loss": 0.4524, + "step": 10916 + }, + { + "epoch": 0.6113226565124874, + "grad_norm": 1.079251766204834, + "learning_rate": 5.4555e-05, + "loss": 0.3759, + "step": 10917 + }, + { + "epoch": 0.6113786538246164, + "grad_norm": 1.436574101448059, + "learning_rate": 5.456e-05, + "loss": 0.3481, + "step": 10918 + }, + { + "epoch": 0.6114346511367454, + "grad_norm": 1.346972942352295, + "learning_rate": 5.4565e-05, + "loss": 0.4239, + "step": 10919 + }, + { + "epoch": 0.6114906484488745, + "grad_norm": 1.3086992502212524, + "learning_rate": 5.457e-05, + "loss": 0.4323, + "step": 10920 + }, + { + "epoch": 0.6115466457610035, + "grad_norm": 1.2626492977142334, + "learning_rate": 5.4575e-05, + "loss": 0.4108, + "step": 10921 + }, + { + "epoch": 0.6116026430731325, + "grad_norm": 1.3344029188156128, + "learning_rate": 5.4579999999999996e-05, + "loss": 0.4914, + "step": 10922 + }, + { + "epoch": 0.6116586403852615, + "grad_norm": 1.2221949100494385, + "learning_rate": 5.4585e-05, + "loss": 0.3818, + "step": 10923 + }, + { + "epoch": 0.6117146376973905, + "grad_norm": 1.3945127725601196, + "learning_rate": 5.459000000000001e-05, + "loss": 0.4025, + "step": 10924 + }, + { + "epoch": 0.6117706350095196, + "grad_norm": 1.5424026250839233, + "learning_rate": 5.459500000000001e-05, + "loss": 0.5179, + "step": 10925 + }, + { + "epoch": 0.6118266323216486, + "grad_norm": 1.215567708015442, + "learning_rate": 5.4600000000000006e-05, + "loss": 0.4808, + "step": 10926 + }, + { + "epoch": 0.6118826296337776, + "grad_norm": 1.0344998836517334, + "learning_rate": 5.4605e-05, + "loss": 0.4033, + "step": 10927 + }, + { + "epoch": 0.6119386269459066, + "grad_norm": 1.3918633460998535, + "learning_rate": 5.461000000000001e-05, + "loss": 0.4244, + "step": 10928 + }, + { + "epoch": 0.6119946242580356, + "grad_norm": 1.1625525951385498, + "learning_rate": 5.4615000000000005e-05, + "loss": 0.4361, + "step": 10929 + }, + { + "epoch": 0.6120506215701647, + "grad_norm": 1.3637739419937134, + "learning_rate": 5.462e-05, + "loss": 0.4096, + "step": 10930 + }, + { + "epoch": 0.6121066188822937, + "grad_norm": 0.9587275385856628, + "learning_rate": 5.4625000000000006e-05, + "loss": 0.335, + "step": 10931 + }, + { + "epoch": 0.6121626161944227, + "grad_norm": 1.3482089042663574, + "learning_rate": 5.4630000000000004e-05, + "loss": 0.4462, + "step": 10932 + }, + { + "epoch": 0.6122186135065517, + "grad_norm": 1.388649344444275, + "learning_rate": 5.4635e-05, + "loss": 0.354, + "step": 10933 + }, + { + "epoch": 0.6122746108186807, + "grad_norm": 1.1378109455108643, + "learning_rate": 5.4640000000000005e-05, + "loss": 0.316, + "step": 10934 + }, + { + "epoch": 0.6123306081308098, + "grad_norm": 1.0495285987854004, + "learning_rate": 5.4645e-05, + "loss": 0.3512, + "step": 10935 + }, + { + "epoch": 0.6123866054429388, + "grad_norm": 2.3947298526763916, + "learning_rate": 5.465e-05, + "loss": 0.4391, + "step": 10936 + }, + { + "epoch": 0.6124426027550678, + "grad_norm": 1.2747169733047485, + "learning_rate": 5.4655e-05, + "loss": 0.4272, + "step": 10937 + }, + { + "epoch": 0.6124986000671968, + "grad_norm": 1.2688775062561035, + "learning_rate": 5.466e-05, + "loss": 0.4662, + "step": 10938 + }, + { + "epoch": 0.6125545973793258, + "grad_norm": 1.249367594718933, + "learning_rate": 5.4665e-05, + "loss": 0.3896, + "step": 10939 + }, + { + "epoch": 0.6126105946914548, + "grad_norm": 1.2491068840026855, + "learning_rate": 5.467e-05, + "loss": 0.3809, + "step": 10940 + }, + { + "epoch": 0.6126665920035839, + "grad_norm": 1.1681910753250122, + "learning_rate": 5.4675e-05, + "loss": 0.3824, + "step": 10941 + }, + { + "epoch": 0.6127225893157129, + "grad_norm": 1.31464684009552, + "learning_rate": 5.468e-05, + "loss": 0.4716, + "step": 10942 + }, + { + "epoch": 0.6127785866278419, + "grad_norm": 1.1723101139068604, + "learning_rate": 5.4684999999999996e-05, + "loss": 0.4744, + "step": 10943 + }, + { + "epoch": 0.6128345839399709, + "grad_norm": 1.363568663597107, + "learning_rate": 5.469000000000001e-05, + "loss": 0.4266, + "step": 10944 + }, + { + "epoch": 0.6128905812520999, + "grad_norm": 1.425294280052185, + "learning_rate": 5.4695000000000004e-05, + "loss": 0.6472, + "step": 10945 + }, + { + "epoch": 0.612946578564229, + "grad_norm": 1.3231925964355469, + "learning_rate": 5.470000000000001e-05, + "loss": 0.372, + "step": 10946 + }, + { + "epoch": 0.613002575876358, + "grad_norm": 1.4727872610092163, + "learning_rate": 5.4705000000000006e-05, + "loss": 0.3979, + "step": 10947 + }, + { + "epoch": 0.613058573188487, + "grad_norm": 1.184914469718933, + "learning_rate": 5.471e-05, + "loss": 0.4542, + "step": 10948 + }, + { + "epoch": 0.613114570500616, + "grad_norm": 1.3102973699569702, + "learning_rate": 5.471500000000001e-05, + "loss": 0.4403, + "step": 10949 + }, + { + "epoch": 0.613170567812745, + "grad_norm": 1.3212430477142334, + "learning_rate": 5.4720000000000005e-05, + "loss": 0.4557, + "step": 10950 + }, + { + "epoch": 0.6132265651248741, + "grad_norm": 1.1247193813323975, + "learning_rate": 5.4725e-05, + "loss": 0.3428, + "step": 10951 + }, + { + "epoch": 0.6132825624370031, + "grad_norm": 2.253873825073242, + "learning_rate": 5.4730000000000006e-05, + "loss": 0.547, + "step": 10952 + }, + { + "epoch": 0.6133385597491321, + "grad_norm": 1.2817833423614502, + "learning_rate": 5.4735000000000004e-05, + "loss": 0.4066, + "step": 10953 + }, + { + "epoch": 0.6133945570612611, + "grad_norm": 1.4005568027496338, + "learning_rate": 5.474e-05, + "loss": 0.3571, + "step": 10954 + }, + { + "epoch": 0.61345055437339, + "grad_norm": 1.2430723905563354, + "learning_rate": 5.4745e-05, + "loss": 0.4512, + "step": 10955 + }, + { + "epoch": 0.613506551685519, + "grad_norm": 1.6329760551452637, + "learning_rate": 5.475e-05, + "loss": 0.7016, + "step": 10956 + }, + { + "epoch": 0.6135625489976481, + "grad_norm": 1.369162917137146, + "learning_rate": 5.4755e-05, + "loss": 0.4148, + "step": 10957 + }, + { + "epoch": 0.6136185463097771, + "grad_norm": 1.1662013530731201, + "learning_rate": 5.476e-05, + "loss": 0.4242, + "step": 10958 + }, + { + "epoch": 0.6136745436219061, + "grad_norm": 1.0382499694824219, + "learning_rate": 5.4765e-05, + "loss": 0.3375, + "step": 10959 + }, + { + "epoch": 0.6137305409340351, + "grad_norm": 1.6717263460159302, + "learning_rate": 5.477e-05, + "loss": 0.4685, + "step": 10960 + }, + { + "epoch": 0.6137865382461641, + "grad_norm": 1.4468532800674438, + "learning_rate": 5.4774999999999996e-05, + "loss": 0.4467, + "step": 10961 + }, + { + "epoch": 0.6138425355582932, + "grad_norm": 1.1400389671325684, + "learning_rate": 5.478e-05, + "loss": 0.4137, + "step": 10962 + }, + { + "epoch": 0.6138985328704222, + "grad_norm": 1.5730295181274414, + "learning_rate": 5.4785e-05, + "loss": 0.3277, + "step": 10963 + }, + { + "epoch": 0.6139545301825512, + "grad_norm": 1.0826725959777832, + "learning_rate": 5.479000000000001e-05, + "loss": 0.3517, + "step": 10964 + }, + { + "epoch": 0.6140105274946802, + "grad_norm": 1.3723716735839844, + "learning_rate": 5.4795000000000006e-05, + "loss": 0.4235, + "step": 10965 + }, + { + "epoch": 0.6140665248068092, + "grad_norm": 1.3577009439468384, + "learning_rate": 5.4800000000000004e-05, + "loss": 0.5371, + "step": 10966 + }, + { + "epoch": 0.6141225221189383, + "grad_norm": 1.2552025318145752, + "learning_rate": 5.480500000000001e-05, + "loss": 0.4135, + "step": 10967 + }, + { + "epoch": 0.6141785194310673, + "grad_norm": 1.306849479675293, + "learning_rate": 5.4810000000000005e-05, + "loss": 0.5673, + "step": 10968 + }, + { + "epoch": 0.6142345167431963, + "grad_norm": 1.1524357795715332, + "learning_rate": 5.4815e-05, + "loss": 0.359, + "step": 10969 + }, + { + "epoch": 0.6142905140553253, + "grad_norm": 1.3059027194976807, + "learning_rate": 5.482000000000001e-05, + "loss": 0.4656, + "step": 10970 + }, + { + "epoch": 0.6143465113674543, + "grad_norm": 1.4936234951019287, + "learning_rate": 5.4825000000000004e-05, + "loss": 0.4255, + "step": 10971 + }, + { + "epoch": 0.6144025086795833, + "grad_norm": 1.2601217031478882, + "learning_rate": 5.483e-05, + "loss": 0.6143, + "step": 10972 + }, + { + "epoch": 0.6144585059917124, + "grad_norm": 1.4112039804458618, + "learning_rate": 5.4835000000000006e-05, + "loss": 0.5809, + "step": 10973 + }, + { + "epoch": 0.6145145033038414, + "grad_norm": 1.4576923847198486, + "learning_rate": 5.4840000000000003e-05, + "loss": 0.3353, + "step": 10974 + }, + { + "epoch": 0.6145705006159704, + "grad_norm": 1.2597851753234863, + "learning_rate": 5.4845e-05, + "loss": 0.5477, + "step": 10975 + }, + { + "epoch": 0.6146264979280994, + "grad_norm": 1.2168810367584229, + "learning_rate": 5.485e-05, + "loss": 0.4293, + "step": 10976 + }, + { + "epoch": 0.6146824952402284, + "grad_norm": 1.051815152168274, + "learning_rate": 5.4855e-05, + "loss": 0.4024, + "step": 10977 + }, + { + "epoch": 0.6147384925523575, + "grad_norm": 1.2898094654083252, + "learning_rate": 5.486e-05, + "loss": 0.4484, + "step": 10978 + }, + { + "epoch": 0.6147944898644865, + "grad_norm": 1.4077908992767334, + "learning_rate": 5.4865e-05, + "loss": 0.4834, + "step": 10979 + }, + { + "epoch": 0.6148504871766155, + "grad_norm": 1.5623785257339478, + "learning_rate": 5.487e-05, + "loss": 0.625, + "step": 10980 + }, + { + "epoch": 0.6149064844887445, + "grad_norm": 1.3912885189056396, + "learning_rate": 5.4875e-05, + "loss": 0.5837, + "step": 10981 + }, + { + "epoch": 0.6149624818008735, + "grad_norm": 1.4288856983184814, + "learning_rate": 5.4879999999999996e-05, + "loss": 0.4109, + "step": 10982 + }, + { + "epoch": 0.6150184791130026, + "grad_norm": 1.4826418161392212, + "learning_rate": 5.4885e-05, + "loss": 0.5164, + "step": 10983 + }, + { + "epoch": 0.6150744764251316, + "grad_norm": 1.431363821029663, + "learning_rate": 5.4890000000000005e-05, + "loss": 0.4435, + "step": 10984 + }, + { + "epoch": 0.6151304737372606, + "grad_norm": 1.2103575468063354, + "learning_rate": 5.489500000000001e-05, + "loss": 0.3564, + "step": 10985 + }, + { + "epoch": 0.6151864710493896, + "grad_norm": 1.0259290933609009, + "learning_rate": 5.4900000000000006e-05, + "loss": 0.3939, + "step": 10986 + }, + { + "epoch": 0.6152424683615186, + "grad_norm": 1.5478663444519043, + "learning_rate": 5.4905000000000004e-05, + "loss": 0.4409, + "step": 10987 + }, + { + "epoch": 0.6152984656736477, + "grad_norm": 1.2921854257583618, + "learning_rate": 5.491000000000001e-05, + "loss": 0.4146, + "step": 10988 + }, + { + "epoch": 0.6153544629857767, + "grad_norm": 1.2508203983306885, + "learning_rate": 5.4915000000000005e-05, + "loss": 0.4168, + "step": 10989 + }, + { + "epoch": 0.6154104602979057, + "grad_norm": 1.3484240770339966, + "learning_rate": 5.492e-05, + "loss": 0.4035, + "step": 10990 + }, + { + "epoch": 0.6154664576100347, + "grad_norm": 1.207398533821106, + "learning_rate": 5.492500000000001e-05, + "loss": 0.3209, + "step": 10991 + }, + { + "epoch": 0.6155224549221637, + "grad_norm": 2.1470868587493896, + "learning_rate": 5.4930000000000004e-05, + "loss": 0.3565, + "step": 10992 + }, + { + "epoch": 0.6155784522342928, + "grad_norm": 1.2360011339187622, + "learning_rate": 5.4935e-05, + "loss": 0.3835, + "step": 10993 + }, + { + "epoch": 0.6156344495464218, + "grad_norm": 1.2802917957305908, + "learning_rate": 5.4940000000000006e-05, + "loss": 0.3776, + "step": 10994 + }, + { + "epoch": 0.6156904468585508, + "grad_norm": 1.2238057851791382, + "learning_rate": 5.4945e-05, + "loss": 0.4526, + "step": 10995 + }, + { + "epoch": 0.6157464441706798, + "grad_norm": 1.3067924976348877, + "learning_rate": 5.495e-05, + "loss": 0.4185, + "step": 10996 + }, + { + "epoch": 0.6158024414828088, + "grad_norm": 1.451428771018982, + "learning_rate": 5.4955e-05, + "loss": 0.6839, + "step": 10997 + }, + { + "epoch": 0.6158584387949378, + "grad_norm": 3.082087516784668, + "learning_rate": 5.496e-05, + "loss": 0.4933, + "step": 10998 + }, + { + "epoch": 0.6159144361070669, + "grad_norm": 1.1497201919555664, + "learning_rate": 5.4965e-05, + "loss": 0.5335, + "step": 10999 + }, + { + "epoch": 0.6159704334191959, + "grad_norm": 1.3007779121398926, + "learning_rate": 5.497e-05, + "loss": 0.4934, + "step": 11000 + }, + { + "epoch": 0.6160264307313249, + "grad_norm": 1.517802357673645, + "learning_rate": 5.4975e-05, + "loss": 0.622, + "step": 11001 + }, + { + "epoch": 0.6160824280434539, + "grad_norm": 1.2755626440048218, + "learning_rate": 5.498e-05, + "loss": 0.3995, + "step": 11002 + }, + { + "epoch": 0.6161384253555829, + "grad_norm": 1.2727468013763428, + "learning_rate": 5.4984999999999996e-05, + "loss": 0.3699, + "step": 11003 + }, + { + "epoch": 0.616194422667712, + "grad_norm": 1.5111430883407593, + "learning_rate": 5.499000000000001e-05, + "loss": 0.4512, + "step": 11004 + }, + { + "epoch": 0.616250419979841, + "grad_norm": 1.1231077909469604, + "learning_rate": 5.4995000000000005e-05, + "loss": 0.3624, + "step": 11005 + }, + { + "epoch": 0.61630641729197, + "grad_norm": 1.5678112506866455, + "learning_rate": 5.500000000000001e-05, + "loss": 0.4726, + "step": 11006 + }, + { + "epoch": 0.616362414604099, + "grad_norm": 1.218222737312317, + "learning_rate": 5.5005000000000006e-05, + "loss": 0.4233, + "step": 11007 + }, + { + "epoch": 0.616418411916228, + "grad_norm": 1.5231025218963623, + "learning_rate": 5.5010000000000004e-05, + "loss": 0.5874, + "step": 11008 + }, + { + "epoch": 0.6164744092283571, + "grad_norm": 1.3632187843322754, + "learning_rate": 5.501500000000001e-05, + "loss": 0.5486, + "step": 11009 + }, + { + "epoch": 0.6165304065404861, + "grad_norm": 1.3238469362258911, + "learning_rate": 5.5020000000000005e-05, + "loss": 0.5328, + "step": 11010 + }, + { + "epoch": 0.6165864038526151, + "grad_norm": 1.674149751663208, + "learning_rate": 5.5025e-05, + "loss": 0.4168, + "step": 11011 + }, + { + "epoch": 0.6166424011647441, + "grad_norm": 1.2645148038864136, + "learning_rate": 5.503000000000001e-05, + "loss": 0.5078, + "step": 11012 + }, + { + "epoch": 0.6166983984768731, + "grad_norm": 1.3339440822601318, + "learning_rate": 5.5035000000000004e-05, + "loss": 0.4358, + "step": 11013 + }, + { + "epoch": 0.6167543957890022, + "grad_norm": 1.254732608795166, + "learning_rate": 5.504e-05, + "loss": 0.5489, + "step": 11014 + }, + { + "epoch": 0.6168103931011312, + "grad_norm": 1.3222465515136719, + "learning_rate": 5.5045e-05, + "loss": 0.6669, + "step": 11015 + }, + { + "epoch": 0.6168663904132602, + "grad_norm": 1.307663083076477, + "learning_rate": 5.505e-05, + "loss": 0.3796, + "step": 11016 + }, + { + "epoch": 0.6169223877253892, + "grad_norm": 1.1509522199630737, + "learning_rate": 5.5055e-05, + "loss": 0.3864, + "step": 11017 + }, + { + "epoch": 0.6169783850375182, + "grad_norm": 1.4544485807418823, + "learning_rate": 5.506e-05, + "loss": 0.4906, + "step": 11018 + }, + { + "epoch": 0.6170343823496472, + "grad_norm": 1.219419002532959, + "learning_rate": 5.5065e-05, + "loss": 0.493, + "step": 11019 + }, + { + "epoch": 0.6170903796617763, + "grad_norm": 1.4182944297790527, + "learning_rate": 5.507e-05, + "loss": 0.4782, + "step": 11020 + }, + { + "epoch": 0.6171463769739053, + "grad_norm": 1.3153084516525269, + "learning_rate": 5.5075e-05, + "loss": 0.368, + "step": 11021 + }, + { + "epoch": 0.6172023742860343, + "grad_norm": 1.4359521865844727, + "learning_rate": 5.508e-05, + "loss": 0.6702, + "step": 11022 + }, + { + "epoch": 0.6172583715981633, + "grad_norm": 1.212326169013977, + "learning_rate": 5.5085e-05, + "loss": 0.4304, + "step": 11023 + }, + { + "epoch": 0.6173143689102923, + "grad_norm": 1.1759984493255615, + "learning_rate": 5.5089999999999996e-05, + "loss": 0.4048, + "step": 11024 + }, + { + "epoch": 0.6173703662224214, + "grad_norm": 1.2775005102157593, + "learning_rate": 5.509500000000001e-05, + "loss": 0.3695, + "step": 11025 + }, + { + "epoch": 0.6174263635345504, + "grad_norm": 1.2537641525268555, + "learning_rate": 5.5100000000000004e-05, + "loss": 0.3624, + "step": 11026 + }, + { + "epoch": 0.6174823608466794, + "grad_norm": 1.345799446105957, + "learning_rate": 5.510500000000001e-05, + "loss": 0.6128, + "step": 11027 + }, + { + "epoch": 0.6175383581588084, + "grad_norm": 1.2790555953979492, + "learning_rate": 5.5110000000000006e-05, + "loss": 0.4091, + "step": 11028 + }, + { + "epoch": 0.6175943554709374, + "grad_norm": 1.3208950757980347, + "learning_rate": 5.5115000000000003e-05, + "loss": 0.4776, + "step": 11029 + }, + { + "epoch": 0.6176503527830665, + "grad_norm": 1.0869702100753784, + "learning_rate": 5.512000000000001e-05, + "loss": 0.3377, + "step": 11030 + }, + { + "epoch": 0.6177063500951955, + "grad_norm": 1.5070682764053345, + "learning_rate": 5.5125000000000005e-05, + "loss": 0.5009, + "step": 11031 + }, + { + "epoch": 0.6177623474073245, + "grad_norm": 1.4577410221099854, + "learning_rate": 5.513e-05, + "loss": 0.5479, + "step": 11032 + }, + { + "epoch": 0.6178183447194535, + "grad_norm": 1.3744590282440186, + "learning_rate": 5.5135000000000007e-05, + "loss": 0.3575, + "step": 11033 + }, + { + "epoch": 0.6178743420315825, + "grad_norm": 1.0250931978225708, + "learning_rate": 5.5140000000000004e-05, + "loss": 0.3366, + "step": 11034 + }, + { + "epoch": 0.6179303393437116, + "grad_norm": 1.3300193548202515, + "learning_rate": 5.5145e-05, + "loss": 0.4172, + "step": 11035 + }, + { + "epoch": 0.6179863366558406, + "grad_norm": 1.2005608081817627, + "learning_rate": 5.515e-05, + "loss": 0.3565, + "step": 11036 + }, + { + "epoch": 0.6180423339679696, + "grad_norm": 1.2163310050964355, + "learning_rate": 5.5155e-05, + "loss": 0.3138, + "step": 11037 + }, + { + "epoch": 0.6180983312800985, + "grad_norm": 1.210322618484497, + "learning_rate": 5.516e-05, + "loss": 0.4677, + "step": 11038 + }, + { + "epoch": 0.6181543285922275, + "grad_norm": 1.21651029586792, + "learning_rate": 5.5165e-05, + "loss": 0.5215, + "step": 11039 + }, + { + "epoch": 0.6182103259043565, + "grad_norm": 1.3309255838394165, + "learning_rate": 5.517e-05, + "loss": 0.4064, + "step": 11040 + }, + { + "epoch": 0.6182663232164856, + "grad_norm": 1.3202449083328247, + "learning_rate": 5.5175e-05, + "loss": 0.433, + "step": 11041 + }, + { + "epoch": 0.6183223205286146, + "grad_norm": 1.4714305400848389, + "learning_rate": 5.518e-05, + "loss": 0.4863, + "step": 11042 + }, + { + "epoch": 0.6183783178407436, + "grad_norm": 1.1451250314712524, + "learning_rate": 5.5185e-05, + "loss": 0.4105, + "step": 11043 + }, + { + "epoch": 0.6184343151528726, + "grad_norm": 1.2788653373718262, + "learning_rate": 5.519e-05, + "loss": 0.4258, + "step": 11044 + }, + { + "epoch": 0.6184903124650016, + "grad_norm": 1.2494900226593018, + "learning_rate": 5.519500000000001e-05, + "loss": 0.3796, + "step": 11045 + }, + { + "epoch": 0.6185463097771307, + "grad_norm": 1.2078883647918701, + "learning_rate": 5.520000000000001e-05, + "loss": 0.4384, + "step": 11046 + }, + { + "epoch": 0.6186023070892597, + "grad_norm": 1.1643199920654297, + "learning_rate": 5.5205000000000004e-05, + "loss": 0.3847, + "step": 11047 + }, + { + "epoch": 0.6186583044013887, + "grad_norm": 2.068957805633545, + "learning_rate": 5.521000000000001e-05, + "loss": 0.4046, + "step": 11048 + }, + { + "epoch": 0.6187143017135177, + "grad_norm": 1.1836789846420288, + "learning_rate": 5.5215000000000006e-05, + "loss": 0.3422, + "step": 11049 + }, + { + "epoch": 0.6187702990256467, + "grad_norm": 1.6435412168502808, + "learning_rate": 5.522e-05, + "loss": 0.4077, + "step": 11050 + }, + { + "epoch": 0.6188262963377757, + "grad_norm": 1.2401477098464966, + "learning_rate": 5.522500000000001e-05, + "loss": 0.4519, + "step": 11051 + }, + { + "epoch": 0.6188822936499048, + "grad_norm": 1.4588103294372559, + "learning_rate": 5.5230000000000005e-05, + "loss": 0.5337, + "step": 11052 + }, + { + "epoch": 0.6189382909620338, + "grad_norm": 1.5246227979660034, + "learning_rate": 5.5235e-05, + "loss": 0.5767, + "step": 11053 + }, + { + "epoch": 0.6189942882741628, + "grad_norm": 3.2909200191497803, + "learning_rate": 5.524e-05, + "loss": 0.3592, + "step": 11054 + }, + { + "epoch": 0.6190502855862918, + "grad_norm": 1.2052184343338013, + "learning_rate": 5.5245000000000004e-05, + "loss": 0.4188, + "step": 11055 + }, + { + "epoch": 0.6191062828984208, + "grad_norm": 1.6031838655471802, + "learning_rate": 5.525e-05, + "loss": 0.4697, + "step": 11056 + }, + { + "epoch": 0.6191622802105499, + "grad_norm": 1.0913581848144531, + "learning_rate": 5.5255e-05, + "loss": 0.3663, + "step": 11057 + }, + { + "epoch": 0.6192182775226789, + "grad_norm": 1.2913140058517456, + "learning_rate": 5.526e-05, + "loss": 0.4313, + "step": 11058 + }, + { + "epoch": 0.6192742748348079, + "grad_norm": 1.3415793180465698, + "learning_rate": 5.5265e-05, + "loss": 0.4306, + "step": 11059 + }, + { + "epoch": 0.6193302721469369, + "grad_norm": 1.519203782081604, + "learning_rate": 5.527e-05, + "loss": 0.4767, + "step": 11060 + }, + { + "epoch": 0.6193862694590659, + "grad_norm": 1.2701588869094849, + "learning_rate": 5.5275e-05, + "loss": 0.5083, + "step": 11061 + }, + { + "epoch": 0.619442266771195, + "grad_norm": 1.0991445779800415, + "learning_rate": 5.528e-05, + "loss": 0.347, + "step": 11062 + }, + { + "epoch": 0.619498264083324, + "grad_norm": 1.1720919609069824, + "learning_rate": 5.5285e-05, + "loss": 0.3684, + "step": 11063 + }, + { + "epoch": 0.619554261395453, + "grad_norm": 1.3075461387634277, + "learning_rate": 5.5289999999999994e-05, + "loss": 0.4524, + "step": 11064 + }, + { + "epoch": 0.619610258707582, + "grad_norm": 1.2739973068237305, + "learning_rate": 5.5295000000000005e-05, + "loss": 0.5399, + "step": 11065 + }, + { + "epoch": 0.619666256019711, + "grad_norm": 1.3019740581512451, + "learning_rate": 5.530000000000001e-05, + "loss": 0.4455, + "step": 11066 + }, + { + "epoch": 0.6197222533318401, + "grad_norm": 1.2433137893676758, + "learning_rate": 5.530500000000001e-05, + "loss": 0.3155, + "step": 11067 + }, + { + "epoch": 0.6197782506439691, + "grad_norm": 1.2786258459091187, + "learning_rate": 5.5310000000000004e-05, + "loss": 0.5279, + "step": 11068 + }, + { + "epoch": 0.6198342479560981, + "grad_norm": 1.3097648620605469, + "learning_rate": 5.531500000000001e-05, + "loss": 0.5644, + "step": 11069 + }, + { + "epoch": 0.6198902452682271, + "grad_norm": 1.406480312347412, + "learning_rate": 5.5320000000000006e-05, + "loss": 0.5045, + "step": 11070 + }, + { + "epoch": 0.6199462425803561, + "grad_norm": 1.22561776638031, + "learning_rate": 5.5325e-05, + "loss": 0.3946, + "step": 11071 + }, + { + "epoch": 0.6200022398924852, + "grad_norm": 1.4362882375717163, + "learning_rate": 5.533000000000001e-05, + "loss": 0.4382, + "step": 11072 + }, + { + "epoch": 0.6200582372046142, + "grad_norm": 1.520639181137085, + "learning_rate": 5.5335000000000005e-05, + "loss": 0.4684, + "step": 11073 + }, + { + "epoch": 0.6201142345167432, + "grad_norm": 1.1990320682525635, + "learning_rate": 5.534e-05, + "loss": 0.4649, + "step": 11074 + }, + { + "epoch": 0.6201702318288722, + "grad_norm": 1.2321856021881104, + "learning_rate": 5.5345e-05, + "loss": 0.3917, + "step": 11075 + }, + { + "epoch": 0.6202262291410012, + "grad_norm": 1.2517001628875732, + "learning_rate": 5.5350000000000004e-05, + "loss": 0.3432, + "step": 11076 + }, + { + "epoch": 0.6202822264531302, + "grad_norm": 1.116336703300476, + "learning_rate": 5.5355e-05, + "loss": 0.414, + "step": 11077 + }, + { + "epoch": 0.6203382237652593, + "grad_norm": 1.4558522701263428, + "learning_rate": 5.536e-05, + "loss": 0.3346, + "step": 11078 + }, + { + "epoch": 0.6203942210773883, + "grad_norm": 1.334283709526062, + "learning_rate": 5.5365e-05, + "loss": 0.4531, + "step": 11079 + }, + { + "epoch": 0.6204502183895173, + "grad_norm": 1.297268033027649, + "learning_rate": 5.537e-05, + "loss": 0.5036, + "step": 11080 + }, + { + "epoch": 0.6205062157016463, + "grad_norm": 1.2266566753387451, + "learning_rate": 5.5375e-05, + "loss": 0.4021, + "step": 11081 + }, + { + "epoch": 0.6205622130137753, + "grad_norm": 1.2731362581253052, + "learning_rate": 5.538e-05, + "loss": 0.4218, + "step": 11082 + }, + { + "epoch": 0.6206182103259044, + "grad_norm": 1.3133450746536255, + "learning_rate": 5.5385e-05, + "loss": 0.4175, + "step": 11083 + }, + { + "epoch": 0.6206742076380334, + "grad_norm": 1.390400767326355, + "learning_rate": 5.5389999999999997e-05, + "loss": 0.3764, + "step": 11084 + }, + { + "epoch": 0.6207302049501624, + "grad_norm": 1.1527926921844482, + "learning_rate": 5.539500000000001e-05, + "loss": 0.3754, + "step": 11085 + }, + { + "epoch": 0.6207862022622914, + "grad_norm": 1.342327356338501, + "learning_rate": 5.5400000000000005e-05, + "loss": 0.5999, + "step": 11086 + }, + { + "epoch": 0.6208421995744204, + "grad_norm": 1.1754281520843506, + "learning_rate": 5.540500000000001e-05, + "loss": 0.4429, + "step": 11087 + }, + { + "epoch": 0.6208981968865495, + "grad_norm": 1.2577422857284546, + "learning_rate": 5.5410000000000007e-05, + "loss": 0.4208, + "step": 11088 + }, + { + "epoch": 0.6209541941986785, + "grad_norm": 1.3542053699493408, + "learning_rate": 5.5415000000000004e-05, + "loss": 0.4091, + "step": 11089 + }, + { + "epoch": 0.6210101915108075, + "grad_norm": 1.1279963254928589, + "learning_rate": 5.542000000000001e-05, + "loss": 0.3579, + "step": 11090 + }, + { + "epoch": 0.6210661888229365, + "grad_norm": 0.9857292175292969, + "learning_rate": 5.5425000000000006e-05, + "loss": 0.2692, + "step": 11091 + }, + { + "epoch": 0.6211221861350655, + "grad_norm": 1.6026922464370728, + "learning_rate": 5.543e-05, + "loss": 0.5099, + "step": 11092 + }, + { + "epoch": 0.6211781834471946, + "grad_norm": 1.8746087551116943, + "learning_rate": 5.5435e-05, + "loss": 0.5977, + "step": 11093 + }, + { + "epoch": 0.6212341807593236, + "grad_norm": 1.1242713928222656, + "learning_rate": 5.5440000000000005e-05, + "loss": 0.3337, + "step": 11094 + }, + { + "epoch": 0.6212901780714526, + "grad_norm": 1.4334536790847778, + "learning_rate": 5.5445e-05, + "loss": 0.491, + "step": 11095 + }, + { + "epoch": 0.6213461753835816, + "grad_norm": 1.4200242757797241, + "learning_rate": 5.545e-05, + "loss": 0.4305, + "step": 11096 + }, + { + "epoch": 0.6214021726957106, + "grad_norm": 1.2854182720184326, + "learning_rate": 5.5455000000000004e-05, + "loss": 0.5226, + "step": 11097 + }, + { + "epoch": 0.6214581700078396, + "grad_norm": 1.1912659406661987, + "learning_rate": 5.546e-05, + "loss": 0.4192, + "step": 11098 + }, + { + "epoch": 0.6215141673199687, + "grad_norm": 1.1298226118087769, + "learning_rate": 5.5465e-05, + "loss": 0.3647, + "step": 11099 + }, + { + "epoch": 0.6215701646320977, + "grad_norm": 1.2152447700500488, + "learning_rate": 5.547e-05, + "loss": 0.311, + "step": 11100 + }, + { + "epoch": 0.6216261619442267, + "grad_norm": 1.1264703273773193, + "learning_rate": 5.5475e-05, + "loss": 0.3419, + "step": 11101 + }, + { + "epoch": 0.6216821592563557, + "grad_norm": 1.2177457809448242, + "learning_rate": 5.548e-05, + "loss": 0.3906, + "step": 11102 + }, + { + "epoch": 0.6217381565684847, + "grad_norm": 1.399300217628479, + "learning_rate": 5.5484999999999995e-05, + "loss": 0.4839, + "step": 11103 + }, + { + "epoch": 0.6217941538806138, + "grad_norm": 1.3462135791778564, + "learning_rate": 5.549e-05, + "loss": 0.4196, + "step": 11104 + }, + { + "epoch": 0.6218501511927428, + "grad_norm": 1.5765787363052368, + "learning_rate": 5.549500000000001e-05, + "loss": 0.5721, + "step": 11105 + }, + { + "epoch": 0.6219061485048718, + "grad_norm": 1.2746860980987549, + "learning_rate": 5.550000000000001e-05, + "loss": 0.4462, + "step": 11106 + }, + { + "epoch": 0.6219621458170008, + "grad_norm": 1.1315209865570068, + "learning_rate": 5.5505000000000005e-05, + "loss": 0.396, + "step": 11107 + }, + { + "epoch": 0.6220181431291298, + "grad_norm": 1.1072025299072266, + "learning_rate": 5.551000000000001e-05, + "loss": 0.2609, + "step": 11108 + }, + { + "epoch": 0.6220741404412589, + "grad_norm": 1.4301235675811768, + "learning_rate": 5.5515000000000006e-05, + "loss": 0.4324, + "step": 11109 + }, + { + "epoch": 0.6221301377533879, + "grad_norm": 1.4740022420883179, + "learning_rate": 5.5520000000000004e-05, + "loss": 0.5201, + "step": 11110 + }, + { + "epoch": 0.6221861350655169, + "grad_norm": 1.1610413789749146, + "learning_rate": 5.552500000000001e-05, + "loss": 0.3384, + "step": 11111 + }, + { + "epoch": 0.6222421323776459, + "grad_norm": 1.0692731142044067, + "learning_rate": 5.5530000000000005e-05, + "loss": 0.36, + "step": 11112 + }, + { + "epoch": 0.6222981296897749, + "grad_norm": 1.2527985572814941, + "learning_rate": 5.5535e-05, + "loss": 0.5219, + "step": 11113 + }, + { + "epoch": 0.622354127001904, + "grad_norm": 1.2130718231201172, + "learning_rate": 5.554e-05, + "loss": 0.418, + "step": 11114 + }, + { + "epoch": 0.622410124314033, + "grad_norm": 1.4109265804290771, + "learning_rate": 5.5545000000000004e-05, + "loss": 0.4951, + "step": 11115 + }, + { + "epoch": 0.622466121626162, + "grad_norm": 1.3276960849761963, + "learning_rate": 5.555e-05, + "loss": 0.4697, + "step": 11116 + }, + { + "epoch": 0.622522118938291, + "grad_norm": 1.4295916557312012, + "learning_rate": 5.5555e-05, + "loss": 0.4085, + "step": 11117 + }, + { + "epoch": 0.62257811625042, + "grad_norm": 1.5808247327804565, + "learning_rate": 5.556e-05, + "loss": 0.4237, + "step": 11118 + }, + { + "epoch": 0.622634113562549, + "grad_norm": 1.3355120420455933, + "learning_rate": 5.5565e-05, + "loss": 0.5678, + "step": 11119 + }, + { + "epoch": 0.622690110874678, + "grad_norm": 1.452845811843872, + "learning_rate": 5.557e-05, + "loss": 0.5304, + "step": 11120 + }, + { + "epoch": 0.622746108186807, + "grad_norm": 1.1395128965377808, + "learning_rate": 5.5575e-05, + "loss": 0.3455, + "step": 11121 + }, + { + "epoch": 0.622802105498936, + "grad_norm": 1.3583852052688599, + "learning_rate": 5.558e-05, + "loss": 0.3351, + "step": 11122 + }, + { + "epoch": 0.622858102811065, + "grad_norm": 1.1834583282470703, + "learning_rate": 5.5585e-05, + "loss": 0.4016, + "step": 11123 + }, + { + "epoch": 0.622914100123194, + "grad_norm": 1.3968842029571533, + "learning_rate": 5.5589999999999995e-05, + "loss": 0.3693, + "step": 11124 + }, + { + "epoch": 0.6229700974353231, + "grad_norm": 1.4644670486450195, + "learning_rate": 5.5595000000000006e-05, + "loss": 0.5647, + "step": 11125 + }, + { + "epoch": 0.6230260947474521, + "grad_norm": 1.2715789079666138, + "learning_rate": 5.560000000000001e-05, + "loss": 0.3577, + "step": 11126 + }, + { + "epoch": 0.6230820920595811, + "grad_norm": 3.06221342086792, + "learning_rate": 5.560500000000001e-05, + "loss": 0.5486, + "step": 11127 + }, + { + "epoch": 0.6231380893717101, + "grad_norm": 1.3651798963546753, + "learning_rate": 5.5610000000000005e-05, + "loss": 0.4616, + "step": 11128 + }, + { + "epoch": 0.6231940866838391, + "grad_norm": 1.209020972251892, + "learning_rate": 5.561500000000001e-05, + "loss": 0.3837, + "step": 11129 + }, + { + "epoch": 0.6232500839959682, + "grad_norm": 1.3096704483032227, + "learning_rate": 5.5620000000000006e-05, + "loss": 0.4342, + "step": 11130 + }, + { + "epoch": 0.6233060813080972, + "grad_norm": 1.4271564483642578, + "learning_rate": 5.5625000000000004e-05, + "loss": 0.466, + "step": 11131 + }, + { + "epoch": 0.6233620786202262, + "grad_norm": 1.1934137344360352, + "learning_rate": 5.563e-05, + "loss": 0.4586, + "step": 11132 + }, + { + "epoch": 0.6234180759323552, + "grad_norm": 1.1513158082962036, + "learning_rate": 5.5635000000000005e-05, + "loss": 0.341, + "step": 11133 + }, + { + "epoch": 0.6234740732444842, + "grad_norm": 1.2885565757751465, + "learning_rate": 5.564e-05, + "loss": 0.4821, + "step": 11134 + }, + { + "epoch": 0.6235300705566132, + "grad_norm": 1.2884690761566162, + "learning_rate": 5.5645e-05, + "loss": 0.4927, + "step": 11135 + }, + { + "epoch": 0.6235860678687423, + "grad_norm": 1.2023165225982666, + "learning_rate": 5.5650000000000004e-05, + "loss": 0.3787, + "step": 11136 + }, + { + "epoch": 0.6236420651808713, + "grad_norm": 1.1008411645889282, + "learning_rate": 5.5655e-05, + "loss": 0.4443, + "step": 11137 + }, + { + "epoch": 0.6236980624930003, + "grad_norm": 1.222356915473938, + "learning_rate": 5.566e-05, + "loss": 0.4743, + "step": 11138 + }, + { + "epoch": 0.6237540598051293, + "grad_norm": 1.2594475746154785, + "learning_rate": 5.5665e-05, + "loss": 0.5381, + "step": 11139 + }, + { + "epoch": 0.6238100571172583, + "grad_norm": 1.3761041164398193, + "learning_rate": 5.567e-05, + "loss": 0.542, + "step": 11140 + }, + { + "epoch": 0.6238660544293874, + "grad_norm": 1.3204220533370972, + "learning_rate": 5.5675e-05, + "loss": 0.47, + "step": 11141 + }, + { + "epoch": 0.6239220517415164, + "grad_norm": 1.188469648361206, + "learning_rate": 5.5679999999999995e-05, + "loss": 0.3275, + "step": 11142 + }, + { + "epoch": 0.6239780490536454, + "grad_norm": 1.4052780866622925, + "learning_rate": 5.5685e-05, + "loss": 0.4375, + "step": 11143 + }, + { + "epoch": 0.6240340463657744, + "grad_norm": 1.1389778852462769, + "learning_rate": 5.569e-05, + "loss": 0.336, + "step": 11144 + }, + { + "epoch": 0.6240900436779034, + "grad_norm": 1.3461494445800781, + "learning_rate": 5.5694999999999994e-05, + "loss": 0.4599, + "step": 11145 + }, + { + "epoch": 0.6241460409900325, + "grad_norm": 1.2822725772857666, + "learning_rate": 5.5700000000000005e-05, + "loss": 0.4538, + "step": 11146 + }, + { + "epoch": 0.6242020383021615, + "grad_norm": 1.4448376893997192, + "learning_rate": 5.570500000000001e-05, + "loss": 0.4361, + "step": 11147 + }, + { + "epoch": 0.6242580356142905, + "grad_norm": 1.0926368236541748, + "learning_rate": 5.571000000000001e-05, + "loss": 0.3762, + "step": 11148 + }, + { + "epoch": 0.6243140329264195, + "grad_norm": 1.2508419752120972, + "learning_rate": 5.5715000000000004e-05, + "loss": 0.2762, + "step": 11149 + }, + { + "epoch": 0.6243700302385485, + "grad_norm": 1.2052196264266968, + "learning_rate": 5.572000000000001e-05, + "loss": 0.4175, + "step": 11150 + }, + { + "epoch": 0.6244260275506776, + "grad_norm": 1.2460700273513794, + "learning_rate": 5.5725000000000006e-05, + "loss": 0.4135, + "step": 11151 + }, + { + "epoch": 0.6244820248628066, + "grad_norm": 1.1380853652954102, + "learning_rate": 5.5730000000000003e-05, + "loss": 0.4291, + "step": 11152 + }, + { + "epoch": 0.6245380221749356, + "grad_norm": 1.1354602575302124, + "learning_rate": 5.5735e-05, + "loss": 0.3984, + "step": 11153 + }, + { + "epoch": 0.6245940194870646, + "grad_norm": 1.2212356328964233, + "learning_rate": 5.5740000000000005e-05, + "loss": 0.3591, + "step": 11154 + }, + { + "epoch": 0.6246500167991936, + "grad_norm": 1.7149409055709839, + "learning_rate": 5.5745e-05, + "loss": 0.4434, + "step": 11155 + }, + { + "epoch": 0.6247060141113226, + "grad_norm": 1.4858187437057495, + "learning_rate": 5.575e-05, + "loss": 0.5377, + "step": 11156 + }, + { + "epoch": 0.6247620114234517, + "grad_norm": 1.2911696434020996, + "learning_rate": 5.5755000000000004e-05, + "loss": 0.4101, + "step": 11157 + }, + { + "epoch": 0.6248180087355807, + "grad_norm": 1.328184962272644, + "learning_rate": 5.576e-05, + "loss": 0.3503, + "step": 11158 + }, + { + "epoch": 0.6248740060477097, + "grad_norm": 1.7264142036437988, + "learning_rate": 5.5765e-05, + "loss": 0.4688, + "step": 11159 + }, + { + "epoch": 0.6249300033598387, + "grad_norm": 1.4469679594039917, + "learning_rate": 5.577e-05, + "loss": 0.528, + "step": 11160 + }, + { + "epoch": 0.6249860006719677, + "grad_norm": 1.1961560249328613, + "learning_rate": 5.5775e-05, + "loss": 0.2944, + "step": 11161 + }, + { + "epoch": 0.6250419979840968, + "grad_norm": 1.1152257919311523, + "learning_rate": 5.578e-05, + "loss": 0.3623, + "step": 11162 + }, + { + "epoch": 0.6250979952962258, + "grad_norm": 1.3937879800796509, + "learning_rate": 5.5784999999999995e-05, + "loss": 0.3723, + "step": 11163 + }, + { + "epoch": 0.6251539926083548, + "grad_norm": 1.5608205795288086, + "learning_rate": 5.579e-05, + "loss": 0.5245, + "step": 11164 + }, + { + "epoch": 0.6252099899204838, + "grad_norm": 1.3844794034957886, + "learning_rate": 5.5795e-05, + "loss": 0.4392, + "step": 11165 + }, + { + "epoch": 0.6252659872326128, + "grad_norm": 1.4845026731491089, + "learning_rate": 5.580000000000001e-05, + "loss": 0.5709, + "step": 11166 + }, + { + "epoch": 0.6253219845447419, + "grad_norm": 1.2369791269302368, + "learning_rate": 5.5805000000000005e-05, + "loss": 0.4965, + "step": 11167 + }, + { + "epoch": 0.6253779818568709, + "grad_norm": 1.115130066871643, + "learning_rate": 5.581000000000001e-05, + "loss": 0.4414, + "step": 11168 + }, + { + "epoch": 0.6254339791689999, + "grad_norm": 1.6319810152053833, + "learning_rate": 5.581500000000001e-05, + "loss": 0.5116, + "step": 11169 + }, + { + "epoch": 0.6254899764811289, + "grad_norm": 1.1978349685668945, + "learning_rate": 5.5820000000000004e-05, + "loss": 0.4042, + "step": 11170 + }, + { + "epoch": 0.6255459737932579, + "grad_norm": 1.140435814857483, + "learning_rate": 5.582500000000001e-05, + "loss": 0.4591, + "step": 11171 + }, + { + "epoch": 0.625601971105387, + "grad_norm": 1.1942261457443237, + "learning_rate": 5.5830000000000006e-05, + "loss": 0.5054, + "step": 11172 + }, + { + "epoch": 0.625657968417516, + "grad_norm": 1.1937553882598877, + "learning_rate": 5.5835e-05, + "loss": 0.4353, + "step": 11173 + }, + { + "epoch": 0.625713965729645, + "grad_norm": 1.2794886827468872, + "learning_rate": 5.584e-05, + "loss": 0.2802, + "step": 11174 + }, + { + "epoch": 0.625769963041774, + "grad_norm": 1.1961913108825684, + "learning_rate": 5.5845000000000005e-05, + "loss": 0.464, + "step": 11175 + }, + { + "epoch": 0.625825960353903, + "grad_norm": 1.4396162033081055, + "learning_rate": 5.585e-05, + "loss": 0.5098, + "step": 11176 + }, + { + "epoch": 0.625881957666032, + "grad_norm": 1.1948219537734985, + "learning_rate": 5.5855e-05, + "loss": 0.431, + "step": 11177 + }, + { + "epoch": 0.6259379549781611, + "grad_norm": 2.665365695953369, + "learning_rate": 5.5860000000000004e-05, + "loss": 0.421, + "step": 11178 + }, + { + "epoch": 0.6259939522902901, + "grad_norm": 1.2600346803665161, + "learning_rate": 5.5865e-05, + "loss": 0.4265, + "step": 11179 + }, + { + "epoch": 0.6260499496024191, + "grad_norm": 1.7787073850631714, + "learning_rate": 5.587e-05, + "loss": 0.5158, + "step": 11180 + }, + { + "epoch": 0.6261059469145481, + "grad_norm": 1.2258793115615845, + "learning_rate": 5.5875e-05, + "loss": 0.5191, + "step": 11181 + }, + { + "epoch": 0.6261619442266771, + "grad_norm": 1.2070649862289429, + "learning_rate": 5.588e-05, + "loss": 0.451, + "step": 11182 + }, + { + "epoch": 0.6262179415388062, + "grad_norm": 4.887661457061768, + "learning_rate": 5.5885e-05, + "loss": 0.516, + "step": 11183 + }, + { + "epoch": 0.6262739388509352, + "grad_norm": 1.2355186939239502, + "learning_rate": 5.5889999999999995e-05, + "loss": 0.4068, + "step": 11184 + }, + { + "epoch": 0.6263299361630642, + "grad_norm": 1.3576050996780396, + "learning_rate": 5.5895e-05, + "loss": 0.6321, + "step": 11185 + }, + { + "epoch": 0.6263859334751932, + "grad_norm": 1.405781865119934, + "learning_rate": 5.590000000000001e-05, + "loss": 0.4014, + "step": 11186 + }, + { + "epoch": 0.6264419307873222, + "grad_norm": 1.2649202346801758, + "learning_rate": 5.590500000000001e-05, + "loss": 0.3719, + "step": 11187 + }, + { + "epoch": 0.6264979280994513, + "grad_norm": 1.101420521736145, + "learning_rate": 5.5910000000000005e-05, + "loss": 0.3204, + "step": 11188 + }, + { + "epoch": 0.6265539254115803, + "grad_norm": 1.2335408926010132, + "learning_rate": 5.591500000000001e-05, + "loss": 0.4149, + "step": 11189 + }, + { + "epoch": 0.6266099227237093, + "grad_norm": 1.3392070531845093, + "learning_rate": 5.592000000000001e-05, + "loss": 0.5609, + "step": 11190 + }, + { + "epoch": 0.6266659200358383, + "grad_norm": 1.339861512184143, + "learning_rate": 5.5925000000000004e-05, + "loss": 0.4918, + "step": 11191 + }, + { + "epoch": 0.6267219173479673, + "grad_norm": 1.6837819814682007, + "learning_rate": 5.593e-05, + "loss": 0.5612, + "step": 11192 + }, + { + "epoch": 0.6267779146600964, + "grad_norm": 1.3606007099151611, + "learning_rate": 5.5935000000000006e-05, + "loss": 0.4434, + "step": 11193 + }, + { + "epoch": 0.6268339119722254, + "grad_norm": 1.2954528331756592, + "learning_rate": 5.594e-05, + "loss": 0.5468, + "step": 11194 + }, + { + "epoch": 0.6268899092843544, + "grad_norm": 6.007063865661621, + "learning_rate": 5.5945e-05, + "loss": 0.4323, + "step": 11195 + }, + { + "epoch": 0.6269459065964834, + "grad_norm": 1.3410000801086426, + "learning_rate": 5.5950000000000005e-05, + "loss": 0.3654, + "step": 11196 + }, + { + "epoch": 0.6270019039086124, + "grad_norm": 1.3862791061401367, + "learning_rate": 5.5955e-05, + "loss": 0.515, + "step": 11197 + }, + { + "epoch": 0.6270579012207415, + "grad_norm": 1.2850056886672974, + "learning_rate": 5.596e-05, + "loss": 0.4258, + "step": 11198 + }, + { + "epoch": 0.6271138985328705, + "grad_norm": 1.225555658340454, + "learning_rate": 5.5965000000000004e-05, + "loss": 0.4058, + "step": 11199 + }, + { + "epoch": 0.6271698958449995, + "grad_norm": 1.2777115106582642, + "learning_rate": 5.597e-05, + "loss": 0.3095, + "step": 11200 + }, + { + "epoch": 0.6272258931571285, + "grad_norm": 1.643985629081726, + "learning_rate": 5.5975e-05, + "loss": 0.4018, + "step": 11201 + }, + { + "epoch": 0.6272818904692575, + "grad_norm": 1.3758684396743774, + "learning_rate": 5.5979999999999996e-05, + "loss": 0.5092, + "step": 11202 + }, + { + "epoch": 0.6273378877813864, + "grad_norm": 1.4489269256591797, + "learning_rate": 5.5985e-05, + "loss": 0.4493, + "step": 11203 + }, + { + "epoch": 0.6273938850935155, + "grad_norm": 1.2936818599700928, + "learning_rate": 5.599e-05, + "loss": 0.4872, + "step": 11204 + }, + { + "epoch": 0.6274498824056445, + "grad_norm": 1.2241450548171997, + "learning_rate": 5.5994999999999995e-05, + "loss": 0.3612, + "step": 11205 + }, + { + "epoch": 0.6275058797177735, + "grad_norm": 1.1195197105407715, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.3864, + "step": 11206 + }, + { + "epoch": 0.6275618770299025, + "grad_norm": 1.2647725343704224, + "learning_rate": 5.600500000000001e-05, + "loss": 0.4808, + "step": 11207 + }, + { + "epoch": 0.6276178743420315, + "grad_norm": 1.250089406967163, + "learning_rate": 5.601000000000001e-05, + "loss": 0.402, + "step": 11208 + }, + { + "epoch": 0.6276738716541606, + "grad_norm": 1.9865802526474, + "learning_rate": 5.6015000000000005e-05, + "loss": 0.4525, + "step": 11209 + }, + { + "epoch": 0.6277298689662896, + "grad_norm": 1.143970251083374, + "learning_rate": 5.602000000000001e-05, + "loss": 0.3206, + "step": 11210 + }, + { + "epoch": 0.6277858662784186, + "grad_norm": 1.2424830198287964, + "learning_rate": 5.6025000000000007e-05, + "loss": 0.4686, + "step": 11211 + }, + { + "epoch": 0.6278418635905476, + "grad_norm": 1.3840773105621338, + "learning_rate": 5.6030000000000004e-05, + "loss": 0.3806, + "step": 11212 + }, + { + "epoch": 0.6278978609026766, + "grad_norm": 1.2964428663253784, + "learning_rate": 5.6035e-05, + "loss": 0.4664, + "step": 11213 + }, + { + "epoch": 0.6279538582148056, + "grad_norm": 1.2423175573349, + "learning_rate": 5.6040000000000006e-05, + "loss": 0.3617, + "step": 11214 + }, + { + "epoch": 0.6280098555269347, + "grad_norm": 1.3496135473251343, + "learning_rate": 5.6045e-05, + "loss": 0.5255, + "step": 11215 + }, + { + "epoch": 0.6280658528390637, + "grad_norm": 1.2265090942382812, + "learning_rate": 5.605e-05, + "loss": 0.4631, + "step": 11216 + }, + { + "epoch": 0.6281218501511927, + "grad_norm": 1.2823275327682495, + "learning_rate": 5.6055000000000005e-05, + "loss": 0.3997, + "step": 11217 + }, + { + "epoch": 0.6281778474633217, + "grad_norm": 1.5288941860198975, + "learning_rate": 5.606e-05, + "loss": 0.5032, + "step": 11218 + }, + { + "epoch": 0.6282338447754507, + "grad_norm": 1.2193326950073242, + "learning_rate": 5.6065e-05, + "loss": 0.401, + "step": 11219 + }, + { + "epoch": 0.6282898420875798, + "grad_norm": 1.213442087173462, + "learning_rate": 5.6070000000000004e-05, + "loss": 0.4321, + "step": 11220 + }, + { + "epoch": 0.6283458393997088, + "grad_norm": 1.3099799156188965, + "learning_rate": 5.6075e-05, + "loss": 0.411, + "step": 11221 + }, + { + "epoch": 0.6284018367118378, + "grad_norm": 1.056294322013855, + "learning_rate": 5.608e-05, + "loss": 0.2766, + "step": 11222 + }, + { + "epoch": 0.6284578340239668, + "grad_norm": 1.1698724031448364, + "learning_rate": 5.6084999999999996e-05, + "loss": 0.413, + "step": 11223 + }, + { + "epoch": 0.6285138313360958, + "grad_norm": 1.145461916923523, + "learning_rate": 5.609e-05, + "loss": 0.2676, + "step": 11224 + }, + { + "epoch": 0.6285698286482249, + "grad_norm": 1.2429014444351196, + "learning_rate": 5.6095e-05, + "loss": 0.4648, + "step": 11225 + }, + { + "epoch": 0.6286258259603539, + "grad_norm": 1.3581485748291016, + "learning_rate": 5.610000000000001e-05, + "loss": 0.4733, + "step": 11226 + }, + { + "epoch": 0.6286818232724829, + "grad_norm": 1.0891610383987427, + "learning_rate": 5.6105000000000006e-05, + "loss": 0.3796, + "step": 11227 + }, + { + "epoch": 0.6287378205846119, + "grad_norm": 1.4651587009429932, + "learning_rate": 5.611000000000001e-05, + "loss": 0.5484, + "step": 11228 + }, + { + "epoch": 0.6287938178967409, + "grad_norm": 1.1366677284240723, + "learning_rate": 5.611500000000001e-05, + "loss": 0.3699, + "step": 11229 + }, + { + "epoch": 0.62884981520887, + "grad_norm": 1.1697146892547607, + "learning_rate": 5.6120000000000005e-05, + "loss": 0.4184, + "step": 11230 + }, + { + "epoch": 0.628905812520999, + "grad_norm": 1.4018419981002808, + "learning_rate": 5.6125e-05, + "loss": 0.5213, + "step": 11231 + }, + { + "epoch": 0.628961809833128, + "grad_norm": 1.3317739963531494, + "learning_rate": 5.6130000000000006e-05, + "loss": 0.4804, + "step": 11232 + }, + { + "epoch": 0.629017807145257, + "grad_norm": 1.0773656368255615, + "learning_rate": 5.6135000000000004e-05, + "loss": 0.3926, + "step": 11233 + }, + { + "epoch": 0.629073804457386, + "grad_norm": 2.1188793182373047, + "learning_rate": 5.614e-05, + "loss": 0.3593, + "step": 11234 + }, + { + "epoch": 0.629129801769515, + "grad_norm": 1.1686667203903198, + "learning_rate": 5.6145000000000005e-05, + "loss": 0.3245, + "step": 11235 + }, + { + "epoch": 0.6291857990816441, + "grad_norm": 1.5293480157852173, + "learning_rate": 5.615e-05, + "loss": 0.501, + "step": 11236 + }, + { + "epoch": 0.6292417963937731, + "grad_norm": 1.2358907461166382, + "learning_rate": 5.6155e-05, + "loss": 0.3587, + "step": 11237 + }, + { + "epoch": 0.6292977937059021, + "grad_norm": 6.005584239959717, + "learning_rate": 5.6160000000000004e-05, + "loss": 0.4272, + "step": 11238 + }, + { + "epoch": 0.6293537910180311, + "grad_norm": 1.945157527923584, + "learning_rate": 5.6165e-05, + "loss": 0.5091, + "step": 11239 + }, + { + "epoch": 0.6294097883301601, + "grad_norm": 1.199906349182129, + "learning_rate": 5.617e-05, + "loss": 0.4088, + "step": 11240 + }, + { + "epoch": 0.6294657856422892, + "grad_norm": 1.1609842777252197, + "learning_rate": 5.6175e-05, + "loss": 0.37, + "step": 11241 + }, + { + "epoch": 0.6295217829544182, + "grad_norm": 1.303733229637146, + "learning_rate": 5.618e-05, + "loss": 0.3684, + "step": 11242 + }, + { + "epoch": 0.6295777802665472, + "grad_norm": 1.3500808477401733, + "learning_rate": 5.6185e-05, + "loss": 0.49, + "step": 11243 + }, + { + "epoch": 0.6296337775786762, + "grad_norm": 1.1155012845993042, + "learning_rate": 5.6189999999999996e-05, + "loss": 0.3952, + "step": 11244 + }, + { + "epoch": 0.6296897748908052, + "grad_norm": 1.3555607795715332, + "learning_rate": 5.6195e-05, + "loss": 0.4157, + "step": 11245 + }, + { + "epoch": 0.6297457722029343, + "grad_norm": 1.3759735822677612, + "learning_rate": 5.620000000000001e-05, + "loss": 0.3974, + "step": 11246 + }, + { + "epoch": 0.6298017695150633, + "grad_norm": 1.5575779676437378, + "learning_rate": 5.620500000000001e-05, + "loss": 0.3911, + "step": 11247 + }, + { + "epoch": 0.6298577668271923, + "grad_norm": 1.4618136882781982, + "learning_rate": 5.6210000000000006e-05, + "loss": 0.4637, + "step": 11248 + }, + { + "epoch": 0.6299137641393213, + "grad_norm": 1.4149360656738281, + "learning_rate": 5.621500000000001e-05, + "loss": 0.4352, + "step": 11249 + }, + { + "epoch": 0.6299697614514503, + "grad_norm": 1.1021887063980103, + "learning_rate": 5.622000000000001e-05, + "loss": 0.2918, + "step": 11250 + }, + { + "epoch": 0.6300257587635794, + "grad_norm": 1.0999575853347778, + "learning_rate": 5.6225000000000005e-05, + "loss": 0.4268, + "step": 11251 + }, + { + "epoch": 0.6300817560757084, + "grad_norm": 1.0819861888885498, + "learning_rate": 5.623e-05, + "loss": 0.5071, + "step": 11252 + }, + { + "epoch": 0.6301377533878374, + "grad_norm": 1.3975441455841064, + "learning_rate": 5.6235000000000006e-05, + "loss": 0.4135, + "step": 11253 + }, + { + "epoch": 0.6301937506999664, + "grad_norm": 1.3753687143325806, + "learning_rate": 5.6240000000000004e-05, + "loss": 0.3901, + "step": 11254 + }, + { + "epoch": 0.6302497480120954, + "grad_norm": 1.2578248977661133, + "learning_rate": 5.6245e-05, + "loss": 0.4152, + "step": 11255 + }, + { + "epoch": 0.6303057453242245, + "grad_norm": 1.2870309352874756, + "learning_rate": 5.6250000000000005e-05, + "loss": 0.4242, + "step": 11256 + }, + { + "epoch": 0.6303617426363535, + "grad_norm": 1.084806203842163, + "learning_rate": 5.6255e-05, + "loss": 0.4124, + "step": 11257 + }, + { + "epoch": 0.6304177399484825, + "grad_norm": 1.4478613138198853, + "learning_rate": 5.626e-05, + "loss": 0.3683, + "step": 11258 + }, + { + "epoch": 0.6304737372606115, + "grad_norm": 1.4411780834197998, + "learning_rate": 5.6265000000000004e-05, + "loss": 0.5716, + "step": 11259 + }, + { + "epoch": 0.6305297345727405, + "grad_norm": 1.5875511169433594, + "learning_rate": 5.627e-05, + "loss": 0.4225, + "step": 11260 + }, + { + "epoch": 0.6305857318848695, + "grad_norm": 1.0330952405929565, + "learning_rate": 5.6275e-05, + "loss": 0.4313, + "step": 11261 + }, + { + "epoch": 0.6306417291969986, + "grad_norm": 1.1702361106872559, + "learning_rate": 5.6279999999999996e-05, + "loss": 0.4511, + "step": 11262 + }, + { + "epoch": 0.6306977265091276, + "grad_norm": 1.1063364744186401, + "learning_rate": 5.6285e-05, + "loss": 0.3643, + "step": 11263 + }, + { + "epoch": 0.6307537238212566, + "grad_norm": 1.007749080657959, + "learning_rate": 5.629e-05, + "loss": 0.3217, + "step": 11264 + }, + { + "epoch": 0.6308097211333856, + "grad_norm": 1.3514572381973267, + "learning_rate": 5.6294999999999995e-05, + "loss": 0.4353, + "step": 11265 + }, + { + "epoch": 0.6308657184455146, + "grad_norm": 1.3110345602035522, + "learning_rate": 5.63e-05, + "loss": 0.3841, + "step": 11266 + }, + { + "epoch": 0.6309217157576437, + "grad_norm": 1.281907081604004, + "learning_rate": 5.630500000000001e-05, + "loss": 0.4366, + "step": 11267 + }, + { + "epoch": 0.6309777130697727, + "grad_norm": 1.4426237344741821, + "learning_rate": 5.631000000000001e-05, + "loss": 0.4704, + "step": 11268 + }, + { + "epoch": 0.6310337103819017, + "grad_norm": 1.2236207723617554, + "learning_rate": 5.6315000000000005e-05, + "loss": 0.381, + "step": 11269 + }, + { + "epoch": 0.6310897076940307, + "grad_norm": 1.2905206680297852, + "learning_rate": 5.632e-05, + "loss": 0.4791, + "step": 11270 + }, + { + "epoch": 0.6311457050061597, + "grad_norm": 1.352844476699829, + "learning_rate": 5.632500000000001e-05, + "loss": 0.3825, + "step": 11271 + }, + { + "epoch": 0.6312017023182888, + "grad_norm": 1.462010145187378, + "learning_rate": 5.6330000000000004e-05, + "loss": 0.7624, + "step": 11272 + }, + { + "epoch": 0.6312576996304178, + "grad_norm": 1.4606443643569946, + "learning_rate": 5.6335e-05, + "loss": 0.4573, + "step": 11273 + }, + { + "epoch": 0.6313136969425468, + "grad_norm": 1.1596461534500122, + "learning_rate": 5.6340000000000006e-05, + "loss": 0.4066, + "step": 11274 + }, + { + "epoch": 0.6313696942546758, + "grad_norm": 1.125241994857788, + "learning_rate": 5.6345000000000003e-05, + "loss": 0.3541, + "step": 11275 + }, + { + "epoch": 0.6314256915668048, + "grad_norm": 1.3315197229385376, + "learning_rate": 5.635e-05, + "loss": 0.4487, + "step": 11276 + }, + { + "epoch": 0.6314816888789339, + "grad_norm": 1.1584787368774414, + "learning_rate": 5.6355000000000005e-05, + "loss": 0.2897, + "step": 11277 + }, + { + "epoch": 0.6315376861910629, + "grad_norm": 1.058961272239685, + "learning_rate": 5.636e-05, + "loss": 0.3676, + "step": 11278 + }, + { + "epoch": 0.6315936835031919, + "grad_norm": 1.2803348302841187, + "learning_rate": 5.6365e-05, + "loss": 0.5269, + "step": 11279 + }, + { + "epoch": 0.6316496808153209, + "grad_norm": 1.4021506309509277, + "learning_rate": 5.637e-05, + "loss": 0.3149, + "step": 11280 + }, + { + "epoch": 0.6317056781274499, + "grad_norm": 1.371085286140442, + "learning_rate": 5.6375e-05, + "loss": 0.5523, + "step": 11281 + }, + { + "epoch": 0.631761675439579, + "grad_norm": 1.7258166074752808, + "learning_rate": 5.638e-05, + "loss": 0.637, + "step": 11282 + }, + { + "epoch": 0.631817672751708, + "grad_norm": 1.157315731048584, + "learning_rate": 5.6384999999999996e-05, + "loss": 0.3429, + "step": 11283 + }, + { + "epoch": 0.631873670063837, + "grad_norm": 1.417036771774292, + "learning_rate": 5.639e-05, + "loss": 0.4629, + "step": 11284 + }, + { + "epoch": 0.631929667375966, + "grad_norm": 1.314966082572937, + "learning_rate": 5.6395e-05, + "loss": 0.5163, + "step": 11285 + }, + { + "epoch": 0.6319856646880949, + "grad_norm": 1.2751572132110596, + "learning_rate": 5.6399999999999995e-05, + "loss": 0.4377, + "step": 11286 + }, + { + "epoch": 0.6320416620002239, + "grad_norm": 1.1246416568756104, + "learning_rate": 5.6405000000000006e-05, + "loss": 0.4409, + "step": 11287 + }, + { + "epoch": 0.632097659312353, + "grad_norm": 1.2444161176681519, + "learning_rate": 5.641000000000001e-05, + "loss": 0.4737, + "step": 11288 + }, + { + "epoch": 0.632153656624482, + "grad_norm": 1.0859520435333252, + "learning_rate": 5.641500000000001e-05, + "loss": 0.4256, + "step": 11289 + }, + { + "epoch": 0.632209653936611, + "grad_norm": 1.3945276737213135, + "learning_rate": 5.6420000000000005e-05, + "loss": 0.3861, + "step": 11290 + }, + { + "epoch": 0.63226565124874, + "grad_norm": 1.2874191999435425, + "learning_rate": 5.6425e-05, + "loss": 0.4776, + "step": 11291 + }, + { + "epoch": 0.632321648560869, + "grad_norm": 1.2917265892028809, + "learning_rate": 5.643000000000001e-05, + "loss": 0.4273, + "step": 11292 + }, + { + "epoch": 0.632377645872998, + "grad_norm": 1.1397157907485962, + "learning_rate": 5.6435000000000004e-05, + "loss": 0.3987, + "step": 11293 + }, + { + "epoch": 0.6324336431851271, + "grad_norm": 1.1848894357681274, + "learning_rate": 5.644e-05, + "loss": 0.4868, + "step": 11294 + }, + { + "epoch": 0.6324896404972561, + "grad_norm": 1.2006059885025024, + "learning_rate": 5.6445000000000006e-05, + "loss": 0.4025, + "step": 11295 + }, + { + "epoch": 0.6325456378093851, + "grad_norm": 1.505966067314148, + "learning_rate": 5.645e-05, + "loss": 0.4714, + "step": 11296 + }, + { + "epoch": 0.6326016351215141, + "grad_norm": 1.1986751556396484, + "learning_rate": 5.6455e-05, + "loss": 0.4536, + "step": 11297 + }, + { + "epoch": 0.6326576324336431, + "grad_norm": 1.4272018671035767, + "learning_rate": 5.6460000000000005e-05, + "loss": 0.4774, + "step": 11298 + }, + { + "epoch": 0.6327136297457722, + "grad_norm": 1.498241901397705, + "learning_rate": 5.6465e-05, + "loss": 0.5556, + "step": 11299 + }, + { + "epoch": 0.6327696270579012, + "grad_norm": 6.183231830596924, + "learning_rate": 5.647e-05, + "loss": 0.4907, + "step": 11300 + }, + { + "epoch": 0.6328256243700302, + "grad_norm": 1.3515992164611816, + "learning_rate": 5.6475e-05, + "loss": 0.4121, + "step": 11301 + }, + { + "epoch": 0.6328816216821592, + "grad_norm": 1.2577941417694092, + "learning_rate": 5.648e-05, + "loss": 0.4679, + "step": 11302 + }, + { + "epoch": 0.6329376189942882, + "grad_norm": 1.0943795442581177, + "learning_rate": 5.6485e-05, + "loss": 0.3358, + "step": 11303 + }, + { + "epoch": 0.6329936163064173, + "grad_norm": 1.1649833917617798, + "learning_rate": 5.6489999999999996e-05, + "loss": 0.4517, + "step": 11304 + }, + { + "epoch": 0.6330496136185463, + "grad_norm": 1.3486391305923462, + "learning_rate": 5.6495e-05, + "loss": 0.3764, + "step": 11305 + }, + { + "epoch": 0.6331056109306753, + "grad_norm": 1.4467990398406982, + "learning_rate": 5.65e-05, + "loss": 0.4745, + "step": 11306 + }, + { + "epoch": 0.6331616082428043, + "grad_norm": 1.2465481758117676, + "learning_rate": 5.650500000000001e-05, + "loss": 0.4743, + "step": 11307 + }, + { + "epoch": 0.6332176055549333, + "grad_norm": 1.2886686325073242, + "learning_rate": 5.6510000000000006e-05, + "loss": 0.4211, + "step": 11308 + }, + { + "epoch": 0.6332736028670624, + "grad_norm": 1.816685676574707, + "learning_rate": 5.6515000000000004e-05, + "loss": 0.4136, + "step": 11309 + }, + { + "epoch": 0.6333296001791914, + "grad_norm": 1.1860828399658203, + "learning_rate": 5.652000000000001e-05, + "loss": 0.4991, + "step": 11310 + }, + { + "epoch": 0.6333855974913204, + "grad_norm": 1.1487269401550293, + "learning_rate": 5.6525000000000005e-05, + "loss": 0.4361, + "step": 11311 + }, + { + "epoch": 0.6334415948034494, + "grad_norm": 1.3433301448822021, + "learning_rate": 5.653e-05, + "loss": 0.3952, + "step": 11312 + }, + { + "epoch": 0.6334975921155784, + "grad_norm": 1.2602077722549438, + "learning_rate": 5.653500000000001e-05, + "loss": 0.5292, + "step": 11313 + }, + { + "epoch": 0.6335535894277075, + "grad_norm": 1.15339994430542, + "learning_rate": 5.6540000000000004e-05, + "loss": 0.3208, + "step": 11314 + }, + { + "epoch": 0.6336095867398365, + "grad_norm": 1.2489097118377686, + "learning_rate": 5.6545e-05, + "loss": 0.3924, + "step": 11315 + }, + { + "epoch": 0.6336655840519655, + "grad_norm": 1.2408370971679688, + "learning_rate": 5.6550000000000006e-05, + "loss": 0.311, + "step": 11316 + }, + { + "epoch": 0.6337215813640945, + "grad_norm": 1.1563353538513184, + "learning_rate": 5.6555e-05, + "loss": 0.3385, + "step": 11317 + }, + { + "epoch": 0.6337775786762235, + "grad_norm": 1.1346092224121094, + "learning_rate": 5.656e-05, + "loss": 0.2971, + "step": 11318 + }, + { + "epoch": 0.6338335759883525, + "grad_norm": 1.3084992170333862, + "learning_rate": 5.6565e-05, + "loss": 0.4301, + "step": 11319 + }, + { + "epoch": 0.6338895733004816, + "grad_norm": 1.445418357849121, + "learning_rate": 5.657e-05, + "loss": 0.4382, + "step": 11320 + }, + { + "epoch": 0.6339455706126106, + "grad_norm": 1.1264489889144897, + "learning_rate": 5.6575e-05, + "loss": 0.447, + "step": 11321 + }, + { + "epoch": 0.6340015679247396, + "grad_norm": 1.0811023712158203, + "learning_rate": 5.658e-05, + "loss": 0.3592, + "step": 11322 + }, + { + "epoch": 0.6340575652368686, + "grad_norm": 1.1096742153167725, + "learning_rate": 5.6585e-05, + "loss": 0.3223, + "step": 11323 + }, + { + "epoch": 0.6341135625489976, + "grad_norm": 1.3272207975387573, + "learning_rate": 5.659e-05, + "loss": 0.4864, + "step": 11324 + }, + { + "epoch": 0.6341695598611267, + "grad_norm": 1.095194697380066, + "learning_rate": 5.6594999999999996e-05, + "loss": 0.3495, + "step": 11325 + }, + { + "epoch": 0.6342255571732557, + "grad_norm": 1.1297070980072021, + "learning_rate": 5.66e-05, + "loss": 0.4301, + "step": 11326 + }, + { + "epoch": 0.6342815544853847, + "grad_norm": 1.390947937965393, + "learning_rate": 5.660500000000001e-05, + "loss": 0.414, + "step": 11327 + }, + { + "epoch": 0.6343375517975137, + "grad_norm": 1.2035391330718994, + "learning_rate": 5.661000000000001e-05, + "loss": 0.4177, + "step": 11328 + }, + { + "epoch": 0.6343935491096427, + "grad_norm": 1.2064096927642822, + "learning_rate": 5.6615000000000006e-05, + "loss": 0.3505, + "step": 11329 + }, + { + "epoch": 0.6344495464217718, + "grad_norm": 1.132487177848816, + "learning_rate": 5.6620000000000003e-05, + "loss": 0.4181, + "step": 11330 + }, + { + "epoch": 0.6345055437339008, + "grad_norm": 1.305894136428833, + "learning_rate": 5.662500000000001e-05, + "loss": 0.4296, + "step": 11331 + }, + { + "epoch": 0.6345615410460298, + "grad_norm": 1.579166054725647, + "learning_rate": 5.6630000000000005e-05, + "loss": 0.4964, + "step": 11332 + }, + { + "epoch": 0.6346175383581588, + "grad_norm": 1.2170345783233643, + "learning_rate": 5.6635e-05, + "loss": 0.4896, + "step": 11333 + }, + { + "epoch": 0.6346735356702878, + "grad_norm": 1.304287075996399, + "learning_rate": 5.6640000000000007e-05, + "loss": 0.5304, + "step": 11334 + }, + { + "epoch": 0.6347295329824169, + "grad_norm": 1.2192530632019043, + "learning_rate": 5.6645000000000004e-05, + "loss": 0.3684, + "step": 11335 + }, + { + "epoch": 0.6347855302945459, + "grad_norm": 1.2074230909347534, + "learning_rate": 5.665e-05, + "loss": 0.4713, + "step": 11336 + }, + { + "epoch": 0.6348415276066749, + "grad_norm": 1.4461374282836914, + "learning_rate": 5.6655000000000006e-05, + "loss": 0.476, + "step": 11337 + }, + { + "epoch": 0.6348975249188039, + "grad_norm": 1.287718415260315, + "learning_rate": 5.666e-05, + "loss": 0.4544, + "step": 11338 + }, + { + "epoch": 0.6349535222309329, + "grad_norm": 1.1624178886413574, + "learning_rate": 5.6665e-05, + "loss": 0.3842, + "step": 11339 + }, + { + "epoch": 0.635009519543062, + "grad_norm": 1.1182670593261719, + "learning_rate": 5.667e-05, + "loss": 0.4279, + "step": 11340 + }, + { + "epoch": 0.635065516855191, + "grad_norm": 1.5850125551223755, + "learning_rate": 5.6675e-05, + "loss": 0.616, + "step": 11341 + }, + { + "epoch": 0.63512151416732, + "grad_norm": 1.3708568811416626, + "learning_rate": 5.668e-05, + "loss": 0.615, + "step": 11342 + }, + { + "epoch": 0.635177511479449, + "grad_norm": 1.1200449466705322, + "learning_rate": 5.6685e-05, + "loss": 0.35, + "step": 11343 + }, + { + "epoch": 0.635233508791578, + "grad_norm": 1.2273062467575073, + "learning_rate": 5.669e-05, + "loss": 0.3545, + "step": 11344 + }, + { + "epoch": 0.635289506103707, + "grad_norm": 1.147248387336731, + "learning_rate": 5.6695e-05, + "loss": 0.4342, + "step": 11345 + }, + { + "epoch": 0.6353455034158361, + "grad_norm": 1.3605453968048096, + "learning_rate": 5.6699999999999996e-05, + "loss": 0.4196, + "step": 11346 + }, + { + "epoch": 0.6354015007279651, + "grad_norm": 1.3121367692947388, + "learning_rate": 5.670500000000001e-05, + "loss": 0.4314, + "step": 11347 + }, + { + "epoch": 0.6354574980400941, + "grad_norm": 1.303649663925171, + "learning_rate": 5.6710000000000004e-05, + "loss": 0.4225, + "step": 11348 + }, + { + "epoch": 0.6355134953522231, + "grad_norm": 1.190080165863037, + "learning_rate": 5.671500000000001e-05, + "loss": 0.436, + "step": 11349 + }, + { + "epoch": 0.6355694926643521, + "grad_norm": 1.2896257638931274, + "learning_rate": 5.6720000000000006e-05, + "loss": 0.3563, + "step": 11350 + }, + { + "epoch": 0.6356254899764812, + "grad_norm": 1.3367797136306763, + "learning_rate": 5.6725e-05, + "loss": 0.4411, + "step": 11351 + }, + { + "epoch": 0.6356814872886102, + "grad_norm": 1.5831478834152222, + "learning_rate": 5.673000000000001e-05, + "loss": 0.5588, + "step": 11352 + }, + { + "epoch": 0.6357374846007392, + "grad_norm": 1.2519817352294922, + "learning_rate": 5.6735000000000005e-05, + "loss": 0.4346, + "step": 11353 + }, + { + "epoch": 0.6357934819128682, + "grad_norm": 1.526275634765625, + "learning_rate": 5.674e-05, + "loss": 0.4632, + "step": 11354 + }, + { + "epoch": 0.6358494792249972, + "grad_norm": 1.3396154642105103, + "learning_rate": 5.6745000000000006e-05, + "loss": 0.5087, + "step": 11355 + }, + { + "epoch": 0.6359054765371263, + "grad_norm": 1.5325535535812378, + "learning_rate": 5.6750000000000004e-05, + "loss": 0.4743, + "step": 11356 + }, + { + "epoch": 0.6359614738492553, + "grad_norm": 1.4574859142303467, + "learning_rate": 5.6755e-05, + "loss": 0.4546, + "step": 11357 + }, + { + "epoch": 0.6360174711613843, + "grad_norm": 1.2215218544006348, + "learning_rate": 5.6760000000000005e-05, + "loss": 0.4006, + "step": 11358 + }, + { + "epoch": 0.6360734684735133, + "grad_norm": 1.2708491086959839, + "learning_rate": 5.6765e-05, + "loss": 0.575, + "step": 11359 + }, + { + "epoch": 0.6361294657856423, + "grad_norm": 1.4911993741989136, + "learning_rate": 5.677e-05, + "loss": 0.4185, + "step": 11360 + }, + { + "epoch": 0.6361854630977714, + "grad_norm": 1.4460312128067017, + "learning_rate": 5.6775e-05, + "loss": 0.5021, + "step": 11361 + }, + { + "epoch": 0.6362414604099004, + "grad_norm": 1.0688856840133667, + "learning_rate": 5.678e-05, + "loss": 0.3547, + "step": 11362 + }, + { + "epoch": 0.6362974577220294, + "grad_norm": 1.122283935546875, + "learning_rate": 5.6785e-05, + "loss": 0.4026, + "step": 11363 + }, + { + "epoch": 0.6363534550341584, + "grad_norm": 1.3614938259124756, + "learning_rate": 5.679e-05, + "loss": 0.5103, + "step": 11364 + }, + { + "epoch": 0.6364094523462874, + "grad_norm": 1.4329949617385864, + "learning_rate": 5.6795e-05, + "loss": 0.405, + "step": 11365 + }, + { + "epoch": 0.6364654496584164, + "grad_norm": 1.2382676601409912, + "learning_rate": 5.68e-05, + "loss": 0.4251, + "step": 11366 + }, + { + "epoch": 0.6365214469705455, + "grad_norm": 1.3758963346481323, + "learning_rate": 5.680500000000001e-05, + "loss": 0.5959, + "step": 11367 + }, + { + "epoch": 0.6365774442826744, + "grad_norm": 1.1326810121536255, + "learning_rate": 5.681000000000001e-05, + "loss": 0.4424, + "step": 11368 + }, + { + "epoch": 0.6366334415948034, + "grad_norm": 1.1721724271774292, + "learning_rate": 5.6815000000000004e-05, + "loss": 0.4848, + "step": 11369 + }, + { + "epoch": 0.6366894389069324, + "grad_norm": 1.471866250038147, + "learning_rate": 5.682000000000001e-05, + "loss": 0.3995, + "step": 11370 + }, + { + "epoch": 0.6367454362190614, + "grad_norm": 1.4628483057022095, + "learning_rate": 5.6825000000000006e-05, + "loss": 0.4036, + "step": 11371 + }, + { + "epoch": 0.6368014335311905, + "grad_norm": 1.1856060028076172, + "learning_rate": 5.683e-05, + "loss": 0.422, + "step": 11372 + }, + { + "epoch": 0.6368574308433195, + "grad_norm": 1.3349860906600952, + "learning_rate": 5.683500000000001e-05, + "loss": 0.349, + "step": 11373 + }, + { + "epoch": 0.6369134281554485, + "grad_norm": 1.2007198333740234, + "learning_rate": 5.6840000000000005e-05, + "loss": 0.3308, + "step": 11374 + }, + { + "epoch": 0.6369694254675775, + "grad_norm": 1.6263080835342407, + "learning_rate": 5.6845e-05, + "loss": 0.4246, + "step": 11375 + }, + { + "epoch": 0.6370254227797065, + "grad_norm": 1.3804060220718384, + "learning_rate": 5.6850000000000006e-05, + "loss": 0.5547, + "step": 11376 + }, + { + "epoch": 0.6370814200918355, + "grad_norm": 1.197320580482483, + "learning_rate": 5.6855000000000004e-05, + "loss": 0.4101, + "step": 11377 + }, + { + "epoch": 0.6371374174039646, + "grad_norm": 1.2640727758407593, + "learning_rate": 5.686e-05, + "loss": 0.407, + "step": 11378 + }, + { + "epoch": 0.6371934147160936, + "grad_norm": 1.3745468854904175, + "learning_rate": 5.6865e-05, + "loss": 0.4645, + "step": 11379 + }, + { + "epoch": 0.6372494120282226, + "grad_norm": 1.1405552625656128, + "learning_rate": 5.687e-05, + "loss": 0.3982, + "step": 11380 + }, + { + "epoch": 0.6373054093403516, + "grad_norm": 1.1586295366287231, + "learning_rate": 5.6875e-05, + "loss": 0.3587, + "step": 11381 + }, + { + "epoch": 0.6373614066524806, + "grad_norm": 1.2616404294967651, + "learning_rate": 5.688e-05, + "loss": 0.4723, + "step": 11382 + }, + { + "epoch": 0.6374174039646097, + "grad_norm": 1.6011511087417603, + "learning_rate": 5.6885e-05, + "loss": 0.4597, + "step": 11383 + }, + { + "epoch": 0.6374734012767387, + "grad_norm": 1.3609460592269897, + "learning_rate": 5.689e-05, + "loss": 0.4977, + "step": 11384 + }, + { + "epoch": 0.6375293985888677, + "grad_norm": 1.7835408449172974, + "learning_rate": 5.6894999999999997e-05, + "loss": 0.5518, + "step": 11385 + }, + { + "epoch": 0.6375853959009967, + "grad_norm": 1.487677812576294, + "learning_rate": 5.69e-05, + "loss": 0.5093, + "step": 11386 + }, + { + "epoch": 0.6376413932131257, + "grad_norm": 1.3706413507461548, + "learning_rate": 5.6905e-05, + "loss": 0.5073, + "step": 11387 + }, + { + "epoch": 0.6376973905252548, + "grad_norm": 1.1740338802337646, + "learning_rate": 5.691000000000001e-05, + "loss": 0.4306, + "step": 11388 + }, + { + "epoch": 0.6377533878373838, + "grad_norm": 1.3596729040145874, + "learning_rate": 5.6915000000000006e-05, + "loss": 0.5845, + "step": 11389 + }, + { + "epoch": 0.6378093851495128, + "grad_norm": 1.2662302255630493, + "learning_rate": 5.6920000000000004e-05, + "loss": 0.4194, + "step": 11390 + }, + { + "epoch": 0.6378653824616418, + "grad_norm": 1.2910363674163818, + "learning_rate": 5.692500000000001e-05, + "loss": 0.4155, + "step": 11391 + }, + { + "epoch": 0.6379213797737708, + "grad_norm": 1.5611884593963623, + "learning_rate": 5.6930000000000006e-05, + "loss": 0.4015, + "step": 11392 + }, + { + "epoch": 0.6379773770858999, + "grad_norm": 1.6610698699951172, + "learning_rate": 5.6935e-05, + "loss": 0.4132, + "step": 11393 + }, + { + "epoch": 0.6380333743980289, + "grad_norm": 1.3034590482711792, + "learning_rate": 5.694000000000001e-05, + "loss": 0.4829, + "step": 11394 + }, + { + "epoch": 0.6380893717101579, + "grad_norm": 1.194251537322998, + "learning_rate": 5.6945000000000005e-05, + "loss": 0.3595, + "step": 11395 + }, + { + "epoch": 0.6381453690222869, + "grad_norm": 1.3438389301300049, + "learning_rate": 5.695e-05, + "loss": 0.4172, + "step": 11396 + }, + { + "epoch": 0.6382013663344159, + "grad_norm": 1.3963749408721924, + "learning_rate": 5.6955000000000006e-05, + "loss": 0.6238, + "step": 11397 + }, + { + "epoch": 0.638257363646545, + "grad_norm": 1.1640512943267822, + "learning_rate": 5.6960000000000004e-05, + "loss": 0.4674, + "step": 11398 + }, + { + "epoch": 0.638313360958674, + "grad_norm": 1.4330171346664429, + "learning_rate": 5.6965e-05, + "loss": 0.3955, + "step": 11399 + }, + { + "epoch": 0.638369358270803, + "grad_norm": 1.1675355434417725, + "learning_rate": 5.697e-05, + "loss": 0.3712, + "step": 11400 + }, + { + "epoch": 0.638425355582932, + "grad_norm": 1.525566577911377, + "learning_rate": 5.6975e-05, + "loss": 0.4081, + "step": 11401 + }, + { + "epoch": 0.638481352895061, + "grad_norm": 1.232356309890747, + "learning_rate": 5.698e-05, + "loss": 0.4272, + "step": 11402 + }, + { + "epoch": 0.63853735020719, + "grad_norm": 1.3237557411193848, + "learning_rate": 5.6985e-05, + "loss": 0.4409, + "step": 11403 + }, + { + "epoch": 0.6385933475193191, + "grad_norm": 1.2331622838974, + "learning_rate": 5.699e-05, + "loss": 0.3428, + "step": 11404 + }, + { + "epoch": 0.6386493448314481, + "grad_norm": 1.3364495038986206, + "learning_rate": 5.6995e-05, + "loss": 0.3592, + "step": 11405 + }, + { + "epoch": 0.6387053421435771, + "grad_norm": 1.2224996089935303, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.462, + "step": 11406 + }, + { + "epoch": 0.6387613394557061, + "grad_norm": 1.1252217292785645, + "learning_rate": 5.7005e-05, + "loss": 0.33, + "step": 11407 + }, + { + "epoch": 0.6388173367678351, + "grad_norm": 1.2200932502746582, + "learning_rate": 5.7010000000000005e-05, + "loss": 0.3552, + "step": 11408 + }, + { + "epoch": 0.6388733340799642, + "grad_norm": 1.2461763620376587, + "learning_rate": 5.701500000000001e-05, + "loss": 0.3404, + "step": 11409 + }, + { + "epoch": 0.6389293313920932, + "grad_norm": 1.7498670816421509, + "learning_rate": 5.7020000000000006e-05, + "loss": 0.576, + "step": 11410 + }, + { + "epoch": 0.6389853287042222, + "grad_norm": 1.3398674726486206, + "learning_rate": 5.7025000000000004e-05, + "loss": 0.4954, + "step": 11411 + }, + { + "epoch": 0.6390413260163512, + "grad_norm": 1.282238245010376, + "learning_rate": 5.703000000000001e-05, + "loss": 0.4809, + "step": 11412 + }, + { + "epoch": 0.6390973233284802, + "grad_norm": 1.2816829681396484, + "learning_rate": 5.7035000000000005e-05, + "loss": 0.3704, + "step": 11413 + }, + { + "epoch": 0.6391533206406093, + "grad_norm": 1.433122992515564, + "learning_rate": 5.704e-05, + "loss": 0.5439, + "step": 11414 + }, + { + "epoch": 0.6392093179527383, + "grad_norm": 3.01277494430542, + "learning_rate": 5.704500000000001e-05, + "loss": 0.4214, + "step": 11415 + }, + { + "epoch": 0.6392653152648673, + "grad_norm": 1.63400137424469, + "learning_rate": 5.7050000000000004e-05, + "loss": 0.3825, + "step": 11416 + }, + { + "epoch": 0.6393213125769963, + "grad_norm": 1.2651917934417725, + "learning_rate": 5.7055e-05, + "loss": 0.4381, + "step": 11417 + }, + { + "epoch": 0.6393773098891253, + "grad_norm": 1.2086280584335327, + "learning_rate": 5.706e-05, + "loss": 0.4712, + "step": 11418 + }, + { + "epoch": 0.6394333072012544, + "grad_norm": 1.4258328676223755, + "learning_rate": 5.7065e-05, + "loss": 0.4264, + "step": 11419 + }, + { + "epoch": 0.6394893045133834, + "grad_norm": 1.3295502662658691, + "learning_rate": 5.707e-05, + "loss": 0.4893, + "step": 11420 + }, + { + "epoch": 0.6395453018255124, + "grad_norm": 1.505089521408081, + "learning_rate": 5.7075e-05, + "loss": 0.54, + "step": 11421 + }, + { + "epoch": 0.6396012991376414, + "grad_norm": 1.2171998023986816, + "learning_rate": 5.708e-05, + "loss": 0.5263, + "step": 11422 + }, + { + "epoch": 0.6396572964497704, + "grad_norm": 1.2634024620056152, + "learning_rate": 5.7085e-05, + "loss": 0.5035, + "step": 11423 + }, + { + "epoch": 0.6397132937618994, + "grad_norm": 1.3135502338409424, + "learning_rate": 5.709e-05, + "loss": 0.3592, + "step": 11424 + }, + { + "epoch": 0.6397692910740285, + "grad_norm": 1.3976722955703735, + "learning_rate": 5.7095e-05, + "loss": 0.5523, + "step": 11425 + }, + { + "epoch": 0.6398252883861575, + "grad_norm": 1.4321637153625488, + "learning_rate": 5.71e-05, + "loss": 0.4316, + "step": 11426 + }, + { + "epoch": 0.6398812856982865, + "grad_norm": 1.2029895782470703, + "learning_rate": 5.7104999999999996e-05, + "loss": 0.3667, + "step": 11427 + }, + { + "epoch": 0.6399372830104155, + "grad_norm": 1.2664986848831177, + "learning_rate": 5.711000000000001e-05, + "loss": 0.3627, + "step": 11428 + }, + { + "epoch": 0.6399932803225445, + "grad_norm": 1.3680680990219116, + "learning_rate": 5.7115000000000005e-05, + "loss": 0.3631, + "step": 11429 + }, + { + "epoch": 0.6400492776346736, + "grad_norm": 1.1282374858856201, + "learning_rate": 5.712000000000001e-05, + "loss": 0.399, + "step": 11430 + }, + { + "epoch": 0.6401052749468026, + "grad_norm": 1.3647115230560303, + "learning_rate": 5.7125000000000006e-05, + "loss": 0.5315, + "step": 11431 + }, + { + "epoch": 0.6401612722589316, + "grad_norm": 1.1750375032424927, + "learning_rate": 5.7130000000000004e-05, + "loss": 0.3252, + "step": 11432 + }, + { + "epoch": 0.6402172695710606, + "grad_norm": 1.1515816450119019, + "learning_rate": 5.713500000000001e-05, + "loss": 0.3207, + "step": 11433 + }, + { + "epoch": 0.6402732668831896, + "grad_norm": 1.2982523441314697, + "learning_rate": 5.7140000000000005e-05, + "loss": 0.3753, + "step": 11434 + }, + { + "epoch": 0.6403292641953187, + "grad_norm": 1.3154433965682983, + "learning_rate": 5.7145e-05, + "loss": 0.3783, + "step": 11435 + }, + { + "epoch": 0.6403852615074477, + "grad_norm": 1.3863359689712524, + "learning_rate": 5.715000000000001e-05, + "loss": 0.4329, + "step": 11436 + }, + { + "epoch": 0.6404412588195767, + "grad_norm": 1.5635638236999512, + "learning_rate": 5.7155000000000004e-05, + "loss": 0.3972, + "step": 11437 + }, + { + "epoch": 0.6404972561317057, + "grad_norm": 1.3954685926437378, + "learning_rate": 5.716e-05, + "loss": 0.4937, + "step": 11438 + }, + { + "epoch": 0.6405532534438347, + "grad_norm": 1.2547791004180908, + "learning_rate": 5.7165e-05, + "loss": 0.4415, + "step": 11439 + }, + { + "epoch": 0.6406092507559638, + "grad_norm": 1.4040956497192383, + "learning_rate": 5.717e-05, + "loss": 0.4813, + "step": 11440 + }, + { + "epoch": 0.6406652480680928, + "grad_norm": 1.5399640798568726, + "learning_rate": 5.7175e-05, + "loss": 0.3528, + "step": 11441 + }, + { + "epoch": 0.6407212453802218, + "grad_norm": 1.1662284135818481, + "learning_rate": 5.718e-05, + "loss": 0.3048, + "step": 11442 + }, + { + "epoch": 0.6407772426923508, + "grad_norm": 1.4365795850753784, + "learning_rate": 5.7185e-05, + "loss": 0.5575, + "step": 11443 + }, + { + "epoch": 0.6408332400044798, + "grad_norm": 1.2736120223999023, + "learning_rate": 5.719e-05, + "loss": 0.4323, + "step": 11444 + }, + { + "epoch": 0.6408892373166089, + "grad_norm": 1.616166591644287, + "learning_rate": 5.7195e-05, + "loss": 0.4377, + "step": 11445 + }, + { + "epoch": 0.6409452346287379, + "grad_norm": 1.6133275032043457, + "learning_rate": 5.72e-05, + "loss": 0.469, + "step": 11446 + }, + { + "epoch": 0.6410012319408669, + "grad_norm": 1.2576003074645996, + "learning_rate": 5.7205e-05, + "loss": 0.5083, + "step": 11447 + }, + { + "epoch": 0.6410572292529959, + "grad_norm": 1.3431735038757324, + "learning_rate": 5.721000000000001e-05, + "loss": 0.4122, + "step": 11448 + }, + { + "epoch": 0.6411132265651249, + "grad_norm": 1.3998430967330933, + "learning_rate": 5.721500000000001e-05, + "loss": 0.563, + "step": 11449 + }, + { + "epoch": 0.641169223877254, + "grad_norm": 1.4633538722991943, + "learning_rate": 5.7220000000000004e-05, + "loss": 0.4276, + "step": 11450 + }, + { + "epoch": 0.6412252211893829, + "grad_norm": 1.1596487760543823, + "learning_rate": 5.722500000000001e-05, + "loss": 0.3429, + "step": 11451 + }, + { + "epoch": 0.6412812185015119, + "grad_norm": 1.3280785083770752, + "learning_rate": 5.7230000000000006e-05, + "loss": 0.4165, + "step": 11452 + }, + { + "epoch": 0.6413372158136409, + "grad_norm": 1.7387182712554932, + "learning_rate": 5.7235000000000003e-05, + "loss": 0.3599, + "step": 11453 + }, + { + "epoch": 0.6413932131257699, + "grad_norm": 1.4629881381988525, + "learning_rate": 5.724000000000001e-05, + "loss": 0.3769, + "step": 11454 + }, + { + "epoch": 0.6414492104378989, + "grad_norm": 1.3334035873413086, + "learning_rate": 5.7245000000000005e-05, + "loss": 0.335, + "step": 11455 + }, + { + "epoch": 0.641505207750028, + "grad_norm": 1.2458865642547607, + "learning_rate": 5.725e-05, + "loss": 0.43, + "step": 11456 + }, + { + "epoch": 0.641561205062157, + "grad_norm": 1.3170214891433716, + "learning_rate": 5.7255e-05, + "loss": 0.4686, + "step": 11457 + }, + { + "epoch": 0.641617202374286, + "grad_norm": 1.4576579332351685, + "learning_rate": 5.7260000000000004e-05, + "loss": 0.4614, + "step": 11458 + }, + { + "epoch": 0.641673199686415, + "grad_norm": 1.4827557802200317, + "learning_rate": 5.7265e-05, + "loss": 0.5667, + "step": 11459 + }, + { + "epoch": 0.641729196998544, + "grad_norm": 1.1222044229507446, + "learning_rate": 5.727e-05, + "loss": 0.507, + "step": 11460 + }, + { + "epoch": 0.641785194310673, + "grad_norm": 1.186562418937683, + "learning_rate": 5.7275e-05, + "loss": 0.3673, + "step": 11461 + }, + { + "epoch": 0.6418411916228021, + "grad_norm": 1.4117891788482666, + "learning_rate": 5.728e-05, + "loss": 0.4402, + "step": 11462 + }, + { + "epoch": 0.6418971889349311, + "grad_norm": 1.2655959129333496, + "learning_rate": 5.7285e-05, + "loss": 0.347, + "step": 11463 + }, + { + "epoch": 0.6419531862470601, + "grad_norm": 1.325129508972168, + "learning_rate": 5.729e-05, + "loss": 0.5245, + "step": 11464 + }, + { + "epoch": 0.6420091835591891, + "grad_norm": 1.2996641397476196, + "learning_rate": 5.7295e-05, + "loss": 0.4196, + "step": 11465 + }, + { + "epoch": 0.6420651808713181, + "grad_norm": 1.9610968828201294, + "learning_rate": 5.73e-05, + "loss": 0.4092, + "step": 11466 + }, + { + "epoch": 0.6421211781834472, + "grad_norm": 1.3521019220352173, + "learning_rate": 5.7304999999999994e-05, + "loss": 0.3975, + "step": 11467 + }, + { + "epoch": 0.6421771754955762, + "grad_norm": 1.2128995656967163, + "learning_rate": 5.7310000000000005e-05, + "loss": 0.3544, + "step": 11468 + }, + { + "epoch": 0.6422331728077052, + "grad_norm": 1.2328879833221436, + "learning_rate": 5.731500000000001e-05, + "loss": 0.595, + "step": 11469 + }, + { + "epoch": 0.6422891701198342, + "grad_norm": 1.2000852823257446, + "learning_rate": 5.732000000000001e-05, + "loss": 0.325, + "step": 11470 + }, + { + "epoch": 0.6423451674319632, + "grad_norm": 1.6674840450286865, + "learning_rate": 5.7325000000000004e-05, + "loss": 0.3719, + "step": 11471 + }, + { + "epoch": 0.6424011647440923, + "grad_norm": 1.1440792083740234, + "learning_rate": 5.733000000000001e-05, + "loss": 0.3789, + "step": 11472 + }, + { + "epoch": 0.6424571620562213, + "grad_norm": 1.2247045040130615, + "learning_rate": 5.7335000000000006e-05, + "loss": 0.428, + "step": 11473 + }, + { + "epoch": 0.6425131593683503, + "grad_norm": 1.2148150205612183, + "learning_rate": 5.734e-05, + "loss": 0.4804, + "step": 11474 + }, + { + "epoch": 0.6425691566804793, + "grad_norm": 1.5058478116989136, + "learning_rate": 5.734500000000001e-05, + "loss": 0.4637, + "step": 11475 + }, + { + "epoch": 0.6426251539926083, + "grad_norm": 1.2872493267059326, + "learning_rate": 5.7350000000000005e-05, + "loss": 0.3831, + "step": 11476 + }, + { + "epoch": 0.6426811513047374, + "grad_norm": 1.0541887283325195, + "learning_rate": 5.7355e-05, + "loss": 0.428, + "step": 11477 + }, + { + "epoch": 0.6427371486168664, + "grad_norm": 1.1574785709381104, + "learning_rate": 5.736e-05, + "loss": 0.313, + "step": 11478 + }, + { + "epoch": 0.6427931459289954, + "grad_norm": 1.2846903800964355, + "learning_rate": 5.7365000000000004e-05, + "loss": 0.4837, + "step": 11479 + }, + { + "epoch": 0.6428491432411244, + "grad_norm": 1.373149037361145, + "learning_rate": 5.737e-05, + "loss": 0.4994, + "step": 11480 + }, + { + "epoch": 0.6429051405532534, + "grad_norm": 1.2504630088806152, + "learning_rate": 5.7375e-05, + "loss": 0.4227, + "step": 11481 + }, + { + "epoch": 0.6429611378653824, + "grad_norm": 1.0435395240783691, + "learning_rate": 5.738e-05, + "loss": 0.3611, + "step": 11482 + }, + { + "epoch": 0.6430171351775115, + "grad_norm": 1.393623948097229, + "learning_rate": 5.7385e-05, + "loss": 0.4542, + "step": 11483 + }, + { + "epoch": 0.6430731324896405, + "grad_norm": 1.033274531364441, + "learning_rate": 5.739e-05, + "loss": 0.2868, + "step": 11484 + }, + { + "epoch": 0.6431291298017695, + "grad_norm": 1.6692862510681152, + "learning_rate": 5.7395e-05, + "loss": 0.5322, + "step": 11485 + }, + { + "epoch": 0.6431851271138985, + "grad_norm": 1.2050962448120117, + "learning_rate": 5.74e-05, + "loss": 0.3991, + "step": 11486 + }, + { + "epoch": 0.6432411244260275, + "grad_norm": 1.5836702585220337, + "learning_rate": 5.7405e-05, + "loss": 0.3882, + "step": 11487 + }, + { + "epoch": 0.6432971217381566, + "grad_norm": 1.2420437335968018, + "learning_rate": 5.741000000000001e-05, + "loss": 0.3548, + "step": 11488 + }, + { + "epoch": 0.6433531190502856, + "grad_norm": 1.2407389879226685, + "learning_rate": 5.7415000000000005e-05, + "loss": 0.4549, + "step": 11489 + }, + { + "epoch": 0.6434091163624146, + "grad_norm": 1.330662488937378, + "learning_rate": 5.742000000000001e-05, + "loss": 0.5312, + "step": 11490 + }, + { + "epoch": 0.6434651136745436, + "grad_norm": 1.766160249710083, + "learning_rate": 5.742500000000001e-05, + "loss": 0.5731, + "step": 11491 + }, + { + "epoch": 0.6435211109866726, + "grad_norm": 1.2884317636489868, + "learning_rate": 5.7430000000000004e-05, + "loss": 0.5438, + "step": 11492 + }, + { + "epoch": 0.6435771082988017, + "grad_norm": 1.593859314918518, + "learning_rate": 5.743500000000001e-05, + "loss": 0.4792, + "step": 11493 + }, + { + "epoch": 0.6436331056109307, + "grad_norm": 1.1537383794784546, + "learning_rate": 5.7440000000000006e-05, + "loss": 0.3695, + "step": 11494 + }, + { + "epoch": 0.6436891029230597, + "grad_norm": 1.2668381929397583, + "learning_rate": 5.7445e-05, + "loss": 0.3173, + "step": 11495 + }, + { + "epoch": 0.6437451002351887, + "grad_norm": 1.3317357301712036, + "learning_rate": 5.745e-05, + "loss": 0.6053, + "step": 11496 + }, + { + "epoch": 0.6438010975473177, + "grad_norm": 1.1135140657424927, + "learning_rate": 5.7455000000000005e-05, + "loss": 0.3324, + "step": 11497 + }, + { + "epoch": 0.6438570948594468, + "grad_norm": 1.2193281650543213, + "learning_rate": 5.746e-05, + "loss": 0.378, + "step": 11498 + }, + { + "epoch": 0.6439130921715758, + "grad_norm": 1.1844638586044312, + "learning_rate": 5.7465e-05, + "loss": 0.488, + "step": 11499 + }, + { + "epoch": 0.6439690894837048, + "grad_norm": 1.4308418035507202, + "learning_rate": 5.7470000000000004e-05, + "loss": 0.5116, + "step": 11500 + }, + { + "epoch": 0.6440250867958338, + "grad_norm": 1.4272637367248535, + "learning_rate": 5.7475e-05, + "loss": 0.4804, + "step": 11501 + }, + { + "epoch": 0.6440810841079628, + "grad_norm": 1.3765759468078613, + "learning_rate": 5.748e-05, + "loss": 0.4766, + "step": 11502 + }, + { + "epoch": 0.6441370814200919, + "grad_norm": 1.304327368736267, + "learning_rate": 5.7485e-05, + "loss": 0.5071, + "step": 11503 + }, + { + "epoch": 0.6441930787322209, + "grad_norm": 1.2469215393066406, + "learning_rate": 5.749e-05, + "loss": 0.4449, + "step": 11504 + }, + { + "epoch": 0.6442490760443499, + "grad_norm": 1.2005549669265747, + "learning_rate": 5.7495e-05, + "loss": 0.454, + "step": 11505 + }, + { + "epoch": 0.6443050733564789, + "grad_norm": 1.0989803075790405, + "learning_rate": 5.7499999999999995e-05, + "loss": 0.4284, + "step": 11506 + }, + { + "epoch": 0.6443610706686079, + "grad_norm": 1.1446917057037354, + "learning_rate": 5.7505e-05, + "loss": 0.4705, + "step": 11507 + }, + { + "epoch": 0.644417067980737, + "grad_norm": 1.1719930171966553, + "learning_rate": 5.7509999999999997e-05, + "loss": 0.3878, + "step": 11508 + }, + { + "epoch": 0.644473065292866, + "grad_norm": 1.02593195438385, + "learning_rate": 5.751500000000001e-05, + "loss": 0.2829, + "step": 11509 + }, + { + "epoch": 0.644529062604995, + "grad_norm": 1.3297885656356812, + "learning_rate": 5.7520000000000005e-05, + "loss": 0.611, + "step": 11510 + }, + { + "epoch": 0.644585059917124, + "grad_norm": 1.445859670639038, + "learning_rate": 5.752500000000001e-05, + "loss": 0.4839, + "step": 11511 + }, + { + "epoch": 0.644641057229253, + "grad_norm": 1.259381890296936, + "learning_rate": 5.7530000000000007e-05, + "loss": 0.4401, + "step": 11512 + }, + { + "epoch": 0.644697054541382, + "grad_norm": 1.2717972993850708, + "learning_rate": 5.7535000000000004e-05, + "loss": 0.4225, + "step": 11513 + }, + { + "epoch": 0.6447530518535111, + "grad_norm": 1.3548849821090698, + "learning_rate": 5.754000000000001e-05, + "loss": 0.3936, + "step": 11514 + }, + { + "epoch": 0.6448090491656401, + "grad_norm": 1.3008915185928345, + "learning_rate": 5.7545000000000006e-05, + "loss": 0.4837, + "step": 11515 + }, + { + "epoch": 0.6448650464777691, + "grad_norm": 1.3620336055755615, + "learning_rate": 5.755e-05, + "loss": 0.8122, + "step": 11516 + }, + { + "epoch": 0.6449210437898981, + "grad_norm": 1.2951780557632446, + "learning_rate": 5.7555e-05, + "loss": 0.4222, + "step": 11517 + }, + { + "epoch": 0.6449770411020271, + "grad_norm": 1.435787558555603, + "learning_rate": 5.7560000000000005e-05, + "loss": 0.4281, + "step": 11518 + }, + { + "epoch": 0.6450330384141562, + "grad_norm": 1.430140495300293, + "learning_rate": 5.7565e-05, + "loss": 0.3756, + "step": 11519 + }, + { + "epoch": 0.6450890357262852, + "grad_norm": 1.2110021114349365, + "learning_rate": 5.757e-05, + "loss": 0.4653, + "step": 11520 + }, + { + "epoch": 0.6451450330384142, + "grad_norm": 2.7886364459991455, + "learning_rate": 5.7575000000000004e-05, + "loss": 0.4481, + "step": 11521 + }, + { + "epoch": 0.6452010303505432, + "grad_norm": 1.1463032960891724, + "learning_rate": 5.758e-05, + "loss": 0.3699, + "step": 11522 + }, + { + "epoch": 0.6452570276626722, + "grad_norm": 1.4110069274902344, + "learning_rate": 5.7585e-05, + "loss": 0.4299, + "step": 11523 + }, + { + "epoch": 0.6453130249748013, + "grad_norm": 1.5350940227508545, + "learning_rate": 5.759e-05, + "loss": 0.4105, + "step": 11524 + }, + { + "epoch": 0.6453690222869303, + "grad_norm": 1.5140190124511719, + "learning_rate": 5.7595e-05, + "loss": 0.7324, + "step": 11525 + }, + { + "epoch": 0.6454250195990593, + "grad_norm": 1.4884791374206543, + "learning_rate": 5.76e-05, + "loss": 0.4558, + "step": 11526 + }, + { + "epoch": 0.6454810169111883, + "grad_norm": 1.242226243019104, + "learning_rate": 5.7604999999999995e-05, + "loss": 0.5391, + "step": 11527 + }, + { + "epoch": 0.6455370142233173, + "grad_norm": 1.2266403436660767, + "learning_rate": 5.761e-05, + "loss": 0.4052, + "step": 11528 + }, + { + "epoch": 0.6455930115354463, + "grad_norm": 1.0919597148895264, + "learning_rate": 5.761500000000001e-05, + "loss": 0.3521, + "step": 11529 + }, + { + "epoch": 0.6456490088475754, + "grad_norm": 1.1650614738464355, + "learning_rate": 5.762000000000001e-05, + "loss": 0.5479, + "step": 11530 + }, + { + "epoch": 0.6457050061597044, + "grad_norm": 1.1579889059066772, + "learning_rate": 5.7625000000000005e-05, + "loss": 0.4455, + "step": 11531 + }, + { + "epoch": 0.6457610034718334, + "grad_norm": 1.603376865386963, + "learning_rate": 5.763000000000001e-05, + "loss": 0.4191, + "step": 11532 + }, + { + "epoch": 0.6458170007839624, + "grad_norm": 1.2230231761932373, + "learning_rate": 5.7635000000000006e-05, + "loss": 0.4944, + "step": 11533 + }, + { + "epoch": 0.6458729980960913, + "grad_norm": 1.1598926782608032, + "learning_rate": 5.7640000000000004e-05, + "loss": 0.374, + "step": 11534 + }, + { + "epoch": 0.6459289954082204, + "grad_norm": 2.9758739471435547, + "learning_rate": 5.7645e-05, + "loss": 0.4522, + "step": 11535 + }, + { + "epoch": 0.6459849927203494, + "grad_norm": 1.1005445718765259, + "learning_rate": 5.7650000000000005e-05, + "loss": 0.3054, + "step": 11536 + }, + { + "epoch": 0.6460409900324784, + "grad_norm": 1.3561359643936157, + "learning_rate": 5.7655e-05, + "loss": 0.4538, + "step": 11537 + }, + { + "epoch": 0.6460969873446074, + "grad_norm": 1.477579951286316, + "learning_rate": 5.766e-05, + "loss": 0.3823, + "step": 11538 + }, + { + "epoch": 0.6461529846567364, + "grad_norm": 1.068372130393982, + "learning_rate": 5.7665000000000004e-05, + "loss": 0.2992, + "step": 11539 + }, + { + "epoch": 0.6462089819688654, + "grad_norm": 1.4765825271606445, + "learning_rate": 5.767e-05, + "loss": 0.5325, + "step": 11540 + }, + { + "epoch": 0.6462649792809945, + "grad_norm": 1.3367769718170166, + "learning_rate": 5.7675e-05, + "loss": 0.3413, + "step": 11541 + }, + { + "epoch": 0.6463209765931235, + "grad_norm": 1.3945248126983643, + "learning_rate": 5.7680000000000003e-05, + "loss": 0.4743, + "step": 11542 + }, + { + "epoch": 0.6463769739052525, + "grad_norm": 1.352189540863037, + "learning_rate": 5.7685e-05, + "loss": 0.5408, + "step": 11543 + }, + { + "epoch": 0.6464329712173815, + "grad_norm": 1.473279595375061, + "learning_rate": 5.769e-05, + "loss": 0.5688, + "step": 11544 + }, + { + "epoch": 0.6464889685295105, + "grad_norm": 1.2186814546585083, + "learning_rate": 5.7695e-05, + "loss": 0.3343, + "step": 11545 + }, + { + "epoch": 0.6465449658416396, + "grad_norm": 1.340122103691101, + "learning_rate": 5.77e-05, + "loss": 0.4962, + "step": 11546 + }, + { + "epoch": 0.6466009631537686, + "grad_norm": 1.6914175748825073, + "learning_rate": 5.7705e-05, + "loss": 0.5216, + "step": 11547 + }, + { + "epoch": 0.6466569604658976, + "grad_norm": 1.2420464754104614, + "learning_rate": 5.7709999999999995e-05, + "loss": 0.4441, + "step": 11548 + }, + { + "epoch": 0.6467129577780266, + "grad_norm": 1.2060019969940186, + "learning_rate": 5.7715000000000006e-05, + "loss": 0.3787, + "step": 11549 + }, + { + "epoch": 0.6467689550901556, + "grad_norm": 1.010940432548523, + "learning_rate": 5.772000000000001e-05, + "loss": 0.3713, + "step": 11550 + }, + { + "epoch": 0.6468249524022847, + "grad_norm": 1.2166849374771118, + "learning_rate": 5.772500000000001e-05, + "loss": 0.3891, + "step": 11551 + }, + { + "epoch": 0.6468809497144137, + "grad_norm": 1.2105660438537598, + "learning_rate": 5.7730000000000005e-05, + "loss": 0.2847, + "step": 11552 + }, + { + "epoch": 0.6469369470265427, + "grad_norm": 1.383593201637268, + "learning_rate": 5.773500000000001e-05, + "loss": 0.5505, + "step": 11553 + }, + { + "epoch": 0.6469929443386717, + "grad_norm": 1.187862515449524, + "learning_rate": 5.7740000000000006e-05, + "loss": 0.3641, + "step": 11554 + }, + { + "epoch": 0.6470489416508007, + "grad_norm": 1.1482435464859009, + "learning_rate": 5.7745000000000004e-05, + "loss": 0.5302, + "step": 11555 + }, + { + "epoch": 0.6471049389629298, + "grad_norm": 1.2438193559646606, + "learning_rate": 5.775e-05, + "loss": 0.4347, + "step": 11556 + }, + { + "epoch": 0.6471609362750588, + "grad_norm": 1.4243003129959106, + "learning_rate": 5.7755000000000005e-05, + "loss": 0.4023, + "step": 11557 + }, + { + "epoch": 0.6472169335871878, + "grad_norm": 1.2398462295532227, + "learning_rate": 5.776e-05, + "loss": 0.4243, + "step": 11558 + }, + { + "epoch": 0.6472729308993168, + "grad_norm": 1.0839929580688477, + "learning_rate": 5.7765e-05, + "loss": 0.2921, + "step": 11559 + }, + { + "epoch": 0.6473289282114458, + "grad_norm": 1.2982513904571533, + "learning_rate": 5.7770000000000004e-05, + "loss": 0.3389, + "step": 11560 + }, + { + "epoch": 0.6473849255235749, + "grad_norm": 1.5267226696014404, + "learning_rate": 5.7775e-05, + "loss": 0.4404, + "step": 11561 + }, + { + "epoch": 0.6474409228357039, + "grad_norm": 1.4346492290496826, + "learning_rate": 5.778e-05, + "loss": 0.4076, + "step": 11562 + }, + { + "epoch": 0.6474969201478329, + "grad_norm": 1.395491123199463, + "learning_rate": 5.7785e-05, + "loss": 0.4529, + "step": 11563 + }, + { + "epoch": 0.6475529174599619, + "grad_norm": 1.6427137851715088, + "learning_rate": 5.779e-05, + "loss": 0.5369, + "step": 11564 + }, + { + "epoch": 0.6476089147720909, + "grad_norm": 1.3966608047485352, + "learning_rate": 5.7795e-05, + "loss": 0.3765, + "step": 11565 + }, + { + "epoch": 0.64766491208422, + "grad_norm": 1.2554833889007568, + "learning_rate": 5.7799999999999995e-05, + "loss": 0.5485, + "step": 11566 + }, + { + "epoch": 0.647720909396349, + "grad_norm": 1.5577747821807861, + "learning_rate": 5.7805e-05, + "loss": 0.7601, + "step": 11567 + }, + { + "epoch": 0.647776906708478, + "grad_norm": 1.2530993223190308, + "learning_rate": 5.781e-05, + "loss": 0.3431, + "step": 11568 + }, + { + "epoch": 0.647832904020607, + "grad_norm": 1.155864953994751, + "learning_rate": 5.781500000000001e-05, + "loss": 0.468, + "step": 11569 + }, + { + "epoch": 0.647888901332736, + "grad_norm": 1.0874488353729248, + "learning_rate": 5.7820000000000005e-05, + "loss": 0.4228, + "step": 11570 + }, + { + "epoch": 0.647944898644865, + "grad_norm": 1.3322001695632935, + "learning_rate": 5.782500000000001e-05, + "loss": 0.5208, + "step": 11571 + }, + { + "epoch": 0.6480008959569941, + "grad_norm": 1.3388378620147705, + "learning_rate": 5.783000000000001e-05, + "loss": 0.4283, + "step": 11572 + }, + { + "epoch": 0.6480568932691231, + "grad_norm": 1.1866655349731445, + "learning_rate": 5.7835000000000004e-05, + "loss": 0.3104, + "step": 11573 + }, + { + "epoch": 0.6481128905812521, + "grad_norm": 1.391587495803833, + "learning_rate": 5.784000000000001e-05, + "loss": 0.6146, + "step": 11574 + }, + { + "epoch": 0.6481688878933811, + "grad_norm": 1.344538688659668, + "learning_rate": 5.7845000000000006e-05, + "loss": 0.3789, + "step": 11575 + }, + { + "epoch": 0.6482248852055101, + "grad_norm": 1.2578099966049194, + "learning_rate": 5.7850000000000003e-05, + "loss": 0.5257, + "step": 11576 + }, + { + "epoch": 0.6482808825176392, + "grad_norm": 1.385211706161499, + "learning_rate": 5.7855e-05, + "loss": 0.3098, + "step": 11577 + }, + { + "epoch": 0.6483368798297682, + "grad_norm": 1.5441373586654663, + "learning_rate": 5.7860000000000005e-05, + "loss": 0.5056, + "step": 11578 + }, + { + "epoch": 0.6483928771418972, + "grad_norm": 1.4079526662826538, + "learning_rate": 5.7865e-05, + "loss": 0.3969, + "step": 11579 + }, + { + "epoch": 0.6484488744540262, + "grad_norm": 1.1507244110107422, + "learning_rate": 5.787e-05, + "loss": 0.4326, + "step": 11580 + }, + { + "epoch": 0.6485048717661552, + "grad_norm": 1.1137008666992188, + "learning_rate": 5.7875000000000004e-05, + "loss": 0.4333, + "step": 11581 + }, + { + "epoch": 0.6485608690782843, + "grad_norm": 1.256856083869934, + "learning_rate": 5.788e-05, + "loss": 0.495, + "step": 11582 + }, + { + "epoch": 0.6486168663904133, + "grad_norm": 1.2716187238693237, + "learning_rate": 5.7885e-05, + "loss": 0.3649, + "step": 11583 + }, + { + "epoch": 0.6486728637025423, + "grad_norm": 1.5647202730178833, + "learning_rate": 5.789e-05, + "loss": 0.5961, + "step": 11584 + }, + { + "epoch": 0.6487288610146713, + "grad_norm": 1.144774317741394, + "learning_rate": 5.7895e-05, + "loss": 0.3296, + "step": 11585 + }, + { + "epoch": 0.6487848583268003, + "grad_norm": 1.1031969785690308, + "learning_rate": 5.79e-05, + "loss": 0.3849, + "step": 11586 + }, + { + "epoch": 0.6488408556389293, + "grad_norm": 1.2824819087982178, + "learning_rate": 5.7904999999999995e-05, + "loss": 0.3177, + "step": 11587 + }, + { + "epoch": 0.6488968529510584, + "grad_norm": 1.555246353149414, + "learning_rate": 5.791e-05, + "loss": 0.4973, + "step": 11588 + }, + { + "epoch": 0.6489528502631874, + "grad_norm": 1.6722393035888672, + "learning_rate": 5.791500000000001e-05, + "loss": 0.4526, + "step": 11589 + }, + { + "epoch": 0.6490088475753164, + "grad_norm": 1.4631997346878052, + "learning_rate": 5.792000000000001e-05, + "loss": 0.4488, + "step": 11590 + }, + { + "epoch": 0.6490648448874454, + "grad_norm": 1.3110997676849365, + "learning_rate": 5.7925000000000005e-05, + "loss": 0.434, + "step": 11591 + }, + { + "epoch": 0.6491208421995744, + "grad_norm": 1.4031312465667725, + "learning_rate": 5.793000000000001e-05, + "loss": 0.3872, + "step": 11592 + }, + { + "epoch": 0.6491768395117035, + "grad_norm": 2.0797858238220215, + "learning_rate": 5.793500000000001e-05, + "loss": 0.3397, + "step": 11593 + }, + { + "epoch": 0.6492328368238325, + "grad_norm": 1.397301435470581, + "learning_rate": 5.7940000000000004e-05, + "loss": 0.4672, + "step": 11594 + }, + { + "epoch": 0.6492888341359615, + "grad_norm": 1.1421433687210083, + "learning_rate": 5.7945e-05, + "loss": 0.3781, + "step": 11595 + }, + { + "epoch": 0.6493448314480905, + "grad_norm": 1.278132438659668, + "learning_rate": 5.7950000000000006e-05, + "loss": 0.4038, + "step": 11596 + }, + { + "epoch": 0.6494008287602195, + "grad_norm": 1.68009614944458, + "learning_rate": 5.7955e-05, + "loss": 0.433, + "step": 11597 + }, + { + "epoch": 0.6494568260723486, + "grad_norm": 1.222383737564087, + "learning_rate": 5.796e-05, + "loss": 0.4574, + "step": 11598 + }, + { + "epoch": 0.6495128233844776, + "grad_norm": 1.118330955505371, + "learning_rate": 5.7965000000000005e-05, + "loss": 0.3929, + "step": 11599 + }, + { + "epoch": 0.6495688206966066, + "grad_norm": 1.2085684537887573, + "learning_rate": 5.797e-05, + "loss": 0.5143, + "step": 11600 + }, + { + "epoch": 0.6496248180087356, + "grad_norm": 1.231821894645691, + "learning_rate": 5.7975e-05, + "loss": 0.4428, + "step": 11601 + }, + { + "epoch": 0.6496808153208646, + "grad_norm": 1.723815679550171, + "learning_rate": 5.7980000000000004e-05, + "loss": 0.4408, + "step": 11602 + }, + { + "epoch": 0.6497368126329937, + "grad_norm": 1.2709662914276123, + "learning_rate": 5.7985e-05, + "loss": 0.3881, + "step": 11603 + }, + { + "epoch": 0.6497928099451227, + "grad_norm": 1.2329301834106445, + "learning_rate": 5.799e-05, + "loss": 0.4592, + "step": 11604 + }, + { + "epoch": 0.6498488072572517, + "grad_norm": 1.4668264389038086, + "learning_rate": 5.7994999999999996e-05, + "loss": 0.5234, + "step": 11605 + }, + { + "epoch": 0.6499048045693807, + "grad_norm": 1.1704156398773193, + "learning_rate": 5.8e-05, + "loss": 0.412, + "step": 11606 + }, + { + "epoch": 0.6499608018815097, + "grad_norm": 1.2018678188323975, + "learning_rate": 5.8005e-05, + "loss": 0.4805, + "step": 11607 + }, + { + "epoch": 0.6500167991936388, + "grad_norm": 1.0939933061599731, + "learning_rate": 5.8009999999999995e-05, + "loss": 0.4825, + "step": 11608 + }, + { + "epoch": 0.6500727965057678, + "grad_norm": 1.0713413953781128, + "learning_rate": 5.8015000000000006e-05, + "loss": 0.3814, + "step": 11609 + }, + { + "epoch": 0.6501287938178968, + "grad_norm": 1.1035113334655762, + "learning_rate": 5.802000000000001e-05, + "loss": 0.4044, + "step": 11610 + }, + { + "epoch": 0.6501847911300258, + "grad_norm": 1.2724945545196533, + "learning_rate": 5.802500000000001e-05, + "loss": 0.4172, + "step": 11611 + }, + { + "epoch": 0.6502407884421548, + "grad_norm": 15.259182929992676, + "learning_rate": 5.8030000000000005e-05, + "loss": 0.5308, + "step": 11612 + }, + { + "epoch": 0.6502967857542838, + "grad_norm": 1.1987518072128296, + "learning_rate": 5.803500000000001e-05, + "loss": 0.5636, + "step": 11613 + }, + { + "epoch": 0.6503527830664129, + "grad_norm": 1.400138258934021, + "learning_rate": 5.804000000000001e-05, + "loss": 0.5606, + "step": 11614 + }, + { + "epoch": 0.6504087803785419, + "grad_norm": 1.3241719007492065, + "learning_rate": 5.8045000000000004e-05, + "loss": 0.4243, + "step": 11615 + }, + { + "epoch": 0.6504647776906708, + "grad_norm": 1.204487919807434, + "learning_rate": 5.805e-05, + "loss": 0.4068, + "step": 11616 + }, + { + "epoch": 0.6505207750027998, + "grad_norm": 1.2731162309646606, + "learning_rate": 5.8055000000000006e-05, + "loss": 0.404, + "step": 11617 + }, + { + "epoch": 0.6505767723149288, + "grad_norm": 1.1496412754058838, + "learning_rate": 5.806e-05, + "loss": 0.413, + "step": 11618 + }, + { + "epoch": 0.6506327696270578, + "grad_norm": 1.4973740577697754, + "learning_rate": 5.8065e-05, + "loss": 0.3583, + "step": 11619 + }, + { + "epoch": 0.6506887669391869, + "grad_norm": 1.2994894981384277, + "learning_rate": 5.8070000000000005e-05, + "loss": 0.3506, + "step": 11620 + }, + { + "epoch": 0.6507447642513159, + "grad_norm": 1.2027066946029663, + "learning_rate": 5.8075e-05, + "loss": 0.4193, + "step": 11621 + }, + { + "epoch": 0.6508007615634449, + "grad_norm": 1.14454984664917, + "learning_rate": 5.808e-05, + "loss": 0.4292, + "step": 11622 + }, + { + "epoch": 0.6508567588755739, + "grad_norm": 1.2749015092849731, + "learning_rate": 5.8085000000000004e-05, + "loss": 0.3946, + "step": 11623 + }, + { + "epoch": 0.650912756187703, + "grad_norm": 1.0972893238067627, + "learning_rate": 5.809e-05, + "loss": 0.3939, + "step": 11624 + }, + { + "epoch": 0.650968753499832, + "grad_norm": 1.0439338684082031, + "learning_rate": 5.8095e-05, + "loss": 0.4078, + "step": 11625 + }, + { + "epoch": 0.651024750811961, + "grad_norm": 1.4483518600463867, + "learning_rate": 5.8099999999999996e-05, + "loss": 0.3705, + "step": 11626 + }, + { + "epoch": 0.65108074812409, + "grad_norm": 1.3294535875320435, + "learning_rate": 5.8105e-05, + "loss": 0.3375, + "step": 11627 + }, + { + "epoch": 0.651136745436219, + "grad_norm": 1.417929768562317, + "learning_rate": 5.811e-05, + "loss": 0.5627, + "step": 11628 + }, + { + "epoch": 0.651192742748348, + "grad_norm": 1.3771790266036987, + "learning_rate": 5.811500000000001e-05, + "loss": 0.7373, + "step": 11629 + }, + { + "epoch": 0.6512487400604771, + "grad_norm": 1.1689010858535767, + "learning_rate": 5.8120000000000006e-05, + "loss": 0.3993, + "step": 11630 + }, + { + "epoch": 0.6513047373726061, + "grad_norm": 1.2407886981964111, + "learning_rate": 5.812500000000001e-05, + "loss": 0.4528, + "step": 11631 + }, + { + "epoch": 0.6513607346847351, + "grad_norm": 1.2653611898422241, + "learning_rate": 5.813000000000001e-05, + "loss": 0.3802, + "step": 11632 + }, + { + "epoch": 0.6514167319968641, + "grad_norm": 1.2242566347122192, + "learning_rate": 5.8135000000000005e-05, + "loss": 0.4715, + "step": 11633 + }, + { + "epoch": 0.6514727293089931, + "grad_norm": 1.299102783203125, + "learning_rate": 5.814e-05, + "loss": 0.4909, + "step": 11634 + }, + { + "epoch": 0.6515287266211222, + "grad_norm": 1.1339411735534668, + "learning_rate": 5.8145000000000007e-05, + "loss": 0.4823, + "step": 11635 + }, + { + "epoch": 0.6515847239332512, + "grad_norm": 1.212303638458252, + "learning_rate": 5.8150000000000004e-05, + "loss": 0.4722, + "step": 11636 + }, + { + "epoch": 0.6516407212453802, + "grad_norm": 1.3442906141281128, + "learning_rate": 5.8155e-05, + "loss": 0.4407, + "step": 11637 + }, + { + "epoch": 0.6516967185575092, + "grad_norm": 5.252134323120117, + "learning_rate": 5.8160000000000006e-05, + "loss": 0.4684, + "step": 11638 + }, + { + "epoch": 0.6517527158696382, + "grad_norm": 1.1816720962524414, + "learning_rate": 5.8165e-05, + "loss": 0.375, + "step": 11639 + }, + { + "epoch": 0.6518087131817673, + "grad_norm": 1.3227847814559937, + "learning_rate": 5.817e-05, + "loss": 0.6434, + "step": 11640 + }, + { + "epoch": 0.6518647104938963, + "grad_norm": 1.3233706951141357, + "learning_rate": 5.8175000000000005e-05, + "loss": 0.6012, + "step": 11641 + }, + { + "epoch": 0.6519207078060253, + "grad_norm": 1.1149864196777344, + "learning_rate": 5.818e-05, + "loss": 0.4763, + "step": 11642 + }, + { + "epoch": 0.6519767051181543, + "grad_norm": 1.2007625102996826, + "learning_rate": 5.8185e-05, + "loss": 0.4474, + "step": 11643 + }, + { + "epoch": 0.6520327024302833, + "grad_norm": 1.3394529819488525, + "learning_rate": 5.819e-05, + "loss": 0.5211, + "step": 11644 + }, + { + "epoch": 0.6520886997424123, + "grad_norm": 1.2217689752578735, + "learning_rate": 5.8195e-05, + "loss": 0.4672, + "step": 11645 + }, + { + "epoch": 0.6521446970545414, + "grad_norm": 1.3266834020614624, + "learning_rate": 5.82e-05, + "loss": 0.4283, + "step": 11646 + }, + { + "epoch": 0.6522006943666704, + "grad_norm": 1.2351205348968506, + "learning_rate": 5.8204999999999996e-05, + "loss": 0.5023, + "step": 11647 + }, + { + "epoch": 0.6522566916787994, + "grad_norm": 1.3728914260864258, + "learning_rate": 5.821e-05, + "loss": 0.4334, + "step": 11648 + }, + { + "epoch": 0.6523126889909284, + "grad_norm": 1.0195527076721191, + "learning_rate": 5.8215e-05, + "loss": 0.3195, + "step": 11649 + }, + { + "epoch": 0.6523686863030574, + "grad_norm": 1.3558344841003418, + "learning_rate": 5.822000000000001e-05, + "loss": 0.3579, + "step": 11650 + }, + { + "epoch": 0.6524246836151865, + "grad_norm": 1.289319634437561, + "learning_rate": 5.8225000000000006e-05, + "loss": 0.4538, + "step": 11651 + }, + { + "epoch": 0.6524806809273155, + "grad_norm": 1.212921142578125, + "learning_rate": 5.823000000000001e-05, + "loss": 0.5243, + "step": 11652 + }, + { + "epoch": 0.6525366782394445, + "grad_norm": 1.3482457399368286, + "learning_rate": 5.823500000000001e-05, + "loss": 0.4709, + "step": 11653 + }, + { + "epoch": 0.6525926755515735, + "grad_norm": 1.23513662815094, + "learning_rate": 5.8240000000000005e-05, + "loss": 0.3948, + "step": 11654 + }, + { + "epoch": 0.6526486728637025, + "grad_norm": 1.2374106645584106, + "learning_rate": 5.8245e-05, + "loss": 0.4464, + "step": 11655 + }, + { + "epoch": 0.6527046701758316, + "grad_norm": 1.2403322458267212, + "learning_rate": 5.8250000000000006e-05, + "loss": 0.4029, + "step": 11656 + }, + { + "epoch": 0.6527606674879606, + "grad_norm": 1.2212584018707275, + "learning_rate": 5.8255000000000004e-05, + "loss": 0.3104, + "step": 11657 + }, + { + "epoch": 0.6528166648000896, + "grad_norm": 1.8834686279296875, + "learning_rate": 5.826e-05, + "loss": 0.3212, + "step": 11658 + }, + { + "epoch": 0.6528726621122186, + "grad_norm": 1.3160979747772217, + "learning_rate": 5.8265000000000005e-05, + "loss": 0.4347, + "step": 11659 + }, + { + "epoch": 0.6529286594243476, + "grad_norm": 1.1711444854736328, + "learning_rate": 5.827e-05, + "loss": 0.3997, + "step": 11660 + }, + { + "epoch": 0.6529846567364767, + "grad_norm": 1.2330718040466309, + "learning_rate": 5.8275e-05, + "loss": 0.4108, + "step": 11661 + }, + { + "epoch": 0.6530406540486057, + "grad_norm": 1.1606751680374146, + "learning_rate": 5.8280000000000004e-05, + "loss": 0.3931, + "step": 11662 + }, + { + "epoch": 0.6530966513607347, + "grad_norm": 1.2084978818893433, + "learning_rate": 5.8285e-05, + "loss": 0.3705, + "step": 11663 + }, + { + "epoch": 0.6531526486728637, + "grad_norm": 1.577621340751648, + "learning_rate": 5.829e-05, + "loss": 0.5828, + "step": 11664 + }, + { + "epoch": 0.6532086459849927, + "grad_norm": 1.4541676044464111, + "learning_rate": 5.8295e-05, + "loss": 0.5594, + "step": 11665 + }, + { + "epoch": 0.6532646432971217, + "grad_norm": 1.2026711702346802, + "learning_rate": 5.83e-05, + "loss": 0.392, + "step": 11666 + }, + { + "epoch": 0.6533206406092508, + "grad_norm": 1.5014997720718384, + "learning_rate": 5.8305e-05, + "loss": 0.4563, + "step": 11667 + }, + { + "epoch": 0.6533766379213798, + "grad_norm": 1.4592264890670776, + "learning_rate": 5.8309999999999996e-05, + "loss": 0.4314, + "step": 11668 + }, + { + "epoch": 0.6534326352335088, + "grad_norm": 1.177983045578003, + "learning_rate": 5.8315e-05, + "loss": 0.4513, + "step": 11669 + }, + { + "epoch": 0.6534886325456378, + "grad_norm": 1.5815566778182983, + "learning_rate": 5.832000000000001e-05, + "loss": 0.4594, + "step": 11670 + }, + { + "epoch": 0.6535446298577668, + "grad_norm": 1.4919463396072388, + "learning_rate": 5.832500000000001e-05, + "loss": 0.5935, + "step": 11671 + }, + { + "epoch": 0.6536006271698959, + "grad_norm": 1.1126666069030762, + "learning_rate": 5.8330000000000006e-05, + "loss": 0.5506, + "step": 11672 + }, + { + "epoch": 0.6536566244820249, + "grad_norm": 1.260085105895996, + "learning_rate": 5.8335e-05, + "loss": 0.5312, + "step": 11673 + }, + { + "epoch": 0.6537126217941539, + "grad_norm": 1.4133983850479126, + "learning_rate": 5.834000000000001e-05, + "loss": 0.459, + "step": 11674 + }, + { + "epoch": 0.6537686191062829, + "grad_norm": 1.1627137660980225, + "learning_rate": 5.8345000000000005e-05, + "loss": 0.4538, + "step": 11675 + }, + { + "epoch": 0.6538246164184119, + "grad_norm": 1.4787089824676514, + "learning_rate": 5.835e-05, + "loss": 0.4798, + "step": 11676 + }, + { + "epoch": 0.653880613730541, + "grad_norm": 1.3909518718719482, + "learning_rate": 5.8355000000000006e-05, + "loss": 0.4618, + "step": 11677 + }, + { + "epoch": 0.65393661104267, + "grad_norm": 1.359830617904663, + "learning_rate": 5.8360000000000004e-05, + "loss": 0.4567, + "step": 11678 + }, + { + "epoch": 0.653992608354799, + "grad_norm": 1.7665683031082153, + "learning_rate": 5.8365e-05, + "loss": 0.5119, + "step": 11679 + }, + { + "epoch": 0.654048605666928, + "grad_norm": 1.4605189561843872, + "learning_rate": 5.8370000000000005e-05, + "loss": 0.6055, + "step": 11680 + }, + { + "epoch": 0.654104602979057, + "grad_norm": 1.4210529327392578, + "learning_rate": 5.8375e-05, + "loss": 0.3897, + "step": 11681 + }, + { + "epoch": 0.6541606002911861, + "grad_norm": 0.9742177128791809, + "learning_rate": 5.838e-05, + "loss": 0.2776, + "step": 11682 + }, + { + "epoch": 0.6542165976033151, + "grad_norm": 1.2380565404891968, + "learning_rate": 5.8385e-05, + "loss": 0.4494, + "step": 11683 + }, + { + "epoch": 0.6542725949154441, + "grad_norm": 1.3207238912582397, + "learning_rate": 5.839e-05, + "loss": 0.5514, + "step": 11684 + }, + { + "epoch": 0.6543285922275731, + "grad_norm": 1.5492300987243652, + "learning_rate": 5.8395e-05, + "loss": 0.3392, + "step": 11685 + }, + { + "epoch": 0.6543845895397021, + "grad_norm": 1.3901665210723877, + "learning_rate": 5.8399999999999997e-05, + "loss": 0.4178, + "step": 11686 + }, + { + "epoch": 0.6544405868518312, + "grad_norm": 1.3766168355941772, + "learning_rate": 5.8405e-05, + "loss": 0.4283, + "step": 11687 + }, + { + "epoch": 0.6544965841639602, + "grad_norm": 1.496849536895752, + "learning_rate": 5.841e-05, + "loss": 0.5418, + "step": 11688 + }, + { + "epoch": 0.6545525814760892, + "grad_norm": 1.4288110733032227, + "learning_rate": 5.8414999999999996e-05, + "loss": 0.4658, + "step": 11689 + }, + { + "epoch": 0.6546085787882182, + "grad_norm": 1.2487083673477173, + "learning_rate": 5.8420000000000006e-05, + "loss": 0.4451, + "step": 11690 + }, + { + "epoch": 0.6546645761003472, + "grad_norm": 1.3596370220184326, + "learning_rate": 5.842500000000001e-05, + "loss": 0.3989, + "step": 11691 + }, + { + "epoch": 0.6547205734124762, + "grad_norm": 1.147562026977539, + "learning_rate": 5.843000000000001e-05, + "loss": 0.4082, + "step": 11692 + }, + { + "epoch": 0.6547765707246053, + "grad_norm": 1.269890308380127, + "learning_rate": 5.8435000000000005e-05, + "loss": 0.3645, + "step": 11693 + }, + { + "epoch": 0.6548325680367343, + "grad_norm": 1.1032383441925049, + "learning_rate": 5.844e-05, + "loss": 0.3172, + "step": 11694 + }, + { + "epoch": 0.6548885653488633, + "grad_norm": 1.4504663944244385, + "learning_rate": 5.844500000000001e-05, + "loss": 0.4008, + "step": 11695 + }, + { + "epoch": 0.6549445626609923, + "grad_norm": 1.4644497632980347, + "learning_rate": 5.8450000000000005e-05, + "loss": 0.465, + "step": 11696 + }, + { + "epoch": 0.6550005599731213, + "grad_norm": 1.419569492340088, + "learning_rate": 5.8455e-05, + "loss": 0.4715, + "step": 11697 + }, + { + "epoch": 0.6550565572852504, + "grad_norm": 1.3105918169021606, + "learning_rate": 5.8460000000000006e-05, + "loss": 0.412, + "step": 11698 + }, + { + "epoch": 0.6551125545973793, + "grad_norm": 1.656390905380249, + "learning_rate": 5.8465000000000004e-05, + "loss": 0.6538, + "step": 11699 + }, + { + "epoch": 0.6551685519095083, + "grad_norm": 1.3274263143539429, + "learning_rate": 5.847e-05, + "loss": 0.4924, + "step": 11700 + }, + { + "epoch": 0.6552245492216373, + "grad_norm": 1.4678207635879517, + "learning_rate": 5.8475000000000005e-05, + "loss": 0.6668, + "step": 11701 + }, + { + "epoch": 0.6552805465337663, + "grad_norm": 1.424195647239685, + "learning_rate": 5.848e-05, + "loss": 0.562, + "step": 11702 + }, + { + "epoch": 0.6553365438458953, + "grad_norm": 1.2749199867248535, + "learning_rate": 5.8485e-05, + "loss": 0.4165, + "step": 11703 + }, + { + "epoch": 0.6553925411580244, + "grad_norm": 1.5304670333862305, + "learning_rate": 5.849e-05, + "loss": 0.5117, + "step": 11704 + }, + { + "epoch": 0.6554485384701534, + "grad_norm": 1.12393319606781, + "learning_rate": 5.8495e-05, + "loss": 0.4185, + "step": 11705 + }, + { + "epoch": 0.6555045357822824, + "grad_norm": 1.443090796470642, + "learning_rate": 5.85e-05, + "loss": 0.3894, + "step": 11706 + }, + { + "epoch": 0.6555605330944114, + "grad_norm": 1.3098422288894653, + "learning_rate": 5.8504999999999996e-05, + "loss": 0.499, + "step": 11707 + }, + { + "epoch": 0.6556165304065404, + "grad_norm": 1.2031232118606567, + "learning_rate": 5.851e-05, + "loss": 0.3625, + "step": 11708 + }, + { + "epoch": 0.6556725277186695, + "grad_norm": 1.471382737159729, + "learning_rate": 5.8515e-05, + "loss": 0.3973, + "step": 11709 + }, + { + "epoch": 0.6557285250307985, + "grad_norm": 1.3180073499679565, + "learning_rate": 5.852000000000001e-05, + "loss": 0.4428, + "step": 11710 + }, + { + "epoch": 0.6557845223429275, + "grad_norm": 1.5244274139404297, + "learning_rate": 5.8525000000000006e-05, + "loss": 0.5903, + "step": 11711 + }, + { + "epoch": 0.6558405196550565, + "grad_norm": 1.1666597127914429, + "learning_rate": 5.8530000000000004e-05, + "loss": 0.363, + "step": 11712 + }, + { + "epoch": 0.6558965169671855, + "grad_norm": 1.1757006645202637, + "learning_rate": 5.853500000000001e-05, + "loss": 0.41, + "step": 11713 + }, + { + "epoch": 0.6559525142793146, + "grad_norm": 1.4187425374984741, + "learning_rate": 5.8540000000000005e-05, + "loss": 0.3809, + "step": 11714 + }, + { + "epoch": 0.6560085115914436, + "grad_norm": 1.2364821434020996, + "learning_rate": 5.8545e-05, + "loss": 0.3692, + "step": 11715 + }, + { + "epoch": 0.6560645089035726, + "grad_norm": 1.5956910848617554, + "learning_rate": 5.855000000000001e-05, + "loss": 0.4042, + "step": 11716 + }, + { + "epoch": 0.6561205062157016, + "grad_norm": 1.5447666645050049, + "learning_rate": 5.8555000000000004e-05, + "loss": 0.5662, + "step": 11717 + }, + { + "epoch": 0.6561765035278306, + "grad_norm": 1.403235912322998, + "learning_rate": 5.856e-05, + "loss": 0.5386, + "step": 11718 + }, + { + "epoch": 0.6562325008399597, + "grad_norm": 1.3280143737792969, + "learning_rate": 5.8565000000000006e-05, + "loss": 0.3795, + "step": 11719 + }, + { + "epoch": 0.6562884981520887, + "grad_norm": 1.3676313161849976, + "learning_rate": 5.857e-05, + "loss": 0.4279, + "step": 11720 + }, + { + "epoch": 0.6563444954642177, + "grad_norm": 1.336786150932312, + "learning_rate": 5.8575e-05, + "loss": 0.4123, + "step": 11721 + }, + { + "epoch": 0.6564004927763467, + "grad_norm": 1.216882348060608, + "learning_rate": 5.858e-05, + "loss": 0.4441, + "step": 11722 + }, + { + "epoch": 0.6564564900884757, + "grad_norm": 12.355265617370605, + "learning_rate": 5.8585e-05, + "loss": 0.4049, + "step": 11723 + }, + { + "epoch": 0.6565124874006047, + "grad_norm": 1.1024394035339355, + "learning_rate": 5.859e-05, + "loss": 0.4176, + "step": 11724 + }, + { + "epoch": 0.6565684847127338, + "grad_norm": 1.2378270626068115, + "learning_rate": 5.8595e-05, + "loss": 0.3146, + "step": 11725 + }, + { + "epoch": 0.6566244820248628, + "grad_norm": 1.1719168424606323, + "learning_rate": 5.86e-05, + "loss": 0.3656, + "step": 11726 + }, + { + "epoch": 0.6566804793369918, + "grad_norm": 1.1853828430175781, + "learning_rate": 5.8605e-05, + "loss": 0.3465, + "step": 11727 + }, + { + "epoch": 0.6567364766491208, + "grad_norm": 1.3977810144424438, + "learning_rate": 5.8609999999999996e-05, + "loss": 0.436, + "step": 11728 + }, + { + "epoch": 0.6567924739612498, + "grad_norm": 1.3636177778244019, + "learning_rate": 5.8615e-05, + "loss": 0.4208, + "step": 11729 + }, + { + "epoch": 0.6568484712733789, + "grad_norm": 1.3164868354797363, + "learning_rate": 5.862000000000001e-05, + "loss": 0.4447, + "step": 11730 + }, + { + "epoch": 0.6569044685855079, + "grad_norm": 1.3972855806350708, + "learning_rate": 5.862500000000001e-05, + "loss": 0.4463, + "step": 11731 + }, + { + "epoch": 0.6569604658976369, + "grad_norm": 1.3859697580337524, + "learning_rate": 5.8630000000000006e-05, + "loss": 0.5974, + "step": 11732 + }, + { + "epoch": 0.6570164632097659, + "grad_norm": 1.2899953126907349, + "learning_rate": 5.8635000000000004e-05, + "loss": 0.3652, + "step": 11733 + }, + { + "epoch": 0.6570724605218949, + "grad_norm": 1.3799649477005005, + "learning_rate": 5.864000000000001e-05, + "loss": 0.4831, + "step": 11734 + }, + { + "epoch": 0.657128457834024, + "grad_norm": 1.995896339416504, + "learning_rate": 5.8645000000000005e-05, + "loss": 0.5063, + "step": 11735 + }, + { + "epoch": 0.657184455146153, + "grad_norm": 1.4469842910766602, + "learning_rate": 5.865e-05, + "loss": 0.4502, + "step": 11736 + }, + { + "epoch": 0.657240452458282, + "grad_norm": 1.168785572052002, + "learning_rate": 5.865500000000001e-05, + "loss": 0.3616, + "step": 11737 + }, + { + "epoch": 0.657296449770411, + "grad_norm": 1.3240330219268799, + "learning_rate": 5.8660000000000004e-05, + "loss": 0.4383, + "step": 11738 + }, + { + "epoch": 0.65735244708254, + "grad_norm": 1.1602659225463867, + "learning_rate": 5.8665e-05, + "loss": 0.3743, + "step": 11739 + }, + { + "epoch": 0.6574084443946691, + "grad_norm": 1.331207513809204, + "learning_rate": 5.8670000000000006e-05, + "loss": 0.3482, + "step": 11740 + }, + { + "epoch": 0.6574644417067981, + "grad_norm": 1.1242907047271729, + "learning_rate": 5.8675e-05, + "loss": 0.4016, + "step": 11741 + }, + { + "epoch": 0.6575204390189271, + "grad_norm": 1.4794058799743652, + "learning_rate": 5.868e-05, + "loss": 0.6036, + "step": 11742 + }, + { + "epoch": 0.6575764363310561, + "grad_norm": 1.4201059341430664, + "learning_rate": 5.8685e-05, + "loss": 0.5238, + "step": 11743 + }, + { + "epoch": 0.6576324336431851, + "grad_norm": 1.3712326288223267, + "learning_rate": 5.869e-05, + "loss": 0.4391, + "step": 11744 + }, + { + "epoch": 0.6576884309553142, + "grad_norm": 1.6325640678405762, + "learning_rate": 5.8695e-05, + "loss": 0.4827, + "step": 11745 + }, + { + "epoch": 0.6577444282674432, + "grad_norm": 1.094815969467163, + "learning_rate": 5.87e-05, + "loss": 0.4478, + "step": 11746 + }, + { + "epoch": 0.6578004255795722, + "grad_norm": 1.4926972389221191, + "learning_rate": 5.8705e-05, + "loss": 0.4445, + "step": 11747 + }, + { + "epoch": 0.6578564228917012, + "grad_norm": 1.0443830490112305, + "learning_rate": 5.871e-05, + "loss": 0.3351, + "step": 11748 + }, + { + "epoch": 0.6579124202038302, + "grad_norm": 1.129218339920044, + "learning_rate": 5.8714999999999996e-05, + "loss": 0.3688, + "step": 11749 + }, + { + "epoch": 0.6579684175159592, + "grad_norm": 1.281672477722168, + "learning_rate": 5.872000000000001e-05, + "loss": 0.5401, + "step": 11750 + }, + { + "epoch": 0.6580244148280883, + "grad_norm": 1.2422080039978027, + "learning_rate": 5.8725000000000004e-05, + "loss": 0.3815, + "step": 11751 + }, + { + "epoch": 0.6580804121402173, + "grad_norm": 1.2020775079727173, + "learning_rate": 5.873000000000001e-05, + "loss": 0.376, + "step": 11752 + }, + { + "epoch": 0.6581364094523463, + "grad_norm": 1.1493568420410156, + "learning_rate": 5.8735000000000006e-05, + "loss": 0.4852, + "step": 11753 + }, + { + "epoch": 0.6581924067644753, + "grad_norm": 1.532774806022644, + "learning_rate": 5.8740000000000003e-05, + "loss": 0.3621, + "step": 11754 + }, + { + "epoch": 0.6582484040766043, + "grad_norm": 1.2717229127883911, + "learning_rate": 5.874500000000001e-05, + "loss": 0.4511, + "step": 11755 + }, + { + "epoch": 0.6583044013887334, + "grad_norm": 1.4544858932495117, + "learning_rate": 5.8750000000000005e-05, + "loss": 0.5239, + "step": 11756 + }, + { + "epoch": 0.6583603987008624, + "grad_norm": 1.1929380893707275, + "learning_rate": 5.8755e-05, + "loss": 0.4486, + "step": 11757 + }, + { + "epoch": 0.6584163960129914, + "grad_norm": 1.1866029500961304, + "learning_rate": 5.876000000000001e-05, + "loss": 0.3347, + "step": 11758 + }, + { + "epoch": 0.6584723933251204, + "grad_norm": 1.4016965627670288, + "learning_rate": 5.8765000000000004e-05, + "loss": 0.5889, + "step": 11759 + }, + { + "epoch": 0.6585283906372494, + "grad_norm": 1.2263785600662231, + "learning_rate": 5.877e-05, + "loss": 0.3747, + "step": 11760 + }, + { + "epoch": 0.6585843879493785, + "grad_norm": 1.1574546098709106, + "learning_rate": 5.8775000000000006e-05, + "loss": 0.4193, + "step": 11761 + }, + { + "epoch": 0.6586403852615075, + "grad_norm": 1.2843056917190552, + "learning_rate": 5.878e-05, + "loss": 0.4564, + "step": 11762 + }, + { + "epoch": 0.6586963825736365, + "grad_norm": 1.2758581638336182, + "learning_rate": 5.8785e-05, + "loss": 0.3833, + "step": 11763 + }, + { + "epoch": 0.6587523798857655, + "grad_norm": 1.386577844619751, + "learning_rate": 5.879e-05, + "loss": 0.481, + "step": 11764 + }, + { + "epoch": 0.6588083771978945, + "grad_norm": 1.3712282180786133, + "learning_rate": 5.8795e-05, + "loss": 0.5142, + "step": 11765 + }, + { + "epoch": 0.6588643745100236, + "grad_norm": 1.4404501914978027, + "learning_rate": 5.88e-05, + "loss": 0.467, + "step": 11766 + }, + { + "epoch": 0.6589203718221526, + "grad_norm": 1.213527798652649, + "learning_rate": 5.8805e-05, + "loss": 0.3757, + "step": 11767 + }, + { + "epoch": 0.6589763691342816, + "grad_norm": 1.4828687906265259, + "learning_rate": 5.881e-05, + "loss": 0.4814, + "step": 11768 + }, + { + "epoch": 0.6590323664464106, + "grad_norm": 1.2644912004470825, + "learning_rate": 5.8815e-05, + "loss": 0.4162, + "step": 11769 + }, + { + "epoch": 0.6590883637585396, + "grad_norm": 1.4774798154830933, + "learning_rate": 5.8819999999999996e-05, + "loss": 0.6192, + "step": 11770 + }, + { + "epoch": 0.6591443610706686, + "grad_norm": 2.14587140083313, + "learning_rate": 5.882500000000001e-05, + "loss": 0.479, + "step": 11771 + }, + { + "epoch": 0.6592003583827977, + "grad_norm": 1.6047451496124268, + "learning_rate": 5.8830000000000004e-05, + "loss": 0.4926, + "step": 11772 + }, + { + "epoch": 0.6592563556949267, + "grad_norm": 1.5371648073196411, + "learning_rate": 5.883500000000001e-05, + "loss": 0.3083, + "step": 11773 + }, + { + "epoch": 0.6593123530070557, + "grad_norm": 1.403256893157959, + "learning_rate": 5.8840000000000006e-05, + "loss": 0.5472, + "step": 11774 + }, + { + "epoch": 0.6593683503191847, + "grad_norm": 1.0998344421386719, + "learning_rate": 5.8845e-05, + "loss": 0.3273, + "step": 11775 + }, + { + "epoch": 0.6594243476313137, + "grad_norm": 1.3494468927383423, + "learning_rate": 5.885000000000001e-05, + "loss": 0.706, + "step": 11776 + }, + { + "epoch": 0.6594803449434428, + "grad_norm": 1.1570298671722412, + "learning_rate": 5.8855000000000005e-05, + "loss": 0.3141, + "step": 11777 + }, + { + "epoch": 0.6595363422555718, + "grad_norm": 1.5827627182006836, + "learning_rate": 5.886e-05, + "loss": 0.5053, + "step": 11778 + }, + { + "epoch": 0.6595923395677008, + "grad_norm": 1.2346322536468506, + "learning_rate": 5.8865000000000006e-05, + "loss": 0.3815, + "step": 11779 + }, + { + "epoch": 0.6596483368798298, + "grad_norm": 1.3112822771072388, + "learning_rate": 5.8870000000000004e-05, + "loss": 0.4709, + "step": 11780 + }, + { + "epoch": 0.6597043341919588, + "grad_norm": 1.2978674173355103, + "learning_rate": 5.8875e-05, + "loss": 0.3935, + "step": 11781 + }, + { + "epoch": 0.6597603315040877, + "grad_norm": 1.329642653465271, + "learning_rate": 5.888e-05, + "loss": 0.4252, + "step": 11782 + }, + { + "epoch": 0.6598163288162168, + "grad_norm": 1.0972132682800293, + "learning_rate": 5.8885e-05, + "loss": 0.3462, + "step": 11783 + }, + { + "epoch": 0.6598723261283458, + "grad_norm": 1.3023643493652344, + "learning_rate": 5.889e-05, + "loss": 0.4253, + "step": 11784 + }, + { + "epoch": 0.6599283234404748, + "grad_norm": 1.5814366340637207, + "learning_rate": 5.8895e-05, + "loss": 0.4563, + "step": 11785 + }, + { + "epoch": 0.6599843207526038, + "grad_norm": 1.316964864730835, + "learning_rate": 5.89e-05, + "loss": 0.4273, + "step": 11786 + }, + { + "epoch": 0.6600403180647328, + "grad_norm": 1.2883378267288208, + "learning_rate": 5.8905e-05, + "loss": 0.5418, + "step": 11787 + }, + { + "epoch": 0.6600963153768619, + "grad_norm": 1.5526007413864136, + "learning_rate": 5.891e-05, + "loss": 0.4849, + "step": 11788 + }, + { + "epoch": 0.6601523126889909, + "grad_norm": 12.64220905303955, + "learning_rate": 5.8915e-05, + "loss": 0.4976, + "step": 11789 + }, + { + "epoch": 0.6602083100011199, + "grad_norm": 1.3078986406326294, + "learning_rate": 5.892e-05, + "loss": 0.5009, + "step": 11790 + }, + { + "epoch": 0.6602643073132489, + "grad_norm": 1.2736436128616333, + "learning_rate": 5.892500000000001e-05, + "loss": 0.4243, + "step": 11791 + }, + { + "epoch": 0.6603203046253779, + "grad_norm": 1.6053931713104248, + "learning_rate": 5.893000000000001e-05, + "loss": 0.5212, + "step": 11792 + }, + { + "epoch": 0.660376301937507, + "grad_norm": 1.125852108001709, + "learning_rate": 5.8935000000000004e-05, + "loss": 0.3243, + "step": 11793 + }, + { + "epoch": 0.660432299249636, + "grad_norm": 1.2537152767181396, + "learning_rate": 5.894000000000001e-05, + "loss": 0.3947, + "step": 11794 + }, + { + "epoch": 0.660488296561765, + "grad_norm": 1.3903148174285889, + "learning_rate": 5.8945000000000006e-05, + "loss": 0.3863, + "step": 11795 + }, + { + "epoch": 0.660544293873894, + "grad_norm": 1.2561471462249756, + "learning_rate": 5.895e-05, + "loss": 0.4033, + "step": 11796 + }, + { + "epoch": 0.660600291186023, + "grad_norm": 1.3503178358078003, + "learning_rate": 5.895500000000001e-05, + "loss": 0.5024, + "step": 11797 + }, + { + "epoch": 0.6606562884981521, + "grad_norm": 1.8672709465026855, + "learning_rate": 5.8960000000000005e-05, + "loss": 0.3454, + "step": 11798 + }, + { + "epoch": 0.6607122858102811, + "grad_norm": 1.1492935419082642, + "learning_rate": 5.8965e-05, + "loss": 0.3286, + "step": 11799 + }, + { + "epoch": 0.6607682831224101, + "grad_norm": 1.6859663724899292, + "learning_rate": 5.8970000000000006e-05, + "loss": 0.4703, + "step": 11800 + }, + { + "epoch": 0.6608242804345391, + "grad_norm": 1.3863939046859741, + "learning_rate": 5.8975000000000004e-05, + "loss": 0.3802, + "step": 11801 + }, + { + "epoch": 0.6608802777466681, + "grad_norm": 1.2479121685028076, + "learning_rate": 5.898e-05, + "loss": 0.428, + "step": 11802 + }, + { + "epoch": 0.6609362750587972, + "grad_norm": 1.3120057582855225, + "learning_rate": 5.8985e-05, + "loss": 0.5067, + "step": 11803 + }, + { + "epoch": 0.6609922723709262, + "grad_norm": 1.1948027610778809, + "learning_rate": 5.899e-05, + "loss": 0.3471, + "step": 11804 + }, + { + "epoch": 0.6610482696830552, + "grad_norm": 1.3826274871826172, + "learning_rate": 5.8995e-05, + "loss": 0.5424, + "step": 11805 + }, + { + "epoch": 0.6611042669951842, + "grad_norm": 1.0666022300720215, + "learning_rate": 5.9e-05, + "loss": 0.3621, + "step": 11806 + }, + { + "epoch": 0.6611602643073132, + "grad_norm": 1.216614007949829, + "learning_rate": 5.9005e-05, + "loss": 0.3731, + "step": 11807 + }, + { + "epoch": 0.6612162616194422, + "grad_norm": 1.5249807834625244, + "learning_rate": 5.901e-05, + "loss": 0.5525, + "step": 11808 + }, + { + "epoch": 0.6612722589315713, + "grad_norm": 1.3568562269210815, + "learning_rate": 5.9014999999999997e-05, + "loss": 0.435, + "step": 11809 + }, + { + "epoch": 0.6613282562437003, + "grad_norm": 1.2439298629760742, + "learning_rate": 5.902e-05, + "loss": 0.423, + "step": 11810 + }, + { + "epoch": 0.6613842535558293, + "grad_norm": 1.246422290802002, + "learning_rate": 5.9025000000000005e-05, + "loss": 0.321, + "step": 11811 + }, + { + "epoch": 0.6614402508679583, + "grad_norm": 1.2262554168701172, + "learning_rate": 5.903000000000001e-05, + "loss": 0.4811, + "step": 11812 + }, + { + "epoch": 0.6614962481800873, + "grad_norm": 1.3477610349655151, + "learning_rate": 5.9035000000000007e-05, + "loss": 0.4632, + "step": 11813 + }, + { + "epoch": 0.6615522454922164, + "grad_norm": 1.1747660636901855, + "learning_rate": 5.9040000000000004e-05, + "loss": 0.374, + "step": 11814 + }, + { + "epoch": 0.6616082428043454, + "grad_norm": 1.1574561595916748, + "learning_rate": 5.904500000000001e-05, + "loss": 0.317, + "step": 11815 + }, + { + "epoch": 0.6616642401164744, + "grad_norm": 1.3460285663604736, + "learning_rate": 5.9050000000000006e-05, + "loss": 0.5394, + "step": 11816 + }, + { + "epoch": 0.6617202374286034, + "grad_norm": 1.1897482872009277, + "learning_rate": 5.9055e-05, + "loss": 0.353, + "step": 11817 + }, + { + "epoch": 0.6617762347407324, + "grad_norm": 1.5638235807418823, + "learning_rate": 5.906000000000001e-05, + "loss": 0.4266, + "step": 11818 + }, + { + "epoch": 0.6618322320528615, + "grad_norm": 1.418049931526184, + "learning_rate": 5.9065000000000005e-05, + "loss": 0.5869, + "step": 11819 + }, + { + "epoch": 0.6618882293649905, + "grad_norm": 1.3842191696166992, + "learning_rate": 5.907e-05, + "loss": 0.4286, + "step": 11820 + }, + { + "epoch": 0.6619442266771195, + "grad_norm": 1.5180695056915283, + "learning_rate": 5.9075e-05, + "loss": 0.5938, + "step": 11821 + }, + { + "epoch": 0.6620002239892485, + "grad_norm": 1.3481963872909546, + "learning_rate": 5.9080000000000004e-05, + "loss": 0.4343, + "step": 11822 + }, + { + "epoch": 0.6620562213013775, + "grad_norm": 1.3385549783706665, + "learning_rate": 5.9085e-05, + "loss": 0.3347, + "step": 11823 + }, + { + "epoch": 0.6621122186135066, + "grad_norm": 1.5572330951690674, + "learning_rate": 5.909e-05, + "loss": 0.4192, + "step": 11824 + }, + { + "epoch": 0.6621682159256356, + "grad_norm": 1.4639410972595215, + "learning_rate": 5.9095e-05, + "loss": 0.5353, + "step": 11825 + }, + { + "epoch": 0.6622242132377646, + "grad_norm": 1.4320757389068604, + "learning_rate": 5.91e-05, + "loss": 0.473, + "step": 11826 + }, + { + "epoch": 0.6622802105498936, + "grad_norm": 1.3239295482635498, + "learning_rate": 5.9105e-05, + "loss": 0.4496, + "step": 11827 + }, + { + "epoch": 0.6623362078620226, + "grad_norm": 1.1210360527038574, + "learning_rate": 5.911e-05, + "loss": 0.418, + "step": 11828 + }, + { + "epoch": 0.6623922051741516, + "grad_norm": 2.27600359916687, + "learning_rate": 5.9115e-05, + "loss": 0.4041, + "step": 11829 + }, + { + "epoch": 0.6624482024862807, + "grad_norm": 1.2792423963546753, + "learning_rate": 5.9119999999999996e-05, + "loss": 0.36, + "step": 11830 + }, + { + "epoch": 0.6625041997984097, + "grad_norm": 1.1539493799209595, + "learning_rate": 5.912500000000001e-05, + "loss": 0.3898, + "step": 11831 + }, + { + "epoch": 0.6625601971105387, + "grad_norm": 1.31495201587677, + "learning_rate": 5.9130000000000005e-05, + "loss": 0.4026, + "step": 11832 + }, + { + "epoch": 0.6626161944226677, + "grad_norm": 1.0842198133468628, + "learning_rate": 5.913500000000001e-05, + "loss": 0.3748, + "step": 11833 + }, + { + "epoch": 0.6626721917347967, + "grad_norm": 1.3249589204788208, + "learning_rate": 5.9140000000000006e-05, + "loss": 0.5073, + "step": 11834 + }, + { + "epoch": 0.6627281890469258, + "grad_norm": 1.3649876117706299, + "learning_rate": 5.9145000000000004e-05, + "loss": 0.43, + "step": 11835 + }, + { + "epoch": 0.6627841863590548, + "grad_norm": 1.2828794717788696, + "learning_rate": 5.915000000000001e-05, + "loss": 0.417, + "step": 11836 + }, + { + "epoch": 0.6628401836711838, + "grad_norm": 1.9186866283416748, + "learning_rate": 5.9155000000000005e-05, + "loss": 0.4914, + "step": 11837 + }, + { + "epoch": 0.6628961809833128, + "grad_norm": 1.1696540117263794, + "learning_rate": 5.916e-05, + "loss": 0.3673, + "step": 11838 + }, + { + "epoch": 0.6629521782954418, + "grad_norm": 1.321453332901001, + "learning_rate": 5.916500000000001e-05, + "loss": 0.4073, + "step": 11839 + }, + { + "epoch": 0.6630081756075709, + "grad_norm": 1.1427853107452393, + "learning_rate": 5.9170000000000004e-05, + "loss": 0.4659, + "step": 11840 + }, + { + "epoch": 0.6630641729196999, + "grad_norm": 1.3836218118667603, + "learning_rate": 5.9175e-05, + "loss": 0.3893, + "step": 11841 + }, + { + "epoch": 0.6631201702318289, + "grad_norm": 1.653493881225586, + "learning_rate": 5.918e-05, + "loss": 0.431, + "step": 11842 + }, + { + "epoch": 0.6631761675439579, + "grad_norm": 1.120078444480896, + "learning_rate": 5.9185000000000003e-05, + "loss": 0.3408, + "step": 11843 + }, + { + "epoch": 0.6632321648560869, + "grad_norm": 1.3832314014434814, + "learning_rate": 5.919e-05, + "loss": 0.4999, + "step": 11844 + }, + { + "epoch": 0.663288162168216, + "grad_norm": 1.2807661294937134, + "learning_rate": 5.9195e-05, + "loss": 0.3549, + "step": 11845 + }, + { + "epoch": 0.663344159480345, + "grad_norm": 1.1888970136642456, + "learning_rate": 5.92e-05, + "loss": 0.3814, + "step": 11846 + }, + { + "epoch": 0.663400156792474, + "grad_norm": 1.893478274345398, + "learning_rate": 5.9205e-05, + "loss": 0.4226, + "step": 11847 + }, + { + "epoch": 0.663456154104603, + "grad_norm": 1.1474511623382568, + "learning_rate": 5.921e-05, + "loss": 0.3579, + "step": 11848 + }, + { + "epoch": 0.663512151416732, + "grad_norm": 1.5900508165359497, + "learning_rate": 5.9215e-05, + "loss": 0.6149, + "step": 11849 + }, + { + "epoch": 0.663568148728861, + "grad_norm": 1.3894587755203247, + "learning_rate": 5.922e-05, + "loss": 0.3685, + "step": 11850 + }, + { + "epoch": 0.6636241460409901, + "grad_norm": 1.6272608041763306, + "learning_rate": 5.922500000000001e-05, + "loss": 0.5183, + "step": 11851 + }, + { + "epoch": 0.6636801433531191, + "grad_norm": 1.393485188484192, + "learning_rate": 5.923000000000001e-05, + "loss": 0.5789, + "step": 11852 + }, + { + "epoch": 0.6637361406652481, + "grad_norm": 1.3365730047225952, + "learning_rate": 5.9235000000000005e-05, + "loss": 0.3492, + "step": 11853 + }, + { + "epoch": 0.6637921379773771, + "grad_norm": 1.1424380540847778, + "learning_rate": 5.924000000000001e-05, + "loss": 0.314, + "step": 11854 + }, + { + "epoch": 0.6638481352895061, + "grad_norm": 1.5570404529571533, + "learning_rate": 5.9245000000000006e-05, + "loss": 0.4278, + "step": 11855 + }, + { + "epoch": 0.6639041326016352, + "grad_norm": 1.0318195819854736, + "learning_rate": 5.9250000000000004e-05, + "loss": 0.3839, + "step": 11856 + }, + { + "epoch": 0.6639601299137642, + "grad_norm": 1.310918927192688, + "learning_rate": 5.925500000000001e-05, + "loss": 0.4464, + "step": 11857 + }, + { + "epoch": 0.6640161272258932, + "grad_norm": 1.6875386238098145, + "learning_rate": 5.9260000000000005e-05, + "loss": 0.557, + "step": 11858 + }, + { + "epoch": 0.6640721245380222, + "grad_norm": 1.0900417566299438, + "learning_rate": 5.9265e-05, + "loss": 0.3614, + "step": 11859 + }, + { + "epoch": 0.6641281218501512, + "grad_norm": 1.054628849029541, + "learning_rate": 5.927e-05, + "loss": 0.3527, + "step": 11860 + }, + { + "epoch": 0.6641841191622803, + "grad_norm": 1.3811562061309814, + "learning_rate": 5.9275000000000004e-05, + "loss": 0.4464, + "step": 11861 + }, + { + "epoch": 0.6642401164744093, + "grad_norm": 1.394370675086975, + "learning_rate": 5.928e-05, + "loss": 0.6263, + "step": 11862 + }, + { + "epoch": 0.6642961137865383, + "grad_norm": 1.579593539237976, + "learning_rate": 5.9285e-05, + "loss": 0.5119, + "step": 11863 + }, + { + "epoch": 0.6643521110986672, + "grad_norm": 1.3653138875961304, + "learning_rate": 5.929e-05, + "loss": 0.4603, + "step": 11864 + }, + { + "epoch": 0.6644081084107962, + "grad_norm": 1.4746373891830444, + "learning_rate": 5.9295e-05, + "loss": 0.4037, + "step": 11865 + }, + { + "epoch": 0.6644641057229252, + "grad_norm": 1.2131935358047485, + "learning_rate": 5.93e-05, + "loss": 0.3738, + "step": 11866 + }, + { + "epoch": 0.6645201030350543, + "grad_norm": 1.1263573169708252, + "learning_rate": 5.9305e-05, + "loss": 0.3515, + "step": 11867 + }, + { + "epoch": 0.6645761003471833, + "grad_norm": 1.3177452087402344, + "learning_rate": 5.931e-05, + "loss": 0.3774, + "step": 11868 + }, + { + "epoch": 0.6646320976593123, + "grad_norm": 1.270390272140503, + "learning_rate": 5.9315e-05, + "loss": 0.4753, + "step": 11869 + }, + { + "epoch": 0.6646880949714413, + "grad_norm": 1.494642734527588, + "learning_rate": 5.9319999999999994e-05, + "loss": 0.3979, + "step": 11870 + }, + { + "epoch": 0.6647440922835703, + "grad_norm": 1.6839993000030518, + "learning_rate": 5.9325000000000005e-05, + "loss": 0.5764, + "step": 11871 + }, + { + "epoch": 0.6648000895956994, + "grad_norm": 1.3141297101974487, + "learning_rate": 5.933000000000001e-05, + "loss": 0.3978, + "step": 11872 + }, + { + "epoch": 0.6648560869078284, + "grad_norm": 1.3168694972991943, + "learning_rate": 5.933500000000001e-05, + "loss": 0.469, + "step": 11873 + }, + { + "epoch": 0.6649120842199574, + "grad_norm": 1.372299313545227, + "learning_rate": 5.9340000000000004e-05, + "loss": 0.4825, + "step": 11874 + }, + { + "epoch": 0.6649680815320864, + "grad_norm": 2.188124895095825, + "learning_rate": 5.934500000000001e-05, + "loss": 0.5163, + "step": 11875 + }, + { + "epoch": 0.6650240788442154, + "grad_norm": 1.4128270149230957, + "learning_rate": 5.9350000000000006e-05, + "loss": 0.4864, + "step": 11876 + }, + { + "epoch": 0.6650800761563445, + "grad_norm": 1.202697992324829, + "learning_rate": 5.9355000000000003e-05, + "loss": 0.4417, + "step": 11877 + }, + { + "epoch": 0.6651360734684735, + "grad_norm": 1.3172637224197388, + "learning_rate": 5.936000000000001e-05, + "loss": 0.4383, + "step": 11878 + }, + { + "epoch": 0.6651920707806025, + "grad_norm": 1.202485203742981, + "learning_rate": 5.9365000000000005e-05, + "loss": 0.3311, + "step": 11879 + }, + { + "epoch": 0.6652480680927315, + "grad_norm": 1.5256835222244263, + "learning_rate": 5.937e-05, + "loss": 0.3707, + "step": 11880 + }, + { + "epoch": 0.6653040654048605, + "grad_norm": 1.5215630531311035, + "learning_rate": 5.9375e-05, + "loss": 0.4309, + "step": 11881 + }, + { + "epoch": 0.6653600627169896, + "grad_norm": 1.437851071357727, + "learning_rate": 5.9380000000000004e-05, + "loss": 0.3928, + "step": 11882 + }, + { + "epoch": 0.6654160600291186, + "grad_norm": 1.1500213146209717, + "learning_rate": 5.9385e-05, + "loss": 0.3762, + "step": 11883 + }, + { + "epoch": 0.6654720573412476, + "grad_norm": 1.196663737297058, + "learning_rate": 5.939e-05, + "loss": 0.3458, + "step": 11884 + }, + { + "epoch": 0.6655280546533766, + "grad_norm": 1.365906000137329, + "learning_rate": 5.9395e-05, + "loss": 0.4867, + "step": 11885 + }, + { + "epoch": 0.6655840519655056, + "grad_norm": 1.3021512031555176, + "learning_rate": 5.94e-05, + "loss": 0.45, + "step": 11886 + }, + { + "epoch": 0.6656400492776346, + "grad_norm": 1.5049903392791748, + "learning_rate": 5.9405e-05, + "loss": 0.4254, + "step": 11887 + }, + { + "epoch": 0.6656960465897637, + "grad_norm": 1.4015634059906006, + "learning_rate": 5.941e-05, + "loss": 0.4781, + "step": 11888 + }, + { + "epoch": 0.6657520439018927, + "grad_norm": 1.026158332824707, + "learning_rate": 5.9415e-05, + "loss": 0.3566, + "step": 11889 + }, + { + "epoch": 0.6658080412140217, + "grad_norm": 1.1147710084915161, + "learning_rate": 5.942e-05, + "loss": 0.4144, + "step": 11890 + }, + { + "epoch": 0.6658640385261507, + "grad_norm": 1.2159868478775024, + "learning_rate": 5.9424999999999994e-05, + "loss": 0.4443, + "step": 11891 + }, + { + "epoch": 0.6659200358382797, + "grad_norm": 1.2928882837295532, + "learning_rate": 5.9430000000000005e-05, + "loss": 0.4125, + "step": 11892 + }, + { + "epoch": 0.6659760331504088, + "grad_norm": 1.3846945762634277, + "learning_rate": 5.943500000000001e-05, + "loss": 0.4197, + "step": 11893 + }, + { + "epoch": 0.6660320304625378, + "grad_norm": 1.3825461864471436, + "learning_rate": 5.944000000000001e-05, + "loss": 0.5739, + "step": 11894 + }, + { + "epoch": 0.6660880277746668, + "grad_norm": 1.2266974449157715, + "learning_rate": 5.9445000000000004e-05, + "loss": 0.375, + "step": 11895 + }, + { + "epoch": 0.6661440250867958, + "grad_norm": 1.3153057098388672, + "learning_rate": 5.945000000000001e-05, + "loss": 0.4659, + "step": 11896 + }, + { + "epoch": 0.6662000223989248, + "grad_norm": 1.5280402898788452, + "learning_rate": 5.9455000000000006e-05, + "loss": 0.4301, + "step": 11897 + }, + { + "epoch": 0.6662560197110539, + "grad_norm": 1.302307367324829, + "learning_rate": 5.946e-05, + "loss": 0.5529, + "step": 11898 + }, + { + "epoch": 0.6663120170231829, + "grad_norm": 1.386218547821045, + "learning_rate": 5.9465e-05, + "loss": 0.4544, + "step": 11899 + }, + { + "epoch": 0.6663680143353119, + "grad_norm": 1.0509182214736938, + "learning_rate": 5.9470000000000005e-05, + "loss": 0.3286, + "step": 11900 + }, + { + "epoch": 0.6664240116474409, + "grad_norm": 1.423601508140564, + "learning_rate": 5.9475e-05, + "loss": 0.4119, + "step": 11901 + }, + { + "epoch": 0.6664800089595699, + "grad_norm": 1.2592089176177979, + "learning_rate": 5.948e-05, + "loss": 0.6676, + "step": 11902 + }, + { + "epoch": 0.666536006271699, + "grad_norm": 1.3460267782211304, + "learning_rate": 5.9485000000000004e-05, + "loss": 0.4699, + "step": 11903 + }, + { + "epoch": 0.666592003583828, + "grad_norm": 1.334141731262207, + "learning_rate": 5.949e-05, + "loss": 0.5719, + "step": 11904 + }, + { + "epoch": 0.666648000895957, + "grad_norm": 1.3642629384994507, + "learning_rate": 5.9495e-05, + "loss": 0.5086, + "step": 11905 + }, + { + "epoch": 0.666703998208086, + "grad_norm": 1.1799554824829102, + "learning_rate": 5.95e-05, + "loss": 0.4032, + "step": 11906 + }, + { + "epoch": 0.666759995520215, + "grad_norm": 1.4249459505081177, + "learning_rate": 5.9505e-05, + "loss": 0.4316, + "step": 11907 + }, + { + "epoch": 0.666815992832344, + "grad_norm": 1.4522744417190552, + "learning_rate": 5.951e-05, + "loss": 0.4173, + "step": 11908 + }, + { + "epoch": 0.6668719901444731, + "grad_norm": 1.1182037591934204, + "learning_rate": 5.9514999999999995e-05, + "loss": 0.3313, + "step": 11909 + }, + { + "epoch": 0.6669279874566021, + "grad_norm": 1.2711259126663208, + "learning_rate": 5.952e-05, + "loss": 0.3685, + "step": 11910 + }, + { + "epoch": 0.6669839847687311, + "grad_norm": 5.938239574432373, + "learning_rate": 5.9525e-05, + "loss": 0.4984, + "step": 11911 + }, + { + "epoch": 0.6670399820808601, + "grad_norm": 1.4067604541778564, + "learning_rate": 5.953000000000001e-05, + "loss": 0.4658, + "step": 11912 + }, + { + "epoch": 0.6670959793929891, + "grad_norm": 1.7628065347671509, + "learning_rate": 5.9535000000000005e-05, + "loss": 0.3645, + "step": 11913 + }, + { + "epoch": 0.6671519767051182, + "grad_norm": 1.2518030405044556, + "learning_rate": 5.954000000000001e-05, + "loss": 0.4882, + "step": 11914 + }, + { + "epoch": 0.6672079740172472, + "grad_norm": 1.567389965057373, + "learning_rate": 5.954500000000001e-05, + "loss": 0.4946, + "step": 11915 + }, + { + "epoch": 0.6672639713293762, + "grad_norm": 1.301295280456543, + "learning_rate": 5.9550000000000004e-05, + "loss": 0.3501, + "step": 11916 + }, + { + "epoch": 0.6673199686415052, + "grad_norm": 1.160940170288086, + "learning_rate": 5.955500000000001e-05, + "loss": 0.347, + "step": 11917 + }, + { + "epoch": 0.6673759659536342, + "grad_norm": 1.2055591344833374, + "learning_rate": 5.9560000000000006e-05, + "loss": 0.3748, + "step": 11918 + }, + { + "epoch": 0.6674319632657633, + "grad_norm": 1.1648057699203491, + "learning_rate": 5.9565e-05, + "loss": 0.3992, + "step": 11919 + }, + { + "epoch": 0.6674879605778923, + "grad_norm": 1.2689573764801025, + "learning_rate": 5.957e-05, + "loss": 0.3886, + "step": 11920 + }, + { + "epoch": 0.6675439578900213, + "grad_norm": 1.2126526832580566, + "learning_rate": 5.9575000000000005e-05, + "loss": 0.3961, + "step": 11921 + }, + { + "epoch": 0.6675999552021503, + "grad_norm": 1.1302536725997925, + "learning_rate": 5.958e-05, + "loss": 0.3949, + "step": 11922 + }, + { + "epoch": 0.6676559525142793, + "grad_norm": 1.5190685987472534, + "learning_rate": 5.9585e-05, + "loss": 0.63, + "step": 11923 + }, + { + "epoch": 0.6677119498264084, + "grad_norm": 1.2824280261993408, + "learning_rate": 5.9590000000000004e-05, + "loss": 0.434, + "step": 11924 + }, + { + "epoch": 0.6677679471385374, + "grad_norm": 1.348188042640686, + "learning_rate": 5.9595e-05, + "loss": 0.368, + "step": 11925 + }, + { + "epoch": 0.6678239444506664, + "grad_norm": 1.1867979764938354, + "learning_rate": 5.96e-05, + "loss": 0.4094, + "step": 11926 + }, + { + "epoch": 0.6678799417627954, + "grad_norm": 1.3989386558532715, + "learning_rate": 5.9605e-05, + "loss": 0.5733, + "step": 11927 + }, + { + "epoch": 0.6679359390749244, + "grad_norm": 1.3534163236618042, + "learning_rate": 5.961e-05, + "loss": 0.4177, + "step": 11928 + }, + { + "epoch": 0.6679919363870535, + "grad_norm": 1.5746853351593018, + "learning_rate": 5.9615e-05, + "loss": 0.4174, + "step": 11929 + }, + { + "epoch": 0.6680479336991825, + "grad_norm": 1.2192744016647339, + "learning_rate": 5.9619999999999995e-05, + "loss": 0.4422, + "step": 11930 + }, + { + "epoch": 0.6681039310113115, + "grad_norm": 2.8761403560638428, + "learning_rate": 5.9625e-05, + "loss": 0.5961, + "step": 11931 + }, + { + "epoch": 0.6681599283234405, + "grad_norm": 1.376819372177124, + "learning_rate": 5.963000000000001e-05, + "loss": 0.5999, + "step": 11932 + }, + { + "epoch": 0.6682159256355695, + "grad_norm": 1.293157696723938, + "learning_rate": 5.963500000000001e-05, + "loss": 0.3914, + "step": 11933 + }, + { + "epoch": 0.6682719229476985, + "grad_norm": 1.537792444229126, + "learning_rate": 5.9640000000000005e-05, + "loss": 0.5856, + "step": 11934 + }, + { + "epoch": 0.6683279202598276, + "grad_norm": 2.8495757579803467, + "learning_rate": 5.964500000000001e-05, + "loss": 0.788, + "step": 11935 + }, + { + "epoch": 0.6683839175719566, + "grad_norm": 1.2567057609558105, + "learning_rate": 5.9650000000000007e-05, + "loss": 0.4586, + "step": 11936 + }, + { + "epoch": 0.6684399148840856, + "grad_norm": 1.1390223503112793, + "learning_rate": 5.9655000000000004e-05, + "loss": 0.3774, + "step": 11937 + }, + { + "epoch": 0.6684959121962146, + "grad_norm": 1.2039321660995483, + "learning_rate": 5.966000000000001e-05, + "loss": 0.3559, + "step": 11938 + }, + { + "epoch": 0.6685519095083436, + "grad_norm": 1.622532606124878, + "learning_rate": 5.9665000000000006e-05, + "loss": 0.3729, + "step": 11939 + }, + { + "epoch": 0.6686079068204727, + "grad_norm": 1.2902191877365112, + "learning_rate": 5.967e-05, + "loss": 0.4087, + "step": 11940 + }, + { + "epoch": 0.6686639041326017, + "grad_norm": 1.5999029874801636, + "learning_rate": 5.9675e-05, + "loss": 0.4653, + "step": 11941 + }, + { + "epoch": 0.6687199014447307, + "grad_norm": 1.2554268836975098, + "learning_rate": 5.9680000000000005e-05, + "loss": 0.4276, + "step": 11942 + }, + { + "epoch": 0.6687758987568597, + "grad_norm": 1.4497487545013428, + "learning_rate": 5.9685e-05, + "loss": 0.4258, + "step": 11943 + }, + { + "epoch": 0.6688318960689887, + "grad_norm": 1.1641937494277954, + "learning_rate": 5.969e-05, + "loss": 0.3416, + "step": 11944 + }, + { + "epoch": 0.6688878933811178, + "grad_norm": 1.5143848657608032, + "learning_rate": 5.9695000000000004e-05, + "loss": 0.5738, + "step": 11945 + }, + { + "epoch": 0.6689438906932468, + "grad_norm": 1.2914464473724365, + "learning_rate": 5.97e-05, + "loss": 0.3678, + "step": 11946 + }, + { + "epoch": 0.6689998880053757, + "grad_norm": 1.261762022972107, + "learning_rate": 5.9705e-05, + "loss": 0.5169, + "step": 11947 + }, + { + "epoch": 0.6690558853175047, + "grad_norm": 1.398228406906128, + "learning_rate": 5.971e-05, + "loss": 0.4794, + "step": 11948 + }, + { + "epoch": 0.6691118826296337, + "grad_norm": 2.0928797721862793, + "learning_rate": 5.9715e-05, + "loss": 0.4342, + "step": 11949 + }, + { + "epoch": 0.6691678799417627, + "grad_norm": 1.274298071861267, + "learning_rate": 5.972e-05, + "loss": 0.3518, + "step": 11950 + }, + { + "epoch": 0.6692238772538918, + "grad_norm": 1.3640156984329224, + "learning_rate": 5.9724999999999995e-05, + "loss": 0.4479, + "step": 11951 + }, + { + "epoch": 0.6692798745660208, + "grad_norm": 1.148178219795227, + "learning_rate": 5.9730000000000006e-05, + "loss": 0.3329, + "step": 11952 + }, + { + "epoch": 0.6693358718781498, + "grad_norm": 1.2451313734054565, + "learning_rate": 5.973500000000001e-05, + "loss": 0.3687, + "step": 11953 + }, + { + "epoch": 0.6693918691902788, + "grad_norm": 1.5776304006576538, + "learning_rate": 5.974000000000001e-05, + "loss": 0.3967, + "step": 11954 + }, + { + "epoch": 0.6694478665024078, + "grad_norm": 1.358848214149475, + "learning_rate": 5.9745000000000005e-05, + "loss": 0.416, + "step": 11955 + }, + { + "epoch": 0.6695038638145369, + "grad_norm": 1.2372437715530396, + "learning_rate": 5.975000000000001e-05, + "loss": 0.4196, + "step": 11956 + }, + { + "epoch": 0.6695598611266659, + "grad_norm": 1.1792287826538086, + "learning_rate": 5.9755000000000006e-05, + "loss": 0.4522, + "step": 11957 + }, + { + "epoch": 0.6696158584387949, + "grad_norm": 1.178227186203003, + "learning_rate": 5.9760000000000004e-05, + "loss": 0.3618, + "step": 11958 + }, + { + "epoch": 0.6696718557509239, + "grad_norm": 1.2957943677902222, + "learning_rate": 5.9765e-05, + "loss": 0.5151, + "step": 11959 + }, + { + "epoch": 0.6697278530630529, + "grad_norm": 1.2186528444290161, + "learning_rate": 5.9770000000000005e-05, + "loss": 0.5495, + "step": 11960 + }, + { + "epoch": 0.669783850375182, + "grad_norm": 1.2566514015197754, + "learning_rate": 5.9775e-05, + "loss": 0.4227, + "step": 11961 + }, + { + "epoch": 0.669839847687311, + "grad_norm": 1.2219884395599365, + "learning_rate": 5.978e-05, + "loss": 0.319, + "step": 11962 + }, + { + "epoch": 0.66989584499944, + "grad_norm": 1.4038097858428955, + "learning_rate": 5.9785000000000004e-05, + "loss": 0.4801, + "step": 11963 + }, + { + "epoch": 0.669951842311569, + "grad_norm": 1.5065792798995972, + "learning_rate": 5.979e-05, + "loss": 0.421, + "step": 11964 + }, + { + "epoch": 0.670007839623698, + "grad_norm": 1.3518203496932983, + "learning_rate": 5.9795e-05, + "loss": 0.3793, + "step": 11965 + }, + { + "epoch": 0.670063836935827, + "grad_norm": 1.4339953660964966, + "learning_rate": 5.9800000000000003e-05, + "loss": 0.4307, + "step": 11966 + }, + { + "epoch": 0.6701198342479561, + "grad_norm": 1.2827121019363403, + "learning_rate": 5.9805e-05, + "loss": 0.3806, + "step": 11967 + }, + { + "epoch": 0.6701758315600851, + "grad_norm": NaN, + "learning_rate": 5.9805e-05, + "loss": 0.4681, + "step": 11968 + }, + { + "epoch": 0.6702318288722141, + "grad_norm": 1.5028961896896362, + "learning_rate": 5.981e-05, + "loss": 0.4211, + "step": 11969 + }, + { + "epoch": 0.6702878261843431, + "grad_norm": 1.3298301696777344, + "learning_rate": 5.9814999999999996e-05, + "loss": 0.303, + "step": 11970 + }, + { + "epoch": 0.6703438234964721, + "grad_norm": 1.2868156433105469, + "learning_rate": 5.982e-05, + "loss": 0.5618, + "step": 11971 + }, + { + "epoch": 0.6703998208086012, + "grad_norm": 1.4557560682296753, + "learning_rate": 5.9825e-05, + "loss": 0.5228, + "step": 11972 + }, + { + "epoch": 0.6704558181207302, + "grad_norm": 1.225768804550171, + "learning_rate": 5.983000000000001e-05, + "loss": 0.4383, + "step": 11973 + }, + { + "epoch": 0.6705118154328592, + "grad_norm": 1.3572560548782349, + "learning_rate": 5.9835000000000006e-05, + "loss": 0.4428, + "step": 11974 + }, + { + "epoch": 0.6705678127449882, + "grad_norm": 2.91058349609375, + "learning_rate": 5.984000000000001e-05, + "loss": 0.3326, + "step": 11975 + }, + { + "epoch": 0.6706238100571172, + "grad_norm": 1.3695838451385498, + "learning_rate": 5.984500000000001e-05, + "loss": 0.3944, + "step": 11976 + }, + { + "epoch": 0.6706798073692463, + "grad_norm": 1.2205719947814941, + "learning_rate": 5.9850000000000005e-05, + "loss": 0.3951, + "step": 11977 + }, + { + "epoch": 0.6707358046813753, + "grad_norm": 1.076446294784546, + "learning_rate": 5.985500000000001e-05, + "loss": 0.3704, + "step": 11978 + }, + { + "epoch": 0.6707918019935043, + "grad_norm": 1.4320250749588013, + "learning_rate": 5.9860000000000006e-05, + "loss": 0.5801, + "step": 11979 + }, + { + "epoch": 0.6708477993056333, + "grad_norm": 1.3300652503967285, + "learning_rate": 5.9865000000000004e-05, + "loss": 0.3682, + "step": 11980 + }, + { + "epoch": 0.6709037966177623, + "grad_norm": 1.949277400970459, + "learning_rate": 5.987e-05, + "loss": 0.4824, + "step": 11981 + }, + { + "epoch": 0.6709597939298914, + "grad_norm": 1.1415592432022095, + "learning_rate": 5.9875000000000005e-05, + "loss": 0.4576, + "step": 11982 + }, + { + "epoch": 0.6710157912420204, + "grad_norm": 1.2905715703964233, + "learning_rate": 5.988e-05, + "loss": 0.425, + "step": 11983 + }, + { + "epoch": 0.6710717885541494, + "grad_norm": 1.4143505096435547, + "learning_rate": 5.9885e-05, + "loss": 0.4279, + "step": 11984 + }, + { + "epoch": 0.6711277858662784, + "grad_norm": 1.546417474746704, + "learning_rate": 5.9890000000000004e-05, + "loss": 0.4881, + "step": 11985 + }, + { + "epoch": 0.6711837831784074, + "grad_norm": 1.5658057928085327, + "learning_rate": 5.9895e-05, + "loss": 0.4668, + "step": 11986 + }, + { + "epoch": 0.6712397804905365, + "grad_norm": 1.1546103954315186, + "learning_rate": 5.99e-05, + "loss": 0.4428, + "step": 11987 + }, + { + "epoch": 0.6712957778026655, + "grad_norm": 1.3035434484481812, + "learning_rate": 5.9905e-05, + "loss": 0.4807, + "step": 11988 + }, + { + "epoch": 0.6713517751147945, + "grad_norm": 1.1266698837280273, + "learning_rate": 5.991e-05, + "loss": 0.4117, + "step": 11989 + }, + { + "epoch": 0.6714077724269235, + "grad_norm": 1.3537970781326294, + "learning_rate": 5.9915e-05, + "loss": 0.4936, + "step": 11990 + }, + { + "epoch": 0.6714637697390525, + "grad_norm": 1.3205552101135254, + "learning_rate": 5.9919999999999996e-05, + "loss": 0.4109, + "step": 11991 + }, + { + "epoch": 0.6715197670511815, + "grad_norm": 1.755386471748352, + "learning_rate": 5.9925e-05, + "loss": 0.5293, + "step": 11992 + }, + { + "epoch": 0.6715757643633106, + "grad_norm": 1.450015902519226, + "learning_rate": 5.993000000000001e-05, + "loss": 0.4599, + "step": 11993 + }, + { + "epoch": 0.6716317616754396, + "grad_norm": 1.3256274461746216, + "learning_rate": 5.993500000000001e-05, + "loss": 0.4843, + "step": 11994 + }, + { + "epoch": 0.6716877589875686, + "grad_norm": 1.287933349609375, + "learning_rate": 5.9940000000000005e-05, + "loss": 0.36, + "step": 11995 + }, + { + "epoch": 0.6717437562996976, + "grad_norm": 1.2362840175628662, + "learning_rate": 5.994500000000001e-05, + "loss": 0.5034, + "step": 11996 + }, + { + "epoch": 0.6717997536118266, + "grad_norm": 1.6679575443267822, + "learning_rate": 5.995000000000001e-05, + "loss": 0.4632, + "step": 11997 + }, + { + "epoch": 0.6718557509239557, + "grad_norm": 1.4457893371582031, + "learning_rate": 5.9955000000000004e-05, + "loss": 0.6138, + "step": 11998 + }, + { + "epoch": 0.6719117482360847, + "grad_norm": 1.1589275598526, + "learning_rate": 5.996e-05, + "loss": 0.3804, + "step": 11999 + }, + { + "epoch": 0.6719677455482137, + "grad_norm": 1.1779927015304565, + "learning_rate": 5.9965000000000006e-05, + "loss": 0.4292, + "step": 12000 + }, + { + "epoch": 0.6720237428603427, + "grad_norm": 1.5580350160598755, + "learning_rate": 5.9970000000000004e-05, + "loss": 0.3699, + "step": 12001 + }, + { + "epoch": 0.6720797401724717, + "grad_norm": 1.4929412603378296, + "learning_rate": 5.9975e-05, + "loss": 0.4857, + "step": 12002 + }, + { + "epoch": 0.6721357374846008, + "grad_norm": 1.3599839210510254, + "learning_rate": 5.9980000000000005e-05, + "loss": 0.5874, + "step": 12003 + }, + { + "epoch": 0.6721917347967298, + "grad_norm": 1.2680851221084595, + "learning_rate": 5.9985e-05, + "loss": 0.4742, + "step": 12004 + }, + { + "epoch": 0.6722477321088588, + "grad_norm": 1.1076149940490723, + "learning_rate": 5.999e-05, + "loss": 0.3154, + "step": 12005 + }, + { + "epoch": 0.6723037294209878, + "grad_norm": 1.2600069046020508, + "learning_rate": 5.9995000000000004e-05, + "loss": 0.342, + "step": 12006 + }, + { + "epoch": 0.6723597267331168, + "grad_norm": 1.2424794435501099, + "learning_rate": 6e-05, + "loss": 0.4882, + "step": 12007 + }, + { + "epoch": 0.6724157240452459, + "grad_norm": 1.4509917497634888, + "learning_rate": 6.0005e-05, + "loss": 0.3048, + "step": 12008 + }, + { + "epoch": 0.6724717213573749, + "grad_norm": 1.324674367904663, + "learning_rate": 6.0009999999999996e-05, + "loss": 0.395, + "step": 12009 + }, + { + "epoch": 0.6725277186695039, + "grad_norm": 1.259440541267395, + "learning_rate": 6.0015e-05, + "loss": 0.4889, + "step": 12010 + }, + { + "epoch": 0.6725837159816329, + "grad_norm": 1.3536385297775269, + "learning_rate": 6.002e-05, + "loss": 0.498, + "step": 12011 + }, + { + "epoch": 0.6726397132937619, + "grad_norm": 1.317745566368103, + "learning_rate": 6.0024999999999995e-05, + "loss": 0.4705, + "step": 12012 + }, + { + "epoch": 0.672695710605891, + "grad_norm": 1.2811329364776611, + "learning_rate": 6.003e-05, + "loss": 0.4175, + "step": 12013 + }, + { + "epoch": 0.67275170791802, + "grad_norm": 1.3161181211471558, + "learning_rate": 6.003500000000001e-05, + "loss": 0.5964, + "step": 12014 + }, + { + "epoch": 0.672807705230149, + "grad_norm": 1.2640495300292969, + "learning_rate": 6.004000000000001e-05, + "loss": 0.4612, + "step": 12015 + }, + { + "epoch": 0.672863702542278, + "grad_norm": 1.3181904554367065, + "learning_rate": 6.0045000000000005e-05, + "loss": 0.3851, + "step": 12016 + }, + { + "epoch": 0.672919699854407, + "grad_norm": 1.3061329126358032, + "learning_rate": 6.005000000000001e-05, + "loss": 0.5407, + "step": 12017 + }, + { + "epoch": 0.672975697166536, + "grad_norm": 1.2655748128890991, + "learning_rate": 6.005500000000001e-05, + "loss": 0.3327, + "step": 12018 + }, + { + "epoch": 0.6730316944786651, + "grad_norm": 1.140716552734375, + "learning_rate": 6.0060000000000004e-05, + "loss": 0.4395, + "step": 12019 + }, + { + "epoch": 0.6730876917907941, + "grad_norm": 1.422457218170166, + "learning_rate": 6.0065e-05, + "loss": 0.6095, + "step": 12020 + }, + { + "epoch": 0.6731436891029231, + "grad_norm": 1.1209276914596558, + "learning_rate": 6.0070000000000006e-05, + "loss": 0.3438, + "step": 12021 + }, + { + "epoch": 0.6731996864150521, + "grad_norm": 1.7475590705871582, + "learning_rate": 6.0075e-05, + "loss": 0.5939, + "step": 12022 + }, + { + "epoch": 0.6732556837271811, + "grad_norm": 1.2284224033355713, + "learning_rate": 6.008e-05, + "loss": 0.3576, + "step": 12023 + }, + { + "epoch": 0.6733116810393102, + "grad_norm": 1.2152773141860962, + "learning_rate": 6.0085000000000005e-05, + "loss": 0.432, + "step": 12024 + }, + { + "epoch": 0.6733676783514392, + "grad_norm": 1.2505340576171875, + "learning_rate": 6.009e-05, + "loss": 0.3708, + "step": 12025 + }, + { + "epoch": 0.6734236756635682, + "grad_norm": 1.2982544898986816, + "learning_rate": 6.0095e-05, + "loss": 0.5278, + "step": 12026 + }, + { + "epoch": 0.6734796729756972, + "grad_norm": 1.1628079414367676, + "learning_rate": 6.0100000000000004e-05, + "loss": 0.3659, + "step": 12027 + }, + { + "epoch": 0.6735356702878262, + "grad_norm": 1.8002924919128418, + "learning_rate": 6.0105e-05, + "loss": 0.8369, + "step": 12028 + }, + { + "epoch": 0.6735916675999553, + "grad_norm": 1.223206877708435, + "learning_rate": 6.011e-05, + "loss": 0.5043, + "step": 12029 + }, + { + "epoch": 0.6736476649120842, + "grad_norm": 1.3584004640579224, + "learning_rate": 6.0114999999999996e-05, + "loss": 0.4154, + "step": 12030 + }, + { + "epoch": 0.6737036622242132, + "grad_norm": 1.5717308521270752, + "learning_rate": 6.012e-05, + "loss": 0.4392, + "step": 12031 + }, + { + "epoch": 0.6737596595363422, + "grad_norm": 1.321874737739563, + "learning_rate": 6.0125e-05, + "loss": 0.3455, + "step": 12032 + }, + { + "epoch": 0.6738156568484712, + "grad_norm": 1.0951192378997803, + "learning_rate": 6.0129999999999995e-05, + "loss": 0.3107, + "step": 12033 + }, + { + "epoch": 0.6738716541606002, + "grad_norm": 1.5254805088043213, + "learning_rate": 6.0135000000000006e-05, + "loss": 0.3985, + "step": 12034 + }, + { + "epoch": 0.6739276514727293, + "grad_norm": 1.4667788743972778, + "learning_rate": 6.014000000000001e-05, + "loss": 0.4162, + "step": 12035 + }, + { + "epoch": 0.6739836487848583, + "grad_norm": 1.4369940757751465, + "learning_rate": 6.014500000000001e-05, + "loss": 0.6691, + "step": 12036 + }, + { + "epoch": 0.6740396460969873, + "grad_norm": 1.646856427192688, + "learning_rate": 6.0150000000000005e-05, + "loss": 0.392, + "step": 12037 + }, + { + "epoch": 0.6740956434091163, + "grad_norm": 1.3220666646957397, + "learning_rate": 6.0155e-05, + "loss": 0.4313, + "step": 12038 + }, + { + "epoch": 0.6741516407212453, + "grad_norm": 1.4972738027572632, + "learning_rate": 6.016000000000001e-05, + "loss": 0.492, + "step": 12039 + }, + { + "epoch": 0.6742076380333744, + "grad_norm": 1.316117763519287, + "learning_rate": 6.0165000000000004e-05, + "loss": 0.5395, + "step": 12040 + }, + { + "epoch": 0.6742636353455034, + "grad_norm": 1.4296866655349731, + "learning_rate": 6.017e-05, + "loss": 0.4829, + "step": 12041 + }, + { + "epoch": 0.6743196326576324, + "grad_norm": 1.4425750970840454, + "learning_rate": 6.0175000000000006e-05, + "loss": 0.3922, + "step": 12042 + }, + { + "epoch": 0.6743756299697614, + "grad_norm": 1.1359617710113525, + "learning_rate": 6.018e-05, + "loss": 0.4163, + "step": 12043 + }, + { + "epoch": 0.6744316272818904, + "grad_norm": 1.3443559408187866, + "learning_rate": 6.0185e-05, + "loss": 0.4645, + "step": 12044 + }, + { + "epoch": 0.6744876245940195, + "grad_norm": 1.6530473232269287, + "learning_rate": 6.0190000000000005e-05, + "loss": 0.4853, + "step": 12045 + }, + { + "epoch": 0.6745436219061485, + "grad_norm": 1.2223633527755737, + "learning_rate": 6.0195e-05, + "loss": 0.4603, + "step": 12046 + }, + { + "epoch": 0.6745996192182775, + "grad_norm": 1.292128086090088, + "learning_rate": 6.02e-05, + "loss": 0.4506, + "step": 12047 + }, + { + "epoch": 0.6746556165304065, + "grad_norm": 1.9163436889648438, + "learning_rate": 6.0205e-05, + "loss": 0.6832, + "step": 12048 + }, + { + "epoch": 0.6747116138425355, + "grad_norm": 1.1957800388336182, + "learning_rate": 6.021e-05, + "loss": 0.327, + "step": 12049 + }, + { + "epoch": 0.6747676111546645, + "grad_norm": 1.2962968349456787, + "learning_rate": 6.0215e-05, + "loss": 0.5793, + "step": 12050 + }, + { + "epoch": 0.6748236084667936, + "grad_norm": 1.2703187465667725, + "learning_rate": 6.0219999999999996e-05, + "loss": 0.3853, + "step": 12051 + }, + { + "epoch": 0.6748796057789226, + "grad_norm": 1.4209612607955933, + "learning_rate": 6.0225e-05, + "loss": 0.5524, + "step": 12052 + }, + { + "epoch": 0.6749356030910516, + "grad_norm": 1.231265902519226, + "learning_rate": 6.023e-05, + "loss": 0.5274, + "step": 12053 + }, + { + "epoch": 0.6749916004031806, + "grad_norm": 1.2571219205856323, + "learning_rate": 6.023500000000001e-05, + "loss": 0.422, + "step": 12054 + }, + { + "epoch": 0.6750475977153096, + "grad_norm": 1.330564022064209, + "learning_rate": 6.0240000000000006e-05, + "loss": 0.4988, + "step": 12055 + }, + { + "epoch": 0.6751035950274387, + "grad_norm": 1.2424752712249756, + "learning_rate": 6.024500000000001e-05, + "loss": 0.5119, + "step": 12056 + }, + { + "epoch": 0.6751595923395677, + "grad_norm": 1.1252355575561523, + "learning_rate": 6.025000000000001e-05, + "loss": 0.3737, + "step": 12057 + }, + { + "epoch": 0.6752155896516967, + "grad_norm": 1.6888601779937744, + "learning_rate": 6.0255000000000005e-05, + "loss": 0.5231, + "step": 12058 + }, + { + "epoch": 0.6752715869638257, + "grad_norm": 1.2207615375518799, + "learning_rate": 6.026e-05, + "loss": 0.3587, + "step": 12059 + }, + { + "epoch": 0.6753275842759547, + "grad_norm": 1.3332748413085938, + "learning_rate": 6.0265000000000007e-05, + "loss": 0.4438, + "step": 12060 + }, + { + "epoch": 0.6753835815880838, + "grad_norm": 1.7057472467422485, + "learning_rate": 6.0270000000000004e-05, + "loss": 0.6373, + "step": 12061 + }, + { + "epoch": 0.6754395789002128, + "grad_norm": 1.2983251810073853, + "learning_rate": 6.0275e-05, + "loss": 0.5587, + "step": 12062 + }, + { + "epoch": 0.6754955762123418, + "grad_norm": 1.5316787958145142, + "learning_rate": 6.0280000000000006e-05, + "loss": 0.4676, + "step": 12063 + }, + { + "epoch": 0.6755515735244708, + "grad_norm": 1.215063214302063, + "learning_rate": 6.0285e-05, + "loss": 0.4541, + "step": 12064 + }, + { + "epoch": 0.6756075708365998, + "grad_norm": 1.2608975172042847, + "learning_rate": 6.029e-05, + "loss": 0.5397, + "step": 12065 + }, + { + "epoch": 0.6756635681487289, + "grad_norm": 1.618356704711914, + "learning_rate": 6.0295000000000005e-05, + "loss": 0.5113, + "step": 12066 + }, + { + "epoch": 0.6757195654608579, + "grad_norm": 1.4438884258270264, + "learning_rate": 6.03e-05, + "loss": 0.4424, + "step": 12067 + }, + { + "epoch": 0.6757755627729869, + "grad_norm": 1.1770647764205933, + "learning_rate": 6.0305e-05, + "loss": 0.3894, + "step": 12068 + }, + { + "epoch": 0.6758315600851159, + "grad_norm": 1.245314121246338, + "learning_rate": 6.031e-05, + "loss": 0.4122, + "step": 12069 + }, + { + "epoch": 0.6758875573972449, + "grad_norm": 1.1871123313903809, + "learning_rate": 6.0315e-05, + "loss": 0.4421, + "step": 12070 + }, + { + "epoch": 0.675943554709374, + "grad_norm": 1.7059030532836914, + "learning_rate": 6.032e-05, + "loss": 0.4171, + "step": 12071 + }, + { + "epoch": 0.675999552021503, + "grad_norm": 1.3483079671859741, + "learning_rate": 6.0324999999999996e-05, + "loss": 0.4056, + "step": 12072 + }, + { + "epoch": 0.676055549333632, + "grad_norm": 1.364949345588684, + "learning_rate": 6.033e-05, + "loss": 0.3944, + "step": 12073 + }, + { + "epoch": 0.676111546645761, + "grad_norm": 1.3981837034225464, + "learning_rate": 6.033500000000001e-05, + "loss": 0.398, + "step": 12074 + }, + { + "epoch": 0.67616754395789, + "grad_norm": 1.3125524520874023, + "learning_rate": 6.034000000000001e-05, + "loss": 0.4499, + "step": 12075 + }, + { + "epoch": 0.676223541270019, + "grad_norm": 1.201357364654541, + "learning_rate": 6.0345000000000006e-05, + "loss": 0.4868, + "step": 12076 + }, + { + "epoch": 0.6762795385821481, + "grad_norm": 1.6828097105026245, + "learning_rate": 6.035e-05, + "loss": 0.5972, + "step": 12077 + }, + { + "epoch": 0.6763355358942771, + "grad_norm": 1.5219569206237793, + "learning_rate": 6.035500000000001e-05, + "loss": 0.4371, + "step": 12078 + }, + { + "epoch": 0.6763915332064061, + "grad_norm": 1.4877394437789917, + "learning_rate": 6.0360000000000005e-05, + "loss": 0.4413, + "step": 12079 + }, + { + "epoch": 0.6764475305185351, + "grad_norm": 1.6040446758270264, + "learning_rate": 6.0365e-05, + "loss": 0.4621, + "step": 12080 + }, + { + "epoch": 0.6765035278306641, + "grad_norm": 1.199930191040039, + "learning_rate": 6.0370000000000006e-05, + "loss": 0.4529, + "step": 12081 + }, + { + "epoch": 0.6765595251427932, + "grad_norm": 1.2376455068588257, + "learning_rate": 6.0375000000000004e-05, + "loss": 0.4784, + "step": 12082 + }, + { + "epoch": 0.6766155224549222, + "grad_norm": 1.2274621725082397, + "learning_rate": 6.038e-05, + "loss": 0.3986, + "step": 12083 + }, + { + "epoch": 0.6766715197670512, + "grad_norm": 1.4890775680541992, + "learning_rate": 6.0385000000000005e-05, + "loss": 0.5813, + "step": 12084 + }, + { + "epoch": 0.6767275170791802, + "grad_norm": 1.232393741607666, + "learning_rate": 6.039e-05, + "loss": 0.4071, + "step": 12085 + }, + { + "epoch": 0.6767835143913092, + "grad_norm": 1.3618180751800537, + "learning_rate": 6.0395e-05, + "loss": 0.4631, + "step": 12086 + }, + { + "epoch": 0.6768395117034383, + "grad_norm": 1.5815465450286865, + "learning_rate": 6.04e-05, + "loss": 0.6354, + "step": 12087 + }, + { + "epoch": 0.6768955090155673, + "grad_norm": 1.2645756006240845, + "learning_rate": 6.0405e-05, + "loss": 0.427, + "step": 12088 + }, + { + "epoch": 0.6769515063276963, + "grad_norm": 1.0445808172225952, + "learning_rate": 6.041e-05, + "loss": 0.332, + "step": 12089 + }, + { + "epoch": 0.6770075036398253, + "grad_norm": 1.3043770790100098, + "learning_rate": 6.0415e-05, + "loss": 0.6544, + "step": 12090 + }, + { + "epoch": 0.6770635009519543, + "grad_norm": 1.4303696155548096, + "learning_rate": 6.042e-05, + "loss": 0.4022, + "step": 12091 + }, + { + "epoch": 0.6771194982640834, + "grad_norm": 1.2758795022964478, + "learning_rate": 6.0425e-05, + "loss": 0.4598, + "step": 12092 + }, + { + "epoch": 0.6771754955762124, + "grad_norm": 1.3888673782348633, + "learning_rate": 6.0429999999999996e-05, + "loss": 0.4979, + "step": 12093 + }, + { + "epoch": 0.6772314928883414, + "grad_norm": 1.3244187831878662, + "learning_rate": 6.043500000000001e-05, + "loss": 0.4616, + "step": 12094 + }, + { + "epoch": 0.6772874902004704, + "grad_norm": 1.1622129678726196, + "learning_rate": 6.044000000000001e-05, + "loss": 0.4378, + "step": 12095 + }, + { + "epoch": 0.6773434875125994, + "grad_norm": 1.3151376247406006, + "learning_rate": 6.044500000000001e-05, + "loss": 0.3957, + "step": 12096 + }, + { + "epoch": 0.6773994848247284, + "grad_norm": 4.065365314483643, + "learning_rate": 6.0450000000000006e-05, + "loss": 0.4165, + "step": 12097 + }, + { + "epoch": 0.6774554821368575, + "grad_norm": 1.9255039691925049, + "learning_rate": 6.0455e-05, + "loss": 0.5765, + "step": 12098 + }, + { + "epoch": 0.6775114794489865, + "grad_norm": 1.1353352069854736, + "learning_rate": 6.046000000000001e-05, + "loss": 0.507, + "step": 12099 + }, + { + "epoch": 0.6775674767611155, + "grad_norm": 1.3141318559646606, + "learning_rate": 6.0465000000000005e-05, + "loss": 0.4241, + "step": 12100 + }, + { + "epoch": 0.6776234740732445, + "grad_norm": 1.657446265220642, + "learning_rate": 6.047e-05, + "loss": 0.4933, + "step": 12101 + }, + { + "epoch": 0.6776794713853735, + "grad_norm": 1.1255770921707153, + "learning_rate": 6.0475000000000006e-05, + "loss": 0.3442, + "step": 12102 + }, + { + "epoch": 0.6777354686975026, + "grad_norm": 1.3445061445236206, + "learning_rate": 6.0480000000000004e-05, + "loss": 0.4012, + "step": 12103 + }, + { + "epoch": 0.6777914660096316, + "grad_norm": 1.5071191787719727, + "learning_rate": 6.0485e-05, + "loss": 0.431, + "step": 12104 + }, + { + "epoch": 0.6778474633217606, + "grad_norm": 1.302628517150879, + "learning_rate": 6.0490000000000005e-05, + "loss": 0.3635, + "step": 12105 + }, + { + "epoch": 0.6779034606338896, + "grad_norm": 1.3854845762252808, + "learning_rate": 6.0495e-05, + "loss": 0.5189, + "step": 12106 + }, + { + "epoch": 0.6779594579460186, + "grad_norm": 1.2401306629180908, + "learning_rate": 6.05e-05, + "loss": 0.4829, + "step": 12107 + }, + { + "epoch": 0.6780154552581477, + "grad_norm": 1.5215063095092773, + "learning_rate": 6.0505e-05, + "loss": 0.5669, + "step": 12108 + }, + { + "epoch": 0.6780714525702767, + "grad_norm": 1.374367594718933, + "learning_rate": 6.051e-05, + "loss": 0.5544, + "step": 12109 + }, + { + "epoch": 0.6781274498824057, + "grad_norm": 1.4461313486099243, + "learning_rate": 6.0515e-05, + "loss": 0.5488, + "step": 12110 + }, + { + "epoch": 0.6781834471945347, + "grad_norm": 1.4633358716964722, + "learning_rate": 6.0519999999999997e-05, + "loss": 0.594, + "step": 12111 + }, + { + "epoch": 0.6782394445066637, + "grad_norm": 1.371083378791809, + "learning_rate": 6.0525e-05, + "loss": 0.4453, + "step": 12112 + }, + { + "epoch": 0.6782954418187926, + "grad_norm": 1.2095295190811157, + "learning_rate": 6.053e-05, + "loss": 0.34, + "step": 12113 + }, + { + "epoch": 0.6783514391309217, + "grad_norm": 1.2679212093353271, + "learning_rate": 6.053500000000001e-05, + "loss": 0.4288, + "step": 12114 + }, + { + "epoch": 0.6784074364430507, + "grad_norm": 1.5355103015899658, + "learning_rate": 6.0540000000000007e-05, + "loss": 0.5017, + "step": 12115 + }, + { + "epoch": 0.6784634337551797, + "grad_norm": 1.0712206363677979, + "learning_rate": 6.0545000000000004e-05, + "loss": 0.3312, + "step": 12116 + }, + { + "epoch": 0.6785194310673087, + "grad_norm": 1.1691625118255615, + "learning_rate": 6.055000000000001e-05, + "loss": 0.4556, + "step": 12117 + }, + { + "epoch": 0.6785754283794377, + "grad_norm": 1.2647243738174438, + "learning_rate": 6.0555000000000006e-05, + "loss": 0.3252, + "step": 12118 + }, + { + "epoch": 0.6786314256915668, + "grad_norm": 1.303964614868164, + "learning_rate": 6.056e-05, + "loss": 0.5992, + "step": 12119 + }, + { + "epoch": 0.6786874230036958, + "grad_norm": 1.539599895477295, + "learning_rate": 6.056500000000001e-05, + "loss": 0.5647, + "step": 12120 + }, + { + "epoch": 0.6787434203158248, + "grad_norm": 1.6583836078643799, + "learning_rate": 6.0570000000000005e-05, + "loss": 0.5323, + "step": 12121 + }, + { + "epoch": 0.6787994176279538, + "grad_norm": 1.4732065200805664, + "learning_rate": 6.0575e-05, + "loss": 0.4887, + "step": 12122 + }, + { + "epoch": 0.6788554149400828, + "grad_norm": 1.8859001398086548, + "learning_rate": 6.0580000000000006e-05, + "loss": 0.6046, + "step": 12123 + }, + { + "epoch": 0.6789114122522119, + "grad_norm": 1.1522440910339355, + "learning_rate": 6.0585000000000004e-05, + "loss": 0.385, + "step": 12124 + }, + { + "epoch": 0.6789674095643409, + "grad_norm": 1.0091975927352905, + "learning_rate": 6.059e-05, + "loss": 0.3164, + "step": 12125 + }, + { + "epoch": 0.6790234068764699, + "grad_norm": 1.124294638633728, + "learning_rate": 6.0595000000000005e-05, + "loss": 0.3647, + "step": 12126 + }, + { + "epoch": 0.6790794041885989, + "grad_norm": 1.23323392868042, + "learning_rate": 6.06e-05, + "loss": 0.4144, + "step": 12127 + }, + { + "epoch": 0.6791354015007279, + "grad_norm": 1.3646094799041748, + "learning_rate": 6.0605e-05, + "loss": 0.7289, + "step": 12128 + }, + { + "epoch": 0.679191398812857, + "grad_norm": 1.2699626684188843, + "learning_rate": 6.061e-05, + "loss": 0.429, + "step": 12129 + }, + { + "epoch": 0.679247396124986, + "grad_norm": 1.2049400806427002, + "learning_rate": 6.0615e-05, + "loss": 0.4621, + "step": 12130 + }, + { + "epoch": 0.679303393437115, + "grad_norm": 1.487858533859253, + "learning_rate": 6.062e-05, + "loss": 0.4598, + "step": 12131 + }, + { + "epoch": 0.679359390749244, + "grad_norm": 1.4971121549606323, + "learning_rate": 6.0624999999999996e-05, + "loss": 0.508, + "step": 12132 + }, + { + "epoch": 0.679415388061373, + "grad_norm": 1.1997946500778198, + "learning_rate": 6.063e-05, + "loss": 0.4751, + "step": 12133 + }, + { + "epoch": 0.679471385373502, + "grad_norm": 1.2344303131103516, + "learning_rate": 6.0635e-05, + "loss": 0.4005, + "step": 12134 + }, + { + "epoch": 0.6795273826856311, + "grad_norm": 1.1131056547164917, + "learning_rate": 6.064000000000001e-05, + "loss": 0.3827, + "step": 12135 + }, + { + "epoch": 0.6795833799977601, + "grad_norm": 1.1705071926116943, + "learning_rate": 6.0645000000000006e-05, + "loss": 0.3733, + "step": 12136 + }, + { + "epoch": 0.6796393773098891, + "grad_norm": 1.2471826076507568, + "learning_rate": 6.0650000000000004e-05, + "loss": 0.4472, + "step": 12137 + }, + { + "epoch": 0.6796953746220181, + "grad_norm": 1.3577433824539185, + "learning_rate": 6.065500000000001e-05, + "loss": 0.4631, + "step": 12138 + }, + { + "epoch": 0.6797513719341471, + "grad_norm": 1.1097514629364014, + "learning_rate": 6.0660000000000005e-05, + "loss": 0.3606, + "step": 12139 + }, + { + "epoch": 0.6798073692462762, + "grad_norm": 1.6002696752548218, + "learning_rate": 6.0665e-05, + "loss": 0.4334, + "step": 12140 + }, + { + "epoch": 0.6798633665584052, + "grad_norm": 1.2866092920303345, + "learning_rate": 6.067000000000001e-05, + "loss": 0.4569, + "step": 12141 + }, + { + "epoch": 0.6799193638705342, + "grad_norm": 1.1357052326202393, + "learning_rate": 6.0675000000000004e-05, + "loss": 0.4529, + "step": 12142 + }, + { + "epoch": 0.6799753611826632, + "grad_norm": 1.271215796470642, + "learning_rate": 6.068e-05, + "loss": 0.3355, + "step": 12143 + }, + { + "epoch": 0.6800313584947922, + "grad_norm": 1.1626436710357666, + "learning_rate": 6.0685000000000006e-05, + "loss": 0.4175, + "step": 12144 + }, + { + "epoch": 0.6800873558069213, + "grad_norm": 1.1848807334899902, + "learning_rate": 6.069e-05, + "loss": 0.4359, + "step": 12145 + }, + { + "epoch": 0.6801433531190503, + "grad_norm": 1.495087742805481, + "learning_rate": 6.0695e-05, + "loss": 0.4236, + "step": 12146 + }, + { + "epoch": 0.6801993504311793, + "grad_norm": 1.203644037246704, + "learning_rate": 6.07e-05, + "loss": 0.4306, + "step": 12147 + }, + { + "epoch": 0.6802553477433083, + "grad_norm": 1.1470236778259277, + "learning_rate": 6.0705e-05, + "loss": 0.4808, + "step": 12148 + }, + { + "epoch": 0.6803113450554373, + "grad_norm": 1.2754034996032715, + "learning_rate": 6.071e-05, + "loss": 0.496, + "step": 12149 + }, + { + "epoch": 0.6803673423675664, + "grad_norm": 1.3479679822921753, + "learning_rate": 6.0715e-05, + "loss": 0.6041, + "step": 12150 + }, + { + "epoch": 0.6804233396796954, + "grad_norm": 1.2230275869369507, + "learning_rate": 6.072e-05, + "loss": 0.3986, + "step": 12151 + }, + { + "epoch": 0.6804793369918244, + "grad_norm": 1.4950847625732422, + "learning_rate": 6.0725e-05, + "loss": 0.4485, + "step": 12152 + }, + { + "epoch": 0.6805353343039534, + "grad_norm": 1.4482049942016602, + "learning_rate": 6.0729999999999996e-05, + "loss": 0.4365, + "step": 12153 + }, + { + "epoch": 0.6805913316160824, + "grad_norm": 1.2511510848999023, + "learning_rate": 6.0735e-05, + "loss": 0.5161, + "step": 12154 + }, + { + "epoch": 0.6806473289282114, + "grad_norm": 1.4707467555999756, + "learning_rate": 6.074000000000001e-05, + "loss": 0.4518, + "step": 12155 + }, + { + "epoch": 0.6807033262403405, + "grad_norm": 1.0941394567489624, + "learning_rate": 6.074500000000001e-05, + "loss": 0.3309, + "step": 12156 + }, + { + "epoch": 0.6807593235524695, + "grad_norm": 1.1759692430496216, + "learning_rate": 6.0750000000000006e-05, + "loss": 0.3841, + "step": 12157 + }, + { + "epoch": 0.6808153208645985, + "grad_norm": 1.0564043521881104, + "learning_rate": 6.0755000000000004e-05, + "loss": 0.4067, + "step": 12158 + }, + { + "epoch": 0.6808713181767275, + "grad_norm": 1.4290287494659424, + "learning_rate": 6.076000000000001e-05, + "loss": 0.4944, + "step": 12159 + }, + { + "epoch": 0.6809273154888565, + "grad_norm": 1.222448468208313, + "learning_rate": 6.0765000000000005e-05, + "loss": 0.3964, + "step": 12160 + }, + { + "epoch": 0.6809833128009856, + "grad_norm": 1.0129196643829346, + "learning_rate": 6.077e-05, + "loss": 0.4556, + "step": 12161 + }, + { + "epoch": 0.6810393101131146, + "grad_norm": 1.4554996490478516, + "learning_rate": 6.077500000000001e-05, + "loss": 0.3385, + "step": 12162 + }, + { + "epoch": 0.6810953074252436, + "grad_norm": 1.183143973350525, + "learning_rate": 6.0780000000000004e-05, + "loss": 0.4086, + "step": 12163 + }, + { + "epoch": 0.6811513047373726, + "grad_norm": 1.2586287260055542, + "learning_rate": 6.0785e-05, + "loss": 0.3804, + "step": 12164 + }, + { + "epoch": 0.6812073020495016, + "grad_norm": 1.4544435739517212, + "learning_rate": 6.0790000000000006e-05, + "loss": 0.3895, + "step": 12165 + }, + { + "epoch": 0.6812632993616307, + "grad_norm": 1.2856827974319458, + "learning_rate": 6.0795e-05, + "loss": 0.4406, + "step": 12166 + }, + { + "epoch": 0.6813192966737597, + "grad_norm": 1.5319968461990356, + "learning_rate": 6.08e-05, + "loss": 0.4159, + "step": 12167 + }, + { + "epoch": 0.6813752939858887, + "grad_norm": 1.0890932083129883, + "learning_rate": 6.0805e-05, + "loss": 0.3229, + "step": 12168 + }, + { + "epoch": 0.6814312912980177, + "grad_norm": 1.1322203874588013, + "learning_rate": 6.081e-05, + "loss": 0.3951, + "step": 12169 + }, + { + "epoch": 0.6814872886101467, + "grad_norm": 1.456730604171753, + "learning_rate": 6.0815e-05, + "loss": 0.459, + "step": 12170 + }, + { + "epoch": 0.6815432859222758, + "grad_norm": 1.2333000898361206, + "learning_rate": 6.082e-05, + "loss": 0.3485, + "step": 12171 + }, + { + "epoch": 0.6815992832344048, + "grad_norm": 1.2088539600372314, + "learning_rate": 6.0825e-05, + "loss": 0.4149, + "step": 12172 + }, + { + "epoch": 0.6816552805465338, + "grad_norm": 1.328800916671753, + "learning_rate": 6.083e-05, + "loss": 0.5339, + "step": 12173 + }, + { + "epoch": 0.6817112778586628, + "grad_norm": 1.2763824462890625, + "learning_rate": 6.0834999999999996e-05, + "loss": 0.4658, + "step": 12174 + }, + { + "epoch": 0.6817672751707918, + "grad_norm": 1.3026294708251953, + "learning_rate": 6.084000000000001e-05, + "loss": 0.4155, + "step": 12175 + }, + { + "epoch": 0.6818232724829209, + "grad_norm": 1.4607728719711304, + "learning_rate": 6.0845000000000004e-05, + "loss": 0.5532, + "step": 12176 + }, + { + "epoch": 0.6818792697950499, + "grad_norm": 1.214138150215149, + "learning_rate": 6.085000000000001e-05, + "loss": 0.4987, + "step": 12177 + }, + { + "epoch": 0.6819352671071789, + "grad_norm": 1.393033504486084, + "learning_rate": 6.0855000000000006e-05, + "loss": 0.3502, + "step": 12178 + }, + { + "epoch": 0.6819912644193079, + "grad_norm": 1.378004550933838, + "learning_rate": 6.0860000000000003e-05, + "loss": 0.4423, + "step": 12179 + }, + { + "epoch": 0.6820472617314369, + "grad_norm": 1.2123223543167114, + "learning_rate": 6.086500000000001e-05, + "loss": 0.3355, + "step": 12180 + }, + { + "epoch": 0.682103259043566, + "grad_norm": 1.2966917753219604, + "learning_rate": 6.0870000000000005e-05, + "loss": 0.4241, + "step": 12181 + }, + { + "epoch": 0.682159256355695, + "grad_norm": 1.3020904064178467, + "learning_rate": 6.0875e-05, + "loss": 0.4779, + "step": 12182 + }, + { + "epoch": 0.682215253667824, + "grad_norm": 1.3528094291687012, + "learning_rate": 6.088000000000001e-05, + "loss": 0.427, + "step": 12183 + }, + { + "epoch": 0.682271250979953, + "grad_norm": 1.0275715589523315, + "learning_rate": 6.0885000000000004e-05, + "loss": 0.3687, + "step": 12184 + }, + { + "epoch": 0.682327248292082, + "grad_norm": 1.3559116125106812, + "learning_rate": 6.089e-05, + "loss": 0.4377, + "step": 12185 + }, + { + "epoch": 0.682383245604211, + "grad_norm": 2.2175986766815186, + "learning_rate": 6.0895e-05, + "loss": 0.4603, + "step": 12186 + }, + { + "epoch": 0.6824392429163401, + "grad_norm": 1.3152580261230469, + "learning_rate": 6.09e-05, + "loss": 0.4857, + "step": 12187 + }, + { + "epoch": 0.6824952402284691, + "grad_norm": 1.3260953426361084, + "learning_rate": 6.0905e-05, + "loss": 0.4299, + "step": 12188 + }, + { + "epoch": 0.6825512375405981, + "grad_norm": 1.5903900861740112, + "learning_rate": 6.091e-05, + "loss": 0.6495, + "step": 12189 + }, + { + "epoch": 0.6826072348527271, + "grad_norm": 1.2022602558135986, + "learning_rate": 6.0915e-05, + "loss": 0.3198, + "step": 12190 + }, + { + "epoch": 0.6826632321648561, + "grad_norm": 1.2887011766433716, + "learning_rate": 6.092e-05, + "loss": 0.409, + "step": 12191 + }, + { + "epoch": 0.6827192294769852, + "grad_norm": 1.371610164642334, + "learning_rate": 6.0925e-05, + "loss": 0.4759, + "step": 12192 + }, + { + "epoch": 0.6827752267891142, + "grad_norm": 1.2783170938491821, + "learning_rate": 6.093e-05, + "loss": 0.4126, + "step": 12193 + }, + { + "epoch": 0.6828312241012432, + "grad_norm": 1.5165284872055054, + "learning_rate": 6.0935e-05, + "loss": 0.5952, + "step": 12194 + }, + { + "epoch": 0.6828872214133721, + "grad_norm": 1.315489411354065, + "learning_rate": 6.094000000000001e-05, + "loss": 0.4228, + "step": 12195 + }, + { + "epoch": 0.6829432187255011, + "grad_norm": 1.4239274263381958, + "learning_rate": 6.094500000000001e-05, + "loss": 0.5519, + "step": 12196 + }, + { + "epoch": 0.6829992160376301, + "grad_norm": 1.2774584293365479, + "learning_rate": 6.0950000000000004e-05, + "loss": 0.3841, + "step": 12197 + }, + { + "epoch": 0.6830552133497592, + "grad_norm": 1.2198424339294434, + "learning_rate": 6.095500000000001e-05, + "loss": 0.4392, + "step": 12198 + }, + { + "epoch": 0.6831112106618882, + "grad_norm": 1.5404493808746338, + "learning_rate": 6.0960000000000006e-05, + "loss": 0.5669, + "step": 12199 + }, + { + "epoch": 0.6831672079740172, + "grad_norm": 1.1638553142547607, + "learning_rate": 6.0965e-05, + "loss": 0.3687, + "step": 12200 + }, + { + "epoch": 0.6832232052861462, + "grad_norm": 1.316155195236206, + "learning_rate": 6.097000000000001e-05, + "loss": 0.5432, + "step": 12201 + }, + { + "epoch": 0.6832792025982752, + "grad_norm": 1.062378168106079, + "learning_rate": 6.0975000000000005e-05, + "loss": 0.3574, + "step": 12202 + }, + { + "epoch": 0.6833351999104043, + "grad_norm": 1.0928003787994385, + "learning_rate": 6.098e-05, + "loss": 0.3067, + "step": 12203 + }, + { + "epoch": 0.6833911972225333, + "grad_norm": 1.4202882051467896, + "learning_rate": 6.0985000000000006e-05, + "loss": 0.4275, + "step": 12204 + }, + { + "epoch": 0.6834471945346623, + "grad_norm": 1.2420756816864014, + "learning_rate": 6.0990000000000004e-05, + "loss": 0.407, + "step": 12205 + }, + { + "epoch": 0.6835031918467913, + "grad_norm": 1.6382828950881958, + "learning_rate": 6.0995e-05, + "loss": 0.8044, + "step": 12206 + }, + { + "epoch": 0.6835591891589203, + "grad_norm": 1.3947349786758423, + "learning_rate": 6.1e-05, + "loss": 0.4024, + "step": 12207 + }, + { + "epoch": 0.6836151864710494, + "grad_norm": 1.2955342531204224, + "learning_rate": 6.1005e-05, + "loss": 0.5735, + "step": 12208 + }, + { + "epoch": 0.6836711837831784, + "grad_norm": 1.34746515750885, + "learning_rate": 6.101e-05, + "loss": 0.3859, + "step": 12209 + }, + { + "epoch": 0.6837271810953074, + "grad_norm": 1.415920376777649, + "learning_rate": 6.1015e-05, + "loss": 0.5651, + "step": 12210 + }, + { + "epoch": 0.6837831784074364, + "grad_norm": 1.3192853927612305, + "learning_rate": 6.102e-05, + "loss": 0.4725, + "step": 12211 + }, + { + "epoch": 0.6838391757195654, + "grad_norm": 1.2164310216903687, + "learning_rate": 6.1025e-05, + "loss": 0.4347, + "step": 12212 + }, + { + "epoch": 0.6838951730316944, + "grad_norm": 1.2607762813568115, + "learning_rate": 6.103e-05, + "loss": 0.4457, + "step": 12213 + }, + { + "epoch": 0.6839511703438235, + "grad_norm": 1.4990837574005127, + "learning_rate": 6.1035e-05, + "loss": 0.5186, + "step": 12214 + }, + { + "epoch": 0.6840071676559525, + "grad_norm": 1.424031376838684, + "learning_rate": 6.104000000000001e-05, + "loss": 0.4896, + "step": 12215 + }, + { + "epoch": 0.6840631649680815, + "grad_norm": 1.4935369491577148, + "learning_rate": 6.104500000000001e-05, + "loss": 0.4456, + "step": 12216 + }, + { + "epoch": 0.6841191622802105, + "grad_norm": 1.1517411470413208, + "learning_rate": 6.105e-05, + "loss": 0.4245, + "step": 12217 + }, + { + "epoch": 0.6841751595923395, + "grad_norm": 1.3826349973678589, + "learning_rate": 6.1055e-05, + "loss": 0.5045, + "step": 12218 + }, + { + "epoch": 0.6842311569044686, + "grad_norm": 2.0401902198791504, + "learning_rate": 6.106e-05, + "loss": 0.5802, + "step": 12219 + }, + { + "epoch": 0.6842871542165976, + "grad_norm": 1.386125922203064, + "learning_rate": 6.1065e-05, + "loss": 0.3533, + "step": 12220 + }, + { + "epoch": 0.6843431515287266, + "grad_norm": 1.3331139087677002, + "learning_rate": 6.107000000000001e-05, + "loss": 0.5166, + "step": 12221 + }, + { + "epoch": 0.6843991488408556, + "grad_norm": 1.130191683769226, + "learning_rate": 6.107500000000001e-05, + "loss": 0.4323, + "step": 12222 + }, + { + "epoch": 0.6844551461529846, + "grad_norm": 1.1950807571411133, + "learning_rate": 6.108e-05, + "loss": 0.4142, + "step": 12223 + }, + { + "epoch": 0.6845111434651137, + "grad_norm": 1.4747769832611084, + "learning_rate": 6.1085e-05, + "loss": 0.505, + "step": 12224 + }, + { + "epoch": 0.6845671407772427, + "grad_norm": 2.184282064437866, + "learning_rate": 6.109e-05, + "loss": 0.426, + "step": 12225 + }, + { + "epoch": 0.6846231380893717, + "grad_norm": 1.0928298234939575, + "learning_rate": 6.1095e-05, + "loss": 0.3717, + "step": 12226 + }, + { + "epoch": 0.6846791354015007, + "grad_norm": 1.3561042547225952, + "learning_rate": 6.110000000000001e-05, + "loss": 0.4217, + "step": 12227 + }, + { + "epoch": 0.6847351327136297, + "grad_norm": 1.2690937519073486, + "learning_rate": 6.1105e-05, + "loss": 0.5044, + "step": 12228 + }, + { + "epoch": 0.6847911300257588, + "grad_norm": 1.5557212829589844, + "learning_rate": 6.111e-05, + "loss": 0.4117, + "step": 12229 + }, + { + "epoch": 0.6848471273378878, + "grad_norm": 1.4569201469421387, + "learning_rate": 6.1115e-05, + "loss": 0.5277, + "step": 12230 + }, + { + "epoch": 0.6849031246500168, + "grad_norm": 1.5207160711288452, + "learning_rate": 6.112e-05, + "loss": 0.5777, + "step": 12231 + }, + { + "epoch": 0.6849591219621458, + "grad_norm": 1.2555395364761353, + "learning_rate": 6.1125e-05, + "loss": 0.3153, + "step": 12232 + }, + { + "epoch": 0.6850151192742748, + "grad_norm": 1.2674345970153809, + "learning_rate": 6.112999999999999e-05, + "loss": 0.4369, + "step": 12233 + }, + { + "epoch": 0.6850711165864038, + "grad_norm": 1.3939169645309448, + "learning_rate": 6.1135e-05, + "loss": 0.5536, + "step": 12234 + }, + { + "epoch": 0.6851271138985329, + "grad_norm": 1.5421286821365356, + "learning_rate": 6.114000000000001e-05, + "loss": 0.6619, + "step": 12235 + }, + { + "epoch": 0.6851831112106619, + "grad_norm": 1.2183438539505005, + "learning_rate": 6.114500000000001e-05, + "loss": 0.412, + "step": 12236 + }, + { + "epoch": 0.6852391085227909, + "grad_norm": 1.5787529945373535, + "learning_rate": 6.115000000000001e-05, + "loss": 0.4965, + "step": 12237 + }, + { + "epoch": 0.6852951058349199, + "grad_norm": 1.439435362815857, + "learning_rate": 6.1155e-05, + "loss": 0.546, + "step": 12238 + }, + { + "epoch": 0.685351103147049, + "grad_norm": 1.4464318752288818, + "learning_rate": 6.116e-05, + "loss": 0.4975, + "step": 12239 + }, + { + "epoch": 0.685407100459178, + "grad_norm": 1.4630446434020996, + "learning_rate": 6.1165e-05, + "loss": 0.4319, + "step": 12240 + }, + { + "epoch": 0.685463097771307, + "grad_norm": 1.3636311292648315, + "learning_rate": 6.117e-05, + "loss": 0.3872, + "step": 12241 + }, + { + "epoch": 0.685519095083436, + "grad_norm": 1.2092517614364624, + "learning_rate": 6.117500000000001e-05, + "loss": 0.4466, + "step": 12242 + }, + { + "epoch": 0.685575092395565, + "grad_norm": 1.2505483627319336, + "learning_rate": 6.118000000000001e-05, + "loss": 0.463, + "step": 12243 + }, + { + "epoch": 0.685631089707694, + "grad_norm": 1.0488157272338867, + "learning_rate": 6.1185e-05, + "loss": 0.4061, + "step": 12244 + }, + { + "epoch": 0.6856870870198231, + "grad_norm": 1.2378517389297485, + "learning_rate": 6.119e-05, + "loss": 0.4247, + "step": 12245 + }, + { + "epoch": 0.6857430843319521, + "grad_norm": 1.3138020038604736, + "learning_rate": 6.1195e-05, + "loss": 0.4243, + "step": 12246 + }, + { + "epoch": 0.6857990816440811, + "grad_norm": 1.1253302097320557, + "learning_rate": 6.12e-05, + "loss": 0.449, + "step": 12247 + }, + { + "epoch": 0.6858550789562101, + "grad_norm": 1.4977877140045166, + "learning_rate": 6.120500000000001e-05, + "loss": 0.5941, + "step": 12248 + }, + { + "epoch": 0.6859110762683391, + "grad_norm": 1.7441450357437134, + "learning_rate": 6.121e-05, + "loss": 0.4848, + "step": 12249 + }, + { + "epoch": 0.6859670735804682, + "grad_norm": 1.2781704664230347, + "learning_rate": 6.1215e-05, + "loss": 0.5405, + "step": 12250 + }, + { + "epoch": 0.6860230708925972, + "grad_norm": 1.3380956649780273, + "learning_rate": 6.122e-05, + "loss": 0.446, + "step": 12251 + }, + { + "epoch": 0.6860790682047262, + "grad_norm": 1.4156084060668945, + "learning_rate": 6.1225e-05, + "loss": 0.6244, + "step": 12252 + }, + { + "epoch": 0.6861350655168552, + "grad_norm": 1.3929370641708374, + "learning_rate": 6.123e-05, + "loss": 0.3996, + "step": 12253 + }, + { + "epoch": 0.6861910628289842, + "grad_norm": 1.4578877687454224, + "learning_rate": 6.123499999999999e-05, + "loss": 0.5644, + "step": 12254 + }, + { + "epoch": 0.6862470601411133, + "grad_norm": 1.2098445892333984, + "learning_rate": 6.124e-05, + "loss": 0.4117, + "step": 12255 + }, + { + "epoch": 0.6863030574532423, + "grad_norm": 0.9289770722389221, + "learning_rate": 6.124500000000001e-05, + "loss": 0.2874, + "step": 12256 + }, + { + "epoch": 0.6863590547653713, + "grad_norm": 1.2058159112930298, + "learning_rate": 6.125000000000001e-05, + "loss": 0.4046, + "step": 12257 + }, + { + "epoch": 0.6864150520775003, + "grad_norm": 1.042585015296936, + "learning_rate": 6.125500000000001e-05, + "loss": 0.4132, + "step": 12258 + }, + { + "epoch": 0.6864710493896293, + "grad_norm": 1.7090892791748047, + "learning_rate": 6.126e-05, + "loss": 0.4091, + "step": 12259 + }, + { + "epoch": 0.6865270467017583, + "grad_norm": 1.4218131303787231, + "learning_rate": 6.1265e-05, + "loss": 0.4224, + "step": 12260 + }, + { + "epoch": 0.6865830440138874, + "grad_norm": 1.2966798543930054, + "learning_rate": 6.127e-05, + "loss": 0.3678, + "step": 12261 + }, + { + "epoch": 0.6866390413260164, + "grad_norm": 1.028228759765625, + "learning_rate": 6.1275e-05, + "loss": 0.3057, + "step": 12262 + }, + { + "epoch": 0.6866950386381454, + "grad_norm": 1.1287555694580078, + "learning_rate": 6.128000000000001e-05, + "loss": 0.3543, + "step": 12263 + }, + { + "epoch": 0.6867510359502744, + "grad_norm": 1.1865333318710327, + "learning_rate": 6.128500000000001e-05, + "loss": 0.5076, + "step": 12264 + }, + { + "epoch": 0.6868070332624034, + "grad_norm": 1.2991281747817993, + "learning_rate": 6.129e-05, + "loss": 0.3995, + "step": 12265 + }, + { + "epoch": 0.6868630305745325, + "grad_norm": 1.4616847038269043, + "learning_rate": 6.1295e-05, + "loss": 0.4717, + "step": 12266 + }, + { + "epoch": 0.6869190278866615, + "grad_norm": 1.5509908199310303, + "learning_rate": 6.13e-05, + "loss": 0.4697, + "step": 12267 + }, + { + "epoch": 0.6869750251987905, + "grad_norm": 1.568916916847229, + "learning_rate": 6.1305e-05, + "loss": 0.6279, + "step": 12268 + }, + { + "epoch": 0.6870310225109195, + "grad_norm": 1.1783875226974487, + "learning_rate": 6.131e-05, + "loss": 0.3793, + "step": 12269 + }, + { + "epoch": 0.6870870198230485, + "grad_norm": 1.2726467847824097, + "learning_rate": 6.1315e-05, + "loss": 0.5254, + "step": 12270 + }, + { + "epoch": 0.6871430171351776, + "grad_norm": 1.444631576538086, + "learning_rate": 6.132e-05, + "loss": 0.4486, + "step": 12271 + }, + { + "epoch": 0.6871990144473066, + "grad_norm": 32.72178649902344, + "learning_rate": 6.1325e-05, + "loss": 0.5434, + "step": 12272 + }, + { + "epoch": 0.6872550117594356, + "grad_norm": 1.4250762462615967, + "learning_rate": 6.133e-05, + "loss": 0.4798, + "step": 12273 + }, + { + "epoch": 0.6873110090715646, + "grad_norm": 1.2289494276046753, + "learning_rate": 6.1335e-05, + "loss": 0.4453, + "step": 12274 + }, + { + "epoch": 0.6873670063836936, + "grad_norm": 1.4113556146621704, + "learning_rate": 6.133999999999999e-05, + "loss": 0.4163, + "step": 12275 + }, + { + "epoch": 0.6874230036958227, + "grad_norm": 1.1996521949768066, + "learning_rate": 6.1345e-05, + "loss": 0.3363, + "step": 12276 + }, + { + "epoch": 0.6874790010079517, + "grad_norm": 1.3179504871368408, + "learning_rate": 6.135000000000001e-05, + "loss": 0.4021, + "step": 12277 + }, + { + "epoch": 0.6875349983200806, + "grad_norm": 1.270889163017273, + "learning_rate": 6.135500000000001e-05, + "loss": 0.4603, + "step": 12278 + }, + { + "epoch": 0.6875909956322096, + "grad_norm": 1.40614652633667, + "learning_rate": 6.136000000000001e-05, + "loss": 0.5777, + "step": 12279 + }, + { + "epoch": 0.6876469929443386, + "grad_norm": 1.3910115957260132, + "learning_rate": 6.1365e-05, + "loss": 0.4963, + "step": 12280 + }, + { + "epoch": 0.6877029902564676, + "grad_norm": 1.5937877893447876, + "learning_rate": 6.137e-05, + "loss": 0.4653, + "step": 12281 + }, + { + "epoch": 0.6877589875685967, + "grad_norm": 1.1973016262054443, + "learning_rate": 6.1375e-05, + "loss": 0.4001, + "step": 12282 + }, + { + "epoch": 0.6878149848807257, + "grad_norm": 1.4405215978622437, + "learning_rate": 6.138e-05, + "loss": 0.5029, + "step": 12283 + }, + { + "epoch": 0.6878709821928547, + "grad_norm": 1.4870195388793945, + "learning_rate": 6.138500000000001e-05, + "loss": 0.5531, + "step": 12284 + }, + { + "epoch": 0.6879269795049837, + "grad_norm": 1.3453369140625, + "learning_rate": 6.139000000000001e-05, + "loss": 0.4839, + "step": 12285 + }, + { + "epoch": 0.6879829768171127, + "grad_norm": 1.1019184589385986, + "learning_rate": 6.1395e-05, + "loss": 0.338, + "step": 12286 + }, + { + "epoch": 0.6880389741292418, + "grad_norm": 1.1243878602981567, + "learning_rate": 6.14e-05, + "loss": 0.3211, + "step": 12287 + }, + { + "epoch": 0.6880949714413708, + "grad_norm": 1.2816708087921143, + "learning_rate": 6.1405e-05, + "loss": 0.468, + "step": 12288 + }, + { + "epoch": 0.6881509687534998, + "grad_norm": 1.57627534866333, + "learning_rate": 6.141e-05, + "loss": 0.6315, + "step": 12289 + }, + { + "epoch": 0.6882069660656288, + "grad_norm": 1.2261550426483154, + "learning_rate": 6.1415e-05, + "loss": 0.3907, + "step": 12290 + }, + { + "epoch": 0.6882629633777578, + "grad_norm": 1.2663525342941284, + "learning_rate": 6.142e-05, + "loss": 0.3551, + "step": 12291 + }, + { + "epoch": 0.6883189606898868, + "grad_norm": 1.3640536069869995, + "learning_rate": 6.1425e-05, + "loss": 0.4819, + "step": 12292 + }, + { + "epoch": 0.6883749580020159, + "grad_norm": 1.5169423818588257, + "learning_rate": 6.143e-05, + "loss": 0.4374, + "step": 12293 + }, + { + "epoch": 0.6884309553141449, + "grad_norm": 1.093767523765564, + "learning_rate": 6.1435e-05, + "loss": 0.4196, + "step": 12294 + }, + { + "epoch": 0.6884869526262739, + "grad_norm": 2.810868501663208, + "learning_rate": 6.144e-05, + "loss": 0.3948, + "step": 12295 + }, + { + "epoch": 0.6885429499384029, + "grad_norm": 1.4540945291519165, + "learning_rate": 6.1445e-05, + "loss": 0.4799, + "step": 12296 + }, + { + "epoch": 0.688598947250532, + "grad_norm": 1.3927223682403564, + "learning_rate": 6.145e-05, + "loss": 0.3329, + "step": 12297 + }, + { + "epoch": 0.688654944562661, + "grad_norm": 1.3568576574325562, + "learning_rate": 6.1455e-05, + "loss": 0.42, + "step": 12298 + }, + { + "epoch": 0.68871094187479, + "grad_norm": 1.440532922744751, + "learning_rate": 6.146000000000001e-05, + "loss": 0.6115, + "step": 12299 + }, + { + "epoch": 0.688766939186919, + "grad_norm": 1.3961313962936401, + "learning_rate": 6.146500000000001e-05, + "loss": 0.4438, + "step": 12300 + }, + { + "epoch": 0.688822936499048, + "grad_norm": 1.181216835975647, + "learning_rate": 6.147e-05, + "loss": 0.4751, + "step": 12301 + }, + { + "epoch": 0.688878933811177, + "grad_norm": 1.2675639390945435, + "learning_rate": 6.1475e-05, + "loss": 0.3858, + "step": 12302 + }, + { + "epoch": 0.6889349311233061, + "grad_norm": 1.2192527055740356, + "learning_rate": 6.148e-05, + "loss": 0.375, + "step": 12303 + }, + { + "epoch": 0.6889909284354351, + "grad_norm": 1.0635168552398682, + "learning_rate": 6.1485e-05, + "loss": 0.4258, + "step": 12304 + }, + { + "epoch": 0.6890469257475641, + "grad_norm": 1.2519820928573608, + "learning_rate": 6.149000000000001e-05, + "loss": 0.414, + "step": 12305 + }, + { + "epoch": 0.6891029230596931, + "grad_norm": 1.3190950155258179, + "learning_rate": 6.1495e-05, + "loss": 0.386, + "step": 12306 + }, + { + "epoch": 0.6891589203718221, + "grad_norm": 1.805222988128662, + "learning_rate": 6.15e-05, + "loss": 0.5352, + "step": 12307 + }, + { + "epoch": 0.6892149176839512, + "grad_norm": 1.282267451286316, + "learning_rate": 6.1505e-05, + "loss": 0.3987, + "step": 12308 + }, + { + "epoch": 0.6892709149960802, + "grad_norm": 1.27260160446167, + "learning_rate": 6.151e-05, + "loss": 0.5293, + "step": 12309 + }, + { + "epoch": 0.6893269123082092, + "grad_norm": 1.2897119522094727, + "learning_rate": 6.1515e-05, + "loss": 0.3931, + "step": 12310 + }, + { + "epoch": 0.6893829096203382, + "grad_norm": 1.5478432178497314, + "learning_rate": 6.152e-05, + "loss": 0.5605, + "step": 12311 + }, + { + "epoch": 0.6894389069324672, + "grad_norm": 1.4531878232955933, + "learning_rate": 6.1525e-05, + "loss": 0.4144, + "step": 12312 + }, + { + "epoch": 0.6894949042445963, + "grad_norm": 1.2213512659072876, + "learning_rate": 6.153e-05, + "loss": 0.3277, + "step": 12313 + }, + { + "epoch": 0.6895509015567253, + "grad_norm": 1.3131664991378784, + "learning_rate": 6.1535e-05, + "loss": 0.4082, + "step": 12314 + }, + { + "epoch": 0.6896068988688543, + "grad_norm": 1.3242067098617554, + "learning_rate": 6.154e-05, + "loss": 0.3955, + "step": 12315 + }, + { + "epoch": 0.6896628961809833, + "grad_norm": 1.5959453582763672, + "learning_rate": 6.154500000000001e-05, + "loss": 0.5431, + "step": 12316 + }, + { + "epoch": 0.6897188934931123, + "grad_norm": 1.1596336364746094, + "learning_rate": 6.155e-05, + "loss": 0.5232, + "step": 12317 + }, + { + "epoch": 0.6897748908052413, + "grad_norm": 1.3719199895858765, + "learning_rate": 6.1555e-05, + "loss": 0.5622, + "step": 12318 + }, + { + "epoch": 0.6898308881173704, + "grad_norm": 1.2654657363891602, + "learning_rate": 6.156e-05, + "loss": 0.4272, + "step": 12319 + }, + { + "epoch": 0.6898868854294994, + "grad_norm": 1.224621295928955, + "learning_rate": 6.156500000000001e-05, + "loss": 0.3971, + "step": 12320 + }, + { + "epoch": 0.6899428827416284, + "grad_norm": 1.293892502784729, + "learning_rate": 6.157000000000001e-05, + "loss": 0.5648, + "step": 12321 + }, + { + "epoch": 0.6899988800537574, + "grad_norm": 1.255736231803894, + "learning_rate": 6.1575e-05, + "loss": 0.4034, + "step": 12322 + }, + { + "epoch": 0.6900548773658864, + "grad_norm": 1.3798484802246094, + "learning_rate": 6.158e-05, + "loss": 0.3675, + "step": 12323 + }, + { + "epoch": 0.6901108746780155, + "grad_norm": 1.1086690425872803, + "learning_rate": 6.1585e-05, + "loss": 0.3527, + "step": 12324 + }, + { + "epoch": 0.6901668719901445, + "grad_norm": 1.802785873413086, + "learning_rate": 6.159e-05, + "loss": 0.3634, + "step": 12325 + }, + { + "epoch": 0.6902228693022735, + "grad_norm": 1.3808271884918213, + "learning_rate": 6.159500000000001e-05, + "loss": 0.4957, + "step": 12326 + }, + { + "epoch": 0.6902788666144025, + "grad_norm": 1.555474877357483, + "learning_rate": 6.16e-05, + "loss": 0.5661, + "step": 12327 + }, + { + "epoch": 0.6903348639265315, + "grad_norm": 7.169306755065918, + "learning_rate": 6.1605e-05, + "loss": 0.4269, + "step": 12328 + }, + { + "epoch": 0.6903908612386606, + "grad_norm": 1.2660406827926636, + "learning_rate": 6.161e-05, + "loss": 0.4906, + "step": 12329 + }, + { + "epoch": 0.6904468585507896, + "grad_norm": 1.4176205396652222, + "learning_rate": 6.1615e-05, + "loss": 0.5224, + "step": 12330 + }, + { + "epoch": 0.6905028558629186, + "grad_norm": 1.3572801351547241, + "learning_rate": 6.162e-05, + "loss": 0.483, + "step": 12331 + }, + { + "epoch": 0.6905588531750476, + "grad_norm": 1.1732454299926758, + "learning_rate": 6.1625e-05, + "loss": 0.3916, + "step": 12332 + }, + { + "epoch": 0.6906148504871766, + "grad_norm": 1.292445182800293, + "learning_rate": 6.163e-05, + "loss": 0.4665, + "step": 12333 + }, + { + "epoch": 0.6906708477993057, + "grad_norm": 1.5300066471099854, + "learning_rate": 6.1635e-05, + "loss": 0.607, + "step": 12334 + }, + { + "epoch": 0.6907268451114347, + "grad_norm": 1.125780701637268, + "learning_rate": 6.164e-05, + "loss": 0.3816, + "step": 12335 + }, + { + "epoch": 0.6907828424235637, + "grad_norm": 1.141910433769226, + "learning_rate": 6.164500000000001e-05, + "loss": 0.4099, + "step": 12336 + }, + { + "epoch": 0.6908388397356927, + "grad_norm": 1.2341314554214478, + "learning_rate": 6.165000000000001e-05, + "loss": 0.4981, + "step": 12337 + }, + { + "epoch": 0.6908948370478217, + "grad_norm": 1.4774041175842285, + "learning_rate": 6.1655e-05, + "loss": 0.5521, + "step": 12338 + }, + { + "epoch": 0.6909508343599507, + "grad_norm": 1.1159309148788452, + "learning_rate": 6.166e-05, + "loss": 0.3608, + "step": 12339 + }, + { + "epoch": 0.6910068316720798, + "grad_norm": 1.3648525476455688, + "learning_rate": 6.1665e-05, + "loss": 0.4344, + "step": 12340 + }, + { + "epoch": 0.6910628289842088, + "grad_norm": 1.148815631866455, + "learning_rate": 6.167000000000001e-05, + "loss": 0.4819, + "step": 12341 + }, + { + "epoch": 0.6911188262963378, + "grad_norm": 1.500679612159729, + "learning_rate": 6.167500000000001e-05, + "loss": 0.6123, + "step": 12342 + }, + { + "epoch": 0.6911748236084668, + "grad_norm": 1.464145541191101, + "learning_rate": 6.168e-05, + "loss": 0.5013, + "step": 12343 + }, + { + "epoch": 0.6912308209205958, + "grad_norm": 1.5020862817764282, + "learning_rate": 6.1685e-05, + "loss": 0.4593, + "step": 12344 + }, + { + "epoch": 0.6912868182327249, + "grad_norm": 1.1385457515716553, + "learning_rate": 6.169e-05, + "loss": 0.387, + "step": 12345 + }, + { + "epoch": 0.6913428155448539, + "grad_norm": 1.4682669639587402, + "learning_rate": 6.1695e-05, + "loss": 0.432, + "step": 12346 + }, + { + "epoch": 0.6913988128569829, + "grad_norm": 1.1639906167984009, + "learning_rate": 6.170000000000001e-05, + "loss": 0.3979, + "step": 12347 + }, + { + "epoch": 0.6914548101691119, + "grad_norm": 2.324922561645508, + "learning_rate": 6.1705e-05, + "loss": 0.4675, + "step": 12348 + }, + { + "epoch": 0.6915108074812409, + "grad_norm": 1.420125126838684, + "learning_rate": 6.171e-05, + "loss": 0.5791, + "step": 12349 + }, + { + "epoch": 0.69156680479337, + "grad_norm": 1.4984275102615356, + "learning_rate": 6.1715e-05, + "loss": 0.5217, + "step": 12350 + }, + { + "epoch": 0.691622802105499, + "grad_norm": 1.2690491676330566, + "learning_rate": 6.172e-05, + "loss": 0.4017, + "step": 12351 + }, + { + "epoch": 0.691678799417628, + "grad_norm": 1.1910409927368164, + "learning_rate": 6.1725e-05, + "loss": 0.3263, + "step": 12352 + }, + { + "epoch": 0.691734796729757, + "grad_norm": 3.2725179195404053, + "learning_rate": 6.173e-05, + "loss": 0.6993, + "step": 12353 + }, + { + "epoch": 0.691790794041886, + "grad_norm": 1.2835434675216675, + "learning_rate": 6.1735e-05, + "loss": 0.467, + "step": 12354 + }, + { + "epoch": 0.6918467913540151, + "grad_norm": 1.5669128894805908, + "learning_rate": 6.174e-05, + "loss": 0.4122, + "step": 12355 + }, + { + "epoch": 0.6919027886661441, + "grad_norm": 1.442619800567627, + "learning_rate": 6.174500000000001e-05, + "loss": 0.4024, + "step": 12356 + }, + { + "epoch": 0.6919587859782731, + "grad_norm": 1.4724701642990112, + "learning_rate": 6.175000000000001e-05, + "loss": 0.4798, + "step": 12357 + }, + { + "epoch": 0.6920147832904021, + "grad_norm": 1.193954348564148, + "learning_rate": 6.175500000000001e-05, + "loss": 0.3632, + "step": 12358 + }, + { + "epoch": 0.6920707806025311, + "grad_norm": 1.225625991821289, + "learning_rate": 6.176e-05, + "loss": 0.3221, + "step": 12359 + }, + { + "epoch": 0.6921267779146602, + "grad_norm": 1.286931037902832, + "learning_rate": 6.1765e-05, + "loss": 0.417, + "step": 12360 + }, + { + "epoch": 0.6921827752267891, + "grad_norm": 1.1550440788269043, + "learning_rate": 6.177e-05, + "loss": 0.3877, + "step": 12361 + }, + { + "epoch": 0.6922387725389181, + "grad_norm": 1.173991084098816, + "learning_rate": 6.177500000000001e-05, + "loss": 0.4283, + "step": 12362 + }, + { + "epoch": 0.6922947698510471, + "grad_norm": 1.178902268409729, + "learning_rate": 6.178000000000001e-05, + "loss": 0.4929, + "step": 12363 + }, + { + "epoch": 0.6923507671631761, + "grad_norm": 1.4234992265701294, + "learning_rate": 6.1785e-05, + "loss": 0.4473, + "step": 12364 + }, + { + "epoch": 0.6924067644753051, + "grad_norm": 1.3138865232467651, + "learning_rate": 6.179e-05, + "loss": 0.4019, + "step": 12365 + }, + { + "epoch": 0.6924627617874342, + "grad_norm": 1.3836005926132202, + "learning_rate": 6.1795e-05, + "loss": 0.4197, + "step": 12366 + }, + { + "epoch": 0.6925187590995632, + "grad_norm": 1.3653817176818848, + "learning_rate": 6.18e-05, + "loss": 0.422, + "step": 12367 + }, + { + "epoch": 0.6925747564116922, + "grad_norm": 5.243404865264893, + "learning_rate": 6.1805e-05, + "loss": 0.462, + "step": 12368 + }, + { + "epoch": 0.6926307537238212, + "grad_norm": 1.4292103052139282, + "learning_rate": 6.181e-05, + "loss": 0.5952, + "step": 12369 + }, + { + "epoch": 0.6926867510359502, + "grad_norm": 1.1659621000289917, + "learning_rate": 6.1815e-05, + "loss": 0.4945, + "step": 12370 + }, + { + "epoch": 0.6927427483480793, + "grad_norm": 1.1917778253555298, + "learning_rate": 6.182e-05, + "loss": 0.3642, + "step": 12371 + }, + { + "epoch": 0.6927987456602083, + "grad_norm": 0.9862884283065796, + "learning_rate": 6.1825e-05, + "loss": 0.3606, + "step": 12372 + }, + { + "epoch": 0.6928547429723373, + "grad_norm": 1.9535349607467651, + "learning_rate": 6.183e-05, + "loss": 0.4091, + "step": 12373 + }, + { + "epoch": 0.6929107402844663, + "grad_norm": 1.199156403541565, + "learning_rate": 6.1835e-05, + "loss": 0.3657, + "step": 12374 + }, + { + "epoch": 0.6929667375965953, + "grad_norm": 1.1949660778045654, + "learning_rate": 6.184e-05, + "loss": 0.4283, + "step": 12375 + }, + { + "epoch": 0.6930227349087243, + "grad_norm": 1.4052292108535767, + "learning_rate": 6.184500000000002e-05, + "loss": 0.3788, + "step": 12376 + }, + { + "epoch": 0.6930787322208534, + "grad_norm": 1.4408352375030518, + "learning_rate": 6.185000000000001e-05, + "loss": 0.4129, + "step": 12377 + }, + { + "epoch": 0.6931347295329824, + "grad_norm": 1.9941606521606445, + "learning_rate": 6.185500000000001e-05, + "loss": 0.5218, + "step": 12378 + }, + { + "epoch": 0.6931907268451114, + "grad_norm": 1.2464509010314941, + "learning_rate": 6.186000000000001e-05, + "loss": 0.4011, + "step": 12379 + }, + { + "epoch": 0.6932467241572404, + "grad_norm": 1.2264740467071533, + "learning_rate": 6.1865e-05, + "loss": 0.3415, + "step": 12380 + }, + { + "epoch": 0.6933027214693694, + "grad_norm": 1.244165062904358, + "learning_rate": 6.187e-05, + "loss": 0.4134, + "step": 12381 + }, + { + "epoch": 0.6933587187814985, + "grad_norm": 1.3536137342453003, + "learning_rate": 6.1875e-05, + "loss": 0.5228, + "step": 12382 + }, + { + "epoch": 0.6934147160936275, + "grad_norm": 1.158705472946167, + "learning_rate": 6.188000000000001e-05, + "loss": 0.3152, + "step": 12383 + }, + { + "epoch": 0.6934707134057565, + "grad_norm": 1.3614567518234253, + "learning_rate": 6.188500000000001e-05, + "loss": 0.4177, + "step": 12384 + }, + { + "epoch": 0.6935267107178855, + "grad_norm": 1.2631393671035767, + "learning_rate": 6.189e-05, + "loss": 0.3932, + "step": 12385 + }, + { + "epoch": 0.6935827080300145, + "grad_norm": 1.2913925647735596, + "learning_rate": 6.1895e-05, + "loss": 0.4944, + "step": 12386 + }, + { + "epoch": 0.6936387053421436, + "grad_norm": 1.3278769254684448, + "learning_rate": 6.19e-05, + "loss": 0.4233, + "step": 12387 + }, + { + "epoch": 0.6936947026542726, + "grad_norm": 1.3033205270767212, + "learning_rate": 6.1905e-05, + "loss": 0.4485, + "step": 12388 + }, + { + "epoch": 0.6937506999664016, + "grad_norm": 1.3875821828842163, + "learning_rate": 6.191e-05, + "loss": 0.5035, + "step": 12389 + }, + { + "epoch": 0.6938066972785306, + "grad_norm": 1.3573859930038452, + "learning_rate": 6.1915e-05, + "loss": 0.4555, + "step": 12390 + }, + { + "epoch": 0.6938626945906596, + "grad_norm": 1.2488489151000977, + "learning_rate": 6.192e-05, + "loss": 0.4228, + "step": 12391 + }, + { + "epoch": 0.6939186919027887, + "grad_norm": 1.43985915184021, + "learning_rate": 6.1925e-05, + "loss": 0.5246, + "step": 12392 + }, + { + "epoch": 0.6939746892149177, + "grad_norm": 1.0981749296188354, + "learning_rate": 6.193e-05, + "loss": 0.377, + "step": 12393 + }, + { + "epoch": 0.6940306865270467, + "grad_norm": 1.3188191652297974, + "learning_rate": 6.1935e-05, + "loss": 0.3934, + "step": 12394 + }, + { + "epoch": 0.6940866838391757, + "grad_norm": 1.3046042919158936, + "learning_rate": 6.193999999999999e-05, + "loss": 0.5209, + "step": 12395 + }, + { + "epoch": 0.6941426811513047, + "grad_norm": 1.3684697151184082, + "learning_rate": 6.1945e-05, + "loss": 0.529, + "step": 12396 + }, + { + "epoch": 0.6941986784634337, + "grad_norm": 49.712005615234375, + "learning_rate": 6.195e-05, + "loss": 0.5506, + "step": 12397 + }, + { + "epoch": 0.6942546757755628, + "grad_norm": 1.3110227584838867, + "learning_rate": 6.195500000000001e-05, + "loss": 0.4521, + "step": 12398 + }, + { + "epoch": 0.6943106730876918, + "grad_norm": 1.4124492406845093, + "learning_rate": 6.196000000000001e-05, + "loss": 0.6361, + "step": 12399 + }, + { + "epoch": 0.6943666703998208, + "grad_norm": 1.3802233934402466, + "learning_rate": 6.196500000000001e-05, + "loss": 0.5113, + "step": 12400 + }, + { + "epoch": 0.6944226677119498, + "grad_norm": 1.4428071975708008, + "learning_rate": 6.197e-05, + "loss": 0.5623, + "step": 12401 + }, + { + "epoch": 0.6944786650240788, + "grad_norm": 1.30404531955719, + "learning_rate": 6.1975e-05, + "loss": 0.5918, + "step": 12402 + }, + { + "epoch": 0.6945346623362079, + "grad_norm": 1.4471380710601807, + "learning_rate": 6.198e-05, + "loss": 0.4736, + "step": 12403 + }, + { + "epoch": 0.6945906596483369, + "grad_norm": 1.5136996507644653, + "learning_rate": 6.198500000000001e-05, + "loss": 0.5243, + "step": 12404 + }, + { + "epoch": 0.6946466569604659, + "grad_norm": 1.5524667501449585, + "learning_rate": 6.199000000000001e-05, + "loss": 0.4637, + "step": 12405 + }, + { + "epoch": 0.6947026542725949, + "grad_norm": 10.976086616516113, + "learning_rate": 6.1995e-05, + "loss": 0.3784, + "step": 12406 + }, + { + "epoch": 0.6947586515847239, + "grad_norm": 1.4833440780639648, + "learning_rate": 6.2e-05, + "loss": 0.6505, + "step": 12407 + }, + { + "epoch": 0.694814648896853, + "grad_norm": 1.3871524333953857, + "learning_rate": 6.2005e-05, + "loss": 0.4584, + "step": 12408 + }, + { + "epoch": 0.694870646208982, + "grad_norm": 1.3627489805221558, + "learning_rate": 6.201e-05, + "loss": 0.3724, + "step": 12409 + }, + { + "epoch": 0.694926643521111, + "grad_norm": 1.328230381011963, + "learning_rate": 6.2015e-05, + "loss": 0.3918, + "step": 12410 + }, + { + "epoch": 0.69498264083324, + "grad_norm": 1.4259014129638672, + "learning_rate": 6.202e-05, + "loss": 0.3789, + "step": 12411 + }, + { + "epoch": 0.695038638145369, + "grad_norm": 1.0605638027191162, + "learning_rate": 6.2025e-05, + "loss": 0.3887, + "step": 12412 + }, + { + "epoch": 0.6950946354574981, + "grad_norm": 1.4853936433792114, + "learning_rate": 6.203e-05, + "loss": 0.5887, + "step": 12413 + }, + { + "epoch": 0.6951506327696271, + "grad_norm": 1.384651780128479, + "learning_rate": 6.2035e-05, + "loss": 0.6094, + "step": 12414 + }, + { + "epoch": 0.6952066300817561, + "grad_norm": 1.2497036457061768, + "learning_rate": 6.204e-05, + "loss": 0.3074, + "step": 12415 + }, + { + "epoch": 0.6952626273938851, + "grad_norm": 1.0918927192687988, + "learning_rate": 6.204499999999999e-05, + "loss": 0.3625, + "step": 12416 + }, + { + "epoch": 0.6953186247060141, + "grad_norm": 1.1047202348709106, + "learning_rate": 6.205e-05, + "loss": 0.3787, + "step": 12417 + }, + { + "epoch": 0.6953746220181432, + "grad_norm": 1.1791718006134033, + "learning_rate": 6.2055e-05, + "loss": 0.4311, + "step": 12418 + }, + { + "epoch": 0.6954306193302722, + "grad_norm": 1.9835759401321411, + "learning_rate": 6.206000000000001e-05, + "loss": 0.5259, + "step": 12419 + }, + { + "epoch": 0.6954866166424012, + "grad_norm": 1.4982942342758179, + "learning_rate": 6.206500000000001e-05, + "loss": 0.4543, + "step": 12420 + }, + { + "epoch": 0.6955426139545302, + "grad_norm": 1.2310798168182373, + "learning_rate": 6.207000000000001e-05, + "loss": 0.4151, + "step": 12421 + }, + { + "epoch": 0.6955986112666592, + "grad_norm": 1.3420618772506714, + "learning_rate": 6.2075e-05, + "loss": 0.4868, + "step": 12422 + }, + { + "epoch": 0.6956546085787882, + "grad_norm": 1.1320631504058838, + "learning_rate": 6.208e-05, + "loss": 0.3478, + "step": 12423 + }, + { + "epoch": 0.6957106058909173, + "grad_norm": 1.4887796640396118, + "learning_rate": 6.2085e-05, + "loss": 0.613, + "step": 12424 + }, + { + "epoch": 0.6957666032030463, + "grad_norm": 1.5807126760482788, + "learning_rate": 6.209000000000001e-05, + "loss": 0.4946, + "step": 12425 + }, + { + "epoch": 0.6958226005151753, + "grad_norm": 1.3650822639465332, + "learning_rate": 6.209500000000001e-05, + "loss": 0.4914, + "step": 12426 + }, + { + "epoch": 0.6958785978273043, + "grad_norm": 1.8704893589019775, + "learning_rate": 6.21e-05, + "loss": 0.3987, + "step": 12427 + }, + { + "epoch": 0.6959345951394333, + "grad_norm": 1.4728385210037231, + "learning_rate": 6.2105e-05, + "loss": 0.5204, + "step": 12428 + }, + { + "epoch": 0.6959905924515624, + "grad_norm": 1.3263381719589233, + "learning_rate": 6.211e-05, + "loss": 0.4777, + "step": 12429 + }, + { + "epoch": 0.6960465897636914, + "grad_norm": 1.2977200746536255, + "learning_rate": 6.2115e-05, + "loss": 0.3269, + "step": 12430 + }, + { + "epoch": 0.6961025870758204, + "grad_norm": 1.1716684103012085, + "learning_rate": 6.212e-05, + "loss": 0.4632, + "step": 12431 + }, + { + "epoch": 0.6961585843879494, + "grad_norm": 1.213047981262207, + "learning_rate": 6.2125e-05, + "loss": 0.5351, + "step": 12432 + }, + { + "epoch": 0.6962145817000784, + "grad_norm": 1.2444779872894287, + "learning_rate": 6.213e-05, + "loss": 0.3439, + "step": 12433 + }, + { + "epoch": 0.6962705790122075, + "grad_norm": 1.1619069576263428, + "learning_rate": 6.2135e-05, + "loss": 0.4513, + "step": 12434 + }, + { + "epoch": 0.6963265763243365, + "grad_norm": 1.0866138935089111, + "learning_rate": 6.214e-05, + "loss": 0.3634, + "step": 12435 + }, + { + "epoch": 0.6963825736364655, + "grad_norm": 1.5795514583587646, + "learning_rate": 6.2145e-05, + "loss": 0.6332, + "step": 12436 + }, + { + "epoch": 0.6964385709485945, + "grad_norm": 1.434275507926941, + "learning_rate": 6.215e-05, + "loss": 0.3181, + "step": 12437 + }, + { + "epoch": 0.6964945682607235, + "grad_norm": 1.4398802518844604, + "learning_rate": 6.2155e-05, + "loss": 0.4559, + "step": 12438 + }, + { + "epoch": 0.6965505655728526, + "grad_norm": 1.1475560665130615, + "learning_rate": 6.216e-05, + "loss": 0.3817, + "step": 12439 + }, + { + "epoch": 0.6966065628849816, + "grad_norm": 1.1969971656799316, + "learning_rate": 6.216500000000001e-05, + "loss": 0.4641, + "step": 12440 + }, + { + "epoch": 0.6966625601971106, + "grad_norm": 1.2024345397949219, + "learning_rate": 6.217000000000001e-05, + "loss": 0.4557, + "step": 12441 + }, + { + "epoch": 0.6967185575092396, + "grad_norm": 1.4936630725860596, + "learning_rate": 6.217500000000001e-05, + "loss": 0.4337, + "step": 12442 + }, + { + "epoch": 0.6967745548213685, + "grad_norm": 1.4987143278121948, + "learning_rate": 6.218e-05, + "loss": 0.4671, + "step": 12443 + }, + { + "epoch": 0.6968305521334975, + "grad_norm": 1.2767128944396973, + "learning_rate": 6.2185e-05, + "loss": 0.4376, + "step": 12444 + }, + { + "epoch": 0.6968865494456266, + "grad_norm": 1.389953851699829, + "learning_rate": 6.219e-05, + "loss": 0.3997, + "step": 12445 + }, + { + "epoch": 0.6969425467577556, + "grad_norm": 1.530099868774414, + "learning_rate": 6.2195e-05, + "loss": 0.3824, + "step": 12446 + }, + { + "epoch": 0.6969985440698846, + "grad_norm": 1.2455320358276367, + "learning_rate": 6.220000000000001e-05, + "loss": 0.5118, + "step": 12447 + }, + { + "epoch": 0.6970545413820136, + "grad_norm": 1.564921259880066, + "learning_rate": 6.2205e-05, + "loss": 0.5814, + "step": 12448 + }, + { + "epoch": 0.6971105386941426, + "grad_norm": 1.4247167110443115, + "learning_rate": 6.221e-05, + "loss": 0.35, + "step": 12449 + }, + { + "epoch": 0.6971665360062717, + "grad_norm": 1.2676891088485718, + "learning_rate": 6.2215e-05, + "loss": 0.2909, + "step": 12450 + }, + { + "epoch": 0.6972225333184007, + "grad_norm": 1.6524531841278076, + "learning_rate": 6.222e-05, + "loss": 0.4867, + "step": 12451 + }, + { + "epoch": 0.6972785306305297, + "grad_norm": 1.3316035270690918, + "learning_rate": 6.2225e-05, + "loss": 0.383, + "step": 12452 + }, + { + "epoch": 0.6973345279426587, + "grad_norm": 1.362134337425232, + "learning_rate": 6.223e-05, + "loss": 0.4233, + "step": 12453 + }, + { + "epoch": 0.6973905252547877, + "grad_norm": 1.4702951908111572, + "learning_rate": 6.2235e-05, + "loss": 0.4932, + "step": 12454 + }, + { + "epoch": 0.6974465225669167, + "grad_norm": 1.5142104625701904, + "learning_rate": 6.224e-05, + "loss": 0.4368, + "step": 12455 + }, + { + "epoch": 0.6975025198790458, + "grad_norm": 1.3793469667434692, + "learning_rate": 6.2245e-05, + "loss": 0.3769, + "step": 12456 + }, + { + "epoch": 0.6975585171911748, + "grad_norm": 1.3909077644348145, + "learning_rate": 6.225000000000001e-05, + "loss": 0.4614, + "step": 12457 + }, + { + "epoch": 0.6976145145033038, + "grad_norm": 1.0924532413482666, + "learning_rate": 6.2255e-05, + "loss": 0.4464, + "step": 12458 + }, + { + "epoch": 0.6976705118154328, + "grad_norm": 1.3595489263534546, + "learning_rate": 6.226e-05, + "loss": 0.3865, + "step": 12459 + }, + { + "epoch": 0.6977265091275618, + "grad_norm": 1.1982531547546387, + "learning_rate": 6.2265e-05, + "loss": 0.3582, + "step": 12460 + }, + { + "epoch": 0.6977825064396909, + "grad_norm": 1.8145815134048462, + "learning_rate": 6.227000000000001e-05, + "loss": 0.5622, + "step": 12461 + }, + { + "epoch": 0.6978385037518199, + "grad_norm": 1.2581560611724854, + "learning_rate": 6.227500000000001e-05, + "loss": 0.3973, + "step": 12462 + }, + { + "epoch": 0.6978945010639489, + "grad_norm": 1.8066720962524414, + "learning_rate": 6.228000000000001e-05, + "loss": 0.4065, + "step": 12463 + }, + { + "epoch": 0.6979504983760779, + "grad_norm": 1.3089772462844849, + "learning_rate": 6.2285e-05, + "loss": 0.5027, + "step": 12464 + }, + { + "epoch": 0.6980064956882069, + "grad_norm": 1.2001739740371704, + "learning_rate": 6.229e-05, + "loss": 0.4057, + "step": 12465 + }, + { + "epoch": 0.698062493000336, + "grad_norm": 1.4888426065444946, + "learning_rate": 6.2295e-05, + "loss": 0.5365, + "step": 12466 + }, + { + "epoch": 0.698118490312465, + "grad_norm": 1.1244386434555054, + "learning_rate": 6.23e-05, + "loss": 0.4614, + "step": 12467 + }, + { + "epoch": 0.698174487624594, + "grad_norm": 1.2285563945770264, + "learning_rate": 6.230500000000001e-05, + "loss": 0.4636, + "step": 12468 + }, + { + "epoch": 0.698230484936723, + "grad_norm": 1.3267616033554077, + "learning_rate": 6.231e-05, + "loss": 0.4245, + "step": 12469 + }, + { + "epoch": 0.698286482248852, + "grad_norm": 1.3063656091690063, + "learning_rate": 6.2315e-05, + "loss": 0.3789, + "step": 12470 + }, + { + "epoch": 0.698342479560981, + "grad_norm": 1.3477351665496826, + "learning_rate": 6.232e-05, + "loss": 0.5748, + "step": 12471 + }, + { + "epoch": 0.6983984768731101, + "grad_norm": 1.2163971662521362, + "learning_rate": 6.2325e-05, + "loss": 0.4104, + "step": 12472 + }, + { + "epoch": 0.6984544741852391, + "grad_norm": 1.1639045476913452, + "learning_rate": 6.233e-05, + "loss": 0.4309, + "step": 12473 + }, + { + "epoch": 0.6985104714973681, + "grad_norm": 1.8191872835159302, + "learning_rate": 6.2335e-05, + "loss": 0.478, + "step": 12474 + }, + { + "epoch": 0.6985664688094971, + "grad_norm": 1.1949455738067627, + "learning_rate": 6.234e-05, + "loss": 0.3786, + "step": 12475 + }, + { + "epoch": 0.6986224661216262, + "grad_norm": 1.2190887928009033, + "learning_rate": 6.2345e-05, + "loss": 0.5706, + "step": 12476 + }, + { + "epoch": 0.6986784634337552, + "grad_norm": 1.1594059467315674, + "learning_rate": 6.235000000000001e-05, + "loss": 0.4136, + "step": 12477 + }, + { + "epoch": 0.6987344607458842, + "grad_norm": 1.2436951398849487, + "learning_rate": 6.235500000000001e-05, + "loss": 0.4618, + "step": 12478 + }, + { + "epoch": 0.6987904580580132, + "grad_norm": 1.2693654298782349, + "learning_rate": 6.236e-05, + "loss": 0.4969, + "step": 12479 + }, + { + "epoch": 0.6988464553701422, + "grad_norm": 1.348768949508667, + "learning_rate": 6.2365e-05, + "loss": 0.518, + "step": 12480 + }, + { + "epoch": 0.6989024526822712, + "grad_norm": 1.1391533613204956, + "learning_rate": 6.237e-05, + "loss": 0.4162, + "step": 12481 + }, + { + "epoch": 0.6989584499944003, + "grad_norm": 1.248640537261963, + "learning_rate": 6.237500000000001e-05, + "loss": 0.3715, + "step": 12482 + }, + { + "epoch": 0.6990144473065293, + "grad_norm": 1.3822144269943237, + "learning_rate": 6.238000000000001e-05, + "loss": 0.4397, + "step": 12483 + }, + { + "epoch": 0.6990704446186583, + "grad_norm": 1.16531503200531, + "learning_rate": 6.2385e-05, + "loss": 0.4137, + "step": 12484 + }, + { + "epoch": 0.6991264419307873, + "grad_norm": 1.2543193101882935, + "learning_rate": 6.239e-05, + "loss": 0.5052, + "step": 12485 + }, + { + "epoch": 0.6991824392429163, + "grad_norm": 2.1988441944122314, + "learning_rate": 6.2395e-05, + "loss": 0.6602, + "step": 12486 + }, + { + "epoch": 0.6992384365550454, + "grad_norm": 1.329607367515564, + "learning_rate": 6.24e-05, + "loss": 0.3931, + "step": 12487 + }, + { + "epoch": 0.6992944338671744, + "grad_norm": 1.3688476085662842, + "learning_rate": 6.2405e-05, + "loss": 0.4576, + "step": 12488 + }, + { + "epoch": 0.6993504311793034, + "grad_norm": 1.3245147466659546, + "learning_rate": 6.241000000000001e-05, + "loss": 0.5143, + "step": 12489 + }, + { + "epoch": 0.6994064284914324, + "grad_norm": 1.143288016319275, + "learning_rate": 6.2415e-05, + "loss": 0.5049, + "step": 12490 + }, + { + "epoch": 0.6994624258035614, + "grad_norm": 1.5038130283355713, + "learning_rate": 6.242e-05, + "loss": 0.6721, + "step": 12491 + }, + { + "epoch": 0.6995184231156905, + "grad_norm": 1.218773365020752, + "learning_rate": 6.2425e-05, + "loss": 0.475, + "step": 12492 + }, + { + "epoch": 0.6995744204278195, + "grad_norm": 1.2821155786514282, + "learning_rate": 6.243e-05, + "loss": 0.396, + "step": 12493 + }, + { + "epoch": 0.6996304177399485, + "grad_norm": 1.1763544082641602, + "learning_rate": 6.2435e-05, + "loss": 0.4884, + "step": 12494 + }, + { + "epoch": 0.6996864150520775, + "grad_norm": 1.1693071126937866, + "learning_rate": 6.244e-05, + "loss": 0.3576, + "step": 12495 + }, + { + "epoch": 0.6997424123642065, + "grad_norm": 1.5130113363265991, + "learning_rate": 6.2445e-05, + "loss": 0.4434, + "step": 12496 + }, + { + "epoch": 0.6997984096763356, + "grad_norm": 1.4724360704421997, + "learning_rate": 6.245000000000001e-05, + "loss": 0.4186, + "step": 12497 + }, + { + "epoch": 0.6998544069884646, + "grad_norm": 1.420949935913086, + "learning_rate": 6.245500000000001e-05, + "loss": 0.4128, + "step": 12498 + }, + { + "epoch": 0.6999104043005936, + "grad_norm": 1.2526235580444336, + "learning_rate": 6.246000000000001e-05, + "loss": 0.3879, + "step": 12499 + }, + { + "epoch": 0.6999664016127226, + "grad_norm": 1.2274563312530518, + "learning_rate": 6.2465e-05, + "loss": 0.3422, + "step": 12500 + }, + { + "epoch": 0.7000223989248516, + "grad_norm": 1.237396240234375, + "learning_rate": 6.247e-05, + "loss": 0.3423, + "step": 12501 + }, + { + "epoch": 0.7000783962369806, + "grad_norm": 1.1403456926345825, + "learning_rate": 6.2475e-05, + "loss": 0.4302, + "step": 12502 + }, + { + "epoch": 0.7001343935491097, + "grad_norm": 1.3087342977523804, + "learning_rate": 6.248000000000001e-05, + "loss": 0.3942, + "step": 12503 + }, + { + "epoch": 0.7001903908612387, + "grad_norm": 1.552927851676941, + "learning_rate": 6.248500000000001e-05, + "loss": 0.6173, + "step": 12504 + }, + { + "epoch": 0.7002463881733677, + "grad_norm": 1.3391687870025635, + "learning_rate": 6.249e-05, + "loss": 0.5829, + "step": 12505 + }, + { + "epoch": 0.7003023854854967, + "grad_norm": 1.5082244873046875, + "learning_rate": 6.2495e-05, + "loss": 0.4968, + "step": 12506 + }, + { + "epoch": 0.7003583827976257, + "grad_norm": 1.4176688194274902, + "learning_rate": 6.25e-05, + "loss": 0.4616, + "step": 12507 + }, + { + "epoch": 0.7004143801097548, + "grad_norm": 1.3099764585494995, + "learning_rate": 6.2505e-05, + "loss": 0.437, + "step": 12508 + }, + { + "epoch": 0.7004703774218838, + "grad_norm": 1.3941704034805298, + "learning_rate": 6.251e-05, + "loss": 0.3963, + "step": 12509 + }, + { + "epoch": 0.7005263747340128, + "grad_norm": 1.2004023790359497, + "learning_rate": 6.251500000000001e-05, + "loss": 0.3655, + "step": 12510 + }, + { + "epoch": 0.7005823720461418, + "grad_norm": 1.379607081413269, + "learning_rate": 6.252e-05, + "loss": 0.5685, + "step": 12511 + }, + { + "epoch": 0.7006383693582708, + "grad_norm": 1.3818296194076538, + "learning_rate": 6.2525e-05, + "loss": 0.3594, + "step": 12512 + }, + { + "epoch": 0.7006943666703999, + "grad_norm": 1.1807204484939575, + "learning_rate": 6.253e-05, + "loss": 0.3541, + "step": 12513 + }, + { + "epoch": 0.7007503639825289, + "grad_norm": 1.0599114894866943, + "learning_rate": 6.2535e-05, + "loss": 0.3048, + "step": 12514 + }, + { + "epoch": 0.7008063612946579, + "grad_norm": 1.2263143062591553, + "learning_rate": 6.254e-05, + "loss": 0.4669, + "step": 12515 + }, + { + "epoch": 0.7008623586067869, + "grad_norm": 2.299558639526367, + "learning_rate": 6.254499999999999e-05, + "loss": 0.5666, + "step": 12516 + }, + { + "epoch": 0.7009183559189159, + "grad_norm": 1.2690045833587646, + "learning_rate": 6.255e-05, + "loss": 0.3804, + "step": 12517 + }, + { + "epoch": 0.700974353231045, + "grad_norm": 1.2532697916030884, + "learning_rate": 6.255500000000001e-05, + "loss": 0.498, + "step": 12518 + }, + { + "epoch": 0.701030350543174, + "grad_norm": 1.5119946002960205, + "learning_rate": 6.256000000000001e-05, + "loss": 0.5361, + "step": 12519 + }, + { + "epoch": 0.701086347855303, + "grad_norm": 1.0479772090911865, + "learning_rate": 6.256500000000001e-05, + "loss": 0.3354, + "step": 12520 + }, + { + "epoch": 0.701142345167432, + "grad_norm": 1.426035761833191, + "learning_rate": 6.257e-05, + "loss": 0.329, + "step": 12521 + }, + { + "epoch": 0.701198342479561, + "grad_norm": 1.311549425125122, + "learning_rate": 6.2575e-05, + "loss": 0.4817, + "step": 12522 + }, + { + "epoch": 0.70125433979169, + "grad_norm": 1.2965736389160156, + "learning_rate": 6.258e-05, + "loss": 0.6261, + "step": 12523 + }, + { + "epoch": 0.7013103371038191, + "grad_norm": 1.3294415473937988, + "learning_rate": 6.258500000000001e-05, + "loss": 0.4701, + "step": 12524 + }, + { + "epoch": 0.7013663344159481, + "grad_norm": 1.2690424919128418, + "learning_rate": 6.259000000000001e-05, + "loss": 0.3445, + "step": 12525 + }, + { + "epoch": 0.701422331728077, + "grad_norm": 1.2250972986221313, + "learning_rate": 6.2595e-05, + "loss": 0.3364, + "step": 12526 + }, + { + "epoch": 0.701478329040206, + "grad_norm": 1.245665431022644, + "learning_rate": 6.26e-05, + "loss": 0.4073, + "step": 12527 + }, + { + "epoch": 0.701534326352335, + "grad_norm": 2.446558713912964, + "learning_rate": 6.2605e-05, + "loss": 0.4662, + "step": 12528 + }, + { + "epoch": 0.701590323664464, + "grad_norm": 1.2567524909973145, + "learning_rate": 6.261e-05, + "loss": 0.3178, + "step": 12529 + }, + { + "epoch": 0.7016463209765931, + "grad_norm": 1.420933485031128, + "learning_rate": 6.2615e-05, + "loss": 0.387, + "step": 12530 + }, + { + "epoch": 0.7017023182887221, + "grad_norm": 1.4116603136062622, + "learning_rate": 6.262000000000001e-05, + "loss": 0.4641, + "step": 12531 + }, + { + "epoch": 0.7017583156008511, + "grad_norm": 1.1638530492782593, + "learning_rate": 6.2625e-05, + "loss": 0.4309, + "step": 12532 + }, + { + "epoch": 0.7018143129129801, + "grad_norm": 1.426966667175293, + "learning_rate": 6.263e-05, + "loss": 0.4635, + "step": 12533 + }, + { + "epoch": 0.7018703102251092, + "grad_norm": 1.309417963027954, + "learning_rate": 6.2635e-05, + "loss": 0.452, + "step": 12534 + }, + { + "epoch": 0.7019263075372382, + "grad_norm": 1.2854201793670654, + "learning_rate": 6.264e-05, + "loss": 0.4823, + "step": 12535 + }, + { + "epoch": 0.7019823048493672, + "grad_norm": 1.50555419921875, + "learning_rate": 6.2645e-05, + "loss": 0.4936, + "step": 12536 + }, + { + "epoch": 0.7020383021614962, + "grad_norm": 1.2499947547912598, + "learning_rate": 6.264999999999999e-05, + "loss": 0.4279, + "step": 12537 + }, + { + "epoch": 0.7020942994736252, + "grad_norm": 1.2926799058914185, + "learning_rate": 6.2655e-05, + "loss": 0.2997, + "step": 12538 + }, + { + "epoch": 0.7021502967857542, + "grad_norm": 1.2959471940994263, + "learning_rate": 6.266000000000001e-05, + "loss": 0.3984, + "step": 12539 + }, + { + "epoch": 0.7022062940978833, + "grad_norm": 1.2488656044006348, + "learning_rate": 6.266500000000001e-05, + "loss": 0.4622, + "step": 12540 + }, + { + "epoch": 0.7022622914100123, + "grad_norm": 1.310998797416687, + "learning_rate": 6.267000000000001e-05, + "loss": 0.4812, + "step": 12541 + }, + { + "epoch": 0.7023182887221413, + "grad_norm": 1.280368685722351, + "learning_rate": 6.2675e-05, + "loss": 0.4793, + "step": 12542 + }, + { + "epoch": 0.7023742860342703, + "grad_norm": 1.4046286344528198, + "learning_rate": 6.268e-05, + "loss": 0.3768, + "step": 12543 + }, + { + "epoch": 0.7024302833463993, + "grad_norm": 1.514461636543274, + "learning_rate": 6.2685e-05, + "loss": 0.3981, + "step": 12544 + }, + { + "epoch": 0.7024862806585284, + "grad_norm": 1.2564232349395752, + "learning_rate": 6.269e-05, + "loss": 0.4399, + "step": 12545 + }, + { + "epoch": 0.7025422779706574, + "grad_norm": 1.1517951488494873, + "learning_rate": 6.269500000000001e-05, + "loss": 0.424, + "step": 12546 + }, + { + "epoch": 0.7025982752827864, + "grad_norm": 1.3670238256454468, + "learning_rate": 6.27e-05, + "loss": 0.5629, + "step": 12547 + }, + { + "epoch": 0.7026542725949154, + "grad_norm": 1.3139759302139282, + "learning_rate": 6.2705e-05, + "loss": 0.4085, + "step": 12548 + }, + { + "epoch": 0.7027102699070444, + "grad_norm": 1.2447679042816162, + "learning_rate": 6.271e-05, + "loss": 0.5478, + "step": 12549 + }, + { + "epoch": 0.7027662672191735, + "grad_norm": 1.2819429636001587, + "learning_rate": 6.2715e-05, + "loss": 0.3848, + "step": 12550 + }, + { + "epoch": 0.7028222645313025, + "grad_norm": 3.032064437866211, + "learning_rate": 6.272e-05, + "loss": 0.4426, + "step": 12551 + }, + { + "epoch": 0.7028782618434315, + "grad_norm": 1.4964185953140259, + "learning_rate": 6.2725e-05, + "loss": 0.516, + "step": 12552 + }, + { + "epoch": 0.7029342591555605, + "grad_norm": 1.7251659631729126, + "learning_rate": 6.273e-05, + "loss": 0.4446, + "step": 12553 + }, + { + "epoch": 0.7029902564676895, + "grad_norm": 1.5610597133636475, + "learning_rate": 6.2735e-05, + "loss": 0.5465, + "step": 12554 + }, + { + "epoch": 0.7030462537798186, + "grad_norm": 1.3260618448257446, + "learning_rate": 6.274e-05, + "loss": 0.49, + "step": 12555 + }, + { + "epoch": 0.7031022510919476, + "grad_norm": 1.0873066186904907, + "learning_rate": 6.2745e-05, + "loss": 0.3162, + "step": 12556 + }, + { + "epoch": 0.7031582484040766, + "grad_norm": 1.1887412071228027, + "learning_rate": 6.275e-05, + "loss": 0.3976, + "step": 12557 + }, + { + "epoch": 0.7032142457162056, + "grad_norm": 1.362644076347351, + "learning_rate": 6.2755e-05, + "loss": 0.4904, + "step": 12558 + }, + { + "epoch": 0.7032702430283346, + "grad_norm": 1.2175319194793701, + "learning_rate": 6.276e-05, + "loss": 0.4208, + "step": 12559 + }, + { + "epoch": 0.7033262403404636, + "grad_norm": 1.2674423456192017, + "learning_rate": 6.276500000000001e-05, + "loss": 0.3864, + "step": 12560 + }, + { + "epoch": 0.7033822376525927, + "grad_norm": 1.1710255146026611, + "learning_rate": 6.277000000000001e-05, + "loss": 0.4219, + "step": 12561 + }, + { + "epoch": 0.7034382349647217, + "grad_norm": 1.1940264701843262, + "learning_rate": 6.277500000000001e-05, + "loss": 0.3946, + "step": 12562 + }, + { + "epoch": 0.7034942322768507, + "grad_norm": 1.3122938871383667, + "learning_rate": 6.278e-05, + "loss": 0.3898, + "step": 12563 + }, + { + "epoch": 0.7035502295889797, + "grad_norm": 1.266348958015442, + "learning_rate": 6.2785e-05, + "loss": 0.5555, + "step": 12564 + }, + { + "epoch": 0.7036062269011087, + "grad_norm": 1.386495590209961, + "learning_rate": 6.279e-05, + "loss": 0.4076, + "step": 12565 + }, + { + "epoch": 0.7036622242132378, + "grad_norm": 1.2832168340682983, + "learning_rate": 6.2795e-05, + "loss": 0.4575, + "step": 12566 + }, + { + "epoch": 0.7037182215253668, + "grad_norm": 1.2073702812194824, + "learning_rate": 6.280000000000001e-05, + "loss": 0.3843, + "step": 12567 + }, + { + "epoch": 0.7037742188374958, + "grad_norm": 1.2366464138031006, + "learning_rate": 6.2805e-05, + "loss": 0.4584, + "step": 12568 + }, + { + "epoch": 0.7038302161496248, + "grad_norm": 1.237715482711792, + "learning_rate": 6.281e-05, + "loss": 0.446, + "step": 12569 + }, + { + "epoch": 0.7038862134617538, + "grad_norm": 1.3560377359390259, + "learning_rate": 6.2815e-05, + "loss": 0.376, + "step": 12570 + }, + { + "epoch": 0.7039422107738829, + "grad_norm": 1.4787386655807495, + "learning_rate": 6.282e-05, + "loss": 0.4276, + "step": 12571 + }, + { + "epoch": 0.7039982080860119, + "grad_norm": 1.650567889213562, + "learning_rate": 6.2825e-05, + "loss": 0.5232, + "step": 12572 + }, + { + "epoch": 0.7040542053981409, + "grad_norm": 1.2303379774093628, + "learning_rate": 6.283e-05, + "loss": 0.3249, + "step": 12573 + }, + { + "epoch": 0.7041102027102699, + "grad_norm": 1.3153318166732788, + "learning_rate": 6.2835e-05, + "loss": 0.5754, + "step": 12574 + }, + { + "epoch": 0.7041662000223989, + "grad_norm": 1.261511206626892, + "learning_rate": 6.284e-05, + "loss": 0.3882, + "step": 12575 + }, + { + "epoch": 0.704222197334528, + "grad_norm": 1.7436730861663818, + "learning_rate": 6.2845e-05, + "loss": 0.5699, + "step": 12576 + }, + { + "epoch": 0.704278194646657, + "grad_norm": 1.4731782674789429, + "learning_rate": 6.285e-05, + "loss": 0.4778, + "step": 12577 + }, + { + "epoch": 0.704334191958786, + "grad_norm": 1.2351558208465576, + "learning_rate": 6.285500000000001e-05, + "loss": 0.4492, + "step": 12578 + }, + { + "epoch": 0.704390189270915, + "grad_norm": 1.2724463939666748, + "learning_rate": 6.286e-05, + "loss": 0.3817, + "step": 12579 + }, + { + "epoch": 0.704446186583044, + "grad_norm": 1.1775709390640259, + "learning_rate": 6.2865e-05, + "loss": 0.4378, + "step": 12580 + }, + { + "epoch": 0.704502183895173, + "grad_norm": 1.4919737577438354, + "learning_rate": 6.287000000000001e-05, + "loss": 0.4648, + "step": 12581 + }, + { + "epoch": 0.7045581812073021, + "grad_norm": 1.245802402496338, + "learning_rate": 6.287500000000001e-05, + "loss": 0.373, + "step": 12582 + }, + { + "epoch": 0.7046141785194311, + "grad_norm": 1.3589507341384888, + "learning_rate": 6.288000000000001e-05, + "loss": 0.5279, + "step": 12583 + }, + { + "epoch": 0.7046701758315601, + "grad_norm": 1.2483769655227661, + "learning_rate": 6.2885e-05, + "loss": 0.3568, + "step": 12584 + }, + { + "epoch": 0.7047261731436891, + "grad_norm": 1.0464344024658203, + "learning_rate": 6.289e-05, + "loss": 0.4032, + "step": 12585 + }, + { + "epoch": 0.7047821704558181, + "grad_norm": 1.4985077381134033, + "learning_rate": 6.2895e-05, + "loss": 0.4843, + "step": 12586 + }, + { + "epoch": 0.7048381677679472, + "grad_norm": 1.618809461593628, + "learning_rate": 6.29e-05, + "loss": 0.4632, + "step": 12587 + }, + { + "epoch": 0.7048941650800762, + "grad_norm": 1.4173755645751953, + "learning_rate": 6.290500000000001e-05, + "loss": 0.4738, + "step": 12588 + }, + { + "epoch": 0.7049501623922052, + "grad_norm": 1.209714412689209, + "learning_rate": 6.291e-05, + "loss": 0.3928, + "step": 12589 + }, + { + "epoch": 0.7050061597043342, + "grad_norm": 1.331560730934143, + "learning_rate": 6.2915e-05, + "loss": 0.4719, + "step": 12590 + }, + { + "epoch": 0.7050621570164632, + "grad_norm": 1.475605845451355, + "learning_rate": 6.292e-05, + "loss": 0.5298, + "step": 12591 + }, + { + "epoch": 0.7051181543285923, + "grad_norm": 1.38764488697052, + "learning_rate": 6.2925e-05, + "loss": 0.4586, + "step": 12592 + }, + { + "epoch": 0.7051741516407213, + "grad_norm": 1.2886463403701782, + "learning_rate": 6.293e-05, + "loss": 0.4298, + "step": 12593 + }, + { + "epoch": 0.7052301489528503, + "grad_norm": 1.1384798288345337, + "learning_rate": 6.293499999999999e-05, + "loss": 0.4246, + "step": 12594 + }, + { + "epoch": 0.7052861462649793, + "grad_norm": 1.3754823207855225, + "learning_rate": 6.294e-05, + "loss": 0.4228, + "step": 12595 + }, + { + "epoch": 0.7053421435771083, + "grad_norm": 1.4134275913238525, + "learning_rate": 6.2945e-05, + "loss": 0.597, + "step": 12596 + }, + { + "epoch": 0.7053981408892374, + "grad_norm": 1.2388256788253784, + "learning_rate": 6.295e-05, + "loss": 0.4314, + "step": 12597 + }, + { + "epoch": 0.7054541382013664, + "grad_norm": 1.4481347799301147, + "learning_rate": 6.295500000000001e-05, + "loss": 0.4934, + "step": 12598 + }, + { + "epoch": 0.7055101355134954, + "grad_norm": 1.091850757598877, + "learning_rate": 6.296000000000001e-05, + "loss": 0.3545, + "step": 12599 + }, + { + "epoch": 0.7055661328256244, + "grad_norm": 1.3457605838775635, + "learning_rate": 6.2965e-05, + "loss": 0.5096, + "step": 12600 + }, + { + "epoch": 0.7056221301377534, + "grad_norm": 1.4634277820587158, + "learning_rate": 6.297e-05, + "loss": 0.4189, + "step": 12601 + }, + { + "epoch": 0.7056781274498825, + "grad_norm": 1.5778405666351318, + "learning_rate": 6.297500000000001e-05, + "loss": 0.492, + "step": 12602 + }, + { + "epoch": 0.7057341247620115, + "grad_norm": 1.2554309368133545, + "learning_rate": 6.298000000000001e-05, + "loss": 0.3473, + "step": 12603 + }, + { + "epoch": 0.7057901220741405, + "grad_norm": 1.4002500772476196, + "learning_rate": 6.298500000000001e-05, + "loss": 0.5078, + "step": 12604 + }, + { + "epoch": 0.7058461193862695, + "grad_norm": 1.460457444190979, + "learning_rate": 6.299e-05, + "loss": 0.3477, + "step": 12605 + }, + { + "epoch": 0.7059021166983985, + "grad_norm": 1.3724795579910278, + "learning_rate": 6.2995e-05, + "loss": 0.4048, + "step": 12606 + }, + { + "epoch": 0.7059581140105275, + "grad_norm": 1.3621898889541626, + "learning_rate": 6.3e-05, + "loss": 0.5058, + "step": 12607 + }, + { + "epoch": 0.7060141113226566, + "grad_norm": 1.2200628519058228, + "learning_rate": 6.3005e-05, + "loss": 0.4472, + "step": 12608 + }, + { + "epoch": 0.7060701086347855, + "grad_norm": 1.37726628780365, + "learning_rate": 6.301000000000001e-05, + "loss": 0.4492, + "step": 12609 + }, + { + "epoch": 0.7061261059469145, + "grad_norm": 1.4450502395629883, + "learning_rate": 6.3015e-05, + "loss": 0.4134, + "step": 12610 + }, + { + "epoch": 0.7061821032590435, + "grad_norm": 1.2746467590332031, + "learning_rate": 6.302e-05, + "loss": 0.4434, + "step": 12611 + }, + { + "epoch": 0.7062381005711725, + "grad_norm": 1.3004050254821777, + "learning_rate": 6.3025e-05, + "loss": 0.4678, + "step": 12612 + }, + { + "epoch": 0.7062940978833016, + "grad_norm": 1.4135793447494507, + "learning_rate": 6.303e-05, + "loss": 0.505, + "step": 12613 + }, + { + "epoch": 0.7063500951954306, + "grad_norm": 1.072414755821228, + "learning_rate": 6.3035e-05, + "loss": 0.3845, + "step": 12614 + }, + { + "epoch": 0.7064060925075596, + "grad_norm": 1.2727646827697754, + "learning_rate": 6.303999999999999e-05, + "loss": 0.5119, + "step": 12615 + }, + { + "epoch": 0.7064620898196886, + "grad_norm": 1.1109548807144165, + "learning_rate": 6.3045e-05, + "loss": 0.3195, + "step": 12616 + }, + { + "epoch": 0.7065180871318176, + "grad_norm": 1.3027209043502808, + "learning_rate": 6.305e-05, + "loss": 0.5456, + "step": 12617 + }, + { + "epoch": 0.7065740844439466, + "grad_norm": 1.3179601430892944, + "learning_rate": 6.305500000000001e-05, + "loss": 0.4321, + "step": 12618 + }, + { + "epoch": 0.7066300817560757, + "grad_norm": 1.138843059539795, + "learning_rate": 6.306000000000001e-05, + "loss": 0.4206, + "step": 12619 + }, + { + "epoch": 0.7066860790682047, + "grad_norm": 1.6393864154815674, + "learning_rate": 6.306500000000001e-05, + "loss": 0.6926, + "step": 12620 + }, + { + "epoch": 0.7067420763803337, + "grad_norm": 1.7525949478149414, + "learning_rate": 6.307e-05, + "loss": 0.4402, + "step": 12621 + }, + { + "epoch": 0.7067980736924627, + "grad_norm": 1.4333950281143188, + "learning_rate": 6.3075e-05, + "loss": 0.4057, + "step": 12622 + }, + { + "epoch": 0.7068540710045917, + "grad_norm": 1.2789214849472046, + "learning_rate": 6.308e-05, + "loss": 0.3135, + "step": 12623 + }, + { + "epoch": 0.7069100683167208, + "grad_norm": 1.5111428499221802, + "learning_rate": 6.308500000000001e-05, + "loss": 0.3997, + "step": 12624 + }, + { + "epoch": 0.7069660656288498, + "grad_norm": 1.579714298248291, + "learning_rate": 6.309000000000001e-05, + "loss": 0.4526, + "step": 12625 + }, + { + "epoch": 0.7070220629409788, + "grad_norm": 1.551098346710205, + "learning_rate": 6.3095e-05, + "loss": 0.4695, + "step": 12626 + }, + { + "epoch": 0.7070780602531078, + "grad_norm": 1.2486560344696045, + "learning_rate": 6.31e-05, + "loss": 0.4226, + "step": 12627 + }, + { + "epoch": 0.7071340575652368, + "grad_norm": 1.126120924949646, + "learning_rate": 6.3105e-05, + "loss": 0.3612, + "step": 12628 + }, + { + "epoch": 0.7071900548773659, + "grad_norm": 1.2583171129226685, + "learning_rate": 6.311e-05, + "loss": 0.3793, + "step": 12629 + }, + { + "epoch": 0.7072460521894949, + "grad_norm": 1.390459656715393, + "learning_rate": 6.311500000000001e-05, + "loss": 0.4242, + "step": 12630 + }, + { + "epoch": 0.7073020495016239, + "grad_norm": 1.1530574560165405, + "learning_rate": 6.312e-05, + "loss": 0.4057, + "step": 12631 + }, + { + "epoch": 0.7073580468137529, + "grad_norm": 1.2216347455978394, + "learning_rate": 6.3125e-05, + "loss": 0.3678, + "step": 12632 + }, + { + "epoch": 0.7074140441258819, + "grad_norm": 1.5343226194381714, + "learning_rate": 6.313e-05, + "loss": 0.5646, + "step": 12633 + }, + { + "epoch": 0.707470041438011, + "grad_norm": 1.2741063833236694, + "learning_rate": 6.3135e-05, + "loss": 0.4095, + "step": 12634 + }, + { + "epoch": 0.70752603875014, + "grad_norm": 1.3892006874084473, + "learning_rate": 6.314e-05, + "loss": 0.4059, + "step": 12635 + }, + { + "epoch": 0.707582036062269, + "grad_norm": 1.618849754333496, + "learning_rate": 6.314499999999999e-05, + "loss": 0.4831, + "step": 12636 + }, + { + "epoch": 0.707638033374398, + "grad_norm": 1.3106848001480103, + "learning_rate": 6.315e-05, + "loss": 0.4332, + "step": 12637 + }, + { + "epoch": 0.707694030686527, + "grad_norm": 1.281575083732605, + "learning_rate": 6.3155e-05, + "loss": 0.4828, + "step": 12638 + }, + { + "epoch": 0.707750027998656, + "grad_norm": 1.260123610496521, + "learning_rate": 6.316000000000001e-05, + "loss": 0.4523, + "step": 12639 + }, + { + "epoch": 0.7078060253107851, + "grad_norm": 1.2649827003479004, + "learning_rate": 6.316500000000001e-05, + "loss": 0.5126, + "step": 12640 + }, + { + "epoch": 0.7078620226229141, + "grad_norm": 1.6030091047286987, + "learning_rate": 6.317e-05, + "loss": 0.4465, + "step": 12641 + }, + { + "epoch": 0.7079180199350431, + "grad_norm": 1.3819293975830078, + "learning_rate": 6.3175e-05, + "loss": 0.4599, + "step": 12642 + }, + { + "epoch": 0.7079740172471721, + "grad_norm": 1.3676986694335938, + "learning_rate": 6.318e-05, + "loss": 0.4164, + "step": 12643 + }, + { + "epoch": 0.7080300145593011, + "grad_norm": 1.3112050294876099, + "learning_rate": 6.3185e-05, + "loss": 0.4768, + "step": 12644 + }, + { + "epoch": 0.7080860118714302, + "grad_norm": 1.3456107378005981, + "learning_rate": 6.319000000000001e-05, + "loss": 0.4973, + "step": 12645 + }, + { + "epoch": 0.7081420091835592, + "grad_norm": 1.1713309288024902, + "learning_rate": 6.319500000000001e-05, + "loss": 0.3567, + "step": 12646 + }, + { + "epoch": 0.7081980064956882, + "grad_norm": 1.4542640447616577, + "learning_rate": 6.32e-05, + "loss": 0.5801, + "step": 12647 + }, + { + "epoch": 0.7082540038078172, + "grad_norm": 1.784436583518982, + "learning_rate": 6.3205e-05, + "loss": 0.5084, + "step": 12648 + }, + { + "epoch": 0.7083100011199462, + "grad_norm": 1.2475916147232056, + "learning_rate": 6.321e-05, + "loss": 0.4811, + "step": 12649 + }, + { + "epoch": 0.7083659984320753, + "grad_norm": 1.2328693866729736, + "learning_rate": 6.3215e-05, + "loss": 0.4592, + "step": 12650 + }, + { + "epoch": 0.7084219957442043, + "grad_norm": 1.145531177520752, + "learning_rate": 6.322000000000001e-05, + "loss": 0.424, + "step": 12651 + }, + { + "epoch": 0.7084779930563333, + "grad_norm": 1.4359947443008423, + "learning_rate": 6.3225e-05, + "loss": 0.4709, + "step": 12652 + }, + { + "epoch": 0.7085339903684623, + "grad_norm": 1.411778211593628, + "learning_rate": 6.323e-05, + "loss": 0.4565, + "step": 12653 + }, + { + "epoch": 0.7085899876805913, + "grad_norm": 1.3843156099319458, + "learning_rate": 6.3235e-05, + "loss": 0.485, + "step": 12654 + }, + { + "epoch": 0.7086459849927204, + "grad_norm": 1.4193692207336426, + "learning_rate": 6.324e-05, + "loss": 0.3932, + "step": 12655 + }, + { + "epoch": 0.7087019823048494, + "grad_norm": 1.2704010009765625, + "learning_rate": 6.3245e-05, + "loss": 0.4651, + "step": 12656 + }, + { + "epoch": 0.7087579796169784, + "grad_norm": 1.1232733726501465, + "learning_rate": 6.324999999999999e-05, + "loss": 0.3719, + "step": 12657 + }, + { + "epoch": 0.7088139769291074, + "grad_norm": 1.3058933019638062, + "learning_rate": 6.3255e-05, + "loss": 0.3575, + "step": 12658 + }, + { + "epoch": 0.7088699742412364, + "grad_norm": 1.5541898012161255, + "learning_rate": 6.326000000000001e-05, + "loss": 0.6517, + "step": 12659 + }, + { + "epoch": 0.7089259715533655, + "grad_norm": 1.325561761856079, + "learning_rate": 6.326500000000001e-05, + "loss": 0.3889, + "step": 12660 + }, + { + "epoch": 0.7089819688654945, + "grad_norm": 1.5662405490875244, + "learning_rate": 6.327000000000001e-05, + "loss": 0.405, + "step": 12661 + }, + { + "epoch": 0.7090379661776235, + "grad_norm": 1.0807271003723145, + "learning_rate": 6.3275e-05, + "loss": 0.3512, + "step": 12662 + }, + { + "epoch": 0.7090939634897525, + "grad_norm": 1.5273305177688599, + "learning_rate": 6.328e-05, + "loss": 0.6008, + "step": 12663 + }, + { + "epoch": 0.7091499608018815, + "grad_norm": 1.6548724174499512, + "learning_rate": 6.3285e-05, + "loss": 0.5224, + "step": 12664 + }, + { + "epoch": 0.7092059581140105, + "grad_norm": 1.264316201210022, + "learning_rate": 6.329e-05, + "loss": 0.5297, + "step": 12665 + }, + { + "epoch": 0.7092619554261396, + "grad_norm": 1.1656309366226196, + "learning_rate": 6.329500000000001e-05, + "loss": 0.3994, + "step": 12666 + }, + { + "epoch": 0.7093179527382686, + "grad_norm": 2.9174137115478516, + "learning_rate": 6.330000000000001e-05, + "loss": 0.4231, + "step": 12667 + }, + { + "epoch": 0.7093739500503976, + "grad_norm": 1.241214632987976, + "learning_rate": 6.3305e-05, + "loss": 0.4536, + "step": 12668 + }, + { + "epoch": 0.7094299473625266, + "grad_norm": 1.5645548105239868, + "learning_rate": 6.331e-05, + "loss": 0.5548, + "step": 12669 + }, + { + "epoch": 0.7094859446746556, + "grad_norm": 1.4205975532531738, + "learning_rate": 6.3315e-05, + "loss": 0.6061, + "step": 12670 + }, + { + "epoch": 0.7095419419867847, + "grad_norm": 1.774970531463623, + "learning_rate": 6.332e-05, + "loss": 0.4486, + "step": 12671 + }, + { + "epoch": 0.7095979392989137, + "grad_norm": 1.1116825342178345, + "learning_rate": 6.3325e-05, + "loss": 0.3824, + "step": 12672 + }, + { + "epoch": 0.7096539366110427, + "grad_norm": 1.1717495918273926, + "learning_rate": 6.333e-05, + "loss": 0.5045, + "step": 12673 + }, + { + "epoch": 0.7097099339231717, + "grad_norm": 1.1295884847640991, + "learning_rate": 6.3335e-05, + "loss": 0.3551, + "step": 12674 + }, + { + "epoch": 0.7097659312353007, + "grad_norm": 1.3893578052520752, + "learning_rate": 6.334e-05, + "loss": 0.5575, + "step": 12675 + }, + { + "epoch": 0.7098219285474298, + "grad_norm": 1.4093233346939087, + "learning_rate": 6.3345e-05, + "loss": 0.4796, + "step": 12676 + }, + { + "epoch": 0.7098779258595588, + "grad_norm": 1.3685671091079712, + "learning_rate": 6.335e-05, + "loss": 0.4079, + "step": 12677 + }, + { + "epoch": 0.7099339231716878, + "grad_norm": 1.190311074256897, + "learning_rate": 6.335499999999999e-05, + "loss": 0.4145, + "step": 12678 + }, + { + "epoch": 0.7099899204838168, + "grad_norm": 1.2564928531646729, + "learning_rate": 6.336e-05, + "loss": 0.4877, + "step": 12679 + }, + { + "epoch": 0.7100459177959458, + "grad_norm": 1.264681100845337, + "learning_rate": 6.336500000000001e-05, + "loss": 0.4781, + "step": 12680 + }, + { + "epoch": 0.7101019151080749, + "grad_norm": 1.2790048122406006, + "learning_rate": 6.337000000000001e-05, + "loss": 0.4392, + "step": 12681 + }, + { + "epoch": 0.7101579124202039, + "grad_norm": 1.1290431022644043, + "learning_rate": 6.337500000000001e-05, + "loss": 0.4491, + "step": 12682 + }, + { + "epoch": 0.7102139097323329, + "grad_norm": 1.465829610824585, + "learning_rate": 6.338e-05, + "loss": 0.6806, + "step": 12683 + }, + { + "epoch": 0.7102699070444619, + "grad_norm": 1.6494287252426147, + "learning_rate": 6.3385e-05, + "loss": 0.486, + "step": 12684 + }, + { + "epoch": 0.7103259043565909, + "grad_norm": 1.357857584953308, + "learning_rate": 6.339e-05, + "loss": 0.4848, + "step": 12685 + }, + { + "epoch": 0.71038190166872, + "grad_norm": 1.3813881874084473, + "learning_rate": 6.3395e-05, + "loss": 0.4979, + "step": 12686 + }, + { + "epoch": 0.710437898980849, + "grad_norm": 1.551567792892456, + "learning_rate": 6.340000000000001e-05, + "loss": 0.3673, + "step": 12687 + }, + { + "epoch": 0.710493896292978, + "grad_norm": 1.2936947345733643, + "learning_rate": 6.340500000000001e-05, + "loss": 0.4462, + "step": 12688 + }, + { + "epoch": 0.710549893605107, + "grad_norm": 1.3695658445358276, + "learning_rate": 6.341e-05, + "loss": 0.4734, + "step": 12689 + }, + { + "epoch": 0.710605890917236, + "grad_norm": 1.2387371063232422, + "learning_rate": 6.3415e-05, + "loss": 0.3892, + "step": 12690 + }, + { + "epoch": 0.7106618882293649, + "grad_norm": 1.2142603397369385, + "learning_rate": 6.342e-05, + "loss": 0.4328, + "step": 12691 + }, + { + "epoch": 0.710717885541494, + "grad_norm": 1.2077988386154175, + "learning_rate": 6.3425e-05, + "loss": 0.415, + "step": 12692 + }, + { + "epoch": 0.710773882853623, + "grad_norm": 1.2667608261108398, + "learning_rate": 6.343e-05, + "loss": 0.4881, + "step": 12693 + }, + { + "epoch": 0.710829880165752, + "grad_norm": 1.4840319156646729, + "learning_rate": 6.3435e-05, + "loss": 0.48, + "step": 12694 + }, + { + "epoch": 0.710885877477881, + "grad_norm": 1.4201076030731201, + "learning_rate": 6.344e-05, + "loss": 0.5295, + "step": 12695 + }, + { + "epoch": 0.71094187479001, + "grad_norm": 1.192383885383606, + "learning_rate": 6.3445e-05, + "loss": 0.3654, + "step": 12696 + }, + { + "epoch": 0.710997872102139, + "grad_norm": 1.3051652908325195, + "learning_rate": 6.345e-05, + "loss": 0.4585, + "step": 12697 + }, + { + "epoch": 0.7110538694142681, + "grad_norm": 1.7367881536483765, + "learning_rate": 6.3455e-05, + "loss": 0.4431, + "step": 12698 + }, + { + "epoch": 0.7111098667263971, + "grad_norm": 1.2035497426986694, + "learning_rate": 6.346e-05, + "loss": 0.4178, + "step": 12699 + }, + { + "epoch": 0.7111658640385261, + "grad_norm": 1.2499315738677979, + "learning_rate": 6.3465e-05, + "loss": 0.4182, + "step": 12700 + }, + { + "epoch": 0.7112218613506551, + "grad_norm": 1.4131886959075928, + "learning_rate": 6.347e-05, + "loss": 0.4649, + "step": 12701 + }, + { + "epoch": 0.7112778586627841, + "grad_norm": 1.190421223640442, + "learning_rate": 6.347500000000001e-05, + "loss": 0.4147, + "step": 12702 + }, + { + "epoch": 0.7113338559749132, + "grad_norm": 1.193487286567688, + "learning_rate": 6.348000000000001e-05, + "loss": 0.4261, + "step": 12703 + }, + { + "epoch": 0.7113898532870422, + "grad_norm": 1.1812646389007568, + "learning_rate": 6.3485e-05, + "loss": 0.4381, + "step": 12704 + }, + { + "epoch": 0.7114458505991712, + "grad_norm": 1.2294243574142456, + "learning_rate": 6.349e-05, + "loss": 0.3583, + "step": 12705 + }, + { + "epoch": 0.7115018479113002, + "grad_norm": 2.127265691757202, + "learning_rate": 6.3495e-05, + "loss": 0.5528, + "step": 12706 + }, + { + "epoch": 0.7115578452234292, + "grad_norm": 1.3275443315505981, + "learning_rate": 6.35e-05, + "loss": 0.3748, + "step": 12707 + }, + { + "epoch": 0.7116138425355583, + "grad_norm": 1.2263245582580566, + "learning_rate": 6.350500000000001e-05, + "loss": 0.4958, + "step": 12708 + }, + { + "epoch": 0.7116698398476873, + "grad_norm": 1.5293418169021606, + "learning_rate": 6.351000000000001e-05, + "loss": 0.3799, + "step": 12709 + }, + { + "epoch": 0.7117258371598163, + "grad_norm": 1.5596578121185303, + "learning_rate": 6.3515e-05, + "loss": 0.4192, + "step": 12710 + }, + { + "epoch": 0.7117818344719453, + "grad_norm": 1.153295874595642, + "learning_rate": 6.352e-05, + "loss": 0.4549, + "step": 12711 + }, + { + "epoch": 0.7118378317840743, + "grad_norm": 1.3420997858047485, + "learning_rate": 6.3525e-05, + "loss": 0.4939, + "step": 12712 + }, + { + "epoch": 0.7118938290962034, + "grad_norm": 1.3926844596862793, + "learning_rate": 6.353e-05, + "loss": 0.3718, + "step": 12713 + }, + { + "epoch": 0.7119498264083324, + "grad_norm": 1.3157209157943726, + "learning_rate": 6.3535e-05, + "loss": 0.5368, + "step": 12714 + }, + { + "epoch": 0.7120058237204614, + "grad_norm": 1.580293893814087, + "learning_rate": 6.354e-05, + "loss": 0.5469, + "step": 12715 + }, + { + "epoch": 0.7120618210325904, + "grad_norm": 1.5111911296844482, + "learning_rate": 6.3545e-05, + "loss": 0.3721, + "step": 12716 + }, + { + "epoch": 0.7121178183447194, + "grad_norm": 1.2916908264160156, + "learning_rate": 6.355e-05, + "loss": 0.3981, + "step": 12717 + }, + { + "epoch": 0.7121738156568485, + "grad_norm": 1.370994210243225, + "learning_rate": 6.3555e-05, + "loss": 0.3765, + "step": 12718 + }, + { + "epoch": 0.7122298129689775, + "grad_norm": 1.1901644468307495, + "learning_rate": 6.356000000000001e-05, + "loss": 0.3658, + "step": 12719 + }, + { + "epoch": 0.7122858102811065, + "grad_norm": 1.3271005153656006, + "learning_rate": 6.3565e-05, + "loss": 0.5493, + "step": 12720 + }, + { + "epoch": 0.7123418075932355, + "grad_norm": 1.1498123407363892, + "learning_rate": 6.357e-05, + "loss": 0.4468, + "step": 12721 + }, + { + "epoch": 0.7123978049053645, + "grad_norm": 1.1788270473480225, + "learning_rate": 6.3575e-05, + "loss": 0.3985, + "step": 12722 + }, + { + "epoch": 0.7124538022174935, + "grad_norm": 1.8106350898742676, + "learning_rate": 6.358000000000001e-05, + "loss": 0.6286, + "step": 12723 + }, + { + "epoch": 0.7125097995296226, + "grad_norm": 1.4803918600082397, + "learning_rate": 6.358500000000001e-05, + "loss": 0.4002, + "step": 12724 + }, + { + "epoch": 0.7125657968417516, + "grad_norm": 1.188391923904419, + "learning_rate": 6.359e-05, + "loss": 0.3289, + "step": 12725 + }, + { + "epoch": 0.7126217941538806, + "grad_norm": 1.2638044357299805, + "learning_rate": 6.3595e-05, + "loss": 0.3745, + "step": 12726 + }, + { + "epoch": 0.7126777914660096, + "grad_norm": 1.017531156539917, + "learning_rate": 6.36e-05, + "loss": 0.337, + "step": 12727 + }, + { + "epoch": 0.7127337887781386, + "grad_norm": 1.3558027744293213, + "learning_rate": 6.3605e-05, + "loss": 0.3277, + "step": 12728 + }, + { + "epoch": 0.7127897860902677, + "grad_norm": 1.1347521543502808, + "learning_rate": 6.361000000000001e-05, + "loss": 0.4376, + "step": 12729 + }, + { + "epoch": 0.7128457834023967, + "grad_norm": 1.4346020221710205, + "learning_rate": 6.3615e-05, + "loss": 0.5466, + "step": 12730 + }, + { + "epoch": 0.7129017807145257, + "grad_norm": 1.54099702835083, + "learning_rate": 6.362e-05, + "loss": 0.517, + "step": 12731 + }, + { + "epoch": 0.7129577780266547, + "grad_norm": 1.1725080013275146, + "learning_rate": 6.3625e-05, + "loss": 0.3622, + "step": 12732 + }, + { + "epoch": 0.7130137753387837, + "grad_norm": 1.2479534149169922, + "learning_rate": 6.363e-05, + "loss": 0.3899, + "step": 12733 + }, + { + "epoch": 0.7130697726509128, + "grad_norm": 1.2837066650390625, + "learning_rate": 6.3635e-05, + "loss": 0.3856, + "step": 12734 + }, + { + "epoch": 0.7131257699630418, + "grad_norm": 2.2546517848968506, + "learning_rate": 6.364e-05, + "loss": 0.5325, + "step": 12735 + }, + { + "epoch": 0.7131817672751708, + "grad_norm": 1.216312289237976, + "learning_rate": 6.3645e-05, + "loss": 0.4332, + "step": 12736 + }, + { + "epoch": 0.7132377645872998, + "grad_norm": 1.3184088468551636, + "learning_rate": 6.365e-05, + "loss": 0.4355, + "step": 12737 + }, + { + "epoch": 0.7132937618994288, + "grad_norm": 1.1263552904129028, + "learning_rate": 6.3655e-05, + "loss": 0.3072, + "step": 12738 + }, + { + "epoch": 0.7133497592115579, + "grad_norm": 1.165421962738037, + "learning_rate": 6.366000000000001e-05, + "loss": 0.4261, + "step": 12739 + }, + { + "epoch": 0.7134057565236869, + "grad_norm": 1.4257819652557373, + "learning_rate": 6.366500000000001e-05, + "loss": 0.5089, + "step": 12740 + }, + { + "epoch": 0.7134617538358159, + "grad_norm": 1.3485167026519775, + "learning_rate": 6.367e-05, + "loss": 0.4479, + "step": 12741 + }, + { + "epoch": 0.7135177511479449, + "grad_norm": 1.4713619947433472, + "learning_rate": 6.3675e-05, + "loss": 0.4504, + "step": 12742 + }, + { + "epoch": 0.7135737484600739, + "grad_norm": 1.085688591003418, + "learning_rate": 6.368e-05, + "loss": 0.4326, + "step": 12743 + }, + { + "epoch": 0.713629745772203, + "grad_norm": 1.2095509767532349, + "learning_rate": 6.368500000000001e-05, + "loss": 0.3957, + "step": 12744 + }, + { + "epoch": 0.713685743084332, + "grad_norm": 1.5142602920532227, + "learning_rate": 6.369000000000001e-05, + "loss": 0.3968, + "step": 12745 + }, + { + "epoch": 0.713741740396461, + "grad_norm": 1.1609373092651367, + "learning_rate": 6.3695e-05, + "loss": 0.5362, + "step": 12746 + }, + { + "epoch": 0.71379773770859, + "grad_norm": 1.6774296760559082, + "learning_rate": 6.37e-05, + "loss": 0.5318, + "step": 12747 + }, + { + "epoch": 0.713853735020719, + "grad_norm": 1.3412060737609863, + "learning_rate": 6.3705e-05, + "loss": 0.3999, + "step": 12748 + }, + { + "epoch": 0.713909732332848, + "grad_norm": 1.1753593683242798, + "learning_rate": 6.371e-05, + "loss": 0.3468, + "step": 12749 + }, + { + "epoch": 0.7139657296449771, + "grad_norm": 1.4292224645614624, + "learning_rate": 6.371500000000001e-05, + "loss": 0.5395, + "step": 12750 + }, + { + "epoch": 0.7140217269571061, + "grad_norm": 1.2235126495361328, + "learning_rate": 6.372e-05, + "loss": 0.3857, + "step": 12751 + }, + { + "epoch": 0.7140777242692351, + "grad_norm": 1.4244022369384766, + "learning_rate": 6.3725e-05, + "loss": 0.4213, + "step": 12752 + }, + { + "epoch": 0.7141337215813641, + "grad_norm": 1.2730166912078857, + "learning_rate": 6.373e-05, + "loss": 0.4058, + "step": 12753 + }, + { + "epoch": 0.7141897188934931, + "grad_norm": 1.4534602165222168, + "learning_rate": 6.3735e-05, + "loss": 0.4449, + "step": 12754 + }, + { + "epoch": 0.7142457162056222, + "grad_norm": 1.2470502853393555, + "learning_rate": 6.374e-05, + "loss": 0.5016, + "step": 12755 + }, + { + "epoch": 0.7143017135177512, + "grad_norm": 1.146397352218628, + "learning_rate": 6.3745e-05, + "loss": 0.3615, + "step": 12756 + }, + { + "epoch": 0.7143577108298802, + "grad_norm": 1.190064549446106, + "learning_rate": 6.375e-05, + "loss": 0.4055, + "step": 12757 + }, + { + "epoch": 0.7144137081420092, + "grad_norm": 1.2497453689575195, + "learning_rate": 6.3755e-05, + "loss": 0.4397, + "step": 12758 + }, + { + "epoch": 0.7144697054541382, + "grad_norm": 2.066960096359253, + "learning_rate": 6.376e-05, + "loss": 0.357, + "step": 12759 + }, + { + "epoch": 0.7145257027662673, + "grad_norm": 1.6281381845474243, + "learning_rate": 6.376500000000001e-05, + "loss": 0.5837, + "step": 12760 + }, + { + "epoch": 0.7145817000783963, + "grad_norm": 1.5029911994934082, + "learning_rate": 6.377000000000001e-05, + "loss": 0.4694, + "step": 12761 + }, + { + "epoch": 0.7146376973905253, + "grad_norm": 1.3037540912628174, + "learning_rate": 6.3775e-05, + "loss": 0.3326, + "step": 12762 + }, + { + "epoch": 0.7146936947026543, + "grad_norm": 1.1242607831954956, + "learning_rate": 6.378e-05, + "loss": 0.4072, + "step": 12763 + }, + { + "epoch": 0.7147496920147833, + "grad_norm": 1.3720523118972778, + "learning_rate": 6.3785e-05, + "loss": 0.4946, + "step": 12764 + }, + { + "epoch": 0.7148056893269124, + "grad_norm": 1.3609472513198853, + "learning_rate": 6.379000000000001e-05, + "loss": 0.4443, + "step": 12765 + }, + { + "epoch": 0.7148616866390414, + "grad_norm": 1.4723418951034546, + "learning_rate": 6.379500000000001e-05, + "loss": 0.4779, + "step": 12766 + }, + { + "epoch": 0.7149176839511704, + "grad_norm": 1.2363005876541138, + "learning_rate": 6.38e-05, + "loss": 0.3756, + "step": 12767 + }, + { + "epoch": 0.7149736812632994, + "grad_norm": 1.1192009449005127, + "learning_rate": 6.3805e-05, + "loss": 0.4275, + "step": 12768 + }, + { + "epoch": 0.7150296785754284, + "grad_norm": 1.8555961847305298, + "learning_rate": 6.381e-05, + "loss": 0.5432, + "step": 12769 + }, + { + "epoch": 0.7150856758875574, + "grad_norm": 1.238851547241211, + "learning_rate": 6.3815e-05, + "loss": 0.364, + "step": 12770 + }, + { + "epoch": 0.7151416731996865, + "grad_norm": 1.2626633644104004, + "learning_rate": 6.382e-05, + "loss": 0.4722, + "step": 12771 + }, + { + "epoch": 0.7151976705118155, + "grad_norm": 2.2175941467285156, + "learning_rate": 6.3825e-05, + "loss": 0.3666, + "step": 12772 + }, + { + "epoch": 0.7152536678239445, + "grad_norm": 1.3716325759887695, + "learning_rate": 6.383e-05, + "loss": 0.383, + "step": 12773 + }, + { + "epoch": 0.7153096651360734, + "grad_norm": 1.627503752708435, + "learning_rate": 6.3835e-05, + "loss": 0.4719, + "step": 12774 + }, + { + "epoch": 0.7153656624482024, + "grad_norm": 1.6553013324737549, + "learning_rate": 6.384e-05, + "loss": 0.4163, + "step": 12775 + }, + { + "epoch": 0.7154216597603315, + "grad_norm": 1.3761839866638184, + "learning_rate": 6.3845e-05, + "loss": 0.5103, + "step": 12776 + }, + { + "epoch": 0.7154776570724605, + "grad_norm": 1.385082483291626, + "learning_rate": 6.385e-05, + "loss": 0.5036, + "step": 12777 + }, + { + "epoch": 0.7155336543845895, + "grad_norm": 1.3864158391952515, + "learning_rate": 6.3855e-05, + "loss": 0.5604, + "step": 12778 + }, + { + "epoch": 0.7155896516967185, + "grad_norm": 1.4270308017730713, + "learning_rate": 6.386e-05, + "loss": 0.5372, + "step": 12779 + }, + { + "epoch": 0.7156456490088475, + "grad_norm": 1.5776902437210083, + "learning_rate": 6.386500000000001e-05, + "loss": 0.6883, + "step": 12780 + }, + { + "epoch": 0.7157016463209765, + "grad_norm": 1.3400368690490723, + "learning_rate": 6.387000000000001e-05, + "loss": 0.4332, + "step": 12781 + }, + { + "epoch": 0.7157576436331056, + "grad_norm": 1.2608451843261719, + "learning_rate": 6.387500000000001e-05, + "loss": 0.3786, + "step": 12782 + }, + { + "epoch": 0.7158136409452346, + "grad_norm": 1.2663267850875854, + "learning_rate": 6.388e-05, + "loss": 0.4651, + "step": 12783 + }, + { + "epoch": 0.7158696382573636, + "grad_norm": 1.5927826166152954, + "learning_rate": 6.3885e-05, + "loss": 0.4652, + "step": 12784 + }, + { + "epoch": 0.7159256355694926, + "grad_norm": 1.1670840978622437, + "learning_rate": 6.389e-05, + "loss": 0.4082, + "step": 12785 + }, + { + "epoch": 0.7159816328816216, + "grad_norm": 1.3903788328170776, + "learning_rate": 6.389500000000001e-05, + "loss": 0.5638, + "step": 12786 + }, + { + "epoch": 0.7160376301937507, + "grad_norm": 1.4020166397094727, + "learning_rate": 6.390000000000001e-05, + "loss": 0.4935, + "step": 12787 + }, + { + "epoch": 0.7160936275058797, + "grad_norm": 1.471501111984253, + "learning_rate": 6.3905e-05, + "loss": 0.4349, + "step": 12788 + }, + { + "epoch": 0.7161496248180087, + "grad_norm": 1.266884207725525, + "learning_rate": 6.391e-05, + "loss": 0.3633, + "step": 12789 + }, + { + "epoch": 0.7162056221301377, + "grad_norm": 1.3894003629684448, + "learning_rate": 6.3915e-05, + "loss": 0.4857, + "step": 12790 + }, + { + "epoch": 0.7162616194422667, + "grad_norm": 1.3955460786819458, + "learning_rate": 6.392e-05, + "loss": 0.4931, + "step": 12791 + }, + { + "epoch": 0.7163176167543958, + "grad_norm": 1.221116065979004, + "learning_rate": 6.3925e-05, + "loss": 0.3948, + "step": 12792 + }, + { + "epoch": 0.7163736140665248, + "grad_norm": 3.43851900100708, + "learning_rate": 6.393e-05, + "loss": 0.4225, + "step": 12793 + }, + { + "epoch": 0.7164296113786538, + "grad_norm": 1.3714956045150757, + "learning_rate": 6.3935e-05, + "loss": 0.4307, + "step": 12794 + }, + { + "epoch": 0.7164856086907828, + "grad_norm": 1.4571897983551025, + "learning_rate": 6.394e-05, + "loss": 0.4694, + "step": 12795 + }, + { + "epoch": 0.7165416060029118, + "grad_norm": 1.2747544050216675, + "learning_rate": 6.3945e-05, + "loss": 0.4313, + "step": 12796 + }, + { + "epoch": 0.7165976033150409, + "grad_norm": 1.3819502592086792, + "learning_rate": 6.395e-05, + "loss": 0.4512, + "step": 12797 + }, + { + "epoch": 0.7166536006271699, + "grad_norm": 1.1545897722244263, + "learning_rate": 6.3955e-05, + "loss": 0.3672, + "step": 12798 + }, + { + "epoch": 0.7167095979392989, + "grad_norm": 1.3497068881988525, + "learning_rate": 6.396e-05, + "loss": 0.4123, + "step": 12799 + }, + { + "epoch": 0.7167655952514279, + "grad_norm": 1.1514780521392822, + "learning_rate": 6.3965e-05, + "loss": 0.422, + "step": 12800 + }, + { + "epoch": 0.7168215925635569, + "grad_norm": 1.310577154159546, + "learning_rate": 6.397000000000001e-05, + "loss": 0.3955, + "step": 12801 + }, + { + "epoch": 0.716877589875686, + "grad_norm": 1.2854505777359009, + "learning_rate": 6.397500000000001e-05, + "loss": 0.521, + "step": 12802 + }, + { + "epoch": 0.716933587187815, + "grad_norm": 1.0573272705078125, + "learning_rate": 6.398000000000001e-05, + "loss": 0.3111, + "step": 12803 + }, + { + "epoch": 0.716989584499944, + "grad_norm": 12.162367820739746, + "learning_rate": 6.3985e-05, + "loss": 0.6242, + "step": 12804 + }, + { + "epoch": 0.717045581812073, + "grad_norm": 1.1641361713409424, + "learning_rate": 6.399e-05, + "loss": 0.4572, + "step": 12805 + }, + { + "epoch": 0.717101579124202, + "grad_norm": 1.853898525238037, + "learning_rate": 6.3995e-05, + "loss": 0.4984, + "step": 12806 + }, + { + "epoch": 0.717157576436331, + "grad_norm": 1.2919056415557861, + "learning_rate": 6.400000000000001e-05, + "loss": 0.4071, + "step": 12807 + }, + { + "epoch": 0.7172135737484601, + "grad_norm": 1.2785226106643677, + "learning_rate": 6.400500000000001e-05, + "loss": 0.35, + "step": 12808 + }, + { + "epoch": 0.7172695710605891, + "grad_norm": 1.0631603002548218, + "learning_rate": 6.401e-05, + "loss": 0.389, + "step": 12809 + }, + { + "epoch": 0.7173255683727181, + "grad_norm": 1.295305848121643, + "learning_rate": 6.4015e-05, + "loss": 0.4316, + "step": 12810 + }, + { + "epoch": 0.7173815656848471, + "grad_norm": 1.3592380285263062, + "learning_rate": 6.402e-05, + "loss": 0.4869, + "step": 12811 + }, + { + "epoch": 0.7174375629969761, + "grad_norm": 1.337204933166504, + "learning_rate": 6.4025e-05, + "loss": 0.3931, + "step": 12812 + }, + { + "epoch": 0.7174935603091052, + "grad_norm": 1.4174797534942627, + "learning_rate": 6.403e-05, + "loss": 0.3998, + "step": 12813 + }, + { + "epoch": 0.7175495576212342, + "grad_norm": 1.4384292364120483, + "learning_rate": 6.4035e-05, + "loss": 0.442, + "step": 12814 + }, + { + "epoch": 0.7176055549333632, + "grad_norm": 1.1505624055862427, + "learning_rate": 6.404e-05, + "loss": 0.5024, + "step": 12815 + }, + { + "epoch": 0.7176615522454922, + "grad_norm": 1.3162777423858643, + "learning_rate": 6.4045e-05, + "loss": 0.5222, + "step": 12816 + }, + { + "epoch": 0.7177175495576212, + "grad_norm": 1.2605162858963013, + "learning_rate": 6.405e-05, + "loss": 0.2991, + "step": 12817 + }, + { + "epoch": 0.7177735468697503, + "grad_norm": 1.4509625434875488, + "learning_rate": 6.4055e-05, + "loss": 0.4539, + "step": 12818 + }, + { + "epoch": 0.7178295441818793, + "grad_norm": 1.4207115173339844, + "learning_rate": 6.405999999999999e-05, + "loss": 0.4775, + "step": 12819 + }, + { + "epoch": 0.7178855414940083, + "grad_norm": 1.4416894912719727, + "learning_rate": 6.4065e-05, + "loss": 0.4061, + "step": 12820 + }, + { + "epoch": 0.7179415388061373, + "grad_norm": 1.3681820631027222, + "learning_rate": 6.407e-05, + "loss": 0.5323, + "step": 12821 + }, + { + "epoch": 0.7179975361182663, + "grad_norm": 1.2461649179458618, + "learning_rate": 6.407500000000001e-05, + "loss": 0.4439, + "step": 12822 + }, + { + "epoch": 0.7180535334303954, + "grad_norm": 1.3961012363433838, + "learning_rate": 6.408000000000001e-05, + "loss": 0.5067, + "step": 12823 + }, + { + "epoch": 0.7181095307425244, + "grad_norm": 1.3075144290924072, + "learning_rate": 6.408500000000001e-05, + "loss": 0.35, + "step": 12824 + }, + { + "epoch": 0.7181655280546534, + "grad_norm": 1.199385166168213, + "learning_rate": 6.409e-05, + "loss": 0.5217, + "step": 12825 + }, + { + "epoch": 0.7182215253667824, + "grad_norm": 1.7142282724380493, + "learning_rate": 6.4095e-05, + "loss": 0.5228, + "step": 12826 + }, + { + "epoch": 0.7182775226789114, + "grad_norm": 1.2804813385009766, + "learning_rate": 6.41e-05, + "loss": 0.4752, + "step": 12827 + }, + { + "epoch": 0.7183335199910404, + "grad_norm": 1.6355879306793213, + "learning_rate": 6.410500000000001e-05, + "loss": 0.3663, + "step": 12828 + }, + { + "epoch": 0.7183895173031695, + "grad_norm": 2.1395740509033203, + "learning_rate": 6.411000000000001e-05, + "loss": 0.4131, + "step": 12829 + }, + { + "epoch": 0.7184455146152985, + "grad_norm": 1.2016159296035767, + "learning_rate": 6.4115e-05, + "loss": 0.4172, + "step": 12830 + }, + { + "epoch": 0.7185015119274275, + "grad_norm": 4.299350261688232, + "learning_rate": 6.412e-05, + "loss": 0.5124, + "step": 12831 + }, + { + "epoch": 0.7185575092395565, + "grad_norm": 1.3119045495986938, + "learning_rate": 6.4125e-05, + "loss": 0.5367, + "step": 12832 + }, + { + "epoch": 0.7186135065516855, + "grad_norm": 1.2507914304733276, + "learning_rate": 6.413e-05, + "loss": 0.4462, + "step": 12833 + }, + { + "epoch": 0.7186695038638146, + "grad_norm": 1.6498754024505615, + "learning_rate": 6.4135e-05, + "loss": 0.4533, + "step": 12834 + }, + { + "epoch": 0.7187255011759436, + "grad_norm": 1.2412898540496826, + "learning_rate": 6.414e-05, + "loss": 0.4036, + "step": 12835 + }, + { + "epoch": 0.7187814984880726, + "grad_norm": 1.275437593460083, + "learning_rate": 6.4145e-05, + "loss": 0.5085, + "step": 12836 + }, + { + "epoch": 0.7188374958002016, + "grad_norm": 1.2589842081069946, + "learning_rate": 6.415e-05, + "loss": 0.4849, + "step": 12837 + }, + { + "epoch": 0.7188934931123306, + "grad_norm": 1.271705985069275, + "learning_rate": 6.4155e-05, + "loss": 0.5544, + "step": 12838 + }, + { + "epoch": 0.7189494904244597, + "grad_norm": 1.4079164266586304, + "learning_rate": 6.416e-05, + "loss": 0.5601, + "step": 12839 + }, + { + "epoch": 0.7190054877365887, + "grad_norm": 1.3210337162017822, + "learning_rate": 6.4165e-05, + "loss": 0.5112, + "step": 12840 + }, + { + "epoch": 0.7190614850487177, + "grad_norm": 1.2597122192382812, + "learning_rate": 6.417e-05, + "loss": 0.4337, + "step": 12841 + }, + { + "epoch": 0.7191174823608467, + "grad_norm": 1.3668289184570312, + "learning_rate": 6.4175e-05, + "loss": 0.443, + "step": 12842 + }, + { + "epoch": 0.7191734796729757, + "grad_norm": 1.4604369401931763, + "learning_rate": 6.418000000000001e-05, + "loss": 0.4761, + "step": 12843 + }, + { + "epoch": 0.7192294769851048, + "grad_norm": 1.1284401416778564, + "learning_rate": 6.418500000000001e-05, + "loss": 0.3613, + "step": 12844 + }, + { + "epoch": 0.7192854742972338, + "grad_norm": 1.336661458015442, + "learning_rate": 6.419000000000001e-05, + "loss": 0.3995, + "step": 12845 + }, + { + "epoch": 0.7193414716093628, + "grad_norm": 1.4011123180389404, + "learning_rate": 6.4195e-05, + "loss": 0.4359, + "step": 12846 + }, + { + "epoch": 0.7193974689214918, + "grad_norm": 1.2431459426879883, + "learning_rate": 6.42e-05, + "loss": 0.4012, + "step": 12847 + }, + { + "epoch": 0.7194534662336208, + "grad_norm": 1.1655406951904297, + "learning_rate": 6.4205e-05, + "loss": 0.4497, + "step": 12848 + }, + { + "epoch": 0.7195094635457498, + "grad_norm": 1.292120099067688, + "learning_rate": 6.421e-05, + "loss": 0.5142, + "step": 12849 + }, + { + "epoch": 0.7195654608578789, + "grad_norm": 1.2695152759552002, + "learning_rate": 6.421500000000001e-05, + "loss": 0.4447, + "step": 12850 + }, + { + "epoch": 0.7196214581700079, + "grad_norm": 1.2664965391159058, + "learning_rate": 6.422e-05, + "loss": 0.4946, + "step": 12851 + }, + { + "epoch": 0.7196774554821369, + "grad_norm": 1.2135577201843262, + "learning_rate": 6.4225e-05, + "loss": 0.3936, + "step": 12852 + }, + { + "epoch": 0.7197334527942659, + "grad_norm": 1.182833194732666, + "learning_rate": 6.423e-05, + "loss": 0.3167, + "step": 12853 + }, + { + "epoch": 0.719789450106395, + "grad_norm": 1.412680983543396, + "learning_rate": 6.4235e-05, + "loss": 0.4906, + "step": 12854 + }, + { + "epoch": 0.719845447418524, + "grad_norm": 1.304890751838684, + "learning_rate": 6.424e-05, + "loss": 0.3311, + "step": 12855 + }, + { + "epoch": 0.719901444730653, + "grad_norm": 1.1928796768188477, + "learning_rate": 6.4245e-05, + "loss": 0.4449, + "step": 12856 + }, + { + "epoch": 0.7199574420427819, + "grad_norm": 2.066993236541748, + "learning_rate": 6.425e-05, + "loss": 0.3949, + "step": 12857 + }, + { + "epoch": 0.7200134393549109, + "grad_norm": 1.359500527381897, + "learning_rate": 6.4255e-05, + "loss": 0.4686, + "step": 12858 + }, + { + "epoch": 0.7200694366670399, + "grad_norm": 1.3475192785263062, + "learning_rate": 6.426e-05, + "loss": 0.482, + "step": 12859 + }, + { + "epoch": 0.720125433979169, + "grad_norm": 1.0215314626693726, + "learning_rate": 6.426500000000001e-05, + "loss": 0.3383, + "step": 12860 + }, + { + "epoch": 0.720181431291298, + "grad_norm": 1.4091228246688843, + "learning_rate": 6.427e-05, + "loss": 0.4618, + "step": 12861 + }, + { + "epoch": 0.720237428603427, + "grad_norm": 1.5251009464263916, + "learning_rate": 6.4275e-05, + "loss": 0.4077, + "step": 12862 + }, + { + "epoch": 0.720293425915556, + "grad_norm": 1.6396993398666382, + "learning_rate": 6.428e-05, + "loss": 0.5588, + "step": 12863 + }, + { + "epoch": 0.720349423227685, + "grad_norm": 3.3280935287475586, + "learning_rate": 6.428500000000001e-05, + "loss": 0.5493, + "step": 12864 + }, + { + "epoch": 0.720405420539814, + "grad_norm": 1.4691766500473022, + "learning_rate": 6.429000000000001e-05, + "loss": 0.5274, + "step": 12865 + }, + { + "epoch": 0.7204614178519431, + "grad_norm": 1.2549264430999756, + "learning_rate": 6.429500000000001e-05, + "loss": 0.5649, + "step": 12866 + }, + { + "epoch": 0.7205174151640721, + "grad_norm": 1.3790887594223022, + "learning_rate": 6.43e-05, + "loss": 0.4306, + "step": 12867 + }, + { + "epoch": 0.7205734124762011, + "grad_norm": 1.192781925201416, + "learning_rate": 6.4305e-05, + "loss": 0.44, + "step": 12868 + }, + { + "epoch": 0.7206294097883301, + "grad_norm": 1.3899825811386108, + "learning_rate": 6.431e-05, + "loss": 0.4697, + "step": 12869 + }, + { + "epoch": 0.7206854071004591, + "grad_norm": 1.4914478063583374, + "learning_rate": 6.4315e-05, + "loss": 0.3433, + "step": 12870 + }, + { + "epoch": 0.7207414044125882, + "grad_norm": 1.2485928535461426, + "learning_rate": 6.432000000000001e-05, + "loss": 0.3903, + "step": 12871 + }, + { + "epoch": 0.7207974017247172, + "grad_norm": 1.469435691833496, + "learning_rate": 6.4325e-05, + "loss": 0.4088, + "step": 12872 + }, + { + "epoch": 0.7208533990368462, + "grad_norm": 1.2373088598251343, + "learning_rate": 6.433e-05, + "loss": 0.3797, + "step": 12873 + }, + { + "epoch": 0.7209093963489752, + "grad_norm": 1.1302622556686401, + "learning_rate": 6.4335e-05, + "loss": 0.4827, + "step": 12874 + }, + { + "epoch": 0.7209653936611042, + "grad_norm": 1.3318990468978882, + "learning_rate": 6.434e-05, + "loss": 0.3497, + "step": 12875 + }, + { + "epoch": 0.7210213909732333, + "grad_norm": 1.1196736097335815, + "learning_rate": 6.4345e-05, + "loss": 0.4114, + "step": 12876 + }, + { + "epoch": 0.7210773882853623, + "grad_norm": 1.4482390880584717, + "learning_rate": 6.435e-05, + "loss": 0.4194, + "step": 12877 + }, + { + "epoch": 0.7211333855974913, + "grad_norm": 1.2796387672424316, + "learning_rate": 6.4355e-05, + "loss": 0.4891, + "step": 12878 + }, + { + "epoch": 0.7211893829096203, + "grad_norm": 1.1463838815689087, + "learning_rate": 6.436e-05, + "loss": 0.4692, + "step": 12879 + }, + { + "epoch": 0.7212453802217493, + "grad_norm": 6.655930995941162, + "learning_rate": 6.436500000000001e-05, + "loss": 0.4129, + "step": 12880 + }, + { + "epoch": 0.7213013775338784, + "grad_norm": 3.030787706375122, + "learning_rate": 6.437000000000001e-05, + "loss": 0.3954, + "step": 12881 + }, + { + "epoch": 0.7213573748460074, + "grad_norm": 1.4530386924743652, + "learning_rate": 6.4375e-05, + "loss": 0.612, + "step": 12882 + }, + { + "epoch": 0.7214133721581364, + "grad_norm": 1.3220127820968628, + "learning_rate": 6.438e-05, + "loss": 0.4964, + "step": 12883 + }, + { + "epoch": 0.7214693694702654, + "grad_norm": 1.7192901372909546, + "learning_rate": 6.4385e-05, + "loss": 0.4847, + "step": 12884 + }, + { + "epoch": 0.7215253667823944, + "grad_norm": 1.3402737379074097, + "learning_rate": 6.439000000000001e-05, + "loss": 0.4283, + "step": 12885 + }, + { + "epoch": 0.7215813640945234, + "grad_norm": 1.1397950649261475, + "learning_rate": 6.439500000000001e-05, + "loss": 0.3931, + "step": 12886 + }, + { + "epoch": 0.7216373614066525, + "grad_norm": 1.5136781930923462, + "learning_rate": 6.440000000000001e-05, + "loss": 0.581, + "step": 12887 + }, + { + "epoch": 0.7216933587187815, + "grad_norm": 1.2671502828598022, + "learning_rate": 6.4405e-05, + "loss": 0.537, + "step": 12888 + }, + { + "epoch": 0.7217493560309105, + "grad_norm": 1.1760938167572021, + "learning_rate": 6.441e-05, + "loss": 0.4935, + "step": 12889 + }, + { + "epoch": 0.7218053533430395, + "grad_norm": 1.2217832803726196, + "learning_rate": 6.4415e-05, + "loss": 0.3571, + "step": 12890 + }, + { + "epoch": 0.7218613506551685, + "grad_norm": 1.1983466148376465, + "learning_rate": 6.442e-05, + "loss": 0.3651, + "step": 12891 + }, + { + "epoch": 0.7219173479672976, + "grad_norm": 1.4146442413330078, + "learning_rate": 6.442500000000001e-05, + "loss": 0.4483, + "step": 12892 + }, + { + "epoch": 0.7219733452794266, + "grad_norm": 1.398146152496338, + "learning_rate": 6.443e-05, + "loss": 0.6141, + "step": 12893 + }, + { + "epoch": 0.7220293425915556, + "grad_norm": 1.4189565181732178, + "learning_rate": 6.4435e-05, + "loss": 0.3798, + "step": 12894 + }, + { + "epoch": 0.7220853399036846, + "grad_norm": 1.2826626300811768, + "learning_rate": 6.444e-05, + "loss": 0.5178, + "step": 12895 + }, + { + "epoch": 0.7221413372158136, + "grad_norm": 1.3823177814483643, + "learning_rate": 6.4445e-05, + "loss": 0.4631, + "step": 12896 + }, + { + "epoch": 0.7221973345279427, + "grad_norm": 1.345367670059204, + "learning_rate": 6.445e-05, + "loss": 0.437, + "step": 12897 + }, + { + "epoch": 0.7222533318400717, + "grad_norm": 1.2119885683059692, + "learning_rate": 6.4455e-05, + "loss": 0.4164, + "step": 12898 + }, + { + "epoch": 0.7223093291522007, + "grad_norm": 1.1886770725250244, + "learning_rate": 6.446e-05, + "loss": 0.355, + "step": 12899 + }, + { + "epoch": 0.7223653264643297, + "grad_norm": 1.188043475151062, + "learning_rate": 6.4465e-05, + "loss": 0.4286, + "step": 12900 + }, + { + "epoch": 0.7224213237764587, + "grad_norm": 1.4560166597366333, + "learning_rate": 6.447000000000001e-05, + "loss": 0.4311, + "step": 12901 + }, + { + "epoch": 0.7224773210885878, + "grad_norm": 1.4028239250183105, + "learning_rate": 6.447500000000001e-05, + "loss": 0.4554, + "step": 12902 + }, + { + "epoch": 0.7225333184007168, + "grad_norm": 1.1268011331558228, + "learning_rate": 6.448e-05, + "loss": 0.4178, + "step": 12903 + }, + { + "epoch": 0.7225893157128458, + "grad_norm": 1.1890791654586792, + "learning_rate": 6.4485e-05, + "loss": 0.3923, + "step": 12904 + }, + { + "epoch": 0.7226453130249748, + "grad_norm": 1.1738241910934448, + "learning_rate": 6.449e-05, + "loss": 0.3756, + "step": 12905 + }, + { + "epoch": 0.7227013103371038, + "grad_norm": 1.6105811595916748, + "learning_rate": 6.449500000000001e-05, + "loss": 0.3918, + "step": 12906 + }, + { + "epoch": 0.7227573076492328, + "grad_norm": 1.286133050918579, + "learning_rate": 6.450000000000001e-05, + "loss": 0.3927, + "step": 12907 + }, + { + "epoch": 0.7228133049613619, + "grad_norm": 1.4088801145553589, + "learning_rate": 6.4505e-05, + "loss": 0.6302, + "step": 12908 + }, + { + "epoch": 0.7228693022734909, + "grad_norm": 1.468022108078003, + "learning_rate": 6.451e-05, + "loss": 0.4253, + "step": 12909 + }, + { + "epoch": 0.7229252995856199, + "grad_norm": 1.3817226886749268, + "learning_rate": 6.4515e-05, + "loss": 0.5141, + "step": 12910 + }, + { + "epoch": 0.7229812968977489, + "grad_norm": 1.8205647468566895, + "learning_rate": 6.452e-05, + "loss": 0.4257, + "step": 12911 + }, + { + "epoch": 0.723037294209878, + "grad_norm": 1.2383532524108887, + "learning_rate": 6.4525e-05, + "loss": 0.3925, + "step": 12912 + }, + { + "epoch": 0.723093291522007, + "grad_norm": 1.3751835823059082, + "learning_rate": 6.453000000000001e-05, + "loss": 0.5789, + "step": 12913 + }, + { + "epoch": 0.723149288834136, + "grad_norm": 1.4135364294052124, + "learning_rate": 6.4535e-05, + "loss": 0.402, + "step": 12914 + }, + { + "epoch": 0.723205286146265, + "grad_norm": 1.4469807147979736, + "learning_rate": 6.454e-05, + "loss": 0.4442, + "step": 12915 + }, + { + "epoch": 0.723261283458394, + "grad_norm": 1.4926486015319824, + "learning_rate": 6.4545e-05, + "loss": 0.5421, + "step": 12916 + }, + { + "epoch": 0.723317280770523, + "grad_norm": 4.189614772796631, + "learning_rate": 6.455e-05, + "loss": 0.4641, + "step": 12917 + }, + { + "epoch": 0.7233732780826521, + "grad_norm": 1.3739595413208008, + "learning_rate": 6.4555e-05, + "loss": 0.3941, + "step": 12918 + }, + { + "epoch": 0.7234292753947811, + "grad_norm": 1.3635294437408447, + "learning_rate": 6.455999999999999e-05, + "loss": 0.5162, + "step": 12919 + }, + { + "epoch": 0.7234852727069101, + "grad_norm": 1.2362215518951416, + "learning_rate": 6.4565e-05, + "loss": 0.3528, + "step": 12920 + }, + { + "epoch": 0.7235412700190391, + "grad_norm": 1.1968994140625, + "learning_rate": 6.457000000000001e-05, + "loss": 0.4326, + "step": 12921 + }, + { + "epoch": 0.7235972673311681, + "grad_norm": 1.122292160987854, + "learning_rate": 6.457500000000001e-05, + "loss": 0.401, + "step": 12922 + }, + { + "epoch": 0.7236532646432972, + "grad_norm": 1.3467565774917603, + "learning_rate": 6.458000000000001e-05, + "loss": 0.4402, + "step": 12923 + }, + { + "epoch": 0.7237092619554262, + "grad_norm": 1.0921175479888916, + "learning_rate": 6.4585e-05, + "loss": 0.392, + "step": 12924 + }, + { + "epoch": 0.7237652592675552, + "grad_norm": 1.3415946960449219, + "learning_rate": 6.459e-05, + "loss": 0.3967, + "step": 12925 + }, + { + "epoch": 0.7238212565796842, + "grad_norm": 2.375540018081665, + "learning_rate": 6.4595e-05, + "loss": 0.4013, + "step": 12926 + }, + { + "epoch": 0.7238772538918132, + "grad_norm": 1.3974196910858154, + "learning_rate": 6.460000000000001e-05, + "loss": 0.5083, + "step": 12927 + }, + { + "epoch": 0.7239332512039423, + "grad_norm": 1.1457161903381348, + "learning_rate": 6.460500000000001e-05, + "loss": 0.351, + "step": 12928 + }, + { + "epoch": 0.7239892485160713, + "grad_norm": 1.063173532485962, + "learning_rate": 6.461e-05, + "loss": 0.3628, + "step": 12929 + }, + { + "epoch": 0.7240452458282003, + "grad_norm": 1.3780083656311035, + "learning_rate": 6.4615e-05, + "loss": 0.4839, + "step": 12930 + }, + { + "epoch": 0.7241012431403293, + "grad_norm": 1.3097578287124634, + "learning_rate": 6.462e-05, + "loss": 0.4928, + "step": 12931 + }, + { + "epoch": 0.7241572404524583, + "grad_norm": 1.202901840209961, + "learning_rate": 6.4625e-05, + "loss": 0.4396, + "step": 12932 + }, + { + "epoch": 0.7242132377645873, + "grad_norm": 1.4199342727661133, + "learning_rate": 6.463e-05, + "loss": 0.4244, + "step": 12933 + }, + { + "epoch": 0.7242692350767164, + "grad_norm": 1.2170206308364868, + "learning_rate": 6.463500000000001e-05, + "loss": 0.421, + "step": 12934 + }, + { + "epoch": 0.7243252323888454, + "grad_norm": 1.0460795164108276, + "learning_rate": 6.464e-05, + "loss": 0.3263, + "step": 12935 + }, + { + "epoch": 0.7243812297009744, + "grad_norm": 1.2562450170516968, + "learning_rate": 6.4645e-05, + "loss": 0.3932, + "step": 12936 + }, + { + "epoch": 0.7244372270131034, + "grad_norm": 1.3742763996124268, + "learning_rate": 6.465e-05, + "loss": 0.444, + "step": 12937 + }, + { + "epoch": 0.7244932243252324, + "grad_norm": 1.135968804359436, + "learning_rate": 6.4655e-05, + "loss": 0.3864, + "step": 12938 + }, + { + "epoch": 0.7245492216373614, + "grad_norm": 1.221862554550171, + "learning_rate": 6.466e-05, + "loss": 0.3532, + "step": 12939 + }, + { + "epoch": 0.7246052189494904, + "grad_norm": 1.5764732360839844, + "learning_rate": 6.466499999999999e-05, + "loss": 0.4656, + "step": 12940 + }, + { + "epoch": 0.7246612162616194, + "grad_norm": 1.252284288406372, + "learning_rate": 6.467e-05, + "loss": 0.4488, + "step": 12941 + }, + { + "epoch": 0.7247172135737484, + "grad_norm": 1.1536979675292969, + "learning_rate": 6.467500000000001e-05, + "loss": 0.4659, + "step": 12942 + }, + { + "epoch": 0.7247732108858774, + "grad_norm": 1.3067823648452759, + "learning_rate": 6.468000000000001e-05, + "loss": 0.514, + "step": 12943 + }, + { + "epoch": 0.7248292081980064, + "grad_norm": 1.3132990598678589, + "learning_rate": 6.468500000000001e-05, + "loss": 0.4565, + "step": 12944 + }, + { + "epoch": 0.7248852055101355, + "grad_norm": 1.3057440519332886, + "learning_rate": 6.469e-05, + "loss": 0.5572, + "step": 12945 + }, + { + "epoch": 0.7249412028222645, + "grad_norm": 1.1999316215515137, + "learning_rate": 6.4695e-05, + "loss": 0.375, + "step": 12946 + }, + { + "epoch": 0.7249972001343935, + "grad_norm": 1.336259365081787, + "learning_rate": 6.47e-05, + "loss": 0.5179, + "step": 12947 + }, + { + "epoch": 0.7250531974465225, + "grad_norm": 1.6089376211166382, + "learning_rate": 6.4705e-05, + "loss": 0.4512, + "step": 12948 + }, + { + "epoch": 0.7251091947586515, + "grad_norm": 1.5541924238204956, + "learning_rate": 6.471000000000001e-05, + "loss": 0.5098, + "step": 12949 + }, + { + "epoch": 0.7251651920707806, + "grad_norm": 1.271735429763794, + "learning_rate": 6.4715e-05, + "loss": 0.5747, + "step": 12950 + }, + { + "epoch": 0.7252211893829096, + "grad_norm": 1.406443476676941, + "learning_rate": 6.472e-05, + "loss": 0.3573, + "step": 12951 + }, + { + "epoch": 0.7252771866950386, + "grad_norm": 1.480695366859436, + "learning_rate": 6.4725e-05, + "loss": 0.5508, + "step": 12952 + }, + { + "epoch": 0.7253331840071676, + "grad_norm": 1.1610902547836304, + "learning_rate": 6.473e-05, + "loss": 0.4652, + "step": 12953 + }, + { + "epoch": 0.7253891813192966, + "grad_norm": 1.2197450399398804, + "learning_rate": 6.4735e-05, + "loss": 0.3855, + "step": 12954 + }, + { + "epoch": 0.7254451786314257, + "grad_norm": 1.2905586957931519, + "learning_rate": 6.474000000000001e-05, + "loss": 0.5056, + "step": 12955 + }, + { + "epoch": 0.7255011759435547, + "grad_norm": 1.1793146133422852, + "learning_rate": 6.4745e-05, + "loss": 0.3365, + "step": 12956 + }, + { + "epoch": 0.7255571732556837, + "grad_norm": 1.4936169385910034, + "learning_rate": 6.475e-05, + "loss": 0.4332, + "step": 12957 + }, + { + "epoch": 0.7256131705678127, + "grad_norm": 1.0383176803588867, + "learning_rate": 6.4755e-05, + "loss": 0.2899, + "step": 12958 + }, + { + "epoch": 0.7256691678799417, + "grad_norm": 1.3028063774108887, + "learning_rate": 6.476e-05, + "loss": 0.4425, + "step": 12959 + }, + { + "epoch": 0.7257251651920708, + "grad_norm": 1.3707900047302246, + "learning_rate": 6.4765e-05, + "loss": 0.4871, + "step": 12960 + }, + { + "epoch": 0.7257811625041998, + "grad_norm": 1.2507411241531372, + "learning_rate": 6.477e-05, + "loss": 0.4744, + "step": 12961 + }, + { + "epoch": 0.7258371598163288, + "grad_norm": 1.4732862710952759, + "learning_rate": 6.4775e-05, + "loss": 0.5197, + "step": 12962 + }, + { + "epoch": 0.7258931571284578, + "grad_norm": 1.2263737916946411, + "learning_rate": 6.478000000000001e-05, + "loss": 0.4142, + "step": 12963 + }, + { + "epoch": 0.7259491544405868, + "grad_norm": 1.123755693435669, + "learning_rate": 6.478500000000001e-05, + "loss": 0.4705, + "step": 12964 + }, + { + "epoch": 0.7260051517527158, + "grad_norm": 1.5642553567886353, + "learning_rate": 6.479000000000001e-05, + "loss": 0.3635, + "step": 12965 + }, + { + "epoch": 0.7260611490648449, + "grad_norm": 3.230133295059204, + "learning_rate": 6.4795e-05, + "loss": 0.5285, + "step": 12966 + }, + { + "epoch": 0.7261171463769739, + "grad_norm": 1.2619363069534302, + "learning_rate": 6.48e-05, + "loss": 0.4288, + "step": 12967 + }, + { + "epoch": 0.7261731436891029, + "grad_norm": 1.3217151165008545, + "learning_rate": 6.4805e-05, + "loss": 0.3963, + "step": 12968 + }, + { + "epoch": 0.7262291410012319, + "grad_norm": 1.244097113609314, + "learning_rate": 6.481e-05, + "loss": 0.4191, + "step": 12969 + }, + { + "epoch": 0.726285138313361, + "grad_norm": 1.190354824066162, + "learning_rate": 6.481500000000001e-05, + "loss": 0.4705, + "step": 12970 + }, + { + "epoch": 0.72634113562549, + "grad_norm": 1.2711552381515503, + "learning_rate": 6.482e-05, + "loss": 0.3871, + "step": 12971 + }, + { + "epoch": 0.726397132937619, + "grad_norm": 1.2860972881317139, + "learning_rate": 6.4825e-05, + "loss": 0.5483, + "step": 12972 + }, + { + "epoch": 0.726453130249748, + "grad_norm": 1.2053567171096802, + "learning_rate": 6.483e-05, + "loss": 0.3243, + "step": 12973 + }, + { + "epoch": 0.726509127561877, + "grad_norm": 1.2540823221206665, + "learning_rate": 6.4835e-05, + "loss": 0.4828, + "step": 12974 + }, + { + "epoch": 0.726565124874006, + "grad_norm": 1.240599274635315, + "learning_rate": 6.484e-05, + "loss": 0.466, + "step": 12975 + }, + { + "epoch": 0.7266211221861351, + "grad_norm": 1.5479391813278198, + "learning_rate": 6.484500000000001e-05, + "loss": 0.5901, + "step": 12976 + }, + { + "epoch": 0.7266771194982641, + "grad_norm": 1.4428088665008545, + "learning_rate": 6.485e-05, + "loss": 0.4771, + "step": 12977 + }, + { + "epoch": 0.7267331168103931, + "grad_norm": 1.6157454252243042, + "learning_rate": 6.4855e-05, + "loss": 0.5049, + "step": 12978 + }, + { + "epoch": 0.7267891141225221, + "grad_norm": 1.34217369556427, + "learning_rate": 6.486e-05, + "loss": 0.4706, + "step": 12979 + }, + { + "epoch": 0.7268451114346511, + "grad_norm": 1.2550715208053589, + "learning_rate": 6.4865e-05, + "loss": 0.3991, + "step": 12980 + }, + { + "epoch": 0.7269011087467802, + "grad_norm": 1.307175874710083, + "learning_rate": 6.487000000000001e-05, + "loss": 0.5009, + "step": 12981 + }, + { + "epoch": 0.7269571060589092, + "grad_norm": 1.1628906726837158, + "learning_rate": 6.4875e-05, + "loss": 0.3632, + "step": 12982 + }, + { + "epoch": 0.7270131033710382, + "grad_norm": 1.2792617082595825, + "learning_rate": 6.488e-05, + "loss": 0.4168, + "step": 12983 + }, + { + "epoch": 0.7270691006831672, + "grad_norm": 1.4946825504302979, + "learning_rate": 6.488500000000001e-05, + "loss": 0.486, + "step": 12984 + }, + { + "epoch": 0.7271250979952962, + "grad_norm": 1.4570118188858032, + "learning_rate": 6.489000000000001e-05, + "loss": 0.5857, + "step": 12985 + }, + { + "epoch": 0.7271810953074253, + "grad_norm": 1.5580264329910278, + "learning_rate": 6.489500000000001e-05, + "loss": 0.353, + "step": 12986 + }, + { + "epoch": 0.7272370926195543, + "grad_norm": 1.3818445205688477, + "learning_rate": 6.49e-05, + "loss": 0.3925, + "step": 12987 + }, + { + "epoch": 0.7272930899316833, + "grad_norm": 1.2081820964813232, + "learning_rate": 6.4905e-05, + "loss": 0.4304, + "step": 12988 + }, + { + "epoch": 0.7273490872438123, + "grad_norm": 1.2454428672790527, + "learning_rate": 6.491e-05, + "loss": 0.477, + "step": 12989 + }, + { + "epoch": 0.7274050845559413, + "grad_norm": 1.2278811931610107, + "learning_rate": 6.4915e-05, + "loss": 0.3489, + "step": 12990 + }, + { + "epoch": 0.7274610818680703, + "grad_norm": 1.3734474182128906, + "learning_rate": 6.492000000000001e-05, + "loss": 0.448, + "step": 12991 + }, + { + "epoch": 0.7275170791801994, + "grad_norm": 1.1877390146255493, + "learning_rate": 6.4925e-05, + "loss": 0.3798, + "step": 12992 + }, + { + "epoch": 0.7275730764923284, + "grad_norm": 1.2094286680221558, + "learning_rate": 6.493e-05, + "loss": 0.4001, + "step": 12993 + }, + { + "epoch": 0.7276290738044574, + "grad_norm": 1.2303513288497925, + "learning_rate": 6.4935e-05, + "loss": 0.3729, + "step": 12994 + }, + { + "epoch": 0.7276850711165864, + "grad_norm": 1.6889305114746094, + "learning_rate": 6.494e-05, + "loss": 0.4747, + "step": 12995 + }, + { + "epoch": 0.7277410684287154, + "grad_norm": 1.3299304246902466, + "learning_rate": 6.4945e-05, + "loss": 0.5057, + "step": 12996 + }, + { + "epoch": 0.7277970657408445, + "grad_norm": 1.2950594425201416, + "learning_rate": 6.494999999999999e-05, + "loss": 0.4396, + "step": 12997 + }, + { + "epoch": 0.7278530630529735, + "grad_norm": 1.956496238708496, + "learning_rate": 6.4955e-05, + "loss": 0.5345, + "step": 12998 + }, + { + "epoch": 0.7279090603651025, + "grad_norm": 1.3873339891433716, + "learning_rate": 6.496e-05, + "loss": 0.405, + "step": 12999 + }, + { + "epoch": 0.7279650576772315, + "grad_norm": 1.2748873233795166, + "learning_rate": 6.4965e-05, + "loss": 0.5211, + "step": 13000 + }, + { + "epoch": 0.7280210549893605, + "grad_norm": 1.6707043647766113, + "learning_rate": 6.497000000000001e-05, + "loss": 0.5523, + "step": 13001 + }, + { + "epoch": 0.7280770523014896, + "grad_norm": 1.3483299016952515, + "learning_rate": 6.497500000000001e-05, + "loss": 0.4982, + "step": 13002 + }, + { + "epoch": 0.7281330496136186, + "grad_norm": 1.3101677894592285, + "learning_rate": 6.498e-05, + "loss": 0.4722, + "step": 13003 + }, + { + "epoch": 0.7281890469257476, + "grad_norm": 1.118424892425537, + "learning_rate": 6.4985e-05, + "loss": 0.3566, + "step": 13004 + }, + { + "epoch": 0.7282450442378766, + "grad_norm": 1.1747689247131348, + "learning_rate": 6.499000000000001e-05, + "loss": 0.4689, + "step": 13005 + }, + { + "epoch": 0.7283010415500056, + "grad_norm": 1.495073676109314, + "learning_rate": 6.499500000000001e-05, + "loss": 0.45, + "step": 13006 + }, + { + "epoch": 0.7283570388621347, + "grad_norm": 1.3751918077468872, + "learning_rate": 6.500000000000001e-05, + "loss": 0.4043, + "step": 13007 + }, + { + "epoch": 0.7284130361742637, + "grad_norm": 1.296090841293335, + "learning_rate": 6.5005e-05, + "loss": 0.4572, + "step": 13008 + }, + { + "epoch": 0.7284690334863927, + "grad_norm": 1.2352558374404907, + "learning_rate": 6.501e-05, + "loss": 0.3579, + "step": 13009 + }, + { + "epoch": 0.7285250307985217, + "grad_norm": 1.1969058513641357, + "learning_rate": 6.5015e-05, + "loss": 0.4123, + "step": 13010 + }, + { + "epoch": 0.7285810281106507, + "grad_norm": 1.451724648475647, + "learning_rate": 6.502e-05, + "loss": 0.5631, + "step": 13011 + }, + { + "epoch": 0.7286370254227797, + "grad_norm": 1.25950288772583, + "learning_rate": 6.502500000000001e-05, + "loss": 0.4782, + "step": 13012 + }, + { + "epoch": 0.7286930227349088, + "grad_norm": 1.3958215713500977, + "learning_rate": 6.503e-05, + "loss": 0.5441, + "step": 13013 + }, + { + "epoch": 0.7287490200470378, + "grad_norm": 1.6094831228256226, + "learning_rate": 6.5035e-05, + "loss": 0.5775, + "step": 13014 + }, + { + "epoch": 0.7288050173591668, + "grad_norm": 1.5766825675964355, + "learning_rate": 6.504e-05, + "loss": 0.5992, + "step": 13015 + }, + { + "epoch": 0.7288610146712958, + "grad_norm": 1.4682761430740356, + "learning_rate": 6.5045e-05, + "loss": 0.5773, + "step": 13016 + }, + { + "epoch": 0.7289170119834248, + "grad_norm": 1.3241996765136719, + "learning_rate": 6.505e-05, + "loss": 0.4616, + "step": 13017 + }, + { + "epoch": 0.7289730092955539, + "grad_norm": 1.1775118112564087, + "learning_rate": 6.505499999999999e-05, + "loss": 0.3069, + "step": 13018 + }, + { + "epoch": 0.7290290066076829, + "grad_norm": 1.3832868337631226, + "learning_rate": 6.506e-05, + "loss": 0.5166, + "step": 13019 + }, + { + "epoch": 0.7290850039198119, + "grad_norm": 1.269210934638977, + "learning_rate": 6.5065e-05, + "loss": 0.426, + "step": 13020 + }, + { + "epoch": 0.7291410012319409, + "grad_norm": 1.6523929834365845, + "learning_rate": 6.507e-05, + "loss": 0.5015, + "step": 13021 + }, + { + "epoch": 0.7291969985440698, + "grad_norm": 1.3177670240402222, + "learning_rate": 6.507500000000001e-05, + "loss": 0.4026, + "step": 13022 + }, + { + "epoch": 0.7292529958561988, + "grad_norm": 1.1591671705245972, + "learning_rate": 6.508000000000001e-05, + "loss": 0.3467, + "step": 13023 + }, + { + "epoch": 0.7293089931683279, + "grad_norm": 1.3172328472137451, + "learning_rate": 6.5085e-05, + "loss": 0.4742, + "step": 13024 + }, + { + "epoch": 0.7293649904804569, + "grad_norm": 1.0297353267669678, + "learning_rate": 6.509e-05, + "loss": 0.3457, + "step": 13025 + }, + { + "epoch": 0.7294209877925859, + "grad_norm": 1.4599758386611938, + "learning_rate": 6.5095e-05, + "loss": 0.5066, + "step": 13026 + }, + { + "epoch": 0.7294769851047149, + "grad_norm": 1.2901194095611572, + "learning_rate": 6.510000000000001e-05, + "loss": 0.4009, + "step": 13027 + }, + { + "epoch": 0.729532982416844, + "grad_norm": 1.7665644884109497, + "learning_rate": 6.510500000000001e-05, + "loss": 0.3786, + "step": 13028 + }, + { + "epoch": 0.729588979728973, + "grad_norm": 1.3884735107421875, + "learning_rate": 6.511e-05, + "loss": 0.4049, + "step": 13029 + }, + { + "epoch": 0.729644977041102, + "grad_norm": 1.4180721044540405, + "learning_rate": 6.5115e-05, + "loss": 0.4338, + "step": 13030 + }, + { + "epoch": 0.729700974353231, + "grad_norm": 1.2886587381362915, + "learning_rate": 6.512e-05, + "loss": 0.4717, + "step": 13031 + }, + { + "epoch": 0.72975697166536, + "grad_norm": 1.4126027822494507, + "learning_rate": 6.5125e-05, + "loss": 0.5387, + "step": 13032 + }, + { + "epoch": 0.729812968977489, + "grad_norm": 1.5100982189178467, + "learning_rate": 6.513000000000001e-05, + "loss": 0.6658, + "step": 13033 + }, + { + "epoch": 0.7298689662896181, + "grad_norm": 5.311023235321045, + "learning_rate": 6.5135e-05, + "loss": 0.4693, + "step": 13034 + }, + { + "epoch": 0.7299249636017471, + "grad_norm": 1.2274329662322998, + "learning_rate": 6.514e-05, + "loss": 0.4763, + "step": 13035 + }, + { + "epoch": 0.7299809609138761, + "grad_norm": 1.3793895244598389, + "learning_rate": 6.5145e-05, + "loss": 0.3569, + "step": 13036 + }, + { + "epoch": 0.7300369582260051, + "grad_norm": 1.3775551319122314, + "learning_rate": 6.515e-05, + "loss": 0.6268, + "step": 13037 + }, + { + "epoch": 0.7300929555381341, + "grad_norm": 1.4914031028747559, + "learning_rate": 6.5155e-05, + "loss": 0.6025, + "step": 13038 + }, + { + "epoch": 0.7301489528502632, + "grad_norm": 1.4676573276519775, + "learning_rate": 6.515999999999999e-05, + "loss": 0.4987, + "step": 13039 + }, + { + "epoch": 0.7302049501623922, + "grad_norm": 1.2466171979904175, + "learning_rate": 6.5165e-05, + "loss": 0.3876, + "step": 13040 + }, + { + "epoch": 0.7302609474745212, + "grad_norm": 1.5200568437576294, + "learning_rate": 6.517e-05, + "loss": 0.4826, + "step": 13041 + }, + { + "epoch": 0.7303169447866502, + "grad_norm": 1.2348226308822632, + "learning_rate": 6.517500000000001e-05, + "loss": 0.3823, + "step": 13042 + }, + { + "epoch": 0.7303729420987792, + "grad_norm": 1.1773552894592285, + "learning_rate": 6.518000000000001e-05, + "loss": 0.4015, + "step": 13043 + }, + { + "epoch": 0.7304289394109083, + "grad_norm": 1.558884620666504, + "learning_rate": 6.518500000000001e-05, + "loss": 0.4081, + "step": 13044 + }, + { + "epoch": 0.7304849367230373, + "grad_norm": 1.3640022277832031, + "learning_rate": 6.519e-05, + "loss": 0.4403, + "step": 13045 + }, + { + "epoch": 0.7305409340351663, + "grad_norm": 1.130803108215332, + "learning_rate": 6.5195e-05, + "loss": 0.3317, + "step": 13046 + }, + { + "epoch": 0.7305969313472953, + "grad_norm": 1.2308452129364014, + "learning_rate": 6.52e-05, + "loss": 0.4061, + "step": 13047 + }, + { + "epoch": 0.7306529286594243, + "grad_norm": 1.3676975965499878, + "learning_rate": 6.520500000000001e-05, + "loss": 0.4609, + "step": 13048 + }, + { + "epoch": 0.7307089259715533, + "grad_norm": 1.168544054031372, + "learning_rate": 6.521000000000001e-05, + "loss": 0.374, + "step": 13049 + }, + { + "epoch": 0.7307649232836824, + "grad_norm": 1.2402279376983643, + "learning_rate": 6.5215e-05, + "loss": 0.4881, + "step": 13050 + }, + { + "epoch": 0.7308209205958114, + "grad_norm": 1.1424239873886108, + "learning_rate": 6.522e-05, + "loss": 0.3579, + "step": 13051 + }, + { + "epoch": 0.7308769179079404, + "grad_norm": 1.6257461309432983, + "learning_rate": 6.5225e-05, + "loss": 0.5411, + "step": 13052 + }, + { + "epoch": 0.7309329152200694, + "grad_norm": 1.138053059577942, + "learning_rate": 6.523e-05, + "loss": 0.3891, + "step": 13053 + }, + { + "epoch": 0.7309889125321984, + "grad_norm": 1.118177056312561, + "learning_rate": 6.523500000000001e-05, + "loss": 0.3239, + "step": 13054 + }, + { + "epoch": 0.7310449098443275, + "grad_norm": 1.5924105644226074, + "learning_rate": 6.524e-05, + "loss": 0.5878, + "step": 13055 + }, + { + "epoch": 0.7311009071564565, + "grad_norm": 1.3991609811782837, + "learning_rate": 6.5245e-05, + "loss": 0.3695, + "step": 13056 + }, + { + "epoch": 0.7311569044685855, + "grad_norm": 1.188379168510437, + "learning_rate": 6.525e-05, + "loss": 0.3401, + "step": 13057 + }, + { + "epoch": 0.7312129017807145, + "grad_norm": 1.364189863204956, + "learning_rate": 6.5255e-05, + "loss": 0.351, + "step": 13058 + }, + { + "epoch": 0.7312688990928435, + "grad_norm": 1.245988368988037, + "learning_rate": 6.526e-05, + "loss": 0.4417, + "step": 13059 + }, + { + "epoch": 0.7313248964049726, + "grad_norm": 1.2236649990081787, + "learning_rate": 6.526499999999999e-05, + "loss": 0.3758, + "step": 13060 + }, + { + "epoch": 0.7313808937171016, + "grad_norm": 1.0870695114135742, + "learning_rate": 6.527e-05, + "loss": 0.2935, + "step": 13061 + }, + { + "epoch": 0.7314368910292306, + "grad_norm": 1.4593501091003418, + "learning_rate": 6.527500000000001e-05, + "loss": 0.4677, + "step": 13062 + }, + { + "epoch": 0.7314928883413596, + "grad_norm": 1.3815888166427612, + "learning_rate": 6.528000000000001e-05, + "loss": 0.3849, + "step": 13063 + }, + { + "epoch": 0.7315488856534886, + "grad_norm": 1.5010366439819336, + "learning_rate": 6.528500000000001e-05, + "loss": 0.416, + "step": 13064 + }, + { + "epoch": 0.7316048829656177, + "grad_norm": 1.7675964832305908, + "learning_rate": 6.529e-05, + "loss": 0.3649, + "step": 13065 + }, + { + "epoch": 0.7316608802777467, + "grad_norm": 1.5239472389221191, + "learning_rate": 6.5295e-05, + "loss": 0.4546, + "step": 13066 + }, + { + "epoch": 0.7317168775898757, + "grad_norm": 1.4678200483322144, + "learning_rate": 6.53e-05, + "loss": 0.4545, + "step": 13067 + }, + { + "epoch": 0.7317728749020047, + "grad_norm": 1.3831298351287842, + "learning_rate": 6.5305e-05, + "loss": 0.4052, + "step": 13068 + }, + { + "epoch": 0.7318288722141337, + "grad_norm": 1.2077752351760864, + "learning_rate": 6.531000000000001e-05, + "loss": 0.4256, + "step": 13069 + }, + { + "epoch": 0.7318848695262627, + "grad_norm": 1.3360260725021362, + "learning_rate": 6.531500000000001e-05, + "loss": 0.4347, + "step": 13070 + }, + { + "epoch": 0.7319408668383918, + "grad_norm": 1.3099371194839478, + "learning_rate": 6.532e-05, + "loss": 0.4831, + "step": 13071 + }, + { + "epoch": 0.7319968641505208, + "grad_norm": 1.2227438688278198, + "learning_rate": 6.5325e-05, + "loss": 0.4765, + "step": 13072 + }, + { + "epoch": 0.7320528614626498, + "grad_norm": 1.2485935688018799, + "learning_rate": 6.533e-05, + "loss": 0.3406, + "step": 13073 + }, + { + "epoch": 0.7321088587747788, + "grad_norm": 1.3343572616577148, + "learning_rate": 6.5335e-05, + "loss": 0.4625, + "step": 13074 + }, + { + "epoch": 0.7321648560869078, + "grad_norm": 1.1251615285873413, + "learning_rate": 6.534e-05, + "loss": 0.3521, + "step": 13075 + }, + { + "epoch": 0.7322208533990369, + "grad_norm": 1.5101428031921387, + "learning_rate": 6.5345e-05, + "loss": 0.5716, + "step": 13076 + }, + { + "epoch": 0.7322768507111659, + "grad_norm": 1.3580188751220703, + "learning_rate": 6.535e-05, + "loss": 0.4463, + "step": 13077 + }, + { + "epoch": 0.7323328480232949, + "grad_norm": 1.3270171880722046, + "learning_rate": 6.5355e-05, + "loss": 0.5816, + "step": 13078 + }, + { + "epoch": 0.7323888453354239, + "grad_norm": 1.1969938278198242, + "learning_rate": 6.536e-05, + "loss": 0.354, + "step": 13079 + }, + { + "epoch": 0.7324448426475529, + "grad_norm": 1.433491587638855, + "learning_rate": 6.5365e-05, + "loss": 0.3844, + "step": 13080 + }, + { + "epoch": 0.732500839959682, + "grad_norm": 1.302200198173523, + "learning_rate": 6.536999999999999e-05, + "loss": 0.4395, + "step": 13081 + }, + { + "epoch": 0.732556837271811, + "grad_norm": 1.4608463048934937, + "learning_rate": 6.5375e-05, + "loss": 0.4483, + "step": 13082 + }, + { + "epoch": 0.73261283458394, + "grad_norm": 1.3989903926849365, + "learning_rate": 6.538000000000001e-05, + "loss": 0.4813, + "step": 13083 + }, + { + "epoch": 0.732668831896069, + "grad_norm": 1.2609761953353882, + "learning_rate": 6.538500000000001e-05, + "loss": 0.39, + "step": 13084 + }, + { + "epoch": 0.732724829208198, + "grad_norm": 1.1653172969818115, + "learning_rate": 6.539000000000001e-05, + "loss": 0.4204, + "step": 13085 + }, + { + "epoch": 0.732780826520327, + "grad_norm": 1.3216698169708252, + "learning_rate": 6.5395e-05, + "loss": 0.4472, + "step": 13086 + }, + { + "epoch": 0.7328368238324561, + "grad_norm": 1.3972324132919312, + "learning_rate": 6.54e-05, + "loss": 0.5031, + "step": 13087 + }, + { + "epoch": 0.7328928211445851, + "grad_norm": 1.205294132232666, + "learning_rate": 6.5405e-05, + "loss": 0.4253, + "step": 13088 + }, + { + "epoch": 0.7329488184567141, + "grad_norm": 1.3671629428863525, + "learning_rate": 6.541e-05, + "loss": 0.4748, + "step": 13089 + }, + { + "epoch": 0.7330048157688431, + "grad_norm": 1.2196563482284546, + "learning_rate": 6.541500000000001e-05, + "loss": 0.4939, + "step": 13090 + }, + { + "epoch": 0.7330608130809722, + "grad_norm": 1.2559672594070435, + "learning_rate": 6.542000000000001e-05, + "loss": 0.3385, + "step": 13091 + }, + { + "epoch": 0.7331168103931012, + "grad_norm": 1.2398109436035156, + "learning_rate": 6.5425e-05, + "loss": 0.4241, + "step": 13092 + }, + { + "epoch": 0.7331728077052302, + "grad_norm": 1.8966413736343384, + "learning_rate": 6.543e-05, + "loss": 0.5796, + "step": 13093 + }, + { + "epoch": 0.7332288050173592, + "grad_norm": 1.4565420150756836, + "learning_rate": 6.5435e-05, + "loss": 0.4427, + "step": 13094 + }, + { + "epoch": 0.7332848023294882, + "grad_norm": 1.4604127407073975, + "learning_rate": 6.544e-05, + "loss": 0.5573, + "step": 13095 + }, + { + "epoch": 0.7333407996416172, + "grad_norm": 1.456725001335144, + "learning_rate": 6.5445e-05, + "loss": 0.459, + "step": 13096 + }, + { + "epoch": 0.7333967969537463, + "grad_norm": 1.8771783113479614, + "learning_rate": 6.545e-05, + "loss": 0.4563, + "step": 13097 + }, + { + "epoch": 0.7334527942658753, + "grad_norm": 1.2887972593307495, + "learning_rate": 6.5455e-05, + "loss": 0.5581, + "step": 13098 + }, + { + "epoch": 0.7335087915780043, + "grad_norm": 1.593860387802124, + "learning_rate": 6.546e-05, + "loss": 0.4394, + "step": 13099 + }, + { + "epoch": 0.7335647888901333, + "grad_norm": 1.411238431930542, + "learning_rate": 6.5465e-05, + "loss": 0.4793, + "step": 13100 + }, + { + "epoch": 0.7336207862022623, + "grad_norm": 1.213128924369812, + "learning_rate": 6.547e-05, + "loss": 0.4316, + "step": 13101 + }, + { + "epoch": 0.7336767835143914, + "grad_norm": 1.4428856372833252, + "learning_rate": 6.5475e-05, + "loss": 0.4737, + "step": 13102 + }, + { + "epoch": 0.7337327808265204, + "grad_norm": 1.2088133096694946, + "learning_rate": 6.548e-05, + "loss": 0.4506, + "step": 13103 + }, + { + "epoch": 0.7337887781386494, + "grad_norm": 1.3256062269210815, + "learning_rate": 6.548500000000001e-05, + "loss": 0.5224, + "step": 13104 + }, + { + "epoch": 0.7338447754507783, + "grad_norm": 1.1555911302566528, + "learning_rate": 6.549000000000001e-05, + "loss": 0.3475, + "step": 13105 + }, + { + "epoch": 0.7339007727629073, + "grad_norm": 1.248432993888855, + "learning_rate": 6.549500000000001e-05, + "loss": 0.3861, + "step": 13106 + }, + { + "epoch": 0.7339567700750363, + "grad_norm": 1.095834732055664, + "learning_rate": 6.55e-05, + "loss": 0.3302, + "step": 13107 + }, + { + "epoch": 0.7340127673871654, + "grad_norm": 0.9473075866699219, + "learning_rate": 6.5505e-05, + "loss": 0.2724, + "step": 13108 + }, + { + "epoch": 0.7340687646992944, + "grad_norm": Infinity, + "learning_rate": 6.5505e-05, + "loss": 0.4543, + "step": 13109 + }, + { + "epoch": 0.7341247620114234, + "grad_norm": 1.1974743604660034, + "learning_rate": 6.551e-05, + "loss": 0.3362, + "step": 13110 + }, + { + "epoch": 0.7341807593235524, + "grad_norm": 1.695594072341919, + "learning_rate": 6.5515e-05, + "loss": 0.4449, + "step": 13111 + }, + { + "epoch": 0.7342367566356814, + "grad_norm": 1.3305492401123047, + "learning_rate": 6.552000000000001e-05, + "loss": 0.3226, + "step": 13112 + }, + { + "epoch": 0.7342927539478105, + "grad_norm": 1.1246436834335327, + "learning_rate": 6.552500000000001e-05, + "loss": 0.3647, + "step": 13113 + }, + { + "epoch": 0.7343487512599395, + "grad_norm": 1.2378196716308594, + "learning_rate": 6.553e-05, + "loss": 0.4981, + "step": 13114 + }, + { + "epoch": 0.7344047485720685, + "grad_norm": 2.289897918701172, + "learning_rate": 6.5535e-05, + "loss": 0.5156, + "step": 13115 + }, + { + "epoch": 0.7344607458841975, + "grad_norm": 1.1310378313064575, + "learning_rate": 6.554e-05, + "loss": 0.4595, + "step": 13116 + }, + { + "epoch": 0.7345167431963265, + "grad_norm": 1.167104959487915, + "learning_rate": 6.5545e-05, + "loss": 0.3827, + "step": 13117 + }, + { + "epoch": 0.7345727405084556, + "grad_norm": 1.4493050575256348, + "learning_rate": 6.555e-05, + "loss": 0.4118, + "step": 13118 + }, + { + "epoch": 0.7346287378205846, + "grad_norm": 1.5088547468185425, + "learning_rate": 6.5555e-05, + "loss": 0.4831, + "step": 13119 + }, + { + "epoch": 0.7346847351327136, + "grad_norm": 1.1988805532455444, + "learning_rate": 6.556e-05, + "loss": 0.419, + "step": 13120 + }, + { + "epoch": 0.7347407324448426, + "grad_norm": 1.561507225036621, + "learning_rate": 6.5565e-05, + "loss": 0.5169, + "step": 13121 + }, + { + "epoch": 0.7347967297569716, + "grad_norm": 1.3324174880981445, + "learning_rate": 6.557e-05, + "loss": 0.4567, + "step": 13122 + }, + { + "epoch": 0.7348527270691007, + "grad_norm": 1.2111668586730957, + "learning_rate": 6.557500000000001e-05, + "loss": 0.4117, + "step": 13123 + }, + { + "epoch": 0.7349087243812297, + "grad_norm": 1.8720570802688599, + "learning_rate": 6.558e-05, + "loss": 0.5808, + "step": 13124 + }, + { + "epoch": 0.7349647216933587, + "grad_norm": 1.7627100944519043, + "learning_rate": 6.5585e-05, + "loss": 0.4777, + "step": 13125 + }, + { + "epoch": 0.7350207190054877, + "grad_norm": 1.185268759727478, + "learning_rate": 6.559e-05, + "loss": 0.3688, + "step": 13126 + }, + { + "epoch": 0.7350767163176167, + "grad_norm": 1.5869851112365723, + "learning_rate": 6.559500000000001e-05, + "loss": 0.5782, + "step": 13127 + }, + { + "epoch": 0.7351327136297457, + "grad_norm": 1.3997811079025269, + "learning_rate": 6.560000000000001e-05, + "loss": 0.3659, + "step": 13128 + }, + { + "epoch": 0.7351887109418748, + "grad_norm": 1.2208737134933472, + "learning_rate": 6.5605e-05, + "loss": 0.4605, + "step": 13129 + }, + { + "epoch": 0.7352447082540038, + "grad_norm": 1.28058922290802, + "learning_rate": 6.561e-05, + "loss": 0.537, + "step": 13130 + }, + { + "epoch": 0.7353007055661328, + "grad_norm": 1.0700160264968872, + "learning_rate": 6.5615e-05, + "loss": 0.4376, + "step": 13131 + }, + { + "epoch": 0.7353567028782618, + "grad_norm": 1.3895633220672607, + "learning_rate": 6.562e-05, + "loss": 0.4032, + "step": 13132 + }, + { + "epoch": 0.7354127001903908, + "grad_norm": 1.2850769758224487, + "learning_rate": 6.562500000000001e-05, + "loss": 0.5439, + "step": 13133 + }, + { + "epoch": 0.7354686975025199, + "grad_norm": 1.3515318632125854, + "learning_rate": 6.563000000000001e-05, + "loss": 0.5074, + "step": 13134 + }, + { + "epoch": 0.7355246948146489, + "grad_norm": 1.4849662780761719, + "learning_rate": 6.5635e-05, + "loss": 0.6059, + "step": 13135 + }, + { + "epoch": 0.7355806921267779, + "grad_norm": 1.2736537456512451, + "learning_rate": 6.564e-05, + "loss": 0.5221, + "step": 13136 + }, + { + "epoch": 0.7356366894389069, + "grad_norm": 1.1367994546890259, + "learning_rate": 6.5645e-05, + "loss": 0.4085, + "step": 13137 + }, + { + "epoch": 0.7356926867510359, + "grad_norm": 1.3408769369125366, + "learning_rate": 6.565e-05, + "loss": 0.4669, + "step": 13138 + }, + { + "epoch": 0.735748684063165, + "grad_norm": 1.2693918943405151, + "learning_rate": 6.5655e-05, + "loss": 0.4661, + "step": 13139 + }, + { + "epoch": 0.735804681375294, + "grad_norm": 1.6538734436035156, + "learning_rate": 6.566e-05, + "loss": 0.4861, + "step": 13140 + }, + { + "epoch": 0.735860678687423, + "grad_norm": 1.1471478939056396, + "learning_rate": 6.5665e-05, + "loss": 0.4333, + "step": 13141 + }, + { + "epoch": 0.735916675999552, + "grad_norm": 1.5495682954788208, + "learning_rate": 6.567e-05, + "loss": 0.4726, + "step": 13142 + }, + { + "epoch": 0.735972673311681, + "grad_norm": 1.4408315420150757, + "learning_rate": 6.5675e-05, + "loss": 0.5122, + "step": 13143 + }, + { + "epoch": 0.73602867062381, + "grad_norm": 1.115729808807373, + "learning_rate": 6.568000000000001e-05, + "loss": 0.3786, + "step": 13144 + }, + { + "epoch": 0.7360846679359391, + "grad_norm": 1.2041523456573486, + "learning_rate": 6.5685e-05, + "loss": 0.3107, + "step": 13145 + }, + { + "epoch": 0.7361406652480681, + "grad_norm": 1.2265079021453857, + "learning_rate": 6.569e-05, + "loss": 0.3852, + "step": 13146 + }, + { + "epoch": 0.7361966625601971, + "grad_norm": 1.2132033109664917, + "learning_rate": 6.5695e-05, + "loss": 0.475, + "step": 13147 + }, + { + "epoch": 0.7362526598723261, + "grad_norm": 1.082369089126587, + "learning_rate": 6.570000000000001e-05, + "loss": 0.3402, + "step": 13148 + }, + { + "epoch": 0.7363086571844552, + "grad_norm": 1.333266019821167, + "learning_rate": 6.570500000000001e-05, + "loss": 0.4125, + "step": 13149 + }, + { + "epoch": 0.7363646544965842, + "grad_norm": 1.2038555145263672, + "learning_rate": 6.571e-05, + "loss": 0.3986, + "step": 13150 + }, + { + "epoch": 0.7364206518087132, + "grad_norm": 1.3272336721420288, + "learning_rate": 6.5715e-05, + "loss": 0.4539, + "step": 13151 + }, + { + "epoch": 0.7364766491208422, + "grad_norm": 1.3587881326675415, + "learning_rate": 6.572e-05, + "loss": 0.4755, + "step": 13152 + }, + { + "epoch": 0.7365326464329712, + "grad_norm": 1.3513455390930176, + "learning_rate": 6.5725e-05, + "loss": 0.4184, + "step": 13153 + }, + { + "epoch": 0.7365886437451002, + "grad_norm": 1.403626561164856, + "learning_rate": 6.573000000000001e-05, + "loss": 0.5434, + "step": 13154 + }, + { + "epoch": 0.7366446410572293, + "grad_norm": 1.3098336458206177, + "learning_rate": 6.5735e-05, + "loss": 0.4434, + "step": 13155 + }, + { + "epoch": 0.7367006383693583, + "grad_norm": 1.129221796989441, + "learning_rate": 6.574e-05, + "loss": 0.3664, + "step": 13156 + }, + { + "epoch": 0.7367566356814873, + "grad_norm": 1.4928340911865234, + "learning_rate": 6.5745e-05, + "loss": 0.4269, + "step": 13157 + }, + { + "epoch": 0.7368126329936163, + "grad_norm": 1.458243727684021, + "learning_rate": 6.575e-05, + "loss": 0.44, + "step": 13158 + }, + { + "epoch": 0.7368686303057453, + "grad_norm": 1.3369060754776, + "learning_rate": 6.5755e-05, + "loss": 0.5347, + "step": 13159 + }, + { + "epoch": 0.7369246276178744, + "grad_norm": 1.359553337097168, + "learning_rate": 6.576e-05, + "loss": 0.3842, + "step": 13160 + }, + { + "epoch": 0.7369806249300034, + "grad_norm": 1.1033194065093994, + "learning_rate": 6.5765e-05, + "loss": 0.3966, + "step": 13161 + }, + { + "epoch": 0.7370366222421324, + "grad_norm": 1.2944777011871338, + "learning_rate": 6.577e-05, + "loss": 0.5503, + "step": 13162 + }, + { + "epoch": 0.7370926195542614, + "grad_norm": 1.1972578763961792, + "learning_rate": 6.5775e-05, + "loss": 0.4032, + "step": 13163 + }, + { + "epoch": 0.7371486168663904, + "grad_norm": 1.368035078048706, + "learning_rate": 6.578000000000001e-05, + "loss": 0.4314, + "step": 13164 + }, + { + "epoch": 0.7372046141785195, + "grad_norm": 1.3839082717895508, + "learning_rate": 6.578500000000001e-05, + "loss": 0.3676, + "step": 13165 + }, + { + "epoch": 0.7372606114906485, + "grad_norm": 1.2854747772216797, + "learning_rate": 6.579e-05, + "loss": 0.4638, + "step": 13166 + }, + { + "epoch": 0.7373166088027775, + "grad_norm": 1.4696745872497559, + "learning_rate": 6.5795e-05, + "loss": 0.4465, + "step": 13167 + }, + { + "epoch": 0.7373726061149065, + "grad_norm": 1.368017315864563, + "learning_rate": 6.58e-05, + "loss": 0.4276, + "step": 13168 + }, + { + "epoch": 0.7374286034270355, + "grad_norm": 1.272612452507019, + "learning_rate": 6.580500000000001e-05, + "loss": 0.4243, + "step": 13169 + }, + { + "epoch": 0.7374846007391646, + "grad_norm": 1.2280429601669312, + "learning_rate": 6.581000000000001e-05, + "loss": 0.5592, + "step": 13170 + }, + { + "epoch": 0.7375405980512936, + "grad_norm": 1.3454099893569946, + "learning_rate": 6.5815e-05, + "loss": 0.4519, + "step": 13171 + }, + { + "epoch": 0.7375965953634226, + "grad_norm": 1.195507526397705, + "learning_rate": 6.582e-05, + "loss": 0.384, + "step": 13172 + }, + { + "epoch": 0.7376525926755516, + "grad_norm": 0.985457718372345, + "learning_rate": 6.5825e-05, + "loss": 0.3247, + "step": 13173 + }, + { + "epoch": 0.7377085899876806, + "grad_norm": 1.602728247642517, + "learning_rate": 6.583e-05, + "loss": 0.4885, + "step": 13174 + }, + { + "epoch": 0.7377645872998096, + "grad_norm": 1.2869107723236084, + "learning_rate": 6.5835e-05, + "loss": 0.3641, + "step": 13175 + }, + { + "epoch": 0.7378205846119387, + "grad_norm": 1.3212437629699707, + "learning_rate": 6.584e-05, + "loss": 0.4423, + "step": 13176 + }, + { + "epoch": 0.7378765819240677, + "grad_norm": 2.345132350921631, + "learning_rate": 6.5845e-05, + "loss": 0.3675, + "step": 13177 + }, + { + "epoch": 0.7379325792361967, + "grad_norm": 1.5399130582809448, + "learning_rate": 6.585e-05, + "loss": 0.6307, + "step": 13178 + }, + { + "epoch": 0.7379885765483257, + "grad_norm": 1.4886460304260254, + "learning_rate": 6.5855e-05, + "loss": 0.3838, + "step": 13179 + }, + { + "epoch": 0.7380445738604547, + "grad_norm": 1.2662973403930664, + "learning_rate": 6.586e-05, + "loss": 0.4451, + "step": 13180 + }, + { + "epoch": 0.7381005711725838, + "grad_norm": 1.113808035850525, + "learning_rate": 6.5865e-05, + "loss": 0.4122, + "step": 13181 + }, + { + "epoch": 0.7381565684847128, + "grad_norm": 1.5719285011291504, + "learning_rate": 6.587e-05, + "loss": 0.4248, + "step": 13182 + }, + { + "epoch": 0.7382125657968418, + "grad_norm": 1.3040238618850708, + "learning_rate": 6.5875e-05, + "loss": 0.4513, + "step": 13183 + }, + { + "epoch": 0.7382685631089708, + "grad_norm": 1.3562145233154297, + "learning_rate": 6.588000000000001e-05, + "loss": 0.4359, + "step": 13184 + }, + { + "epoch": 0.7383245604210998, + "grad_norm": 1.2630589008331299, + "learning_rate": 6.588500000000001e-05, + "loss": 0.4191, + "step": 13185 + }, + { + "epoch": 0.7383805577332289, + "grad_norm": 1.2360169887542725, + "learning_rate": 6.589000000000001e-05, + "loss": 0.4591, + "step": 13186 + }, + { + "epoch": 0.7384365550453578, + "grad_norm": 1.296000361442566, + "learning_rate": 6.5895e-05, + "loss": 0.4327, + "step": 13187 + }, + { + "epoch": 0.7384925523574868, + "grad_norm": 1.523987054824829, + "learning_rate": 6.59e-05, + "loss": 0.456, + "step": 13188 + }, + { + "epoch": 0.7385485496696158, + "grad_norm": 1.7142254114151, + "learning_rate": 6.5905e-05, + "loss": 0.5521, + "step": 13189 + }, + { + "epoch": 0.7386045469817448, + "grad_norm": 1.0339454412460327, + "learning_rate": 6.591000000000001e-05, + "loss": 0.3673, + "step": 13190 + }, + { + "epoch": 0.7386605442938738, + "grad_norm": 1.3455902338027954, + "learning_rate": 6.591500000000001e-05, + "loss": 0.4764, + "step": 13191 + }, + { + "epoch": 0.7387165416060029, + "grad_norm": 1.2935000658035278, + "learning_rate": 6.592e-05, + "loss": 0.4256, + "step": 13192 + }, + { + "epoch": 0.7387725389181319, + "grad_norm": 1.2083535194396973, + "learning_rate": 6.5925e-05, + "loss": 0.3604, + "step": 13193 + }, + { + "epoch": 0.7388285362302609, + "grad_norm": 1.4803552627563477, + "learning_rate": 6.593e-05, + "loss": 0.4738, + "step": 13194 + }, + { + "epoch": 0.7388845335423899, + "grad_norm": 1.3232859373092651, + "learning_rate": 6.5935e-05, + "loss": 0.4574, + "step": 13195 + }, + { + "epoch": 0.7389405308545189, + "grad_norm": 1.2311434745788574, + "learning_rate": 6.594e-05, + "loss": 0.5001, + "step": 13196 + }, + { + "epoch": 0.738996528166648, + "grad_norm": 1.3587087392807007, + "learning_rate": 6.5945e-05, + "loss": 0.5302, + "step": 13197 + }, + { + "epoch": 0.739052525478777, + "grad_norm": 1.3923612833023071, + "learning_rate": 6.595e-05, + "loss": 0.3617, + "step": 13198 + }, + { + "epoch": 0.739108522790906, + "grad_norm": 1.3061606884002686, + "learning_rate": 6.5955e-05, + "loss": 0.5975, + "step": 13199 + }, + { + "epoch": 0.739164520103035, + "grad_norm": 1.2932555675506592, + "learning_rate": 6.596e-05, + "loss": 0.4979, + "step": 13200 + }, + { + "epoch": 0.739220517415164, + "grad_norm": 2.5977580547332764, + "learning_rate": 6.5965e-05, + "loss": 0.3575, + "step": 13201 + }, + { + "epoch": 0.739276514727293, + "grad_norm": 1.2210155725479126, + "learning_rate": 6.597e-05, + "loss": 0.5616, + "step": 13202 + }, + { + "epoch": 0.7393325120394221, + "grad_norm": 1.3751977682113647, + "learning_rate": 6.5975e-05, + "loss": 0.4162, + "step": 13203 + }, + { + "epoch": 0.7393885093515511, + "grad_norm": 1.316551685333252, + "learning_rate": 6.598e-05, + "loss": 0.5675, + "step": 13204 + }, + { + "epoch": 0.7394445066636801, + "grad_norm": 1.4882951974868774, + "learning_rate": 6.598500000000001e-05, + "loss": 0.5544, + "step": 13205 + }, + { + "epoch": 0.7395005039758091, + "grad_norm": 1.2412019968032837, + "learning_rate": 6.599000000000001e-05, + "loss": 0.3857, + "step": 13206 + }, + { + "epoch": 0.7395565012879382, + "grad_norm": 1.1741154193878174, + "learning_rate": 6.599500000000001e-05, + "loss": 0.4972, + "step": 13207 + }, + { + "epoch": 0.7396124986000672, + "grad_norm": 1.460400938987732, + "learning_rate": 6.6e-05, + "loss": 0.4688, + "step": 13208 + }, + { + "epoch": 0.7396684959121962, + "grad_norm": 1.3373973369598389, + "learning_rate": 6.6005e-05, + "loss": 0.3796, + "step": 13209 + }, + { + "epoch": 0.7397244932243252, + "grad_norm": 1.4087990522384644, + "learning_rate": 6.601e-05, + "loss": 0.7111, + "step": 13210 + }, + { + "epoch": 0.7397804905364542, + "grad_norm": 1.1778781414031982, + "learning_rate": 6.601500000000001e-05, + "loss": 0.2816, + "step": 13211 + }, + { + "epoch": 0.7398364878485832, + "grad_norm": 1.573437213897705, + "learning_rate": 6.602000000000001e-05, + "loss": 0.5817, + "step": 13212 + }, + { + "epoch": 0.7398924851607123, + "grad_norm": 1.4049535989761353, + "learning_rate": 6.6025e-05, + "loss": 0.3513, + "step": 13213 + }, + { + "epoch": 0.7399484824728413, + "grad_norm": 1.3372673988342285, + "learning_rate": 6.603e-05, + "loss": 0.523, + "step": 13214 + }, + { + "epoch": 0.7400044797849703, + "grad_norm": 1.2721821069717407, + "learning_rate": 6.6035e-05, + "loss": 0.5244, + "step": 13215 + }, + { + "epoch": 0.7400604770970993, + "grad_norm": 1.2308804988861084, + "learning_rate": 6.604e-05, + "loss": 0.3764, + "step": 13216 + }, + { + "epoch": 0.7401164744092283, + "grad_norm": 1.28029465675354, + "learning_rate": 6.6045e-05, + "loss": 0.4245, + "step": 13217 + }, + { + "epoch": 0.7401724717213574, + "grad_norm": 1.2959840297698975, + "learning_rate": 6.605e-05, + "loss": 0.5164, + "step": 13218 + }, + { + "epoch": 0.7402284690334864, + "grad_norm": 1.1691176891326904, + "learning_rate": 6.6055e-05, + "loss": 0.4248, + "step": 13219 + }, + { + "epoch": 0.7402844663456154, + "grad_norm": 1.4101749658584595, + "learning_rate": 6.606e-05, + "loss": 0.4106, + "step": 13220 + }, + { + "epoch": 0.7403404636577444, + "grad_norm": 1.2446681261062622, + "learning_rate": 6.6065e-05, + "loss": 0.5375, + "step": 13221 + }, + { + "epoch": 0.7403964609698734, + "grad_norm": 1.48183274269104, + "learning_rate": 6.607e-05, + "loss": 0.543, + "step": 13222 + }, + { + "epoch": 0.7404524582820025, + "grad_norm": 1.2818819284439087, + "learning_rate": 6.6075e-05, + "loss": 0.3969, + "step": 13223 + }, + { + "epoch": 0.7405084555941315, + "grad_norm": 1.3402365446090698, + "learning_rate": 6.608e-05, + "loss": 0.3758, + "step": 13224 + }, + { + "epoch": 0.7405644529062605, + "grad_norm": 1.1508859395980835, + "learning_rate": 6.6085e-05, + "loss": 0.4746, + "step": 13225 + }, + { + "epoch": 0.7406204502183895, + "grad_norm": 1.318395733833313, + "learning_rate": 6.609000000000001e-05, + "loss": 0.4383, + "step": 13226 + }, + { + "epoch": 0.7406764475305185, + "grad_norm": 1.2404429912567139, + "learning_rate": 6.609500000000001e-05, + "loss": 0.4597, + "step": 13227 + }, + { + "epoch": 0.7407324448426476, + "grad_norm": 1.0450571775436401, + "learning_rate": 6.610000000000001e-05, + "loss": 0.3557, + "step": 13228 + }, + { + "epoch": 0.7407884421547766, + "grad_norm": 1.453265905380249, + "learning_rate": 6.6105e-05, + "loss": 0.4221, + "step": 13229 + }, + { + "epoch": 0.7408444394669056, + "grad_norm": 1.6890655755996704, + "learning_rate": 6.611e-05, + "loss": 0.4914, + "step": 13230 + }, + { + "epoch": 0.7409004367790346, + "grad_norm": 1.4456104040145874, + "learning_rate": 6.6115e-05, + "loss": 0.343, + "step": 13231 + }, + { + "epoch": 0.7409564340911636, + "grad_norm": 1.3714569807052612, + "learning_rate": 6.612000000000001e-05, + "loss": 0.365, + "step": 13232 + }, + { + "epoch": 0.7410124314032926, + "grad_norm": 1.425466537475586, + "learning_rate": 6.612500000000001e-05, + "loss": 0.431, + "step": 13233 + }, + { + "epoch": 0.7410684287154217, + "grad_norm": 1.1059119701385498, + "learning_rate": 6.613e-05, + "loss": 0.4517, + "step": 13234 + }, + { + "epoch": 0.7411244260275507, + "grad_norm": 1.4817248582839966, + "learning_rate": 6.6135e-05, + "loss": 0.456, + "step": 13235 + }, + { + "epoch": 0.7411804233396797, + "grad_norm": 1.1598478555679321, + "learning_rate": 6.614e-05, + "loss": 0.3313, + "step": 13236 + }, + { + "epoch": 0.7412364206518087, + "grad_norm": 1.3717542886734009, + "learning_rate": 6.6145e-05, + "loss": 0.5052, + "step": 13237 + }, + { + "epoch": 0.7412924179639377, + "grad_norm": 1.4055230617523193, + "learning_rate": 6.615e-05, + "loss": 0.4718, + "step": 13238 + }, + { + "epoch": 0.7413484152760668, + "grad_norm": 2.4605820178985596, + "learning_rate": 6.6155e-05, + "loss": 0.4463, + "step": 13239 + }, + { + "epoch": 0.7414044125881958, + "grad_norm": 1.2312734127044678, + "learning_rate": 6.616e-05, + "loss": 0.4432, + "step": 13240 + }, + { + "epoch": 0.7414604099003248, + "grad_norm": 1.5527117252349854, + "learning_rate": 6.6165e-05, + "loss": 0.7414, + "step": 13241 + }, + { + "epoch": 0.7415164072124538, + "grad_norm": 1.4419389963150024, + "learning_rate": 6.617e-05, + "loss": 0.4566, + "step": 13242 + }, + { + "epoch": 0.7415724045245828, + "grad_norm": 1.1105155944824219, + "learning_rate": 6.6175e-05, + "loss": 0.3224, + "step": 13243 + }, + { + "epoch": 0.7416284018367119, + "grad_norm": 1.355978012084961, + "learning_rate": 6.618e-05, + "loss": 0.5827, + "step": 13244 + }, + { + "epoch": 0.7416843991488409, + "grad_norm": 1.4871820211410522, + "learning_rate": 6.6185e-05, + "loss": 0.5677, + "step": 13245 + }, + { + "epoch": 0.7417403964609699, + "grad_norm": 1.2730991840362549, + "learning_rate": 6.619e-05, + "loss": 0.4863, + "step": 13246 + }, + { + "epoch": 0.7417963937730989, + "grad_norm": 6.5206217765808105, + "learning_rate": 6.619500000000001e-05, + "loss": 0.4476, + "step": 13247 + }, + { + "epoch": 0.7418523910852279, + "grad_norm": 1.3759405612945557, + "learning_rate": 6.620000000000001e-05, + "loss": 0.4373, + "step": 13248 + }, + { + "epoch": 0.741908388397357, + "grad_norm": 1.4291212558746338, + "learning_rate": 6.620500000000001e-05, + "loss": 0.4196, + "step": 13249 + }, + { + "epoch": 0.741964385709486, + "grad_norm": 1.3058536052703857, + "learning_rate": 6.621e-05, + "loss": 0.4366, + "step": 13250 + }, + { + "epoch": 0.742020383021615, + "grad_norm": 1.2478581666946411, + "learning_rate": 6.6215e-05, + "loss": 0.3752, + "step": 13251 + }, + { + "epoch": 0.742076380333744, + "grad_norm": 1.4263951778411865, + "learning_rate": 6.622e-05, + "loss": 0.5217, + "step": 13252 + }, + { + "epoch": 0.742132377645873, + "grad_norm": 1.5010628700256348, + "learning_rate": 6.6225e-05, + "loss": 0.5169, + "step": 13253 + }, + { + "epoch": 0.742188374958002, + "grad_norm": 1.0944013595581055, + "learning_rate": 6.623000000000001e-05, + "loss": 0.3406, + "step": 13254 + }, + { + "epoch": 0.7422443722701311, + "grad_norm": 1.07964289188385, + "learning_rate": 6.6235e-05, + "loss": 0.3515, + "step": 13255 + }, + { + "epoch": 0.7423003695822601, + "grad_norm": 1.139193058013916, + "learning_rate": 6.624e-05, + "loss": 0.4078, + "step": 13256 + }, + { + "epoch": 0.7423563668943891, + "grad_norm": 1.1736921072006226, + "learning_rate": 6.6245e-05, + "loss": 0.359, + "step": 13257 + }, + { + "epoch": 0.7424123642065181, + "grad_norm": 1.059420108795166, + "learning_rate": 6.625e-05, + "loss": 0.4044, + "step": 13258 + }, + { + "epoch": 0.7424683615186471, + "grad_norm": 1.1916396617889404, + "learning_rate": 6.6255e-05, + "loss": 0.3729, + "step": 13259 + }, + { + "epoch": 0.7425243588307762, + "grad_norm": 1.566210150718689, + "learning_rate": 6.626e-05, + "loss": 0.4848, + "step": 13260 + }, + { + "epoch": 0.7425803561429052, + "grad_norm": 1.3561042547225952, + "learning_rate": 6.6265e-05, + "loss": 0.437, + "step": 13261 + }, + { + "epoch": 0.7426363534550342, + "grad_norm": 1.1792902946472168, + "learning_rate": 6.627e-05, + "loss": 0.3632, + "step": 13262 + }, + { + "epoch": 0.7426923507671632, + "grad_norm": 1.3329291343688965, + "learning_rate": 6.6275e-05, + "loss": 0.4487, + "step": 13263 + }, + { + "epoch": 0.7427483480792922, + "grad_norm": 1.4227898120880127, + "learning_rate": 6.628e-05, + "loss": 0.5119, + "step": 13264 + }, + { + "epoch": 0.7428043453914213, + "grad_norm": 1.191625714302063, + "learning_rate": 6.6285e-05, + "loss": 0.4121, + "step": 13265 + }, + { + "epoch": 0.7428603427035503, + "grad_norm": 1.4005130529403687, + "learning_rate": 6.629e-05, + "loss": 0.5107, + "step": 13266 + }, + { + "epoch": 0.7429163400156793, + "grad_norm": 1.187508225440979, + "learning_rate": 6.6295e-05, + "loss": 0.4321, + "step": 13267 + }, + { + "epoch": 0.7429723373278083, + "grad_norm": 1.2492766380310059, + "learning_rate": 6.630000000000001e-05, + "loss": 0.4224, + "step": 13268 + }, + { + "epoch": 0.7430283346399373, + "grad_norm": 1.3098195791244507, + "learning_rate": 6.630500000000001e-05, + "loss": 0.3745, + "step": 13269 + }, + { + "epoch": 0.7430843319520662, + "grad_norm": 1.9010365009307861, + "learning_rate": 6.631000000000001e-05, + "loss": 0.5458, + "step": 13270 + }, + { + "epoch": 0.7431403292641953, + "grad_norm": 1.2563695907592773, + "learning_rate": 6.6315e-05, + "loss": 0.3183, + "step": 13271 + }, + { + "epoch": 0.7431963265763243, + "grad_norm": 1.824275255203247, + "learning_rate": 6.632e-05, + "loss": 0.5482, + "step": 13272 + }, + { + "epoch": 0.7432523238884533, + "grad_norm": 1.6714365482330322, + "learning_rate": 6.6325e-05, + "loss": 0.392, + "step": 13273 + }, + { + "epoch": 0.7433083212005823, + "grad_norm": 1.7867680788040161, + "learning_rate": 6.633e-05, + "loss": 0.5196, + "step": 13274 + }, + { + "epoch": 0.7433643185127113, + "grad_norm": 1.5641555786132812, + "learning_rate": 6.633500000000001e-05, + "loss": 0.5168, + "step": 13275 + }, + { + "epoch": 0.7434203158248404, + "grad_norm": 1.3071430921554565, + "learning_rate": 6.634e-05, + "loss": 0.6255, + "step": 13276 + }, + { + "epoch": 0.7434763131369694, + "grad_norm": 1.3810094594955444, + "learning_rate": 6.6345e-05, + "loss": 0.5977, + "step": 13277 + }, + { + "epoch": 0.7435323104490984, + "grad_norm": 1.140381932258606, + "learning_rate": 6.635e-05, + "loss": 0.3518, + "step": 13278 + }, + { + "epoch": 0.7435883077612274, + "grad_norm": 1.3058286905288696, + "learning_rate": 6.6355e-05, + "loss": 0.5564, + "step": 13279 + }, + { + "epoch": 0.7436443050733564, + "grad_norm": 1.2812680006027222, + "learning_rate": 6.636e-05, + "loss": 0.391, + "step": 13280 + }, + { + "epoch": 0.7437003023854855, + "grad_norm": 1.4396165609359741, + "learning_rate": 6.6365e-05, + "loss": 0.4724, + "step": 13281 + }, + { + "epoch": 0.7437562996976145, + "grad_norm": 1.5894254446029663, + "learning_rate": 6.637e-05, + "loss": 0.4654, + "step": 13282 + }, + { + "epoch": 0.7438122970097435, + "grad_norm": 1.2542455196380615, + "learning_rate": 6.6375e-05, + "loss": 0.3891, + "step": 13283 + }, + { + "epoch": 0.7438682943218725, + "grad_norm": 1.3622913360595703, + "learning_rate": 6.638e-05, + "loss": 0.4856, + "step": 13284 + }, + { + "epoch": 0.7439242916340015, + "grad_norm": 1.3187345266342163, + "learning_rate": 6.638500000000001e-05, + "loss": 0.4348, + "step": 13285 + }, + { + "epoch": 0.7439802889461306, + "grad_norm": 1.3023624420166016, + "learning_rate": 6.639e-05, + "loss": 0.548, + "step": 13286 + }, + { + "epoch": 0.7440362862582596, + "grad_norm": 1.7420628070831299, + "learning_rate": 6.6395e-05, + "loss": 0.5877, + "step": 13287 + }, + { + "epoch": 0.7440922835703886, + "grad_norm": 1.394727349281311, + "learning_rate": 6.64e-05, + "loss": 0.4225, + "step": 13288 + }, + { + "epoch": 0.7441482808825176, + "grad_norm": 1.368857741355896, + "learning_rate": 6.640500000000001e-05, + "loss": 0.3973, + "step": 13289 + }, + { + "epoch": 0.7442042781946466, + "grad_norm": 1.4397590160369873, + "learning_rate": 6.641000000000001e-05, + "loss": 0.4232, + "step": 13290 + }, + { + "epoch": 0.7442602755067756, + "grad_norm": 1.462835669517517, + "learning_rate": 6.641500000000001e-05, + "loss": 0.408, + "step": 13291 + }, + { + "epoch": 0.7443162728189047, + "grad_norm": 1.2053711414337158, + "learning_rate": 6.642e-05, + "loss": 0.3666, + "step": 13292 + }, + { + "epoch": 0.7443722701310337, + "grad_norm": 1.3954273462295532, + "learning_rate": 6.6425e-05, + "loss": 0.4062, + "step": 13293 + }, + { + "epoch": 0.7444282674431627, + "grad_norm": 1.2463713884353638, + "learning_rate": 6.643e-05, + "loss": 0.4379, + "step": 13294 + }, + { + "epoch": 0.7444842647552917, + "grad_norm": 1.265592098236084, + "learning_rate": 6.6435e-05, + "loss": 0.4643, + "step": 13295 + }, + { + "epoch": 0.7445402620674207, + "grad_norm": 1.3765982389450073, + "learning_rate": 6.644000000000001e-05, + "loss": 0.4802, + "step": 13296 + }, + { + "epoch": 0.7445962593795498, + "grad_norm": 1.4257162809371948, + "learning_rate": 6.6445e-05, + "loss": 0.4158, + "step": 13297 + }, + { + "epoch": 0.7446522566916788, + "grad_norm": 1.2696559429168701, + "learning_rate": 6.645e-05, + "loss": 0.4479, + "step": 13298 + }, + { + "epoch": 0.7447082540038078, + "grad_norm": 1.3374682664871216, + "learning_rate": 6.6455e-05, + "loss": 0.3887, + "step": 13299 + }, + { + "epoch": 0.7447642513159368, + "grad_norm": 1.3666870594024658, + "learning_rate": 6.646e-05, + "loss": 0.4426, + "step": 13300 + }, + { + "epoch": 0.7448202486280658, + "grad_norm": 1.2497977018356323, + "learning_rate": 6.6465e-05, + "loss": 0.5023, + "step": 13301 + }, + { + "epoch": 0.7448762459401949, + "grad_norm": 1.2170549631118774, + "learning_rate": 6.647e-05, + "loss": 0.4167, + "step": 13302 + }, + { + "epoch": 0.7449322432523239, + "grad_norm": 1.234202265739441, + "learning_rate": 6.6475e-05, + "loss": 0.4098, + "step": 13303 + }, + { + "epoch": 0.7449882405644529, + "grad_norm": 1.3242830038070679, + "learning_rate": 6.648e-05, + "loss": 0.4357, + "step": 13304 + }, + { + "epoch": 0.7450442378765819, + "grad_norm": 1.7305229902267456, + "learning_rate": 6.648500000000001e-05, + "loss": 0.4228, + "step": 13305 + }, + { + "epoch": 0.7451002351887109, + "grad_norm": 1.6236753463745117, + "learning_rate": 6.649000000000001e-05, + "loss": 0.5207, + "step": 13306 + }, + { + "epoch": 0.74515623250084, + "grad_norm": 1.654104471206665, + "learning_rate": 6.6495e-05, + "loss": 0.5178, + "step": 13307 + }, + { + "epoch": 0.745212229812969, + "grad_norm": 1.4475064277648926, + "learning_rate": 6.65e-05, + "loss": 0.5265, + "step": 13308 + }, + { + "epoch": 0.745268227125098, + "grad_norm": 1.9489260911941528, + "learning_rate": 6.6505e-05, + "loss": 0.4266, + "step": 13309 + }, + { + "epoch": 0.745324224437227, + "grad_norm": 1.3361420631408691, + "learning_rate": 6.651000000000001e-05, + "loss": 0.3858, + "step": 13310 + }, + { + "epoch": 0.745380221749356, + "grad_norm": 1.3716928958892822, + "learning_rate": 6.651500000000001e-05, + "loss": 0.4142, + "step": 13311 + }, + { + "epoch": 0.745436219061485, + "grad_norm": 1.0638846158981323, + "learning_rate": 6.652000000000001e-05, + "loss": 0.3401, + "step": 13312 + }, + { + "epoch": 0.7454922163736141, + "grad_norm": 1.6768540143966675, + "learning_rate": 6.6525e-05, + "loss": 0.4647, + "step": 13313 + }, + { + "epoch": 0.7455482136857431, + "grad_norm": 1.8939396142959595, + "learning_rate": 6.653e-05, + "loss": 0.4594, + "step": 13314 + }, + { + "epoch": 0.7456042109978721, + "grad_norm": 1.3508588075637817, + "learning_rate": 6.6535e-05, + "loss": 0.345, + "step": 13315 + }, + { + "epoch": 0.7456602083100011, + "grad_norm": 1.226591944694519, + "learning_rate": 6.654e-05, + "loss": 0.3969, + "step": 13316 + }, + { + "epoch": 0.7457162056221301, + "grad_norm": 1.1925842761993408, + "learning_rate": 6.654500000000001e-05, + "loss": 0.5444, + "step": 13317 + }, + { + "epoch": 0.7457722029342592, + "grad_norm": 1.6634101867675781, + "learning_rate": 6.655e-05, + "loss": 0.5147, + "step": 13318 + }, + { + "epoch": 0.7458282002463882, + "grad_norm": 1.3468554019927979, + "learning_rate": 6.6555e-05, + "loss": 0.4423, + "step": 13319 + }, + { + "epoch": 0.7458841975585172, + "grad_norm": 1.2828065156936646, + "learning_rate": 6.656e-05, + "loss": 0.3592, + "step": 13320 + }, + { + "epoch": 0.7459401948706462, + "grad_norm": 1.5055537223815918, + "learning_rate": 6.6565e-05, + "loss": 0.445, + "step": 13321 + }, + { + "epoch": 0.7459961921827752, + "grad_norm": 1.1748046875, + "learning_rate": 6.657e-05, + "loss": 0.3662, + "step": 13322 + }, + { + "epoch": 0.7460521894949043, + "grad_norm": 1.1813485622406006, + "learning_rate": 6.657499999999999e-05, + "loss": 0.3375, + "step": 13323 + }, + { + "epoch": 0.7461081868070333, + "grad_norm": 1.186800241470337, + "learning_rate": 6.658e-05, + "loss": 0.4368, + "step": 13324 + }, + { + "epoch": 0.7461641841191623, + "grad_norm": 1.5023611783981323, + "learning_rate": 6.658500000000001e-05, + "loss": 0.4546, + "step": 13325 + }, + { + "epoch": 0.7462201814312913, + "grad_norm": 1.4181703329086304, + "learning_rate": 6.659000000000001e-05, + "loss": 0.4254, + "step": 13326 + }, + { + "epoch": 0.7462761787434203, + "grad_norm": 1.2328901290893555, + "learning_rate": 6.659500000000001e-05, + "loss": 0.37, + "step": 13327 + }, + { + "epoch": 0.7463321760555494, + "grad_norm": 2.5455312728881836, + "learning_rate": 6.66e-05, + "loss": 0.5461, + "step": 13328 + }, + { + "epoch": 0.7463881733676784, + "grad_norm": 1.4440556764602661, + "learning_rate": 6.6605e-05, + "loss": 0.5065, + "step": 13329 + }, + { + "epoch": 0.7464441706798074, + "grad_norm": 1.3674674034118652, + "learning_rate": 6.661e-05, + "loss": 0.4871, + "step": 13330 + }, + { + "epoch": 0.7465001679919364, + "grad_norm": 1.402139663696289, + "learning_rate": 6.661500000000001e-05, + "loss": 0.4866, + "step": 13331 + }, + { + "epoch": 0.7465561653040654, + "grad_norm": 1.5644489526748657, + "learning_rate": 6.662000000000001e-05, + "loss": 0.4805, + "step": 13332 + }, + { + "epoch": 0.7466121626161945, + "grad_norm": 1.9279708862304688, + "learning_rate": 6.6625e-05, + "loss": 0.6546, + "step": 13333 + }, + { + "epoch": 0.7466681599283235, + "grad_norm": 1.2155046463012695, + "learning_rate": 6.663e-05, + "loss": 0.3641, + "step": 13334 + }, + { + "epoch": 0.7467241572404525, + "grad_norm": 1.3035589456558228, + "learning_rate": 6.6635e-05, + "loss": 0.3426, + "step": 13335 + }, + { + "epoch": 0.7467801545525815, + "grad_norm": 1.1661807298660278, + "learning_rate": 6.664e-05, + "loss": 0.4449, + "step": 13336 + }, + { + "epoch": 0.7468361518647105, + "grad_norm": 1.3552536964416504, + "learning_rate": 6.6645e-05, + "loss": 0.3868, + "step": 13337 + }, + { + "epoch": 0.7468921491768395, + "grad_norm": 1.4642640352249146, + "learning_rate": 6.665000000000001e-05, + "loss": 0.3162, + "step": 13338 + }, + { + "epoch": 0.7469481464889686, + "grad_norm": 1.3704559803009033, + "learning_rate": 6.6655e-05, + "loss": 0.5632, + "step": 13339 + }, + { + "epoch": 0.7470041438010976, + "grad_norm": 1.122069239616394, + "learning_rate": 6.666e-05, + "loss": 0.5401, + "step": 13340 + }, + { + "epoch": 0.7470601411132266, + "grad_norm": 1.4768750667572021, + "learning_rate": 6.6665e-05, + "loss": 0.782, + "step": 13341 + }, + { + "epoch": 0.7471161384253556, + "grad_norm": 1.275857925415039, + "learning_rate": 6.667e-05, + "loss": 0.4492, + "step": 13342 + }, + { + "epoch": 0.7471721357374846, + "grad_norm": 1.7961218357086182, + "learning_rate": 6.6675e-05, + "loss": 0.4196, + "step": 13343 + }, + { + "epoch": 0.7472281330496137, + "grad_norm": 1.3244459629058838, + "learning_rate": 6.667999999999999e-05, + "loss": 0.3435, + "step": 13344 + }, + { + "epoch": 0.7472841303617427, + "grad_norm": 1.2896100282669067, + "learning_rate": 6.6685e-05, + "loss": 0.4248, + "step": 13345 + }, + { + "epoch": 0.7473401276738717, + "grad_norm": 1.4216582775115967, + "learning_rate": 6.669000000000001e-05, + "loss": 0.4556, + "step": 13346 + }, + { + "epoch": 0.7473961249860007, + "grad_norm": 1.4091801643371582, + "learning_rate": 6.669500000000001e-05, + "loss": 0.6151, + "step": 13347 + }, + { + "epoch": 0.7474521222981297, + "grad_norm": 1.1799017190933228, + "learning_rate": 6.670000000000001e-05, + "loss": 0.4962, + "step": 13348 + }, + { + "epoch": 0.7475081196102588, + "grad_norm": 1.501054286956787, + "learning_rate": 6.6705e-05, + "loss": 0.6032, + "step": 13349 + }, + { + "epoch": 0.7475641169223878, + "grad_norm": 1.2588993310928345, + "learning_rate": 6.671e-05, + "loss": 0.3253, + "step": 13350 + }, + { + "epoch": 0.7476201142345168, + "grad_norm": 1.1493582725524902, + "learning_rate": 6.6715e-05, + "loss": 0.3031, + "step": 13351 + }, + { + "epoch": 0.7476761115466458, + "grad_norm": 1.250685691833496, + "learning_rate": 6.672e-05, + "loss": 0.5526, + "step": 13352 + }, + { + "epoch": 0.7477321088587747, + "grad_norm": 1.6470725536346436, + "learning_rate": 6.672500000000001e-05, + "loss": 0.6043, + "step": 13353 + }, + { + "epoch": 0.7477881061709037, + "grad_norm": 1.3246068954467773, + "learning_rate": 6.673e-05, + "loss": 0.3791, + "step": 13354 + }, + { + "epoch": 0.7478441034830328, + "grad_norm": 1.1623932123184204, + "learning_rate": 6.6735e-05, + "loss": 0.4429, + "step": 13355 + }, + { + "epoch": 0.7479001007951618, + "grad_norm": 1.8283212184906006, + "learning_rate": 6.674e-05, + "loss": 0.7647, + "step": 13356 + }, + { + "epoch": 0.7479560981072908, + "grad_norm": 1.244064211845398, + "learning_rate": 6.6745e-05, + "loss": 0.5447, + "step": 13357 + }, + { + "epoch": 0.7480120954194198, + "grad_norm": 1.3712365627288818, + "learning_rate": 6.675e-05, + "loss": 0.5122, + "step": 13358 + }, + { + "epoch": 0.7480680927315488, + "grad_norm": 1.1770358085632324, + "learning_rate": 6.675500000000001e-05, + "loss": 0.3591, + "step": 13359 + }, + { + "epoch": 0.7481240900436779, + "grad_norm": 1.14165198802948, + "learning_rate": 6.676e-05, + "loss": 0.3627, + "step": 13360 + }, + { + "epoch": 0.7481800873558069, + "grad_norm": 1.3313946723937988, + "learning_rate": 6.6765e-05, + "loss": 0.4086, + "step": 13361 + }, + { + "epoch": 0.7482360846679359, + "grad_norm": 1.3255826234817505, + "learning_rate": 6.677e-05, + "loss": 0.4672, + "step": 13362 + }, + { + "epoch": 0.7482920819800649, + "grad_norm": 1.2458029985427856, + "learning_rate": 6.6775e-05, + "loss": 0.4414, + "step": 13363 + }, + { + "epoch": 0.7483480792921939, + "grad_norm": 1.2485015392303467, + "learning_rate": 6.678e-05, + "loss": 0.4208, + "step": 13364 + }, + { + "epoch": 0.748404076604323, + "grad_norm": 1.3235641717910767, + "learning_rate": 6.6785e-05, + "loss": 0.5394, + "step": 13365 + }, + { + "epoch": 0.748460073916452, + "grad_norm": 1.4212559461593628, + "learning_rate": 6.679e-05, + "loss": 0.4528, + "step": 13366 + }, + { + "epoch": 0.748516071228581, + "grad_norm": 1.4086921215057373, + "learning_rate": 6.679500000000001e-05, + "loss": 0.524, + "step": 13367 + }, + { + "epoch": 0.74857206854071, + "grad_norm": 0.9677045941352844, + "learning_rate": 6.680000000000001e-05, + "loss": 0.3381, + "step": 13368 + }, + { + "epoch": 0.748628065852839, + "grad_norm": 1.6913973093032837, + "learning_rate": 6.680500000000001e-05, + "loss": 0.4431, + "step": 13369 + }, + { + "epoch": 0.748684063164968, + "grad_norm": 1.1492834091186523, + "learning_rate": 6.681e-05, + "loss": 0.3583, + "step": 13370 + }, + { + "epoch": 0.7487400604770971, + "grad_norm": 1.4888323545455933, + "learning_rate": 6.6815e-05, + "loss": 0.5411, + "step": 13371 + }, + { + "epoch": 0.7487960577892261, + "grad_norm": 1.3886725902557373, + "learning_rate": 6.682e-05, + "loss": 0.6303, + "step": 13372 + }, + { + "epoch": 0.7488520551013551, + "grad_norm": 1.3541219234466553, + "learning_rate": 6.6825e-05, + "loss": 0.348, + "step": 13373 + }, + { + "epoch": 0.7489080524134841, + "grad_norm": 1.4529330730438232, + "learning_rate": 6.683000000000001e-05, + "loss": 0.4437, + "step": 13374 + }, + { + "epoch": 0.7489640497256131, + "grad_norm": 2.124105930328369, + "learning_rate": 6.6835e-05, + "loss": 0.5812, + "step": 13375 + }, + { + "epoch": 0.7490200470377422, + "grad_norm": 1.2463494539260864, + "learning_rate": 6.684e-05, + "loss": 0.4733, + "step": 13376 + }, + { + "epoch": 0.7490760443498712, + "grad_norm": 1.0593266487121582, + "learning_rate": 6.6845e-05, + "loss": 0.3661, + "step": 13377 + }, + { + "epoch": 0.7491320416620002, + "grad_norm": 1.3837532997131348, + "learning_rate": 6.685e-05, + "loss": 0.4814, + "step": 13378 + }, + { + "epoch": 0.7491880389741292, + "grad_norm": 2.01131010055542, + "learning_rate": 6.6855e-05, + "loss": 0.4869, + "step": 13379 + }, + { + "epoch": 0.7492440362862582, + "grad_norm": 1.162909746170044, + "learning_rate": 6.686000000000001e-05, + "loss": 0.4443, + "step": 13380 + }, + { + "epoch": 0.7493000335983873, + "grad_norm": 1.2567288875579834, + "learning_rate": 6.6865e-05, + "loss": 0.4941, + "step": 13381 + }, + { + "epoch": 0.7493560309105163, + "grad_norm": 1.3693965673446655, + "learning_rate": 6.687e-05, + "loss": 0.5171, + "step": 13382 + }, + { + "epoch": 0.7494120282226453, + "grad_norm": 1.385277509689331, + "learning_rate": 6.6875e-05, + "loss": 0.36, + "step": 13383 + }, + { + "epoch": 0.7494680255347743, + "grad_norm": 1.4657427072525024, + "learning_rate": 6.688e-05, + "loss": 0.3572, + "step": 13384 + }, + { + "epoch": 0.7495240228469033, + "grad_norm": 1.1794122457504272, + "learning_rate": 6.6885e-05, + "loss": 0.4101, + "step": 13385 + }, + { + "epoch": 0.7495800201590324, + "grad_norm": 1.3354967832565308, + "learning_rate": 6.689e-05, + "loss": 0.4576, + "step": 13386 + }, + { + "epoch": 0.7496360174711614, + "grad_norm": 1.1651242971420288, + "learning_rate": 6.6895e-05, + "loss": 0.3619, + "step": 13387 + }, + { + "epoch": 0.7496920147832904, + "grad_norm": 1.4015265703201294, + "learning_rate": 6.690000000000001e-05, + "loss": 0.4246, + "step": 13388 + }, + { + "epoch": 0.7497480120954194, + "grad_norm": 1.1027058362960815, + "learning_rate": 6.690500000000001e-05, + "loss": 0.2979, + "step": 13389 + }, + { + "epoch": 0.7498040094075484, + "grad_norm": 1.2594717741012573, + "learning_rate": 6.691000000000001e-05, + "loss": 0.4395, + "step": 13390 + }, + { + "epoch": 0.7498600067196775, + "grad_norm": 1.2497899532318115, + "learning_rate": 6.6915e-05, + "loss": 0.4734, + "step": 13391 + }, + { + "epoch": 0.7499160040318065, + "grad_norm": 1.5366617441177368, + "learning_rate": 6.692e-05, + "loss": 0.4336, + "step": 13392 + }, + { + "epoch": 0.7499720013439355, + "grad_norm": 1.5138517618179321, + "learning_rate": 6.6925e-05, + "loss": 0.6008, + "step": 13393 + }, + { + "epoch": 0.7500279986560645, + "grad_norm": 1.1237903833389282, + "learning_rate": 6.693e-05, + "loss": 0.4135, + "step": 13394 + }, + { + "epoch": 0.7500839959681935, + "grad_norm": 1.1958673000335693, + "learning_rate": 6.693500000000001e-05, + "loss": 0.3744, + "step": 13395 + }, + { + "epoch": 0.7501399932803225, + "grad_norm": 1.316476821899414, + "learning_rate": 6.694e-05, + "loss": 0.4043, + "step": 13396 + }, + { + "epoch": 0.7501959905924516, + "grad_norm": 1.3259350061416626, + "learning_rate": 6.6945e-05, + "loss": 0.4395, + "step": 13397 + }, + { + "epoch": 0.7502519879045806, + "grad_norm": 1.565314769744873, + "learning_rate": 6.695e-05, + "loss": 0.4311, + "step": 13398 + }, + { + "epoch": 0.7503079852167096, + "grad_norm": 1.2138341665267944, + "learning_rate": 6.6955e-05, + "loss": 0.3549, + "step": 13399 + }, + { + "epoch": 0.7503639825288386, + "grad_norm": 1.5309661626815796, + "learning_rate": 6.696e-05, + "loss": 0.6146, + "step": 13400 + }, + { + "epoch": 0.7504199798409676, + "grad_norm": 1.2558754682540894, + "learning_rate": 6.6965e-05, + "loss": 0.4875, + "step": 13401 + }, + { + "epoch": 0.7504759771530967, + "grad_norm": 1.4507876634597778, + "learning_rate": 6.697e-05, + "loss": 0.6295, + "step": 13402 + }, + { + "epoch": 0.7505319744652257, + "grad_norm": 1.652795433998108, + "learning_rate": 6.6975e-05, + "loss": 0.3718, + "step": 13403 + }, + { + "epoch": 0.7505879717773547, + "grad_norm": 1.3381702899932861, + "learning_rate": 6.698e-05, + "loss": 0.4106, + "step": 13404 + }, + { + "epoch": 0.7506439690894837, + "grad_norm": 1.7910661697387695, + "learning_rate": 6.6985e-05, + "loss": 0.4129, + "step": 13405 + }, + { + "epoch": 0.7506999664016127, + "grad_norm": 1.3057547807693481, + "learning_rate": 6.699000000000001e-05, + "loss": 0.3802, + "step": 13406 + }, + { + "epoch": 0.7507559637137418, + "grad_norm": 1.3384629487991333, + "learning_rate": 6.6995e-05, + "loss": 0.5628, + "step": 13407 + }, + { + "epoch": 0.7508119610258708, + "grad_norm": 1.34722101688385, + "learning_rate": 6.7e-05, + "loss": 0.4655, + "step": 13408 + }, + { + "epoch": 0.7508679583379998, + "grad_norm": 1.3082317113876343, + "learning_rate": 6.700500000000001e-05, + "loss": 0.4847, + "step": 13409 + }, + { + "epoch": 0.7509239556501288, + "grad_norm": 1.2084414958953857, + "learning_rate": 6.701000000000001e-05, + "loss": 0.5166, + "step": 13410 + }, + { + "epoch": 0.7509799529622578, + "grad_norm": 1.056401252746582, + "learning_rate": 6.701500000000001e-05, + "loss": 0.4287, + "step": 13411 + }, + { + "epoch": 0.7510359502743869, + "grad_norm": 1.2451685667037964, + "learning_rate": 6.702e-05, + "loss": 0.3245, + "step": 13412 + }, + { + "epoch": 0.7510919475865159, + "grad_norm": 1.2550543546676636, + "learning_rate": 6.7025e-05, + "loss": 0.415, + "step": 13413 + }, + { + "epoch": 0.7511479448986449, + "grad_norm": 1.3785005807876587, + "learning_rate": 6.703e-05, + "loss": 0.352, + "step": 13414 + }, + { + "epoch": 0.7512039422107739, + "grad_norm": 1.3353605270385742, + "learning_rate": 6.7035e-05, + "loss": 0.445, + "step": 13415 + }, + { + "epoch": 0.7512599395229029, + "grad_norm": 1.0877524614334106, + "learning_rate": 6.704000000000001e-05, + "loss": 0.3922, + "step": 13416 + }, + { + "epoch": 0.751315936835032, + "grad_norm": 1.2942429780960083, + "learning_rate": 6.7045e-05, + "loss": 0.3311, + "step": 13417 + }, + { + "epoch": 0.751371934147161, + "grad_norm": 1.3884506225585938, + "learning_rate": 6.705e-05, + "loss": 0.4471, + "step": 13418 + }, + { + "epoch": 0.75142793145929, + "grad_norm": 1.247389554977417, + "learning_rate": 6.7055e-05, + "loss": 0.3752, + "step": 13419 + }, + { + "epoch": 0.751483928771419, + "grad_norm": 1.2123414278030396, + "learning_rate": 6.706e-05, + "loss": 0.4752, + "step": 13420 + }, + { + "epoch": 0.751539926083548, + "grad_norm": 1.7403959035873413, + "learning_rate": 6.7065e-05, + "loss": 0.5192, + "step": 13421 + }, + { + "epoch": 0.751595923395677, + "grad_norm": 1.4864706993103027, + "learning_rate": 6.706999999999999e-05, + "loss": 0.3862, + "step": 13422 + }, + { + "epoch": 0.7516519207078061, + "grad_norm": 1.4361765384674072, + "learning_rate": 6.7075e-05, + "loss": 0.4101, + "step": 13423 + }, + { + "epoch": 0.7517079180199351, + "grad_norm": 1.5567173957824707, + "learning_rate": 6.708e-05, + "loss": 0.5029, + "step": 13424 + }, + { + "epoch": 0.7517639153320641, + "grad_norm": 1.390423059463501, + "learning_rate": 6.7085e-05, + "loss": 0.3613, + "step": 13425 + }, + { + "epoch": 0.7518199126441931, + "grad_norm": 1.4552497863769531, + "learning_rate": 6.709000000000001e-05, + "loss": 0.7664, + "step": 13426 + }, + { + "epoch": 0.7518759099563221, + "grad_norm": 1.305458426475525, + "learning_rate": 6.709500000000001e-05, + "loss": 0.5114, + "step": 13427 + }, + { + "epoch": 0.7519319072684512, + "grad_norm": 1.2647517919540405, + "learning_rate": 6.71e-05, + "loss": 0.3717, + "step": 13428 + }, + { + "epoch": 0.7519879045805802, + "grad_norm": 1.1407110691070557, + "learning_rate": 6.7105e-05, + "loss": 0.3953, + "step": 13429 + }, + { + "epoch": 0.7520439018927092, + "grad_norm": 1.303435206413269, + "learning_rate": 6.711e-05, + "loss": 0.4151, + "step": 13430 + }, + { + "epoch": 0.7520998992048382, + "grad_norm": 1.5761874914169312, + "learning_rate": 6.711500000000001e-05, + "loss": 0.3881, + "step": 13431 + }, + { + "epoch": 0.7521558965169672, + "grad_norm": 1.368018388748169, + "learning_rate": 6.712000000000001e-05, + "loss": 0.3724, + "step": 13432 + }, + { + "epoch": 0.7522118938290963, + "grad_norm": 1.412675142288208, + "learning_rate": 6.7125e-05, + "loss": 0.5171, + "step": 13433 + }, + { + "epoch": 0.7522678911412253, + "grad_norm": 1.142570972442627, + "learning_rate": 6.713e-05, + "loss": 0.3583, + "step": 13434 + }, + { + "epoch": 0.7523238884533542, + "grad_norm": 1.1345243453979492, + "learning_rate": 6.7135e-05, + "loss": 0.3614, + "step": 13435 + }, + { + "epoch": 0.7523798857654832, + "grad_norm": 1.4500483274459839, + "learning_rate": 6.714e-05, + "loss": 0.4051, + "step": 13436 + }, + { + "epoch": 0.7524358830776122, + "grad_norm": 1.1398577690124512, + "learning_rate": 6.714500000000001e-05, + "loss": 0.3196, + "step": 13437 + }, + { + "epoch": 0.7524918803897412, + "grad_norm": 1.4100042581558228, + "learning_rate": 6.715e-05, + "loss": 0.4972, + "step": 13438 + }, + { + "epoch": 0.7525478777018703, + "grad_norm": 1.1147397756576538, + "learning_rate": 6.7155e-05, + "loss": 0.4089, + "step": 13439 + }, + { + "epoch": 0.7526038750139993, + "grad_norm": 1.1418505907058716, + "learning_rate": 6.716e-05, + "loss": 0.3055, + "step": 13440 + }, + { + "epoch": 0.7526598723261283, + "grad_norm": 1.6036012172698975, + "learning_rate": 6.7165e-05, + "loss": 0.476, + "step": 13441 + }, + { + "epoch": 0.7527158696382573, + "grad_norm": 1.084713101387024, + "learning_rate": 6.717e-05, + "loss": 0.3097, + "step": 13442 + }, + { + "epoch": 0.7527718669503863, + "grad_norm": 1.3821686506271362, + "learning_rate": 6.717499999999999e-05, + "loss": 0.5434, + "step": 13443 + }, + { + "epoch": 0.7528278642625154, + "grad_norm": 1.393151879310608, + "learning_rate": 6.718e-05, + "loss": 0.4664, + "step": 13444 + }, + { + "epoch": 0.7528838615746444, + "grad_norm": 1.1680833101272583, + "learning_rate": 6.7185e-05, + "loss": 0.4461, + "step": 13445 + }, + { + "epoch": 0.7529398588867734, + "grad_norm": 1.2255024909973145, + "learning_rate": 6.719000000000001e-05, + "loss": 0.4013, + "step": 13446 + }, + { + "epoch": 0.7529958561989024, + "grad_norm": 1.1132417917251587, + "learning_rate": 6.719500000000001e-05, + "loss": 0.3616, + "step": 13447 + }, + { + "epoch": 0.7530518535110314, + "grad_norm": 1.598866581916809, + "learning_rate": 6.720000000000001e-05, + "loss": 0.4688, + "step": 13448 + }, + { + "epoch": 0.7531078508231605, + "grad_norm": 1.1186206340789795, + "learning_rate": 6.7205e-05, + "loss": 0.3437, + "step": 13449 + }, + { + "epoch": 0.7531638481352895, + "grad_norm": 1.6304622888565063, + "learning_rate": 6.721e-05, + "loss": 0.6735, + "step": 13450 + }, + { + "epoch": 0.7532198454474185, + "grad_norm": 1.4465316534042358, + "learning_rate": 6.7215e-05, + "loss": 0.4097, + "step": 13451 + }, + { + "epoch": 0.7532758427595475, + "grad_norm": 1.664799451828003, + "learning_rate": 6.722000000000001e-05, + "loss": 0.4707, + "step": 13452 + }, + { + "epoch": 0.7533318400716765, + "grad_norm": 1.2107914686203003, + "learning_rate": 6.722500000000001e-05, + "loss": 0.3472, + "step": 13453 + }, + { + "epoch": 0.7533878373838055, + "grad_norm": 1.3441057205200195, + "learning_rate": 6.723e-05, + "loss": 0.4359, + "step": 13454 + }, + { + "epoch": 0.7534438346959346, + "grad_norm": 1.2599084377288818, + "learning_rate": 6.7235e-05, + "loss": 0.4591, + "step": 13455 + }, + { + "epoch": 0.7534998320080636, + "grad_norm": 1.185228705406189, + "learning_rate": 6.724e-05, + "loss": 0.3497, + "step": 13456 + }, + { + "epoch": 0.7535558293201926, + "grad_norm": 1.2232575416564941, + "learning_rate": 6.7245e-05, + "loss": 0.4625, + "step": 13457 + }, + { + "epoch": 0.7536118266323216, + "grad_norm": 1.4489551782608032, + "learning_rate": 6.725000000000001e-05, + "loss": 0.5915, + "step": 13458 + }, + { + "epoch": 0.7536678239444506, + "grad_norm": 1.467835783958435, + "learning_rate": 6.7255e-05, + "loss": 0.6399, + "step": 13459 + }, + { + "epoch": 0.7537238212565797, + "grad_norm": 1.36167311668396, + "learning_rate": 6.726e-05, + "loss": 0.4153, + "step": 13460 + }, + { + "epoch": 0.7537798185687087, + "grad_norm": 1.2901496887207031, + "learning_rate": 6.7265e-05, + "loss": 0.3864, + "step": 13461 + }, + { + "epoch": 0.7538358158808377, + "grad_norm": 1.549117922782898, + "learning_rate": 6.727e-05, + "loss": 0.4436, + "step": 13462 + }, + { + "epoch": 0.7538918131929667, + "grad_norm": 1.1801596879959106, + "learning_rate": 6.7275e-05, + "loss": 0.3789, + "step": 13463 + }, + { + "epoch": 0.7539478105050957, + "grad_norm": 1.5706390142440796, + "learning_rate": 6.727999999999999e-05, + "loss": 0.3809, + "step": 13464 + }, + { + "epoch": 0.7540038078172248, + "grad_norm": 1.2169467210769653, + "learning_rate": 6.7285e-05, + "loss": 0.4053, + "step": 13465 + }, + { + "epoch": 0.7540598051293538, + "grad_norm": 1.3880095481872559, + "learning_rate": 6.729000000000001e-05, + "loss": 0.5238, + "step": 13466 + }, + { + "epoch": 0.7541158024414828, + "grad_norm": 1.2271162271499634, + "learning_rate": 6.729500000000001e-05, + "loss": 0.386, + "step": 13467 + }, + { + "epoch": 0.7541717997536118, + "grad_norm": 1.463864803314209, + "learning_rate": 6.730000000000001e-05, + "loss": 0.4024, + "step": 13468 + }, + { + "epoch": 0.7542277970657408, + "grad_norm": 1.4349846839904785, + "learning_rate": 6.730500000000001e-05, + "loss": 0.5111, + "step": 13469 + }, + { + "epoch": 0.7542837943778699, + "grad_norm": 1.2246228456497192, + "learning_rate": 6.731e-05, + "loss": 0.3641, + "step": 13470 + }, + { + "epoch": 0.7543397916899989, + "grad_norm": 1.2935744524002075, + "learning_rate": 6.7315e-05, + "loss": 0.5459, + "step": 13471 + }, + { + "epoch": 0.7543957890021279, + "grad_norm": 1.280429482460022, + "learning_rate": 6.732e-05, + "loss": 0.4411, + "step": 13472 + }, + { + "epoch": 0.7544517863142569, + "grad_norm": 1.3651620149612427, + "learning_rate": 6.732500000000001e-05, + "loss": 0.4289, + "step": 13473 + }, + { + "epoch": 0.7545077836263859, + "grad_norm": 1.0948057174682617, + "learning_rate": 6.733000000000001e-05, + "loss": 0.3642, + "step": 13474 + }, + { + "epoch": 0.754563780938515, + "grad_norm": 1.3790465593338013, + "learning_rate": 6.7335e-05, + "loss": 0.4769, + "step": 13475 + }, + { + "epoch": 0.754619778250644, + "grad_norm": 1.553052544593811, + "learning_rate": 6.734e-05, + "loss": 0.5529, + "step": 13476 + }, + { + "epoch": 0.754675775562773, + "grad_norm": 1.7630292177200317, + "learning_rate": 6.7345e-05, + "loss": 0.6346, + "step": 13477 + }, + { + "epoch": 0.754731772874902, + "grad_norm": 1.2925057411193848, + "learning_rate": 6.735e-05, + "loss": 0.4822, + "step": 13478 + }, + { + "epoch": 0.754787770187031, + "grad_norm": 1.9060975313186646, + "learning_rate": 6.735500000000001e-05, + "loss": 0.4757, + "step": 13479 + }, + { + "epoch": 0.75484376749916, + "grad_norm": 1.1911603212356567, + "learning_rate": 6.736e-05, + "loss": 0.4558, + "step": 13480 + }, + { + "epoch": 0.7548997648112891, + "grad_norm": 1.3044967651367188, + "learning_rate": 6.7365e-05, + "loss": 0.4002, + "step": 13481 + }, + { + "epoch": 0.7549557621234181, + "grad_norm": 1.0621124505996704, + "learning_rate": 6.737e-05, + "loss": 0.3953, + "step": 13482 + }, + { + "epoch": 0.7550117594355471, + "grad_norm": 1.3665282726287842, + "learning_rate": 6.7375e-05, + "loss": 0.4164, + "step": 13483 + }, + { + "epoch": 0.7550677567476761, + "grad_norm": 1.1974210739135742, + "learning_rate": 6.738e-05, + "loss": 0.4412, + "step": 13484 + }, + { + "epoch": 0.7551237540598051, + "grad_norm": 1.2153085470199585, + "learning_rate": 6.738499999999999e-05, + "loss": 0.3828, + "step": 13485 + }, + { + "epoch": 0.7551797513719342, + "grad_norm": 1.622483730316162, + "learning_rate": 6.739e-05, + "loss": 0.4713, + "step": 13486 + }, + { + "epoch": 0.7552357486840632, + "grad_norm": 1.2181439399719238, + "learning_rate": 6.739500000000001e-05, + "loss": 0.4743, + "step": 13487 + }, + { + "epoch": 0.7552917459961922, + "grad_norm": 1.6612908840179443, + "learning_rate": 6.740000000000001e-05, + "loss": 0.3973, + "step": 13488 + }, + { + "epoch": 0.7553477433083212, + "grad_norm": 1.333847165107727, + "learning_rate": 6.740500000000001e-05, + "loss": 0.3932, + "step": 13489 + }, + { + "epoch": 0.7554037406204502, + "grad_norm": 1.3070459365844727, + "learning_rate": 6.741000000000001e-05, + "loss": 0.431, + "step": 13490 + }, + { + "epoch": 0.7554597379325793, + "grad_norm": 1.1635133028030396, + "learning_rate": 6.7415e-05, + "loss": 0.3468, + "step": 13491 + }, + { + "epoch": 0.7555157352447083, + "grad_norm": 1.4309470653533936, + "learning_rate": 6.742e-05, + "loss": 0.4128, + "step": 13492 + }, + { + "epoch": 0.7555717325568373, + "grad_norm": 1.3312102556228638, + "learning_rate": 6.7425e-05, + "loss": 0.4404, + "step": 13493 + }, + { + "epoch": 0.7556277298689663, + "grad_norm": 1.4434672594070435, + "learning_rate": 6.743000000000001e-05, + "loss": 0.5623, + "step": 13494 + }, + { + "epoch": 0.7556837271810953, + "grad_norm": 1.1383618116378784, + "learning_rate": 6.743500000000001e-05, + "loss": 0.3618, + "step": 13495 + }, + { + "epoch": 0.7557397244932244, + "grad_norm": 1.1406939029693604, + "learning_rate": 6.744e-05, + "loss": 0.3949, + "step": 13496 + }, + { + "epoch": 0.7557957218053534, + "grad_norm": 1.3334400653839111, + "learning_rate": 6.7445e-05, + "loss": 0.4681, + "step": 13497 + }, + { + "epoch": 0.7558517191174824, + "grad_norm": 1.2865227460861206, + "learning_rate": 6.745e-05, + "loss": 0.4014, + "step": 13498 + }, + { + "epoch": 0.7559077164296114, + "grad_norm": 1.1794874668121338, + "learning_rate": 6.7455e-05, + "loss": 0.4435, + "step": 13499 + }, + { + "epoch": 0.7559637137417404, + "grad_norm": 1.5550849437713623, + "learning_rate": 6.746e-05, + "loss": 0.6375, + "step": 13500 + }, + { + "epoch": 0.7560197110538694, + "grad_norm": 1.6652644872665405, + "learning_rate": 6.7465e-05, + "loss": 0.5682, + "step": 13501 + }, + { + "epoch": 0.7560757083659985, + "grad_norm": 1.117836594581604, + "learning_rate": 6.747e-05, + "loss": 0.3993, + "step": 13502 + }, + { + "epoch": 0.7561317056781275, + "grad_norm": 1.3415014743804932, + "learning_rate": 6.7475e-05, + "loss": 0.4238, + "step": 13503 + }, + { + "epoch": 0.7561877029902565, + "grad_norm": 1.3984713554382324, + "learning_rate": 6.748e-05, + "loss": 0.4271, + "step": 13504 + }, + { + "epoch": 0.7562437003023855, + "grad_norm": 1.6960017681121826, + "learning_rate": 6.7485e-05, + "loss": 0.3231, + "step": 13505 + }, + { + "epoch": 0.7562996976145145, + "grad_norm": 1.6180530786514282, + "learning_rate": 6.749e-05, + "loss": 0.477, + "step": 13506 + }, + { + "epoch": 0.7563556949266436, + "grad_norm": 1.4171092510223389, + "learning_rate": 6.7495e-05, + "loss": 0.4831, + "step": 13507 + }, + { + "epoch": 0.7564116922387726, + "grad_norm": 1.1733931303024292, + "learning_rate": 6.750000000000001e-05, + "loss": 0.4208, + "step": 13508 + }, + { + "epoch": 0.7564676895509016, + "grad_norm": 1.4384032487869263, + "learning_rate": 6.750500000000001e-05, + "loss": 0.5576, + "step": 13509 + }, + { + "epoch": 0.7565236868630306, + "grad_norm": 1.3238472938537598, + "learning_rate": 6.751000000000001e-05, + "loss": 0.4113, + "step": 13510 + }, + { + "epoch": 0.7565796841751596, + "grad_norm": 1.176927924156189, + "learning_rate": 6.7515e-05, + "loss": 0.3882, + "step": 13511 + }, + { + "epoch": 0.7566356814872887, + "grad_norm": 1.1933873891830444, + "learning_rate": 6.752e-05, + "loss": 0.496, + "step": 13512 + }, + { + "epoch": 0.7566916787994177, + "grad_norm": 1.1905460357666016, + "learning_rate": 6.7525e-05, + "loss": 0.3585, + "step": 13513 + }, + { + "epoch": 0.7567476761115467, + "grad_norm": 1.154370665550232, + "learning_rate": 6.753e-05, + "loss": 0.3804, + "step": 13514 + }, + { + "epoch": 0.7568036734236757, + "grad_norm": 1.5908572673797607, + "learning_rate": 6.753500000000001e-05, + "loss": 0.5003, + "step": 13515 + }, + { + "epoch": 0.7568596707358047, + "grad_norm": 2.966088056564331, + "learning_rate": 6.754000000000001e-05, + "loss": 0.4471, + "step": 13516 + }, + { + "epoch": 0.7569156680479338, + "grad_norm": 1.2605220079421997, + "learning_rate": 6.7545e-05, + "loss": 0.3171, + "step": 13517 + }, + { + "epoch": 0.7569716653600627, + "grad_norm": 1.172694444656372, + "learning_rate": 6.755e-05, + "loss": 0.4792, + "step": 13518 + }, + { + "epoch": 0.7570276626721917, + "grad_norm": 1.4007148742675781, + "learning_rate": 6.7555e-05, + "loss": 0.5274, + "step": 13519 + }, + { + "epoch": 0.7570836599843207, + "grad_norm": 1.4705430269241333, + "learning_rate": 6.756e-05, + "loss": 0.5258, + "step": 13520 + }, + { + "epoch": 0.7571396572964497, + "grad_norm": 1.2992222309112549, + "learning_rate": 6.7565e-05, + "loss": 0.4533, + "step": 13521 + }, + { + "epoch": 0.7571956546085787, + "grad_norm": 1.5344949960708618, + "learning_rate": 6.757e-05, + "loss": 0.5269, + "step": 13522 + }, + { + "epoch": 0.7572516519207078, + "grad_norm": 1.204139232635498, + "learning_rate": 6.7575e-05, + "loss": 0.5333, + "step": 13523 + }, + { + "epoch": 0.7573076492328368, + "grad_norm": 1.3827623128890991, + "learning_rate": 6.758e-05, + "loss": 0.548, + "step": 13524 + }, + { + "epoch": 0.7573636465449658, + "grad_norm": 1.5382299423217773, + "learning_rate": 6.7585e-05, + "loss": 0.5404, + "step": 13525 + }, + { + "epoch": 0.7574196438570948, + "grad_norm": 1.6641407012939453, + "learning_rate": 6.759e-05, + "loss": 0.6407, + "step": 13526 + }, + { + "epoch": 0.7574756411692238, + "grad_norm": 1.459222674369812, + "learning_rate": 6.7595e-05, + "loss": 0.4695, + "step": 13527 + }, + { + "epoch": 0.7575316384813529, + "grad_norm": 1.2386016845703125, + "learning_rate": 6.76e-05, + "loss": 0.5066, + "step": 13528 + }, + { + "epoch": 0.7575876357934819, + "grad_norm": 1.3396717309951782, + "learning_rate": 6.7605e-05, + "loss": 0.3974, + "step": 13529 + }, + { + "epoch": 0.7576436331056109, + "grad_norm": 3.966275691986084, + "learning_rate": 6.761000000000001e-05, + "loss": 0.4284, + "step": 13530 + }, + { + "epoch": 0.7576996304177399, + "grad_norm": 1.2423146963119507, + "learning_rate": 6.761500000000001e-05, + "loss": 0.4615, + "step": 13531 + }, + { + "epoch": 0.7577556277298689, + "grad_norm": 1.0985716581344604, + "learning_rate": 6.762e-05, + "loss": 0.3827, + "step": 13532 + }, + { + "epoch": 0.757811625041998, + "grad_norm": 1.3467158079147339, + "learning_rate": 6.7625e-05, + "loss": 0.4836, + "step": 13533 + }, + { + "epoch": 0.757867622354127, + "grad_norm": 1.2831975221633911, + "learning_rate": 6.763e-05, + "loss": 0.4462, + "step": 13534 + }, + { + "epoch": 0.757923619666256, + "grad_norm": 1.3927476406097412, + "learning_rate": 6.7635e-05, + "loss": 0.4237, + "step": 13535 + }, + { + "epoch": 0.757979616978385, + "grad_norm": 1.3788520097732544, + "learning_rate": 6.764000000000001e-05, + "loss": 0.4037, + "step": 13536 + }, + { + "epoch": 0.758035614290514, + "grad_norm": 1.3232665061950684, + "learning_rate": 6.764500000000001e-05, + "loss": 0.5126, + "step": 13537 + }, + { + "epoch": 0.758091611602643, + "grad_norm": 1.4086594581604004, + "learning_rate": 6.765e-05, + "loss": 0.4718, + "step": 13538 + }, + { + "epoch": 0.7581476089147721, + "grad_norm": 1.207565188407898, + "learning_rate": 6.7655e-05, + "loss": 0.3503, + "step": 13539 + }, + { + "epoch": 0.7582036062269011, + "grad_norm": 1.3686875104904175, + "learning_rate": 6.766e-05, + "loss": 0.3985, + "step": 13540 + }, + { + "epoch": 0.7582596035390301, + "grad_norm": 1.418041706085205, + "learning_rate": 6.7665e-05, + "loss": 0.4116, + "step": 13541 + }, + { + "epoch": 0.7583156008511591, + "grad_norm": 1.3729579448699951, + "learning_rate": 6.767e-05, + "loss": 0.5298, + "step": 13542 + }, + { + "epoch": 0.7583715981632881, + "grad_norm": 1.4995180368423462, + "learning_rate": 6.7675e-05, + "loss": 0.5458, + "step": 13543 + }, + { + "epoch": 0.7584275954754172, + "grad_norm": 1.378974199295044, + "learning_rate": 6.768e-05, + "loss": 0.3962, + "step": 13544 + }, + { + "epoch": 0.7584835927875462, + "grad_norm": 1.2189967632293701, + "learning_rate": 6.7685e-05, + "loss": 0.5196, + "step": 13545 + }, + { + "epoch": 0.7585395900996752, + "grad_norm": 1.2721692323684692, + "learning_rate": 6.769e-05, + "loss": 0.38, + "step": 13546 + }, + { + "epoch": 0.7585955874118042, + "grad_norm": 1.3236334323883057, + "learning_rate": 6.769500000000001e-05, + "loss": 0.4855, + "step": 13547 + }, + { + "epoch": 0.7586515847239332, + "grad_norm": 1.087719202041626, + "learning_rate": 6.77e-05, + "loss": 0.3678, + "step": 13548 + }, + { + "epoch": 0.7587075820360623, + "grad_norm": 1.3718489408493042, + "learning_rate": 6.7705e-05, + "loss": 0.4305, + "step": 13549 + }, + { + "epoch": 0.7587635793481913, + "grad_norm": 1.2739298343658447, + "learning_rate": 6.771e-05, + "loss": 0.4258, + "step": 13550 + }, + { + "epoch": 0.7588195766603203, + "grad_norm": 1.6008694171905518, + "learning_rate": 6.771500000000001e-05, + "loss": 0.5363, + "step": 13551 + }, + { + "epoch": 0.7588755739724493, + "grad_norm": 1.3075045347213745, + "learning_rate": 6.772000000000001e-05, + "loss": 0.4086, + "step": 13552 + }, + { + "epoch": 0.7589315712845783, + "grad_norm": 1.3609967231750488, + "learning_rate": 6.7725e-05, + "loss": 0.4109, + "step": 13553 + }, + { + "epoch": 0.7589875685967074, + "grad_norm": 1.360772967338562, + "learning_rate": 6.773e-05, + "loss": 0.4327, + "step": 13554 + }, + { + "epoch": 0.7590435659088364, + "grad_norm": 1.3756797313690186, + "learning_rate": 6.7735e-05, + "loss": 0.4837, + "step": 13555 + }, + { + "epoch": 0.7590995632209654, + "grad_norm": 1.610328197479248, + "learning_rate": 6.774e-05, + "loss": 0.6, + "step": 13556 + }, + { + "epoch": 0.7591555605330944, + "grad_norm": 1.6325621604919434, + "learning_rate": 6.774500000000001e-05, + "loss": 0.5773, + "step": 13557 + }, + { + "epoch": 0.7592115578452234, + "grad_norm": 1.2873172760009766, + "learning_rate": 6.775000000000001e-05, + "loss": 0.4575, + "step": 13558 + }, + { + "epoch": 0.7592675551573524, + "grad_norm": 1.4128810167312622, + "learning_rate": 6.7755e-05, + "loss": 0.3474, + "step": 13559 + }, + { + "epoch": 0.7593235524694815, + "grad_norm": 1.3517918586730957, + "learning_rate": 6.776e-05, + "loss": 0.4743, + "step": 13560 + }, + { + "epoch": 0.7593795497816105, + "grad_norm": 1.365537166595459, + "learning_rate": 6.7765e-05, + "loss": 0.5071, + "step": 13561 + }, + { + "epoch": 0.7594355470937395, + "grad_norm": 1.3406610488891602, + "learning_rate": 6.777e-05, + "loss": 0.3936, + "step": 13562 + }, + { + "epoch": 0.7594915444058685, + "grad_norm": 1.4251781702041626, + "learning_rate": 6.7775e-05, + "loss": 0.5897, + "step": 13563 + }, + { + "epoch": 0.7595475417179975, + "grad_norm": 1.5039125680923462, + "learning_rate": 6.778e-05, + "loss": 0.5221, + "step": 13564 + }, + { + "epoch": 0.7596035390301266, + "grad_norm": 1.3651732206344604, + "learning_rate": 6.7785e-05, + "loss": 0.4698, + "step": 13565 + }, + { + "epoch": 0.7596595363422556, + "grad_norm": 1.6269944906234741, + "learning_rate": 6.779e-05, + "loss": 0.5133, + "step": 13566 + }, + { + "epoch": 0.7597155336543846, + "grad_norm": 1.255621075630188, + "learning_rate": 6.779500000000001e-05, + "loss": 0.4111, + "step": 13567 + }, + { + "epoch": 0.7597715309665136, + "grad_norm": 1.2345309257507324, + "learning_rate": 6.780000000000001e-05, + "loss": 0.3922, + "step": 13568 + }, + { + "epoch": 0.7598275282786426, + "grad_norm": 1.3083750009536743, + "learning_rate": 6.7805e-05, + "loss": 0.421, + "step": 13569 + }, + { + "epoch": 0.7598835255907717, + "grad_norm": 1.4425714015960693, + "learning_rate": 6.781e-05, + "loss": 0.7085, + "step": 13570 + }, + { + "epoch": 0.7599395229029007, + "grad_norm": 1.5089733600616455, + "learning_rate": 6.7815e-05, + "loss": 0.4205, + "step": 13571 + }, + { + "epoch": 0.7599955202150297, + "grad_norm": 1.9460489749908447, + "learning_rate": 6.782000000000001e-05, + "loss": 0.5807, + "step": 13572 + }, + { + "epoch": 0.7600515175271587, + "grad_norm": 1.3572713136672974, + "learning_rate": 6.782500000000001e-05, + "loss": 0.551, + "step": 13573 + }, + { + "epoch": 0.7601075148392877, + "grad_norm": 1.2764432430267334, + "learning_rate": 6.783e-05, + "loss": 0.5627, + "step": 13574 + }, + { + "epoch": 0.7601635121514168, + "grad_norm": 1.253567099571228, + "learning_rate": 6.7835e-05, + "loss": 0.3933, + "step": 13575 + }, + { + "epoch": 0.7602195094635458, + "grad_norm": 1.4845340251922607, + "learning_rate": 6.784e-05, + "loss": 0.4841, + "step": 13576 + }, + { + "epoch": 0.7602755067756748, + "grad_norm": 1.1086807250976562, + "learning_rate": 6.7845e-05, + "loss": 0.3112, + "step": 13577 + }, + { + "epoch": 0.7603315040878038, + "grad_norm": 1.3579661846160889, + "learning_rate": 6.785e-05, + "loss": 0.4462, + "step": 13578 + }, + { + "epoch": 0.7603875013999328, + "grad_norm": 1.42841374874115, + "learning_rate": 6.785500000000001e-05, + "loss": 0.4167, + "step": 13579 + }, + { + "epoch": 0.7604434987120618, + "grad_norm": 1.4315341711044312, + "learning_rate": 6.786e-05, + "loss": 0.4883, + "step": 13580 + }, + { + "epoch": 0.7604994960241909, + "grad_norm": 1.3810499906539917, + "learning_rate": 6.7865e-05, + "loss": 0.5526, + "step": 13581 + }, + { + "epoch": 0.7605554933363199, + "grad_norm": 1.5452520847320557, + "learning_rate": 6.787e-05, + "loss": 0.294, + "step": 13582 + }, + { + "epoch": 0.7606114906484489, + "grad_norm": 1.2834396362304688, + "learning_rate": 6.7875e-05, + "loss": 0.5102, + "step": 13583 + }, + { + "epoch": 0.7606674879605779, + "grad_norm": 1.6737674474716187, + "learning_rate": 6.788e-05, + "loss": 0.6717, + "step": 13584 + }, + { + "epoch": 0.760723485272707, + "grad_norm": 1.3482942581176758, + "learning_rate": 6.7885e-05, + "loss": 0.4505, + "step": 13585 + }, + { + "epoch": 0.760779482584836, + "grad_norm": 1.315356731414795, + "learning_rate": 6.789e-05, + "loss": 0.345, + "step": 13586 + }, + { + "epoch": 0.760835479896965, + "grad_norm": 1.2866826057434082, + "learning_rate": 6.789500000000001e-05, + "loss": 0.3925, + "step": 13587 + }, + { + "epoch": 0.760891477209094, + "grad_norm": 1.3313685655593872, + "learning_rate": 6.790000000000001e-05, + "loss": 0.4995, + "step": 13588 + }, + { + "epoch": 0.760947474521223, + "grad_norm": 1.3239092826843262, + "learning_rate": 6.790500000000001e-05, + "loss": 0.5177, + "step": 13589 + }, + { + "epoch": 0.761003471833352, + "grad_norm": 1.577412724494934, + "learning_rate": 6.791e-05, + "loss": 0.6218, + "step": 13590 + }, + { + "epoch": 0.7610594691454811, + "grad_norm": 1.3489168882369995, + "learning_rate": 6.7915e-05, + "loss": 0.5704, + "step": 13591 + }, + { + "epoch": 0.7611154664576101, + "grad_norm": 1.1966787576675415, + "learning_rate": 6.792e-05, + "loss": 0.4038, + "step": 13592 + }, + { + "epoch": 0.7611714637697391, + "grad_norm": 1.236101746559143, + "learning_rate": 6.792500000000001e-05, + "loss": 0.392, + "step": 13593 + }, + { + "epoch": 0.7612274610818681, + "grad_norm": 1.2937026023864746, + "learning_rate": 6.793000000000001e-05, + "loss": 0.39, + "step": 13594 + }, + { + "epoch": 0.7612834583939971, + "grad_norm": 1.1674257516860962, + "learning_rate": 6.7935e-05, + "loss": 0.3888, + "step": 13595 + }, + { + "epoch": 0.7613394557061262, + "grad_norm": 1.514306664466858, + "learning_rate": 6.794e-05, + "loss": 0.6035, + "step": 13596 + }, + { + "epoch": 0.7613954530182552, + "grad_norm": 1.2262016534805298, + "learning_rate": 6.7945e-05, + "loss": 0.3332, + "step": 13597 + }, + { + "epoch": 0.7614514503303842, + "grad_norm": 1.750225305557251, + "learning_rate": 6.795e-05, + "loss": 0.4764, + "step": 13598 + }, + { + "epoch": 0.7615074476425132, + "grad_norm": 1.3644682168960571, + "learning_rate": 6.7955e-05, + "loss": 0.5144, + "step": 13599 + }, + { + "epoch": 0.7615634449546422, + "grad_norm": 1.456863522529602, + "learning_rate": 6.796e-05, + "loss": 0.5256, + "step": 13600 + }, + { + "epoch": 0.7616194422667711, + "grad_norm": 0.9988186359405518, + "learning_rate": 6.7965e-05, + "loss": 0.2737, + "step": 13601 + }, + { + "epoch": 0.7616754395789002, + "grad_norm": 1.4243215322494507, + "learning_rate": 6.797e-05, + "loss": 0.4492, + "step": 13602 + }, + { + "epoch": 0.7617314368910292, + "grad_norm": 1.331834316253662, + "learning_rate": 6.7975e-05, + "loss": 0.3962, + "step": 13603 + }, + { + "epoch": 0.7617874342031582, + "grad_norm": 1.409032940864563, + "learning_rate": 6.798e-05, + "loss": 0.4875, + "step": 13604 + }, + { + "epoch": 0.7618434315152872, + "grad_norm": 1.417380452156067, + "learning_rate": 6.7985e-05, + "loss": 0.4, + "step": 13605 + }, + { + "epoch": 0.7618994288274162, + "grad_norm": 1.4346762895584106, + "learning_rate": 6.799e-05, + "loss": 0.3644, + "step": 13606 + }, + { + "epoch": 0.7619554261395453, + "grad_norm": 1.5617364645004272, + "learning_rate": 6.7995e-05, + "loss": 0.6251, + "step": 13607 + }, + { + "epoch": 0.7620114234516743, + "grad_norm": 1.664028525352478, + "learning_rate": 6.800000000000001e-05, + "loss": 0.4658, + "step": 13608 + }, + { + "epoch": 0.7620674207638033, + "grad_norm": 1.2303657531738281, + "learning_rate": 6.800500000000001e-05, + "loss": 0.3691, + "step": 13609 + }, + { + "epoch": 0.7621234180759323, + "grad_norm": 1.4195746183395386, + "learning_rate": 6.801000000000001e-05, + "loss": 0.4241, + "step": 13610 + }, + { + "epoch": 0.7621794153880613, + "grad_norm": 1.0718209743499756, + "learning_rate": 6.8015e-05, + "loss": 0.4468, + "step": 13611 + }, + { + "epoch": 0.7622354127001904, + "grad_norm": 1.1220693588256836, + "learning_rate": 6.802e-05, + "loss": 0.3926, + "step": 13612 + }, + { + "epoch": 0.7622914100123194, + "grad_norm": 1.277579426765442, + "learning_rate": 6.8025e-05, + "loss": 0.3934, + "step": 13613 + }, + { + "epoch": 0.7623474073244484, + "grad_norm": 1.3952373266220093, + "learning_rate": 6.803000000000001e-05, + "loss": 0.5871, + "step": 13614 + }, + { + "epoch": 0.7624034046365774, + "grad_norm": 1.304663062095642, + "learning_rate": 6.803500000000001e-05, + "loss": 0.4321, + "step": 13615 + }, + { + "epoch": 0.7624594019487064, + "grad_norm": 1.3302100896835327, + "learning_rate": 6.804e-05, + "loss": 0.4849, + "step": 13616 + }, + { + "epoch": 0.7625153992608354, + "grad_norm": 1.1536314487457275, + "learning_rate": 6.8045e-05, + "loss": 0.4198, + "step": 13617 + }, + { + "epoch": 0.7625713965729645, + "grad_norm": 1.2717252969741821, + "learning_rate": 6.805e-05, + "loss": 0.4946, + "step": 13618 + }, + { + "epoch": 0.7626273938850935, + "grad_norm": 1.3787095546722412, + "learning_rate": 6.8055e-05, + "loss": 0.5078, + "step": 13619 + }, + { + "epoch": 0.7626833911972225, + "grad_norm": 1.3335047960281372, + "learning_rate": 6.806e-05, + "loss": 0.5409, + "step": 13620 + }, + { + "epoch": 0.7627393885093515, + "grad_norm": 1.2705175876617432, + "learning_rate": 6.8065e-05, + "loss": 0.4512, + "step": 13621 + }, + { + "epoch": 0.7627953858214805, + "grad_norm": 1.4200292825698853, + "learning_rate": 6.807e-05, + "loss": 0.3421, + "step": 13622 + }, + { + "epoch": 0.7628513831336096, + "grad_norm": 1.419069766998291, + "learning_rate": 6.8075e-05, + "loss": 0.4204, + "step": 13623 + }, + { + "epoch": 0.7629073804457386, + "grad_norm": 1.2540982961654663, + "learning_rate": 6.808e-05, + "loss": 0.5944, + "step": 13624 + }, + { + "epoch": 0.7629633777578676, + "grad_norm": 1.294551134109497, + "learning_rate": 6.8085e-05, + "loss": 0.4136, + "step": 13625 + }, + { + "epoch": 0.7630193750699966, + "grad_norm": 1.2086056470870972, + "learning_rate": 6.809e-05, + "loss": 0.4379, + "step": 13626 + }, + { + "epoch": 0.7630753723821256, + "grad_norm": 1.7807788848876953, + "learning_rate": 6.8095e-05, + "loss": 0.5198, + "step": 13627 + }, + { + "epoch": 0.7631313696942547, + "grad_norm": 1.272684931755066, + "learning_rate": 6.81e-05, + "loss": 0.4241, + "step": 13628 + }, + { + "epoch": 0.7631873670063837, + "grad_norm": 1.271483063697815, + "learning_rate": 6.810500000000001e-05, + "loss": 0.3404, + "step": 13629 + }, + { + "epoch": 0.7632433643185127, + "grad_norm": 1.1285747289657593, + "learning_rate": 6.811000000000001e-05, + "loss": 0.4228, + "step": 13630 + }, + { + "epoch": 0.7632993616306417, + "grad_norm": 1.5059736967086792, + "learning_rate": 6.811500000000001e-05, + "loss": 0.5092, + "step": 13631 + }, + { + "epoch": 0.7633553589427707, + "grad_norm": 1.2974300384521484, + "learning_rate": 6.812e-05, + "loss": 0.462, + "step": 13632 + }, + { + "epoch": 0.7634113562548998, + "grad_norm": 1.4447135925292969, + "learning_rate": 6.8125e-05, + "loss": 0.3697, + "step": 13633 + }, + { + "epoch": 0.7634673535670288, + "grad_norm": 1.396356225013733, + "learning_rate": 6.813e-05, + "loss": 0.454, + "step": 13634 + }, + { + "epoch": 0.7635233508791578, + "grad_norm": 1.2440048456192017, + "learning_rate": 6.813500000000001e-05, + "loss": 0.4406, + "step": 13635 + }, + { + "epoch": 0.7635793481912868, + "grad_norm": 1.2469977140426636, + "learning_rate": 6.814000000000001e-05, + "loss": 0.4061, + "step": 13636 + }, + { + "epoch": 0.7636353455034158, + "grad_norm": 1.319535255432129, + "learning_rate": 6.8145e-05, + "loss": 0.4064, + "step": 13637 + }, + { + "epoch": 0.7636913428155448, + "grad_norm": 1.4810253381729126, + "learning_rate": 6.815e-05, + "loss": 0.4953, + "step": 13638 + }, + { + "epoch": 0.7637473401276739, + "grad_norm": 1.311285376548767, + "learning_rate": 6.8155e-05, + "loss": 0.395, + "step": 13639 + }, + { + "epoch": 0.7638033374398029, + "grad_norm": 1.3299849033355713, + "learning_rate": 6.816e-05, + "loss": 0.476, + "step": 13640 + }, + { + "epoch": 0.7638593347519319, + "grad_norm": 1.3407683372497559, + "learning_rate": 6.8165e-05, + "loss": 0.386, + "step": 13641 + }, + { + "epoch": 0.7639153320640609, + "grad_norm": 1.4075672626495361, + "learning_rate": 6.817e-05, + "loss": 0.4974, + "step": 13642 + }, + { + "epoch": 0.76397132937619, + "grad_norm": 1.3753952980041504, + "learning_rate": 6.8175e-05, + "loss": 0.4195, + "step": 13643 + }, + { + "epoch": 0.764027326688319, + "grad_norm": 1.2342287302017212, + "learning_rate": 6.818e-05, + "loss": 0.4695, + "step": 13644 + }, + { + "epoch": 0.764083324000448, + "grad_norm": 1.3846280574798584, + "learning_rate": 6.8185e-05, + "loss": 0.3887, + "step": 13645 + }, + { + "epoch": 0.764139321312577, + "grad_norm": 1.123100996017456, + "learning_rate": 6.819e-05, + "loss": 0.452, + "step": 13646 + }, + { + "epoch": 0.764195318624706, + "grad_norm": 1.3150691986083984, + "learning_rate": 6.8195e-05, + "loss": 0.5618, + "step": 13647 + }, + { + "epoch": 0.764251315936835, + "grad_norm": 1.6028003692626953, + "learning_rate": 6.82e-05, + "loss": 0.5873, + "step": 13648 + }, + { + "epoch": 0.7643073132489641, + "grad_norm": 1.3623460531234741, + "learning_rate": 6.8205e-05, + "loss": 0.4849, + "step": 13649 + }, + { + "epoch": 0.7643633105610931, + "grad_norm": 1.2319543361663818, + "learning_rate": 6.821000000000001e-05, + "loss": 0.4245, + "step": 13650 + }, + { + "epoch": 0.7644193078732221, + "grad_norm": 1.5644632577896118, + "learning_rate": 6.821500000000001e-05, + "loss": 0.6767, + "step": 13651 + }, + { + "epoch": 0.7644753051853511, + "grad_norm": 1.3389331102371216, + "learning_rate": 6.822000000000001e-05, + "loss": 0.4127, + "step": 13652 + }, + { + "epoch": 0.7645313024974801, + "grad_norm": 1.673305630683899, + "learning_rate": 6.8225e-05, + "loss": 0.4908, + "step": 13653 + }, + { + "epoch": 0.7645872998096092, + "grad_norm": 1.2320618629455566, + "learning_rate": 6.823e-05, + "loss": 0.4999, + "step": 13654 + }, + { + "epoch": 0.7646432971217382, + "grad_norm": 1.3539533615112305, + "learning_rate": 6.8235e-05, + "loss": 0.4486, + "step": 13655 + }, + { + "epoch": 0.7646992944338672, + "grad_norm": 1.133371114730835, + "learning_rate": 6.824e-05, + "loss": 0.329, + "step": 13656 + }, + { + "epoch": 0.7647552917459962, + "grad_norm": 1.3948217630386353, + "learning_rate": 6.824500000000001e-05, + "loss": 0.4525, + "step": 13657 + }, + { + "epoch": 0.7648112890581252, + "grad_norm": 1.3602687120437622, + "learning_rate": 6.825e-05, + "loss": 0.4447, + "step": 13658 + }, + { + "epoch": 0.7648672863702543, + "grad_norm": 1.3339569568634033, + "learning_rate": 6.8255e-05, + "loss": 0.5464, + "step": 13659 + }, + { + "epoch": 0.7649232836823833, + "grad_norm": 1.3197356462478638, + "learning_rate": 6.826e-05, + "loss": 0.3726, + "step": 13660 + }, + { + "epoch": 0.7649792809945123, + "grad_norm": 1.3283066749572754, + "learning_rate": 6.8265e-05, + "loss": 0.5509, + "step": 13661 + }, + { + "epoch": 0.7650352783066413, + "grad_norm": 1.8376636505126953, + "learning_rate": 6.827e-05, + "loss": 0.6377, + "step": 13662 + }, + { + "epoch": 0.7650912756187703, + "grad_norm": 1.1652458906173706, + "learning_rate": 6.8275e-05, + "loss": 0.4488, + "step": 13663 + }, + { + "epoch": 0.7651472729308993, + "grad_norm": 1.3074110746383667, + "learning_rate": 6.828e-05, + "loss": 0.4817, + "step": 13664 + }, + { + "epoch": 0.7652032702430284, + "grad_norm": 1.4740490913391113, + "learning_rate": 6.8285e-05, + "loss": 0.4235, + "step": 13665 + }, + { + "epoch": 0.7652592675551574, + "grad_norm": 1.4304113388061523, + "learning_rate": 6.829e-05, + "loss": 0.5427, + "step": 13666 + }, + { + "epoch": 0.7653152648672864, + "grad_norm": 1.463375210762024, + "learning_rate": 6.8295e-05, + "loss": 0.4777, + "step": 13667 + }, + { + "epoch": 0.7653712621794154, + "grad_norm": 1.1754728555679321, + "learning_rate": 6.83e-05, + "loss": 0.394, + "step": 13668 + }, + { + "epoch": 0.7654272594915444, + "grad_norm": 1.3912222385406494, + "learning_rate": 6.8305e-05, + "loss": 0.5556, + "step": 13669 + }, + { + "epoch": 0.7654832568036735, + "grad_norm": 1.3535871505737305, + "learning_rate": 6.831e-05, + "loss": 0.5625, + "step": 13670 + }, + { + "epoch": 0.7655392541158025, + "grad_norm": 1.1886454820632935, + "learning_rate": 6.831500000000001e-05, + "loss": 0.4734, + "step": 13671 + }, + { + "epoch": 0.7655952514279315, + "grad_norm": 2.200488567352295, + "learning_rate": 6.832000000000001e-05, + "loss": 0.5001, + "step": 13672 + }, + { + "epoch": 0.7656512487400605, + "grad_norm": 1.2043800354003906, + "learning_rate": 6.832500000000001e-05, + "loss": 0.4501, + "step": 13673 + }, + { + "epoch": 0.7657072460521895, + "grad_norm": 1.228277325630188, + "learning_rate": 6.833e-05, + "loss": 0.4027, + "step": 13674 + }, + { + "epoch": 0.7657632433643186, + "grad_norm": 1.2629377841949463, + "learning_rate": 6.8335e-05, + "loss": 0.4552, + "step": 13675 + }, + { + "epoch": 0.7658192406764476, + "grad_norm": 1.2324979305267334, + "learning_rate": 6.834e-05, + "loss": 0.3709, + "step": 13676 + }, + { + "epoch": 0.7658752379885766, + "grad_norm": 1.163761854171753, + "learning_rate": 6.8345e-05, + "loss": 0.5439, + "step": 13677 + }, + { + "epoch": 0.7659312353007056, + "grad_norm": 1.2088426351547241, + "learning_rate": 6.835000000000001e-05, + "loss": 0.4621, + "step": 13678 + }, + { + "epoch": 0.7659872326128346, + "grad_norm": 1.2792140245437622, + "learning_rate": 6.8355e-05, + "loss": 0.4059, + "step": 13679 + }, + { + "epoch": 0.7660432299249637, + "grad_norm": 1.3498033285140991, + "learning_rate": 6.836e-05, + "loss": 0.4016, + "step": 13680 + }, + { + "epoch": 0.7660992272370927, + "grad_norm": 1.7404643297195435, + "learning_rate": 6.8365e-05, + "loss": 0.5288, + "step": 13681 + }, + { + "epoch": 0.7661552245492217, + "grad_norm": 1.2052669525146484, + "learning_rate": 6.837e-05, + "loss": 0.4172, + "step": 13682 + }, + { + "epoch": 0.7662112218613506, + "grad_norm": 1.1906694173812866, + "learning_rate": 6.8375e-05, + "loss": 0.3374, + "step": 13683 + }, + { + "epoch": 0.7662672191734796, + "grad_norm": 1.4606155157089233, + "learning_rate": 6.838e-05, + "loss": 0.4478, + "step": 13684 + }, + { + "epoch": 0.7663232164856086, + "grad_norm": 1.5571273565292358, + "learning_rate": 6.8385e-05, + "loss": 0.4997, + "step": 13685 + }, + { + "epoch": 0.7663792137977377, + "grad_norm": 1.4292577505111694, + "learning_rate": 6.839e-05, + "loss": 0.4201, + "step": 13686 + }, + { + "epoch": 0.7664352111098667, + "grad_norm": 1.3577619791030884, + "learning_rate": 6.8395e-05, + "loss": 0.393, + "step": 13687 + }, + { + "epoch": 0.7664912084219957, + "grad_norm": 1.3015681505203247, + "learning_rate": 6.840000000000001e-05, + "loss": 0.3905, + "step": 13688 + }, + { + "epoch": 0.7665472057341247, + "grad_norm": 1.588666558265686, + "learning_rate": 6.8405e-05, + "loss": 0.4982, + "step": 13689 + }, + { + "epoch": 0.7666032030462537, + "grad_norm": 1.0863832235336304, + "learning_rate": 6.841e-05, + "loss": 0.3979, + "step": 13690 + }, + { + "epoch": 0.7666592003583828, + "grad_norm": 1.5667450428009033, + "learning_rate": 6.8415e-05, + "loss": 0.6567, + "step": 13691 + }, + { + "epoch": 0.7667151976705118, + "grad_norm": 1.1819483041763306, + "learning_rate": 6.842000000000001e-05, + "loss": 0.5536, + "step": 13692 + }, + { + "epoch": 0.7667711949826408, + "grad_norm": 1.4729474782943726, + "learning_rate": 6.842500000000001e-05, + "loss": 0.5742, + "step": 13693 + }, + { + "epoch": 0.7668271922947698, + "grad_norm": 1.5023722648620605, + "learning_rate": 6.843000000000001e-05, + "loss": 0.4246, + "step": 13694 + }, + { + "epoch": 0.7668831896068988, + "grad_norm": 1.6079078912734985, + "learning_rate": 6.8435e-05, + "loss": 0.4576, + "step": 13695 + }, + { + "epoch": 0.7669391869190278, + "grad_norm": 1.3467698097229004, + "learning_rate": 6.844e-05, + "loss": 0.4981, + "step": 13696 + }, + { + "epoch": 0.7669951842311569, + "grad_norm": 1.4072273969650269, + "learning_rate": 6.8445e-05, + "loss": 0.3155, + "step": 13697 + }, + { + "epoch": 0.7670511815432859, + "grad_norm": 1.3815951347351074, + "learning_rate": 6.845e-05, + "loss": 0.376, + "step": 13698 + }, + { + "epoch": 0.7671071788554149, + "grad_norm": 1.4208966493606567, + "learning_rate": 6.845500000000001e-05, + "loss": 0.5671, + "step": 13699 + }, + { + "epoch": 0.7671631761675439, + "grad_norm": 1.2100359201431274, + "learning_rate": 6.846e-05, + "loss": 0.4106, + "step": 13700 + }, + { + "epoch": 0.767219173479673, + "grad_norm": 1.3779208660125732, + "learning_rate": 6.8465e-05, + "loss": 0.5216, + "step": 13701 + }, + { + "epoch": 0.767275170791802, + "grad_norm": 1.2398920059204102, + "learning_rate": 6.847e-05, + "loss": 0.3448, + "step": 13702 + }, + { + "epoch": 0.767331168103931, + "grad_norm": 1.2889721393585205, + "learning_rate": 6.8475e-05, + "loss": 0.4105, + "step": 13703 + }, + { + "epoch": 0.76738716541606, + "grad_norm": 1.327928066253662, + "learning_rate": 6.848e-05, + "loss": 0.5157, + "step": 13704 + }, + { + "epoch": 0.767443162728189, + "grad_norm": 1.3076730966567993, + "learning_rate": 6.8485e-05, + "loss": 0.3998, + "step": 13705 + }, + { + "epoch": 0.767499160040318, + "grad_norm": 1.2470765113830566, + "learning_rate": 6.849e-05, + "loss": 0.3305, + "step": 13706 + }, + { + "epoch": 0.7675551573524471, + "grad_norm": 1.3026156425476074, + "learning_rate": 6.8495e-05, + "loss": 0.4167, + "step": 13707 + }, + { + "epoch": 0.7676111546645761, + "grad_norm": 1.1424435377120972, + "learning_rate": 6.850000000000001e-05, + "loss": 0.4479, + "step": 13708 + }, + { + "epoch": 0.7676671519767051, + "grad_norm": 1.2309578657150269, + "learning_rate": 6.850500000000001e-05, + "loss": 0.4619, + "step": 13709 + }, + { + "epoch": 0.7677231492888341, + "grad_norm": 1.623602032661438, + "learning_rate": 6.851e-05, + "loss": 0.4734, + "step": 13710 + }, + { + "epoch": 0.7677791466009631, + "grad_norm": 1.4080479145050049, + "learning_rate": 6.8515e-05, + "loss": 0.421, + "step": 13711 + }, + { + "epoch": 0.7678351439130922, + "grad_norm": 1.8655375242233276, + "learning_rate": 6.852e-05, + "loss": 0.3954, + "step": 13712 + }, + { + "epoch": 0.7678911412252212, + "grad_norm": 1.8153020143508911, + "learning_rate": 6.852500000000001e-05, + "loss": 0.5158, + "step": 13713 + }, + { + "epoch": 0.7679471385373502, + "grad_norm": 1.4256774187088013, + "learning_rate": 6.853000000000001e-05, + "loss": 0.423, + "step": 13714 + }, + { + "epoch": 0.7680031358494792, + "grad_norm": 1.6449263095855713, + "learning_rate": 6.853500000000001e-05, + "loss": 0.5707, + "step": 13715 + }, + { + "epoch": 0.7680591331616082, + "grad_norm": 1.398479700088501, + "learning_rate": 6.854e-05, + "loss": 0.4827, + "step": 13716 + }, + { + "epoch": 0.7681151304737373, + "grad_norm": 1.153760313987732, + "learning_rate": 6.8545e-05, + "loss": 0.4057, + "step": 13717 + }, + { + "epoch": 0.7681711277858663, + "grad_norm": 1.3056362867355347, + "learning_rate": 6.855e-05, + "loss": 0.4192, + "step": 13718 + }, + { + "epoch": 0.7682271250979953, + "grad_norm": 1.4091055393218994, + "learning_rate": 6.8555e-05, + "loss": 0.5524, + "step": 13719 + }, + { + "epoch": 0.7682831224101243, + "grad_norm": 1.2711598873138428, + "learning_rate": 6.856000000000001e-05, + "loss": 0.4829, + "step": 13720 + }, + { + "epoch": 0.7683391197222533, + "grad_norm": 1.2869224548339844, + "learning_rate": 6.8565e-05, + "loss": 0.6806, + "step": 13721 + }, + { + "epoch": 0.7683951170343823, + "grad_norm": 1.2931668758392334, + "learning_rate": 6.857e-05, + "loss": 0.4188, + "step": 13722 + }, + { + "epoch": 0.7684511143465114, + "grad_norm": 1.210431456565857, + "learning_rate": 6.8575e-05, + "loss": 0.3685, + "step": 13723 + }, + { + "epoch": 0.7685071116586404, + "grad_norm": 1.2958704233169556, + "learning_rate": 6.858e-05, + "loss": 0.4263, + "step": 13724 + }, + { + "epoch": 0.7685631089707694, + "grad_norm": 1.4035395383834839, + "learning_rate": 6.8585e-05, + "loss": 0.4423, + "step": 13725 + }, + { + "epoch": 0.7686191062828984, + "grad_norm": 1.3132297992706299, + "learning_rate": 6.858999999999999e-05, + "loss": 0.4727, + "step": 13726 + }, + { + "epoch": 0.7686751035950274, + "grad_norm": 1.3928653001785278, + "learning_rate": 6.8595e-05, + "loss": 0.3781, + "step": 13727 + }, + { + "epoch": 0.7687311009071565, + "grad_norm": 1.354992389678955, + "learning_rate": 6.860000000000001e-05, + "loss": 0.4253, + "step": 13728 + }, + { + "epoch": 0.7687870982192855, + "grad_norm": 1.6718236207962036, + "learning_rate": 6.860500000000001e-05, + "loss": 0.6554, + "step": 13729 + }, + { + "epoch": 0.7688430955314145, + "grad_norm": 1.3217116594314575, + "learning_rate": 6.861000000000001e-05, + "loss": 0.4524, + "step": 13730 + }, + { + "epoch": 0.7688990928435435, + "grad_norm": 1.318081259727478, + "learning_rate": 6.8615e-05, + "loss": 0.4761, + "step": 13731 + }, + { + "epoch": 0.7689550901556725, + "grad_norm": 0.9450418949127197, + "learning_rate": 6.862e-05, + "loss": 0.3411, + "step": 13732 + }, + { + "epoch": 0.7690110874678016, + "grad_norm": 1.217575192451477, + "learning_rate": 6.8625e-05, + "loss": 0.4743, + "step": 13733 + }, + { + "epoch": 0.7690670847799306, + "grad_norm": 1.4697747230529785, + "learning_rate": 6.863000000000001e-05, + "loss": 0.573, + "step": 13734 + }, + { + "epoch": 0.7691230820920596, + "grad_norm": 1.4503259658813477, + "learning_rate": 6.863500000000001e-05, + "loss": 0.4563, + "step": 13735 + }, + { + "epoch": 0.7691790794041886, + "grad_norm": 1.291875958442688, + "learning_rate": 6.864000000000001e-05, + "loss": 0.5397, + "step": 13736 + }, + { + "epoch": 0.7692350767163176, + "grad_norm": 1.412310242652893, + "learning_rate": 6.8645e-05, + "loss": 0.5399, + "step": 13737 + }, + { + "epoch": 0.7692910740284467, + "grad_norm": 1.4714233875274658, + "learning_rate": 6.865e-05, + "loss": 0.6858, + "step": 13738 + }, + { + "epoch": 0.7693470713405757, + "grad_norm": 1.0596532821655273, + "learning_rate": 6.8655e-05, + "loss": 0.2782, + "step": 13739 + }, + { + "epoch": 0.7694030686527047, + "grad_norm": 1.4776426553726196, + "learning_rate": 6.866e-05, + "loss": 0.4573, + "step": 13740 + }, + { + "epoch": 0.7694590659648337, + "grad_norm": 1.279836893081665, + "learning_rate": 6.866500000000001e-05, + "loss": 0.3918, + "step": 13741 + }, + { + "epoch": 0.7695150632769627, + "grad_norm": 1.4288512468338013, + "learning_rate": 6.867e-05, + "loss": 0.3375, + "step": 13742 + }, + { + "epoch": 0.7695710605890917, + "grad_norm": 1.5765928030014038, + "learning_rate": 6.8675e-05, + "loss": 0.4134, + "step": 13743 + }, + { + "epoch": 0.7696270579012208, + "grad_norm": 1.4100160598754883, + "learning_rate": 6.868e-05, + "loss": 0.4122, + "step": 13744 + }, + { + "epoch": 0.7696830552133498, + "grad_norm": 1.3476622104644775, + "learning_rate": 6.8685e-05, + "loss": 0.4397, + "step": 13745 + }, + { + "epoch": 0.7697390525254788, + "grad_norm": 1.3707756996154785, + "learning_rate": 6.869e-05, + "loss": 0.4645, + "step": 13746 + }, + { + "epoch": 0.7697950498376078, + "grad_norm": 1.6237640380859375, + "learning_rate": 6.869499999999999e-05, + "loss": 0.5476, + "step": 13747 + }, + { + "epoch": 0.7698510471497368, + "grad_norm": 1.3181239366531372, + "learning_rate": 6.87e-05, + "loss": 0.3942, + "step": 13748 + }, + { + "epoch": 0.7699070444618659, + "grad_norm": 1.353237509727478, + "learning_rate": 6.870500000000001e-05, + "loss": 0.4023, + "step": 13749 + }, + { + "epoch": 0.7699630417739949, + "grad_norm": 1.4873833656311035, + "learning_rate": 6.871000000000001e-05, + "loss": 0.4376, + "step": 13750 + }, + { + "epoch": 0.7700190390861239, + "grad_norm": 1.360853672027588, + "learning_rate": 6.871500000000001e-05, + "loss": 0.4512, + "step": 13751 + }, + { + "epoch": 0.7700750363982529, + "grad_norm": 1.18434476852417, + "learning_rate": 6.872e-05, + "loss": 0.3869, + "step": 13752 + }, + { + "epoch": 0.7701310337103819, + "grad_norm": 1.1423625946044922, + "learning_rate": 6.8725e-05, + "loss": 0.3605, + "step": 13753 + }, + { + "epoch": 0.770187031022511, + "grad_norm": 1.325973391532898, + "learning_rate": 6.873e-05, + "loss": 0.2949, + "step": 13754 + }, + { + "epoch": 0.77024302833464, + "grad_norm": 1.4096765518188477, + "learning_rate": 6.8735e-05, + "loss": 0.3872, + "step": 13755 + }, + { + "epoch": 0.770299025646769, + "grad_norm": 1.3526920080184937, + "learning_rate": 6.874000000000001e-05, + "loss": 0.4204, + "step": 13756 + }, + { + "epoch": 0.770355022958898, + "grad_norm": 1.1253491640090942, + "learning_rate": 6.8745e-05, + "loss": 0.3993, + "step": 13757 + }, + { + "epoch": 0.770411020271027, + "grad_norm": 1.1818180084228516, + "learning_rate": 6.875e-05, + "loss": 0.4489, + "step": 13758 + }, + { + "epoch": 0.770467017583156, + "grad_norm": 1.7405974864959717, + "learning_rate": 6.8755e-05, + "loss": 0.5501, + "step": 13759 + }, + { + "epoch": 0.7705230148952851, + "grad_norm": 1.3856478929519653, + "learning_rate": 6.876e-05, + "loss": 0.6071, + "step": 13760 + }, + { + "epoch": 0.7705790122074141, + "grad_norm": 1.6659204959869385, + "learning_rate": 6.8765e-05, + "loss": 0.6156, + "step": 13761 + }, + { + "epoch": 0.7706350095195431, + "grad_norm": 1.35427987575531, + "learning_rate": 6.877000000000001e-05, + "loss": 0.4552, + "step": 13762 + }, + { + "epoch": 0.7706910068316721, + "grad_norm": 1.2575783729553223, + "learning_rate": 6.8775e-05, + "loss": 0.423, + "step": 13763 + }, + { + "epoch": 0.7707470041438012, + "grad_norm": 1.2999074459075928, + "learning_rate": 6.878e-05, + "loss": 0.5113, + "step": 13764 + }, + { + "epoch": 0.7708030014559302, + "grad_norm": 1.3737356662750244, + "learning_rate": 6.8785e-05, + "loss": 0.4653, + "step": 13765 + }, + { + "epoch": 0.7708589987680591, + "grad_norm": 1.4483639001846313, + "learning_rate": 6.879e-05, + "loss": 0.5, + "step": 13766 + }, + { + "epoch": 0.7709149960801881, + "grad_norm": 1.2880616188049316, + "learning_rate": 6.8795e-05, + "loss": 0.3674, + "step": 13767 + }, + { + "epoch": 0.7709709933923171, + "grad_norm": 1.4078574180603027, + "learning_rate": 6.879999999999999e-05, + "loss": 0.5425, + "step": 13768 + }, + { + "epoch": 0.7710269907044461, + "grad_norm": 1.2571057081222534, + "learning_rate": 6.8805e-05, + "loss": 0.3848, + "step": 13769 + }, + { + "epoch": 0.7710829880165752, + "grad_norm": 1.589552640914917, + "learning_rate": 6.881000000000001e-05, + "loss": 0.6412, + "step": 13770 + }, + { + "epoch": 0.7711389853287042, + "grad_norm": 1.9987590312957764, + "learning_rate": 6.881500000000001e-05, + "loss": 0.5261, + "step": 13771 + }, + { + "epoch": 0.7711949826408332, + "grad_norm": 1.2986613512039185, + "learning_rate": 6.882000000000001e-05, + "loss": 0.4911, + "step": 13772 + }, + { + "epoch": 0.7712509799529622, + "grad_norm": 1.2847800254821777, + "learning_rate": 6.8825e-05, + "loss": 0.4441, + "step": 13773 + }, + { + "epoch": 0.7713069772650912, + "grad_norm": 1.485494613647461, + "learning_rate": 6.883e-05, + "loss": 0.6333, + "step": 13774 + }, + { + "epoch": 0.7713629745772203, + "grad_norm": 1.34844970703125, + "learning_rate": 6.8835e-05, + "loss": 0.3288, + "step": 13775 + }, + { + "epoch": 0.7714189718893493, + "grad_norm": 1.1112004518508911, + "learning_rate": 6.884e-05, + "loss": 0.3564, + "step": 13776 + }, + { + "epoch": 0.7714749692014783, + "grad_norm": 1.1809488534927368, + "learning_rate": 6.884500000000001e-05, + "loss": 0.3969, + "step": 13777 + }, + { + "epoch": 0.7715309665136073, + "grad_norm": 1.8307031393051147, + "learning_rate": 6.885e-05, + "loss": 0.4789, + "step": 13778 + }, + { + "epoch": 0.7715869638257363, + "grad_norm": 1.2297064065933228, + "learning_rate": 6.8855e-05, + "loss": 0.4582, + "step": 13779 + }, + { + "epoch": 0.7716429611378653, + "grad_norm": 1.400884747505188, + "learning_rate": 6.886e-05, + "loss": 0.4653, + "step": 13780 + }, + { + "epoch": 0.7716989584499944, + "grad_norm": 1.321446180343628, + "learning_rate": 6.8865e-05, + "loss": 0.3972, + "step": 13781 + }, + { + "epoch": 0.7717549557621234, + "grad_norm": 1.7086085081100464, + "learning_rate": 6.887e-05, + "loss": 0.6464, + "step": 13782 + }, + { + "epoch": 0.7718109530742524, + "grad_norm": 1.319392204284668, + "learning_rate": 6.887500000000001e-05, + "loss": 0.3759, + "step": 13783 + }, + { + "epoch": 0.7718669503863814, + "grad_norm": 1.6760494709014893, + "learning_rate": 6.888e-05, + "loss": 0.5043, + "step": 13784 + }, + { + "epoch": 0.7719229476985104, + "grad_norm": 1.1474812030792236, + "learning_rate": 6.8885e-05, + "loss": 0.3287, + "step": 13785 + }, + { + "epoch": 0.7719789450106395, + "grad_norm": 1.2620575428009033, + "learning_rate": 6.889e-05, + "loss": 0.4179, + "step": 13786 + }, + { + "epoch": 0.7720349423227685, + "grad_norm": 1.1926451921463013, + "learning_rate": 6.8895e-05, + "loss": 0.3944, + "step": 13787 + }, + { + "epoch": 0.7720909396348975, + "grad_norm": 1.2984777688980103, + "learning_rate": 6.89e-05, + "loss": 0.4094, + "step": 13788 + }, + { + "epoch": 0.7721469369470265, + "grad_norm": 1.5784839391708374, + "learning_rate": 6.8905e-05, + "loss": 0.5894, + "step": 13789 + }, + { + "epoch": 0.7722029342591555, + "grad_norm": 1.2304500341415405, + "learning_rate": 6.891e-05, + "loss": 0.3704, + "step": 13790 + }, + { + "epoch": 0.7722589315712846, + "grad_norm": 1.6105252504348755, + "learning_rate": 6.891500000000001e-05, + "loss": 0.5236, + "step": 13791 + }, + { + "epoch": 0.7723149288834136, + "grad_norm": 1.457632303237915, + "learning_rate": 6.892000000000001e-05, + "loss": 0.4524, + "step": 13792 + }, + { + "epoch": 0.7723709261955426, + "grad_norm": 1.252092719078064, + "learning_rate": 6.892500000000001e-05, + "loss": 0.382, + "step": 13793 + }, + { + "epoch": 0.7724269235076716, + "grad_norm": 1.1379681825637817, + "learning_rate": 6.893e-05, + "loss": 0.3817, + "step": 13794 + }, + { + "epoch": 0.7724829208198006, + "grad_norm": 1.3736088275909424, + "learning_rate": 6.8935e-05, + "loss": 0.4011, + "step": 13795 + }, + { + "epoch": 0.7725389181319297, + "grad_norm": 1.4902105331420898, + "learning_rate": 6.894e-05, + "loss": 0.4807, + "step": 13796 + }, + { + "epoch": 0.7725949154440587, + "grad_norm": 1.2289371490478516, + "learning_rate": 6.8945e-05, + "loss": 0.3919, + "step": 13797 + }, + { + "epoch": 0.7726509127561877, + "grad_norm": 1.515181541442871, + "learning_rate": 6.895000000000001e-05, + "loss": 0.4261, + "step": 13798 + }, + { + "epoch": 0.7727069100683167, + "grad_norm": 1.2557302713394165, + "learning_rate": 6.8955e-05, + "loss": 0.518, + "step": 13799 + }, + { + "epoch": 0.7727629073804457, + "grad_norm": 1.485304832458496, + "learning_rate": 6.896e-05, + "loss": 0.4222, + "step": 13800 + }, + { + "epoch": 0.7728189046925747, + "grad_norm": 1.253737211227417, + "learning_rate": 6.8965e-05, + "loss": 0.4721, + "step": 13801 + }, + { + "epoch": 0.7728749020047038, + "grad_norm": 1.5144121646881104, + "learning_rate": 6.897e-05, + "loss": 0.4277, + "step": 13802 + }, + { + "epoch": 0.7729308993168328, + "grad_norm": 1.363663673400879, + "learning_rate": 6.8975e-05, + "loss": 0.3993, + "step": 13803 + }, + { + "epoch": 0.7729868966289618, + "grad_norm": 1.209930181503296, + "learning_rate": 6.898e-05, + "loss": 0.3794, + "step": 13804 + }, + { + "epoch": 0.7730428939410908, + "grad_norm": 1.2857134342193604, + "learning_rate": 6.8985e-05, + "loss": 0.4527, + "step": 13805 + }, + { + "epoch": 0.7730988912532198, + "grad_norm": 2.3742313385009766, + "learning_rate": 6.899e-05, + "loss": 0.5023, + "step": 13806 + }, + { + "epoch": 0.7731548885653489, + "grad_norm": 1.337531328201294, + "learning_rate": 6.8995e-05, + "loss": 0.4598, + "step": 13807 + }, + { + "epoch": 0.7732108858774779, + "grad_norm": 1.3381874561309814, + "learning_rate": 6.9e-05, + "loss": 0.452, + "step": 13808 + }, + { + "epoch": 0.7732668831896069, + "grad_norm": 1.3037018775939941, + "learning_rate": 6.900500000000001e-05, + "loss": 0.3577, + "step": 13809 + }, + { + "epoch": 0.7733228805017359, + "grad_norm": 1.2436840534210205, + "learning_rate": 6.901e-05, + "loss": 0.3838, + "step": 13810 + }, + { + "epoch": 0.7733788778138649, + "grad_norm": 1.198750376701355, + "learning_rate": 6.9015e-05, + "loss": 0.3625, + "step": 13811 + }, + { + "epoch": 0.773434875125994, + "grad_norm": 1.8446670770645142, + "learning_rate": 6.902000000000001e-05, + "loss": 0.4405, + "step": 13812 + }, + { + "epoch": 0.773490872438123, + "grad_norm": 1.359084963798523, + "learning_rate": 6.902500000000001e-05, + "loss": 0.4634, + "step": 13813 + }, + { + "epoch": 0.773546869750252, + "grad_norm": 1.36271333694458, + "learning_rate": 6.903000000000001e-05, + "loss": 0.5752, + "step": 13814 + }, + { + "epoch": 0.773602867062381, + "grad_norm": 1.6614081859588623, + "learning_rate": 6.9035e-05, + "loss": 0.51, + "step": 13815 + }, + { + "epoch": 0.77365886437451, + "grad_norm": 1.3628937005996704, + "learning_rate": 6.904e-05, + "loss": 0.4572, + "step": 13816 + }, + { + "epoch": 0.773714861686639, + "grad_norm": 1.6391777992248535, + "learning_rate": 6.9045e-05, + "loss": 0.6078, + "step": 13817 + }, + { + "epoch": 0.7737708589987681, + "grad_norm": 1.3059842586517334, + "learning_rate": 6.905e-05, + "loss": 0.3356, + "step": 13818 + }, + { + "epoch": 0.7738268563108971, + "grad_norm": 1.2347959280014038, + "learning_rate": 6.905500000000001e-05, + "loss": 0.3631, + "step": 13819 + }, + { + "epoch": 0.7738828536230261, + "grad_norm": 1.3058514595031738, + "learning_rate": 6.906e-05, + "loss": 0.4657, + "step": 13820 + }, + { + "epoch": 0.7739388509351551, + "grad_norm": 1.529215693473816, + "learning_rate": 6.9065e-05, + "loss": 0.4599, + "step": 13821 + }, + { + "epoch": 0.7739948482472842, + "grad_norm": 1.2192203998565674, + "learning_rate": 6.907e-05, + "loss": 0.4931, + "step": 13822 + }, + { + "epoch": 0.7740508455594132, + "grad_norm": 1.276862621307373, + "learning_rate": 6.9075e-05, + "loss": 0.5643, + "step": 13823 + }, + { + "epoch": 0.7741068428715422, + "grad_norm": 1.3896090984344482, + "learning_rate": 6.908e-05, + "loss": 0.4071, + "step": 13824 + }, + { + "epoch": 0.7741628401836712, + "grad_norm": 1.2592624425888062, + "learning_rate": 6.9085e-05, + "loss": 0.4726, + "step": 13825 + }, + { + "epoch": 0.7742188374958002, + "grad_norm": 1.6384234428405762, + "learning_rate": 6.909e-05, + "loss": 0.4709, + "step": 13826 + }, + { + "epoch": 0.7742748348079292, + "grad_norm": 1.4587829113006592, + "learning_rate": 6.9095e-05, + "loss": 0.3398, + "step": 13827 + }, + { + "epoch": 0.7743308321200583, + "grad_norm": 1.1166255474090576, + "learning_rate": 6.91e-05, + "loss": 0.402, + "step": 13828 + }, + { + "epoch": 0.7743868294321873, + "grad_norm": 1.5124624967575073, + "learning_rate": 6.910500000000001e-05, + "loss": 0.4448, + "step": 13829 + }, + { + "epoch": 0.7744428267443163, + "grad_norm": 1.1770051717758179, + "learning_rate": 6.911000000000001e-05, + "loss": 0.3488, + "step": 13830 + }, + { + "epoch": 0.7744988240564453, + "grad_norm": 1.3130956888198853, + "learning_rate": 6.9115e-05, + "loss": 0.4856, + "step": 13831 + }, + { + "epoch": 0.7745548213685743, + "grad_norm": 1.3943486213684082, + "learning_rate": 6.912e-05, + "loss": 0.4011, + "step": 13832 + }, + { + "epoch": 0.7746108186807034, + "grad_norm": 1.4471852779388428, + "learning_rate": 6.9125e-05, + "loss": 0.3976, + "step": 13833 + }, + { + "epoch": 0.7746668159928324, + "grad_norm": 1.2625128030776978, + "learning_rate": 6.913000000000001e-05, + "loss": 0.4263, + "step": 13834 + }, + { + "epoch": 0.7747228133049614, + "grad_norm": 1.5544123649597168, + "learning_rate": 6.913500000000001e-05, + "loss": 0.4449, + "step": 13835 + }, + { + "epoch": 0.7747788106170904, + "grad_norm": 1.125077247619629, + "learning_rate": 6.914e-05, + "loss": 0.4424, + "step": 13836 + }, + { + "epoch": 0.7748348079292194, + "grad_norm": 1.201973557472229, + "learning_rate": 6.9145e-05, + "loss": 0.4998, + "step": 13837 + }, + { + "epoch": 0.7748908052413485, + "grad_norm": 1.315532922744751, + "learning_rate": 6.915e-05, + "loss": 0.3769, + "step": 13838 + }, + { + "epoch": 0.7749468025534775, + "grad_norm": 2.161365032196045, + "learning_rate": 6.9155e-05, + "loss": 0.4246, + "step": 13839 + }, + { + "epoch": 0.7750027998656065, + "grad_norm": 1.1771349906921387, + "learning_rate": 6.916000000000001e-05, + "loss": 0.4635, + "step": 13840 + }, + { + "epoch": 0.7750587971777355, + "grad_norm": 1.7514574527740479, + "learning_rate": 6.9165e-05, + "loss": 0.5846, + "step": 13841 + }, + { + "epoch": 0.7751147944898645, + "grad_norm": 1.6608920097351074, + "learning_rate": 6.917e-05, + "loss": 0.5855, + "step": 13842 + }, + { + "epoch": 0.7751707918019936, + "grad_norm": 1.6512091159820557, + "learning_rate": 6.9175e-05, + "loss": 0.5509, + "step": 13843 + }, + { + "epoch": 0.7752267891141226, + "grad_norm": 1.455511450767517, + "learning_rate": 6.918e-05, + "loss": 0.4465, + "step": 13844 + }, + { + "epoch": 0.7752827864262516, + "grad_norm": 1.0359824895858765, + "learning_rate": 6.9185e-05, + "loss": 0.3096, + "step": 13845 + }, + { + "epoch": 0.7753387837383806, + "grad_norm": 1.3309375047683716, + "learning_rate": 6.918999999999999e-05, + "loss": 0.4897, + "step": 13846 + }, + { + "epoch": 0.7753947810505096, + "grad_norm": 1.2151447534561157, + "learning_rate": 6.9195e-05, + "loss": 0.4117, + "step": 13847 + }, + { + "epoch": 0.7754507783626386, + "grad_norm": 1.316588282585144, + "learning_rate": 6.92e-05, + "loss": 0.3778, + "step": 13848 + }, + { + "epoch": 0.7755067756747676, + "grad_norm": 1.5426836013793945, + "learning_rate": 6.920500000000001e-05, + "loss": 0.5271, + "step": 13849 + }, + { + "epoch": 0.7755627729868966, + "grad_norm": 1.4366511106491089, + "learning_rate": 6.921000000000001e-05, + "loss": 0.4839, + "step": 13850 + }, + { + "epoch": 0.7756187702990256, + "grad_norm": 1.266235113143921, + "learning_rate": 6.921500000000001e-05, + "loss": 0.3653, + "step": 13851 + }, + { + "epoch": 0.7756747676111546, + "grad_norm": 1.1202263832092285, + "learning_rate": 6.922e-05, + "loss": 0.3637, + "step": 13852 + }, + { + "epoch": 0.7757307649232836, + "grad_norm": 1.158772349357605, + "learning_rate": 6.9225e-05, + "loss": 0.3914, + "step": 13853 + }, + { + "epoch": 0.7757867622354127, + "grad_norm": 1.7575604915618896, + "learning_rate": 6.923e-05, + "loss": 0.4897, + "step": 13854 + }, + { + "epoch": 0.7758427595475417, + "grad_norm": 1.211358904838562, + "learning_rate": 6.923500000000001e-05, + "loss": 0.4437, + "step": 13855 + }, + { + "epoch": 0.7758987568596707, + "grad_norm": 1.5842890739440918, + "learning_rate": 6.924000000000001e-05, + "loss": 0.6314, + "step": 13856 + }, + { + "epoch": 0.7759547541717997, + "grad_norm": 1.2781901359558105, + "learning_rate": 6.9245e-05, + "loss": 0.4226, + "step": 13857 + }, + { + "epoch": 0.7760107514839287, + "grad_norm": 1.6134580373764038, + "learning_rate": 6.925e-05, + "loss": 0.4942, + "step": 13858 + }, + { + "epoch": 0.7760667487960577, + "grad_norm": 1.2906938791275024, + "learning_rate": 6.9255e-05, + "loss": 0.3801, + "step": 13859 + }, + { + "epoch": 0.7761227461081868, + "grad_norm": 1.284714937210083, + "learning_rate": 6.926e-05, + "loss": 0.5016, + "step": 13860 + }, + { + "epoch": 0.7761787434203158, + "grad_norm": 1.2326958179473877, + "learning_rate": 6.926500000000001e-05, + "loss": 0.4594, + "step": 13861 + }, + { + "epoch": 0.7762347407324448, + "grad_norm": 1.385058045387268, + "learning_rate": 6.927e-05, + "loss": 0.4893, + "step": 13862 + }, + { + "epoch": 0.7762907380445738, + "grad_norm": 1.1791129112243652, + "learning_rate": 6.9275e-05, + "loss": 0.4252, + "step": 13863 + }, + { + "epoch": 0.7763467353567028, + "grad_norm": 1.4034109115600586, + "learning_rate": 6.928e-05, + "loss": 0.4095, + "step": 13864 + }, + { + "epoch": 0.7764027326688319, + "grad_norm": 1.4608080387115479, + "learning_rate": 6.9285e-05, + "loss": 0.6999, + "step": 13865 + }, + { + "epoch": 0.7764587299809609, + "grad_norm": 1.1830004453659058, + "learning_rate": 6.929e-05, + "loss": 0.5181, + "step": 13866 + }, + { + "epoch": 0.7765147272930899, + "grad_norm": 1.135304570198059, + "learning_rate": 6.929499999999999e-05, + "loss": 0.4598, + "step": 13867 + }, + { + "epoch": 0.7765707246052189, + "grad_norm": 1.295250415802002, + "learning_rate": 6.93e-05, + "loss": 0.3736, + "step": 13868 + }, + { + "epoch": 0.7766267219173479, + "grad_norm": 1.9412102699279785, + "learning_rate": 6.930500000000001e-05, + "loss": 0.5801, + "step": 13869 + }, + { + "epoch": 0.776682719229477, + "grad_norm": 1.6821688413619995, + "learning_rate": 6.931000000000001e-05, + "loss": 0.4209, + "step": 13870 + }, + { + "epoch": 0.776738716541606, + "grad_norm": 1.1794018745422363, + "learning_rate": 6.931500000000001e-05, + "loss": 0.3961, + "step": 13871 + }, + { + "epoch": 0.776794713853735, + "grad_norm": 1.363509178161621, + "learning_rate": 6.932000000000001e-05, + "loss": 0.4314, + "step": 13872 + }, + { + "epoch": 0.776850711165864, + "grad_norm": 1.2927194833755493, + "learning_rate": 6.9325e-05, + "loss": 0.3952, + "step": 13873 + }, + { + "epoch": 0.776906708477993, + "grad_norm": 1.4138803482055664, + "learning_rate": 6.933e-05, + "loss": 0.4139, + "step": 13874 + }, + { + "epoch": 0.776962705790122, + "grad_norm": 1.3323150873184204, + "learning_rate": 6.9335e-05, + "loss": 0.4429, + "step": 13875 + }, + { + "epoch": 0.7770187031022511, + "grad_norm": 1.705682635307312, + "learning_rate": 6.934000000000001e-05, + "loss": 0.4408, + "step": 13876 + }, + { + "epoch": 0.7770747004143801, + "grad_norm": 1.4556496143341064, + "learning_rate": 6.934500000000001e-05, + "loss": 0.3778, + "step": 13877 + }, + { + "epoch": 0.7771306977265091, + "grad_norm": 1.2827378511428833, + "learning_rate": 6.935e-05, + "loss": 0.4104, + "step": 13878 + }, + { + "epoch": 0.7771866950386381, + "grad_norm": 1.4275895357131958, + "learning_rate": 6.9355e-05, + "loss": 0.4003, + "step": 13879 + }, + { + "epoch": 0.7772426923507672, + "grad_norm": 1.1352436542510986, + "learning_rate": 6.936e-05, + "loss": 0.3701, + "step": 13880 + }, + { + "epoch": 0.7772986896628962, + "grad_norm": 1.2699083089828491, + "learning_rate": 6.9365e-05, + "loss": 0.4275, + "step": 13881 + }, + { + "epoch": 0.7773546869750252, + "grad_norm": 1.5980900526046753, + "learning_rate": 6.937000000000001e-05, + "loss": 0.3992, + "step": 13882 + }, + { + "epoch": 0.7774106842871542, + "grad_norm": 1.665728211402893, + "learning_rate": 6.9375e-05, + "loss": 0.4682, + "step": 13883 + }, + { + "epoch": 0.7774666815992832, + "grad_norm": 1.3462138175964355, + "learning_rate": 6.938e-05, + "loss": 0.4193, + "step": 13884 + }, + { + "epoch": 0.7775226789114122, + "grad_norm": 0.9974067211151123, + "learning_rate": 6.9385e-05, + "loss": 0.3148, + "step": 13885 + }, + { + "epoch": 0.7775786762235413, + "grad_norm": 1.3148261308670044, + "learning_rate": 6.939e-05, + "loss": 0.4683, + "step": 13886 + }, + { + "epoch": 0.7776346735356703, + "grad_norm": 1.506017804145813, + "learning_rate": 6.9395e-05, + "loss": 0.5044, + "step": 13887 + }, + { + "epoch": 0.7776906708477993, + "grad_norm": 1.2350032329559326, + "learning_rate": 6.939999999999999e-05, + "loss": 0.4377, + "step": 13888 + }, + { + "epoch": 0.7777466681599283, + "grad_norm": 1.1359614133834839, + "learning_rate": 6.9405e-05, + "loss": 0.4154, + "step": 13889 + }, + { + "epoch": 0.7778026654720573, + "grad_norm": 1.454512119293213, + "learning_rate": 6.941000000000001e-05, + "loss": 0.673, + "step": 13890 + }, + { + "epoch": 0.7778586627841864, + "grad_norm": 1.3806427717208862, + "learning_rate": 6.941500000000001e-05, + "loss": 0.4112, + "step": 13891 + }, + { + "epoch": 0.7779146600963154, + "grad_norm": 1.360846757888794, + "learning_rate": 6.942000000000001e-05, + "loss": 0.3939, + "step": 13892 + }, + { + "epoch": 0.7779706574084444, + "grad_norm": 1.238250970840454, + "learning_rate": 6.942500000000001e-05, + "loss": 0.3914, + "step": 13893 + }, + { + "epoch": 0.7780266547205734, + "grad_norm": 4.463929176330566, + "learning_rate": 6.943e-05, + "loss": 0.4305, + "step": 13894 + }, + { + "epoch": 0.7780826520327024, + "grad_norm": 1.5247230529785156, + "learning_rate": 6.9435e-05, + "loss": 0.5265, + "step": 13895 + }, + { + "epoch": 0.7781386493448315, + "grad_norm": 1.4591853618621826, + "learning_rate": 6.944e-05, + "loss": 0.4177, + "step": 13896 + }, + { + "epoch": 0.7781946466569605, + "grad_norm": 1.3938111066818237, + "learning_rate": 6.944500000000001e-05, + "loss": 0.3971, + "step": 13897 + }, + { + "epoch": 0.7782506439690895, + "grad_norm": 1.2990190982818604, + "learning_rate": 6.945000000000001e-05, + "loss": 0.4432, + "step": 13898 + }, + { + "epoch": 0.7783066412812185, + "grad_norm": 1.4695848226547241, + "learning_rate": 6.9455e-05, + "loss": 0.4576, + "step": 13899 + }, + { + "epoch": 0.7783626385933475, + "grad_norm": 1.2041194438934326, + "learning_rate": 6.946e-05, + "loss": 0.4222, + "step": 13900 + }, + { + "epoch": 0.7784186359054766, + "grad_norm": 1.2224990129470825, + "learning_rate": 6.9465e-05, + "loss": 0.5282, + "step": 13901 + }, + { + "epoch": 0.7784746332176056, + "grad_norm": 1.1354583501815796, + "learning_rate": 6.947e-05, + "loss": 0.3981, + "step": 13902 + }, + { + "epoch": 0.7785306305297346, + "grad_norm": 1.738417387008667, + "learning_rate": 6.9475e-05, + "loss": 0.6282, + "step": 13903 + }, + { + "epoch": 0.7785866278418636, + "grad_norm": 1.2167282104492188, + "learning_rate": 6.948e-05, + "loss": 0.3819, + "step": 13904 + }, + { + "epoch": 0.7786426251539926, + "grad_norm": 1.361893653869629, + "learning_rate": 6.9485e-05, + "loss": 0.4069, + "step": 13905 + }, + { + "epoch": 0.7786986224661216, + "grad_norm": 1.4380555152893066, + "learning_rate": 6.949e-05, + "loss": 0.4445, + "step": 13906 + }, + { + "epoch": 0.7787546197782507, + "grad_norm": 1.24867582321167, + "learning_rate": 6.9495e-05, + "loss": 0.4167, + "step": 13907 + }, + { + "epoch": 0.7788106170903797, + "grad_norm": 1.3571693897247314, + "learning_rate": 6.95e-05, + "loss": 0.4471, + "step": 13908 + }, + { + "epoch": 0.7788666144025087, + "grad_norm": 1.3829060792922974, + "learning_rate": 6.950499999999999e-05, + "loss": 0.3551, + "step": 13909 + }, + { + "epoch": 0.7789226117146377, + "grad_norm": 1.2228182554244995, + "learning_rate": 6.951e-05, + "loss": 0.5215, + "step": 13910 + }, + { + "epoch": 0.7789786090267667, + "grad_norm": 1.6797186136245728, + "learning_rate": 6.951500000000001e-05, + "loss": 0.4942, + "step": 13911 + }, + { + "epoch": 0.7790346063388958, + "grad_norm": 1.561423420906067, + "learning_rate": 6.952000000000001e-05, + "loss": 0.5206, + "step": 13912 + }, + { + "epoch": 0.7790906036510248, + "grad_norm": 1.3722482919692993, + "learning_rate": 6.952500000000001e-05, + "loss": 0.4192, + "step": 13913 + }, + { + "epoch": 0.7791466009631538, + "grad_norm": 1.2754452228546143, + "learning_rate": 6.953000000000001e-05, + "loss": 0.5925, + "step": 13914 + }, + { + "epoch": 0.7792025982752828, + "grad_norm": 1.2323298454284668, + "learning_rate": 6.9535e-05, + "loss": 0.4066, + "step": 13915 + }, + { + "epoch": 0.7792585955874118, + "grad_norm": 1.2997041940689087, + "learning_rate": 6.954e-05, + "loss": 0.4002, + "step": 13916 + }, + { + "epoch": 0.7793145928995409, + "grad_norm": 1.1204379796981812, + "learning_rate": 6.9545e-05, + "loss": 0.3701, + "step": 13917 + }, + { + "epoch": 0.7793705902116699, + "grad_norm": 1.3768155574798584, + "learning_rate": 6.955000000000001e-05, + "loss": 0.3913, + "step": 13918 + }, + { + "epoch": 0.7794265875237989, + "grad_norm": 1.28208327293396, + "learning_rate": 6.955500000000001e-05, + "loss": 0.4146, + "step": 13919 + }, + { + "epoch": 0.7794825848359279, + "grad_norm": 1.2160152196884155, + "learning_rate": 6.956e-05, + "loss": 0.397, + "step": 13920 + }, + { + "epoch": 0.7795385821480569, + "grad_norm": 1.4384526014328003, + "learning_rate": 6.9565e-05, + "loss": 0.5679, + "step": 13921 + }, + { + "epoch": 0.779594579460186, + "grad_norm": 1.3922152519226074, + "learning_rate": 6.957e-05, + "loss": 0.4086, + "step": 13922 + }, + { + "epoch": 0.779650576772315, + "grad_norm": 1.5517241954803467, + "learning_rate": 6.9575e-05, + "loss": 0.4071, + "step": 13923 + }, + { + "epoch": 0.779706574084444, + "grad_norm": 1.2017390727996826, + "learning_rate": 6.958e-05, + "loss": 0.4027, + "step": 13924 + }, + { + "epoch": 0.779762571396573, + "grad_norm": 1.3809328079223633, + "learning_rate": 6.9585e-05, + "loss": 0.4484, + "step": 13925 + }, + { + "epoch": 0.779818568708702, + "grad_norm": 1.2521413564682007, + "learning_rate": 6.959e-05, + "loss": 0.3402, + "step": 13926 + }, + { + "epoch": 0.779874566020831, + "grad_norm": 1.704215407371521, + "learning_rate": 6.9595e-05, + "loss": 0.4422, + "step": 13927 + }, + { + "epoch": 0.7799305633329601, + "grad_norm": 1.3313181400299072, + "learning_rate": 6.96e-05, + "loss": 0.4553, + "step": 13928 + }, + { + "epoch": 0.7799865606450891, + "grad_norm": 1.4314274787902832, + "learning_rate": 6.9605e-05, + "loss": 0.3606, + "step": 13929 + }, + { + "epoch": 0.7800425579572181, + "grad_norm": 1.0212676525115967, + "learning_rate": 6.961e-05, + "loss": 0.3565, + "step": 13930 + }, + { + "epoch": 0.780098555269347, + "grad_norm": 1.203959345817566, + "learning_rate": 6.9615e-05, + "loss": 0.4249, + "step": 13931 + }, + { + "epoch": 0.780154552581476, + "grad_norm": 1.0592491626739502, + "learning_rate": 6.962e-05, + "loss": 0.423, + "step": 13932 + }, + { + "epoch": 0.780210549893605, + "grad_norm": 1.5868914127349854, + "learning_rate": 6.962500000000001e-05, + "loss": 0.348, + "step": 13933 + }, + { + "epoch": 0.7802665472057341, + "grad_norm": 5.164170742034912, + "learning_rate": 6.963000000000001e-05, + "loss": 0.4902, + "step": 13934 + }, + { + "epoch": 0.7803225445178631, + "grad_norm": 1.4760080575942993, + "learning_rate": 6.9635e-05, + "loss": 0.5056, + "step": 13935 + }, + { + "epoch": 0.7803785418299921, + "grad_norm": 1.3616795539855957, + "learning_rate": 6.964e-05, + "loss": 0.372, + "step": 13936 + }, + { + "epoch": 0.7804345391421211, + "grad_norm": 1.3553009033203125, + "learning_rate": 6.9645e-05, + "loss": 0.4697, + "step": 13937 + }, + { + "epoch": 0.7804905364542502, + "grad_norm": 1.2548644542694092, + "learning_rate": 6.965e-05, + "loss": 0.4852, + "step": 13938 + }, + { + "epoch": 0.7805465337663792, + "grad_norm": 1.1524333953857422, + "learning_rate": 6.965500000000001e-05, + "loss": 0.4164, + "step": 13939 + }, + { + "epoch": 0.7806025310785082, + "grad_norm": 1.3469394445419312, + "learning_rate": 6.966000000000001e-05, + "loss": 0.3957, + "step": 13940 + }, + { + "epoch": 0.7806585283906372, + "grad_norm": 1.2270981073379517, + "learning_rate": 6.9665e-05, + "loss": 0.4294, + "step": 13941 + }, + { + "epoch": 0.7807145257027662, + "grad_norm": 1.1579169034957886, + "learning_rate": 6.967e-05, + "loss": 0.3281, + "step": 13942 + }, + { + "epoch": 0.7807705230148952, + "grad_norm": 1.1557217836380005, + "learning_rate": 6.9675e-05, + "loss": 0.3436, + "step": 13943 + }, + { + "epoch": 0.7808265203270243, + "grad_norm": 1.1874650716781616, + "learning_rate": 6.968e-05, + "loss": 0.3193, + "step": 13944 + }, + { + "epoch": 0.7808825176391533, + "grad_norm": 1.2706297636032104, + "learning_rate": 6.9685e-05, + "loss": 0.5212, + "step": 13945 + }, + { + "epoch": 0.7809385149512823, + "grad_norm": 1.314117431640625, + "learning_rate": 6.969e-05, + "loss": 0.4, + "step": 13946 + }, + { + "epoch": 0.7809945122634113, + "grad_norm": 1.414960265159607, + "learning_rate": 6.9695e-05, + "loss": 0.4314, + "step": 13947 + }, + { + "epoch": 0.7810505095755403, + "grad_norm": 1.3312770128250122, + "learning_rate": 6.97e-05, + "loss": 0.3932, + "step": 13948 + }, + { + "epoch": 0.7811065068876694, + "grad_norm": 1.2104490995407104, + "learning_rate": 6.9705e-05, + "loss": 0.371, + "step": 13949 + }, + { + "epoch": 0.7811625041997984, + "grad_norm": 1.268545389175415, + "learning_rate": 6.971000000000001e-05, + "loss": 0.4738, + "step": 13950 + }, + { + "epoch": 0.7812185015119274, + "grad_norm": 1.1525225639343262, + "learning_rate": 6.9715e-05, + "loss": 0.4031, + "step": 13951 + }, + { + "epoch": 0.7812744988240564, + "grad_norm": 1.3071669340133667, + "learning_rate": 6.972e-05, + "loss": 0.401, + "step": 13952 + }, + { + "epoch": 0.7813304961361854, + "grad_norm": 1.194106936454773, + "learning_rate": 6.9725e-05, + "loss": 0.3375, + "step": 13953 + }, + { + "epoch": 0.7813864934483145, + "grad_norm": 1.4018372297286987, + "learning_rate": 6.973000000000001e-05, + "loss": 0.4572, + "step": 13954 + }, + { + "epoch": 0.7814424907604435, + "grad_norm": 1.109297275543213, + "learning_rate": 6.973500000000001e-05, + "loss": 0.4132, + "step": 13955 + }, + { + "epoch": 0.7814984880725725, + "grad_norm": 1.2089378833770752, + "learning_rate": 6.974e-05, + "loss": 0.5602, + "step": 13956 + }, + { + "epoch": 0.7815544853847015, + "grad_norm": 1.5532406568527222, + "learning_rate": 6.9745e-05, + "loss": 0.471, + "step": 13957 + }, + { + "epoch": 0.7816104826968305, + "grad_norm": 1.2478727102279663, + "learning_rate": 6.975e-05, + "loss": 0.4428, + "step": 13958 + }, + { + "epoch": 0.7816664800089596, + "grad_norm": 1.18569815158844, + "learning_rate": 6.9755e-05, + "loss": 0.3875, + "step": 13959 + }, + { + "epoch": 0.7817224773210886, + "grad_norm": 1.3408461809158325, + "learning_rate": 6.976000000000001e-05, + "loss": 0.5038, + "step": 13960 + }, + { + "epoch": 0.7817784746332176, + "grad_norm": 1.3232312202453613, + "learning_rate": 6.976500000000001e-05, + "loss": 0.4234, + "step": 13961 + }, + { + "epoch": 0.7818344719453466, + "grad_norm": 1.516841173171997, + "learning_rate": 6.977e-05, + "loss": 0.6285, + "step": 13962 + }, + { + "epoch": 0.7818904692574756, + "grad_norm": 1.223027229309082, + "learning_rate": 6.9775e-05, + "loss": 0.4015, + "step": 13963 + }, + { + "epoch": 0.7819464665696046, + "grad_norm": 1.454139232635498, + "learning_rate": 6.978e-05, + "loss": 0.5237, + "step": 13964 + }, + { + "epoch": 0.7820024638817337, + "grad_norm": 1.4948914051055908, + "learning_rate": 6.9785e-05, + "loss": 0.4537, + "step": 13965 + }, + { + "epoch": 0.7820584611938627, + "grad_norm": 1.4777477979660034, + "learning_rate": 6.979e-05, + "loss": 0.4717, + "step": 13966 + }, + { + "epoch": 0.7821144585059917, + "grad_norm": 1.4510791301727295, + "learning_rate": 6.9795e-05, + "loss": 0.5227, + "step": 13967 + }, + { + "epoch": 0.7821704558181207, + "grad_norm": 1.7112095355987549, + "learning_rate": 6.98e-05, + "loss": 0.4873, + "step": 13968 + }, + { + "epoch": 0.7822264531302497, + "grad_norm": 1.3013123273849487, + "learning_rate": 6.9805e-05, + "loss": 0.352, + "step": 13969 + }, + { + "epoch": 0.7822824504423788, + "grad_norm": 1.1731970310211182, + "learning_rate": 6.981000000000001e-05, + "loss": 0.4815, + "step": 13970 + }, + { + "epoch": 0.7823384477545078, + "grad_norm": 1.0791093111038208, + "learning_rate": 6.981500000000001e-05, + "loss": 0.3778, + "step": 13971 + }, + { + "epoch": 0.7823944450666368, + "grad_norm": 1.2413989305496216, + "learning_rate": 6.982e-05, + "loss": 0.3655, + "step": 13972 + }, + { + "epoch": 0.7824504423787658, + "grad_norm": 1.4981213808059692, + "learning_rate": 6.9825e-05, + "loss": 0.3967, + "step": 13973 + }, + { + "epoch": 0.7825064396908948, + "grad_norm": 1.2649314403533936, + "learning_rate": 6.983e-05, + "loss": 0.4468, + "step": 13974 + }, + { + "epoch": 0.7825624370030239, + "grad_norm": 1.3142908811569214, + "learning_rate": 6.983500000000001e-05, + "loss": 0.4962, + "step": 13975 + }, + { + "epoch": 0.7826184343151529, + "grad_norm": 1.1367021799087524, + "learning_rate": 6.984000000000001e-05, + "loss": 0.3033, + "step": 13976 + }, + { + "epoch": 0.7826744316272819, + "grad_norm": 1.2410575151443481, + "learning_rate": 6.9845e-05, + "loss": 0.4894, + "step": 13977 + }, + { + "epoch": 0.7827304289394109, + "grad_norm": 1.424251675605774, + "learning_rate": 6.985e-05, + "loss": 0.365, + "step": 13978 + }, + { + "epoch": 0.7827864262515399, + "grad_norm": 1.9061392545700073, + "learning_rate": 6.9855e-05, + "loss": 0.5065, + "step": 13979 + }, + { + "epoch": 0.782842423563669, + "grad_norm": 1.4891753196716309, + "learning_rate": 6.986e-05, + "loss": 0.4166, + "step": 13980 + }, + { + "epoch": 0.782898420875798, + "grad_norm": 1.5439541339874268, + "learning_rate": 6.9865e-05, + "loss": 0.4611, + "step": 13981 + }, + { + "epoch": 0.782954418187927, + "grad_norm": 1.2894482612609863, + "learning_rate": 6.987000000000001e-05, + "loss": 0.4427, + "step": 13982 + }, + { + "epoch": 0.783010415500056, + "grad_norm": 1.278134822845459, + "learning_rate": 6.9875e-05, + "loss": 0.5133, + "step": 13983 + }, + { + "epoch": 0.783066412812185, + "grad_norm": 1.510360598564148, + "learning_rate": 6.988e-05, + "loss": 0.4293, + "step": 13984 + }, + { + "epoch": 0.783122410124314, + "grad_norm": 1.4213038682937622, + "learning_rate": 6.9885e-05, + "loss": 0.4431, + "step": 13985 + }, + { + "epoch": 0.7831784074364431, + "grad_norm": 1.2273017168045044, + "learning_rate": 6.989e-05, + "loss": 0.4495, + "step": 13986 + }, + { + "epoch": 0.7832344047485721, + "grad_norm": 1.3446868658065796, + "learning_rate": 6.9895e-05, + "loss": 0.4086, + "step": 13987 + }, + { + "epoch": 0.7832904020607011, + "grad_norm": 1.4655638933181763, + "learning_rate": 6.99e-05, + "loss": 0.4838, + "step": 13988 + }, + { + "epoch": 0.7833463993728301, + "grad_norm": 1.3987774848937988, + "learning_rate": 6.9905e-05, + "loss": 0.493, + "step": 13989 + }, + { + "epoch": 0.7834023966849591, + "grad_norm": 1.6490641832351685, + "learning_rate": 6.991000000000001e-05, + "loss": 0.5507, + "step": 13990 + }, + { + "epoch": 0.7834583939970882, + "grad_norm": 1.5870490074157715, + "learning_rate": 6.991500000000001e-05, + "loss": 0.5309, + "step": 13991 + }, + { + "epoch": 0.7835143913092172, + "grad_norm": 1.0920445919036865, + "learning_rate": 6.992000000000001e-05, + "loss": 0.4331, + "step": 13992 + }, + { + "epoch": 0.7835703886213462, + "grad_norm": 1.8054163455963135, + "learning_rate": 6.9925e-05, + "loss": 0.3573, + "step": 13993 + }, + { + "epoch": 0.7836263859334752, + "grad_norm": 1.3015691041946411, + "learning_rate": 6.993e-05, + "loss": 0.5052, + "step": 13994 + }, + { + "epoch": 0.7836823832456042, + "grad_norm": 1.448464035987854, + "learning_rate": 6.9935e-05, + "loss": 0.3574, + "step": 13995 + }, + { + "epoch": 0.7837383805577333, + "grad_norm": 1.2324483394622803, + "learning_rate": 6.994000000000001e-05, + "loss": 0.4016, + "step": 13996 + }, + { + "epoch": 0.7837943778698623, + "grad_norm": 1.290958046913147, + "learning_rate": 6.994500000000001e-05, + "loss": 0.4793, + "step": 13997 + }, + { + "epoch": 0.7838503751819913, + "grad_norm": 1.1485828161239624, + "learning_rate": 6.995e-05, + "loss": 0.3982, + "step": 13998 + }, + { + "epoch": 0.7839063724941203, + "grad_norm": 1.4403445720672607, + "learning_rate": 6.9955e-05, + "loss": 0.3792, + "step": 13999 + }, + { + "epoch": 0.7839623698062493, + "grad_norm": 1.1863340139389038, + "learning_rate": 6.996e-05, + "loss": 0.4099, + "step": 14000 + }, + { + "epoch": 0.7840183671183784, + "grad_norm": 1.294132113456726, + "learning_rate": 6.9965e-05, + "loss": 0.4635, + "step": 14001 + }, + { + "epoch": 0.7840743644305074, + "grad_norm": 1.403907060623169, + "learning_rate": 6.997e-05, + "loss": 0.4105, + "step": 14002 + }, + { + "epoch": 0.7841303617426364, + "grad_norm": 1.434434413909912, + "learning_rate": 6.997500000000001e-05, + "loss": 0.4805, + "step": 14003 + }, + { + "epoch": 0.7841863590547654, + "grad_norm": 1.1685349941253662, + "learning_rate": 6.998e-05, + "loss": 0.5337, + "step": 14004 + }, + { + "epoch": 0.7842423563668944, + "grad_norm": 2.138056516647339, + "learning_rate": 6.9985e-05, + "loss": 0.4324, + "step": 14005 + }, + { + "epoch": 0.7842983536790235, + "grad_norm": 1.5927270650863647, + "learning_rate": 6.999e-05, + "loss": 0.4646, + "step": 14006 + }, + { + "epoch": 0.7843543509911525, + "grad_norm": 2.062490463256836, + "learning_rate": 6.9995e-05, + "loss": 0.6349, + "step": 14007 + }, + { + "epoch": 0.7844103483032815, + "grad_norm": 1.1932083368301392, + "learning_rate": 7e-05, + "loss": 0.3989, + "step": 14008 + }, + { + "epoch": 0.7844663456154105, + "grad_norm": 1.3754913806915283, + "learning_rate": 7.0005e-05, + "loss": 0.4343, + "step": 14009 + }, + { + "epoch": 0.7845223429275395, + "grad_norm": 1.5349290370941162, + "learning_rate": 7.001e-05, + "loss": 0.4089, + "step": 14010 + }, + { + "epoch": 0.7845783402396685, + "grad_norm": 1.2927325963974, + "learning_rate": 7.001500000000001e-05, + "loss": 0.5192, + "step": 14011 + }, + { + "epoch": 0.7846343375517976, + "grad_norm": 2.542348861694336, + "learning_rate": 7.002000000000001e-05, + "loss": 0.5494, + "step": 14012 + }, + { + "epoch": 0.7846903348639266, + "grad_norm": 1.211527943611145, + "learning_rate": 7.002500000000001e-05, + "loss": 0.3993, + "step": 14013 + }, + { + "epoch": 0.7847463321760555, + "grad_norm": 1.3985072374343872, + "learning_rate": 7.003e-05, + "loss": 0.4081, + "step": 14014 + }, + { + "epoch": 0.7848023294881845, + "grad_norm": 1.479193091392517, + "learning_rate": 7.0035e-05, + "loss": 0.569, + "step": 14015 + }, + { + "epoch": 0.7848583268003135, + "grad_norm": 1.100593090057373, + "learning_rate": 7.004e-05, + "loss": 0.3533, + "step": 14016 + }, + { + "epoch": 0.7849143241124426, + "grad_norm": 1.3223495483398438, + "learning_rate": 7.004500000000001e-05, + "loss": 0.3749, + "step": 14017 + }, + { + "epoch": 0.7849703214245716, + "grad_norm": 1.162502408027649, + "learning_rate": 7.005000000000001e-05, + "loss": 0.4615, + "step": 14018 + }, + { + "epoch": 0.7850263187367006, + "grad_norm": 1.402768611907959, + "learning_rate": 7.0055e-05, + "loss": 0.5245, + "step": 14019 + }, + { + "epoch": 0.7850823160488296, + "grad_norm": 1.5840893983840942, + "learning_rate": 7.006e-05, + "loss": 0.5788, + "step": 14020 + }, + { + "epoch": 0.7851383133609586, + "grad_norm": 1.3359031677246094, + "learning_rate": 7.0065e-05, + "loss": 0.4687, + "step": 14021 + }, + { + "epoch": 0.7851943106730876, + "grad_norm": 1.5367978811264038, + "learning_rate": 7.007e-05, + "loss": 0.5541, + "step": 14022 + }, + { + "epoch": 0.7852503079852167, + "grad_norm": 1.2812166213989258, + "learning_rate": 7.0075e-05, + "loss": 0.4377, + "step": 14023 + }, + { + "epoch": 0.7853063052973457, + "grad_norm": 1.3251447677612305, + "learning_rate": 7.008e-05, + "loss": 0.405, + "step": 14024 + }, + { + "epoch": 0.7853623026094747, + "grad_norm": 1.500702142715454, + "learning_rate": 7.0085e-05, + "loss": 0.5783, + "step": 14025 + }, + { + "epoch": 0.7854182999216037, + "grad_norm": 1.5428801774978638, + "learning_rate": 7.009e-05, + "loss": 0.588, + "step": 14026 + }, + { + "epoch": 0.7854742972337327, + "grad_norm": 1.34929358959198, + "learning_rate": 7.0095e-05, + "loss": 0.5225, + "step": 14027 + }, + { + "epoch": 0.7855302945458618, + "grad_norm": 1.4423037767410278, + "learning_rate": 7.01e-05, + "loss": 0.522, + "step": 14028 + }, + { + "epoch": 0.7855862918579908, + "grad_norm": 1.3482623100280762, + "learning_rate": 7.0105e-05, + "loss": 0.5341, + "step": 14029 + }, + { + "epoch": 0.7856422891701198, + "grad_norm": 1.7001186609268188, + "learning_rate": 7.011e-05, + "loss": 0.5013, + "step": 14030 + }, + { + "epoch": 0.7856982864822488, + "grad_norm": 1.2402490377426147, + "learning_rate": 7.0115e-05, + "loss": 0.5603, + "step": 14031 + }, + { + "epoch": 0.7857542837943778, + "grad_norm": 1.2932721376419067, + "learning_rate": 7.012000000000001e-05, + "loss": 0.4235, + "step": 14032 + }, + { + "epoch": 0.7858102811065069, + "grad_norm": 1.446073293685913, + "learning_rate": 7.012500000000001e-05, + "loss": 0.4654, + "step": 14033 + }, + { + "epoch": 0.7858662784186359, + "grad_norm": 1.7081900835037231, + "learning_rate": 7.013000000000001e-05, + "loss": 0.4237, + "step": 14034 + }, + { + "epoch": 0.7859222757307649, + "grad_norm": 1.3166462182998657, + "learning_rate": 7.0135e-05, + "loss": 0.4698, + "step": 14035 + }, + { + "epoch": 0.7859782730428939, + "grad_norm": 1.2614250183105469, + "learning_rate": 7.014e-05, + "loss": 0.4302, + "step": 14036 + }, + { + "epoch": 0.7860342703550229, + "grad_norm": 1.2707799673080444, + "learning_rate": 7.0145e-05, + "loss": 0.4416, + "step": 14037 + }, + { + "epoch": 0.786090267667152, + "grad_norm": 1.3705872297286987, + "learning_rate": 7.015000000000001e-05, + "loss": 0.429, + "step": 14038 + }, + { + "epoch": 0.786146264979281, + "grad_norm": 1.3415343761444092, + "learning_rate": 7.015500000000001e-05, + "loss": 0.5315, + "step": 14039 + }, + { + "epoch": 0.78620226229141, + "grad_norm": 1.2904845476150513, + "learning_rate": 7.016e-05, + "loss": 0.4053, + "step": 14040 + }, + { + "epoch": 0.786258259603539, + "grad_norm": 1.431028127670288, + "learning_rate": 7.0165e-05, + "loss": 0.5252, + "step": 14041 + }, + { + "epoch": 0.786314256915668, + "grad_norm": 9.211777687072754, + "learning_rate": 7.017e-05, + "loss": 0.4757, + "step": 14042 + }, + { + "epoch": 0.786370254227797, + "grad_norm": 1.9496930837631226, + "learning_rate": 7.0175e-05, + "loss": 0.6311, + "step": 14043 + }, + { + "epoch": 0.7864262515399261, + "grad_norm": 1.450932502746582, + "learning_rate": 7.018e-05, + "loss": 0.712, + "step": 14044 + }, + { + "epoch": 0.7864822488520551, + "grad_norm": 1.3744438886642456, + "learning_rate": 7.0185e-05, + "loss": 0.4701, + "step": 14045 + }, + { + "epoch": 0.7865382461641841, + "grad_norm": 1.468982219696045, + "learning_rate": 7.019e-05, + "loss": 0.508, + "step": 14046 + }, + { + "epoch": 0.7865942434763131, + "grad_norm": 1.406665563583374, + "learning_rate": 7.0195e-05, + "loss": 0.4218, + "step": 14047 + }, + { + "epoch": 0.7866502407884421, + "grad_norm": 1.1718058586120605, + "learning_rate": 7.02e-05, + "loss": 0.3639, + "step": 14048 + }, + { + "epoch": 0.7867062381005712, + "grad_norm": 1.7525266408920288, + "learning_rate": 7.0205e-05, + "loss": 0.4124, + "step": 14049 + }, + { + "epoch": 0.7867622354127002, + "grad_norm": 1.2831170558929443, + "learning_rate": 7.021e-05, + "loss": 0.4299, + "step": 14050 + }, + { + "epoch": 0.7868182327248292, + "grad_norm": 1.143329381942749, + "learning_rate": 7.0215e-05, + "loss": 0.386, + "step": 14051 + }, + { + "epoch": 0.7868742300369582, + "grad_norm": 1.5027421712875366, + "learning_rate": 7.022e-05, + "loss": 0.4731, + "step": 14052 + }, + { + "epoch": 0.7869302273490872, + "grad_norm": 1.4526931047439575, + "learning_rate": 7.022500000000001e-05, + "loss": 0.5402, + "step": 14053 + }, + { + "epoch": 0.7869862246612163, + "grad_norm": 1.1389609575271606, + "learning_rate": 7.023000000000001e-05, + "loss": 0.3371, + "step": 14054 + }, + { + "epoch": 0.7870422219733453, + "grad_norm": 1.5279737710952759, + "learning_rate": 7.023500000000001e-05, + "loss": 0.408, + "step": 14055 + }, + { + "epoch": 0.7870982192854743, + "grad_norm": 1.5773791074752808, + "learning_rate": 7.024e-05, + "loss": 0.5432, + "step": 14056 + }, + { + "epoch": 0.7871542165976033, + "grad_norm": 2.4261014461517334, + "learning_rate": 7.0245e-05, + "loss": 0.471, + "step": 14057 + }, + { + "epoch": 0.7872102139097323, + "grad_norm": 1.2080491781234741, + "learning_rate": 7.025e-05, + "loss": 0.3309, + "step": 14058 + }, + { + "epoch": 0.7872662112218614, + "grad_norm": 1.195871353149414, + "learning_rate": 7.025500000000001e-05, + "loss": 0.4542, + "step": 14059 + }, + { + "epoch": 0.7873222085339904, + "grad_norm": 1.3698365688323975, + "learning_rate": 7.026000000000001e-05, + "loss": 0.5356, + "step": 14060 + }, + { + "epoch": 0.7873782058461194, + "grad_norm": 1.4013510942459106, + "learning_rate": 7.0265e-05, + "loss": 0.5224, + "step": 14061 + }, + { + "epoch": 0.7874342031582484, + "grad_norm": 1.2595869302749634, + "learning_rate": 7.027e-05, + "loss": 0.5053, + "step": 14062 + }, + { + "epoch": 0.7874902004703774, + "grad_norm": 1.0256685018539429, + "learning_rate": 7.0275e-05, + "loss": 0.4091, + "step": 14063 + }, + { + "epoch": 0.7875461977825065, + "grad_norm": 1.2182501554489136, + "learning_rate": 7.028e-05, + "loss": 0.4386, + "step": 14064 + }, + { + "epoch": 0.7876021950946355, + "grad_norm": 1.3985791206359863, + "learning_rate": 7.0285e-05, + "loss": 0.4712, + "step": 14065 + }, + { + "epoch": 0.7876581924067645, + "grad_norm": 1.2584521770477295, + "learning_rate": 7.029e-05, + "loss": 0.4653, + "step": 14066 + }, + { + "epoch": 0.7877141897188935, + "grad_norm": 1.3298203945159912, + "learning_rate": 7.0295e-05, + "loss": 0.576, + "step": 14067 + }, + { + "epoch": 0.7877701870310225, + "grad_norm": 1.344087839126587, + "learning_rate": 7.03e-05, + "loss": 0.442, + "step": 14068 + }, + { + "epoch": 0.7878261843431515, + "grad_norm": 1.3564358949661255, + "learning_rate": 7.0305e-05, + "loss": 0.449, + "step": 14069 + }, + { + "epoch": 0.7878821816552806, + "grad_norm": 1.4164124727249146, + "learning_rate": 7.031e-05, + "loss": 0.4238, + "step": 14070 + }, + { + "epoch": 0.7879381789674096, + "grad_norm": 1.408128261566162, + "learning_rate": 7.031500000000001e-05, + "loss": 0.4914, + "step": 14071 + }, + { + "epoch": 0.7879941762795386, + "grad_norm": 1.1966450214385986, + "learning_rate": 7.032e-05, + "loss": 0.3067, + "step": 14072 + }, + { + "epoch": 0.7880501735916676, + "grad_norm": 1.1928439140319824, + "learning_rate": 7.0325e-05, + "loss": 0.3834, + "step": 14073 + }, + { + "epoch": 0.7881061709037966, + "grad_norm": 1.3703701496124268, + "learning_rate": 7.033000000000001e-05, + "loss": 0.5714, + "step": 14074 + }, + { + "epoch": 0.7881621682159257, + "grad_norm": 1.4749839305877686, + "learning_rate": 7.033500000000001e-05, + "loss": 0.4225, + "step": 14075 + }, + { + "epoch": 0.7882181655280547, + "grad_norm": 1.267511010169983, + "learning_rate": 7.034000000000001e-05, + "loss": 0.511, + "step": 14076 + }, + { + "epoch": 0.7882741628401837, + "grad_norm": 1.5715607404708862, + "learning_rate": 7.0345e-05, + "loss": 0.4602, + "step": 14077 + }, + { + "epoch": 0.7883301601523127, + "grad_norm": 1.459873080253601, + "learning_rate": 7.035e-05, + "loss": 0.578, + "step": 14078 + }, + { + "epoch": 0.7883861574644417, + "grad_norm": 1.2043129205703735, + "learning_rate": 7.0355e-05, + "loss": 0.3964, + "step": 14079 + }, + { + "epoch": 0.7884421547765708, + "grad_norm": 1.171628713607788, + "learning_rate": 7.036e-05, + "loss": 0.4532, + "step": 14080 + }, + { + "epoch": 0.7884981520886998, + "grad_norm": 1.5888665914535522, + "learning_rate": 7.036500000000001e-05, + "loss": 0.4802, + "step": 14081 + }, + { + "epoch": 0.7885541494008288, + "grad_norm": 1.1802852153778076, + "learning_rate": 7.037e-05, + "loss": 0.3877, + "step": 14082 + }, + { + "epoch": 0.7886101467129578, + "grad_norm": 1.3492436408996582, + "learning_rate": 7.0375e-05, + "loss": 0.4503, + "step": 14083 + }, + { + "epoch": 0.7886661440250868, + "grad_norm": 3.586104393005371, + "learning_rate": 7.038e-05, + "loss": 0.3421, + "step": 14084 + }, + { + "epoch": 0.7887221413372159, + "grad_norm": 1.3017324209213257, + "learning_rate": 7.0385e-05, + "loss": 0.367, + "step": 14085 + }, + { + "epoch": 0.7887781386493449, + "grad_norm": 1.4720596075057983, + "learning_rate": 7.039e-05, + "loss": 0.47, + "step": 14086 + }, + { + "epoch": 0.7888341359614739, + "grad_norm": 1.2844620943069458, + "learning_rate": 7.0395e-05, + "loss": 0.3905, + "step": 14087 + }, + { + "epoch": 0.7888901332736029, + "grad_norm": 1.5005204677581787, + "learning_rate": 7.04e-05, + "loss": 0.6698, + "step": 14088 + }, + { + "epoch": 0.7889461305857319, + "grad_norm": 1.1908706426620483, + "learning_rate": 7.0405e-05, + "loss": 0.4621, + "step": 14089 + }, + { + "epoch": 0.789002127897861, + "grad_norm": 1.265588641166687, + "learning_rate": 7.041e-05, + "loss": 0.422, + "step": 14090 + }, + { + "epoch": 0.78905812520999, + "grad_norm": 1.56885826587677, + "learning_rate": 7.041500000000001e-05, + "loss": 0.4852, + "step": 14091 + }, + { + "epoch": 0.789114122522119, + "grad_norm": 1.1783219575881958, + "learning_rate": 7.042000000000001e-05, + "loss": 0.4621, + "step": 14092 + }, + { + "epoch": 0.789170119834248, + "grad_norm": 1.3888453245162964, + "learning_rate": 7.0425e-05, + "loss": 0.3956, + "step": 14093 + }, + { + "epoch": 0.789226117146377, + "grad_norm": 1.3621768951416016, + "learning_rate": 7.043e-05, + "loss": 0.6413, + "step": 14094 + }, + { + "epoch": 0.789282114458506, + "grad_norm": 1.1520731449127197, + "learning_rate": 7.043500000000001e-05, + "loss": 0.353, + "step": 14095 + }, + { + "epoch": 0.7893381117706351, + "grad_norm": 1.5501999855041504, + "learning_rate": 7.044000000000001e-05, + "loss": 0.5938, + "step": 14096 + }, + { + "epoch": 0.789394109082764, + "grad_norm": 1.583893895149231, + "learning_rate": 7.044500000000001e-05, + "loss": 0.3894, + "step": 14097 + }, + { + "epoch": 0.789450106394893, + "grad_norm": 1.5419281721115112, + "learning_rate": 7.045e-05, + "loss": 0.6311, + "step": 14098 + }, + { + "epoch": 0.789506103707022, + "grad_norm": 1.2712949514389038, + "learning_rate": 7.0455e-05, + "loss": 0.5394, + "step": 14099 + }, + { + "epoch": 0.789562101019151, + "grad_norm": 1.4368669986724854, + "learning_rate": 7.046e-05, + "loss": 0.4303, + "step": 14100 + }, + { + "epoch": 0.78961809833128, + "grad_norm": 1.144955039024353, + "learning_rate": 7.0465e-05, + "loss": 0.4121, + "step": 14101 + }, + { + "epoch": 0.7896740956434091, + "grad_norm": 7.278684139251709, + "learning_rate": 7.047000000000001e-05, + "loss": 0.3975, + "step": 14102 + }, + { + "epoch": 0.7897300929555381, + "grad_norm": 1.2687106132507324, + "learning_rate": 7.0475e-05, + "loss": 0.3977, + "step": 14103 + }, + { + "epoch": 0.7897860902676671, + "grad_norm": 1.5099725723266602, + "learning_rate": 7.048e-05, + "loss": 0.4205, + "step": 14104 + }, + { + "epoch": 0.7898420875797961, + "grad_norm": 1.4072988033294678, + "learning_rate": 7.0485e-05, + "loss": 0.406, + "step": 14105 + }, + { + "epoch": 0.7898980848919251, + "grad_norm": 1.2100664377212524, + "learning_rate": 7.049e-05, + "loss": 0.318, + "step": 14106 + }, + { + "epoch": 0.7899540822040542, + "grad_norm": 1.2987401485443115, + "learning_rate": 7.0495e-05, + "loss": 0.4051, + "step": 14107 + }, + { + "epoch": 0.7900100795161832, + "grad_norm": 1.3227951526641846, + "learning_rate": 7.05e-05, + "loss": 0.4309, + "step": 14108 + }, + { + "epoch": 0.7900660768283122, + "grad_norm": 1.254512071609497, + "learning_rate": 7.0505e-05, + "loss": 0.4531, + "step": 14109 + }, + { + "epoch": 0.7901220741404412, + "grad_norm": 1.2160489559173584, + "learning_rate": 7.051e-05, + "loss": 0.5015, + "step": 14110 + }, + { + "epoch": 0.7901780714525702, + "grad_norm": 1.265352725982666, + "learning_rate": 7.051500000000001e-05, + "loss": 0.3967, + "step": 14111 + }, + { + "epoch": 0.7902340687646993, + "grad_norm": 1.3767404556274414, + "learning_rate": 7.052000000000001e-05, + "loss": 0.4449, + "step": 14112 + }, + { + "epoch": 0.7902900660768283, + "grad_norm": 1.5246636867523193, + "learning_rate": 7.0525e-05, + "loss": 0.5072, + "step": 14113 + }, + { + "epoch": 0.7903460633889573, + "grad_norm": 1.4679063558578491, + "learning_rate": 7.053e-05, + "loss": 0.5503, + "step": 14114 + }, + { + "epoch": 0.7904020607010863, + "grad_norm": 1.1133171319961548, + "learning_rate": 7.0535e-05, + "loss": 0.3749, + "step": 14115 + }, + { + "epoch": 0.7904580580132153, + "grad_norm": 1.1903791427612305, + "learning_rate": 7.054000000000001e-05, + "loss": 0.4349, + "step": 14116 + }, + { + "epoch": 0.7905140553253444, + "grad_norm": 1.286184310913086, + "learning_rate": 7.054500000000001e-05, + "loss": 0.3713, + "step": 14117 + }, + { + "epoch": 0.7905700526374734, + "grad_norm": 1.1068525314331055, + "learning_rate": 7.055000000000001e-05, + "loss": 0.3936, + "step": 14118 + }, + { + "epoch": 0.7906260499496024, + "grad_norm": 1.2849111557006836, + "learning_rate": 7.0555e-05, + "loss": 0.4914, + "step": 14119 + }, + { + "epoch": 0.7906820472617314, + "grad_norm": 1.794370174407959, + "learning_rate": 7.056e-05, + "loss": 0.5837, + "step": 14120 + }, + { + "epoch": 0.7907380445738604, + "grad_norm": 1.2527025938034058, + "learning_rate": 7.0565e-05, + "loss": 0.3589, + "step": 14121 + }, + { + "epoch": 0.7907940418859895, + "grad_norm": 1.3123115301132202, + "learning_rate": 7.057e-05, + "loss": 0.4757, + "step": 14122 + }, + { + "epoch": 0.7908500391981185, + "grad_norm": 1.499796986579895, + "learning_rate": 7.057500000000001e-05, + "loss": 0.5368, + "step": 14123 + }, + { + "epoch": 0.7909060365102475, + "grad_norm": 1.1614887714385986, + "learning_rate": 7.058e-05, + "loss": 0.5316, + "step": 14124 + }, + { + "epoch": 0.7909620338223765, + "grad_norm": 1.4116251468658447, + "learning_rate": 7.0585e-05, + "loss": 0.5434, + "step": 14125 + }, + { + "epoch": 0.7910180311345055, + "grad_norm": 1.3411842584609985, + "learning_rate": 7.059e-05, + "loss": 0.407, + "step": 14126 + }, + { + "epoch": 0.7910740284466345, + "grad_norm": 1.2646327018737793, + "learning_rate": 7.0595e-05, + "loss": 0.4557, + "step": 14127 + }, + { + "epoch": 0.7911300257587636, + "grad_norm": 1.333034634590149, + "learning_rate": 7.06e-05, + "loss": 0.3658, + "step": 14128 + }, + { + "epoch": 0.7911860230708926, + "grad_norm": 1.3084169626235962, + "learning_rate": 7.060499999999999e-05, + "loss": 0.4472, + "step": 14129 + }, + { + "epoch": 0.7912420203830216, + "grad_norm": 1.3331811428070068, + "learning_rate": 7.061e-05, + "loss": 0.4248, + "step": 14130 + }, + { + "epoch": 0.7912980176951506, + "grad_norm": 1.2928749322891235, + "learning_rate": 7.061500000000001e-05, + "loss": 0.3972, + "step": 14131 + }, + { + "epoch": 0.7913540150072796, + "grad_norm": 1.325481653213501, + "learning_rate": 7.062000000000001e-05, + "loss": 0.3963, + "step": 14132 + }, + { + "epoch": 0.7914100123194087, + "grad_norm": 1.6766866445541382, + "learning_rate": 7.062500000000001e-05, + "loss": 0.5129, + "step": 14133 + }, + { + "epoch": 0.7914660096315377, + "grad_norm": 1.099776268005371, + "learning_rate": 7.063e-05, + "loss": 0.4562, + "step": 14134 + }, + { + "epoch": 0.7915220069436667, + "grad_norm": 1.5759435892105103, + "learning_rate": 7.0635e-05, + "loss": 0.4211, + "step": 14135 + }, + { + "epoch": 0.7915780042557957, + "grad_norm": 1.4366590976715088, + "learning_rate": 7.064e-05, + "loss": 0.5245, + "step": 14136 + }, + { + "epoch": 0.7916340015679247, + "grad_norm": 1.5051610469818115, + "learning_rate": 7.064500000000001e-05, + "loss": 0.6648, + "step": 14137 + }, + { + "epoch": 0.7916899988800538, + "grad_norm": 1.4565407037734985, + "learning_rate": 7.065000000000001e-05, + "loss": 0.4574, + "step": 14138 + }, + { + "epoch": 0.7917459961921828, + "grad_norm": 1.1879507303237915, + "learning_rate": 7.065500000000001e-05, + "loss": 0.4324, + "step": 14139 + }, + { + "epoch": 0.7918019935043118, + "grad_norm": 1.3431316614151, + "learning_rate": 7.066e-05, + "loss": 0.5131, + "step": 14140 + }, + { + "epoch": 0.7918579908164408, + "grad_norm": 1.2967579364776611, + "learning_rate": 7.0665e-05, + "loss": 0.4321, + "step": 14141 + }, + { + "epoch": 0.7919139881285698, + "grad_norm": 1.5854921340942383, + "learning_rate": 7.067e-05, + "loss": 0.5023, + "step": 14142 + }, + { + "epoch": 0.7919699854406989, + "grad_norm": 1.1517387628555298, + "learning_rate": 7.0675e-05, + "loss": 0.3641, + "step": 14143 + }, + { + "epoch": 0.7920259827528279, + "grad_norm": 1.1762385368347168, + "learning_rate": 7.068000000000001e-05, + "loss": 0.4396, + "step": 14144 + }, + { + "epoch": 0.7920819800649569, + "grad_norm": 1.2819037437438965, + "learning_rate": 7.0685e-05, + "loss": 0.4133, + "step": 14145 + }, + { + "epoch": 0.7921379773770859, + "grad_norm": 1.4618840217590332, + "learning_rate": 7.069e-05, + "loss": 0.3764, + "step": 14146 + }, + { + "epoch": 0.7921939746892149, + "grad_norm": 1.475507140159607, + "learning_rate": 7.0695e-05, + "loss": 0.4936, + "step": 14147 + }, + { + "epoch": 0.792249972001344, + "grad_norm": 1.2078789472579956, + "learning_rate": 7.07e-05, + "loss": 0.3813, + "step": 14148 + }, + { + "epoch": 0.792305969313473, + "grad_norm": 1.1872726678848267, + "learning_rate": 7.0705e-05, + "loss": 0.3415, + "step": 14149 + }, + { + "epoch": 0.792361966625602, + "grad_norm": 1.275092601776123, + "learning_rate": 7.070999999999999e-05, + "loss": 0.5457, + "step": 14150 + }, + { + "epoch": 0.792417963937731, + "grad_norm": 1.3667881488800049, + "learning_rate": 7.0715e-05, + "loss": 0.5445, + "step": 14151 + }, + { + "epoch": 0.79247396124986, + "grad_norm": 1.5850286483764648, + "learning_rate": 7.072000000000001e-05, + "loss": 0.7943, + "step": 14152 + }, + { + "epoch": 0.792529958561989, + "grad_norm": 1.4312115907669067, + "learning_rate": 7.072500000000001e-05, + "loss": 0.4663, + "step": 14153 + }, + { + "epoch": 0.7925859558741181, + "grad_norm": 1.304882526397705, + "learning_rate": 7.073000000000001e-05, + "loss": 0.4723, + "step": 14154 + }, + { + "epoch": 0.7926419531862471, + "grad_norm": 1.721720576286316, + "learning_rate": 7.0735e-05, + "loss": 0.4387, + "step": 14155 + }, + { + "epoch": 0.7926979504983761, + "grad_norm": 1.597560167312622, + "learning_rate": 7.074e-05, + "loss": 0.4141, + "step": 14156 + }, + { + "epoch": 0.7927539478105051, + "grad_norm": 1.3952064514160156, + "learning_rate": 7.0745e-05, + "loss": 0.4149, + "step": 14157 + }, + { + "epoch": 0.7928099451226341, + "grad_norm": 1.1094516515731812, + "learning_rate": 7.075e-05, + "loss": 0.318, + "step": 14158 + }, + { + "epoch": 0.7928659424347632, + "grad_norm": 1.298769235610962, + "learning_rate": 7.075500000000001e-05, + "loss": 0.3279, + "step": 14159 + }, + { + "epoch": 0.7929219397468922, + "grad_norm": 1.2508598566055298, + "learning_rate": 7.076000000000001e-05, + "loss": 0.401, + "step": 14160 + }, + { + "epoch": 0.7929779370590212, + "grad_norm": 1.1670008897781372, + "learning_rate": 7.0765e-05, + "loss": 0.355, + "step": 14161 + }, + { + "epoch": 0.7930339343711502, + "grad_norm": 1.7155007123947144, + "learning_rate": 7.077e-05, + "loss": 0.3922, + "step": 14162 + }, + { + "epoch": 0.7930899316832792, + "grad_norm": 1.4616706371307373, + "learning_rate": 7.0775e-05, + "loss": 0.4847, + "step": 14163 + }, + { + "epoch": 0.7931459289954083, + "grad_norm": 1.3160438537597656, + "learning_rate": 7.078e-05, + "loss": 0.4121, + "step": 14164 + }, + { + "epoch": 0.7932019263075373, + "grad_norm": 1.2417327165603638, + "learning_rate": 7.078500000000001e-05, + "loss": 0.5756, + "step": 14165 + }, + { + "epoch": 0.7932579236196663, + "grad_norm": 1.224461555480957, + "learning_rate": 7.079e-05, + "loss": 0.451, + "step": 14166 + }, + { + "epoch": 0.7933139209317953, + "grad_norm": 1.3468503952026367, + "learning_rate": 7.0795e-05, + "loss": 0.473, + "step": 14167 + }, + { + "epoch": 0.7933699182439243, + "grad_norm": 1.8887944221496582, + "learning_rate": 7.08e-05, + "loss": 0.4912, + "step": 14168 + }, + { + "epoch": 0.7934259155560534, + "grad_norm": 1.1624226570129395, + "learning_rate": 7.0805e-05, + "loss": 0.4854, + "step": 14169 + }, + { + "epoch": 0.7934819128681824, + "grad_norm": 1.4265356063842773, + "learning_rate": 7.081e-05, + "loss": 0.4913, + "step": 14170 + }, + { + "epoch": 0.7935379101803114, + "grad_norm": 1.379022240638733, + "learning_rate": 7.081499999999999e-05, + "loss": 0.5303, + "step": 14171 + }, + { + "epoch": 0.7935939074924404, + "grad_norm": 1.1742662191390991, + "learning_rate": 7.082e-05, + "loss": 0.3983, + "step": 14172 + }, + { + "epoch": 0.7936499048045694, + "grad_norm": 1.4051592350006104, + "learning_rate": 7.082500000000001e-05, + "loss": 0.3314, + "step": 14173 + }, + { + "epoch": 0.7937059021166984, + "grad_norm": 1.246253252029419, + "learning_rate": 7.083000000000001e-05, + "loss": 0.3705, + "step": 14174 + }, + { + "epoch": 0.7937618994288275, + "grad_norm": 1.3388421535491943, + "learning_rate": 7.083500000000001e-05, + "loss": 0.5268, + "step": 14175 + }, + { + "epoch": 0.7938178967409565, + "grad_norm": 1.4494574069976807, + "learning_rate": 7.084e-05, + "loss": 0.5604, + "step": 14176 + }, + { + "epoch": 0.7938738940530855, + "grad_norm": 1.3175182342529297, + "learning_rate": 7.0845e-05, + "loss": 0.4475, + "step": 14177 + }, + { + "epoch": 0.7939298913652145, + "grad_norm": 1.9641207456588745, + "learning_rate": 7.085e-05, + "loss": 0.391, + "step": 14178 + }, + { + "epoch": 0.7939858886773434, + "grad_norm": 1.4529749155044556, + "learning_rate": 7.0855e-05, + "loss": 0.3616, + "step": 14179 + }, + { + "epoch": 0.7940418859894725, + "grad_norm": 1.5091392993927002, + "learning_rate": 7.086000000000001e-05, + "loss": 0.6272, + "step": 14180 + }, + { + "epoch": 0.7940978833016015, + "grad_norm": 1.480791687965393, + "learning_rate": 7.0865e-05, + "loss": 0.5521, + "step": 14181 + }, + { + "epoch": 0.7941538806137305, + "grad_norm": 1.2192174196243286, + "learning_rate": 7.087e-05, + "loss": 0.4446, + "step": 14182 + }, + { + "epoch": 0.7942098779258595, + "grad_norm": 1.1119846105575562, + "learning_rate": 7.0875e-05, + "loss": 0.432, + "step": 14183 + }, + { + "epoch": 0.7942658752379885, + "grad_norm": 1.2087913751602173, + "learning_rate": 7.088e-05, + "loss": 0.4761, + "step": 14184 + }, + { + "epoch": 0.7943218725501175, + "grad_norm": 1.1879780292510986, + "learning_rate": 7.0885e-05, + "loss": 0.4461, + "step": 14185 + }, + { + "epoch": 0.7943778698622466, + "grad_norm": 2.624023914337158, + "learning_rate": 7.089000000000001e-05, + "loss": 0.5263, + "step": 14186 + }, + { + "epoch": 0.7944338671743756, + "grad_norm": 1.4163124561309814, + "learning_rate": 7.0895e-05, + "loss": 0.4879, + "step": 14187 + }, + { + "epoch": 0.7944898644865046, + "grad_norm": 1.2269207239151, + "learning_rate": 7.09e-05, + "loss": 0.4756, + "step": 14188 + }, + { + "epoch": 0.7945458617986336, + "grad_norm": 1.2559641599655151, + "learning_rate": 7.0905e-05, + "loss": 0.6148, + "step": 14189 + }, + { + "epoch": 0.7946018591107626, + "grad_norm": 1.4977549314498901, + "learning_rate": 7.091e-05, + "loss": 0.594, + "step": 14190 + }, + { + "epoch": 0.7946578564228917, + "grad_norm": 1.6066761016845703, + "learning_rate": 7.0915e-05, + "loss": 0.3459, + "step": 14191 + }, + { + "epoch": 0.7947138537350207, + "grad_norm": 1.1700197458267212, + "learning_rate": 7.092e-05, + "loss": 0.4241, + "step": 14192 + }, + { + "epoch": 0.7947698510471497, + "grad_norm": 1.5863022804260254, + "learning_rate": 7.0925e-05, + "loss": 0.7132, + "step": 14193 + }, + { + "epoch": 0.7948258483592787, + "grad_norm": 1.3558669090270996, + "learning_rate": 7.093000000000001e-05, + "loss": 0.3699, + "step": 14194 + }, + { + "epoch": 0.7948818456714077, + "grad_norm": 1.8490558862686157, + "learning_rate": 7.093500000000001e-05, + "loss": 0.6204, + "step": 14195 + }, + { + "epoch": 0.7949378429835368, + "grad_norm": 1.3741960525512695, + "learning_rate": 7.094000000000001e-05, + "loss": 0.4074, + "step": 14196 + }, + { + "epoch": 0.7949938402956658, + "grad_norm": 1.4264682531356812, + "learning_rate": 7.0945e-05, + "loss": 0.4239, + "step": 14197 + }, + { + "epoch": 0.7950498376077948, + "grad_norm": 1.9793999195098877, + "learning_rate": 7.095e-05, + "loss": 0.4577, + "step": 14198 + }, + { + "epoch": 0.7951058349199238, + "grad_norm": 1.2810734510421753, + "learning_rate": 7.0955e-05, + "loss": 0.4092, + "step": 14199 + }, + { + "epoch": 0.7951618322320528, + "grad_norm": 1.3208116292953491, + "learning_rate": 7.096e-05, + "loss": 0.4284, + "step": 14200 + }, + { + "epoch": 0.7952178295441819, + "grad_norm": 1.63396418094635, + "learning_rate": 7.096500000000001e-05, + "loss": 0.5604, + "step": 14201 + }, + { + "epoch": 0.7952738268563109, + "grad_norm": 1.3054780960083008, + "learning_rate": 7.097e-05, + "loss": 0.4291, + "step": 14202 + }, + { + "epoch": 0.7953298241684399, + "grad_norm": 1.2806179523468018, + "learning_rate": 7.0975e-05, + "loss": 0.4277, + "step": 14203 + }, + { + "epoch": 0.7953858214805689, + "grad_norm": 1.4290262460708618, + "learning_rate": 7.098e-05, + "loss": 0.5616, + "step": 14204 + }, + { + "epoch": 0.7954418187926979, + "grad_norm": 1.2974903583526611, + "learning_rate": 7.0985e-05, + "loss": 0.3513, + "step": 14205 + }, + { + "epoch": 0.795497816104827, + "grad_norm": 1.2196741104125977, + "learning_rate": 7.099e-05, + "loss": 0.4768, + "step": 14206 + }, + { + "epoch": 0.795553813416956, + "grad_norm": 2.9551949501037598, + "learning_rate": 7.0995e-05, + "loss": 0.6393, + "step": 14207 + }, + { + "epoch": 0.795609810729085, + "grad_norm": 1.1885533332824707, + "learning_rate": 7.1e-05, + "loss": 0.4344, + "step": 14208 + }, + { + "epoch": 0.795665808041214, + "grad_norm": 1.4281381368637085, + "learning_rate": 7.1005e-05, + "loss": 0.4298, + "step": 14209 + }, + { + "epoch": 0.795721805353343, + "grad_norm": 1.1432418823242188, + "learning_rate": 7.101e-05, + "loss": 0.4367, + "step": 14210 + }, + { + "epoch": 0.795777802665472, + "grad_norm": 1.2060902118682861, + "learning_rate": 7.1015e-05, + "loss": 0.4498, + "step": 14211 + }, + { + "epoch": 0.7958337999776011, + "grad_norm": 1.2184255123138428, + "learning_rate": 7.102000000000001e-05, + "loss": 0.4082, + "step": 14212 + }, + { + "epoch": 0.7958897972897301, + "grad_norm": 1.3108534812927246, + "learning_rate": 7.1025e-05, + "loss": 0.4099, + "step": 14213 + }, + { + "epoch": 0.7959457946018591, + "grad_norm": 1.232239007949829, + "learning_rate": 7.103e-05, + "loss": 0.6333, + "step": 14214 + }, + { + "epoch": 0.7960017919139881, + "grad_norm": 1.3570442199707031, + "learning_rate": 7.103500000000001e-05, + "loss": 0.5427, + "step": 14215 + }, + { + "epoch": 0.7960577892261171, + "grad_norm": 1.299333930015564, + "learning_rate": 7.104000000000001e-05, + "loss": 0.4296, + "step": 14216 + }, + { + "epoch": 0.7961137865382462, + "grad_norm": 1.7934577465057373, + "learning_rate": 7.104500000000001e-05, + "loss": 0.6227, + "step": 14217 + }, + { + "epoch": 0.7961697838503752, + "grad_norm": 1.5457813739776611, + "learning_rate": 7.105e-05, + "loss": 0.4957, + "step": 14218 + }, + { + "epoch": 0.7962257811625042, + "grad_norm": 1.2532612085342407, + "learning_rate": 7.1055e-05, + "loss": 0.3346, + "step": 14219 + }, + { + "epoch": 0.7962817784746332, + "grad_norm": 2.023634910583496, + "learning_rate": 7.106e-05, + "loss": 0.44, + "step": 14220 + }, + { + "epoch": 0.7963377757867622, + "grad_norm": 6.002133369445801, + "learning_rate": 7.1065e-05, + "loss": 0.4516, + "step": 14221 + }, + { + "epoch": 0.7963937730988913, + "grad_norm": 1.5097990036010742, + "learning_rate": 7.107000000000001e-05, + "loss": 0.5253, + "step": 14222 + }, + { + "epoch": 0.7964497704110203, + "grad_norm": 1.0850706100463867, + "learning_rate": 7.1075e-05, + "loss": 0.3218, + "step": 14223 + }, + { + "epoch": 0.7965057677231493, + "grad_norm": 1.238458514213562, + "learning_rate": 7.108e-05, + "loss": 0.4333, + "step": 14224 + }, + { + "epoch": 0.7965617650352783, + "grad_norm": 1.1130365133285522, + "learning_rate": 7.1085e-05, + "loss": 0.4513, + "step": 14225 + }, + { + "epoch": 0.7966177623474073, + "grad_norm": 1.3274425268173218, + "learning_rate": 7.109e-05, + "loss": 0.4381, + "step": 14226 + }, + { + "epoch": 0.7966737596595364, + "grad_norm": 1.3055750131607056, + "learning_rate": 7.1095e-05, + "loss": 0.4583, + "step": 14227 + }, + { + "epoch": 0.7967297569716654, + "grad_norm": 1.180130958557129, + "learning_rate": 7.11e-05, + "loss": 0.4189, + "step": 14228 + }, + { + "epoch": 0.7967857542837944, + "grad_norm": 1.506287932395935, + "learning_rate": 7.1105e-05, + "loss": 0.5207, + "step": 14229 + }, + { + "epoch": 0.7968417515959234, + "grad_norm": 1.1442244052886963, + "learning_rate": 7.111e-05, + "loss": 0.5126, + "step": 14230 + }, + { + "epoch": 0.7968977489080524, + "grad_norm": 1.3099751472473145, + "learning_rate": 7.1115e-05, + "loss": 0.566, + "step": 14231 + }, + { + "epoch": 0.7969537462201814, + "grad_norm": 1.828741192817688, + "learning_rate": 7.112000000000001e-05, + "loss": 0.3673, + "step": 14232 + }, + { + "epoch": 0.7970097435323105, + "grad_norm": 1.4499508142471313, + "learning_rate": 7.112500000000001e-05, + "loss": 0.5663, + "step": 14233 + }, + { + "epoch": 0.7970657408444395, + "grad_norm": 1.4727813005447388, + "learning_rate": 7.113e-05, + "loss": 0.5223, + "step": 14234 + }, + { + "epoch": 0.7971217381565685, + "grad_norm": 1.4278827905654907, + "learning_rate": 7.1135e-05, + "loss": 0.5051, + "step": 14235 + }, + { + "epoch": 0.7971777354686975, + "grad_norm": 2.509204387664795, + "learning_rate": 7.114e-05, + "loss": 0.5877, + "step": 14236 + }, + { + "epoch": 0.7972337327808265, + "grad_norm": 1.3308311700820923, + "learning_rate": 7.114500000000001e-05, + "loss": 0.4522, + "step": 14237 + }, + { + "epoch": 0.7972897300929556, + "grad_norm": 1.5752297639846802, + "learning_rate": 7.115000000000001e-05, + "loss": 0.43, + "step": 14238 + }, + { + "epoch": 0.7973457274050846, + "grad_norm": 2.0403895378112793, + "learning_rate": 7.1155e-05, + "loss": 0.5288, + "step": 14239 + }, + { + "epoch": 0.7974017247172136, + "grad_norm": 1.2725965976715088, + "learning_rate": 7.116e-05, + "loss": 0.5799, + "step": 14240 + }, + { + "epoch": 0.7974577220293426, + "grad_norm": 1.2783291339874268, + "learning_rate": 7.1165e-05, + "loss": 0.4713, + "step": 14241 + }, + { + "epoch": 0.7975137193414716, + "grad_norm": 1.2267341613769531, + "learning_rate": 7.117e-05, + "loss": 0.3125, + "step": 14242 + }, + { + "epoch": 0.7975697166536007, + "grad_norm": 1.5969126224517822, + "learning_rate": 7.117500000000001e-05, + "loss": 0.4726, + "step": 14243 + }, + { + "epoch": 0.7976257139657297, + "grad_norm": 1.3979538679122925, + "learning_rate": 7.118e-05, + "loss": 0.3592, + "step": 14244 + }, + { + "epoch": 0.7976817112778587, + "grad_norm": 1.6626211404800415, + "learning_rate": 7.1185e-05, + "loss": 0.4393, + "step": 14245 + }, + { + "epoch": 0.7977377085899877, + "grad_norm": 1.2637313604354858, + "learning_rate": 7.119e-05, + "loss": 0.3928, + "step": 14246 + }, + { + "epoch": 0.7977937059021167, + "grad_norm": 1.3991094827651978, + "learning_rate": 7.1195e-05, + "loss": 0.452, + "step": 14247 + }, + { + "epoch": 0.7978497032142458, + "grad_norm": 1.4132726192474365, + "learning_rate": 7.12e-05, + "loss": 0.4239, + "step": 14248 + }, + { + "epoch": 0.7979057005263748, + "grad_norm": 1.7055772542953491, + "learning_rate": 7.1205e-05, + "loss": 0.6003, + "step": 14249 + }, + { + "epoch": 0.7979616978385038, + "grad_norm": 1.3924280405044556, + "learning_rate": 7.121e-05, + "loss": 0.5031, + "step": 14250 + }, + { + "epoch": 0.7980176951506328, + "grad_norm": 1.4206241369247437, + "learning_rate": 7.1215e-05, + "loss": 0.4005, + "step": 14251 + }, + { + "epoch": 0.7980736924627618, + "grad_norm": 1.3712095022201538, + "learning_rate": 7.122000000000001e-05, + "loss": 0.4737, + "step": 14252 + }, + { + "epoch": 0.7981296897748908, + "grad_norm": 1.3633575439453125, + "learning_rate": 7.122500000000001e-05, + "loss": 0.6029, + "step": 14253 + }, + { + "epoch": 0.7981856870870199, + "grad_norm": 1.168293833732605, + "learning_rate": 7.123000000000001e-05, + "loss": 0.5213, + "step": 14254 + }, + { + "epoch": 0.7982416843991489, + "grad_norm": 1.3674473762512207, + "learning_rate": 7.1235e-05, + "loss": 0.4504, + "step": 14255 + }, + { + "epoch": 0.7982976817112779, + "grad_norm": 1.1504228115081787, + "learning_rate": 7.124e-05, + "loss": 0.431, + "step": 14256 + }, + { + "epoch": 0.7983536790234069, + "grad_norm": 1.3997844457626343, + "learning_rate": 7.1245e-05, + "loss": 0.5008, + "step": 14257 + }, + { + "epoch": 0.798409676335536, + "grad_norm": 1.2385599613189697, + "learning_rate": 7.125000000000001e-05, + "loss": 0.3936, + "step": 14258 + }, + { + "epoch": 0.798465673647665, + "grad_norm": 1.5361371040344238, + "learning_rate": 7.125500000000001e-05, + "loss": 0.3703, + "step": 14259 + }, + { + "epoch": 0.798521670959794, + "grad_norm": 1.2596443891525269, + "learning_rate": 7.126e-05, + "loss": 0.4015, + "step": 14260 + }, + { + "epoch": 0.798577668271923, + "grad_norm": 1.484503149986267, + "learning_rate": 7.1265e-05, + "loss": 0.4397, + "step": 14261 + }, + { + "epoch": 0.7986336655840519, + "grad_norm": 1.2304725646972656, + "learning_rate": 7.127e-05, + "loss": 0.3993, + "step": 14262 + }, + { + "epoch": 0.7986896628961809, + "grad_norm": 1.341511607170105, + "learning_rate": 7.1275e-05, + "loss": 0.4711, + "step": 14263 + }, + { + "epoch": 0.79874566020831, + "grad_norm": 1.4946208000183105, + "learning_rate": 7.128000000000001e-05, + "loss": 0.475, + "step": 14264 + }, + { + "epoch": 0.798801657520439, + "grad_norm": 1.28714919090271, + "learning_rate": 7.1285e-05, + "loss": 0.4342, + "step": 14265 + }, + { + "epoch": 0.798857654832568, + "grad_norm": 1.2437433004379272, + "learning_rate": 7.129e-05, + "loss": 0.5012, + "step": 14266 + }, + { + "epoch": 0.798913652144697, + "grad_norm": 1.260019302368164, + "learning_rate": 7.1295e-05, + "loss": 0.3945, + "step": 14267 + }, + { + "epoch": 0.798969649456826, + "grad_norm": 1.257361888885498, + "learning_rate": 7.13e-05, + "loss": 0.4328, + "step": 14268 + }, + { + "epoch": 0.799025646768955, + "grad_norm": 1.4107636213302612, + "learning_rate": 7.1305e-05, + "loss": 0.5142, + "step": 14269 + }, + { + "epoch": 0.7990816440810841, + "grad_norm": 1.4554567337036133, + "learning_rate": 7.130999999999999e-05, + "loss": 0.4655, + "step": 14270 + }, + { + "epoch": 0.7991376413932131, + "grad_norm": 1.271363615989685, + "learning_rate": 7.1315e-05, + "loss": 0.4211, + "step": 14271 + }, + { + "epoch": 0.7991936387053421, + "grad_norm": 1.533981204032898, + "learning_rate": 7.132e-05, + "loss": 0.3774, + "step": 14272 + }, + { + "epoch": 0.7992496360174711, + "grad_norm": 1.338004231452942, + "learning_rate": 7.132500000000001e-05, + "loss": 0.4101, + "step": 14273 + }, + { + "epoch": 0.7993056333296001, + "grad_norm": 1.4239612817764282, + "learning_rate": 7.133000000000001e-05, + "loss": 0.4688, + "step": 14274 + }, + { + "epoch": 0.7993616306417292, + "grad_norm": 1.2443686723709106, + "learning_rate": 7.133500000000001e-05, + "loss": 0.452, + "step": 14275 + }, + { + "epoch": 0.7994176279538582, + "grad_norm": 2.020554780960083, + "learning_rate": 7.134e-05, + "loss": 0.5314, + "step": 14276 + }, + { + "epoch": 0.7994736252659872, + "grad_norm": 1.0896626710891724, + "learning_rate": 7.1345e-05, + "loss": 0.4325, + "step": 14277 + }, + { + "epoch": 0.7995296225781162, + "grad_norm": 1.2125056982040405, + "learning_rate": 7.135e-05, + "loss": 0.3837, + "step": 14278 + }, + { + "epoch": 0.7995856198902452, + "grad_norm": 1.316235065460205, + "learning_rate": 7.135500000000001e-05, + "loss": 0.5725, + "step": 14279 + }, + { + "epoch": 0.7996416172023743, + "grad_norm": 1.1320780515670776, + "learning_rate": 7.136000000000001e-05, + "loss": 0.3444, + "step": 14280 + }, + { + "epoch": 0.7996976145145033, + "grad_norm": 1.5037559270858765, + "learning_rate": 7.1365e-05, + "loss": 0.6402, + "step": 14281 + }, + { + "epoch": 0.7997536118266323, + "grad_norm": 1.3142647743225098, + "learning_rate": 7.137e-05, + "loss": 0.4549, + "step": 14282 + }, + { + "epoch": 0.7998096091387613, + "grad_norm": 1.2375869750976562, + "learning_rate": 7.1375e-05, + "loss": 0.3238, + "step": 14283 + }, + { + "epoch": 0.7998656064508903, + "grad_norm": 1.1469448804855347, + "learning_rate": 7.138e-05, + "loss": 0.418, + "step": 14284 + }, + { + "epoch": 0.7999216037630194, + "grad_norm": 1.3749374151229858, + "learning_rate": 7.138500000000001e-05, + "loss": 0.5568, + "step": 14285 + }, + { + "epoch": 0.7999776010751484, + "grad_norm": 1.19504976272583, + "learning_rate": 7.139e-05, + "loss": 0.3862, + "step": 14286 + }, + { + "epoch": 0.8000335983872774, + "grad_norm": 5.354552745819092, + "learning_rate": 7.1395e-05, + "loss": 0.4797, + "step": 14287 + }, + { + "epoch": 0.8000895956994064, + "grad_norm": 1.2926965951919556, + "learning_rate": 7.14e-05, + "loss": 0.3903, + "step": 14288 + }, + { + "epoch": 0.8001455930115354, + "grad_norm": 1.509556770324707, + "learning_rate": 7.1405e-05, + "loss": 0.4057, + "step": 14289 + }, + { + "epoch": 0.8002015903236644, + "grad_norm": 1.476244330406189, + "learning_rate": 7.141e-05, + "loss": 0.5082, + "step": 14290 + }, + { + "epoch": 0.8002575876357935, + "grad_norm": 1.268317699432373, + "learning_rate": 7.141499999999999e-05, + "loss": 0.4666, + "step": 14291 + }, + { + "epoch": 0.8003135849479225, + "grad_norm": 1.8188954591751099, + "learning_rate": 7.142e-05, + "loss": 0.4773, + "step": 14292 + }, + { + "epoch": 0.8003695822600515, + "grad_norm": 1.49734628200531, + "learning_rate": 7.142500000000001e-05, + "loss": 0.4786, + "step": 14293 + }, + { + "epoch": 0.8004255795721805, + "grad_norm": 2.688525438308716, + "learning_rate": 7.143000000000001e-05, + "loss": 0.4933, + "step": 14294 + }, + { + "epoch": 0.8004815768843095, + "grad_norm": 1.2296005487442017, + "learning_rate": 7.143500000000001e-05, + "loss": 0.3708, + "step": 14295 + }, + { + "epoch": 0.8005375741964386, + "grad_norm": 1.2410680055618286, + "learning_rate": 7.144000000000001e-05, + "loss": 0.4211, + "step": 14296 + }, + { + "epoch": 0.8005935715085676, + "grad_norm": 1.3349018096923828, + "learning_rate": 7.1445e-05, + "loss": 0.5248, + "step": 14297 + }, + { + "epoch": 0.8006495688206966, + "grad_norm": 1.2120205163955688, + "learning_rate": 7.145e-05, + "loss": 0.4427, + "step": 14298 + }, + { + "epoch": 0.8007055661328256, + "grad_norm": 1.176427960395813, + "learning_rate": 7.1455e-05, + "loss": 0.5057, + "step": 14299 + }, + { + "epoch": 0.8007615634449546, + "grad_norm": 1.1894642114639282, + "learning_rate": 7.146000000000001e-05, + "loss": 0.3401, + "step": 14300 + }, + { + "epoch": 0.8008175607570837, + "grad_norm": 1.3587576150894165, + "learning_rate": 7.146500000000001e-05, + "loss": 0.5311, + "step": 14301 + }, + { + "epoch": 0.8008735580692127, + "grad_norm": 1.4819519519805908, + "learning_rate": 7.147e-05, + "loss": 0.6611, + "step": 14302 + }, + { + "epoch": 0.8009295553813417, + "grad_norm": 1.2229000329971313, + "learning_rate": 7.1475e-05, + "loss": 0.42, + "step": 14303 + }, + { + "epoch": 0.8009855526934707, + "grad_norm": 1.2849124670028687, + "learning_rate": 7.148e-05, + "loss": 0.4447, + "step": 14304 + }, + { + "epoch": 0.8010415500055997, + "grad_norm": 1.5820670127868652, + "learning_rate": 7.1485e-05, + "loss": 0.5244, + "step": 14305 + }, + { + "epoch": 0.8010975473177288, + "grad_norm": 1.4715663194656372, + "learning_rate": 7.149e-05, + "loss": 0.4639, + "step": 14306 + }, + { + "epoch": 0.8011535446298578, + "grad_norm": 1.313481330871582, + "learning_rate": 7.1495e-05, + "loss": 0.341, + "step": 14307 + }, + { + "epoch": 0.8012095419419868, + "grad_norm": 3.2087135314941406, + "learning_rate": 7.15e-05, + "loss": 0.544, + "step": 14308 + }, + { + "epoch": 0.8012655392541158, + "grad_norm": 1.0600281953811646, + "learning_rate": 7.1505e-05, + "loss": 0.3455, + "step": 14309 + }, + { + "epoch": 0.8013215365662448, + "grad_norm": 1.1321611404418945, + "learning_rate": 7.151e-05, + "loss": 0.4154, + "step": 14310 + }, + { + "epoch": 0.8013775338783738, + "grad_norm": 1.4016467332839966, + "learning_rate": 7.1515e-05, + "loss": 0.4927, + "step": 14311 + }, + { + "epoch": 0.8014335311905029, + "grad_norm": 1.1130542755126953, + "learning_rate": 7.151999999999999e-05, + "loss": 0.3972, + "step": 14312 + }, + { + "epoch": 0.8014895285026319, + "grad_norm": 1.4507575035095215, + "learning_rate": 7.1525e-05, + "loss": 0.399, + "step": 14313 + }, + { + "epoch": 0.8015455258147609, + "grad_norm": 1.2351462841033936, + "learning_rate": 7.153000000000001e-05, + "loss": 0.3804, + "step": 14314 + }, + { + "epoch": 0.8016015231268899, + "grad_norm": 1.241011381149292, + "learning_rate": 7.153500000000001e-05, + "loss": 0.4423, + "step": 14315 + }, + { + "epoch": 0.801657520439019, + "grad_norm": 1.063472032546997, + "learning_rate": 7.154000000000001e-05, + "loss": 0.2882, + "step": 14316 + }, + { + "epoch": 0.801713517751148, + "grad_norm": 1.2055721282958984, + "learning_rate": 7.154500000000001e-05, + "loss": 0.4239, + "step": 14317 + }, + { + "epoch": 0.801769515063277, + "grad_norm": 1.5081279277801514, + "learning_rate": 7.155e-05, + "loss": 0.413, + "step": 14318 + }, + { + "epoch": 0.801825512375406, + "grad_norm": 1.3589757680892944, + "learning_rate": 7.1555e-05, + "loss": 0.407, + "step": 14319 + }, + { + "epoch": 0.801881509687535, + "grad_norm": 1.2902450561523438, + "learning_rate": 7.156e-05, + "loss": 0.3748, + "step": 14320 + }, + { + "epoch": 0.801937506999664, + "grad_norm": 1.3496792316436768, + "learning_rate": 7.156500000000001e-05, + "loss": 0.3717, + "step": 14321 + }, + { + "epoch": 0.8019935043117931, + "grad_norm": 1.3816170692443848, + "learning_rate": 7.157000000000001e-05, + "loss": 0.3834, + "step": 14322 + }, + { + "epoch": 0.8020495016239221, + "grad_norm": 1.2812403440475464, + "learning_rate": 7.1575e-05, + "loss": 0.4117, + "step": 14323 + }, + { + "epoch": 0.8021054989360511, + "grad_norm": 1.4883240461349487, + "learning_rate": 7.158e-05, + "loss": 0.4894, + "step": 14324 + }, + { + "epoch": 0.8021614962481801, + "grad_norm": 1.299131989479065, + "learning_rate": 7.1585e-05, + "loss": 0.5771, + "step": 14325 + }, + { + "epoch": 0.8022174935603091, + "grad_norm": 1.1620566844940186, + "learning_rate": 7.159e-05, + "loss": 0.5, + "step": 14326 + }, + { + "epoch": 0.8022734908724382, + "grad_norm": 1.3025226593017578, + "learning_rate": 7.1595e-05, + "loss": 0.4366, + "step": 14327 + }, + { + "epoch": 0.8023294881845672, + "grad_norm": 1.404446005821228, + "learning_rate": 7.16e-05, + "loss": 0.7016, + "step": 14328 + }, + { + "epoch": 0.8023854854966962, + "grad_norm": 1.3371330499649048, + "learning_rate": 7.1605e-05, + "loss": 0.5923, + "step": 14329 + }, + { + "epoch": 0.8024414828088252, + "grad_norm": 1.2328046560287476, + "learning_rate": 7.161e-05, + "loss": 0.4832, + "step": 14330 + }, + { + "epoch": 0.8024974801209542, + "grad_norm": 1.2135400772094727, + "learning_rate": 7.1615e-05, + "loss": 0.39, + "step": 14331 + }, + { + "epoch": 0.8025534774330833, + "grad_norm": 1.1489354372024536, + "learning_rate": 7.162e-05, + "loss": 0.4436, + "step": 14332 + }, + { + "epoch": 0.8026094747452123, + "grad_norm": 1.3100652694702148, + "learning_rate": 7.1625e-05, + "loss": 0.6064, + "step": 14333 + }, + { + "epoch": 0.8026654720573413, + "grad_norm": 1.2295804023742676, + "learning_rate": 7.163e-05, + "loss": 0.3946, + "step": 14334 + }, + { + "epoch": 0.8027214693694703, + "grad_norm": 3.268707513809204, + "learning_rate": 7.1635e-05, + "loss": 0.4619, + "step": 14335 + }, + { + "epoch": 0.8027774666815993, + "grad_norm": 1.4066771268844604, + "learning_rate": 7.164000000000001e-05, + "loss": 0.5977, + "step": 14336 + }, + { + "epoch": 0.8028334639937283, + "grad_norm": 1.3116049766540527, + "learning_rate": 7.164500000000001e-05, + "loss": 0.4666, + "step": 14337 + }, + { + "epoch": 0.8028894613058574, + "grad_norm": 1.5711578130722046, + "learning_rate": 7.165000000000001e-05, + "loss": 0.4854, + "step": 14338 + }, + { + "epoch": 0.8029454586179864, + "grad_norm": 1.2447172403335571, + "learning_rate": 7.1655e-05, + "loss": 0.4169, + "step": 14339 + }, + { + "epoch": 0.8030014559301154, + "grad_norm": 1.405240535736084, + "learning_rate": 7.166e-05, + "loss": 0.7066, + "step": 14340 + }, + { + "epoch": 0.8030574532422444, + "grad_norm": 1.3452883958816528, + "learning_rate": 7.1665e-05, + "loss": 0.4707, + "step": 14341 + }, + { + "epoch": 0.8031134505543734, + "grad_norm": 1.3635317087173462, + "learning_rate": 7.167000000000001e-05, + "loss": 0.448, + "step": 14342 + }, + { + "epoch": 0.8031694478665025, + "grad_norm": 1.2283295392990112, + "learning_rate": 7.167500000000001e-05, + "loss": 0.4203, + "step": 14343 + }, + { + "epoch": 0.8032254451786315, + "grad_norm": 1.4867265224456787, + "learning_rate": 7.168e-05, + "loss": 0.4884, + "step": 14344 + }, + { + "epoch": 0.8032814424907604, + "grad_norm": 1.251662015914917, + "learning_rate": 7.1685e-05, + "loss": 0.4322, + "step": 14345 + }, + { + "epoch": 0.8033374398028894, + "grad_norm": 1.4250129461288452, + "learning_rate": 7.169e-05, + "loss": 0.5191, + "step": 14346 + }, + { + "epoch": 0.8033934371150184, + "grad_norm": 1.3150770664215088, + "learning_rate": 7.1695e-05, + "loss": 0.4644, + "step": 14347 + }, + { + "epoch": 0.8034494344271474, + "grad_norm": 1.645158052444458, + "learning_rate": 7.17e-05, + "loss": 0.4428, + "step": 14348 + }, + { + "epoch": 0.8035054317392765, + "grad_norm": 1.4586005210876465, + "learning_rate": 7.1705e-05, + "loss": 0.5313, + "step": 14349 + }, + { + "epoch": 0.8035614290514055, + "grad_norm": 1.369107961654663, + "learning_rate": 7.171e-05, + "loss": 0.4005, + "step": 14350 + }, + { + "epoch": 0.8036174263635345, + "grad_norm": 1.3012665510177612, + "learning_rate": 7.1715e-05, + "loss": 0.5529, + "step": 14351 + }, + { + "epoch": 0.8036734236756635, + "grad_norm": 1.325852632522583, + "learning_rate": 7.172e-05, + "loss": 0.3732, + "step": 14352 + }, + { + "epoch": 0.8037294209877925, + "grad_norm": 1.499667763710022, + "learning_rate": 7.172500000000001e-05, + "loss": 0.4682, + "step": 14353 + }, + { + "epoch": 0.8037854182999216, + "grad_norm": 1.3996325731277466, + "learning_rate": 7.173e-05, + "loss": 0.4142, + "step": 14354 + }, + { + "epoch": 0.8038414156120506, + "grad_norm": 1.36122465133667, + "learning_rate": 7.1735e-05, + "loss": 0.4767, + "step": 14355 + }, + { + "epoch": 0.8038974129241796, + "grad_norm": 1.3188210725784302, + "learning_rate": 7.174e-05, + "loss": 0.5409, + "step": 14356 + }, + { + "epoch": 0.8039534102363086, + "grad_norm": 1.2389452457427979, + "learning_rate": 7.174500000000001e-05, + "loss": 0.4215, + "step": 14357 + }, + { + "epoch": 0.8040094075484376, + "grad_norm": 1.310027003288269, + "learning_rate": 7.175000000000001e-05, + "loss": 0.4265, + "step": 14358 + }, + { + "epoch": 0.8040654048605667, + "grad_norm": 1.260506272315979, + "learning_rate": 7.1755e-05, + "loss": 0.398, + "step": 14359 + }, + { + "epoch": 0.8041214021726957, + "grad_norm": 1.1169949769973755, + "learning_rate": 7.176e-05, + "loss": 0.4912, + "step": 14360 + }, + { + "epoch": 0.8041773994848247, + "grad_norm": 1.4652414321899414, + "learning_rate": 7.1765e-05, + "loss": 0.4644, + "step": 14361 + }, + { + "epoch": 0.8042333967969537, + "grad_norm": 1.5234915018081665, + "learning_rate": 7.177e-05, + "loss": 0.5104, + "step": 14362 + }, + { + "epoch": 0.8042893941090827, + "grad_norm": 1.2871246337890625, + "learning_rate": 7.177500000000001e-05, + "loss": 0.3849, + "step": 14363 + }, + { + "epoch": 0.8043453914212118, + "grad_norm": 1.1576374769210815, + "learning_rate": 7.178000000000001e-05, + "loss": 0.4857, + "step": 14364 + }, + { + "epoch": 0.8044013887333408, + "grad_norm": 1.2182807922363281, + "learning_rate": 7.1785e-05, + "loss": 0.4108, + "step": 14365 + }, + { + "epoch": 0.8044573860454698, + "grad_norm": 1.3278043270111084, + "learning_rate": 7.179e-05, + "loss": 0.379, + "step": 14366 + }, + { + "epoch": 0.8045133833575988, + "grad_norm": 1.2018606662750244, + "learning_rate": 7.1795e-05, + "loss": 0.4943, + "step": 14367 + }, + { + "epoch": 0.8045693806697278, + "grad_norm": 1.2104520797729492, + "learning_rate": 7.18e-05, + "loss": 0.4814, + "step": 14368 + }, + { + "epoch": 0.8046253779818568, + "grad_norm": 1.061835765838623, + "learning_rate": 7.1805e-05, + "loss": 0.3731, + "step": 14369 + }, + { + "epoch": 0.8046813752939859, + "grad_norm": 1.4871950149536133, + "learning_rate": 7.181e-05, + "loss": 0.5726, + "step": 14370 + }, + { + "epoch": 0.8047373726061149, + "grad_norm": 1.4260612726211548, + "learning_rate": 7.1815e-05, + "loss": 0.4471, + "step": 14371 + }, + { + "epoch": 0.8047933699182439, + "grad_norm": 1.37614107131958, + "learning_rate": 7.182e-05, + "loss": 0.4413, + "step": 14372 + }, + { + "epoch": 0.8048493672303729, + "grad_norm": 1.2057019472122192, + "learning_rate": 7.182500000000001e-05, + "loss": 0.4537, + "step": 14373 + }, + { + "epoch": 0.8049053645425019, + "grad_norm": 1.2680689096450806, + "learning_rate": 7.183000000000001e-05, + "loss": 0.5552, + "step": 14374 + }, + { + "epoch": 0.804961361854631, + "grad_norm": 1.451594591140747, + "learning_rate": 7.1835e-05, + "loss": 0.5571, + "step": 14375 + }, + { + "epoch": 0.80501735916676, + "grad_norm": 1.197312355041504, + "learning_rate": 7.184e-05, + "loss": 0.3676, + "step": 14376 + }, + { + "epoch": 0.805073356478889, + "grad_norm": 1.3863509893417358, + "learning_rate": 7.1845e-05, + "loss": 0.4757, + "step": 14377 + }, + { + "epoch": 0.805129353791018, + "grad_norm": 1.2120007276535034, + "learning_rate": 7.185000000000001e-05, + "loss": 0.4002, + "step": 14378 + }, + { + "epoch": 0.805185351103147, + "grad_norm": 1.3701173067092896, + "learning_rate": 7.185500000000001e-05, + "loss": 0.4783, + "step": 14379 + }, + { + "epoch": 0.8052413484152761, + "grad_norm": 1.5542365312576294, + "learning_rate": 7.186e-05, + "loss": 0.3459, + "step": 14380 + }, + { + "epoch": 0.8052973457274051, + "grad_norm": 1.4818055629730225, + "learning_rate": 7.1865e-05, + "loss": 0.5042, + "step": 14381 + }, + { + "epoch": 0.8053533430395341, + "grad_norm": 1.4172481298446655, + "learning_rate": 7.187e-05, + "loss": 0.4043, + "step": 14382 + }, + { + "epoch": 0.8054093403516631, + "grad_norm": 1.235385775566101, + "learning_rate": 7.1875e-05, + "loss": 0.3711, + "step": 14383 + }, + { + "epoch": 0.8054653376637921, + "grad_norm": 1.370848536491394, + "learning_rate": 7.188e-05, + "loss": 0.3752, + "step": 14384 + }, + { + "epoch": 0.8055213349759212, + "grad_norm": 1.4222896099090576, + "learning_rate": 7.188500000000001e-05, + "loss": 0.4127, + "step": 14385 + }, + { + "epoch": 0.8055773322880502, + "grad_norm": 1.5399459600448608, + "learning_rate": 7.189e-05, + "loss": 0.4703, + "step": 14386 + }, + { + "epoch": 0.8056333296001792, + "grad_norm": 1.208827018737793, + "learning_rate": 7.1895e-05, + "loss": 0.4682, + "step": 14387 + }, + { + "epoch": 0.8056893269123082, + "grad_norm": 1.3453835248947144, + "learning_rate": 7.19e-05, + "loss": 0.4047, + "step": 14388 + }, + { + "epoch": 0.8057453242244372, + "grad_norm": 1.356747031211853, + "learning_rate": 7.1905e-05, + "loss": 0.6252, + "step": 14389 + }, + { + "epoch": 0.8058013215365663, + "grad_norm": 1.1763570308685303, + "learning_rate": 7.191e-05, + "loss": 0.4417, + "step": 14390 + }, + { + "epoch": 0.8058573188486953, + "grad_norm": 1.4643850326538086, + "learning_rate": 7.1915e-05, + "loss": 0.6276, + "step": 14391 + }, + { + "epoch": 0.8059133161608243, + "grad_norm": 1.474872350692749, + "learning_rate": 7.192e-05, + "loss": 0.4014, + "step": 14392 + }, + { + "epoch": 0.8059693134729533, + "grad_norm": 1.3869831562042236, + "learning_rate": 7.1925e-05, + "loss": 0.4205, + "step": 14393 + }, + { + "epoch": 0.8060253107850823, + "grad_norm": 1.3573460578918457, + "learning_rate": 7.193000000000001e-05, + "loss": 0.4208, + "step": 14394 + }, + { + "epoch": 0.8060813080972113, + "grad_norm": 1.432698130607605, + "learning_rate": 7.193500000000001e-05, + "loss": 0.4238, + "step": 14395 + }, + { + "epoch": 0.8061373054093404, + "grad_norm": 1.7275192737579346, + "learning_rate": 7.194e-05, + "loss": 0.4974, + "step": 14396 + }, + { + "epoch": 0.8061933027214694, + "grad_norm": 1.3129981756210327, + "learning_rate": 7.1945e-05, + "loss": 0.4118, + "step": 14397 + }, + { + "epoch": 0.8062493000335984, + "grad_norm": 1.3502404689788818, + "learning_rate": 7.195e-05, + "loss": 0.4085, + "step": 14398 + }, + { + "epoch": 0.8063052973457274, + "grad_norm": 1.3459618091583252, + "learning_rate": 7.195500000000001e-05, + "loss": 0.4328, + "step": 14399 + }, + { + "epoch": 0.8063612946578564, + "grad_norm": 1.3390995264053345, + "learning_rate": 7.196000000000001e-05, + "loss": 0.3985, + "step": 14400 + }, + { + "epoch": 0.8064172919699855, + "grad_norm": 1.316535472869873, + "learning_rate": 7.1965e-05, + "loss": 0.4415, + "step": 14401 + }, + { + "epoch": 0.8064732892821145, + "grad_norm": 1.365905523300171, + "learning_rate": 7.197e-05, + "loss": 0.4783, + "step": 14402 + }, + { + "epoch": 0.8065292865942435, + "grad_norm": 1.3806086778640747, + "learning_rate": 7.1975e-05, + "loss": 0.4347, + "step": 14403 + }, + { + "epoch": 0.8065852839063725, + "grad_norm": 1.2446452379226685, + "learning_rate": 7.198e-05, + "loss": 0.5143, + "step": 14404 + }, + { + "epoch": 0.8066412812185015, + "grad_norm": 1.1861881017684937, + "learning_rate": 7.1985e-05, + "loss": 0.3499, + "step": 14405 + }, + { + "epoch": 0.8066972785306306, + "grad_norm": 1.2290480136871338, + "learning_rate": 7.199000000000001e-05, + "loss": 0.397, + "step": 14406 + }, + { + "epoch": 0.8067532758427596, + "grad_norm": 1.3085788488388062, + "learning_rate": 7.1995e-05, + "loss": 0.3467, + "step": 14407 + }, + { + "epoch": 0.8068092731548886, + "grad_norm": 1.5729981660842896, + "learning_rate": 7.2e-05, + "loss": 0.4784, + "step": 14408 + }, + { + "epoch": 0.8068652704670176, + "grad_norm": 1.4126801490783691, + "learning_rate": 7.2005e-05, + "loss": 0.4939, + "step": 14409 + }, + { + "epoch": 0.8069212677791466, + "grad_norm": 1.1600823402404785, + "learning_rate": 7.201e-05, + "loss": 0.4275, + "step": 14410 + }, + { + "epoch": 0.8069772650912757, + "grad_norm": 1.2501877546310425, + "learning_rate": 7.2015e-05, + "loss": 0.3635, + "step": 14411 + }, + { + "epoch": 0.8070332624034047, + "grad_norm": 1.3730149269104004, + "learning_rate": 7.202e-05, + "loss": 0.4545, + "step": 14412 + }, + { + "epoch": 0.8070892597155337, + "grad_norm": 1.0997185707092285, + "learning_rate": 7.2025e-05, + "loss": 0.3805, + "step": 14413 + }, + { + "epoch": 0.8071452570276627, + "grad_norm": 1.1816157102584839, + "learning_rate": 7.203000000000001e-05, + "loss": 0.4126, + "step": 14414 + }, + { + "epoch": 0.8072012543397917, + "grad_norm": 1.4850237369537354, + "learning_rate": 7.203500000000001e-05, + "loss": 0.4057, + "step": 14415 + }, + { + "epoch": 0.8072572516519207, + "grad_norm": 1.332733154296875, + "learning_rate": 7.204000000000001e-05, + "loss": 0.6524, + "step": 14416 + }, + { + "epoch": 0.8073132489640498, + "grad_norm": 1.2679888010025024, + "learning_rate": 7.2045e-05, + "loss": 0.6952, + "step": 14417 + }, + { + "epoch": 0.8073692462761788, + "grad_norm": 1.5113410949707031, + "learning_rate": 7.205e-05, + "loss": 0.5025, + "step": 14418 + }, + { + "epoch": 0.8074252435883078, + "grad_norm": 1.4489210844039917, + "learning_rate": 7.2055e-05, + "loss": 0.5082, + "step": 14419 + }, + { + "epoch": 0.8074812409004368, + "grad_norm": 2.1691977977752686, + "learning_rate": 7.206000000000001e-05, + "loss": 0.6638, + "step": 14420 + }, + { + "epoch": 0.8075372382125658, + "grad_norm": 1.3792576789855957, + "learning_rate": 7.206500000000001e-05, + "loss": 0.4289, + "step": 14421 + }, + { + "epoch": 0.8075932355246949, + "grad_norm": 3.897068500518799, + "learning_rate": 7.207e-05, + "loss": 0.417, + "step": 14422 + }, + { + "epoch": 0.8076492328368239, + "grad_norm": 1.4487427473068237, + "learning_rate": 7.2075e-05, + "loss": 0.5249, + "step": 14423 + }, + { + "epoch": 0.8077052301489529, + "grad_norm": 1.3798766136169434, + "learning_rate": 7.208e-05, + "loss": 0.4358, + "step": 14424 + }, + { + "epoch": 0.8077612274610819, + "grad_norm": 1.737931251525879, + "learning_rate": 7.2085e-05, + "loss": 0.4121, + "step": 14425 + }, + { + "epoch": 0.8078172247732109, + "grad_norm": 1.1489999294281006, + "learning_rate": 7.209e-05, + "loss": 0.4085, + "step": 14426 + }, + { + "epoch": 0.8078732220853398, + "grad_norm": 1.190069556236267, + "learning_rate": 7.209500000000001e-05, + "loss": 0.5369, + "step": 14427 + }, + { + "epoch": 0.8079292193974689, + "grad_norm": 1.5137462615966797, + "learning_rate": 7.21e-05, + "loss": 0.7515, + "step": 14428 + }, + { + "epoch": 0.8079852167095979, + "grad_norm": 1.2904857397079468, + "learning_rate": 7.2105e-05, + "loss": 0.4236, + "step": 14429 + }, + { + "epoch": 0.8080412140217269, + "grad_norm": 1.3198213577270508, + "learning_rate": 7.211e-05, + "loss": 0.4046, + "step": 14430 + }, + { + "epoch": 0.8080972113338559, + "grad_norm": 1.3352689743041992, + "learning_rate": 7.2115e-05, + "loss": 0.4601, + "step": 14431 + }, + { + "epoch": 0.8081532086459849, + "grad_norm": 1.275355577468872, + "learning_rate": 7.212e-05, + "loss": 0.4128, + "step": 14432 + }, + { + "epoch": 0.808209205958114, + "grad_norm": 1.2645851373672485, + "learning_rate": 7.2125e-05, + "loss": 0.403, + "step": 14433 + }, + { + "epoch": 0.808265203270243, + "grad_norm": 1.3488813638687134, + "learning_rate": 7.213e-05, + "loss": 0.5088, + "step": 14434 + }, + { + "epoch": 0.808321200582372, + "grad_norm": 1.4230356216430664, + "learning_rate": 7.213500000000001e-05, + "loss": 0.6918, + "step": 14435 + }, + { + "epoch": 0.808377197894501, + "grad_norm": 1.0565516948699951, + "learning_rate": 7.214000000000001e-05, + "loss": 0.3467, + "step": 14436 + }, + { + "epoch": 0.80843319520663, + "grad_norm": 1.523375153541565, + "learning_rate": 7.214500000000001e-05, + "loss": 0.3884, + "step": 14437 + }, + { + "epoch": 0.8084891925187591, + "grad_norm": 1.429703950881958, + "learning_rate": 7.215e-05, + "loss": 0.4894, + "step": 14438 + }, + { + "epoch": 0.8085451898308881, + "grad_norm": 1.278806447982788, + "learning_rate": 7.2155e-05, + "loss": 0.4793, + "step": 14439 + }, + { + "epoch": 0.8086011871430171, + "grad_norm": 1.5889099836349487, + "learning_rate": 7.216e-05, + "loss": 0.5262, + "step": 14440 + }, + { + "epoch": 0.8086571844551461, + "grad_norm": 1.4965639114379883, + "learning_rate": 7.216500000000001e-05, + "loss": 0.4079, + "step": 14441 + }, + { + "epoch": 0.8087131817672751, + "grad_norm": 1.2740325927734375, + "learning_rate": 7.217000000000001e-05, + "loss": 0.3079, + "step": 14442 + }, + { + "epoch": 0.8087691790794042, + "grad_norm": 1.7930535078048706, + "learning_rate": 7.2175e-05, + "loss": 0.4116, + "step": 14443 + }, + { + "epoch": 0.8088251763915332, + "grad_norm": 1.1276098489761353, + "learning_rate": 7.218e-05, + "loss": 0.3954, + "step": 14444 + }, + { + "epoch": 0.8088811737036622, + "grad_norm": 1.5948141813278198, + "learning_rate": 7.2185e-05, + "loss": 0.5563, + "step": 14445 + }, + { + "epoch": 0.8089371710157912, + "grad_norm": 2.2157845497131348, + "learning_rate": 7.219e-05, + "loss": 0.5056, + "step": 14446 + }, + { + "epoch": 0.8089931683279202, + "grad_norm": 1.2125484943389893, + "learning_rate": 7.2195e-05, + "loss": 0.3818, + "step": 14447 + }, + { + "epoch": 0.8090491656400493, + "grad_norm": 1.572281837463379, + "learning_rate": 7.22e-05, + "loss": 0.4074, + "step": 14448 + }, + { + "epoch": 0.8091051629521783, + "grad_norm": 1.3786903619766235, + "learning_rate": 7.2205e-05, + "loss": 0.4561, + "step": 14449 + }, + { + "epoch": 0.8091611602643073, + "grad_norm": 1.3767876625061035, + "learning_rate": 7.221e-05, + "loss": 0.4368, + "step": 14450 + }, + { + "epoch": 0.8092171575764363, + "grad_norm": 1.33573579788208, + "learning_rate": 7.2215e-05, + "loss": 0.3953, + "step": 14451 + }, + { + "epoch": 0.8092731548885653, + "grad_norm": 1.474442720413208, + "learning_rate": 7.222e-05, + "loss": 0.6192, + "step": 14452 + }, + { + "epoch": 0.8093291522006943, + "grad_norm": 1.7762633562088013, + "learning_rate": 7.2225e-05, + "loss": 0.3859, + "step": 14453 + }, + { + "epoch": 0.8093851495128234, + "grad_norm": 1.1925138235092163, + "learning_rate": 7.223e-05, + "loss": 0.4591, + "step": 14454 + }, + { + "epoch": 0.8094411468249524, + "grad_norm": 1.2057689428329468, + "learning_rate": 7.2235e-05, + "loss": 0.4817, + "step": 14455 + }, + { + "epoch": 0.8094971441370814, + "grad_norm": 1.4963434934616089, + "learning_rate": 7.224000000000001e-05, + "loss": 0.4412, + "step": 14456 + }, + { + "epoch": 0.8095531414492104, + "grad_norm": 1.4457064867019653, + "learning_rate": 7.224500000000001e-05, + "loss": 0.3906, + "step": 14457 + }, + { + "epoch": 0.8096091387613394, + "grad_norm": 1.3031740188598633, + "learning_rate": 7.225000000000001e-05, + "loss": 0.3942, + "step": 14458 + }, + { + "epoch": 0.8096651360734685, + "grad_norm": 1.4185705184936523, + "learning_rate": 7.2255e-05, + "loss": 0.5671, + "step": 14459 + }, + { + "epoch": 0.8097211333855975, + "grad_norm": 1.244755744934082, + "learning_rate": 7.226e-05, + "loss": 0.5442, + "step": 14460 + }, + { + "epoch": 0.8097771306977265, + "grad_norm": 2.351407766342163, + "learning_rate": 7.2265e-05, + "loss": 0.414, + "step": 14461 + }, + { + "epoch": 0.8098331280098555, + "grad_norm": 1.2181895971298218, + "learning_rate": 7.227000000000001e-05, + "loss": 0.4345, + "step": 14462 + }, + { + "epoch": 0.8098891253219845, + "grad_norm": 1.3296558856964111, + "learning_rate": 7.227500000000001e-05, + "loss": 0.4265, + "step": 14463 + }, + { + "epoch": 0.8099451226341136, + "grad_norm": 1.4543119668960571, + "learning_rate": 7.228e-05, + "loss": 0.4575, + "step": 14464 + }, + { + "epoch": 0.8100011199462426, + "grad_norm": 1.1617560386657715, + "learning_rate": 7.2285e-05, + "loss": 0.4275, + "step": 14465 + }, + { + "epoch": 0.8100571172583716, + "grad_norm": 1.3625268936157227, + "learning_rate": 7.229e-05, + "loss": 0.5049, + "step": 14466 + }, + { + "epoch": 0.8101131145705006, + "grad_norm": 1.1242477893829346, + "learning_rate": 7.2295e-05, + "loss": 0.3465, + "step": 14467 + }, + { + "epoch": 0.8101691118826296, + "grad_norm": 1.2985645532608032, + "learning_rate": 7.23e-05, + "loss": 0.5279, + "step": 14468 + }, + { + "epoch": 0.8102251091947587, + "grad_norm": 1.2515453100204468, + "learning_rate": 7.2305e-05, + "loss": 0.4025, + "step": 14469 + }, + { + "epoch": 0.8102811065068877, + "grad_norm": 1.2598460912704468, + "learning_rate": 7.231e-05, + "loss": 0.4909, + "step": 14470 + }, + { + "epoch": 0.8103371038190167, + "grad_norm": 1.250108003616333, + "learning_rate": 7.2315e-05, + "loss": 0.4534, + "step": 14471 + }, + { + "epoch": 0.8103931011311457, + "grad_norm": 1.2872648239135742, + "learning_rate": 7.232e-05, + "loss": 0.3846, + "step": 14472 + }, + { + "epoch": 0.8104490984432747, + "grad_norm": 2.0495991706848145, + "learning_rate": 7.2325e-05, + "loss": 0.462, + "step": 14473 + }, + { + "epoch": 0.8105050957554037, + "grad_norm": 1.2422711849212646, + "learning_rate": 7.233000000000001e-05, + "loss": 0.3957, + "step": 14474 + }, + { + "epoch": 0.8105610930675328, + "grad_norm": 1.7333513498306274, + "learning_rate": 7.2335e-05, + "loss": 0.4894, + "step": 14475 + }, + { + "epoch": 0.8106170903796618, + "grad_norm": 1.6531480550765991, + "learning_rate": 7.234e-05, + "loss": 0.5644, + "step": 14476 + }, + { + "epoch": 0.8106730876917908, + "grad_norm": 1.3454701900482178, + "learning_rate": 7.234500000000001e-05, + "loss": 0.4615, + "step": 14477 + }, + { + "epoch": 0.8107290850039198, + "grad_norm": 1.1700000762939453, + "learning_rate": 7.235000000000001e-05, + "loss": 0.303, + "step": 14478 + }, + { + "epoch": 0.8107850823160488, + "grad_norm": 1.2846060991287231, + "learning_rate": 7.235500000000001e-05, + "loss": 0.5067, + "step": 14479 + }, + { + "epoch": 0.8108410796281779, + "grad_norm": 1.861580729484558, + "learning_rate": 7.236e-05, + "loss": 0.4014, + "step": 14480 + }, + { + "epoch": 0.8108970769403069, + "grad_norm": 1.2220993041992188, + "learning_rate": 7.2365e-05, + "loss": 0.4058, + "step": 14481 + }, + { + "epoch": 0.8109530742524359, + "grad_norm": 1.3710349798202515, + "learning_rate": 7.237e-05, + "loss": 0.4855, + "step": 14482 + }, + { + "epoch": 0.8110090715645649, + "grad_norm": 2.1424448490142822, + "learning_rate": 7.2375e-05, + "loss": 0.3967, + "step": 14483 + }, + { + "epoch": 0.8110650688766939, + "grad_norm": 1.3352257013320923, + "learning_rate": 7.238000000000001e-05, + "loss": 0.3667, + "step": 14484 + }, + { + "epoch": 0.811121066188823, + "grad_norm": 1.3583341836929321, + "learning_rate": 7.2385e-05, + "loss": 0.3491, + "step": 14485 + }, + { + "epoch": 0.811177063500952, + "grad_norm": 1.2811883687973022, + "learning_rate": 7.239e-05, + "loss": 0.3825, + "step": 14486 + }, + { + "epoch": 0.811233060813081, + "grad_norm": 1.3344007730484009, + "learning_rate": 7.2395e-05, + "loss": 0.4904, + "step": 14487 + }, + { + "epoch": 0.81128905812521, + "grad_norm": 1.446425437927246, + "learning_rate": 7.24e-05, + "loss": 0.5377, + "step": 14488 + }, + { + "epoch": 0.811345055437339, + "grad_norm": 1.0479023456573486, + "learning_rate": 7.2405e-05, + "loss": 0.3216, + "step": 14489 + }, + { + "epoch": 0.811401052749468, + "grad_norm": 1.2680141925811768, + "learning_rate": 7.241e-05, + "loss": 0.4654, + "step": 14490 + }, + { + "epoch": 0.8114570500615971, + "grad_norm": 1.3281927108764648, + "learning_rate": 7.2415e-05, + "loss": 0.4584, + "step": 14491 + }, + { + "epoch": 0.8115130473737261, + "grad_norm": 1.3772964477539062, + "learning_rate": 7.242e-05, + "loss": 0.375, + "step": 14492 + }, + { + "epoch": 0.8115690446858551, + "grad_norm": 1.623201847076416, + "learning_rate": 7.2425e-05, + "loss": 0.4131, + "step": 14493 + }, + { + "epoch": 0.8116250419979841, + "grad_norm": 1.2621368169784546, + "learning_rate": 7.243000000000001e-05, + "loss": 0.3265, + "step": 14494 + }, + { + "epoch": 0.8116810393101132, + "grad_norm": 1.3126702308654785, + "learning_rate": 7.243500000000001e-05, + "loss": 0.3985, + "step": 14495 + }, + { + "epoch": 0.8117370366222422, + "grad_norm": 1.4472118616104126, + "learning_rate": 7.244e-05, + "loss": 0.3782, + "step": 14496 + }, + { + "epoch": 0.8117930339343712, + "grad_norm": 1.2617502212524414, + "learning_rate": 7.2445e-05, + "loss": 0.3656, + "step": 14497 + }, + { + "epoch": 0.8118490312465002, + "grad_norm": 1.3361226320266724, + "learning_rate": 7.245000000000001e-05, + "loss": 0.4149, + "step": 14498 + }, + { + "epoch": 0.8119050285586292, + "grad_norm": 1.5286263227462769, + "learning_rate": 7.245500000000001e-05, + "loss": 0.5432, + "step": 14499 + }, + { + "epoch": 0.8119610258707582, + "grad_norm": 1.36464262008667, + "learning_rate": 7.246000000000001e-05, + "loss": 0.3893, + "step": 14500 + }, + { + "epoch": 0.8120170231828873, + "grad_norm": 1.3603463172912598, + "learning_rate": 7.2465e-05, + "loss": 0.3698, + "step": 14501 + }, + { + "epoch": 0.8120730204950163, + "grad_norm": 1.6051450967788696, + "learning_rate": 7.247e-05, + "loss": 0.5028, + "step": 14502 + }, + { + "epoch": 0.8121290178071453, + "grad_norm": 1.5369080305099487, + "learning_rate": 7.2475e-05, + "loss": 0.4999, + "step": 14503 + }, + { + "epoch": 0.8121850151192743, + "grad_norm": 1.2656768560409546, + "learning_rate": 7.248e-05, + "loss": 0.3857, + "step": 14504 + }, + { + "epoch": 0.8122410124314033, + "grad_norm": 1.2804858684539795, + "learning_rate": 7.248500000000001e-05, + "loss": 0.3913, + "step": 14505 + }, + { + "epoch": 0.8122970097435324, + "grad_norm": 1.3018038272857666, + "learning_rate": 7.249e-05, + "loss": 0.3833, + "step": 14506 + }, + { + "epoch": 0.8123530070556614, + "grad_norm": 1.3546369075775146, + "learning_rate": 7.2495e-05, + "loss": 0.4575, + "step": 14507 + }, + { + "epoch": 0.8124090043677904, + "grad_norm": 1.5813671350479126, + "learning_rate": 7.25e-05, + "loss": 0.4787, + "step": 14508 + }, + { + "epoch": 0.8124650016799194, + "grad_norm": 1.3506122827529907, + "learning_rate": 7.2505e-05, + "loss": 0.4252, + "step": 14509 + }, + { + "epoch": 0.8125209989920483, + "grad_norm": 1.5459997653961182, + "learning_rate": 7.251e-05, + "loss": 0.479, + "step": 14510 + }, + { + "epoch": 0.8125769963041773, + "grad_norm": 1.3185906410217285, + "learning_rate": 7.2515e-05, + "loss": 0.3693, + "step": 14511 + }, + { + "epoch": 0.8126329936163064, + "grad_norm": 1.6713619232177734, + "learning_rate": 7.252e-05, + "loss": 0.4934, + "step": 14512 + }, + { + "epoch": 0.8126889909284354, + "grad_norm": 1.160532832145691, + "learning_rate": 7.2525e-05, + "loss": 0.4034, + "step": 14513 + }, + { + "epoch": 0.8127449882405644, + "grad_norm": 1.6146104335784912, + "learning_rate": 7.253e-05, + "loss": 0.3794, + "step": 14514 + }, + { + "epoch": 0.8128009855526934, + "grad_norm": 1.3655707836151123, + "learning_rate": 7.253500000000001e-05, + "loss": 0.5995, + "step": 14515 + }, + { + "epoch": 0.8128569828648224, + "grad_norm": 1.1747153997421265, + "learning_rate": 7.254000000000001e-05, + "loss": 0.4118, + "step": 14516 + }, + { + "epoch": 0.8129129801769515, + "grad_norm": 1.23712158203125, + "learning_rate": 7.2545e-05, + "loss": 0.2912, + "step": 14517 + }, + { + "epoch": 0.8129689774890805, + "grad_norm": 1.6816219091415405, + "learning_rate": 7.255e-05, + "loss": 0.4653, + "step": 14518 + }, + { + "epoch": 0.8130249748012095, + "grad_norm": 1.5153015851974487, + "learning_rate": 7.255500000000001e-05, + "loss": 0.4508, + "step": 14519 + }, + { + "epoch": 0.8130809721133385, + "grad_norm": 1.2474896907806396, + "learning_rate": 7.256000000000001e-05, + "loss": 0.4264, + "step": 14520 + }, + { + "epoch": 0.8131369694254675, + "grad_norm": 1.249538540840149, + "learning_rate": 7.256500000000001e-05, + "loss": 0.4027, + "step": 14521 + }, + { + "epoch": 0.8131929667375966, + "grad_norm": 1.1467176675796509, + "learning_rate": 7.257e-05, + "loss": 0.3656, + "step": 14522 + }, + { + "epoch": 0.8132489640497256, + "grad_norm": 1.2952277660369873, + "learning_rate": 7.2575e-05, + "loss": 0.3393, + "step": 14523 + }, + { + "epoch": 0.8133049613618546, + "grad_norm": 1.2148288488388062, + "learning_rate": 7.258e-05, + "loss": 0.4045, + "step": 14524 + }, + { + "epoch": 0.8133609586739836, + "grad_norm": 1.3868563175201416, + "learning_rate": 7.2585e-05, + "loss": 0.5312, + "step": 14525 + }, + { + "epoch": 0.8134169559861126, + "grad_norm": 1.3878198862075806, + "learning_rate": 7.259000000000001e-05, + "loss": 0.5242, + "step": 14526 + }, + { + "epoch": 0.8134729532982417, + "grad_norm": 1.3271074295043945, + "learning_rate": 7.2595e-05, + "loss": 0.3862, + "step": 14527 + }, + { + "epoch": 0.8135289506103707, + "grad_norm": 1.4036977291107178, + "learning_rate": 7.26e-05, + "loss": 0.5575, + "step": 14528 + }, + { + "epoch": 0.8135849479224997, + "grad_norm": 1.1951898336410522, + "learning_rate": 7.2605e-05, + "loss": 0.3614, + "step": 14529 + }, + { + "epoch": 0.8136409452346287, + "grad_norm": 1.2342334985733032, + "learning_rate": 7.261e-05, + "loss": 0.4671, + "step": 14530 + }, + { + "epoch": 0.8136969425467577, + "grad_norm": 1.5239843130111694, + "learning_rate": 7.2615e-05, + "loss": 0.5482, + "step": 14531 + }, + { + "epoch": 0.8137529398588867, + "grad_norm": 1.4657853841781616, + "learning_rate": 7.261999999999999e-05, + "loss": 0.4421, + "step": 14532 + }, + { + "epoch": 0.8138089371710158, + "grad_norm": 1.2028380632400513, + "learning_rate": 7.2625e-05, + "loss": 0.4187, + "step": 14533 + }, + { + "epoch": 0.8138649344831448, + "grad_norm": 1.6086699962615967, + "learning_rate": 7.263e-05, + "loss": 0.5446, + "step": 14534 + }, + { + "epoch": 0.8139209317952738, + "grad_norm": 1.3743810653686523, + "learning_rate": 7.263500000000001e-05, + "loss": 0.4005, + "step": 14535 + }, + { + "epoch": 0.8139769291074028, + "grad_norm": 1.5481847524642944, + "learning_rate": 7.264000000000001e-05, + "loss": 0.5506, + "step": 14536 + }, + { + "epoch": 0.8140329264195318, + "grad_norm": 1.3572007417678833, + "learning_rate": 7.2645e-05, + "loss": 0.3101, + "step": 14537 + }, + { + "epoch": 0.8140889237316609, + "grad_norm": 1.2318710088729858, + "learning_rate": 7.265e-05, + "loss": 0.3469, + "step": 14538 + }, + { + "epoch": 0.8141449210437899, + "grad_norm": 1.2858757972717285, + "learning_rate": 7.2655e-05, + "loss": 0.3661, + "step": 14539 + }, + { + "epoch": 0.8142009183559189, + "grad_norm": 1.3333518505096436, + "learning_rate": 7.266000000000001e-05, + "loss": 0.4861, + "step": 14540 + }, + { + "epoch": 0.8142569156680479, + "grad_norm": 1.3598148822784424, + "learning_rate": 7.266500000000001e-05, + "loss": 0.4595, + "step": 14541 + }, + { + "epoch": 0.8143129129801769, + "grad_norm": 1.4633821249008179, + "learning_rate": 7.267000000000001e-05, + "loss": 0.4354, + "step": 14542 + }, + { + "epoch": 0.814368910292306, + "grad_norm": 1.4942946434020996, + "learning_rate": 7.2675e-05, + "loss": 0.5181, + "step": 14543 + }, + { + "epoch": 0.814424907604435, + "grad_norm": 1.449614405632019, + "learning_rate": 7.268e-05, + "loss": 0.4668, + "step": 14544 + }, + { + "epoch": 0.814480904916564, + "grad_norm": 1.1244574785232544, + "learning_rate": 7.2685e-05, + "loss": 0.4041, + "step": 14545 + }, + { + "epoch": 0.814536902228693, + "grad_norm": 1.3382452726364136, + "learning_rate": 7.269e-05, + "loss": 0.4061, + "step": 14546 + }, + { + "epoch": 0.814592899540822, + "grad_norm": 1.4895119667053223, + "learning_rate": 7.269500000000001e-05, + "loss": 0.4328, + "step": 14547 + }, + { + "epoch": 0.814648896852951, + "grad_norm": 1.4116780757904053, + "learning_rate": 7.27e-05, + "loss": 0.5754, + "step": 14548 + }, + { + "epoch": 0.8147048941650801, + "grad_norm": 1.4497257471084595, + "learning_rate": 7.2705e-05, + "loss": 0.5019, + "step": 14549 + }, + { + "epoch": 0.8147608914772091, + "grad_norm": 1.269838571548462, + "learning_rate": 7.271e-05, + "loss": 0.4255, + "step": 14550 + }, + { + "epoch": 0.8148168887893381, + "grad_norm": 1.9430867433547974, + "learning_rate": 7.2715e-05, + "loss": 0.5007, + "step": 14551 + }, + { + "epoch": 0.8148728861014671, + "grad_norm": 1.705452561378479, + "learning_rate": 7.272e-05, + "loss": 0.5779, + "step": 14552 + }, + { + "epoch": 0.8149288834135962, + "grad_norm": 1.1430672407150269, + "learning_rate": 7.272499999999999e-05, + "loss": 0.4957, + "step": 14553 + }, + { + "epoch": 0.8149848807257252, + "grad_norm": 1.5716427564620972, + "learning_rate": 7.273e-05, + "loss": 0.5149, + "step": 14554 + }, + { + "epoch": 0.8150408780378542, + "grad_norm": 1.5491863489151, + "learning_rate": 7.273500000000001e-05, + "loss": 0.4984, + "step": 14555 + }, + { + "epoch": 0.8150968753499832, + "grad_norm": 1.7216907739639282, + "learning_rate": 7.274000000000001e-05, + "loss": 0.4223, + "step": 14556 + }, + { + "epoch": 0.8151528726621122, + "grad_norm": 1.058103322982788, + "learning_rate": 7.274500000000001e-05, + "loss": 0.3283, + "step": 14557 + }, + { + "epoch": 0.8152088699742412, + "grad_norm": 1.297997236251831, + "learning_rate": 7.275e-05, + "loss": 0.4349, + "step": 14558 + }, + { + "epoch": 0.8152648672863703, + "grad_norm": 1.323997139930725, + "learning_rate": 7.2755e-05, + "loss": 0.4357, + "step": 14559 + }, + { + "epoch": 0.8153208645984993, + "grad_norm": 1.1607613563537598, + "learning_rate": 7.276e-05, + "loss": 0.3407, + "step": 14560 + }, + { + "epoch": 0.8153768619106283, + "grad_norm": 2.034667730331421, + "learning_rate": 7.2765e-05, + "loss": 0.5754, + "step": 14561 + }, + { + "epoch": 0.8154328592227573, + "grad_norm": 1.323005199432373, + "learning_rate": 7.277000000000001e-05, + "loss": 0.5629, + "step": 14562 + }, + { + "epoch": 0.8154888565348863, + "grad_norm": 1.2635260820388794, + "learning_rate": 7.277500000000001e-05, + "loss": 0.4229, + "step": 14563 + }, + { + "epoch": 0.8155448538470154, + "grad_norm": 1.1667816638946533, + "learning_rate": 7.278e-05, + "loss": 0.3357, + "step": 14564 + }, + { + "epoch": 0.8156008511591444, + "grad_norm": 1.0956239700317383, + "learning_rate": 7.2785e-05, + "loss": 0.3747, + "step": 14565 + }, + { + "epoch": 0.8156568484712734, + "grad_norm": 3.721742868423462, + "learning_rate": 7.279e-05, + "loss": 0.452, + "step": 14566 + }, + { + "epoch": 0.8157128457834024, + "grad_norm": 1.572638750076294, + "learning_rate": 7.2795e-05, + "loss": 0.5118, + "step": 14567 + }, + { + "epoch": 0.8157688430955314, + "grad_norm": 1.690908432006836, + "learning_rate": 7.280000000000001e-05, + "loss": 0.6523, + "step": 14568 + }, + { + "epoch": 0.8158248404076605, + "grad_norm": 1.463591456413269, + "learning_rate": 7.2805e-05, + "loss": 0.5766, + "step": 14569 + }, + { + "epoch": 0.8158808377197895, + "grad_norm": 1.3467861413955688, + "learning_rate": 7.281e-05, + "loss": 0.4124, + "step": 14570 + }, + { + "epoch": 0.8159368350319185, + "grad_norm": 1.3160570859909058, + "learning_rate": 7.2815e-05, + "loss": 0.4452, + "step": 14571 + }, + { + "epoch": 0.8159928323440475, + "grad_norm": 1.3122376203536987, + "learning_rate": 7.282e-05, + "loss": 0.4212, + "step": 14572 + }, + { + "epoch": 0.8160488296561765, + "grad_norm": 1.2728044986724854, + "learning_rate": 7.2825e-05, + "loss": 0.4936, + "step": 14573 + }, + { + "epoch": 0.8161048269683056, + "grad_norm": 1.4045751094818115, + "learning_rate": 7.282999999999999e-05, + "loss": 0.4088, + "step": 14574 + }, + { + "epoch": 0.8161608242804346, + "grad_norm": 1.1426284313201904, + "learning_rate": 7.2835e-05, + "loss": 0.3795, + "step": 14575 + }, + { + "epoch": 0.8162168215925636, + "grad_norm": 1.2386523485183716, + "learning_rate": 7.284000000000001e-05, + "loss": 0.4168, + "step": 14576 + }, + { + "epoch": 0.8162728189046926, + "grad_norm": 1.2142409086227417, + "learning_rate": 7.284500000000001e-05, + "loss": 0.398, + "step": 14577 + }, + { + "epoch": 0.8163288162168216, + "grad_norm": 1.250333547592163, + "learning_rate": 7.285000000000001e-05, + "loss": 0.324, + "step": 14578 + }, + { + "epoch": 0.8163848135289506, + "grad_norm": 1.2744383811950684, + "learning_rate": 7.2855e-05, + "loss": 0.4283, + "step": 14579 + }, + { + "epoch": 0.8164408108410797, + "grad_norm": 1.6865350008010864, + "learning_rate": 7.286e-05, + "loss": 0.4068, + "step": 14580 + }, + { + "epoch": 0.8164968081532087, + "grad_norm": 1.2743353843688965, + "learning_rate": 7.2865e-05, + "loss": 0.3827, + "step": 14581 + }, + { + "epoch": 0.8165528054653377, + "grad_norm": 1.450026035308838, + "learning_rate": 7.287e-05, + "loss": 0.5665, + "step": 14582 + }, + { + "epoch": 0.8166088027774667, + "grad_norm": 1.5260260105133057, + "learning_rate": 7.287500000000001e-05, + "loss": 0.3995, + "step": 14583 + }, + { + "epoch": 0.8166648000895957, + "grad_norm": 2.230234146118164, + "learning_rate": 7.288000000000001e-05, + "loss": 0.5908, + "step": 14584 + }, + { + "epoch": 0.8167207974017248, + "grad_norm": 1.2641435861587524, + "learning_rate": 7.2885e-05, + "loss": 0.3928, + "step": 14585 + }, + { + "epoch": 0.8167767947138538, + "grad_norm": 1.340026617050171, + "learning_rate": 7.289e-05, + "loss": 0.4319, + "step": 14586 + }, + { + "epoch": 0.8168327920259828, + "grad_norm": 1.2149425745010376, + "learning_rate": 7.2895e-05, + "loss": 0.3706, + "step": 14587 + }, + { + "epoch": 0.8168887893381118, + "grad_norm": 1.1634408235549927, + "learning_rate": 7.29e-05, + "loss": 0.5095, + "step": 14588 + }, + { + "epoch": 0.8169447866502408, + "grad_norm": 1.4276528358459473, + "learning_rate": 7.290500000000001e-05, + "loss": 0.4456, + "step": 14589 + }, + { + "epoch": 0.8170007839623699, + "grad_norm": 1.5272024869918823, + "learning_rate": 7.291e-05, + "loss": 0.4431, + "step": 14590 + }, + { + "epoch": 0.8170567812744989, + "grad_norm": 1.952351450920105, + "learning_rate": 7.2915e-05, + "loss": 0.611, + "step": 14591 + }, + { + "epoch": 0.8171127785866279, + "grad_norm": 1.099005103111267, + "learning_rate": 7.292e-05, + "loss": 0.3382, + "step": 14592 + }, + { + "epoch": 0.8171687758987568, + "grad_norm": 1.6788164377212524, + "learning_rate": 7.2925e-05, + "loss": 0.5231, + "step": 14593 + }, + { + "epoch": 0.8172247732108858, + "grad_norm": 1.1630085706710815, + "learning_rate": 7.293e-05, + "loss": 0.3644, + "step": 14594 + }, + { + "epoch": 0.8172807705230148, + "grad_norm": 1.2604320049285889, + "learning_rate": 7.2935e-05, + "loss": 0.519, + "step": 14595 + }, + { + "epoch": 0.8173367678351439, + "grad_norm": 5.7034831047058105, + "learning_rate": 7.294e-05, + "loss": 0.4025, + "step": 14596 + }, + { + "epoch": 0.8173927651472729, + "grad_norm": 1.554646372795105, + "learning_rate": 7.294500000000001e-05, + "loss": 0.4582, + "step": 14597 + }, + { + "epoch": 0.8174487624594019, + "grad_norm": 1.5256388187408447, + "learning_rate": 7.295000000000001e-05, + "loss": 0.4781, + "step": 14598 + }, + { + "epoch": 0.8175047597715309, + "grad_norm": 1.668744683265686, + "learning_rate": 7.295500000000001e-05, + "loss": 0.7531, + "step": 14599 + }, + { + "epoch": 0.8175607570836599, + "grad_norm": 1.4327677488327026, + "learning_rate": 7.296e-05, + "loss": 0.7833, + "step": 14600 + }, + { + "epoch": 0.817616754395789, + "grad_norm": 1.1870002746582031, + "learning_rate": 7.2965e-05, + "loss": 0.4161, + "step": 14601 + }, + { + "epoch": 0.817672751707918, + "grad_norm": 1.5035710334777832, + "learning_rate": 7.297e-05, + "loss": 0.3995, + "step": 14602 + }, + { + "epoch": 0.817728749020047, + "grad_norm": 1.594335913658142, + "learning_rate": 7.2975e-05, + "loss": 0.52, + "step": 14603 + }, + { + "epoch": 0.817784746332176, + "grad_norm": 1.2316128015518188, + "learning_rate": 7.298000000000001e-05, + "loss": 0.424, + "step": 14604 + }, + { + "epoch": 0.817840743644305, + "grad_norm": 1.5011049509048462, + "learning_rate": 7.298500000000001e-05, + "loss": 0.4467, + "step": 14605 + }, + { + "epoch": 0.817896740956434, + "grad_norm": 1.3760048151016235, + "learning_rate": 7.299e-05, + "loss": 0.3854, + "step": 14606 + }, + { + "epoch": 0.8179527382685631, + "grad_norm": 1.1981438398361206, + "learning_rate": 7.2995e-05, + "loss": 0.5182, + "step": 14607 + }, + { + "epoch": 0.8180087355806921, + "grad_norm": 1.446954369544983, + "learning_rate": 7.3e-05, + "loss": 0.5173, + "step": 14608 + }, + { + "epoch": 0.8180647328928211, + "grad_norm": 1.686732530593872, + "learning_rate": 7.3005e-05, + "loss": 0.56, + "step": 14609 + }, + { + "epoch": 0.8181207302049501, + "grad_norm": 1.051320195198059, + "learning_rate": 7.301e-05, + "loss": 0.3697, + "step": 14610 + }, + { + "epoch": 0.8181767275170791, + "grad_norm": 1.1930115222930908, + "learning_rate": 7.3015e-05, + "loss": 0.4837, + "step": 14611 + }, + { + "epoch": 0.8182327248292082, + "grad_norm": 1.5659046173095703, + "learning_rate": 7.302e-05, + "loss": 0.4421, + "step": 14612 + }, + { + "epoch": 0.8182887221413372, + "grad_norm": 1.5599424839019775, + "learning_rate": 7.3025e-05, + "loss": 0.4769, + "step": 14613 + }, + { + "epoch": 0.8183447194534662, + "grad_norm": 1.2555923461914062, + "learning_rate": 7.303e-05, + "loss": 0.4743, + "step": 14614 + }, + { + "epoch": 0.8184007167655952, + "grad_norm": 1.4767690896987915, + "learning_rate": 7.303500000000001e-05, + "loss": 0.4749, + "step": 14615 + }, + { + "epoch": 0.8184567140777242, + "grad_norm": 1.3402944803237915, + "learning_rate": 7.304e-05, + "loss": 0.5187, + "step": 14616 + }, + { + "epoch": 0.8185127113898533, + "grad_norm": 1.3602509498596191, + "learning_rate": 7.3045e-05, + "loss": 0.4312, + "step": 14617 + }, + { + "epoch": 0.8185687087019823, + "grad_norm": 1.3979710340499878, + "learning_rate": 7.305000000000001e-05, + "loss": 0.4078, + "step": 14618 + }, + { + "epoch": 0.8186247060141113, + "grad_norm": 1.2349612712860107, + "learning_rate": 7.305500000000001e-05, + "loss": 0.3946, + "step": 14619 + }, + { + "epoch": 0.8186807033262403, + "grad_norm": 1.2803958654403687, + "learning_rate": 7.306000000000001e-05, + "loss": 0.3911, + "step": 14620 + }, + { + "epoch": 0.8187367006383693, + "grad_norm": 1.5141361951828003, + "learning_rate": 7.3065e-05, + "loss": 0.4704, + "step": 14621 + }, + { + "epoch": 0.8187926979504984, + "grad_norm": 1.7877403497695923, + "learning_rate": 7.307e-05, + "loss": 0.5888, + "step": 14622 + }, + { + "epoch": 0.8188486952626274, + "grad_norm": 1.3140099048614502, + "learning_rate": 7.3075e-05, + "loss": 0.3834, + "step": 14623 + }, + { + "epoch": 0.8189046925747564, + "grad_norm": 1.3405500650405884, + "learning_rate": 7.308e-05, + "loss": 0.4656, + "step": 14624 + }, + { + "epoch": 0.8189606898868854, + "grad_norm": 1.3182367086410522, + "learning_rate": 7.308500000000001e-05, + "loss": 0.3299, + "step": 14625 + }, + { + "epoch": 0.8190166871990144, + "grad_norm": 1.8528672456741333, + "learning_rate": 7.309e-05, + "loss": 0.5492, + "step": 14626 + }, + { + "epoch": 0.8190726845111435, + "grad_norm": 1.2025448083877563, + "learning_rate": 7.3095e-05, + "loss": 0.4631, + "step": 14627 + }, + { + "epoch": 0.8191286818232725, + "grad_norm": 1.1996395587921143, + "learning_rate": 7.31e-05, + "loss": 0.4278, + "step": 14628 + }, + { + "epoch": 0.8191846791354015, + "grad_norm": 1.288961410522461, + "learning_rate": 7.3105e-05, + "loss": 0.4867, + "step": 14629 + }, + { + "epoch": 0.8192406764475305, + "grad_norm": 1.4086191654205322, + "learning_rate": 7.311e-05, + "loss": 0.463, + "step": 14630 + }, + { + "epoch": 0.8192966737596595, + "grad_norm": 1.3368200063705444, + "learning_rate": 7.3115e-05, + "loss": 0.4777, + "step": 14631 + }, + { + "epoch": 0.8193526710717886, + "grad_norm": 0.9781476855278015, + "learning_rate": 7.312e-05, + "loss": 0.3705, + "step": 14632 + }, + { + "epoch": 0.8194086683839176, + "grad_norm": 1.185136318206787, + "learning_rate": 7.3125e-05, + "loss": 0.5119, + "step": 14633 + }, + { + "epoch": 0.8194646656960466, + "grad_norm": 1.1368544101715088, + "learning_rate": 7.313e-05, + "loss": 0.4276, + "step": 14634 + }, + { + "epoch": 0.8195206630081756, + "grad_norm": 1.0748132467269897, + "learning_rate": 7.3135e-05, + "loss": 0.3838, + "step": 14635 + }, + { + "epoch": 0.8195766603203046, + "grad_norm": 1.304819941520691, + "learning_rate": 7.314000000000001e-05, + "loss": 0.4756, + "step": 14636 + }, + { + "epoch": 0.8196326576324336, + "grad_norm": 1.168513298034668, + "learning_rate": 7.3145e-05, + "loss": 0.3447, + "step": 14637 + }, + { + "epoch": 0.8196886549445627, + "grad_norm": 1.4905389547348022, + "learning_rate": 7.315e-05, + "loss": 0.5992, + "step": 14638 + }, + { + "epoch": 0.8197446522566917, + "grad_norm": 1.1134607791900635, + "learning_rate": 7.315500000000001e-05, + "loss": 0.4236, + "step": 14639 + }, + { + "epoch": 0.8198006495688207, + "grad_norm": 1.1270670890808105, + "learning_rate": 7.316000000000001e-05, + "loss": 0.354, + "step": 14640 + }, + { + "epoch": 0.8198566468809497, + "grad_norm": 1.2572729587554932, + "learning_rate": 7.316500000000001e-05, + "loss": 0.3315, + "step": 14641 + }, + { + "epoch": 0.8199126441930787, + "grad_norm": 1.289846420288086, + "learning_rate": 7.317e-05, + "loss": 0.3378, + "step": 14642 + }, + { + "epoch": 0.8199686415052078, + "grad_norm": 1.543160080909729, + "learning_rate": 7.3175e-05, + "loss": 0.5779, + "step": 14643 + }, + { + "epoch": 0.8200246388173368, + "grad_norm": 1.4106050729751587, + "learning_rate": 7.318e-05, + "loss": 0.4014, + "step": 14644 + }, + { + "epoch": 0.8200806361294658, + "grad_norm": 1.1911691427230835, + "learning_rate": 7.3185e-05, + "loss": 0.4199, + "step": 14645 + }, + { + "epoch": 0.8201366334415948, + "grad_norm": 1.0628832578659058, + "learning_rate": 7.319000000000001e-05, + "loss": 0.4342, + "step": 14646 + }, + { + "epoch": 0.8201926307537238, + "grad_norm": 1.2558344602584839, + "learning_rate": 7.3195e-05, + "loss": 0.4735, + "step": 14647 + }, + { + "epoch": 0.8202486280658529, + "grad_norm": 1.5138523578643799, + "learning_rate": 7.32e-05, + "loss": 0.6924, + "step": 14648 + }, + { + "epoch": 0.8203046253779819, + "grad_norm": 1.438155174255371, + "learning_rate": 7.3205e-05, + "loss": 0.4288, + "step": 14649 + }, + { + "epoch": 0.8203606226901109, + "grad_norm": 1.302852749824524, + "learning_rate": 7.321e-05, + "loss": 0.4153, + "step": 14650 + }, + { + "epoch": 0.8204166200022399, + "grad_norm": 1.169129729270935, + "learning_rate": 7.3215e-05, + "loss": 0.3858, + "step": 14651 + }, + { + "epoch": 0.8204726173143689, + "grad_norm": 1.8877336978912354, + "learning_rate": 7.322e-05, + "loss": 0.5608, + "step": 14652 + }, + { + "epoch": 0.820528614626498, + "grad_norm": 1.320983648300171, + "learning_rate": 7.3225e-05, + "loss": 0.4959, + "step": 14653 + }, + { + "epoch": 0.820584611938627, + "grad_norm": 1.4541881084442139, + "learning_rate": 7.323e-05, + "loss": 0.5543, + "step": 14654 + }, + { + "epoch": 0.820640609250756, + "grad_norm": 1.3759751319885254, + "learning_rate": 7.3235e-05, + "loss": 0.4852, + "step": 14655 + }, + { + "epoch": 0.820696606562885, + "grad_norm": 1.4528175592422485, + "learning_rate": 7.324000000000001e-05, + "loss": 0.478, + "step": 14656 + }, + { + "epoch": 0.820752603875014, + "grad_norm": 1.3313637971878052, + "learning_rate": 7.324500000000001e-05, + "loss": 0.4429, + "step": 14657 + }, + { + "epoch": 0.820808601187143, + "grad_norm": 1.0332139730453491, + "learning_rate": 7.325e-05, + "loss": 0.2853, + "step": 14658 + }, + { + "epoch": 0.8208645984992721, + "grad_norm": 1.4055781364440918, + "learning_rate": 7.3255e-05, + "loss": 0.4461, + "step": 14659 + }, + { + "epoch": 0.8209205958114011, + "grad_norm": 1.1871622800827026, + "learning_rate": 7.326e-05, + "loss": 0.5183, + "step": 14660 + }, + { + "epoch": 0.8209765931235301, + "grad_norm": 1.436928391456604, + "learning_rate": 7.326500000000001e-05, + "loss": 0.4143, + "step": 14661 + }, + { + "epoch": 0.8210325904356591, + "grad_norm": 1.2425044775009155, + "learning_rate": 7.327000000000001e-05, + "loss": 0.4609, + "step": 14662 + }, + { + "epoch": 0.8210885877477881, + "grad_norm": 1.4125325679779053, + "learning_rate": 7.3275e-05, + "loss": 0.4595, + "step": 14663 + }, + { + "epoch": 0.8211445850599172, + "grad_norm": 1.0556050539016724, + "learning_rate": 7.328e-05, + "loss": 0.3873, + "step": 14664 + }, + { + "epoch": 0.8212005823720462, + "grad_norm": 1.2359877824783325, + "learning_rate": 7.3285e-05, + "loss": 0.3886, + "step": 14665 + }, + { + "epoch": 0.8212565796841752, + "grad_norm": 1.4855868816375732, + "learning_rate": 7.329e-05, + "loss": 0.5341, + "step": 14666 + }, + { + "epoch": 0.8213125769963042, + "grad_norm": 3.715852975845337, + "learning_rate": 7.329500000000001e-05, + "loss": 0.4834, + "step": 14667 + }, + { + "epoch": 0.8213685743084332, + "grad_norm": 1.4016079902648926, + "learning_rate": 7.33e-05, + "loss": 0.502, + "step": 14668 + }, + { + "epoch": 0.8214245716205623, + "grad_norm": 1.375775694847107, + "learning_rate": 7.3305e-05, + "loss": 0.3553, + "step": 14669 + }, + { + "epoch": 0.8214805689326913, + "grad_norm": 1.2052714824676514, + "learning_rate": 7.331e-05, + "loss": 0.3535, + "step": 14670 + }, + { + "epoch": 0.8215365662448203, + "grad_norm": 1.445199966430664, + "learning_rate": 7.3315e-05, + "loss": 0.5214, + "step": 14671 + }, + { + "epoch": 0.8215925635569493, + "grad_norm": 1.3662270307540894, + "learning_rate": 7.332e-05, + "loss": 0.4388, + "step": 14672 + }, + { + "epoch": 0.8216485608690783, + "grad_norm": 1.2618768215179443, + "learning_rate": 7.3325e-05, + "loss": 0.6232, + "step": 14673 + }, + { + "epoch": 0.8217045581812074, + "grad_norm": 1.0770246982574463, + "learning_rate": 7.333e-05, + "loss": 0.3527, + "step": 14674 + }, + { + "epoch": 0.8217605554933363, + "grad_norm": 1.4090774059295654, + "learning_rate": 7.3335e-05, + "loss": 0.4901, + "step": 14675 + }, + { + "epoch": 0.8218165528054653, + "grad_norm": 1.4112480878829956, + "learning_rate": 7.334000000000001e-05, + "loss": 0.4592, + "step": 14676 + }, + { + "epoch": 0.8218725501175943, + "grad_norm": 1.2244633436203003, + "learning_rate": 7.334500000000001e-05, + "loss": 0.3995, + "step": 14677 + }, + { + "epoch": 0.8219285474297233, + "grad_norm": 1.3836400508880615, + "learning_rate": 7.335000000000001e-05, + "loss": 0.4217, + "step": 14678 + }, + { + "epoch": 0.8219845447418523, + "grad_norm": 1.394639015197754, + "learning_rate": 7.3355e-05, + "loss": 0.3731, + "step": 14679 + }, + { + "epoch": 0.8220405420539814, + "grad_norm": 1.2811203002929688, + "learning_rate": 7.336e-05, + "loss": 0.4087, + "step": 14680 + }, + { + "epoch": 0.8220965393661104, + "grad_norm": 1.214834213256836, + "learning_rate": 7.3365e-05, + "loss": 0.467, + "step": 14681 + }, + { + "epoch": 0.8221525366782394, + "grad_norm": 1.3365193605422974, + "learning_rate": 7.337000000000001e-05, + "loss": 0.6025, + "step": 14682 + }, + { + "epoch": 0.8222085339903684, + "grad_norm": 1.3410736322402954, + "learning_rate": 7.337500000000001e-05, + "loss": 0.4645, + "step": 14683 + }, + { + "epoch": 0.8222645313024974, + "grad_norm": 1.3538296222686768, + "learning_rate": 7.338e-05, + "loss": 0.4959, + "step": 14684 + }, + { + "epoch": 0.8223205286146265, + "grad_norm": 1.3033719062805176, + "learning_rate": 7.3385e-05, + "loss": 0.4946, + "step": 14685 + }, + { + "epoch": 0.8223765259267555, + "grad_norm": 1.2510207891464233, + "learning_rate": 7.339e-05, + "loss": 0.5141, + "step": 14686 + }, + { + "epoch": 0.8224325232388845, + "grad_norm": 1.329649567604065, + "learning_rate": 7.3395e-05, + "loss": 0.4308, + "step": 14687 + }, + { + "epoch": 0.8224885205510135, + "grad_norm": 1.3454447984695435, + "learning_rate": 7.340000000000001e-05, + "loss": 0.4015, + "step": 14688 + }, + { + "epoch": 0.8225445178631425, + "grad_norm": 1.414605975151062, + "learning_rate": 7.3405e-05, + "loss": 0.4734, + "step": 14689 + }, + { + "epoch": 0.8226005151752716, + "grad_norm": 1.2809250354766846, + "learning_rate": 7.341e-05, + "loss": 0.2809, + "step": 14690 + }, + { + "epoch": 0.8226565124874006, + "grad_norm": 1.1174598932266235, + "learning_rate": 7.3415e-05, + "loss": 0.402, + "step": 14691 + }, + { + "epoch": 0.8227125097995296, + "grad_norm": 1.1026337146759033, + "learning_rate": 7.342e-05, + "loss": 0.5006, + "step": 14692 + }, + { + "epoch": 0.8227685071116586, + "grad_norm": 1.4404120445251465, + "learning_rate": 7.3425e-05, + "loss": 0.5915, + "step": 14693 + }, + { + "epoch": 0.8228245044237876, + "grad_norm": 1.2131141424179077, + "learning_rate": 7.342999999999999e-05, + "loss": 0.4807, + "step": 14694 + }, + { + "epoch": 0.8228805017359166, + "grad_norm": 2.460108757019043, + "learning_rate": 7.3435e-05, + "loss": 0.4563, + "step": 14695 + }, + { + "epoch": 0.8229364990480457, + "grad_norm": 1.672230839729309, + "learning_rate": 7.344000000000002e-05, + "loss": 0.5001, + "step": 14696 + }, + { + "epoch": 0.8229924963601747, + "grad_norm": 1.2148809432983398, + "learning_rate": 7.344500000000001e-05, + "loss": 0.4161, + "step": 14697 + }, + { + "epoch": 0.8230484936723037, + "grad_norm": 1.3984878063201904, + "learning_rate": 7.345000000000001e-05, + "loss": 0.4249, + "step": 14698 + }, + { + "epoch": 0.8231044909844327, + "grad_norm": 1.5426331758499146, + "learning_rate": 7.345500000000001e-05, + "loss": 0.6712, + "step": 14699 + }, + { + "epoch": 0.8231604882965617, + "grad_norm": 1.246315836906433, + "learning_rate": 7.346e-05, + "loss": 0.5127, + "step": 14700 + }, + { + "epoch": 0.8232164856086908, + "grad_norm": 1.3528317213058472, + "learning_rate": 7.3465e-05, + "loss": 0.4185, + "step": 14701 + }, + { + "epoch": 0.8232724829208198, + "grad_norm": 2.795318841934204, + "learning_rate": 7.347e-05, + "loss": 0.3799, + "step": 14702 + }, + { + "epoch": 0.8233284802329488, + "grad_norm": 1.1334058046340942, + "learning_rate": 7.347500000000001e-05, + "loss": 0.4266, + "step": 14703 + }, + { + "epoch": 0.8233844775450778, + "grad_norm": 1.4261080026626587, + "learning_rate": 7.348000000000001e-05, + "loss": 0.4662, + "step": 14704 + }, + { + "epoch": 0.8234404748572068, + "grad_norm": 1.3351155519485474, + "learning_rate": 7.3485e-05, + "loss": 0.4606, + "step": 14705 + }, + { + "epoch": 0.8234964721693359, + "grad_norm": 1.4264384508132935, + "learning_rate": 7.349e-05, + "loss": 0.4275, + "step": 14706 + }, + { + "epoch": 0.8235524694814649, + "grad_norm": 1.2684037685394287, + "learning_rate": 7.3495e-05, + "loss": 0.4041, + "step": 14707 + }, + { + "epoch": 0.8236084667935939, + "grad_norm": 1.1517019271850586, + "learning_rate": 7.35e-05, + "loss": 0.3418, + "step": 14708 + }, + { + "epoch": 0.8236644641057229, + "grad_norm": 1.2840877771377563, + "learning_rate": 7.3505e-05, + "loss": 0.4765, + "step": 14709 + }, + { + "epoch": 0.8237204614178519, + "grad_norm": 1.1933214664459229, + "learning_rate": 7.351e-05, + "loss": 0.4337, + "step": 14710 + }, + { + "epoch": 0.823776458729981, + "grad_norm": 1.3279502391815186, + "learning_rate": 7.3515e-05, + "loss": 0.3734, + "step": 14711 + }, + { + "epoch": 0.82383245604211, + "grad_norm": 1.504241943359375, + "learning_rate": 7.352e-05, + "loss": 0.3383, + "step": 14712 + }, + { + "epoch": 0.823888453354239, + "grad_norm": 1.501292109489441, + "learning_rate": 7.3525e-05, + "loss": 0.4715, + "step": 14713 + }, + { + "epoch": 0.823944450666368, + "grad_norm": 1.2501533031463623, + "learning_rate": 7.353e-05, + "loss": 0.3866, + "step": 14714 + }, + { + "epoch": 0.824000447978497, + "grad_norm": 1.207091212272644, + "learning_rate": 7.353499999999999e-05, + "loss": 0.4442, + "step": 14715 + }, + { + "epoch": 0.824056445290626, + "grad_norm": 1.4631602764129639, + "learning_rate": 7.354e-05, + "loss": 0.445, + "step": 14716 + }, + { + "epoch": 0.8241124426027551, + "grad_norm": 1.2972570657730103, + "learning_rate": 7.354500000000001e-05, + "loss": 0.4775, + "step": 14717 + }, + { + "epoch": 0.8241684399148841, + "grad_norm": 1.363891363143921, + "learning_rate": 7.355000000000001e-05, + "loss": 0.3694, + "step": 14718 + }, + { + "epoch": 0.8242244372270131, + "grad_norm": 1.2534159421920776, + "learning_rate": 7.355500000000001e-05, + "loss": 0.3984, + "step": 14719 + }, + { + "epoch": 0.8242804345391421, + "grad_norm": 2.2425355911254883, + "learning_rate": 7.356000000000001e-05, + "loss": 0.4672, + "step": 14720 + }, + { + "epoch": 0.8243364318512711, + "grad_norm": 1.1252158880233765, + "learning_rate": 7.3565e-05, + "loss": 0.3602, + "step": 14721 + }, + { + "epoch": 0.8243924291634002, + "grad_norm": 1.5778850317001343, + "learning_rate": 7.357e-05, + "loss": 0.5639, + "step": 14722 + }, + { + "epoch": 0.8244484264755292, + "grad_norm": 1.4036636352539062, + "learning_rate": 7.3575e-05, + "loss": 0.5913, + "step": 14723 + }, + { + "epoch": 0.8245044237876582, + "grad_norm": 1.3537404537200928, + "learning_rate": 7.358000000000001e-05, + "loss": 0.4174, + "step": 14724 + }, + { + "epoch": 0.8245604210997872, + "grad_norm": 1.240222692489624, + "learning_rate": 7.358500000000001e-05, + "loss": 0.5879, + "step": 14725 + }, + { + "epoch": 0.8246164184119162, + "grad_norm": 1.2327338457107544, + "learning_rate": 7.359e-05, + "loss": 0.4506, + "step": 14726 + }, + { + "epoch": 0.8246724157240453, + "grad_norm": 1.5453565120697021, + "learning_rate": 7.3595e-05, + "loss": 0.4951, + "step": 14727 + }, + { + "epoch": 0.8247284130361743, + "grad_norm": 1.184607744216919, + "learning_rate": 7.36e-05, + "loss": 0.3828, + "step": 14728 + }, + { + "epoch": 0.8247844103483033, + "grad_norm": 1.2312238216400146, + "learning_rate": 7.3605e-05, + "loss": 0.4146, + "step": 14729 + }, + { + "epoch": 0.8248404076604323, + "grad_norm": 1.4124467372894287, + "learning_rate": 7.361e-05, + "loss": 0.5357, + "step": 14730 + }, + { + "epoch": 0.8248964049725613, + "grad_norm": 1.378571629524231, + "learning_rate": 7.3615e-05, + "loss": 0.4415, + "step": 14731 + }, + { + "epoch": 0.8249524022846904, + "grad_norm": 1.1218804121017456, + "learning_rate": 7.362e-05, + "loss": 0.3804, + "step": 14732 + }, + { + "epoch": 0.8250083995968194, + "grad_norm": 1.1442683935165405, + "learning_rate": 7.3625e-05, + "loss": 0.3825, + "step": 14733 + }, + { + "epoch": 0.8250643969089484, + "grad_norm": 1.2137649059295654, + "learning_rate": 7.363e-05, + "loss": 0.3702, + "step": 14734 + }, + { + "epoch": 0.8251203942210774, + "grad_norm": 1.3204543590545654, + "learning_rate": 7.3635e-05, + "loss": 0.4778, + "step": 14735 + }, + { + "epoch": 0.8251763915332064, + "grad_norm": 1.361220359802246, + "learning_rate": 7.364e-05, + "loss": 0.3699, + "step": 14736 + }, + { + "epoch": 0.8252323888453355, + "grad_norm": 1.2736396789550781, + "learning_rate": 7.3645e-05, + "loss": 0.4897, + "step": 14737 + }, + { + "epoch": 0.8252883861574645, + "grad_norm": 1.2590088844299316, + "learning_rate": 7.365e-05, + "loss": 0.4012, + "step": 14738 + }, + { + "epoch": 0.8253443834695935, + "grad_norm": 1.4426013231277466, + "learning_rate": 7.365500000000001e-05, + "loss": 0.5248, + "step": 14739 + }, + { + "epoch": 0.8254003807817225, + "grad_norm": 1.2324867248535156, + "learning_rate": 7.366000000000001e-05, + "loss": 0.3447, + "step": 14740 + }, + { + "epoch": 0.8254563780938515, + "grad_norm": 1.4885910749435425, + "learning_rate": 7.366500000000001e-05, + "loss": 0.3961, + "step": 14741 + }, + { + "epoch": 0.8255123754059805, + "grad_norm": 1.3118256330490112, + "learning_rate": 7.367e-05, + "loss": 0.428, + "step": 14742 + }, + { + "epoch": 0.8255683727181096, + "grad_norm": 1.4559918642044067, + "learning_rate": 7.3675e-05, + "loss": 0.5029, + "step": 14743 + }, + { + "epoch": 0.8256243700302386, + "grad_norm": 1.3392595052719116, + "learning_rate": 7.368e-05, + "loss": 0.4989, + "step": 14744 + }, + { + "epoch": 0.8256803673423676, + "grad_norm": 1.6044344902038574, + "learning_rate": 7.368500000000001e-05, + "loss": 0.5751, + "step": 14745 + }, + { + "epoch": 0.8257363646544966, + "grad_norm": 1.5788519382476807, + "learning_rate": 7.369000000000001e-05, + "loss": 0.4325, + "step": 14746 + }, + { + "epoch": 0.8257923619666256, + "grad_norm": 1.4509316682815552, + "learning_rate": 7.3695e-05, + "loss": 0.5399, + "step": 14747 + }, + { + "epoch": 0.8258483592787547, + "grad_norm": 1.5302283763885498, + "learning_rate": 7.37e-05, + "loss": 0.4131, + "step": 14748 + }, + { + "epoch": 0.8259043565908837, + "grad_norm": 1.2519524097442627, + "learning_rate": 7.3705e-05, + "loss": 0.523, + "step": 14749 + }, + { + "epoch": 0.8259603539030127, + "grad_norm": 0.9893085360527039, + "learning_rate": 7.371e-05, + "loss": 0.3256, + "step": 14750 + }, + { + "epoch": 0.8260163512151417, + "grad_norm": 1.1434038877487183, + "learning_rate": 7.3715e-05, + "loss": 0.318, + "step": 14751 + }, + { + "epoch": 0.8260723485272707, + "grad_norm": 1.723732352256775, + "learning_rate": 7.372e-05, + "loss": 0.5284, + "step": 14752 + }, + { + "epoch": 0.8261283458393998, + "grad_norm": 1.2246155738830566, + "learning_rate": 7.3725e-05, + "loss": 0.4872, + "step": 14753 + }, + { + "epoch": 0.8261843431515288, + "grad_norm": 1.3805774450302124, + "learning_rate": 7.373e-05, + "loss": 0.4494, + "step": 14754 + }, + { + "epoch": 0.8262403404636578, + "grad_norm": 1.4114630222320557, + "learning_rate": 7.3735e-05, + "loss": 0.484, + "step": 14755 + }, + { + "epoch": 0.8262963377757868, + "grad_norm": 1.6827677488327026, + "learning_rate": 7.374000000000001e-05, + "loss": 0.5552, + "step": 14756 + }, + { + "epoch": 0.8263523350879158, + "grad_norm": 1.3255912065505981, + "learning_rate": 7.3745e-05, + "loss": 0.4747, + "step": 14757 + }, + { + "epoch": 0.8264083324000447, + "grad_norm": 1.1039899587631226, + "learning_rate": 7.375e-05, + "loss": 0.4505, + "step": 14758 + }, + { + "epoch": 0.8264643297121738, + "grad_norm": 1.1913259029388428, + "learning_rate": 7.3755e-05, + "loss": 0.4721, + "step": 14759 + }, + { + "epoch": 0.8265203270243028, + "grad_norm": 1.2081841230392456, + "learning_rate": 7.376000000000001e-05, + "loss": 0.3813, + "step": 14760 + }, + { + "epoch": 0.8265763243364318, + "grad_norm": 1.3080998659133911, + "learning_rate": 7.376500000000001e-05, + "loss": 0.4688, + "step": 14761 + }, + { + "epoch": 0.8266323216485608, + "grad_norm": 1.1383557319641113, + "learning_rate": 7.377000000000001e-05, + "loss": 0.3989, + "step": 14762 + }, + { + "epoch": 0.8266883189606898, + "grad_norm": 1.406874179840088, + "learning_rate": 7.3775e-05, + "loss": 0.5117, + "step": 14763 + }, + { + "epoch": 0.8267443162728189, + "grad_norm": 1.470577597618103, + "learning_rate": 7.378e-05, + "loss": 0.4923, + "step": 14764 + }, + { + "epoch": 0.8268003135849479, + "grad_norm": 1.519362449645996, + "learning_rate": 7.3785e-05, + "loss": 0.4208, + "step": 14765 + }, + { + "epoch": 0.8268563108970769, + "grad_norm": 1.3467808961868286, + "learning_rate": 7.379000000000001e-05, + "loss": 0.4447, + "step": 14766 + }, + { + "epoch": 0.8269123082092059, + "grad_norm": 1.2084959745407104, + "learning_rate": 7.379500000000001e-05, + "loss": 0.4171, + "step": 14767 + }, + { + "epoch": 0.8269683055213349, + "grad_norm": 1.4759771823883057, + "learning_rate": 7.38e-05, + "loss": 0.3606, + "step": 14768 + }, + { + "epoch": 0.827024302833464, + "grad_norm": 1.2212929725646973, + "learning_rate": 7.3805e-05, + "loss": 0.4007, + "step": 14769 + }, + { + "epoch": 0.827080300145593, + "grad_norm": 1.5924476385116577, + "learning_rate": 7.381e-05, + "loss": 0.3969, + "step": 14770 + }, + { + "epoch": 0.827136297457722, + "grad_norm": 1.227035641670227, + "learning_rate": 7.3815e-05, + "loss": 0.5113, + "step": 14771 + }, + { + "epoch": 0.827192294769851, + "grad_norm": 1.3617016077041626, + "learning_rate": 7.382e-05, + "loss": 0.4139, + "step": 14772 + }, + { + "epoch": 0.82724829208198, + "grad_norm": 1.984702706336975, + "learning_rate": 7.3825e-05, + "loss": 0.5963, + "step": 14773 + }, + { + "epoch": 0.827304289394109, + "grad_norm": 1.2086858749389648, + "learning_rate": 7.383e-05, + "loss": 0.4192, + "step": 14774 + }, + { + "epoch": 0.8273602867062381, + "grad_norm": 1.24480140209198, + "learning_rate": 7.3835e-05, + "loss": 0.4102, + "step": 14775 + }, + { + "epoch": 0.8274162840183671, + "grad_norm": 1.2117775678634644, + "learning_rate": 7.384e-05, + "loss": 0.5034, + "step": 14776 + }, + { + "epoch": 0.8274722813304961, + "grad_norm": 1.4940330982208252, + "learning_rate": 7.384500000000001e-05, + "loss": 0.4258, + "step": 14777 + }, + { + "epoch": 0.8275282786426251, + "grad_norm": 1.0816054344177246, + "learning_rate": 7.385e-05, + "loss": 0.3952, + "step": 14778 + }, + { + "epoch": 0.8275842759547541, + "grad_norm": 1.1663109064102173, + "learning_rate": 7.3855e-05, + "loss": 0.3236, + "step": 14779 + }, + { + "epoch": 0.8276402732668832, + "grad_norm": 1.3746048212051392, + "learning_rate": 7.386e-05, + "loss": 0.3799, + "step": 14780 + }, + { + "epoch": 0.8276962705790122, + "grad_norm": 1.4794203042984009, + "learning_rate": 7.386500000000001e-05, + "loss": 0.4392, + "step": 14781 + }, + { + "epoch": 0.8277522678911412, + "grad_norm": 1.4070956707000732, + "learning_rate": 7.387000000000001e-05, + "loss": 0.4524, + "step": 14782 + }, + { + "epoch": 0.8278082652032702, + "grad_norm": 1.380784511566162, + "learning_rate": 7.3875e-05, + "loss": 0.3083, + "step": 14783 + }, + { + "epoch": 0.8278642625153992, + "grad_norm": 1.4081639051437378, + "learning_rate": 7.388e-05, + "loss": 0.5004, + "step": 14784 + }, + { + "epoch": 0.8279202598275283, + "grad_norm": 1.2172077894210815, + "learning_rate": 7.3885e-05, + "loss": 0.4313, + "step": 14785 + }, + { + "epoch": 0.8279762571396573, + "grad_norm": 1.530921459197998, + "learning_rate": 7.389e-05, + "loss": 0.6821, + "step": 14786 + }, + { + "epoch": 0.8280322544517863, + "grad_norm": 1.1566896438598633, + "learning_rate": 7.3895e-05, + "loss": 0.4739, + "step": 14787 + }, + { + "epoch": 0.8280882517639153, + "grad_norm": 1.351818561553955, + "learning_rate": 7.390000000000001e-05, + "loss": 0.4297, + "step": 14788 + }, + { + "epoch": 0.8281442490760443, + "grad_norm": 1.5123695135116577, + "learning_rate": 7.3905e-05, + "loss": 0.6097, + "step": 14789 + }, + { + "epoch": 0.8282002463881734, + "grad_norm": 1.398011565208435, + "learning_rate": 7.391e-05, + "loss": 0.4429, + "step": 14790 + }, + { + "epoch": 0.8282562437003024, + "grad_norm": 1.2961584329605103, + "learning_rate": 7.3915e-05, + "loss": 0.4447, + "step": 14791 + }, + { + "epoch": 0.8283122410124314, + "grad_norm": 1.4222519397735596, + "learning_rate": 7.392e-05, + "loss": 0.5819, + "step": 14792 + }, + { + "epoch": 0.8283682383245604, + "grad_norm": 1.1378669738769531, + "learning_rate": 7.3925e-05, + "loss": 0.3585, + "step": 14793 + }, + { + "epoch": 0.8284242356366894, + "grad_norm": 1.3640563488006592, + "learning_rate": 7.393e-05, + "loss": 0.3788, + "step": 14794 + }, + { + "epoch": 0.8284802329488185, + "grad_norm": 1.566734790802002, + "learning_rate": 7.3935e-05, + "loss": 0.3884, + "step": 14795 + }, + { + "epoch": 0.8285362302609475, + "grad_norm": 1.1821701526641846, + "learning_rate": 7.394e-05, + "loss": 0.4079, + "step": 14796 + }, + { + "epoch": 0.8285922275730765, + "grad_norm": 1.4555513858795166, + "learning_rate": 7.394500000000001e-05, + "loss": 0.4253, + "step": 14797 + }, + { + "epoch": 0.8286482248852055, + "grad_norm": 1.151355266571045, + "learning_rate": 7.395000000000001e-05, + "loss": 0.5326, + "step": 14798 + }, + { + "epoch": 0.8287042221973345, + "grad_norm": 1.3573161363601685, + "learning_rate": 7.3955e-05, + "loss": 0.4826, + "step": 14799 + }, + { + "epoch": 0.8287602195094635, + "grad_norm": 1.3706074953079224, + "learning_rate": 7.396e-05, + "loss": 0.4708, + "step": 14800 + }, + { + "epoch": 0.8288162168215926, + "grad_norm": 1.7695225477218628, + "learning_rate": 7.3965e-05, + "loss": 0.5706, + "step": 14801 + }, + { + "epoch": 0.8288722141337216, + "grad_norm": 1.3132703304290771, + "learning_rate": 7.397000000000001e-05, + "loss": 0.5201, + "step": 14802 + }, + { + "epoch": 0.8289282114458506, + "grad_norm": 1.317012071609497, + "learning_rate": 7.397500000000001e-05, + "loss": 0.3876, + "step": 14803 + }, + { + "epoch": 0.8289842087579796, + "grad_norm": 1.2215791940689087, + "learning_rate": 7.398e-05, + "loss": 0.4146, + "step": 14804 + }, + { + "epoch": 0.8290402060701086, + "grad_norm": 1.2587159872055054, + "learning_rate": 7.3985e-05, + "loss": 0.5045, + "step": 14805 + }, + { + "epoch": 0.8290962033822377, + "grad_norm": 16.493318557739258, + "learning_rate": 7.399e-05, + "loss": 0.3607, + "step": 14806 + }, + { + "epoch": 0.8291522006943667, + "grad_norm": 1.1195287704467773, + "learning_rate": 7.3995e-05, + "loss": 0.3974, + "step": 14807 + }, + { + "epoch": 0.8292081980064957, + "grad_norm": 1.345656394958496, + "learning_rate": 7.4e-05, + "loss": 0.4637, + "step": 14808 + }, + { + "epoch": 0.8292641953186247, + "grad_norm": 1.370466947555542, + "learning_rate": 7.400500000000001e-05, + "loss": 0.4364, + "step": 14809 + }, + { + "epoch": 0.8293201926307537, + "grad_norm": 1.2453478574752808, + "learning_rate": 7.401e-05, + "loss": 0.3906, + "step": 14810 + }, + { + "epoch": 0.8293761899428828, + "grad_norm": 1.512919306755066, + "learning_rate": 7.4015e-05, + "loss": 0.4683, + "step": 14811 + }, + { + "epoch": 0.8294321872550118, + "grad_norm": 1.2530479431152344, + "learning_rate": 7.402e-05, + "loss": 0.5021, + "step": 14812 + }, + { + "epoch": 0.8294881845671408, + "grad_norm": 1.4404563903808594, + "learning_rate": 7.4025e-05, + "loss": 0.4307, + "step": 14813 + }, + { + "epoch": 0.8295441818792698, + "grad_norm": 1.1528865098953247, + "learning_rate": 7.403e-05, + "loss": 0.4487, + "step": 14814 + }, + { + "epoch": 0.8296001791913988, + "grad_norm": 1.3025373220443726, + "learning_rate": 7.4035e-05, + "loss": 0.4264, + "step": 14815 + }, + { + "epoch": 0.8296561765035279, + "grad_norm": 1.4294639825820923, + "learning_rate": 7.404e-05, + "loss": 0.4792, + "step": 14816 + }, + { + "epoch": 0.8297121738156569, + "grad_norm": 1.3184431791305542, + "learning_rate": 7.404500000000001e-05, + "loss": 0.4518, + "step": 14817 + }, + { + "epoch": 0.8297681711277859, + "grad_norm": 1.3912808895111084, + "learning_rate": 7.405000000000001e-05, + "loss": 0.4099, + "step": 14818 + }, + { + "epoch": 0.8298241684399149, + "grad_norm": 1.3266398906707764, + "learning_rate": 7.405500000000001e-05, + "loss": 0.4376, + "step": 14819 + }, + { + "epoch": 0.8298801657520439, + "grad_norm": 1.2280153036117554, + "learning_rate": 7.406e-05, + "loss": 0.4315, + "step": 14820 + }, + { + "epoch": 0.829936163064173, + "grad_norm": 1.4548487663269043, + "learning_rate": 7.4065e-05, + "loss": 0.5355, + "step": 14821 + }, + { + "epoch": 0.829992160376302, + "grad_norm": 1.5744315385818481, + "learning_rate": 7.407e-05, + "loss": 0.4781, + "step": 14822 + }, + { + "epoch": 0.830048157688431, + "grad_norm": 1.4761401414871216, + "learning_rate": 7.407500000000001e-05, + "loss": 0.4804, + "step": 14823 + }, + { + "epoch": 0.83010415500056, + "grad_norm": 1.4220203161239624, + "learning_rate": 7.408000000000001e-05, + "loss": 0.4612, + "step": 14824 + }, + { + "epoch": 0.830160152312689, + "grad_norm": 1.3385324478149414, + "learning_rate": 7.4085e-05, + "loss": 0.4546, + "step": 14825 + }, + { + "epoch": 0.830216149624818, + "grad_norm": 1.2253103256225586, + "learning_rate": 7.409e-05, + "loss": 0.3805, + "step": 14826 + }, + { + "epoch": 0.8302721469369471, + "grad_norm": 1.189700961112976, + "learning_rate": 7.4095e-05, + "loss": 0.3957, + "step": 14827 + }, + { + "epoch": 0.8303281442490761, + "grad_norm": 1.3408607244491577, + "learning_rate": 7.41e-05, + "loss": 0.4735, + "step": 14828 + }, + { + "epoch": 0.8303841415612051, + "grad_norm": 1.4462240934371948, + "learning_rate": 7.4105e-05, + "loss": 0.4593, + "step": 14829 + }, + { + "epoch": 0.8304401388733341, + "grad_norm": 1.2908709049224854, + "learning_rate": 7.411000000000001e-05, + "loss": 0.4502, + "step": 14830 + }, + { + "epoch": 0.8304961361854631, + "grad_norm": 1.2498122453689575, + "learning_rate": 7.4115e-05, + "loss": 0.3867, + "step": 14831 + }, + { + "epoch": 0.8305521334975922, + "grad_norm": 1.355921983718872, + "learning_rate": 7.412e-05, + "loss": 0.4423, + "step": 14832 + }, + { + "epoch": 0.8306081308097212, + "grad_norm": 1.0706628561019897, + "learning_rate": 7.4125e-05, + "loss": 0.3468, + "step": 14833 + }, + { + "epoch": 0.8306641281218502, + "grad_norm": 1.3038891553878784, + "learning_rate": 7.413e-05, + "loss": 0.3944, + "step": 14834 + }, + { + "epoch": 0.8307201254339792, + "grad_norm": 1.3403329849243164, + "learning_rate": 7.4135e-05, + "loss": 0.4666, + "step": 14835 + }, + { + "epoch": 0.8307761227461082, + "grad_norm": 11.348949432373047, + "learning_rate": 7.414e-05, + "loss": 0.502, + "step": 14836 + }, + { + "epoch": 0.8308321200582373, + "grad_norm": 2.9683735370635986, + "learning_rate": 7.4145e-05, + "loss": 0.3808, + "step": 14837 + }, + { + "epoch": 0.8308881173703663, + "grad_norm": 1.7411690950393677, + "learning_rate": 7.415000000000001e-05, + "loss": 0.6224, + "step": 14838 + }, + { + "epoch": 0.8309441146824953, + "grad_norm": 1.753604769706726, + "learning_rate": 7.415500000000001e-05, + "loss": 0.5623, + "step": 14839 + }, + { + "epoch": 0.8310001119946243, + "grad_norm": 1.29805588722229, + "learning_rate": 7.416000000000001e-05, + "loss": 0.4957, + "step": 14840 + }, + { + "epoch": 0.8310561093067532, + "grad_norm": 1.355239987373352, + "learning_rate": 7.4165e-05, + "loss": 0.4066, + "step": 14841 + }, + { + "epoch": 0.8311121066188822, + "grad_norm": 1.1854850053787231, + "learning_rate": 7.417e-05, + "loss": 0.4089, + "step": 14842 + }, + { + "epoch": 0.8311681039310113, + "grad_norm": 1.3505926132202148, + "learning_rate": 7.4175e-05, + "loss": 0.4427, + "step": 14843 + }, + { + "epoch": 0.8312241012431403, + "grad_norm": 1.7071194648742676, + "learning_rate": 7.418000000000001e-05, + "loss": 0.5477, + "step": 14844 + }, + { + "epoch": 0.8312800985552693, + "grad_norm": 1.307734727859497, + "learning_rate": 7.418500000000001e-05, + "loss": 0.3538, + "step": 14845 + }, + { + "epoch": 0.8313360958673983, + "grad_norm": 1.3063827753067017, + "learning_rate": 7.419e-05, + "loss": 0.4007, + "step": 14846 + }, + { + "epoch": 0.8313920931795273, + "grad_norm": 1.40926194190979, + "learning_rate": 7.4195e-05, + "loss": 0.4593, + "step": 14847 + }, + { + "epoch": 0.8314480904916564, + "grad_norm": 1.2591607570648193, + "learning_rate": 7.42e-05, + "loss": 0.4995, + "step": 14848 + }, + { + "epoch": 0.8315040878037854, + "grad_norm": 1.5662548542022705, + "learning_rate": 7.4205e-05, + "loss": 0.5592, + "step": 14849 + }, + { + "epoch": 0.8315600851159144, + "grad_norm": 1.2944042682647705, + "learning_rate": 7.421e-05, + "loss": 0.4518, + "step": 14850 + }, + { + "epoch": 0.8316160824280434, + "grad_norm": 2.1192898750305176, + "learning_rate": 7.421500000000001e-05, + "loss": 0.4295, + "step": 14851 + }, + { + "epoch": 0.8316720797401724, + "grad_norm": 1.2583346366882324, + "learning_rate": 7.422e-05, + "loss": 0.4781, + "step": 14852 + }, + { + "epoch": 0.8317280770523015, + "grad_norm": 1.570609450340271, + "learning_rate": 7.4225e-05, + "loss": 0.6466, + "step": 14853 + }, + { + "epoch": 0.8317840743644305, + "grad_norm": 1.2126221656799316, + "learning_rate": 7.423e-05, + "loss": 0.4425, + "step": 14854 + }, + { + "epoch": 0.8318400716765595, + "grad_norm": 1.2975512742996216, + "learning_rate": 7.4235e-05, + "loss": 0.3918, + "step": 14855 + }, + { + "epoch": 0.8318960689886885, + "grad_norm": 1.341474175453186, + "learning_rate": 7.424e-05, + "loss": 0.4896, + "step": 14856 + }, + { + "epoch": 0.8319520663008175, + "grad_norm": 1.5290112495422363, + "learning_rate": 7.4245e-05, + "loss": 0.4209, + "step": 14857 + }, + { + "epoch": 0.8320080636129465, + "grad_norm": 1.3061316013336182, + "learning_rate": 7.425e-05, + "loss": 0.4874, + "step": 14858 + }, + { + "epoch": 0.8320640609250756, + "grad_norm": 1.4568842649459839, + "learning_rate": 7.425500000000001e-05, + "loss": 0.4203, + "step": 14859 + }, + { + "epoch": 0.8321200582372046, + "grad_norm": 1.3135074377059937, + "learning_rate": 7.426000000000001e-05, + "loss": 0.4105, + "step": 14860 + }, + { + "epoch": 0.8321760555493336, + "grad_norm": 1.4716944694519043, + "learning_rate": 7.426500000000001e-05, + "loss": 0.5895, + "step": 14861 + }, + { + "epoch": 0.8322320528614626, + "grad_norm": 1.6510661840438843, + "learning_rate": 7.427e-05, + "loss": 0.476, + "step": 14862 + }, + { + "epoch": 0.8322880501735916, + "grad_norm": 1.1166248321533203, + "learning_rate": 7.4275e-05, + "loss": 0.3829, + "step": 14863 + }, + { + "epoch": 0.8323440474857207, + "grad_norm": 1.3078899383544922, + "learning_rate": 7.428e-05, + "loss": 0.419, + "step": 14864 + }, + { + "epoch": 0.8324000447978497, + "grad_norm": 1.235997200012207, + "learning_rate": 7.428500000000001e-05, + "loss": 0.4324, + "step": 14865 + }, + { + "epoch": 0.8324560421099787, + "grad_norm": 1.4174054861068726, + "learning_rate": 7.429000000000001e-05, + "loss": 0.4429, + "step": 14866 + }, + { + "epoch": 0.8325120394221077, + "grad_norm": 1.2512991428375244, + "learning_rate": 7.4295e-05, + "loss": 0.5797, + "step": 14867 + }, + { + "epoch": 0.8325680367342367, + "grad_norm": 1.4484134912490845, + "learning_rate": 7.43e-05, + "loss": 0.4633, + "step": 14868 + }, + { + "epoch": 0.8326240340463658, + "grad_norm": 1.1724870204925537, + "learning_rate": 7.4305e-05, + "loss": 0.4174, + "step": 14869 + }, + { + "epoch": 0.8326800313584948, + "grad_norm": 1.6425628662109375, + "learning_rate": 7.431e-05, + "loss": 0.4574, + "step": 14870 + }, + { + "epoch": 0.8327360286706238, + "grad_norm": 1.5310993194580078, + "learning_rate": 7.4315e-05, + "loss": 0.4455, + "step": 14871 + }, + { + "epoch": 0.8327920259827528, + "grad_norm": 1.3307411670684814, + "learning_rate": 7.432e-05, + "loss": 0.3753, + "step": 14872 + }, + { + "epoch": 0.8328480232948818, + "grad_norm": 1.6938661336898804, + "learning_rate": 7.4325e-05, + "loss": 0.3857, + "step": 14873 + }, + { + "epoch": 0.8329040206070109, + "grad_norm": 1.1644052267074585, + "learning_rate": 7.433e-05, + "loss": 0.37, + "step": 14874 + }, + { + "epoch": 0.8329600179191399, + "grad_norm": 1.5414255857467651, + "learning_rate": 7.4335e-05, + "loss": 0.7109, + "step": 14875 + }, + { + "epoch": 0.8330160152312689, + "grad_norm": 1.226073145866394, + "learning_rate": 7.434e-05, + "loss": 0.4286, + "step": 14876 + }, + { + "epoch": 0.8330720125433979, + "grad_norm": 1.476332187652588, + "learning_rate": 7.434500000000001e-05, + "loss": 0.5368, + "step": 14877 + }, + { + "epoch": 0.8331280098555269, + "grad_norm": 1.1078234910964966, + "learning_rate": 7.435e-05, + "loss": 0.3977, + "step": 14878 + }, + { + "epoch": 0.833184007167656, + "grad_norm": 1.0875986814498901, + "learning_rate": 7.4355e-05, + "loss": 0.4219, + "step": 14879 + }, + { + "epoch": 0.833240004479785, + "grad_norm": 1.0852160453796387, + "learning_rate": 7.436000000000001e-05, + "loss": 0.3652, + "step": 14880 + }, + { + "epoch": 0.833296001791914, + "grad_norm": 1.3678065538406372, + "learning_rate": 7.436500000000001e-05, + "loss": 0.4264, + "step": 14881 + }, + { + "epoch": 0.833351999104043, + "grad_norm": 1.3367291688919067, + "learning_rate": 7.437000000000001e-05, + "loss": 0.4543, + "step": 14882 + }, + { + "epoch": 0.833407996416172, + "grad_norm": 1.0837889909744263, + "learning_rate": 7.4375e-05, + "loss": 0.3629, + "step": 14883 + }, + { + "epoch": 0.833463993728301, + "grad_norm": 1.1942131519317627, + "learning_rate": 7.438e-05, + "loss": 0.4346, + "step": 14884 + }, + { + "epoch": 0.8335199910404301, + "grad_norm": 1.2060596942901611, + "learning_rate": 7.4385e-05, + "loss": 0.4946, + "step": 14885 + }, + { + "epoch": 0.8335759883525591, + "grad_norm": 1.5555088520050049, + "learning_rate": 7.439e-05, + "loss": 0.4262, + "step": 14886 + }, + { + "epoch": 0.8336319856646881, + "grad_norm": 1.3655695915222168, + "learning_rate": 7.439500000000001e-05, + "loss": 0.4072, + "step": 14887 + }, + { + "epoch": 0.8336879829768171, + "grad_norm": 1.4177491664886475, + "learning_rate": 7.44e-05, + "loss": 0.3854, + "step": 14888 + }, + { + "epoch": 0.8337439802889461, + "grad_norm": 1.4612762928009033, + "learning_rate": 7.4405e-05, + "loss": 0.5113, + "step": 14889 + }, + { + "epoch": 0.8337999776010752, + "grad_norm": 2.1552562713623047, + "learning_rate": 7.441e-05, + "loss": 0.4914, + "step": 14890 + }, + { + "epoch": 0.8338559749132042, + "grad_norm": 1.421797752380371, + "learning_rate": 7.4415e-05, + "loss": 0.4435, + "step": 14891 + }, + { + "epoch": 0.8339119722253332, + "grad_norm": 28.51599884033203, + "learning_rate": 7.442e-05, + "loss": 0.4078, + "step": 14892 + }, + { + "epoch": 0.8339679695374622, + "grad_norm": 1.7614316940307617, + "learning_rate": 7.4425e-05, + "loss": 0.4645, + "step": 14893 + }, + { + "epoch": 0.8340239668495912, + "grad_norm": 1.2875900268554688, + "learning_rate": 7.443e-05, + "loss": 0.3867, + "step": 14894 + }, + { + "epoch": 0.8340799641617203, + "grad_norm": 1.457829236984253, + "learning_rate": 7.4435e-05, + "loss": 0.5773, + "step": 14895 + }, + { + "epoch": 0.8341359614738493, + "grad_norm": 1.4923008680343628, + "learning_rate": 7.444e-05, + "loss": 0.5008, + "step": 14896 + }, + { + "epoch": 0.8341919587859783, + "grad_norm": 1.265728235244751, + "learning_rate": 7.4445e-05, + "loss": 0.4255, + "step": 14897 + }, + { + "epoch": 0.8342479560981073, + "grad_norm": 1.3114256858825684, + "learning_rate": 7.445000000000001e-05, + "loss": 0.4625, + "step": 14898 + }, + { + "epoch": 0.8343039534102363, + "grad_norm": 1.353049874305725, + "learning_rate": 7.4455e-05, + "loss": 0.4348, + "step": 14899 + }, + { + "epoch": 0.8343599507223654, + "grad_norm": 1.4858341217041016, + "learning_rate": 7.446e-05, + "loss": 0.5839, + "step": 14900 + }, + { + "epoch": 0.8344159480344944, + "grad_norm": 1.201106071472168, + "learning_rate": 7.446500000000001e-05, + "loss": 0.5097, + "step": 14901 + }, + { + "epoch": 0.8344719453466234, + "grad_norm": 1.4960027933120728, + "learning_rate": 7.447000000000001e-05, + "loss": 0.3893, + "step": 14902 + }, + { + "epoch": 0.8345279426587524, + "grad_norm": 1.461047887802124, + "learning_rate": 7.447500000000001e-05, + "loss": 0.5777, + "step": 14903 + }, + { + "epoch": 0.8345839399708814, + "grad_norm": 1.317950963973999, + "learning_rate": 7.448e-05, + "loss": 0.3558, + "step": 14904 + }, + { + "epoch": 0.8346399372830104, + "grad_norm": 1.1693061590194702, + "learning_rate": 7.4485e-05, + "loss": 0.3988, + "step": 14905 + }, + { + "epoch": 0.8346959345951395, + "grad_norm": 1.4293357133865356, + "learning_rate": 7.449e-05, + "loss": 0.3711, + "step": 14906 + }, + { + "epoch": 0.8347519319072685, + "grad_norm": 1.3089927434921265, + "learning_rate": 7.4495e-05, + "loss": 0.4544, + "step": 14907 + }, + { + "epoch": 0.8348079292193975, + "grad_norm": 1.3952184915542603, + "learning_rate": 7.450000000000001e-05, + "loss": 0.5343, + "step": 14908 + }, + { + "epoch": 0.8348639265315265, + "grad_norm": 1.7103863954544067, + "learning_rate": 7.4505e-05, + "loss": 0.4547, + "step": 14909 + }, + { + "epoch": 0.8349199238436555, + "grad_norm": 1.2795830965042114, + "learning_rate": 7.451e-05, + "loss": 0.4001, + "step": 14910 + }, + { + "epoch": 0.8349759211557846, + "grad_norm": 1.4509633779525757, + "learning_rate": 7.4515e-05, + "loss": 0.5354, + "step": 14911 + }, + { + "epoch": 0.8350319184679136, + "grad_norm": 1.3074486255645752, + "learning_rate": 7.452e-05, + "loss": 0.3619, + "step": 14912 + }, + { + "epoch": 0.8350879157800426, + "grad_norm": 1.2817742824554443, + "learning_rate": 7.4525e-05, + "loss": 0.4242, + "step": 14913 + }, + { + "epoch": 0.8351439130921716, + "grad_norm": 1.4056373834609985, + "learning_rate": 7.453e-05, + "loss": 0.546, + "step": 14914 + }, + { + "epoch": 0.8351999104043006, + "grad_norm": 1.1839416027069092, + "learning_rate": 7.4535e-05, + "loss": 0.4322, + "step": 14915 + }, + { + "epoch": 0.8352559077164297, + "grad_norm": 1.1238529682159424, + "learning_rate": 7.454e-05, + "loss": 0.3937, + "step": 14916 + }, + { + "epoch": 0.8353119050285587, + "grad_norm": 1.6225190162658691, + "learning_rate": 7.4545e-05, + "loss": 0.5391, + "step": 14917 + }, + { + "epoch": 0.8353679023406877, + "grad_norm": 1.2570643424987793, + "learning_rate": 7.455000000000001e-05, + "loss": 0.5262, + "step": 14918 + }, + { + "epoch": 0.8354238996528167, + "grad_norm": 1.3023627996444702, + "learning_rate": 7.455500000000001e-05, + "loss": 0.5817, + "step": 14919 + }, + { + "epoch": 0.8354798969649457, + "grad_norm": 1.1538527011871338, + "learning_rate": 7.456e-05, + "loss": 0.3635, + "step": 14920 + }, + { + "epoch": 0.8355358942770748, + "grad_norm": 1.3623279333114624, + "learning_rate": 7.4565e-05, + "loss": 0.45, + "step": 14921 + }, + { + "epoch": 0.8355918915892038, + "grad_norm": 1.2881258726119995, + "learning_rate": 7.457000000000001e-05, + "loss": 0.4108, + "step": 14922 + }, + { + "epoch": 0.8356478889013328, + "grad_norm": 1.0918225049972534, + "learning_rate": 7.457500000000001e-05, + "loss": 0.3451, + "step": 14923 + }, + { + "epoch": 0.8357038862134617, + "grad_norm": 1.4319071769714355, + "learning_rate": 7.458000000000001e-05, + "loss": 0.5995, + "step": 14924 + }, + { + "epoch": 0.8357598835255907, + "grad_norm": 1.2443747520446777, + "learning_rate": 7.4585e-05, + "loss": 0.4229, + "step": 14925 + }, + { + "epoch": 0.8358158808377197, + "grad_norm": 1.3476805686950684, + "learning_rate": 7.459e-05, + "loss": 0.3255, + "step": 14926 + }, + { + "epoch": 0.8358718781498488, + "grad_norm": 1.546398401260376, + "learning_rate": 7.4595e-05, + "loss": 0.4513, + "step": 14927 + }, + { + "epoch": 0.8359278754619778, + "grad_norm": 1.3001435995101929, + "learning_rate": 7.46e-05, + "loss": 0.4581, + "step": 14928 + }, + { + "epoch": 0.8359838727741068, + "grad_norm": 1.1330454349517822, + "learning_rate": 7.460500000000001e-05, + "loss": 0.3801, + "step": 14929 + }, + { + "epoch": 0.8360398700862358, + "grad_norm": 1.2715420722961426, + "learning_rate": 7.461e-05, + "loss": 0.3954, + "step": 14930 + }, + { + "epoch": 0.8360958673983648, + "grad_norm": 1.3216328620910645, + "learning_rate": 7.4615e-05, + "loss": 0.4547, + "step": 14931 + }, + { + "epoch": 0.8361518647104939, + "grad_norm": 1.295122742652893, + "learning_rate": 7.462e-05, + "loss": 0.5081, + "step": 14932 + }, + { + "epoch": 0.8362078620226229, + "grad_norm": 1.2410364151000977, + "learning_rate": 7.4625e-05, + "loss": 0.3546, + "step": 14933 + }, + { + "epoch": 0.8362638593347519, + "grad_norm": 1.381272554397583, + "learning_rate": 7.463e-05, + "loss": 0.4826, + "step": 14934 + }, + { + "epoch": 0.8363198566468809, + "grad_norm": 1.2284221649169922, + "learning_rate": 7.463499999999999e-05, + "loss": 0.4441, + "step": 14935 + }, + { + "epoch": 0.8363758539590099, + "grad_norm": 1.4489960670471191, + "learning_rate": 7.464e-05, + "loss": 0.4896, + "step": 14936 + }, + { + "epoch": 0.836431851271139, + "grad_norm": 1.1005908250808716, + "learning_rate": 7.4645e-05, + "loss": 0.3141, + "step": 14937 + }, + { + "epoch": 0.836487848583268, + "grad_norm": 1.5796070098876953, + "learning_rate": 7.465000000000001e-05, + "loss": 0.5244, + "step": 14938 + }, + { + "epoch": 0.836543845895397, + "grad_norm": 1.3035825490951538, + "learning_rate": 7.465500000000001e-05, + "loss": 0.3867, + "step": 14939 + }, + { + "epoch": 0.836599843207526, + "grad_norm": 1.3744882345199585, + "learning_rate": 7.466000000000001e-05, + "loss": 0.4519, + "step": 14940 + }, + { + "epoch": 0.836655840519655, + "grad_norm": 1.293918490409851, + "learning_rate": 7.4665e-05, + "loss": 0.4301, + "step": 14941 + }, + { + "epoch": 0.836711837831784, + "grad_norm": 1.397953987121582, + "learning_rate": 7.467e-05, + "loss": 0.3475, + "step": 14942 + }, + { + "epoch": 0.8367678351439131, + "grad_norm": 1.3367972373962402, + "learning_rate": 7.467500000000001e-05, + "loss": 0.4344, + "step": 14943 + }, + { + "epoch": 0.8368238324560421, + "grad_norm": 1.3484013080596924, + "learning_rate": 7.468000000000001e-05, + "loss": 0.4412, + "step": 14944 + }, + { + "epoch": 0.8368798297681711, + "grad_norm": 1.146432638168335, + "learning_rate": 7.468500000000001e-05, + "loss": 0.3475, + "step": 14945 + }, + { + "epoch": 0.8369358270803001, + "grad_norm": 1.3615920543670654, + "learning_rate": 7.469e-05, + "loss": 0.4965, + "step": 14946 + }, + { + "epoch": 0.8369918243924291, + "grad_norm": 1.2948979139328003, + "learning_rate": 7.4695e-05, + "loss": 0.4753, + "step": 14947 + }, + { + "epoch": 0.8370478217045582, + "grad_norm": 1.672621726989746, + "learning_rate": 7.47e-05, + "loss": 0.3665, + "step": 14948 + }, + { + "epoch": 0.8371038190166872, + "grad_norm": 1.4697469472885132, + "learning_rate": 7.4705e-05, + "loss": 0.5146, + "step": 14949 + }, + { + "epoch": 0.8371598163288162, + "grad_norm": 1.4369267225265503, + "learning_rate": 7.471000000000001e-05, + "loss": 0.4534, + "step": 14950 + }, + { + "epoch": 0.8372158136409452, + "grad_norm": 1.263033390045166, + "learning_rate": 7.4715e-05, + "loss": 0.3747, + "step": 14951 + }, + { + "epoch": 0.8372718109530742, + "grad_norm": 1.2970012426376343, + "learning_rate": 7.472e-05, + "loss": 0.4123, + "step": 14952 + }, + { + "epoch": 0.8373278082652033, + "grad_norm": 1.3430755138397217, + "learning_rate": 7.4725e-05, + "loss": 0.5154, + "step": 14953 + }, + { + "epoch": 0.8373838055773323, + "grad_norm": 1.4384161233901978, + "learning_rate": 7.473e-05, + "loss": 0.4757, + "step": 14954 + }, + { + "epoch": 0.8374398028894613, + "grad_norm": 1.372983694076538, + "learning_rate": 7.4735e-05, + "loss": 0.5302, + "step": 14955 + }, + { + "epoch": 0.8374958002015903, + "grad_norm": 1.2524195909500122, + "learning_rate": 7.473999999999999e-05, + "loss": 0.3936, + "step": 14956 + }, + { + "epoch": 0.8375517975137193, + "grad_norm": 1.3363986015319824, + "learning_rate": 7.4745e-05, + "loss": 0.5856, + "step": 14957 + }, + { + "epoch": 0.8376077948258484, + "grad_norm": 1.2847176790237427, + "learning_rate": 7.475000000000001e-05, + "loss": 0.4496, + "step": 14958 + }, + { + "epoch": 0.8376637921379774, + "grad_norm": 1.5691107511520386, + "learning_rate": 7.475500000000001e-05, + "loss": 0.8147, + "step": 14959 + }, + { + "epoch": 0.8377197894501064, + "grad_norm": 1.3995745182037354, + "learning_rate": 7.476000000000001e-05, + "loss": 0.4678, + "step": 14960 + }, + { + "epoch": 0.8377757867622354, + "grad_norm": 1.3471348285675049, + "learning_rate": 7.4765e-05, + "loss": 0.4194, + "step": 14961 + }, + { + "epoch": 0.8378317840743644, + "grad_norm": 1.2455815076828003, + "learning_rate": 7.477e-05, + "loss": 0.3867, + "step": 14962 + }, + { + "epoch": 0.8378877813864934, + "grad_norm": 1.4678806066513062, + "learning_rate": 7.4775e-05, + "loss": 0.4391, + "step": 14963 + }, + { + "epoch": 0.8379437786986225, + "grad_norm": 1.3188345432281494, + "learning_rate": 7.478e-05, + "loss": 0.4947, + "step": 14964 + }, + { + "epoch": 0.8379997760107515, + "grad_norm": 1.3664520978927612, + "learning_rate": 7.478500000000001e-05, + "loss": 0.4572, + "step": 14965 + }, + { + "epoch": 0.8380557733228805, + "grad_norm": 1.267961025238037, + "learning_rate": 7.479000000000001e-05, + "loss": 0.3737, + "step": 14966 + }, + { + "epoch": 0.8381117706350095, + "grad_norm": 1.320089340209961, + "learning_rate": 7.4795e-05, + "loss": 0.4796, + "step": 14967 + }, + { + "epoch": 0.8381677679471385, + "grad_norm": 1.3131184577941895, + "learning_rate": 7.48e-05, + "loss": 0.4372, + "step": 14968 + }, + { + "epoch": 0.8382237652592676, + "grad_norm": 1.4215002059936523, + "learning_rate": 7.4805e-05, + "loss": 0.4411, + "step": 14969 + }, + { + "epoch": 0.8382797625713966, + "grad_norm": 1.3251869678497314, + "learning_rate": 7.481e-05, + "loss": 0.4582, + "step": 14970 + }, + { + "epoch": 0.8383357598835256, + "grad_norm": 1.3846755027770996, + "learning_rate": 7.481500000000001e-05, + "loss": 0.5226, + "step": 14971 + }, + { + "epoch": 0.8383917571956546, + "grad_norm": 1.3473392724990845, + "learning_rate": 7.482e-05, + "loss": 0.5661, + "step": 14972 + }, + { + "epoch": 0.8384477545077836, + "grad_norm": 1.5047401189804077, + "learning_rate": 7.4825e-05, + "loss": 0.5873, + "step": 14973 + }, + { + "epoch": 0.8385037518199127, + "grad_norm": 1.4454703330993652, + "learning_rate": 7.483e-05, + "loss": 0.6128, + "step": 14974 + }, + { + "epoch": 0.8385597491320417, + "grad_norm": 1.2447439432144165, + "learning_rate": 7.4835e-05, + "loss": 0.3712, + "step": 14975 + }, + { + "epoch": 0.8386157464441707, + "grad_norm": 1.2866668701171875, + "learning_rate": 7.484e-05, + "loss": 0.3689, + "step": 14976 + }, + { + "epoch": 0.8386717437562997, + "grad_norm": 1.4072551727294922, + "learning_rate": 7.484499999999999e-05, + "loss": 0.4984, + "step": 14977 + }, + { + "epoch": 0.8387277410684287, + "grad_norm": 1.4915257692337036, + "learning_rate": 7.485e-05, + "loss": 0.3813, + "step": 14978 + }, + { + "epoch": 0.8387837383805578, + "grad_norm": 1.3246511220932007, + "learning_rate": 7.485500000000001e-05, + "loss": 0.4371, + "step": 14979 + }, + { + "epoch": 0.8388397356926868, + "grad_norm": 1.0489832162857056, + "learning_rate": 7.486000000000001e-05, + "loss": 0.3958, + "step": 14980 + }, + { + "epoch": 0.8388957330048158, + "grad_norm": 1.1089919805526733, + "learning_rate": 7.486500000000001e-05, + "loss": 0.3411, + "step": 14981 + }, + { + "epoch": 0.8389517303169448, + "grad_norm": 1.6093395948410034, + "learning_rate": 7.487e-05, + "loss": 0.4069, + "step": 14982 + }, + { + "epoch": 0.8390077276290738, + "grad_norm": 1.4661673307418823, + "learning_rate": 7.4875e-05, + "loss": 0.5336, + "step": 14983 + }, + { + "epoch": 0.8390637249412028, + "grad_norm": 1.3630739450454712, + "learning_rate": 7.488e-05, + "loss": 0.4706, + "step": 14984 + }, + { + "epoch": 0.8391197222533319, + "grad_norm": 1.1160573959350586, + "learning_rate": 7.4885e-05, + "loss": 0.3812, + "step": 14985 + }, + { + "epoch": 0.8391757195654609, + "grad_norm": 1.460625410079956, + "learning_rate": 7.489000000000001e-05, + "loss": 0.615, + "step": 14986 + }, + { + "epoch": 0.8392317168775899, + "grad_norm": 1.4919368028640747, + "learning_rate": 7.489500000000001e-05, + "loss": 0.6785, + "step": 14987 + }, + { + "epoch": 0.8392877141897189, + "grad_norm": 1.2921801805496216, + "learning_rate": 7.49e-05, + "loss": 0.3843, + "step": 14988 + }, + { + "epoch": 0.8393437115018479, + "grad_norm": 1.3943148851394653, + "learning_rate": 7.4905e-05, + "loss": 0.5812, + "step": 14989 + }, + { + "epoch": 0.839399708813977, + "grad_norm": 1.2905611991882324, + "learning_rate": 7.491e-05, + "loss": 0.4547, + "step": 14990 + }, + { + "epoch": 0.839455706126106, + "grad_norm": 1.599746823310852, + "learning_rate": 7.4915e-05, + "loss": 0.4198, + "step": 14991 + }, + { + "epoch": 0.839511703438235, + "grad_norm": 1.3307349681854248, + "learning_rate": 7.492000000000001e-05, + "loss": 0.4406, + "step": 14992 + }, + { + "epoch": 0.839567700750364, + "grad_norm": 1.2734228372573853, + "learning_rate": 7.4925e-05, + "loss": 0.5028, + "step": 14993 + }, + { + "epoch": 0.839623698062493, + "grad_norm": 1.4966728687286377, + "learning_rate": 7.493e-05, + "loss": 0.4875, + "step": 14994 + }, + { + "epoch": 0.8396796953746221, + "grad_norm": 1.447335958480835, + "learning_rate": 7.4935e-05, + "loss": 0.4251, + "step": 14995 + }, + { + "epoch": 0.8397356926867511, + "grad_norm": 1.4132452011108398, + "learning_rate": 7.494e-05, + "loss": 0.5464, + "step": 14996 + }, + { + "epoch": 0.8397916899988801, + "grad_norm": 1.608242154121399, + "learning_rate": 7.4945e-05, + "loss": 0.3826, + "step": 14997 + }, + { + "epoch": 0.8398476873110091, + "grad_norm": 1.1616274118423462, + "learning_rate": 7.495e-05, + "loss": 0.3667, + "step": 14998 + }, + { + "epoch": 0.8399036846231381, + "grad_norm": 1.2818100452423096, + "learning_rate": 7.4955e-05, + "loss": 0.4147, + "step": 14999 + }, + { + "epoch": 0.8399596819352672, + "grad_norm": 1.1615766286849976, + "learning_rate": 7.496000000000001e-05, + "loss": 0.3461, + "step": 15000 + }, + { + "epoch": 0.8400156792473962, + "grad_norm": 1.5612907409667969, + "learning_rate": 7.496500000000001e-05, + "loss": 0.4007, + "step": 15001 + }, + { + "epoch": 0.8400716765595252, + "grad_norm": 1.4862791299819946, + "learning_rate": 7.497000000000001e-05, + "loss": 0.4315, + "step": 15002 + }, + { + "epoch": 0.8401276738716542, + "grad_norm": 1.3636221885681152, + "learning_rate": 7.4975e-05, + "loss": 0.5253, + "step": 15003 + }, + { + "epoch": 0.8401836711837832, + "grad_norm": 1.2589805126190186, + "learning_rate": 7.498e-05, + "loss": 0.3413, + "step": 15004 + }, + { + "epoch": 0.8402396684959123, + "grad_norm": 1.2923073768615723, + "learning_rate": 7.4985e-05, + "loss": 0.3613, + "step": 15005 + }, + { + "epoch": 0.8402956658080412, + "grad_norm": 1.5071004629135132, + "learning_rate": 7.499e-05, + "loss": 0.385, + "step": 15006 + }, + { + "epoch": 0.8403516631201702, + "grad_norm": 1.355750560760498, + "learning_rate": 7.499500000000001e-05, + "loss": 0.3953, + "step": 15007 + }, + { + "epoch": 0.8404076604322992, + "grad_norm": 1.3409547805786133, + "learning_rate": 7.500000000000001e-05, + "loss": 0.3501, + "step": 15008 + }, + { + "epoch": 0.8404636577444282, + "grad_norm": 1.5930057764053345, + "learning_rate": 7.5005e-05, + "loss": 0.4344, + "step": 15009 + }, + { + "epoch": 0.8405196550565572, + "grad_norm": 1.585240364074707, + "learning_rate": 7.501e-05, + "loss": 0.7247, + "step": 15010 + }, + { + "epoch": 0.8405756523686863, + "grad_norm": 1.1724711656570435, + "learning_rate": 7.5015e-05, + "loss": 0.4556, + "step": 15011 + }, + { + "epoch": 0.8406316496808153, + "grad_norm": 1.4890203475952148, + "learning_rate": 7.502e-05, + "loss": 0.5924, + "step": 15012 + }, + { + "epoch": 0.8406876469929443, + "grad_norm": 1.615662932395935, + "learning_rate": 7.502500000000001e-05, + "loss": 0.4121, + "step": 15013 + }, + { + "epoch": 0.8407436443050733, + "grad_norm": 1.2852932214736938, + "learning_rate": 7.503e-05, + "loss": 0.393, + "step": 15014 + }, + { + "epoch": 0.8407996416172023, + "grad_norm": 1.6026692390441895, + "learning_rate": 7.5035e-05, + "loss": 0.4125, + "step": 15015 + }, + { + "epoch": 0.8408556389293314, + "grad_norm": 1.4187570810317993, + "learning_rate": 7.504e-05, + "loss": 0.4448, + "step": 15016 + }, + { + "epoch": 0.8409116362414604, + "grad_norm": 1.3563053607940674, + "learning_rate": 7.5045e-05, + "loss": 0.4576, + "step": 15017 + }, + { + "epoch": 0.8409676335535894, + "grad_norm": 1.2016352415084839, + "learning_rate": 7.505e-05, + "loss": 0.4305, + "step": 15018 + }, + { + "epoch": 0.8410236308657184, + "grad_norm": 1.245765209197998, + "learning_rate": 7.5055e-05, + "loss": 0.3862, + "step": 15019 + }, + { + "epoch": 0.8410796281778474, + "grad_norm": 2.2996387481689453, + "learning_rate": 7.506e-05, + "loss": 0.5382, + "step": 15020 + }, + { + "epoch": 0.8411356254899764, + "grad_norm": 1.327736735343933, + "learning_rate": 7.506500000000001e-05, + "loss": 0.5347, + "step": 15021 + }, + { + "epoch": 0.8411916228021055, + "grad_norm": 1.5099338293075562, + "learning_rate": 7.507000000000001e-05, + "loss": 0.5358, + "step": 15022 + }, + { + "epoch": 0.8412476201142345, + "grad_norm": 1.208007574081421, + "learning_rate": 7.507500000000001e-05, + "loss": 0.5175, + "step": 15023 + }, + { + "epoch": 0.8413036174263635, + "grad_norm": 1.517074465751648, + "learning_rate": 7.508e-05, + "loss": 0.5982, + "step": 15024 + }, + { + "epoch": 0.8413596147384925, + "grad_norm": 1.3720316886901855, + "learning_rate": 7.5085e-05, + "loss": 0.474, + "step": 15025 + }, + { + "epoch": 0.8414156120506215, + "grad_norm": 1.3069417476654053, + "learning_rate": 7.509e-05, + "loss": 0.4922, + "step": 15026 + }, + { + "epoch": 0.8414716093627506, + "grad_norm": 1.3628120422363281, + "learning_rate": 7.5095e-05, + "loss": 0.4216, + "step": 15027 + }, + { + "epoch": 0.8415276066748796, + "grad_norm": 1.2434688806533813, + "learning_rate": 7.510000000000001e-05, + "loss": 0.629, + "step": 15028 + }, + { + "epoch": 0.8415836039870086, + "grad_norm": 1.5129505395889282, + "learning_rate": 7.510500000000001e-05, + "loss": 0.422, + "step": 15029 + }, + { + "epoch": 0.8416396012991376, + "grad_norm": 1.1953586339950562, + "learning_rate": 7.511e-05, + "loss": 0.4075, + "step": 15030 + }, + { + "epoch": 0.8416955986112666, + "grad_norm": 2.293485403060913, + "learning_rate": 7.5115e-05, + "loss": 0.6196, + "step": 15031 + }, + { + "epoch": 0.8417515959233957, + "grad_norm": 1.3731416463851929, + "learning_rate": 7.512e-05, + "loss": 0.4824, + "step": 15032 + }, + { + "epoch": 0.8418075932355247, + "grad_norm": 1.2921339273452759, + "learning_rate": 7.5125e-05, + "loss": 0.4374, + "step": 15033 + }, + { + "epoch": 0.8418635905476537, + "grad_norm": 1.2573421001434326, + "learning_rate": 7.513e-05, + "loss": 0.3117, + "step": 15034 + }, + { + "epoch": 0.8419195878597827, + "grad_norm": 1.2236018180847168, + "learning_rate": 7.5135e-05, + "loss": 0.4238, + "step": 15035 + }, + { + "epoch": 0.8419755851719117, + "grad_norm": 1.2526265382766724, + "learning_rate": 7.514e-05, + "loss": 0.3968, + "step": 15036 + }, + { + "epoch": 0.8420315824840408, + "grad_norm": 1.4436498880386353, + "learning_rate": 7.5145e-05, + "loss": 0.4659, + "step": 15037 + }, + { + "epoch": 0.8420875797961698, + "grad_norm": 1.447879433631897, + "learning_rate": 7.515e-05, + "loss": 0.4036, + "step": 15038 + }, + { + "epoch": 0.8421435771082988, + "grad_norm": 1.3573596477508545, + "learning_rate": 7.515500000000001e-05, + "loss": 0.3824, + "step": 15039 + }, + { + "epoch": 0.8421995744204278, + "grad_norm": 1.569237470626831, + "learning_rate": 7.516e-05, + "loss": 0.395, + "step": 15040 + }, + { + "epoch": 0.8422555717325568, + "grad_norm": 1.4944884777069092, + "learning_rate": 7.5165e-05, + "loss": 0.4338, + "step": 15041 + }, + { + "epoch": 0.8423115690446858, + "grad_norm": 1.386260747909546, + "learning_rate": 7.517000000000001e-05, + "loss": 0.5996, + "step": 15042 + }, + { + "epoch": 0.8423675663568149, + "grad_norm": 2.475252389907837, + "learning_rate": 7.517500000000001e-05, + "loss": 0.5768, + "step": 15043 + }, + { + "epoch": 0.8424235636689439, + "grad_norm": 1.4581243991851807, + "learning_rate": 7.518000000000001e-05, + "loss": 0.3723, + "step": 15044 + }, + { + "epoch": 0.8424795609810729, + "grad_norm": 1.2736772298812866, + "learning_rate": 7.5185e-05, + "loss": 0.5126, + "step": 15045 + }, + { + "epoch": 0.8425355582932019, + "grad_norm": 1.3051806688308716, + "learning_rate": 7.519e-05, + "loss": 0.4082, + "step": 15046 + }, + { + "epoch": 0.8425915556053309, + "grad_norm": 1.5884802341461182, + "learning_rate": 7.5195e-05, + "loss": 0.4608, + "step": 15047 + }, + { + "epoch": 0.84264755291746, + "grad_norm": 1.7127121686935425, + "learning_rate": 7.52e-05, + "loss": 0.5517, + "step": 15048 + }, + { + "epoch": 0.842703550229589, + "grad_norm": 1.2679857015609741, + "learning_rate": 7.520500000000001e-05, + "loss": 0.4228, + "step": 15049 + }, + { + "epoch": 0.842759547541718, + "grad_norm": 1.5982797145843506, + "learning_rate": 7.521e-05, + "loss": 0.5136, + "step": 15050 + }, + { + "epoch": 0.842815544853847, + "grad_norm": 1.4622713327407837, + "learning_rate": 7.5215e-05, + "loss": 0.4572, + "step": 15051 + }, + { + "epoch": 0.842871542165976, + "grad_norm": 1.3255765438079834, + "learning_rate": 7.522e-05, + "loss": 0.4086, + "step": 15052 + }, + { + "epoch": 0.8429275394781051, + "grad_norm": 1.319079041481018, + "learning_rate": 7.5225e-05, + "loss": 0.4433, + "step": 15053 + }, + { + "epoch": 0.8429835367902341, + "grad_norm": 1.2004789113998413, + "learning_rate": 7.523e-05, + "loss": 0.3553, + "step": 15054 + }, + { + "epoch": 0.8430395341023631, + "grad_norm": 1.1800658702850342, + "learning_rate": 7.5235e-05, + "loss": 0.5275, + "step": 15055 + }, + { + "epoch": 0.8430955314144921, + "grad_norm": 1.158860206604004, + "learning_rate": 7.524e-05, + "loss": 0.3951, + "step": 15056 + }, + { + "epoch": 0.8431515287266211, + "grad_norm": 1.3727558851242065, + "learning_rate": 7.5245e-05, + "loss": 0.5408, + "step": 15057 + }, + { + "epoch": 0.8432075260387502, + "grad_norm": 1.2245267629623413, + "learning_rate": 7.525e-05, + "loss": 0.4581, + "step": 15058 + }, + { + "epoch": 0.8432635233508792, + "grad_norm": 1.5283687114715576, + "learning_rate": 7.525500000000001e-05, + "loss": 0.6485, + "step": 15059 + }, + { + "epoch": 0.8433195206630082, + "grad_norm": 1.5196453332901, + "learning_rate": 7.526000000000001e-05, + "loss": 0.5316, + "step": 15060 + }, + { + "epoch": 0.8433755179751372, + "grad_norm": 1.3572794198989868, + "learning_rate": 7.5265e-05, + "loss": 0.4793, + "step": 15061 + }, + { + "epoch": 0.8434315152872662, + "grad_norm": 2.0908303260803223, + "learning_rate": 7.527e-05, + "loss": 0.432, + "step": 15062 + }, + { + "epoch": 0.8434875125993953, + "grad_norm": 1.378475546836853, + "learning_rate": 7.5275e-05, + "loss": 0.4168, + "step": 15063 + }, + { + "epoch": 0.8435435099115243, + "grad_norm": 1.2672103643417358, + "learning_rate": 7.528000000000001e-05, + "loss": 0.4288, + "step": 15064 + }, + { + "epoch": 0.8435995072236533, + "grad_norm": 1.2985295057296753, + "learning_rate": 7.528500000000001e-05, + "loss": 0.4394, + "step": 15065 + }, + { + "epoch": 0.8436555045357823, + "grad_norm": 1.3514337539672852, + "learning_rate": 7.529e-05, + "loss": 0.5229, + "step": 15066 + }, + { + "epoch": 0.8437115018479113, + "grad_norm": 1.2348365783691406, + "learning_rate": 7.5295e-05, + "loss": 0.6591, + "step": 15067 + }, + { + "epoch": 0.8437674991600403, + "grad_norm": 1.2063206434249878, + "learning_rate": 7.53e-05, + "loss": 0.4903, + "step": 15068 + }, + { + "epoch": 0.8438234964721694, + "grad_norm": 1.1621588468551636, + "learning_rate": 7.5305e-05, + "loss": 0.5143, + "step": 15069 + }, + { + "epoch": 0.8438794937842984, + "grad_norm": 1.2593555450439453, + "learning_rate": 7.531000000000001e-05, + "loss": 0.4102, + "step": 15070 + }, + { + "epoch": 0.8439354910964274, + "grad_norm": 1.4822927713394165, + "learning_rate": 7.5315e-05, + "loss": 0.5137, + "step": 15071 + }, + { + "epoch": 0.8439914884085564, + "grad_norm": 1.429641604423523, + "learning_rate": 7.532e-05, + "loss": 0.5827, + "step": 15072 + }, + { + "epoch": 0.8440474857206854, + "grad_norm": 1.4930412769317627, + "learning_rate": 7.5325e-05, + "loss": 0.5212, + "step": 15073 + }, + { + "epoch": 0.8441034830328145, + "grad_norm": 1.2361456155776978, + "learning_rate": 7.533e-05, + "loss": 0.4542, + "step": 15074 + }, + { + "epoch": 0.8441594803449435, + "grad_norm": 1.2064471244812012, + "learning_rate": 7.5335e-05, + "loss": 0.3553, + "step": 15075 + }, + { + "epoch": 0.8442154776570725, + "grad_norm": 1.3847750425338745, + "learning_rate": 7.534e-05, + "loss": 0.5201, + "step": 15076 + }, + { + "epoch": 0.8442714749692015, + "grad_norm": 1.3404045104980469, + "learning_rate": 7.5345e-05, + "loss": 0.4421, + "step": 15077 + }, + { + "epoch": 0.8443274722813305, + "grad_norm": 1.3262113332748413, + "learning_rate": 7.535e-05, + "loss": 0.5306, + "step": 15078 + }, + { + "epoch": 0.8443834695934596, + "grad_norm": 1.6203056573867798, + "learning_rate": 7.535500000000001e-05, + "loss": 0.4103, + "step": 15079 + }, + { + "epoch": 0.8444394669055886, + "grad_norm": 1.3159303665161133, + "learning_rate": 7.536000000000001e-05, + "loss": 0.4167, + "step": 15080 + }, + { + "epoch": 0.8444954642177176, + "grad_norm": 1.398086428642273, + "learning_rate": 7.536500000000001e-05, + "loss": 0.514, + "step": 15081 + }, + { + "epoch": 0.8445514615298466, + "grad_norm": 1.2203478813171387, + "learning_rate": 7.537e-05, + "loss": 0.3646, + "step": 15082 + }, + { + "epoch": 0.8446074588419756, + "grad_norm": 1.266960859298706, + "learning_rate": 7.5375e-05, + "loss": 0.4621, + "step": 15083 + }, + { + "epoch": 0.8446634561541047, + "grad_norm": 1.3884551525115967, + "learning_rate": 7.538e-05, + "loss": 0.4161, + "step": 15084 + }, + { + "epoch": 0.8447194534662337, + "grad_norm": 1.5007433891296387, + "learning_rate": 7.538500000000001e-05, + "loss": 0.5216, + "step": 15085 + }, + { + "epoch": 0.8447754507783627, + "grad_norm": 3.044949531555176, + "learning_rate": 7.539000000000001e-05, + "loss": 0.406, + "step": 15086 + }, + { + "epoch": 0.8448314480904917, + "grad_norm": 1.3742420673370361, + "learning_rate": 7.5395e-05, + "loss": 0.5471, + "step": 15087 + }, + { + "epoch": 0.8448874454026207, + "grad_norm": 1.5436530113220215, + "learning_rate": 7.54e-05, + "loss": 0.516, + "step": 15088 + }, + { + "epoch": 0.8449434427147496, + "grad_norm": 1.3303571939468384, + "learning_rate": 7.5405e-05, + "loss": 0.4711, + "step": 15089 + }, + { + "epoch": 0.8449994400268787, + "grad_norm": 1.3449771404266357, + "learning_rate": 7.541e-05, + "loss": 0.3937, + "step": 15090 + }, + { + "epoch": 0.8450554373390077, + "grad_norm": 1.3330497741699219, + "learning_rate": 7.541500000000001e-05, + "loss": 0.3399, + "step": 15091 + }, + { + "epoch": 0.8451114346511367, + "grad_norm": 1.2649072408676147, + "learning_rate": 7.542e-05, + "loss": 0.4364, + "step": 15092 + }, + { + "epoch": 0.8451674319632657, + "grad_norm": 1.2320575714111328, + "learning_rate": 7.5425e-05, + "loss": 0.4069, + "step": 15093 + }, + { + "epoch": 0.8452234292753947, + "grad_norm": 1.6531530618667603, + "learning_rate": 7.543e-05, + "loss": 0.4329, + "step": 15094 + }, + { + "epoch": 0.8452794265875238, + "grad_norm": 1.2341358661651611, + "learning_rate": 7.5435e-05, + "loss": 0.3878, + "step": 15095 + }, + { + "epoch": 0.8453354238996528, + "grad_norm": 2.1931910514831543, + "learning_rate": 7.544e-05, + "loss": 0.4141, + "step": 15096 + }, + { + "epoch": 0.8453914212117818, + "grad_norm": 1.1421024799346924, + "learning_rate": 7.5445e-05, + "loss": 0.4015, + "step": 15097 + }, + { + "epoch": 0.8454474185239108, + "grad_norm": 1.6613743305206299, + "learning_rate": 7.545e-05, + "loss": 0.4879, + "step": 15098 + }, + { + "epoch": 0.8455034158360398, + "grad_norm": 1.3891104459762573, + "learning_rate": 7.545500000000002e-05, + "loss": 0.3483, + "step": 15099 + }, + { + "epoch": 0.8455594131481688, + "grad_norm": 1.3898457288742065, + "learning_rate": 7.546000000000001e-05, + "loss": 0.6164, + "step": 15100 + }, + { + "epoch": 0.8456154104602979, + "grad_norm": 1.2472468614578247, + "learning_rate": 7.546500000000001e-05, + "loss": 0.4381, + "step": 15101 + }, + { + "epoch": 0.8456714077724269, + "grad_norm": 1.5568019151687622, + "learning_rate": 7.547000000000001e-05, + "loss": 0.5354, + "step": 15102 + }, + { + "epoch": 0.8457274050845559, + "grad_norm": 1.1311805248260498, + "learning_rate": 7.5475e-05, + "loss": 0.4659, + "step": 15103 + }, + { + "epoch": 0.8457834023966849, + "grad_norm": 1.4916313886642456, + "learning_rate": 7.548e-05, + "loss": 0.5287, + "step": 15104 + }, + { + "epoch": 0.8458393997088139, + "grad_norm": 1.3321473598480225, + "learning_rate": 7.5485e-05, + "loss": 0.4549, + "step": 15105 + }, + { + "epoch": 0.845895397020943, + "grad_norm": 1.2652349472045898, + "learning_rate": 7.549000000000001e-05, + "loss": 0.4562, + "step": 15106 + }, + { + "epoch": 0.845951394333072, + "grad_norm": 1.383254885673523, + "learning_rate": 7.549500000000001e-05, + "loss": 0.3811, + "step": 15107 + }, + { + "epoch": 0.846007391645201, + "grad_norm": 1.3633276224136353, + "learning_rate": 7.55e-05, + "loss": 0.4664, + "step": 15108 + }, + { + "epoch": 0.84606338895733, + "grad_norm": 1.2604163885116577, + "learning_rate": 7.5505e-05, + "loss": 0.4612, + "step": 15109 + }, + { + "epoch": 0.846119386269459, + "grad_norm": 1.3349034786224365, + "learning_rate": 7.551e-05, + "loss": 0.4329, + "step": 15110 + }, + { + "epoch": 0.8461753835815881, + "grad_norm": 1.3420207500457764, + "learning_rate": 7.5515e-05, + "loss": 0.3875, + "step": 15111 + }, + { + "epoch": 0.8462313808937171, + "grad_norm": 1.4629493951797485, + "learning_rate": 7.552e-05, + "loss": 0.4527, + "step": 15112 + }, + { + "epoch": 0.8462873782058461, + "grad_norm": 1.209176778793335, + "learning_rate": 7.5525e-05, + "loss": 0.4839, + "step": 15113 + }, + { + "epoch": 0.8463433755179751, + "grad_norm": 1.5998519659042358, + "learning_rate": 7.553e-05, + "loss": 0.4067, + "step": 15114 + }, + { + "epoch": 0.8463993728301041, + "grad_norm": 1.3559772968292236, + "learning_rate": 7.5535e-05, + "loss": 0.3687, + "step": 15115 + }, + { + "epoch": 0.8464553701422332, + "grad_norm": 3.905630350112915, + "learning_rate": 7.554e-05, + "loss": 0.3848, + "step": 15116 + }, + { + "epoch": 0.8465113674543622, + "grad_norm": 1.3527567386627197, + "learning_rate": 7.5545e-05, + "loss": 0.4556, + "step": 15117 + }, + { + "epoch": 0.8465673647664912, + "grad_norm": 1.3242403268814087, + "learning_rate": 7.555e-05, + "loss": 0.4031, + "step": 15118 + }, + { + "epoch": 0.8466233620786202, + "grad_norm": 1.3410563468933105, + "learning_rate": 7.5555e-05, + "loss": 0.5529, + "step": 15119 + }, + { + "epoch": 0.8466793593907492, + "grad_norm": 1.413779854774475, + "learning_rate": 7.556000000000002e-05, + "loss": 0.444, + "step": 15120 + }, + { + "epoch": 0.8467353567028783, + "grad_norm": 1.4130003452301025, + "learning_rate": 7.556500000000001e-05, + "loss": 0.4252, + "step": 15121 + }, + { + "epoch": 0.8467913540150073, + "grad_norm": 1.1051877737045288, + "learning_rate": 7.557000000000001e-05, + "loss": 0.3686, + "step": 15122 + }, + { + "epoch": 0.8468473513271363, + "grad_norm": 1.4178258180618286, + "learning_rate": 7.557500000000001e-05, + "loss": 0.374, + "step": 15123 + }, + { + "epoch": 0.8469033486392653, + "grad_norm": 1.5272403955459595, + "learning_rate": 7.558e-05, + "loss": 0.5934, + "step": 15124 + }, + { + "epoch": 0.8469593459513943, + "grad_norm": 1.3880999088287354, + "learning_rate": 7.5585e-05, + "loss": 0.4118, + "step": 15125 + }, + { + "epoch": 0.8470153432635233, + "grad_norm": 1.6361058950424194, + "learning_rate": 7.559e-05, + "loss": 0.5314, + "step": 15126 + }, + { + "epoch": 0.8470713405756524, + "grad_norm": 1.244228720664978, + "learning_rate": 7.559500000000001e-05, + "loss": 0.4525, + "step": 15127 + }, + { + "epoch": 0.8471273378877814, + "grad_norm": 2.738976001739502, + "learning_rate": 7.560000000000001e-05, + "loss": 0.4815, + "step": 15128 + }, + { + "epoch": 0.8471833351999104, + "grad_norm": 1.1920626163482666, + "learning_rate": 7.5605e-05, + "loss": 0.4106, + "step": 15129 + }, + { + "epoch": 0.8472393325120394, + "grad_norm": 1.376732587814331, + "learning_rate": 7.561e-05, + "loss": 0.4497, + "step": 15130 + }, + { + "epoch": 0.8472953298241684, + "grad_norm": 1.3465228080749512, + "learning_rate": 7.5615e-05, + "loss": 0.4248, + "step": 15131 + }, + { + "epoch": 0.8473513271362975, + "grad_norm": 1.2644684314727783, + "learning_rate": 7.562e-05, + "loss": 0.5093, + "step": 15132 + }, + { + "epoch": 0.8474073244484265, + "grad_norm": 1.762965440750122, + "learning_rate": 7.5625e-05, + "loss": 0.446, + "step": 15133 + }, + { + "epoch": 0.8474633217605555, + "grad_norm": 1.3486056327819824, + "learning_rate": 7.563e-05, + "loss": 0.4694, + "step": 15134 + }, + { + "epoch": 0.8475193190726845, + "grad_norm": 1.1375880241394043, + "learning_rate": 7.5635e-05, + "loss": 0.3783, + "step": 15135 + }, + { + "epoch": 0.8475753163848135, + "grad_norm": 1.2902443408966064, + "learning_rate": 7.564e-05, + "loss": 0.4339, + "step": 15136 + }, + { + "epoch": 0.8476313136969426, + "grad_norm": 1.3332070112228394, + "learning_rate": 7.5645e-05, + "loss": 0.4528, + "step": 15137 + }, + { + "epoch": 0.8476873110090716, + "grad_norm": 1.3735326528549194, + "learning_rate": 7.565e-05, + "loss": 0.421, + "step": 15138 + }, + { + "epoch": 0.8477433083212006, + "grad_norm": 1.3245290517807007, + "learning_rate": 7.565499999999999e-05, + "loss": 0.4636, + "step": 15139 + }, + { + "epoch": 0.8477993056333296, + "grad_norm": 1.122619867324829, + "learning_rate": 7.566e-05, + "loss": 0.3773, + "step": 15140 + }, + { + "epoch": 0.8478553029454586, + "grad_norm": 1.3315730094909668, + "learning_rate": 7.5665e-05, + "loss": 0.378, + "step": 15141 + }, + { + "epoch": 0.8479113002575877, + "grad_norm": 1.272456169128418, + "learning_rate": 7.567000000000001e-05, + "loss": 0.4726, + "step": 15142 + }, + { + "epoch": 0.8479672975697167, + "grad_norm": 1.2605549097061157, + "learning_rate": 7.567500000000001e-05, + "loss": 0.4612, + "step": 15143 + }, + { + "epoch": 0.8480232948818457, + "grad_norm": 1.3905287981033325, + "learning_rate": 7.568000000000001e-05, + "loss": 0.4521, + "step": 15144 + }, + { + "epoch": 0.8480792921939747, + "grad_norm": 1.5169981718063354, + "learning_rate": 7.5685e-05, + "loss": 0.405, + "step": 15145 + }, + { + "epoch": 0.8481352895061037, + "grad_norm": 1.1962016820907593, + "learning_rate": 7.569e-05, + "loss": 0.3827, + "step": 15146 + }, + { + "epoch": 0.8481912868182327, + "grad_norm": 1.2600289583206177, + "learning_rate": 7.5695e-05, + "loss": 0.3677, + "step": 15147 + }, + { + "epoch": 0.8482472841303618, + "grad_norm": 1.3277665376663208, + "learning_rate": 7.570000000000001e-05, + "loss": 0.5631, + "step": 15148 + }, + { + "epoch": 0.8483032814424908, + "grad_norm": 1.6062098741531372, + "learning_rate": 7.570500000000001e-05, + "loss": 0.5264, + "step": 15149 + }, + { + "epoch": 0.8483592787546198, + "grad_norm": 1.3468025922775269, + "learning_rate": 7.571e-05, + "loss": 0.4625, + "step": 15150 + }, + { + "epoch": 0.8484152760667488, + "grad_norm": 1.3188852071762085, + "learning_rate": 7.5715e-05, + "loss": 0.4286, + "step": 15151 + }, + { + "epoch": 0.8484712733788778, + "grad_norm": 1.5755048990249634, + "learning_rate": 7.572e-05, + "loss": 0.5279, + "step": 15152 + }, + { + "epoch": 0.8485272706910069, + "grad_norm": 1.7589013576507568, + "learning_rate": 7.5725e-05, + "loss": 0.3309, + "step": 15153 + }, + { + "epoch": 0.8485832680031359, + "grad_norm": 1.310774564743042, + "learning_rate": 7.573e-05, + "loss": 0.3594, + "step": 15154 + }, + { + "epoch": 0.8486392653152649, + "grad_norm": 1.4295710325241089, + "learning_rate": 7.5735e-05, + "loss": 0.4235, + "step": 15155 + }, + { + "epoch": 0.8486952626273939, + "grad_norm": 1.4306899309158325, + "learning_rate": 7.574e-05, + "loss": 0.4259, + "step": 15156 + }, + { + "epoch": 0.8487512599395229, + "grad_norm": 1.4165624380111694, + "learning_rate": 7.5745e-05, + "loss": 0.5563, + "step": 15157 + }, + { + "epoch": 0.848807257251652, + "grad_norm": 1.251570701599121, + "learning_rate": 7.575e-05, + "loss": 0.4512, + "step": 15158 + }, + { + "epoch": 0.848863254563781, + "grad_norm": 1.3429430723190308, + "learning_rate": 7.5755e-05, + "loss": 0.4967, + "step": 15159 + }, + { + "epoch": 0.84891925187591, + "grad_norm": 1.4190011024475098, + "learning_rate": 7.576e-05, + "loss": 0.4926, + "step": 15160 + }, + { + "epoch": 0.848975249188039, + "grad_norm": 1.4082642793655396, + "learning_rate": 7.5765e-05, + "loss": 0.4795, + "step": 15161 + }, + { + "epoch": 0.849031246500168, + "grad_norm": 1.2854467630386353, + "learning_rate": 7.577e-05, + "loss": 0.4818, + "step": 15162 + }, + { + "epoch": 0.849087243812297, + "grad_norm": 1.1384862661361694, + "learning_rate": 7.577500000000001e-05, + "loss": 0.3906, + "step": 15163 + }, + { + "epoch": 0.8491432411244261, + "grad_norm": 1.175520658493042, + "learning_rate": 7.578000000000001e-05, + "loss": 0.3959, + "step": 15164 + }, + { + "epoch": 0.8491992384365551, + "grad_norm": 1.3986434936523438, + "learning_rate": 7.578500000000001e-05, + "loss": 0.6178, + "step": 15165 + }, + { + "epoch": 0.8492552357486841, + "grad_norm": 1.4496233463287354, + "learning_rate": 7.579e-05, + "loss": 0.4642, + "step": 15166 + }, + { + "epoch": 0.8493112330608131, + "grad_norm": 1.6438530683517456, + "learning_rate": 7.5795e-05, + "loss": 0.607, + "step": 15167 + }, + { + "epoch": 0.8493672303729422, + "grad_norm": 1.8109285831451416, + "learning_rate": 7.58e-05, + "loss": 0.4636, + "step": 15168 + }, + { + "epoch": 0.8494232276850712, + "grad_norm": 1.416939377784729, + "learning_rate": 7.580500000000001e-05, + "loss": 0.3799, + "step": 15169 + }, + { + "epoch": 0.8494792249972002, + "grad_norm": 1.3079711198806763, + "learning_rate": 7.581000000000001e-05, + "loss": 0.4225, + "step": 15170 + }, + { + "epoch": 0.8495352223093292, + "grad_norm": 1.6238187551498413, + "learning_rate": 7.5815e-05, + "loss": 0.5868, + "step": 15171 + }, + { + "epoch": 0.8495912196214581, + "grad_norm": 1.2737096548080444, + "learning_rate": 7.582e-05, + "loss": 0.328, + "step": 15172 + }, + { + "epoch": 0.8496472169335871, + "grad_norm": 1.239679217338562, + "learning_rate": 7.5825e-05, + "loss": 0.3602, + "step": 15173 + }, + { + "epoch": 0.8497032142457162, + "grad_norm": 1.4801266193389893, + "learning_rate": 7.583e-05, + "loss": 0.3883, + "step": 15174 + }, + { + "epoch": 0.8497592115578452, + "grad_norm": 1.7090065479278564, + "learning_rate": 7.5835e-05, + "loss": 0.5019, + "step": 15175 + }, + { + "epoch": 0.8498152088699742, + "grad_norm": 1.245547890663147, + "learning_rate": 7.584e-05, + "loss": 0.3724, + "step": 15176 + }, + { + "epoch": 0.8498712061821032, + "grad_norm": 1.361806869506836, + "learning_rate": 7.5845e-05, + "loss": 0.5009, + "step": 15177 + }, + { + "epoch": 0.8499272034942322, + "grad_norm": 1.7137019634246826, + "learning_rate": 7.585e-05, + "loss": 0.6438, + "step": 15178 + }, + { + "epoch": 0.8499832008063612, + "grad_norm": 1.3690975904464722, + "learning_rate": 7.5855e-05, + "loss": 0.4153, + "step": 15179 + }, + { + "epoch": 0.8500391981184903, + "grad_norm": 1.3201725482940674, + "learning_rate": 7.586000000000001e-05, + "loss": 0.4907, + "step": 15180 + }, + { + "epoch": 0.8500951954306193, + "grad_norm": 1.3027015924453735, + "learning_rate": 7.5865e-05, + "loss": 0.475, + "step": 15181 + }, + { + "epoch": 0.8501511927427483, + "grad_norm": 1.2826457023620605, + "learning_rate": 7.587e-05, + "loss": 0.456, + "step": 15182 + }, + { + "epoch": 0.8502071900548773, + "grad_norm": 1.317867398262024, + "learning_rate": 7.5875e-05, + "loss": 0.5059, + "step": 15183 + }, + { + "epoch": 0.8502631873670063, + "grad_norm": 2.2673516273498535, + "learning_rate": 7.588000000000001e-05, + "loss": 0.6258, + "step": 15184 + }, + { + "epoch": 0.8503191846791354, + "grad_norm": 1.1475238800048828, + "learning_rate": 7.588500000000001e-05, + "loss": 0.3576, + "step": 15185 + }, + { + "epoch": 0.8503751819912644, + "grad_norm": 1.4983441829681396, + "learning_rate": 7.589000000000001e-05, + "loss": 0.5368, + "step": 15186 + }, + { + "epoch": 0.8504311793033934, + "grad_norm": 1.2127751111984253, + "learning_rate": 7.5895e-05, + "loss": 0.4064, + "step": 15187 + }, + { + "epoch": 0.8504871766155224, + "grad_norm": 1.471183180809021, + "learning_rate": 7.59e-05, + "loss": 0.4226, + "step": 15188 + }, + { + "epoch": 0.8505431739276514, + "grad_norm": 1.3438286781311035, + "learning_rate": 7.5905e-05, + "loss": 0.4563, + "step": 15189 + }, + { + "epoch": 0.8505991712397805, + "grad_norm": 1.1882891654968262, + "learning_rate": 7.591e-05, + "loss": 0.521, + "step": 15190 + }, + { + "epoch": 0.8506551685519095, + "grad_norm": 1.0482243299484253, + "learning_rate": 7.591500000000001e-05, + "loss": 0.4479, + "step": 15191 + }, + { + "epoch": 0.8507111658640385, + "grad_norm": 1.3335113525390625, + "learning_rate": 7.592e-05, + "loss": 0.3779, + "step": 15192 + }, + { + "epoch": 0.8507671631761675, + "grad_norm": 1.2946522235870361, + "learning_rate": 7.5925e-05, + "loss": 0.4433, + "step": 15193 + }, + { + "epoch": 0.8508231604882965, + "grad_norm": 1.3130769729614258, + "learning_rate": 7.593e-05, + "loss": 0.5303, + "step": 15194 + }, + { + "epoch": 0.8508791578004256, + "grad_norm": 1.0719996690750122, + "learning_rate": 7.5935e-05, + "loss": 0.4205, + "step": 15195 + }, + { + "epoch": 0.8509351551125546, + "grad_norm": 1.2909539937973022, + "learning_rate": 7.594e-05, + "loss": 0.4152, + "step": 15196 + }, + { + "epoch": 0.8509911524246836, + "grad_norm": 1.605582594871521, + "learning_rate": 7.5945e-05, + "loss": 0.5042, + "step": 15197 + }, + { + "epoch": 0.8510471497368126, + "grad_norm": 1.7889682054519653, + "learning_rate": 7.595e-05, + "loss": 0.4765, + "step": 15198 + }, + { + "epoch": 0.8511031470489416, + "grad_norm": 1.495755910873413, + "learning_rate": 7.5955e-05, + "loss": 0.4569, + "step": 15199 + }, + { + "epoch": 0.8511591443610707, + "grad_norm": 1.4306620359420776, + "learning_rate": 7.596000000000001e-05, + "loss": 0.5663, + "step": 15200 + }, + { + "epoch": 0.8512151416731997, + "grad_norm": 1.3635423183441162, + "learning_rate": 7.596500000000001e-05, + "loss": 0.4369, + "step": 15201 + }, + { + "epoch": 0.8512711389853287, + "grad_norm": 1.4563435316085815, + "learning_rate": 7.597e-05, + "loss": 0.4164, + "step": 15202 + }, + { + "epoch": 0.8513271362974577, + "grad_norm": 1.192082166671753, + "learning_rate": 7.5975e-05, + "loss": 0.3142, + "step": 15203 + }, + { + "epoch": 0.8513831336095867, + "grad_norm": 1.3374476432800293, + "learning_rate": 7.598e-05, + "loss": 0.3901, + "step": 15204 + }, + { + "epoch": 0.8514391309217157, + "grad_norm": 1.3649131059646606, + "learning_rate": 7.598500000000001e-05, + "loss": 0.4348, + "step": 15205 + }, + { + "epoch": 0.8514951282338448, + "grad_norm": 84.07178497314453, + "learning_rate": 7.599000000000001e-05, + "loss": 0.5072, + "step": 15206 + }, + { + "epoch": 0.8515511255459738, + "grad_norm": 1.9043371677398682, + "learning_rate": 7.5995e-05, + "loss": 0.5572, + "step": 15207 + }, + { + "epoch": 0.8516071228581028, + "grad_norm": 1.4176578521728516, + "learning_rate": 7.6e-05, + "loss": 0.5742, + "step": 15208 + }, + { + "epoch": 0.8516631201702318, + "grad_norm": 1.2849376201629639, + "learning_rate": 7.6005e-05, + "loss": 0.437, + "step": 15209 + }, + { + "epoch": 0.8517191174823608, + "grad_norm": 1.14579439163208, + "learning_rate": 7.601e-05, + "loss": 0.3665, + "step": 15210 + }, + { + "epoch": 0.8517751147944899, + "grad_norm": 1.326216459274292, + "learning_rate": 7.6015e-05, + "loss": 0.4595, + "step": 15211 + }, + { + "epoch": 0.8518311121066189, + "grad_norm": 1.5373824834823608, + "learning_rate": 7.602000000000001e-05, + "loss": 0.4353, + "step": 15212 + }, + { + "epoch": 0.8518871094187479, + "grad_norm": 1.2989906072616577, + "learning_rate": 7.6025e-05, + "loss": 0.3503, + "step": 15213 + }, + { + "epoch": 0.8519431067308769, + "grad_norm": 1.641305685043335, + "learning_rate": 7.603e-05, + "loss": 0.4397, + "step": 15214 + }, + { + "epoch": 0.8519991040430059, + "grad_norm": 1.5231834650039673, + "learning_rate": 7.6035e-05, + "loss": 0.4273, + "step": 15215 + }, + { + "epoch": 0.852055101355135, + "grad_norm": 6.989379405975342, + "learning_rate": 7.604e-05, + "loss": 0.3557, + "step": 15216 + }, + { + "epoch": 0.852111098667264, + "grad_norm": 1.4232429265975952, + "learning_rate": 7.6045e-05, + "loss": 0.4825, + "step": 15217 + }, + { + "epoch": 0.852167095979393, + "grad_norm": 8.40113353729248, + "learning_rate": 7.605e-05, + "loss": 0.3653, + "step": 15218 + }, + { + "epoch": 0.852223093291522, + "grad_norm": 1.3674389123916626, + "learning_rate": 7.6055e-05, + "loss": 0.4705, + "step": 15219 + }, + { + "epoch": 0.852279090603651, + "grad_norm": 1.4717456102371216, + "learning_rate": 7.606000000000001e-05, + "loss": 0.483, + "step": 15220 + }, + { + "epoch": 0.85233508791578, + "grad_norm": 1.2045587301254272, + "learning_rate": 7.606500000000001e-05, + "loss": 0.3697, + "step": 15221 + }, + { + "epoch": 0.8523910852279091, + "grad_norm": 1.6508337259292603, + "learning_rate": 7.607000000000001e-05, + "loss": 0.4677, + "step": 15222 + }, + { + "epoch": 0.8524470825400381, + "grad_norm": 1.5064278841018677, + "learning_rate": 7.6075e-05, + "loss": 0.4107, + "step": 15223 + }, + { + "epoch": 0.8525030798521671, + "grad_norm": 1.36526620388031, + "learning_rate": 7.608e-05, + "loss": 0.5868, + "step": 15224 + }, + { + "epoch": 0.8525590771642961, + "grad_norm": 1.386756181716919, + "learning_rate": 7.6085e-05, + "loss": 0.4697, + "step": 15225 + }, + { + "epoch": 0.8526150744764251, + "grad_norm": 1.6597256660461426, + "learning_rate": 7.609000000000001e-05, + "loss": 0.5056, + "step": 15226 + }, + { + "epoch": 0.8526710717885542, + "grad_norm": 1.4845725297927856, + "learning_rate": 7.609500000000001e-05, + "loss": 0.4151, + "step": 15227 + }, + { + "epoch": 0.8527270691006832, + "grad_norm": 1.6620419025421143, + "learning_rate": 7.61e-05, + "loss": 0.4783, + "step": 15228 + }, + { + "epoch": 0.8527830664128122, + "grad_norm": 1.298387050628662, + "learning_rate": 7.6105e-05, + "loss": 0.4438, + "step": 15229 + }, + { + "epoch": 0.8528390637249412, + "grad_norm": 1.2491012811660767, + "learning_rate": 7.611e-05, + "loss": 0.3297, + "step": 15230 + }, + { + "epoch": 0.8528950610370702, + "grad_norm": 1.2535948753356934, + "learning_rate": 7.6115e-05, + "loss": 0.4548, + "step": 15231 + }, + { + "epoch": 0.8529510583491993, + "grad_norm": 1.3734540939331055, + "learning_rate": 7.612e-05, + "loss": 0.3724, + "step": 15232 + }, + { + "epoch": 0.8530070556613283, + "grad_norm": 1.4256091117858887, + "learning_rate": 7.612500000000001e-05, + "loss": 0.5826, + "step": 15233 + }, + { + "epoch": 0.8530630529734573, + "grad_norm": 1.5222828388214111, + "learning_rate": 7.613e-05, + "loss": 0.5786, + "step": 15234 + }, + { + "epoch": 0.8531190502855863, + "grad_norm": 1.5543458461761475, + "learning_rate": 7.6135e-05, + "loss": 0.5745, + "step": 15235 + }, + { + "epoch": 0.8531750475977153, + "grad_norm": 1.5437970161437988, + "learning_rate": 7.614e-05, + "loss": 0.4744, + "step": 15236 + }, + { + "epoch": 0.8532310449098444, + "grad_norm": 1.250725507736206, + "learning_rate": 7.6145e-05, + "loss": 0.4594, + "step": 15237 + }, + { + "epoch": 0.8532870422219734, + "grad_norm": 1.443010926246643, + "learning_rate": 7.615e-05, + "loss": 0.3214, + "step": 15238 + }, + { + "epoch": 0.8533430395341024, + "grad_norm": 1.4554890394210815, + "learning_rate": 7.6155e-05, + "loss": 0.5692, + "step": 15239 + }, + { + "epoch": 0.8533990368462314, + "grad_norm": 1.387817621231079, + "learning_rate": 7.616e-05, + "loss": 0.3915, + "step": 15240 + }, + { + "epoch": 0.8534550341583604, + "grad_norm": 1.4982044696807861, + "learning_rate": 7.616500000000001e-05, + "loss": 0.4794, + "step": 15241 + }, + { + "epoch": 0.8535110314704895, + "grad_norm": 1.5566614866256714, + "learning_rate": 7.617000000000001e-05, + "loss": 0.5169, + "step": 15242 + }, + { + "epoch": 0.8535670287826185, + "grad_norm": 1.342090368270874, + "learning_rate": 7.617500000000001e-05, + "loss": 0.5644, + "step": 15243 + }, + { + "epoch": 0.8536230260947475, + "grad_norm": 1.3355064392089844, + "learning_rate": 7.618e-05, + "loss": 0.4928, + "step": 15244 + }, + { + "epoch": 0.8536790234068765, + "grad_norm": 1.677080512046814, + "learning_rate": 7.6185e-05, + "loss": 0.4778, + "step": 15245 + }, + { + "epoch": 0.8537350207190055, + "grad_norm": 1.5180878639221191, + "learning_rate": 7.619e-05, + "loss": 0.5091, + "step": 15246 + }, + { + "epoch": 0.8537910180311346, + "grad_norm": 1.383988857269287, + "learning_rate": 7.619500000000001e-05, + "loss": 0.4522, + "step": 15247 + }, + { + "epoch": 0.8538470153432636, + "grad_norm": 1.5268155336380005, + "learning_rate": 7.620000000000001e-05, + "loss": 0.4806, + "step": 15248 + }, + { + "epoch": 0.8539030126553926, + "grad_norm": 1.1285520792007446, + "learning_rate": 7.6205e-05, + "loss": 0.4177, + "step": 15249 + }, + { + "epoch": 0.8539590099675216, + "grad_norm": 1.116123914718628, + "learning_rate": 7.621e-05, + "loss": 0.3905, + "step": 15250 + }, + { + "epoch": 0.8540150072796506, + "grad_norm": 1.492278814315796, + "learning_rate": 7.6215e-05, + "loss": 0.544, + "step": 15251 + }, + { + "epoch": 0.8540710045917796, + "grad_norm": 1.5980716943740845, + "learning_rate": 7.622e-05, + "loss": 0.4494, + "step": 15252 + }, + { + "epoch": 0.8541270019039087, + "grad_norm": 1.2674134969711304, + "learning_rate": 7.6225e-05, + "loss": 0.4129, + "step": 15253 + }, + { + "epoch": 0.8541829992160376, + "grad_norm": 1.328264832496643, + "learning_rate": 7.623000000000001e-05, + "loss": 0.3812, + "step": 15254 + }, + { + "epoch": 0.8542389965281666, + "grad_norm": 1.4931414127349854, + "learning_rate": 7.6235e-05, + "loss": 0.5149, + "step": 15255 + }, + { + "epoch": 0.8542949938402956, + "grad_norm": 1.322326898574829, + "learning_rate": 7.624e-05, + "loss": 0.4716, + "step": 15256 + }, + { + "epoch": 0.8543509911524246, + "grad_norm": 1.2993319034576416, + "learning_rate": 7.6245e-05, + "loss": 0.3725, + "step": 15257 + }, + { + "epoch": 0.8544069884645537, + "grad_norm": 1.541629433631897, + "learning_rate": 7.625e-05, + "loss": 0.5011, + "step": 15258 + }, + { + "epoch": 0.8544629857766827, + "grad_norm": 1.1747040748596191, + "learning_rate": 7.6255e-05, + "loss": 0.489, + "step": 15259 + }, + { + "epoch": 0.8545189830888117, + "grad_norm": 1.365580677986145, + "learning_rate": 7.625999999999999e-05, + "loss": 0.4291, + "step": 15260 + }, + { + "epoch": 0.8545749804009407, + "grad_norm": 1.0650475025177002, + "learning_rate": 7.6265e-05, + "loss": 0.3304, + "step": 15261 + }, + { + "epoch": 0.8546309777130697, + "grad_norm": 1.2407748699188232, + "learning_rate": 7.627000000000001e-05, + "loss": 0.394, + "step": 15262 + }, + { + "epoch": 0.8546869750251987, + "grad_norm": 1.5234675407409668, + "learning_rate": 7.627500000000001e-05, + "loss": 0.5103, + "step": 15263 + }, + { + "epoch": 0.8547429723373278, + "grad_norm": 1.4129886627197266, + "learning_rate": 7.628000000000001e-05, + "loss": 0.5637, + "step": 15264 + }, + { + "epoch": 0.8547989696494568, + "grad_norm": 1.5237212181091309, + "learning_rate": 7.6285e-05, + "loss": 0.4244, + "step": 15265 + }, + { + "epoch": 0.8548549669615858, + "grad_norm": 1.416741132736206, + "learning_rate": 7.629e-05, + "loss": 0.5953, + "step": 15266 + }, + { + "epoch": 0.8549109642737148, + "grad_norm": 1.2391144037246704, + "learning_rate": 7.6295e-05, + "loss": 0.4641, + "step": 15267 + }, + { + "epoch": 0.8549669615858438, + "grad_norm": 1.3451381921768188, + "learning_rate": 7.630000000000001e-05, + "loss": 0.4012, + "step": 15268 + }, + { + "epoch": 0.8550229588979729, + "grad_norm": 1.365315318107605, + "learning_rate": 7.630500000000001e-05, + "loss": 0.5176, + "step": 15269 + }, + { + "epoch": 0.8550789562101019, + "grad_norm": 1.6232542991638184, + "learning_rate": 7.631e-05, + "loss": 0.4692, + "step": 15270 + }, + { + "epoch": 0.8551349535222309, + "grad_norm": 1.7078979015350342, + "learning_rate": 7.6315e-05, + "loss": 0.5395, + "step": 15271 + }, + { + "epoch": 0.8551909508343599, + "grad_norm": 1.5448113679885864, + "learning_rate": 7.632e-05, + "loss": 0.5784, + "step": 15272 + }, + { + "epoch": 0.8552469481464889, + "grad_norm": 1.3233789205551147, + "learning_rate": 7.6325e-05, + "loss": 0.4248, + "step": 15273 + }, + { + "epoch": 0.855302945458618, + "grad_norm": 1.5929086208343506, + "learning_rate": 7.633e-05, + "loss": 0.4113, + "step": 15274 + }, + { + "epoch": 0.855358942770747, + "grad_norm": 1.4089661836624146, + "learning_rate": 7.633500000000001e-05, + "loss": 0.4067, + "step": 15275 + }, + { + "epoch": 0.855414940082876, + "grad_norm": 1.22316575050354, + "learning_rate": 7.634e-05, + "loss": 0.4686, + "step": 15276 + }, + { + "epoch": 0.855470937395005, + "grad_norm": 1.2601726055145264, + "learning_rate": 7.6345e-05, + "loss": 0.3796, + "step": 15277 + }, + { + "epoch": 0.855526934707134, + "grad_norm": 1.2563194036483765, + "learning_rate": 7.635e-05, + "loss": 0.5263, + "step": 15278 + }, + { + "epoch": 0.855582932019263, + "grad_norm": 1.494057059288025, + "learning_rate": 7.6355e-05, + "loss": 0.4191, + "step": 15279 + }, + { + "epoch": 0.8556389293313921, + "grad_norm": 1.328596830368042, + "learning_rate": 7.636e-05, + "loss": 0.4457, + "step": 15280 + }, + { + "epoch": 0.8556949266435211, + "grad_norm": 1.3479852676391602, + "learning_rate": 7.6365e-05, + "loss": 0.4119, + "step": 15281 + }, + { + "epoch": 0.8557509239556501, + "grad_norm": 1.1821396350860596, + "learning_rate": 7.637e-05, + "loss": 0.3704, + "step": 15282 + }, + { + "epoch": 0.8558069212677791, + "grad_norm": 1.317995309829712, + "learning_rate": 7.637500000000001e-05, + "loss": 0.6254, + "step": 15283 + }, + { + "epoch": 0.8558629185799081, + "grad_norm": 1.3521196842193604, + "learning_rate": 7.638000000000001e-05, + "loss": 0.4053, + "step": 15284 + }, + { + "epoch": 0.8559189158920372, + "grad_norm": 1.6777294874191284, + "learning_rate": 7.638500000000001e-05, + "loss": 0.6206, + "step": 15285 + }, + { + "epoch": 0.8559749132041662, + "grad_norm": 1.53383469581604, + "learning_rate": 7.639e-05, + "loss": 0.5084, + "step": 15286 + }, + { + "epoch": 0.8560309105162952, + "grad_norm": 1.3479366302490234, + "learning_rate": 7.6395e-05, + "loss": 0.5463, + "step": 15287 + }, + { + "epoch": 0.8560869078284242, + "grad_norm": 1.2493224143981934, + "learning_rate": 7.64e-05, + "loss": 0.3651, + "step": 15288 + }, + { + "epoch": 0.8561429051405532, + "grad_norm": 1.287906527519226, + "learning_rate": 7.6405e-05, + "loss": 0.4258, + "step": 15289 + }, + { + "epoch": 0.8561989024526823, + "grad_norm": 1.672400712966919, + "learning_rate": 7.641000000000001e-05, + "loss": 0.5002, + "step": 15290 + }, + { + "epoch": 0.8562548997648113, + "grad_norm": 1.20218026638031, + "learning_rate": 7.6415e-05, + "loss": 0.5053, + "step": 15291 + }, + { + "epoch": 0.8563108970769403, + "grad_norm": 1.4538565874099731, + "learning_rate": 7.642e-05, + "loss": 0.4315, + "step": 15292 + }, + { + "epoch": 0.8563668943890693, + "grad_norm": 1.3052338361740112, + "learning_rate": 7.6425e-05, + "loss": 0.4419, + "step": 15293 + }, + { + "epoch": 0.8564228917011983, + "grad_norm": 1.3309745788574219, + "learning_rate": 7.643e-05, + "loss": 0.5276, + "step": 15294 + }, + { + "epoch": 0.8564788890133274, + "grad_norm": 1.4033682346343994, + "learning_rate": 7.6435e-05, + "loss": 0.4153, + "step": 15295 + }, + { + "epoch": 0.8565348863254564, + "grad_norm": 1.365225911140442, + "learning_rate": 7.644e-05, + "loss": 0.5216, + "step": 15296 + }, + { + "epoch": 0.8565908836375854, + "grad_norm": 1.053643822669983, + "learning_rate": 7.6445e-05, + "loss": 0.3019, + "step": 15297 + }, + { + "epoch": 0.8566468809497144, + "grad_norm": 1.4441500902175903, + "learning_rate": 7.645e-05, + "loss": 0.5822, + "step": 15298 + }, + { + "epoch": 0.8567028782618434, + "grad_norm": 1.664686679840088, + "learning_rate": 7.6455e-05, + "loss": 0.5443, + "step": 15299 + }, + { + "epoch": 0.8567588755739725, + "grad_norm": 1.169580101966858, + "learning_rate": 7.646e-05, + "loss": 0.5139, + "step": 15300 + }, + { + "epoch": 0.8568148728861015, + "grad_norm": 1.4870423078536987, + "learning_rate": 7.646500000000001e-05, + "loss": 0.4219, + "step": 15301 + }, + { + "epoch": 0.8568708701982305, + "grad_norm": 1.3848826885223389, + "learning_rate": 7.647e-05, + "loss": 0.5761, + "step": 15302 + }, + { + "epoch": 0.8569268675103595, + "grad_norm": 1.1729873418807983, + "learning_rate": 7.6475e-05, + "loss": 0.3544, + "step": 15303 + }, + { + "epoch": 0.8569828648224885, + "grad_norm": 3.545872926712036, + "learning_rate": 7.648000000000001e-05, + "loss": 0.5091, + "step": 15304 + }, + { + "epoch": 0.8570388621346176, + "grad_norm": 1.288378119468689, + "learning_rate": 7.648500000000001e-05, + "loss": 0.4387, + "step": 15305 + }, + { + "epoch": 0.8570948594467466, + "grad_norm": 1.4308961629867554, + "learning_rate": 7.649000000000001e-05, + "loss": 0.4135, + "step": 15306 + }, + { + "epoch": 0.8571508567588756, + "grad_norm": 1.287602424621582, + "learning_rate": 7.6495e-05, + "loss": 0.4679, + "step": 15307 + }, + { + "epoch": 0.8572068540710046, + "grad_norm": 1.4903717041015625, + "learning_rate": 7.65e-05, + "loss": 0.5455, + "step": 15308 + }, + { + "epoch": 0.8572628513831336, + "grad_norm": 1.3370749950408936, + "learning_rate": 7.6505e-05, + "loss": 0.5237, + "step": 15309 + }, + { + "epoch": 0.8573188486952626, + "grad_norm": 1.2127388715744019, + "learning_rate": 7.651e-05, + "loss": 0.4822, + "step": 15310 + }, + { + "epoch": 0.8573748460073917, + "grad_norm": 1.4833921194076538, + "learning_rate": 7.651500000000001e-05, + "loss": 0.4984, + "step": 15311 + }, + { + "epoch": 0.8574308433195207, + "grad_norm": 2.3310141563415527, + "learning_rate": 7.652e-05, + "loss": 0.7687, + "step": 15312 + }, + { + "epoch": 0.8574868406316497, + "grad_norm": 1.4999405145645142, + "learning_rate": 7.6525e-05, + "loss": 0.5486, + "step": 15313 + }, + { + "epoch": 0.8575428379437787, + "grad_norm": 1.5353020429611206, + "learning_rate": 7.653e-05, + "loss": 0.515, + "step": 15314 + }, + { + "epoch": 0.8575988352559077, + "grad_norm": 1.1975631713867188, + "learning_rate": 7.6535e-05, + "loss": 0.3858, + "step": 15315 + }, + { + "epoch": 0.8576548325680368, + "grad_norm": 1.2844549417495728, + "learning_rate": 7.654e-05, + "loss": 0.5304, + "step": 15316 + }, + { + "epoch": 0.8577108298801658, + "grad_norm": 1.4918015003204346, + "learning_rate": 7.6545e-05, + "loss": 0.4314, + "step": 15317 + }, + { + "epoch": 0.8577668271922948, + "grad_norm": 1.3247824907302856, + "learning_rate": 7.655e-05, + "loss": 0.4832, + "step": 15318 + }, + { + "epoch": 0.8578228245044238, + "grad_norm": 1.0839030742645264, + "learning_rate": 7.6555e-05, + "loss": 0.3746, + "step": 15319 + }, + { + "epoch": 0.8578788218165528, + "grad_norm": 1.2216871976852417, + "learning_rate": 7.656e-05, + "loss": 0.378, + "step": 15320 + }, + { + "epoch": 0.8579348191286819, + "grad_norm": 1.0671459436416626, + "learning_rate": 7.656500000000001e-05, + "loss": 0.3347, + "step": 15321 + }, + { + "epoch": 0.8579908164408109, + "grad_norm": 1.2441599369049072, + "learning_rate": 7.657000000000001e-05, + "loss": 0.3581, + "step": 15322 + }, + { + "epoch": 0.8580468137529399, + "grad_norm": 1.1995137929916382, + "learning_rate": 7.6575e-05, + "loss": 0.3563, + "step": 15323 + }, + { + "epoch": 0.8581028110650689, + "grad_norm": 1.481691837310791, + "learning_rate": 7.658e-05, + "loss": 0.4686, + "step": 15324 + }, + { + "epoch": 0.8581588083771979, + "grad_norm": 1.2402544021606445, + "learning_rate": 7.658500000000001e-05, + "loss": 0.4165, + "step": 15325 + }, + { + "epoch": 0.858214805689327, + "grad_norm": 1.3916577100753784, + "learning_rate": 7.659000000000001e-05, + "loss": 0.3222, + "step": 15326 + }, + { + "epoch": 0.858270803001456, + "grad_norm": 1.6940088272094727, + "learning_rate": 7.659500000000001e-05, + "loss": 0.468, + "step": 15327 + }, + { + "epoch": 0.858326800313585, + "grad_norm": 1.287348747253418, + "learning_rate": 7.66e-05, + "loss": 0.3956, + "step": 15328 + }, + { + "epoch": 0.858382797625714, + "grad_norm": 3.0163638591766357, + "learning_rate": 7.6605e-05, + "loss": 0.7145, + "step": 15329 + }, + { + "epoch": 0.858438794937843, + "grad_norm": 1.291735053062439, + "learning_rate": 7.661e-05, + "loss": 0.4072, + "step": 15330 + }, + { + "epoch": 0.858494792249972, + "grad_norm": 1.3850016593933105, + "learning_rate": 7.6615e-05, + "loss": 0.4995, + "step": 15331 + }, + { + "epoch": 0.8585507895621011, + "grad_norm": 1.3612180948257446, + "learning_rate": 7.662000000000001e-05, + "loss": 0.3674, + "step": 15332 + }, + { + "epoch": 0.8586067868742301, + "grad_norm": 1.2428797483444214, + "learning_rate": 7.6625e-05, + "loss": 0.4511, + "step": 15333 + }, + { + "epoch": 0.8586627841863591, + "grad_norm": 1.3600984811782837, + "learning_rate": 7.663e-05, + "loss": 0.4131, + "step": 15334 + }, + { + "epoch": 0.8587187814984881, + "grad_norm": 1.4094133377075195, + "learning_rate": 7.6635e-05, + "loss": 0.4748, + "step": 15335 + }, + { + "epoch": 0.8587747788106171, + "grad_norm": 2.0889029502868652, + "learning_rate": 7.664e-05, + "loss": 0.3707, + "step": 15336 + }, + { + "epoch": 0.858830776122746, + "grad_norm": 1.264032006263733, + "learning_rate": 7.6645e-05, + "loss": 0.3294, + "step": 15337 + }, + { + "epoch": 0.8588867734348751, + "grad_norm": 1.217186689376831, + "learning_rate": 7.664999999999999e-05, + "loss": 0.3299, + "step": 15338 + }, + { + "epoch": 0.8589427707470041, + "grad_norm": 2.11641001701355, + "learning_rate": 7.6655e-05, + "loss": 0.4554, + "step": 15339 + }, + { + "epoch": 0.8589987680591331, + "grad_norm": 1.3534648418426514, + "learning_rate": 7.666e-05, + "loss": 0.4348, + "step": 15340 + }, + { + "epoch": 0.8590547653712621, + "grad_norm": 0.9543183445930481, + "learning_rate": 7.666500000000001e-05, + "loss": 0.333, + "step": 15341 + }, + { + "epoch": 0.8591107626833911, + "grad_norm": 1.2656171321868896, + "learning_rate": 7.667000000000001e-05, + "loss": 0.4672, + "step": 15342 + }, + { + "epoch": 0.8591667599955202, + "grad_norm": 2.572000741958618, + "learning_rate": 7.667500000000001e-05, + "loss": 0.5044, + "step": 15343 + }, + { + "epoch": 0.8592227573076492, + "grad_norm": 1.4788674116134644, + "learning_rate": 7.668e-05, + "loss": 0.3862, + "step": 15344 + }, + { + "epoch": 0.8592787546197782, + "grad_norm": 1.557162880897522, + "learning_rate": 7.6685e-05, + "loss": 0.3956, + "step": 15345 + }, + { + "epoch": 0.8593347519319072, + "grad_norm": 1.3017799854278564, + "learning_rate": 7.669000000000001e-05, + "loss": 0.3904, + "step": 15346 + }, + { + "epoch": 0.8593907492440362, + "grad_norm": 2.0657570362091064, + "learning_rate": 7.669500000000001e-05, + "loss": 0.46, + "step": 15347 + }, + { + "epoch": 0.8594467465561653, + "grad_norm": 1.1990329027175903, + "learning_rate": 7.670000000000001e-05, + "loss": 0.3548, + "step": 15348 + }, + { + "epoch": 0.8595027438682943, + "grad_norm": 2.280388116836548, + "learning_rate": 7.6705e-05, + "loss": 0.4669, + "step": 15349 + }, + { + "epoch": 0.8595587411804233, + "grad_norm": 1.4168832302093506, + "learning_rate": 7.671e-05, + "loss": 0.4556, + "step": 15350 + }, + { + "epoch": 0.8596147384925523, + "grad_norm": 1.4790887832641602, + "learning_rate": 7.6715e-05, + "loss": 0.3881, + "step": 15351 + }, + { + "epoch": 0.8596707358046813, + "grad_norm": 1.7048856019973755, + "learning_rate": 7.672e-05, + "loss": 0.4468, + "step": 15352 + }, + { + "epoch": 0.8597267331168104, + "grad_norm": 1.3507384061813354, + "learning_rate": 7.672500000000001e-05, + "loss": 0.4268, + "step": 15353 + }, + { + "epoch": 0.8597827304289394, + "grad_norm": 1.2550270557403564, + "learning_rate": 7.673e-05, + "loss": 0.4574, + "step": 15354 + }, + { + "epoch": 0.8598387277410684, + "grad_norm": 1.3710527420043945, + "learning_rate": 7.6735e-05, + "loss": 0.422, + "step": 15355 + }, + { + "epoch": 0.8598947250531974, + "grad_norm": 1.1961610317230225, + "learning_rate": 7.674e-05, + "loss": 0.3967, + "step": 15356 + }, + { + "epoch": 0.8599507223653264, + "grad_norm": 1.5828375816345215, + "learning_rate": 7.6745e-05, + "loss": 0.4511, + "step": 15357 + }, + { + "epoch": 0.8600067196774555, + "grad_norm": 1.3332316875457764, + "learning_rate": 7.675e-05, + "loss": 0.4498, + "step": 15358 + }, + { + "epoch": 0.8600627169895845, + "grad_norm": 1.31644868850708, + "learning_rate": 7.675499999999999e-05, + "loss": 0.4281, + "step": 15359 + }, + { + "epoch": 0.8601187143017135, + "grad_norm": 1.2825725078582764, + "learning_rate": 7.676e-05, + "loss": 0.4345, + "step": 15360 + }, + { + "epoch": 0.8601747116138425, + "grad_norm": 1.291054368019104, + "learning_rate": 7.676500000000001e-05, + "loss": 0.3941, + "step": 15361 + }, + { + "epoch": 0.8602307089259715, + "grad_norm": 1.2731778621673584, + "learning_rate": 7.677000000000001e-05, + "loss": 0.438, + "step": 15362 + }, + { + "epoch": 0.8602867062381006, + "grad_norm": 1.3587656021118164, + "learning_rate": 7.677500000000001e-05, + "loss": 0.414, + "step": 15363 + }, + { + "epoch": 0.8603427035502296, + "grad_norm": 1.2961088418960571, + "learning_rate": 7.678000000000001e-05, + "loss": 0.4495, + "step": 15364 + }, + { + "epoch": 0.8603987008623586, + "grad_norm": 1.5349632501602173, + "learning_rate": 7.6785e-05, + "loss": 0.6048, + "step": 15365 + }, + { + "epoch": 0.8604546981744876, + "grad_norm": 1.4466733932495117, + "learning_rate": 7.679e-05, + "loss": 0.392, + "step": 15366 + }, + { + "epoch": 0.8605106954866166, + "grad_norm": 1.298704981803894, + "learning_rate": 7.6795e-05, + "loss": 0.4187, + "step": 15367 + }, + { + "epoch": 0.8605666927987456, + "grad_norm": 1.2818284034729004, + "learning_rate": 7.680000000000001e-05, + "loss": 0.4395, + "step": 15368 + }, + { + "epoch": 0.8606226901108747, + "grad_norm": 1.5905777215957642, + "learning_rate": 7.680500000000001e-05, + "loss": 0.4923, + "step": 15369 + }, + { + "epoch": 0.8606786874230037, + "grad_norm": 1.232959508895874, + "learning_rate": 7.681e-05, + "loss": 0.3201, + "step": 15370 + }, + { + "epoch": 0.8607346847351327, + "grad_norm": 1.4469730854034424, + "learning_rate": 7.6815e-05, + "loss": 0.4906, + "step": 15371 + }, + { + "epoch": 0.8607906820472617, + "grad_norm": 1.1165945529937744, + "learning_rate": 7.682e-05, + "loss": 0.4475, + "step": 15372 + }, + { + "epoch": 0.8608466793593907, + "grad_norm": 1.3485347032546997, + "learning_rate": 7.6825e-05, + "loss": 0.4444, + "step": 15373 + }, + { + "epoch": 0.8609026766715198, + "grad_norm": 1.6563959121704102, + "learning_rate": 7.683000000000001e-05, + "loss": 0.4601, + "step": 15374 + }, + { + "epoch": 0.8609586739836488, + "grad_norm": 1.7581827640533447, + "learning_rate": 7.6835e-05, + "loss": 0.4572, + "step": 15375 + }, + { + "epoch": 0.8610146712957778, + "grad_norm": 1.1497478485107422, + "learning_rate": 7.684e-05, + "loss": 0.3935, + "step": 15376 + }, + { + "epoch": 0.8610706686079068, + "grad_norm": 1.2260955572128296, + "learning_rate": 7.6845e-05, + "loss": 0.3778, + "step": 15377 + }, + { + "epoch": 0.8611266659200358, + "grad_norm": 1.366969108581543, + "learning_rate": 7.685e-05, + "loss": 0.4371, + "step": 15378 + }, + { + "epoch": 0.8611826632321649, + "grad_norm": 1.4714306592941284, + "learning_rate": 7.6855e-05, + "loss": 0.4312, + "step": 15379 + }, + { + "epoch": 0.8612386605442939, + "grad_norm": 1.4076884984970093, + "learning_rate": 7.685999999999999e-05, + "loss": 0.466, + "step": 15380 + }, + { + "epoch": 0.8612946578564229, + "grad_norm": 1.2253717184066772, + "learning_rate": 7.6865e-05, + "loss": 0.4248, + "step": 15381 + }, + { + "epoch": 0.8613506551685519, + "grad_norm": 1.476537823677063, + "learning_rate": 7.687000000000001e-05, + "loss": 0.3281, + "step": 15382 + }, + { + "epoch": 0.8614066524806809, + "grad_norm": 1.4724180698394775, + "learning_rate": 7.687500000000001e-05, + "loss": 0.4495, + "step": 15383 + }, + { + "epoch": 0.86146264979281, + "grad_norm": 1.313353180885315, + "learning_rate": 7.688000000000001e-05, + "loss": 0.4246, + "step": 15384 + }, + { + "epoch": 0.861518647104939, + "grad_norm": 1.5207279920578003, + "learning_rate": 7.6885e-05, + "loss": 0.5077, + "step": 15385 + }, + { + "epoch": 0.861574644417068, + "grad_norm": 1.2960842847824097, + "learning_rate": 7.689e-05, + "loss": 0.4312, + "step": 15386 + }, + { + "epoch": 0.861630641729197, + "grad_norm": 1.3373053073883057, + "learning_rate": 7.6895e-05, + "loss": 0.5328, + "step": 15387 + }, + { + "epoch": 0.861686639041326, + "grad_norm": 1.7143656015396118, + "learning_rate": 7.69e-05, + "loss": 0.5065, + "step": 15388 + }, + { + "epoch": 0.861742636353455, + "grad_norm": 1.6221736669540405, + "learning_rate": 7.690500000000001e-05, + "loss": 0.4077, + "step": 15389 + }, + { + "epoch": 0.8617986336655841, + "grad_norm": 1.740646243095398, + "learning_rate": 7.691000000000001e-05, + "loss": 0.4296, + "step": 15390 + }, + { + "epoch": 0.8618546309777131, + "grad_norm": 1.6884620189666748, + "learning_rate": 7.6915e-05, + "loss": 0.6193, + "step": 15391 + }, + { + "epoch": 0.8619106282898421, + "grad_norm": 1.6343050003051758, + "learning_rate": 7.692e-05, + "loss": 0.6356, + "step": 15392 + }, + { + "epoch": 0.8619666256019711, + "grad_norm": 1.3310794830322266, + "learning_rate": 7.6925e-05, + "loss": 0.4613, + "step": 15393 + }, + { + "epoch": 0.8620226229141001, + "grad_norm": 1.2125768661499023, + "learning_rate": 7.693e-05, + "loss": 0.3971, + "step": 15394 + }, + { + "epoch": 0.8620786202262292, + "grad_norm": 1.165642499923706, + "learning_rate": 7.693500000000001e-05, + "loss": 0.4871, + "step": 15395 + }, + { + "epoch": 0.8621346175383582, + "grad_norm": 1.8752145767211914, + "learning_rate": 7.694e-05, + "loss": 0.4689, + "step": 15396 + }, + { + "epoch": 0.8621906148504872, + "grad_norm": 1.4456344842910767, + "learning_rate": 7.6945e-05, + "loss": 0.4598, + "step": 15397 + }, + { + "epoch": 0.8622466121626162, + "grad_norm": 1.1199686527252197, + "learning_rate": 7.695e-05, + "loss": 0.371, + "step": 15398 + }, + { + "epoch": 0.8623026094747452, + "grad_norm": 1.0419329404830933, + "learning_rate": 7.6955e-05, + "loss": 0.3077, + "step": 15399 + }, + { + "epoch": 0.8623586067868743, + "grad_norm": 1.233064889907837, + "learning_rate": 7.696e-05, + "loss": 0.3843, + "step": 15400 + }, + { + "epoch": 0.8624146040990033, + "grad_norm": 1.3966174125671387, + "learning_rate": 7.696499999999999e-05, + "loss": 0.5682, + "step": 15401 + }, + { + "epoch": 0.8624706014111323, + "grad_norm": 2.0899782180786133, + "learning_rate": 7.697e-05, + "loss": 0.5866, + "step": 15402 + }, + { + "epoch": 0.8625265987232613, + "grad_norm": 1.5108164548873901, + "learning_rate": 7.697500000000001e-05, + "loss": 0.4806, + "step": 15403 + }, + { + "epoch": 0.8625825960353903, + "grad_norm": 1.3947193622589111, + "learning_rate": 7.698000000000001e-05, + "loss": 0.4549, + "step": 15404 + }, + { + "epoch": 0.8626385933475194, + "grad_norm": 1.3674523830413818, + "learning_rate": 7.698500000000001e-05, + "loss": 0.5378, + "step": 15405 + }, + { + "epoch": 0.8626945906596484, + "grad_norm": 1.3286527395248413, + "learning_rate": 7.699e-05, + "loss": 0.604, + "step": 15406 + }, + { + "epoch": 0.8627505879717774, + "grad_norm": 1.4222450256347656, + "learning_rate": 7.6995e-05, + "loss": 0.4108, + "step": 15407 + }, + { + "epoch": 0.8628065852839064, + "grad_norm": 1.323704481124878, + "learning_rate": 7.7e-05, + "loss": 0.5305, + "step": 15408 + }, + { + "epoch": 0.8628625825960354, + "grad_norm": 1.5488990545272827, + "learning_rate": 7.7005e-05, + "loss": 0.6045, + "step": 15409 + }, + { + "epoch": 0.8629185799081645, + "grad_norm": 1.2930275201797485, + "learning_rate": 7.701000000000001e-05, + "loss": 0.5057, + "step": 15410 + }, + { + "epoch": 0.8629745772202935, + "grad_norm": 1.2379000186920166, + "learning_rate": 7.701500000000001e-05, + "loss": 0.4044, + "step": 15411 + }, + { + "epoch": 0.8630305745324225, + "grad_norm": 1.2456347942352295, + "learning_rate": 7.702e-05, + "loss": 0.4925, + "step": 15412 + }, + { + "epoch": 0.8630865718445515, + "grad_norm": 1.2400336265563965, + "learning_rate": 7.7025e-05, + "loss": 0.3953, + "step": 15413 + }, + { + "epoch": 0.8631425691566805, + "grad_norm": 1.3767483234405518, + "learning_rate": 7.703e-05, + "loss": 0.3056, + "step": 15414 + }, + { + "epoch": 0.8631985664688095, + "grad_norm": 1.6071982383728027, + "learning_rate": 7.7035e-05, + "loss": 0.7268, + "step": 15415 + }, + { + "epoch": 0.8632545637809386, + "grad_norm": 1.1186975240707397, + "learning_rate": 7.704000000000001e-05, + "loss": 0.3486, + "step": 15416 + }, + { + "epoch": 0.8633105610930676, + "grad_norm": 1.5963722467422485, + "learning_rate": 7.7045e-05, + "loss": 0.4169, + "step": 15417 + }, + { + "epoch": 0.8633665584051966, + "grad_norm": 1.4779984951019287, + "learning_rate": 7.705e-05, + "loss": 0.5113, + "step": 15418 + }, + { + "epoch": 0.8634225557173256, + "grad_norm": 1.7189549207687378, + "learning_rate": 7.7055e-05, + "loss": 0.6054, + "step": 15419 + }, + { + "epoch": 0.8634785530294545, + "grad_norm": 1.1392791271209717, + "learning_rate": 7.706e-05, + "loss": 0.3658, + "step": 15420 + }, + { + "epoch": 0.8635345503415836, + "grad_norm": 1.5345208644866943, + "learning_rate": 7.7065e-05, + "loss": 0.5942, + "step": 15421 + }, + { + "epoch": 0.8635905476537126, + "grad_norm": 1.671932339668274, + "learning_rate": 7.707e-05, + "loss": 0.4105, + "step": 15422 + }, + { + "epoch": 0.8636465449658416, + "grad_norm": 1.54603111743927, + "learning_rate": 7.7075e-05, + "loss": 0.4105, + "step": 15423 + }, + { + "epoch": 0.8637025422779706, + "grad_norm": 1.509049892425537, + "learning_rate": 7.708000000000001e-05, + "loss": 0.4501, + "step": 15424 + }, + { + "epoch": 0.8637585395900996, + "grad_norm": 1.3530577421188354, + "learning_rate": 7.708500000000001e-05, + "loss": 0.5009, + "step": 15425 + }, + { + "epoch": 0.8638145369022286, + "grad_norm": 1.230531930923462, + "learning_rate": 7.709000000000001e-05, + "loss": 0.3958, + "step": 15426 + }, + { + "epoch": 0.8638705342143577, + "grad_norm": 1.2434871196746826, + "learning_rate": 7.7095e-05, + "loss": 0.5073, + "step": 15427 + }, + { + "epoch": 0.8639265315264867, + "grad_norm": 1.2971305847167969, + "learning_rate": 7.71e-05, + "loss": 0.4372, + "step": 15428 + }, + { + "epoch": 0.8639825288386157, + "grad_norm": 1.4570695161819458, + "learning_rate": 7.7105e-05, + "loss": 0.4053, + "step": 15429 + }, + { + "epoch": 0.8640385261507447, + "grad_norm": 1.3198705911636353, + "learning_rate": 7.711e-05, + "loss": 0.381, + "step": 15430 + }, + { + "epoch": 0.8640945234628737, + "grad_norm": 1.161210536956787, + "learning_rate": 7.711500000000001e-05, + "loss": 0.4301, + "step": 15431 + }, + { + "epoch": 0.8641505207750028, + "grad_norm": 1.0020915269851685, + "learning_rate": 7.712000000000001e-05, + "loss": 0.3555, + "step": 15432 + }, + { + "epoch": 0.8642065180871318, + "grad_norm": 1.200257420539856, + "learning_rate": 7.7125e-05, + "loss": 0.4197, + "step": 15433 + }, + { + "epoch": 0.8642625153992608, + "grad_norm": 1.7010304927825928, + "learning_rate": 7.713e-05, + "loss": 0.5349, + "step": 15434 + }, + { + "epoch": 0.8643185127113898, + "grad_norm": 1.8268228769302368, + "learning_rate": 7.7135e-05, + "loss": 0.5794, + "step": 15435 + }, + { + "epoch": 0.8643745100235188, + "grad_norm": 1.5086874961853027, + "learning_rate": 7.714e-05, + "loss": 0.4878, + "step": 15436 + }, + { + "epoch": 0.8644305073356479, + "grad_norm": 1.0217642784118652, + "learning_rate": 7.7145e-05, + "loss": 0.3234, + "step": 15437 + }, + { + "epoch": 0.8644865046477769, + "grad_norm": 5.203007698059082, + "learning_rate": 7.715e-05, + "loss": 0.511, + "step": 15438 + }, + { + "epoch": 0.8645425019599059, + "grad_norm": 1.233720302581787, + "learning_rate": 7.7155e-05, + "loss": 0.4566, + "step": 15439 + }, + { + "epoch": 0.8645984992720349, + "grad_norm": 1.3341400623321533, + "learning_rate": 7.716e-05, + "loss": 0.404, + "step": 15440 + }, + { + "epoch": 0.8646544965841639, + "grad_norm": 1.4108461141586304, + "learning_rate": 7.7165e-05, + "loss": 0.4925, + "step": 15441 + }, + { + "epoch": 0.864710493896293, + "grad_norm": 1.3659836053848267, + "learning_rate": 7.717000000000001e-05, + "loss": 0.4156, + "step": 15442 + }, + { + "epoch": 0.864766491208422, + "grad_norm": 1.1967355012893677, + "learning_rate": 7.7175e-05, + "loss": 0.386, + "step": 15443 + }, + { + "epoch": 0.864822488520551, + "grad_norm": 1.2768745422363281, + "learning_rate": 7.718e-05, + "loss": 0.4394, + "step": 15444 + }, + { + "epoch": 0.86487848583268, + "grad_norm": 1.3104383945465088, + "learning_rate": 7.718500000000001e-05, + "loss": 0.3974, + "step": 15445 + }, + { + "epoch": 0.864934483144809, + "grad_norm": 1.3468825817108154, + "learning_rate": 7.719000000000001e-05, + "loss": 0.4178, + "step": 15446 + }, + { + "epoch": 0.864990480456938, + "grad_norm": 1.2500470876693726, + "learning_rate": 7.719500000000001e-05, + "loss": 0.4704, + "step": 15447 + }, + { + "epoch": 0.8650464777690671, + "grad_norm": 1.2548214197158813, + "learning_rate": 7.72e-05, + "loss": 0.3746, + "step": 15448 + }, + { + "epoch": 0.8651024750811961, + "grad_norm": 1.4752436876296997, + "learning_rate": 7.7205e-05, + "loss": 0.5735, + "step": 15449 + }, + { + "epoch": 0.8651584723933251, + "grad_norm": 1.3975766897201538, + "learning_rate": 7.721e-05, + "loss": 0.4493, + "step": 15450 + }, + { + "epoch": 0.8652144697054541, + "grad_norm": 6.8397417068481445, + "learning_rate": 7.7215e-05, + "loss": 0.4978, + "step": 15451 + }, + { + "epoch": 0.8652704670175831, + "grad_norm": 1.2908713817596436, + "learning_rate": 7.722000000000001e-05, + "loss": 0.4509, + "step": 15452 + }, + { + "epoch": 0.8653264643297122, + "grad_norm": 1.1639848947525024, + "learning_rate": 7.722500000000001e-05, + "loss": 0.5475, + "step": 15453 + }, + { + "epoch": 0.8653824616418412, + "grad_norm": 1.2309406995773315, + "learning_rate": 7.723e-05, + "loss": 0.4391, + "step": 15454 + }, + { + "epoch": 0.8654384589539702, + "grad_norm": 1.3136157989501953, + "learning_rate": 7.7235e-05, + "loss": 0.4677, + "step": 15455 + }, + { + "epoch": 0.8654944562660992, + "grad_norm": 1.334875226020813, + "learning_rate": 7.724e-05, + "loss": 0.3689, + "step": 15456 + }, + { + "epoch": 0.8655504535782282, + "grad_norm": 1.2034419775009155, + "learning_rate": 7.7245e-05, + "loss": 0.4591, + "step": 15457 + }, + { + "epoch": 0.8656064508903573, + "grad_norm": 1.3061434030532837, + "learning_rate": 7.725e-05, + "loss": 0.4413, + "step": 15458 + }, + { + "epoch": 0.8656624482024863, + "grad_norm": 1.155679702758789, + "learning_rate": 7.7255e-05, + "loss": 0.386, + "step": 15459 + }, + { + "epoch": 0.8657184455146153, + "grad_norm": 1.4922846555709839, + "learning_rate": 7.726e-05, + "loss": 0.5156, + "step": 15460 + }, + { + "epoch": 0.8657744428267443, + "grad_norm": 1.457108497619629, + "learning_rate": 7.7265e-05, + "loss": 0.5872, + "step": 15461 + }, + { + "epoch": 0.8658304401388733, + "grad_norm": 1.2564916610717773, + "learning_rate": 7.727000000000001e-05, + "loss": 0.4891, + "step": 15462 + }, + { + "epoch": 0.8658864374510024, + "grad_norm": 1.2155927419662476, + "learning_rate": 7.727500000000001e-05, + "loss": 0.4745, + "step": 15463 + }, + { + "epoch": 0.8659424347631314, + "grad_norm": 1.302109956741333, + "learning_rate": 7.728e-05, + "loss": 0.4322, + "step": 15464 + }, + { + "epoch": 0.8659984320752604, + "grad_norm": 1.4536240100860596, + "learning_rate": 7.7285e-05, + "loss": 0.4242, + "step": 15465 + }, + { + "epoch": 0.8660544293873894, + "grad_norm": 1.3679760694503784, + "learning_rate": 7.729e-05, + "loss": 0.4916, + "step": 15466 + }, + { + "epoch": 0.8661104266995184, + "grad_norm": 1.3587265014648438, + "learning_rate": 7.729500000000001e-05, + "loss": 0.4235, + "step": 15467 + }, + { + "epoch": 0.8661664240116475, + "grad_norm": 1.4559144973754883, + "learning_rate": 7.730000000000001e-05, + "loss": 0.4911, + "step": 15468 + }, + { + "epoch": 0.8662224213237765, + "grad_norm": 1.8773778676986694, + "learning_rate": 7.7305e-05, + "loss": 0.6553, + "step": 15469 + }, + { + "epoch": 0.8662784186359055, + "grad_norm": 1.2014228105545044, + "learning_rate": 7.731e-05, + "loss": 0.3914, + "step": 15470 + }, + { + "epoch": 0.8663344159480345, + "grad_norm": 1.4217487573623657, + "learning_rate": 7.7315e-05, + "loss": 0.3885, + "step": 15471 + }, + { + "epoch": 0.8663904132601635, + "grad_norm": 1.4292007684707642, + "learning_rate": 7.732e-05, + "loss": 0.4123, + "step": 15472 + }, + { + "epoch": 0.8664464105722925, + "grad_norm": 1.520644187927246, + "learning_rate": 7.732500000000001e-05, + "loss": 0.6345, + "step": 15473 + }, + { + "epoch": 0.8665024078844216, + "grad_norm": 1.1639766693115234, + "learning_rate": 7.733e-05, + "loss": 0.4798, + "step": 15474 + }, + { + "epoch": 0.8665584051965506, + "grad_norm": 1.337589144706726, + "learning_rate": 7.7335e-05, + "loss": 0.3851, + "step": 15475 + }, + { + "epoch": 0.8666144025086796, + "grad_norm": 1.5339282751083374, + "learning_rate": 7.734e-05, + "loss": 0.4823, + "step": 15476 + }, + { + "epoch": 0.8666703998208086, + "grad_norm": 1.498666524887085, + "learning_rate": 7.7345e-05, + "loss": 0.5346, + "step": 15477 + }, + { + "epoch": 0.8667263971329376, + "grad_norm": 1.5173438787460327, + "learning_rate": 7.735e-05, + "loss": 0.4974, + "step": 15478 + }, + { + "epoch": 0.8667823944450667, + "grad_norm": 1.3957185745239258, + "learning_rate": 7.7355e-05, + "loss": 0.5111, + "step": 15479 + }, + { + "epoch": 0.8668383917571957, + "grad_norm": 1.2964214086532593, + "learning_rate": 7.736e-05, + "loss": 0.4033, + "step": 15480 + }, + { + "epoch": 0.8668943890693247, + "grad_norm": 1.4217289686203003, + "learning_rate": 7.7365e-05, + "loss": 0.5553, + "step": 15481 + }, + { + "epoch": 0.8669503863814537, + "grad_norm": 1.6912500858306885, + "learning_rate": 7.737000000000001e-05, + "loss": 0.4515, + "step": 15482 + }, + { + "epoch": 0.8670063836935827, + "grad_norm": 1.4011566638946533, + "learning_rate": 7.737500000000001e-05, + "loss": 0.467, + "step": 15483 + }, + { + "epoch": 0.8670623810057118, + "grad_norm": 1.3058698177337646, + "learning_rate": 7.738000000000001e-05, + "loss": 0.4093, + "step": 15484 + }, + { + "epoch": 0.8671183783178408, + "grad_norm": 1.1876654624938965, + "learning_rate": 7.7385e-05, + "loss": 0.5413, + "step": 15485 + }, + { + "epoch": 0.8671743756299698, + "grad_norm": 1.3856619596481323, + "learning_rate": 7.739e-05, + "loss": 0.5422, + "step": 15486 + }, + { + "epoch": 0.8672303729420988, + "grad_norm": 1.1427805423736572, + "learning_rate": 7.7395e-05, + "loss": 0.3884, + "step": 15487 + }, + { + "epoch": 0.8672863702542278, + "grad_norm": 1.272996425628662, + "learning_rate": 7.740000000000001e-05, + "loss": 0.6489, + "step": 15488 + }, + { + "epoch": 0.8673423675663569, + "grad_norm": 1.6561285257339478, + "learning_rate": 7.740500000000001e-05, + "loss": 0.5978, + "step": 15489 + }, + { + "epoch": 0.8673983648784859, + "grad_norm": 1.341091513633728, + "learning_rate": 7.741e-05, + "loss": 0.4276, + "step": 15490 + }, + { + "epoch": 0.8674543621906149, + "grad_norm": 1.3549751043319702, + "learning_rate": 7.7415e-05, + "loss": 0.4541, + "step": 15491 + }, + { + "epoch": 0.8675103595027439, + "grad_norm": 1.370042085647583, + "learning_rate": 7.742e-05, + "loss": 0.379, + "step": 15492 + }, + { + "epoch": 0.8675663568148729, + "grad_norm": 1.3413833379745483, + "learning_rate": 7.7425e-05, + "loss": 0.4339, + "step": 15493 + }, + { + "epoch": 0.867622354127002, + "grad_norm": 1.3769487142562866, + "learning_rate": 7.743000000000001e-05, + "loss": 0.4813, + "step": 15494 + }, + { + "epoch": 0.867678351439131, + "grad_norm": 1.2802037000656128, + "learning_rate": 7.7435e-05, + "loss": 0.492, + "step": 15495 + }, + { + "epoch": 0.86773434875126, + "grad_norm": 1.201843500137329, + "learning_rate": 7.744e-05, + "loss": 0.3885, + "step": 15496 + }, + { + "epoch": 0.867790346063389, + "grad_norm": 1.21647310256958, + "learning_rate": 7.7445e-05, + "loss": 0.3994, + "step": 15497 + }, + { + "epoch": 0.867846343375518, + "grad_norm": 1.3415783643722534, + "learning_rate": 7.745e-05, + "loss": 0.3511, + "step": 15498 + }, + { + "epoch": 0.867902340687647, + "grad_norm": 1.1152540445327759, + "learning_rate": 7.7455e-05, + "loss": 0.406, + "step": 15499 + }, + { + "epoch": 0.8679583379997761, + "grad_norm": 1.1693726778030396, + "learning_rate": 7.746e-05, + "loss": 0.4369, + "step": 15500 + }, + { + "epoch": 0.8680143353119051, + "grad_norm": 1.5938900709152222, + "learning_rate": 7.7465e-05, + "loss": 0.5564, + "step": 15501 + }, + { + "epoch": 0.868070332624034, + "grad_norm": 1.3347742557525635, + "learning_rate": 7.747000000000002e-05, + "loss": 0.481, + "step": 15502 + }, + { + "epoch": 0.868126329936163, + "grad_norm": 1.2332980632781982, + "learning_rate": 7.747500000000001e-05, + "loss": 0.4362, + "step": 15503 + }, + { + "epoch": 0.868182327248292, + "grad_norm": 1.2018414735794067, + "learning_rate": 7.748000000000001e-05, + "loss": 0.4191, + "step": 15504 + }, + { + "epoch": 0.868238324560421, + "grad_norm": 1.2105592489242554, + "learning_rate": 7.748500000000001e-05, + "loss": 0.517, + "step": 15505 + }, + { + "epoch": 0.8682943218725501, + "grad_norm": 1.053380012512207, + "learning_rate": 7.749e-05, + "loss": 0.4104, + "step": 15506 + }, + { + "epoch": 0.8683503191846791, + "grad_norm": 1.3285924196243286, + "learning_rate": 7.7495e-05, + "loss": 0.4991, + "step": 15507 + }, + { + "epoch": 0.8684063164968081, + "grad_norm": 1.1642602682113647, + "learning_rate": 7.75e-05, + "loss": 0.3911, + "step": 15508 + }, + { + "epoch": 0.8684623138089371, + "grad_norm": 1.2761082649230957, + "learning_rate": 7.750500000000001e-05, + "loss": 0.4166, + "step": 15509 + }, + { + "epoch": 0.8685183111210661, + "grad_norm": 1.2622443437576294, + "learning_rate": 7.751000000000001e-05, + "loss": 0.4354, + "step": 15510 + }, + { + "epoch": 0.8685743084331952, + "grad_norm": 1.3382610082626343, + "learning_rate": 7.7515e-05, + "loss": 0.3583, + "step": 15511 + }, + { + "epoch": 0.8686303057453242, + "grad_norm": 1.1973257064819336, + "learning_rate": 7.752e-05, + "loss": 0.3613, + "step": 15512 + }, + { + "epoch": 0.8686863030574532, + "grad_norm": 1.2233986854553223, + "learning_rate": 7.7525e-05, + "loss": 0.4168, + "step": 15513 + }, + { + "epoch": 0.8687423003695822, + "grad_norm": 1.197396159172058, + "learning_rate": 7.753e-05, + "loss": 0.344, + "step": 15514 + }, + { + "epoch": 0.8687982976817112, + "grad_norm": 1.5905447006225586, + "learning_rate": 7.7535e-05, + "loss": 0.6329, + "step": 15515 + }, + { + "epoch": 0.8688542949938403, + "grad_norm": 1.442287802696228, + "learning_rate": 7.754e-05, + "loss": 0.3922, + "step": 15516 + }, + { + "epoch": 0.8689102923059693, + "grad_norm": 1.1113076210021973, + "learning_rate": 7.7545e-05, + "loss": 0.4077, + "step": 15517 + }, + { + "epoch": 0.8689662896180983, + "grad_norm": 1.2457160949707031, + "learning_rate": 7.755e-05, + "loss": 0.471, + "step": 15518 + }, + { + "epoch": 0.8690222869302273, + "grad_norm": 1.5347809791564941, + "learning_rate": 7.7555e-05, + "loss": 0.3644, + "step": 15519 + }, + { + "epoch": 0.8690782842423563, + "grad_norm": 1.2472450733184814, + "learning_rate": 7.756e-05, + "loss": 0.4156, + "step": 15520 + }, + { + "epoch": 0.8691342815544854, + "grad_norm": 1.208138108253479, + "learning_rate": 7.7565e-05, + "loss": 0.468, + "step": 15521 + }, + { + "epoch": 0.8691902788666144, + "grad_norm": 1.2988399267196655, + "learning_rate": 7.757e-05, + "loss": 0.4566, + "step": 15522 + }, + { + "epoch": 0.8692462761787434, + "grad_norm": 1.5431101322174072, + "learning_rate": 7.757500000000002e-05, + "loss": 0.4289, + "step": 15523 + }, + { + "epoch": 0.8693022734908724, + "grad_norm": 1.3862583637237549, + "learning_rate": 7.758000000000001e-05, + "loss": 0.5599, + "step": 15524 + }, + { + "epoch": 0.8693582708030014, + "grad_norm": 1.4205782413482666, + "learning_rate": 7.758500000000001e-05, + "loss": 0.4798, + "step": 15525 + }, + { + "epoch": 0.8694142681151305, + "grad_norm": 1.4005579948425293, + "learning_rate": 7.759000000000001e-05, + "loss": 0.3991, + "step": 15526 + }, + { + "epoch": 0.8694702654272595, + "grad_norm": 1.517439603805542, + "learning_rate": 7.7595e-05, + "loss": 0.4242, + "step": 15527 + }, + { + "epoch": 0.8695262627393885, + "grad_norm": 1.5895737409591675, + "learning_rate": 7.76e-05, + "loss": 0.5689, + "step": 15528 + }, + { + "epoch": 0.8695822600515175, + "grad_norm": 1.289066195487976, + "learning_rate": 7.7605e-05, + "loss": 0.4296, + "step": 15529 + }, + { + "epoch": 0.8696382573636465, + "grad_norm": 1.2449251413345337, + "learning_rate": 7.761000000000001e-05, + "loss": 0.4266, + "step": 15530 + }, + { + "epoch": 0.8696942546757755, + "grad_norm": 1.6113264560699463, + "learning_rate": 7.761500000000001e-05, + "loss": 0.5132, + "step": 15531 + }, + { + "epoch": 0.8697502519879046, + "grad_norm": 1.1368285417556763, + "learning_rate": 7.762e-05, + "loss": 0.439, + "step": 15532 + }, + { + "epoch": 0.8698062493000336, + "grad_norm": 1.2893623113632202, + "learning_rate": 7.7625e-05, + "loss": 0.4703, + "step": 15533 + }, + { + "epoch": 0.8698622466121626, + "grad_norm": 1.5299174785614014, + "learning_rate": 7.763e-05, + "loss": 0.4481, + "step": 15534 + }, + { + "epoch": 0.8699182439242916, + "grad_norm": 1.1706514358520508, + "learning_rate": 7.7635e-05, + "loss": 0.3218, + "step": 15535 + }, + { + "epoch": 0.8699742412364206, + "grad_norm": 1.2985281944274902, + "learning_rate": 7.764e-05, + "loss": 0.3237, + "step": 15536 + }, + { + "epoch": 0.8700302385485497, + "grad_norm": 1.3335989713668823, + "learning_rate": 7.7645e-05, + "loss": 0.4881, + "step": 15537 + }, + { + "epoch": 0.8700862358606787, + "grad_norm": 1.3737127780914307, + "learning_rate": 7.765e-05, + "loss": 0.3775, + "step": 15538 + }, + { + "epoch": 0.8701422331728077, + "grad_norm": 1.4452928304672241, + "learning_rate": 7.7655e-05, + "loss": 0.3665, + "step": 15539 + }, + { + "epoch": 0.8701982304849367, + "grad_norm": 1.1835672855377197, + "learning_rate": 7.766e-05, + "loss": 0.3957, + "step": 15540 + }, + { + "epoch": 0.8702542277970657, + "grad_norm": 1.1987941265106201, + "learning_rate": 7.7665e-05, + "loss": 0.4849, + "step": 15541 + }, + { + "epoch": 0.8703102251091948, + "grad_norm": 1.0974875688552856, + "learning_rate": 7.767e-05, + "loss": 0.4018, + "step": 15542 + }, + { + "epoch": 0.8703662224213238, + "grad_norm": 1.0797059535980225, + "learning_rate": 7.7675e-05, + "loss": 0.3083, + "step": 15543 + }, + { + "epoch": 0.8704222197334528, + "grad_norm": 1.196189284324646, + "learning_rate": 7.768e-05, + "loss": 0.3826, + "step": 15544 + }, + { + "epoch": 0.8704782170455818, + "grad_norm": 1.3408902883529663, + "learning_rate": 7.768500000000001e-05, + "loss": 0.4565, + "step": 15545 + }, + { + "epoch": 0.8705342143577108, + "grad_norm": 1.1600315570831299, + "learning_rate": 7.769000000000001e-05, + "loss": 0.3222, + "step": 15546 + }, + { + "epoch": 0.8705902116698399, + "grad_norm": 1.8778573274612427, + "learning_rate": 7.769500000000001e-05, + "loss": 0.5961, + "step": 15547 + }, + { + "epoch": 0.8706462089819689, + "grad_norm": 1.2116731405258179, + "learning_rate": 7.77e-05, + "loss": 0.4234, + "step": 15548 + }, + { + "epoch": 0.8707022062940979, + "grad_norm": 1.3063853979110718, + "learning_rate": 7.7705e-05, + "loss": 0.38, + "step": 15549 + }, + { + "epoch": 0.8707582036062269, + "grad_norm": 1.646710753440857, + "learning_rate": 7.771e-05, + "loss": 0.433, + "step": 15550 + }, + { + "epoch": 0.8708142009183559, + "grad_norm": 1.3051859140396118, + "learning_rate": 7.771500000000001e-05, + "loss": 0.4738, + "step": 15551 + }, + { + "epoch": 0.870870198230485, + "grad_norm": 1.4985663890838623, + "learning_rate": 7.772000000000001e-05, + "loss": 0.4587, + "step": 15552 + }, + { + "epoch": 0.870926195542614, + "grad_norm": 1.1227905750274658, + "learning_rate": 7.7725e-05, + "loss": 0.3363, + "step": 15553 + }, + { + "epoch": 0.870982192854743, + "grad_norm": 1.4132035970687866, + "learning_rate": 7.773e-05, + "loss": 0.5337, + "step": 15554 + }, + { + "epoch": 0.871038190166872, + "grad_norm": 1.3326305150985718, + "learning_rate": 7.7735e-05, + "loss": 0.4576, + "step": 15555 + }, + { + "epoch": 0.871094187479001, + "grad_norm": 1.2112631797790527, + "learning_rate": 7.774e-05, + "loss": 0.405, + "step": 15556 + }, + { + "epoch": 0.87115018479113, + "grad_norm": 1.383243203163147, + "learning_rate": 7.7745e-05, + "loss": 0.6684, + "step": 15557 + }, + { + "epoch": 0.8712061821032591, + "grad_norm": 1.0421710014343262, + "learning_rate": 7.775e-05, + "loss": 0.3893, + "step": 15558 + }, + { + "epoch": 0.8712621794153881, + "grad_norm": 1.2611445188522339, + "learning_rate": 7.7755e-05, + "loss": 0.5054, + "step": 15559 + }, + { + "epoch": 0.8713181767275171, + "grad_norm": 1.14493727684021, + "learning_rate": 7.776e-05, + "loss": 0.352, + "step": 15560 + }, + { + "epoch": 0.8713741740396461, + "grad_norm": 1.6791402101516724, + "learning_rate": 7.7765e-05, + "loss": 0.5136, + "step": 15561 + }, + { + "epoch": 0.8714301713517751, + "grad_norm": 1.4859578609466553, + "learning_rate": 7.777e-05, + "loss": 0.3403, + "step": 15562 + }, + { + "epoch": 0.8714861686639042, + "grad_norm": 1.1438791751861572, + "learning_rate": 7.7775e-05, + "loss": 0.3434, + "step": 15563 + }, + { + "epoch": 0.8715421659760332, + "grad_norm": 1.1297763586044312, + "learning_rate": 7.778e-05, + "loss": 0.4069, + "step": 15564 + }, + { + "epoch": 0.8715981632881622, + "grad_norm": 1.4422338008880615, + "learning_rate": 7.7785e-05, + "loss": 0.4356, + "step": 15565 + }, + { + "epoch": 0.8716541606002912, + "grad_norm": 1.3220010995864868, + "learning_rate": 7.779000000000001e-05, + "loss": 0.4501, + "step": 15566 + }, + { + "epoch": 0.8717101579124202, + "grad_norm": 1.1621001958847046, + "learning_rate": 7.779500000000001e-05, + "loss": 0.4435, + "step": 15567 + }, + { + "epoch": 0.8717661552245493, + "grad_norm": 1.2144877910614014, + "learning_rate": 7.780000000000001e-05, + "loss": 0.4044, + "step": 15568 + }, + { + "epoch": 0.8718221525366783, + "grad_norm": 1.7398021221160889, + "learning_rate": 7.7805e-05, + "loss": 0.7119, + "step": 15569 + }, + { + "epoch": 0.8718781498488073, + "grad_norm": 1.3021275997161865, + "learning_rate": 7.781e-05, + "loss": 0.5303, + "step": 15570 + }, + { + "epoch": 0.8719341471609363, + "grad_norm": 1.234926700592041, + "learning_rate": 7.7815e-05, + "loss": 0.4424, + "step": 15571 + }, + { + "epoch": 0.8719901444730653, + "grad_norm": 1.5147863626480103, + "learning_rate": 7.782000000000001e-05, + "loss": 0.4176, + "step": 15572 + }, + { + "epoch": 0.8720461417851944, + "grad_norm": 1.1744723320007324, + "learning_rate": 7.782500000000001e-05, + "loss": 0.3972, + "step": 15573 + }, + { + "epoch": 0.8721021390973234, + "grad_norm": 1.2651808261871338, + "learning_rate": 7.783e-05, + "loss": 0.3941, + "step": 15574 + }, + { + "epoch": 0.8721581364094524, + "grad_norm": 1.311531901359558, + "learning_rate": 7.7835e-05, + "loss": 0.4906, + "step": 15575 + }, + { + "epoch": 0.8722141337215814, + "grad_norm": 1.4220561981201172, + "learning_rate": 7.784e-05, + "loss": 0.4883, + "step": 15576 + }, + { + "epoch": 0.8722701310337104, + "grad_norm": 1.3571114540100098, + "learning_rate": 7.7845e-05, + "loss": 0.6236, + "step": 15577 + }, + { + "epoch": 0.8723261283458394, + "grad_norm": 1.26026451587677, + "learning_rate": 7.785e-05, + "loss": 0.5265, + "step": 15578 + }, + { + "epoch": 0.8723821256579685, + "grad_norm": 1.316271185874939, + "learning_rate": 7.7855e-05, + "loss": 0.4858, + "step": 15579 + }, + { + "epoch": 0.8724381229700975, + "grad_norm": 1.3338090181350708, + "learning_rate": 7.786e-05, + "loss": 0.4271, + "step": 15580 + }, + { + "epoch": 0.8724941202822265, + "grad_norm": 1.221513271331787, + "learning_rate": 7.7865e-05, + "loss": 0.3846, + "step": 15581 + }, + { + "epoch": 0.8725501175943555, + "grad_norm": 1.2444828748703003, + "learning_rate": 7.787e-05, + "loss": 0.4285, + "step": 15582 + }, + { + "epoch": 0.8726061149064845, + "grad_norm": 1.3883877992630005, + "learning_rate": 7.787500000000001e-05, + "loss": 0.3942, + "step": 15583 + }, + { + "epoch": 0.8726621122186136, + "grad_norm": 1.7320107221603394, + "learning_rate": 7.788e-05, + "loss": 0.4515, + "step": 15584 + }, + { + "epoch": 0.8727181095307425, + "grad_norm": 1.1985026597976685, + "learning_rate": 7.7885e-05, + "loss": 0.4076, + "step": 15585 + }, + { + "epoch": 0.8727741068428715, + "grad_norm": 1.548317551612854, + "learning_rate": 7.789e-05, + "loss": 0.3293, + "step": 15586 + }, + { + "epoch": 0.8728301041550005, + "grad_norm": 1.435633897781372, + "learning_rate": 7.789500000000001e-05, + "loss": 0.4572, + "step": 15587 + }, + { + "epoch": 0.8728861014671295, + "grad_norm": 1.9437674283981323, + "learning_rate": 7.790000000000001e-05, + "loss": 0.508, + "step": 15588 + }, + { + "epoch": 0.8729420987792585, + "grad_norm": 5.298740863800049, + "learning_rate": 7.790500000000001e-05, + "loss": 0.5213, + "step": 15589 + }, + { + "epoch": 0.8729980960913876, + "grad_norm": 1.305383324623108, + "learning_rate": 7.791e-05, + "loss": 0.4151, + "step": 15590 + }, + { + "epoch": 0.8730540934035166, + "grad_norm": 1.3252203464508057, + "learning_rate": 7.7915e-05, + "loss": 0.4882, + "step": 15591 + }, + { + "epoch": 0.8731100907156456, + "grad_norm": 1.3112610578536987, + "learning_rate": 7.792e-05, + "loss": 0.3805, + "step": 15592 + }, + { + "epoch": 0.8731660880277746, + "grad_norm": 1.2379817962646484, + "learning_rate": 7.792500000000001e-05, + "loss": 0.4596, + "step": 15593 + }, + { + "epoch": 0.8732220853399036, + "grad_norm": 1.6146538257598877, + "learning_rate": 7.793000000000001e-05, + "loss": 0.4251, + "step": 15594 + }, + { + "epoch": 0.8732780826520327, + "grad_norm": 1.3385467529296875, + "learning_rate": 7.7935e-05, + "loss": 0.4956, + "step": 15595 + }, + { + "epoch": 0.8733340799641617, + "grad_norm": 1.4091204404830933, + "learning_rate": 7.794e-05, + "loss": 0.4343, + "step": 15596 + }, + { + "epoch": 0.8733900772762907, + "grad_norm": 1.2925491333007812, + "learning_rate": 7.7945e-05, + "loss": 0.4075, + "step": 15597 + }, + { + "epoch": 0.8734460745884197, + "grad_norm": 1.2249665260314941, + "learning_rate": 7.795e-05, + "loss": 0.4046, + "step": 15598 + }, + { + "epoch": 0.8735020719005487, + "grad_norm": 1.1581363677978516, + "learning_rate": 7.7955e-05, + "loss": 0.3682, + "step": 15599 + }, + { + "epoch": 0.8735580692126778, + "grad_norm": 1.7862924337387085, + "learning_rate": 7.796e-05, + "loss": 0.526, + "step": 15600 + }, + { + "epoch": 0.8736140665248068, + "grad_norm": 1.3446723222732544, + "learning_rate": 7.7965e-05, + "loss": 0.4079, + "step": 15601 + }, + { + "epoch": 0.8736700638369358, + "grad_norm": 1.334972620010376, + "learning_rate": 7.797e-05, + "loss": 0.4373, + "step": 15602 + }, + { + "epoch": 0.8737260611490648, + "grad_norm": 1.2533605098724365, + "learning_rate": 7.797500000000001e-05, + "loss": 0.3595, + "step": 15603 + }, + { + "epoch": 0.8737820584611938, + "grad_norm": 1.8424146175384521, + "learning_rate": 7.798000000000001e-05, + "loss": 0.566, + "step": 15604 + }, + { + "epoch": 0.8738380557733229, + "grad_norm": 1.4950603246688843, + "learning_rate": 7.7985e-05, + "loss": 0.378, + "step": 15605 + }, + { + "epoch": 0.8738940530854519, + "grad_norm": 1.2845555543899536, + "learning_rate": 7.799e-05, + "loss": 0.42, + "step": 15606 + }, + { + "epoch": 0.8739500503975809, + "grad_norm": 1.5531342029571533, + "learning_rate": 7.7995e-05, + "loss": 0.515, + "step": 15607 + }, + { + "epoch": 0.8740060477097099, + "grad_norm": 1.1322746276855469, + "learning_rate": 7.800000000000001e-05, + "loss": 0.354, + "step": 15608 + }, + { + "epoch": 0.8740620450218389, + "grad_norm": 1.6688734292984009, + "learning_rate": 7.800500000000001e-05, + "loss": 0.4178, + "step": 15609 + }, + { + "epoch": 0.874118042333968, + "grad_norm": 1.2702504396438599, + "learning_rate": 7.801000000000001e-05, + "loss": 0.5135, + "step": 15610 + }, + { + "epoch": 0.874174039646097, + "grad_norm": 1.1688611507415771, + "learning_rate": 7.8015e-05, + "loss": 0.3942, + "step": 15611 + }, + { + "epoch": 0.874230036958226, + "grad_norm": 1.407023549079895, + "learning_rate": 7.802e-05, + "loss": 0.4159, + "step": 15612 + }, + { + "epoch": 0.874286034270355, + "grad_norm": 1.5352704524993896, + "learning_rate": 7.8025e-05, + "loss": 0.4463, + "step": 15613 + }, + { + "epoch": 0.874342031582484, + "grad_norm": 1.5974230766296387, + "learning_rate": 7.803e-05, + "loss": 0.4942, + "step": 15614 + }, + { + "epoch": 0.874398028894613, + "grad_norm": 1.779258131980896, + "learning_rate": 7.803500000000001e-05, + "loss": 0.515, + "step": 15615 + }, + { + "epoch": 0.8744540262067421, + "grad_norm": 1.6793160438537598, + "learning_rate": 7.804e-05, + "loss": 0.4746, + "step": 15616 + }, + { + "epoch": 0.8745100235188711, + "grad_norm": 1.193279504776001, + "learning_rate": 7.8045e-05, + "loss": 0.4896, + "step": 15617 + }, + { + "epoch": 0.8745660208310001, + "grad_norm": 1.3777776956558228, + "learning_rate": 7.805e-05, + "loss": 0.5491, + "step": 15618 + }, + { + "epoch": 0.8746220181431291, + "grad_norm": 1.351875901222229, + "learning_rate": 7.8055e-05, + "loss": 0.4955, + "step": 15619 + }, + { + "epoch": 0.8746780154552581, + "grad_norm": 1.1543796062469482, + "learning_rate": 7.806e-05, + "loss": 0.3552, + "step": 15620 + }, + { + "epoch": 0.8747340127673872, + "grad_norm": 1.2773261070251465, + "learning_rate": 7.8065e-05, + "loss": 0.4395, + "step": 15621 + }, + { + "epoch": 0.8747900100795162, + "grad_norm": 1.5070065259933472, + "learning_rate": 7.807e-05, + "loss": 0.6016, + "step": 15622 + }, + { + "epoch": 0.8748460073916452, + "grad_norm": 1.2362004518508911, + "learning_rate": 7.807500000000001e-05, + "loss": 0.4658, + "step": 15623 + }, + { + "epoch": 0.8749020047037742, + "grad_norm": 1.290040135383606, + "learning_rate": 7.808000000000001e-05, + "loss": 0.5284, + "step": 15624 + }, + { + "epoch": 0.8749580020159032, + "grad_norm": 5.069690704345703, + "learning_rate": 7.808500000000001e-05, + "loss": 0.57, + "step": 15625 + }, + { + "epoch": 0.8750139993280323, + "grad_norm": 1.8353949785232544, + "learning_rate": 7.809e-05, + "loss": 0.6024, + "step": 15626 + }, + { + "epoch": 0.8750699966401613, + "grad_norm": 1.3642324209213257, + "learning_rate": 7.8095e-05, + "loss": 0.3545, + "step": 15627 + }, + { + "epoch": 0.8751259939522903, + "grad_norm": 1.3173775672912598, + "learning_rate": 7.81e-05, + "loss": 0.4785, + "step": 15628 + }, + { + "epoch": 0.8751819912644193, + "grad_norm": 1.51857590675354, + "learning_rate": 7.810500000000001e-05, + "loss": 0.39, + "step": 15629 + }, + { + "epoch": 0.8752379885765483, + "grad_norm": 1.6354382038116455, + "learning_rate": 7.811000000000001e-05, + "loss": 0.4176, + "step": 15630 + }, + { + "epoch": 0.8752939858886774, + "grad_norm": 1.3690122365951538, + "learning_rate": 7.811500000000001e-05, + "loss": 0.4327, + "step": 15631 + }, + { + "epoch": 0.8753499832008064, + "grad_norm": 1.4135335683822632, + "learning_rate": 7.812e-05, + "loss": 0.5097, + "step": 15632 + }, + { + "epoch": 0.8754059805129354, + "grad_norm": 1.1350250244140625, + "learning_rate": 7.8125e-05, + "loss": 0.4193, + "step": 15633 + }, + { + "epoch": 0.8754619778250644, + "grad_norm": 1.281648874282837, + "learning_rate": 7.813e-05, + "loss": 0.37, + "step": 15634 + }, + { + "epoch": 0.8755179751371934, + "grad_norm": 1.2919368743896484, + "learning_rate": 7.8135e-05, + "loss": 0.4193, + "step": 15635 + }, + { + "epoch": 0.8755739724493224, + "grad_norm": 1.283706784248352, + "learning_rate": 7.814000000000001e-05, + "loss": 0.4654, + "step": 15636 + }, + { + "epoch": 0.8756299697614515, + "grad_norm": 1.662026047706604, + "learning_rate": 7.8145e-05, + "loss": 0.3832, + "step": 15637 + }, + { + "epoch": 0.8756859670735805, + "grad_norm": 1.2785576581954956, + "learning_rate": 7.815e-05, + "loss": 0.3598, + "step": 15638 + }, + { + "epoch": 0.8757419643857095, + "grad_norm": 1.264336109161377, + "learning_rate": 7.8155e-05, + "loss": 0.4208, + "step": 15639 + }, + { + "epoch": 0.8757979616978385, + "grad_norm": 1.2264667749404907, + "learning_rate": 7.816e-05, + "loss": 0.4188, + "step": 15640 + }, + { + "epoch": 0.8758539590099675, + "grad_norm": 1.6351864337921143, + "learning_rate": 7.8165e-05, + "loss": 0.5137, + "step": 15641 + }, + { + "epoch": 0.8759099563220966, + "grad_norm": 1.2195281982421875, + "learning_rate": 7.817e-05, + "loss": 0.4903, + "step": 15642 + }, + { + "epoch": 0.8759659536342256, + "grad_norm": 1.5257734060287476, + "learning_rate": 7.8175e-05, + "loss": 0.4517, + "step": 15643 + }, + { + "epoch": 0.8760219509463546, + "grad_norm": 1.3661330938339233, + "learning_rate": 7.818000000000001e-05, + "loss": 0.4797, + "step": 15644 + }, + { + "epoch": 0.8760779482584836, + "grad_norm": 1.2897977828979492, + "learning_rate": 7.818500000000001e-05, + "loss": 0.5837, + "step": 15645 + }, + { + "epoch": 0.8761339455706126, + "grad_norm": 1.2548011541366577, + "learning_rate": 7.819000000000001e-05, + "loss": 0.5132, + "step": 15646 + }, + { + "epoch": 0.8761899428827417, + "grad_norm": 1.4747647047042847, + "learning_rate": 7.8195e-05, + "loss": 0.5048, + "step": 15647 + }, + { + "epoch": 0.8762459401948707, + "grad_norm": 1.3684711456298828, + "learning_rate": 7.82e-05, + "loss": 0.4655, + "step": 15648 + }, + { + "epoch": 0.8763019375069997, + "grad_norm": 1.2558674812316895, + "learning_rate": 7.8205e-05, + "loss": 0.3309, + "step": 15649 + }, + { + "epoch": 0.8763579348191287, + "grad_norm": 1.528770089149475, + "learning_rate": 7.821000000000001e-05, + "loss": 0.502, + "step": 15650 + }, + { + "epoch": 0.8764139321312577, + "grad_norm": 1.3130013942718506, + "learning_rate": 7.821500000000001e-05, + "loss": 0.4539, + "step": 15651 + }, + { + "epoch": 0.8764699294433868, + "grad_norm": 3.9340603351593018, + "learning_rate": 7.822e-05, + "loss": 0.4584, + "step": 15652 + }, + { + "epoch": 0.8765259267555158, + "grad_norm": 1.3083142042160034, + "learning_rate": 7.8225e-05, + "loss": 0.3608, + "step": 15653 + }, + { + "epoch": 0.8765819240676448, + "grad_norm": 1.1950018405914307, + "learning_rate": 7.823e-05, + "loss": 0.4288, + "step": 15654 + }, + { + "epoch": 0.8766379213797738, + "grad_norm": 1.6428717374801636, + "learning_rate": 7.8235e-05, + "loss": 0.6502, + "step": 15655 + }, + { + "epoch": 0.8766939186919028, + "grad_norm": 1.3999062776565552, + "learning_rate": 7.824e-05, + "loss": 0.5594, + "step": 15656 + }, + { + "epoch": 0.8767499160040318, + "grad_norm": 1.3282313346862793, + "learning_rate": 7.824500000000001e-05, + "loss": 0.4249, + "step": 15657 + }, + { + "epoch": 0.8768059133161609, + "grad_norm": 1.4244219064712524, + "learning_rate": 7.825e-05, + "loss": 0.3722, + "step": 15658 + }, + { + "epoch": 0.8768619106282899, + "grad_norm": 1.458259105682373, + "learning_rate": 7.8255e-05, + "loss": 0.3885, + "step": 15659 + }, + { + "epoch": 0.8769179079404189, + "grad_norm": 1.1726912260055542, + "learning_rate": 7.826e-05, + "loss": 0.4037, + "step": 15660 + }, + { + "epoch": 0.8769739052525479, + "grad_norm": 1.3612245321273804, + "learning_rate": 7.8265e-05, + "loss": 0.4036, + "step": 15661 + }, + { + "epoch": 0.8770299025646769, + "grad_norm": 1.4680825471878052, + "learning_rate": 7.827e-05, + "loss": 0.3367, + "step": 15662 + }, + { + "epoch": 0.877085899876806, + "grad_norm": 2.020721435546875, + "learning_rate": 7.827499999999999e-05, + "loss": 0.5772, + "step": 15663 + }, + { + "epoch": 0.877141897188935, + "grad_norm": 1.4212771654129028, + "learning_rate": 7.828e-05, + "loss": 0.6183, + "step": 15664 + }, + { + "epoch": 0.877197894501064, + "grad_norm": 1.1750584840774536, + "learning_rate": 7.828500000000001e-05, + "loss": 0.3967, + "step": 15665 + }, + { + "epoch": 0.877253891813193, + "grad_norm": 1.2902772426605225, + "learning_rate": 7.829000000000001e-05, + "loss": 0.4072, + "step": 15666 + }, + { + "epoch": 0.877309889125322, + "grad_norm": 1.3469256162643433, + "learning_rate": 7.829500000000001e-05, + "loss": 0.3589, + "step": 15667 + }, + { + "epoch": 0.877365886437451, + "grad_norm": 1.5020853281021118, + "learning_rate": 7.83e-05, + "loss": 0.6214, + "step": 15668 + }, + { + "epoch": 0.87742188374958, + "grad_norm": 1.1948949098587036, + "learning_rate": 7.8305e-05, + "loss": 0.4081, + "step": 15669 + }, + { + "epoch": 0.877477881061709, + "grad_norm": 1.28453528881073, + "learning_rate": 7.831e-05, + "loss": 0.3609, + "step": 15670 + }, + { + "epoch": 0.877533878373838, + "grad_norm": 1.2847203016281128, + "learning_rate": 7.831500000000001e-05, + "loss": 0.4036, + "step": 15671 + }, + { + "epoch": 0.877589875685967, + "grad_norm": 1.308562994003296, + "learning_rate": 7.832000000000001e-05, + "loss": 0.3373, + "step": 15672 + }, + { + "epoch": 0.877645872998096, + "grad_norm": 1.245770812034607, + "learning_rate": 7.8325e-05, + "loss": 0.4617, + "step": 15673 + }, + { + "epoch": 0.8777018703102251, + "grad_norm": 1.3665430545806885, + "learning_rate": 7.833e-05, + "loss": 0.622, + "step": 15674 + }, + { + "epoch": 0.8777578676223541, + "grad_norm": 1.1770883798599243, + "learning_rate": 7.8335e-05, + "loss": 0.3847, + "step": 15675 + }, + { + "epoch": 0.8778138649344831, + "grad_norm": 1.5456660985946655, + "learning_rate": 7.834e-05, + "loss": 0.3619, + "step": 15676 + }, + { + "epoch": 0.8778698622466121, + "grad_norm": 1.3694406747817993, + "learning_rate": 7.8345e-05, + "loss": 0.3838, + "step": 15677 + }, + { + "epoch": 0.8779258595587411, + "grad_norm": 1.4143106937408447, + "learning_rate": 7.835000000000001e-05, + "loss": 0.5196, + "step": 15678 + }, + { + "epoch": 0.8779818568708702, + "grad_norm": 1.2907812595367432, + "learning_rate": 7.8355e-05, + "loss": 0.3555, + "step": 15679 + }, + { + "epoch": 0.8780378541829992, + "grad_norm": 1.3018662929534912, + "learning_rate": 7.836e-05, + "loss": 0.4758, + "step": 15680 + }, + { + "epoch": 0.8780938514951282, + "grad_norm": 1.2399076223373413, + "learning_rate": 7.8365e-05, + "loss": 0.32, + "step": 15681 + }, + { + "epoch": 0.8781498488072572, + "grad_norm": 1.7134053707122803, + "learning_rate": 7.837e-05, + "loss": 0.3929, + "step": 15682 + }, + { + "epoch": 0.8782058461193862, + "grad_norm": 1.286105990409851, + "learning_rate": 7.8375e-05, + "loss": 0.4124, + "step": 15683 + }, + { + "epoch": 0.8782618434315153, + "grad_norm": 1.7660576105117798, + "learning_rate": 7.838e-05, + "loss": 0.4636, + "step": 15684 + }, + { + "epoch": 0.8783178407436443, + "grad_norm": 1.1781138181686401, + "learning_rate": 7.8385e-05, + "loss": 0.5726, + "step": 15685 + }, + { + "epoch": 0.8783738380557733, + "grad_norm": 1.2534667253494263, + "learning_rate": 7.839000000000001e-05, + "loss": 0.3376, + "step": 15686 + }, + { + "epoch": 0.8784298353679023, + "grad_norm": 1.2173240184783936, + "learning_rate": 7.839500000000001e-05, + "loss": 0.4651, + "step": 15687 + }, + { + "epoch": 0.8784858326800313, + "grad_norm": 1.2226678133010864, + "learning_rate": 7.840000000000001e-05, + "loss": 0.5286, + "step": 15688 + }, + { + "epoch": 0.8785418299921604, + "grad_norm": 1.1224699020385742, + "learning_rate": 7.8405e-05, + "loss": 0.5105, + "step": 15689 + }, + { + "epoch": 0.8785978273042894, + "grad_norm": 1.8798848390579224, + "learning_rate": 7.841e-05, + "loss": 0.5881, + "step": 15690 + }, + { + "epoch": 0.8786538246164184, + "grad_norm": 1.2383431196212769, + "learning_rate": 7.8415e-05, + "loss": 0.3522, + "step": 15691 + }, + { + "epoch": 0.8787098219285474, + "grad_norm": 1.2047781944274902, + "learning_rate": 7.842e-05, + "loss": 0.3768, + "step": 15692 + }, + { + "epoch": 0.8787658192406764, + "grad_norm": 1.237811803817749, + "learning_rate": 7.842500000000001e-05, + "loss": 0.3701, + "step": 15693 + }, + { + "epoch": 0.8788218165528054, + "grad_norm": 1.475827693939209, + "learning_rate": 7.843e-05, + "loss": 0.4622, + "step": 15694 + }, + { + "epoch": 0.8788778138649345, + "grad_norm": 1.2367972135543823, + "learning_rate": 7.8435e-05, + "loss": 0.4071, + "step": 15695 + }, + { + "epoch": 0.8789338111770635, + "grad_norm": 1.360946774482727, + "learning_rate": 7.844e-05, + "loss": 0.5247, + "step": 15696 + }, + { + "epoch": 0.8789898084891925, + "grad_norm": 1.1919349431991577, + "learning_rate": 7.8445e-05, + "loss": 0.3536, + "step": 15697 + }, + { + "epoch": 0.8790458058013215, + "grad_norm": 1.7997097969055176, + "learning_rate": 7.845e-05, + "loss": 0.6277, + "step": 15698 + }, + { + "epoch": 0.8791018031134505, + "grad_norm": 1.5822625160217285, + "learning_rate": 7.845500000000001e-05, + "loss": 0.6634, + "step": 15699 + }, + { + "epoch": 0.8791578004255796, + "grad_norm": 1.187794804573059, + "learning_rate": 7.846e-05, + "loss": 0.3938, + "step": 15700 + }, + { + "epoch": 0.8792137977377086, + "grad_norm": 1.319350242614746, + "learning_rate": 7.8465e-05, + "loss": 0.3277, + "step": 15701 + }, + { + "epoch": 0.8792697950498376, + "grad_norm": 1.2361481189727783, + "learning_rate": 7.847e-05, + "loss": 0.4237, + "step": 15702 + }, + { + "epoch": 0.8793257923619666, + "grad_norm": 1.2398425340652466, + "learning_rate": 7.8475e-05, + "loss": 0.4074, + "step": 15703 + }, + { + "epoch": 0.8793817896740956, + "grad_norm": 1.4108473062515259, + "learning_rate": 7.848000000000001e-05, + "loss": 0.5114, + "step": 15704 + }, + { + "epoch": 0.8794377869862247, + "grad_norm": 1.2769901752471924, + "learning_rate": 7.8485e-05, + "loss": 0.5534, + "step": 15705 + }, + { + "epoch": 0.8794937842983537, + "grad_norm": 1.167389154434204, + "learning_rate": 7.849e-05, + "loss": 0.3794, + "step": 15706 + }, + { + "epoch": 0.8795497816104827, + "grad_norm": 1.3811312913894653, + "learning_rate": 7.849500000000001e-05, + "loss": 0.4559, + "step": 15707 + }, + { + "epoch": 0.8796057789226117, + "grad_norm": 1.0768953561782837, + "learning_rate": 7.850000000000001e-05, + "loss": 0.3338, + "step": 15708 + }, + { + "epoch": 0.8796617762347407, + "grad_norm": 1.198590874671936, + "learning_rate": 7.850500000000001e-05, + "loss": 0.3763, + "step": 15709 + }, + { + "epoch": 0.8797177735468698, + "grad_norm": 1.430693507194519, + "learning_rate": 7.851e-05, + "loss": 0.4116, + "step": 15710 + }, + { + "epoch": 0.8797737708589988, + "grad_norm": 1.407012939453125, + "learning_rate": 7.8515e-05, + "loss": 0.4667, + "step": 15711 + }, + { + "epoch": 0.8798297681711278, + "grad_norm": 1.4793479442596436, + "learning_rate": 7.852e-05, + "loss": 0.3616, + "step": 15712 + }, + { + "epoch": 0.8798857654832568, + "grad_norm": 1.2668476104736328, + "learning_rate": 7.8525e-05, + "loss": 0.58, + "step": 15713 + }, + { + "epoch": 0.8799417627953858, + "grad_norm": 1.2861369848251343, + "learning_rate": 7.853000000000001e-05, + "loss": 0.5646, + "step": 15714 + }, + { + "epoch": 0.8799977601075148, + "grad_norm": 1.1182711124420166, + "learning_rate": 7.8535e-05, + "loss": 0.385, + "step": 15715 + }, + { + "epoch": 0.8800537574196439, + "grad_norm": 1.5841948986053467, + "learning_rate": 7.854e-05, + "loss": 0.4527, + "step": 15716 + }, + { + "epoch": 0.8801097547317729, + "grad_norm": 1.2600511312484741, + "learning_rate": 7.8545e-05, + "loss": 0.4581, + "step": 15717 + }, + { + "epoch": 0.8801657520439019, + "grad_norm": 1.081499695777893, + "learning_rate": 7.855e-05, + "loss": 0.363, + "step": 15718 + }, + { + "epoch": 0.8802217493560309, + "grad_norm": 1.4237945079803467, + "learning_rate": 7.8555e-05, + "loss": 0.5863, + "step": 15719 + }, + { + "epoch": 0.8802777466681599, + "grad_norm": 1.3789386749267578, + "learning_rate": 7.856000000000001e-05, + "loss": 0.3903, + "step": 15720 + }, + { + "epoch": 0.880333743980289, + "grad_norm": 1.7766565084457397, + "learning_rate": 7.8565e-05, + "loss": 0.5534, + "step": 15721 + }, + { + "epoch": 0.880389741292418, + "grad_norm": 1.305375576019287, + "learning_rate": 7.857e-05, + "loss": 0.4963, + "step": 15722 + }, + { + "epoch": 0.880445738604547, + "grad_norm": 1.1422768831253052, + "learning_rate": 7.8575e-05, + "loss": 0.3225, + "step": 15723 + }, + { + "epoch": 0.880501735916676, + "grad_norm": 1.2995232343673706, + "learning_rate": 7.858000000000001e-05, + "loss": 0.3413, + "step": 15724 + }, + { + "epoch": 0.880557733228805, + "grad_norm": 1.1238868236541748, + "learning_rate": 7.858500000000001e-05, + "loss": 0.3428, + "step": 15725 + }, + { + "epoch": 0.8806137305409341, + "grad_norm": 2.277445077896118, + "learning_rate": 7.859e-05, + "loss": 0.4319, + "step": 15726 + }, + { + "epoch": 0.8806697278530631, + "grad_norm": 1.423354983329773, + "learning_rate": 7.8595e-05, + "loss": 0.5173, + "step": 15727 + }, + { + "epoch": 0.8807257251651921, + "grad_norm": 1.0879390239715576, + "learning_rate": 7.860000000000001e-05, + "loss": 0.4703, + "step": 15728 + }, + { + "epoch": 0.8807817224773211, + "grad_norm": 1.377094030380249, + "learning_rate": 7.860500000000001e-05, + "loss": 0.459, + "step": 15729 + }, + { + "epoch": 0.8808377197894501, + "grad_norm": 1.2914865016937256, + "learning_rate": 7.861000000000001e-05, + "loss": 0.4543, + "step": 15730 + }, + { + "epoch": 0.8808937171015792, + "grad_norm": 1.1602896451950073, + "learning_rate": 7.8615e-05, + "loss": 0.3261, + "step": 15731 + }, + { + "epoch": 0.8809497144137082, + "grad_norm": 1.5694911479949951, + "learning_rate": 7.862e-05, + "loss": 0.4, + "step": 15732 + }, + { + "epoch": 0.8810057117258372, + "grad_norm": 1.3546191453933716, + "learning_rate": 7.8625e-05, + "loss": 0.3636, + "step": 15733 + }, + { + "epoch": 0.8810617090379662, + "grad_norm": 1.2792226076126099, + "learning_rate": 7.863e-05, + "loss": 0.4535, + "step": 15734 + }, + { + "epoch": 0.8811177063500952, + "grad_norm": 1.7178654670715332, + "learning_rate": 7.863500000000001e-05, + "loss": 0.3848, + "step": 15735 + }, + { + "epoch": 0.8811737036622243, + "grad_norm": 1.3578240871429443, + "learning_rate": 7.864e-05, + "loss": 0.4088, + "step": 15736 + }, + { + "epoch": 0.8812297009743533, + "grad_norm": 1.5478277206420898, + "learning_rate": 7.8645e-05, + "loss": 0.4679, + "step": 15737 + }, + { + "epoch": 0.8812856982864823, + "grad_norm": 1.336410403251648, + "learning_rate": 7.865e-05, + "loss": 0.4307, + "step": 15738 + }, + { + "epoch": 0.8813416955986113, + "grad_norm": 1.3684492111206055, + "learning_rate": 7.8655e-05, + "loss": 0.4762, + "step": 15739 + }, + { + "epoch": 0.8813976929107403, + "grad_norm": 1.430402398109436, + "learning_rate": 7.866e-05, + "loss": 0.6, + "step": 15740 + }, + { + "epoch": 0.8814536902228693, + "grad_norm": 1.1227734088897705, + "learning_rate": 7.866499999999999e-05, + "loss": 0.3133, + "step": 15741 + }, + { + "epoch": 0.8815096875349984, + "grad_norm": 1.2948923110961914, + "learning_rate": 7.867e-05, + "loss": 0.4961, + "step": 15742 + }, + { + "epoch": 0.8815656848471274, + "grad_norm": 1.7092138528823853, + "learning_rate": 7.8675e-05, + "loss": 0.5236, + "step": 15743 + }, + { + "epoch": 0.8816216821592564, + "grad_norm": 1.4647992849349976, + "learning_rate": 7.868000000000001e-05, + "loss": 0.5177, + "step": 15744 + }, + { + "epoch": 0.8816776794713854, + "grad_norm": 1.3417359590530396, + "learning_rate": 7.868500000000001e-05, + "loss": 0.4036, + "step": 15745 + }, + { + "epoch": 0.8817336767835144, + "grad_norm": 1.4374005794525146, + "learning_rate": 7.869000000000001e-05, + "loss": 0.4864, + "step": 15746 + }, + { + "epoch": 0.8817896740956435, + "grad_norm": 1.2635575532913208, + "learning_rate": 7.8695e-05, + "loss": 0.4443, + "step": 15747 + }, + { + "epoch": 0.8818456714077725, + "grad_norm": 1.4097075462341309, + "learning_rate": 7.87e-05, + "loss": 0.4626, + "step": 15748 + }, + { + "epoch": 0.8819016687199015, + "grad_norm": 1.5398057699203491, + "learning_rate": 7.870500000000001e-05, + "loss": 0.6255, + "step": 15749 + }, + { + "epoch": 0.8819576660320304, + "grad_norm": 1.2846860885620117, + "learning_rate": 7.871000000000001e-05, + "loss": 0.422, + "step": 15750 + }, + { + "epoch": 0.8820136633441594, + "grad_norm": 1.2987191677093506, + "learning_rate": 7.871500000000001e-05, + "loss": 0.4639, + "step": 15751 + }, + { + "epoch": 0.8820696606562884, + "grad_norm": 1.2411824464797974, + "learning_rate": 7.872e-05, + "loss": 0.4454, + "step": 15752 + }, + { + "epoch": 0.8821256579684175, + "grad_norm": 1.18653404712677, + "learning_rate": 7.8725e-05, + "loss": 0.4113, + "step": 15753 + }, + { + "epoch": 0.8821816552805465, + "grad_norm": 1.386753797531128, + "learning_rate": 7.873e-05, + "loss": 0.5086, + "step": 15754 + }, + { + "epoch": 0.8822376525926755, + "grad_norm": 1.205653429031372, + "learning_rate": 7.8735e-05, + "loss": 0.4265, + "step": 15755 + }, + { + "epoch": 0.8822936499048045, + "grad_norm": 1.688432216644287, + "learning_rate": 7.874000000000001e-05, + "loss": 0.531, + "step": 15756 + }, + { + "epoch": 0.8823496472169335, + "grad_norm": 1.3085002899169922, + "learning_rate": 7.8745e-05, + "loss": 0.4349, + "step": 15757 + }, + { + "epoch": 0.8824056445290626, + "grad_norm": 1.0334367752075195, + "learning_rate": 7.875e-05, + "loss": 0.4277, + "step": 15758 + }, + { + "epoch": 0.8824616418411916, + "grad_norm": 1.3466542959213257, + "learning_rate": 7.8755e-05, + "loss": 0.4581, + "step": 15759 + }, + { + "epoch": 0.8825176391533206, + "grad_norm": 1.5849618911743164, + "learning_rate": 7.876e-05, + "loss": 0.5612, + "step": 15760 + }, + { + "epoch": 0.8825736364654496, + "grad_norm": 1.3699510097503662, + "learning_rate": 7.8765e-05, + "loss": 0.4414, + "step": 15761 + }, + { + "epoch": 0.8826296337775786, + "grad_norm": 1.2993191480636597, + "learning_rate": 7.876999999999999e-05, + "loss": 0.4806, + "step": 15762 + }, + { + "epoch": 0.8826856310897077, + "grad_norm": 1.302834153175354, + "learning_rate": 7.8775e-05, + "loss": 0.3442, + "step": 15763 + }, + { + "epoch": 0.8827416284018367, + "grad_norm": 1.1885366439819336, + "learning_rate": 7.878e-05, + "loss": 0.4186, + "step": 15764 + }, + { + "epoch": 0.8827976257139657, + "grad_norm": 2.4305038452148438, + "learning_rate": 7.878500000000001e-05, + "loss": 0.786, + "step": 15765 + }, + { + "epoch": 0.8828536230260947, + "grad_norm": 1.4897726774215698, + "learning_rate": 7.879000000000001e-05, + "loss": 0.5225, + "step": 15766 + }, + { + "epoch": 0.8829096203382237, + "grad_norm": 1.3332767486572266, + "learning_rate": 7.879500000000001e-05, + "loss": 0.4555, + "step": 15767 + }, + { + "epoch": 0.8829656176503528, + "grad_norm": 1.3837586641311646, + "learning_rate": 7.88e-05, + "loss": 0.5187, + "step": 15768 + }, + { + "epoch": 0.8830216149624818, + "grad_norm": 1.3469719886779785, + "learning_rate": 7.8805e-05, + "loss": 0.3342, + "step": 15769 + }, + { + "epoch": 0.8830776122746108, + "grad_norm": 1.5540140867233276, + "learning_rate": 7.881e-05, + "loss": 0.4922, + "step": 15770 + }, + { + "epoch": 0.8831336095867398, + "grad_norm": 1.3087104558944702, + "learning_rate": 7.881500000000001e-05, + "loss": 0.4845, + "step": 15771 + }, + { + "epoch": 0.8831896068988688, + "grad_norm": 1.4179688692092896, + "learning_rate": 7.882000000000001e-05, + "loss": 0.422, + "step": 15772 + }, + { + "epoch": 0.8832456042109978, + "grad_norm": 1.3407858610153198, + "learning_rate": 7.8825e-05, + "loss": 0.5993, + "step": 15773 + }, + { + "epoch": 0.8833016015231269, + "grad_norm": 1.298976182937622, + "learning_rate": 7.883e-05, + "loss": 0.6, + "step": 15774 + }, + { + "epoch": 0.8833575988352559, + "grad_norm": 2.641345739364624, + "learning_rate": 7.8835e-05, + "loss": 0.4339, + "step": 15775 + }, + { + "epoch": 0.8834135961473849, + "grad_norm": 1.4696484804153442, + "learning_rate": 7.884e-05, + "loss": 0.7233, + "step": 15776 + }, + { + "epoch": 0.8834695934595139, + "grad_norm": 1.3720401525497437, + "learning_rate": 7.884500000000001e-05, + "loss": 0.4915, + "step": 15777 + }, + { + "epoch": 0.8835255907716429, + "grad_norm": 1.4552339315414429, + "learning_rate": 7.885e-05, + "loss": 0.5732, + "step": 15778 + }, + { + "epoch": 0.883581588083772, + "grad_norm": 1.0989445447921753, + "learning_rate": 7.8855e-05, + "loss": 0.4422, + "step": 15779 + }, + { + "epoch": 0.883637585395901, + "grad_norm": 1.638524055480957, + "learning_rate": 7.886e-05, + "loss": 0.6128, + "step": 15780 + }, + { + "epoch": 0.88369358270803, + "grad_norm": 1.354801893234253, + "learning_rate": 7.8865e-05, + "loss": 0.4369, + "step": 15781 + }, + { + "epoch": 0.883749580020159, + "grad_norm": 1.3780988454818726, + "learning_rate": 7.887e-05, + "loss": 0.5277, + "step": 15782 + }, + { + "epoch": 0.883805577332288, + "grad_norm": 2.5515425205230713, + "learning_rate": 7.887499999999999e-05, + "loss": 0.4847, + "step": 15783 + }, + { + "epoch": 0.8838615746444171, + "grad_norm": 1.3097612857818604, + "learning_rate": 7.888e-05, + "loss": 0.489, + "step": 15784 + }, + { + "epoch": 0.8839175719565461, + "grad_norm": 1.5259464979171753, + "learning_rate": 7.888500000000001e-05, + "loss": 0.6229, + "step": 15785 + }, + { + "epoch": 0.8839735692686751, + "grad_norm": 1.48321533203125, + "learning_rate": 7.889000000000001e-05, + "loss": 0.4613, + "step": 15786 + }, + { + "epoch": 0.8840295665808041, + "grad_norm": 1.4632923603057861, + "learning_rate": 7.889500000000001e-05, + "loss": 0.5773, + "step": 15787 + }, + { + "epoch": 0.8840855638929331, + "grad_norm": 1.2408758401870728, + "learning_rate": 7.890000000000001e-05, + "loss": 0.4215, + "step": 15788 + }, + { + "epoch": 0.8841415612050622, + "grad_norm": 1.5239912271499634, + "learning_rate": 7.8905e-05, + "loss": 0.404, + "step": 15789 + }, + { + "epoch": 0.8841975585171912, + "grad_norm": 1.1883434057235718, + "learning_rate": 7.891e-05, + "loss": 0.4913, + "step": 15790 + }, + { + "epoch": 0.8842535558293202, + "grad_norm": 1.1972849369049072, + "learning_rate": 7.8915e-05, + "loss": 0.3678, + "step": 15791 + }, + { + "epoch": 0.8843095531414492, + "grad_norm": 1.5056698322296143, + "learning_rate": 7.892000000000001e-05, + "loss": 0.4903, + "step": 15792 + }, + { + "epoch": 0.8843655504535782, + "grad_norm": 1.1644717454910278, + "learning_rate": 7.892500000000001e-05, + "loss": 0.3522, + "step": 15793 + }, + { + "epoch": 0.8844215477657072, + "grad_norm": 1.3911675214767456, + "learning_rate": 7.893e-05, + "loss": 0.4528, + "step": 15794 + }, + { + "epoch": 0.8844775450778363, + "grad_norm": 1.78517484664917, + "learning_rate": 7.8935e-05, + "loss": 0.6006, + "step": 15795 + }, + { + "epoch": 0.8845335423899653, + "grad_norm": 1.4201488494873047, + "learning_rate": 7.894e-05, + "loss": 0.4007, + "step": 15796 + }, + { + "epoch": 0.8845895397020943, + "grad_norm": 1.1176270246505737, + "learning_rate": 7.8945e-05, + "loss": 0.3761, + "step": 15797 + }, + { + "epoch": 0.8846455370142233, + "grad_norm": 1.5850352048873901, + "learning_rate": 7.895000000000001e-05, + "loss": 0.5135, + "step": 15798 + }, + { + "epoch": 0.8847015343263523, + "grad_norm": 1.4305088520050049, + "learning_rate": 7.8955e-05, + "loss": 0.5224, + "step": 15799 + }, + { + "epoch": 0.8847575316384814, + "grad_norm": 1.3241698741912842, + "learning_rate": 7.896e-05, + "loss": 0.4852, + "step": 15800 + }, + { + "epoch": 0.8848135289506104, + "grad_norm": 1.5602823495864868, + "learning_rate": 7.8965e-05, + "loss": 0.5188, + "step": 15801 + }, + { + "epoch": 0.8848695262627394, + "grad_norm": 1.3524249792099, + "learning_rate": 7.897e-05, + "loss": 0.462, + "step": 15802 + }, + { + "epoch": 0.8849255235748684, + "grad_norm": 1.0575517416000366, + "learning_rate": 7.8975e-05, + "loss": 0.3493, + "step": 15803 + }, + { + "epoch": 0.8849815208869974, + "grad_norm": 1.414480447769165, + "learning_rate": 7.897999999999999e-05, + "loss": 0.4526, + "step": 15804 + }, + { + "epoch": 0.8850375181991265, + "grad_norm": 1.4470492601394653, + "learning_rate": 7.8985e-05, + "loss": 0.3683, + "step": 15805 + }, + { + "epoch": 0.8850935155112555, + "grad_norm": 1.4290904998779297, + "learning_rate": 7.899000000000001e-05, + "loss": 0.5473, + "step": 15806 + }, + { + "epoch": 0.8851495128233845, + "grad_norm": 1.4914425611495972, + "learning_rate": 7.899500000000001e-05, + "loss": 0.5157, + "step": 15807 + }, + { + "epoch": 0.8852055101355135, + "grad_norm": 2.0020601749420166, + "learning_rate": 7.900000000000001e-05, + "loss": 0.5264, + "step": 15808 + }, + { + "epoch": 0.8852615074476425, + "grad_norm": 1.4715309143066406, + "learning_rate": 7.9005e-05, + "loss": 0.5628, + "step": 15809 + }, + { + "epoch": 0.8853175047597716, + "grad_norm": 1.3492498397827148, + "learning_rate": 7.901e-05, + "loss": 0.5557, + "step": 15810 + }, + { + "epoch": 0.8853735020719006, + "grad_norm": 1.3634693622589111, + "learning_rate": 7.9015e-05, + "loss": 0.4594, + "step": 15811 + }, + { + "epoch": 0.8854294993840296, + "grad_norm": 1.4548414945602417, + "learning_rate": 7.902e-05, + "loss": 0.3877, + "step": 15812 + }, + { + "epoch": 0.8854854966961586, + "grad_norm": 1.302975058555603, + "learning_rate": 7.902500000000001e-05, + "loss": 0.4062, + "step": 15813 + }, + { + "epoch": 0.8855414940082876, + "grad_norm": 1.6429636478424072, + "learning_rate": 7.903000000000001e-05, + "loss": 0.644, + "step": 15814 + }, + { + "epoch": 0.8855974913204167, + "grad_norm": 1.5866872072219849, + "learning_rate": 7.9035e-05, + "loss": 0.5736, + "step": 15815 + }, + { + "epoch": 0.8856534886325457, + "grad_norm": 1.7080777883529663, + "learning_rate": 7.904e-05, + "loss": 0.529, + "step": 15816 + }, + { + "epoch": 0.8857094859446747, + "grad_norm": 1.3886007070541382, + "learning_rate": 7.9045e-05, + "loss": 0.3237, + "step": 15817 + }, + { + "epoch": 0.8857654832568037, + "grad_norm": 1.5394459962844849, + "learning_rate": 7.905e-05, + "loss": 0.4803, + "step": 15818 + }, + { + "epoch": 0.8858214805689327, + "grad_norm": 1.3017991781234741, + "learning_rate": 7.905500000000001e-05, + "loss": 0.4171, + "step": 15819 + }, + { + "epoch": 0.8858774778810617, + "grad_norm": 1.491532802581787, + "learning_rate": 7.906e-05, + "loss": 0.5411, + "step": 15820 + }, + { + "epoch": 0.8859334751931908, + "grad_norm": 1.353472113609314, + "learning_rate": 7.9065e-05, + "loss": 0.4454, + "step": 15821 + }, + { + "epoch": 0.8859894725053198, + "grad_norm": 18.59945297241211, + "learning_rate": 7.907e-05, + "loss": 0.472, + "step": 15822 + }, + { + "epoch": 0.8860454698174488, + "grad_norm": 1.55799400806427, + "learning_rate": 7.9075e-05, + "loss": 0.514, + "step": 15823 + }, + { + "epoch": 0.8861014671295778, + "grad_norm": 1.2203803062438965, + "learning_rate": 7.908e-05, + "loss": 0.4197, + "step": 15824 + }, + { + "epoch": 0.8861574644417068, + "grad_norm": 1.3588980436325073, + "learning_rate": 7.9085e-05, + "loss": 0.3994, + "step": 15825 + }, + { + "epoch": 0.8862134617538359, + "grad_norm": 1.366632342338562, + "learning_rate": 7.909e-05, + "loss": 0.4418, + "step": 15826 + }, + { + "epoch": 0.8862694590659649, + "grad_norm": 1.4658832550048828, + "learning_rate": 7.909500000000001e-05, + "loss": 0.5261, + "step": 15827 + }, + { + "epoch": 0.8863254563780939, + "grad_norm": 1.6701146364212036, + "learning_rate": 7.910000000000001e-05, + "loss": 0.4231, + "step": 15828 + }, + { + "epoch": 0.8863814536902229, + "grad_norm": 1.3779981136322021, + "learning_rate": 7.910500000000001e-05, + "loss": 0.5005, + "step": 15829 + }, + { + "epoch": 0.8864374510023519, + "grad_norm": 1.3617491722106934, + "learning_rate": 7.911e-05, + "loss": 0.4605, + "step": 15830 + }, + { + "epoch": 0.886493448314481, + "grad_norm": 1.4876869916915894, + "learning_rate": 7.9115e-05, + "loss": 0.5499, + "step": 15831 + }, + { + "epoch": 0.88654944562661, + "grad_norm": 1.2605429887771606, + "learning_rate": 7.912e-05, + "loss": 0.3691, + "step": 15832 + }, + { + "epoch": 0.8866054429387389, + "grad_norm": 1.374754548072815, + "learning_rate": 7.9125e-05, + "loss": 0.395, + "step": 15833 + }, + { + "epoch": 0.8866614402508679, + "grad_norm": 1.4355651140213013, + "learning_rate": 7.913000000000001e-05, + "loss": 0.4694, + "step": 15834 + }, + { + "epoch": 0.8867174375629969, + "grad_norm": 2.087085723876953, + "learning_rate": 7.913500000000001e-05, + "loss": 0.4419, + "step": 15835 + }, + { + "epoch": 0.8867734348751259, + "grad_norm": 1.4105737209320068, + "learning_rate": 7.914e-05, + "loss": 0.4611, + "step": 15836 + }, + { + "epoch": 0.886829432187255, + "grad_norm": 1.5528287887573242, + "learning_rate": 7.9145e-05, + "loss": 0.4813, + "step": 15837 + }, + { + "epoch": 0.886885429499384, + "grad_norm": 1.5225160121917725, + "learning_rate": 7.915e-05, + "loss": 0.3797, + "step": 15838 + }, + { + "epoch": 0.886941426811513, + "grad_norm": 1.4016565084457397, + "learning_rate": 7.9155e-05, + "loss": 0.436, + "step": 15839 + }, + { + "epoch": 0.886997424123642, + "grad_norm": 1.6190400123596191, + "learning_rate": 7.916e-05, + "loss": 0.6496, + "step": 15840 + }, + { + "epoch": 0.887053421435771, + "grad_norm": 1.536169171333313, + "learning_rate": 7.9165e-05, + "loss": 0.491, + "step": 15841 + }, + { + "epoch": 0.8871094187479001, + "grad_norm": 1.1836466789245605, + "learning_rate": 7.917e-05, + "loss": 0.3229, + "step": 15842 + }, + { + "epoch": 0.8871654160600291, + "grad_norm": 1.2826460599899292, + "learning_rate": 7.9175e-05, + "loss": 0.3797, + "step": 15843 + }, + { + "epoch": 0.8872214133721581, + "grad_norm": 1.3410890102386475, + "learning_rate": 7.918e-05, + "loss": 0.4594, + "step": 15844 + }, + { + "epoch": 0.8872774106842871, + "grad_norm": 1.5151267051696777, + "learning_rate": 7.918500000000001e-05, + "loss": 0.6094, + "step": 15845 + }, + { + "epoch": 0.8873334079964161, + "grad_norm": 1.3532662391662598, + "learning_rate": 7.919e-05, + "loss": 0.552, + "step": 15846 + }, + { + "epoch": 0.8873894053085452, + "grad_norm": 1.1911065578460693, + "learning_rate": 7.9195e-05, + "loss": 0.3841, + "step": 15847 + }, + { + "epoch": 0.8874454026206742, + "grad_norm": 1.689738392829895, + "learning_rate": 7.920000000000001e-05, + "loss": 0.4415, + "step": 15848 + }, + { + "epoch": 0.8875013999328032, + "grad_norm": 1.3184703588485718, + "learning_rate": 7.920500000000001e-05, + "loss": 0.4592, + "step": 15849 + }, + { + "epoch": 0.8875573972449322, + "grad_norm": 1.3035863637924194, + "learning_rate": 7.921000000000001e-05, + "loss": 0.3915, + "step": 15850 + }, + { + "epoch": 0.8876133945570612, + "grad_norm": 1.5418294668197632, + "learning_rate": 7.9215e-05, + "loss": 0.4651, + "step": 15851 + }, + { + "epoch": 0.8876693918691902, + "grad_norm": 1.2926342487335205, + "learning_rate": 7.922e-05, + "loss": 0.3957, + "step": 15852 + }, + { + "epoch": 0.8877253891813193, + "grad_norm": 1.4467759132385254, + "learning_rate": 7.9225e-05, + "loss": 0.4977, + "step": 15853 + }, + { + "epoch": 0.8877813864934483, + "grad_norm": 1.3778395652770996, + "learning_rate": 7.923e-05, + "loss": 0.4759, + "step": 15854 + }, + { + "epoch": 0.8878373838055773, + "grad_norm": 1.3714171648025513, + "learning_rate": 7.923500000000001e-05, + "loss": 0.3592, + "step": 15855 + }, + { + "epoch": 0.8878933811177063, + "grad_norm": 1.3562978506088257, + "learning_rate": 7.924000000000001e-05, + "loss": 0.4602, + "step": 15856 + }, + { + "epoch": 0.8879493784298353, + "grad_norm": 1.3002963066101074, + "learning_rate": 7.9245e-05, + "loss": 0.4336, + "step": 15857 + }, + { + "epoch": 0.8880053757419644, + "grad_norm": 1.0840893983840942, + "learning_rate": 7.925e-05, + "loss": 0.4134, + "step": 15858 + }, + { + "epoch": 0.8880613730540934, + "grad_norm": 1.65232515335083, + "learning_rate": 7.9255e-05, + "loss": 0.6223, + "step": 15859 + }, + { + "epoch": 0.8881173703662224, + "grad_norm": 1.4145983457565308, + "learning_rate": 7.926e-05, + "loss": 0.5579, + "step": 15860 + }, + { + "epoch": 0.8881733676783514, + "grad_norm": 1.2491377592086792, + "learning_rate": 7.9265e-05, + "loss": 0.4983, + "step": 15861 + }, + { + "epoch": 0.8882293649904804, + "grad_norm": 1.2925677299499512, + "learning_rate": 7.927e-05, + "loss": 0.4141, + "step": 15862 + }, + { + "epoch": 0.8882853623026095, + "grad_norm": 1.2815871238708496, + "learning_rate": 7.9275e-05, + "loss": 0.3611, + "step": 15863 + }, + { + "epoch": 0.8883413596147385, + "grad_norm": 1.209580421447754, + "learning_rate": 7.928e-05, + "loss": 0.3394, + "step": 15864 + }, + { + "epoch": 0.8883973569268675, + "grad_norm": 1.3125176429748535, + "learning_rate": 7.928500000000001e-05, + "loss": 0.4889, + "step": 15865 + }, + { + "epoch": 0.8884533542389965, + "grad_norm": 1.1995253562927246, + "learning_rate": 7.929000000000001e-05, + "loss": 0.4111, + "step": 15866 + }, + { + "epoch": 0.8885093515511255, + "grad_norm": 1.5066097974777222, + "learning_rate": 7.9295e-05, + "loss": 0.3786, + "step": 15867 + }, + { + "epoch": 0.8885653488632546, + "grad_norm": 1.250152349472046, + "learning_rate": 7.93e-05, + "loss": 0.3288, + "step": 15868 + }, + { + "epoch": 0.8886213461753836, + "grad_norm": 1.4152107238769531, + "learning_rate": 7.9305e-05, + "loss": 0.5992, + "step": 15869 + }, + { + "epoch": 0.8886773434875126, + "grad_norm": 2.087266683578491, + "learning_rate": 7.931000000000001e-05, + "loss": 0.4672, + "step": 15870 + }, + { + "epoch": 0.8887333407996416, + "grad_norm": 1.2819197177886963, + "learning_rate": 7.931500000000001e-05, + "loss": 0.3629, + "step": 15871 + }, + { + "epoch": 0.8887893381117706, + "grad_norm": 1.2254847288131714, + "learning_rate": 7.932e-05, + "loss": 0.4562, + "step": 15872 + }, + { + "epoch": 0.8888453354238997, + "grad_norm": 1.9349656105041504, + "learning_rate": 7.9325e-05, + "loss": 0.7662, + "step": 15873 + }, + { + "epoch": 0.8889013327360287, + "grad_norm": 3.4781572818756104, + "learning_rate": 7.933e-05, + "loss": 0.4305, + "step": 15874 + }, + { + "epoch": 0.8889573300481577, + "grad_norm": 1.4061557054519653, + "learning_rate": 7.9335e-05, + "loss": 0.5388, + "step": 15875 + }, + { + "epoch": 0.8890133273602867, + "grad_norm": 1.6819357872009277, + "learning_rate": 7.934000000000001e-05, + "loss": 0.5555, + "step": 15876 + }, + { + "epoch": 0.8890693246724157, + "grad_norm": 1.3523751497268677, + "learning_rate": 7.934500000000001e-05, + "loss": 0.3901, + "step": 15877 + }, + { + "epoch": 0.8891253219845447, + "grad_norm": 1.488936185836792, + "learning_rate": 7.935e-05, + "loss": 0.6156, + "step": 15878 + }, + { + "epoch": 0.8891813192966738, + "grad_norm": 1.2007721662521362, + "learning_rate": 7.9355e-05, + "loss": 0.3833, + "step": 15879 + }, + { + "epoch": 0.8892373166088028, + "grad_norm": 1.336026668548584, + "learning_rate": 7.936e-05, + "loss": 0.5105, + "step": 15880 + }, + { + "epoch": 0.8892933139209318, + "grad_norm": 1.1636326313018799, + "learning_rate": 7.9365e-05, + "loss": 0.4252, + "step": 15881 + }, + { + "epoch": 0.8893493112330608, + "grad_norm": 1.8537273406982422, + "learning_rate": 7.937e-05, + "loss": 0.4205, + "step": 15882 + }, + { + "epoch": 0.8894053085451898, + "grad_norm": 1.2801110744476318, + "learning_rate": 7.9375e-05, + "loss": 0.6122, + "step": 15883 + }, + { + "epoch": 0.8894613058573189, + "grad_norm": 1.5170996189117432, + "learning_rate": 7.938e-05, + "loss": 0.439, + "step": 15884 + }, + { + "epoch": 0.8895173031694479, + "grad_norm": 1.350251317024231, + "learning_rate": 7.9385e-05, + "loss": 0.4352, + "step": 15885 + }, + { + "epoch": 0.8895733004815769, + "grad_norm": 1.1521936655044556, + "learning_rate": 7.939000000000001e-05, + "loss": 0.4359, + "step": 15886 + }, + { + "epoch": 0.8896292977937059, + "grad_norm": 1.2937862873077393, + "learning_rate": 7.939500000000001e-05, + "loss": 0.3774, + "step": 15887 + }, + { + "epoch": 0.8896852951058349, + "grad_norm": 1.3156973123550415, + "learning_rate": 7.94e-05, + "loss": 0.4469, + "step": 15888 + }, + { + "epoch": 0.889741292417964, + "grad_norm": 1.879453420639038, + "learning_rate": 7.9405e-05, + "loss": 0.5854, + "step": 15889 + }, + { + "epoch": 0.889797289730093, + "grad_norm": 18.626432418823242, + "learning_rate": 7.941e-05, + "loss": 0.4712, + "step": 15890 + }, + { + "epoch": 0.889853287042222, + "grad_norm": 1.63935387134552, + "learning_rate": 7.941500000000001e-05, + "loss": 0.435, + "step": 15891 + }, + { + "epoch": 0.889909284354351, + "grad_norm": 1.7789877653121948, + "learning_rate": 7.942000000000001e-05, + "loss": 0.4656, + "step": 15892 + }, + { + "epoch": 0.88996528166648, + "grad_norm": 1.30344557762146, + "learning_rate": 7.9425e-05, + "loss": 0.4466, + "step": 15893 + }, + { + "epoch": 0.890021278978609, + "grad_norm": 1.3301328420639038, + "learning_rate": 7.943e-05, + "loss": 0.4277, + "step": 15894 + }, + { + "epoch": 0.8900772762907381, + "grad_norm": 1.4361047744750977, + "learning_rate": 7.9435e-05, + "loss": 0.4576, + "step": 15895 + }, + { + "epoch": 0.8901332736028671, + "grad_norm": 1.6864228248596191, + "learning_rate": 7.944e-05, + "loss": 0.3797, + "step": 15896 + }, + { + "epoch": 0.8901892709149961, + "grad_norm": 1.6416587829589844, + "learning_rate": 7.944500000000001e-05, + "loss": 0.4188, + "step": 15897 + }, + { + "epoch": 0.8902452682271251, + "grad_norm": 1.5706303119659424, + "learning_rate": 7.945e-05, + "loss": 0.4691, + "step": 15898 + }, + { + "epoch": 0.8903012655392541, + "grad_norm": 1.7088900804519653, + "learning_rate": 7.9455e-05, + "loss": 0.6489, + "step": 15899 + }, + { + "epoch": 0.8903572628513832, + "grad_norm": 1.3953651189804077, + "learning_rate": 7.946e-05, + "loss": 0.4973, + "step": 15900 + }, + { + "epoch": 0.8904132601635122, + "grad_norm": 1.3055979013442993, + "learning_rate": 7.9465e-05, + "loss": 0.3984, + "step": 15901 + }, + { + "epoch": 0.8904692574756412, + "grad_norm": 1.4346344470977783, + "learning_rate": 7.947e-05, + "loss": 0.5526, + "step": 15902 + }, + { + "epoch": 0.8905252547877702, + "grad_norm": 1.500628113746643, + "learning_rate": 7.9475e-05, + "loss": 0.4486, + "step": 15903 + }, + { + "epoch": 0.8905812520998992, + "grad_norm": 1.1303764581680298, + "learning_rate": 7.948e-05, + "loss": 0.3409, + "step": 15904 + }, + { + "epoch": 0.8906372494120283, + "grad_norm": 1.253481149673462, + "learning_rate": 7.9485e-05, + "loss": 0.4313, + "step": 15905 + }, + { + "epoch": 0.8906932467241573, + "grad_norm": 1.2080345153808594, + "learning_rate": 7.949000000000001e-05, + "loss": 0.4027, + "step": 15906 + }, + { + "epoch": 0.8907492440362863, + "grad_norm": 1.382400393486023, + "learning_rate": 7.949500000000001e-05, + "loss": 0.4287, + "step": 15907 + }, + { + "epoch": 0.8908052413484153, + "grad_norm": 1.2246397733688354, + "learning_rate": 7.950000000000001e-05, + "loss": 0.4408, + "step": 15908 + }, + { + "epoch": 0.8908612386605443, + "grad_norm": 1.305453896522522, + "learning_rate": 7.9505e-05, + "loss": 0.4523, + "step": 15909 + }, + { + "epoch": 0.8909172359726734, + "grad_norm": 1.3066540956497192, + "learning_rate": 7.951e-05, + "loss": 0.3914, + "step": 15910 + }, + { + "epoch": 0.8909732332848024, + "grad_norm": 1.8098177909851074, + "learning_rate": 7.9515e-05, + "loss": 0.5402, + "step": 15911 + }, + { + "epoch": 0.8910292305969314, + "grad_norm": 1.3305491209030151, + "learning_rate": 7.952000000000001e-05, + "loss": 0.4655, + "step": 15912 + }, + { + "epoch": 0.8910852279090604, + "grad_norm": 1.63957941532135, + "learning_rate": 7.952500000000001e-05, + "loss": 0.3788, + "step": 15913 + }, + { + "epoch": 0.8911412252211894, + "grad_norm": 1.2962210178375244, + "learning_rate": 7.953e-05, + "loss": 0.5002, + "step": 15914 + }, + { + "epoch": 0.8911972225333185, + "grad_norm": 1.2458339929580688, + "learning_rate": 7.9535e-05, + "loss": 0.3197, + "step": 15915 + }, + { + "epoch": 0.8912532198454474, + "grad_norm": 1.050899863243103, + "learning_rate": 7.954e-05, + "loss": 0.3315, + "step": 15916 + }, + { + "epoch": 0.8913092171575764, + "grad_norm": 1.336843490600586, + "learning_rate": 7.9545e-05, + "loss": 0.4845, + "step": 15917 + }, + { + "epoch": 0.8913652144697054, + "grad_norm": 1.130553126335144, + "learning_rate": 7.955e-05, + "loss": 0.3586, + "step": 15918 + }, + { + "epoch": 0.8914212117818344, + "grad_norm": 1.126340389251709, + "learning_rate": 7.9555e-05, + "loss": 0.3741, + "step": 15919 + }, + { + "epoch": 0.8914772090939634, + "grad_norm": 1.239076018333435, + "learning_rate": 7.956e-05, + "loss": 0.4854, + "step": 15920 + }, + { + "epoch": 0.8915332064060925, + "grad_norm": 29.588964462280273, + "learning_rate": 7.9565e-05, + "loss": 0.4257, + "step": 15921 + }, + { + "epoch": 0.8915892037182215, + "grad_norm": 1.1070128679275513, + "learning_rate": 7.957e-05, + "loss": 0.3545, + "step": 15922 + }, + { + "epoch": 0.8916452010303505, + "grad_norm": 1.6583384275436401, + "learning_rate": 7.9575e-05, + "loss": 0.4977, + "step": 15923 + }, + { + "epoch": 0.8917011983424795, + "grad_norm": 1.3087083101272583, + "learning_rate": 7.958e-05, + "loss": 0.5173, + "step": 15924 + }, + { + "epoch": 0.8917571956546085, + "grad_norm": 1.3332713842391968, + "learning_rate": 7.9585e-05, + "loss": 0.5326, + "step": 15925 + }, + { + "epoch": 0.8918131929667376, + "grad_norm": 1.3929810523986816, + "learning_rate": 7.959000000000002e-05, + "loss": 0.4613, + "step": 15926 + }, + { + "epoch": 0.8918691902788666, + "grad_norm": 1.5982316732406616, + "learning_rate": 7.959500000000001e-05, + "loss": 0.4301, + "step": 15927 + }, + { + "epoch": 0.8919251875909956, + "grad_norm": 1.7531770467758179, + "learning_rate": 7.960000000000001e-05, + "loss": 0.4654, + "step": 15928 + }, + { + "epoch": 0.8919811849031246, + "grad_norm": 1.43354070186615, + "learning_rate": 7.960500000000001e-05, + "loss": 0.4953, + "step": 15929 + }, + { + "epoch": 0.8920371822152536, + "grad_norm": 1.5353807210922241, + "learning_rate": 7.961e-05, + "loss": 0.4647, + "step": 15930 + }, + { + "epoch": 0.8920931795273827, + "grad_norm": 1.599859356880188, + "learning_rate": 7.9615e-05, + "loss": 0.4708, + "step": 15931 + }, + { + "epoch": 0.8921491768395117, + "grad_norm": 1.3043773174285889, + "learning_rate": 7.962e-05, + "loss": 0.4345, + "step": 15932 + }, + { + "epoch": 0.8922051741516407, + "grad_norm": 1.63169527053833, + "learning_rate": 7.962500000000001e-05, + "loss": 0.4663, + "step": 15933 + }, + { + "epoch": 0.8922611714637697, + "grad_norm": 2.183722734451294, + "learning_rate": 7.963000000000001e-05, + "loss": 0.4604, + "step": 15934 + }, + { + "epoch": 0.8923171687758987, + "grad_norm": 1.592522144317627, + "learning_rate": 7.9635e-05, + "loss": 0.5142, + "step": 15935 + }, + { + "epoch": 0.8923731660880277, + "grad_norm": 1.4822566509246826, + "learning_rate": 7.964e-05, + "loss": 0.5215, + "step": 15936 + }, + { + "epoch": 0.8924291634001568, + "grad_norm": 1.3690813779830933, + "learning_rate": 7.9645e-05, + "loss": 0.5041, + "step": 15937 + }, + { + "epoch": 0.8924851607122858, + "grad_norm": 1.424587607383728, + "learning_rate": 7.965e-05, + "loss": 0.4089, + "step": 15938 + }, + { + "epoch": 0.8925411580244148, + "grad_norm": 1.2622556686401367, + "learning_rate": 7.9655e-05, + "loss": 0.3578, + "step": 15939 + }, + { + "epoch": 0.8925971553365438, + "grad_norm": 1.181199550628662, + "learning_rate": 7.966e-05, + "loss": 0.3952, + "step": 15940 + }, + { + "epoch": 0.8926531526486728, + "grad_norm": 1.1826465129852295, + "learning_rate": 7.9665e-05, + "loss": 0.4911, + "step": 15941 + }, + { + "epoch": 0.8927091499608019, + "grad_norm": 1.2569098472595215, + "learning_rate": 7.967e-05, + "loss": 0.4143, + "step": 15942 + }, + { + "epoch": 0.8927651472729309, + "grad_norm": 1.435709834098816, + "learning_rate": 7.9675e-05, + "loss": 0.4349, + "step": 15943 + }, + { + "epoch": 0.8928211445850599, + "grad_norm": 1.2740654945373535, + "learning_rate": 7.968e-05, + "loss": 0.425, + "step": 15944 + }, + { + "epoch": 0.8928771418971889, + "grad_norm": 1.3047778606414795, + "learning_rate": 7.9685e-05, + "loss": 0.3786, + "step": 15945 + }, + { + "epoch": 0.8929331392093179, + "grad_norm": 1.2718031406402588, + "learning_rate": 7.969e-05, + "loss": 0.3694, + "step": 15946 + }, + { + "epoch": 0.892989136521447, + "grad_norm": 1.4117116928100586, + "learning_rate": 7.9695e-05, + "loss": 0.5094, + "step": 15947 + }, + { + "epoch": 0.893045133833576, + "grad_norm": 1.591858983039856, + "learning_rate": 7.970000000000001e-05, + "loss": 0.5699, + "step": 15948 + }, + { + "epoch": 0.893101131145705, + "grad_norm": 1.3295998573303223, + "learning_rate": 7.970500000000001e-05, + "loss": 0.3798, + "step": 15949 + }, + { + "epoch": 0.893157128457834, + "grad_norm": 1.3497673273086548, + "learning_rate": 7.971000000000001e-05, + "loss": 0.3794, + "step": 15950 + }, + { + "epoch": 0.893213125769963, + "grad_norm": 1.523391604423523, + "learning_rate": 7.9715e-05, + "loss": 0.5808, + "step": 15951 + }, + { + "epoch": 0.893269123082092, + "grad_norm": 1.3123940229415894, + "learning_rate": 7.972e-05, + "loss": 0.4704, + "step": 15952 + }, + { + "epoch": 0.8933251203942211, + "grad_norm": 1.4037657976150513, + "learning_rate": 7.9725e-05, + "loss": 0.4478, + "step": 15953 + }, + { + "epoch": 0.8933811177063501, + "grad_norm": 21.667104721069336, + "learning_rate": 7.973000000000001e-05, + "loss": 0.3895, + "step": 15954 + }, + { + "epoch": 0.8934371150184791, + "grad_norm": 1.4874041080474854, + "learning_rate": 7.973500000000001e-05, + "loss": 0.4544, + "step": 15955 + }, + { + "epoch": 0.8934931123306081, + "grad_norm": 1.2745970487594604, + "learning_rate": 7.974e-05, + "loss": 0.3299, + "step": 15956 + }, + { + "epoch": 0.8935491096427371, + "grad_norm": 1.364397644996643, + "learning_rate": 7.9745e-05, + "loss": 0.5139, + "step": 15957 + }, + { + "epoch": 0.8936051069548662, + "grad_norm": 1.5993179082870483, + "learning_rate": 7.975e-05, + "loss": 0.4982, + "step": 15958 + }, + { + "epoch": 0.8936611042669952, + "grad_norm": 1.1834697723388672, + "learning_rate": 7.9755e-05, + "loss": 0.3477, + "step": 15959 + }, + { + "epoch": 0.8937171015791242, + "grad_norm": 1.264442801475525, + "learning_rate": 7.976e-05, + "loss": 0.3819, + "step": 15960 + }, + { + "epoch": 0.8937730988912532, + "grad_norm": 1.2564032077789307, + "learning_rate": 7.9765e-05, + "loss": 0.3424, + "step": 15961 + }, + { + "epoch": 0.8938290962033822, + "grad_norm": 1.2035491466522217, + "learning_rate": 7.977e-05, + "loss": 0.361, + "step": 15962 + }, + { + "epoch": 0.8938850935155113, + "grad_norm": 1.5151845216751099, + "learning_rate": 7.9775e-05, + "loss": 0.3303, + "step": 15963 + }, + { + "epoch": 0.8939410908276403, + "grad_norm": 1.3209103345870972, + "learning_rate": 7.978e-05, + "loss": 0.468, + "step": 15964 + }, + { + "epoch": 0.8939970881397693, + "grad_norm": 1.5421687364578247, + "learning_rate": 7.9785e-05, + "loss": 0.5283, + "step": 15965 + }, + { + "epoch": 0.8940530854518983, + "grad_norm": 1.4798215627670288, + "learning_rate": 7.979000000000001e-05, + "loss": 0.4598, + "step": 15966 + }, + { + "epoch": 0.8941090827640273, + "grad_norm": 1.294493556022644, + "learning_rate": 7.9795e-05, + "loss": 0.5248, + "step": 15967 + }, + { + "epoch": 0.8941650800761564, + "grad_norm": 1.600492000579834, + "learning_rate": 7.98e-05, + "loss": 0.5206, + "step": 15968 + }, + { + "epoch": 0.8942210773882854, + "grad_norm": 1.415737509727478, + "learning_rate": 7.980500000000001e-05, + "loss": 0.4547, + "step": 15969 + }, + { + "epoch": 0.8942770747004144, + "grad_norm": 1.2604701519012451, + "learning_rate": 7.981000000000001e-05, + "loss": 0.4808, + "step": 15970 + }, + { + "epoch": 0.8943330720125434, + "grad_norm": 1.2768656015396118, + "learning_rate": 7.981500000000001e-05, + "loss": 0.3808, + "step": 15971 + }, + { + "epoch": 0.8943890693246724, + "grad_norm": 1.1699621677398682, + "learning_rate": 7.982e-05, + "loss": 0.366, + "step": 15972 + }, + { + "epoch": 0.8944450666368015, + "grad_norm": 1.4129713773727417, + "learning_rate": 7.9825e-05, + "loss": 0.4452, + "step": 15973 + }, + { + "epoch": 0.8945010639489305, + "grad_norm": 1.3117674589157104, + "learning_rate": 7.983e-05, + "loss": 0.4753, + "step": 15974 + }, + { + "epoch": 0.8945570612610595, + "grad_norm": 1.3465110063552856, + "learning_rate": 7.983500000000001e-05, + "loss": 0.468, + "step": 15975 + }, + { + "epoch": 0.8946130585731885, + "grad_norm": 1.24776291847229, + "learning_rate": 7.984000000000001e-05, + "loss": 0.437, + "step": 15976 + }, + { + "epoch": 0.8946690558853175, + "grad_norm": 1.4752442836761475, + "learning_rate": 7.9845e-05, + "loss": 0.3954, + "step": 15977 + }, + { + "epoch": 0.8947250531974466, + "grad_norm": 1.5850889682769775, + "learning_rate": 7.985e-05, + "loss": 0.4348, + "step": 15978 + }, + { + "epoch": 0.8947810505095756, + "grad_norm": 1.7630277872085571, + "learning_rate": 7.9855e-05, + "loss": 0.3952, + "step": 15979 + }, + { + "epoch": 0.8948370478217046, + "grad_norm": 1.4778448343276978, + "learning_rate": 7.986e-05, + "loss": 0.6347, + "step": 15980 + }, + { + "epoch": 0.8948930451338336, + "grad_norm": 1.1664304733276367, + "learning_rate": 7.9865e-05, + "loss": 0.3586, + "step": 15981 + }, + { + "epoch": 0.8949490424459626, + "grad_norm": 1.3973556756973267, + "learning_rate": 7.987e-05, + "loss": 0.4527, + "step": 15982 + }, + { + "epoch": 0.8950050397580916, + "grad_norm": 1.5247793197631836, + "learning_rate": 7.9875e-05, + "loss": 0.4154, + "step": 15983 + }, + { + "epoch": 0.8950610370702207, + "grad_norm": 1.149987816810608, + "learning_rate": 7.988e-05, + "loss": 0.3322, + "step": 15984 + }, + { + "epoch": 0.8951170343823497, + "grad_norm": 1.3923696279525757, + "learning_rate": 7.9885e-05, + "loss": 0.4424, + "step": 15985 + }, + { + "epoch": 0.8951730316944787, + "grad_norm": 1.550440788269043, + "learning_rate": 7.989000000000001e-05, + "loss": 0.3841, + "step": 15986 + }, + { + "epoch": 0.8952290290066077, + "grad_norm": 1.2467762231826782, + "learning_rate": 7.9895e-05, + "loss": 0.5213, + "step": 15987 + }, + { + "epoch": 0.8952850263187367, + "grad_norm": 1.3135510683059692, + "learning_rate": 7.99e-05, + "loss": 0.4367, + "step": 15988 + }, + { + "epoch": 0.8953410236308658, + "grad_norm": 1.4266338348388672, + "learning_rate": 7.9905e-05, + "loss": 0.5519, + "step": 15989 + }, + { + "epoch": 0.8953970209429948, + "grad_norm": 1.6020103693008423, + "learning_rate": 7.991000000000001e-05, + "loss": 0.4317, + "step": 15990 + }, + { + "epoch": 0.8954530182551238, + "grad_norm": 1.1448312997817993, + "learning_rate": 7.991500000000001e-05, + "loss": 0.3692, + "step": 15991 + }, + { + "epoch": 0.8955090155672528, + "grad_norm": 1.4078099727630615, + "learning_rate": 7.992000000000001e-05, + "loss": 0.5585, + "step": 15992 + }, + { + "epoch": 0.8955650128793818, + "grad_norm": 1.2579874992370605, + "learning_rate": 7.9925e-05, + "loss": 0.4945, + "step": 15993 + }, + { + "epoch": 0.8956210101915109, + "grad_norm": 1.5178617238998413, + "learning_rate": 7.993e-05, + "loss": 0.524, + "step": 15994 + }, + { + "epoch": 0.8956770075036399, + "grad_norm": 1.4215368032455444, + "learning_rate": 7.9935e-05, + "loss": 0.4524, + "step": 15995 + }, + { + "epoch": 0.8957330048157689, + "grad_norm": 1.0902271270751953, + "learning_rate": 7.994000000000001e-05, + "loss": 0.3212, + "step": 15996 + }, + { + "epoch": 0.8957890021278979, + "grad_norm": 1.4094494581222534, + "learning_rate": 7.994500000000001e-05, + "loss": 0.4249, + "step": 15997 + }, + { + "epoch": 0.8958449994400268, + "grad_norm": 1.415575623512268, + "learning_rate": 7.995e-05, + "loss": 0.4705, + "step": 15998 + }, + { + "epoch": 0.8959009967521558, + "grad_norm": 1.1373982429504395, + "learning_rate": 7.9955e-05, + "loss": 0.3789, + "step": 15999 + }, + { + "epoch": 0.8959569940642849, + "grad_norm": 1.6810451745986938, + "learning_rate": 7.996e-05, + "loss": 0.7872, + "step": 16000 + }, + { + "epoch": 0.8960129913764139, + "grad_norm": 1.4576246738433838, + "learning_rate": 7.9965e-05, + "loss": 0.361, + "step": 16001 + }, + { + "epoch": 0.8960689886885429, + "grad_norm": 3.6870405673980713, + "learning_rate": 7.997e-05, + "loss": 0.4053, + "step": 16002 + }, + { + "epoch": 0.8961249860006719, + "grad_norm": 1.5046871900558472, + "learning_rate": 7.9975e-05, + "loss": 0.4212, + "step": 16003 + }, + { + "epoch": 0.8961809833128009, + "grad_norm": 1.788367509841919, + "learning_rate": 7.998e-05, + "loss": 0.5489, + "step": 16004 + }, + { + "epoch": 0.89623698062493, + "grad_norm": 1.2205194234848022, + "learning_rate": 7.9985e-05, + "loss": 0.3749, + "step": 16005 + }, + { + "epoch": 0.896292977937059, + "grad_norm": 1.5499311685562134, + "learning_rate": 7.999000000000001e-05, + "loss": 0.4994, + "step": 16006 + }, + { + "epoch": 0.896348975249188, + "grad_norm": 1.4454174041748047, + "learning_rate": 7.999500000000001e-05, + "loss": 0.588, + "step": 16007 + }, + { + "epoch": 0.896404972561317, + "grad_norm": 1.3178184032440186, + "learning_rate": 8e-05, + "loss": 0.4279, + "step": 16008 + }, + { + "epoch": 0.896460969873446, + "grad_norm": 1.215785264968872, + "learning_rate": 8.0005e-05, + "loss": 0.5083, + "step": 16009 + }, + { + "epoch": 0.896516967185575, + "grad_norm": 1.4268912076950073, + "learning_rate": 8.001e-05, + "loss": 0.4572, + "step": 16010 + }, + { + "epoch": 0.8965729644977041, + "grad_norm": 1.2209932804107666, + "learning_rate": 8.001500000000001e-05, + "loss": 0.3982, + "step": 16011 + }, + { + "epoch": 0.8966289618098331, + "grad_norm": 1.4563535451889038, + "learning_rate": 8.002000000000001e-05, + "loss": 0.4739, + "step": 16012 + }, + { + "epoch": 0.8966849591219621, + "grad_norm": 1.3663113117218018, + "learning_rate": 8.002500000000001e-05, + "loss": 0.4169, + "step": 16013 + }, + { + "epoch": 0.8967409564340911, + "grad_norm": 1.5242372751235962, + "learning_rate": 8.003e-05, + "loss": 0.4739, + "step": 16014 + }, + { + "epoch": 0.8967969537462201, + "grad_norm": 1.2546685934066772, + "learning_rate": 8.0035e-05, + "loss": 0.423, + "step": 16015 + }, + { + "epoch": 0.8968529510583492, + "grad_norm": 1.2493762969970703, + "learning_rate": 8.004e-05, + "loss": 0.4325, + "step": 16016 + }, + { + "epoch": 0.8969089483704782, + "grad_norm": 1.390856146812439, + "learning_rate": 8.0045e-05, + "loss": 0.5716, + "step": 16017 + }, + { + "epoch": 0.8969649456826072, + "grad_norm": 1.460411787033081, + "learning_rate": 8.005000000000001e-05, + "loss": 0.4483, + "step": 16018 + }, + { + "epoch": 0.8970209429947362, + "grad_norm": 1.5322096347808838, + "learning_rate": 8.0055e-05, + "loss": 0.4411, + "step": 16019 + }, + { + "epoch": 0.8970769403068652, + "grad_norm": 1.2709147930145264, + "learning_rate": 8.006e-05, + "loss": 0.5024, + "step": 16020 + }, + { + "epoch": 0.8971329376189943, + "grad_norm": 1.3853907585144043, + "learning_rate": 8.0065e-05, + "loss": 0.4838, + "step": 16021 + }, + { + "epoch": 0.8971889349311233, + "grad_norm": 1.5160138607025146, + "learning_rate": 8.007e-05, + "loss": 0.4062, + "step": 16022 + }, + { + "epoch": 0.8972449322432523, + "grad_norm": 1.3848271369934082, + "learning_rate": 8.0075e-05, + "loss": 0.4543, + "step": 16023 + }, + { + "epoch": 0.8973009295553813, + "grad_norm": 1.159733772277832, + "learning_rate": 8.008e-05, + "loss": 0.4621, + "step": 16024 + }, + { + "epoch": 0.8973569268675103, + "grad_norm": 1.551933765411377, + "learning_rate": 8.0085e-05, + "loss": 0.5854, + "step": 16025 + }, + { + "epoch": 0.8974129241796394, + "grad_norm": 1.224943995475769, + "learning_rate": 8.009e-05, + "loss": 0.4938, + "step": 16026 + }, + { + "epoch": 0.8974689214917684, + "grad_norm": 1.6137648820877075, + "learning_rate": 8.009500000000001e-05, + "loss": 0.5849, + "step": 16027 + }, + { + "epoch": 0.8975249188038974, + "grad_norm": 1.6982605457305908, + "learning_rate": 8.010000000000001e-05, + "loss": 0.5129, + "step": 16028 + }, + { + "epoch": 0.8975809161160264, + "grad_norm": 1.256736397743225, + "learning_rate": 8.0105e-05, + "loss": 0.5327, + "step": 16029 + }, + { + "epoch": 0.8976369134281554, + "grad_norm": 1.533632755279541, + "learning_rate": 8.011e-05, + "loss": 0.5459, + "step": 16030 + }, + { + "epoch": 0.8976929107402845, + "grad_norm": 1.3175910711288452, + "learning_rate": 8.0115e-05, + "loss": 0.5595, + "step": 16031 + }, + { + "epoch": 0.8977489080524135, + "grad_norm": 1.1984654664993286, + "learning_rate": 8.012000000000001e-05, + "loss": 0.3818, + "step": 16032 + }, + { + "epoch": 0.8978049053645425, + "grad_norm": 1.3063366413116455, + "learning_rate": 8.012500000000001e-05, + "loss": 0.3868, + "step": 16033 + }, + { + "epoch": 0.8978609026766715, + "grad_norm": 1.4270581007003784, + "learning_rate": 8.013000000000001e-05, + "loss": 0.4259, + "step": 16034 + }, + { + "epoch": 0.8979168999888005, + "grad_norm": 1.5793156623840332, + "learning_rate": 8.0135e-05, + "loss": 0.4375, + "step": 16035 + }, + { + "epoch": 0.8979728973009296, + "grad_norm": 1.4425928592681885, + "learning_rate": 8.014e-05, + "loss": 0.5081, + "step": 16036 + }, + { + "epoch": 0.8980288946130586, + "grad_norm": 1.380345344543457, + "learning_rate": 8.0145e-05, + "loss": 0.5136, + "step": 16037 + }, + { + "epoch": 0.8980848919251876, + "grad_norm": 1.4106237888336182, + "learning_rate": 8.015e-05, + "loss": 0.4984, + "step": 16038 + }, + { + "epoch": 0.8981408892373166, + "grad_norm": 1.3798160552978516, + "learning_rate": 8.015500000000001e-05, + "loss": 0.4104, + "step": 16039 + }, + { + "epoch": 0.8981968865494456, + "grad_norm": 1.4829031229019165, + "learning_rate": 8.016e-05, + "loss": 0.4247, + "step": 16040 + }, + { + "epoch": 0.8982528838615746, + "grad_norm": 1.0722377300262451, + "learning_rate": 8.0165e-05, + "loss": 0.3371, + "step": 16041 + }, + { + "epoch": 0.8983088811737037, + "grad_norm": 1.2194339036941528, + "learning_rate": 8.017e-05, + "loss": 0.4685, + "step": 16042 + }, + { + "epoch": 0.8983648784858327, + "grad_norm": 1.2947598695755005, + "learning_rate": 8.0175e-05, + "loss": 0.3746, + "step": 16043 + }, + { + "epoch": 0.8984208757979617, + "grad_norm": 1.1339155435562134, + "learning_rate": 8.018e-05, + "loss": 0.4044, + "step": 16044 + }, + { + "epoch": 0.8984768731100907, + "grad_norm": 1.5063127279281616, + "learning_rate": 8.0185e-05, + "loss": 0.4086, + "step": 16045 + }, + { + "epoch": 0.8985328704222197, + "grad_norm": 1.4349595308303833, + "learning_rate": 8.019e-05, + "loss": 0.5806, + "step": 16046 + }, + { + "epoch": 0.8985888677343488, + "grad_norm": 1.2671962976455688, + "learning_rate": 8.019500000000001e-05, + "loss": 0.3964, + "step": 16047 + }, + { + "epoch": 0.8986448650464778, + "grad_norm": 2.124438524246216, + "learning_rate": 8.020000000000001e-05, + "loss": 0.4742, + "step": 16048 + }, + { + "epoch": 0.8987008623586068, + "grad_norm": 1.3917243480682373, + "learning_rate": 8.020500000000001e-05, + "loss": 0.5335, + "step": 16049 + }, + { + "epoch": 0.8987568596707358, + "grad_norm": 1.321627140045166, + "learning_rate": 8.021e-05, + "loss": 0.4422, + "step": 16050 + }, + { + "epoch": 0.8988128569828648, + "grad_norm": 1.4380525350570679, + "learning_rate": 8.0215e-05, + "loss": 0.4308, + "step": 16051 + }, + { + "epoch": 0.8988688542949939, + "grad_norm": 1.2992310523986816, + "learning_rate": 8.022e-05, + "loss": 0.4118, + "step": 16052 + }, + { + "epoch": 0.8989248516071229, + "grad_norm": 1.675029993057251, + "learning_rate": 8.022500000000001e-05, + "loss": 0.6676, + "step": 16053 + }, + { + "epoch": 0.8989808489192519, + "grad_norm": 1.4144927263259888, + "learning_rate": 8.023000000000001e-05, + "loss": 0.3992, + "step": 16054 + }, + { + "epoch": 0.8990368462313809, + "grad_norm": 1.5401591062545776, + "learning_rate": 8.023500000000001e-05, + "loss": 0.4584, + "step": 16055 + }, + { + "epoch": 0.8990928435435099, + "grad_norm": 1.4692164659500122, + "learning_rate": 8.024e-05, + "loss": 0.5731, + "step": 16056 + }, + { + "epoch": 0.899148840855639, + "grad_norm": 1.1132906675338745, + "learning_rate": 8.0245e-05, + "loss": 0.3796, + "step": 16057 + }, + { + "epoch": 0.899204838167768, + "grad_norm": 1.2593834400177002, + "learning_rate": 8.025e-05, + "loss": 0.4325, + "step": 16058 + }, + { + "epoch": 0.899260835479897, + "grad_norm": 1.2930701971054077, + "learning_rate": 8.0255e-05, + "loss": 0.4755, + "step": 16059 + }, + { + "epoch": 0.899316832792026, + "grad_norm": 1.2894867658615112, + "learning_rate": 8.026000000000001e-05, + "loss": 0.6381, + "step": 16060 + }, + { + "epoch": 0.899372830104155, + "grad_norm": 1.3451348543167114, + "learning_rate": 8.0265e-05, + "loss": 0.4582, + "step": 16061 + }, + { + "epoch": 0.899428827416284, + "grad_norm": 1.569953441619873, + "learning_rate": 8.027e-05, + "loss": 0.4933, + "step": 16062 + }, + { + "epoch": 0.8994848247284131, + "grad_norm": 1.1205308437347412, + "learning_rate": 8.0275e-05, + "loss": 0.329, + "step": 16063 + }, + { + "epoch": 0.8995408220405421, + "grad_norm": 1.2865349054336548, + "learning_rate": 8.028e-05, + "loss": 0.4729, + "step": 16064 + }, + { + "epoch": 0.8995968193526711, + "grad_norm": 1.3408750295639038, + "learning_rate": 8.0285e-05, + "loss": 0.4201, + "step": 16065 + }, + { + "epoch": 0.8996528166648001, + "grad_norm": 1.244759440422058, + "learning_rate": 8.028999999999999e-05, + "loss": 0.4149, + "step": 16066 + }, + { + "epoch": 0.8997088139769291, + "grad_norm": 1.2448692321777344, + "learning_rate": 8.0295e-05, + "loss": 0.3632, + "step": 16067 + }, + { + "epoch": 0.8997648112890582, + "grad_norm": 1.2463021278381348, + "learning_rate": 8.030000000000001e-05, + "loss": 0.4785, + "step": 16068 + }, + { + "epoch": 0.8998208086011872, + "grad_norm": 1.367929458618164, + "learning_rate": 8.030500000000001e-05, + "loss": 0.4811, + "step": 16069 + }, + { + "epoch": 0.8998768059133162, + "grad_norm": 1.4838855266571045, + "learning_rate": 8.031000000000001e-05, + "loss": 0.425, + "step": 16070 + }, + { + "epoch": 0.8999328032254452, + "grad_norm": 1.6777338981628418, + "learning_rate": 8.0315e-05, + "loss": 0.5884, + "step": 16071 + }, + { + "epoch": 0.8999888005375742, + "grad_norm": 1.2199866771697998, + "learning_rate": 8.032e-05, + "loss": 0.4392, + "step": 16072 + }, + { + "epoch": 0.9000447978497033, + "grad_norm": 1.167232632637024, + "learning_rate": 8.0325e-05, + "loss": 0.4452, + "step": 16073 + }, + { + "epoch": 0.9001007951618323, + "grad_norm": 1.5272061824798584, + "learning_rate": 8.033000000000001e-05, + "loss": 0.4721, + "step": 16074 + }, + { + "epoch": 0.9001567924739613, + "grad_norm": 1.4772456884384155, + "learning_rate": 8.033500000000001e-05, + "loss": 0.5618, + "step": 16075 + }, + { + "epoch": 0.9002127897860903, + "grad_norm": 1.3673405647277832, + "learning_rate": 8.034e-05, + "loss": 0.4879, + "step": 16076 + }, + { + "epoch": 0.9002687870982193, + "grad_norm": 1.3085155487060547, + "learning_rate": 8.0345e-05, + "loss": 0.3939, + "step": 16077 + }, + { + "epoch": 0.9003247844103484, + "grad_norm": 1.544012188911438, + "learning_rate": 8.035e-05, + "loss": 0.3759, + "step": 16078 + }, + { + "epoch": 0.9003807817224774, + "grad_norm": 1.4712990522384644, + "learning_rate": 8.0355e-05, + "loss": 0.5326, + "step": 16079 + }, + { + "epoch": 0.9004367790346064, + "grad_norm": 1.9467556476593018, + "learning_rate": 8.036e-05, + "loss": 0.3925, + "step": 16080 + }, + { + "epoch": 0.9004927763467353, + "grad_norm": 1.3120015859603882, + "learning_rate": 8.036500000000001e-05, + "loss": 0.3204, + "step": 16081 + }, + { + "epoch": 0.9005487736588643, + "grad_norm": Infinity, + "learning_rate": 8.036500000000001e-05, + "loss": 0.4318, + "step": 16082 + }, + { + "epoch": 0.9006047709709933, + "grad_norm": 1.3546849489212036, + "learning_rate": 8.037e-05, + "loss": 0.3974, + "step": 16083 + }, + { + "epoch": 0.9006607682831224, + "grad_norm": 1.37135910987854, + "learning_rate": 8.0375e-05, + "loss": 0.5232, + "step": 16084 + }, + { + "epoch": 0.9007167655952514, + "grad_norm": 1.3977696895599365, + "learning_rate": 8.038e-05, + "loss": 0.4193, + "step": 16085 + }, + { + "epoch": 0.9007727629073804, + "grad_norm": 1.3352080583572388, + "learning_rate": 8.0385e-05, + "loss": 0.472, + "step": 16086 + }, + { + "epoch": 0.9008287602195094, + "grad_norm": 1.2196705341339111, + "learning_rate": 8.039e-05, + "loss": 0.356, + "step": 16087 + }, + { + "epoch": 0.9008847575316384, + "grad_norm": 1.4114445447921753, + "learning_rate": 8.0395e-05, + "loss": 0.4789, + "step": 16088 + }, + { + "epoch": 0.9009407548437675, + "grad_norm": 1.3660703897476196, + "learning_rate": 8.04e-05, + "loss": 0.4888, + "step": 16089 + }, + { + "epoch": 0.9009967521558965, + "grad_norm": 1.3786678314208984, + "learning_rate": 8.040500000000001e-05, + "loss": 0.4506, + "step": 16090 + }, + { + "epoch": 0.9010527494680255, + "grad_norm": 1.1101665496826172, + "learning_rate": 8.041000000000001e-05, + "loss": 0.3506, + "step": 16091 + }, + { + "epoch": 0.9011087467801545, + "grad_norm": 1.319434404373169, + "learning_rate": 8.041500000000001e-05, + "loss": 0.3757, + "step": 16092 + }, + { + "epoch": 0.9011647440922835, + "grad_norm": 1.656463861465454, + "learning_rate": 8.042e-05, + "loss": 0.5559, + "step": 16093 + }, + { + "epoch": 0.9012207414044126, + "grad_norm": 1.1022865772247314, + "learning_rate": 8.0425e-05, + "loss": 0.3704, + "step": 16094 + }, + { + "epoch": 0.9012767387165416, + "grad_norm": 1.38589346408844, + "learning_rate": 8.043e-05, + "loss": 0.4217, + "step": 16095 + }, + { + "epoch": 0.9013327360286706, + "grad_norm": 1.690219521522522, + "learning_rate": 8.0435e-05, + "loss": 0.5664, + "step": 16096 + }, + { + "epoch": 0.9013887333407996, + "grad_norm": 1.1399365663528442, + "learning_rate": 8.044000000000001e-05, + "loss": 0.4389, + "step": 16097 + }, + { + "epoch": 0.9014447306529286, + "grad_norm": 1.3624540567398071, + "learning_rate": 8.0445e-05, + "loss": 0.4216, + "step": 16098 + }, + { + "epoch": 0.9015007279650576, + "grad_norm": 1.3338139057159424, + "learning_rate": 8.045e-05, + "loss": 0.4295, + "step": 16099 + }, + { + "epoch": 0.9015567252771867, + "grad_norm": 1.6596754789352417, + "learning_rate": 8.0455e-05, + "loss": 0.4851, + "step": 16100 + }, + { + "epoch": 0.9016127225893157, + "grad_norm": 1.45106840133667, + "learning_rate": 8.046e-05, + "loss": 0.4712, + "step": 16101 + }, + { + "epoch": 0.9016687199014447, + "grad_norm": 1.7162147760391235, + "learning_rate": 8.0465e-05, + "loss": 0.5216, + "step": 16102 + }, + { + "epoch": 0.9017247172135737, + "grad_norm": 1.3345248699188232, + "learning_rate": 8.047000000000001e-05, + "loss": 0.4811, + "step": 16103 + }, + { + "epoch": 0.9017807145257027, + "grad_norm": 1.5572048425674438, + "learning_rate": 8.0475e-05, + "loss": 0.5564, + "step": 16104 + }, + { + "epoch": 0.9018367118378318, + "grad_norm": 1.1450647115707397, + "learning_rate": 8.048e-05, + "loss": 0.4413, + "step": 16105 + }, + { + "epoch": 0.9018927091499608, + "grad_norm": 1.807526707649231, + "learning_rate": 8.0485e-05, + "loss": 0.6043, + "step": 16106 + }, + { + "epoch": 0.9019487064620898, + "grad_norm": 1.389998435974121, + "learning_rate": 8.049e-05, + "loss": 0.497, + "step": 16107 + }, + { + "epoch": 0.9020047037742188, + "grad_norm": 1.0663912296295166, + "learning_rate": 8.049500000000001e-05, + "loss": 0.3759, + "step": 16108 + }, + { + "epoch": 0.9020607010863478, + "grad_norm": 1.4233379364013672, + "learning_rate": 8.05e-05, + "loss": 0.4, + "step": 16109 + }, + { + "epoch": 0.9021166983984769, + "grad_norm": 1.518378496170044, + "learning_rate": 8.0505e-05, + "loss": 0.4115, + "step": 16110 + }, + { + "epoch": 0.9021726957106059, + "grad_norm": 1.3768906593322754, + "learning_rate": 8.051000000000001e-05, + "loss": 0.5397, + "step": 16111 + }, + { + "epoch": 0.9022286930227349, + "grad_norm": 1.376766562461853, + "learning_rate": 8.051500000000001e-05, + "loss": 0.4534, + "step": 16112 + }, + { + "epoch": 0.9022846903348639, + "grad_norm": 1.288089394569397, + "learning_rate": 8.052000000000001e-05, + "loss": 0.4622, + "step": 16113 + }, + { + "epoch": 0.9023406876469929, + "grad_norm": 1.2125897407531738, + "learning_rate": 8.0525e-05, + "loss": 0.4396, + "step": 16114 + }, + { + "epoch": 0.902396684959122, + "grad_norm": 2.0193521976470947, + "learning_rate": 8.053e-05, + "loss": 0.6333, + "step": 16115 + }, + { + "epoch": 0.902452682271251, + "grad_norm": 1.7033817768096924, + "learning_rate": 8.0535e-05, + "loss": 0.5278, + "step": 16116 + }, + { + "epoch": 0.90250867958338, + "grad_norm": 1.4649571180343628, + "learning_rate": 8.054e-05, + "loss": 0.5143, + "step": 16117 + }, + { + "epoch": 0.902564676895509, + "grad_norm": 1.4818642139434814, + "learning_rate": 8.054500000000001e-05, + "loss": 0.4286, + "step": 16118 + }, + { + "epoch": 0.902620674207638, + "grad_norm": 1.3658850193023682, + "learning_rate": 8.055e-05, + "loss": 0.551, + "step": 16119 + }, + { + "epoch": 0.902676671519767, + "grad_norm": 1.3626432418823242, + "learning_rate": 8.0555e-05, + "loss": 0.4361, + "step": 16120 + }, + { + "epoch": 0.9027326688318961, + "grad_norm": 1.1653423309326172, + "learning_rate": 8.056e-05, + "loss": 0.3681, + "step": 16121 + }, + { + "epoch": 0.9027886661440251, + "grad_norm": 1.0749971866607666, + "learning_rate": 8.0565e-05, + "loss": 0.3763, + "step": 16122 + }, + { + "epoch": 0.9028446634561541, + "grad_norm": 1.35155189037323, + "learning_rate": 8.057e-05, + "loss": 0.393, + "step": 16123 + }, + { + "epoch": 0.9029006607682831, + "grad_norm": 1.3368000984191895, + "learning_rate": 8.057500000000001e-05, + "loss": 0.3983, + "step": 16124 + }, + { + "epoch": 0.9029566580804121, + "grad_norm": 1.3507204055786133, + "learning_rate": 8.058e-05, + "loss": 0.4552, + "step": 16125 + }, + { + "epoch": 0.9030126553925412, + "grad_norm": 1.1446025371551514, + "learning_rate": 8.0585e-05, + "loss": 0.3386, + "step": 16126 + }, + { + "epoch": 0.9030686527046702, + "grad_norm": 1.4161683320999146, + "learning_rate": 8.059e-05, + "loss": 0.4751, + "step": 16127 + }, + { + "epoch": 0.9031246500167992, + "grad_norm": 1.2866870164871216, + "learning_rate": 8.059500000000001e-05, + "loss": 0.45, + "step": 16128 + }, + { + "epoch": 0.9031806473289282, + "grad_norm": 1.401611089706421, + "learning_rate": 8.060000000000001e-05, + "loss": 0.5611, + "step": 16129 + }, + { + "epoch": 0.9032366446410572, + "grad_norm": 1.4381974935531616, + "learning_rate": 8.0605e-05, + "loss": 0.6571, + "step": 16130 + }, + { + "epoch": 0.9032926419531863, + "grad_norm": 1.134797215461731, + "learning_rate": 8.061e-05, + "loss": 0.3239, + "step": 16131 + }, + { + "epoch": 0.9033486392653153, + "grad_norm": 1.5025545358657837, + "learning_rate": 8.061500000000001e-05, + "loss": 0.4626, + "step": 16132 + }, + { + "epoch": 0.9034046365774443, + "grad_norm": 1.3810558319091797, + "learning_rate": 8.062000000000001e-05, + "loss": 0.5958, + "step": 16133 + }, + { + "epoch": 0.9034606338895733, + "grad_norm": 1.4801636934280396, + "learning_rate": 8.062500000000001e-05, + "loss": 0.4656, + "step": 16134 + }, + { + "epoch": 0.9035166312017023, + "grad_norm": 1.2599828243255615, + "learning_rate": 8.063e-05, + "loss": 0.4976, + "step": 16135 + }, + { + "epoch": 0.9035726285138314, + "grad_norm": 1.2133804559707642, + "learning_rate": 8.0635e-05, + "loss": 0.4485, + "step": 16136 + }, + { + "epoch": 0.9036286258259604, + "grad_norm": 1.4889907836914062, + "learning_rate": 8.064e-05, + "loss": 0.5225, + "step": 16137 + }, + { + "epoch": 0.9036846231380894, + "grad_norm": 1.5218982696533203, + "learning_rate": 8.0645e-05, + "loss": 0.4227, + "step": 16138 + }, + { + "epoch": 0.9037406204502184, + "grad_norm": 1.1445194482803345, + "learning_rate": 8.065000000000001e-05, + "loss": 0.3879, + "step": 16139 + }, + { + "epoch": 0.9037966177623474, + "grad_norm": 1.486558198928833, + "learning_rate": 8.0655e-05, + "loss": 0.5492, + "step": 16140 + }, + { + "epoch": 0.9038526150744765, + "grad_norm": 1.294519305229187, + "learning_rate": 8.066e-05, + "loss": 0.5715, + "step": 16141 + }, + { + "epoch": 0.9039086123866055, + "grad_norm": 1.384840726852417, + "learning_rate": 8.0665e-05, + "loss": 0.5428, + "step": 16142 + }, + { + "epoch": 0.9039646096987345, + "grad_norm": 1.2878063917160034, + "learning_rate": 8.067e-05, + "loss": 0.4181, + "step": 16143 + }, + { + "epoch": 0.9040206070108635, + "grad_norm": 1.132713794708252, + "learning_rate": 8.0675e-05, + "loss": 0.3261, + "step": 16144 + }, + { + "epoch": 0.9040766043229925, + "grad_norm": 1.303546667098999, + "learning_rate": 8.068e-05, + "loss": 0.3057, + "step": 16145 + }, + { + "epoch": 0.9041326016351215, + "grad_norm": 1.181791067123413, + "learning_rate": 8.0685e-05, + "loss": 0.3682, + "step": 16146 + }, + { + "epoch": 0.9041885989472506, + "grad_norm": 1.2825514078140259, + "learning_rate": 8.069e-05, + "loss": 0.4431, + "step": 16147 + }, + { + "epoch": 0.9042445962593796, + "grad_norm": 1.2796927690505981, + "learning_rate": 8.0695e-05, + "loss": 0.496, + "step": 16148 + }, + { + "epoch": 0.9043005935715086, + "grad_norm": 1.2571866512298584, + "learning_rate": 8.070000000000001e-05, + "loss": 0.4749, + "step": 16149 + }, + { + "epoch": 0.9043565908836376, + "grad_norm": 1.2255233526229858, + "learning_rate": 8.070500000000001e-05, + "loss": 0.4372, + "step": 16150 + }, + { + "epoch": 0.9044125881957666, + "grad_norm": 1.5090402364730835, + "learning_rate": 8.071e-05, + "loss": 0.7129, + "step": 16151 + }, + { + "epoch": 0.9044685855078957, + "grad_norm": 1.2965089082717896, + "learning_rate": 8.0715e-05, + "loss": 0.4358, + "step": 16152 + }, + { + "epoch": 0.9045245828200247, + "grad_norm": 1.4195002317428589, + "learning_rate": 8.072000000000001e-05, + "loss": 0.4692, + "step": 16153 + }, + { + "epoch": 0.9045805801321537, + "grad_norm": 1.3298155069351196, + "learning_rate": 8.072500000000001e-05, + "loss": 0.4046, + "step": 16154 + }, + { + "epoch": 0.9046365774442827, + "grad_norm": 1.4429786205291748, + "learning_rate": 8.073000000000001e-05, + "loss": 0.5544, + "step": 16155 + }, + { + "epoch": 0.9046925747564117, + "grad_norm": 1.4150007963180542, + "learning_rate": 8.0735e-05, + "loss": 0.4725, + "step": 16156 + }, + { + "epoch": 0.9047485720685408, + "grad_norm": 1.5700832605361938, + "learning_rate": 8.074e-05, + "loss": 0.6968, + "step": 16157 + }, + { + "epoch": 0.9048045693806698, + "grad_norm": 1.4435895681381226, + "learning_rate": 8.0745e-05, + "loss": 0.6842, + "step": 16158 + }, + { + "epoch": 0.9048605666927988, + "grad_norm": 18.472076416015625, + "learning_rate": 8.075e-05, + "loss": 0.7216, + "step": 16159 + }, + { + "epoch": 0.9049165640049278, + "grad_norm": 1.087300181388855, + "learning_rate": 8.075500000000001e-05, + "loss": 0.4338, + "step": 16160 + }, + { + "epoch": 0.9049725613170568, + "grad_norm": 1.6640185117721558, + "learning_rate": 8.076e-05, + "loss": 0.8026, + "step": 16161 + }, + { + "epoch": 0.9050285586291859, + "grad_norm": 1.339402675628662, + "learning_rate": 8.0765e-05, + "loss": 0.424, + "step": 16162 + }, + { + "epoch": 0.9050845559413149, + "grad_norm": 1.7281773090362549, + "learning_rate": 8.077e-05, + "loss": 0.7183, + "step": 16163 + }, + { + "epoch": 0.9051405532534438, + "grad_norm": 1.195365309715271, + "learning_rate": 8.0775e-05, + "loss": 0.4868, + "step": 16164 + }, + { + "epoch": 0.9051965505655728, + "grad_norm": 1.2876406908035278, + "learning_rate": 8.078e-05, + "loss": 0.5234, + "step": 16165 + }, + { + "epoch": 0.9052525478777018, + "grad_norm": 1.2299528121948242, + "learning_rate": 8.078499999999999e-05, + "loss": 0.4793, + "step": 16166 + }, + { + "epoch": 0.9053085451898308, + "grad_norm": 1.245281457901001, + "learning_rate": 8.079e-05, + "loss": 0.4979, + "step": 16167 + }, + { + "epoch": 0.9053645425019599, + "grad_norm": 1.3213170766830444, + "learning_rate": 8.0795e-05, + "loss": 0.5371, + "step": 16168 + }, + { + "epoch": 0.9054205398140889, + "grad_norm": 1.53616201877594, + "learning_rate": 8.080000000000001e-05, + "loss": 0.6766, + "step": 16169 + }, + { + "epoch": 0.9054765371262179, + "grad_norm": 1.4785538911819458, + "learning_rate": 8.080500000000001e-05, + "loss": 0.4185, + "step": 16170 + }, + { + "epoch": 0.9055325344383469, + "grad_norm": 1.1670349836349487, + "learning_rate": 8.081000000000001e-05, + "loss": 0.3507, + "step": 16171 + }, + { + "epoch": 0.9055885317504759, + "grad_norm": 1.273755431175232, + "learning_rate": 8.0815e-05, + "loss": 0.4314, + "step": 16172 + }, + { + "epoch": 0.905644529062605, + "grad_norm": 1.750573992729187, + "learning_rate": 8.082e-05, + "loss": 0.4869, + "step": 16173 + }, + { + "epoch": 0.905700526374734, + "grad_norm": 1.0891785621643066, + "learning_rate": 8.082500000000001e-05, + "loss": 0.3164, + "step": 16174 + }, + { + "epoch": 0.905756523686863, + "grad_norm": 1.1787875890731812, + "learning_rate": 8.083000000000001e-05, + "loss": 0.3236, + "step": 16175 + }, + { + "epoch": 0.905812520998992, + "grad_norm": 1.3453223705291748, + "learning_rate": 8.083500000000001e-05, + "loss": 0.3288, + "step": 16176 + }, + { + "epoch": 0.905868518311121, + "grad_norm": 1.140633463859558, + "learning_rate": 8.084e-05, + "loss": 0.3574, + "step": 16177 + }, + { + "epoch": 0.90592451562325, + "grad_norm": 1.3836084604263306, + "learning_rate": 8.0845e-05, + "loss": 0.4461, + "step": 16178 + }, + { + "epoch": 0.9059805129353791, + "grad_norm": 3.173196792602539, + "learning_rate": 8.085e-05, + "loss": 0.3756, + "step": 16179 + }, + { + "epoch": 0.9060365102475081, + "grad_norm": 1.382127046585083, + "learning_rate": 8.0855e-05, + "loss": 0.5083, + "step": 16180 + }, + { + "epoch": 0.9060925075596371, + "grad_norm": 1.0873388051986694, + "learning_rate": 8.086000000000001e-05, + "loss": 0.4796, + "step": 16181 + }, + { + "epoch": 0.9061485048717661, + "grad_norm": 1.4298620223999023, + "learning_rate": 8.0865e-05, + "loss": 0.4708, + "step": 16182 + }, + { + "epoch": 0.9062045021838951, + "grad_norm": 1.3999799489974976, + "learning_rate": 8.087e-05, + "loss": 0.6052, + "step": 16183 + }, + { + "epoch": 0.9062604994960242, + "grad_norm": 1.205410361289978, + "learning_rate": 8.0875e-05, + "loss": 0.4133, + "step": 16184 + }, + { + "epoch": 0.9063164968081532, + "grad_norm": 1.6735310554504395, + "learning_rate": 8.088e-05, + "loss": 0.4577, + "step": 16185 + }, + { + "epoch": 0.9063724941202822, + "grad_norm": 1.5331871509552002, + "learning_rate": 8.0885e-05, + "loss": 0.4554, + "step": 16186 + }, + { + "epoch": 0.9064284914324112, + "grad_norm": 1.5027611255645752, + "learning_rate": 8.088999999999999e-05, + "loss": 0.57, + "step": 16187 + }, + { + "epoch": 0.9064844887445402, + "grad_norm": 1.3087024688720703, + "learning_rate": 8.0895e-05, + "loss": 0.3807, + "step": 16188 + }, + { + "epoch": 0.9065404860566693, + "grad_norm": 1.2069069147109985, + "learning_rate": 8.090000000000001e-05, + "loss": 0.344, + "step": 16189 + }, + { + "epoch": 0.9065964833687983, + "grad_norm": 1.1890544891357422, + "learning_rate": 8.090500000000001e-05, + "loss": 0.4263, + "step": 16190 + }, + { + "epoch": 0.9066524806809273, + "grad_norm": 1.5398627519607544, + "learning_rate": 8.091000000000001e-05, + "loss": 0.4006, + "step": 16191 + }, + { + "epoch": 0.9067084779930563, + "grad_norm": 1.345105528831482, + "learning_rate": 8.091500000000001e-05, + "loss": 0.5528, + "step": 16192 + }, + { + "epoch": 0.9067644753051853, + "grad_norm": 1.2979389429092407, + "learning_rate": 8.092e-05, + "loss": 0.3823, + "step": 16193 + }, + { + "epoch": 0.9068204726173144, + "grad_norm": 1.7370052337646484, + "learning_rate": 8.0925e-05, + "loss": 0.7235, + "step": 16194 + }, + { + "epoch": 0.9068764699294434, + "grad_norm": 1.5376818180084229, + "learning_rate": 8.093e-05, + "loss": 0.4005, + "step": 16195 + }, + { + "epoch": 0.9069324672415724, + "grad_norm": 1.472191572189331, + "learning_rate": 8.093500000000001e-05, + "loss": 0.399, + "step": 16196 + }, + { + "epoch": 0.9069884645537014, + "grad_norm": 1.2682433128356934, + "learning_rate": 8.094000000000001e-05, + "loss": 0.4769, + "step": 16197 + }, + { + "epoch": 0.9070444618658304, + "grad_norm": 1.5884227752685547, + "learning_rate": 8.0945e-05, + "loss": 0.5694, + "step": 16198 + }, + { + "epoch": 0.9071004591779595, + "grad_norm": 1.2543954849243164, + "learning_rate": 8.095e-05, + "loss": 0.5715, + "step": 16199 + }, + { + "epoch": 0.9071564564900885, + "grad_norm": 1.4897866249084473, + "learning_rate": 8.0955e-05, + "loss": 0.4489, + "step": 16200 + }, + { + "epoch": 0.9072124538022175, + "grad_norm": 1.388708233833313, + "learning_rate": 8.096e-05, + "loss": 0.5487, + "step": 16201 + }, + { + "epoch": 0.9072684511143465, + "grad_norm": 1.382103443145752, + "learning_rate": 8.096500000000001e-05, + "loss": 0.458, + "step": 16202 + }, + { + "epoch": 0.9073244484264755, + "grad_norm": 1.297139286994934, + "learning_rate": 8.097e-05, + "loss": 0.4633, + "step": 16203 + }, + { + "epoch": 0.9073804457386045, + "grad_norm": 1.2164654731750488, + "learning_rate": 8.0975e-05, + "loss": 0.3735, + "step": 16204 + }, + { + "epoch": 0.9074364430507336, + "grad_norm": 1.4001030921936035, + "learning_rate": 8.098e-05, + "loss": 0.4218, + "step": 16205 + }, + { + "epoch": 0.9074924403628626, + "grad_norm": 1.3910261392593384, + "learning_rate": 8.0985e-05, + "loss": 0.5399, + "step": 16206 + }, + { + "epoch": 0.9075484376749916, + "grad_norm": 1.693255066871643, + "learning_rate": 8.099e-05, + "loss": 0.5294, + "step": 16207 + }, + { + "epoch": 0.9076044349871206, + "grad_norm": 1.3666377067565918, + "learning_rate": 8.099499999999999e-05, + "loss": 0.4751, + "step": 16208 + }, + { + "epoch": 0.9076604322992496, + "grad_norm": 9.84433650970459, + "learning_rate": 8.1e-05, + "loss": 0.512, + "step": 16209 + }, + { + "epoch": 0.9077164296113787, + "grad_norm": 1.3591711521148682, + "learning_rate": 8.100500000000001e-05, + "loss": 0.4619, + "step": 16210 + }, + { + "epoch": 0.9077724269235077, + "grad_norm": 1.3341845273971558, + "learning_rate": 8.101000000000001e-05, + "loss": 0.4369, + "step": 16211 + }, + { + "epoch": 0.9078284242356367, + "grad_norm": 1.8678618669509888, + "learning_rate": 8.101500000000001e-05, + "loss": 0.4691, + "step": 16212 + }, + { + "epoch": 0.9078844215477657, + "grad_norm": 1.4997316598892212, + "learning_rate": 8.102000000000001e-05, + "loss": 0.5044, + "step": 16213 + }, + { + "epoch": 0.9079404188598947, + "grad_norm": 1.2690948247909546, + "learning_rate": 8.1025e-05, + "loss": 0.3845, + "step": 16214 + }, + { + "epoch": 0.9079964161720238, + "grad_norm": 1.580435037612915, + "learning_rate": 8.103e-05, + "loss": 0.5397, + "step": 16215 + }, + { + "epoch": 0.9080524134841528, + "grad_norm": 1.1663674116134644, + "learning_rate": 8.1035e-05, + "loss": 0.4878, + "step": 16216 + }, + { + "epoch": 0.9081084107962818, + "grad_norm": 1.1223293542861938, + "learning_rate": 8.104000000000001e-05, + "loss": 0.3977, + "step": 16217 + }, + { + "epoch": 0.9081644081084108, + "grad_norm": 1.6381984949111938, + "learning_rate": 8.104500000000001e-05, + "loss": 0.5847, + "step": 16218 + }, + { + "epoch": 0.9082204054205398, + "grad_norm": 1.454047679901123, + "learning_rate": 8.105e-05, + "loss": 0.4213, + "step": 16219 + }, + { + "epoch": 0.9082764027326689, + "grad_norm": 2.117156982421875, + "learning_rate": 8.1055e-05, + "loss": 0.4909, + "step": 16220 + }, + { + "epoch": 0.9083324000447979, + "grad_norm": 1.3888218402862549, + "learning_rate": 8.106e-05, + "loss": 0.3746, + "step": 16221 + }, + { + "epoch": 0.9083883973569269, + "grad_norm": 1.1721341609954834, + "learning_rate": 8.1065e-05, + "loss": 0.3608, + "step": 16222 + }, + { + "epoch": 0.9084443946690559, + "grad_norm": 1.574838399887085, + "learning_rate": 8.107000000000001e-05, + "loss": 0.5261, + "step": 16223 + }, + { + "epoch": 0.9085003919811849, + "grad_norm": 1.385015845298767, + "learning_rate": 8.1075e-05, + "loss": 0.4958, + "step": 16224 + }, + { + "epoch": 0.908556389293314, + "grad_norm": 1.306646704673767, + "learning_rate": 8.108e-05, + "loss": 0.4316, + "step": 16225 + }, + { + "epoch": 0.908612386605443, + "grad_norm": 1.5103237628936768, + "learning_rate": 8.1085e-05, + "loss": 0.6305, + "step": 16226 + }, + { + "epoch": 0.908668383917572, + "grad_norm": 1.5798276662826538, + "learning_rate": 8.109e-05, + "loss": 0.5126, + "step": 16227 + }, + { + "epoch": 0.908724381229701, + "grad_norm": 1.7766233682632446, + "learning_rate": 8.1095e-05, + "loss": 0.6594, + "step": 16228 + }, + { + "epoch": 0.90878037854183, + "grad_norm": 1.152177333831787, + "learning_rate": 8.11e-05, + "loss": 0.3107, + "step": 16229 + }, + { + "epoch": 0.908836375853959, + "grad_norm": 1.5890731811523438, + "learning_rate": 8.1105e-05, + "loss": 0.439, + "step": 16230 + }, + { + "epoch": 0.9088923731660881, + "grad_norm": 1.157092809677124, + "learning_rate": 8.111000000000001e-05, + "loss": 0.4707, + "step": 16231 + }, + { + "epoch": 0.9089483704782171, + "grad_norm": 1.3347845077514648, + "learning_rate": 8.111500000000001e-05, + "loss": 0.4352, + "step": 16232 + }, + { + "epoch": 0.9090043677903461, + "grad_norm": 1.407378911972046, + "learning_rate": 8.112000000000001e-05, + "loss": 0.3694, + "step": 16233 + }, + { + "epoch": 0.9090603651024751, + "grad_norm": 1.2295944690704346, + "learning_rate": 8.112500000000001e-05, + "loss": 0.3802, + "step": 16234 + }, + { + "epoch": 0.9091163624146041, + "grad_norm": 1.3493835926055908, + "learning_rate": 8.113e-05, + "loss": 0.4413, + "step": 16235 + }, + { + "epoch": 0.9091723597267332, + "grad_norm": 1.043889045715332, + "learning_rate": 8.1135e-05, + "loss": 0.4428, + "step": 16236 + }, + { + "epoch": 0.9092283570388622, + "grad_norm": 1.533015489578247, + "learning_rate": 8.114e-05, + "loss": 0.5359, + "step": 16237 + }, + { + "epoch": 0.9092843543509912, + "grad_norm": 1.045485258102417, + "learning_rate": 8.114500000000001e-05, + "loss": 0.3455, + "step": 16238 + }, + { + "epoch": 0.9093403516631202, + "grad_norm": 1.2184590101242065, + "learning_rate": 8.115000000000001e-05, + "loss": 0.3954, + "step": 16239 + }, + { + "epoch": 0.9093963489752492, + "grad_norm": 1.220358967781067, + "learning_rate": 8.1155e-05, + "loss": 0.3347, + "step": 16240 + }, + { + "epoch": 0.9094523462873783, + "grad_norm": 1.1589940786361694, + "learning_rate": 8.116e-05, + "loss": 0.5209, + "step": 16241 + }, + { + "epoch": 0.9095083435995073, + "grad_norm": 1.555674433708191, + "learning_rate": 8.1165e-05, + "loss": 0.5367, + "step": 16242 + }, + { + "epoch": 0.9095643409116363, + "grad_norm": 1.1752678155899048, + "learning_rate": 8.117e-05, + "loss": 0.4052, + "step": 16243 + }, + { + "epoch": 0.9096203382237653, + "grad_norm": 1.2515757083892822, + "learning_rate": 8.1175e-05, + "loss": 0.4064, + "step": 16244 + }, + { + "epoch": 0.9096763355358943, + "grad_norm": 1.4909850358963013, + "learning_rate": 8.118e-05, + "loss": 0.4566, + "step": 16245 + }, + { + "epoch": 0.9097323328480232, + "grad_norm": 1.183108925819397, + "learning_rate": 8.1185e-05, + "loss": 0.43, + "step": 16246 + }, + { + "epoch": 0.9097883301601523, + "grad_norm": 1.329343318939209, + "learning_rate": 8.119e-05, + "loss": 0.4417, + "step": 16247 + }, + { + "epoch": 0.9098443274722813, + "grad_norm": 1.2194442749023438, + "learning_rate": 8.1195e-05, + "loss": 0.403, + "step": 16248 + }, + { + "epoch": 0.9099003247844103, + "grad_norm": 1.368298888206482, + "learning_rate": 8.120000000000001e-05, + "loss": 0.4122, + "step": 16249 + }, + { + "epoch": 0.9099563220965393, + "grad_norm": 1.3849939107894897, + "learning_rate": 8.1205e-05, + "loss": 0.3577, + "step": 16250 + }, + { + "epoch": 0.9100123194086683, + "grad_norm": 1.4291472434997559, + "learning_rate": 8.121e-05, + "loss": 0.5803, + "step": 16251 + }, + { + "epoch": 0.9100683167207974, + "grad_norm": 1.1422356367111206, + "learning_rate": 8.121500000000001e-05, + "loss": 0.3736, + "step": 16252 + }, + { + "epoch": 0.9101243140329264, + "grad_norm": 1.247540831565857, + "learning_rate": 8.122000000000001e-05, + "loss": 0.4379, + "step": 16253 + }, + { + "epoch": 0.9101803113450554, + "grad_norm": 1.3040695190429688, + "learning_rate": 8.122500000000001e-05, + "loss": 0.506, + "step": 16254 + }, + { + "epoch": 0.9102363086571844, + "grad_norm": 1.257175087928772, + "learning_rate": 8.123e-05, + "loss": 0.3589, + "step": 16255 + }, + { + "epoch": 0.9102923059693134, + "grad_norm": 1.5271435976028442, + "learning_rate": 8.1235e-05, + "loss": 0.4279, + "step": 16256 + }, + { + "epoch": 0.9103483032814425, + "grad_norm": 1.2693156003952026, + "learning_rate": 8.124e-05, + "loss": 0.4413, + "step": 16257 + }, + { + "epoch": 0.9104043005935715, + "grad_norm": 1.3279603719711304, + "learning_rate": 8.1245e-05, + "loss": 0.4359, + "step": 16258 + }, + { + "epoch": 0.9104602979057005, + "grad_norm": 1.4256255626678467, + "learning_rate": 8.125000000000001e-05, + "loss": 0.4211, + "step": 16259 + }, + { + "epoch": 0.9105162952178295, + "grad_norm": 1.497703194618225, + "learning_rate": 8.125500000000001e-05, + "loss": 0.4449, + "step": 16260 + }, + { + "epoch": 0.9105722925299585, + "grad_norm": 1.348031997680664, + "learning_rate": 8.126e-05, + "loss": 0.3811, + "step": 16261 + }, + { + "epoch": 0.9106282898420875, + "grad_norm": 1.5301815271377563, + "learning_rate": 8.1265e-05, + "loss": 0.6615, + "step": 16262 + }, + { + "epoch": 0.9106842871542166, + "grad_norm": 1.6751320362091064, + "learning_rate": 8.127e-05, + "loss": 0.5039, + "step": 16263 + }, + { + "epoch": 0.9107402844663456, + "grad_norm": 1.3533034324645996, + "learning_rate": 8.1275e-05, + "loss": 0.379, + "step": 16264 + }, + { + "epoch": 0.9107962817784746, + "grad_norm": 1.2872544527053833, + "learning_rate": 8.128e-05, + "loss": 0.4944, + "step": 16265 + }, + { + "epoch": 0.9108522790906036, + "grad_norm": 1.281751036643982, + "learning_rate": 8.1285e-05, + "loss": 0.4542, + "step": 16266 + }, + { + "epoch": 0.9109082764027326, + "grad_norm": 1.4266624450683594, + "learning_rate": 8.129e-05, + "loss": 0.4478, + "step": 16267 + }, + { + "epoch": 0.9109642737148617, + "grad_norm": 1.7636620998382568, + "learning_rate": 8.1295e-05, + "loss": 0.6262, + "step": 16268 + }, + { + "epoch": 0.9110202710269907, + "grad_norm": 1.3292865753173828, + "learning_rate": 8.13e-05, + "loss": 0.456, + "step": 16269 + }, + { + "epoch": 0.9110762683391197, + "grad_norm": 1.4947370290756226, + "learning_rate": 8.130500000000001e-05, + "loss": 0.5257, + "step": 16270 + }, + { + "epoch": 0.9111322656512487, + "grad_norm": 1.2445459365844727, + "learning_rate": 8.131e-05, + "loss": 0.4555, + "step": 16271 + }, + { + "epoch": 0.9111882629633777, + "grad_norm": 1.5176284313201904, + "learning_rate": 8.1315e-05, + "loss": 0.6189, + "step": 16272 + }, + { + "epoch": 0.9112442602755068, + "grad_norm": 1.2148562669754028, + "learning_rate": 8.132e-05, + "loss": 0.4785, + "step": 16273 + }, + { + "epoch": 0.9113002575876358, + "grad_norm": 1.1544235944747925, + "learning_rate": 8.132500000000001e-05, + "loss": 0.4687, + "step": 16274 + }, + { + "epoch": 0.9113562548997648, + "grad_norm": 1.3045356273651123, + "learning_rate": 8.133000000000001e-05, + "loss": 0.5035, + "step": 16275 + }, + { + "epoch": 0.9114122522118938, + "grad_norm": 1.4231603145599365, + "learning_rate": 8.1335e-05, + "loss": 0.4949, + "step": 16276 + }, + { + "epoch": 0.9114682495240228, + "grad_norm": 1.5219913721084595, + "learning_rate": 8.134e-05, + "loss": 0.5055, + "step": 16277 + }, + { + "epoch": 0.9115242468361519, + "grad_norm": 1.6135090589523315, + "learning_rate": 8.1345e-05, + "loss": 0.5868, + "step": 16278 + }, + { + "epoch": 0.9115802441482809, + "grad_norm": 1.9982261657714844, + "learning_rate": 8.135e-05, + "loss": 0.7034, + "step": 16279 + }, + { + "epoch": 0.9116362414604099, + "grad_norm": 1.4176361560821533, + "learning_rate": 8.135500000000001e-05, + "loss": 0.3944, + "step": 16280 + }, + { + "epoch": 0.9116922387725389, + "grad_norm": 1.4432766437530518, + "learning_rate": 8.136000000000001e-05, + "loss": 0.4427, + "step": 16281 + }, + { + "epoch": 0.9117482360846679, + "grad_norm": 1.5466388463974, + "learning_rate": 8.1365e-05, + "loss": 0.4778, + "step": 16282 + }, + { + "epoch": 0.911804233396797, + "grad_norm": 1.649107575416565, + "learning_rate": 8.137e-05, + "loss": 0.546, + "step": 16283 + }, + { + "epoch": 0.911860230708926, + "grad_norm": 1.3462942838668823, + "learning_rate": 8.1375e-05, + "loss": 0.38, + "step": 16284 + }, + { + "epoch": 0.911916228021055, + "grad_norm": 1.2513823509216309, + "learning_rate": 8.138e-05, + "loss": 0.571, + "step": 16285 + }, + { + "epoch": 0.911972225333184, + "grad_norm": 1.1332285404205322, + "learning_rate": 8.1385e-05, + "loss": 0.3993, + "step": 16286 + }, + { + "epoch": 0.912028222645313, + "grad_norm": 1.279436707496643, + "learning_rate": 8.139e-05, + "loss": 0.5434, + "step": 16287 + }, + { + "epoch": 0.912084219957442, + "grad_norm": 1.4637783765792847, + "learning_rate": 8.1395e-05, + "loss": 0.4162, + "step": 16288 + }, + { + "epoch": 0.9121402172695711, + "grad_norm": 1.2769397497177124, + "learning_rate": 8.14e-05, + "loss": 0.4708, + "step": 16289 + }, + { + "epoch": 0.9121962145817001, + "grad_norm": 1.3884077072143555, + "learning_rate": 8.140500000000001e-05, + "loss": 0.4197, + "step": 16290 + }, + { + "epoch": 0.9122522118938291, + "grad_norm": 1.5181313753128052, + "learning_rate": 8.141000000000001e-05, + "loss": 0.4158, + "step": 16291 + }, + { + "epoch": 0.9123082092059581, + "grad_norm": 1.2915698289871216, + "learning_rate": 8.1415e-05, + "loss": 0.4737, + "step": 16292 + }, + { + "epoch": 0.9123642065180871, + "grad_norm": 1.173693299293518, + "learning_rate": 8.142e-05, + "loss": 0.3346, + "step": 16293 + }, + { + "epoch": 0.9124202038302162, + "grad_norm": 1.1621513366699219, + "learning_rate": 8.1425e-05, + "loss": 0.4959, + "step": 16294 + }, + { + "epoch": 0.9124762011423452, + "grad_norm": 1.2622287273406982, + "learning_rate": 8.143000000000001e-05, + "loss": 0.3461, + "step": 16295 + }, + { + "epoch": 0.9125321984544742, + "grad_norm": 1.4559603929519653, + "learning_rate": 8.143500000000001e-05, + "loss": 0.5704, + "step": 16296 + }, + { + "epoch": 0.9125881957666032, + "grad_norm": 1.3613815307617188, + "learning_rate": 8.144e-05, + "loss": 0.4548, + "step": 16297 + }, + { + "epoch": 0.9126441930787322, + "grad_norm": 1.4421823024749756, + "learning_rate": 8.1445e-05, + "loss": 0.4864, + "step": 16298 + }, + { + "epoch": 0.9127001903908613, + "grad_norm": 1.381778359413147, + "learning_rate": 8.145e-05, + "loss": 0.5673, + "step": 16299 + }, + { + "epoch": 0.9127561877029903, + "grad_norm": 1.7580300569534302, + "learning_rate": 8.1455e-05, + "loss": 0.4055, + "step": 16300 + }, + { + "epoch": 0.9128121850151193, + "grad_norm": 1.568524956703186, + "learning_rate": 8.146000000000001e-05, + "loss": 0.597, + "step": 16301 + }, + { + "epoch": 0.9128681823272483, + "grad_norm": 1.4807724952697754, + "learning_rate": 8.146500000000001e-05, + "loss": 0.6351, + "step": 16302 + }, + { + "epoch": 0.9129241796393773, + "grad_norm": 1.2681360244750977, + "learning_rate": 8.147e-05, + "loss": 0.3429, + "step": 16303 + }, + { + "epoch": 0.9129801769515064, + "grad_norm": 1.101849913597107, + "learning_rate": 8.1475e-05, + "loss": 0.3718, + "step": 16304 + }, + { + "epoch": 0.9130361742636354, + "grad_norm": 1.3666799068450928, + "learning_rate": 8.148e-05, + "loss": 0.3794, + "step": 16305 + }, + { + "epoch": 0.9130921715757644, + "grad_norm": 1.3681020736694336, + "learning_rate": 8.1485e-05, + "loss": 0.5026, + "step": 16306 + }, + { + "epoch": 0.9131481688878934, + "grad_norm": 1.272249698638916, + "learning_rate": 8.149e-05, + "loss": 0.3594, + "step": 16307 + }, + { + "epoch": 0.9132041662000224, + "grad_norm": 1.4186328649520874, + "learning_rate": 8.1495e-05, + "loss": 0.5804, + "step": 16308 + }, + { + "epoch": 0.9132601635121514, + "grad_norm": 1.6269071102142334, + "learning_rate": 8.15e-05, + "loss": 0.4691, + "step": 16309 + }, + { + "epoch": 0.9133161608242805, + "grad_norm": 1.2336229085922241, + "learning_rate": 8.150500000000001e-05, + "loss": 0.5393, + "step": 16310 + }, + { + "epoch": 0.9133721581364095, + "grad_norm": 1.4158024787902832, + "learning_rate": 8.151000000000001e-05, + "loss": 0.4173, + "step": 16311 + }, + { + "epoch": 0.9134281554485385, + "grad_norm": 1.4624764919281006, + "learning_rate": 8.151500000000001e-05, + "loss": 0.4952, + "step": 16312 + }, + { + "epoch": 0.9134841527606675, + "grad_norm": 1.443229079246521, + "learning_rate": 8.152e-05, + "loss": 0.6761, + "step": 16313 + }, + { + "epoch": 0.9135401500727965, + "grad_norm": 1.2824628353118896, + "learning_rate": 8.1525e-05, + "loss": 0.6185, + "step": 16314 + }, + { + "epoch": 0.9135961473849256, + "grad_norm": 1.2834609746932983, + "learning_rate": 8.153e-05, + "loss": 0.4322, + "step": 16315 + }, + { + "epoch": 0.9136521446970546, + "grad_norm": 1.5608112812042236, + "learning_rate": 8.153500000000001e-05, + "loss": 0.5059, + "step": 16316 + }, + { + "epoch": 0.9137081420091836, + "grad_norm": 1.5314184427261353, + "learning_rate": 8.154000000000001e-05, + "loss": 0.415, + "step": 16317 + }, + { + "epoch": 0.9137641393213126, + "grad_norm": 1.286604881286621, + "learning_rate": 8.1545e-05, + "loss": 0.5446, + "step": 16318 + }, + { + "epoch": 0.9138201366334416, + "grad_norm": 1.4087798595428467, + "learning_rate": 8.155e-05, + "loss": 0.4123, + "step": 16319 + }, + { + "epoch": 0.9138761339455707, + "grad_norm": 1.2750279903411865, + "learning_rate": 8.1555e-05, + "loss": 0.5283, + "step": 16320 + }, + { + "epoch": 0.9139321312576997, + "grad_norm": 1.311269760131836, + "learning_rate": 8.156e-05, + "loss": 0.6315, + "step": 16321 + }, + { + "epoch": 0.9139881285698287, + "grad_norm": 1.63417387008667, + "learning_rate": 8.1565e-05, + "loss": 0.4848, + "step": 16322 + }, + { + "epoch": 0.9140441258819577, + "grad_norm": 1.385966181755066, + "learning_rate": 8.157e-05, + "loss": 0.4425, + "step": 16323 + }, + { + "epoch": 0.9141001231940867, + "grad_norm": 1.4401918649673462, + "learning_rate": 8.1575e-05, + "loss": 0.4885, + "step": 16324 + }, + { + "epoch": 0.9141561205062158, + "grad_norm": 1.2513331174850464, + "learning_rate": 8.158e-05, + "loss": 0.4367, + "step": 16325 + }, + { + "epoch": 0.9142121178183448, + "grad_norm": 1.186728596687317, + "learning_rate": 8.1585e-05, + "loss": 0.5121, + "step": 16326 + }, + { + "epoch": 0.9142681151304738, + "grad_norm": 1.219286322593689, + "learning_rate": 8.159e-05, + "loss": 0.3755, + "step": 16327 + }, + { + "epoch": 0.9143241124426028, + "grad_norm": 1.8178082704544067, + "learning_rate": 8.1595e-05, + "loss": 0.3789, + "step": 16328 + }, + { + "epoch": 0.9143801097547317, + "grad_norm": 1.340780258178711, + "learning_rate": 8.16e-05, + "loss": 0.4837, + "step": 16329 + }, + { + "epoch": 0.9144361070668607, + "grad_norm": 1.390148401260376, + "learning_rate": 8.160500000000002e-05, + "loss": 0.4514, + "step": 16330 + }, + { + "epoch": 0.9144921043789898, + "grad_norm": 1.273046612739563, + "learning_rate": 8.161000000000001e-05, + "loss": 0.4115, + "step": 16331 + }, + { + "epoch": 0.9145481016911188, + "grad_norm": 1.2908871173858643, + "learning_rate": 8.161500000000001e-05, + "loss": 0.4347, + "step": 16332 + }, + { + "epoch": 0.9146040990032478, + "grad_norm": 1.2404327392578125, + "learning_rate": 8.162000000000001e-05, + "loss": 0.3864, + "step": 16333 + }, + { + "epoch": 0.9146600963153768, + "grad_norm": 1.2438278198242188, + "learning_rate": 8.1625e-05, + "loss": 0.4607, + "step": 16334 + }, + { + "epoch": 0.9147160936275058, + "grad_norm": 1.335219383239746, + "learning_rate": 8.163e-05, + "loss": 0.4362, + "step": 16335 + }, + { + "epoch": 0.9147720909396349, + "grad_norm": 1.4740371704101562, + "learning_rate": 8.1635e-05, + "loss": 0.4317, + "step": 16336 + }, + { + "epoch": 0.9148280882517639, + "grad_norm": 1.3565070629119873, + "learning_rate": 8.164000000000001e-05, + "loss": 0.4093, + "step": 16337 + }, + { + "epoch": 0.9148840855638929, + "grad_norm": 1.1543887853622437, + "learning_rate": 8.164500000000001e-05, + "loss": 0.4376, + "step": 16338 + }, + { + "epoch": 0.9149400828760219, + "grad_norm": 1.3757209777832031, + "learning_rate": 8.165e-05, + "loss": 0.356, + "step": 16339 + }, + { + "epoch": 0.9149960801881509, + "grad_norm": 1.2366504669189453, + "learning_rate": 8.1655e-05, + "loss": 0.3948, + "step": 16340 + }, + { + "epoch": 0.91505207750028, + "grad_norm": 1.3505820035934448, + "learning_rate": 8.166e-05, + "loss": 0.4726, + "step": 16341 + }, + { + "epoch": 0.915108074812409, + "grad_norm": 1.359368920326233, + "learning_rate": 8.1665e-05, + "loss": 0.4518, + "step": 16342 + }, + { + "epoch": 0.915164072124538, + "grad_norm": 1.362165093421936, + "learning_rate": 8.167e-05, + "loss": 0.5659, + "step": 16343 + }, + { + "epoch": 0.915220069436667, + "grad_norm": 1.3214900493621826, + "learning_rate": 8.1675e-05, + "loss": 0.4294, + "step": 16344 + }, + { + "epoch": 0.915276066748796, + "grad_norm": 1.562992811203003, + "learning_rate": 8.168e-05, + "loss": 0.4319, + "step": 16345 + }, + { + "epoch": 0.915332064060925, + "grad_norm": 1.7184698581695557, + "learning_rate": 8.1685e-05, + "loss": 0.5534, + "step": 16346 + }, + { + "epoch": 0.9153880613730541, + "grad_norm": 1.6614658832550049, + "learning_rate": 8.169e-05, + "loss": 0.5746, + "step": 16347 + }, + { + "epoch": 0.9154440586851831, + "grad_norm": 1.4695688486099243, + "learning_rate": 8.1695e-05, + "loss": 0.5482, + "step": 16348 + }, + { + "epoch": 0.9155000559973121, + "grad_norm": 1.4225592613220215, + "learning_rate": 8.17e-05, + "loss": 0.5707, + "step": 16349 + }, + { + "epoch": 0.9155560533094411, + "grad_norm": 1.2848695516586304, + "learning_rate": 8.1705e-05, + "loss": 0.344, + "step": 16350 + }, + { + "epoch": 0.9156120506215701, + "grad_norm": 1.327746868133545, + "learning_rate": 8.171e-05, + "loss": 0.5246, + "step": 16351 + }, + { + "epoch": 0.9156680479336992, + "grad_norm": 1.410145878791809, + "learning_rate": 8.171500000000001e-05, + "loss": 0.5809, + "step": 16352 + }, + { + "epoch": 0.9157240452458282, + "grad_norm": 1.1476210355758667, + "learning_rate": 8.172000000000001e-05, + "loss": 0.3379, + "step": 16353 + }, + { + "epoch": 0.9157800425579572, + "grad_norm": 1.2684223651885986, + "learning_rate": 8.172500000000001e-05, + "loss": 0.4586, + "step": 16354 + }, + { + "epoch": 0.9158360398700862, + "grad_norm": 1.221681833267212, + "learning_rate": 8.173e-05, + "loss": 0.4008, + "step": 16355 + }, + { + "epoch": 0.9158920371822152, + "grad_norm": 1.29313325881958, + "learning_rate": 8.1735e-05, + "loss": 0.3409, + "step": 16356 + }, + { + "epoch": 0.9159480344943443, + "grad_norm": 1.157299518585205, + "learning_rate": 8.174e-05, + "loss": 0.4208, + "step": 16357 + }, + { + "epoch": 0.9160040318064733, + "grad_norm": 1.4694918394088745, + "learning_rate": 8.174500000000001e-05, + "loss": 0.6095, + "step": 16358 + }, + { + "epoch": 0.9160600291186023, + "grad_norm": 1.538213849067688, + "learning_rate": 8.175000000000001e-05, + "loss": 0.4755, + "step": 16359 + }, + { + "epoch": 0.9161160264307313, + "grad_norm": 1.3335171937942505, + "learning_rate": 8.1755e-05, + "loss": 0.4496, + "step": 16360 + }, + { + "epoch": 0.9161720237428603, + "grad_norm": 1.4281189441680908, + "learning_rate": 8.176e-05, + "loss": 0.4759, + "step": 16361 + }, + { + "epoch": 0.9162280210549893, + "grad_norm": 1.1768161058425903, + "learning_rate": 8.1765e-05, + "loss": 0.3872, + "step": 16362 + }, + { + "epoch": 0.9162840183671184, + "grad_norm": 1.4018878936767578, + "learning_rate": 8.177e-05, + "loss": 0.4976, + "step": 16363 + }, + { + "epoch": 0.9163400156792474, + "grad_norm": 1.1655353307724, + "learning_rate": 8.1775e-05, + "loss": 0.4554, + "step": 16364 + }, + { + "epoch": 0.9163960129913764, + "grad_norm": 1.2682573795318604, + "learning_rate": 8.178e-05, + "loss": 0.3436, + "step": 16365 + }, + { + "epoch": 0.9164520103035054, + "grad_norm": 1.6166516542434692, + "learning_rate": 8.1785e-05, + "loss": 0.4677, + "step": 16366 + }, + { + "epoch": 0.9165080076156344, + "grad_norm": 1.3583062887191772, + "learning_rate": 8.179e-05, + "loss": 0.4421, + "step": 16367 + }, + { + "epoch": 0.9165640049277635, + "grad_norm": 1.5479578971862793, + "learning_rate": 8.1795e-05, + "loss": 0.6007, + "step": 16368 + }, + { + "epoch": 0.9166200022398925, + "grad_norm": 1.1865326166152954, + "learning_rate": 8.18e-05, + "loss": 0.4101, + "step": 16369 + }, + { + "epoch": 0.9166759995520215, + "grad_norm": 1.5260560512542725, + "learning_rate": 8.180500000000001e-05, + "loss": 0.5892, + "step": 16370 + }, + { + "epoch": 0.9167319968641505, + "grad_norm": 1.3571499586105347, + "learning_rate": 8.181e-05, + "loss": 0.3783, + "step": 16371 + }, + { + "epoch": 0.9167879941762795, + "grad_norm": 1.2588247060775757, + "learning_rate": 8.1815e-05, + "loss": 0.4172, + "step": 16372 + }, + { + "epoch": 0.9168439914884086, + "grad_norm": 1.2398922443389893, + "learning_rate": 8.182000000000001e-05, + "loss": 0.4312, + "step": 16373 + }, + { + "epoch": 0.9168999888005376, + "grad_norm": 1.4239126443862915, + "learning_rate": 8.182500000000001e-05, + "loss": 0.428, + "step": 16374 + }, + { + "epoch": 0.9169559861126666, + "grad_norm": 1.1611047983169556, + "learning_rate": 8.183000000000001e-05, + "loss": 0.3953, + "step": 16375 + }, + { + "epoch": 0.9170119834247956, + "grad_norm": 1.212927222251892, + "learning_rate": 8.1835e-05, + "loss": 0.4476, + "step": 16376 + }, + { + "epoch": 0.9170679807369246, + "grad_norm": 1.2236628532409668, + "learning_rate": 8.184e-05, + "loss": 0.4802, + "step": 16377 + }, + { + "epoch": 0.9171239780490537, + "grad_norm": 1.3253746032714844, + "learning_rate": 8.1845e-05, + "loss": 0.5255, + "step": 16378 + }, + { + "epoch": 0.9171799753611827, + "grad_norm": 1.1778866052627563, + "learning_rate": 8.185000000000001e-05, + "loss": 0.464, + "step": 16379 + }, + { + "epoch": 0.9172359726733117, + "grad_norm": 1.138843059539795, + "learning_rate": 8.185500000000001e-05, + "loss": 0.331, + "step": 16380 + }, + { + "epoch": 0.9172919699854407, + "grad_norm": 1.3189870119094849, + "learning_rate": 8.186e-05, + "loss": 0.397, + "step": 16381 + }, + { + "epoch": 0.9173479672975697, + "grad_norm": 1.7547982931137085, + "learning_rate": 8.1865e-05, + "loss": 0.5269, + "step": 16382 + }, + { + "epoch": 0.9174039646096988, + "grad_norm": 1.3204976320266724, + "learning_rate": 8.187e-05, + "loss": 0.3563, + "step": 16383 + }, + { + "epoch": 0.9174599619218278, + "grad_norm": 1.531217098236084, + "learning_rate": 8.1875e-05, + "loss": 0.6081, + "step": 16384 + }, + { + "epoch": 0.9175159592339568, + "grad_norm": 1.2082085609436035, + "learning_rate": 8.188e-05, + "loss": 0.3193, + "step": 16385 + }, + { + "epoch": 0.9175719565460858, + "grad_norm": 1.1349927186965942, + "learning_rate": 8.1885e-05, + "loss": 0.3463, + "step": 16386 + }, + { + "epoch": 0.9176279538582148, + "grad_norm": 1.5495916604995728, + "learning_rate": 8.189e-05, + "loss": 0.4939, + "step": 16387 + }, + { + "epoch": 0.9176839511703438, + "grad_norm": 1.2847042083740234, + "learning_rate": 8.1895e-05, + "loss": 0.5711, + "step": 16388 + }, + { + "epoch": 0.9177399484824729, + "grad_norm": 1.3056803941726685, + "learning_rate": 8.19e-05, + "loss": 0.3994, + "step": 16389 + }, + { + "epoch": 0.9177959457946019, + "grad_norm": 1.2236343622207642, + "learning_rate": 8.1905e-05, + "loss": 0.3643, + "step": 16390 + }, + { + "epoch": 0.9178519431067309, + "grad_norm": 1.272200584411621, + "learning_rate": 8.191000000000001e-05, + "loss": 0.4945, + "step": 16391 + }, + { + "epoch": 0.9179079404188599, + "grad_norm": 1.504387378692627, + "learning_rate": 8.1915e-05, + "loss": 0.4055, + "step": 16392 + }, + { + "epoch": 0.9179639377309889, + "grad_norm": 1.4934040307998657, + "learning_rate": 8.192e-05, + "loss": 0.4389, + "step": 16393 + }, + { + "epoch": 0.918019935043118, + "grad_norm": 1.3719617128372192, + "learning_rate": 8.192500000000001e-05, + "loss": 0.4529, + "step": 16394 + }, + { + "epoch": 0.918075932355247, + "grad_norm": 1.6728545427322388, + "learning_rate": 8.193000000000001e-05, + "loss": 0.4228, + "step": 16395 + }, + { + "epoch": 0.918131929667376, + "grad_norm": 1.1509687900543213, + "learning_rate": 8.193500000000001e-05, + "loss": 0.341, + "step": 16396 + }, + { + "epoch": 0.918187926979505, + "grad_norm": 1.6074631214141846, + "learning_rate": 8.194e-05, + "loss": 0.5055, + "step": 16397 + }, + { + "epoch": 0.918243924291634, + "grad_norm": 1.5949623584747314, + "learning_rate": 8.1945e-05, + "loss": 0.4419, + "step": 16398 + }, + { + "epoch": 0.9182999216037631, + "grad_norm": 1.4974658489227295, + "learning_rate": 8.195e-05, + "loss": 0.4307, + "step": 16399 + }, + { + "epoch": 0.9183559189158921, + "grad_norm": 1.205846905708313, + "learning_rate": 8.195500000000001e-05, + "loss": 0.4287, + "step": 16400 + }, + { + "epoch": 0.9184119162280211, + "grad_norm": 1.346282958984375, + "learning_rate": 8.196000000000001e-05, + "loss": 0.4806, + "step": 16401 + }, + { + "epoch": 0.9184679135401501, + "grad_norm": 1.577929139137268, + "learning_rate": 8.1965e-05, + "loss": 0.5559, + "step": 16402 + }, + { + "epoch": 0.9185239108522791, + "grad_norm": 1.194756269454956, + "learning_rate": 8.197e-05, + "loss": 0.42, + "step": 16403 + }, + { + "epoch": 0.9185799081644082, + "grad_norm": 1.3071774244308472, + "learning_rate": 8.1975e-05, + "loss": 0.4382, + "step": 16404 + }, + { + "epoch": 0.9186359054765372, + "grad_norm": 1.2871538400650024, + "learning_rate": 8.198e-05, + "loss": 0.5422, + "step": 16405 + }, + { + "epoch": 0.9186919027886662, + "grad_norm": 1.2749103307724, + "learning_rate": 8.1985e-05, + "loss": 0.5609, + "step": 16406 + }, + { + "epoch": 0.9187479001007952, + "grad_norm": 1.362830638885498, + "learning_rate": 8.199e-05, + "loss": 0.5608, + "step": 16407 + }, + { + "epoch": 0.9188038974129242, + "grad_norm": 1.5428667068481445, + "learning_rate": 8.1995e-05, + "loss": 0.4483, + "step": 16408 + }, + { + "epoch": 0.9188598947250533, + "grad_norm": 1.3721121549606323, + "learning_rate": 8.2e-05, + "loss": 0.4578, + "step": 16409 + }, + { + "epoch": 0.9189158920371823, + "grad_norm": 1.4253727197647095, + "learning_rate": 8.2005e-05, + "loss": 0.4589, + "step": 16410 + }, + { + "epoch": 0.9189718893493113, + "grad_norm": 1.0003160238265991, + "learning_rate": 8.201000000000001e-05, + "loss": 0.3628, + "step": 16411 + }, + { + "epoch": 0.9190278866614402, + "grad_norm": 1.298473834991455, + "learning_rate": 8.2015e-05, + "loss": 0.3898, + "step": 16412 + }, + { + "epoch": 0.9190838839735692, + "grad_norm": 1.5291424989700317, + "learning_rate": 8.202e-05, + "loss": 0.5072, + "step": 16413 + }, + { + "epoch": 0.9191398812856982, + "grad_norm": 1.2101683616638184, + "learning_rate": 8.2025e-05, + "loss": 0.4064, + "step": 16414 + }, + { + "epoch": 0.9191958785978273, + "grad_norm": 1.035366415977478, + "learning_rate": 8.203000000000001e-05, + "loss": 0.3954, + "step": 16415 + }, + { + "epoch": 0.9192518759099563, + "grad_norm": 1.2232056856155396, + "learning_rate": 8.203500000000001e-05, + "loss": 0.3527, + "step": 16416 + }, + { + "epoch": 0.9193078732220853, + "grad_norm": 1.4133915901184082, + "learning_rate": 8.204000000000001e-05, + "loss": 0.4413, + "step": 16417 + }, + { + "epoch": 0.9193638705342143, + "grad_norm": 1.235101580619812, + "learning_rate": 8.2045e-05, + "loss": 0.3635, + "step": 16418 + }, + { + "epoch": 0.9194198678463433, + "grad_norm": 1.292309045791626, + "learning_rate": 8.205e-05, + "loss": 0.5364, + "step": 16419 + }, + { + "epoch": 0.9194758651584723, + "grad_norm": 1.4182237386703491, + "learning_rate": 8.2055e-05, + "loss": 0.3983, + "step": 16420 + }, + { + "epoch": 0.9195318624706014, + "grad_norm": 1.3632667064666748, + "learning_rate": 8.206e-05, + "loss": 0.4713, + "step": 16421 + }, + { + "epoch": 0.9195878597827304, + "grad_norm": 1.3614498376846313, + "learning_rate": 8.206500000000001e-05, + "loss": 0.4299, + "step": 16422 + }, + { + "epoch": 0.9196438570948594, + "grad_norm": 0.9320577383041382, + "learning_rate": 8.207e-05, + "loss": 0.2702, + "step": 16423 + }, + { + "epoch": 0.9196998544069884, + "grad_norm": 1.6794301271438599, + "learning_rate": 8.2075e-05, + "loss": 0.4165, + "step": 16424 + }, + { + "epoch": 0.9197558517191174, + "grad_norm": 1.3629662990570068, + "learning_rate": 8.208e-05, + "loss": 0.39, + "step": 16425 + }, + { + "epoch": 0.9198118490312465, + "grad_norm": 1.3306039571762085, + "learning_rate": 8.2085e-05, + "loss": 0.3896, + "step": 16426 + }, + { + "epoch": 0.9198678463433755, + "grad_norm": 1.297487497329712, + "learning_rate": 8.209e-05, + "loss": 0.4326, + "step": 16427 + }, + { + "epoch": 0.9199238436555045, + "grad_norm": 1.2556846141815186, + "learning_rate": 8.2095e-05, + "loss": 0.4173, + "step": 16428 + }, + { + "epoch": 0.9199798409676335, + "grad_norm": 1.4238240718841553, + "learning_rate": 8.21e-05, + "loss": 0.4721, + "step": 16429 + }, + { + "epoch": 0.9200358382797625, + "grad_norm": 1.0150232315063477, + "learning_rate": 8.2105e-05, + "loss": 0.3388, + "step": 16430 + }, + { + "epoch": 0.9200918355918916, + "grad_norm": 1.9842538833618164, + "learning_rate": 8.211000000000001e-05, + "loss": 0.5868, + "step": 16431 + }, + { + "epoch": 0.9201478329040206, + "grad_norm": 1.5507006645202637, + "learning_rate": 8.211500000000001e-05, + "loss": 0.5215, + "step": 16432 + }, + { + "epoch": 0.9202038302161496, + "grad_norm": 1.2344601154327393, + "learning_rate": 8.212e-05, + "loss": 0.5651, + "step": 16433 + }, + { + "epoch": 0.9202598275282786, + "grad_norm": 2.3398501873016357, + "learning_rate": 8.2125e-05, + "loss": 0.5618, + "step": 16434 + }, + { + "epoch": 0.9203158248404076, + "grad_norm": 1.16732919216156, + "learning_rate": 8.213e-05, + "loss": 0.5229, + "step": 16435 + }, + { + "epoch": 0.9203718221525367, + "grad_norm": 1.2803624868392944, + "learning_rate": 8.213500000000001e-05, + "loss": 0.4516, + "step": 16436 + }, + { + "epoch": 0.9204278194646657, + "grad_norm": 1.6683622598648071, + "learning_rate": 8.214000000000001e-05, + "loss": 0.5961, + "step": 16437 + }, + { + "epoch": 0.9204838167767947, + "grad_norm": 1.0868639945983887, + "learning_rate": 8.214500000000001e-05, + "loss": 0.4601, + "step": 16438 + }, + { + "epoch": 0.9205398140889237, + "grad_norm": 1.460677981376648, + "learning_rate": 8.215e-05, + "loss": 0.5205, + "step": 16439 + }, + { + "epoch": 0.9205958114010527, + "grad_norm": 1.5984394550323486, + "learning_rate": 8.2155e-05, + "loss": 0.4798, + "step": 16440 + }, + { + "epoch": 0.9206518087131818, + "grad_norm": 1.2280601263046265, + "learning_rate": 8.216e-05, + "loss": 0.5228, + "step": 16441 + }, + { + "epoch": 0.9207078060253108, + "grad_norm": 1.8393840789794922, + "learning_rate": 8.2165e-05, + "loss": 0.4127, + "step": 16442 + }, + { + "epoch": 0.9207638033374398, + "grad_norm": 1.3817163705825806, + "learning_rate": 8.217000000000001e-05, + "loss": 0.4368, + "step": 16443 + }, + { + "epoch": 0.9208198006495688, + "grad_norm": 2.4562225341796875, + "learning_rate": 8.2175e-05, + "loss": 0.6111, + "step": 16444 + }, + { + "epoch": 0.9208757979616978, + "grad_norm": 1.3783890008926392, + "learning_rate": 8.218e-05, + "loss": 0.3892, + "step": 16445 + }, + { + "epoch": 0.9209317952738268, + "grad_norm": 1.3932751417160034, + "learning_rate": 8.2185e-05, + "loss": 0.4043, + "step": 16446 + }, + { + "epoch": 0.9209877925859559, + "grad_norm": 1.3778951168060303, + "learning_rate": 8.219e-05, + "loss": 0.4582, + "step": 16447 + }, + { + "epoch": 0.9210437898980849, + "grad_norm": 1.257495641708374, + "learning_rate": 8.2195e-05, + "loss": 0.4668, + "step": 16448 + }, + { + "epoch": 0.9210997872102139, + "grad_norm": 1.1436564922332764, + "learning_rate": 8.22e-05, + "loss": 0.3684, + "step": 16449 + }, + { + "epoch": 0.9211557845223429, + "grad_norm": 1.3091379404067993, + "learning_rate": 8.2205e-05, + "loss": 0.398, + "step": 16450 + }, + { + "epoch": 0.9212117818344719, + "grad_norm": 1.2673567533493042, + "learning_rate": 8.221000000000001e-05, + "loss": 0.4482, + "step": 16451 + }, + { + "epoch": 0.921267779146601, + "grad_norm": 1.4292402267456055, + "learning_rate": 8.221500000000001e-05, + "loss": 0.5005, + "step": 16452 + }, + { + "epoch": 0.92132377645873, + "grad_norm": 1.2949180603027344, + "learning_rate": 8.222000000000001e-05, + "loss": 0.3964, + "step": 16453 + }, + { + "epoch": 0.921379773770859, + "grad_norm": 1.2452545166015625, + "learning_rate": 8.2225e-05, + "loss": 0.4522, + "step": 16454 + }, + { + "epoch": 0.921435771082988, + "grad_norm": 1.1206986904144287, + "learning_rate": 8.223e-05, + "loss": 0.3496, + "step": 16455 + }, + { + "epoch": 0.921491768395117, + "grad_norm": 1.346388578414917, + "learning_rate": 8.2235e-05, + "loss": 0.405, + "step": 16456 + }, + { + "epoch": 0.9215477657072461, + "grad_norm": 1.2031829357147217, + "learning_rate": 8.224000000000001e-05, + "loss": 0.3709, + "step": 16457 + }, + { + "epoch": 0.9216037630193751, + "grad_norm": 1.5863611698150635, + "learning_rate": 8.224500000000001e-05, + "loss": 0.556, + "step": 16458 + }, + { + "epoch": 0.9216597603315041, + "grad_norm": 1.2165085077285767, + "learning_rate": 8.225000000000001e-05, + "loss": 0.3987, + "step": 16459 + }, + { + "epoch": 0.9217157576436331, + "grad_norm": 1.18646240234375, + "learning_rate": 8.2255e-05, + "loss": 0.4075, + "step": 16460 + }, + { + "epoch": 0.9217717549557621, + "grad_norm": 1.255533218383789, + "learning_rate": 8.226e-05, + "loss": 0.3696, + "step": 16461 + }, + { + "epoch": 0.9218277522678912, + "grad_norm": 1.4369428157806396, + "learning_rate": 8.2265e-05, + "loss": 0.5192, + "step": 16462 + }, + { + "epoch": 0.9218837495800202, + "grad_norm": 1.2948023080825806, + "learning_rate": 8.227e-05, + "loss": 0.4968, + "step": 16463 + }, + { + "epoch": 0.9219397468921492, + "grad_norm": 1.2731658220291138, + "learning_rate": 8.227500000000001e-05, + "loss": 0.455, + "step": 16464 + }, + { + "epoch": 0.9219957442042782, + "grad_norm": 1.2603859901428223, + "learning_rate": 8.228e-05, + "loss": 0.3907, + "step": 16465 + }, + { + "epoch": 0.9220517415164072, + "grad_norm": 1.4726279973983765, + "learning_rate": 8.2285e-05, + "loss": 0.4225, + "step": 16466 + }, + { + "epoch": 0.9221077388285362, + "grad_norm": 1.221868634223938, + "learning_rate": 8.229e-05, + "loss": 0.5006, + "step": 16467 + }, + { + "epoch": 0.9221637361406653, + "grad_norm": 1.2886098623275757, + "learning_rate": 8.2295e-05, + "loss": 0.507, + "step": 16468 + }, + { + "epoch": 0.9222197334527943, + "grad_norm": 1.2557547092437744, + "learning_rate": 8.23e-05, + "loss": 0.4825, + "step": 16469 + }, + { + "epoch": 0.9222757307649233, + "grad_norm": 1.2773897647857666, + "learning_rate": 8.230499999999999e-05, + "loss": 0.3528, + "step": 16470 + }, + { + "epoch": 0.9223317280770523, + "grad_norm": 1.585194706916809, + "learning_rate": 8.231e-05, + "loss": 0.4966, + "step": 16471 + }, + { + "epoch": 0.9223877253891813, + "grad_norm": 1.293116807937622, + "learning_rate": 8.231500000000001e-05, + "loss": 0.653, + "step": 16472 + }, + { + "epoch": 0.9224437227013104, + "grad_norm": 1.1291966438293457, + "learning_rate": 8.232000000000001e-05, + "loss": 0.4101, + "step": 16473 + }, + { + "epoch": 0.9224997200134394, + "grad_norm": 1.5031859874725342, + "learning_rate": 8.232500000000001e-05, + "loss": 0.6122, + "step": 16474 + }, + { + "epoch": 0.9225557173255684, + "grad_norm": 1.3225125074386597, + "learning_rate": 8.233e-05, + "loss": 0.346, + "step": 16475 + }, + { + "epoch": 0.9226117146376974, + "grad_norm": 1.2840086221694946, + "learning_rate": 8.2335e-05, + "loss": 0.395, + "step": 16476 + }, + { + "epoch": 0.9226677119498264, + "grad_norm": 1.2742666006088257, + "learning_rate": 8.234e-05, + "loss": 0.4647, + "step": 16477 + }, + { + "epoch": 0.9227237092619555, + "grad_norm": 1.4567025899887085, + "learning_rate": 8.234500000000001e-05, + "loss": 0.4579, + "step": 16478 + }, + { + "epoch": 0.9227797065740845, + "grad_norm": 1.147629976272583, + "learning_rate": 8.235000000000001e-05, + "loss": 0.4129, + "step": 16479 + }, + { + "epoch": 0.9228357038862135, + "grad_norm": 1.72379469871521, + "learning_rate": 8.235500000000001e-05, + "loss": 0.5756, + "step": 16480 + }, + { + "epoch": 0.9228917011983425, + "grad_norm": 1.4656895399093628, + "learning_rate": 8.236e-05, + "loss": 0.4804, + "step": 16481 + }, + { + "epoch": 0.9229476985104715, + "grad_norm": 1.4475468397140503, + "learning_rate": 8.2365e-05, + "loss": 0.4392, + "step": 16482 + }, + { + "epoch": 0.9230036958226006, + "grad_norm": 1.415730595588684, + "learning_rate": 8.237e-05, + "loss": 0.451, + "step": 16483 + }, + { + "epoch": 0.9230596931347296, + "grad_norm": 1.2868967056274414, + "learning_rate": 8.2375e-05, + "loss": 0.5013, + "step": 16484 + }, + { + "epoch": 0.9231156904468586, + "grad_norm": 1.1820437908172607, + "learning_rate": 8.238000000000001e-05, + "loss": 0.4294, + "step": 16485 + }, + { + "epoch": 0.9231716877589876, + "grad_norm": 1.4844865798950195, + "learning_rate": 8.2385e-05, + "loss": 0.5896, + "step": 16486 + }, + { + "epoch": 0.9232276850711166, + "grad_norm": 1.2885370254516602, + "learning_rate": 8.239e-05, + "loss": 0.443, + "step": 16487 + }, + { + "epoch": 0.9232836823832457, + "grad_norm": 1.252520203590393, + "learning_rate": 8.2395e-05, + "loss": 0.3869, + "step": 16488 + }, + { + "epoch": 0.9233396796953747, + "grad_norm": 1.2477056980133057, + "learning_rate": 8.24e-05, + "loss": 0.5054, + "step": 16489 + }, + { + "epoch": 0.9233956770075037, + "grad_norm": 1.156029462814331, + "learning_rate": 8.2405e-05, + "loss": 0.4791, + "step": 16490 + }, + { + "epoch": 0.9234516743196327, + "grad_norm": 1.2756543159484863, + "learning_rate": 8.241e-05, + "loss": 0.4116, + "step": 16491 + }, + { + "epoch": 0.9235076716317617, + "grad_norm": 1.3437877893447876, + "learning_rate": 8.2415e-05, + "loss": 0.4112, + "step": 16492 + }, + { + "epoch": 0.9235636689438907, + "grad_norm": 4.821994304656982, + "learning_rate": 8.242000000000001e-05, + "loss": 0.4393, + "step": 16493 + }, + { + "epoch": 0.9236196662560197, + "grad_norm": 4.62672758102417, + "learning_rate": 8.242500000000001e-05, + "loss": 0.4004, + "step": 16494 + }, + { + "epoch": 0.9236756635681487, + "grad_norm": 1.2945533990859985, + "learning_rate": 8.243000000000001e-05, + "loss": 0.3963, + "step": 16495 + }, + { + "epoch": 0.9237316608802777, + "grad_norm": 1.3269444704055786, + "learning_rate": 8.2435e-05, + "loss": 0.343, + "step": 16496 + }, + { + "epoch": 0.9237876581924067, + "grad_norm": 1.295454740524292, + "learning_rate": 8.244e-05, + "loss": 0.4811, + "step": 16497 + }, + { + "epoch": 0.9238436555045357, + "grad_norm": 1.302964448928833, + "learning_rate": 8.2445e-05, + "loss": 0.5828, + "step": 16498 + }, + { + "epoch": 0.9238996528166648, + "grad_norm": 1.400241494178772, + "learning_rate": 8.245e-05, + "loss": 0.5116, + "step": 16499 + }, + { + "epoch": 0.9239556501287938, + "grad_norm": 1.2892831563949585, + "learning_rate": 8.245500000000001e-05, + "loss": 0.4536, + "step": 16500 + }, + { + "epoch": 0.9240116474409228, + "grad_norm": 1.1384472846984863, + "learning_rate": 8.246e-05, + "loss": 0.3839, + "step": 16501 + }, + { + "epoch": 0.9240676447530518, + "grad_norm": 1.221259355545044, + "learning_rate": 8.2465e-05, + "loss": 0.3242, + "step": 16502 + }, + { + "epoch": 0.9241236420651808, + "grad_norm": 1.291494607925415, + "learning_rate": 8.247e-05, + "loss": 0.4998, + "step": 16503 + }, + { + "epoch": 0.9241796393773098, + "grad_norm": 1.2921441793441772, + "learning_rate": 8.2475e-05, + "loss": 0.4613, + "step": 16504 + }, + { + "epoch": 0.9242356366894389, + "grad_norm": 1.4710813760757446, + "learning_rate": 8.248e-05, + "loss": 0.369, + "step": 16505 + }, + { + "epoch": 0.9242916340015679, + "grad_norm": 1.471215009689331, + "learning_rate": 8.248500000000001e-05, + "loss": 0.4822, + "step": 16506 + }, + { + "epoch": 0.9243476313136969, + "grad_norm": 1.4474198818206787, + "learning_rate": 8.249e-05, + "loss": 0.5187, + "step": 16507 + }, + { + "epoch": 0.9244036286258259, + "grad_norm": 1.4380496740341187, + "learning_rate": 8.2495e-05, + "loss": 0.5645, + "step": 16508 + }, + { + "epoch": 0.9244596259379549, + "grad_norm": 1.4628536701202393, + "learning_rate": 8.25e-05, + "loss": 0.4177, + "step": 16509 + }, + { + "epoch": 0.924515623250084, + "grad_norm": 1.2975531816482544, + "learning_rate": 8.2505e-05, + "loss": 0.513, + "step": 16510 + }, + { + "epoch": 0.924571620562213, + "grad_norm": 1.196489930152893, + "learning_rate": 8.251e-05, + "loss": 0.4424, + "step": 16511 + }, + { + "epoch": 0.924627617874342, + "grad_norm": 1.557321548461914, + "learning_rate": 8.2515e-05, + "loss": 0.5826, + "step": 16512 + }, + { + "epoch": 0.924683615186471, + "grad_norm": 1.4021459817886353, + "learning_rate": 8.252e-05, + "loss": 0.681, + "step": 16513 + }, + { + "epoch": 0.9247396124986, + "grad_norm": 1.1830254793167114, + "learning_rate": 8.252500000000001e-05, + "loss": 0.3778, + "step": 16514 + }, + { + "epoch": 0.9247956098107291, + "grad_norm": 1.2348469495773315, + "learning_rate": 8.253000000000001e-05, + "loss": 0.393, + "step": 16515 + }, + { + "epoch": 0.9248516071228581, + "grad_norm": 1.3843134641647339, + "learning_rate": 8.253500000000001e-05, + "loss": 0.4101, + "step": 16516 + }, + { + "epoch": 0.9249076044349871, + "grad_norm": 1.307335376739502, + "learning_rate": 8.254e-05, + "loss": 0.3418, + "step": 16517 + }, + { + "epoch": 0.9249636017471161, + "grad_norm": 1.4313379526138306, + "learning_rate": 8.2545e-05, + "loss": 0.4476, + "step": 16518 + }, + { + "epoch": 0.9250195990592451, + "grad_norm": 1.4691137075424194, + "learning_rate": 8.255e-05, + "loss": 0.457, + "step": 16519 + }, + { + "epoch": 0.9250755963713742, + "grad_norm": 3.8030176162719727, + "learning_rate": 8.2555e-05, + "loss": 0.4801, + "step": 16520 + }, + { + "epoch": 0.9251315936835032, + "grad_norm": 1.2685890197753906, + "learning_rate": 8.256000000000001e-05, + "loss": 0.4545, + "step": 16521 + }, + { + "epoch": 0.9251875909956322, + "grad_norm": 1.1432857513427734, + "learning_rate": 8.2565e-05, + "loss": 0.3794, + "step": 16522 + }, + { + "epoch": 0.9252435883077612, + "grad_norm": 1.3015574216842651, + "learning_rate": 8.257e-05, + "loss": 0.3819, + "step": 16523 + }, + { + "epoch": 0.9252995856198902, + "grad_norm": 1.5656224489212036, + "learning_rate": 8.2575e-05, + "loss": 0.5617, + "step": 16524 + }, + { + "epoch": 0.9253555829320192, + "grad_norm": 1.4016870260238647, + "learning_rate": 8.258e-05, + "loss": 0.609, + "step": 16525 + }, + { + "epoch": 0.9254115802441483, + "grad_norm": 1.5549887418746948, + "learning_rate": 8.2585e-05, + "loss": 0.4389, + "step": 16526 + }, + { + "epoch": 0.9254675775562773, + "grad_norm": 1.205660343170166, + "learning_rate": 8.259000000000001e-05, + "loss": 0.4418, + "step": 16527 + }, + { + "epoch": 0.9255235748684063, + "grad_norm": 1.5776665210723877, + "learning_rate": 8.2595e-05, + "loss": 0.5697, + "step": 16528 + }, + { + "epoch": 0.9255795721805353, + "grad_norm": 1.3645782470703125, + "learning_rate": 8.26e-05, + "loss": 0.4751, + "step": 16529 + }, + { + "epoch": 0.9256355694926643, + "grad_norm": 1.3706555366516113, + "learning_rate": 8.2605e-05, + "loss": 0.4326, + "step": 16530 + }, + { + "epoch": 0.9256915668047934, + "grad_norm": 1.230024814605713, + "learning_rate": 8.261e-05, + "loss": 0.3903, + "step": 16531 + }, + { + "epoch": 0.9257475641169224, + "grad_norm": 1.4342620372772217, + "learning_rate": 8.261500000000001e-05, + "loss": 0.7193, + "step": 16532 + }, + { + "epoch": 0.9258035614290514, + "grad_norm": 1.3708159923553467, + "learning_rate": 8.262e-05, + "loss": 0.4468, + "step": 16533 + }, + { + "epoch": 0.9258595587411804, + "grad_norm": 1.3850219249725342, + "learning_rate": 8.2625e-05, + "loss": 0.5445, + "step": 16534 + }, + { + "epoch": 0.9259155560533094, + "grad_norm": 1.1610344648361206, + "learning_rate": 8.263000000000001e-05, + "loss": 0.3923, + "step": 16535 + }, + { + "epoch": 0.9259715533654385, + "grad_norm": 1.4086459875106812, + "learning_rate": 8.263500000000001e-05, + "loss": 0.5273, + "step": 16536 + }, + { + "epoch": 0.9260275506775675, + "grad_norm": 1.4805762767791748, + "learning_rate": 8.264000000000001e-05, + "loss": 0.4365, + "step": 16537 + }, + { + "epoch": 0.9260835479896965, + "grad_norm": 1.6059681177139282, + "learning_rate": 8.2645e-05, + "loss": 0.5667, + "step": 16538 + }, + { + "epoch": 0.9261395453018255, + "grad_norm": 1.3339136838912964, + "learning_rate": 8.265e-05, + "loss": 0.3483, + "step": 16539 + }, + { + "epoch": 0.9261955426139545, + "grad_norm": 1.4039710760116577, + "learning_rate": 8.2655e-05, + "loss": 0.3865, + "step": 16540 + }, + { + "epoch": 0.9262515399260836, + "grad_norm": 1.1990165710449219, + "learning_rate": 8.266e-05, + "loss": 0.3419, + "step": 16541 + }, + { + "epoch": 0.9263075372382126, + "grad_norm": 1.5688766241073608, + "learning_rate": 8.266500000000001e-05, + "loss": 0.5469, + "step": 16542 + }, + { + "epoch": 0.9263635345503416, + "grad_norm": 1.3112396001815796, + "learning_rate": 8.267e-05, + "loss": 0.4534, + "step": 16543 + }, + { + "epoch": 0.9264195318624706, + "grad_norm": 1.5226678848266602, + "learning_rate": 8.2675e-05, + "loss": 0.5196, + "step": 16544 + }, + { + "epoch": 0.9264755291745996, + "grad_norm": 1.2306163311004639, + "learning_rate": 8.268e-05, + "loss": 0.4405, + "step": 16545 + }, + { + "epoch": 0.9265315264867287, + "grad_norm": 1.2804595232009888, + "learning_rate": 8.2685e-05, + "loss": 0.4288, + "step": 16546 + }, + { + "epoch": 0.9265875237988577, + "grad_norm": 1.1443030834197998, + "learning_rate": 8.269e-05, + "loss": 0.4405, + "step": 16547 + }, + { + "epoch": 0.9266435211109867, + "grad_norm": 1.2803187370300293, + "learning_rate": 8.269500000000001e-05, + "loss": 0.4594, + "step": 16548 + }, + { + "epoch": 0.9266995184231157, + "grad_norm": 1.303547978401184, + "learning_rate": 8.27e-05, + "loss": 0.3836, + "step": 16549 + }, + { + "epoch": 0.9267555157352447, + "grad_norm": 1.2507973909378052, + "learning_rate": 8.2705e-05, + "loss": 0.5085, + "step": 16550 + }, + { + "epoch": 0.9268115130473737, + "grad_norm": 1.3687591552734375, + "learning_rate": 8.271e-05, + "loss": 0.539, + "step": 16551 + }, + { + "epoch": 0.9268675103595028, + "grad_norm": 1.1975895166397095, + "learning_rate": 8.271500000000001e-05, + "loss": 0.4621, + "step": 16552 + }, + { + "epoch": 0.9269235076716318, + "grad_norm": 1.4076459407806396, + "learning_rate": 8.272000000000001e-05, + "loss": 0.4201, + "step": 16553 + }, + { + "epoch": 0.9269795049837608, + "grad_norm": 1.4311432838439941, + "learning_rate": 8.2725e-05, + "loss": 0.5233, + "step": 16554 + }, + { + "epoch": 0.9270355022958898, + "grad_norm": 1.3195104598999023, + "learning_rate": 8.273e-05, + "loss": 0.4051, + "step": 16555 + }, + { + "epoch": 0.9270914996080188, + "grad_norm": 1.2419683933258057, + "learning_rate": 8.273500000000001e-05, + "loss": 0.4731, + "step": 16556 + }, + { + "epoch": 0.9271474969201479, + "grad_norm": 1.3806172609329224, + "learning_rate": 8.274000000000001e-05, + "loss": 0.5336, + "step": 16557 + }, + { + "epoch": 0.9272034942322769, + "grad_norm": 1.577425241470337, + "learning_rate": 8.274500000000001e-05, + "loss": 0.43, + "step": 16558 + }, + { + "epoch": 0.9272594915444059, + "grad_norm": 1.5444074869155884, + "learning_rate": 8.275e-05, + "loss": 0.4589, + "step": 16559 + }, + { + "epoch": 0.9273154888565349, + "grad_norm": 1.5758018493652344, + "learning_rate": 8.2755e-05, + "loss": 0.5047, + "step": 16560 + }, + { + "epoch": 0.9273714861686639, + "grad_norm": 1.2698769569396973, + "learning_rate": 8.276e-05, + "loss": 0.4111, + "step": 16561 + }, + { + "epoch": 0.927427483480793, + "grad_norm": 1.5544837713241577, + "learning_rate": 8.2765e-05, + "loss": 0.4808, + "step": 16562 + }, + { + "epoch": 0.927483480792922, + "grad_norm": 1.2893900871276855, + "learning_rate": 8.277000000000001e-05, + "loss": 0.4719, + "step": 16563 + }, + { + "epoch": 0.927539478105051, + "grad_norm": 1.3675788640975952, + "learning_rate": 8.2775e-05, + "loss": 0.403, + "step": 16564 + }, + { + "epoch": 0.92759547541718, + "grad_norm": 1.1784521341323853, + "learning_rate": 8.278e-05, + "loss": 0.4722, + "step": 16565 + }, + { + "epoch": 0.927651472729309, + "grad_norm": 1.1031605005264282, + "learning_rate": 8.2785e-05, + "loss": 0.4225, + "step": 16566 + }, + { + "epoch": 0.927707470041438, + "grad_norm": 1.4057633876800537, + "learning_rate": 8.279e-05, + "loss": 0.4327, + "step": 16567 + }, + { + "epoch": 0.9277634673535671, + "grad_norm": 1.7156649827957153, + "learning_rate": 8.2795e-05, + "loss": 0.7462, + "step": 16568 + }, + { + "epoch": 0.9278194646656961, + "grad_norm": 1.4485633373260498, + "learning_rate": 8.28e-05, + "loss": 0.4922, + "step": 16569 + }, + { + "epoch": 0.9278754619778251, + "grad_norm": 1.3397701978683472, + "learning_rate": 8.2805e-05, + "loss": 0.499, + "step": 16570 + }, + { + "epoch": 0.9279314592899541, + "grad_norm": 1.3101890087127686, + "learning_rate": 8.281e-05, + "loss": 0.4668, + "step": 16571 + }, + { + "epoch": 0.9279874566020831, + "grad_norm": 1.1900054216384888, + "learning_rate": 8.281500000000001e-05, + "loss": 0.4898, + "step": 16572 + }, + { + "epoch": 0.9280434539142122, + "grad_norm": 1.3926546573638916, + "learning_rate": 8.282000000000001e-05, + "loss": 0.5736, + "step": 16573 + }, + { + "epoch": 0.9280994512263412, + "grad_norm": 1.7720388174057007, + "learning_rate": 8.282500000000001e-05, + "loss": 0.4774, + "step": 16574 + }, + { + "epoch": 0.9281554485384702, + "grad_norm": 1.2900577783584595, + "learning_rate": 8.283e-05, + "loss": 0.3573, + "step": 16575 + }, + { + "epoch": 0.9282114458505992, + "grad_norm": 1.13142991065979, + "learning_rate": 8.2835e-05, + "loss": 0.3229, + "step": 16576 + }, + { + "epoch": 0.9282674431627281, + "grad_norm": 1.5441309213638306, + "learning_rate": 8.284000000000001e-05, + "loss": 0.5427, + "step": 16577 + }, + { + "epoch": 0.9283234404748572, + "grad_norm": 1.2781037092208862, + "learning_rate": 8.284500000000001e-05, + "loss": 0.408, + "step": 16578 + }, + { + "epoch": 0.9283794377869862, + "grad_norm": 1.2270132303237915, + "learning_rate": 8.285000000000001e-05, + "loss": 0.3863, + "step": 16579 + }, + { + "epoch": 0.9284354350991152, + "grad_norm": 1.265037178993225, + "learning_rate": 8.2855e-05, + "loss": 0.4642, + "step": 16580 + }, + { + "epoch": 0.9284914324112442, + "grad_norm": 1.491784930229187, + "learning_rate": 8.286e-05, + "loss": 0.5653, + "step": 16581 + }, + { + "epoch": 0.9285474297233732, + "grad_norm": 1.302323341369629, + "learning_rate": 8.2865e-05, + "loss": 0.3473, + "step": 16582 + }, + { + "epoch": 0.9286034270355022, + "grad_norm": 1.445154070854187, + "learning_rate": 8.287e-05, + "loss": 0.4246, + "step": 16583 + }, + { + "epoch": 0.9286594243476313, + "grad_norm": 1.2335448265075684, + "learning_rate": 8.287500000000001e-05, + "loss": 0.4162, + "step": 16584 + }, + { + "epoch": 0.9287154216597603, + "grad_norm": 1.3359366655349731, + "learning_rate": 8.288e-05, + "loss": 0.448, + "step": 16585 + }, + { + "epoch": 0.9287714189718893, + "grad_norm": 1.3777626752853394, + "learning_rate": 8.2885e-05, + "loss": 0.3824, + "step": 16586 + }, + { + "epoch": 0.9288274162840183, + "grad_norm": 1.393613338470459, + "learning_rate": 8.289e-05, + "loss": 0.5083, + "step": 16587 + }, + { + "epoch": 0.9288834135961473, + "grad_norm": 1.359660029411316, + "learning_rate": 8.2895e-05, + "loss": 0.4547, + "step": 16588 + }, + { + "epoch": 0.9289394109082764, + "grad_norm": 1.5443047285079956, + "learning_rate": 8.29e-05, + "loss": 0.3964, + "step": 16589 + }, + { + "epoch": 0.9289954082204054, + "grad_norm": 1.789296269416809, + "learning_rate": 8.290499999999999e-05, + "loss": 0.5635, + "step": 16590 + }, + { + "epoch": 0.9290514055325344, + "grad_norm": 1.4631083011627197, + "learning_rate": 8.291e-05, + "loss": 0.359, + "step": 16591 + }, + { + "epoch": 0.9291074028446634, + "grad_norm": 1.3464879989624023, + "learning_rate": 8.291500000000002e-05, + "loss": 0.5024, + "step": 16592 + }, + { + "epoch": 0.9291634001567924, + "grad_norm": 1.7417012453079224, + "learning_rate": 8.292000000000001e-05, + "loss": 0.5396, + "step": 16593 + }, + { + "epoch": 0.9292193974689215, + "grad_norm": 1.5067992210388184, + "learning_rate": 8.292500000000001e-05, + "loss": 0.4364, + "step": 16594 + }, + { + "epoch": 0.9292753947810505, + "grad_norm": 1.6404314041137695, + "learning_rate": 8.293000000000001e-05, + "loss": 0.5575, + "step": 16595 + }, + { + "epoch": 0.9293313920931795, + "grad_norm": 1.6139005422592163, + "learning_rate": 8.2935e-05, + "loss": 0.6856, + "step": 16596 + }, + { + "epoch": 0.9293873894053085, + "grad_norm": 3.3078603744506836, + "learning_rate": 8.294e-05, + "loss": 0.4122, + "step": 16597 + }, + { + "epoch": 0.9294433867174375, + "grad_norm": 2.09658145904541, + "learning_rate": 8.2945e-05, + "loss": 0.5927, + "step": 16598 + }, + { + "epoch": 0.9294993840295666, + "grad_norm": 1.4565671682357788, + "learning_rate": 8.295000000000001e-05, + "loss": 0.5263, + "step": 16599 + }, + { + "epoch": 0.9295553813416956, + "grad_norm": 1.625476360321045, + "learning_rate": 8.295500000000001e-05, + "loss": 0.6797, + "step": 16600 + }, + { + "epoch": 0.9296113786538246, + "grad_norm": 1.2399108409881592, + "learning_rate": 8.296e-05, + "loss": 0.4101, + "step": 16601 + }, + { + "epoch": 0.9296673759659536, + "grad_norm": 1.4441423416137695, + "learning_rate": 8.2965e-05, + "loss": 0.4181, + "step": 16602 + }, + { + "epoch": 0.9297233732780826, + "grad_norm": 1.3000669479370117, + "learning_rate": 8.297e-05, + "loss": 0.3524, + "step": 16603 + }, + { + "epoch": 0.9297793705902117, + "grad_norm": 1.5105751752853394, + "learning_rate": 8.2975e-05, + "loss": 0.5565, + "step": 16604 + }, + { + "epoch": 0.9298353679023407, + "grad_norm": 1.344003677368164, + "learning_rate": 8.298000000000001e-05, + "loss": 0.4283, + "step": 16605 + }, + { + "epoch": 0.9298913652144697, + "grad_norm": 1.0473260879516602, + "learning_rate": 8.2985e-05, + "loss": 0.3195, + "step": 16606 + }, + { + "epoch": 0.9299473625265987, + "grad_norm": 8.235751152038574, + "learning_rate": 8.299e-05, + "loss": 0.3389, + "step": 16607 + }, + { + "epoch": 0.9300033598387277, + "grad_norm": 1.305876612663269, + "learning_rate": 8.2995e-05, + "loss": 0.4715, + "step": 16608 + }, + { + "epoch": 0.9300593571508567, + "grad_norm": 1.1514440774917603, + "learning_rate": 8.3e-05, + "loss": 0.4138, + "step": 16609 + }, + { + "epoch": 0.9301153544629858, + "grad_norm": 1.5276670455932617, + "learning_rate": 8.3005e-05, + "loss": 0.4012, + "step": 16610 + }, + { + "epoch": 0.9301713517751148, + "grad_norm": 1.2899521589279175, + "learning_rate": 8.300999999999999e-05, + "loss": 0.4227, + "step": 16611 + }, + { + "epoch": 0.9302273490872438, + "grad_norm": 1.303055763244629, + "learning_rate": 8.3015e-05, + "loss": 0.5667, + "step": 16612 + }, + { + "epoch": 0.9302833463993728, + "grad_norm": 1.1715697050094604, + "learning_rate": 8.302000000000001e-05, + "loss": 0.4125, + "step": 16613 + }, + { + "epoch": 0.9303393437115018, + "grad_norm": 1.5779167413711548, + "learning_rate": 8.302500000000001e-05, + "loss": 0.5931, + "step": 16614 + }, + { + "epoch": 0.9303953410236309, + "grad_norm": 1.1428412199020386, + "learning_rate": 8.303000000000001e-05, + "loss": 0.3914, + "step": 16615 + }, + { + "epoch": 0.9304513383357599, + "grad_norm": 1.3851007223129272, + "learning_rate": 8.303500000000001e-05, + "loss": 0.567, + "step": 16616 + }, + { + "epoch": 0.9305073356478889, + "grad_norm": 1.3200896978378296, + "learning_rate": 8.304e-05, + "loss": 0.3061, + "step": 16617 + }, + { + "epoch": 0.9305633329600179, + "grad_norm": 1.2225103378295898, + "learning_rate": 8.3045e-05, + "loss": 0.2826, + "step": 16618 + }, + { + "epoch": 0.9306193302721469, + "grad_norm": 1.1365717649459839, + "learning_rate": 8.305e-05, + "loss": 0.3784, + "step": 16619 + }, + { + "epoch": 0.930675327584276, + "grad_norm": 1.8045048713684082, + "learning_rate": 8.305500000000001e-05, + "loss": 0.6675, + "step": 16620 + }, + { + "epoch": 0.930731324896405, + "grad_norm": 1.4891531467437744, + "learning_rate": 8.306000000000001e-05, + "loss": 0.4465, + "step": 16621 + }, + { + "epoch": 0.930787322208534, + "grad_norm": 1.1786224842071533, + "learning_rate": 8.3065e-05, + "loss": 0.4102, + "step": 16622 + }, + { + "epoch": 0.930843319520663, + "grad_norm": 1.4224238395690918, + "learning_rate": 8.307e-05, + "loss": 0.4362, + "step": 16623 + }, + { + "epoch": 0.930899316832792, + "grad_norm": 1.1878151893615723, + "learning_rate": 8.3075e-05, + "loss": 0.4253, + "step": 16624 + }, + { + "epoch": 0.930955314144921, + "grad_norm": 1.362742304801941, + "learning_rate": 8.308e-05, + "loss": 0.4014, + "step": 16625 + }, + { + "epoch": 0.9310113114570501, + "grad_norm": 1.4134865999221802, + "learning_rate": 8.308500000000001e-05, + "loss": 0.5373, + "step": 16626 + }, + { + "epoch": 0.9310673087691791, + "grad_norm": 1.3262542486190796, + "learning_rate": 8.309e-05, + "loss": 0.3624, + "step": 16627 + }, + { + "epoch": 0.9311233060813081, + "grad_norm": 1.2941330671310425, + "learning_rate": 8.3095e-05, + "loss": 0.4494, + "step": 16628 + }, + { + "epoch": 0.9311793033934371, + "grad_norm": 1.194333553314209, + "learning_rate": 8.31e-05, + "loss": 0.4017, + "step": 16629 + }, + { + "epoch": 0.9312353007055661, + "grad_norm": 1.6164854764938354, + "learning_rate": 8.3105e-05, + "loss": 0.5246, + "step": 16630 + }, + { + "epoch": 0.9312912980176952, + "grad_norm": 1.2223023176193237, + "learning_rate": 8.311e-05, + "loss": 0.4499, + "step": 16631 + }, + { + "epoch": 0.9313472953298242, + "grad_norm": 1.225380539894104, + "learning_rate": 8.3115e-05, + "loss": 0.446, + "step": 16632 + }, + { + "epoch": 0.9314032926419532, + "grad_norm": 1.4562822580337524, + "learning_rate": 8.312e-05, + "loss": 0.537, + "step": 16633 + }, + { + "epoch": 0.9314592899540822, + "grad_norm": 1.2595573663711548, + "learning_rate": 8.312500000000001e-05, + "loss": 0.4092, + "step": 16634 + }, + { + "epoch": 0.9315152872662112, + "grad_norm": 1.570261001586914, + "learning_rate": 8.313000000000001e-05, + "loss": 0.4921, + "step": 16635 + }, + { + "epoch": 0.9315712845783403, + "grad_norm": 1.8107719421386719, + "learning_rate": 8.313500000000001e-05, + "loss": 0.4667, + "step": 16636 + }, + { + "epoch": 0.9316272818904693, + "grad_norm": 1.3004285097122192, + "learning_rate": 8.314000000000001e-05, + "loss": 0.61, + "step": 16637 + }, + { + "epoch": 0.9316832792025983, + "grad_norm": 1.2579715251922607, + "learning_rate": 8.3145e-05, + "loss": 0.3779, + "step": 16638 + }, + { + "epoch": 0.9317392765147273, + "grad_norm": 1.3704969882965088, + "learning_rate": 8.315e-05, + "loss": 0.4827, + "step": 16639 + }, + { + "epoch": 0.9317952738268563, + "grad_norm": 1.4027427434921265, + "learning_rate": 8.3155e-05, + "loss": 0.3904, + "step": 16640 + }, + { + "epoch": 0.9318512711389854, + "grad_norm": 1.5530561208724976, + "learning_rate": 8.316000000000001e-05, + "loss": 0.6129, + "step": 16641 + }, + { + "epoch": 0.9319072684511144, + "grad_norm": 1.5409584045410156, + "learning_rate": 8.316500000000001e-05, + "loss": 0.575, + "step": 16642 + }, + { + "epoch": 0.9319632657632434, + "grad_norm": 1.448671817779541, + "learning_rate": 8.317e-05, + "loss": 0.4654, + "step": 16643 + }, + { + "epoch": 0.9320192630753724, + "grad_norm": 1.2616862058639526, + "learning_rate": 8.3175e-05, + "loss": 0.5618, + "step": 16644 + }, + { + "epoch": 0.9320752603875014, + "grad_norm": 1.637648344039917, + "learning_rate": 8.318e-05, + "loss": 0.473, + "step": 16645 + }, + { + "epoch": 0.9321312576996305, + "grad_norm": 1.5618937015533447, + "learning_rate": 8.3185e-05, + "loss": 0.4767, + "step": 16646 + }, + { + "epoch": 0.9321872550117595, + "grad_norm": 1.2212520837783813, + "learning_rate": 8.319e-05, + "loss": 0.4776, + "step": 16647 + }, + { + "epoch": 0.9322432523238885, + "grad_norm": 1.1408857107162476, + "learning_rate": 8.3195e-05, + "loss": 0.3799, + "step": 16648 + }, + { + "epoch": 0.9322992496360175, + "grad_norm": 1.4778286218643188, + "learning_rate": 8.32e-05, + "loss": 0.38, + "step": 16649 + }, + { + "epoch": 0.9323552469481465, + "grad_norm": 1.2437379360198975, + "learning_rate": 8.3205e-05, + "loss": 0.4526, + "step": 16650 + }, + { + "epoch": 0.9324112442602756, + "grad_norm": 5.390810012817383, + "learning_rate": 8.321e-05, + "loss": 0.3792, + "step": 16651 + }, + { + "epoch": 0.9324672415724046, + "grad_norm": 1.3441483974456787, + "learning_rate": 8.3215e-05, + "loss": 0.4469, + "step": 16652 + }, + { + "epoch": 0.9325232388845336, + "grad_norm": 1.81027090549469, + "learning_rate": 8.322e-05, + "loss": 0.4594, + "step": 16653 + }, + { + "epoch": 0.9325792361966626, + "grad_norm": 1.5646641254425049, + "learning_rate": 8.3225e-05, + "loss": 0.5921, + "step": 16654 + }, + { + "epoch": 0.9326352335087916, + "grad_norm": 1.2193459272384644, + "learning_rate": 8.323000000000001e-05, + "loss": 0.3902, + "step": 16655 + }, + { + "epoch": 0.9326912308209206, + "grad_norm": 1.1926816701889038, + "learning_rate": 8.323500000000001e-05, + "loss": 0.4753, + "step": 16656 + }, + { + "epoch": 0.9327472281330497, + "grad_norm": 1.4411406517028809, + "learning_rate": 8.324000000000001e-05, + "loss": 0.4647, + "step": 16657 + }, + { + "epoch": 0.9328032254451787, + "grad_norm": 1.874061107635498, + "learning_rate": 8.324500000000001e-05, + "loss": 0.5395, + "step": 16658 + }, + { + "epoch": 0.9328592227573077, + "grad_norm": 1.3584909439086914, + "learning_rate": 8.325e-05, + "loss": 0.4404, + "step": 16659 + }, + { + "epoch": 0.9329152200694366, + "grad_norm": 1.2228422164916992, + "learning_rate": 8.3255e-05, + "loss": 0.2974, + "step": 16660 + }, + { + "epoch": 0.9329712173815656, + "grad_norm": 1.4384647607803345, + "learning_rate": 8.326e-05, + "loss": 0.4368, + "step": 16661 + }, + { + "epoch": 0.9330272146936947, + "grad_norm": 1.2427054643630981, + "learning_rate": 8.326500000000001e-05, + "loss": 0.4453, + "step": 16662 + }, + { + "epoch": 0.9330832120058237, + "grad_norm": 1.569198489189148, + "learning_rate": 8.327000000000001e-05, + "loss": 0.5041, + "step": 16663 + }, + { + "epoch": 0.9331392093179527, + "grad_norm": 1.2273691892623901, + "learning_rate": 8.3275e-05, + "loss": 0.4874, + "step": 16664 + }, + { + "epoch": 0.9331952066300817, + "grad_norm": 1.4037173986434937, + "learning_rate": 8.328e-05, + "loss": 0.5313, + "step": 16665 + }, + { + "epoch": 0.9332512039422107, + "grad_norm": 1.7003096342086792, + "learning_rate": 8.3285e-05, + "loss": 0.4736, + "step": 16666 + }, + { + "epoch": 0.9333072012543397, + "grad_norm": 1.7694745063781738, + "learning_rate": 8.329e-05, + "loss": 0.5357, + "step": 16667 + }, + { + "epoch": 0.9333631985664688, + "grad_norm": 1.3882302045822144, + "learning_rate": 8.3295e-05, + "loss": 0.3981, + "step": 16668 + }, + { + "epoch": 0.9334191958785978, + "grad_norm": 1.2965500354766846, + "learning_rate": 8.33e-05, + "loss": 0.4082, + "step": 16669 + }, + { + "epoch": 0.9334751931907268, + "grad_norm": 1.3104054927825928, + "learning_rate": 8.3305e-05, + "loss": 0.4348, + "step": 16670 + }, + { + "epoch": 0.9335311905028558, + "grad_norm": 1.2672455310821533, + "learning_rate": 8.331e-05, + "loss": 0.3781, + "step": 16671 + }, + { + "epoch": 0.9335871878149848, + "grad_norm": 1.2638646364212036, + "learning_rate": 8.3315e-05, + "loss": 0.5076, + "step": 16672 + }, + { + "epoch": 0.9336431851271139, + "grad_norm": 1.0242950916290283, + "learning_rate": 8.332000000000001e-05, + "loss": 0.5022, + "step": 16673 + }, + { + "epoch": 0.9336991824392429, + "grad_norm": 1.3578529357910156, + "learning_rate": 8.3325e-05, + "loss": 0.5606, + "step": 16674 + }, + { + "epoch": 0.9337551797513719, + "grad_norm": 1.5048909187316895, + "learning_rate": 8.333e-05, + "loss": 0.368, + "step": 16675 + }, + { + "epoch": 0.9338111770635009, + "grad_norm": 1.1758440732955933, + "learning_rate": 8.3335e-05, + "loss": 0.372, + "step": 16676 + }, + { + "epoch": 0.9338671743756299, + "grad_norm": 1.7584550380706787, + "learning_rate": 8.334000000000001e-05, + "loss": 0.6781, + "step": 16677 + }, + { + "epoch": 0.933923171687759, + "grad_norm": 1.4949352741241455, + "learning_rate": 8.334500000000001e-05, + "loss": 0.5093, + "step": 16678 + }, + { + "epoch": 0.933979168999888, + "grad_norm": 1.2756378650665283, + "learning_rate": 8.335e-05, + "loss": 0.4561, + "step": 16679 + }, + { + "epoch": 0.934035166312017, + "grad_norm": 1.7087390422821045, + "learning_rate": 8.3355e-05, + "loss": 0.3433, + "step": 16680 + }, + { + "epoch": 0.934091163624146, + "grad_norm": 1.3154563903808594, + "learning_rate": 8.336e-05, + "loss": 0.5667, + "step": 16681 + }, + { + "epoch": 0.934147160936275, + "grad_norm": 1.68277108669281, + "learning_rate": 8.3365e-05, + "loss": 0.4285, + "step": 16682 + }, + { + "epoch": 0.934203158248404, + "grad_norm": 1.7122166156768799, + "learning_rate": 8.337000000000001e-05, + "loss": 0.5155, + "step": 16683 + }, + { + "epoch": 0.9342591555605331, + "grad_norm": 1.2633670568466187, + "learning_rate": 8.337500000000001e-05, + "loss": 0.434, + "step": 16684 + }, + { + "epoch": 0.9343151528726621, + "grad_norm": 1.2328050136566162, + "learning_rate": 8.338e-05, + "loss": 0.3519, + "step": 16685 + }, + { + "epoch": 0.9343711501847911, + "grad_norm": 1.4168370962142944, + "learning_rate": 8.3385e-05, + "loss": 0.4409, + "step": 16686 + }, + { + "epoch": 0.9344271474969201, + "grad_norm": 1.3801827430725098, + "learning_rate": 8.339e-05, + "loss": 0.4707, + "step": 16687 + }, + { + "epoch": 0.9344831448090491, + "grad_norm": 1.3615226745605469, + "learning_rate": 8.3395e-05, + "loss": 0.5223, + "step": 16688 + }, + { + "epoch": 0.9345391421211782, + "grad_norm": 1.6777215003967285, + "learning_rate": 8.34e-05, + "loss": 0.6625, + "step": 16689 + }, + { + "epoch": 0.9345951394333072, + "grad_norm": 1.1215518712997437, + "learning_rate": 8.3405e-05, + "loss": 0.4153, + "step": 16690 + }, + { + "epoch": 0.9346511367454362, + "grad_norm": 1.509817361831665, + "learning_rate": 8.341e-05, + "loss": 0.5476, + "step": 16691 + }, + { + "epoch": 0.9347071340575652, + "grad_norm": 1.4509403705596924, + "learning_rate": 8.3415e-05, + "loss": 0.44, + "step": 16692 + }, + { + "epoch": 0.9347631313696942, + "grad_norm": 1.4264498949050903, + "learning_rate": 8.342000000000001e-05, + "loss": 0.5042, + "step": 16693 + }, + { + "epoch": 0.9348191286818233, + "grad_norm": 1.282489538192749, + "learning_rate": 8.342500000000001e-05, + "loss": 0.4117, + "step": 16694 + }, + { + "epoch": 0.9348751259939523, + "grad_norm": 1.2684929370880127, + "learning_rate": 8.343e-05, + "loss": 0.528, + "step": 16695 + }, + { + "epoch": 0.9349311233060813, + "grad_norm": 1.458742380142212, + "learning_rate": 8.3435e-05, + "loss": 0.409, + "step": 16696 + }, + { + "epoch": 0.9349871206182103, + "grad_norm": 1.156498908996582, + "learning_rate": 8.344e-05, + "loss": 0.563, + "step": 16697 + }, + { + "epoch": 0.9350431179303393, + "grad_norm": 1.2255337238311768, + "learning_rate": 8.344500000000001e-05, + "loss": 0.4638, + "step": 16698 + }, + { + "epoch": 0.9350991152424684, + "grad_norm": 1.8082275390625, + "learning_rate": 8.345000000000001e-05, + "loss": 0.5849, + "step": 16699 + }, + { + "epoch": 0.9351551125545974, + "grad_norm": 1.4461427927017212, + "learning_rate": 8.3455e-05, + "loss": 0.4325, + "step": 16700 + }, + { + "epoch": 0.9352111098667264, + "grad_norm": 1.1311047077178955, + "learning_rate": 8.346e-05, + "loss": 0.3848, + "step": 16701 + }, + { + "epoch": 0.9352671071788554, + "grad_norm": 1.0894145965576172, + "learning_rate": 8.3465e-05, + "loss": 0.3834, + "step": 16702 + }, + { + "epoch": 0.9353231044909844, + "grad_norm": 3.1649255752563477, + "learning_rate": 8.347e-05, + "loss": 0.3808, + "step": 16703 + }, + { + "epoch": 0.9353791018031135, + "grad_norm": 1.191104769706726, + "learning_rate": 8.347500000000001e-05, + "loss": 0.3527, + "step": 16704 + }, + { + "epoch": 0.9354350991152425, + "grad_norm": 1.2181001901626587, + "learning_rate": 8.348000000000001e-05, + "loss": 0.3814, + "step": 16705 + }, + { + "epoch": 0.9354910964273715, + "grad_norm": 1.340221881866455, + "learning_rate": 8.3485e-05, + "loss": 0.3855, + "step": 16706 + }, + { + "epoch": 0.9355470937395005, + "grad_norm": 1.5500530004501343, + "learning_rate": 8.349e-05, + "loss": 0.4846, + "step": 16707 + }, + { + "epoch": 0.9356030910516295, + "grad_norm": 1.3324302434921265, + "learning_rate": 8.3495e-05, + "loss": 0.4714, + "step": 16708 + }, + { + "epoch": 0.9356590883637586, + "grad_norm": 1.3207728862762451, + "learning_rate": 8.35e-05, + "loss": 0.5499, + "step": 16709 + }, + { + "epoch": 0.9357150856758876, + "grad_norm": 1.2513694763183594, + "learning_rate": 8.3505e-05, + "loss": 0.3918, + "step": 16710 + }, + { + "epoch": 0.9357710829880166, + "grad_norm": 1.3769958019256592, + "learning_rate": 8.351e-05, + "loss": 0.4827, + "step": 16711 + }, + { + "epoch": 0.9358270803001456, + "grad_norm": 1.632336139678955, + "learning_rate": 8.3515e-05, + "loss": 0.5865, + "step": 16712 + }, + { + "epoch": 0.9358830776122746, + "grad_norm": 1.361846923828125, + "learning_rate": 8.352000000000001e-05, + "loss": 0.5407, + "step": 16713 + }, + { + "epoch": 0.9359390749244036, + "grad_norm": 1.3150957822799683, + "learning_rate": 8.352500000000001e-05, + "loss": 0.5172, + "step": 16714 + }, + { + "epoch": 0.9359950722365327, + "grad_norm": 1.3028266429901123, + "learning_rate": 8.353000000000001e-05, + "loss": 0.4127, + "step": 16715 + }, + { + "epoch": 0.9360510695486617, + "grad_norm": 1.2794989347457886, + "learning_rate": 8.3535e-05, + "loss": 0.4052, + "step": 16716 + }, + { + "epoch": 0.9361070668607907, + "grad_norm": 1.2054892778396606, + "learning_rate": 8.354e-05, + "loss": 0.3915, + "step": 16717 + }, + { + "epoch": 0.9361630641729197, + "grad_norm": 1.4225001335144043, + "learning_rate": 8.3545e-05, + "loss": 0.4819, + "step": 16718 + }, + { + "epoch": 0.9362190614850487, + "grad_norm": 1.4767318964004517, + "learning_rate": 8.355000000000001e-05, + "loss": 0.3541, + "step": 16719 + }, + { + "epoch": 0.9362750587971778, + "grad_norm": 1.5410804748535156, + "learning_rate": 8.355500000000001e-05, + "loss": 0.4682, + "step": 16720 + }, + { + "epoch": 0.9363310561093068, + "grad_norm": 1.2901997566223145, + "learning_rate": 8.356e-05, + "loss": 0.4525, + "step": 16721 + }, + { + "epoch": 0.9363870534214358, + "grad_norm": 1.4899600744247437, + "learning_rate": 8.3565e-05, + "loss": 0.5563, + "step": 16722 + }, + { + "epoch": 0.9364430507335648, + "grad_norm": 1.5106761455535889, + "learning_rate": 8.357e-05, + "loss": 0.4132, + "step": 16723 + }, + { + "epoch": 0.9364990480456938, + "grad_norm": 1.3609869480133057, + "learning_rate": 8.3575e-05, + "loss": 0.4436, + "step": 16724 + }, + { + "epoch": 0.9365550453578229, + "grad_norm": 1.3122451305389404, + "learning_rate": 8.358e-05, + "loss": 0.4533, + "step": 16725 + }, + { + "epoch": 0.9366110426699519, + "grad_norm": 1.235106110572815, + "learning_rate": 8.358500000000001e-05, + "loss": 0.4028, + "step": 16726 + }, + { + "epoch": 0.9366670399820809, + "grad_norm": 1.664569616317749, + "learning_rate": 8.359e-05, + "loss": 0.4744, + "step": 16727 + }, + { + "epoch": 0.9367230372942099, + "grad_norm": 1.2943017482757568, + "learning_rate": 8.3595e-05, + "loss": 0.5061, + "step": 16728 + }, + { + "epoch": 0.9367790346063389, + "grad_norm": 2.0248494148254395, + "learning_rate": 8.36e-05, + "loss": 0.5164, + "step": 16729 + }, + { + "epoch": 0.936835031918468, + "grad_norm": 1.248183250427246, + "learning_rate": 8.3605e-05, + "loss": 0.4729, + "step": 16730 + }, + { + "epoch": 0.936891029230597, + "grad_norm": 1.6285631656646729, + "learning_rate": 8.361e-05, + "loss": 0.4802, + "step": 16731 + }, + { + "epoch": 0.936947026542726, + "grad_norm": 1.0719648599624634, + "learning_rate": 8.3615e-05, + "loss": 0.4056, + "step": 16732 + }, + { + "epoch": 0.937003023854855, + "grad_norm": 1.513091802597046, + "learning_rate": 8.362000000000002e-05, + "loss": 0.6647, + "step": 16733 + }, + { + "epoch": 0.937059021166984, + "grad_norm": 1.4053661823272705, + "learning_rate": 8.362500000000001e-05, + "loss": 0.4836, + "step": 16734 + }, + { + "epoch": 0.937115018479113, + "grad_norm": 1.584080696105957, + "learning_rate": 8.363000000000001e-05, + "loss": 0.527, + "step": 16735 + }, + { + "epoch": 0.9371710157912421, + "grad_norm": 1.1603384017944336, + "learning_rate": 8.363500000000001e-05, + "loss": 0.3775, + "step": 16736 + }, + { + "epoch": 0.9372270131033711, + "grad_norm": 1.2846858501434326, + "learning_rate": 8.364e-05, + "loss": 0.5528, + "step": 16737 + }, + { + "epoch": 0.9372830104155001, + "grad_norm": 1.8479763269424438, + "learning_rate": 8.3645e-05, + "loss": 0.5682, + "step": 16738 + }, + { + "epoch": 0.9373390077276291, + "grad_norm": 1.514147400856018, + "learning_rate": 8.365e-05, + "loss": 0.3887, + "step": 16739 + }, + { + "epoch": 0.9373950050397581, + "grad_norm": 1.4268624782562256, + "learning_rate": 8.365500000000001e-05, + "loss": 0.57, + "step": 16740 + }, + { + "epoch": 0.9374510023518872, + "grad_norm": 1.175947904586792, + "learning_rate": 8.366000000000001e-05, + "loss": 0.4704, + "step": 16741 + }, + { + "epoch": 0.9375069996640161, + "grad_norm": 1.1671470403671265, + "learning_rate": 8.3665e-05, + "loss": 0.3743, + "step": 16742 + }, + { + "epoch": 0.9375629969761451, + "grad_norm": 1.4053637981414795, + "learning_rate": 8.367e-05, + "loss": 0.4791, + "step": 16743 + }, + { + "epoch": 0.9376189942882741, + "grad_norm": 1.3294618129730225, + "learning_rate": 8.3675e-05, + "loss": 0.4177, + "step": 16744 + }, + { + "epoch": 0.9376749916004031, + "grad_norm": 1.4835890531539917, + "learning_rate": 8.368e-05, + "loss": 0.6239, + "step": 16745 + }, + { + "epoch": 0.9377309889125321, + "grad_norm": 1.3424429893493652, + "learning_rate": 8.3685e-05, + "loss": 0.4418, + "step": 16746 + }, + { + "epoch": 0.9377869862246612, + "grad_norm": 1.3203662633895874, + "learning_rate": 8.369000000000001e-05, + "loss": 0.4023, + "step": 16747 + }, + { + "epoch": 0.9378429835367902, + "grad_norm": 1.531296968460083, + "learning_rate": 8.3695e-05, + "loss": 0.4345, + "step": 16748 + }, + { + "epoch": 0.9378989808489192, + "grad_norm": 1.4427509307861328, + "learning_rate": 8.37e-05, + "loss": 0.4551, + "step": 16749 + }, + { + "epoch": 0.9379549781610482, + "grad_norm": 1.5274933576583862, + "learning_rate": 8.3705e-05, + "loss": 0.6143, + "step": 16750 + }, + { + "epoch": 0.9380109754731772, + "grad_norm": 1.2396273612976074, + "learning_rate": 8.371e-05, + "loss": 0.3572, + "step": 16751 + }, + { + "epoch": 0.9380669727853063, + "grad_norm": 1.183813214302063, + "learning_rate": 8.3715e-05, + "loss": 0.403, + "step": 16752 + }, + { + "epoch": 0.9381229700974353, + "grad_norm": 1.4945588111877441, + "learning_rate": 8.372e-05, + "loss": 0.4316, + "step": 16753 + }, + { + "epoch": 0.9381789674095643, + "grad_norm": 1.4189181327819824, + "learning_rate": 8.3725e-05, + "loss": 0.4808, + "step": 16754 + }, + { + "epoch": 0.9382349647216933, + "grad_norm": 1.2214527130126953, + "learning_rate": 8.373000000000001e-05, + "loss": 0.467, + "step": 16755 + }, + { + "epoch": 0.9382909620338223, + "grad_norm": 1.5569618940353394, + "learning_rate": 8.373500000000001e-05, + "loss": 0.458, + "step": 16756 + }, + { + "epoch": 0.9383469593459514, + "grad_norm": 1.3493481874465942, + "learning_rate": 8.374000000000001e-05, + "loss": 0.4004, + "step": 16757 + }, + { + "epoch": 0.9384029566580804, + "grad_norm": 1.4711847305297852, + "learning_rate": 8.3745e-05, + "loss": 0.4783, + "step": 16758 + }, + { + "epoch": 0.9384589539702094, + "grad_norm": 1.4054988622665405, + "learning_rate": 8.375e-05, + "loss": 0.5384, + "step": 16759 + }, + { + "epoch": 0.9385149512823384, + "grad_norm": 1.324813723564148, + "learning_rate": 8.3755e-05, + "loss": 0.3708, + "step": 16760 + }, + { + "epoch": 0.9385709485944674, + "grad_norm": 1.344856858253479, + "learning_rate": 8.376000000000001e-05, + "loss": 0.5535, + "step": 16761 + }, + { + "epoch": 0.9386269459065965, + "grad_norm": 1.283390760421753, + "learning_rate": 8.376500000000001e-05, + "loss": 0.6027, + "step": 16762 + }, + { + "epoch": 0.9386829432187255, + "grad_norm": 1.4102137088775635, + "learning_rate": 8.377e-05, + "loss": 0.7469, + "step": 16763 + }, + { + "epoch": 0.9387389405308545, + "grad_norm": 1.2193641662597656, + "learning_rate": 8.3775e-05, + "loss": 0.4421, + "step": 16764 + }, + { + "epoch": 0.9387949378429835, + "grad_norm": 1.1807458400726318, + "learning_rate": 8.378e-05, + "loss": 0.5354, + "step": 16765 + }, + { + "epoch": 0.9388509351551125, + "grad_norm": 1.3945485353469849, + "learning_rate": 8.3785e-05, + "loss": 0.3762, + "step": 16766 + }, + { + "epoch": 0.9389069324672416, + "grad_norm": 1.2609732151031494, + "learning_rate": 8.379e-05, + "loss": 0.3866, + "step": 16767 + }, + { + "epoch": 0.9389629297793706, + "grad_norm": 1.4377027750015259, + "learning_rate": 8.3795e-05, + "loss": 0.5106, + "step": 16768 + }, + { + "epoch": 0.9390189270914996, + "grad_norm": 1.3723695278167725, + "learning_rate": 8.38e-05, + "loss": 0.4197, + "step": 16769 + }, + { + "epoch": 0.9390749244036286, + "grad_norm": 1.7024558782577515, + "learning_rate": 8.3805e-05, + "loss": 0.533, + "step": 16770 + }, + { + "epoch": 0.9391309217157576, + "grad_norm": 1.4607455730438232, + "learning_rate": 8.381e-05, + "loss": 0.3979, + "step": 16771 + }, + { + "epoch": 0.9391869190278866, + "grad_norm": 1.6503533124923706, + "learning_rate": 8.3815e-05, + "loss": 0.5965, + "step": 16772 + }, + { + "epoch": 0.9392429163400157, + "grad_norm": 1.5065014362335205, + "learning_rate": 8.382e-05, + "loss": 0.712, + "step": 16773 + }, + { + "epoch": 0.9392989136521447, + "grad_norm": 1.4836997985839844, + "learning_rate": 8.3825e-05, + "loss": 0.5417, + "step": 16774 + }, + { + "epoch": 0.9393549109642737, + "grad_norm": 1.4093387126922607, + "learning_rate": 8.383e-05, + "loss": 0.4704, + "step": 16775 + }, + { + "epoch": 0.9394109082764027, + "grad_norm": 1.4547462463378906, + "learning_rate": 8.383500000000001e-05, + "loss": 0.4811, + "step": 16776 + }, + { + "epoch": 0.9394669055885317, + "grad_norm": 1.3972203731536865, + "learning_rate": 8.384000000000001e-05, + "loss": 0.4777, + "step": 16777 + }, + { + "epoch": 0.9395229029006608, + "grad_norm": 1.5986653566360474, + "learning_rate": 8.384500000000001e-05, + "loss": 0.4574, + "step": 16778 + }, + { + "epoch": 0.9395789002127898, + "grad_norm": 1.2717307806015015, + "learning_rate": 8.385e-05, + "loss": 0.3281, + "step": 16779 + }, + { + "epoch": 0.9396348975249188, + "grad_norm": 1.1803277730941772, + "learning_rate": 8.3855e-05, + "loss": 0.3698, + "step": 16780 + }, + { + "epoch": 0.9396908948370478, + "grad_norm": 1.5105961561203003, + "learning_rate": 8.386e-05, + "loss": 0.4268, + "step": 16781 + }, + { + "epoch": 0.9397468921491768, + "grad_norm": 1.2344133853912354, + "learning_rate": 8.386500000000001e-05, + "loss": 0.4531, + "step": 16782 + }, + { + "epoch": 0.9398028894613059, + "grad_norm": 1.1643778085708618, + "learning_rate": 8.387000000000001e-05, + "loss": 0.3649, + "step": 16783 + }, + { + "epoch": 0.9398588867734349, + "grad_norm": 1.440381646156311, + "learning_rate": 8.3875e-05, + "loss": 0.5847, + "step": 16784 + }, + { + "epoch": 0.9399148840855639, + "grad_norm": 1.1849454641342163, + "learning_rate": 8.388e-05, + "loss": 0.3915, + "step": 16785 + }, + { + "epoch": 0.9399708813976929, + "grad_norm": 1.1867284774780273, + "learning_rate": 8.3885e-05, + "loss": 0.4209, + "step": 16786 + }, + { + "epoch": 0.9400268787098219, + "grad_norm": 1.245200276374817, + "learning_rate": 8.389e-05, + "loss": 0.4678, + "step": 16787 + }, + { + "epoch": 0.940082876021951, + "grad_norm": 1.2511872053146362, + "learning_rate": 8.3895e-05, + "loss": 0.4868, + "step": 16788 + }, + { + "epoch": 0.94013887333408, + "grad_norm": 2.0155186653137207, + "learning_rate": 8.39e-05, + "loss": 0.4851, + "step": 16789 + }, + { + "epoch": 0.940194870646209, + "grad_norm": 1.2310574054718018, + "learning_rate": 8.3905e-05, + "loss": 0.4153, + "step": 16790 + }, + { + "epoch": 0.940250867958338, + "grad_norm": 1.2299989461898804, + "learning_rate": 8.391e-05, + "loss": 0.4983, + "step": 16791 + }, + { + "epoch": 0.940306865270467, + "grad_norm": 1.5917248725891113, + "learning_rate": 8.3915e-05, + "loss": 0.4836, + "step": 16792 + }, + { + "epoch": 0.940362862582596, + "grad_norm": 1.1603355407714844, + "learning_rate": 8.392e-05, + "loss": 0.4125, + "step": 16793 + }, + { + "epoch": 0.9404188598947251, + "grad_norm": 1.0655100345611572, + "learning_rate": 8.392500000000001e-05, + "loss": 0.3189, + "step": 16794 + }, + { + "epoch": 0.9404748572068541, + "grad_norm": 1.6109964847564697, + "learning_rate": 8.393e-05, + "loss": 0.52, + "step": 16795 + }, + { + "epoch": 0.9405308545189831, + "grad_norm": 1.5058531761169434, + "learning_rate": 8.3935e-05, + "loss": 0.4478, + "step": 16796 + }, + { + "epoch": 0.9405868518311121, + "grad_norm": 1.4384710788726807, + "learning_rate": 8.394000000000001e-05, + "loss": 0.4512, + "step": 16797 + }, + { + "epoch": 0.9406428491432411, + "grad_norm": 1.2084736824035645, + "learning_rate": 8.394500000000001e-05, + "loss": 0.3652, + "step": 16798 + }, + { + "epoch": 0.9406988464553702, + "grad_norm": 1.5859885215759277, + "learning_rate": 8.395000000000001e-05, + "loss": 0.4277, + "step": 16799 + }, + { + "epoch": 0.9407548437674992, + "grad_norm": 1.2028608322143555, + "learning_rate": 8.3955e-05, + "loss": 0.4893, + "step": 16800 + }, + { + "epoch": 0.9408108410796282, + "grad_norm": 1.3841071128845215, + "learning_rate": 8.396e-05, + "loss": 0.439, + "step": 16801 + }, + { + "epoch": 0.9408668383917572, + "grad_norm": 1.3934539556503296, + "learning_rate": 8.3965e-05, + "loss": 0.408, + "step": 16802 + }, + { + "epoch": 0.9409228357038862, + "grad_norm": 1.1731603145599365, + "learning_rate": 8.397000000000001e-05, + "loss": 0.3979, + "step": 16803 + }, + { + "epoch": 0.9409788330160153, + "grad_norm": 1.3989534378051758, + "learning_rate": 8.397500000000001e-05, + "loss": 0.432, + "step": 16804 + }, + { + "epoch": 0.9410348303281443, + "grad_norm": 1.3040473461151123, + "learning_rate": 8.398e-05, + "loss": 0.4161, + "step": 16805 + }, + { + "epoch": 0.9410908276402733, + "grad_norm": 1.228775978088379, + "learning_rate": 8.3985e-05, + "loss": 0.3721, + "step": 16806 + }, + { + "epoch": 0.9411468249524023, + "grad_norm": 1.2703782320022583, + "learning_rate": 8.399e-05, + "loss": 0.5118, + "step": 16807 + }, + { + "epoch": 0.9412028222645313, + "grad_norm": 1.5137567520141602, + "learning_rate": 8.3995e-05, + "loss": 0.4539, + "step": 16808 + }, + { + "epoch": 0.9412588195766604, + "grad_norm": 1.775665283203125, + "learning_rate": 8.4e-05, + "loss": 0.583, + "step": 16809 + }, + { + "epoch": 0.9413148168887894, + "grad_norm": 1.314042091369629, + "learning_rate": 8.4005e-05, + "loss": 0.426, + "step": 16810 + }, + { + "epoch": 0.9413708142009184, + "grad_norm": 1.167601466178894, + "learning_rate": 8.401e-05, + "loss": 0.5671, + "step": 16811 + }, + { + "epoch": 0.9414268115130474, + "grad_norm": 1.4067963361740112, + "learning_rate": 8.4015e-05, + "loss": 0.4454, + "step": 16812 + }, + { + "epoch": 0.9414828088251764, + "grad_norm": 2.502490282058716, + "learning_rate": 8.402e-05, + "loss": 0.4565, + "step": 16813 + }, + { + "epoch": 0.9415388061373055, + "grad_norm": 2.032898426055908, + "learning_rate": 8.402500000000001e-05, + "loss": 0.7062, + "step": 16814 + }, + { + "epoch": 0.9415948034494345, + "grad_norm": 1.5732282400131226, + "learning_rate": 8.403000000000001e-05, + "loss": 0.4778, + "step": 16815 + }, + { + "epoch": 0.9416508007615635, + "grad_norm": 1.5170363187789917, + "learning_rate": 8.4035e-05, + "loss": 0.5445, + "step": 16816 + }, + { + "epoch": 0.9417067980736925, + "grad_norm": 1.4666023254394531, + "learning_rate": 8.404e-05, + "loss": 0.433, + "step": 16817 + }, + { + "epoch": 0.9417627953858215, + "grad_norm": 1.441066026687622, + "learning_rate": 8.404500000000001e-05, + "loss": 0.6679, + "step": 16818 + }, + { + "epoch": 0.9418187926979505, + "grad_norm": 1.2354779243469238, + "learning_rate": 8.405000000000001e-05, + "loss": 0.4044, + "step": 16819 + }, + { + "epoch": 0.9418747900100796, + "grad_norm": 1.2528882026672363, + "learning_rate": 8.405500000000001e-05, + "loss": 0.4331, + "step": 16820 + }, + { + "epoch": 0.9419307873222086, + "grad_norm": 1.175680160522461, + "learning_rate": 8.406e-05, + "loss": 0.4404, + "step": 16821 + }, + { + "epoch": 0.9419867846343376, + "grad_norm": 2.039055347442627, + "learning_rate": 8.4065e-05, + "loss": 0.4098, + "step": 16822 + }, + { + "epoch": 0.9420427819464666, + "grad_norm": 1.3916031122207642, + "learning_rate": 8.407e-05, + "loss": 0.4205, + "step": 16823 + }, + { + "epoch": 0.9420987792585956, + "grad_norm": 1.3693307638168335, + "learning_rate": 8.4075e-05, + "loss": 0.5207, + "step": 16824 + }, + { + "epoch": 0.9421547765707246, + "grad_norm": 1.4005661010742188, + "learning_rate": 8.408000000000001e-05, + "loss": 0.524, + "step": 16825 + }, + { + "epoch": 0.9422107738828536, + "grad_norm": 1.458531141281128, + "learning_rate": 8.4085e-05, + "loss": 0.5229, + "step": 16826 + }, + { + "epoch": 0.9422667711949826, + "grad_norm": 1.1541227102279663, + "learning_rate": 8.409e-05, + "loss": 0.3867, + "step": 16827 + }, + { + "epoch": 0.9423227685071116, + "grad_norm": 1.5439306497573853, + "learning_rate": 8.4095e-05, + "loss": 0.531, + "step": 16828 + }, + { + "epoch": 0.9423787658192406, + "grad_norm": 1.3004478216171265, + "learning_rate": 8.41e-05, + "loss": 0.4577, + "step": 16829 + }, + { + "epoch": 0.9424347631313696, + "grad_norm": 1.5048997402191162, + "learning_rate": 8.4105e-05, + "loss": 0.5917, + "step": 16830 + }, + { + "epoch": 0.9424907604434987, + "grad_norm": 1.692022681236267, + "learning_rate": 8.411e-05, + "loss": 0.4814, + "step": 16831 + }, + { + "epoch": 0.9425467577556277, + "grad_norm": 1.2445706129074097, + "learning_rate": 8.4115e-05, + "loss": 0.5822, + "step": 16832 + }, + { + "epoch": 0.9426027550677567, + "grad_norm": 1.32802152633667, + "learning_rate": 8.412e-05, + "loss": 0.499, + "step": 16833 + }, + { + "epoch": 0.9426587523798857, + "grad_norm": 1.4759972095489502, + "learning_rate": 8.412500000000001e-05, + "loss": 0.5567, + "step": 16834 + }, + { + "epoch": 0.9427147496920147, + "grad_norm": 1.3070430755615234, + "learning_rate": 8.413000000000001e-05, + "loss": 0.361, + "step": 16835 + }, + { + "epoch": 0.9427707470041438, + "grad_norm": 1.3818913698196411, + "learning_rate": 8.4135e-05, + "loss": 0.4857, + "step": 16836 + }, + { + "epoch": 0.9428267443162728, + "grad_norm": 1.1885535717010498, + "learning_rate": 8.414e-05, + "loss": 0.4102, + "step": 16837 + }, + { + "epoch": 0.9428827416284018, + "grad_norm": 1.4455609321594238, + "learning_rate": 8.4145e-05, + "loss": 0.3973, + "step": 16838 + }, + { + "epoch": 0.9429387389405308, + "grad_norm": 1.3386549949645996, + "learning_rate": 8.415000000000001e-05, + "loss": 0.5643, + "step": 16839 + }, + { + "epoch": 0.9429947362526598, + "grad_norm": 1.6348649263381958, + "learning_rate": 8.415500000000001e-05, + "loss": 0.4125, + "step": 16840 + }, + { + "epoch": 0.9430507335647889, + "grad_norm": 1.3806694746017456, + "learning_rate": 8.416000000000001e-05, + "loss": 0.4462, + "step": 16841 + }, + { + "epoch": 0.9431067308769179, + "grad_norm": 1.826649785041809, + "learning_rate": 8.4165e-05, + "loss": 0.4605, + "step": 16842 + }, + { + "epoch": 0.9431627281890469, + "grad_norm": 1.4051538705825806, + "learning_rate": 8.417e-05, + "loss": 0.5421, + "step": 16843 + }, + { + "epoch": 0.9432187255011759, + "grad_norm": 1.68427312374115, + "learning_rate": 8.4175e-05, + "loss": 0.5481, + "step": 16844 + }, + { + "epoch": 0.9432747228133049, + "grad_norm": 1.3187127113342285, + "learning_rate": 8.418e-05, + "loss": 0.405, + "step": 16845 + }, + { + "epoch": 0.943330720125434, + "grad_norm": 1.2708790302276611, + "learning_rate": 8.418500000000001e-05, + "loss": 0.6116, + "step": 16846 + }, + { + "epoch": 0.943386717437563, + "grad_norm": 1.282198190689087, + "learning_rate": 8.419e-05, + "loss": 0.3726, + "step": 16847 + }, + { + "epoch": 0.943442714749692, + "grad_norm": 1.1929875612258911, + "learning_rate": 8.4195e-05, + "loss": 0.4536, + "step": 16848 + }, + { + "epoch": 0.943498712061821, + "grad_norm": 1.3808897733688354, + "learning_rate": 8.42e-05, + "loss": 0.4514, + "step": 16849 + }, + { + "epoch": 0.94355470937395, + "grad_norm": 1.4703110456466675, + "learning_rate": 8.4205e-05, + "loss": 0.4597, + "step": 16850 + }, + { + "epoch": 0.943610706686079, + "grad_norm": 1.349044680595398, + "learning_rate": 8.421e-05, + "loss": 0.6292, + "step": 16851 + }, + { + "epoch": 0.9436667039982081, + "grad_norm": 1.147835373878479, + "learning_rate": 8.4215e-05, + "loss": 0.4185, + "step": 16852 + }, + { + "epoch": 0.9437227013103371, + "grad_norm": 1.3355592489242554, + "learning_rate": 8.422e-05, + "loss": 0.5128, + "step": 16853 + }, + { + "epoch": 0.9437786986224661, + "grad_norm": 1.4008398056030273, + "learning_rate": 8.422500000000001e-05, + "loss": 0.5362, + "step": 16854 + }, + { + "epoch": 0.9438346959345951, + "grad_norm": 1.6245315074920654, + "learning_rate": 8.423000000000001e-05, + "loss": 0.6372, + "step": 16855 + }, + { + "epoch": 0.9438906932467241, + "grad_norm": 2.5869712829589844, + "learning_rate": 8.423500000000001e-05, + "loss": 0.5831, + "step": 16856 + }, + { + "epoch": 0.9439466905588532, + "grad_norm": 1.4545773267745972, + "learning_rate": 8.424e-05, + "loss": 0.5986, + "step": 16857 + }, + { + "epoch": 0.9440026878709822, + "grad_norm": 1.1646499633789062, + "learning_rate": 8.4245e-05, + "loss": 0.475, + "step": 16858 + }, + { + "epoch": 0.9440586851831112, + "grad_norm": 1.5359723567962646, + "learning_rate": 8.425e-05, + "loss": 0.3858, + "step": 16859 + }, + { + "epoch": 0.9441146824952402, + "grad_norm": 1.070334553718567, + "learning_rate": 8.425500000000001e-05, + "loss": 0.3192, + "step": 16860 + }, + { + "epoch": 0.9441706798073692, + "grad_norm": 1.770363211631775, + "learning_rate": 8.426000000000001e-05, + "loss": 0.4904, + "step": 16861 + }, + { + "epoch": 0.9442266771194983, + "grad_norm": 1.579593300819397, + "learning_rate": 8.426500000000001e-05, + "loss": 0.586, + "step": 16862 + }, + { + "epoch": 0.9442826744316273, + "grad_norm": 1.3765833377838135, + "learning_rate": 8.427e-05, + "loss": 0.5836, + "step": 16863 + }, + { + "epoch": 0.9443386717437563, + "grad_norm": 1.5055797100067139, + "learning_rate": 8.4275e-05, + "loss": 0.4513, + "step": 16864 + }, + { + "epoch": 0.9443946690558853, + "grad_norm": 1.366385579109192, + "learning_rate": 8.428e-05, + "loss": 0.4448, + "step": 16865 + }, + { + "epoch": 0.9444506663680143, + "grad_norm": 1.3146216869354248, + "learning_rate": 8.4285e-05, + "loss": 0.4196, + "step": 16866 + }, + { + "epoch": 0.9445066636801434, + "grad_norm": 1.6299651861190796, + "learning_rate": 8.429000000000001e-05, + "loss": 0.517, + "step": 16867 + }, + { + "epoch": 0.9445626609922724, + "grad_norm": 1.3505542278289795, + "learning_rate": 8.4295e-05, + "loss": 0.4939, + "step": 16868 + }, + { + "epoch": 0.9446186583044014, + "grad_norm": 1.7189971208572388, + "learning_rate": 8.43e-05, + "loss": 0.491, + "step": 16869 + }, + { + "epoch": 0.9446746556165304, + "grad_norm": 1.3334531784057617, + "learning_rate": 8.4305e-05, + "loss": 0.4253, + "step": 16870 + }, + { + "epoch": 0.9447306529286594, + "grad_norm": 1.3857353925704956, + "learning_rate": 8.431e-05, + "loss": 0.4126, + "step": 16871 + }, + { + "epoch": 0.9447866502407885, + "grad_norm": 1.5548064708709717, + "learning_rate": 8.4315e-05, + "loss": 0.5157, + "step": 16872 + }, + { + "epoch": 0.9448426475529175, + "grad_norm": 1.399116039276123, + "learning_rate": 8.431999999999999e-05, + "loss": 0.413, + "step": 16873 + }, + { + "epoch": 0.9448986448650465, + "grad_norm": 1.8226274251937866, + "learning_rate": 8.4325e-05, + "loss": 0.4868, + "step": 16874 + }, + { + "epoch": 0.9449546421771755, + "grad_norm": 1.6294915676116943, + "learning_rate": 8.433000000000001e-05, + "loss": 0.511, + "step": 16875 + }, + { + "epoch": 0.9450106394893045, + "grad_norm": 1.645871877670288, + "learning_rate": 8.433500000000001e-05, + "loss": 0.3838, + "step": 16876 + }, + { + "epoch": 0.9450666368014335, + "grad_norm": 1.2966974973678589, + "learning_rate": 8.434000000000001e-05, + "loss": 0.4449, + "step": 16877 + }, + { + "epoch": 0.9451226341135626, + "grad_norm": 1.1157574653625488, + "learning_rate": 8.4345e-05, + "loss": 0.3294, + "step": 16878 + }, + { + "epoch": 0.9451786314256916, + "grad_norm": 1.3588204383850098, + "learning_rate": 8.435e-05, + "loss": 0.5036, + "step": 16879 + }, + { + "epoch": 0.9452346287378206, + "grad_norm": 1.3987594842910767, + "learning_rate": 8.4355e-05, + "loss": 0.5028, + "step": 16880 + }, + { + "epoch": 0.9452906260499496, + "grad_norm": 1.1793015003204346, + "learning_rate": 8.436000000000001e-05, + "loss": 0.3366, + "step": 16881 + }, + { + "epoch": 0.9453466233620786, + "grad_norm": 1.2872474193572998, + "learning_rate": 8.436500000000001e-05, + "loss": 0.4089, + "step": 16882 + }, + { + "epoch": 0.9454026206742077, + "grad_norm": 1.6301637887954712, + "learning_rate": 8.437000000000001e-05, + "loss": 0.4551, + "step": 16883 + }, + { + "epoch": 0.9454586179863367, + "grad_norm": 1.6496950387954712, + "learning_rate": 8.4375e-05, + "loss": 0.5353, + "step": 16884 + }, + { + "epoch": 0.9455146152984657, + "grad_norm": 1.4343160390853882, + "learning_rate": 8.438e-05, + "loss": 0.4322, + "step": 16885 + }, + { + "epoch": 0.9455706126105947, + "grad_norm": 1.707632303237915, + "learning_rate": 8.4385e-05, + "loss": 0.392, + "step": 16886 + }, + { + "epoch": 0.9456266099227237, + "grad_norm": 1.0887588262557983, + "learning_rate": 8.439e-05, + "loss": 0.3682, + "step": 16887 + }, + { + "epoch": 0.9456826072348528, + "grad_norm": 1.4382503032684326, + "learning_rate": 8.439500000000001e-05, + "loss": 0.4761, + "step": 16888 + }, + { + "epoch": 0.9457386045469818, + "grad_norm": 1.2576593160629272, + "learning_rate": 8.44e-05, + "loss": 0.4996, + "step": 16889 + }, + { + "epoch": 0.9457946018591108, + "grad_norm": 1.357924461364746, + "learning_rate": 8.4405e-05, + "loss": 0.4564, + "step": 16890 + }, + { + "epoch": 0.9458505991712398, + "grad_norm": 1.4293816089630127, + "learning_rate": 8.441e-05, + "loss": 0.4203, + "step": 16891 + }, + { + "epoch": 0.9459065964833688, + "grad_norm": 1.5781736373901367, + "learning_rate": 8.4415e-05, + "loss": 0.4303, + "step": 16892 + }, + { + "epoch": 0.9459625937954979, + "grad_norm": 1.687091588973999, + "learning_rate": 8.442e-05, + "loss": 0.5184, + "step": 16893 + }, + { + "epoch": 0.9460185911076269, + "grad_norm": 1.4222277402877808, + "learning_rate": 8.442499999999999e-05, + "loss": 0.508, + "step": 16894 + }, + { + "epoch": 0.9460745884197559, + "grad_norm": 1.134165644645691, + "learning_rate": 8.443e-05, + "loss": 0.4689, + "step": 16895 + }, + { + "epoch": 0.9461305857318849, + "grad_norm": 1.3414344787597656, + "learning_rate": 8.443500000000001e-05, + "loss": 0.4429, + "step": 16896 + }, + { + "epoch": 0.9461865830440139, + "grad_norm": 1.2850489616394043, + "learning_rate": 8.444000000000001e-05, + "loss": 0.4379, + "step": 16897 + }, + { + "epoch": 0.946242580356143, + "grad_norm": 1.5533628463745117, + "learning_rate": 8.444500000000001e-05, + "loss": 0.3544, + "step": 16898 + }, + { + "epoch": 0.946298577668272, + "grad_norm": 1.3350168466567993, + "learning_rate": 8.445e-05, + "loss": 0.4305, + "step": 16899 + }, + { + "epoch": 0.946354574980401, + "grad_norm": 1.4601842164993286, + "learning_rate": 8.4455e-05, + "loss": 0.4564, + "step": 16900 + }, + { + "epoch": 0.94641057229253, + "grad_norm": 1.246171474456787, + "learning_rate": 8.446e-05, + "loss": 0.3276, + "step": 16901 + }, + { + "epoch": 0.946466569604659, + "grad_norm": 1.3086272478103638, + "learning_rate": 8.4465e-05, + "loss": 0.4221, + "step": 16902 + }, + { + "epoch": 0.946522566916788, + "grad_norm": 1.4926385879516602, + "learning_rate": 8.447000000000001e-05, + "loss": 0.5245, + "step": 16903 + }, + { + "epoch": 0.9465785642289171, + "grad_norm": 1.481048822402954, + "learning_rate": 8.447500000000001e-05, + "loss": 0.395, + "step": 16904 + }, + { + "epoch": 0.9466345615410461, + "grad_norm": 1.3172568082809448, + "learning_rate": 8.448e-05, + "loss": 0.4195, + "step": 16905 + }, + { + "epoch": 0.9466905588531751, + "grad_norm": 1.406826376914978, + "learning_rate": 8.4485e-05, + "loss": 0.4089, + "step": 16906 + }, + { + "epoch": 0.9467465561653041, + "grad_norm": 1.5490084886550903, + "learning_rate": 8.449e-05, + "loss": 0.4135, + "step": 16907 + }, + { + "epoch": 0.946802553477433, + "grad_norm": 1.8110581636428833, + "learning_rate": 8.4495e-05, + "loss": 0.6009, + "step": 16908 + }, + { + "epoch": 0.946858550789562, + "grad_norm": 1.189125895500183, + "learning_rate": 8.450000000000001e-05, + "loss": 0.5155, + "step": 16909 + }, + { + "epoch": 0.9469145481016911, + "grad_norm": 1.2542072534561157, + "learning_rate": 8.4505e-05, + "loss": 0.4491, + "step": 16910 + }, + { + "epoch": 0.9469705454138201, + "grad_norm": 1.4154157638549805, + "learning_rate": 8.451e-05, + "loss": 0.5446, + "step": 16911 + }, + { + "epoch": 0.9470265427259491, + "grad_norm": 1.1997610330581665, + "learning_rate": 8.4515e-05, + "loss": 0.5155, + "step": 16912 + }, + { + "epoch": 0.9470825400380781, + "grad_norm": 1.0751558542251587, + "learning_rate": 8.452e-05, + "loss": 0.3765, + "step": 16913 + }, + { + "epoch": 0.9471385373502071, + "grad_norm": 1.3078124523162842, + "learning_rate": 8.4525e-05, + "loss": 0.5072, + "step": 16914 + }, + { + "epoch": 0.9471945346623362, + "grad_norm": 1.3146898746490479, + "learning_rate": 8.453e-05, + "loss": 0.4415, + "step": 16915 + }, + { + "epoch": 0.9472505319744652, + "grad_norm": 1.3087681531906128, + "learning_rate": 8.4535e-05, + "loss": 0.3962, + "step": 16916 + }, + { + "epoch": 0.9473065292865942, + "grad_norm": 1.1553888320922852, + "learning_rate": 8.454000000000001e-05, + "loss": 0.4969, + "step": 16917 + }, + { + "epoch": 0.9473625265987232, + "grad_norm": 1.4546152353286743, + "learning_rate": 8.454500000000001e-05, + "loss": 0.5268, + "step": 16918 + }, + { + "epoch": 0.9474185239108522, + "grad_norm": 1.5179592370986938, + "learning_rate": 8.455000000000001e-05, + "loss": 0.4177, + "step": 16919 + }, + { + "epoch": 0.9474745212229813, + "grad_norm": 1.39425790309906, + "learning_rate": 8.4555e-05, + "loss": 0.568, + "step": 16920 + }, + { + "epoch": 0.9475305185351103, + "grad_norm": 1.3313343524932861, + "learning_rate": 8.456e-05, + "loss": 0.419, + "step": 16921 + }, + { + "epoch": 0.9475865158472393, + "grad_norm": 1.3817088603973389, + "learning_rate": 8.4565e-05, + "loss": 0.5545, + "step": 16922 + }, + { + "epoch": 0.9476425131593683, + "grad_norm": 1.2225130796432495, + "learning_rate": 8.457e-05, + "loss": 0.4863, + "step": 16923 + }, + { + "epoch": 0.9476985104714973, + "grad_norm": 1.3912243843078613, + "learning_rate": 8.457500000000001e-05, + "loss": 0.5267, + "step": 16924 + }, + { + "epoch": 0.9477545077836264, + "grad_norm": 1.3930424451828003, + "learning_rate": 8.458e-05, + "loss": 0.3551, + "step": 16925 + }, + { + "epoch": 0.9478105050957554, + "grad_norm": 1.426986575126648, + "learning_rate": 8.4585e-05, + "loss": 0.4643, + "step": 16926 + }, + { + "epoch": 0.9478665024078844, + "grad_norm": 1.3757551908493042, + "learning_rate": 8.459e-05, + "loss": 0.4426, + "step": 16927 + }, + { + "epoch": 0.9479224997200134, + "grad_norm": 1.4466967582702637, + "learning_rate": 8.4595e-05, + "loss": 0.4384, + "step": 16928 + }, + { + "epoch": 0.9479784970321424, + "grad_norm": 1.2996398210525513, + "learning_rate": 8.46e-05, + "loss": 0.3729, + "step": 16929 + }, + { + "epoch": 0.9480344943442715, + "grad_norm": 1.6008819341659546, + "learning_rate": 8.460500000000001e-05, + "loss": 0.4521, + "step": 16930 + }, + { + "epoch": 0.9480904916564005, + "grad_norm": 1.7591370344161987, + "learning_rate": 8.461e-05, + "loss": 0.561, + "step": 16931 + }, + { + "epoch": 0.9481464889685295, + "grad_norm": 1.2559250593185425, + "learning_rate": 8.4615e-05, + "loss": 0.3675, + "step": 16932 + }, + { + "epoch": 0.9482024862806585, + "grad_norm": 1.5842159986495972, + "learning_rate": 8.462e-05, + "loss": 0.6105, + "step": 16933 + }, + { + "epoch": 0.9482584835927875, + "grad_norm": 1.522939920425415, + "learning_rate": 8.4625e-05, + "loss": 0.4102, + "step": 16934 + }, + { + "epoch": 0.9483144809049165, + "grad_norm": 1.306370496749878, + "learning_rate": 8.463000000000001e-05, + "loss": 0.515, + "step": 16935 + }, + { + "epoch": 0.9483704782170456, + "grad_norm": 1.504814624786377, + "learning_rate": 8.4635e-05, + "loss": 0.5157, + "step": 16936 + }, + { + "epoch": 0.9484264755291746, + "grad_norm": 1.1809239387512207, + "learning_rate": 8.464e-05, + "loss": 0.4107, + "step": 16937 + }, + { + "epoch": 0.9484824728413036, + "grad_norm": 1.5091184377670288, + "learning_rate": 8.464500000000001e-05, + "loss": 0.4104, + "step": 16938 + }, + { + "epoch": 0.9485384701534326, + "grad_norm": 1.0965172052383423, + "learning_rate": 8.465000000000001e-05, + "loss": 0.2987, + "step": 16939 + }, + { + "epoch": 0.9485944674655616, + "grad_norm": 1.2307863235473633, + "learning_rate": 8.465500000000001e-05, + "loss": 0.535, + "step": 16940 + }, + { + "epoch": 0.9486504647776907, + "grad_norm": 1.4812849760055542, + "learning_rate": 8.466e-05, + "loss": 0.4636, + "step": 16941 + }, + { + "epoch": 0.9487064620898197, + "grad_norm": 1.4107130765914917, + "learning_rate": 8.4665e-05, + "loss": 0.622, + "step": 16942 + }, + { + "epoch": 0.9487624594019487, + "grad_norm": 1.3292529582977295, + "learning_rate": 8.467e-05, + "loss": 0.4732, + "step": 16943 + }, + { + "epoch": 0.9488184567140777, + "grad_norm": 1.2414759397506714, + "learning_rate": 8.4675e-05, + "loss": 0.3468, + "step": 16944 + }, + { + "epoch": 0.9488744540262067, + "grad_norm": 1.1517809629440308, + "learning_rate": 8.468000000000001e-05, + "loss": 0.3438, + "step": 16945 + }, + { + "epoch": 0.9489304513383358, + "grad_norm": 1.5465229749679565, + "learning_rate": 8.4685e-05, + "loss": 0.4394, + "step": 16946 + }, + { + "epoch": 0.9489864486504648, + "grad_norm": 1.4621566534042358, + "learning_rate": 8.469e-05, + "loss": 0.3958, + "step": 16947 + }, + { + "epoch": 0.9490424459625938, + "grad_norm": 1.2637304067611694, + "learning_rate": 8.4695e-05, + "loss": 0.3675, + "step": 16948 + }, + { + "epoch": 0.9490984432747228, + "grad_norm": 1.4915884733200073, + "learning_rate": 8.47e-05, + "loss": 0.4249, + "step": 16949 + }, + { + "epoch": 0.9491544405868518, + "grad_norm": 1.419655203819275, + "learning_rate": 8.4705e-05, + "loss": 0.6283, + "step": 16950 + }, + { + "epoch": 0.9492104378989809, + "grad_norm": 1.1955921649932861, + "learning_rate": 8.471000000000001e-05, + "loss": 0.48, + "step": 16951 + }, + { + "epoch": 0.9492664352111099, + "grad_norm": 1.340035319328308, + "learning_rate": 8.4715e-05, + "loss": 0.6691, + "step": 16952 + }, + { + "epoch": 0.9493224325232389, + "grad_norm": 1.3427761793136597, + "learning_rate": 8.472e-05, + "loss": 0.503, + "step": 16953 + }, + { + "epoch": 0.9493784298353679, + "grad_norm": 1.4067109823226929, + "learning_rate": 8.4725e-05, + "loss": 0.6549, + "step": 16954 + }, + { + "epoch": 0.9494344271474969, + "grad_norm": 1.3630707263946533, + "learning_rate": 8.473000000000001e-05, + "loss": 0.4791, + "step": 16955 + }, + { + "epoch": 0.949490424459626, + "grad_norm": 1.6496031284332275, + "learning_rate": 8.473500000000001e-05, + "loss": 0.6371, + "step": 16956 + }, + { + "epoch": 0.949546421771755, + "grad_norm": 1.4741486310958862, + "learning_rate": 8.474e-05, + "loss": 0.3872, + "step": 16957 + }, + { + "epoch": 0.949602419083884, + "grad_norm": 1.1507982015609741, + "learning_rate": 8.4745e-05, + "loss": 0.4622, + "step": 16958 + }, + { + "epoch": 0.949658416396013, + "grad_norm": 1.512468695640564, + "learning_rate": 8.475000000000001e-05, + "loss": 0.6612, + "step": 16959 + }, + { + "epoch": 0.949714413708142, + "grad_norm": 1.4250510931015015, + "learning_rate": 8.475500000000001e-05, + "loss": 0.4561, + "step": 16960 + }, + { + "epoch": 0.949770411020271, + "grad_norm": 1.3143805265426636, + "learning_rate": 8.476000000000001e-05, + "loss": 0.4001, + "step": 16961 + }, + { + "epoch": 0.9498264083324001, + "grad_norm": 1.4907798767089844, + "learning_rate": 8.4765e-05, + "loss": 0.5428, + "step": 16962 + }, + { + "epoch": 0.9498824056445291, + "grad_norm": 1.3315340280532837, + "learning_rate": 8.477e-05, + "loss": 0.4319, + "step": 16963 + }, + { + "epoch": 0.9499384029566581, + "grad_norm": 1.20774507522583, + "learning_rate": 8.4775e-05, + "loss": 0.4153, + "step": 16964 + }, + { + "epoch": 0.9499944002687871, + "grad_norm": 1.1649962663650513, + "learning_rate": 8.478e-05, + "loss": 0.4366, + "step": 16965 + }, + { + "epoch": 0.9500503975809161, + "grad_norm": 1.3137903213500977, + "learning_rate": 8.478500000000001e-05, + "loss": 0.4325, + "step": 16966 + }, + { + "epoch": 0.9501063948930452, + "grad_norm": 1.3243414163589478, + "learning_rate": 8.479e-05, + "loss": 0.3953, + "step": 16967 + }, + { + "epoch": 0.9501623922051742, + "grad_norm": 1.4575765132904053, + "learning_rate": 8.4795e-05, + "loss": 0.4123, + "step": 16968 + }, + { + "epoch": 0.9502183895173032, + "grad_norm": 1.3398209810256958, + "learning_rate": 8.48e-05, + "loss": 0.4228, + "step": 16969 + }, + { + "epoch": 0.9502743868294322, + "grad_norm": 1.8145989179611206, + "learning_rate": 8.4805e-05, + "loss": 0.5517, + "step": 16970 + }, + { + "epoch": 0.9503303841415612, + "grad_norm": 1.2563401460647583, + "learning_rate": 8.481e-05, + "loss": 0.4388, + "step": 16971 + }, + { + "epoch": 0.9503863814536903, + "grad_norm": 1.535429835319519, + "learning_rate": 8.4815e-05, + "loss": 0.4084, + "step": 16972 + }, + { + "epoch": 0.9504423787658193, + "grad_norm": 1.1452304124832153, + "learning_rate": 8.482e-05, + "loss": 0.3845, + "step": 16973 + }, + { + "epoch": 0.9504983760779483, + "grad_norm": 1.621180772781372, + "learning_rate": 8.4825e-05, + "loss": 0.4521, + "step": 16974 + }, + { + "epoch": 0.9505543733900773, + "grad_norm": 1.3514586687088013, + "learning_rate": 8.483000000000001e-05, + "loss": 0.6353, + "step": 16975 + }, + { + "epoch": 0.9506103707022063, + "grad_norm": 1.2678074836730957, + "learning_rate": 8.483500000000001e-05, + "loss": 0.4231, + "step": 16976 + }, + { + "epoch": 0.9506663680143354, + "grad_norm": 2.2612082958221436, + "learning_rate": 8.484000000000001e-05, + "loss": 0.4763, + "step": 16977 + }, + { + "epoch": 0.9507223653264644, + "grad_norm": 1.457580327987671, + "learning_rate": 8.4845e-05, + "loss": 0.5832, + "step": 16978 + }, + { + "epoch": 0.9507783626385934, + "grad_norm": 3.180603265762329, + "learning_rate": 8.485e-05, + "loss": 0.4155, + "step": 16979 + }, + { + "epoch": 0.9508343599507224, + "grad_norm": 1.9469780921936035, + "learning_rate": 8.485500000000001e-05, + "loss": 0.3662, + "step": 16980 + }, + { + "epoch": 0.9508903572628514, + "grad_norm": 1.345381498336792, + "learning_rate": 8.486000000000001e-05, + "loss": 0.4756, + "step": 16981 + }, + { + "epoch": 0.9509463545749804, + "grad_norm": 1.5418356657028198, + "learning_rate": 8.486500000000001e-05, + "loss": 0.4512, + "step": 16982 + }, + { + "epoch": 0.9510023518871095, + "grad_norm": 1.2618006467819214, + "learning_rate": 8.487e-05, + "loss": 0.5224, + "step": 16983 + }, + { + "epoch": 0.9510583491992385, + "grad_norm": 1.2593483924865723, + "learning_rate": 8.4875e-05, + "loss": 0.4596, + "step": 16984 + }, + { + "epoch": 0.9511143465113675, + "grad_norm": 1.267127513885498, + "learning_rate": 8.488e-05, + "loss": 0.5895, + "step": 16985 + }, + { + "epoch": 0.9511703438234965, + "grad_norm": 1.5504367351531982, + "learning_rate": 8.4885e-05, + "loss": 0.5344, + "step": 16986 + }, + { + "epoch": 0.9512263411356255, + "grad_norm": 1.2736339569091797, + "learning_rate": 8.489000000000001e-05, + "loss": 0.4524, + "step": 16987 + }, + { + "epoch": 0.9512823384477546, + "grad_norm": 1.1661581993103027, + "learning_rate": 8.4895e-05, + "loss": 0.343, + "step": 16988 + }, + { + "epoch": 0.9513383357598836, + "grad_norm": 1.334139347076416, + "learning_rate": 8.49e-05, + "loss": 0.5219, + "step": 16989 + }, + { + "epoch": 0.9513943330720125, + "grad_norm": 1.4772909879684448, + "learning_rate": 8.4905e-05, + "loss": 0.555, + "step": 16990 + }, + { + "epoch": 0.9514503303841415, + "grad_norm": 1.1652812957763672, + "learning_rate": 8.491e-05, + "loss": 0.3417, + "step": 16991 + }, + { + "epoch": 0.9515063276962705, + "grad_norm": 1.2831329107284546, + "learning_rate": 8.4915e-05, + "loss": 0.4816, + "step": 16992 + }, + { + "epoch": 0.9515623250083995, + "grad_norm": 1.5837223529815674, + "learning_rate": 8.492e-05, + "loss": 0.4307, + "step": 16993 + }, + { + "epoch": 0.9516183223205286, + "grad_norm": 1.555816888809204, + "learning_rate": 8.4925e-05, + "loss": 0.4459, + "step": 16994 + }, + { + "epoch": 0.9516743196326576, + "grad_norm": 1.3275021314620972, + "learning_rate": 8.493000000000002e-05, + "loss": 0.4356, + "step": 16995 + }, + { + "epoch": 0.9517303169447866, + "grad_norm": 1.6596986055374146, + "learning_rate": 8.493500000000001e-05, + "loss": 0.4259, + "step": 16996 + }, + { + "epoch": 0.9517863142569156, + "grad_norm": 1.3667863607406616, + "learning_rate": 8.494000000000001e-05, + "loss": 0.5015, + "step": 16997 + }, + { + "epoch": 0.9518423115690446, + "grad_norm": 1.5858535766601562, + "learning_rate": 8.494500000000001e-05, + "loss": 0.3896, + "step": 16998 + }, + { + "epoch": 0.9518983088811737, + "grad_norm": 1.3232752084732056, + "learning_rate": 8.495e-05, + "loss": 0.3544, + "step": 16999 + }, + { + "epoch": 0.9519543061933027, + "grad_norm": 1.571776270866394, + "learning_rate": 8.4955e-05, + "loss": 0.4382, + "step": 17000 + }, + { + "epoch": 0.9520103035054317, + "grad_norm": 1.4432038068771362, + "learning_rate": 8.496e-05, + "loss": 0.4218, + "step": 17001 + }, + { + "epoch": 0.9520663008175607, + "grad_norm": 1.3933186531066895, + "learning_rate": 8.496500000000001e-05, + "loss": 0.4635, + "step": 17002 + }, + { + "epoch": 0.9521222981296897, + "grad_norm": 1.338154911994934, + "learning_rate": 8.497000000000001e-05, + "loss": 0.4775, + "step": 17003 + }, + { + "epoch": 0.9521782954418188, + "grad_norm": 1.3936713933944702, + "learning_rate": 8.4975e-05, + "loss": 0.5565, + "step": 17004 + }, + { + "epoch": 0.9522342927539478, + "grad_norm": 1.5405932664871216, + "learning_rate": 8.498e-05, + "loss": 0.5839, + "step": 17005 + }, + { + "epoch": 0.9522902900660768, + "grad_norm": 1.1904425621032715, + "learning_rate": 8.4985e-05, + "loss": 0.3904, + "step": 17006 + }, + { + "epoch": 0.9523462873782058, + "grad_norm": 1.8471018075942993, + "learning_rate": 8.499e-05, + "loss": 0.4705, + "step": 17007 + }, + { + "epoch": 0.9524022846903348, + "grad_norm": 1.429958462715149, + "learning_rate": 8.499500000000001e-05, + "loss": 0.4797, + "step": 17008 + }, + { + "epoch": 0.9524582820024639, + "grad_norm": 1.300558090209961, + "learning_rate": 8.5e-05, + "loss": 0.3766, + "step": 17009 + }, + { + "epoch": 0.9525142793145929, + "grad_norm": 1.1773066520690918, + "learning_rate": 8.5005e-05, + "loss": 0.4107, + "step": 17010 + }, + { + "epoch": 0.9525702766267219, + "grad_norm": 1.355825662612915, + "learning_rate": 8.501e-05, + "loss": 0.6292, + "step": 17011 + }, + { + "epoch": 0.9526262739388509, + "grad_norm": 1.293701171875, + "learning_rate": 8.5015e-05, + "loss": 0.4225, + "step": 17012 + }, + { + "epoch": 0.9526822712509799, + "grad_norm": 1.2822965383529663, + "learning_rate": 8.502e-05, + "loss": 0.5179, + "step": 17013 + }, + { + "epoch": 0.952738268563109, + "grad_norm": 1.3549543619155884, + "learning_rate": 8.502499999999999e-05, + "loss": 0.4233, + "step": 17014 + }, + { + "epoch": 0.952794265875238, + "grad_norm": 1.1742892265319824, + "learning_rate": 8.503e-05, + "loss": 0.3484, + "step": 17015 + }, + { + "epoch": 0.952850263187367, + "grad_norm": 1.4947805404663086, + "learning_rate": 8.503500000000002e-05, + "loss": 0.4144, + "step": 17016 + }, + { + "epoch": 0.952906260499496, + "grad_norm": 2.0162227153778076, + "learning_rate": 8.504000000000001e-05, + "loss": 0.5194, + "step": 17017 + }, + { + "epoch": 0.952962257811625, + "grad_norm": 1.4323337078094482, + "learning_rate": 8.504500000000001e-05, + "loss": 0.5476, + "step": 17018 + }, + { + "epoch": 0.953018255123754, + "grad_norm": 1.0337882041931152, + "learning_rate": 8.505000000000001e-05, + "loss": 0.3104, + "step": 17019 + }, + { + "epoch": 0.9530742524358831, + "grad_norm": 1.3247967958450317, + "learning_rate": 8.5055e-05, + "loss": 0.4969, + "step": 17020 + }, + { + "epoch": 0.9531302497480121, + "grad_norm": 1.4347736835479736, + "learning_rate": 8.506e-05, + "loss": 0.4299, + "step": 17021 + }, + { + "epoch": 0.9531862470601411, + "grad_norm": 1.526903510093689, + "learning_rate": 8.5065e-05, + "loss": 0.3932, + "step": 17022 + }, + { + "epoch": 0.9532422443722701, + "grad_norm": 1.2358578443527222, + "learning_rate": 8.507000000000001e-05, + "loss": 0.3121, + "step": 17023 + }, + { + "epoch": 0.9532982416843991, + "grad_norm": 2.889700174331665, + "learning_rate": 8.507500000000001e-05, + "loss": 0.7682, + "step": 17024 + }, + { + "epoch": 0.9533542389965282, + "grad_norm": 1.4642601013183594, + "learning_rate": 8.508e-05, + "loss": 0.3698, + "step": 17025 + }, + { + "epoch": 0.9534102363086572, + "grad_norm": 1.27633798122406, + "learning_rate": 8.5085e-05, + "loss": 0.4462, + "step": 17026 + }, + { + "epoch": 0.9534662336207862, + "grad_norm": 1.7518889904022217, + "learning_rate": 8.509e-05, + "loss": 0.5751, + "step": 17027 + }, + { + "epoch": 0.9535222309329152, + "grad_norm": 1.443544864654541, + "learning_rate": 8.5095e-05, + "loss": 0.378, + "step": 17028 + }, + { + "epoch": 0.9535782282450442, + "grad_norm": 1.3720136880874634, + "learning_rate": 8.510000000000001e-05, + "loss": 0.4845, + "step": 17029 + }, + { + "epoch": 0.9536342255571733, + "grad_norm": 1.889662265777588, + "learning_rate": 8.5105e-05, + "loss": 0.4681, + "step": 17030 + }, + { + "epoch": 0.9536902228693023, + "grad_norm": 1.6645822525024414, + "learning_rate": 8.511e-05, + "loss": 0.3319, + "step": 17031 + }, + { + "epoch": 0.9537462201814313, + "grad_norm": 1.5410239696502686, + "learning_rate": 8.5115e-05, + "loss": 0.527, + "step": 17032 + }, + { + "epoch": 0.9538022174935603, + "grad_norm": 1.3207294940948486, + "learning_rate": 8.512e-05, + "loss": 0.4837, + "step": 17033 + }, + { + "epoch": 0.9538582148056893, + "grad_norm": 1.1162564754486084, + "learning_rate": 8.5125e-05, + "loss": 0.3487, + "step": 17034 + }, + { + "epoch": 0.9539142121178183, + "grad_norm": 1.2722265720367432, + "learning_rate": 8.512999999999999e-05, + "loss": 0.4862, + "step": 17035 + }, + { + "epoch": 0.9539702094299474, + "grad_norm": 1.4618171453475952, + "learning_rate": 8.5135e-05, + "loss": 0.4082, + "step": 17036 + }, + { + "epoch": 0.9540262067420764, + "grad_norm": 1.6251577138900757, + "learning_rate": 8.514000000000001e-05, + "loss": 0.4704, + "step": 17037 + }, + { + "epoch": 0.9540822040542054, + "grad_norm": 1.2648488283157349, + "learning_rate": 8.514500000000001e-05, + "loss": 0.3907, + "step": 17038 + }, + { + "epoch": 0.9541382013663344, + "grad_norm": 1.171286702156067, + "learning_rate": 8.515000000000001e-05, + "loss": 0.36, + "step": 17039 + }, + { + "epoch": 0.9541941986784634, + "grad_norm": 1.8883570432662964, + "learning_rate": 8.515500000000001e-05, + "loss": 0.4311, + "step": 17040 + }, + { + "epoch": 0.9542501959905925, + "grad_norm": 1.5154889822006226, + "learning_rate": 8.516e-05, + "loss": 0.6413, + "step": 17041 + }, + { + "epoch": 0.9543061933027215, + "grad_norm": 1.3099116086959839, + "learning_rate": 8.5165e-05, + "loss": 0.4196, + "step": 17042 + }, + { + "epoch": 0.9543621906148505, + "grad_norm": 1.5377181768417358, + "learning_rate": 8.517e-05, + "loss": 0.5744, + "step": 17043 + }, + { + "epoch": 0.9544181879269795, + "grad_norm": 1.5144118070602417, + "learning_rate": 8.517500000000001e-05, + "loss": 0.3586, + "step": 17044 + }, + { + "epoch": 0.9544741852391085, + "grad_norm": 1.6528390645980835, + "learning_rate": 8.518000000000001e-05, + "loss": 0.4618, + "step": 17045 + }, + { + "epoch": 0.9545301825512376, + "grad_norm": 2.705566883087158, + "learning_rate": 8.5185e-05, + "loss": 0.3898, + "step": 17046 + }, + { + "epoch": 0.9545861798633666, + "grad_norm": 1.237256646156311, + "learning_rate": 8.519e-05, + "loss": 0.4158, + "step": 17047 + }, + { + "epoch": 0.9546421771754956, + "grad_norm": 1.4359909296035767, + "learning_rate": 8.5195e-05, + "loss": 0.5538, + "step": 17048 + }, + { + "epoch": 0.9546981744876246, + "grad_norm": 1.3809127807617188, + "learning_rate": 8.52e-05, + "loss": 0.4524, + "step": 17049 + }, + { + "epoch": 0.9547541717997536, + "grad_norm": 1.2590758800506592, + "learning_rate": 8.5205e-05, + "loss": 0.4239, + "step": 17050 + }, + { + "epoch": 0.9548101691118827, + "grad_norm": 1.2650855779647827, + "learning_rate": 8.521e-05, + "loss": 0.5073, + "step": 17051 + }, + { + "epoch": 0.9548661664240117, + "grad_norm": 1.4202791452407837, + "learning_rate": 8.5215e-05, + "loss": 0.4295, + "step": 17052 + }, + { + "epoch": 0.9549221637361407, + "grad_norm": 1.4298592805862427, + "learning_rate": 8.522e-05, + "loss": 0.645, + "step": 17053 + }, + { + "epoch": 0.9549781610482697, + "grad_norm": 1.6812329292297363, + "learning_rate": 8.5225e-05, + "loss": 0.4883, + "step": 17054 + }, + { + "epoch": 0.9550341583603987, + "grad_norm": 1.1203745603561401, + "learning_rate": 8.523e-05, + "loss": 0.4202, + "step": 17055 + }, + { + "epoch": 0.9550901556725278, + "grad_norm": 1.4017579555511475, + "learning_rate": 8.5235e-05, + "loss": 0.3603, + "step": 17056 + }, + { + "epoch": 0.9551461529846568, + "grad_norm": 1.3865840435028076, + "learning_rate": 8.524e-05, + "loss": 0.436, + "step": 17057 + }, + { + "epoch": 0.9552021502967858, + "grad_norm": 1.513572096824646, + "learning_rate": 8.524500000000001e-05, + "loss": 0.4439, + "step": 17058 + }, + { + "epoch": 0.9552581476089148, + "grad_norm": 1.321823000907898, + "learning_rate": 8.525000000000001e-05, + "loss": 0.4888, + "step": 17059 + }, + { + "epoch": 0.9553141449210438, + "grad_norm": 1.6148601770401, + "learning_rate": 8.525500000000001e-05, + "loss": 0.4308, + "step": 17060 + }, + { + "epoch": 0.9553701422331728, + "grad_norm": 1.5410012006759644, + "learning_rate": 8.526000000000001e-05, + "loss": 0.5282, + "step": 17061 + }, + { + "epoch": 0.9554261395453019, + "grad_norm": 1.0875070095062256, + "learning_rate": 8.5265e-05, + "loss": 0.3218, + "step": 17062 + }, + { + "epoch": 0.9554821368574309, + "grad_norm": 1.3766624927520752, + "learning_rate": 8.527e-05, + "loss": 0.4481, + "step": 17063 + }, + { + "epoch": 0.9555381341695599, + "grad_norm": 1.3956143856048584, + "learning_rate": 8.5275e-05, + "loss": 0.4638, + "step": 17064 + }, + { + "epoch": 0.9555941314816889, + "grad_norm": 1.3810175657272339, + "learning_rate": 8.528000000000001e-05, + "loss": 0.579, + "step": 17065 + }, + { + "epoch": 0.9556501287938179, + "grad_norm": 1.3077311515808105, + "learning_rate": 8.528500000000001e-05, + "loss": 0.3388, + "step": 17066 + }, + { + "epoch": 0.955706126105947, + "grad_norm": 2.1643025875091553, + "learning_rate": 8.529e-05, + "loss": 0.4967, + "step": 17067 + }, + { + "epoch": 0.955762123418076, + "grad_norm": 1.6373200416564941, + "learning_rate": 8.5295e-05, + "loss": 0.5301, + "step": 17068 + }, + { + "epoch": 0.955818120730205, + "grad_norm": 1.2569293975830078, + "learning_rate": 8.53e-05, + "loss": 0.3281, + "step": 17069 + }, + { + "epoch": 0.955874118042334, + "grad_norm": 1.2961771488189697, + "learning_rate": 8.5305e-05, + "loss": 0.5222, + "step": 17070 + }, + { + "epoch": 0.955930115354463, + "grad_norm": 1.2537667751312256, + "learning_rate": 8.531e-05, + "loss": 0.5211, + "step": 17071 + }, + { + "epoch": 0.9559861126665921, + "grad_norm": 1.3391064405441284, + "learning_rate": 8.5315e-05, + "loss": 0.4964, + "step": 17072 + }, + { + "epoch": 0.956042109978721, + "grad_norm": 1.6892999410629272, + "learning_rate": 8.532e-05, + "loss": 0.7413, + "step": 17073 + }, + { + "epoch": 0.95609810729085, + "grad_norm": 1.190972924232483, + "learning_rate": 8.5325e-05, + "loss": 0.4132, + "step": 17074 + }, + { + "epoch": 0.956154104602979, + "grad_norm": 1.376509666442871, + "learning_rate": 8.533e-05, + "loss": 0.4137, + "step": 17075 + }, + { + "epoch": 0.956210101915108, + "grad_norm": 1.2693827152252197, + "learning_rate": 8.533500000000001e-05, + "loss": 0.3201, + "step": 17076 + }, + { + "epoch": 0.956266099227237, + "grad_norm": 1.4007991552352905, + "learning_rate": 8.534e-05, + "loss": 0.4881, + "step": 17077 + }, + { + "epoch": 0.9563220965393661, + "grad_norm": 1.4203362464904785, + "learning_rate": 8.5345e-05, + "loss": 0.5946, + "step": 17078 + }, + { + "epoch": 0.9563780938514951, + "grad_norm": 1.3453959226608276, + "learning_rate": 8.535e-05, + "loss": 0.4659, + "step": 17079 + }, + { + "epoch": 0.9564340911636241, + "grad_norm": 1.370445966720581, + "learning_rate": 8.535500000000001e-05, + "loss": 0.3657, + "step": 17080 + }, + { + "epoch": 0.9564900884757531, + "grad_norm": 1.295682430267334, + "learning_rate": 8.536000000000001e-05, + "loss": 0.2921, + "step": 17081 + }, + { + "epoch": 0.9565460857878821, + "grad_norm": 1.332443356513977, + "learning_rate": 8.536500000000001e-05, + "loss": 0.3777, + "step": 17082 + }, + { + "epoch": 0.9566020831000112, + "grad_norm": 1.307560682296753, + "learning_rate": 8.537e-05, + "loss": 0.5362, + "step": 17083 + }, + { + "epoch": 0.9566580804121402, + "grad_norm": 1.2536908388137817, + "learning_rate": 8.5375e-05, + "loss": 0.4425, + "step": 17084 + }, + { + "epoch": 0.9567140777242692, + "grad_norm": 1.1129144430160522, + "learning_rate": 8.538e-05, + "loss": 0.3942, + "step": 17085 + }, + { + "epoch": 0.9567700750363982, + "grad_norm": 1.0530842542648315, + "learning_rate": 8.538500000000001e-05, + "loss": 0.4956, + "step": 17086 + }, + { + "epoch": 0.9568260723485272, + "grad_norm": 1.2498310804367065, + "learning_rate": 8.539000000000001e-05, + "loss": 0.4406, + "step": 17087 + }, + { + "epoch": 0.9568820696606563, + "grad_norm": 1.247690200805664, + "learning_rate": 8.5395e-05, + "loss": 0.3611, + "step": 17088 + }, + { + "epoch": 0.9569380669727853, + "grad_norm": 1.384846568107605, + "learning_rate": 8.54e-05, + "loss": 0.5635, + "step": 17089 + }, + { + "epoch": 0.9569940642849143, + "grad_norm": 1.3289271593093872, + "learning_rate": 8.5405e-05, + "loss": 0.5475, + "step": 17090 + }, + { + "epoch": 0.9570500615970433, + "grad_norm": 1.2485624551773071, + "learning_rate": 8.541e-05, + "loss": 0.4813, + "step": 17091 + }, + { + "epoch": 0.9571060589091723, + "grad_norm": 1.3284590244293213, + "learning_rate": 8.5415e-05, + "loss": 0.4166, + "step": 17092 + }, + { + "epoch": 0.9571620562213013, + "grad_norm": 1.539612054824829, + "learning_rate": 8.542e-05, + "loss": 0.3568, + "step": 17093 + }, + { + "epoch": 0.9572180535334304, + "grad_norm": 1.638933777809143, + "learning_rate": 8.5425e-05, + "loss": 0.4398, + "step": 17094 + }, + { + "epoch": 0.9572740508455594, + "grad_norm": 1.4553638696670532, + "learning_rate": 8.543e-05, + "loss": 0.5311, + "step": 17095 + }, + { + "epoch": 0.9573300481576884, + "grad_norm": 1.381110668182373, + "learning_rate": 8.543500000000001e-05, + "loss": 0.5229, + "step": 17096 + }, + { + "epoch": 0.9573860454698174, + "grad_norm": 1.2284245491027832, + "learning_rate": 8.544000000000001e-05, + "loss": 0.4791, + "step": 17097 + }, + { + "epoch": 0.9574420427819464, + "grad_norm": 1.2967196702957153, + "learning_rate": 8.5445e-05, + "loss": 0.3985, + "step": 17098 + }, + { + "epoch": 0.9574980400940755, + "grad_norm": 1.2861863374710083, + "learning_rate": 8.545e-05, + "loss": 0.4384, + "step": 17099 + }, + { + "epoch": 0.9575540374062045, + "grad_norm": 1.610902190208435, + "learning_rate": 8.5455e-05, + "loss": 0.5805, + "step": 17100 + }, + { + "epoch": 0.9576100347183335, + "grad_norm": 1.3004497289657593, + "learning_rate": 8.546000000000001e-05, + "loss": 0.4394, + "step": 17101 + }, + { + "epoch": 0.9576660320304625, + "grad_norm": 1.4884735345840454, + "learning_rate": 8.546500000000001e-05, + "loss": 0.6698, + "step": 17102 + }, + { + "epoch": 0.9577220293425915, + "grad_norm": 1.7032983303070068, + "learning_rate": 8.547e-05, + "loss": 0.6283, + "step": 17103 + }, + { + "epoch": 0.9577780266547206, + "grad_norm": 1.5157755613327026, + "learning_rate": 8.5475e-05, + "loss": 0.5819, + "step": 17104 + }, + { + "epoch": 0.9578340239668496, + "grad_norm": 1.2883331775665283, + "learning_rate": 8.548e-05, + "loss": 0.4398, + "step": 17105 + }, + { + "epoch": 0.9578900212789786, + "grad_norm": 1.4675449132919312, + "learning_rate": 8.5485e-05, + "loss": 0.385, + "step": 17106 + }, + { + "epoch": 0.9579460185911076, + "grad_norm": 1.3032020330429077, + "learning_rate": 8.549000000000001e-05, + "loss": 0.4696, + "step": 17107 + }, + { + "epoch": 0.9580020159032366, + "grad_norm": 1.1724374294281006, + "learning_rate": 8.549500000000001e-05, + "loss": 0.369, + "step": 17108 + }, + { + "epoch": 0.9580580132153657, + "grad_norm": 1.5858454704284668, + "learning_rate": 8.55e-05, + "loss": 0.5147, + "step": 17109 + }, + { + "epoch": 0.9581140105274947, + "grad_norm": 1.1673816442489624, + "learning_rate": 8.5505e-05, + "loss": 0.3849, + "step": 17110 + }, + { + "epoch": 0.9581700078396237, + "grad_norm": 1.6120600700378418, + "learning_rate": 8.551e-05, + "loss": 0.3709, + "step": 17111 + }, + { + "epoch": 0.9582260051517527, + "grad_norm": 1.4594321250915527, + "learning_rate": 8.5515e-05, + "loss": 0.5787, + "step": 17112 + }, + { + "epoch": 0.9582820024638817, + "grad_norm": 1.3463350534439087, + "learning_rate": 8.552e-05, + "loss": 0.4482, + "step": 17113 + }, + { + "epoch": 0.9583379997760108, + "grad_norm": 1.336738109588623, + "learning_rate": 8.5525e-05, + "loss": 0.4269, + "step": 17114 + }, + { + "epoch": 0.9583939970881398, + "grad_norm": 1.1716670989990234, + "learning_rate": 8.553e-05, + "loss": 0.4297, + "step": 17115 + }, + { + "epoch": 0.9584499944002688, + "grad_norm": 1.1067936420440674, + "learning_rate": 8.553500000000001e-05, + "loss": 0.3488, + "step": 17116 + }, + { + "epoch": 0.9585059917123978, + "grad_norm": 1.4455829858779907, + "learning_rate": 8.554000000000001e-05, + "loss": 0.4391, + "step": 17117 + }, + { + "epoch": 0.9585619890245268, + "grad_norm": 1.3242000341415405, + "learning_rate": 8.554500000000001e-05, + "loss": 0.4274, + "step": 17118 + }, + { + "epoch": 0.9586179863366558, + "grad_norm": 1.7039105892181396, + "learning_rate": 8.555e-05, + "loss": 0.4254, + "step": 17119 + }, + { + "epoch": 0.9586739836487849, + "grad_norm": 1.2343722581863403, + "learning_rate": 8.5555e-05, + "loss": 0.4459, + "step": 17120 + }, + { + "epoch": 0.9587299809609139, + "grad_norm": 1.8182319402694702, + "learning_rate": 8.556e-05, + "loss": 0.4805, + "step": 17121 + }, + { + "epoch": 0.9587859782730429, + "grad_norm": 1.6502269506454468, + "learning_rate": 8.556500000000001e-05, + "loss": 0.5003, + "step": 17122 + }, + { + "epoch": 0.9588419755851719, + "grad_norm": 1.1833775043487549, + "learning_rate": 8.557000000000001e-05, + "loss": 0.4476, + "step": 17123 + }, + { + "epoch": 0.9588979728973009, + "grad_norm": 1.6448407173156738, + "learning_rate": 8.5575e-05, + "loss": 0.3923, + "step": 17124 + }, + { + "epoch": 0.95895397020943, + "grad_norm": 1.1200000047683716, + "learning_rate": 8.558e-05, + "loss": 0.3327, + "step": 17125 + }, + { + "epoch": 0.959009967521559, + "grad_norm": 1.3580704927444458, + "learning_rate": 8.5585e-05, + "loss": 0.5893, + "step": 17126 + }, + { + "epoch": 0.959065964833688, + "grad_norm": 1.5426150560379028, + "learning_rate": 8.559e-05, + "loss": 0.5055, + "step": 17127 + }, + { + "epoch": 0.959121962145817, + "grad_norm": 1.2711679935455322, + "learning_rate": 8.559500000000001e-05, + "loss": 0.4388, + "step": 17128 + }, + { + "epoch": 0.959177959457946, + "grad_norm": 1.3287243843078613, + "learning_rate": 8.560000000000001e-05, + "loss": 0.4177, + "step": 17129 + }, + { + "epoch": 0.9592339567700751, + "grad_norm": 1.3373806476593018, + "learning_rate": 8.5605e-05, + "loss": 0.4425, + "step": 17130 + }, + { + "epoch": 0.9592899540822041, + "grad_norm": 1.3446122407913208, + "learning_rate": 8.561e-05, + "loss": 0.4756, + "step": 17131 + }, + { + "epoch": 0.9593459513943331, + "grad_norm": 1.3530346155166626, + "learning_rate": 8.5615e-05, + "loss": 0.365, + "step": 17132 + }, + { + "epoch": 0.9594019487064621, + "grad_norm": 1.2455790042877197, + "learning_rate": 8.562e-05, + "loss": 0.5752, + "step": 17133 + }, + { + "epoch": 0.9594579460185911, + "grad_norm": 1.099302053451538, + "learning_rate": 8.5625e-05, + "loss": 0.3418, + "step": 17134 + }, + { + "epoch": 0.9595139433307202, + "grad_norm": 1.6025974750518799, + "learning_rate": 8.563e-05, + "loss": 0.4204, + "step": 17135 + }, + { + "epoch": 0.9595699406428492, + "grad_norm": 1.3592160940170288, + "learning_rate": 8.5635e-05, + "loss": 0.524, + "step": 17136 + }, + { + "epoch": 0.9596259379549782, + "grad_norm": 1.4095449447631836, + "learning_rate": 8.564000000000001e-05, + "loss": 0.4715, + "step": 17137 + }, + { + "epoch": 0.9596819352671072, + "grad_norm": 1.1044148206710815, + "learning_rate": 8.564500000000001e-05, + "loss": 0.5169, + "step": 17138 + }, + { + "epoch": 0.9597379325792362, + "grad_norm": 1.1242592334747314, + "learning_rate": 8.565000000000001e-05, + "loss": 0.4757, + "step": 17139 + }, + { + "epoch": 0.9597939298913652, + "grad_norm": 1.244508147239685, + "learning_rate": 8.5655e-05, + "loss": 0.3963, + "step": 17140 + }, + { + "epoch": 0.9598499272034943, + "grad_norm": 1.54185152053833, + "learning_rate": 8.566e-05, + "loss": 0.4904, + "step": 17141 + }, + { + "epoch": 0.9599059245156233, + "grad_norm": 1.4076370000839233, + "learning_rate": 8.5665e-05, + "loss": 0.4381, + "step": 17142 + }, + { + "epoch": 0.9599619218277523, + "grad_norm": 1.2301522493362427, + "learning_rate": 8.567000000000001e-05, + "loss": 0.4052, + "step": 17143 + }, + { + "epoch": 0.9600179191398813, + "grad_norm": 1.486867070198059, + "learning_rate": 8.567500000000001e-05, + "loss": 0.4805, + "step": 17144 + }, + { + "epoch": 0.9600739164520103, + "grad_norm": 1.4362130165100098, + "learning_rate": 8.568e-05, + "loss": 0.3875, + "step": 17145 + }, + { + "epoch": 0.9601299137641394, + "grad_norm": 1.2532141208648682, + "learning_rate": 8.5685e-05, + "loss": 0.4779, + "step": 17146 + }, + { + "epoch": 0.9601859110762684, + "grad_norm": 1.3376730680465698, + "learning_rate": 8.569e-05, + "loss": 0.4556, + "step": 17147 + }, + { + "epoch": 0.9602419083883974, + "grad_norm": 1.3041802644729614, + "learning_rate": 8.5695e-05, + "loss": 0.6248, + "step": 17148 + }, + { + "epoch": 0.9602979057005264, + "grad_norm": 3.7895519733428955, + "learning_rate": 8.57e-05, + "loss": 0.3966, + "step": 17149 + }, + { + "epoch": 0.9603539030126554, + "grad_norm": 1.2806825637817383, + "learning_rate": 8.570500000000001e-05, + "loss": 0.5187, + "step": 17150 + }, + { + "epoch": 0.9604099003247845, + "grad_norm": 1.4069244861602783, + "learning_rate": 8.571e-05, + "loss": 0.5099, + "step": 17151 + }, + { + "epoch": 0.9604658976369135, + "grad_norm": 1.3413307666778564, + "learning_rate": 8.5715e-05, + "loss": 0.3637, + "step": 17152 + }, + { + "epoch": 0.9605218949490425, + "grad_norm": 1.2224640846252441, + "learning_rate": 8.572e-05, + "loss": 0.4022, + "step": 17153 + }, + { + "epoch": 0.9605778922611715, + "grad_norm": 1.2687058448791504, + "learning_rate": 8.5725e-05, + "loss": 0.532, + "step": 17154 + }, + { + "epoch": 0.9606338895733005, + "grad_norm": 1.3773020505905151, + "learning_rate": 8.573e-05, + "loss": 0.4421, + "step": 17155 + }, + { + "epoch": 0.9606898868854294, + "grad_norm": 1.491414189338684, + "learning_rate": 8.5735e-05, + "loss": 0.5085, + "step": 17156 + }, + { + "epoch": 0.9607458841975585, + "grad_norm": 1.4372575283050537, + "learning_rate": 8.574000000000002e-05, + "loss": 0.4531, + "step": 17157 + }, + { + "epoch": 0.9608018815096875, + "grad_norm": 1.0840134620666504, + "learning_rate": 8.574500000000001e-05, + "loss": 0.3929, + "step": 17158 + }, + { + "epoch": 0.9608578788218165, + "grad_norm": 1.4550862312316895, + "learning_rate": 8.575000000000001e-05, + "loss": 0.4343, + "step": 17159 + }, + { + "epoch": 0.9609138761339455, + "grad_norm": 1.675118327140808, + "learning_rate": 8.575500000000001e-05, + "loss": 0.4982, + "step": 17160 + }, + { + "epoch": 0.9609698734460745, + "grad_norm": 1.2108522653579712, + "learning_rate": 8.576e-05, + "loss": 0.3745, + "step": 17161 + }, + { + "epoch": 0.9610258707582036, + "grad_norm": 1.415676474571228, + "learning_rate": 8.5765e-05, + "loss": 0.5187, + "step": 17162 + }, + { + "epoch": 0.9610818680703326, + "grad_norm": 1.544421911239624, + "learning_rate": 8.577e-05, + "loss": 0.4471, + "step": 17163 + }, + { + "epoch": 0.9611378653824616, + "grad_norm": 1.3246055841445923, + "learning_rate": 8.577500000000001e-05, + "loss": 0.4553, + "step": 17164 + }, + { + "epoch": 0.9611938626945906, + "grad_norm": 1.423762321472168, + "learning_rate": 8.578000000000001e-05, + "loss": 0.5542, + "step": 17165 + }, + { + "epoch": 0.9612498600067196, + "grad_norm": 1.6242868900299072, + "learning_rate": 8.5785e-05, + "loss": 0.4095, + "step": 17166 + }, + { + "epoch": 0.9613058573188487, + "grad_norm": 1.2840968370437622, + "learning_rate": 8.579e-05, + "loss": 0.5696, + "step": 17167 + }, + { + "epoch": 0.9613618546309777, + "grad_norm": 1.1831138134002686, + "learning_rate": 8.5795e-05, + "loss": 0.3876, + "step": 17168 + }, + { + "epoch": 0.9614178519431067, + "grad_norm": 1.7165281772613525, + "learning_rate": 8.58e-05, + "loss": 0.4842, + "step": 17169 + }, + { + "epoch": 0.9614738492552357, + "grad_norm": 1.2815358638763428, + "learning_rate": 8.5805e-05, + "loss": 0.46, + "step": 17170 + }, + { + "epoch": 0.9615298465673647, + "grad_norm": 1.4860469102859497, + "learning_rate": 8.581000000000001e-05, + "loss": 0.4817, + "step": 17171 + }, + { + "epoch": 0.9615858438794938, + "grad_norm": 1.146111011505127, + "learning_rate": 8.5815e-05, + "loss": 0.4267, + "step": 17172 + }, + { + "epoch": 0.9616418411916228, + "grad_norm": 1.5226686000823975, + "learning_rate": 8.582e-05, + "loss": 0.3614, + "step": 17173 + }, + { + "epoch": 0.9616978385037518, + "grad_norm": 1.1706711053848267, + "learning_rate": 8.5825e-05, + "loss": 0.3884, + "step": 17174 + }, + { + "epoch": 0.9617538358158808, + "grad_norm": 1.2649924755096436, + "learning_rate": 8.583e-05, + "loss": 0.317, + "step": 17175 + }, + { + "epoch": 0.9618098331280098, + "grad_norm": 1.4722168445587158, + "learning_rate": 8.5835e-05, + "loss": 0.4549, + "step": 17176 + }, + { + "epoch": 0.9618658304401388, + "grad_norm": 1.4678112268447876, + "learning_rate": 8.584e-05, + "loss": 0.452, + "step": 17177 + }, + { + "epoch": 0.9619218277522679, + "grad_norm": 1.369017243385315, + "learning_rate": 8.5845e-05, + "loss": 0.4349, + "step": 17178 + }, + { + "epoch": 0.9619778250643969, + "grad_norm": 1.6606252193450928, + "learning_rate": 8.585000000000001e-05, + "loss": 0.4326, + "step": 17179 + }, + { + "epoch": 0.9620338223765259, + "grad_norm": 1.2879390716552734, + "learning_rate": 8.585500000000001e-05, + "loss": 0.4629, + "step": 17180 + }, + { + "epoch": 0.9620898196886549, + "grad_norm": 1.4565268754959106, + "learning_rate": 8.586000000000001e-05, + "loss": 0.4547, + "step": 17181 + }, + { + "epoch": 0.9621458170007839, + "grad_norm": 1.0399436950683594, + "learning_rate": 8.5865e-05, + "loss": 0.3099, + "step": 17182 + }, + { + "epoch": 0.962201814312913, + "grad_norm": 1.2132089138031006, + "learning_rate": 8.587e-05, + "loss": 0.4141, + "step": 17183 + }, + { + "epoch": 0.962257811625042, + "grad_norm": 1.2838457822799683, + "learning_rate": 8.5875e-05, + "loss": 0.3578, + "step": 17184 + }, + { + "epoch": 0.962313808937171, + "grad_norm": 1.459041953086853, + "learning_rate": 8.588000000000001e-05, + "loss": 0.444, + "step": 17185 + }, + { + "epoch": 0.9623698062493, + "grad_norm": 1.8635562658309937, + "learning_rate": 8.588500000000001e-05, + "loss": 0.4549, + "step": 17186 + }, + { + "epoch": 0.962425803561429, + "grad_norm": 1.378082036972046, + "learning_rate": 8.589e-05, + "loss": 0.4109, + "step": 17187 + }, + { + "epoch": 0.9624818008735581, + "grad_norm": 1.1600459814071655, + "learning_rate": 8.5895e-05, + "loss": 0.3097, + "step": 17188 + }, + { + "epoch": 0.9625377981856871, + "grad_norm": 1.5207675695419312, + "learning_rate": 8.59e-05, + "loss": 0.4211, + "step": 17189 + }, + { + "epoch": 0.9625937954978161, + "grad_norm": 1.275919795036316, + "learning_rate": 8.5905e-05, + "loss": 0.5067, + "step": 17190 + }, + { + "epoch": 0.9626497928099451, + "grad_norm": 1.1985430717468262, + "learning_rate": 8.591e-05, + "loss": 0.4454, + "step": 17191 + }, + { + "epoch": 0.9627057901220741, + "grad_norm": 1.5325583219528198, + "learning_rate": 8.5915e-05, + "loss": 0.5317, + "step": 17192 + }, + { + "epoch": 0.9627617874342032, + "grad_norm": 1.1471023559570312, + "learning_rate": 8.592e-05, + "loss": 0.388, + "step": 17193 + }, + { + "epoch": 0.9628177847463322, + "grad_norm": 1.355360507965088, + "learning_rate": 8.5925e-05, + "loss": 0.5246, + "step": 17194 + }, + { + "epoch": 0.9628737820584612, + "grad_norm": 1.3444854021072388, + "learning_rate": 8.593e-05, + "loss": 0.5281, + "step": 17195 + }, + { + "epoch": 0.9629297793705902, + "grad_norm": 1.592160701751709, + "learning_rate": 8.5935e-05, + "loss": 0.4489, + "step": 17196 + }, + { + "epoch": 0.9629857766827192, + "grad_norm": 1.5497907400131226, + "learning_rate": 8.594000000000001e-05, + "loss": 0.5357, + "step": 17197 + }, + { + "epoch": 0.9630417739948482, + "grad_norm": 1.4236985445022583, + "learning_rate": 8.5945e-05, + "loss": 0.4859, + "step": 17198 + }, + { + "epoch": 0.9630977713069773, + "grad_norm": 1.5251559019088745, + "learning_rate": 8.595e-05, + "loss": 0.5002, + "step": 17199 + }, + { + "epoch": 0.9631537686191063, + "grad_norm": 1.4532705545425415, + "learning_rate": 8.595500000000001e-05, + "loss": 0.5314, + "step": 17200 + }, + { + "epoch": 0.9632097659312353, + "grad_norm": 1.423785924911499, + "learning_rate": 8.596000000000001e-05, + "loss": 0.582, + "step": 17201 + }, + { + "epoch": 0.9632657632433643, + "grad_norm": 1.3997337818145752, + "learning_rate": 8.596500000000001e-05, + "loss": 0.4636, + "step": 17202 + }, + { + "epoch": 0.9633217605554933, + "grad_norm": 1.3916946649551392, + "learning_rate": 8.597e-05, + "loss": 0.4764, + "step": 17203 + }, + { + "epoch": 0.9633777578676224, + "grad_norm": 1.5298632383346558, + "learning_rate": 8.5975e-05, + "loss": 0.4478, + "step": 17204 + }, + { + "epoch": 0.9634337551797514, + "grad_norm": 1.5225152969360352, + "learning_rate": 8.598e-05, + "loss": 0.5898, + "step": 17205 + }, + { + "epoch": 0.9634897524918804, + "grad_norm": 1.1563955545425415, + "learning_rate": 8.598500000000001e-05, + "loss": 0.539, + "step": 17206 + }, + { + "epoch": 0.9635457498040094, + "grad_norm": 1.3186798095703125, + "learning_rate": 8.599000000000001e-05, + "loss": 0.472, + "step": 17207 + }, + { + "epoch": 0.9636017471161384, + "grad_norm": 1.4567782878875732, + "learning_rate": 8.5995e-05, + "loss": 0.5397, + "step": 17208 + }, + { + "epoch": 0.9636577444282675, + "grad_norm": 1.0257200002670288, + "learning_rate": 8.6e-05, + "loss": 0.3696, + "step": 17209 + }, + { + "epoch": 0.9637137417403965, + "grad_norm": 1.7328156232833862, + "learning_rate": 8.6005e-05, + "loss": 0.3124, + "step": 17210 + }, + { + "epoch": 0.9637697390525255, + "grad_norm": 1.5354329347610474, + "learning_rate": 8.601e-05, + "loss": 0.483, + "step": 17211 + }, + { + "epoch": 0.9638257363646545, + "grad_norm": 1.1250163316726685, + "learning_rate": 8.6015e-05, + "loss": 0.4669, + "step": 17212 + }, + { + "epoch": 0.9638817336767835, + "grad_norm": 1.317323088645935, + "learning_rate": 8.602e-05, + "loss": 0.3965, + "step": 17213 + }, + { + "epoch": 0.9639377309889126, + "grad_norm": 1.4232598543167114, + "learning_rate": 8.6025e-05, + "loss": 0.4235, + "step": 17214 + }, + { + "epoch": 0.9639937283010416, + "grad_norm": 1.5435523986816406, + "learning_rate": 8.603e-05, + "loss": 0.4698, + "step": 17215 + }, + { + "epoch": 0.9640497256131706, + "grad_norm": 1.3362163305282593, + "learning_rate": 8.6035e-05, + "loss": 0.3945, + "step": 17216 + }, + { + "epoch": 0.9641057229252996, + "grad_norm": 1.4581693410873413, + "learning_rate": 8.604000000000001e-05, + "loss": 0.5974, + "step": 17217 + }, + { + "epoch": 0.9641617202374286, + "grad_norm": 1.4092007875442505, + "learning_rate": 8.604500000000001e-05, + "loss": 0.5306, + "step": 17218 + }, + { + "epoch": 0.9642177175495577, + "grad_norm": 1.7647417783737183, + "learning_rate": 8.605e-05, + "loss": 0.5479, + "step": 17219 + }, + { + "epoch": 0.9642737148616867, + "grad_norm": 1.4271140098571777, + "learning_rate": 8.6055e-05, + "loss": 0.5704, + "step": 17220 + }, + { + "epoch": 0.9643297121738157, + "grad_norm": 1.4965245723724365, + "learning_rate": 8.606000000000001e-05, + "loss": 0.5712, + "step": 17221 + }, + { + "epoch": 0.9643857094859447, + "grad_norm": 1.320960283279419, + "learning_rate": 8.606500000000001e-05, + "loss": 0.5041, + "step": 17222 + }, + { + "epoch": 0.9644417067980737, + "grad_norm": 1.5413872003555298, + "learning_rate": 8.607000000000001e-05, + "loss": 0.5382, + "step": 17223 + }, + { + "epoch": 0.9644977041102027, + "grad_norm": 1.509645938873291, + "learning_rate": 8.6075e-05, + "loss": 0.3631, + "step": 17224 + }, + { + "epoch": 0.9645537014223318, + "grad_norm": 1.2352864742279053, + "learning_rate": 8.608e-05, + "loss": 0.3771, + "step": 17225 + }, + { + "epoch": 0.9646096987344608, + "grad_norm": 1.4004476070404053, + "learning_rate": 8.6085e-05, + "loss": 0.4592, + "step": 17226 + }, + { + "epoch": 0.9646656960465898, + "grad_norm": 1.2401401996612549, + "learning_rate": 8.609e-05, + "loss": 0.4409, + "step": 17227 + }, + { + "epoch": 0.9647216933587188, + "grad_norm": 1.4171693325042725, + "learning_rate": 8.609500000000001e-05, + "loss": 0.4446, + "step": 17228 + }, + { + "epoch": 0.9647776906708478, + "grad_norm": 1.1623423099517822, + "learning_rate": 8.61e-05, + "loss": 0.5162, + "step": 17229 + }, + { + "epoch": 0.9648336879829769, + "grad_norm": 1.4732013940811157, + "learning_rate": 8.6105e-05, + "loss": 0.411, + "step": 17230 + }, + { + "epoch": 0.9648896852951059, + "grad_norm": 1.3330683708190918, + "learning_rate": 8.611e-05, + "loss": 0.463, + "step": 17231 + }, + { + "epoch": 0.9649456826072349, + "grad_norm": 1.6553144454956055, + "learning_rate": 8.6115e-05, + "loss": 0.4626, + "step": 17232 + }, + { + "epoch": 0.9650016799193639, + "grad_norm": 1.2183603048324585, + "learning_rate": 8.612e-05, + "loss": 0.4028, + "step": 17233 + }, + { + "epoch": 0.9650576772314929, + "grad_norm": 1.3921277523040771, + "learning_rate": 8.6125e-05, + "loss": 0.5236, + "step": 17234 + }, + { + "epoch": 0.965113674543622, + "grad_norm": 1.318587303161621, + "learning_rate": 8.613e-05, + "loss": 0.3898, + "step": 17235 + }, + { + "epoch": 0.965169671855751, + "grad_norm": 1.4311283826828003, + "learning_rate": 8.6135e-05, + "loss": 0.3776, + "step": 17236 + }, + { + "epoch": 0.96522566916788, + "grad_norm": 2.413759231567383, + "learning_rate": 8.614000000000001e-05, + "loss": 0.602, + "step": 17237 + }, + { + "epoch": 0.9652816664800089, + "grad_norm": 2.402284860610962, + "learning_rate": 8.614500000000001e-05, + "loss": 0.4754, + "step": 17238 + }, + { + "epoch": 0.9653376637921379, + "grad_norm": 1.4711947441101074, + "learning_rate": 8.615000000000001e-05, + "loss": 0.5307, + "step": 17239 + }, + { + "epoch": 0.9653936611042669, + "grad_norm": 1.5300489664077759, + "learning_rate": 8.6155e-05, + "loss": 0.5064, + "step": 17240 + }, + { + "epoch": 0.965449658416396, + "grad_norm": 1.4783146381378174, + "learning_rate": 8.616e-05, + "loss": 0.3733, + "step": 17241 + }, + { + "epoch": 0.965505655728525, + "grad_norm": 1.3357300758361816, + "learning_rate": 8.616500000000001e-05, + "loss": 0.4655, + "step": 17242 + }, + { + "epoch": 0.965561653040654, + "grad_norm": 1.3752564191818237, + "learning_rate": 8.617000000000001e-05, + "loss": 0.5256, + "step": 17243 + }, + { + "epoch": 0.965617650352783, + "grad_norm": 1.4365041255950928, + "learning_rate": 8.617500000000001e-05, + "loss": 0.4837, + "step": 17244 + }, + { + "epoch": 0.965673647664912, + "grad_norm": 1.1746888160705566, + "learning_rate": 8.618e-05, + "loss": 0.464, + "step": 17245 + }, + { + "epoch": 0.9657296449770411, + "grad_norm": 1.3307267427444458, + "learning_rate": 8.6185e-05, + "loss": 0.4821, + "step": 17246 + }, + { + "epoch": 0.9657856422891701, + "grad_norm": 1.5634570121765137, + "learning_rate": 8.619e-05, + "loss": 0.5751, + "step": 17247 + }, + { + "epoch": 0.9658416396012991, + "grad_norm": 1.3047250509262085, + "learning_rate": 8.6195e-05, + "loss": 0.4271, + "step": 17248 + }, + { + "epoch": 0.9658976369134281, + "grad_norm": 1.172465443611145, + "learning_rate": 8.620000000000001e-05, + "loss": 0.4076, + "step": 17249 + }, + { + "epoch": 0.9659536342255571, + "grad_norm": 1.4473350048065186, + "learning_rate": 8.6205e-05, + "loss": 0.5583, + "step": 17250 + }, + { + "epoch": 0.9660096315376862, + "grad_norm": 1.2676153182983398, + "learning_rate": 8.621e-05, + "loss": 0.3651, + "step": 17251 + }, + { + "epoch": 0.9660656288498152, + "grad_norm": 1.218522071838379, + "learning_rate": 8.6215e-05, + "loss": 0.4182, + "step": 17252 + }, + { + "epoch": 0.9661216261619442, + "grad_norm": 1.5967514514923096, + "learning_rate": 8.622e-05, + "loss": 0.5798, + "step": 17253 + }, + { + "epoch": 0.9661776234740732, + "grad_norm": 1.5131630897521973, + "learning_rate": 8.6225e-05, + "loss": 0.6109, + "step": 17254 + }, + { + "epoch": 0.9662336207862022, + "grad_norm": 1.3282312154769897, + "learning_rate": 8.623e-05, + "loss": 0.3214, + "step": 17255 + }, + { + "epoch": 0.9662896180983312, + "grad_norm": 1.2723884582519531, + "learning_rate": 8.6235e-05, + "loss": 0.3748, + "step": 17256 + }, + { + "epoch": 0.9663456154104603, + "grad_norm": 4.182846546173096, + "learning_rate": 8.624000000000001e-05, + "loss": 0.4868, + "step": 17257 + }, + { + "epoch": 0.9664016127225893, + "grad_norm": 1.388085961341858, + "learning_rate": 8.624500000000001e-05, + "loss": 0.3495, + "step": 17258 + }, + { + "epoch": 0.9664576100347183, + "grad_norm": 1.6867340803146362, + "learning_rate": 8.625000000000001e-05, + "loss": 0.3838, + "step": 17259 + }, + { + "epoch": 0.9665136073468473, + "grad_norm": 1.412258505821228, + "learning_rate": 8.625500000000001e-05, + "loss": 0.322, + "step": 17260 + }, + { + "epoch": 0.9665696046589763, + "grad_norm": 1.4775829315185547, + "learning_rate": 8.626e-05, + "loss": 0.6763, + "step": 17261 + }, + { + "epoch": 0.9666256019711054, + "grad_norm": 1.2024606466293335, + "learning_rate": 8.6265e-05, + "loss": 0.3556, + "step": 17262 + }, + { + "epoch": 0.9666815992832344, + "grad_norm": 1.649871587753296, + "learning_rate": 8.627000000000001e-05, + "loss": 0.5428, + "step": 17263 + }, + { + "epoch": 0.9667375965953634, + "grad_norm": 1.4487943649291992, + "learning_rate": 8.627500000000001e-05, + "loss": 0.514, + "step": 17264 + }, + { + "epoch": 0.9667935939074924, + "grad_norm": 1.4263319969177246, + "learning_rate": 8.628000000000001e-05, + "loss": 0.422, + "step": 17265 + }, + { + "epoch": 0.9668495912196214, + "grad_norm": 1.298326015472412, + "learning_rate": 8.6285e-05, + "loss": 0.4818, + "step": 17266 + }, + { + "epoch": 0.9669055885317505, + "grad_norm": 1.5739593505859375, + "learning_rate": 8.629e-05, + "loss": 0.547, + "step": 17267 + }, + { + "epoch": 0.9669615858438795, + "grad_norm": 1.1422499418258667, + "learning_rate": 8.6295e-05, + "loss": 0.327, + "step": 17268 + }, + { + "epoch": 0.9670175831560085, + "grad_norm": 1.441144585609436, + "learning_rate": 8.63e-05, + "loss": 0.5159, + "step": 17269 + }, + { + "epoch": 0.9670735804681375, + "grad_norm": 1.6179550886154175, + "learning_rate": 8.630500000000001e-05, + "loss": 0.5163, + "step": 17270 + }, + { + "epoch": 0.9671295777802665, + "grad_norm": 1.4002225399017334, + "learning_rate": 8.631e-05, + "loss": 0.4987, + "step": 17271 + }, + { + "epoch": 0.9671855750923956, + "grad_norm": 1.424810528755188, + "learning_rate": 8.6315e-05, + "loss": 0.4542, + "step": 17272 + }, + { + "epoch": 0.9672415724045246, + "grad_norm": 1.5833181142807007, + "learning_rate": 8.632e-05, + "loss": 0.6777, + "step": 17273 + }, + { + "epoch": 0.9672975697166536, + "grad_norm": 1.6691884994506836, + "learning_rate": 8.6325e-05, + "loss": 0.4705, + "step": 17274 + }, + { + "epoch": 0.9673535670287826, + "grad_norm": 1.240027666091919, + "learning_rate": 8.633e-05, + "loss": 0.5124, + "step": 17275 + }, + { + "epoch": 0.9674095643409116, + "grad_norm": 1.2922923564910889, + "learning_rate": 8.633499999999999e-05, + "loss": 0.4151, + "step": 17276 + }, + { + "epoch": 0.9674655616530407, + "grad_norm": 1.161521077156067, + "learning_rate": 8.634e-05, + "loss": 0.3309, + "step": 17277 + }, + { + "epoch": 0.9675215589651697, + "grad_norm": 1.3965418338775635, + "learning_rate": 8.634500000000001e-05, + "loss": 0.4749, + "step": 17278 + }, + { + "epoch": 0.9675775562772987, + "grad_norm": 1.2737889289855957, + "learning_rate": 8.635000000000001e-05, + "loss": 0.4331, + "step": 17279 + }, + { + "epoch": 0.9676335535894277, + "grad_norm": 1.2584105730056763, + "learning_rate": 8.635500000000001e-05, + "loss": 0.4195, + "step": 17280 + }, + { + "epoch": 0.9676895509015567, + "grad_norm": 1.2243163585662842, + "learning_rate": 8.636e-05, + "loss": 0.429, + "step": 17281 + }, + { + "epoch": 0.9677455482136857, + "grad_norm": 1.387087345123291, + "learning_rate": 8.6365e-05, + "loss": 0.5707, + "step": 17282 + }, + { + "epoch": 0.9678015455258148, + "grad_norm": 1.210732340812683, + "learning_rate": 8.637e-05, + "loss": 0.5474, + "step": 17283 + }, + { + "epoch": 0.9678575428379438, + "grad_norm": 1.1186721324920654, + "learning_rate": 8.637500000000001e-05, + "loss": 0.3673, + "step": 17284 + }, + { + "epoch": 0.9679135401500728, + "grad_norm": 1.5173031091690063, + "learning_rate": 8.638000000000001e-05, + "loss": 0.4944, + "step": 17285 + }, + { + "epoch": 0.9679695374622018, + "grad_norm": 1.1591531038284302, + "learning_rate": 8.638500000000001e-05, + "loss": 0.4421, + "step": 17286 + }, + { + "epoch": 0.9680255347743308, + "grad_norm": 1.3933157920837402, + "learning_rate": 8.639e-05, + "loss": 0.443, + "step": 17287 + }, + { + "epoch": 0.9680815320864599, + "grad_norm": 1.1293524503707886, + "learning_rate": 8.6395e-05, + "loss": 0.3323, + "step": 17288 + }, + { + "epoch": 0.9681375293985889, + "grad_norm": 1.3659924268722534, + "learning_rate": 8.64e-05, + "loss": 0.3717, + "step": 17289 + }, + { + "epoch": 0.9681935267107179, + "grad_norm": 1.6097309589385986, + "learning_rate": 8.6405e-05, + "loss": 0.4152, + "step": 17290 + }, + { + "epoch": 0.9682495240228469, + "grad_norm": 1.181439995765686, + "learning_rate": 8.641000000000001e-05, + "loss": 0.404, + "step": 17291 + }, + { + "epoch": 0.9683055213349759, + "grad_norm": 1.6815600395202637, + "learning_rate": 8.6415e-05, + "loss": 0.6659, + "step": 17292 + }, + { + "epoch": 0.968361518647105, + "grad_norm": 1.570206642150879, + "learning_rate": 8.642e-05, + "loss": 0.4618, + "step": 17293 + }, + { + "epoch": 0.968417515959234, + "grad_norm": 2.163844347000122, + "learning_rate": 8.6425e-05, + "loss": 0.643, + "step": 17294 + }, + { + "epoch": 0.968473513271363, + "grad_norm": 1.467824101448059, + "learning_rate": 8.643e-05, + "loss": 0.5083, + "step": 17295 + }, + { + "epoch": 0.968529510583492, + "grad_norm": 1.1489348411560059, + "learning_rate": 8.6435e-05, + "loss": 0.3732, + "step": 17296 + }, + { + "epoch": 0.968585507895621, + "grad_norm": 1.3897266387939453, + "learning_rate": 8.643999999999999e-05, + "loss": 0.4392, + "step": 17297 + }, + { + "epoch": 0.96864150520775, + "grad_norm": 1.2860050201416016, + "learning_rate": 8.6445e-05, + "loss": 0.4827, + "step": 17298 + }, + { + "epoch": 0.9686975025198791, + "grad_norm": 1.547238826751709, + "learning_rate": 8.645000000000001e-05, + "loss": 0.4105, + "step": 17299 + }, + { + "epoch": 0.9687534998320081, + "grad_norm": 1.42068350315094, + "learning_rate": 8.645500000000001e-05, + "loss": 0.4412, + "step": 17300 + }, + { + "epoch": 0.9688094971441371, + "grad_norm": 1.328653335571289, + "learning_rate": 8.646000000000001e-05, + "loss": 0.4863, + "step": 17301 + }, + { + "epoch": 0.9688654944562661, + "grad_norm": 1.281003713607788, + "learning_rate": 8.6465e-05, + "loss": 0.4571, + "step": 17302 + }, + { + "epoch": 0.9689214917683951, + "grad_norm": 1.40518057346344, + "learning_rate": 8.647e-05, + "loss": 0.3568, + "step": 17303 + }, + { + "epoch": 0.9689774890805242, + "grad_norm": 1.4284865856170654, + "learning_rate": 8.6475e-05, + "loss": 0.5044, + "step": 17304 + }, + { + "epoch": 0.9690334863926532, + "grad_norm": 1.402702808380127, + "learning_rate": 8.648e-05, + "loss": 0.5023, + "step": 17305 + }, + { + "epoch": 0.9690894837047822, + "grad_norm": 1.465108871459961, + "learning_rate": 8.648500000000001e-05, + "loss": 0.5385, + "step": 17306 + }, + { + "epoch": 0.9691454810169112, + "grad_norm": 1.6162699460983276, + "learning_rate": 8.649000000000001e-05, + "loss": 0.5153, + "step": 17307 + }, + { + "epoch": 0.9692014783290402, + "grad_norm": 1.3178231716156006, + "learning_rate": 8.6495e-05, + "loss": 0.4102, + "step": 17308 + }, + { + "epoch": 0.9692574756411693, + "grad_norm": 1.271968960762024, + "learning_rate": 8.65e-05, + "loss": 0.3879, + "step": 17309 + }, + { + "epoch": 0.9693134729532983, + "grad_norm": 1.488120436668396, + "learning_rate": 8.6505e-05, + "loss": 0.4599, + "step": 17310 + }, + { + "epoch": 0.9693694702654273, + "grad_norm": 1.620308518409729, + "learning_rate": 8.651e-05, + "loss": 0.665, + "step": 17311 + }, + { + "epoch": 0.9694254675775563, + "grad_norm": 1.3775193691253662, + "learning_rate": 8.651500000000001e-05, + "loss": 0.4418, + "step": 17312 + }, + { + "epoch": 0.9694814648896853, + "grad_norm": 1.3837276697158813, + "learning_rate": 8.652e-05, + "loss": 0.5351, + "step": 17313 + }, + { + "epoch": 0.9695374622018144, + "grad_norm": 1.4161573648452759, + "learning_rate": 8.6525e-05, + "loss": 0.4661, + "step": 17314 + }, + { + "epoch": 0.9695934595139434, + "grad_norm": 1.5309301614761353, + "learning_rate": 8.653e-05, + "loss": 0.5751, + "step": 17315 + }, + { + "epoch": 0.9696494568260724, + "grad_norm": 1.2019197940826416, + "learning_rate": 8.6535e-05, + "loss": 0.4031, + "step": 17316 + }, + { + "epoch": 0.9697054541382014, + "grad_norm": 1.1792582273483276, + "learning_rate": 8.654e-05, + "loss": 0.4181, + "step": 17317 + }, + { + "epoch": 0.9697614514503304, + "grad_norm": 1.3851470947265625, + "learning_rate": 8.6545e-05, + "loss": 0.5365, + "step": 17318 + }, + { + "epoch": 0.9698174487624595, + "grad_norm": 1.6818479299545288, + "learning_rate": 8.655e-05, + "loss": 0.4797, + "step": 17319 + }, + { + "epoch": 0.9698734460745885, + "grad_norm": 1.4252855777740479, + "learning_rate": 8.655500000000001e-05, + "loss": 0.4359, + "step": 17320 + }, + { + "epoch": 0.9699294433867174, + "grad_norm": 1.3790574073791504, + "learning_rate": 8.656000000000001e-05, + "loss": 0.7042, + "step": 17321 + }, + { + "epoch": 0.9699854406988464, + "grad_norm": 1.2539957761764526, + "learning_rate": 8.656500000000001e-05, + "loss": 0.4094, + "step": 17322 + }, + { + "epoch": 0.9700414380109754, + "grad_norm": 1.3838392496109009, + "learning_rate": 8.657e-05, + "loss": 0.4915, + "step": 17323 + }, + { + "epoch": 0.9700974353231044, + "grad_norm": 1.2799766063690186, + "learning_rate": 8.6575e-05, + "loss": 0.4512, + "step": 17324 + }, + { + "epoch": 0.9701534326352335, + "grad_norm": 1.4863401651382446, + "learning_rate": 8.658e-05, + "loss": 0.4828, + "step": 17325 + }, + { + "epoch": 0.9702094299473625, + "grad_norm": 1.1538506746292114, + "learning_rate": 8.6585e-05, + "loss": 0.3684, + "step": 17326 + }, + { + "epoch": 0.9702654272594915, + "grad_norm": 1.3310805559158325, + "learning_rate": 8.659000000000001e-05, + "loss": 0.3406, + "step": 17327 + }, + { + "epoch": 0.9703214245716205, + "grad_norm": 1.559973120689392, + "learning_rate": 8.659500000000001e-05, + "loss": 0.3151, + "step": 17328 + }, + { + "epoch": 0.9703774218837495, + "grad_norm": 1.297188401222229, + "learning_rate": 8.66e-05, + "loss": 0.3679, + "step": 17329 + }, + { + "epoch": 0.9704334191958786, + "grad_norm": 1.1994104385375977, + "learning_rate": 8.6605e-05, + "loss": 0.3181, + "step": 17330 + }, + { + "epoch": 0.9704894165080076, + "grad_norm": 1.5522327423095703, + "learning_rate": 8.661e-05, + "loss": 0.4533, + "step": 17331 + }, + { + "epoch": 0.9705454138201366, + "grad_norm": 1.4846142530441284, + "learning_rate": 8.6615e-05, + "loss": 0.497, + "step": 17332 + }, + { + "epoch": 0.9706014111322656, + "grad_norm": 4.936802864074707, + "learning_rate": 8.662000000000001e-05, + "loss": 0.4634, + "step": 17333 + }, + { + "epoch": 0.9706574084443946, + "grad_norm": 1.461236834526062, + "learning_rate": 8.6625e-05, + "loss": 0.4195, + "step": 17334 + }, + { + "epoch": 0.9707134057565237, + "grad_norm": 1.3690143823623657, + "learning_rate": 8.663e-05, + "loss": 0.3715, + "step": 17335 + }, + { + "epoch": 0.9707694030686527, + "grad_norm": 1.4367308616638184, + "learning_rate": 8.6635e-05, + "loss": 0.4884, + "step": 17336 + }, + { + "epoch": 0.9708254003807817, + "grad_norm": 1.3397955894470215, + "learning_rate": 8.664e-05, + "loss": 0.3518, + "step": 17337 + }, + { + "epoch": 0.9708813976929107, + "grad_norm": 1.7689563035964966, + "learning_rate": 8.664500000000001e-05, + "loss": 0.5787, + "step": 17338 + }, + { + "epoch": 0.9709373950050397, + "grad_norm": 1.5664008855819702, + "learning_rate": 8.665e-05, + "loss": 0.3727, + "step": 17339 + }, + { + "epoch": 0.9709933923171687, + "grad_norm": 1.1659214496612549, + "learning_rate": 8.6655e-05, + "loss": 0.4004, + "step": 17340 + }, + { + "epoch": 0.9710493896292978, + "grad_norm": 1.307281255722046, + "learning_rate": 8.666000000000001e-05, + "loss": 0.3862, + "step": 17341 + }, + { + "epoch": 0.9711053869414268, + "grad_norm": 1.6505941152572632, + "learning_rate": 8.666500000000001e-05, + "loss": 0.4579, + "step": 17342 + }, + { + "epoch": 0.9711613842535558, + "grad_norm": 1.163074254989624, + "learning_rate": 8.667000000000001e-05, + "loss": 0.4259, + "step": 17343 + }, + { + "epoch": 0.9712173815656848, + "grad_norm": 1.2999498844146729, + "learning_rate": 8.6675e-05, + "loss": 0.4441, + "step": 17344 + }, + { + "epoch": 0.9712733788778138, + "grad_norm": 2.1191458702087402, + "learning_rate": 8.668e-05, + "loss": 0.426, + "step": 17345 + }, + { + "epoch": 0.9713293761899429, + "grad_norm": 1.8392921686172485, + "learning_rate": 8.6685e-05, + "loss": 0.4483, + "step": 17346 + }, + { + "epoch": 0.9713853735020719, + "grad_norm": 1.2343018054962158, + "learning_rate": 8.669e-05, + "loss": 0.5183, + "step": 17347 + }, + { + "epoch": 0.9714413708142009, + "grad_norm": 1.1489077806472778, + "learning_rate": 8.669500000000001e-05, + "loss": 0.3422, + "step": 17348 + }, + { + "epoch": 0.9714973681263299, + "grad_norm": 1.1706421375274658, + "learning_rate": 8.67e-05, + "loss": 0.3887, + "step": 17349 + }, + { + "epoch": 0.9715533654384589, + "grad_norm": 1.3635989427566528, + "learning_rate": 8.6705e-05, + "loss": 0.5152, + "step": 17350 + }, + { + "epoch": 0.971609362750588, + "grad_norm": 2.037436008453369, + "learning_rate": 8.671e-05, + "loss": 0.3749, + "step": 17351 + }, + { + "epoch": 0.971665360062717, + "grad_norm": 1.750329852104187, + "learning_rate": 8.6715e-05, + "loss": 0.6067, + "step": 17352 + }, + { + "epoch": 0.971721357374846, + "grad_norm": 1.270072340965271, + "learning_rate": 8.672e-05, + "loss": 0.4039, + "step": 17353 + }, + { + "epoch": 0.971777354686975, + "grad_norm": 1.7762908935546875, + "learning_rate": 8.672500000000001e-05, + "loss": 0.5357, + "step": 17354 + }, + { + "epoch": 0.971833351999104, + "grad_norm": 1.4279133081436157, + "learning_rate": 8.673e-05, + "loss": 0.4171, + "step": 17355 + }, + { + "epoch": 0.971889349311233, + "grad_norm": 1.1249388456344604, + "learning_rate": 8.6735e-05, + "loss": 0.3568, + "step": 17356 + }, + { + "epoch": 0.9719453466233621, + "grad_norm": 1.4502819776535034, + "learning_rate": 8.674e-05, + "loss": 0.4598, + "step": 17357 + }, + { + "epoch": 0.9720013439354911, + "grad_norm": 1.2537726163864136, + "learning_rate": 8.674500000000001e-05, + "loss": 0.3844, + "step": 17358 + }, + { + "epoch": 0.9720573412476201, + "grad_norm": 1.2995517253875732, + "learning_rate": 8.675000000000001e-05, + "loss": 0.5004, + "step": 17359 + }, + { + "epoch": 0.9721133385597491, + "grad_norm": 1.4475985765457153, + "learning_rate": 8.6755e-05, + "loss": 0.4624, + "step": 17360 + }, + { + "epoch": 0.9721693358718781, + "grad_norm": 1.2398110628128052, + "learning_rate": 8.676e-05, + "loss": 0.3665, + "step": 17361 + }, + { + "epoch": 0.9722253331840072, + "grad_norm": 1.439234972000122, + "learning_rate": 8.676500000000001e-05, + "loss": 0.4756, + "step": 17362 + }, + { + "epoch": 0.9722813304961362, + "grad_norm": 1.49754798412323, + "learning_rate": 8.677000000000001e-05, + "loss": 0.455, + "step": 17363 + }, + { + "epoch": 0.9723373278082652, + "grad_norm": 1.302474021911621, + "learning_rate": 8.677500000000001e-05, + "loss": 0.4373, + "step": 17364 + }, + { + "epoch": 0.9723933251203942, + "grad_norm": 1.0640931129455566, + "learning_rate": 8.678e-05, + "loss": 0.3671, + "step": 17365 + }, + { + "epoch": 0.9724493224325232, + "grad_norm": 1.2272655963897705, + "learning_rate": 8.6785e-05, + "loss": 0.4005, + "step": 17366 + }, + { + "epoch": 0.9725053197446523, + "grad_norm": 1.508810043334961, + "learning_rate": 8.679e-05, + "loss": 0.4766, + "step": 17367 + }, + { + "epoch": 0.9725613170567813, + "grad_norm": 1.224143147468567, + "learning_rate": 8.6795e-05, + "loss": 0.4397, + "step": 17368 + }, + { + "epoch": 0.9726173143689103, + "grad_norm": 1.2601372003555298, + "learning_rate": 8.680000000000001e-05, + "loss": 0.4508, + "step": 17369 + }, + { + "epoch": 0.9726733116810393, + "grad_norm": 1.5283092260360718, + "learning_rate": 8.6805e-05, + "loss": 0.3897, + "step": 17370 + }, + { + "epoch": 0.9727293089931683, + "grad_norm": 1.430024266242981, + "learning_rate": 8.681e-05, + "loss": 0.5453, + "step": 17371 + }, + { + "epoch": 0.9727853063052974, + "grad_norm": 1.3398207426071167, + "learning_rate": 8.6815e-05, + "loss": 0.4539, + "step": 17372 + }, + { + "epoch": 0.9728413036174264, + "grad_norm": 1.4060691595077515, + "learning_rate": 8.682e-05, + "loss": 0.4397, + "step": 17373 + }, + { + "epoch": 0.9728973009295554, + "grad_norm": 1.418944001197815, + "learning_rate": 8.6825e-05, + "loss": 0.4962, + "step": 17374 + }, + { + "epoch": 0.9729532982416844, + "grad_norm": 1.4422852993011475, + "learning_rate": 8.683e-05, + "loss": 0.5276, + "step": 17375 + }, + { + "epoch": 0.9730092955538134, + "grad_norm": 1.3235855102539062, + "learning_rate": 8.6835e-05, + "loss": 0.4473, + "step": 17376 + }, + { + "epoch": 0.9730652928659425, + "grad_norm": 1.2004975080490112, + "learning_rate": 8.684e-05, + "loss": 0.418, + "step": 17377 + }, + { + "epoch": 0.9731212901780715, + "grad_norm": 1.4729422330856323, + "learning_rate": 8.684500000000001e-05, + "loss": 0.455, + "step": 17378 + }, + { + "epoch": 0.9731772874902005, + "grad_norm": 1.4228463172912598, + "learning_rate": 8.685000000000001e-05, + "loss": 0.5516, + "step": 17379 + }, + { + "epoch": 0.9732332848023295, + "grad_norm": 1.2448140382766724, + "learning_rate": 8.685500000000001e-05, + "loss": 0.4987, + "step": 17380 + }, + { + "epoch": 0.9732892821144585, + "grad_norm": 1.2316290140151978, + "learning_rate": 8.686e-05, + "loss": 0.4023, + "step": 17381 + }, + { + "epoch": 0.9733452794265876, + "grad_norm": 1.3800770044326782, + "learning_rate": 8.6865e-05, + "loss": 0.5761, + "step": 17382 + }, + { + "epoch": 0.9734012767387166, + "grad_norm": 1.0687819719314575, + "learning_rate": 8.687000000000001e-05, + "loss": 0.4053, + "step": 17383 + }, + { + "epoch": 0.9734572740508456, + "grad_norm": 1.395995855331421, + "learning_rate": 8.687500000000001e-05, + "loss": 0.4674, + "step": 17384 + }, + { + "epoch": 0.9735132713629746, + "grad_norm": 1.3109594583511353, + "learning_rate": 8.688000000000001e-05, + "loss": 0.4991, + "step": 17385 + }, + { + "epoch": 0.9735692686751036, + "grad_norm": 1.2651557922363281, + "learning_rate": 8.6885e-05, + "loss": 0.4577, + "step": 17386 + }, + { + "epoch": 0.9736252659872326, + "grad_norm": 1.2861329317092896, + "learning_rate": 8.689e-05, + "loss": 0.4672, + "step": 17387 + }, + { + "epoch": 0.9736812632993617, + "grad_norm": 1.2998154163360596, + "learning_rate": 8.6895e-05, + "loss": 0.3578, + "step": 17388 + }, + { + "epoch": 0.9737372606114907, + "grad_norm": 1.3852159976959229, + "learning_rate": 8.69e-05, + "loss": 0.4885, + "step": 17389 + }, + { + "epoch": 0.9737932579236197, + "grad_norm": 1.4287030696868896, + "learning_rate": 8.690500000000001e-05, + "loss": 0.4675, + "step": 17390 + }, + { + "epoch": 0.9738492552357487, + "grad_norm": 1.1838635206222534, + "learning_rate": 8.691e-05, + "loss": 0.4898, + "step": 17391 + }, + { + "epoch": 0.9739052525478777, + "grad_norm": 1.3776044845581055, + "learning_rate": 8.6915e-05, + "loss": 0.4377, + "step": 17392 + }, + { + "epoch": 0.9739612498600068, + "grad_norm": 1.543620228767395, + "learning_rate": 8.692e-05, + "loss": 0.5551, + "step": 17393 + }, + { + "epoch": 0.9740172471721358, + "grad_norm": 1.5677738189697266, + "learning_rate": 8.6925e-05, + "loss": 0.5707, + "step": 17394 + }, + { + "epoch": 0.9740732444842648, + "grad_norm": 1.1838555335998535, + "learning_rate": 8.693e-05, + "loss": 0.4575, + "step": 17395 + }, + { + "epoch": 0.9741292417963938, + "grad_norm": 1.5340828895568848, + "learning_rate": 8.6935e-05, + "loss": 0.5805, + "step": 17396 + }, + { + "epoch": 0.9741852391085228, + "grad_norm": 1.3245877027511597, + "learning_rate": 8.694e-05, + "loss": 0.5607, + "step": 17397 + }, + { + "epoch": 0.9742412364206519, + "grad_norm": 1.3191213607788086, + "learning_rate": 8.6945e-05, + "loss": 0.5458, + "step": 17398 + }, + { + "epoch": 0.9742972337327809, + "grad_norm": 1.0440481901168823, + "learning_rate": 8.695000000000001e-05, + "loss": 0.3783, + "step": 17399 + }, + { + "epoch": 0.9743532310449099, + "grad_norm": 1.5507785081863403, + "learning_rate": 8.695500000000001e-05, + "loss": 0.443, + "step": 17400 + }, + { + "epoch": 0.9744092283570389, + "grad_norm": 1.5314637422561646, + "learning_rate": 8.696000000000001e-05, + "loss": 0.5015, + "step": 17401 + }, + { + "epoch": 0.9744652256691679, + "grad_norm": 1.307962417602539, + "learning_rate": 8.6965e-05, + "loss": 0.4961, + "step": 17402 + }, + { + "epoch": 0.974521222981297, + "grad_norm": 1.4043619632720947, + "learning_rate": 8.697e-05, + "loss": 0.5223, + "step": 17403 + }, + { + "epoch": 0.9745772202934259, + "grad_norm": 1.1751432418823242, + "learning_rate": 8.6975e-05, + "loss": 0.3579, + "step": 17404 + }, + { + "epoch": 0.9746332176055549, + "grad_norm": 1.4457268714904785, + "learning_rate": 8.698000000000001e-05, + "loss": 0.431, + "step": 17405 + }, + { + "epoch": 0.9746892149176839, + "grad_norm": 1.4647364616394043, + "learning_rate": 8.698500000000001e-05, + "loss": 0.5258, + "step": 17406 + }, + { + "epoch": 0.9747452122298129, + "grad_norm": 1.5925337076187134, + "learning_rate": 8.699e-05, + "loss": 0.3815, + "step": 17407 + }, + { + "epoch": 0.9748012095419419, + "grad_norm": 1.443375587463379, + "learning_rate": 8.6995e-05, + "loss": 0.4828, + "step": 17408 + }, + { + "epoch": 0.974857206854071, + "grad_norm": 1.2334703207015991, + "learning_rate": 8.7e-05, + "loss": 0.4202, + "step": 17409 + }, + { + "epoch": 0.9749132041662, + "grad_norm": 1.5328794717788696, + "learning_rate": 8.7005e-05, + "loss": 0.3994, + "step": 17410 + }, + { + "epoch": 0.974969201478329, + "grad_norm": 1.2115094661712646, + "learning_rate": 8.701000000000001e-05, + "loss": 0.3725, + "step": 17411 + }, + { + "epoch": 0.975025198790458, + "grad_norm": 1.4653737545013428, + "learning_rate": 8.7015e-05, + "loss": 0.5088, + "step": 17412 + }, + { + "epoch": 0.975081196102587, + "grad_norm": 1.5247368812561035, + "learning_rate": 8.702e-05, + "loss": 0.5465, + "step": 17413 + }, + { + "epoch": 0.975137193414716, + "grad_norm": 1.2807185649871826, + "learning_rate": 8.7025e-05, + "loss": 0.3851, + "step": 17414 + }, + { + "epoch": 0.9751931907268451, + "grad_norm": 1.2373749017715454, + "learning_rate": 8.703e-05, + "loss": 0.343, + "step": 17415 + }, + { + "epoch": 0.9752491880389741, + "grad_norm": 1.3598071336746216, + "learning_rate": 8.7035e-05, + "loss": 0.6644, + "step": 17416 + }, + { + "epoch": 0.9753051853511031, + "grad_norm": 1.1725260019302368, + "learning_rate": 8.704e-05, + "loss": 0.4689, + "step": 17417 + }, + { + "epoch": 0.9753611826632321, + "grad_norm": 1.242869257926941, + "learning_rate": 8.7045e-05, + "loss": 0.4059, + "step": 17418 + }, + { + "epoch": 0.9754171799753611, + "grad_norm": 1.1475459337234497, + "learning_rate": 8.705000000000002e-05, + "loss": 0.4537, + "step": 17419 + }, + { + "epoch": 0.9754731772874902, + "grad_norm": 1.1499022245407104, + "learning_rate": 8.705500000000001e-05, + "loss": 0.3748, + "step": 17420 + }, + { + "epoch": 0.9755291745996192, + "grad_norm": 1.149317979812622, + "learning_rate": 8.706000000000001e-05, + "loss": 0.4822, + "step": 17421 + }, + { + "epoch": 0.9755851719117482, + "grad_norm": 1.4652608633041382, + "learning_rate": 8.706500000000001e-05, + "loss": 0.4387, + "step": 17422 + }, + { + "epoch": 0.9756411692238772, + "grad_norm": 1.5358394384384155, + "learning_rate": 8.707e-05, + "loss": 0.5631, + "step": 17423 + }, + { + "epoch": 0.9756971665360062, + "grad_norm": 1.3846830129623413, + "learning_rate": 8.7075e-05, + "loss": 0.4613, + "step": 17424 + }, + { + "epoch": 0.9757531638481353, + "grad_norm": 1.6016181707382202, + "learning_rate": 8.708e-05, + "loss": 0.5172, + "step": 17425 + }, + { + "epoch": 0.9758091611602643, + "grad_norm": 1.251336693763733, + "learning_rate": 8.708500000000001e-05, + "loss": 0.4126, + "step": 17426 + }, + { + "epoch": 0.9758651584723933, + "grad_norm": 1.4441380500793457, + "learning_rate": 8.709000000000001e-05, + "loss": 0.3877, + "step": 17427 + }, + { + "epoch": 0.9759211557845223, + "grad_norm": 1.1034600734710693, + "learning_rate": 8.7095e-05, + "loss": 0.4298, + "step": 17428 + }, + { + "epoch": 0.9759771530966513, + "grad_norm": 1.1990065574645996, + "learning_rate": 8.71e-05, + "loss": 0.4104, + "step": 17429 + }, + { + "epoch": 0.9760331504087804, + "grad_norm": 1.752121090888977, + "learning_rate": 8.7105e-05, + "loss": 0.664, + "step": 17430 + }, + { + "epoch": 0.9760891477209094, + "grad_norm": 1.2689826488494873, + "learning_rate": 8.711e-05, + "loss": 0.517, + "step": 17431 + }, + { + "epoch": 0.9761451450330384, + "grad_norm": 1.443172812461853, + "learning_rate": 8.711500000000001e-05, + "loss": 0.4914, + "step": 17432 + }, + { + "epoch": 0.9762011423451674, + "grad_norm": 1.3688075542449951, + "learning_rate": 8.712e-05, + "loss": 0.4629, + "step": 17433 + }, + { + "epoch": 0.9762571396572964, + "grad_norm": 1.4039663076400757, + "learning_rate": 8.7125e-05, + "loss": 0.4362, + "step": 17434 + }, + { + "epoch": 0.9763131369694255, + "grad_norm": 1.5527126789093018, + "learning_rate": 8.713e-05, + "loss": 0.5913, + "step": 17435 + }, + { + "epoch": 0.9763691342815545, + "grad_norm": 1.6251251697540283, + "learning_rate": 8.7135e-05, + "loss": 0.4114, + "step": 17436 + }, + { + "epoch": 0.9764251315936835, + "grad_norm": 1.7148405313491821, + "learning_rate": 8.714e-05, + "loss": 0.531, + "step": 17437 + }, + { + "epoch": 0.9764811289058125, + "grad_norm": 1.2098383903503418, + "learning_rate": 8.714499999999999e-05, + "loss": 0.3567, + "step": 17438 + }, + { + "epoch": 0.9765371262179415, + "grad_norm": 1.2557868957519531, + "learning_rate": 8.715e-05, + "loss": 0.3424, + "step": 17439 + }, + { + "epoch": 0.9765931235300706, + "grad_norm": 1.4591310024261475, + "learning_rate": 8.715500000000002e-05, + "loss": 0.5535, + "step": 17440 + }, + { + "epoch": 0.9766491208421996, + "grad_norm": 1.4864718914031982, + "learning_rate": 8.716000000000001e-05, + "loss": 0.4163, + "step": 17441 + }, + { + "epoch": 0.9767051181543286, + "grad_norm": 1.585402488708496, + "learning_rate": 8.716500000000001e-05, + "loss": 0.4101, + "step": 17442 + }, + { + "epoch": 0.9767611154664576, + "grad_norm": 1.4526938199996948, + "learning_rate": 8.717000000000001e-05, + "loss": 0.5043, + "step": 17443 + }, + { + "epoch": 0.9768171127785866, + "grad_norm": 1.1431570053100586, + "learning_rate": 8.7175e-05, + "loss": 0.3464, + "step": 17444 + }, + { + "epoch": 0.9768731100907156, + "grad_norm": 1.8126105070114136, + "learning_rate": 8.718e-05, + "loss": 0.656, + "step": 17445 + }, + { + "epoch": 0.9769291074028447, + "grad_norm": 1.3256500959396362, + "learning_rate": 8.7185e-05, + "loss": 0.4084, + "step": 17446 + }, + { + "epoch": 0.9769851047149737, + "grad_norm": 1.631635069847107, + "learning_rate": 8.719000000000001e-05, + "loss": 0.5274, + "step": 17447 + }, + { + "epoch": 0.9770411020271027, + "grad_norm": 1.4243981838226318, + "learning_rate": 8.719500000000001e-05, + "loss": 0.4474, + "step": 17448 + }, + { + "epoch": 0.9770970993392317, + "grad_norm": 1.605637788772583, + "learning_rate": 8.72e-05, + "loss": 0.4262, + "step": 17449 + }, + { + "epoch": 0.9771530966513607, + "grad_norm": 1.4703837633132935, + "learning_rate": 8.7205e-05, + "loss": 0.5151, + "step": 17450 + }, + { + "epoch": 0.9772090939634898, + "grad_norm": 1.3215372562408447, + "learning_rate": 8.721e-05, + "loss": 0.3395, + "step": 17451 + }, + { + "epoch": 0.9772650912756188, + "grad_norm": 1.419661283493042, + "learning_rate": 8.7215e-05, + "loss": 0.5087, + "step": 17452 + }, + { + "epoch": 0.9773210885877478, + "grad_norm": 1.253587245941162, + "learning_rate": 8.722e-05, + "loss": 0.4516, + "step": 17453 + }, + { + "epoch": 0.9773770858998768, + "grad_norm": 1.3379052877426147, + "learning_rate": 8.7225e-05, + "loss": 0.3923, + "step": 17454 + }, + { + "epoch": 0.9774330832120058, + "grad_norm": 1.2216262817382812, + "learning_rate": 8.723e-05, + "loss": 0.4176, + "step": 17455 + }, + { + "epoch": 0.9774890805241349, + "grad_norm": 1.610276460647583, + "learning_rate": 8.7235e-05, + "loss": 0.5338, + "step": 17456 + }, + { + "epoch": 0.9775450778362639, + "grad_norm": 1.5220662355422974, + "learning_rate": 8.724e-05, + "loss": 0.7544, + "step": 17457 + }, + { + "epoch": 0.9776010751483929, + "grad_norm": 1.516425371170044, + "learning_rate": 8.7245e-05, + "loss": 0.4472, + "step": 17458 + }, + { + "epoch": 0.9776570724605219, + "grad_norm": 1.4756171703338623, + "learning_rate": 8.725e-05, + "loss": 0.4746, + "step": 17459 + }, + { + "epoch": 0.9777130697726509, + "grad_norm": 1.1557035446166992, + "learning_rate": 8.7255e-05, + "loss": 0.4096, + "step": 17460 + }, + { + "epoch": 0.97776906708478, + "grad_norm": 4.008048057556152, + "learning_rate": 8.726000000000001e-05, + "loss": 0.5853, + "step": 17461 + }, + { + "epoch": 0.977825064396909, + "grad_norm": 1.3212580680847168, + "learning_rate": 8.726500000000001e-05, + "loss": 0.4024, + "step": 17462 + }, + { + "epoch": 0.977881061709038, + "grad_norm": 1.3305388689041138, + "learning_rate": 8.727000000000001e-05, + "loss": 0.3946, + "step": 17463 + }, + { + "epoch": 0.977937059021167, + "grad_norm": 1.2800099849700928, + "learning_rate": 8.727500000000001e-05, + "loss": 0.4303, + "step": 17464 + }, + { + "epoch": 0.977993056333296, + "grad_norm": 1.755967140197754, + "learning_rate": 8.728e-05, + "loss": 0.6422, + "step": 17465 + }, + { + "epoch": 0.978049053645425, + "grad_norm": 1.3023176193237305, + "learning_rate": 8.7285e-05, + "loss": 0.4761, + "step": 17466 + }, + { + "epoch": 0.9781050509575541, + "grad_norm": 1.6691690683364868, + "learning_rate": 8.729e-05, + "loss": 0.4489, + "step": 17467 + }, + { + "epoch": 0.9781610482696831, + "grad_norm": 1.2901387214660645, + "learning_rate": 8.729500000000001e-05, + "loss": 0.3703, + "step": 17468 + }, + { + "epoch": 0.9782170455818121, + "grad_norm": 1.5257415771484375, + "learning_rate": 8.730000000000001e-05, + "loss": 0.5251, + "step": 17469 + }, + { + "epoch": 0.9782730428939411, + "grad_norm": 1.2863874435424805, + "learning_rate": 8.7305e-05, + "loss": 0.5034, + "step": 17470 + }, + { + "epoch": 0.9783290402060701, + "grad_norm": 1.2762781381607056, + "learning_rate": 8.731e-05, + "loss": 0.6128, + "step": 17471 + }, + { + "epoch": 0.9783850375181992, + "grad_norm": 1.3336706161499023, + "learning_rate": 8.7315e-05, + "loss": 0.5472, + "step": 17472 + }, + { + "epoch": 0.9784410348303282, + "grad_norm": 1.3449336290359497, + "learning_rate": 8.732e-05, + "loss": 0.4165, + "step": 17473 + }, + { + "epoch": 0.9784970321424572, + "grad_norm": 1.3002980947494507, + "learning_rate": 8.7325e-05, + "loss": 0.4092, + "step": 17474 + }, + { + "epoch": 0.9785530294545862, + "grad_norm": 1.4328842163085938, + "learning_rate": 8.733e-05, + "loss": 0.3841, + "step": 17475 + }, + { + "epoch": 0.9786090267667152, + "grad_norm": 1.2100359201431274, + "learning_rate": 8.7335e-05, + "loss": 0.3087, + "step": 17476 + }, + { + "epoch": 0.9786650240788443, + "grad_norm": 1.296263337135315, + "learning_rate": 8.734e-05, + "loss": 0.4679, + "step": 17477 + }, + { + "epoch": 0.9787210213909733, + "grad_norm": 1.3720515966415405, + "learning_rate": 8.7345e-05, + "loss": 0.5072, + "step": 17478 + }, + { + "epoch": 0.9787770187031023, + "grad_norm": 1.2821012735366821, + "learning_rate": 8.735000000000001e-05, + "loss": 0.4174, + "step": 17479 + }, + { + "epoch": 0.9788330160152313, + "grad_norm": 1.033850073814392, + "learning_rate": 8.7355e-05, + "loss": 0.429, + "step": 17480 + }, + { + "epoch": 0.9788890133273603, + "grad_norm": 1.2118123769760132, + "learning_rate": 8.736e-05, + "loss": 0.3842, + "step": 17481 + }, + { + "epoch": 0.9789450106394894, + "grad_norm": 1.198876142501831, + "learning_rate": 8.7365e-05, + "loss": 0.4736, + "step": 17482 + }, + { + "epoch": 0.9790010079516184, + "grad_norm": 2.4156970977783203, + "learning_rate": 8.737000000000001e-05, + "loss": 0.5503, + "step": 17483 + }, + { + "epoch": 0.9790570052637474, + "grad_norm": 1.1911977529525757, + "learning_rate": 8.737500000000001e-05, + "loss": 0.4204, + "step": 17484 + }, + { + "epoch": 0.9791130025758764, + "grad_norm": 1.6712185144424438, + "learning_rate": 8.738000000000001e-05, + "loss": 0.4103, + "step": 17485 + }, + { + "epoch": 0.9791689998880053, + "grad_norm": 1.511715292930603, + "learning_rate": 8.7385e-05, + "loss": 0.4508, + "step": 17486 + }, + { + "epoch": 0.9792249972001343, + "grad_norm": 1.4734218120574951, + "learning_rate": 8.739e-05, + "loss": 0.4137, + "step": 17487 + }, + { + "epoch": 0.9792809945122634, + "grad_norm": 1.347332239151001, + "learning_rate": 8.7395e-05, + "loss": 0.4178, + "step": 17488 + }, + { + "epoch": 0.9793369918243924, + "grad_norm": 1.2072690725326538, + "learning_rate": 8.740000000000001e-05, + "loss": 0.4265, + "step": 17489 + }, + { + "epoch": 0.9793929891365214, + "grad_norm": 1.4905452728271484, + "learning_rate": 8.740500000000001e-05, + "loss": 0.5137, + "step": 17490 + }, + { + "epoch": 0.9794489864486504, + "grad_norm": 1.2617844343185425, + "learning_rate": 8.741e-05, + "loss": 0.303, + "step": 17491 + }, + { + "epoch": 0.9795049837607794, + "grad_norm": 1.215404748916626, + "learning_rate": 8.7415e-05, + "loss": 0.4278, + "step": 17492 + }, + { + "epoch": 0.9795609810729085, + "grad_norm": 1.5266979932785034, + "learning_rate": 8.742e-05, + "loss": 0.435, + "step": 17493 + }, + { + "epoch": 0.9796169783850375, + "grad_norm": 1.3276944160461426, + "learning_rate": 8.7425e-05, + "loss": 0.5681, + "step": 17494 + }, + { + "epoch": 0.9796729756971665, + "grad_norm": 1.4566097259521484, + "learning_rate": 8.743e-05, + "loss": 0.5974, + "step": 17495 + }, + { + "epoch": 0.9797289730092955, + "grad_norm": 1.4847010374069214, + "learning_rate": 8.7435e-05, + "loss": 0.4646, + "step": 17496 + }, + { + "epoch": 0.9797849703214245, + "grad_norm": 1.2967168092727661, + "learning_rate": 8.744e-05, + "loss": 0.4152, + "step": 17497 + }, + { + "epoch": 0.9798409676335536, + "grad_norm": 1.37638521194458, + "learning_rate": 8.7445e-05, + "loss": 0.3974, + "step": 17498 + }, + { + "epoch": 0.9798969649456826, + "grad_norm": 1.160549283027649, + "learning_rate": 8.745000000000001e-05, + "loss": 0.4809, + "step": 17499 + }, + { + "epoch": 0.9799529622578116, + "grad_norm": 1.4034357070922852, + "learning_rate": 8.745500000000001e-05, + "loss": 0.4828, + "step": 17500 + }, + { + "epoch": 0.9800089595699406, + "grad_norm": 1.6993513107299805, + "learning_rate": 8.746e-05, + "loss": 0.403, + "step": 17501 + }, + { + "epoch": 0.9800649568820696, + "grad_norm": 1.4433549642562866, + "learning_rate": 8.7465e-05, + "loss": 0.5207, + "step": 17502 + }, + { + "epoch": 0.9801209541941986, + "grad_norm": 1.9773705005645752, + "learning_rate": 8.747e-05, + "loss": 0.6333, + "step": 17503 + }, + { + "epoch": 0.9801769515063277, + "grad_norm": 1.4764324426651, + "learning_rate": 8.747500000000001e-05, + "loss": 0.5438, + "step": 17504 + }, + { + "epoch": 0.9802329488184567, + "grad_norm": 1.410233736038208, + "learning_rate": 8.748000000000001e-05, + "loss": 0.5832, + "step": 17505 + }, + { + "epoch": 0.9802889461305857, + "grad_norm": 1.5553776025772095, + "learning_rate": 8.748500000000001e-05, + "loss": 0.5932, + "step": 17506 + }, + { + "epoch": 0.9803449434427147, + "grad_norm": 1.5294225215911865, + "learning_rate": 8.749e-05, + "loss": 0.6163, + "step": 17507 + }, + { + "epoch": 0.9804009407548437, + "grad_norm": 1.3601484298706055, + "learning_rate": 8.7495e-05, + "loss": 0.5146, + "step": 17508 + }, + { + "epoch": 0.9804569380669728, + "grad_norm": 1.3084996938705444, + "learning_rate": 8.75e-05, + "loss": 0.4682, + "step": 17509 + }, + { + "epoch": 0.9805129353791018, + "grad_norm": 1.2972066402435303, + "learning_rate": 8.750500000000001e-05, + "loss": 0.4591, + "step": 17510 + }, + { + "epoch": 0.9805689326912308, + "grad_norm": 1.3795164823532104, + "learning_rate": 8.751000000000001e-05, + "loss": 0.406, + "step": 17511 + }, + { + "epoch": 0.9806249300033598, + "grad_norm": 1.280336618423462, + "learning_rate": 8.7515e-05, + "loss": 0.4563, + "step": 17512 + }, + { + "epoch": 0.9806809273154888, + "grad_norm": 1.3981839418411255, + "learning_rate": 8.752e-05, + "loss": 0.4962, + "step": 17513 + }, + { + "epoch": 0.9807369246276179, + "grad_norm": 1.3525123596191406, + "learning_rate": 8.7525e-05, + "loss": 0.3947, + "step": 17514 + }, + { + "epoch": 0.9807929219397469, + "grad_norm": 1.4933942556381226, + "learning_rate": 8.753e-05, + "loss": 0.4733, + "step": 17515 + }, + { + "epoch": 0.9808489192518759, + "grad_norm": 1.405686378479004, + "learning_rate": 8.7535e-05, + "loss": 0.467, + "step": 17516 + }, + { + "epoch": 0.9809049165640049, + "grad_norm": 1.1117249727249146, + "learning_rate": 8.754e-05, + "loss": 0.3743, + "step": 17517 + }, + { + "epoch": 0.9809609138761339, + "grad_norm": 1.4071615934371948, + "learning_rate": 8.7545e-05, + "loss": 0.3418, + "step": 17518 + }, + { + "epoch": 0.981016911188263, + "grad_norm": 1.627893090248108, + "learning_rate": 8.755e-05, + "loss": 0.6502, + "step": 17519 + }, + { + "epoch": 0.981072908500392, + "grad_norm": 1.286058783531189, + "learning_rate": 8.755500000000001e-05, + "loss": 0.4648, + "step": 17520 + }, + { + "epoch": 0.981128905812521, + "grad_norm": 1.3681645393371582, + "learning_rate": 8.756000000000001e-05, + "loss": 0.4158, + "step": 17521 + }, + { + "epoch": 0.98118490312465, + "grad_norm": 1.4059162139892578, + "learning_rate": 8.7565e-05, + "loss": 0.4763, + "step": 17522 + }, + { + "epoch": 0.981240900436779, + "grad_norm": 126.30540466308594, + "learning_rate": 8.757e-05, + "loss": 0.4891, + "step": 17523 + }, + { + "epoch": 0.981296897748908, + "grad_norm": 1.467044472694397, + "learning_rate": 8.7575e-05, + "loss": 0.576, + "step": 17524 + }, + { + "epoch": 0.9813528950610371, + "grad_norm": 1.4908976554870605, + "learning_rate": 8.758000000000001e-05, + "loss": 0.506, + "step": 17525 + }, + { + "epoch": 0.9814088923731661, + "grad_norm": 1.2298582792282104, + "learning_rate": 8.758500000000001e-05, + "loss": 0.4315, + "step": 17526 + }, + { + "epoch": 0.9814648896852951, + "grad_norm": 1.304078221321106, + "learning_rate": 8.759e-05, + "loss": 0.42, + "step": 17527 + }, + { + "epoch": 0.9815208869974241, + "grad_norm": 1.5696616172790527, + "learning_rate": 8.7595e-05, + "loss": 0.5689, + "step": 17528 + }, + { + "epoch": 0.9815768843095531, + "grad_norm": 1.8061628341674805, + "learning_rate": 8.76e-05, + "loss": 0.5197, + "step": 17529 + }, + { + "epoch": 0.9816328816216822, + "grad_norm": 2.1518683433532715, + "learning_rate": 8.7605e-05, + "loss": 0.597, + "step": 17530 + }, + { + "epoch": 0.9816888789338112, + "grad_norm": 1.4769752025604248, + "learning_rate": 8.761000000000001e-05, + "loss": 0.5969, + "step": 17531 + }, + { + "epoch": 0.9817448762459402, + "grad_norm": 1.3505018949508667, + "learning_rate": 8.761500000000001e-05, + "loss": 0.4329, + "step": 17532 + }, + { + "epoch": 0.9818008735580692, + "grad_norm": 1.226112723350525, + "learning_rate": 8.762e-05, + "loss": 0.4346, + "step": 17533 + }, + { + "epoch": 0.9818568708701982, + "grad_norm": 1.3486183881759644, + "learning_rate": 8.7625e-05, + "loss": 0.409, + "step": 17534 + }, + { + "epoch": 0.9819128681823273, + "grad_norm": 1.6185193061828613, + "learning_rate": 8.763e-05, + "loss": 0.4879, + "step": 17535 + }, + { + "epoch": 0.9819688654944563, + "grad_norm": 1.1983236074447632, + "learning_rate": 8.7635e-05, + "loss": 0.2968, + "step": 17536 + }, + { + "epoch": 0.9820248628065853, + "grad_norm": 1.2322864532470703, + "learning_rate": 8.764e-05, + "loss": 0.4065, + "step": 17537 + }, + { + "epoch": 0.9820808601187143, + "grad_norm": 1.4110888242721558, + "learning_rate": 8.7645e-05, + "loss": 0.4467, + "step": 17538 + }, + { + "epoch": 0.9821368574308433, + "grad_norm": 1.2793995141983032, + "learning_rate": 8.765e-05, + "loss": 0.4855, + "step": 17539 + }, + { + "epoch": 0.9821928547429724, + "grad_norm": 1.2857661247253418, + "learning_rate": 8.765500000000001e-05, + "loss": 0.4428, + "step": 17540 + }, + { + "epoch": 0.9822488520551014, + "grad_norm": 1.4148931503295898, + "learning_rate": 8.766000000000001e-05, + "loss": 0.3957, + "step": 17541 + }, + { + "epoch": 0.9823048493672304, + "grad_norm": 1.4639850854873657, + "learning_rate": 8.766500000000001e-05, + "loss": 0.5383, + "step": 17542 + }, + { + "epoch": 0.9823608466793594, + "grad_norm": 1.7329463958740234, + "learning_rate": 8.767e-05, + "loss": 0.3988, + "step": 17543 + }, + { + "epoch": 0.9824168439914884, + "grad_norm": 1.2866063117980957, + "learning_rate": 8.7675e-05, + "loss": 0.4273, + "step": 17544 + }, + { + "epoch": 0.9824728413036175, + "grad_norm": 1.3373000621795654, + "learning_rate": 8.768e-05, + "loss": 0.5587, + "step": 17545 + }, + { + "epoch": 0.9825288386157465, + "grad_norm": 1.371846318244934, + "learning_rate": 8.768500000000001e-05, + "loss": 0.451, + "step": 17546 + }, + { + "epoch": 0.9825848359278755, + "grad_norm": 2.8197245597839355, + "learning_rate": 8.769000000000001e-05, + "loss": 0.4816, + "step": 17547 + }, + { + "epoch": 0.9826408332400045, + "grad_norm": 3.393686532974243, + "learning_rate": 8.7695e-05, + "loss": 0.4298, + "step": 17548 + }, + { + "epoch": 0.9826968305521335, + "grad_norm": 1.4075634479522705, + "learning_rate": 8.77e-05, + "loss": 0.4598, + "step": 17549 + }, + { + "epoch": 0.9827528278642625, + "grad_norm": 1.9391776323318481, + "learning_rate": 8.7705e-05, + "loss": 0.5691, + "step": 17550 + }, + { + "epoch": 0.9828088251763916, + "grad_norm": 1.6095472574234009, + "learning_rate": 8.771e-05, + "loss": 0.7115, + "step": 17551 + }, + { + "epoch": 0.9828648224885206, + "grad_norm": 1.1371383666992188, + "learning_rate": 8.7715e-05, + "loss": 0.3423, + "step": 17552 + }, + { + "epoch": 0.9829208198006496, + "grad_norm": 1.347645878791809, + "learning_rate": 8.772000000000001e-05, + "loss": 0.463, + "step": 17553 + }, + { + "epoch": 0.9829768171127786, + "grad_norm": 1.7110610008239746, + "learning_rate": 8.7725e-05, + "loss": 0.5672, + "step": 17554 + }, + { + "epoch": 0.9830328144249076, + "grad_norm": 1.2346199750900269, + "learning_rate": 8.773e-05, + "loss": 0.4478, + "step": 17555 + }, + { + "epoch": 0.9830888117370367, + "grad_norm": 1.3220741748809814, + "learning_rate": 8.7735e-05, + "loss": 0.4089, + "step": 17556 + }, + { + "epoch": 0.9831448090491657, + "grad_norm": 1.3453768491744995, + "learning_rate": 8.774e-05, + "loss": 0.4418, + "step": 17557 + }, + { + "epoch": 0.9832008063612947, + "grad_norm": 1.3212966918945312, + "learning_rate": 8.7745e-05, + "loss": 0.383, + "step": 17558 + }, + { + "epoch": 0.9832568036734237, + "grad_norm": 1.2123596668243408, + "learning_rate": 8.775e-05, + "loss": 0.4264, + "step": 17559 + }, + { + "epoch": 0.9833128009855527, + "grad_norm": 1.3323320150375366, + "learning_rate": 8.775500000000002e-05, + "loss": 0.4671, + "step": 17560 + }, + { + "epoch": 0.9833687982976818, + "grad_norm": 1.3458378314971924, + "learning_rate": 8.776000000000001e-05, + "loss": 0.407, + "step": 17561 + }, + { + "epoch": 0.9834247956098108, + "grad_norm": 1.4096357822418213, + "learning_rate": 8.776500000000001e-05, + "loss": 0.4173, + "step": 17562 + }, + { + "epoch": 0.9834807929219398, + "grad_norm": 1.3322290182113647, + "learning_rate": 8.777000000000001e-05, + "loss": 0.3692, + "step": 17563 + }, + { + "epoch": 0.9835367902340688, + "grad_norm": 1.205859661102295, + "learning_rate": 8.7775e-05, + "loss": 0.4507, + "step": 17564 + }, + { + "epoch": 0.9835927875461978, + "grad_norm": 1.7954727411270142, + "learning_rate": 8.778e-05, + "loss": 0.4743, + "step": 17565 + }, + { + "epoch": 0.9836487848583269, + "grad_norm": 1.2893177270889282, + "learning_rate": 8.7785e-05, + "loss": 0.379, + "step": 17566 + }, + { + "epoch": 0.9837047821704559, + "grad_norm": 1.3160830736160278, + "learning_rate": 8.779000000000001e-05, + "loss": 0.4332, + "step": 17567 + }, + { + "epoch": 0.9837607794825849, + "grad_norm": 1.1817042827606201, + "learning_rate": 8.779500000000001e-05, + "loss": 0.5421, + "step": 17568 + }, + { + "epoch": 0.9838167767947138, + "grad_norm": 1.2465851306915283, + "learning_rate": 8.78e-05, + "loss": 0.4129, + "step": 17569 + }, + { + "epoch": 0.9838727741068428, + "grad_norm": 1.3811452388763428, + "learning_rate": 8.7805e-05, + "loss": 0.4752, + "step": 17570 + }, + { + "epoch": 0.9839287714189718, + "grad_norm": 1.6648184061050415, + "learning_rate": 8.781e-05, + "loss": 0.4578, + "step": 17571 + }, + { + "epoch": 0.9839847687311009, + "grad_norm": 1.3105189800262451, + "learning_rate": 8.7815e-05, + "loss": 0.5465, + "step": 17572 + }, + { + "epoch": 0.9840407660432299, + "grad_norm": 1.455780029296875, + "learning_rate": 8.782e-05, + "loss": 0.4989, + "step": 17573 + }, + { + "epoch": 0.9840967633553589, + "grad_norm": 1.006490707397461, + "learning_rate": 8.782500000000001e-05, + "loss": 0.4122, + "step": 17574 + }, + { + "epoch": 0.9841527606674879, + "grad_norm": 1.4180700778961182, + "learning_rate": 8.783e-05, + "loss": 0.5415, + "step": 17575 + }, + { + "epoch": 0.9842087579796169, + "grad_norm": 1.6900314092636108, + "learning_rate": 8.7835e-05, + "loss": 0.5968, + "step": 17576 + }, + { + "epoch": 0.984264755291746, + "grad_norm": 1.7504936456680298, + "learning_rate": 8.784e-05, + "loss": 0.5195, + "step": 17577 + }, + { + "epoch": 0.984320752603875, + "grad_norm": 1.4187743663787842, + "learning_rate": 8.7845e-05, + "loss": 0.45, + "step": 17578 + }, + { + "epoch": 0.984376749916004, + "grad_norm": 1.480238914489746, + "learning_rate": 8.785e-05, + "loss": 0.5978, + "step": 17579 + }, + { + "epoch": 0.984432747228133, + "grad_norm": 1.2080800533294678, + "learning_rate": 8.7855e-05, + "loss": 0.4829, + "step": 17580 + }, + { + "epoch": 0.984488744540262, + "grad_norm": 1.6299505233764648, + "learning_rate": 8.786e-05, + "loss": 0.501, + "step": 17581 + }, + { + "epoch": 0.984544741852391, + "grad_norm": 1.3159761428833008, + "learning_rate": 8.786500000000001e-05, + "loss": 0.4247, + "step": 17582 + }, + { + "epoch": 0.9846007391645201, + "grad_norm": 1.294948935508728, + "learning_rate": 8.787000000000001e-05, + "loss": 0.3801, + "step": 17583 + }, + { + "epoch": 0.9846567364766491, + "grad_norm": 1.212965488433838, + "learning_rate": 8.787500000000001e-05, + "loss": 0.3939, + "step": 17584 + }, + { + "epoch": 0.9847127337887781, + "grad_norm": 1.37157142162323, + "learning_rate": 8.788e-05, + "loss": 0.634, + "step": 17585 + }, + { + "epoch": 0.9847687311009071, + "grad_norm": 2.70932936668396, + "learning_rate": 8.7885e-05, + "loss": 0.5367, + "step": 17586 + }, + { + "epoch": 0.9848247284130361, + "grad_norm": 1.6919304132461548, + "learning_rate": 8.789e-05, + "loss": 0.5444, + "step": 17587 + }, + { + "epoch": 0.9848807257251652, + "grad_norm": 1.4852466583251953, + "learning_rate": 8.789500000000001e-05, + "loss": 0.4759, + "step": 17588 + }, + { + "epoch": 0.9849367230372942, + "grad_norm": 1.3218944072723389, + "learning_rate": 8.790000000000001e-05, + "loss": 0.4551, + "step": 17589 + }, + { + "epoch": 0.9849927203494232, + "grad_norm": 1.262381672859192, + "learning_rate": 8.7905e-05, + "loss": 0.3755, + "step": 17590 + }, + { + "epoch": 0.9850487176615522, + "grad_norm": 1.518549919128418, + "learning_rate": 8.791e-05, + "loss": 0.4507, + "step": 17591 + }, + { + "epoch": 0.9851047149736812, + "grad_norm": 1.26132333278656, + "learning_rate": 8.7915e-05, + "loss": 0.5181, + "step": 17592 + }, + { + "epoch": 0.9851607122858103, + "grad_norm": 1.578444480895996, + "learning_rate": 8.792e-05, + "loss": 0.4636, + "step": 17593 + }, + { + "epoch": 0.9852167095979393, + "grad_norm": 1.3818082809448242, + "learning_rate": 8.7925e-05, + "loss": 0.5473, + "step": 17594 + }, + { + "epoch": 0.9852727069100683, + "grad_norm": 1.6644206047058105, + "learning_rate": 8.793000000000001e-05, + "loss": 0.4045, + "step": 17595 + }, + { + "epoch": 0.9853287042221973, + "grad_norm": 1.2437721490859985, + "learning_rate": 8.7935e-05, + "loss": 0.5042, + "step": 17596 + }, + { + "epoch": 0.9853847015343263, + "grad_norm": 1.3465697765350342, + "learning_rate": 8.794e-05, + "loss": 0.423, + "step": 17597 + }, + { + "epoch": 0.9854406988464554, + "grad_norm": 1.3346095085144043, + "learning_rate": 8.7945e-05, + "loss": 0.4192, + "step": 17598 + }, + { + "epoch": 0.9854966961585844, + "grad_norm": 1.2999358177185059, + "learning_rate": 8.795e-05, + "loss": 0.461, + "step": 17599 + }, + { + "epoch": 0.9855526934707134, + "grad_norm": 1.4455510377883911, + "learning_rate": 8.795500000000001e-05, + "loss": 0.5385, + "step": 17600 + }, + { + "epoch": 0.9856086907828424, + "grad_norm": 1.5050334930419922, + "learning_rate": 8.796e-05, + "loss": 0.5009, + "step": 17601 + }, + { + "epoch": 0.9856646880949714, + "grad_norm": 1.2636960744857788, + "learning_rate": 8.7965e-05, + "loss": 0.4595, + "step": 17602 + }, + { + "epoch": 0.9857206854071004, + "grad_norm": 1.377002477645874, + "learning_rate": 8.797000000000001e-05, + "loss": 0.4101, + "step": 17603 + }, + { + "epoch": 0.9857766827192295, + "grad_norm": 1.3277605772018433, + "learning_rate": 8.797500000000001e-05, + "loss": 0.4982, + "step": 17604 + }, + { + "epoch": 0.9858326800313585, + "grad_norm": 1.494991660118103, + "learning_rate": 8.798000000000001e-05, + "loss": 0.4443, + "step": 17605 + }, + { + "epoch": 0.9858886773434875, + "grad_norm": 1.3216522932052612, + "learning_rate": 8.7985e-05, + "loss": 0.4982, + "step": 17606 + }, + { + "epoch": 0.9859446746556165, + "grad_norm": 1.3640851974487305, + "learning_rate": 8.799e-05, + "loss": 0.417, + "step": 17607 + }, + { + "epoch": 0.9860006719677455, + "grad_norm": 1.2586225271224976, + "learning_rate": 8.7995e-05, + "loss": 0.499, + "step": 17608 + }, + { + "epoch": 0.9860566692798746, + "grad_norm": 1.2728242874145508, + "learning_rate": 8.800000000000001e-05, + "loss": 0.3669, + "step": 17609 + }, + { + "epoch": 0.9861126665920036, + "grad_norm": 1.3577446937561035, + "learning_rate": 8.800500000000001e-05, + "loss": 0.6406, + "step": 17610 + }, + { + "epoch": 0.9861686639041326, + "grad_norm": 1.105636715888977, + "learning_rate": 8.801e-05, + "loss": 0.3837, + "step": 17611 + }, + { + "epoch": 0.9862246612162616, + "grad_norm": 1.332719087600708, + "learning_rate": 8.8015e-05, + "loss": 0.5369, + "step": 17612 + }, + { + "epoch": 0.9862806585283906, + "grad_norm": 1.1900907754898071, + "learning_rate": 8.802e-05, + "loss": 0.4035, + "step": 17613 + }, + { + "epoch": 0.9863366558405197, + "grad_norm": 1.3678020238876343, + "learning_rate": 8.8025e-05, + "loss": 0.4893, + "step": 17614 + }, + { + "epoch": 0.9863926531526487, + "grad_norm": 1.2033085823059082, + "learning_rate": 8.803e-05, + "loss": 0.4647, + "step": 17615 + }, + { + "epoch": 0.9864486504647777, + "grad_norm": 1.5403302907943726, + "learning_rate": 8.8035e-05, + "loss": 0.4371, + "step": 17616 + }, + { + "epoch": 0.9865046477769067, + "grad_norm": 1.3274661302566528, + "learning_rate": 8.804e-05, + "loss": 0.446, + "step": 17617 + }, + { + "epoch": 0.9865606450890357, + "grad_norm": 1.631810188293457, + "learning_rate": 8.8045e-05, + "loss": 0.4998, + "step": 17618 + }, + { + "epoch": 0.9866166424011648, + "grad_norm": 11.381399154663086, + "learning_rate": 8.805e-05, + "loss": 0.4679, + "step": 17619 + }, + { + "epoch": 0.9866726397132938, + "grad_norm": 1.4126825332641602, + "learning_rate": 8.805500000000001e-05, + "loss": 0.4284, + "step": 17620 + }, + { + "epoch": 0.9867286370254228, + "grad_norm": 1.444777488708496, + "learning_rate": 8.806000000000001e-05, + "loss": 0.4622, + "step": 17621 + }, + { + "epoch": 0.9867846343375518, + "grad_norm": 1.8333213329315186, + "learning_rate": 8.8065e-05, + "loss": 0.6327, + "step": 17622 + }, + { + "epoch": 0.9868406316496808, + "grad_norm": 1.1897823810577393, + "learning_rate": 8.807e-05, + "loss": 0.4171, + "step": 17623 + }, + { + "epoch": 0.9868966289618099, + "grad_norm": 1.3115020990371704, + "learning_rate": 8.807500000000001e-05, + "loss": 0.3187, + "step": 17624 + }, + { + "epoch": 0.9869526262739389, + "grad_norm": 1.45095956325531, + "learning_rate": 8.808000000000001e-05, + "loss": 0.4641, + "step": 17625 + }, + { + "epoch": 0.9870086235860679, + "grad_norm": 1.350498914718628, + "learning_rate": 8.808500000000001e-05, + "loss": 0.4694, + "step": 17626 + }, + { + "epoch": 0.9870646208981969, + "grad_norm": 1.168498158454895, + "learning_rate": 8.809e-05, + "loss": 0.4279, + "step": 17627 + }, + { + "epoch": 0.9871206182103259, + "grad_norm": 1.2061278820037842, + "learning_rate": 8.8095e-05, + "loss": 0.5163, + "step": 17628 + }, + { + "epoch": 0.987176615522455, + "grad_norm": 1.3452043533325195, + "learning_rate": 8.81e-05, + "loss": 0.5216, + "step": 17629 + }, + { + "epoch": 0.987232612834584, + "grad_norm": 1.3370094299316406, + "learning_rate": 8.8105e-05, + "loss": 0.4456, + "step": 17630 + }, + { + "epoch": 0.987288610146713, + "grad_norm": 1.491627812385559, + "learning_rate": 8.811000000000001e-05, + "loss": 0.5166, + "step": 17631 + }, + { + "epoch": 0.987344607458842, + "grad_norm": 1.3679686784744263, + "learning_rate": 8.8115e-05, + "loss": 0.3736, + "step": 17632 + }, + { + "epoch": 0.987400604770971, + "grad_norm": 1.4824379682540894, + "learning_rate": 8.812e-05, + "loss": 0.4467, + "step": 17633 + }, + { + "epoch": 0.9874566020831, + "grad_norm": 0.9939557909965515, + "learning_rate": 8.8125e-05, + "loss": 0.4024, + "step": 17634 + }, + { + "epoch": 0.9875125993952291, + "grad_norm": 1.2251510620117188, + "learning_rate": 8.813e-05, + "loss": 0.4063, + "step": 17635 + }, + { + "epoch": 0.9875685967073581, + "grad_norm": 1.4843429327011108, + "learning_rate": 8.8135e-05, + "loss": 0.5691, + "step": 17636 + }, + { + "epoch": 0.9876245940194871, + "grad_norm": 1.5110747814178467, + "learning_rate": 8.814e-05, + "loss": 0.7143, + "step": 17637 + }, + { + "epoch": 0.9876805913316161, + "grad_norm": 1.2455945014953613, + "learning_rate": 8.8145e-05, + "loss": 0.3352, + "step": 17638 + }, + { + "epoch": 0.9877365886437451, + "grad_norm": 1.3711780309677124, + "learning_rate": 8.815e-05, + "loss": 0.5051, + "step": 17639 + }, + { + "epoch": 0.9877925859558742, + "grad_norm": 1.326949954032898, + "learning_rate": 8.8155e-05, + "loss": 0.5136, + "step": 17640 + }, + { + "epoch": 0.9878485832680032, + "grad_norm": 1.5293852090835571, + "learning_rate": 8.816000000000001e-05, + "loss": 0.5172, + "step": 17641 + }, + { + "epoch": 0.9879045805801322, + "grad_norm": 1.548806071281433, + "learning_rate": 8.816500000000001e-05, + "loss": 0.4185, + "step": 17642 + }, + { + "epoch": 0.9879605778922612, + "grad_norm": 1.25412917137146, + "learning_rate": 8.817e-05, + "loss": 0.495, + "step": 17643 + }, + { + "epoch": 0.9880165752043902, + "grad_norm": 1.476805329322815, + "learning_rate": 8.8175e-05, + "loss": 0.4058, + "step": 17644 + }, + { + "epoch": 0.9880725725165193, + "grad_norm": 1.4195153713226318, + "learning_rate": 8.818000000000001e-05, + "loss": 0.4833, + "step": 17645 + }, + { + "epoch": 0.9881285698286483, + "grad_norm": 1.246290683746338, + "learning_rate": 8.818500000000001e-05, + "loss": 0.3048, + "step": 17646 + }, + { + "epoch": 0.9881845671407773, + "grad_norm": 1.3022139072418213, + "learning_rate": 8.819000000000001e-05, + "loss": 0.437, + "step": 17647 + }, + { + "epoch": 0.9882405644529063, + "grad_norm": 1.6030832529067993, + "learning_rate": 8.8195e-05, + "loss": 0.3815, + "step": 17648 + }, + { + "epoch": 0.9882965617650353, + "grad_norm": 1.3837579488754272, + "learning_rate": 8.82e-05, + "loss": 0.5043, + "step": 17649 + }, + { + "epoch": 0.9883525590771643, + "grad_norm": 1.3769114017486572, + "learning_rate": 8.8205e-05, + "loss": 0.4242, + "step": 17650 + }, + { + "epoch": 0.9884085563892934, + "grad_norm": 1.2110369205474854, + "learning_rate": 8.821e-05, + "loss": 0.5027, + "step": 17651 + }, + { + "epoch": 0.9884645537014223, + "grad_norm": 1.4913884401321411, + "learning_rate": 8.821500000000001e-05, + "loss": 0.4245, + "step": 17652 + }, + { + "epoch": 0.9885205510135513, + "grad_norm": 1.4101661443710327, + "learning_rate": 8.822e-05, + "loss": 0.4611, + "step": 17653 + }, + { + "epoch": 0.9885765483256803, + "grad_norm": 1.5386029481887817, + "learning_rate": 8.8225e-05, + "loss": 0.7085, + "step": 17654 + }, + { + "epoch": 0.9886325456378093, + "grad_norm": 1.4647287130355835, + "learning_rate": 8.823e-05, + "loss": 0.5565, + "step": 17655 + }, + { + "epoch": 0.9886885429499384, + "grad_norm": 1.3486679792404175, + "learning_rate": 8.8235e-05, + "loss": 0.4155, + "step": 17656 + }, + { + "epoch": 0.9887445402620674, + "grad_norm": 1.1760742664337158, + "learning_rate": 8.824e-05, + "loss": 0.42, + "step": 17657 + }, + { + "epoch": 0.9888005375741964, + "grad_norm": 10.632444381713867, + "learning_rate": 8.8245e-05, + "loss": 0.4341, + "step": 17658 + }, + { + "epoch": 0.9888565348863254, + "grad_norm": 1.3989155292510986, + "learning_rate": 8.825e-05, + "loss": 0.491, + "step": 17659 + }, + { + "epoch": 0.9889125321984544, + "grad_norm": 1.2893259525299072, + "learning_rate": 8.8255e-05, + "loss": 0.4318, + "step": 17660 + }, + { + "epoch": 0.9889685295105834, + "grad_norm": 1.8009469509124756, + "learning_rate": 8.826000000000001e-05, + "loss": 0.5432, + "step": 17661 + }, + { + "epoch": 0.9890245268227125, + "grad_norm": 1.3617624044418335, + "learning_rate": 8.826500000000001e-05, + "loss": 0.4691, + "step": 17662 + }, + { + "epoch": 0.9890805241348415, + "grad_norm": 1.30439293384552, + "learning_rate": 8.827000000000001e-05, + "loss": 0.4716, + "step": 17663 + }, + { + "epoch": 0.9891365214469705, + "grad_norm": 1.1191641092300415, + "learning_rate": 8.8275e-05, + "loss": 0.3351, + "step": 17664 + }, + { + "epoch": 0.9891925187590995, + "grad_norm": 1.3394598960876465, + "learning_rate": 8.828e-05, + "loss": 0.4725, + "step": 17665 + }, + { + "epoch": 0.9892485160712285, + "grad_norm": 1.580841302871704, + "learning_rate": 8.828500000000001e-05, + "loss": 0.6894, + "step": 17666 + }, + { + "epoch": 0.9893045133833576, + "grad_norm": 1.1006057262420654, + "learning_rate": 8.829000000000001e-05, + "loss": 0.3338, + "step": 17667 + }, + { + "epoch": 0.9893605106954866, + "grad_norm": 1.2439556121826172, + "learning_rate": 8.829500000000001e-05, + "loss": 0.5283, + "step": 17668 + }, + { + "epoch": 0.9894165080076156, + "grad_norm": 1.3054133653640747, + "learning_rate": 8.83e-05, + "loss": 0.4204, + "step": 17669 + }, + { + "epoch": 0.9894725053197446, + "grad_norm": 1.3397842645645142, + "learning_rate": 8.8305e-05, + "loss": 0.39, + "step": 17670 + }, + { + "epoch": 0.9895285026318736, + "grad_norm": 1.3987393379211426, + "learning_rate": 8.831e-05, + "loss": 0.5048, + "step": 17671 + }, + { + "epoch": 0.9895844999440027, + "grad_norm": 1.7089719772338867, + "learning_rate": 8.8315e-05, + "loss": 0.4487, + "step": 17672 + }, + { + "epoch": 0.9896404972561317, + "grad_norm": 1.145412802696228, + "learning_rate": 8.832000000000001e-05, + "loss": 0.452, + "step": 17673 + }, + { + "epoch": 0.9896964945682607, + "grad_norm": 1.261096715927124, + "learning_rate": 8.8325e-05, + "loss": 0.3973, + "step": 17674 + }, + { + "epoch": 0.9897524918803897, + "grad_norm": 1.509187936782837, + "learning_rate": 8.833e-05, + "loss": 0.5809, + "step": 17675 + }, + { + "epoch": 0.9898084891925187, + "grad_norm": 1.092221975326538, + "learning_rate": 8.8335e-05, + "loss": 0.3629, + "step": 17676 + }, + { + "epoch": 0.9898644865046478, + "grad_norm": 1.2134952545166016, + "learning_rate": 8.834e-05, + "loss": 0.4333, + "step": 17677 + }, + { + "epoch": 0.9899204838167768, + "grad_norm": 1.1407698392868042, + "learning_rate": 8.8345e-05, + "loss": 0.3594, + "step": 17678 + }, + { + "epoch": 0.9899764811289058, + "grad_norm": 1.4732252359390259, + "learning_rate": 8.834999999999999e-05, + "loss": 0.3671, + "step": 17679 + }, + { + "epoch": 0.9900324784410348, + "grad_norm": 1.455226182937622, + "learning_rate": 8.8355e-05, + "loss": 0.448, + "step": 17680 + }, + { + "epoch": 0.9900884757531638, + "grad_norm": 1.2765408754348755, + "learning_rate": 8.836000000000001e-05, + "loss": 0.464, + "step": 17681 + }, + { + "epoch": 0.9901444730652929, + "grad_norm": 1.222954511642456, + "learning_rate": 8.836500000000001e-05, + "loss": 0.4208, + "step": 17682 + }, + { + "epoch": 0.9902004703774219, + "grad_norm": 2.3051717281341553, + "learning_rate": 8.837000000000001e-05, + "loss": 0.647, + "step": 17683 + }, + { + "epoch": 0.9902564676895509, + "grad_norm": 1.7088819742202759, + "learning_rate": 8.837500000000001e-05, + "loss": 0.4154, + "step": 17684 + }, + { + "epoch": 0.9903124650016799, + "grad_norm": 1.267587661743164, + "learning_rate": 8.838e-05, + "loss": 0.404, + "step": 17685 + }, + { + "epoch": 0.9903684623138089, + "grad_norm": 1.8990023136138916, + "learning_rate": 8.8385e-05, + "loss": 0.4948, + "step": 17686 + }, + { + "epoch": 0.990424459625938, + "grad_norm": 1.133603572845459, + "learning_rate": 8.839000000000001e-05, + "loss": 0.3743, + "step": 17687 + }, + { + "epoch": 0.990480456938067, + "grad_norm": 1.6615586280822754, + "learning_rate": 8.839500000000001e-05, + "loss": 0.5648, + "step": 17688 + }, + { + "epoch": 0.990536454250196, + "grad_norm": 1.5147998332977295, + "learning_rate": 8.840000000000001e-05, + "loss": 0.4682, + "step": 17689 + }, + { + "epoch": 0.990592451562325, + "grad_norm": 1.6206152439117432, + "learning_rate": 8.8405e-05, + "loss": 0.47, + "step": 17690 + }, + { + "epoch": 0.990648448874454, + "grad_norm": 1.3913646936416626, + "learning_rate": 8.841e-05, + "loss": 0.4006, + "step": 17691 + }, + { + "epoch": 0.990704446186583, + "grad_norm": 1.4173760414123535, + "learning_rate": 8.8415e-05, + "loss": 0.4517, + "step": 17692 + }, + { + "epoch": 0.9907604434987121, + "grad_norm": 1.551391839981079, + "learning_rate": 8.842e-05, + "loss": 0.5697, + "step": 17693 + }, + { + "epoch": 0.9908164408108411, + "grad_norm": 1.2823461294174194, + "learning_rate": 8.842500000000001e-05, + "loss": 0.3911, + "step": 17694 + }, + { + "epoch": 0.9908724381229701, + "grad_norm": 1.2825812101364136, + "learning_rate": 8.843e-05, + "loss": 0.5166, + "step": 17695 + }, + { + "epoch": 0.9909284354350991, + "grad_norm": 1.237261176109314, + "learning_rate": 8.8435e-05, + "loss": 0.4583, + "step": 17696 + }, + { + "epoch": 0.9909844327472281, + "grad_norm": 1.2546817064285278, + "learning_rate": 8.844e-05, + "loss": 0.4322, + "step": 17697 + }, + { + "epoch": 0.9910404300593572, + "grad_norm": 1.210726261138916, + "learning_rate": 8.8445e-05, + "loss": 0.4521, + "step": 17698 + }, + { + "epoch": 0.9910964273714862, + "grad_norm": 1.7791892290115356, + "learning_rate": 8.845e-05, + "loss": 0.4756, + "step": 17699 + }, + { + "epoch": 0.9911524246836152, + "grad_norm": 1.4492498636245728, + "learning_rate": 8.845499999999999e-05, + "loss": 0.4875, + "step": 17700 + }, + { + "epoch": 0.9912084219957442, + "grad_norm": 1.4883558750152588, + "learning_rate": 8.846e-05, + "loss": 0.4393, + "step": 17701 + }, + { + "epoch": 0.9912644193078732, + "grad_norm": 1.568536639213562, + "learning_rate": 8.846500000000001e-05, + "loss": 0.5384, + "step": 17702 + }, + { + "epoch": 0.9913204166200023, + "grad_norm": 1.1386370658874512, + "learning_rate": 8.847000000000001e-05, + "loss": 0.4197, + "step": 17703 + }, + { + "epoch": 0.9913764139321313, + "grad_norm": 1.130841851234436, + "learning_rate": 8.847500000000001e-05, + "loss": 0.553, + "step": 17704 + }, + { + "epoch": 0.9914324112442603, + "grad_norm": 1.9779821634292603, + "learning_rate": 8.848e-05, + "loss": 0.4041, + "step": 17705 + }, + { + "epoch": 0.9914884085563893, + "grad_norm": 1.2233911752700806, + "learning_rate": 8.8485e-05, + "loss": 0.4139, + "step": 17706 + }, + { + "epoch": 0.9915444058685183, + "grad_norm": 1.1328318119049072, + "learning_rate": 8.849e-05, + "loss": 0.3593, + "step": 17707 + }, + { + "epoch": 0.9916004031806473, + "grad_norm": 1.307611107826233, + "learning_rate": 8.849500000000001e-05, + "loss": 0.4044, + "step": 17708 + }, + { + "epoch": 0.9916564004927764, + "grad_norm": 1.3479758501052856, + "learning_rate": 8.850000000000001e-05, + "loss": 0.4071, + "step": 17709 + }, + { + "epoch": 0.9917123978049054, + "grad_norm": 1.8989241123199463, + "learning_rate": 8.850500000000001e-05, + "loss": 0.5629, + "step": 17710 + }, + { + "epoch": 0.9917683951170344, + "grad_norm": 1.6310759782791138, + "learning_rate": 8.851e-05, + "loss": 0.3843, + "step": 17711 + }, + { + "epoch": 0.9918243924291634, + "grad_norm": 1.456925392150879, + "learning_rate": 8.8515e-05, + "loss": 0.416, + "step": 17712 + }, + { + "epoch": 0.9918803897412924, + "grad_norm": 1.4533954858779907, + "learning_rate": 8.852e-05, + "loss": 0.5127, + "step": 17713 + }, + { + "epoch": 0.9919363870534215, + "grad_norm": 1.5278843641281128, + "learning_rate": 8.8525e-05, + "loss": 0.5731, + "step": 17714 + }, + { + "epoch": 0.9919923843655505, + "grad_norm": 1.6043784618377686, + "learning_rate": 8.853000000000001e-05, + "loss": 0.4541, + "step": 17715 + }, + { + "epoch": 0.9920483816776795, + "grad_norm": 1.2234134674072266, + "learning_rate": 8.8535e-05, + "loss": 0.4573, + "step": 17716 + }, + { + "epoch": 0.9921043789898085, + "grad_norm": 1.176305890083313, + "learning_rate": 8.854e-05, + "loss": 0.3466, + "step": 17717 + }, + { + "epoch": 0.9921603763019375, + "grad_norm": 1.4414507150650024, + "learning_rate": 8.8545e-05, + "loss": 0.4582, + "step": 17718 + }, + { + "epoch": 0.9922163736140666, + "grad_norm": 1.346954345703125, + "learning_rate": 8.855e-05, + "loss": 0.4941, + "step": 17719 + }, + { + "epoch": 0.9922723709261956, + "grad_norm": 1.308688998222351, + "learning_rate": 8.8555e-05, + "loss": 0.5205, + "step": 17720 + }, + { + "epoch": 0.9923283682383246, + "grad_norm": 1.1533242464065552, + "learning_rate": 8.856e-05, + "loss": 0.5297, + "step": 17721 + }, + { + "epoch": 0.9923843655504536, + "grad_norm": 1.1984797716140747, + "learning_rate": 8.8565e-05, + "loss": 0.43, + "step": 17722 + }, + { + "epoch": 0.9924403628625826, + "grad_norm": 1.5509682893753052, + "learning_rate": 8.857000000000001e-05, + "loss": 0.4958, + "step": 17723 + }, + { + "epoch": 0.9924963601747117, + "grad_norm": 1.3433796167373657, + "learning_rate": 8.857500000000001e-05, + "loss": 0.3858, + "step": 17724 + }, + { + "epoch": 0.9925523574868407, + "grad_norm": 1.4598404169082642, + "learning_rate": 8.858000000000001e-05, + "loss": 0.4344, + "step": 17725 + }, + { + "epoch": 0.9926083547989697, + "grad_norm": 1.2984787225723267, + "learning_rate": 8.8585e-05, + "loss": 0.4054, + "step": 17726 + }, + { + "epoch": 0.9926643521110987, + "grad_norm": 1.8481416702270508, + "learning_rate": 8.859e-05, + "loss": 0.3646, + "step": 17727 + }, + { + "epoch": 0.9927203494232277, + "grad_norm": 1.2608082294464111, + "learning_rate": 8.8595e-05, + "loss": 0.3594, + "step": 17728 + }, + { + "epoch": 0.9927763467353568, + "grad_norm": 1.4416066408157349, + "learning_rate": 8.86e-05, + "loss": 0.5569, + "step": 17729 + }, + { + "epoch": 0.9928323440474858, + "grad_norm": 1.2480077743530273, + "learning_rate": 8.860500000000001e-05, + "loss": 0.3737, + "step": 17730 + }, + { + "epoch": 0.9928883413596148, + "grad_norm": 1.1432256698608398, + "learning_rate": 8.861000000000001e-05, + "loss": 0.4381, + "step": 17731 + }, + { + "epoch": 0.9929443386717438, + "grad_norm": 1.6371777057647705, + "learning_rate": 8.8615e-05, + "loss": 0.5299, + "step": 17732 + }, + { + "epoch": 0.9930003359838728, + "grad_norm": 1.286447525024414, + "learning_rate": 8.862e-05, + "loss": 0.4627, + "step": 17733 + }, + { + "epoch": 0.9930563332960017, + "grad_norm": 1.3441230058670044, + "learning_rate": 8.8625e-05, + "loss": 0.38, + "step": 17734 + }, + { + "epoch": 0.9931123306081308, + "grad_norm": 1.1864452362060547, + "learning_rate": 8.863e-05, + "loss": 0.4335, + "step": 17735 + }, + { + "epoch": 0.9931683279202598, + "grad_norm": 1.2555714845657349, + "learning_rate": 8.863500000000001e-05, + "loss": 0.4425, + "step": 17736 + }, + { + "epoch": 0.9932243252323888, + "grad_norm": 1.1101124286651611, + "learning_rate": 8.864e-05, + "loss": 0.3574, + "step": 17737 + }, + { + "epoch": 0.9932803225445178, + "grad_norm": 1.2899377346038818, + "learning_rate": 8.8645e-05, + "loss": 0.3888, + "step": 17738 + }, + { + "epoch": 0.9933363198566468, + "grad_norm": 1.324385643005371, + "learning_rate": 8.865e-05, + "loss": 0.4137, + "step": 17739 + }, + { + "epoch": 0.9933923171687759, + "grad_norm": 1.7079662084579468, + "learning_rate": 8.8655e-05, + "loss": 0.6243, + "step": 17740 + }, + { + "epoch": 0.9934483144809049, + "grad_norm": 1.450404405593872, + "learning_rate": 8.866000000000001e-05, + "loss": 0.5835, + "step": 17741 + }, + { + "epoch": 0.9935043117930339, + "grad_norm": 1.3371881246566772, + "learning_rate": 8.8665e-05, + "loss": 0.4234, + "step": 17742 + }, + { + "epoch": 0.9935603091051629, + "grad_norm": 1.562181830406189, + "learning_rate": 8.867e-05, + "loss": 0.4009, + "step": 17743 + }, + { + "epoch": 0.9936163064172919, + "grad_norm": 1.3748550415039062, + "learning_rate": 8.867500000000001e-05, + "loss": 0.4405, + "step": 17744 + }, + { + "epoch": 0.993672303729421, + "grad_norm": 1.4513928890228271, + "learning_rate": 8.868000000000001e-05, + "loss": 0.425, + "step": 17745 + }, + { + "epoch": 0.99372830104155, + "grad_norm": 1.3562967777252197, + "learning_rate": 8.868500000000001e-05, + "loss": 0.3928, + "step": 17746 + }, + { + "epoch": 0.993784298353679, + "grad_norm": 1.7254388332366943, + "learning_rate": 8.869e-05, + "loss": 0.502, + "step": 17747 + }, + { + "epoch": 0.993840295665808, + "grad_norm": 1.351060152053833, + "learning_rate": 8.8695e-05, + "loss": 0.3679, + "step": 17748 + }, + { + "epoch": 0.993896292977937, + "grad_norm": 1.322018027305603, + "learning_rate": 8.87e-05, + "loss": 0.461, + "step": 17749 + }, + { + "epoch": 0.993952290290066, + "grad_norm": 1.4136666059494019, + "learning_rate": 8.8705e-05, + "loss": 0.4954, + "step": 17750 + }, + { + "epoch": 0.9940082876021951, + "grad_norm": 1.2768642902374268, + "learning_rate": 8.871000000000001e-05, + "loss": 0.4126, + "step": 17751 + }, + { + "epoch": 0.9940642849143241, + "grad_norm": 1.3570771217346191, + "learning_rate": 8.871500000000001e-05, + "loss": 0.4779, + "step": 17752 + }, + { + "epoch": 0.9941202822264531, + "grad_norm": 1.1728743314743042, + "learning_rate": 8.872e-05, + "loss": 0.5258, + "step": 17753 + }, + { + "epoch": 0.9941762795385821, + "grad_norm": 1.4993699789047241, + "learning_rate": 8.8725e-05, + "loss": 0.4644, + "step": 17754 + }, + { + "epoch": 0.9942322768507111, + "grad_norm": 1.169426441192627, + "learning_rate": 8.873e-05, + "loss": 0.4376, + "step": 17755 + }, + { + "epoch": 0.9942882741628402, + "grad_norm": 1.6827619075775146, + "learning_rate": 8.8735e-05, + "loss": 0.5448, + "step": 17756 + }, + { + "epoch": 0.9943442714749692, + "grad_norm": 1.4702149629592896, + "learning_rate": 8.874000000000001e-05, + "loss": 0.4164, + "step": 17757 + }, + { + "epoch": 0.9944002687870982, + "grad_norm": 1.4925825595855713, + "learning_rate": 8.8745e-05, + "loss": 0.4294, + "step": 17758 + }, + { + "epoch": 0.9944562660992272, + "grad_norm": 1.1866360902786255, + "learning_rate": 8.875e-05, + "loss": 0.3493, + "step": 17759 + }, + { + "epoch": 0.9945122634113562, + "grad_norm": 1.4568305015563965, + "learning_rate": 8.8755e-05, + "loss": 0.4478, + "step": 17760 + }, + { + "epoch": 0.9945682607234853, + "grad_norm": 1.296949863433838, + "learning_rate": 8.876e-05, + "loss": 0.3953, + "step": 17761 + }, + { + "epoch": 0.9946242580356143, + "grad_norm": 1.423387050628662, + "learning_rate": 8.876500000000001e-05, + "loss": 0.5378, + "step": 17762 + }, + { + "epoch": 0.9946802553477433, + "grad_norm": 1.4243385791778564, + "learning_rate": 8.877e-05, + "loss": 0.4648, + "step": 17763 + }, + { + "epoch": 0.9947362526598723, + "grad_norm": 1.4275507926940918, + "learning_rate": 8.8775e-05, + "loss": 0.5134, + "step": 17764 + }, + { + "epoch": 0.9947922499720013, + "grad_norm": 1.2002226114273071, + "learning_rate": 8.878000000000001e-05, + "loss": 0.3779, + "step": 17765 + }, + { + "epoch": 0.9948482472841303, + "grad_norm": 1.3369922637939453, + "learning_rate": 8.878500000000001e-05, + "loss": 0.4891, + "step": 17766 + }, + { + "epoch": 0.9949042445962594, + "grad_norm": 2.1659765243530273, + "learning_rate": 8.879000000000001e-05, + "loss": 0.4689, + "step": 17767 + }, + { + "epoch": 0.9949602419083884, + "grad_norm": 1.449819803237915, + "learning_rate": 8.8795e-05, + "loss": 0.4796, + "step": 17768 + }, + { + "epoch": 0.9950162392205174, + "grad_norm": 1.4384304285049438, + "learning_rate": 8.88e-05, + "loss": 0.5106, + "step": 17769 + }, + { + "epoch": 0.9950722365326464, + "grad_norm": 1.7336833477020264, + "learning_rate": 8.8805e-05, + "loss": 0.5333, + "step": 17770 + }, + { + "epoch": 0.9951282338447754, + "grad_norm": 1.467752456665039, + "learning_rate": 8.881e-05, + "loss": 0.5923, + "step": 17771 + }, + { + "epoch": 0.9951842311569045, + "grad_norm": 1.5609163045883179, + "learning_rate": 8.881500000000001e-05, + "loss": 0.5225, + "step": 17772 + }, + { + "epoch": 0.9952402284690335, + "grad_norm": 1.3803917169570923, + "learning_rate": 8.882000000000001e-05, + "loss": 0.4381, + "step": 17773 + }, + { + "epoch": 0.9952962257811625, + "grad_norm": 1.325348138809204, + "learning_rate": 8.8825e-05, + "loss": 0.5434, + "step": 17774 + }, + { + "epoch": 0.9953522230932915, + "grad_norm": 1.1791011095046997, + "learning_rate": 8.883e-05, + "loss": 0.3529, + "step": 17775 + }, + { + "epoch": 0.9954082204054205, + "grad_norm": 1.1796578168869019, + "learning_rate": 8.8835e-05, + "loss": 0.4318, + "step": 17776 + }, + { + "epoch": 0.9954642177175496, + "grad_norm": 1.7437174320220947, + "learning_rate": 8.884e-05, + "loss": 0.462, + "step": 17777 + }, + { + "epoch": 0.9955202150296786, + "grad_norm": 1.3527717590332031, + "learning_rate": 8.8845e-05, + "loss": 0.5139, + "step": 17778 + }, + { + "epoch": 0.9955762123418076, + "grad_norm": 1.0909597873687744, + "learning_rate": 8.885e-05, + "loss": 0.3406, + "step": 17779 + }, + { + "epoch": 0.9956322096539366, + "grad_norm": 15.612887382507324, + "learning_rate": 8.8855e-05, + "loss": 0.5652, + "step": 17780 + }, + { + "epoch": 0.9956882069660656, + "grad_norm": 1.3589320182800293, + "learning_rate": 8.886e-05, + "loss": 0.4209, + "step": 17781 + }, + { + "epoch": 0.9957442042781947, + "grad_norm": 1.6181668043136597, + "learning_rate": 8.886500000000001e-05, + "loss": 0.4392, + "step": 17782 + }, + { + "epoch": 0.9958002015903237, + "grad_norm": 1.5727357864379883, + "learning_rate": 8.887000000000001e-05, + "loss": 0.5498, + "step": 17783 + }, + { + "epoch": 0.9958561989024527, + "grad_norm": 1.449902892112732, + "learning_rate": 8.8875e-05, + "loss": 0.3859, + "step": 17784 + }, + { + "epoch": 0.9959121962145817, + "grad_norm": 1.5982789993286133, + "learning_rate": 8.888e-05, + "loss": 0.4078, + "step": 17785 + }, + { + "epoch": 0.9959681935267107, + "grad_norm": 1.2767308950424194, + "learning_rate": 8.888500000000001e-05, + "loss": 0.4659, + "step": 17786 + }, + { + "epoch": 0.9960241908388398, + "grad_norm": 1.3726425170898438, + "learning_rate": 8.889000000000001e-05, + "loss": 0.4481, + "step": 17787 + }, + { + "epoch": 0.9960801881509688, + "grad_norm": 1.338931918144226, + "learning_rate": 8.889500000000001e-05, + "loss": 0.4569, + "step": 17788 + }, + { + "epoch": 0.9961361854630978, + "grad_norm": 1.3055534362792969, + "learning_rate": 8.89e-05, + "loss": 0.3763, + "step": 17789 + }, + { + "epoch": 0.9961921827752268, + "grad_norm": 1.2787420749664307, + "learning_rate": 8.8905e-05, + "loss": 0.4308, + "step": 17790 + }, + { + "epoch": 0.9962481800873558, + "grad_norm": 1.5374587774276733, + "learning_rate": 8.891e-05, + "loss": 0.5375, + "step": 17791 + }, + { + "epoch": 0.9963041773994848, + "grad_norm": 1.3534173965454102, + "learning_rate": 8.8915e-05, + "loss": 0.4976, + "step": 17792 + }, + { + "epoch": 0.9963601747116139, + "grad_norm": 1.5223244428634644, + "learning_rate": 8.892000000000001e-05, + "loss": 0.4936, + "step": 17793 + }, + { + "epoch": 0.9964161720237429, + "grad_norm": 1.176383137702942, + "learning_rate": 8.8925e-05, + "loss": 0.373, + "step": 17794 + }, + { + "epoch": 0.9964721693358719, + "grad_norm": 1.2590463161468506, + "learning_rate": 8.893e-05, + "loss": 0.4684, + "step": 17795 + }, + { + "epoch": 0.9965281666480009, + "grad_norm": 1.581672191619873, + "learning_rate": 8.8935e-05, + "loss": 0.5669, + "step": 17796 + }, + { + "epoch": 0.9965841639601299, + "grad_norm": 2.492479085922241, + "learning_rate": 8.894e-05, + "loss": 0.5244, + "step": 17797 + }, + { + "epoch": 0.996640161272259, + "grad_norm": 1.3283013105392456, + "learning_rate": 8.8945e-05, + "loss": 0.55, + "step": 17798 + }, + { + "epoch": 0.996696158584388, + "grad_norm": 1.2206213474273682, + "learning_rate": 8.895e-05, + "loss": 0.3917, + "step": 17799 + }, + { + "epoch": 0.996752155896517, + "grad_norm": 1.5171903371810913, + "learning_rate": 8.8955e-05, + "loss": 0.5488, + "step": 17800 + }, + { + "epoch": 0.996808153208646, + "grad_norm": 1.507001280784607, + "learning_rate": 8.896e-05, + "loss": 0.5199, + "step": 17801 + }, + { + "epoch": 0.996864150520775, + "grad_norm": 1.26813542842865, + "learning_rate": 8.896500000000001e-05, + "loss": 0.3859, + "step": 17802 + }, + { + "epoch": 0.9969201478329041, + "grad_norm": 2.1569724082946777, + "learning_rate": 8.897000000000001e-05, + "loss": 0.5384, + "step": 17803 + }, + { + "epoch": 0.9969761451450331, + "grad_norm": 2.1141278743743896, + "learning_rate": 8.897500000000001e-05, + "loss": 0.4599, + "step": 17804 + }, + { + "epoch": 0.9970321424571621, + "grad_norm": 1.3248432874679565, + "learning_rate": 8.898e-05, + "loss": 0.4108, + "step": 17805 + }, + { + "epoch": 0.9970881397692911, + "grad_norm": 1.2197864055633545, + "learning_rate": 8.8985e-05, + "loss": 0.4157, + "step": 17806 + }, + { + "epoch": 0.9971441370814201, + "grad_norm": 1.5786558389663696, + "learning_rate": 8.899e-05, + "loss": 0.5085, + "step": 17807 + }, + { + "epoch": 0.9972001343935492, + "grad_norm": 1.2038183212280273, + "learning_rate": 8.899500000000001e-05, + "loss": 0.3443, + "step": 17808 + }, + { + "epoch": 0.9972561317056782, + "grad_norm": 1.4003777503967285, + "learning_rate": 8.900000000000001e-05, + "loss": 0.4955, + "step": 17809 + }, + { + "epoch": 0.9973121290178072, + "grad_norm": 1.3404408693313599, + "learning_rate": 8.9005e-05, + "loss": 0.424, + "step": 17810 + }, + { + "epoch": 0.9973681263299362, + "grad_norm": 1.5213918685913086, + "learning_rate": 8.901e-05, + "loss": 0.4937, + "step": 17811 + }, + { + "epoch": 0.9974241236420652, + "grad_norm": 1.4018992185592651, + "learning_rate": 8.9015e-05, + "loss": 0.4621, + "step": 17812 + }, + { + "epoch": 0.9974801209541942, + "grad_norm": 1.1799334287643433, + "learning_rate": 8.902e-05, + "loss": 0.3614, + "step": 17813 + }, + { + "epoch": 0.9975361182663233, + "grad_norm": 1.24405038356781, + "learning_rate": 8.902500000000001e-05, + "loss": 0.4992, + "step": 17814 + }, + { + "epoch": 0.9975921155784523, + "grad_norm": 1.2421824932098389, + "learning_rate": 8.903e-05, + "loss": 0.3443, + "step": 17815 + }, + { + "epoch": 0.9976481128905813, + "grad_norm": 1.294106125831604, + "learning_rate": 8.9035e-05, + "loss": 0.5638, + "step": 17816 + }, + { + "epoch": 0.9977041102027102, + "grad_norm": 1.2034837007522583, + "learning_rate": 8.904e-05, + "loss": 0.4029, + "step": 17817 + }, + { + "epoch": 0.9977601075148392, + "grad_norm": 1.393774151802063, + "learning_rate": 8.9045e-05, + "loss": 0.5215, + "step": 17818 + }, + { + "epoch": 0.9978161048269683, + "grad_norm": 5.649298667907715, + "learning_rate": 8.905e-05, + "loss": 0.4517, + "step": 17819 + }, + { + "epoch": 0.9978721021390973, + "grad_norm": 1.5890209674835205, + "learning_rate": 8.9055e-05, + "loss": 0.5306, + "step": 17820 + }, + { + "epoch": 0.9979280994512263, + "grad_norm": 1.5135754346847534, + "learning_rate": 8.906e-05, + "loss": 0.4627, + "step": 17821 + }, + { + "epoch": 0.9979840967633553, + "grad_norm": 1.3312042951583862, + "learning_rate": 8.906500000000002e-05, + "loss": 0.4954, + "step": 17822 + }, + { + "epoch": 0.9980400940754843, + "grad_norm": 1.1777615547180176, + "learning_rate": 8.907000000000001e-05, + "loss": 0.3609, + "step": 17823 + }, + { + "epoch": 0.9980960913876133, + "grad_norm": 1.5533791780471802, + "learning_rate": 8.907500000000001e-05, + "loss": 0.4959, + "step": 17824 + }, + { + "epoch": 0.9981520886997424, + "grad_norm": 1.2568227052688599, + "learning_rate": 8.908000000000001e-05, + "loss": 0.4182, + "step": 17825 + }, + { + "epoch": 0.9982080860118714, + "grad_norm": 1.339856743812561, + "learning_rate": 8.9085e-05, + "loss": 0.4881, + "step": 17826 + }, + { + "epoch": 0.9982640833240004, + "grad_norm": 1.3573658466339111, + "learning_rate": 8.909e-05, + "loss": 0.5631, + "step": 17827 + }, + { + "epoch": 0.9983200806361294, + "grad_norm": 1.392850637435913, + "learning_rate": 8.9095e-05, + "loss": 0.3904, + "step": 17828 + }, + { + "epoch": 0.9983760779482584, + "grad_norm": 1.140561580657959, + "learning_rate": 8.910000000000001e-05, + "loss": 0.3946, + "step": 17829 + }, + { + "epoch": 0.9984320752603875, + "grad_norm": 1.7466707229614258, + "learning_rate": 8.910500000000001e-05, + "loss": 0.6854, + "step": 17830 + }, + { + "epoch": 0.9984880725725165, + "grad_norm": 1.1528984308242798, + "learning_rate": 8.911e-05, + "loss": 0.4211, + "step": 17831 + }, + { + "epoch": 0.9985440698846455, + "grad_norm": 1.7609009742736816, + "learning_rate": 8.9115e-05, + "loss": 0.4003, + "step": 17832 + }, + { + "epoch": 0.9986000671967745, + "grad_norm": 1.2808367013931274, + "learning_rate": 8.912e-05, + "loss": 0.4723, + "step": 17833 + }, + { + "epoch": 0.9986560645089035, + "grad_norm": 1.5797677040100098, + "learning_rate": 8.9125e-05, + "loss": 0.4325, + "step": 17834 + }, + { + "epoch": 0.9987120618210326, + "grad_norm": 1.264971375465393, + "learning_rate": 8.913000000000001e-05, + "loss": 0.4001, + "step": 17835 + }, + { + "epoch": 0.9987680591331616, + "grad_norm": 1.4042302370071411, + "learning_rate": 8.9135e-05, + "loss": 0.3977, + "step": 17836 + }, + { + "epoch": 0.9988240564452906, + "grad_norm": 1.3492649793624878, + "learning_rate": 8.914e-05, + "loss": 0.4877, + "step": 17837 + }, + { + "epoch": 0.9988800537574196, + "grad_norm": 1.1757004261016846, + "learning_rate": 8.9145e-05, + "loss": 0.4391, + "step": 17838 + }, + { + "epoch": 0.9989360510695486, + "grad_norm": 1.409857153892517, + "learning_rate": 8.915e-05, + "loss": 0.4832, + "step": 17839 + }, + { + "epoch": 0.9989920483816777, + "grad_norm": 1.3181517124176025, + "learning_rate": 8.9155e-05, + "loss": 0.3506, + "step": 17840 + }, + { + "epoch": 0.9990480456938067, + "grad_norm": 1.33633553981781, + "learning_rate": 8.916e-05, + "loss": 0.5155, + "step": 17841 + }, + { + "epoch": 0.9991040430059357, + "grad_norm": 1.723892092704773, + "learning_rate": 8.9165e-05, + "loss": 0.5702, + "step": 17842 + }, + { + "epoch": 0.9991600403180647, + "grad_norm": 1.2972443103790283, + "learning_rate": 8.917000000000002e-05, + "loss": 0.4963, + "step": 17843 + }, + { + "epoch": 0.9992160376301937, + "grad_norm": 1.8608943223953247, + "learning_rate": 8.917500000000001e-05, + "loss": 0.5375, + "step": 17844 + }, + { + "epoch": 0.9992720349423228, + "grad_norm": 1.3446626663208008, + "learning_rate": 8.918000000000001e-05, + "loss": 0.5504, + "step": 17845 + }, + { + "epoch": 0.9993280322544518, + "grad_norm": 1.269340991973877, + "learning_rate": 8.918500000000001e-05, + "loss": 0.4592, + "step": 17846 + }, + { + "epoch": 0.9993840295665808, + "grad_norm": 1.349295735359192, + "learning_rate": 8.919e-05, + "loss": 0.4749, + "step": 17847 + }, + { + "epoch": 0.9994400268787098, + "grad_norm": 1.5455998182296753, + "learning_rate": 8.9195e-05, + "loss": 0.5692, + "step": 17848 + }, + { + "epoch": 0.9994960241908388, + "grad_norm": 1.5913318395614624, + "learning_rate": 8.92e-05, + "loss": 0.3864, + "step": 17849 + }, + { + "epoch": 0.9995520215029678, + "grad_norm": 1.3120813369750977, + "learning_rate": 8.920500000000001e-05, + "loss": 0.4912, + "step": 17850 + }, + { + "epoch": 0.9996080188150969, + "grad_norm": 1.6308808326721191, + "learning_rate": 8.921000000000001e-05, + "loss": 0.5462, + "step": 17851 + }, + { + "epoch": 0.9996640161272259, + "grad_norm": 1.414174199104309, + "learning_rate": 8.9215e-05, + "loss": 0.5364, + "step": 17852 + }, + { + "epoch": 0.9997200134393549, + "grad_norm": 1.3153877258300781, + "learning_rate": 8.922e-05, + "loss": 0.4214, + "step": 17853 + }, + { + "epoch": 0.9997760107514839, + "grad_norm": 1.3772268295288086, + "learning_rate": 8.9225e-05, + "loss": 0.5077, + "step": 17854 + }, + { + "epoch": 0.9998320080636129, + "grad_norm": 1.1395400762557983, + "learning_rate": 8.923e-05, + "loss": 0.4192, + "step": 17855 + }, + { + "epoch": 0.999888005375742, + "grad_norm": 1.4553617238998413, + "learning_rate": 8.9235e-05, + "loss": 0.5172, + "step": 17856 + }, + { + "epoch": 0.999944002687871, + "grad_norm": 1.2411185503005981, + "learning_rate": 8.924e-05, + "loss": 0.3903, + "step": 17857 + }, + { + "epoch": 1.0, + "grad_norm": 3.379507541656494, + "learning_rate": 8.9245e-05, + "loss": 0.2554, + "step": 17858 + }, + { + "epoch": 1.000055997312129, + "grad_norm": 1.1951563358306885, + "learning_rate": 8.925e-05, + "loss": 0.3561, + "step": 17859 + }, + { + "epoch": 1.000111994624258, + "grad_norm": 1.2772942781448364, + "learning_rate": 8.9255e-05, + "loss": 0.4737, + "step": 17860 + }, + { + "epoch": 1.000167991936387, + "grad_norm": 1.2272050380706787, + "learning_rate": 8.926e-05, + "loss": 0.3712, + "step": 17861 + }, + { + "epoch": 1.000223989248516, + "grad_norm": 1.2424622774124146, + "learning_rate": 8.9265e-05, + "loss": 0.44, + "step": 17862 + }, + { + "epoch": 1.000279986560645, + "grad_norm": 1.1765683889389038, + "learning_rate": 8.927e-05, + "loss": 0.4108, + "step": 17863 + }, + { + "epoch": 1.0003359838727741, + "grad_norm": 1.3974347114562988, + "learning_rate": 8.927500000000002e-05, + "loss": 0.4063, + "step": 17864 + }, + { + "epoch": 1.0003919811849031, + "grad_norm": 1.3876872062683105, + "learning_rate": 8.928000000000001e-05, + "loss": 0.3698, + "step": 17865 + }, + { + "epoch": 1.0004479784970322, + "grad_norm": 1.5575116872787476, + "learning_rate": 8.928500000000001e-05, + "loss": 0.3858, + "step": 17866 + }, + { + "epoch": 1.0005039758091612, + "grad_norm": 1.4825502634048462, + "learning_rate": 8.929000000000001e-05, + "loss": 0.4251, + "step": 17867 + }, + { + "epoch": 1.0005599731212902, + "grad_norm": 2.1418943405151367, + "learning_rate": 8.9295e-05, + "loss": 0.4481, + "step": 17868 + }, + { + "epoch": 1.0006159704334192, + "grad_norm": 1.5139989852905273, + "learning_rate": 8.93e-05, + "loss": 0.4508, + "step": 17869 + }, + { + "epoch": 1.0006719677455482, + "grad_norm": 1.6377043724060059, + "learning_rate": 8.9305e-05, + "loss": 0.42, + "step": 17870 + }, + { + "epoch": 1.0007279650576772, + "grad_norm": 1.472362995147705, + "learning_rate": 8.931000000000001e-05, + "loss": 0.5715, + "step": 17871 + }, + { + "epoch": 1.0007839623698063, + "grad_norm": 1.52193284034729, + "learning_rate": 8.931500000000001e-05, + "loss": 0.517, + "step": 17872 + }, + { + "epoch": 1.0008399596819353, + "grad_norm": 2.265660047531128, + "learning_rate": 8.932e-05, + "loss": 0.4281, + "step": 17873 + }, + { + "epoch": 1.0008959569940643, + "grad_norm": 1.5788400173187256, + "learning_rate": 8.9325e-05, + "loss": 0.4571, + "step": 17874 + }, + { + "epoch": 1.0009519543061933, + "grad_norm": 1.347461223602295, + "learning_rate": 8.933e-05, + "loss": 0.3671, + "step": 17875 + }, + { + "epoch": 1.0010079516183223, + "grad_norm": 1.3150992393493652, + "learning_rate": 8.9335e-05, + "loss": 0.4848, + "step": 17876 + }, + { + "epoch": 1.0010639489304514, + "grad_norm": 1.1444026231765747, + "learning_rate": 8.934e-05, + "loss": 0.3704, + "step": 17877 + }, + { + "epoch": 1.0011199462425804, + "grad_norm": 1.5898854732513428, + "learning_rate": 8.9345e-05, + "loss": 0.4398, + "step": 17878 + }, + { + "epoch": 1.0011759435547094, + "grad_norm": 1.4028594493865967, + "learning_rate": 8.935e-05, + "loss": 0.3731, + "step": 17879 + }, + { + "epoch": 1.0012319408668384, + "grad_norm": 1.3735319375991821, + "learning_rate": 8.9355e-05, + "loss": 0.4651, + "step": 17880 + }, + { + "epoch": 1.0012879381789674, + "grad_norm": 1.1216611862182617, + "learning_rate": 8.936e-05, + "loss": 0.4876, + "step": 17881 + }, + { + "epoch": 1.0013439354910965, + "grad_norm": 1.6765284538269043, + "learning_rate": 8.936500000000001e-05, + "loss": 0.6061, + "step": 17882 + }, + { + "epoch": 1.0013999328032255, + "grad_norm": 1.298638939857483, + "learning_rate": 8.937e-05, + "loss": 0.3629, + "step": 17883 + }, + { + "epoch": 1.0014559301153545, + "grad_norm": 1.259941577911377, + "learning_rate": 8.9375e-05, + "loss": 0.2662, + "step": 17884 + }, + { + "epoch": 1.0015119274274835, + "grad_norm": 1.2943047285079956, + "learning_rate": 8.938e-05, + "loss": 0.4017, + "step": 17885 + }, + { + "epoch": 1.0015679247396125, + "grad_norm": 1.2645395994186401, + "learning_rate": 8.938500000000001e-05, + "loss": 0.4045, + "step": 17886 + }, + { + "epoch": 1.0016239220517416, + "grad_norm": 1.259494662284851, + "learning_rate": 8.939000000000001e-05, + "loss": 0.3124, + "step": 17887 + }, + { + "epoch": 1.0016799193638706, + "grad_norm": 1.5386265516281128, + "learning_rate": 8.939500000000001e-05, + "loss": 0.5339, + "step": 17888 + }, + { + "epoch": 1.0017359166759996, + "grad_norm": 2.078693389892578, + "learning_rate": 8.94e-05, + "loss": 0.414, + "step": 17889 + }, + { + "epoch": 1.0017919139881286, + "grad_norm": 1.1553360223770142, + "learning_rate": 8.9405e-05, + "loss": 0.478, + "step": 17890 + }, + { + "epoch": 1.0018479113002576, + "grad_norm": 1.3309661149978638, + "learning_rate": 8.941e-05, + "loss": 0.4211, + "step": 17891 + }, + { + "epoch": 1.0019039086123867, + "grad_norm": 1.216221809387207, + "learning_rate": 8.941500000000001e-05, + "loss": 0.5105, + "step": 17892 + }, + { + "epoch": 1.0019599059245157, + "grad_norm": 1.3124853372573853, + "learning_rate": 8.942000000000001e-05, + "loss": 0.5729, + "step": 17893 + }, + { + "epoch": 1.0020159032366447, + "grad_norm": 1.3122268915176392, + "learning_rate": 8.9425e-05, + "loss": 0.4651, + "step": 17894 + }, + { + "epoch": 1.0020719005487737, + "grad_norm": 1.1882063150405884, + "learning_rate": 8.943e-05, + "loss": 0.3396, + "step": 17895 + }, + { + "epoch": 1.0021278978609027, + "grad_norm": 1.3756940364837646, + "learning_rate": 8.9435e-05, + "loss": 0.3477, + "step": 17896 + }, + { + "epoch": 1.0021838951730317, + "grad_norm": 1.2090256214141846, + "learning_rate": 8.944e-05, + "loss": 0.4787, + "step": 17897 + }, + { + "epoch": 1.0022398924851608, + "grad_norm": 1.4074785709381104, + "learning_rate": 8.9445e-05, + "loss": 0.5455, + "step": 17898 + }, + { + "epoch": 1.0022958897972898, + "grad_norm": 1.383531928062439, + "learning_rate": 8.945e-05, + "loss": 0.4804, + "step": 17899 + }, + { + "epoch": 1.0023518871094188, + "grad_norm": 1.4040472507476807, + "learning_rate": 8.9455e-05, + "loss": 0.5287, + "step": 17900 + }, + { + "epoch": 1.0024078844215478, + "grad_norm": 1.351027488708496, + "learning_rate": 8.946e-05, + "loss": 0.3817, + "step": 17901 + }, + { + "epoch": 1.0024638817336768, + "grad_norm": 1.2312889099121094, + "learning_rate": 8.9465e-05, + "loss": 0.4973, + "step": 17902 + }, + { + "epoch": 1.0025198790458059, + "grad_norm": 1.5979794263839722, + "learning_rate": 8.947000000000001e-05, + "loss": 0.5239, + "step": 17903 + }, + { + "epoch": 1.0025758763579349, + "grad_norm": 1.2218527793884277, + "learning_rate": 8.9475e-05, + "loss": 0.5034, + "step": 17904 + }, + { + "epoch": 1.002631873670064, + "grad_norm": 1.7417958974838257, + "learning_rate": 8.948e-05, + "loss": 0.5016, + "step": 17905 + }, + { + "epoch": 1.002687870982193, + "grad_norm": 1.3673913478851318, + "learning_rate": 8.9485e-05, + "loss": 0.3737, + "step": 17906 + }, + { + "epoch": 1.002743868294322, + "grad_norm": 1.341664433479309, + "learning_rate": 8.949000000000001e-05, + "loss": 0.4445, + "step": 17907 + }, + { + "epoch": 1.002799865606451, + "grad_norm": 1.3660603761672974, + "learning_rate": 8.949500000000001e-05, + "loss": 0.4294, + "step": 17908 + }, + { + "epoch": 1.00285586291858, + "grad_norm": 1.244755744934082, + "learning_rate": 8.950000000000001e-05, + "loss": 0.5295, + "step": 17909 + }, + { + "epoch": 1.002911860230709, + "grad_norm": 1.594760775566101, + "learning_rate": 8.9505e-05, + "loss": 0.3796, + "step": 17910 + }, + { + "epoch": 1.002967857542838, + "grad_norm": 1.0852961540222168, + "learning_rate": 8.951e-05, + "loss": 0.3523, + "step": 17911 + }, + { + "epoch": 1.003023854854967, + "grad_norm": 1.42945396900177, + "learning_rate": 8.9515e-05, + "loss": 0.4033, + "step": 17912 + }, + { + "epoch": 1.003079852167096, + "grad_norm": 1.4561173915863037, + "learning_rate": 8.952000000000001e-05, + "loss": 0.3425, + "step": 17913 + }, + { + "epoch": 1.003135849479225, + "grad_norm": 1.454376459121704, + "learning_rate": 8.952500000000001e-05, + "loss": 0.368, + "step": 17914 + }, + { + "epoch": 1.003191846791354, + "grad_norm": 1.29525887966156, + "learning_rate": 8.953e-05, + "loss": 0.3543, + "step": 17915 + }, + { + "epoch": 1.003247844103483, + "grad_norm": 1.374119520187378, + "learning_rate": 8.9535e-05, + "loss": 0.486, + "step": 17916 + }, + { + "epoch": 1.0033038414156121, + "grad_norm": 1.313292384147644, + "learning_rate": 8.954e-05, + "loss": 0.3136, + "step": 17917 + }, + { + "epoch": 1.0033598387277411, + "grad_norm": 1.3198938369750977, + "learning_rate": 8.9545e-05, + "loss": 0.3961, + "step": 17918 + }, + { + "epoch": 1.0034158360398702, + "grad_norm": 1.3486634492874146, + "learning_rate": 8.955e-05, + "loss": 0.3769, + "step": 17919 + }, + { + "epoch": 1.0034718333519992, + "grad_norm": 1.2486048936843872, + "learning_rate": 8.9555e-05, + "loss": 0.3641, + "step": 17920 + }, + { + "epoch": 1.0035278306641282, + "grad_norm": 1.2347756624221802, + "learning_rate": 8.956e-05, + "loss": 0.4007, + "step": 17921 + }, + { + "epoch": 1.0035838279762572, + "grad_norm": 1.5285927057266235, + "learning_rate": 8.9565e-05, + "loss": 0.5858, + "step": 17922 + }, + { + "epoch": 1.0036398252883862, + "grad_norm": 1.3821576833724976, + "learning_rate": 8.957000000000001e-05, + "loss": 0.3995, + "step": 17923 + }, + { + "epoch": 1.0036958226005153, + "grad_norm": 1.405597448348999, + "learning_rate": 8.957500000000001e-05, + "loss": 0.423, + "step": 17924 + }, + { + "epoch": 1.0037518199126443, + "grad_norm": 1.061468243598938, + "learning_rate": 8.958e-05, + "loss": 0.3636, + "step": 17925 + }, + { + "epoch": 1.0038078172247733, + "grad_norm": 1.2281023263931274, + "learning_rate": 8.9585e-05, + "loss": 0.3992, + "step": 17926 + }, + { + "epoch": 1.0038638145369023, + "grad_norm": 1.4296306371688843, + "learning_rate": 8.959e-05, + "loss": 0.4056, + "step": 17927 + }, + { + "epoch": 1.0039198118490313, + "grad_norm": 1.3915297985076904, + "learning_rate": 8.959500000000001e-05, + "loss": 0.4944, + "step": 17928 + }, + { + "epoch": 1.0039758091611604, + "grad_norm": 1.2303251028060913, + "learning_rate": 8.960000000000001e-05, + "loss": 0.4113, + "step": 17929 + }, + { + "epoch": 1.0040318064732894, + "grad_norm": 1.4397661685943604, + "learning_rate": 8.960500000000001e-05, + "loss": 0.4788, + "step": 17930 + }, + { + "epoch": 1.0040878037854184, + "grad_norm": 1.4920543432235718, + "learning_rate": 8.961e-05, + "loss": 0.4156, + "step": 17931 + }, + { + "epoch": 1.0041438010975474, + "grad_norm": 1.8774982690811157, + "learning_rate": 8.9615e-05, + "loss": 0.4992, + "step": 17932 + }, + { + "epoch": 1.0041997984096764, + "grad_norm": 1.4324315786361694, + "learning_rate": 8.962e-05, + "loss": 0.3682, + "step": 17933 + }, + { + "epoch": 1.0042557957218055, + "grad_norm": 1.3314342498779297, + "learning_rate": 8.962500000000001e-05, + "loss": 0.4155, + "step": 17934 + }, + { + "epoch": 1.0043117930339345, + "grad_norm": 1.4088034629821777, + "learning_rate": 8.963000000000001e-05, + "loss": 0.497, + "step": 17935 + }, + { + "epoch": 1.0043677903460635, + "grad_norm": 1.3769229650497437, + "learning_rate": 8.9635e-05, + "loss": 0.4401, + "step": 17936 + }, + { + "epoch": 1.0044237876581925, + "grad_norm": 1.2523490190505981, + "learning_rate": 8.964e-05, + "loss": 0.3812, + "step": 17937 + }, + { + "epoch": 1.0044797849703215, + "grad_norm": 1.2642509937286377, + "learning_rate": 8.9645e-05, + "loss": 0.4266, + "step": 17938 + }, + { + "epoch": 1.0045357822824506, + "grad_norm": 1.5512166023254395, + "learning_rate": 8.965e-05, + "loss": 0.4254, + "step": 17939 + }, + { + "epoch": 1.0045917795945796, + "grad_norm": 1.2381703853607178, + "learning_rate": 8.9655e-05, + "loss": 0.3338, + "step": 17940 + }, + { + "epoch": 1.0046477769067084, + "grad_norm": 1.2523268461227417, + "learning_rate": 8.966e-05, + "loss": 0.4377, + "step": 17941 + }, + { + "epoch": 1.0047037742188374, + "grad_norm": 1.6331632137298584, + "learning_rate": 8.9665e-05, + "loss": 0.5569, + "step": 17942 + }, + { + "epoch": 1.0047597715309664, + "grad_norm": 1.2084189653396606, + "learning_rate": 8.967000000000001e-05, + "loss": 0.5117, + "step": 17943 + }, + { + "epoch": 1.0048157688430954, + "grad_norm": 1.5852941274642944, + "learning_rate": 8.967500000000001e-05, + "loss": 0.4931, + "step": 17944 + }, + { + "epoch": 1.0048717661552244, + "grad_norm": 1.2750952243804932, + "learning_rate": 8.968000000000001e-05, + "loss": 0.4169, + "step": 17945 + }, + { + "epoch": 1.0049277634673535, + "grad_norm": 1.368571162223816, + "learning_rate": 8.9685e-05, + "loss": 0.4283, + "step": 17946 + }, + { + "epoch": 1.0049837607794825, + "grad_norm": 1.2909531593322754, + "learning_rate": 8.969e-05, + "loss": 0.4161, + "step": 17947 + }, + { + "epoch": 1.0050397580916115, + "grad_norm": 1.4677278995513916, + "learning_rate": 8.9695e-05, + "loss": 0.3967, + "step": 17948 + }, + { + "epoch": 1.0050957554037405, + "grad_norm": 1.840155839920044, + "learning_rate": 8.970000000000001e-05, + "loss": 0.4164, + "step": 17949 + }, + { + "epoch": 1.0051517527158695, + "grad_norm": 1.129811406135559, + "learning_rate": 8.970500000000001e-05, + "loss": 0.3269, + "step": 17950 + }, + { + "epoch": 1.0052077500279986, + "grad_norm": 1.4732576608657837, + "learning_rate": 8.971e-05, + "loss": 0.4069, + "step": 17951 + }, + { + "epoch": 1.0052637473401276, + "grad_norm": 1.497945785522461, + "learning_rate": 8.9715e-05, + "loss": 0.4268, + "step": 17952 + }, + { + "epoch": 1.0053197446522566, + "grad_norm": 1.3697718381881714, + "learning_rate": 8.972e-05, + "loss": 0.467, + "step": 17953 + }, + { + "epoch": 1.0053757419643856, + "grad_norm": 1.3891284465789795, + "learning_rate": 8.9725e-05, + "loss": 0.3637, + "step": 17954 + }, + { + "epoch": 1.0054317392765146, + "grad_norm": 1.3554348945617676, + "learning_rate": 8.973e-05, + "loss": 0.5177, + "step": 17955 + }, + { + "epoch": 1.0054877365886437, + "grad_norm": 1.5390160083770752, + "learning_rate": 8.973500000000001e-05, + "loss": 0.4884, + "step": 17956 + }, + { + "epoch": 1.0055437339007727, + "grad_norm": 1.1982821226119995, + "learning_rate": 8.974e-05, + "loss": 0.4356, + "step": 17957 + }, + { + "epoch": 1.0055997312129017, + "grad_norm": 1.4162977933883667, + "learning_rate": 8.9745e-05, + "loss": 0.4273, + "step": 17958 + }, + { + "epoch": 1.0056557285250307, + "grad_norm": 1.566547155380249, + "learning_rate": 8.975e-05, + "loss": 0.5884, + "step": 17959 + }, + { + "epoch": 1.0057117258371597, + "grad_norm": 1.2957085371017456, + "learning_rate": 8.9755e-05, + "loss": 0.3804, + "step": 17960 + }, + { + "epoch": 1.0057677231492888, + "grad_norm": 1.5151442289352417, + "learning_rate": 8.976e-05, + "loss": 0.5292, + "step": 17961 + }, + { + "epoch": 1.0058237204614178, + "grad_norm": 1.4708261489868164, + "learning_rate": 8.9765e-05, + "loss": 0.4407, + "step": 17962 + }, + { + "epoch": 1.0058797177735468, + "grad_norm": 1.3044770956039429, + "learning_rate": 8.977000000000002e-05, + "loss": 0.6537, + "step": 17963 + }, + { + "epoch": 1.0059357150856758, + "grad_norm": 1.1265450716018677, + "learning_rate": 8.977500000000001e-05, + "loss": 0.4736, + "step": 17964 + }, + { + "epoch": 1.0059917123978048, + "grad_norm": 1.3219960927963257, + "learning_rate": 8.978000000000001e-05, + "loss": 0.3908, + "step": 17965 + }, + { + "epoch": 1.0060477097099338, + "grad_norm": 1.1987253427505493, + "learning_rate": 8.978500000000001e-05, + "loss": 0.4455, + "step": 17966 + }, + { + "epoch": 1.0061037070220629, + "grad_norm": 1.7745444774627686, + "learning_rate": 8.979e-05, + "loss": 0.5543, + "step": 17967 + }, + { + "epoch": 1.0061597043341919, + "grad_norm": 1.3833746910095215, + "learning_rate": 8.9795e-05, + "loss": 0.5038, + "step": 17968 + }, + { + "epoch": 1.006215701646321, + "grad_norm": 1.6064419746398926, + "learning_rate": 8.98e-05, + "loss": 0.4707, + "step": 17969 + }, + { + "epoch": 1.00627169895845, + "grad_norm": 1.8970528841018677, + "learning_rate": 8.980500000000001e-05, + "loss": 0.532, + "step": 17970 + }, + { + "epoch": 1.006327696270579, + "grad_norm": 1.2309457063674927, + "learning_rate": 8.981000000000001e-05, + "loss": 0.375, + "step": 17971 + }, + { + "epoch": 1.006383693582708, + "grad_norm": 1.2881358861923218, + "learning_rate": 8.9815e-05, + "loss": 0.4442, + "step": 17972 + }, + { + "epoch": 1.006439690894837, + "grad_norm": 2.184856653213501, + "learning_rate": 8.982e-05, + "loss": 0.3165, + "step": 17973 + }, + { + "epoch": 1.006495688206966, + "grad_norm": 1.2457536458969116, + "learning_rate": 8.9825e-05, + "loss": 0.3519, + "step": 17974 + }, + { + "epoch": 1.006551685519095, + "grad_norm": 1.2922468185424805, + "learning_rate": 8.983e-05, + "loss": 0.361, + "step": 17975 + }, + { + "epoch": 1.006607682831224, + "grad_norm": 1.3228814601898193, + "learning_rate": 8.9835e-05, + "loss": 0.377, + "step": 17976 + }, + { + "epoch": 1.006663680143353, + "grad_norm": 1.4710068702697754, + "learning_rate": 8.984000000000001e-05, + "loss": 0.4726, + "step": 17977 + }, + { + "epoch": 1.006719677455482, + "grad_norm": 1.1677091121673584, + "learning_rate": 8.9845e-05, + "loss": 0.5768, + "step": 17978 + }, + { + "epoch": 1.006775674767611, + "grad_norm": 1.220658302307129, + "learning_rate": 8.985e-05, + "loss": 0.4234, + "step": 17979 + }, + { + "epoch": 1.0068316720797401, + "grad_norm": 1.3482164144515991, + "learning_rate": 8.9855e-05, + "loss": 0.3233, + "step": 17980 + }, + { + "epoch": 1.0068876693918691, + "grad_norm": 1.226611614227295, + "learning_rate": 8.986e-05, + "loss": 0.4663, + "step": 17981 + }, + { + "epoch": 1.0069436667039982, + "grad_norm": 2.298543691635132, + "learning_rate": 8.9865e-05, + "loss": 0.4426, + "step": 17982 + }, + { + "epoch": 1.0069996640161272, + "grad_norm": 1.257027268409729, + "learning_rate": 8.987e-05, + "loss": 0.454, + "step": 17983 + }, + { + "epoch": 1.0070556613282562, + "grad_norm": 1.2130361795425415, + "learning_rate": 8.9875e-05, + "loss": 0.5599, + "step": 17984 + }, + { + "epoch": 1.0071116586403852, + "grad_norm": 1.4071629047393799, + "learning_rate": 8.988000000000001e-05, + "loss": 0.4986, + "step": 17985 + }, + { + "epoch": 1.0071676559525142, + "grad_norm": 1.417580485343933, + "learning_rate": 8.988500000000001e-05, + "loss": 0.426, + "step": 17986 + }, + { + "epoch": 1.0072236532646432, + "grad_norm": 1.2887449264526367, + "learning_rate": 8.989000000000001e-05, + "loss": 0.3997, + "step": 17987 + }, + { + "epoch": 1.0072796505767723, + "grad_norm": 0.9595540165901184, + "learning_rate": 8.9895e-05, + "loss": 0.2766, + "step": 17988 + }, + { + "epoch": 1.0073356478889013, + "grad_norm": 1.271773099899292, + "learning_rate": 8.99e-05, + "loss": 0.4387, + "step": 17989 + }, + { + "epoch": 1.0073916452010303, + "grad_norm": 1.2614787817001343, + "learning_rate": 8.9905e-05, + "loss": 0.3617, + "step": 17990 + }, + { + "epoch": 1.0074476425131593, + "grad_norm": 1.2842191457748413, + "learning_rate": 8.991000000000001e-05, + "loss": 0.3945, + "step": 17991 + }, + { + "epoch": 1.0075036398252883, + "grad_norm": 1.3162118196487427, + "learning_rate": 8.991500000000001e-05, + "loss": 0.402, + "step": 17992 + }, + { + "epoch": 1.0075596371374174, + "grad_norm": 1.1518367528915405, + "learning_rate": 8.992e-05, + "loss": 0.453, + "step": 17993 + }, + { + "epoch": 1.0076156344495464, + "grad_norm": 1.467724323272705, + "learning_rate": 8.9925e-05, + "loss": 0.3989, + "step": 17994 + }, + { + "epoch": 1.0076716317616754, + "grad_norm": 1.5824946165084839, + "learning_rate": 8.993e-05, + "loss": 0.4059, + "step": 17995 + }, + { + "epoch": 1.0077276290738044, + "grad_norm": 1.2596561908721924, + "learning_rate": 8.9935e-05, + "loss": 0.3671, + "step": 17996 + }, + { + "epoch": 1.0077836263859334, + "grad_norm": 1.2339483499526978, + "learning_rate": 8.994e-05, + "loss": 0.359, + "step": 17997 + }, + { + "epoch": 1.0078396236980625, + "grad_norm": 1.5595121383666992, + "learning_rate": 8.994500000000001e-05, + "loss": 0.4069, + "step": 17998 + }, + { + "epoch": 1.0078956210101915, + "grad_norm": 1.2758089303970337, + "learning_rate": 8.995e-05, + "loss": 0.5216, + "step": 17999 + }, + { + "epoch": 1.0079516183223205, + "grad_norm": 1.5260242223739624, + "learning_rate": 8.9955e-05, + "loss": 0.49, + "step": 18000 + }, + { + "epoch": 1.0080076156344495, + "grad_norm": 1.3660736083984375, + "learning_rate": 8.996e-05, + "loss": 0.3987, + "step": 18001 + }, + { + "epoch": 1.0080636129465785, + "grad_norm": 1.0238418579101562, + "learning_rate": 8.9965e-05, + "loss": 0.3057, + "step": 18002 + }, + { + "epoch": 1.0081196102587076, + "grad_norm": 1.3377869129180908, + "learning_rate": 8.997000000000001e-05, + "loss": 0.5516, + "step": 18003 + }, + { + "epoch": 1.0081756075708366, + "grad_norm": 1.399433970451355, + "learning_rate": 8.9975e-05, + "loss": 0.433, + "step": 18004 + }, + { + "epoch": 1.0082316048829656, + "grad_norm": 1.4594414234161377, + "learning_rate": 8.998e-05, + "loss": 0.3464, + "step": 18005 + }, + { + "epoch": 1.0082876021950946, + "grad_norm": 1.1530085802078247, + "learning_rate": 8.998500000000001e-05, + "loss": 0.3205, + "step": 18006 + }, + { + "epoch": 1.0083435995072236, + "grad_norm": 1.2954506874084473, + "learning_rate": 8.999000000000001e-05, + "loss": 0.3969, + "step": 18007 + }, + { + "epoch": 1.0083995968193527, + "grad_norm": 1.3576916456222534, + "learning_rate": 8.999500000000001e-05, + "loss": 0.4051, + "step": 18008 + }, + { + "epoch": 1.0084555941314817, + "grad_norm": 1.2132610082626343, + "learning_rate": 9e-05, + "loss": 0.4627, + "step": 18009 + }, + { + "epoch": 1.0085115914436107, + "grad_norm": 1.4459677934646606, + "learning_rate": 9.0005e-05, + "loss": 0.4382, + "step": 18010 + }, + { + "epoch": 1.0085675887557397, + "grad_norm": 1.4301097393035889, + "learning_rate": 9.001e-05, + "loss": 0.477, + "step": 18011 + }, + { + "epoch": 1.0086235860678687, + "grad_norm": 1.1272304058074951, + "learning_rate": 9.001500000000001e-05, + "loss": 0.3235, + "step": 18012 + }, + { + "epoch": 1.0086795833799977, + "grad_norm": 1.4270473718643188, + "learning_rate": 9.002000000000001e-05, + "loss": 0.4742, + "step": 18013 + }, + { + "epoch": 1.0087355806921268, + "grad_norm": 1.3131418228149414, + "learning_rate": 9.0025e-05, + "loss": 0.3297, + "step": 18014 + }, + { + "epoch": 1.0087915780042558, + "grad_norm": 1.2703737020492554, + "learning_rate": 9.003e-05, + "loss": 0.3442, + "step": 18015 + }, + { + "epoch": 1.0088475753163848, + "grad_norm": 1.1593832969665527, + "learning_rate": 9.0035e-05, + "loss": 0.3833, + "step": 18016 + }, + { + "epoch": 1.0089035726285138, + "grad_norm": 1.2208229303359985, + "learning_rate": 9.004e-05, + "loss": 0.2724, + "step": 18017 + }, + { + "epoch": 1.0089595699406428, + "grad_norm": 1.154589056968689, + "learning_rate": 9.0045e-05, + "loss": 0.3801, + "step": 18018 + }, + { + "epoch": 1.0090155672527719, + "grad_norm": 1.2062815427780151, + "learning_rate": 9.005000000000001e-05, + "loss": 0.3698, + "step": 18019 + }, + { + "epoch": 1.0090715645649009, + "grad_norm": 1.1477242708206177, + "learning_rate": 9.0055e-05, + "loss": 0.3661, + "step": 18020 + }, + { + "epoch": 1.00912756187703, + "grad_norm": 1.3812172412872314, + "learning_rate": 9.006e-05, + "loss": 0.4077, + "step": 18021 + }, + { + "epoch": 1.009183559189159, + "grad_norm": 1.115505337715149, + "learning_rate": 9.0065e-05, + "loss": 0.4108, + "step": 18022 + }, + { + "epoch": 1.009239556501288, + "grad_norm": 1.1311782598495483, + "learning_rate": 9.007e-05, + "loss": 0.3722, + "step": 18023 + }, + { + "epoch": 1.009295553813417, + "grad_norm": 1.1441736221313477, + "learning_rate": 9.007500000000001e-05, + "loss": 0.3694, + "step": 18024 + }, + { + "epoch": 1.009351551125546, + "grad_norm": 1.6131408214569092, + "learning_rate": 9.008e-05, + "loss": 0.4297, + "step": 18025 + }, + { + "epoch": 1.009407548437675, + "grad_norm": 1.1815921068191528, + "learning_rate": 9.0085e-05, + "loss": 0.4238, + "step": 18026 + }, + { + "epoch": 1.009463545749804, + "grad_norm": 1.164927363395691, + "learning_rate": 9.009000000000001e-05, + "loss": 0.4517, + "step": 18027 + }, + { + "epoch": 1.009519543061933, + "grad_norm": 1.2917762994766235, + "learning_rate": 9.009500000000001e-05, + "loss": 0.3382, + "step": 18028 + }, + { + "epoch": 1.009575540374062, + "grad_norm": 1.2980742454528809, + "learning_rate": 9.010000000000001e-05, + "loss": 0.4674, + "step": 18029 + }, + { + "epoch": 1.009631537686191, + "grad_norm": 1.3152427673339844, + "learning_rate": 9.0105e-05, + "loss": 0.5338, + "step": 18030 + }, + { + "epoch": 1.00968753499832, + "grad_norm": 1.1749911308288574, + "learning_rate": 9.011e-05, + "loss": 0.5392, + "step": 18031 + }, + { + "epoch": 1.009743532310449, + "grad_norm": 1.306301236152649, + "learning_rate": 9.0115e-05, + "loss": 0.4575, + "step": 18032 + }, + { + "epoch": 1.0097995296225781, + "grad_norm": 1.0710113048553467, + "learning_rate": 9.012e-05, + "loss": 0.3521, + "step": 18033 + }, + { + "epoch": 1.0098555269347071, + "grad_norm": 1.607362985610962, + "learning_rate": 9.012500000000001e-05, + "loss": 0.4986, + "step": 18034 + }, + { + "epoch": 1.0099115242468362, + "grad_norm": 1.424028992652893, + "learning_rate": 9.013e-05, + "loss": 0.4623, + "step": 18035 + }, + { + "epoch": 1.0099675215589652, + "grad_norm": 1.2317618131637573, + "learning_rate": 9.0135e-05, + "loss": 0.3437, + "step": 18036 + }, + { + "epoch": 1.0100235188710942, + "grad_norm": 1.0915732383728027, + "learning_rate": 9.014e-05, + "loss": 0.3834, + "step": 18037 + }, + { + "epoch": 1.0100795161832232, + "grad_norm": 1.811660647392273, + "learning_rate": 9.0145e-05, + "loss": 0.3943, + "step": 18038 + }, + { + "epoch": 1.0101355134953522, + "grad_norm": 1.3503577709197998, + "learning_rate": 9.015e-05, + "loss": 0.4379, + "step": 18039 + }, + { + "epoch": 1.0101915108074813, + "grad_norm": 1.38282310962677, + "learning_rate": 9.0155e-05, + "loss": 0.5986, + "step": 18040 + }, + { + "epoch": 1.0102475081196103, + "grad_norm": 1.1291764974594116, + "learning_rate": 9.016e-05, + "loss": 0.3217, + "step": 18041 + }, + { + "epoch": 1.0103035054317393, + "grad_norm": 1.450632929801941, + "learning_rate": 9.0165e-05, + "loss": 0.3424, + "step": 18042 + }, + { + "epoch": 1.0103595027438683, + "grad_norm": 1.3379839658737183, + "learning_rate": 9.017e-05, + "loss": 0.3377, + "step": 18043 + }, + { + "epoch": 1.0104155000559973, + "grad_norm": 1.598143458366394, + "learning_rate": 9.017500000000001e-05, + "loss": 0.4294, + "step": 18044 + }, + { + "epoch": 1.0104714973681264, + "grad_norm": 1.1232203245162964, + "learning_rate": 9.018000000000001e-05, + "loss": 0.3088, + "step": 18045 + }, + { + "epoch": 1.0105274946802554, + "grad_norm": 1.3722296953201294, + "learning_rate": 9.0185e-05, + "loss": 0.466, + "step": 18046 + }, + { + "epoch": 1.0105834919923844, + "grad_norm": 2.807011127471924, + "learning_rate": 9.019e-05, + "loss": 0.5928, + "step": 18047 + }, + { + "epoch": 1.0106394893045134, + "grad_norm": 1.2643423080444336, + "learning_rate": 9.019500000000001e-05, + "loss": 0.4106, + "step": 18048 + }, + { + "epoch": 1.0106954866166424, + "grad_norm": 1.3486216068267822, + "learning_rate": 9.020000000000001e-05, + "loss": 0.4426, + "step": 18049 + }, + { + "epoch": 1.0107514839287715, + "grad_norm": 1.446391224861145, + "learning_rate": 9.020500000000001e-05, + "loss": 0.3916, + "step": 18050 + }, + { + "epoch": 1.0108074812409005, + "grad_norm": 1.3402318954467773, + "learning_rate": 9.021e-05, + "loss": 0.3619, + "step": 18051 + }, + { + "epoch": 1.0108634785530295, + "grad_norm": 1.302295446395874, + "learning_rate": 9.0215e-05, + "loss": 0.2946, + "step": 18052 + }, + { + "epoch": 1.0109194758651585, + "grad_norm": 1.367607831954956, + "learning_rate": 9.022e-05, + "loss": 0.4433, + "step": 18053 + }, + { + "epoch": 1.0109754731772875, + "grad_norm": 1.4880040884017944, + "learning_rate": 9.0225e-05, + "loss": 0.4124, + "step": 18054 + }, + { + "epoch": 1.0110314704894166, + "grad_norm": 1.5772989988327026, + "learning_rate": 9.023000000000001e-05, + "loss": 0.5283, + "step": 18055 + }, + { + "epoch": 1.0110874678015456, + "grad_norm": 1.2875374555587769, + "learning_rate": 9.0235e-05, + "loss": 0.5414, + "step": 18056 + }, + { + "epoch": 1.0111434651136746, + "grad_norm": 1.363318920135498, + "learning_rate": 9.024e-05, + "loss": 0.484, + "step": 18057 + }, + { + "epoch": 1.0111994624258036, + "grad_norm": 1.4215304851531982, + "learning_rate": 9.0245e-05, + "loss": 0.3954, + "step": 18058 + }, + { + "epoch": 1.0112554597379326, + "grad_norm": 1.273647665977478, + "learning_rate": 9.025e-05, + "loss": 0.3881, + "step": 18059 + }, + { + "epoch": 1.0113114570500616, + "grad_norm": 1.1219748258590698, + "learning_rate": 9.0255e-05, + "loss": 0.432, + "step": 18060 + }, + { + "epoch": 1.0113674543621907, + "grad_norm": 1.283856749534607, + "learning_rate": 9.026e-05, + "loss": 0.3797, + "step": 18061 + }, + { + "epoch": 1.0114234516743197, + "grad_norm": 1.2342506647109985, + "learning_rate": 9.0265e-05, + "loss": 0.3694, + "step": 18062 + }, + { + "epoch": 1.0114794489864487, + "grad_norm": 1.2998380661010742, + "learning_rate": 9.027e-05, + "loss": 0.3719, + "step": 18063 + }, + { + "epoch": 1.0115354462985777, + "grad_norm": 1.250680685043335, + "learning_rate": 9.027500000000001e-05, + "loss": 0.4127, + "step": 18064 + }, + { + "epoch": 1.0115914436107067, + "grad_norm": 1.2504264116287231, + "learning_rate": 9.028000000000001e-05, + "loss": 0.4795, + "step": 18065 + }, + { + "epoch": 1.0116474409228358, + "grad_norm": 1.4863789081573486, + "learning_rate": 9.028500000000001e-05, + "loss": 0.4474, + "step": 18066 + }, + { + "epoch": 1.0117034382349648, + "grad_norm": 1.2361491918563843, + "learning_rate": 9.029e-05, + "loss": 0.4698, + "step": 18067 + }, + { + "epoch": 1.0117594355470938, + "grad_norm": 1.6292223930358887, + "learning_rate": 9.0295e-05, + "loss": 0.4569, + "step": 18068 + }, + { + "epoch": 1.0118154328592228, + "grad_norm": 1.5858267545700073, + "learning_rate": 9.030000000000001e-05, + "loss": 0.3559, + "step": 18069 + }, + { + "epoch": 1.0118714301713518, + "grad_norm": 1.1331160068511963, + "learning_rate": 9.030500000000001e-05, + "loss": 0.295, + "step": 18070 + }, + { + "epoch": 1.0119274274834809, + "grad_norm": 1.5241893529891968, + "learning_rate": 9.031000000000001e-05, + "loss": 0.4411, + "step": 18071 + }, + { + "epoch": 1.0119834247956099, + "grad_norm": 1.3934721946716309, + "learning_rate": 9.0315e-05, + "loss": 0.5948, + "step": 18072 + }, + { + "epoch": 1.012039422107739, + "grad_norm": 1.6788913011550903, + "learning_rate": 9.032e-05, + "loss": 0.3659, + "step": 18073 + }, + { + "epoch": 1.012095419419868, + "grad_norm": 1.335060715675354, + "learning_rate": 9.0325e-05, + "loss": 0.31, + "step": 18074 + }, + { + "epoch": 1.012151416731997, + "grad_norm": 1.4532748460769653, + "learning_rate": 9.033e-05, + "loss": 0.524, + "step": 18075 + }, + { + "epoch": 1.012207414044126, + "grad_norm": 1.3463493585586548, + "learning_rate": 9.033500000000001e-05, + "loss": 0.3406, + "step": 18076 + }, + { + "epoch": 1.012263411356255, + "grad_norm": 1.3908874988555908, + "learning_rate": 9.034e-05, + "loss": 0.3856, + "step": 18077 + }, + { + "epoch": 1.012319408668384, + "grad_norm": 1.3213330507278442, + "learning_rate": 9.0345e-05, + "loss": 0.3887, + "step": 18078 + }, + { + "epoch": 1.012375405980513, + "grad_norm": 1.2592127323150635, + "learning_rate": 9.035e-05, + "loss": 0.3366, + "step": 18079 + }, + { + "epoch": 1.012431403292642, + "grad_norm": 1.0710480213165283, + "learning_rate": 9.0355e-05, + "loss": 0.3704, + "step": 18080 + }, + { + "epoch": 1.012487400604771, + "grad_norm": 1.4461910724639893, + "learning_rate": 9.036e-05, + "loss": 0.5877, + "step": 18081 + }, + { + "epoch": 1.0125433979169, + "grad_norm": 1.5060564279556274, + "learning_rate": 9.0365e-05, + "loss": 0.5551, + "step": 18082 + }, + { + "epoch": 1.012599395229029, + "grad_norm": 1.3501626253128052, + "learning_rate": 9.037e-05, + "loss": 0.4159, + "step": 18083 + }, + { + "epoch": 1.012655392541158, + "grad_norm": 1.2449951171875, + "learning_rate": 9.037500000000001e-05, + "loss": 0.4655, + "step": 18084 + }, + { + "epoch": 1.0127113898532871, + "grad_norm": 1.2275044918060303, + "learning_rate": 9.038000000000001e-05, + "loss": 0.3742, + "step": 18085 + }, + { + "epoch": 1.0127673871654161, + "grad_norm": 1.4492491483688354, + "learning_rate": 9.038500000000001e-05, + "loss": 0.4606, + "step": 18086 + }, + { + "epoch": 1.0128233844775452, + "grad_norm": 1.2193833589553833, + "learning_rate": 9.039000000000001e-05, + "loss": 0.447, + "step": 18087 + }, + { + "epoch": 1.0128793817896742, + "grad_norm": 1.247789978981018, + "learning_rate": 9.0395e-05, + "loss": 0.446, + "step": 18088 + }, + { + "epoch": 1.0129353791018032, + "grad_norm": 1.3559398651123047, + "learning_rate": 9.04e-05, + "loss": 0.5124, + "step": 18089 + }, + { + "epoch": 1.0129913764139322, + "grad_norm": 1.4070031642913818, + "learning_rate": 9.040500000000001e-05, + "loss": 0.39, + "step": 18090 + }, + { + "epoch": 1.0130473737260612, + "grad_norm": 1.2927995920181274, + "learning_rate": 9.041000000000001e-05, + "loss": 0.3433, + "step": 18091 + }, + { + "epoch": 1.0131033710381903, + "grad_norm": 1.4199851751327515, + "learning_rate": 9.041500000000001e-05, + "loss": 0.3848, + "step": 18092 + }, + { + "epoch": 1.0131593683503193, + "grad_norm": 1.090008020401001, + "learning_rate": 9.042e-05, + "loss": 0.3827, + "step": 18093 + }, + { + "epoch": 1.0132153656624483, + "grad_norm": 1.307547926902771, + "learning_rate": 9.0425e-05, + "loss": 0.4308, + "step": 18094 + }, + { + "epoch": 1.0132713629745773, + "grad_norm": 1.404300332069397, + "learning_rate": 9.043e-05, + "loss": 0.5085, + "step": 18095 + }, + { + "epoch": 1.0133273602867063, + "grad_norm": 1.1837518215179443, + "learning_rate": 9.0435e-05, + "loss": 0.404, + "step": 18096 + }, + { + "epoch": 1.0133833575988354, + "grad_norm": 1.3322854042053223, + "learning_rate": 9.044000000000001e-05, + "loss": 0.4325, + "step": 18097 + }, + { + "epoch": 1.0134393549109644, + "grad_norm": 1.3566287755966187, + "learning_rate": 9.0445e-05, + "loss": 0.5156, + "step": 18098 + }, + { + "epoch": 1.0134953522230934, + "grad_norm": 1.5615484714508057, + "learning_rate": 9.045e-05, + "loss": 0.3841, + "step": 18099 + }, + { + "epoch": 1.0135513495352224, + "grad_norm": 3.3497726917266846, + "learning_rate": 9.0455e-05, + "loss": 0.4027, + "step": 18100 + }, + { + "epoch": 1.0136073468473514, + "grad_norm": 1.3943313360214233, + "learning_rate": 9.046e-05, + "loss": 0.3069, + "step": 18101 + }, + { + "epoch": 1.0136633441594805, + "grad_norm": 1.2476478815078735, + "learning_rate": 9.0465e-05, + "loss": 0.5064, + "step": 18102 + }, + { + "epoch": 1.0137193414716095, + "grad_norm": 1.4256521463394165, + "learning_rate": 9.046999999999999e-05, + "loss": 0.4229, + "step": 18103 + }, + { + "epoch": 1.0137753387837385, + "grad_norm": 1.2707245349884033, + "learning_rate": 9.0475e-05, + "loss": 0.5447, + "step": 18104 + }, + { + "epoch": 1.0138313360958675, + "grad_norm": 1.1792269945144653, + "learning_rate": 9.048000000000001e-05, + "loss": 0.4014, + "step": 18105 + }, + { + "epoch": 1.0138873334079963, + "grad_norm": 1.4103285074234009, + "learning_rate": 9.048500000000001e-05, + "loss": 0.5126, + "step": 18106 + }, + { + "epoch": 1.0139433307201253, + "grad_norm": 1.3352609872817993, + "learning_rate": 9.049000000000001e-05, + "loss": 0.3986, + "step": 18107 + }, + { + "epoch": 1.0139993280322543, + "grad_norm": 1.3411178588867188, + "learning_rate": 9.049500000000001e-05, + "loss": 0.32, + "step": 18108 + }, + { + "epoch": 1.0140553253443834, + "grad_norm": 1.361093282699585, + "learning_rate": 9.05e-05, + "loss": 0.4446, + "step": 18109 + }, + { + "epoch": 1.0141113226565124, + "grad_norm": 1.238129734992981, + "learning_rate": 9.0505e-05, + "loss": 0.3947, + "step": 18110 + }, + { + "epoch": 1.0141673199686414, + "grad_norm": 1.7928906679153442, + "learning_rate": 9.051000000000001e-05, + "loss": 0.3852, + "step": 18111 + }, + { + "epoch": 1.0142233172807704, + "grad_norm": 1.4398564100265503, + "learning_rate": 9.051500000000001e-05, + "loss": 0.4593, + "step": 18112 + }, + { + "epoch": 1.0142793145928994, + "grad_norm": 1.3197253942489624, + "learning_rate": 9.052000000000001e-05, + "loss": 0.3959, + "step": 18113 + }, + { + "epoch": 1.0143353119050285, + "grad_norm": 1.3324925899505615, + "learning_rate": 9.0525e-05, + "loss": 0.3748, + "step": 18114 + }, + { + "epoch": 1.0143913092171575, + "grad_norm": 1.260272741317749, + "learning_rate": 9.053e-05, + "loss": 0.3949, + "step": 18115 + }, + { + "epoch": 1.0144473065292865, + "grad_norm": 1.5691372156143188, + "learning_rate": 9.0535e-05, + "loss": 0.3, + "step": 18116 + }, + { + "epoch": 1.0145033038414155, + "grad_norm": 1.142928123474121, + "learning_rate": 9.054e-05, + "loss": 0.3668, + "step": 18117 + }, + { + "epoch": 1.0145593011535445, + "grad_norm": 1.4724918603897095, + "learning_rate": 9.054500000000001e-05, + "loss": 0.4484, + "step": 18118 + }, + { + "epoch": 1.0146152984656736, + "grad_norm": 1.3410335779190063, + "learning_rate": 9.055e-05, + "loss": 0.4898, + "step": 18119 + }, + { + "epoch": 1.0146712957778026, + "grad_norm": 1.319427728652954, + "learning_rate": 9.0555e-05, + "loss": 0.4391, + "step": 18120 + }, + { + "epoch": 1.0147272930899316, + "grad_norm": 1.2103686332702637, + "learning_rate": 9.056e-05, + "loss": 0.3887, + "step": 18121 + }, + { + "epoch": 1.0147832904020606, + "grad_norm": 1.0283188819885254, + "learning_rate": 9.0565e-05, + "loss": 0.3699, + "step": 18122 + }, + { + "epoch": 1.0148392877141896, + "grad_norm": 1.727451205253601, + "learning_rate": 9.057e-05, + "loss": 0.6659, + "step": 18123 + }, + { + "epoch": 1.0148952850263186, + "grad_norm": 1.2413078546524048, + "learning_rate": 9.0575e-05, + "loss": 0.4249, + "step": 18124 + }, + { + "epoch": 1.0149512823384477, + "grad_norm": 2.3689627647399902, + "learning_rate": 9.058e-05, + "loss": 0.4267, + "step": 18125 + }, + { + "epoch": 1.0150072796505767, + "grad_norm": 1.4286799430847168, + "learning_rate": 9.058500000000001e-05, + "loss": 0.3831, + "step": 18126 + }, + { + "epoch": 1.0150632769627057, + "grad_norm": 1.223140835762024, + "learning_rate": 9.059000000000001e-05, + "loss": 0.4263, + "step": 18127 + }, + { + "epoch": 1.0151192742748347, + "grad_norm": 1.2765775918960571, + "learning_rate": 9.059500000000001e-05, + "loss": 0.4055, + "step": 18128 + }, + { + "epoch": 1.0151752715869637, + "grad_norm": 1.4111768007278442, + "learning_rate": 9.06e-05, + "loss": 0.4041, + "step": 18129 + }, + { + "epoch": 1.0152312688990928, + "grad_norm": 1.454330325126648, + "learning_rate": 9.0605e-05, + "loss": 0.3906, + "step": 18130 + }, + { + "epoch": 1.0152872662112218, + "grad_norm": 1.5828416347503662, + "learning_rate": 9.061e-05, + "loss": 0.4863, + "step": 18131 + }, + { + "epoch": 1.0153432635233508, + "grad_norm": 1.7145265340805054, + "learning_rate": 9.0615e-05, + "loss": 0.5591, + "step": 18132 + }, + { + "epoch": 1.0153992608354798, + "grad_norm": 1.3797694444656372, + "learning_rate": 9.062000000000001e-05, + "loss": 0.4024, + "step": 18133 + }, + { + "epoch": 1.0154552581476088, + "grad_norm": 1.3734862804412842, + "learning_rate": 9.062500000000001e-05, + "loss": 0.3615, + "step": 18134 + }, + { + "epoch": 1.0155112554597379, + "grad_norm": 1.5914740562438965, + "learning_rate": 9.063e-05, + "loss": 0.3757, + "step": 18135 + }, + { + "epoch": 1.0155672527718669, + "grad_norm": 1.196204423904419, + "learning_rate": 9.0635e-05, + "loss": 0.3574, + "step": 18136 + }, + { + "epoch": 1.015623250083996, + "grad_norm": 1.4615166187286377, + "learning_rate": 9.064e-05, + "loss": 0.4394, + "step": 18137 + }, + { + "epoch": 1.015679247396125, + "grad_norm": 1.5073596239089966, + "learning_rate": 9.0645e-05, + "loss": 0.3384, + "step": 18138 + }, + { + "epoch": 1.015735244708254, + "grad_norm": 1.6935042142868042, + "learning_rate": 9.065000000000001e-05, + "loss": 0.6493, + "step": 18139 + }, + { + "epoch": 1.015791242020383, + "grad_norm": 1.2316350936889648, + "learning_rate": 9.0655e-05, + "loss": 0.4317, + "step": 18140 + }, + { + "epoch": 1.015847239332512, + "grad_norm": 1.423403024673462, + "learning_rate": 9.066e-05, + "loss": 0.4692, + "step": 18141 + }, + { + "epoch": 1.015903236644641, + "grad_norm": 1.4836894273757935, + "learning_rate": 9.0665e-05, + "loss": 0.4839, + "step": 18142 + }, + { + "epoch": 1.01595923395677, + "grad_norm": 1.208901047706604, + "learning_rate": 9.067e-05, + "loss": 0.2979, + "step": 18143 + }, + { + "epoch": 1.016015231268899, + "grad_norm": 1.1822084188461304, + "learning_rate": 9.0675e-05, + "loss": 0.3664, + "step": 18144 + }, + { + "epoch": 1.016071228581028, + "grad_norm": 1.3010916709899902, + "learning_rate": 9.068e-05, + "loss": 0.4969, + "step": 18145 + }, + { + "epoch": 1.016127225893157, + "grad_norm": 1.1672778129577637, + "learning_rate": 9.0685e-05, + "loss": 0.4616, + "step": 18146 + }, + { + "epoch": 1.016183223205286, + "grad_norm": 1.3962794542312622, + "learning_rate": 9.069000000000001e-05, + "loss": 0.3049, + "step": 18147 + }, + { + "epoch": 1.016239220517415, + "grad_norm": 1.4669468402862549, + "learning_rate": 9.069500000000001e-05, + "loss": 0.4411, + "step": 18148 + }, + { + "epoch": 1.0162952178295441, + "grad_norm": 1.3549784421920776, + "learning_rate": 9.070000000000001e-05, + "loss": 0.3852, + "step": 18149 + }, + { + "epoch": 1.0163512151416731, + "grad_norm": 1.1801265478134155, + "learning_rate": 9.0705e-05, + "loss": 0.3111, + "step": 18150 + }, + { + "epoch": 1.0164072124538022, + "grad_norm": 1.7732163667678833, + "learning_rate": 9.071e-05, + "loss": 0.4371, + "step": 18151 + }, + { + "epoch": 1.0164632097659312, + "grad_norm": 1.2452828884124756, + "learning_rate": 9.0715e-05, + "loss": 0.4395, + "step": 18152 + }, + { + "epoch": 1.0165192070780602, + "grad_norm": 1.396883487701416, + "learning_rate": 9.072e-05, + "loss": 0.4847, + "step": 18153 + }, + { + "epoch": 1.0165752043901892, + "grad_norm": 1.3976457118988037, + "learning_rate": 9.072500000000001e-05, + "loss": 0.4056, + "step": 18154 + }, + { + "epoch": 1.0166312017023182, + "grad_norm": 1.2956854104995728, + "learning_rate": 9.073000000000001e-05, + "loss": 0.402, + "step": 18155 + }, + { + "epoch": 1.0166871990144473, + "grad_norm": 1.400383472442627, + "learning_rate": 9.0735e-05, + "loss": 0.3289, + "step": 18156 + }, + { + "epoch": 1.0167431963265763, + "grad_norm": 1.3283774852752686, + "learning_rate": 9.074e-05, + "loss": 0.3727, + "step": 18157 + }, + { + "epoch": 1.0167991936387053, + "grad_norm": 1.4578040838241577, + "learning_rate": 9.0745e-05, + "loss": 0.4059, + "step": 18158 + }, + { + "epoch": 1.0168551909508343, + "grad_norm": 1.25513756275177, + "learning_rate": 9.075e-05, + "loss": 0.4249, + "step": 18159 + }, + { + "epoch": 1.0169111882629633, + "grad_norm": 2.642794370651245, + "learning_rate": 9.075500000000001e-05, + "loss": 0.5193, + "step": 18160 + }, + { + "epoch": 1.0169671855750924, + "grad_norm": 1.7181202173233032, + "learning_rate": 9.076e-05, + "loss": 0.4596, + "step": 18161 + }, + { + "epoch": 1.0170231828872214, + "grad_norm": 1.3589255809783936, + "learning_rate": 9.0765e-05, + "loss": 0.615, + "step": 18162 + }, + { + "epoch": 1.0170791801993504, + "grad_norm": 1.5083166360855103, + "learning_rate": 9.077e-05, + "loss": 0.4295, + "step": 18163 + }, + { + "epoch": 1.0171351775114794, + "grad_norm": 2.4867968559265137, + "learning_rate": 9.0775e-05, + "loss": 0.4186, + "step": 18164 + }, + { + "epoch": 1.0171911748236084, + "grad_norm": 1.244420051574707, + "learning_rate": 9.078000000000001e-05, + "loss": 0.399, + "step": 18165 + }, + { + "epoch": 1.0172471721357375, + "grad_norm": 1.086611270904541, + "learning_rate": 9.0785e-05, + "loss": 0.4294, + "step": 18166 + }, + { + "epoch": 1.0173031694478665, + "grad_norm": 1.6594682931900024, + "learning_rate": 9.079e-05, + "loss": 0.6585, + "step": 18167 + }, + { + "epoch": 1.0173591667599955, + "grad_norm": Infinity, + "learning_rate": 9.079e-05, + "loss": 0.4973, + "step": 18168 + }, + { + "epoch": 1.0174151640721245, + "grad_norm": 1.4147491455078125, + "learning_rate": 9.079500000000001e-05, + "loss": 0.3646, + "step": 18169 + }, + { + "epoch": 1.0174711613842535, + "grad_norm": 1.4743132591247559, + "learning_rate": 9.080000000000001e-05, + "loss": 0.4226, + "step": 18170 + }, + { + "epoch": 1.0175271586963825, + "grad_norm": 1.4382675886154175, + "learning_rate": 9.080500000000001e-05, + "loss": 0.4246, + "step": 18171 + }, + { + "epoch": 1.0175831560085116, + "grad_norm": 1.3491721153259277, + "learning_rate": 9.081e-05, + "loss": 0.5039, + "step": 18172 + }, + { + "epoch": 1.0176391533206406, + "grad_norm": 1.3085360527038574, + "learning_rate": 9.0815e-05, + "loss": 0.4194, + "step": 18173 + }, + { + "epoch": 1.0176951506327696, + "grad_norm": 1.3242313861846924, + "learning_rate": 9.082e-05, + "loss": 0.4244, + "step": 18174 + }, + { + "epoch": 1.0177511479448986, + "grad_norm": 1.317463755607605, + "learning_rate": 9.0825e-05, + "loss": 0.4964, + "step": 18175 + }, + { + "epoch": 1.0178071452570276, + "grad_norm": 1.1602150201797485, + "learning_rate": 9.083000000000001e-05, + "loss": 0.341, + "step": 18176 + }, + { + "epoch": 1.0178631425691567, + "grad_norm": 1.1702351570129395, + "learning_rate": 9.083500000000001e-05, + "loss": 0.3677, + "step": 18177 + }, + { + "epoch": 1.0179191398812857, + "grad_norm": 1.5528980493545532, + "learning_rate": 9.084e-05, + "loss": 0.4968, + "step": 18178 + }, + { + "epoch": 1.0179751371934147, + "grad_norm": 1.2101691961288452, + "learning_rate": 9.0845e-05, + "loss": 0.3585, + "step": 18179 + }, + { + "epoch": 1.0180311345055437, + "grad_norm": 1.367945671081543, + "learning_rate": 9.085e-05, + "loss": 0.3716, + "step": 18180 + }, + { + "epoch": 1.0180871318176727, + "grad_norm": 1.5212262868881226, + "learning_rate": 9.0855e-05, + "loss": 0.4099, + "step": 18181 + }, + { + "epoch": 1.0181431291298018, + "grad_norm": 1.1195323467254639, + "learning_rate": 9.086e-05, + "loss": 0.3555, + "step": 18182 + }, + { + "epoch": 1.0181991264419308, + "grad_norm": 1.7062853574752808, + "learning_rate": 9.0865e-05, + "loss": 0.4578, + "step": 18183 + }, + { + "epoch": 1.0182551237540598, + "grad_norm": 1.607496976852417, + "learning_rate": 9.087e-05, + "loss": 0.5234, + "step": 18184 + }, + { + "epoch": 1.0183111210661888, + "grad_norm": 1.255609393119812, + "learning_rate": 9.0875e-05, + "loss": 0.5072, + "step": 18185 + }, + { + "epoch": 1.0183671183783178, + "grad_norm": 1.1043859720230103, + "learning_rate": 9.088000000000001e-05, + "loss": 0.4505, + "step": 18186 + }, + { + "epoch": 1.0184231156904469, + "grad_norm": 1.301988959312439, + "learning_rate": 9.088500000000001e-05, + "loss": 0.4329, + "step": 18187 + }, + { + "epoch": 1.0184791130025759, + "grad_norm": 1.3064353466033936, + "learning_rate": 9.089e-05, + "loss": 0.3995, + "step": 18188 + }, + { + "epoch": 1.018535110314705, + "grad_norm": 1.32597815990448, + "learning_rate": 9.0895e-05, + "loss": 0.4248, + "step": 18189 + }, + { + "epoch": 1.018591107626834, + "grad_norm": 1.489014983177185, + "learning_rate": 9.090000000000001e-05, + "loss": 0.4669, + "step": 18190 + }, + { + "epoch": 1.018647104938963, + "grad_norm": 1.5068203210830688, + "learning_rate": 9.090500000000001e-05, + "loss": 0.4461, + "step": 18191 + }, + { + "epoch": 1.018703102251092, + "grad_norm": 1.1500208377838135, + "learning_rate": 9.091000000000001e-05, + "loss": 0.3835, + "step": 18192 + }, + { + "epoch": 1.018759099563221, + "grad_norm": 1.5500932931900024, + "learning_rate": 9.0915e-05, + "loss": 0.4162, + "step": 18193 + }, + { + "epoch": 1.01881509687535, + "grad_norm": 1.2251145839691162, + "learning_rate": 9.092e-05, + "loss": 0.3043, + "step": 18194 + }, + { + "epoch": 1.018871094187479, + "grad_norm": 1.2725141048431396, + "learning_rate": 9.0925e-05, + "loss": 0.4343, + "step": 18195 + }, + { + "epoch": 1.018927091499608, + "grad_norm": 1.1705259084701538, + "learning_rate": 9.093e-05, + "loss": 0.3781, + "step": 18196 + }, + { + "epoch": 1.018983088811737, + "grad_norm": 1.4429186582565308, + "learning_rate": 9.093500000000001e-05, + "loss": 0.5007, + "step": 18197 + }, + { + "epoch": 1.019039086123866, + "grad_norm": 1.3703272342681885, + "learning_rate": 9.094000000000001e-05, + "loss": 0.54, + "step": 18198 + }, + { + "epoch": 1.019095083435995, + "grad_norm": 1.161527395248413, + "learning_rate": 9.0945e-05, + "loss": 0.3372, + "step": 18199 + }, + { + "epoch": 1.019151080748124, + "grad_norm": 1.3180170059204102, + "learning_rate": 9.095e-05, + "loss": 0.4064, + "step": 18200 + }, + { + "epoch": 1.0192070780602531, + "grad_norm": 1.463131070137024, + "learning_rate": 9.0955e-05, + "loss": 0.5625, + "step": 18201 + }, + { + "epoch": 1.0192630753723821, + "grad_norm": 1.2182964086532593, + "learning_rate": 9.096e-05, + "loss": 0.3779, + "step": 18202 + }, + { + "epoch": 1.0193190726845112, + "grad_norm": 1.4279230833053589, + "learning_rate": 9.0965e-05, + "loss": 0.4652, + "step": 18203 + }, + { + "epoch": 1.0193750699966402, + "grad_norm": 1.2451354265213013, + "learning_rate": 9.097e-05, + "loss": 0.4531, + "step": 18204 + }, + { + "epoch": 1.0194310673087692, + "grad_norm": 1.2272124290466309, + "learning_rate": 9.0975e-05, + "loss": 0.3787, + "step": 18205 + }, + { + "epoch": 1.0194870646208982, + "grad_norm": 1.1763030290603638, + "learning_rate": 9.098000000000001e-05, + "loss": 0.4008, + "step": 18206 + }, + { + "epoch": 1.0195430619330272, + "grad_norm": 1.3578581809997559, + "learning_rate": 9.098500000000001e-05, + "loss": 0.4206, + "step": 18207 + }, + { + "epoch": 1.0195990592451563, + "grad_norm": 1.4249365329742432, + "learning_rate": 9.099000000000001e-05, + "loss": 0.4374, + "step": 18208 + }, + { + "epoch": 1.0196550565572853, + "grad_norm": 1.328666090965271, + "learning_rate": 9.0995e-05, + "loss": 0.4919, + "step": 18209 + }, + { + "epoch": 1.0197110538694143, + "grad_norm": 1.2589588165283203, + "learning_rate": 9.1e-05, + "loss": 0.5186, + "step": 18210 + }, + { + "epoch": 1.0197670511815433, + "grad_norm": 1.3986668586730957, + "learning_rate": 9.1005e-05, + "loss": 0.3669, + "step": 18211 + }, + { + "epoch": 1.0198230484936723, + "grad_norm": 1.1582605838775635, + "learning_rate": 9.101000000000001e-05, + "loss": 0.4514, + "step": 18212 + }, + { + "epoch": 1.0198790458058014, + "grad_norm": 1.40287184715271, + "learning_rate": 9.101500000000001e-05, + "loss": 0.5558, + "step": 18213 + }, + { + "epoch": 1.0199350431179304, + "grad_norm": 1.3524171113967896, + "learning_rate": 9.102e-05, + "loss": 0.4443, + "step": 18214 + }, + { + "epoch": 1.0199910404300594, + "grad_norm": 1.5427931547164917, + "learning_rate": 9.1025e-05, + "loss": 0.4846, + "step": 18215 + }, + { + "epoch": 1.0200470377421884, + "grad_norm": 1.319544792175293, + "learning_rate": 9.103e-05, + "loss": 0.4859, + "step": 18216 + }, + { + "epoch": 1.0201030350543174, + "grad_norm": 1.5592738389968872, + "learning_rate": 9.1035e-05, + "loss": 0.4762, + "step": 18217 + }, + { + "epoch": 1.0201590323664464, + "grad_norm": 1.3388237953186035, + "learning_rate": 9.104000000000001e-05, + "loss": 0.578, + "step": 18218 + }, + { + "epoch": 1.0202150296785755, + "grad_norm": 1.6388988494873047, + "learning_rate": 9.1045e-05, + "loss": 0.4988, + "step": 18219 + }, + { + "epoch": 1.0202710269907045, + "grad_norm": 1.4908218383789062, + "learning_rate": 9.105e-05, + "loss": 0.3613, + "step": 18220 + }, + { + "epoch": 1.0203270243028335, + "grad_norm": 1.2812426090240479, + "learning_rate": 9.1055e-05, + "loss": 0.2655, + "step": 18221 + }, + { + "epoch": 1.0203830216149625, + "grad_norm": 1.3546313047409058, + "learning_rate": 9.106e-05, + "loss": 0.3827, + "step": 18222 + }, + { + "epoch": 1.0204390189270915, + "grad_norm": 1.247186541557312, + "learning_rate": 9.1065e-05, + "loss": 0.4516, + "step": 18223 + }, + { + "epoch": 1.0204950162392206, + "grad_norm": 1.2498414516448975, + "learning_rate": 9.107e-05, + "loss": 0.4024, + "step": 18224 + }, + { + "epoch": 1.0205510135513496, + "grad_norm": 1.2576067447662354, + "learning_rate": 9.1075e-05, + "loss": 0.4612, + "step": 18225 + }, + { + "epoch": 1.0206070108634786, + "grad_norm": 1.5343748331069946, + "learning_rate": 9.108000000000002e-05, + "loss": 0.4571, + "step": 18226 + }, + { + "epoch": 1.0206630081756076, + "grad_norm": 1.9127240180969238, + "learning_rate": 9.108500000000001e-05, + "loss": 0.369, + "step": 18227 + }, + { + "epoch": 1.0207190054877366, + "grad_norm": 1.4005156755447388, + "learning_rate": 9.109000000000001e-05, + "loss": 0.508, + "step": 18228 + }, + { + "epoch": 1.0207750027998657, + "grad_norm": 1.408104658126831, + "learning_rate": 9.109500000000001e-05, + "loss": 0.468, + "step": 18229 + }, + { + "epoch": 1.0208310001119947, + "grad_norm": 1.2827550172805786, + "learning_rate": 9.11e-05, + "loss": 0.5104, + "step": 18230 + }, + { + "epoch": 1.0208869974241237, + "grad_norm": 1.292176604270935, + "learning_rate": 9.1105e-05, + "loss": 0.4796, + "step": 18231 + }, + { + "epoch": 1.0209429947362527, + "grad_norm": 1.4443269968032837, + "learning_rate": 9.111e-05, + "loss": 0.3355, + "step": 18232 + }, + { + "epoch": 1.0209989920483817, + "grad_norm": 1.3418875932693481, + "learning_rate": 9.111500000000001e-05, + "loss": 0.5003, + "step": 18233 + }, + { + "epoch": 1.0210549893605108, + "grad_norm": 1.4856966733932495, + "learning_rate": 9.112000000000001e-05, + "loss": 0.3811, + "step": 18234 + }, + { + "epoch": 1.0211109866726398, + "grad_norm": 1.056575059890747, + "learning_rate": 9.1125e-05, + "loss": 0.2906, + "step": 18235 + }, + { + "epoch": 1.0211669839847688, + "grad_norm": 1.321616291999817, + "learning_rate": 9.113e-05, + "loss": 0.3858, + "step": 18236 + }, + { + "epoch": 1.0212229812968978, + "grad_norm": 1.1759700775146484, + "learning_rate": 9.1135e-05, + "loss": 0.3259, + "step": 18237 + }, + { + "epoch": 1.0212789786090268, + "grad_norm": 1.299067735671997, + "learning_rate": 9.114e-05, + "loss": 0.4186, + "step": 18238 + }, + { + "epoch": 1.0213349759211559, + "grad_norm": 1.4101417064666748, + "learning_rate": 9.114500000000001e-05, + "loss": 0.4097, + "step": 18239 + }, + { + "epoch": 1.0213909732332849, + "grad_norm": 1.4865729808807373, + "learning_rate": 9.115e-05, + "loss": 0.4895, + "step": 18240 + }, + { + "epoch": 1.021446970545414, + "grad_norm": 1.4383394718170166, + "learning_rate": 9.1155e-05, + "loss": 0.4237, + "step": 18241 + }, + { + "epoch": 1.021502967857543, + "grad_norm": 1.3762346506118774, + "learning_rate": 9.116e-05, + "loss": 0.4098, + "step": 18242 + }, + { + "epoch": 1.021558965169672, + "grad_norm": 1.2840474843978882, + "learning_rate": 9.1165e-05, + "loss": 0.396, + "step": 18243 + }, + { + "epoch": 1.021614962481801, + "grad_norm": 1.7581106424331665, + "learning_rate": 9.117e-05, + "loss": 0.3805, + "step": 18244 + }, + { + "epoch": 1.02167095979393, + "grad_norm": 1.226104497909546, + "learning_rate": 9.1175e-05, + "loss": 0.3699, + "step": 18245 + }, + { + "epoch": 1.021726957106059, + "grad_norm": 1.236308217048645, + "learning_rate": 9.118e-05, + "loss": 0.4676, + "step": 18246 + }, + { + "epoch": 1.021782954418188, + "grad_norm": 1.1668379306793213, + "learning_rate": 9.118500000000002e-05, + "loss": 0.3804, + "step": 18247 + }, + { + "epoch": 1.021838951730317, + "grad_norm": 1.8728046417236328, + "learning_rate": 9.119000000000001e-05, + "loss": 0.5154, + "step": 18248 + }, + { + "epoch": 1.021894949042446, + "grad_norm": 1.1699916124343872, + "learning_rate": 9.119500000000001e-05, + "loss": 0.4142, + "step": 18249 + }, + { + "epoch": 1.021950946354575, + "grad_norm": 1.4368023872375488, + "learning_rate": 9.120000000000001e-05, + "loss": 0.5335, + "step": 18250 + }, + { + "epoch": 1.022006943666704, + "grad_norm": 1.3821005821228027, + "learning_rate": 9.1205e-05, + "loss": 0.417, + "step": 18251 + }, + { + "epoch": 1.022062940978833, + "grad_norm": 1.9109565019607544, + "learning_rate": 9.121e-05, + "loss": 0.5542, + "step": 18252 + }, + { + "epoch": 1.0221189382909621, + "grad_norm": 1.257783055305481, + "learning_rate": 9.1215e-05, + "loss": 0.3066, + "step": 18253 + }, + { + "epoch": 1.0221749356030911, + "grad_norm": 1.4408705234527588, + "learning_rate": 9.122000000000001e-05, + "loss": 0.4391, + "step": 18254 + }, + { + "epoch": 1.0222309329152202, + "grad_norm": 1.3678799867630005, + "learning_rate": 9.122500000000001e-05, + "loss": 0.3597, + "step": 18255 + }, + { + "epoch": 1.0222869302273492, + "grad_norm": 1.3051496744155884, + "learning_rate": 9.123e-05, + "loss": 0.4144, + "step": 18256 + }, + { + "epoch": 1.0223429275394782, + "grad_norm": 1.2545008659362793, + "learning_rate": 9.1235e-05, + "loss": 0.4483, + "step": 18257 + }, + { + "epoch": 1.0223989248516072, + "grad_norm": 1.4460846185684204, + "learning_rate": 9.124e-05, + "loss": 0.3909, + "step": 18258 + }, + { + "epoch": 1.0224549221637362, + "grad_norm": 1.2341158390045166, + "learning_rate": 9.1245e-05, + "loss": 0.387, + "step": 18259 + }, + { + "epoch": 1.0225109194758653, + "grad_norm": 1.3547271490097046, + "learning_rate": 9.125e-05, + "loss": 0.4874, + "step": 18260 + }, + { + "epoch": 1.0225669167879943, + "grad_norm": 1.2583049535751343, + "learning_rate": 9.1255e-05, + "loss": 0.3336, + "step": 18261 + }, + { + "epoch": 1.0226229141001233, + "grad_norm": 1.4579052925109863, + "learning_rate": 9.126e-05, + "loss": 0.4597, + "step": 18262 + }, + { + "epoch": 1.0226789114122523, + "grad_norm": 1.7459715604782104, + "learning_rate": 9.1265e-05, + "loss": 0.4076, + "step": 18263 + }, + { + "epoch": 1.0227349087243813, + "grad_norm": 1.2544258832931519, + "learning_rate": 9.127e-05, + "loss": 0.338, + "step": 18264 + }, + { + "epoch": 1.0227909060365103, + "grad_norm": 2.3433997631073, + "learning_rate": 9.1275e-05, + "loss": 0.5613, + "step": 18265 + }, + { + "epoch": 1.0228469033486394, + "grad_norm": 1.5074681043624878, + "learning_rate": 9.128e-05, + "loss": 0.4918, + "step": 18266 + }, + { + "epoch": 1.0229029006607684, + "grad_norm": 1.2791783809661865, + "learning_rate": 9.1285e-05, + "loss": 0.4341, + "step": 18267 + }, + { + "epoch": 1.0229588979728974, + "grad_norm": 1.2252360582351685, + "learning_rate": 9.129000000000002e-05, + "loss": 0.3463, + "step": 18268 + }, + { + "epoch": 1.0230148952850264, + "grad_norm": 1.2673494815826416, + "learning_rate": 9.129500000000001e-05, + "loss": 0.3853, + "step": 18269 + }, + { + "epoch": 1.0230708925971554, + "grad_norm": 1.4124587774276733, + "learning_rate": 9.130000000000001e-05, + "loss": 0.3272, + "step": 18270 + }, + { + "epoch": 1.0231268899092845, + "grad_norm": 1.2844294309616089, + "learning_rate": 9.130500000000001e-05, + "loss": 0.3963, + "step": 18271 + }, + { + "epoch": 1.0231828872214135, + "grad_norm": 1.359400749206543, + "learning_rate": 9.131e-05, + "loss": 0.5973, + "step": 18272 + }, + { + "epoch": 1.0232388845335423, + "grad_norm": 1.4733316898345947, + "learning_rate": 9.1315e-05, + "loss": 0.5078, + "step": 18273 + }, + { + "epoch": 1.0232948818456713, + "grad_norm": 1.5604074001312256, + "learning_rate": 9.132e-05, + "loss": 0.6136, + "step": 18274 + }, + { + "epoch": 1.0233508791578003, + "grad_norm": 1.272163987159729, + "learning_rate": 9.132500000000001e-05, + "loss": 0.3756, + "step": 18275 + }, + { + "epoch": 1.0234068764699293, + "grad_norm": 1.4646979570388794, + "learning_rate": 9.133000000000001e-05, + "loss": 0.5123, + "step": 18276 + }, + { + "epoch": 1.0234628737820584, + "grad_norm": 1.6235682964324951, + "learning_rate": 9.1335e-05, + "loss": 0.4594, + "step": 18277 + }, + { + "epoch": 1.0235188710941874, + "grad_norm": 1.4288482666015625, + "learning_rate": 9.134e-05, + "loss": 0.4221, + "step": 18278 + }, + { + "epoch": 1.0235748684063164, + "grad_norm": 1.1828737258911133, + "learning_rate": 9.1345e-05, + "loss": 0.3298, + "step": 18279 + }, + { + "epoch": 1.0236308657184454, + "grad_norm": 1.3354418277740479, + "learning_rate": 9.135e-05, + "loss": 0.4301, + "step": 18280 + }, + { + "epoch": 1.0236868630305744, + "grad_norm": 1.3682681322097778, + "learning_rate": 9.1355e-05, + "loss": 0.5212, + "step": 18281 + }, + { + "epoch": 1.0237428603427035, + "grad_norm": 1.308532953262329, + "learning_rate": 9.136e-05, + "loss": 0.4403, + "step": 18282 + }, + { + "epoch": 1.0237988576548325, + "grad_norm": 1.3649508953094482, + "learning_rate": 9.1365e-05, + "loss": 0.3935, + "step": 18283 + }, + { + "epoch": 1.0238548549669615, + "grad_norm": 1.3734993934631348, + "learning_rate": 9.137e-05, + "loss": 0.4426, + "step": 18284 + }, + { + "epoch": 1.0239108522790905, + "grad_norm": 1.473650336265564, + "learning_rate": 9.1375e-05, + "loss": 0.5127, + "step": 18285 + }, + { + "epoch": 1.0239668495912195, + "grad_norm": 1.4458955526351929, + "learning_rate": 9.138e-05, + "loss": 0.4166, + "step": 18286 + }, + { + "epoch": 1.0240228469033485, + "grad_norm": 1.4608911275863647, + "learning_rate": 9.138500000000001e-05, + "loss": 0.6066, + "step": 18287 + }, + { + "epoch": 1.0240788442154776, + "grad_norm": 1.498123049736023, + "learning_rate": 9.139e-05, + "loss": 0.5515, + "step": 18288 + }, + { + "epoch": 1.0241348415276066, + "grad_norm": 1.2164201736450195, + "learning_rate": 9.1395e-05, + "loss": 0.4468, + "step": 18289 + }, + { + "epoch": 1.0241908388397356, + "grad_norm": 1.4614959955215454, + "learning_rate": 9.140000000000001e-05, + "loss": 0.4695, + "step": 18290 + }, + { + "epoch": 1.0242468361518646, + "grad_norm": 1.2819448709487915, + "learning_rate": 9.140500000000001e-05, + "loss": 0.302, + "step": 18291 + }, + { + "epoch": 1.0243028334639936, + "grad_norm": 1.484433650970459, + "learning_rate": 9.141000000000001e-05, + "loss": 0.418, + "step": 18292 + }, + { + "epoch": 1.0243588307761227, + "grad_norm": 1.1295392513275146, + "learning_rate": 9.1415e-05, + "loss": 0.377, + "step": 18293 + }, + { + "epoch": 1.0244148280882517, + "grad_norm": 1.4290921688079834, + "learning_rate": 9.142e-05, + "loss": 0.4408, + "step": 18294 + }, + { + "epoch": 1.0244708254003807, + "grad_norm": 1.4190647602081299, + "learning_rate": 9.1425e-05, + "loss": 0.4337, + "step": 18295 + }, + { + "epoch": 1.0245268227125097, + "grad_norm": 1.1803679466247559, + "learning_rate": 9.143000000000001e-05, + "loss": 0.4562, + "step": 18296 + }, + { + "epoch": 1.0245828200246387, + "grad_norm": 1.1149933338165283, + "learning_rate": 9.143500000000001e-05, + "loss": 0.3951, + "step": 18297 + }, + { + "epoch": 1.0246388173367678, + "grad_norm": 1.1982587575912476, + "learning_rate": 9.144e-05, + "loss": 0.3147, + "step": 18298 + }, + { + "epoch": 1.0246948146488968, + "grad_norm": 1.5093990564346313, + "learning_rate": 9.1445e-05, + "loss": 0.65, + "step": 18299 + }, + { + "epoch": 1.0247508119610258, + "grad_norm": 1.3103488683700562, + "learning_rate": 9.145e-05, + "loss": 0.3724, + "step": 18300 + }, + { + "epoch": 1.0248068092731548, + "grad_norm": 1.3719797134399414, + "learning_rate": 9.1455e-05, + "loss": 0.3564, + "step": 18301 + }, + { + "epoch": 1.0248628065852838, + "grad_norm": 1.3001863956451416, + "learning_rate": 9.146e-05, + "loss": 0.5071, + "step": 18302 + }, + { + "epoch": 1.0249188038974129, + "grad_norm": 1.3157297372817993, + "learning_rate": 9.1465e-05, + "loss": 0.4178, + "step": 18303 + }, + { + "epoch": 1.0249748012095419, + "grad_norm": 1.3867977857589722, + "learning_rate": 9.147e-05, + "loss": 0.4625, + "step": 18304 + }, + { + "epoch": 1.025030798521671, + "grad_norm": 1.1817615032196045, + "learning_rate": 9.1475e-05, + "loss": 0.397, + "step": 18305 + }, + { + "epoch": 1.0250867958338, + "grad_norm": 1.3469206094741821, + "learning_rate": 9.148e-05, + "loss": 0.4254, + "step": 18306 + }, + { + "epoch": 1.025142793145929, + "grad_norm": 1.2162150144577026, + "learning_rate": 9.148500000000001e-05, + "loss": 0.4049, + "step": 18307 + }, + { + "epoch": 1.025198790458058, + "grad_norm": 1.859230399131775, + "learning_rate": 9.149e-05, + "loss": 0.6715, + "step": 18308 + }, + { + "epoch": 1.025254787770187, + "grad_norm": 1.5233372449874878, + "learning_rate": 9.1495e-05, + "loss": 0.5154, + "step": 18309 + }, + { + "epoch": 1.025310785082316, + "grad_norm": 1.410709023475647, + "learning_rate": 9.15e-05, + "loss": 0.4479, + "step": 18310 + }, + { + "epoch": 1.025366782394445, + "grad_norm": 1.3257741928100586, + "learning_rate": 9.150500000000001e-05, + "loss": 0.5305, + "step": 18311 + }, + { + "epoch": 1.025422779706574, + "grad_norm": 1.2185670137405396, + "learning_rate": 9.151000000000001e-05, + "loss": 0.4023, + "step": 18312 + }, + { + "epoch": 1.025478777018703, + "grad_norm": 1.3654381036758423, + "learning_rate": 9.151500000000001e-05, + "loss": 0.4937, + "step": 18313 + }, + { + "epoch": 1.025534774330832, + "grad_norm": 1.2604769468307495, + "learning_rate": 9.152e-05, + "loss": 0.3389, + "step": 18314 + }, + { + "epoch": 1.025590771642961, + "grad_norm": 1.3107566833496094, + "learning_rate": 9.1525e-05, + "loss": 0.399, + "step": 18315 + }, + { + "epoch": 1.02564676895509, + "grad_norm": 1.2344932556152344, + "learning_rate": 9.153e-05, + "loss": 0.388, + "step": 18316 + }, + { + "epoch": 1.0257027662672191, + "grad_norm": 1.4607726335525513, + "learning_rate": 9.153500000000001e-05, + "loss": 0.4296, + "step": 18317 + }, + { + "epoch": 1.0257587635793481, + "grad_norm": 1.532738208770752, + "learning_rate": 9.154000000000001e-05, + "loss": 0.3561, + "step": 18318 + }, + { + "epoch": 1.0258147608914772, + "grad_norm": 1.3724843263626099, + "learning_rate": 9.1545e-05, + "loss": 0.3672, + "step": 18319 + }, + { + "epoch": 1.0258707582036062, + "grad_norm": 1.2287768125534058, + "learning_rate": 9.155e-05, + "loss": 0.4009, + "step": 18320 + }, + { + "epoch": 1.0259267555157352, + "grad_norm": 1.1697520017623901, + "learning_rate": 9.1555e-05, + "loss": 0.3992, + "step": 18321 + }, + { + "epoch": 1.0259827528278642, + "grad_norm": 1.450138807296753, + "learning_rate": 9.156e-05, + "loss": 0.5989, + "step": 18322 + }, + { + "epoch": 1.0260387501399932, + "grad_norm": 1.4806028604507446, + "learning_rate": 9.1565e-05, + "loss": 0.4256, + "step": 18323 + }, + { + "epoch": 1.0260947474521223, + "grad_norm": 1.3607051372528076, + "learning_rate": 9.157e-05, + "loss": 0.4582, + "step": 18324 + }, + { + "epoch": 1.0261507447642513, + "grad_norm": 1.2938125133514404, + "learning_rate": 9.1575e-05, + "loss": 0.4895, + "step": 18325 + }, + { + "epoch": 1.0262067420763803, + "grad_norm": 1.1564923524856567, + "learning_rate": 9.158e-05, + "loss": 0.347, + "step": 18326 + }, + { + "epoch": 1.0262627393885093, + "grad_norm": 1.0445786714553833, + "learning_rate": 9.158500000000001e-05, + "loss": 0.3474, + "step": 18327 + }, + { + "epoch": 1.0263187367006383, + "grad_norm": 1.220381259918213, + "learning_rate": 9.159000000000001e-05, + "loss": 0.3853, + "step": 18328 + }, + { + "epoch": 1.0263747340127674, + "grad_norm": 1.3459187746047974, + "learning_rate": 9.1595e-05, + "loss": 0.3873, + "step": 18329 + }, + { + "epoch": 1.0264307313248964, + "grad_norm": 1.1996020078659058, + "learning_rate": 9.16e-05, + "loss": 0.3603, + "step": 18330 + }, + { + "epoch": 1.0264867286370254, + "grad_norm": 1.67203688621521, + "learning_rate": 9.1605e-05, + "loss": 0.5885, + "step": 18331 + }, + { + "epoch": 1.0265427259491544, + "grad_norm": 1.098457932472229, + "learning_rate": 9.161000000000001e-05, + "loss": 0.3171, + "step": 18332 + }, + { + "epoch": 1.0265987232612834, + "grad_norm": 1.4290902614593506, + "learning_rate": 9.161500000000001e-05, + "loss": 0.4719, + "step": 18333 + }, + { + "epoch": 1.0266547205734124, + "grad_norm": 1.4965934753417969, + "learning_rate": 9.162000000000001e-05, + "loss": 0.43, + "step": 18334 + }, + { + "epoch": 1.0267107178855415, + "grad_norm": 1.3025615215301514, + "learning_rate": 9.1625e-05, + "loss": 0.353, + "step": 18335 + }, + { + "epoch": 1.0267667151976705, + "grad_norm": 1.5858601331710815, + "learning_rate": 9.163e-05, + "loss": 0.5798, + "step": 18336 + }, + { + "epoch": 1.0268227125097995, + "grad_norm": 1.6514794826507568, + "learning_rate": 9.1635e-05, + "loss": 0.5493, + "step": 18337 + }, + { + "epoch": 1.0268787098219285, + "grad_norm": 1.5772913694381714, + "learning_rate": 9.164000000000001e-05, + "loss": 0.4204, + "step": 18338 + }, + { + "epoch": 1.0269347071340575, + "grad_norm": 1.5542999505996704, + "learning_rate": 9.164500000000001e-05, + "loss": 0.4284, + "step": 18339 + }, + { + "epoch": 1.0269907044461866, + "grad_norm": 1.5654528141021729, + "learning_rate": 9.165e-05, + "loss": 0.3435, + "step": 18340 + }, + { + "epoch": 1.0270467017583156, + "grad_norm": 1.1904045343399048, + "learning_rate": 9.1655e-05, + "loss": 0.4665, + "step": 18341 + }, + { + "epoch": 1.0271026990704446, + "grad_norm": 1.2764575481414795, + "learning_rate": 9.166e-05, + "loss": 0.3937, + "step": 18342 + }, + { + "epoch": 1.0271586963825736, + "grad_norm": 1.2397255897521973, + "learning_rate": 9.1665e-05, + "loss": 0.4035, + "step": 18343 + }, + { + "epoch": 1.0272146936947026, + "grad_norm": 1.5256158113479614, + "learning_rate": 9.167e-05, + "loss": 0.4795, + "step": 18344 + }, + { + "epoch": 1.0272706910068317, + "grad_norm": 1.3618723154067993, + "learning_rate": 9.1675e-05, + "loss": 0.5174, + "step": 18345 + }, + { + "epoch": 1.0273266883189607, + "grad_norm": 1.2115530967712402, + "learning_rate": 9.168e-05, + "loss": 0.338, + "step": 18346 + }, + { + "epoch": 1.0273826856310897, + "grad_norm": 1.500746250152588, + "learning_rate": 9.168500000000001e-05, + "loss": 0.4163, + "step": 18347 + }, + { + "epoch": 1.0274386829432187, + "grad_norm": 1.482271432876587, + "learning_rate": 9.169000000000001e-05, + "loss": 0.4528, + "step": 18348 + }, + { + "epoch": 1.0274946802553477, + "grad_norm": 1.9786421060562134, + "learning_rate": 9.169500000000001e-05, + "loss": 0.451, + "step": 18349 + }, + { + "epoch": 1.0275506775674768, + "grad_norm": 1.3467090129852295, + "learning_rate": 9.17e-05, + "loss": 0.3977, + "step": 18350 + }, + { + "epoch": 1.0276066748796058, + "grad_norm": 1.366949439048767, + "learning_rate": 9.1705e-05, + "loss": 0.5047, + "step": 18351 + }, + { + "epoch": 1.0276626721917348, + "grad_norm": 1.2228416204452515, + "learning_rate": 9.171e-05, + "loss": 0.3472, + "step": 18352 + }, + { + "epoch": 1.0277186695038638, + "grad_norm": 1.6578097343444824, + "learning_rate": 9.171500000000001e-05, + "loss": 0.4835, + "step": 18353 + }, + { + "epoch": 1.0277746668159928, + "grad_norm": 1.2654086351394653, + "learning_rate": 9.172000000000001e-05, + "loss": 0.3836, + "step": 18354 + }, + { + "epoch": 1.0278306641281219, + "grad_norm": 1.4109214544296265, + "learning_rate": 9.172500000000001e-05, + "loss": 0.487, + "step": 18355 + }, + { + "epoch": 1.0278866614402509, + "grad_norm": 1.3260384798049927, + "learning_rate": 9.173e-05, + "loss": 0.4185, + "step": 18356 + }, + { + "epoch": 1.02794265875238, + "grad_norm": 1.336680293083191, + "learning_rate": 9.1735e-05, + "loss": 0.3993, + "step": 18357 + }, + { + "epoch": 1.027998656064509, + "grad_norm": 1.0807775259017944, + "learning_rate": 9.174e-05, + "loss": 0.3301, + "step": 18358 + }, + { + "epoch": 1.028054653376638, + "grad_norm": 1.4308130741119385, + "learning_rate": 9.1745e-05, + "loss": 0.5478, + "step": 18359 + }, + { + "epoch": 1.028110650688767, + "grad_norm": 1.6996850967407227, + "learning_rate": 9.175000000000001e-05, + "loss": 0.5778, + "step": 18360 + }, + { + "epoch": 1.028166648000896, + "grad_norm": 1.3647295236587524, + "learning_rate": 9.1755e-05, + "loss": 0.4445, + "step": 18361 + }, + { + "epoch": 1.028222645313025, + "grad_norm": 1.6652171611785889, + "learning_rate": 9.176e-05, + "loss": 0.556, + "step": 18362 + }, + { + "epoch": 1.028278642625154, + "grad_norm": 1.6018441915512085, + "learning_rate": 9.1765e-05, + "loss": 0.5107, + "step": 18363 + }, + { + "epoch": 1.028334639937283, + "grad_norm": 1.3393938541412354, + "learning_rate": 9.177e-05, + "loss": 0.4032, + "step": 18364 + }, + { + "epoch": 1.028390637249412, + "grad_norm": 1.1796034574508667, + "learning_rate": 9.1775e-05, + "loss": 0.4536, + "step": 18365 + }, + { + "epoch": 1.028446634561541, + "grad_norm": 1.275696039199829, + "learning_rate": 9.178e-05, + "loss": 0.4154, + "step": 18366 + }, + { + "epoch": 1.02850263187367, + "grad_norm": 3.7940006256103516, + "learning_rate": 9.178500000000002e-05, + "loss": 0.5143, + "step": 18367 + }, + { + "epoch": 1.028558629185799, + "grad_norm": 1.4529520273208618, + "learning_rate": 9.179000000000001e-05, + "loss": 0.5135, + "step": 18368 + }, + { + "epoch": 1.0286146264979281, + "grad_norm": 1.36226487159729, + "learning_rate": 9.179500000000001e-05, + "loss": 0.4201, + "step": 18369 + }, + { + "epoch": 1.0286706238100571, + "grad_norm": 1.2332403659820557, + "learning_rate": 9.180000000000001e-05, + "loss": 0.3876, + "step": 18370 + }, + { + "epoch": 1.0287266211221862, + "grad_norm": 1.2992181777954102, + "learning_rate": 9.1805e-05, + "loss": 0.6808, + "step": 18371 + }, + { + "epoch": 1.0287826184343152, + "grad_norm": 1.4992225170135498, + "learning_rate": 9.181e-05, + "loss": 0.3968, + "step": 18372 + }, + { + "epoch": 1.0288386157464442, + "grad_norm": 1.1045231819152832, + "learning_rate": 9.1815e-05, + "loss": 0.3763, + "step": 18373 + }, + { + "epoch": 1.0288946130585732, + "grad_norm": 1.3332529067993164, + "learning_rate": 9.182000000000001e-05, + "loss": 0.3534, + "step": 18374 + }, + { + "epoch": 1.0289506103707022, + "grad_norm": 1.446587085723877, + "learning_rate": 9.182500000000001e-05, + "loss": 0.3957, + "step": 18375 + }, + { + "epoch": 1.0290066076828313, + "grad_norm": 1.5129408836364746, + "learning_rate": 9.183000000000001e-05, + "loss": 0.5899, + "step": 18376 + }, + { + "epoch": 1.0290626049949603, + "grad_norm": 1.2569142580032349, + "learning_rate": 9.1835e-05, + "loss": 0.3884, + "step": 18377 + }, + { + "epoch": 1.0291186023070893, + "grad_norm": 1.3488404750823975, + "learning_rate": 9.184e-05, + "loss": 0.4009, + "step": 18378 + }, + { + "epoch": 1.0291745996192183, + "grad_norm": 1.46926748752594, + "learning_rate": 9.1845e-05, + "loss": 0.4344, + "step": 18379 + }, + { + "epoch": 1.0292305969313473, + "grad_norm": 1.3317769765853882, + "learning_rate": 9.185e-05, + "loss": 0.4538, + "step": 18380 + }, + { + "epoch": 1.0292865942434763, + "grad_norm": 1.6697031259536743, + "learning_rate": 9.185500000000001e-05, + "loss": 0.3738, + "step": 18381 + }, + { + "epoch": 1.0293425915556054, + "grad_norm": 1.295625925064087, + "learning_rate": 9.186e-05, + "loss": 0.5232, + "step": 18382 + }, + { + "epoch": 1.0293985888677344, + "grad_norm": 1.269540786743164, + "learning_rate": 9.1865e-05, + "loss": 0.4541, + "step": 18383 + }, + { + "epoch": 1.0294545861798634, + "grad_norm": 1.8306798934936523, + "learning_rate": 9.187e-05, + "loss": 0.3955, + "step": 18384 + }, + { + "epoch": 1.0295105834919924, + "grad_norm": 1.1614495515823364, + "learning_rate": 9.1875e-05, + "loss": 0.3892, + "step": 18385 + }, + { + "epoch": 1.0295665808041214, + "grad_norm": 1.5643037557601929, + "learning_rate": 9.188e-05, + "loss": 0.398, + "step": 18386 + }, + { + "epoch": 1.0296225781162505, + "grad_norm": 1.1431219577789307, + "learning_rate": 9.1885e-05, + "loss": 0.4726, + "step": 18387 + }, + { + "epoch": 1.0296785754283795, + "grad_norm": 1.1050772666931152, + "learning_rate": 9.189e-05, + "loss": 0.3813, + "step": 18388 + }, + { + "epoch": 1.0297345727405085, + "grad_norm": 3.193002223968506, + "learning_rate": 9.189500000000001e-05, + "loss": 0.524, + "step": 18389 + }, + { + "epoch": 1.0297905700526375, + "grad_norm": 1.2781938314437866, + "learning_rate": 9.190000000000001e-05, + "loss": 0.3412, + "step": 18390 + }, + { + "epoch": 1.0298465673647665, + "grad_norm": 1.3508274555206299, + "learning_rate": 9.190500000000001e-05, + "loss": 0.5309, + "step": 18391 + }, + { + "epoch": 1.0299025646768956, + "grad_norm": 1.6212399005889893, + "learning_rate": 9.191e-05, + "loss": 0.5822, + "step": 18392 + }, + { + "epoch": 1.0299585619890246, + "grad_norm": 1.4462803602218628, + "learning_rate": 9.1915e-05, + "loss": 0.2874, + "step": 18393 + }, + { + "epoch": 1.0300145593011536, + "grad_norm": 1.1816555261611938, + "learning_rate": 9.192e-05, + "loss": 0.4075, + "step": 18394 + }, + { + "epoch": 1.0300705566132826, + "grad_norm": 1.5166535377502441, + "learning_rate": 9.192500000000001e-05, + "loss": 0.389, + "step": 18395 + }, + { + "epoch": 1.0301265539254116, + "grad_norm": 1.4264452457427979, + "learning_rate": 9.193000000000001e-05, + "loss": 0.4658, + "step": 18396 + }, + { + "epoch": 1.0301825512375407, + "grad_norm": 1.33519446849823, + "learning_rate": 9.1935e-05, + "loss": 0.5087, + "step": 18397 + }, + { + "epoch": 1.0302385485496697, + "grad_norm": 1.2397849559783936, + "learning_rate": 9.194e-05, + "loss": 0.4526, + "step": 18398 + }, + { + "epoch": 1.0302945458617987, + "grad_norm": 1.933770775794983, + "learning_rate": 9.1945e-05, + "loss": 0.3647, + "step": 18399 + }, + { + "epoch": 1.0303505431739277, + "grad_norm": 1.5117770433425903, + "learning_rate": 9.195e-05, + "loss": 0.4367, + "step": 18400 + }, + { + "epoch": 1.0304065404860567, + "grad_norm": 1.671377182006836, + "learning_rate": 9.1955e-05, + "loss": 0.3343, + "step": 18401 + }, + { + "epoch": 1.0304625377981858, + "grad_norm": 1.203366756439209, + "learning_rate": 9.196000000000001e-05, + "loss": 0.3819, + "step": 18402 + }, + { + "epoch": 1.0305185351103148, + "grad_norm": 1.5717040300369263, + "learning_rate": 9.1965e-05, + "loss": 0.4543, + "step": 18403 + }, + { + "epoch": 1.0305745324224438, + "grad_norm": 1.1201183795928955, + "learning_rate": 9.197e-05, + "loss": 0.2794, + "step": 18404 + }, + { + "epoch": 1.0306305297345728, + "grad_norm": 1.88822603225708, + "learning_rate": 9.1975e-05, + "loss": 0.4396, + "step": 18405 + }, + { + "epoch": 1.0306865270467018, + "grad_norm": 1.2081702947616577, + "learning_rate": 9.198e-05, + "loss": 0.3072, + "step": 18406 + }, + { + "epoch": 1.0307425243588308, + "grad_norm": 1.5484460592269897, + "learning_rate": 9.1985e-05, + "loss": 0.5021, + "step": 18407 + }, + { + "epoch": 1.0307985216709599, + "grad_norm": 1.197572946548462, + "learning_rate": 9.199e-05, + "loss": 0.3615, + "step": 18408 + }, + { + "epoch": 1.0308545189830889, + "grad_norm": 1.2413281202316284, + "learning_rate": 9.1995e-05, + "loss": 0.4234, + "step": 18409 + }, + { + "epoch": 1.030910516295218, + "grad_norm": 1.3042575120925903, + "learning_rate": 9.200000000000001e-05, + "loss": 0.4667, + "step": 18410 + }, + { + "epoch": 1.030966513607347, + "grad_norm": 1.9163047075271606, + "learning_rate": 9.200500000000001e-05, + "loss": 0.2956, + "step": 18411 + }, + { + "epoch": 1.031022510919476, + "grad_norm": 1.655187726020813, + "learning_rate": 9.201000000000001e-05, + "loss": 0.3838, + "step": 18412 + }, + { + "epoch": 1.031078508231605, + "grad_norm": 1.4335438013076782, + "learning_rate": 9.2015e-05, + "loss": 0.4645, + "step": 18413 + }, + { + "epoch": 1.031134505543734, + "grad_norm": 1.4914640188217163, + "learning_rate": 9.202e-05, + "loss": 0.403, + "step": 18414 + }, + { + "epoch": 1.031190502855863, + "grad_norm": 1.4218559265136719, + "learning_rate": 9.2025e-05, + "loss": 0.3799, + "step": 18415 + }, + { + "epoch": 1.031246500167992, + "grad_norm": 1.4433684349060059, + "learning_rate": 9.203000000000001e-05, + "loss": 0.4391, + "step": 18416 + }, + { + "epoch": 1.031302497480121, + "grad_norm": 1.1900036334991455, + "learning_rate": 9.203500000000001e-05, + "loss": 0.5269, + "step": 18417 + }, + { + "epoch": 1.03135849479225, + "grad_norm": 1.30314040184021, + "learning_rate": 9.204e-05, + "loss": 0.3646, + "step": 18418 + }, + { + "epoch": 1.031414492104379, + "grad_norm": 1.6013752222061157, + "learning_rate": 9.2045e-05, + "loss": 0.3689, + "step": 18419 + }, + { + "epoch": 1.031470489416508, + "grad_norm": 1.0355218648910522, + "learning_rate": 9.205e-05, + "loss": 0.3397, + "step": 18420 + }, + { + "epoch": 1.0315264867286371, + "grad_norm": 1.4996814727783203, + "learning_rate": 9.2055e-05, + "loss": 0.3549, + "step": 18421 + }, + { + "epoch": 1.0315824840407661, + "grad_norm": 1.2774845361709595, + "learning_rate": 9.206e-05, + "loss": 0.4775, + "step": 18422 + }, + { + "epoch": 1.0316384813528952, + "grad_norm": 1.398349642753601, + "learning_rate": 9.206500000000001e-05, + "loss": 0.3908, + "step": 18423 + }, + { + "epoch": 1.0316944786650242, + "grad_norm": 1.645747423171997, + "learning_rate": 9.207e-05, + "loss": 0.5009, + "step": 18424 + }, + { + "epoch": 1.0317504759771532, + "grad_norm": 1.55572509765625, + "learning_rate": 9.2075e-05, + "loss": 0.5306, + "step": 18425 + }, + { + "epoch": 1.0318064732892822, + "grad_norm": 1.4952770471572876, + "learning_rate": 9.208e-05, + "loss": 0.4225, + "step": 18426 + }, + { + "epoch": 1.0318624706014112, + "grad_norm": 1.4356719255447388, + "learning_rate": 9.2085e-05, + "loss": 0.5911, + "step": 18427 + }, + { + "epoch": 1.0319184679135402, + "grad_norm": 1.3004816770553589, + "learning_rate": 9.209000000000001e-05, + "loss": 0.4836, + "step": 18428 + }, + { + "epoch": 1.0319744652256693, + "grad_norm": 1.3479303121566772, + "learning_rate": 9.2095e-05, + "loss": 0.4258, + "step": 18429 + }, + { + "epoch": 1.0320304625377983, + "grad_norm": 1.1808431148529053, + "learning_rate": 9.21e-05, + "loss": 0.4005, + "step": 18430 + }, + { + "epoch": 1.0320864598499273, + "grad_norm": 1.135550618171692, + "learning_rate": 9.210500000000001e-05, + "loss": 0.3771, + "step": 18431 + }, + { + "epoch": 1.0321424571620563, + "grad_norm": 1.316703200340271, + "learning_rate": 9.211000000000001e-05, + "loss": 0.4245, + "step": 18432 + }, + { + "epoch": 1.0321984544741853, + "grad_norm": 1.4228503704071045, + "learning_rate": 9.211500000000001e-05, + "loss": 0.4636, + "step": 18433 + }, + { + "epoch": 1.0322544517863144, + "grad_norm": 1.084437370300293, + "learning_rate": 9.212e-05, + "loss": 0.3343, + "step": 18434 + }, + { + "epoch": 1.0323104490984432, + "grad_norm": 1.2737746238708496, + "learning_rate": 9.2125e-05, + "loss": 0.4865, + "step": 18435 + }, + { + "epoch": 1.0323664464105722, + "grad_norm": 1.5287773609161377, + "learning_rate": 9.213e-05, + "loss": 0.5212, + "step": 18436 + }, + { + "epoch": 1.0324224437227012, + "grad_norm": 1.218571662902832, + "learning_rate": 9.2135e-05, + "loss": 0.3727, + "step": 18437 + }, + { + "epoch": 1.0324784410348302, + "grad_norm": 1.2497469186782837, + "learning_rate": 9.214000000000001e-05, + "loss": 0.4716, + "step": 18438 + }, + { + "epoch": 1.0325344383469592, + "grad_norm": 1.2556761503219604, + "learning_rate": 9.2145e-05, + "loss": 0.5003, + "step": 18439 + }, + { + "epoch": 1.0325904356590883, + "grad_norm": 1.5302544832229614, + "learning_rate": 9.215e-05, + "loss": 0.353, + "step": 18440 + }, + { + "epoch": 1.0326464329712173, + "grad_norm": 1.4618993997573853, + "learning_rate": 9.2155e-05, + "loss": 0.5285, + "step": 18441 + }, + { + "epoch": 1.0327024302833463, + "grad_norm": 1.3802485466003418, + "learning_rate": 9.216e-05, + "loss": 0.4933, + "step": 18442 + }, + { + "epoch": 1.0327584275954753, + "grad_norm": 1.2149531841278076, + "learning_rate": 9.2165e-05, + "loss": 0.4026, + "step": 18443 + }, + { + "epoch": 1.0328144249076043, + "grad_norm": 1.3150397539138794, + "learning_rate": 9.217000000000001e-05, + "loss": 0.5103, + "step": 18444 + }, + { + "epoch": 1.0328704222197334, + "grad_norm": 1.1010187864303589, + "learning_rate": 9.2175e-05, + "loss": 0.4004, + "step": 18445 + }, + { + "epoch": 1.0329264195318624, + "grad_norm": 1.3528146743774414, + "learning_rate": 9.218e-05, + "loss": 0.4566, + "step": 18446 + }, + { + "epoch": 1.0329824168439914, + "grad_norm": 1.2506998777389526, + "learning_rate": 9.2185e-05, + "loss": 0.378, + "step": 18447 + }, + { + "epoch": 1.0330384141561204, + "grad_norm": 1.246854543685913, + "learning_rate": 9.219000000000001e-05, + "loss": 0.351, + "step": 18448 + }, + { + "epoch": 1.0330944114682494, + "grad_norm": 1.083481788635254, + "learning_rate": 9.219500000000001e-05, + "loss": 0.3952, + "step": 18449 + }, + { + "epoch": 1.0331504087803784, + "grad_norm": 1.1672245264053345, + "learning_rate": 9.22e-05, + "loss": 0.3209, + "step": 18450 + }, + { + "epoch": 1.0332064060925075, + "grad_norm": 1.3521337509155273, + "learning_rate": 9.2205e-05, + "loss": 0.4215, + "step": 18451 + }, + { + "epoch": 1.0332624034046365, + "grad_norm": 2.070510149002075, + "learning_rate": 9.221000000000001e-05, + "loss": 0.4247, + "step": 18452 + }, + { + "epoch": 1.0333184007167655, + "grad_norm": 1.2192219495773315, + "learning_rate": 9.221500000000001e-05, + "loss": 0.4238, + "step": 18453 + }, + { + "epoch": 1.0333743980288945, + "grad_norm": 1.336152195930481, + "learning_rate": 9.222000000000001e-05, + "loss": 0.3947, + "step": 18454 + }, + { + "epoch": 1.0334303953410235, + "grad_norm": 1.402063250541687, + "learning_rate": 9.2225e-05, + "loss": 0.4521, + "step": 18455 + }, + { + "epoch": 1.0334863926531526, + "grad_norm": 1.3092293739318848, + "learning_rate": 9.223e-05, + "loss": 0.4352, + "step": 18456 + }, + { + "epoch": 1.0335423899652816, + "grad_norm": 1.3286820650100708, + "learning_rate": 9.2235e-05, + "loss": 0.4113, + "step": 18457 + }, + { + "epoch": 1.0335983872774106, + "grad_norm": 2.4885976314544678, + "learning_rate": 9.224e-05, + "loss": 0.3486, + "step": 18458 + }, + { + "epoch": 1.0336543845895396, + "grad_norm": 1.1975092887878418, + "learning_rate": 9.224500000000001e-05, + "loss": 0.3241, + "step": 18459 + }, + { + "epoch": 1.0337103819016686, + "grad_norm": 1.226528525352478, + "learning_rate": 9.225e-05, + "loss": 0.4842, + "step": 18460 + }, + { + "epoch": 1.0337663792137977, + "grad_norm": 1.5238783359527588, + "learning_rate": 9.2255e-05, + "loss": 0.4655, + "step": 18461 + }, + { + "epoch": 1.0338223765259267, + "grad_norm": 1.4806811809539795, + "learning_rate": 9.226e-05, + "loss": 0.4675, + "step": 18462 + }, + { + "epoch": 1.0338783738380557, + "grad_norm": 1.292662501335144, + "learning_rate": 9.2265e-05, + "loss": 0.369, + "step": 18463 + }, + { + "epoch": 1.0339343711501847, + "grad_norm": 1.2743792533874512, + "learning_rate": 9.227e-05, + "loss": 0.4125, + "step": 18464 + }, + { + "epoch": 1.0339903684623137, + "grad_norm": 1.2489051818847656, + "learning_rate": 9.2275e-05, + "loss": 0.3937, + "step": 18465 + }, + { + "epoch": 1.0340463657744428, + "grad_norm": 1.2261844873428345, + "learning_rate": 9.228e-05, + "loss": 0.3851, + "step": 18466 + }, + { + "epoch": 1.0341023630865718, + "grad_norm": 1.483903169631958, + "learning_rate": 9.2285e-05, + "loss": 0.5393, + "step": 18467 + }, + { + "epoch": 1.0341583603987008, + "grad_norm": 1.15414297580719, + "learning_rate": 9.229000000000001e-05, + "loss": 0.4449, + "step": 18468 + }, + { + "epoch": 1.0342143577108298, + "grad_norm": 1.0663937330245972, + "learning_rate": 9.229500000000001e-05, + "loss": 0.3002, + "step": 18469 + }, + { + "epoch": 1.0342703550229588, + "grad_norm": 1.4299031496047974, + "learning_rate": 9.230000000000001e-05, + "loss": 0.4362, + "step": 18470 + }, + { + "epoch": 1.0343263523350879, + "grad_norm": 1.37778902053833, + "learning_rate": 9.2305e-05, + "loss": 0.4495, + "step": 18471 + }, + { + "epoch": 1.0343823496472169, + "grad_norm": 1.7153373956680298, + "learning_rate": 9.231e-05, + "loss": 0.4534, + "step": 18472 + }, + { + "epoch": 1.034438346959346, + "grad_norm": 1.1465803384780884, + "learning_rate": 9.231500000000001e-05, + "loss": 0.3172, + "step": 18473 + }, + { + "epoch": 1.034494344271475, + "grad_norm": 1.223066806793213, + "learning_rate": 9.232000000000001e-05, + "loss": 0.5094, + "step": 18474 + }, + { + "epoch": 1.034550341583604, + "grad_norm": 1.3934199810028076, + "learning_rate": 9.232500000000001e-05, + "loss": 0.3479, + "step": 18475 + }, + { + "epoch": 1.034606338895733, + "grad_norm": 1.3378269672393799, + "learning_rate": 9.233e-05, + "loss": 0.496, + "step": 18476 + }, + { + "epoch": 1.034662336207862, + "grad_norm": 1.2652432918548584, + "learning_rate": 9.2335e-05, + "loss": 0.461, + "step": 18477 + }, + { + "epoch": 1.034718333519991, + "grad_norm": 1.155977725982666, + "learning_rate": 9.234e-05, + "loss": 0.2798, + "step": 18478 + }, + { + "epoch": 1.03477433083212, + "grad_norm": 1.3027223348617554, + "learning_rate": 9.2345e-05, + "loss": 0.4, + "step": 18479 + }, + { + "epoch": 1.034830328144249, + "grad_norm": 1.2647991180419922, + "learning_rate": 9.235000000000001e-05, + "loss": 0.4596, + "step": 18480 + }, + { + "epoch": 1.034886325456378, + "grad_norm": 1.5008424520492554, + "learning_rate": 9.2355e-05, + "loss": 0.4096, + "step": 18481 + }, + { + "epoch": 1.034942322768507, + "grad_norm": 1.7912589311599731, + "learning_rate": 9.236e-05, + "loss": 0.4737, + "step": 18482 + }, + { + "epoch": 1.034998320080636, + "grad_norm": 1.201886534690857, + "learning_rate": 9.2365e-05, + "loss": 0.5048, + "step": 18483 + }, + { + "epoch": 1.035054317392765, + "grad_norm": 1.1117593050003052, + "learning_rate": 9.237e-05, + "loss": 0.3959, + "step": 18484 + }, + { + "epoch": 1.0351103147048941, + "grad_norm": 1.2172096967697144, + "learning_rate": 9.2375e-05, + "loss": 0.4206, + "step": 18485 + }, + { + "epoch": 1.0351663120170231, + "grad_norm": 1.2553060054779053, + "learning_rate": 9.238e-05, + "loss": 0.5313, + "step": 18486 + }, + { + "epoch": 1.0352223093291522, + "grad_norm": 1.4407403469085693, + "learning_rate": 9.2385e-05, + "loss": 0.4232, + "step": 18487 + }, + { + "epoch": 1.0352783066412812, + "grad_norm": 1.471834659576416, + "learning_rate": 9.239000000000001e-05, + "loss": 0.383, + "step": 18488 + }, + { + "epoch": 1.0353343039534102, + "grad_norm": 1.1300737857818604, + "learning_rate": 9.239500000000001e-05, + "loss": 0.3147, + "step": 18489 + }, + { + "epoch": 1.0353903012655392, + "grad_norm": 1.2885539531707764, + "learning_rate": 9.240000000000001e-05, + "loss": 0.3815, + "step": 18490 + }, + { + "epoch": 1.0354462985776682, + "grad_norm": 1.3831018209457397, + "learning_rate": 9.240500000000001e-05, + "loss": 0.4184, + "step": 18491 + }, + { + "epoch": 1.0355022958897973, + "grad_norm": 1.3933441638946533, + "learning_rate": 9.241e-05, + "loss": 0.501, + "step": 18492 + }, + { + "epoch": 1.0355582932019263, + "grad_norm": 1.4546265602111816, + "learning_rate": 9.2415e-05, + "loss": 0.3986, + "step": 18493 + }, + { + "epoch": 1.0356142905140553, + "grad_norm": 1.6683248281478882, + "learning_rate": 9.242000000000001e-05, + "loss": 0.42, + "step": 18494 + }, + { + "epoch": 1.0356702878261843, + "grad_norm": 1.5890135765075684, + "learning_rate": 9.242500000000001e-05, + "loss": 0.3606, + "step": 18495 + }, + { + "epoch": 1.0357262851383133, + "grad_norm": 1.2694878578186035, + "learning_rate": 9.243000000000001e-05, + "loss": 0.3669, + "step": 18496 + }, + { + "epoch": 1.0357822824504423, + "grad_norm": 1.2919161319732666, + "learning_rate": 9.2435e-05, + "loss": 0.3652, + "step": 18497 + }, + { + "epoch": 1.0358382797625714, + "grad_norm": 1.3796987533569336, + "learning_rate": 9.244e-05, + "loss": 0.6024, + "step": 18498 + }, + { + "epoch": 1.0358942770747004, + "grad_norm": 1.7107839584350586, + "learning_rate": 9.2445e-05, + "loss": 0.6868, + "step": 18499 + }, + { + "epoch": 1.0359502743868294, + "grad_norm": 1.4201178550720215, + "learning_rate": 9.245e-05, + "loss": 0.3733, + "step": 18500 + }, + { + "epoch": 1.0360062716989584, + "grad_norm": 1.321673035621643, + "learning_rate": 9.245500000000001e-05, + "loss": 0.3997, + "step": 18501 + }, + { + "epoch": 1.0360622690110874, + "grad_norm": 1.5087130069732666, + "learning_rate": 9.246e-05, + "loss": 0.422, + "step": 18502 + }, + { + "epoch": 1.0361182663232165, + "grad_norm": 1.3422858715057373, + "learning_rate": 9.2465e-05, + "loss": 0.4653, + "step": 18503 + }, + { + "epoch": 1.0361742636353455, + "grad_norm": 1.1116987466812134, + "learning_rate": 9.247e-05, + "loss": 0.2964, + "step": 18504 + }, + { + "epoch": 1.0362302609474745, + "grad_norm": 1.3748716115951538, + "learning_rate": 9.2475e-05, + "loss": 0.4254, + "step": 18505 + }, + { + "epoch": 1.0362862582596035, + "grad_norm": 1.7915778160095215, + "learning_rate": 9.248e-05, + "loss": 0.4567, + "step": 18506 + }, + { + "epoch": 1.0363422555717325, + "grad_norm": 1.5386706590652466, + "learning_rate": 9.248499999999999e-05, + "loss": 0.474, + "step": 18507 + }, + { + "epoch": 1.0363982528838616, + "grad_norm": 1.4944480657577515, + "learning_rate": 9.249e-05, + "loss": 0.351, + "step": 18508 + }, + { + "epoch": 1.0364542501959906, + "grad_norm": 1.542141318321228, + "learning_rate": 9.249500000000001e-05, + "loss": 0.4844, + "step": 18509 + }, + { + "epoch": 1.0365102475081196, + "grad_norm": 1.530866265296936, + "learning_rate": 9.250000000000001e-05, + "loss": 0.4474, + "step": 18510 + }, + { + "epoch": 1.0365662448202486, + "grad_norm": 1.4048144817352295, + "learning_rate": 9.250500000000001e-05, + "loss": 0.4107, + "step": 18511 + }, + { + "epoch": 1.0366222421323776, + "grad_norm": 1.313151478767395, + "learning_rate": 9.251000000000001e-05, + "loss": 0.5219, + "step": 18512 + }, + { + "epoch": 1.0366782394445067, + "grad_norm": 1.5241057872772217, + "learning_rate": 9.2515e-05, + "loss": 0.4295, + "step": 18513 + }, + { + "epoch": 1.0367342367566357, + "grad_norm": 1.1829602718353271, + "learning_rate": 9.252e-05, + "loss": 0.3259, + "step": 18514 + }, + { + "epoch": 1.0367902340687647, + "grad_norm": 1.3312814235687256, + "learning_rate": 9.252500000000001e-05, + "loss": 0.3562, + "step": 18515 + }, + { + "epoch": 1.0368462313808937, + "grad_norm": 1.2442947626113892, + "learning_rate": 9.253000000000001e-05, + "loss": 0.4495, + "step": 18516 + }, + { + "epoch": 1.0369022286930227, + "grad_norm": 1.3141032457351685, + "learning_rate": 9.253500000000001e-05, + "loss": 0.4944, + "step": 18517 + }, + { + "epoch": 1.0369582260051518, + "grad_norm": 2.926708459854126, + "learning_rate": 9.254e-05, + "loss": 0.42, + "step": 18518 + }, + { + "epoch": 1.0370142233172808, + "grad_norm": 1.6537495851516724, + "learning_rate": 9.2545e-05, + "loss": 0.3126, + "step": 18519 + }, + { + "epoch": 1.0370702206294098, + "grad_norm": 1.189548134803772, + "learning_rate": 9.255e-05, + "loss": 0.4293, + "step": 18520 + }, + { + "epoch": 1.0371262179415388, + "grad_norm": 1.8494853973388672, + "learning_rate": 9.2555e-05, + "loss": 0.4212, + "step": 18521 + }, + { + "epoch": 1.0371822152536678, + "grad_norm": 1.5236074924468994, + "learning_rate": 9.256000000000001e-05, + "loss": 0.484, + "step": 18522 + }, + { + "epoch": 1.0372382125657968, + "grad_norm": 1.5900028944015503, + "learning_rate": 9.2565e-05, + "loss": 0.4683, + "step": 18523 + }, + { + "epoch": 1.0372942098779259, + "grad_norm": 1.4063224792480469, + "learning_rate": 9.257e-05, + "loss": 0.4586, + "step": 18524 + }, + { + "epoch": 1.0373502071900549, + "grad_norm": 1.189079999923706, + "learning_rate": 9.2575e-05, + "loss": 0.389, + "step": 18525 + }, + { + "epoch": 1.037406204502184, + "grad_norm": 1.4907859563827515, + "learning_rate": 9.258e-05, + "loss": 0.4019, + "step": 18526 + }, + { + "epoch": 1.037462201814313, + "grad_norm": 1.4381401538848877, + "learning_rate": 9.2585e-05, + "loss": 0.5002, + "step": 18527 + }, + { + "epoch": 1.037518199126442, + "grad_norm": 1.2284777164459229, + "learning_rate": 9.258999999999999e-05, + "loss": 0.4127, + "step": 18528 + }, + { + "epoch": 1.037574196438571, + "grad_norm": 1.3698266744613647, + "learning_rate": 9.2595e-05, + "loss": 0.4886, + "step": 18529 + }, + { + "epoch": 1.0376301937507, + "grad_norm": 1.2363755702972412, + "learning_rate": 9.260000000000001e-05, + "loss": 0.4111, + "step": 18530 + }, + { + "epoch": 1.037686191062829, + "grad_norm": 1.3494611978530884, + "learning_rate": 9.260500000000001e-05, + "loss": 0.4135, + "step": 18531 + }, + { + "epoch": 1.037742188374958, + "grad_norm": 1.5822937488555908, + "learning_rate": 9.261000000000001e-05, + "loss": 0.5227, + "step": 18532 + }, + { + "epoch": 1.037798185687087, + "grad_norm": 1.212691307067871, + "learning_rate": 9.261500000000001e-05, + "loss": 0.3829, + "step": 18533 + }, + { + "epoch": 1.037854182999216, + "grad_norm": 1.24557626247406, + "learning_rate": 9.262e-05, + "loss": 0.3104, + "step": 18534 + }, + { + "epoch": 1.037910180311345, + "grad_norm": 1.5108773708343506, + "learning_rate": 9.2625e-05, + "loss": 0.5821, + "step": 18535 + }, + { + "epoch": 1.037966177623474, + "grad_norm": 1.411641240119934, + "learning_rate": 9.263e-05, + "loss": 0.4712, + "step": 18536 + }, + { + "epoch": 1.0380221749356031, + "grad_norm": 1.4698002338409424, + "learning_rate": 9.263500000000001e-05, + "loss": 0.4311, + "step": 18537 + }, + { + "epoch": 1.0380781722477321, + "grad_norm": 1.2794344425201416, + "learning_rate": 9.264000000000001e-05, + "loss": 0.5081, + "step": 18538 + }, + { + "epoch": 1.0381341695598612, + "grad_norm": 1.553958773612976, + "learning_rate": 9.2645e-05, + "loss": 0.3672, + "step": 18539 + }, + { + "epoch": 1.0381901668719902, + "grad_norm": 1.2659039497375488, + "learning_rate": 9.265e-05, + "loss": 0.4383, + "step": 18540 + }, + { + "epoch": 1.0382461641841192, + "grad_norm": 1.397771954536438, + "learning_rate": 9.2655e-05, + "loss": 0.4917, + "step": 18541 + }, + { + "epoch": 1.0383021614962482, + "grad_norm": 1.0613263845443726, + "learning_rate": 9.266e-05, + "loss": 0.303, + "step": 18542 + }, + { + "epoch": 1.0383581588083772, + "grad_norm": 1.3095909357070923, + "learning_rate": 9.266500000000001e-05, + "loss": 0.3641, + "step": 18543 + }, + { + "epoch": 1.0384141561205062, + "grad_norm": 1.1477950811386108, + "learning_rate": 9.267e-05, + "loss": 0.4294, + "step": 18544 + }, + { + "epoch": 1.0384701534326353, + "grad_norm": 1.4217934608459473, + "learning_rate": 9.2675e-05, + "loss": 0.3987, + "step": 18545 + }, + { + "epoch": 1.0385261507447643, + "grad_norm": 1.280185580253601, + "learning_rate": 9.268e-05, + "loss": 0.3985, + "step": 18546 + }, + { + "epoch": 1.0385821480568933, + "grad_norm": 1.384057641029358, + "learning_rate": 9.2685e-05, + "loss": 0.6648, + "step": 18547 + }, + { + "epoch": 1.0386381453690223, + "grad_norm": 1.3243428468704224, + "learning_rate": 9.269e-05, + "loss": 0.4763, + "step": 18548 + }, + { + "epoch": 1.0386941426811513, + "grad_norm": 1.3901629447937012, + "learning_rate": 9.2695e-05, + "loss": 0.5321, + "step": 18549 + }, + { + "epoch": 1.0387501399932804, + "grad_norm": 1.280961513519287, + "learning_rate": 9.27e-05, + "loss": 0.4644, + "step": 18550 + }, + { + "epoch": 1.0388061373054094, + "grad_norm": 1.2251585721969604, + "learning_rate": 9.270500000000001e-05, + "loss": 0.4093, + "step": 18551 + }, + { + "epoch": 1.0388621346175384, + "grad_norm": 1.186393141746521, + "learning_rate": 9.271000000000001e-05, + "loss": 0.3122, + "step": 18552 + }, + { + "epoch": 1.0389181319296674, + "grad_norm": 1.3896955251693726, + "learning_rate": 9.271500000000001e-05, + "loss": 0.4324, + "step": 18553 + }, + { + "epoch": 1.0389741292417964, + "grad_norm": 1.251495361328125, + "learning_rate": 9.272e-05, + "loss": 0.3037, + "step": 18554 + }, + { + "epoch": 1.0390301265539255, + "grad_norm": 1.3746366500854492, + "learning_rate": 9.2725e-05, + "loss": 0.4114, + "step": 18555 + }, + { + "epoch": 1.0390861238660545, + "grad_norm": 1.6332991123199463, + "learning_rate": 9.273e-05, + "loss": 0.4307, + "step": 18556 + }, + { + "epoch": 1.0391421211781835, + "grad_norm": 1.337018609046936, + "learning_rate": 9.2735e-05, + "loss": 0.4085, + "step": 18557 + }, + { + "epoch": 1.0391981184903125, + "grad_norm": 1.35471510887146, + "learning_rate": 9.274000000000001e-05, + "loss": 0.5234, + "step": 18558 + }, + { + "epoch": 1.0392541158024415, + "grad_norm": 1.4117554426193237, + "learning_rate": 9.274500000000001e-05, + "loss": 0.4672, + "step": 18559 + }, + { + "epoch": 1.0393101131145706, + "grad_norm": 1.4028817415237427, + "learning_rate": 9.275e-05, + "loss": 0.4823, + "step": 18560 + }, + { + "epoch": 1.0393661104266996, + "grad_norm": 1.233064889907837, + "learning_rate": 9.2755e-05, + "loss": 0.5555, + "step": 18561 + }, + { + "epoch": 1.0394221077388286, + "grad_norm": 1.1610888242721558, + "learning_rate": 9.276e-05, + "loss": 0.3154, + "step": 18562 + }, + { + "epoch": 1.0394781050509576, + "grad_norm": 1.180823564529419, + "learning_rate": 9.2765e-05, + "loss": 0.3393, + "step": 18563 + }, + { + "epoch": 1.0395341023630866, + "grad_norm": 1.627644419670105, + "learning_rate": 9.277000000000001e-05, + "loss": 0.5276, + "step": 18564 + }, + { + "epoch": 1.0395900996752157, + "grad_norm": 1.388357162475586, + "learning_rate": 9.2775e-05, + "loss": 0.4377, + "step": 18565 + }, + { + "epoch": 1.0396460969873447, + "grad_norm": 1.2008401155471802, + "learning_rate": 9.278e-05, + "loss": 0.4566, + "step": 18566 + }, + { + "epoch": 1.0397020942994737, + "grad_norm": 1.2332335710525513, + "learning_rate": 9.2785e-05, + "loss": 0.4352, + "step": 18567 + }, + { + "epoch": 1.0397580916116027, + "grad_norm": 1.3955116271972656, + "learning_rate": 9.279e-05, + "loss": 0.3749, + "step": 18568 + }, + { + "epoch": 1.0398140889237317, + "grad_norm": 1.4083069562911987, + "learning_rate": 9.279500000000001e-05, + "loss": 0.4986, + "step": 18569 + }, + { + "epoch": 1.0398700862358607, + "grad_norm": 1.2220817804336548, + "learning_rate": 9.28e-05, + "loss": 0.3388, + "step": 18570 + }, + { + "epoch": 1.0399260835479898, + "grad_norm": 3.772998571395874, + "learning_rate": 9.2805e-05, + "loss": 0.4862, + "step": 18571 + }, + { + "epoch": 1.0399820808601188, + "grad_norm": 1.30063796043396, + "learning_rate": 9.281000000000001e-05, + "loss": 0.5021, + "step": 18572 + }, + { + "epoch": 1.0400380781722478, + "grad_norm": 1.389096736907959, + "learning_rate": 9.281500000000001e-05, + "loss": 0.4203, + "step": 18573 + }, + { + "epoch": 1.0400940754843768, + "grad_norm": 1.1973382234573364, + "learning_rate": 9.282000000000001e-05, + "loss": 0.46, + "step": 18574 + }, + { + "epoch": 1.0401500727965058, + "grad_norm": 1.3328988552093506, + "learning_rate": 9.2825e-05, + "loss": 0.4274, + "step": 18575 + }, + { + "epoch": 1.0402060701086349, + "grad_norm": 1.541388988494873, + "learning_rate": 9.283e-05, + "loss": 0.5745, + "step": 18576 + }, + { + "epoch": 1.0402620674207639, + "grad_norm": 1.273723840713501, + "learning_rate": 9.2835e-05, + "loss": 0.3905, + "step": 18577 + }, + { + "epoch": 1.040318064732893, + "grad_norm": 1.2285714149475098, + "learning_rate": 9.284e-05, + "loss": 0.3658, + "step": 18578 + }, + { + "epoch": 1.040374062045022, + "grad_norm": 1.3290058374404907, + "learning_rate": 9.284500000000001e-05, + "loss": 0.3227, + "step": 18579 + }, + { + "epoch": 1.040430059357151, + "grad_norm": 1.489951491355896, + "learning_rate": 9.285000000000001e-05, + "loss": 0.4602, + "step": 18580 + }, + { + "epoch": 1.04048605666928, + "grad_norm": 1.428558111190796, + "learning_rate": 9.2855e-05, + "loss": 0.4313, + "step": 18581 + }, + { + "epoch": 1.040542053981409, + "grad_norm": 1.4195235967636108, + "learning_rate": 9.286e-05, + "loss": 0.4642, + "step": 18582 + }, + { + "epoch": 1.040598051293538, + "grad_norm": 1.3619436025619507, + "learning_rate": 9.2865e-05, + "loss": 0.4122, + "step": 18583 + }, + { + "epoch": 1.040654048605667, + "grad_norm": 1.297425627708435, + "learning_rate": 9.287e-05, + "loss": 0.4537, + "step": 18584 + }, + { + "epoch": 1.040710045917796, + "grad_norm": 1.6776831150054932, + "learning_rate": 9.2875e-05, + "loss": 0.5702, + "step": 18585 + }, + { + "epoch": 1.040766043229925, + "grad_norm": 1.1916131973266602, + "learning_rate": 9.288e-05, + "loss": 0.3784, + "step": 18586 + }, + { + "epoch": 1.040822040542054, + "grad_norm": 1.7893197536468506, + "learning_rate": 9.2885e-05, + "loss": 0.4912, + "step": 18587 + }, + { + "epoch": 1.040878037854183, + "grad_norm": 1.0918089151382446, + "learning_rate": 9.289e-05, + "loss": 0.3608, + "step": 18588 + }, + { + "epoch": 1.040934035166312, + "grad_norm": 1.0850043296813965, + "learning_rate": 9.289500000000001e-05, + "loss": 0.321, + "step": 18589 + }, + { + "epoch": 1.0409900324784411, + "grad_norm": 1.3966138362884521, + "learning_rate": 9.290000000000001e-05, + "loss": 0.4968, + "step": 18590 + }, + { + "epoch": 1.0410460297905701, + "grad_norm": 1.399404525756836, + "learning_rate": 9.2905e-05, + "loss": 0.4167, + "step": 18591 + }, + { + "epoch": 1.0411020271026992, + "grad_norm": 1.5733047723770142, + "learning_rate": 9.291e-05, + "loss": 0.399, + "step": 18592 + }, + { + "epoch": 1.0411580244148282, + "grad_norm": 1.2910435199737549, + "learning_rate": 9.291500000000001e-05, + "loss": 0.5577, + "step": 18593 + }, + { + "epoch": 1.0412140217269572, + "grad_norm": 1.4125996828079224, + "learning_rate": 9.292000000000001e-05, + "loss": 0.4281, + "step": 18594 + }, + { + "epoch": 1.0412700190390862, + "grad_norm": 1.448743224143982, + "learning_rate": 9.292500000000001e-05, + "loss": 0.373, + "step": 18595 + }, + { + "epoch": 1.0413260163512152, + "grad_norm": 1.3208740949630737, + "learning_rate": 9.293e-05, + "loss": 0.4838, + "step": 18596 + }, + { + "epoch": 1.0413820136633443, + "grad_norm": 2.6943812370300293, + "learning_rate": 9.2935e-05, + "loss": 0.3669, + "step": 18597 + }, + { + "epoch": 1.0414380109754733, + "grad_norm": 1.666571021080017, + "learning_rate": 9.294e-05, + "loss": 0.4782, + "step": 18598 + }, + { + "epoch": 1.0414940082876023, + "grad_norm": 1.346853256225586, + "learning_rate": 9.2945e-05, + "loss": 0.4829, + "step": 18599 + }, + { + "epoch": 1.0415500055997313, + "grad_norm": 1.322637915611267, + "learning_rate": 9.295000000000001e-05, + "loss": 0.3847, + "step": 18600 + }, + { + "epoch": 1.0416060029118603, + "grad_norm": 1.4859142303466797, + "learning_rate": 9.295500000000001e-05, + "loss": 0.4769, + "step": 18601 + }, + { + "epoch": 1.0416620002239894, + "grad_norm": 1.3755728006362915, + "learning_rate": 9.296e-05, + "loss": 0.4521, + "step": 18602 + }, + { + "epoch": 1.0417179975361184, + "grad_norm": 1.2686610221862793, + "learning_rate": 9.2965e-05, + "loss": 0.3776, + "step": 18603 + }, + { + "epoch": 1.0417739948482474, + "grad_norm": 1.4579969644546509, + "learning_rate": 9.297e-05, + "loss": 0.5432, + "step": 18604 + }, + { + "epoch": 1.0418299921603762, + "grad_norm": 1.4644408226013184, + "learning_rate": 9.2975e-05, + "loss": 0.4334, + "step": 18605 + }, + { + "epoch": 1.0418859894725052, + "grad_norm": 1.4493407011032104, + "learning_rate": 9.298e-05, + "loss": 0.4742, + "step": 18606 + }, + { + "epoch": 1.0419419867846342, + "grad_norm": 1.270177960395813, + "learning_rate": 9.2985e-05, + "loss": 0.3793, + "step": 18607 + }, + { + "epoch": 1.0419979840967633, + "grad_norm": 1.3753674030303955, + "learning_rate": 9.299e-05, + "loss": 0.4165, + "step": 18608 + }, + { + "epoch": 1.0420539814088923, + "grad_norm": 1.2743052244186401, + "learning_rate": 9.299500000000001e-05, + "loss": 0.443, + "step": 18609 + }, + { + "epoch": 1.0421099787210213, + "grad_norm": 1.2452137470245361, + "learning_rate": 9.300000000000001e-05, + "loss": 0.3657, + "step": 18610 + }, + { + "epoch": 1.0421659760331503, + "grad_norm": 1.2093377113342285, + "learning_rate": 9.300500000000001e-05, + "loss": 0.4707, + "step": 18611 + }, + { + "epoch": 1.0422219733452793, + "grad_norm": 1.4324147701263428, + "learning_rate": 9.301e-05, + "loss": 0.392, + "step": 18612 + }, + { + "epoch": 1.0422779706574083, + "grad_norm": 1.2902770042419434, + "learning_rate": 9.3015e-05, + "loss": 0.4421, + "step": 18613 + }, + { + "epoch": 1.0423339679695374, + "grad_norm": 1.3648157119750977, + "learning_rate": 9.302e-05, + "loss": 0.4302, + "step": 18614 + }, + { + "epoch": 1.0423899652816664, + "grad_norm": 1.7705236673355103, + "learning_rate": 9.302500000000001e-05, + "loss": 0.4566, + "step": 18615 + }, + { + "epoch": 1.0424459625937954, + "grad_norm": 1.3672006130218506, + "learning_rate": 9.303000000000001e-05, + "loss": 0.4822, + "step": 18616 + }, + { + "epoch": 1.0425019599059244, + "grad_norm": 1.1181634664535522, + "learning_rate": 9.3035e-05, + "loss": 0.3502, + "step": 18617 + }, + { + "epoch": 1.0425579572180534, + "grad_norm": 1.2181497812271118, + "learning_rate": 9.304e-05, + "loss": 0.3622, + "step": 18618 + }, + { + "epoch": 1.0426139545301825, + "grad_norm": 1.4582486152648926, + "learning_rate": 9.3045e-05, + "loss": 0.3453, + "step": 18619 + }, + { + "epoch": 1.0426699518423115, + "grad_norm": 1.049888253211975, + "learning_rate": 9.305e-05, + "loss": 0.385, + "step": 18620 + }, + { + "epoch": 1.0427259491544405, + "grad_norm": 1.425157904624939, + "learning_rate": 9.305500000000001e-05, + "loss": 0.4258, + "step": 18621 + }, + { + "epoch": 1.0427819464665695, + "grad_norm": 1.3080180883407593, + "learning_rate": 9.306000000000001e-05, + "loss": 0.4685, + "step": 18622 + }, + { + "epoch": 1.0428379437786985, + "grad_norm": 1.5657936334609985, + "learning_rate": 9.3065e-05, + "loss": 0.6739, + "step": 18623 + }, + { + "epoch": 1.0428939410908276, + "grad_norm": 1.6061358451843262, + "learning_rate": 9.307e-05, + "loss": 0.4863, + "step": 18624 + }, + { + "epoch": 1.0429499384029566, + "grad_norm": 1.7165899276733398, + "learning_rate": 9.3075e-05, + "loss": 0.6032, + "step": 18625 + }, + { + "epoch": 1.0430059357150856, + "grad_norm": 1.1328483819961548, + "learning_rate": 9.308e-05, + "loss": 0.3495, + "step": 18626 + }, + { + "epoch": 1.0430619330272146, + "grad_norm": 1.3610625267028809, + "learning_rate": 9.3085e-05, + "loss": 0.4514, + "step": 18627 + }, + { + "epoch": 1.0431179303393436, + "grad_norm": 1.328114628791809, + "learning_rate": 9.309e-05, + "loss": 0.4111, + "step": 18628 + }, + { + "epoch": 1.0431739276514727, + "grad_norm": 1.3617134094238281, + "learning_rate": 9.309500000000002e-05, + "loss": 0.4748, + "step": 18629 + }, + { + "epoch": 1.0432299249636017, + "grad_norm": 4.176542282104492, + "learning_rate": 9.310000000000001e-05, + "loss": 0.4475, + "step": 18630 + }, + { + "epoch": 1.0432859222757307, + "grad_norm": 1.5613341331481934, + "learning_rate": 9.310500000000001e-05, + "loss": 0.472, + "step": 18631 + }, + { + "epoch": 1.0433419195878597, + "grad_norm": 1.3893775939941406, + "learning_rate": 9.311000000000001e-05, + "loss": 0.4798, + "step": 18632 + }, + { + "epoch": 1.0433979168999887, + "grad_norm": 1.4824298620224, + "learning_rate": 9.3115e-05, + "loss": 0.4829, + "step": 18633 + }, + { + "epoch": 1.0434539142121178, + "grad_norm": 1.209849238395691, + "learning_rate": 9.312e-05, + "loss": 0.36, + "step": 18634 + }, + { + "epoch": 1.0435099115242468, + "grad_norm": 1.2994004487991333, + "learning_rate": 9.3125e-05, + "loss": 0.5197, + "step": 18635 + }, + { + "epoch": 1.0435659088363758, + "grad_norm": 1.2604016065597534, + "learning_rate": 9.313000000000001e-05, + "loss": 0.4214, + "step": 18636 + }, + { + "epoch": 1.0436219061485048, + "grad_norm": 1.1662832498550415, + "learning_rate": 9.313500000000001e-05, + "loss": 0.3889, + "step": 18637 + }, + { + "epoch": 1.0436779034606338, + "grad_norm": 1.609951376914978, + "learning_rate": 9.314e-05, + "loss": 0.3799, + "step": 18638 + }, + { + "epoch": 1.0437339007727628, + "grad_norm": 1.2422561645507812, + "learning_rate": 9.3145e-05, + "loss": 0.4896, + "step": 18639 + }, + { + "epoch": 1.0437898980848919, + "grad_norm": 1.5622755289077759, + "learning_rate": 9.315e-05, + "loss": 0.4953, + "step": 18640 + }, + { + "epoch": 1.0438458953970209, + "grad_norm": 1.173612356185913, + "learning_rate": 9.3155e-05, + "loss": 0.4147, + "step": 18641 + }, + { + "epoch": 1.04390189270915, + "grad_norm": 1.1745073795318604, + "learning_rate": 9.316000000000001e-05, + "loss": 0.366, + "step": 18642 + }, + { + "epoch": 1.043957890021279, + "grad_norm": 1.1597683429718018, + "learning_rate": 9.3165e-05, + "loss": 0.3775, + "step": 18643 + }, + { + "epoch": 1.044013887333408, + "grad_norm": 1.605271339416504, + "learning_rate": 9.317e-05, + "loss": 0.3384, + "step": 18644 + }, + { + "epoch": 1.044069884645537, + "grad_norm": 1.1824640035629272, + "learning_rate": 9.3175e-05, + "loss": 0.5947, + "step": 18645 + }, + { + "epoch": 1.044125881957666, + "grad_norm": 1.3627582788467407, + "learning_rate": 9.318e-05, + "loss": 0.4234, + "step": 18646 + }, + { + "epoch": 1.044181879269795, + "grad_norm": 1.2604939937591553, + "learning_rate": 9.3185e-05, + "loss": 0.4672, + "step": 18647 + }, + { + "epoch": 1.044237876581924, + "grad_norm": 1.429374098777771, + "learning_rate": 9.319e-05, + "loss": 0.47, + "step": 18648 + }, + { + "epoch": 1.044293873894053, + "grad_norm": 1.1948559284210205, + "learning_rate": 9.3195e-05, + "loss": 0.3278, + "step": 18649 + }, + { + "epoch": 1.044349871206182, + "grad_norm": 1.5485488176345825, + "learning_rate": 9.320000000000002e-05, + "loss": 0.6168, + "step": 18650 + }, + { + "epoch": 1.044405868518311, + "grad_norm": 1.5370079278945923, + "learning_rate": 9.320500000000001e-05, + "loss": 0.3781, + "step": 18651 + }, + { + "epoch": 1.04446186583044, + "grad_norm": 1.2994308471679688, + "learning_rate": 9.321000000000001e-05, + "loss": 0.437, + "step": 18652 + }, + { + "epoch": 1.0445178631425691, + "grad_norm": 1.4331862926483154, + "learning_rate": 9.321500000000001e-05, + "loss": 0.5217, + "step": 18653 + }, + { + "epoch": 1.0445738604546981, + "grad_norm": 1.451013445854187, + "learning_rate": 9.322e-05, + "loss": 0.5015, + "step": 18654 + }, + { + "epoch": 1.0446298577668272, + "grad_norm": 1.409241795539856, + "learning_rate": 9.3225e-05, + "loss": 0.5115, + "step": 18655 + }, + { + "epoch": 1.0446858550789562, + "grad_norm": 1.174866795539856, + "learning_rate": 9.323e-05, + "loss": 0.4376, + "step": 18656 + }, + { + "epoch": 1.0447418523910852, + "grad_norm": 1.2477177381515503, + "learning_rate": 9.323500000000001e-05, + "loss": 0.3533, + "step": 18657 + }, + { + "epoch": 1.0447978497032142, + "grad_norm": 1.4175711870193481, + "learning_rate": 9.324000000000001e-05, + "loss": 0.3326, + "step": 18658 + }, + { + "epoch": 1.0448538470153432, + "grad_norm": 1.3794015645980835, + "learning_rate": 9.3245e-05, + "loss": 0.3185, + "step": 18659 + }, + { + "epoch": 1.0449098443274722, + "grad_norm": 1.3675583600997925, + "learning_rate": 9.325e-05, + "loss": 0.3837, + "step": 18660 + }, + { + "epoch": 1.0449658416396013, + "grad_norm": 1.3625034093856812, + "learning_rate": 9.3255e-05, + "loss": 0.5808, + "step": 18661 + }, + { + "epoch": 1.0450218389517303, + "grad_norm": 2.7807505130767822, + "learning_rate": 9.326e-05, + "loss": 0.4191, + "step": 18662 + }, + { + "epoch": 1.0450778362638593, + "grad_norm": 1.6819387674331665, + "learning_rate": 9.326500000000001e-05, + "loss": 0.5341, + "step": 18663 + }, + { + "epoch": 1.0451338335759883, + "grad_norm": 1.4379454851150513, + "learning_rate": 9.327e-05, + "loss": 0.4973, + "step": 18664 + }, + { + "epoch": 1.0451898308881173, + "grad_norm": 1.2826933860778809, + "learning_rate": 9.3275e-05, + "loss": 0.4619, + "step": 18665 + }, + { + "epoch": 1.0452458282002464, + "grad_norm": 1.3095221519470215, + "learning_rate": 9.328e-05, + "loss": 0.451, + "step": 18666 + }, + { + "epoch": 1.0453018255123754, + "grad_norm": 1.3317927122116089, + "learning_rate": 9.3285e-05, + "loss": 0.4667, + "step": 18667 + }, + { + "epoch": 1.0453578228245044, + "grad_norm": 1.3291807174682617, + "learning_rate": 9.329e-05, + "loss": 0.353, + "step": 18668 + }, + { + "epoch": 1.0454138201366334, + "grad_norm": 1.4625712633132935, + "learning_rate": 9.3295e-05, + "loss": 0.4523, + "step": 18669 + }, + { + "epoch": 1.0454698174487624, + "grad_norm": 1.2807096242904663, + "learning_rate": 9.33e-05, + "loss": 0.451, + "step": 18670 + }, + { + "epoch": 1.0455258147608915, + "grad_norm": 1.2990058660507202, + "learning_rate": 9.330500000000002e-05, + "loss": 0.4871, + "step": 18671 + }, + { + "epoch": 1.0455818120730205, + "grad_norm": 1.5651144981384277, + "learning_rate": 9.331000000000001e-05, + "loss": 0.3684, + "step": 18672 + }, + { + "epoch": 1.0456378093851495, + "grad_norm": 3.8242337703704834, + "learning_rate": 9.331500000000001e-05, + "loss": 0.3723, + "step": 18673 + }, + { + "epoch": 1.0456938066972785, + "grad_norm": 1.4493603706359863, + "learning_rate": 9.332000000000001e-05, + "loss": 0.4018, + "step": 18674 + }, + { + "epoch": 1.0457498040094075, + "grad_norm": 1.471799612045288, + "learning_rate": 9.3325e-05, + "loss": 0.5327, + "step": 18675 + }, + { + "epoch": 1.0458058013215366, + "grad_norm": 1.222642183303833, + "learning_rate": 9.333e-05, + "loss": 0.3335, + "step": 18676 + }, + { + "epoch": 1.0458617986336656, + "grad_norm": 1.3658721446990967, + "learning_rate": 9.3335e-05, + "loss": 0.4231, + "step": 18677 + }, + { + "epoch": 1.0459177959457946, + "grad_norm": 1.2885686159133911, + "learning_rate": 9.334000000000001e-05, + "loss": 0.3942, + "step": 18678 + }, + { + "epoch": 1.0459737932579236, + "grad_norm": 1.3054735660552979, + "learning_rate": 9.334500000000001e-05, + "loss": 0.3332, + "step": 18679 + }, + { + "epoch": 1.0460297905700526, + "grad_norm": 1.490426778793335, + "learning_rate": 9.335e-05, + "loss": 0.4277, + "step": 18680 + }, + { + "epoch": 1.0460857878821817, + "grad_norm": 1.3818128108978271, + "learning_rate": 9.3355e-05, + "loss": 0.3945, + "step": 18681 + }, + { + "epoch": 1.0461417851943107, + "grad_norm": 1.3044003248214722, + "learning_rate": 9.336e-05, + "loss": 0.4266, + "step": 18682 + }, + { + "epoch": 1.0461977825064397, + "grad_norm": 1.2530335187911987, + "learning_rate": 9.3365e-05, + "loss": 0.3952, + "step": 18683 + }, + { + "epoch": 1.0462537798185687, + "grad_norm": 1.3521382808685303, + "learning_rate": 9.337e-05, + "loss": 0.4088, + "step": 18684 + }, + { + "epoch": 1.0463097771306977, + "grad_norm": 1.6739455461502075, + "learning_rate": 9.3375e-05, + "loss": 0.4374, + "step": 18685 + }, + { + "epoch": 1.0463657744428267, + "grad_norm": 1.3913389444351196, + "learning_rate": 9.338e-05, + "loss": 0.4193, + "step": 18686 + }, + { + "epoch": 1.0464217717549558, + "grad_norm": 1.3583130836486816, + "learning_rate": 9.3385e-05, + "loss": 0.5621, + "step": 18687 + }, + { + "epoch": 1.0464777690670848, + "grad_norm": 2.3243002891540527, + "learning_rate": 9.339e-05, + "loss": 0.4014, + "step": 18688 + }, + { + "epoch": 1.0465337663792138, + "grad_norm": 1.4661026000976562, + "learning_rate": 9.3395e-05, + "loss": 0.4266, + "step": 18689 + }, + { + "epoch": 1.0465897636913428, + "grad_norm": 1.2348803281784058, + "learning_rate": 9.340000000000001e-05, + "loss": 0.3395, + "step": 18690 + }, + { + "epoch": 1.0466457610034718, + "grad_norm": 1.4044138193130493, + "learning_rate": 9.3405e-05, + "loss": 0.4421, + "step": 18691 + }, + { + "epoch": 1.0467017583156009, + "grad_norm": 1.2582522630691528, + "learning_rate": 9.341000000000002e-05, + "loss": 0.3806, + "step": 18692 + }, + { + "epoch": 1.0467577556277299, + "grad_norm": 1.372855544090271, + "learning_rate": 9.341500000000001e-05, + "loss": 0.4049, + "step": 18693 + }, + { + "epoch": 1.046813752939859, + "grad_norm": 1.436766505241394, + "learning_rate": 9.342000000000001e-05, + "loss": 0.4663, + "step": 18694 + }, + { + "epoch": 1.046869750251988, + "grad_norm": 1.523422360420227, + "learning_rate": 9.342500000000001e-05, + "loss": 0.4943, + "step": 18695 + }, + { + "epoch": 1.046925747564117, + "grad_norm": 1.4499166011810303, + "learning_rate": 9.343e-05, + "loss": 0.5359, + "step": 18696 + }, + { + "epoch": 1.046981744876246, + "grad_norm": 1.202656626701355, + "learning_rate": 9.3435e-05, + "loss": 0.4509, + "step": 18697 + }, + { + "epoch": 1.047037742188375, + "grad_norm": 1.2682124376296997, + "learning_rate": 9.344e-05, + "loss": 0.4227, + "step": 18698 + }, + { + "epoch": 1.047093739500504, + "grad_norm": 1.3726247549057007, + "learning_rate": 9.344500000000001e-05, + "loss": 0.4103, + "step": 18699 + }, + { + "epoch": 1.047149736812633, + "grad_norm": 1.339306116104126, + "learning_rate": 9.345000000000001e-05, + "loss": 0.585, + "step": 18700 + }, + { + "epoch": 1.047205734124762, + "grad_norm": 2.1495375633239746, + "learning_rate": 9.3455e-05, + "loss": 0.4186, + "step": 18701 + }, + { + "epoch": 1.047261731436891, + "grad_norm": 1.4257663488388062, + "learning_rate": 9.346e-05, + "loss": 0.4874, + "step": 18702 + }, + { + "epoch": 1.04731772874902, + "grad_norm": 1.301926851272583, + "learning_rate": 9.3465e-05, + "loss": 0.581, + "step": 18703 + }, + { + "epoch": 1.047373726061149, + "grad_norm": 1.357356071472168, + "learning_rate": 9.347e-05, + "loss": 0.4752, + "step": 18704 + }, + { + "epoch": 1.047429723373278, + "grad_norm": 1.335082769393921, + "learning_rate": 9.3475e-05, + "loss": 0.5403, + "step": 18705 + }, + { + "epoch": 1.0474857206854071, + "grad_norm": 1.386541724205017, + "learning_rate": 9.348e-05, + "loss": 0.3512, + "step": 18706 + }, + { + "epoch": 1.0475417179975361, + "grad_norm": 1.3624595403671265, + "learning_rate": 9.3485e-05, + "loss": 0.5465, + "step": 18707 + }, + { + "epoch": 1.0475977153096652, + "grad_norm": 1.5704821348190308, + "learning_rate": 9.349e-05, + "loss": 0.4433, + "step": 18708 + }, + { + "epoch": 1.0476537126217942, + "grad_norm": 1.186461091041565, + "learning_rate": 9.3495e-05, + "loss": 0.3858, + "step": 18709 + }, + { + "epoch": 1.0477097099339232, + "grad_norm": 1.1894114017486572, + "learning_rate": 9.350000000000001e-05, + "loss": 0.3432, + "step": 18710 + }, + { + "epoch": 1.0477657072460522, + "grad_norm": 1.700202226638794, + "learning_rate": 9.350500000000001e-05, + "loss": 0.482, + "step": 18711 + }, + { + "epoch": 1.0478217045581812, + "grad_norm": 1.487416386604309, + "learning_rate": 9.351e-05, + "loss": 0.4917, + "step": 18712 + }, + { + "epoch": 1.0478777018703103, + "grad_norm": 1.6582293510437012, + "learning_rate": 9.3515e-05, + "loss": 0.4482, + "step": 18713 + }, + { + "epoch": 1.0479336991824393, + "grad_norm": 1.4676433801651, + "learning_rate": 9.352000000000001e-05, + "loss": 0.4053, + "step": 18714 + }, + { + "epoch": 1.0479896964945683, + "grad_norm": 1.5263688564300537, + "learning_rate": 9.352500000000001e-05, + "loss": 0.5713, + "step": 18715 + }, + { + "epoch": 1.0480456938066973, + "grad_norm": 1.2526209354400635, + "learning_rate": 9.353000000000001e-05, + "loss": 0.6906, + "step": 18716 + }, + { + "epoch": 1.0481016911188263, + "grad_norm": 1.3255534172058105, + "learning_rate": 9.3535e-05, + "loss": 0.4322, + "step": 18717 + }, + { + "epoch": 1.0481576884309554, + "grad_norm": 1.4175266027450562, + "learning_rate": 9.354e-05, + "loss": 0.3491, + "step": 18718 + }, + { + "epoch": 1.0482136857430844, + "grad_norm": 1.4038658142089844, + "learning_rate": 9.3545e-05, + "loss": 0.3943, + "step": 18719 + }, + { + "epoch": 1.0482696830552134, + "grad_norm": 1.6249943971633911, + "learning_rate": 9.355000000000001e-05, + "loss": 0.4448, + "step": 18720 + }, + { + "epoch": 1.0483256803673424, + "grad_norm": 1.6626101732254028, + "learning_rate": 9.355500000000001e-05, + "loss": 0.4816, + "step": 18721 + }, + { + "epoch": 1.0483816776794714, + "grad_norm": 1.1830973625183105, + "learning_rate": 9.356e-05, + "loss": 0.4251, + "step": 18722 + }, + { + "epoch": 1.0484376749916005, + "grad_norm": 1.3155674934387207, + "learning_rate": 9.3565e-05, + "loss": 0.4556, + "step": 18723 + }, + { + "epoch": 1.0484936723037295, + "grad_norm": 1.5291848182678223, + "learning_rate": 9.357e-05, + "loss": 0.5577, + "step": 18724 + }, + { + "epoch": 1.0485496696158585, + "grad_norm": 10.447802543640137, + "learning_rate": 9.3575e-05, + "loss": 0.4634, + "step": 18725 + }, + { + "epoch": 1.0486056669279875, + "grad_norm": 1.464624047279358, + "learning_rate": 9.358e-05, + "loss": 0.5386, + "step": 18726 + }, + { + "epoch": 1.0486616642401165, + "grad_norm": 1.2470518350601196, + "learning_rate": 9.3585e-05, + "loss": 0.4673, + "step": 18727 + }, + { + "epoch": 1.0487176615522456, + "grad_norm": 1.7545257806777954, + "learning_rate": 9.359e-05, + "loss": 0.4946, + "step": 18728 + }, + { + "epoch": 1.0487736588643746, + "grad_norm": 1.2152029275894165, + "learning_rate": 9.3595e-05, + "loss": 0.3201, + "step": 18729 + }, + { + "epoch": 1.0488296561765036, + "grad_norm": 1.2757127285003662, + "learning_rate": 9.360000000000001e-05, + "loss": 0.4564, + "step": 18730 + }, + { + "epoch": 1.0488856534886326, + "grad_norm": 1.3454784154891968, + "learning_rate": 9.360500000000001e-05, + "loss": 0.4314, + "step": 18731 + }, + { + "epoch": 1.0489416508007616, + "grad_norm": 1.4702589511871338, + "learning_rate": 9.361e-05, + "loss": 0.4204, + "step": 18732 + }, + { + "epoch": 1.0489976481128906, + "grad_norm": 1.1373652219772339, + "learning_rate": 9.3615e-05, + "loss": 0.3671, + "step": 18733 + }, + { + "epoch": 1.0490536454250197, + "grad_norm": 1.4210302829742432, + "learning_rate": 9.362e-05, + "loss": 0.419, + "step": 18734 + }, + { + "epoch": 1.0491096427371487, + "grad_norm": 1.38583505153656, + "learning_rate": 9.362500000000001e-05, + "loss": 0.4431, + "step": 18735 + }, + { + "epoch": 1.0491656400492777, + "grad_norm": 1.4215130805969238, + "learning_rate": 9.363000000000001e-05, + "loss": 0.5827, + "step": 18736 + }, + { + "epoch": 1.0492216373614067, + "grad_norm": 1.4129101037979126, + "learning_rate": 9.363500000000001e-05, + "loss": 0.4041, + "step": 18737 + }, + { + "epoch": 1.0492776346735357, + "grad_norm": 1.5683997869491577, + "learning_rate": 9.364e-05, + "loss": 0.5229, + "step": 18738 + }, + { + "epoch": 1.0493336319856648, + "grad_norm": 1.083092451095581, + "learning_rate": 9.3645e-05, + "loss": 0.3352, + "step": 18739 + }, + { + "epoch": 1.0493896292977938, + "grad_norm": 1.3357940912246704, + "learning_rate": 9.365e-05, + "loss": 0.5065, + "step": 18740 + }, + { + "epoch": 1.0494456266099228, + "grad_norm": 1.3490136861801147, + "learning_rate": 9.365500000000001e-05, + "loss": 0.4825, + "step": 18741 + }, + { + "epoch": 1.0495016239220518, + "grad_norm": 1.1608885526657104, + "learning_rate": 9.366000000000001e-05, + "loss": 0.3584, + "step": 18742 + }, + { + "epoch": 1.0495576212341808, + "grad_norm": 3.1415653228759766, + "learning_rate": 9.3665e-05, + "loss": 0.4131, + "step": 18743 + }, + { + "epoch": 1.0496136185463099, + "grad_norm": 1.2254018783569336, + "learning_rate": 9.367e-05, + "loss": 0.3774, + "step": 18744 + }, + { + "epoch": 1.0496696158584389, + "grad_norm": 1.1179746389389038, + "learning_rate": 9.3675e-05, + "loss": 0.3453, + "step": 18745 + }, + { + "epoch": 1.049725613170568, + "grad_norm": 1.2203619480133057, + "learning_rate": 9.368e-05, + "loss": 0.5304, + "step": 18746 + }, + { + "epoch": 1.049781610482697, + "grad_norm": 1.4112677574157715, + "learning_rate": 9.3685e-05, + "loss": 0.7577, + "step": 18747 + }, + { + "epoch": 1.049837607794826, + "grad_norm": 1.3277983665466309, + "learning_rate": 9.369e-05, + "loss": 0.3917, + "step": 18748 + }, + { + "epoch": 1.049893605106955, + "grad_norm": 1.2955080270767212, + "learning_rate": 9.3695e-05, + "loss": 0.3897, + "step": 18749 + }, + { + "epoch": 1.049949602419084, + "grad_norm": 1.3035756349563599, + "learning_rate": 9.370000000000001e-05, + "loss": 0.3826, + "step": 18750 + }, + { + "epoch": 1.050005599731213, + "grad_norm": 1.3221250772476196, + "learning_rate": 9.370500000000001e-05, + "loss": 0.3784, + "step": 18751 + }, + { + "epoch": 1.050061597043342, + "grad_norm": 1.2354234457015991, + "learning_rate": 9.371000000000001e-05, + "loss": 0.3577, + "step": 18752 + }, + { + "epoch": 1.050117594355471, + "grad_norm": 1.1802183389663696, + "learning_rate": 9.3715e-05, + "loss": 0.4117, + "step": 18753 + }, + { + "epoch": 1.0501735916676, + "grad_norm": 1.2856926918029785, + "learning_rate": 9.372e-05, + "loss": 0.4059, + "step": 18754 + }, + { + "epoch": 1.050229588979729, + "grad_norm": 1.5393387079238892, + "learning_rate": 9.3725e-05, + "loss": 0.3178, + "step": 18755 + }, + { + "epoch": 1.050285586291858, + "grad_norm": 1.3233131170272827, + "learning_rate": 9.373000000000001e-05, + "loss": 0.3248, + "step": 18756 + }, + { + "epoch": 1.050341583603987, + "grad_norm": 1.6761584281921387, + "learning_rate": 9.373500000000001e-05, + "loss": 0.4015, + "step": 18757 + }, + { + "epoch": 1.0503975809161161, + "grad_norm": 1.2219070196151733, + "learning_rate": 9.374000000000001e-05, + "loss": 0.6149, + "step": 18758 + }, + { + "epoch": 1.0504535782282451, + "grad_norm": 1.2063491344451904, + "learning_rate": 9.3745e-05, + "loss": 0.3681, + "step": 18759 + }, + { + "epoch": 1.0505095755403742, + "grad_norm": 1.8176213502883911, + "learning_rate": 9.375e-05, + "loss": 0.5237, + "step": 18760 + }, + { + "epoch": 1.0505655728525032, + "grad_norm": 1.5886811017990112, + "learning_rate": 9.3755e-05, + "loss": 0.461, + "step": 18761 + }, + { + "epoch": 1.0506215701646322, + "grad_norm": 1.32243812084198, + "learning_rate": 9.376e-05, + "loss": 0.4032, + "step": 18762 + }, + { + "epoch": 1.0506775674767612, + "grad_norm": 1.169313669204712, + "learning_rate": 9.376500000000001e-05, + "loss": 0.4166, + "step": 18763 + }, + { + "epoch": 1.0507335647888902, + "grad_norm": 1.393068790435791, + "learning_rate": 9.377e-05, + "loss": 0.4246, + "step": 18764 + }, + { + "epoch": 1.050789562101019, + "grad_norm": 1.259882926940918, + "learning_rate": 9.3775e-05, + "loss": 0.4142, + "step": 18765 + }, + { + "epoch": 1.050845559413148, + "grad_norm": 1.1903698444366455, + "learning_rate": 9.378e-05, + "loss": 0.3495, + "step": 18766 + }, + { + "epoch": 1.050901556725277, + "grad_norm": 1.4354101419448853, + "learning_rate": 9.3785e-05, + "loss": 0.3947, + "step": 18767 + }, + { + "epoch": 1.050957554037406, + "grad_norm": 1.1266363859176636, + "learning_rate": 9.379e-05, + "loss": 0.3524, + "step": 18768 + }, + { + "epoch": 1.0510135513495351, + "grad_norm": 1.3983913660049438, + "learning_rate": 9.3795e-05, + "loss": 0.4935, + "step": 18769 + }, + { + "epoch": 1.0510695486616641, + "grad_norm": 1.611491084098816, + "learning_rate": 9.38e-05, + "loss": 0.5854, + "step": 18770 + }, + { + "epoch": 1.0511255459737932, + "grad_norm": 1.1961493492126465, + "learning_rate": 9.380500000000001e-05, + "loss": 0.403, + "step": 18771 + }, + { + "epoch": 1.0511815432859222, + "grad_norm": 1.6286276578903198, + "learning_rate": 9.381000000000001e-05, + "loss": 0.3887, + "step": 18772 + }, + { + "epoch": 1.0512375405980512, + "grad_norm": 1.3481632471084595, + "learning_rate": 9.381500000000001e-05, + "loss": 0.3905, + "step": 18773 + }, + { + "epoch": 1.0512935379101802, + "grad_norm": 1.800219178199768, + "learning_rate": 9.382e-05, + "loss": 0.6068, + "step": 18774 + }, + { + "epoch": 1.0513495352223092, + "grad_norm": 1.3384664058685303, + "learning_rate": 9.3825e-05, + "loss": 0.3587, + "step": 18775 + }, + { + "epoch": 1.0514055325344382, + "grad_norm": 1.3007813692092896, + "learning_rate": 9.383e-05, + "loss": 0.3219, + "step": 18776 + }, + { + "epoch": 1.0514615298465673, + "grad_norm": 1.9404252767562866, + "learning_rate": 9.383500000000001e-05, + "loss": 0.535, + "step": 18777 + }, + { + "epoch": 1.0515175271586963, + "grad_norm": 1.2272975444793701, + "learning_rate": 9.384000000000001e-05, + "loss": 0.432, + "step": 18778 + }, + { + "epoch": 1.0515735244708253, + "grad_norm": 1.2873376607894897, + "learning_rate": 9.384500000000001e-05, + "loss": 0.3672, + "step": 18779 + }, + { + "epoch": 1.0516295217829543, + "grad_norm": 1.1425484418869019, + "learning_rate": 9.385e-05, + "loss": 0.3483, + "step": 18780 + }, + { + "epoch": 1.0516855190950833, + "grad_norm": 1.2005585432052612, + "learning_rate": 9.3855e-05, + "loss": 0.4503, + "step": 18781 + }, + { + "epoch": 1.0517415164072124, + "grad_norm": 1.5329253673553467, + "learning_rate": 9.386e-05, + "loss": 0.565, + "step": 18782 + }, + { + "epoch": 1.0517975137193414, + "grad_norm": 1.3012638092041016, + "learning_rate": 9.3865e-05, + "loss": 0.4552, + "step": 18783 + }, + { + "epoch": 1.0518535110314704, + "grad_norm": 1.4215017557144165, + "learning_rate": 9.387000000000001e-05, + "loss": 0.385, + "step": 18784 + }, + { + "epoch": 1.0519095083435994, + "grad_norm": 1.4076894521713257, + "learning_rate": 9.3875e-05, + "loss": 0.4185, + "step": 18785 + }, + { + "epoch": 1.0519655056557284, + "grad_norm": 1.6196259260177612, + "learning_rate": 9.388e-05, + "loss": 0.4357, + "step": 18786 + }, + { + "epoch": 1.0520215029678575, + "grad_norm": 1.3829759359359741, + "learning_rate": 9.3885e-05, + "loss": 0.5274, + "step": 18787 + }, + { + "epoch": 1.0520775002799865, + "grad_norm": 1.3827085494995117, + "learning_rate": 9.389e-05, + "loss": 0.3359, + "step": 18788 + }, + { + "epoch": 1.0521334975921155, + "grad_norm": 1.3541009426116943, + "learning_rate": 9.3895e-05, + "loss": 0.3799, + "step": 18789 + }, + { + "epoch": 1.0521894949042445, + "grad_norm": 7.985986232757568, + "learning_rate": 9.39e-05, + "loss": 0.478, + "step": 18790 + }, + { + "epoch": 1.0522454922163735, + "grad_norm": 1.4385887384414673, + "learning_rate": 9.3905e-05, + "loss": 0.495, + "step": 18791 + }, + { + "epoch": 1.0523014895285026, + "grad_norm": 1.43264639377594, + "learning_rate": 9.391000000000001e-05, + "loss": 0.492, + "step": 18792 + }, + { + "epoch": 1.0523574868406316, + "grad_norm": 1.3586641550064087, + "learning_rate": 9.391500000000001e-05, + "loss": 0.4638, + "step": 18793 + }, + { + "epoch": 1.0524134841527606, + "grad_norm": 1.3236429691314697, + "learning_rate": 9.392000000000001e-05, + "loss": 0.4102, + "step": 18794 + }, + { + "epoch": 1.0524694814648896, + "grad_norm": 1.5813895463943481, + "learning_rate": 9.3925e-05, + "loss": 0.7058, + "step": 18795 + }, + { + "epoch": 1.0525254787770186, + "grad_norm": 1.8021186590194702, + "learning_rate": 9.393e-05, + "loss": 0.3773, + "step": 18796 + }, + { + "epoch": 1.0525814760891476, + "grad_norm": 1.561477780342102, + "learning_rate": 9.3935e-05, + "loss": 0.4236, + "step": 18797 + }, + { + "epoch": 1.0526374734012767, + "grad_norm": 1.3825465440750122, + "learning_rate": 9.394000000000001e-05, + "loss": 0.4629, + "step": 18798 + }, + { + "epoch": 1.0526934707134057, + "grad_norm": 2.319859504699707, + "learning_rate": 9.394500000000001e-05, + "loss": 0.3467, + "step": 18799 + }, + { + "epoch": 1.0527494680255347, + "grad_norm": 1.4497575759887695, + "learning_rate": 9.395000000000001e-05, + "loss": 0.3631, + "step": 18800 + }, + { + "epoch": 1.0528054653376637, + "grad_norm": 1.0492933988571167, + "learning_rate": 9.3955e-05, + "loss": 0.2956, + "step": 18801 + }, + { + "epoch": 1.0528614626497927, + "grad_norm": 1.265881061553955, + "learning_rate": 9.396e-05, + "loss": 0.463, + "step": 18802 + }, + { + "epoch": 1.0529174599619218, + "grad_norm": 1.474715232849121, + "learning_rate": 9.3965e-05, + "loss": 0.5241, + "step": 18803 + }, + { + "epoch": 1.0529734572740508, + "grad_norm": 1.3244526386260986, + "learning_rate": 9.397e-05, + "loss": 0.3758, + "step": 18804 + }, + { + "epoch": 1.0530294545861798, + "grad_norm": 1.293935775756836, + "learning_rate": 9.397500000000001e-05, + "loss": 0.3872, + "step": 18805 + }, + { + "epoch": 1.0530854518983088, + "grad_norm": 1.3484982252120972, + "learning_rate": 9.398e-05, + "loss": 0.4226, + "step": 18806 + }, + { + "epoch": 1.0531414492104378, + "grad_norm": 1.3209012746810913, + "learning_rate": 9.3985e-05, + "loss": 0.4594, + "step": 18807 + }, + { + "epoch": 1.0531974465225669, + "grad_norm": 1.4305565357208252, + "learning_rate": 9.399e-05, + "loss": 0.4233, + "step": 18808 + }, + { + "epoch": 1.0532534438346959, + "grad_norm": 1.4356799125671387, + "learning_rate": 9.3995e-05, + "loss": 0.4133, + "step": 18809 + }, + { + "epoch": 1.053309441146825, + "grad_norm": 1.3230232000350952, + "learning_rate": 9.4e-05, + "loss": 0.4562, + "step": 18810 + }, + { + "epoch": 1.053365438458954, + "grad_norm": 1.3221404552459717, + "learning_rate": 9.4005e-05, + "loss": 0.3312, + "step": 18811 + }, + { + "epoch": 1.053421435771083, + "grad_norm": 1.6068238019943237, + "learning_rate": 9.401e-05, + "loss": 0.5273, + "step": 18812 + }, + { + "epoch": 1.053477433083212, + "grad_norm": 1.308494210243225, + "learning_rate": 9.401500000000001e-05, + "loss": 0.5055, + "step": 18813 + }, + { + "epoch": 1.053533430395341, + "grad_norm": 1.3897660970687866, + "learning_rate": 9.402000000000001e-05, + "loss": 0.4352, + "step": 18814 + }, + { + "epoch": 1.05358942770747, + "grad_norm": 1.3756804466247559, + "learning_rate": 9.402500000000001e-05, + "loss": 0.5146, + "step": 18815 + }, + { + "epoch": 1.053645425019599, + "grad_norm": 1.3395336866378784, + "learning_rate": 9.403e-05, + "loss": 0.5114, + "step": 18816 + }, + { + "epoch": 1.053701422331728, + "grad_norm": 1.4863812923431396, + "learning_rate": 9.4035e-05, + "loss": 0.4387, + "step": 18817 + }, + { + "epoch": 1.053757419643857, + "grad_norm": 1.3136694431304932, + "learning_rate": 9.404e-05, + "loss": 0.5058, + "step": 18818 + }, + { + "epoch": 1.053813416955986, + "grad_norm": 1.463541865348816, + "learning_rate": 9.404500000000001e-05, + "loss": 0.372, + "step": 18819 + }, + { + "epoch": 1.053869414268115, + "grad_norm": 1.429172158241272, + "learning_rate": 9.405000000000001e-05, + "loss": 0.5003, + "step": 18820 + }, + { + "epoch": 1.053925411580244, + "grad_norm": 1.1957285404205322, + "learning_rate": 9.4055e-05, + "loss": 0.4128, + "step": 18821 + }, + { + "epoch": 1.0539814088923731, + "grad_norm": 1.3483213186264038, + "learning_rate": 9.406e-05, + "loss": 0.4783, + "step": 18822 + }, + { + "epoch": 1.0540374062045021, + "grad_norm": 1.742538332939148, + "learning_rate": 9.4065e-05, + "loss": 0.4073, + "step": 18823 + }, + { + "epoch": 1.0540934035166312, + "grad_norm": 1.7641878128051758, + "learning_rate": 9.407e-05, + "loss": 0.4633, + "step": 18824 + }, + { + "epoch": 1.0541494008287602, + "grad_norm": 1.4936950206756592, + "learning_rate": 9.4075e-05, + "loss": 0.5027, + "step": 18825 + }, + { + "epoch": 1.0542053981408892, + "grad_norm": 1.5363181829452515, + "learning_rate": 9.408000000000001e-05, + "loss": 0.3967, + "step": 18826 + }, + { + "epoch": 1.0542613954530182, + "grad_norm": 1.4092928171157837, + "learning_rate": 9.4085e-05, + "loss": 0.4312, + "step": 18827 + }, + { + "epoch": 1.0543173927651472, + "grad_norm": 1.1858136653900146, + "learning_rate": 9.409e-05, + "loss": 0.5055, + "step": 18828 + }, + { + "epoch": 1.0543733900772763, + "grad_norm": 1.7902591228485107, + "learning_rate": 9.4095e-05, + "loss": 0.479, + "step": 18829 + }, + { + "epoch": 1.0544293873894053, + "grad_norm": 1.6157180070877075, + "learning_rate": 9.41e-05, + "loss": 0.369, + "step": 18830 + }, + { + "epoch": 1.0544853847015343, + "grad_norm": 1.1800177097320557, + "learning_rate": 9.410500000000001e-05, + "loss": 0.332, + "step": 18831 + }, + { + "epoch": 1.0545413820136633, + "grad_norm": 1.1747642755508423, + "learning_rate": 9.411e-05, + "loss": 0.4682, + "step": 18832 + }, + { + "epoch": 1.0545973793257923, + "grad_norm": 1.514975905418396, + "learning_rate": 9.4115e-05, + "loss": 0.4393, + "step": 18833 + }, + { + "epoch": 1.0546533766379214, + "grad_norm": 1.2441729307174683, + "learning_rate": 9.412000000000001e-05, + "loss": 0.4483, + "step": 18834 + }, + { + "epoch": 1.0547093739500504, + "grad_norm": 1.2949442863464355, + "learning_rate": 9.412500000000001e-05, + "loss": 0.4996, + "step": 18835 + }, + { + "epoch": 1.0547653712621794, + "grad_norm": 1.818418264389038, + "learning_rate": 9.413000000000001e-05, + "loss": 0.5126, + "step": 18836 + }, + { + "epoch": 1.0548213685743084, + "grad_norm": 1.7463511228561401, + "learning_rate": 9.4135e-05, + "loss": 0.5482, + "step": 18837 + }, + { + "epoch": 1.0548773658864374, + "grad_norm": 1.280035376548767, + "learning_rate": 9.414e-05, + "loss": 0.4009, + "step": 18838 + }, + { + "epoch": 1.0549333631985665, + "grad_norm": 1.5921680927276611, + "learning_rate": 9.4145e-05, + "loss": 0.6184, + "step": 18839 + }, + { + "epoch": 1.0549893605106955, + "grad_norm": 1.3982776403427124, + "learning_rate": 9.415e-05, + "loss": 0.3379, + "step": 18840 + }, + { + "epoch": 1.0550453578228245, + "grad_norm": 1.1702890396118164, + "learning_rate": 9.415500000000001e-05, + "loss": 0.3119, + "step": 18841 + }, + { + "epoch": 1.0551013551349535, + "grad_norm": 1.1738208532333374, + "learning_rate": 9.416e-05, + "loss": 0.3463, + "step": 18842 + }, + { + "epoch": 1.0551573524470825, + "grad_norm": 1.2477097511291504, + "learning_rate": 9.4165e-05, + "loss": 0.4123, + "step": 18843 + }, + { + "epoch": 1.0552133497592115, + "grad_norm": 1.3173540830612183, + "learning_rate": 9.417e-05, + "loss": 0.4082, + "step": 18844 + }, + { + "epoch": 1.0552693470713406, + "grad_norm": 1.4024620056152344, + "learning_rate": 9.4175e-05, + "loss": 0.498, + "step": 18845 + }, + { + "epoch": 1.0553253443834696, + "grad_norm": 1.6407623291015625, + "learning_rate": 9.418e-05, + "loss": 0.5054, + "step": 18846 + }, + { + "epoch": 1.0553813416955986, + "grad_norm": 1.3435959815979004, + "learning_rate": 9.418500000000001e-05, + "loss": 0.5827, + "step": 18847 + }, + { + "epoch": 1.0554373390077276, + "grad_norm": 1.2907919883728027, + "learning_rate": 9.419e-05, + "loss": 0.4695, + "step": 18848 + }, + { + "epoch": 1.0554933363198566, + "grad_norm": 1.2824134826660156, + "learning_rate": 9.4195e-05, + "loss": 0.4491, + "step": 18849 + }, + { + "epoch": 1.0555493336319857, + "grad_norm": 1.4175795316696167, + "learning_rate": 9.42e-05, + "loss": 0.4447, + "step": 18850 + }, + { + "epoch": 1.0556053309441147, + "grad_norm": 2.4582059383392334, + "learning_rate": 9.420500000000001e-05, + "loss": 0.4851, + "step": 18851 + }, + { + "epoch": 1.0556613282562437, + "grad_norm": 1.510203242301941, + "learning_rate": 9.421000000000001e-05, + "loss": 0.4234, + "step": 18852 + }, + { + "epoch": 1.0557173255683727, + "grad_norm": 1.3534705638885498, + "learning_rate": 9.4215e-05, + "loss": 0.4249, + "step": 18853 + }, + { + "epoch": 1.0557733228805017, + "grad_norm": 1.524954915046692, + "learning_rate": 9.422e-05, + "loss": 0.5413, + "step": 18854 + }, + { + "epoch": 1.0558293201926308, + "grad_norm": 1.2934017181396484, + "learning_rate": 9.422500000000001e-05, + "loss": 0.4422, + "step": 18855 + }, + { + "epoch": 1.0558853175047598, + "grad_norm": 1.3412871360778809, + "learning_rate": 9.423000000000001e-05, + "loss": 0.4235, + "step": 18856 + }, + { + "epoch": 1.0559413148168888, + "grad_norm": 1.5189380645751953, + "learning_rate": 9.423500000000001e-05, + "loss": 0.4656, + "step": 18857 + }, + { + "epoch": 1.0559973121290178, + "grad_norm": 1.3191028833389282, + "learning_rate": 9.424e-05, + "loss": 0.4504, + "step": 18858 + }, + { + "epoch": 1.0560533094411468, + "grad_norm": 1.3765827417373657, + "learning_rate": 9.4245e-05, + "loss": 0.4747, + "step": 18859 + }, + { + "epoch": 1.0561093067532759, + "grad_norm": 1.6227412223815918, + "learning_rate": 9.425e-05, + "loss": 0.488, + "step": 18860 + }, + { + "epoch": 1.0561653040654049, + "grad_norm": 2.1638755798339844, + "learning_rate": 9.4255e-05, + "loss": 0.4988, + "step": 18861 + }, + { + "epoch": 1.056221301377534, + "grad_norm": 1.5395900011062622, + "learning_rate": 9.426000000000001e-05, + "loss": 0.594, + "step": 18862 + }, + { + "epoch": 1.056277298689663, + "grad_norm": 1.609810709953308, + "learning_rate": 9.4265e-05, + "loss": 0.4513, + "step": 18863 + }, + { + "epoch": 1.056333296001792, + "grad_norm": 1.3589756488800049, + "learning_rate": 9.427e-05, + "loss": 0.4122, + "step": 18864 + }, + { + "epoch": 1.056389293313921, + "grad_norm": 1.2109688520431519, + "learning_rate": 9.4275e-05, + "loss": 0.3998, + "step": 18865 + }, + { + "epoch": 1.05644529062605, + "grad_norm": 1.3328644037246704, + "learning_rate": 9.428e-05, + "loss": 0.447, + "step": 18866 + }, + { + "epoch": 1.056501287938179, + "grad_norm": 1.28719961643219, + "learning_rate": 9.4285e-05, + "loss": 0.4122, + "step": 18867 + }, + { + "epoch": 1.056557285250308, + "grad_norm": 1.3460605144500732, + "learning_rate": 9.429000000000001e-05, + "loss": 0.4158, + "step": 18868 + }, + { + "epoch": 1.056613282562437, + "grad_norm": 1.3511147499084473, + "learning_rate": 9.4295e-05, + "loss": 0.4722, + "step": 18869 + }, + { + "epoch": 1.056669279874566, + "grad_norm": 1.66860830783844, + "learning_rate": 9.43e-05, + "loss": 0.4749, + "step": 18870 + }, + { + "epoch": 1.056725277186695, + "grad_norm": 1.2137311697006226, + "learning_rate": 9.430500000000001e-05, + "loss": 0.3863, + "step": 18871 + }, + { + "epoch": 1.056781274498824, + "grad_norm": 1.441867709159851, + "learning_rate": 9.431000000000001e-05, + "loss": 0.5332, + "step": 18872 + }, + { + "epoch": 1.056837271810953, + "grad_norm": 1.3843780755996704, + "learning_rate": 9.431500000000001e-05, + "loss": 0.3801, + "step": 18873 + }, + { + "epoch": 1.0568932691230821, + "grad_norm": 1.5861767530441284, + "learning_rate": 9.432e-05, + "loss": 0.478, + "step": 18874 + }, + { + "epoch": 1.0569492664352111, + "grad_norm": 1.8049677610397339, + "learning_rate": 9.4325e-05, + "loss": 0.5031, + "step": 18875 + }, + { + "epoch": 1.0570052637473402, + "grad_norm": 1.302350640296936, + "learning_rate": 9.433000000000001e-05, + "loss": 0.5332, + "step": 18876 + }, + { + "epoch": 1.0570612610594692, + "grad_norm": 1.555769920349121, + "learning_rate": 9.433500000000001e-05, + "loss": 0.4586, + "step": 18877 + }, + { + "epoch": 1.0571172583715982, + "grad_norm": 1.334062933921814, + "learning_rate": 9.434000000000001e-05, + "loss": 0.4274, + "step": 18878 + }, + { + "epoch": 1.0571732556837272, + "grad_norm": 1.4169632196426392, + "learning_rate": 9.4345e-05, + "loss": 0.4329, + "step": 18879 + }, + { + "epoch": 1.0572292529958562, + "grad_norm": 1.3843140602111816, + "learning_rate": 9.435e-05, + "loss": 0.4196, + "step": 18880 + }, + { + "epoch": 1.0572852503079853, + "grad_norm": 1.5343679189682007, + "learning_rate": 9.4355e-05, + "loss": 0.434, + "step": 18881 + }, + { + "epoch": 1.0573412476201143, + "grad_norm": 1.3447123765945435, + "learning_rate": 9.436e-05, + "loss": 0.4374, + "step": 18882 + }, + { + "epoch": 1.0573972449322433, + "grad_norm": 1.2755141258239746, + "learning_rate": 9.436500000000001e-05, + "loss": 0.497, + "step": 18883 + }, + { + "epoch": 1.0574532422443723, + "grad_norm": 1.4663509130477905, + "learning_rate": 9.437e-05, + "loss": 0.449, + "step": 18884 + }, + { + "epoch": 1.0575092395565013, + "grad_norm": 1.2889498472213745, + "learning_rate": 9.4375e-05, + "loss": 0.4059, + "step": 18885 + }, + { + "epoch": 1.0575652368686304, + "grad_norm": 1.7589125633239746, + "learning_rate": 9.438e-05, + "loss": 0.3877, + "step": 18886 + }, + { + "epoch": 1.0576212341807594, + "grad_norm": 1.254963994026184, + "learning_rate": 9.4385e-05, + "loss": 0.3096, + "step": 18887 + }, + { + "epoch": 1.0576772314928884, + "grad_norm": 1.3054019212722778, + "learning_rate": 9.439e-05, + "loss": 0.3835, + "step": 18888 + }, + { + "epoch": 1.0577332288050174, + "grad_norm": 1.5005621910095215, + "learning_rate": 9.439500000000001e-05, + "loss": 0.4646, + "step": 18889 + }, + { + "epoch": 1.0577892261171464, + "grad_norm": 1.1724367141723633, + "learning_rate": 9.44e-05, + "loss": 0.4213, + "step": 18890 + }, + { + "epoch": 1.0578452234292754, + "grad_norm": 1.463555097579956, + "learning_rate": 9.4405e-05, + "loss": 0.4219, + "step": 18891 + }, + { + "epoch": 1.0579012207414045, + "grad_norm": 1.779472827911377, + "learning_rate": 9.441000000000001e-05, + "loss": 0.3379, + "step": 18892 + }, + { + "epoch": 1.0579572180535335, + "grad_norm": 1.2690653800964355, + "learning_rate": 9.441500000000001e-05, + "loss": 0.3729, + "step": 18893 + }, + { + "epoch": 1.0580132153656625, + "grad_norm": 1.3225834369659424, + "learning_rate": 9.442000000000001e-05, + "loss": 0.4082, + "step": 18894 + }, + { + "epoch": 1.0580692126777915, + "grad_norm": 1.572891354560852, + "learning_rate": 9.4425e-05, + "loss": 0.3969, + "step": 18895 + }, + { + "epoch": 1.0581252099899205, + "grad_norm": 1.2776237726211548, + "learning_rate": 9.443e-05, + "loss": 0.4541, + "step": 18896 + }, + { + "epoch": 1.0581812073020496, + "grad_norm": 1.635740041732788, + "learning_rate": 9.443500000000001e-05, + "loss": 0.4054, + "step": 18897 + }, + { + "epoch": 1.0582372046141786, + "grad_norm": 1.3353666067123413, + "learning_rate": 9.444000000000001e-05, + "loss": 0.4354, + "step": 18898 + }, + { + "epoch": 1.0582932019263076, + "grad_norm": 1.283207654953003, + "learning_rate": 9.444500000000001e-05, + "loss": 0.4254, + "step": 18899 + }, + { + "epoch": 1.0583491992384366, + "grad_norm": 1.3528674840927124, + "learning_rate": 9.445e-05, + "loss": 0.3926, + "step": 18900 + }, + { + "epoch": 1.0584051965505656, + "grad_norm": 1.2042877674102783, + "learning_rate": 9.4455e-05, + "loss": 0.4369, + "step": 18901 + }, + { + "epoch": 1.0584611938626947, + "grad_norm": 2.829955816268921, + "learning_rate": 9.446e-05, + "loss": 0.4321, + "step": 18902 + }, + { + "epoch": 1.0585171911748237, + "grad_norm": 1.424607515335083, + "learning_rate": 9.4465e-05, + "loss": 0.3693, + "step": 18903 + }, + { + "epoch": 1.0585731884869527, + "grad_norm": 1.0847461223602295, + "learning_rate": 9.447000000000001e-05, + "loss": 0.3607, + "step": 18904 + }, + { + "epoch": 1.0586291857990817, + "grad_norm": 1.390052080154419, + "learning_rate": 9.4475e-05, + "loss": 0.4118, + "step": 18905 + }, + { + "epoch": 1.0586851831112107, + "grad_norm": 1.8173706531524658, + "learning_rate": 9.448e-05, + "loss": 0.5275, + "step": 18906 + }, + { + "epoch": 1.0587411804233398, + "grad_norm": 1.606350302696228, + "learning_rate": 9.4485e-05, + "loss": 0.3748, + "step": 18907 + }, + { + "epoch": 1.0587971777354688, + "grad_norm": 1.3614684343338013, + "learning_rate": 9.449e-05, + "loss": 0.384, + "step": 18908 + }, + { + "epoch": 1.0588531750475978, + "grad_norm": 1.325899600982666, + "learning_rate": 9.4495e-05, + "loss": 0.3822, + "step": 18909 + }, + { + "epoch": 1.0589091723597268, + "grad_norm": 1.5942211151123047, + "learning_rate": 9.449999999999999e-05, + "loss": 0.3482, + "step": 18910 + }, + { + "epoch": 1.0589651696718558, + "grad_norm": 1.237679362297058, + "learning_rate": 9.4505e-05, + "loss": 0.3954, + "step": 18911 + }, + { + "epoch": 1.0590211669839849, + "grad_norm": 1.4813698530197144, + "learning_rate": 9.451000000000002e-05, + "loss": 0.4544, + "step": 18912 + }, + { + "epoch": 1.0590771642961139, + "grad_norm": 1.36385977268219, + "learning_rate": 9.451500000000001e-05, + "loss": 0.4298, + "step": 18913 + }, + { + "epoch": 1.059133161608243, + "grad_norm": 1.14071786403656, + "learning_rate": 9.452000000000001e-05, + "loss": 0.4289, + "step": 18914 + }, + { + "epoch": 1.059189158920372, + "grad_norm": 1.6904393434524536, + "learning_rate": 9.452500000000001e-05, + "loss": 0.4813, + "step": 18915 + }, + { + "epoch": 1.059245156232501, + "grad_norm": 1.3985432386398315, + "learning_rate": 9.453e-05, + "loss": 0.3883, + "step": 18916 + }, + { + "epoch": 1.05930115354463, + "grad_norm": 1.5503175258636475, + "learning_rate": 9.4535e-05, + "loss": 0.3758, + "step": 18917 + }, + { + "epoch": 1.059357150856759, + "grad_norm": 1.3641659021377563, + "learning_rate": 9.454000000000001e-05, + "loss": 0.4202, + "step": 18918 + }, + { + "epoch": 1.059413148168888, + "grad_norm": 1.790743350982666, + "learning_rate": 9.454500000000001e-05, + "loss": 0.3753, + "step": 18919 + }, + { + "epoch": 1.059469145481017, + "grad_norm": 1.3057456016540527, + "learning_rate": 9.455000000000001e-05, + "loss": 0.4931, + "step": 18920 + }, + { + "epoch": 1.059525142793146, + "grad_norm": 1.4196068048477173, + "learning_rate": 9.4555e-05, + "loss": 0.399, + "step": 18921 + }, + { + "epoch": 1.059581140105275, + "grad_norm": 1.8194833993911743, + "learning_rate": 9.456e-05, + "loss": 0.4895, + "step": 18922 + }, + { + "epoch": 1.059637137417404, + "grad_norm": 1.3155800104141235, + "learning_rate": 9.4565e-05, + "loss": 0.4106, + "step": 18923 + }, + { + "epoch": 1.059693134729533, + "grad_norm": 1.8820222616195679, + "learning_rate": 9.457e-05, + "loss": 0.4736, + "step": 18924 + }, + { + "epoch": 1.059749132041662, + "grad_norm": 1.1942620277404785, + "learning_rate": 9.457500000000001e-05, + "loss": 0.3874, + "step": 18925 + }, + { + "epoch": 1.0598051293537911, + "grad_norm": 1.4331386089324951, + "learning_rate": 9.458e-05, + "loss": 0.5039, + "step": 18926 + }, + { + "epoch": 1.0598611266659201, + "grad_norm": 1.2756425142288208, + "learning_rate": 9.4585e-05, + "loss": 0.5379, + "step": 18927 + }, + { + "epoch": 1.0599171239780492, + "grad_norm": 1.3890256881713867, + "learning_rate": 9.459e-05, + "loss": 0.3719, + "step": 18928 + }, + { + "epoch": 1.0599731212901782, + "grad_norm": 1.3086320161819458, + "learning_rate": 9.4595e-05, + "loss": 0.5043, + "step": 18929 + }, + { + "epoch": 1.0600291186023072, + "grad_norm": 1.5591317415237427, + "learning_rate": 9.46e-05, + "loss": 0.3195, + "step": 18930 + }, + { + "epoch": 1.0600851159144362, + "grad_norm": 1.288242220878601, + "learning_rate": 9.460499999999999e-05, + "loss": 0.3377, + "step": 18931 + }, + { + "epoch": 1.0601411132265652, + "grad_norm": 1.243438720703125, + "learning_rate": 9.461e-05, + "loss": 0.3815, + "step": 18932 + }, + { + "epoch": 1.0601971105386943, + "grad_norm": 1.3783605098724365, + "learning_rate": 9.461500000000001e-05, + "loss": 0.3725, + "step": 18933 + }, + { + "epoch": 1.0602531078508233, + "grad_norm": 1.2204675674438477, + "learning_rate": 9.462000000000001e-05, + "loss": 0.4538, + "step": 18934 + }, + { + "epoch": 1.0603091051629523, + "grad_norm": 1.437117099761963, + "learning_rate": 9.462500000000001e-05, + "loss": 0.4731, + "step": 18935 + }, + { + "epoch": 1.060365102475081, + "grad_norm": 1.4627983570098877, + "learning_rate": 9.463000000000001e-05, + "loss": 0.4642, + "step": 18936 + }, + { + "epoch": 1.06042109978721, + "grad_norm": 1.3718695640563965, + "learning_rate": 9.4635e-05, + "loss": 0.5397, + "step": 18937 + }, + { + "epoch": 1.0604770970993391, + "grad_norm": 1.5296236276626587, + "learning_rate": 9.464e-05, + "loss": 0.4356, + "step": 18938 + }, + { + "epoch": 1.0605330944114681, + "grad_norm": 1.440040111541748, + "learning_rate": 9.4645e-05, + "loss": 0.5243, + "step": 18939 + }, + { + "epoch": 1.0605890917235972, + "grad_norm": 21.09760284423828, + "learning_rate": 9.465000000000001e-05, + "loss": 0.3917, + "step": 18940 + }, + { + "epoch": 1.0606450890357262, + "grad_norm": 1.457410216331482, + "learning_rate": 9.465500000000001e-05, + "loss": 0.3693, + "step": 18941 + }, + { + "epoch": 1.0607010863478552, + "grad_norm": 1.5546989440917969, + "learning_rate": 9.466e-05, + "loss": 0.4964, + "step": 18942 + }, + { + "epoch": 1.0607570836599842, + "grad_norm": 1.2661175727844238, + "learning_rate": 9.4665e-05, + "loss": 0.4675, + "step": 18943 + }, + { + "epoch": 1.0608130809721132, + "grad_norm": 1.2626347541809082, + "learning_rate": 9.467e-05, + "loss": 0.3433, + "step": 18944 + }, + { + "epoch": 1.0608690782842423, + "grad_norm": 1.317460298538208, + "learning_rate": 9.4675e-05, + "loss": 0.35, + "step": 18945 + }, + { + "epoch": 1.0609250755963713, + "grad_norm": 1.5489500761032104, + "learning_rate": 9.468000000000001e-05, + "loss": 0.4688, + "step": 18946 + }, + { + "epoch": 1.0609810729085003, + "grad_norm": 1.44960618019104, + "learning_rate": 9.4685e-05, + "loss": 0.4104, + "step": 18947 + }, + { + "epoch": 1.0610370702206293, + "grad_norm": 1.649051308631897, + "learning_rate": 9.469e-05, + "loss": 0.3846, + "step": 18948 + }, + { + "epoch": 1.0610930675327583, + "grad_norm": 1.4297840595245361, + "learning_rate": 9.4695e-05, + "loss": 0.4522, + "step": 18949 + }, + { + "epoch": 1.0611490648448874, + "grad_norm": 1.7101448774337769, + "learning_rate": 9.47e-05, + "loss": 0.4883, + "step": 18950 + }, + { + "epoch": 1.0612050621570164, + "grad_norm": 1.4615607261657715, + "learning_rate": 9.4705e-05, + "loss": 0.3946, + "step": 18951 + }, + { + "epoch": 1.0612610594691454, + "grad_norm": 1.315509557723999, + "learning_rate": 9.471e-05, + "loss": 0.491, + "step": 18952 + }, + { + "epoch": 1.0613170567812744, + "grad_norm": 1.490499496459961, + "learning_rate": 9.4715e-05, + "loss": 0.4898, + "step": 18953 + }, + { + "epoch": 1.0613730540934034, + "grad_norm": 1.748432993888855, + "learning_rate": 9.472000000000001e-05, + "loss": 0.5113, + "step": 18954 + }, + { + "epoch": 1.0614290514055325, + "grad_norm": 1.2763144969940186, + "learning_rate": 9.472500000000001e-05, + "loss": 0.4567, + "step": 18955 + }, + { + "epoch": 1.0614850487176615, + "grad_norm": 1.5092469453811646, + "learning_rate": 9.473000000000001e-05, + "loss": 0.4869, + "step": 18956 + }, + { + "epoch": 1.0615410460297905, + "grad_norm": 1.5785478353500366, + "learning_rate": 9.473500000000001e-05, + "loss": 0.5633, + "step": 18957 + }, + { + "epoch": 1.0615970433419195, + "grad_norm": 1.1675353050231934, + "learning_rate": 9.474e-05, + "loss": 0.42, + "step": 18958 + }, + { + "epoch": 1.0616530406540485, + "grad_norm": 1.4426264762878418, + "learning_rate": 9.4745e-05, + "loss": 0.5753, + "step": 18959 + }, + { + "epoch": 1.0617090379661775, + "grad_norm": 2.0994317531585693, + "learning_rate": 9.475e-05, + "loss": 0.4554, + "step": 18960 + }, + { + "epoch": 1.0617650352783066, + "grad_norm": 1.3798431158065796, + "learning_rate": 9.475500000000001e-05, + "loss": 0.3845, + "step": 18961 + }, + { + "epoch": 1.0618210325904356, + "grad_norm": 1.6509004831314087, + "learning_rate": 9.476000000000001e-05, + "loss": 0.5474, + "step": 18962 + }, + { + "epoch": 1.0618770299025646, + "grad_norm": 1.4672154188156128, + "learning_rate": 9.4765e-05, + "loss": 0.5159, + "step": 18963 + }, + { + "epoch": 1.0619330272146936, + "grad_norm": 1.2632747888565063, + "learning_rate": 9.477e-05, + "loss": 0.4353, + "step": 18964 + }, + { + "epoch": 1.0619890245268226, + "grad_norm": 1.3404821157455444, + "learning_rate": 9.4775e-05, + "loss": 0.3383, + "step": 18965 + }, + { + "epoch": 1.0620450218389517, + "grad_norm": 1.3777574300765991, + "learning_rate": 9.478e-05, + "loss": 0.5561, + "step": 18966 + }, + { + "epoch": 1.0621010191510807, + "grad_norm": 1.2554987668991089, + "learning_rate": 9.478500000000001e-05, + "loss": 0.3572, + "step": 18967 + }, + { + "epoch": 1.0621570164632097, + "grad_norm": 1.453681468963623, + "learning_rate": 9.479e-05, + "loss": 0.3301, + "step": 18968 + }, + { + "epoch": 1.0622130137753387, + "grad_norm": 1.1490294933319092, + "learning_rate": 9.4795e-05, + "loss": 0.3352, + "step": 18969 + }, + { + "epoch": 1.0622690110874677, + "grad_norm": 2.009647846221924, + "learning_rate": 9.48e-05, + "loss": 0.6631, + "step": 18970 + }, + { + "epoch": 1.0623250083995968, + "grad_norm": 1.4922118186950684, + "learning_rate": 9.4805e-05, + "loss": 0.5982, + "step": 18971 + }, + { + "epoch": 1.0623810057117258, + "grad_norm": 1.34416663646698, + "learning_rate": 9.481000000000001e-05, + "loss": 0.4867, + "step": 18972 + }, + { + "epoch": 1.0624370030238548, + "grad_norm": 1.2582811117172241, + "learning_rate": 9.4815e-05, + "loss": 0.3392, + "step": 18973 + }, + { + "epoch": 1.0624930003359838, + "grad_norm": 1.1551052331924438, + "learning_rate": 9.482e-05, + "loss": 0.3629, + "step": 18974 + }, + { + "epoch": 1.0625489976481128, + "grad_norm": 1.3015782833099365, + "learning_rate": 9.482500000000001e-05, + "loss": 0.3792, + "step": 18975 + }, + { + "epoch": 1.0626049949602419, + "grad_norm": 1.3310070037841797, + "learning_rate": 9.483000000000001e-05, + "loss": 0.3958, + "step": 18976 + }, + { + "epoch": 1.0626609922723709, + "grad_norm": 1.3582322597503662, + "learning_rate": 9.483500000000001e-05, + "loss": 0.3922, + "step": 18977 + }, + { + "epoch": 1.0627169895845, + "grad_norm": 1.480780005455017, + "learning_rate": 9.484e-05, + "loss": 0.3405, + "step": 18978 + }, + { + "epoch": 1.062772986896629, + "grad_norm": 1.2930588722229004, + "learning_rate": 9.4845e-05, + "loss": 0.4571, + "step": 18979 + }, + { + "epoch": 1.062828984208758, + "grad_norm": 1.5896929502487183, + "learning_rate": 9.485e-05, + "loss": 0.5357, + "step": 18980 + }, + { + "epoch": 1.062884981520887, + "grad_norm": 1.4487299919128418, + "learning_rate": 9.4855e-05, + "loss": 0.5018, + "step": 18981 + }, + { + "epoch": 1.062940978833016, + "grad_norm": 1.6484873294830322, + "learning_rate": 9.486000000000001e-05, + "loss": 0.6373, + "step": 18982 + }, + { + "epoch": 1.062996976145145, + "grad_norm": 1.294859766960144, + "learning_rate": 9.486500000000001e-05, + "loss": 0.3981, + "step": 18983 + }, + { + "epoch": 1.063052973457274, + "grad_norm": 1.2828603982925415, + "learning_rate": 9.487e-05, + "loss": 0.5343, + "step": 18984 + }, + { + "epoch": 1.063108970769403, + "grad_norm": 1.579521656036377, + "learning_rate": 9.4875e-05, + "loss": 0.4794, + "step": 18985 + }, + { + "epoch": 1.063164968081532, + "grad_norm": 5.587820529937744, + "learning_rate": 9.488e-05, + "loss": 0.3993, + "step": 18986 + }, + { + "epoch": 1.063220965393661, + "grad_norm": 1.2712122201919556, + "learning_rate": 9.4885e-05, + "loss": 0.3959, + "step": 18987 + }, + { + "epoch": 1.06327696270579, + "grad_norm": 1.1356114149093628, + "learning_rate": 9.489e-05, + "loss": 0.4589, + "step": 18988 + }, + { + "epoch": 1.063332960017919, + "grad_norm": 2.2960779666900635, + "learning_rate": 9.4895e-05, + "loss": 0.2882, + "step": 18989 + }, + { + "epoch": 1.0633889573300481, + "grad_norm": 1.2106236219406128, + "learning_rate": 9.49e-05, + "loss": 0.3994, + "step": 18990 + }, + { + "epoch": 1.0634449546421771, + "grad_norm": 1.1267436742782593, + "learning_rate": 9.4905e-05, + "loss": 0.3675, + "step": 18991 + }, + { + "epoch": 1.0635009519543062, + "grad_norm": 1.2601475715637207, + "learning_rate": 9.491000000000001e-05, + "loss": 0.3452, + "step": 18992 + }, + { + "epoch": 1.0635569492664352, + "grad_norm": 1.4580140113830566, + "learning_rate": 9.491500000000001e-05, + "loss": 0.4447, + "step": 18993 + }, + { + "epoch": 1.0636129465785642, + "grad_norm": 1.2755945920944214, + "learning_rate": 9.492e-05, + "loss": 0.4647, + "step": 18994 + }, + { + "epoch": 1.0636689438906932, + "grad_norm": 1.2138370275497437, + "learning_rate": 9.4925e-05, + "loss": 0.387, + "step": 18995 + }, + { + "epoch": 1.0637249412028222, + "grad_norm": 1.2418153285980225, + "learning_rate": 9.493000000000001e-05, + "loss": 0.3565, + "step": 18996 + }, + { + "epoch": 1.0637809385149513, + "grad_norm": 1.2355155944824219, + "learning_rate": 9.493500000000001e-05, + "loss": 0.4235, + "step": 18997 + }, + { + "epoch": 1.0638369358270803, + "grad_norm": 1.5285604000091553, + "learning_rate": 9.494000000000001e-05, + "loss": 0.3818, + "step": 18998 + }, + { + "epoch": 1.0638929331392093, + "grad_norm": 1.3427233695983887, + "learning_rate": 9.4945e-05, + "loss": 0.4629, + "step": 18999 + }, + { + "epoch": 1.0639489304513383, + "grad_norm": 1.4263298511505127, + "learning_rate": 9.495e-05, + "loss": 0.5107, + "step": 19000 + }, + { + "epoch": 1.0640049277634673, + "grad_norm": 1.3277918100357056, + "learning_rate": 9.4955e-05, + "loss": 0.4914, + "step": 19001 + }, + { + "epoch": 1.0640609250755964, + "grad_norm": 1.5316247940063477, + "learning_rate": 9.496e-05, + "loss": 0.5536, + "step": 19002 + }, + { + "epoch": 1.0641169223877254, + "grad_norm": 1.4550143480300903, + "learning_rate": 9.496500000000001e-05, + "loss": 0.3668, + "step": 19003 + }, + { + "epoch": 1.0641729196998544, + "grad_norm": 1.1481560468673706, + "learning_rate": 9.497000000000001e-05, + "loss": 0.3922, + "step": 19004 + }, + { + "epoch": 1.0642289170119834, + "grad_norm": 1.3690632581710815, + "learning_rate": 9.4975e-05, + "loss": 0.5181, + "step": 19005 + }, + { + "epoch": 1.0642849143241124, + "grad_norm": 1.583183765411377, + "learning_rate": 9.498e-05, + "loss": 0.4482, + "step": 19006 + }, + { + "epoch": 1.0643409116362414, + "grad_norm": 1.3478230237960815, + "learning_rate": 9.4985e-05, + "loss": 0.3658, + "step": 19007 + }, + { + "epoch": 1.0643969089483705, + "grad_norm": 3.4860141277313232, + "learning_rate": 9.499e-05, + "loss": 0.4589, + "step": 19008 + }, + { + "epoch": 1.0644529062604995, + "grad_norm": 1.381873369216919, + "learning_rate": 9.4995e-05, + "loss": 0.4322, + "step": 19009 + }, + { + "epoch": 1.0645089035726285, + "grad_norm": 1.5568053722381592, + "learning_rate": 9.5e-05, + "loss": 0.3266, + "step": 19010 + }, + { + "epoch": 1.0645649008847575, + "grad_norm": 1.5077698230743408, + "learning_rate": 9.5005e-05, + "loss": 0.3767, + "step": 19011 + }, + { + "epoch": 1.0646208981968865, + "grad_norm": 1.1655912399291992, + "learning_rate": 9.501e-05, + "loss": 0.3115, + "step": 19012 + }, + { + "epoch": 1.0646768955090156, + "grad_norm": 1.681504487991333, + "learning_rate": 9.501500000000001e-05, + "loss": 0.4994, + "step": 19013 + }, + { + "epoch": 1.0647328928211446, + "grad_norm": 1.292641520500183, + "learning_rate": 9.502000000000001e-05, + "loss": 0.4135, + "step": 19014 + }, + { + "epoch": 1.0647888901332736, + "grad_norm": 1.6553596258163452, + "learning_rate": 9.5025e-05, + "loss": 0.4628, + "step": 19015 + }, + { + "epoch": 1.0648448874454026, + "grad_norm": 1.3552294969558716, + "learning_rate": 9.503e-05, + "loss": 0.4012, + "step": 19016 + }, + { + "epoch": 1.0649008847575316, + "grad_norm": 1.2220262289047241, + "learning_rate": 9.5035e-05, + "loss": 0.4394, + "step": 19017 + }, + { + "epoch": 1.0649568820696607, + "grad_norm": 1.2594716548919678, + "learning_rate": 9.504000000000001e-05, + "loss": 0.3832, + "step": 19018 + }, + { + "epoch": 1.0650128793817897, + "grad_norm": 1.3809083700180054, + "learning_rate": 9.504500000000001e-05, + "loss": 0.5147, + "step": 19019 + }, + { + "epoch": 1.0650688766939187, + "grad_norm": 1.7652674913406372, + "learning_rate": 9.505e-05, + "loss": 0.3956, + "step": 19020 + }, + { + "epoch": 1.0651248740060477, + "grad_norm": 1.3767143487930298, + "learning_rate": 9.5055e-05, + "loss": 0.4599, + "step": 19021 + }, + { + "epoch": 1.0651808713181767, + "grad_norm": 1.350810170173645, + "learning_rate": 9.506e-05, + "loss": 0.4033, + "step": 19022 + }, + { + "epoch": 1.0652368686303058, + "grad_norm": 2.0616953372955322, + "learning_rate": 9.5065e-05, + "loss": 0.4707, + "step": 19023 + }, + { + "epoch": 1.0652928659424348, + "grad_norm": 1.2971410751342773, + "learning_rate": 9.507000000000001e-05, + "loss": 0.4084, + "step": 19024 + }, + { + "epoch": 1.0653488632545638, + "grad_norm": 1.3095693588256836, + "learning_rate": 9.507500000000001e-05, + "loss": 0.3893, + "step": 19025 + }, + { + "epoch": 1.0654048605666928, + "grad_norm": 1.9630202054977417, + "learning_rate": 9.508e-05, + "loss": 0.4432, + "step": 19026 + }, + { + "epoch": 1.0654608578788218, + "grad_norm": 1.2579559087753296, + "learning_rate": 9.5085e-05, + "loss": 0.5054, + "step": 19027 + }, + { + "epoch": 1.0655168551909509, + "grad_norm": 1.3713284730911255, + "learning_rate": 9.509e-05, + "loss": 0.4316, + "step": 19028 + }, + { + "epoch": 1.0655728525030799, + "grad_norm": 1.4795838594436646, + "learning_rate": 9.5095e-05, + "loss": 0.4322, + "step": 19029 + }, + { + "epoch": 1.065628849815209, + "grad_norm": 1.1935054063796997, + "learning_rate": 9.51e-05, + "loss": 0.3195, + "step": 19030 + }, + { + "epoch": 1.065684847127338, + "grad_norm": 1.5417760610580444, + "learning_rate": 9.5105e-05, + "loss": 0.4727, + "step": 19031 + }, + { + "epoch": 1.065740844439467, + "grad_norm": 1.6064913272857666, + "learning_rate": 9.511e-05, + "loss": 0.4643, + "step": 19032 + }, + { + "epoch": 1.065796841751596, + "grad_norm": 1.329726219177246, + "learning_rate": 9.511500000000001e-05, + "loss": 0.4171, + "step": 19033 + }, + { + "epoch": 1.065852839063725, + "grad_norm": 1.7679561376571655, + "learning_rate": 9.512000000000001e-05, + "loss": 0.5562, + "step": 19034 + }, + { + "epoch": 1.065908836375854, + "grad_norm": 1.3980681896209717, + "learning_rate": 9.512500000000001e-05, + "loss": 0.5964, + "step": 19035 + }, + { + "epoch": 1.065964833687983, + "grad_norm": 1.4354372024536133, + "learning_rate": 9.513e-05, + "loss": 0.5044, + "step": 19036 + }, + { + "epoch": 1.066020831000112, + "grad_norm": 1.3081278800964355, + "learning_rate": 9.5135e-05, + "loss": 0.3527, + "step": 19037 + }, + { + "epoch": 1.066076828312241, + "grad_norm": 1.4948159456253052, + "learning_rate": 9.514e-05, + "loss": 0.4191, + "step": 19038 + }, + { + "epoch": 1.06613282562437, + "grad_norm": 1.452414631843567, + "learning_rate": 9.514500000000001e-05, + "loss": 0.5055, + "step": 19039 + }, + { + "epoch": 1.066188822936499, + "grad_norm": 1.4004466533660889, + "learning_rate": 9.515000000000001e-05, + "loss": 0.3828, + "step": 19040 + }, + { + "epoch": 1.066244820248628, + "grad_norm": 1.7150640487670898, + "learning_rate": 9.5155e-05, + "loss": 0.3618, + "step": 19041 + }, + { + "epoch": 1.0663008175607571, + "grad_norm": 1.6523611545562744, + "learning_rate": 9.516e-05, + "loss": 0.4781, + "step": 19042 + }, + { + "epoch": 1.0663568148728861, + "grad_norm": 1.2870184183120728, + "learning_rate": 9.5165e-05, + "loss": 0.4917, + "step": 19043 + }, + { + "epoch": 1.0664128121850152, + "grad_norm": 1.2794761657714844, + "learning_rate": 9.517e-05, + "loss": 0.5292, + "step": 19044 + }, + { + "epoch": 1.0664688094971442, + "grad_norm": 1.593009352684021, + "learning_rate": 9.517500000000001e-05, + "loss": 0.4965, + "step": 19045 + }, + { + "epoch": 1.0665248068092732, + "grad_norm": 1.3673381805419922, + "learning_rate": 9.518000000000001e-05, + "loss": 0.4164, + "step": 19046 + }, + { + "epoch": 1.0665808041214022, + "grad_norm": 2.2797043323516846, + "learning_rate": 9.5185e-05, + "loss": 0.49, + "step": 19047 + }, + { + "epoch": 1.0666368014335312, + "grad_norm": 1.3565945625305176, + "learning_rate": 9.519e-05, + "loss": 0.464, + "step": 19048 + }, + { + "epoch": 1.0666927987456603, + "grad_norm": 1.221509575843811, + "learning_rate": 9.5195e-05, + "loss": 0.3296, + "step": 19049 + }, + { + "epoch": 1.0667487960577893, + "grad_norm": 1.562795639038086, + "learning_rate": 9.52e-05, + "loss": 0.3887, + "step": 19050 + }, + { + "epoch": 1.0668047933699183, + "grad_norm": 1.1983307600021362, + "learning_rate": 9.5205e-05, + "loss": 0.4013, + "step": 19051 + }, + { + "epoch": 1.0668607906820473, + "grad_norm": 1.505826473236084, + "learning_rate": 9.521e-05, + "loss": 0.4318, + "step": 19052 + }, + { + "epoch": 1.0669167879941763, + "grad_norm": 1.3908984661102295, + "learning_rate": 9.521500000000002e-05, + "loss": 0.4813, + "step": 19053 + }, + { + "epoch": 1.0669727853063053, + "grad_norm": 1.2964426279067993, + "learning_rate": 9.522000000000001e-05, + "loss": 0.3802, + "step": 19054 + }, + { + "epoch": 1.0670287826184344, + "grad_norm": 1.3526910543441772, + "learning_rate": 9.522500000000001e-05, + "loss": 0.4145, + "step": 19055 + }, + { + "epoch": 1.0670847799305634, + "grad_norm": 1.2798247337341309, + "learning_rate": 9.523000000000001e-05, + "loss": 0.4685, + "step": 19056 + }, + { + "epoch": 1.0671407772426924, + "grad_norm": 1.429029941558838, + "learning_rate": 9.5235e-05, + "loss": 0.382, + "step": 19057 + }, + { + "epoch": 1.0671967745548214, + "grad_norm": 1.4787951707839966, + "learning_rate": 9.524e-05, + "loss": 0.4634, + "step": 19058 + }, + { + "epoch": 1.0672527718669504, + "grad_norm": 1.306098461151123, + "learning_rate": 9.5245e-05, + "loss": 0.5, + "step": 19059 + }, + { + "epoch": 1.0673087691790795, + "grad_norm": 1.5572845935821533, + "learning_rate": 9.525000000000001e-05, + "loss": 0.4532, + "step": 19060 + }, + { + "epoch": 1.0673647664912085, + "grad_norm": 1.5002474784851074, + "learning_rate": 9.525500000000001e-05, + "loss": 0.4291, + "step": 19061 + }, + { + "epoch": 1.0674207638033375, + "grad_norm": 1.4859400987625122, + "learning_rate": 9.526e-05, + "loss": 0.4613, + "step": 19062 + }, + { + "epoch": 1.0674767611154665, + "grad_norm": 1.4809750318527222, + "learning_rate": 9.5265e-05, + "loss": 0.439, + "step": 19063 + }, + { + "epoch": 1.0675327584275955, + "grad_norm": 1.3810079097747803, + "learning_rate": 9.527e-05, + "loss": 0.3838, + "step": 19064 + }, + { + "epoch": 1.0675887557397246, + "grad_norm": 1.2954858541488647, + "learning_rate": 9.5275e-05, + "loss": 0.3757, + "step": 19065 + }, + { + "epoch": 1.0676447530518536, + "grad_norm": 1.359278917312622, + "learning_rate": 9.528000000000001e-05, + "loss": 0.4049, + "step": 19066 + }, + { + "epoch": 1.0677007503639826, + "grad_norm": 1.168003797531128, + "learning_rate": 9.5285e-05, + "loss": 0.392, + "step": 19067 + }, + { + "epoch": 1.0677567476761116, + "grad_norm": 1.284511923789978, + "learning_rate": 9.529e-05, + "loss": 0.3769, + "step": 19068 + }, + { + "epoch": 1.0678127449882406, + "grad_norm": 1.1729944944381714, + "learning_rate": 9.5295e-05, + "loss": 0.3494, + "step": 19069 + }, + { + "epoch": 1.0678687423003697, + "grad_norm": 1.1762340068817139, + "learning_rate": 9.53e-05, + "loss": 0.3331, + "step": 19070 + }, + { + "epoch": 1.0679247396124987, + "grad_norm": 1.6314793825149536, + "learning_rate": 9.5305e-05, + "loss": 0.5729, + "step": 19071 + }, + { + "epoch": 1.0679807369246277, + "grad_norm": 1.3847637176513672, + "learning_rate": 9.531e-05, + "loss": 0.4225, + "step": 19072 + }, + { + "epoch": 1.0680367342367567, + "grad_norm": 1.3760457038879395, + "learning_rate": 9.5315e-05, + "loss": 0.4248, + "step": 19073 + }, + { + "epoch": 1.0680927315488857, + "grad_norm": 1.4888118505477905, + "learning_rate": 9.532000000000002e-05, + "loss": 0.5156, + "step": 19074 + }, + { + "epoch": 1.0681487288610148, + "grad_norm": 1.3431413173675537, + "learning_rate": 9.532500000000001e-05, + "loss": 0.3934, + "step": 19075 + }, + { + "epoch": 1.0682047261731438, + "grad_norm": 1.3805855512619019, + "learning_rate": 9.533000000000001e-05, + "loss": 0.5146, + "step": 19076 + }, + { + "epoch": 1.0682607234852728, + "grad_norm": 1.291238784790039, + "learning_rate": 9.533500000000001e-05, + "loss": 0.3844, + "step": 19077 + }, + { + "epoch": 1.0683167207974018, + "grad_norm": 1.8542561531066895, + "learning_rate": 9.534e-05, + "loss": 0.4608, + "step": 19078 + }, + { + "epoch": 1.0683727181095308, + "grad_norm": 1.3636932373046875, + "learning_rate": 9.5345e-05, + "loss": 0.4417, + "step": 19079 + }, + { + "epoch": 1.0684287154216598, + "grad_norm": 1.1776975393295288, + "learning_rate": 9.535e-05, + "loss": 0.3979, + "step": 19080 + }, + { + "epoch": 1.0684847127337889, + "grad_norm": 1.3916935920715332, + "learning_rate": 9.535500000000001e-05, + "loss": 0.4552, + "step": 19081 + }, + { + "epoch": 1.0685407100459179, + "grad_norm": 1.281245470046997, + "learning_rate": 9.536000000000001e-05, + "loss": 0.4894, + "step": 19082 + }, + { + "epoch": 1.068596707358047, + "grad_norm": 1.235292911529541, + "learning_rate": 9.5365e-05, + "loss": 0.5557, + "step": 19083 + }, + { + "epoch": 1.068652704670176, + "grad_norm": 1.126031756401062, + "learning_rate": 9.537e-05, + "loss": 0.3541, + "step": 19084 + }, + { + "epoch": 1.068708701982305, + "grad_norm": 1.4543650150299072, + "learning_rate": 9.5375e-05, + "loss": 0.3107, + "step": 19085 + }, + { + "epoch": 1.068764699294434, + "grad_norm": 1.264474868774414, + "learning_rate": 9.538e-05, + "loss": 0.5231, + "step": 19086 + }, + { + "epoch": 1.068820696606563, + "grad_norm": 1.4919121265411377, + "learning_rate": 9.5385e-05, + "loss": 0.4949, + "step": 19087 + }, + { + "epoch": 1.068876693918692, + "grad_norm": 1.2485785484313965, + "learning_rate": 9.539e-05, + "loss": 0.3303, + "step": 19088 + }, + { + "epoch": 1.068932691230821, + "grad_norm": 1.2648602724075317, + "learning_rate": 9.5395e-05, + "loss": 0.3607, + "step": 19089 + }, + { + "epoch": 1.06898868854295, + "grad_norm": 1.0592108964920044, + "learning_rate": 9.54e-05, + "loss": 0.2496, + "step": 19090 + }, + { + "epoch": 1.069044685855079, + "grad_norm": 1.2441421747207642, + "learning_rate": 9.5405e-05, + "loss": 0.4313, + "step": 19091 + }, + { + "epoch": 1.069100683167208, + "grad_norm": 1.3437831401824951, + "learning_rate": 9.541e-05, + "loss": 0.4893, + "step": 19092 + }, + { + "epoch": 1.0691566804793369, + "grad_norm": 1.3966460227966309, + "learning_rate": 9.541500000000001e-05, + "loss": 0.4479, + "step": 19093 + }, + { + "epoch": 1.069212677791466, + "grad_norm": 1.3548359870910645, + "learning_rate": 9.542e-05, + "loss": 0.4455, + "step": 19094 + }, + { + "epoch": 1.069268675103595, + "grad_norm": 2.854562520980835, + "learning_rate": 9.542500000000002e-05, + "loss": 0.3788, + "step": 19095 + }, + { + "epoch": 1.069324672415724, + "grad_norm": 1.4204295873641968, + "learning_rate": 9.543000000000001e-05, + "loss": 0.5879, + "step": 19096 + }, + { + "epoch": 1.069380669727853, + "grad_norm": 1.1766201257705688, + "learning_rate": 9.543500000000001e-05, + "loss": 0.3723, + "step": 19097 + }, + { + "epoch": 1.069436667039982, + "grad_norm": 1.2578608989715576, + "learning_rate": 9.544000000000001e-05, + "loss": 0.4654, + "step": 19098 + }, + { + "epoch": 1.069492664352111, + "grad_norm": 1.2997828722000122, + "learning_rate": 9.5445e-05, + "loss": 0.478, + "step": 19099 + }, + { + "epoch": 1.06954866166424, + "grad_norm": 1.2996234893798828, + "learning_rate": 9.545e-05, + "loss": 0.5364, + "step": 19100 + }, + { + "epoch": 1.069604658976369, + "grad_norm": 1.490668535232544, + "learning_rate": 9.5455e-05, + "loss": 0.5021, + "step": 19101 + }, + { + "epoch": 1.069660656288498, + "grad_norm": 1.256386160850525, + "learning_rate": 9.546000000000001e-05, + "loss": 0.5062, + "step": 19102 + }, + { + "epoch": 1.069716653600627, + "grad_norm": 1.200835943222046, + "learning_rate": 9.546500000000001e-05, + "loss": 0.3749, + "step": 19103 + }, + { + "epoch": 1.069772650912756, + "grad_norm": 1.45115065574646, + "learning_rate": 9.547e-05, + "loss": 0.5045, + "step": 19104 + }, + { + "epoch": 1.069828648224885, + "grad_norm": 1.0986417531967163, + "learning_rate": 9.5475e-05, + "loss": 0.4245, + "step": 19105 + }, + { + "epoch": 1.0698846455370141, + "grad_norm": 1.5335986614227295, + "learning_rate": 9.548e-05, + "loss": 0.3657, + "step": 19106 + }, + { + "epoch": 1.0699406428491431, + "grad_norm": 1.6153874397277832, + "learning_rate": 9.5485e-05, + "loss": 0.386, + "step": 19107 + }, + { + "epoch": 1.0699966401612722, + "grad_norm": 1.272781252861023, + "learning_rate": 9.549e-05, + "loss": 0.4127, + "step": 19108 + }, + { + "epoch": 1.0700526374734012, + "grad_norm": 1.1290336847305298, + "learning_rate": 9.5495e-05, + "loss": 0.33, + "step": 19109 + }, + { + "epoch": 1.0701086347855302, + "grad_norm": 1.4628112316131592, + "learning_rate": 9.55e-05, + "loss": 0.3112, + "step": 19110 + }, + { + "epoch": 1.0701646320976592, + "grad_norm": 2.051401138305664, + "learning_rate": 9.5505e-05, + "loss": 0.4451, + "step": 19111 + }, + { + "epoch": 1.0702206294097882, + "grad_norm": 1.1548988819122314, + "learning_rate": 9.551e-05, + "loss": 0.3769, + "step": 19112 + }, + { + "epoch": 1.0702766267219173, + "grad_norm": 1.3833144903182983, + "learning_rate": 9.551500000000001e-05, + "loss": 0.3899, + "step": 19113 + }, + { + "epoch": 1.0703326240340463, + "grad_norm": 1.3498610258102417, + "learning_rate": 9.552000000000001e-05, + "loss": 0.4589, + "step": 19114 + }, + { + "epoch": 1.0703886213461753, + "grad_norm": 1.2717036008834839, + "learning_rate": 9.5525e-05, + "loss": 0.3871, + "step": 19115 + }, + { + "epoch": 1.0704446186583043, + "grad_norm": 1.3312551975250244, + "learning_rate": 9.553e-05, + "loss": 0.4529, + "step": 19116 + }, + { + "epoch": 1.0705006159704333, + "grad_norm": 1.3813267946243286, + "learning_rate": 9.553500000000001e-05, + "loss": 0.4138, + "step": 19117 + }, + { + "epoch": 1.0705566132825624, + "grad_norm": 1.2773473262786865, + "learning_rate": 9.554000000000001e-05, + "loss": 0.4857, + "step": 19118 + }, + { + "epoch": 1.0706126105946914, + "grad_norm": 1.2579201459884644, + "learning_rate": 9.554500000000001e-05, + "loss": 0.4754, + "step": 19119 + }, + { + "epoch": 1.0706686079068204, + "grad_norm": 1.289850115776062, + "learning_rate": 9.555e-05, + "loss": 0.4207, + "step": 19120 + }, + { + "epoch": 1.0707246052189494, + "grad_norm": 1.1401543617248535, + "learning_rate": 9.5555e-05, + "loss": 0.4304, + "step": 19121 + }, + { + "epoch": 1.0707806025310784, + "grad_norm": 1.5424062013626099, + "learning_rate": 9.556e-05, + "loss": 0.4428, + "step": 19122 + }, + { + "epoch": 1.0708365998432074, + "grad_norm": 1.9160706996917725, + "learning_rate": 9.556500000000001e-05, + "loss": 0.6462, + "step": 19123 + }, + { + "epoch": 1.0708925971553365, + "grad_norm": 1.756384253501892, + "learning_rate": 9.557000000000001e-05, + "loss": 0.4831, + "step": 19124 + }, + { + "epoch": 1.0709485944674655, + "grad_norm": 1.5352463722229004, + "learning_rate": 9.5575e-05, + "loss": 0.4154, + "step": 19125 + }, + { + "epoch": 1.0710045917795945, + "grad_norm": 1.402227520942688, + "learning_rate": 9.558e-05, + "loss": 0.4776, + "step": 19126 + }, + { + "epoch": 1.0710605890917235, + "grad_norm": 1.381800889968872, + "learning_rate": 9.5585e-05, + "loss": 0.3569, + "step": 19127 + }, + { + "epoch": 1.0711165864038525, + "grad_norm": 1.2688425779342651, + "learning_rate": 9.559e-05, + "loss": 0.3758, + "step": 19128 + }, + { + "epoch": 1.0711725837159816, + "grad_norm": 1.3217486143112183, + "learning_rate": 9.5595e-05, + "loss": 0.5328, + "step": 19129 + }, + { + "epoch": 1.0712285810281106, + "grad_norm": 1.3088239431381226, + "learning_rate": 9.56e-05, + "loss": 0.4262, + "step": 19130 + }, + { + "epoch": 1.0712845783402396, + "grad_norm": 1.3540040254592896, + "learning_rate": 9.5605e-05, + "loss": 0.4453, + "step": 19131 + }, + { + "epoch": 1.0713405756523686, + "grad_norm": 1.3389081954956055, + "learning_rate": 9.561e-05, + "loss": 0.3616, + "step": 19132 + }, + { + "epoch": 1.0713965729644976, + "grad_norm": 1.123694658279419, + "learning_rate": 9.561500000000001e-05, + "loss": 0.4807, + "step": 19133 + }, + { + "epoch": 1.0714525702766267, + "grad_norm": 1.4012280702590942, + "learning_rate": 9.562000000000001e-05, + "loss": 0.5071, + "step": 19134 + }, + { + "epoch": 1.0715085675887557, + "grad_norm": 1.4097554683685303, + "learning_rate": 9.562500000000001e-05, + "loss": 0.4201, + "step": 19135 + }, + { + "epoch": 1.0715645649008847, + "grad_norm": 1.218183994293213, + "learning_rate": 9.563e-05, + "loss": 0.3332, + "step": 19136 + }, + { + "epoch": 1.0716205622130137, + "grad_norm": 1.2299362421035767, + "learning_rate": 9.5635e-05, + "loss": 0.3997, + "step": 19137 + }, + { + "epoch": 1.0716765595251427, + "grad_norm": 1.709261178970337, + "learning_rate": 9.564000000000001e-05, + "loss": 0.459, + "step": 19138 + }, + { + "epoch": 1.0717325568372718, + "grad_norm": 1.3686734437942505, + "learning_rate": 9.564500000000001e-05, + "loss": 0.4295, + "step": 19139 + }, + { + "epoch": 1.0717885541494008, + "grad_norm": 1.5993621349334717, + "learning_rate": 9.565000000000001e-05, + "loss": 0.4597, + "step": 19140 + }, + { + "epoch": 1.0718445514615298, + "grad_norm": 1.6603903770446777, + "learning_rate": 9.5655e-05, + "loss": 0.5369, + "step": 19141 + }, + { + "epoch": 1.0719005487736588, + "grad_norm": 1.1640290021896362, + "learning_rate": 9.566e-05, + "loss": 0.4805, + "step": 19142 + }, + { + "epoch": 1.0719565460857878, + "grad_norm": 1.2247138023376465, + "learning_rate": 9.5665e-05, + "loss": 0.3195, + "step": 19143 + }, + { + "epoch": 1.0720125433979169, + "grad_norm": 1.2630603313446045, + "learning_rate": 9.567000000000001e-05, + "loss": 0.3868, + "step": 19144 + }, + { + "epoch": 1.0720685407100459, + "grad_norm": 1.3521305322647095, + "learning_rate": 9.567500000000001e-05, + "loss": 0.4709, + "step": 19145 + }, + { + "epoch": 1.0721245380221749, + "grad_norm": 1.1388635635375977, + "learning_rate": 9.568e-05, + "loss": 0.2951, + "step": 19146 + }, + { + "epoch": 1.072180535334304, + "grad_norm": 1.536655306816101, + "learning_rate": 9.5685e-05, + "loss": 0.3331, + "step": 19147 + }, + { + "epoch": 1.072236532646433, + "grad_norm": 1.6921892166137695, + "learning_rate": 9.569e-05, + "loss": 0.4127, + "step": 19148 + }, + { + "epoch": 1.072292529958562, + "grad_norm": 1.5893220901489258, + "learning_rate": 9.5695e-05, + "loss": 0.396, + "step": 19149 + }, + { + "epoch": 1.072348527270691, + "grad_norm": 2.503692865371704, + "learning_rate": 9.57e-05, + "loss": 0.4211, + "step": 19150 + }, + { + "epoch": 1.07240452458282, + "grad_norm": 1.5990415811538696, + "learning_rate": 9.5705e-05, + "loss": 0.491, + "step": 19151 + }, + { + "epoch": 1.072460521894949, + "grad_norm": 1.5531582832336426, + "learning_rate": 9.571e-05, + "loss": 0.5286, + "step": 19152 + }, + { + "epoch": 1.072516519207078, + "grad_norm": 1.3615837097167969, + "learning_rate": 9.5715e-05, + "loss": 0.436, + "step": 19153 + }, + { + "epoch": 1.072572516519207, + "grad_norm": 1.5618793964385986, + "learning_rate": 9.572000000000001e-05, + "loss": 0.4309, + "step": 19154 + }, + { + "epoch": 1.072628513831336, + "grad_norm": 1.3727819919586182, + "learning_rate": 9.572500000000001e-05, + "loss": 0.5035, + "step": 19155 + }, + { + "epoch": 1.072684511143465, + "grad_norm": 1.286844253540039, + "learning_rate": 9.573e-05, + "loss": 0.3753, + "step": 19156 + }, + { + "epoch": 1.072740508455594, + "grad_norm": 1.4722813367843628, + "learning_rate": 9.5735e-05, + "loss": 0.3495, + "step": 19157 + }, + { + "epoch": 1.0727965057677231, + "grad_norm": 1.631972074508667, + "learning_rate": 9.574e-05, + "loss": 0.4748, + "step": 19158 + }, + { + "epoch": 1.0728525030798521, + "grad_norm": 1.638663411140442, + "learning_rate": 9.574500000000001e-05, + "loss": 0.4136, + "step": 19159 + }, + { + "epoch": 1.0729085003919812, + "grad_norm": 1.4947776794433594, + "learning_rate": 9.575000000000001e-05, + "loss": 0.5904, + "step": 19160 + }, + { + "epoch": 1.0729644977041102, + "grad_norm": 2.0101702213287354, + "learning_rate": 9.575500000000001e-05, + "loss": 0.5917, + "step": 19161 + }, + { + "epoch": 1.0730204950162392, + "grad_norm": 1.1375631093978882, + "learning_rate": 9.576e-05, + "loss": 0.3687, + "step": 19162 + }, + { + "epoch": 1.0730764923283682, + "grad_norm": 1.2634828090667725, + "learning_rate": 9.5765e-05, + "loss": 0.3762, + "step": 19163 + }, + { + "epoch": 1.0731324896404972, + "grad_norm": 1.2566579580307007, + "learning_rate": 9.577e-05, + "loss": 0.4033, + "step": 19164 + }, + { + "epoch": 1.0731884869526263, + "grad_norm": 1.6312873363494873, + "learning_rate": 9.5775e-05, + "loss": 0.5984, + "step": 19165 + }, + { + "epoch": 1.0732444842647553, + "grad_norm": 1.2147444486618042, + "learning_rate": 9.578000000000001e-05, + "loss": 0.4717, + "step": 19166 + }, + { + "epoch": 1.0733004815768843, + "grad_norm": 1.5377756357192993, + "learning_rate": 9.5785e-05, + "loss": 0.4054, + "step": 19167 + }, + { + "epoch": 1.0733564788890133, + "grad_norm": 1.4068466424942017, + "learning_rate": 9.579e-05, + "loss": 0.3693, + "step": 19168 + }, + { + "epoch": 1.0734124762011423, + "grad_norm": 1.3765770196914673, + "learning_rate": 9.5795e-05, + "loss": 0.5127, + "step": 19169 + }, + { + "epoch": 1.0734684735132713, + "grad_norm": 1.680324673652649, + "learning_rate": 9.58e-05, + "loss": 0.5666, + "step": 19170 + }, + { + "epoch": 1.0735244708254004, + "grad_norm": 1.2332838773727417, + "learning_rate": 9.5805e-05, + "loss": 0.4255, + "step": 19171 + }, + { + "epoch": 1.0735804681375294, + "grad_norm": 1.4358876943588257, + "learning_rate": 9.581e-05, + "loss": 0.3233, + "step": 19172 + }, + { + "epoch": 1.0736364654496584, + "grad_norm": 1.2500696182250977, + "learning_rate": 9.5815e-05, + "loss": 0.4533, + "step": 19173 + }, + { + "epoch": 1.0736924627617874, + "grad_norm": 1.4486896991729736, + "learning_rate": 9.582000000000001e-05, + "loss": 0.4665, + "step": 19174 + }, + { + "epoch": 1.0737484600739164, + "grad_norm": 1.3838056325912476, + "learning_rate": 9.582500000000001e-05, + "loss": 0.4326, + "step": 19175 + }, + { + "epoch": 1.0738044573860455, + "grad_norm": 1.5006517171859741, + "learning_rate": 9.583000000000001e-05, + "loss": 0.5085, + "step": 19176 + }, + { + "epoch": 1.0738604546981745, + "grad_norm": 1.25216543674469, + "learning_rate": 9.5835e-05, + "loss": 0.4017, + "step": 19177 + }, + { + "epoch": 1.0739164520103035, + "grad_norm": 1.4105465412139893, + "learning_rate": 9.584e-05, + "loss": 0.482, + "step": 19178 + }, + { + "epoch": 1.0739724493224325, + "grad_norm": 1.3745070695877075, + "learning_rate": 9.5845e-05, + "loss": 0.4596, + "step": 19179 + }, + { + "epoch": 1.0740284466345615, + "grad_norm": 1.3403490781784058, + "learning_rate": 9.585000000000001e-05, + "loss": 0.4411, + "step": 19180 + }, + { + "epoch": 1.0740844439466906, + "grad_norm": 1.0515056848526, + "learning_rate": 9.585500000000001e-05, + "loss": 0.3387, + "step": 19181 + }, + { + "epoch": 1.0741404412588196, + "grad_norm": 1.3254003524780273, + "learning_rate": 9.586000000000001e-05, + "loss": 0.3439, + "step": 19182 + }, + { + "epoch": 1.0741964385709486, + "grad_norm": 1.3216105699539185, + "learning_rate": 9.5865e-05, + "loss": 0.3528, + "step": 19183 + }, + { + "epoch": 1.0742524358830776, + "grad_norm": 1.2448352575302124, + "learning_rate": 9.587e-05, + "loss": 0.3025, + "step": 19184 + }, + { + "epoch": 1.0743084331952066, + "grad_norm": 1.499056339263916, + "learning_rate": 9.5875e-05, + "loss": 0.4477, + "step": 19185 + }, + { + "epoch": 1.0743644305073357, + "grad_norm": 1.3132444620132446, + "learning_rate": 9.588e-05, + "loss": 0.5061, + "step": 19186 + }, + { + "epoch": 1.0744204278194647, + "grad_norm": 1.2185949087142944, + "learning_rate": 9.588500000000001e-05, + "loss": 0.2891, + "step": 19187 + }, + { + "epoch": 1.0744764251315937, + "grad_norm": 1.2195074558258057, + "learning_rate": 9.589e-05, + "loss": 0.326, + "step": 19188 + }, + { + "epoch": 1.0745324224437227, + "grad_norm": 1.6875252723693848, + "learning_rate": 9.5895e-05, + "loss": 0.5185, + "step": 19189 + }, + { + "epoch": 1.0745884197558517, + "grad_norm": 1.3300776481628418, + "learning_rate": 9.59e-05, + "loss": 0.3762, + "step": 19190 + }, + { + "epoch": 1.0746444170679808, + "grad_norm": 1.4575303792953491, + "learning_rate": 9.5905e-05, + "loss": 0.4495, + "step": 19191 + }, + { + "epoch": 1.0747004143801098, + "grad_norm": 1.3437873125076294, + "learning_rate": 9.591e-05, + "loss": 0.5067, + "step": 19192 + }, + { + "epoch": 1.0747564116922388, + "grad_norm": 1.2452030181884766, + "learning_rate": 9.5915e-05, + "loss": 0.3846, + "step": 19193 + }, + { + "epoch": 1.0748124090043678, + "grad_norm": 1.2425785064697266, + "learning_rate": 9.592e-05, + "loss": 0.3747, + "step": 19194 + }, + { + "epoch": 1.0748684063164968, + "grad_norm": 1.5638777017593384, + "learning_rate": 9.592500000000001e-05, + "loss": 0.4093, + "step": 19195 + }, + { + "epoch": 1.0749244036286258, + "grad_norm": 1.5321054458618164, + "learning_rate": 9.593000000000001e-05, + "loss": 0.4748, + "step": 19196 + }, + { + "epoch": 1.0749804009407549, + "grad_norm": 1.5617907047271729, + "learning_rate": 9.593500000000001e-05, + "loss": 0.4677, + "step": 19197 + }, + { + "epoch": 1.0750363982528839, + "grad_norm": 2.0262441635131836, + "learning_rate": 9.594e-05, + "loss": 0.6669, + "step": 19198 + }, + { + "epoch": 1.075092395565013, + "grad_norm": 1.2591593265533447, + "learning_rate": 9.5945e-05, + "loss": 0.5268, + "step": 19199 + }, + { + "epoch": 1.075148392877142, + "grad_norm": 1.3188189268112183, + "learning_rate": 9.595e-05, + "loss": 0.4543, + "step": 19200 + }, + { + "epoch": 1.075204390189271, + "grad_norm": 1.450954794883728, + "learning_rate": 9.595500000000001e-05, + "loss": 0.4581, + "step": 19201 + }, + { + "epoch": 1.0752603875014, + "grad_norm": 1.6723936796188354, + "learning_rate": 9.596000000000001e-05, + "loss": 0.5799, + "step": 19202 + }, + { + "epoch": 1.075316384813529, + "grad_norm": 1.4126452207565308, + "learning_rate": 9.596500000000001e-05, + "loss": 0.4893, + "step": 19203 + }, + { + "epoch": 1.075372382125658, + "grad_norm": 1.315180778503418, + "learning_rate": 9.597e-05, + "loss": 0.4201, + "step": 19204 + }, + { + "epoch": 1.075428379437787, + "grad_norm": 1.350109577178955, + "learning_rate": 9.5975e-05, + "loss": 0.4438, + "step": 19205 + }, + { + "epoch": 1.075484376749916, + "grad_norm": 1.246303677558899, + "learning_rate": 9.598e-05, + "loss": 0.3991, + "step": 19206 + }, + { + "epoch": 1.075540374062045, + "grad_norm": 1.2813001871109009, + "learning_rate": 9.5985e-05, + "loss": 0.42, + "step": 19207 + }, + { + "epoch": 1.075596371374174, + "grad_norm": 1.320867657661438, + "learning_rate": 9.599000000000001e-05, + "loss": 0.4516, + "step": 19208 + }, + { + "epoch": 1.075652368686303, + "grad_norm": 1.4397691488265991, + "learning_rate": 9.5995e-05, + "loss": 0.4549, + "step": 19209 + }, + { + "epoch": 1.0757083659984321, + "grad_norm": 1.1421473026275635, + "learning_rate": 9.6e-05, + "loss": 0.3526, + "step": 19210 + }, + { + "epoch": 1.0757643633105611, + "grad_norm": 1.3536735773086548, + "learning_rate": 9.6005e-05, + "loss": 0.3772, + "step": 19211 + }, + { + "epoch": 1.0758203606226902, + "grad_norm": 1.417104959487915, + "learning_rate": 9.601e-05, + "loss": 0.4627, + "step": 19212 + }, + { + "epoch": 1.0758763579348192, + "grad_norm": 1.2568718194961548, + "learning_rate": 9.6015e-05, + "loss": 0.3361, + "step": 19213 + }, + { + "epoch": 1.0759323552469482, + "grad_norm": 1.1949340105056763, + "learning_rate": 9.602e-05, + "loss": 0.4164, + "step": 19214 + }, + { + "epoch": 1.0759883525590772, + "grad_norm": 1.399598479270935, + "learning_rate": 9.6025e-05, + "loss": 0.4094, + "step": 19215 + }, + { + "epoch": 1.0760443498712062, + "grad_norm": 1.5406945943832397, + "learning_rate": 9.603000000000001e-05, + "loss": 0.4701, + "step": 19216 + }, + { + "epoch": 1.0761003471833352, + "grad_norm": 1.3081169128417969, + "learning_rate": 9.603500000000001e-05, + "loss": 0.4717, + "step": 19217 + }, + { + "epoch": 1.0761563444954643, + "grad_norm": 1.2680391073226929, + "learning_rate": 9.604000000000001e-05, + "loss": 0.4676, + "step": 19218 + }, + { + "epoch": 1.0762123418075933, + "grad_norm": 1.303810954093933, + "learning_rate": 9.6045e-05, + "loss": 0.41, + "step": 19219 + }, + { + "epoch": 1.0762683391197223, + "grad_norm": 1.2404943704605103, + "learning_rate": 9.605e-05, + "loss": 0.445, + "step": 19220 + }, + { + "epoch": 1.0763243364318513, + "grad_norm": 1.246006727218628, + "learning_rate": 9.6055e-05, + "loss": 0.458, + "step": 19221 + }, + { + "epoch": 1.0763803337439803, + "grad_norm": 1.2949378490447998, + "learning_rate": 9.606000000000001e-05, + "loss": 0.4365, + "step": 19222 + }, + { + "epoch": 1.0764363310561094, + "grad_norm": 1.7878241539001465, + "learning_rate": 9.606500000000001e-05, + "loss": 0.4829, + "step": 19223 + }, + { + "epoch": 1.0764923283682384, + "grad_norm": 1.2644611597061157, + "learning_rate": 9.607000000000001e-05, + "loss": 0.455, + "step": 19224 + }, + { + "epoch": 1.0765483256803674, + "grad_norm": 1.3939383029937744, + "learning_rate": 9.6075e-05, + "loss": 0.4113, + "step": 19225 + }, + { + "epoch": 1.0766043229924964, + "grad_norm": 1.207818627357483, + "learning_rate": 9.608e-05, + "loss": 0.3622, + "step": 19226 + }, + { + "epoch": 1.0766603203046254, + "grad_norm": 1.1098518371582031, + "learning_rate": 9.6085e-05, + "loss": 0.2977, + "step": 19227 + }, + { + "epoch": 1.0767163176167545, + "grad_norm": 1.2676883935928345, + "learning_rate": 9.609e-05, + "loss": 0.3818, + "step": 19228 + }, + { + "epoch": 1.0767723149288835, + "grad_norm": 1.369973063468933, + "learning_rate": 9.609500000000001e-05, + "loss": 0.4579, + "step": 19229 + }, + { + "epoch": 1.0768283122410125, + "grad_norm": 1.4226983785629272, + "learning_rate": 9.61e-05, + "loss": 0.3708, + "step": 19230 + }, + { + "epoch": 1.0768843095531415, + "grad_norm": 1.5574685335159302, + "learning_rate": 9.6105e-05, + "loss": 0.5837, + "step": 19231 + }, + { + "epoch": 1.0769403068652705, + "grad_norm": 1.4205504655838013, + "learning_rate": 9.611e-05, + "loss": 0.5139, + "step": 19232 + }, + { + "epoch": 1.0769963041773996, + "grad_norm": 1.4705474376678467, + "learning_rate": 9.6115e-05, + "loss": 0.5394, + "step": 19233 + }, + { + "epoch": 1.0770523014895286, + "grad_norm": 1.4634571075439453, + "learning_rate": 9.612000000000001e-05, + "loss": 0.448, + "step": 19234 + }, + { + "epoch": 1.0771082988016576, + "grad_norm": 1.6852033138275146, + "learning_rate": 9.6125e-05, + "loss": 0.5067, + "step": 19235 + }, + { + "epoch": 1.0771642961137866, + "grad_norm": 1.2660261392593384, + "learning_rate": 9.613e-05, + "loss": 0.3121, + "step": 19236 + }, + { + "epoch": 1.0772202934259156, + "grad_norm": 1.2085484266281128, + "learning_rate": 9.613500000000001e-05, + "loss": 0.4393, + "step": 19237 + }, + { + "epoch": 1.0772762907380447, + "grad_norm": 1.3156460523605347, + "learning_rate": 9.614000000000001e-05, + "loss": 0.5263, + "step": 19238 + }, + { + "epoch": 1.0773322880501737, + "grad_norm": 1.5212336778640747, + "learning_rate": 9.614500000000001e-05, + "loss": 0.4745, + "step": 19239 + }, + { + "epoch": 1.0773882853623027, + "grad_norm": 1.2039270401000977, + "learning_rate": 9.615e-05, + "loss": 0.3301, + "step": 19240 + }, + { + "epoch": 1.0774442826744317, + "grad_norm": 1.3416954278945923, + "learning_rate": 9.6155e-05, + "loss": 0.2978, + "step": 19241 + }, + { + "epoch": 1.0775002799865607, + "grad_norm": 1.633178472518921, + "learning_rate": 9.616e-05, + "loss": 0.3377, + "step": 19242 + }, + { + "epoch": 1.0775562772986897, + "grad_norm": 1.3483723402023315, + "learning_rate": 9.616500000000001e-05, + "loss": 0.4082, + "step": 19243 + }, + { + "epoch": 1.0776122746108188, + "grad_norm": 1.492039680480957, + "learning_rate": 9.617000000000001e-05, + "loss": 0.5695, + "step": 19244 + }, + { + "epoch": 1.0776682719229478, + "grad_norm": 1.0585488080978394, + "learning_rate": 9.6175e-05, + "loss": 0.3845, + "step": 19245 + }, + { + "epoch": 1.0777242692350768, + "grad_norm": 1.646497368812561, + "learning_rate": 9.618e-05, + "loss": 0.4469, + "step": 19246 + }, + { + "epoch": 1.0777802665472058, + "grad_norm": 1.4915268421173096, + "learning_rate": 9.6185e-05, + "loss": 0.4839, + "step": 19247 + }, + { + "epoch": 1.0778362638593348, + "grad_norm": 1.1967825889587402, + "learning_rate": 9.619e-05, + "loss": 0.3524, + "step": 19248 + }, + { + "epoch": 1.0778922611714639, + "grad_norm": 1.4445247650146484, + "learning_rate": 9.6195e-05, + "loss": 0.3228, + "step": 19249 + }, + { + "epoch": 1.0779482584835929, + "grad_norm": 1.1791483163833618, + "learning_rate": 9.620000000000001e-05, + "loss": 0.3623, + "step": 19250 + }, + { + "epoch": 1.078004255795722, + "grad_norm": 1.2073333263397217, + "learning_rate": 9.6205e-05, + "loss": 0.3775, + "step": 19251 + }, + { + "epoch": 1.078060253107851, + "grad_norm": 1.238425612449646, + "learning_rate": 9.621e-05, + "loss": 0.3242, + "step": 19252 + }, + { + "epoch": 1.07811625041998, + "grad_norm": 1.1087088584899902, + "learning_rate": 9.6215e-05, + "loss": 0.3238, + "step": 19253 + }, + { + "epoch": 1.078172247732109, + "grad_norm": 1.262492060661316, + "learning_rate": 9.622000000000001e-05, + "loss": 0.3786, + "step": 19254 + }, + { + "epoch": 1.078228245044238, + "grad_norm": 1.1630018949508667, + "learning_rate": 9.622500000000001e-05, + "loss": 0.342, + "step": 19255 + }, + { + "epoch": 1.078284242356367, + "grad_norm": 1.9120094776153564, + "learning_rate": 9.623e-05, + "loss": 0.6781, + "step": 19256 + }, + { + "epoch": 1.078340239668496, + "grad_norm": 1.4458720684051514, + "learning_rate": 9.6235e-05, + "loss": 0.4242, + "step": 19257 + }, + { + "epoch": 1.078396236980625, + "grad_norm": 1.3422973155975342, + "learning_rate": 9.624000000000001e-05, + "loss": 0.3918, + "step": 19258 + }, + { + "epoch": 1.078452234292754, + "grad_norm": 1.4883275032043457, + "learning_rate": 9.624500000000001e-05, + "loss": 0.6011, + "step": 19259 + }, + { + "epoch": 1.078508231604883, + "grad_norm": 1.652848720550537, + "learning_rate": 9.625000000000001e-05, + "loss": 0.4489, + "step": 19260 + }, + { + "epoch": 1.078564228917012, + "grad_norm": 1.555734395980835, + "learning_rate": 9.6255e-05, + "loss": 0.3759, + "step": 19261 + }, + { + "epoch": 1.078620226229141, + "grad_norm": 1.3970298767089844, + "learning_rate": 9.626e-05, + "loss": 0.4004, + "step": 19262 + }, + { + "epoch": 1.0786762235412701, + "grad_norm": 1.3191635608673096, + "learning_rate": 9.6265e-05, + "loss": 0.3588, + "step": 19263 + }, + { + "epoch": 1.0787322208533991, + "grad_norm": 1.6098352670669556, + "learning_rate": 9.627e-05, + "loss": 0.4608, + "step": 19264 + }, + { + "epoch": 1.0787882181655282, + "grad_norm": 1.4320333003997803, + "learning_rate": 9.627500000000001e-05, + "loss": 0.4395, + "step": 19265 + }, + { + "epoch": 1.0788442154776572, + "grad_norm": 1.2062251567840576, + "learning_rate": 9.628e-05, + "loss": 0.3891, + "step": 19266 + }, + { + "epoch": 1.0789002127897862, + "grad_norm": 1.2579712867736816, + "learning_rate": 9.6285e-05, + "loss": 0.4037, + "step": 19267 + }, + { + "epoch": 1.0789562101019152, + "grad_norm": 1.2883987426757812, + "learning_rate": 9.629e-05, + "loss": 0.4493, + "step": 19268 + }, + { + "epoch": 1.079012207414044, + "grad_norm": 1.0617892742156982, + "learning_rate": 9.6295e-05, + "loss": 0.3791, + "step": 19269 + }, + { + "epoch": 1.079068204726173, + "grad_norm": 1.2531203031539917, + "learning_rate": 9.63e-05, + "loss": 0.3705, + "step": 19270 + }, + { + "epoch": 1.079124202038302, + "grad_norm": 1.6580188274383545, + "learning_rate": 9.630500000000001e-05, + "loss": 0.5208, + "step": 19271 + }, + { + "epoch": 1.079180199350431, + "grad_norm": 1.5703518390655518, + "learning_rate": 9.631e-05, + "loss": 0.5463, + "step": 19272 + }, + { + "epoch": 1.07923619666256, + "grad_norm": 1.3446650505065918, + "learning_rate": 9.6315e-05, + "loss": 0.4433, + "step": 19273 + }, + { + "epoch": 1.0792921939746891, + "grad_norm": 1.2784892320632935, + "learning_rate": 9.632e-05, + "loss": 0.3716, + "step": 19274 + }, + { + "epoch": 1.0793481912868181, + "grad_norm": 1.4385497570037842, + "learning_rate": 9.632500000000001e-05, + "loss": 0.4583, + "step": 19275 + }, + { + "epoch": 1.0794041885989472, + "grad_norm": 1.7008953094482422, + "learning_rate": 9.633000000000001e-05, + "loss": 0.6018, + "step": 19276 + }, + { + "epoch": 1.0794601859110762, + "grad_norm": 1.5577605962753296, + "learning_rate": 9.6335e-05, + "loss": 0.4435, + "step": 19277 + }, + { + "epoch": 1.0795161832232052, + "grad_norm": 1.368937373161316, + "learning_rate": 9.634e-05, + "loss": 0.3812, + "step": 19278 + }, + { + "epoch": 1.0795721805353342, + "grad_norm": 1.4653809070587158, + "learning_rate": 9.634500000000001e-05, + "loss": 0.4979, + "step": 19279 + }, + { + "epoch": 1.0796281778474632, + "grad_norm": 1.2361409664154053, + "learning_rate": 9.635000000000001e-05, + "loss": 0.4288, + "step": 19280 + }, + { + "epoch": 1.0796841751595923, + "grad_norm": 1.3378677368164062, + "learning_rate": 9.635500000000001e-05, + "loss": 0.4036, + "step": 19281 + }, + { + "epoch": 1.0797401724717213, + "grad_norm": 1.4238026142120361, + "learning_rate": 9.636e-05, + "loss": 0.4819, + "step": 19282 + }, + { + "epoch": 1.0797961697838503, + "grad_norm": 1.2204433679580688, + "learning_rate": 9.6365e-05, + "loss": 0.4173, + "step": 19283 + }, + { + "epoch": 1.0798521670959793, + "grad_norm": 1.1371172666549683, + "learning_rate": 9.637e-05, + "loss": 0.3358, + "step": 19284 + }, + { + "epoch": 1.0799081644081083, + "grad_norm": 1.2987414598464966, + "learning_rate": 9.6375e-05, + "loss": 0.4411, + "step": 19285 + }, + { + "epoch": 1.0799641617202373, + "grad_norm": 1.4364862442016602, + "learning_rate": 9.638000000000001e-05, + "loss": 0.3644, + "step": 19286 + }, + { + "epoch": 1.0800201590323664, + "grad_norm": 1.258409857749939, + "learning_rate": 9.6385e-05, + "loss": 0.4312, + "step": 19287 + }, + { + "epoch": 1.0800761563444954, + "grad_norm": 1.504977822303772, + "learning_rate": 9.639e-05, + "loss": 0.426, + "step": 19288 + }, + { + "epoch": 1.0801321536566244, + "grad_norm": 1.5396018028259277, + "learning_rate": 9.6395e-05, + "loss": 0.5399, + "step": 19289 + }, + { + "epoch": 1.0801881509687534, + "grad_norm": 1.4223546981811523, + "learning_rate": 9.64e-05, + "loss": 0.5389, + "step": 19290 + }, + { + "epoch": 1.0802441482808824, + "grad_norm": 1.4549106359481812, + "learning_rate": 9.6405e-05, + "loss": 0.5428, + "step": 19291 + }, + { + "epoch": 1.0803001455930115, + "grad_norm": 1.9123473167419434, + "learning_rate": 9.641000000000001e-05, + "loss": 0.4298, + "step": 19292 + }, + { + "epoch": 1.0803561429051405, + "grad_norm": 1.4224637746810913, + "learning_rate": 9.6415e-05, + "loss": 0.3685, + "step": 19293 + }, + { + "epoch": 1.0804121402172695, + "grad_norm": 1.3219951391220093, + "learning_rate": 9.642e-05, + "loss": 0.4869, + "step": 19294 + }, + { + "epoch": 1.0804681375293985, + "grad_norm": 1.3343758583068848, + "learning_rate": 9.642500000000001e-05, + "loss": 0.492, + "step": 19295 + }, + { + "epoch": 1.0805241348415275, + "grad_norm": 1.5717768669128418, + "learning_rate": 9.643000000000001e-05, + "loss": 0.6136, + "step": 19296 + }, + { + "epoch": 1.0805801321536566, + "grad_norm": 1.5233100652694702, + "learning_rate": 9.643500000000001e-05, + "loss": 0.4301, + "step": 19297 + }, + { + "epoch": 1.0806361294657856, + "grad_norm": 1.520534634590149, + "learning_rate": 9.644e-05, + "loss": 0.482, + "step": 19298 + }, + { + "epoch": 1.0806921267779146, + "grad_norm": 1.3004443645477295, + "learning_rate": 9.6445e-05, + "loss": 0.3732, + "step": 19299 + }, + { + "epoch": 1.0807481240900436, + "grad_norm": 1.151798963546753, + "learning_rate": 9.645000000000001e-05, + "loss": 0.298, + "step": 19300 + }, + { + "epoch": 1.0808041214021726, + "grad_norm": 1.3476203680038452, + "learning_rate": 9.645500000000001e-05, + "loss": 0.4183, + "step": 19301 + }, + { + "epoch": 1.0808601187143017, + "grad_norm": 1.2625181674957275, + "learning_rate": 9.646000000000001e-05, + "loss": 0.4778, + "step": 19302 + }, + { + "epoch": 1.0809161160264307, + "grad_norm": 1.2647287845611572, + "learning_rate": 9.6465e-05, + "loss": 0.4746, + "step": 19303 + }, + { + "epoch": 1.0809721133385597, + "grad_norm": 1.4775749444961548, + "learning_rate": 9.647e-05, + "loss": 0.4654, + "step": 19304 + }, + { + "epoch": 1.0810281106506887, + "grad_norm": 1.6221728324890137, + "learning_rate": 9.6475e-05, + "loss": 0.387, + "step": 19305 + }, + { + "epoch": 1.0810841079628177, + "grad_norm": 1.1883046627044678, + "learning_rate": 9.648e-05, + "loss": 0.3965, + "step": 19306 + }, + { + "epoch": 1.0811401052749467, + "grad_norm": 1.416805624961853, + "learning_rate": 9.648500000000001e-05, + "loss": 0.4267, + "step": 19307 + }, + { + "epoch": 1.0811961025870758, + "grad_norm": 1.3891547918319702, + "learning_rate": 9.649e-05, + "loss": 0.4943, + "step": 19308 + }, + { + "epoch": 1.0812520998992048, + "grad_norm": 1.1932092905044556, + "learning_rate": 9.6495e-05, + "loss": 0.4577, + "step": 19309 + }, + { + "epoch": 1.0813080972113338, + "grad_norm": 1.4168490171432495, + "learning_rate": 9.65e-05, + "loss": 0.499, + "step": 19310 + }, + { + "epoch": 1.0813640945234628, + "grad_norm": 1.3673006296157837, + "learning_rate": 9.6505e-05, + "loss": 0.4831, + "step": 19311 + }, + { + "epoch": 1.0814200918355918, + "grad_norm": 1.3717122077941895, + "learning_rate": 9.651e-05, + "loss": 0.4299, + "step": 19312 + }, + { + "epoch": 1.0814760891477209, + "grad_norm": 1.4127898216247559, + "learning_rate": 9.6515e-05, + "loss": 0.462, + "step": 19313 + }, + { + "epoch": 1.0815320864598499, + "grad_norm": 1.3625534772872925, + "learning_rate": 9.652e-05, + "loss": 0.3846, + "step": 19314 + }, + { + "epoch": 1.081588083771979, + "grad_norm": 1.3918216228485107, + "learning_rate": 9.652500000000002e-05, + "loss": 0.3508, + "step": 19315 + }, + { + "epoch": 1.081644081084108, + "grad_norm": 1.4035346508026123, + "learning_rate": 9.653000000000001e-05, + "loss": 0.6266, + "step": 19316 + }, + { + "epoch": 1.081700078396237, + "grad_norm": 1.3695547580718994, + "learning_rate": 9.653500000000001e-05, + "loss": 0.3242, + "step": 19317 + }, + { + "epoch": 1.081756075708366, + "grad_norm": 1.5471813678741455, + "learning_rate": 9.654000000000001e-05, + "loss": 0.4938, + "step": 19318 + }, + { + "epoch": 1.081812073020495, + "grad_norm": 1.4240856170654297, + "learning_rate": 9.6545e-05, + "loss": 0.3783, + "step": 19319 + }, + { + "epoch": 1.081868070332624, + "grad_norm": 1.166264533996582, + "learning_rate": 9.655e-05, + "loss": 0.3181, + "step": 19320 + }, + { + "epoch": 1.081924067644753, + "grad_norm": 1.5024768114089966, + "learning_rate": 9.655500000000001e-05, + "loss": 0.606, + "step": 19321 + }, + { + "epoch": 1.081980064956882, + "grad_norm": 1.1460301876068115, + "learning_rate": 9.656000000000001e-05, + "loss": 0.436, + "step": 19322 + }, + { + "epoch": 1.082036062269011, + "grad_norm": 1.4128327369689941, + "learning_rate": 9.656500000000001e-05, + "loss": 0.4741, + "step": 19323 + }, + { + "epoch": 1.08209205958114, + "grad_norm": 1.320252776145935, + "learning_rate": 9.657e-05, + "loss": 0.4336, + "step": 19324 + }, + { + "epoch": 1.082148056893269, + "grad_norm": 1.1598433256149292, + "learning_rate": 9.6575e-05, + "loss": 0.3989, + "step": 19325 + }, + { + "epoch": 1.0822040542053981, + "grad_norm": 1.2715833187103271, + "learning_rate": 9.658e-05, + "loss": 0.3745, + "step": 19326 + }, + { + "epoch": 1.0822600515175271, + "grad_norm": 1.4066200256347656, + "learning_rate": 9.6585e-05, + "loss": 0.4034, + "step": 19327 + }, + { + "epoch": 1.0823160488296562, + "grad_norm": 1.2118477821350098, + "learning_rate": 9.659000000000001e-05, + "loss": 0.4449, + "step": 19328 + }, + { + "epoch": 1.0823720461417852, + "grad_norm": 1.5100220441818237, + "learning_rate": 9.6595e-05, + "loss": 0.5064, + "step": 19329 + }, + { + "epoch": 1.0824280434539142, + "grad_norm": 1.2827850580215454, + "learning_rate": 9.66e-05, + "loss": 0.4138, + "step": 19330 + }, + { + "epoch": 1.0824840407660432, + "grad_norm": 1.5457252264022827, + "learning_rate": 9.6605e-05, + "loss": 0.5519, + "step": 19331 + }, + { + "epoch": 1.0825400380781722, + "grad_norm": 1.2562763690948486, + "learning_rate": 9.661e-05, + "loss": 0.4989, + "step": 19332 + }, + { + "epoch": 1.0825960353903012, + "grad_norm": 1.4032789468765259, + "learning_rate": 9.6615e-05, + "loss": 0.4255, + "step": 19333 + }, + { + "epoch": 1.0826520327024303, + "grad_norm": 1.6235026121139526, + "learning_rate": 9.661999999999999e-05, + "loss": 0.4823, + "step": 19334 + }, + { + "epoch": 1.0827080300145593, + "grad_norm": 1.156258225440979, + "learning_rate": 9.6625e-05, + "loss": 0.3425, + "step": 19335 + }, + { + "epoch": 1.0827640273266883, + "grad_norm": 1.303497552871704, + "learning_rate": 9.663000000000002e-05, + "loss": 0.4756, + "step": 19336 + }, + { + "epoch": 1.0828200246388173, + "grad_norm": 1.3825955390930176, + "learning_rate": 9.663500000000001e-05, + "loss": 0.3229, + "step": 19337 + }, + { + "epoch": 1.0828760219509463, + "grad_norm": 1.3050435781478882, + "learning_rate": 9.664000000000001e-05, + "loss": 0.4469, + "step": 19338 + }, + { + "epoch": 1.0829320192630754, + "grad_norm": 1.271113395690918, + "learning_rate": 9.664500000000001e-05, + "loss": 0.385, + "step": 19339 + }, + { + "epoch": 1.0829880165752044, + "grad_norm": 1.216892957687378, + "learning_rate": 9.665e-05, + "loss": 0.3499, + "step": 19340 + }, + { + "epoch": 1.0830440138873334, + "grad_norm": 1.3005635738372803, + "learning_rate": 9.6655e-05, + "loss": 0.4114, + "step": 19341 + }, + { + "epoch": 1.0831000111994624, + "grad_norm": 1.6403224468231201, + "learning_rate": 9.666e-05, + "loss": 0.4046, + "step": 19342 + }, + { + "epoch": 1.0831560085115914, + "grad_norm": 1.3765748739242554, + "learning_rate": 9.666500000000001e-05, + "loss": 0.5173, + "step": 19343 + }, + { + "epoch": 1.0832120058237205, + "grad_norm": 1.6299631595611572, + "learning_rate": 9.667000000000001e-05, + "loss": 0.3675, + "step": 19344 + }, + { + "epoch": 1.0832680031358495, + "grad_norm": 1.4607957601547241, + "learning_rate": 9.6675e-05, + "loss": 0.4178, + "step": 19345 + }, + { + "epoch": 1.0833240004479785, + "grad_norm": 1.4849029779434204, + "learning_rate": 9.668e-05, + "loss": 0.5477, + "step": 19346 + }, + { + "epoch": 1.0833799977601075, + "grad_norm": 1.2126410007476807, + "learning_rate": 9.6685e-05, + "loss": 0.3774, + "step": 19347 + }, + { + "epoch": 1.0834359950722365, + "grad_norm": 1.4253648519515991, + "learning_rate": 9.669e-05, + "loss": 0.3828, + "step": 19348 + }, + { + "epoch": 1.0834919923843656, + "grad_norm": 1.1440986394882202, + "learning_rate": 9.669500000000001e-05, + "loss": 0.2512, + "step": 19349 + }, + { + "epoch": 1.0835479896964946, + "grad_norm": 1.4302659034729004, + "learning_rate": 9.67e-05, + "loss": 0.3265, + "step": 19350 + }, + { + "epoch": 1.0836039870086236, + "grad_norm": 1.5102487802505493, + "learning_rate": 9.6705e-05, + "loss": 0.3999, + "step": 19351 + }, + { + "epoch": 1.0836599843207526, + "grad_norm": 1.5125442743301392, + "learning_rate": 9.671e-05, + "loss": 0.5036, + "step": 19352 + }, + { + "epoch": 1.0837159816328816, + "grad_norm": 1.411964774131775, + "learning_rate": 9.6715e-05, + "loss": 0.6515, + "step": 19353 + }, + { + "epoch": 1.0837719789450107, + "grad_norm": 1.6002410650253296, + "learning_rate": 9.672e-05, + "loss": 0.6385, + "step": 19354 + }, + { + "epoch": 1.0838279762571397, + "grad_norm": 1.8580633401870728, + "learning_rate": 9.6725e-05, + "loss": 0.605, + "step": 19355 + }, + { + "epoch": 1.0838839735692687, + "grad_norm": 1.1607344150543213, + "learning_rate": 9.673e-05, + "loss": 0.4321, + "step": 19356 + }, + { + "epoch": 1.0839399708813977, + "grad_norm": 1.320904016494751, + "learning_rate": 9.673500000000001e-05, + "loss": 0.4001, + "step": 19357 + }, + { + "epoch": 1.0839959681935267, + "grad_norm": 1.255415439605713, + "learning_rate": 9.674000000000001e-05, + "loss": 0.4552, + "step": 19358 + }, + { + "epoch": 1.0840519655056557, + "grad_norm": 1.2322289943695068, + "learning_rate": 9.674500000000001e-05, + "loss": 0.4472, + "step": 19359 + }, + { + "epoch": 1.0841079628177848, + "grad_norm": 1.4008407592773438, + "learning_rate": 9.675000000000001e-05, + "loss": 0.5539, + "step": 19360 + }, + { + "epoch": 1.0841639601299138, + "grad_norm": 1.384464144706726, + "learning_rate": 9.6755e-05, + "loss": 0.3652, + "step": 19361 + }, + { + "epoch": 1.0842199574420428, + "grad_norm": 1.4799270629882812, + "learning_rate": 9.676e-05, + "loss": 0.4102, + "step": 19362 + }, + { + "epoch": 1.0842759547541718, + "grad_norm": 1.5826606750488281, + "learning_rate": 9.6765e-05, + "loss": 0.5595, + "step": 19363 + }, + { + "epoch": 1.0843319520663008, + "grad_norm": 1.3750200271606445, + "learning_rate": 9.677000000000001e-05, + "loss": 0.4728, + "step": 19364 + }, + { + "epoch": 1.0843879493784299, + "grad_norm": 1.9382109642028809, + "learning_rate": 9.677500000000001e-05, + "loss": 0.5068, + "step": 19365 + }, + { + "epoch": 1.0844439466905589, + "grad_norm": 1.368626356124878, + "learning_rate": 9.678e-05, + "loss": 0.3895, + "step": 19366 + }, + { + "epoch": 1.084499944002688, + "grad_norm": 1.2477309703826904, + "learning_rate": 9.6785e-05, + "loss": 0.3411, + "step": 19367 + }, + { + "epoch": 1.084555941314817, + "grad_norm": 1.4445922374725342, + "learning_rate": 9.679e-05, + "loss": 0.3236, + "step": 19368 + }, + { + "epoch": 1.084611938626946, + "grad_norm": 1.6062517166137695, + "learning_rate": 9.6795e-05, + "loss": 0.4774, + "step": 19369 + }, + { + "epoch": 1.084667935939075, + "grad_norm": 1.1364208459854126, + "learning_rate": 9.680000000000001e-05, + "loss": 0.4174, + "step": 19370 + }, + { + "epoch": 1.084723933251204, + "grad_norm": 1.2720904350280762, + "learning_rate": 9.6805e-05, + "loss": 0.3245, + "step": 19371 + }, + { + "epoch": 1.084779930563333, + "grad_norm": 1.477702260017395, + "learning_rate": 9.681e-05, + "loss": 0.5477, + "step": 19372 + }, + { + "epoch": 1.084835927875462, + "grad_norm": 1.1432007551193237, + "learning_rate": 9.6815e-05, + "loss": 0.4232, + "step": 19373 + }, + { + "epoch": 1.084891925187591, + "grad_norm": 1.412406086921692, + "learning_rate": 9.682e-05, + "loss": 0.4568, + "step": 19374 + }, + { + "epoch": 1.08494792249972, + "grad_norm": 1.2835264205932617, + "learning_rate": 9.682500000000001e-05, + "loss": 0.377, + "step": 19375 + }, + { + "epoch": 1.085003919811849, + "grad_norm": 1.3556221723556519, + "learning_rate": 9.683e-05, + "loss": 0.3866, + "step": 19376 + }, + { + "epoch": 1.085059917123978, + "grad_norm": 1.238083004951477, + "learning_rate": 9.6835e-05, + "loss": 0.54, + "step": 19377 + }, + { + "epoch": 1.085115914436107, + "grad_norm": 1.3918308019638062, + "learning_rate": 9.684000000000001e-05, + "loss": 0.4972, + "step": 19378 + }, + { + "epoch": 1.0851719117482361, + "grad_norm": 1.1738877296447754, + "learning_rate": 9.684500000000001e-05, + "loss": 0.3268, + "step": 19379 + }, + { + "epoch": 1.0852279090603651, + "grad_norm": 1.3730249404907227, + "learning_rate": 9.685000000000001e-05, + "loss": 0.4502, + "step": 19380 + }, + { + "epoch": 1.0852839063724942, + "grad_norm": 1.495382308959961, + "learning_rate": 9.685500000000001e-05, + "loss": 0.428, + "step": 19381 + }, + { + "epoch": 1.0853399036846232, + "grad_norm": 1.2371774911880493, + "learning_rate": 9.686e-05, + "loss": 0.3295, + "step": 19382 + }, + { + "epoch": 1.0853959009967522, + "grad_norm": 1.256272792816162, + "learning_rate": 9.6865e-05, + "loss": 0.5049, + "step": 19383 + }, + { + "epoch": 1.0854518983088812, + "grad_norm": 1.5617402791976929, + "learning_rate": 9.687e-05, + "loss": 0.3444, + "step": 19384 + }, + { + "epoch": 1.0855078956210102, + "grad_norm": 1.2238343954086304, + "learning_rate": 9.687500000000001e-05, + "loss": 0.4621, + "step": 19385 + }, + { + "epoch": 1.0855638929331393, + "grad_norm": 1.7954620122909546, + "learning_rate": 9.688000000000001e-05, + "loss": 0.536, + "step": 19386 + }, + { + "epoch": 1.0856198902452683, + "grad_norm": 1.3573501110076904, + "learning_rate": 9.6885e-05, + "loss": 0.4388, + "step": 19387 + }, + { + "epoch": 1.0856758875573973, + "grad_norm": 1.399285078048706, + "learning_rate": 9.689e-05, + "loss": 0.5818, + "step": 19388 + }, + { + "epoch": 1.0857318848695263, + "grad_norm": 1.4428678750991821, + "learning_rate": 9.6895e-05, + "loss": 0.4234, + "step": 19389 + }, + { + "epoch": 1.0857878821816553, + "grad_norm": 1.4424530267715454, + "learning_rate": 9.69e-05, + "loss": 0.4421, + "step": 19390 + }, + { + "epoch": 1.0858438794937844, + "grad_norm": 1.754023790359497, + "learning_rate": 9.6905e-05, + "loss": 0.4428, + "step": 19391 + }, + { + "epoch": 1.0858998768059134, + "grad_norm": 1.343117594718933, + "learning_rate": 9.691e-05, + "loss": 0.312, + "step": 19392 + }, + { + "epoch": 1.0859558741180424, + "grad_norm": 1.1642510890960693, + "learning_rate": 9.6915e-05, + "loss": 0.3357, + "step": 19393 + }, + { + "epoch": 1.0860118714301714, + "grad_norm": 1.524693250656128, + "learning_rate": 9.692e-05, + "loss": 0.375, + "step": 19394 + }, + { + "epoch": 1.0860678687423004, + "grad_norm": 1.7570533752441406, + "learning_rate": 9.6925e-05, + "loss": 0.5246, + "step": 19395 + }, + { + "epoch": 1.0861238660544295, + "grad_norm": 1.2837694883346558, + "learning_rate": 9.693000000000001e-05, + "loss": 0.3471, + "step": 19396 + }, + { + "epoch": 1.0861798633665585, + "grad_norm": 1.1543211936950684, + "learning_rate": 9.6935e-05, + "loss": 0.3616, + "step": 19397 + }, + { + "epoch": 1.0862358606786875, + "grad_norm": 1.3171840906143188, + "learning_rate": 9.694e-05, + "loss": 0.4241, + "step": 19398 + }, + { + "epoch": 1.0862918579908165, + "grad_norm": 1.5649409294128418, + "learning_rate": 9.694500000000001e-05, + "loss": 0.4944, + "step": 19399 + }, + { + "epoch": 1.0863478553029455, + "grad_norm": 1.2931212186813354, + "learning_rate": 9.695000000000001e-05, + "loss": 0.5089, + "step": 19400 + }, + { + "epoch": 1.0864038526150746, + "grad_norm": 1.3104225397109985, + "learning_rate": 9.695500000000001e-05, + "loss": 0.4478, + "step": 19401 + }, + { + "epoch": 1.0864598499272036, + "grad_norm": 1.1763811111450195, + "learning_rate": 9.696000000000001e-05, + "loss": 0.4075, + "step": 19402 + }, + { + "epoch": 1.0865158472393326, + "grad_norm": 1.2341376543045044, + "learning_rate": 9.6965e-05, + "loss": 0.5711, + "step": 19403 + }, + { + "epoch": 1.0865718445514616, + "grad_norm": 1.2019206285476685, + "learning_rate": 9.697e-05, + "loss": 0.3148, + "step": 19404 + }, + { + "epoch": 1.0866278418635906, + "grad_norm": 1.6193609237670898, + "learning_rate": 9.6975e-05, + "loss": 0.4713, + "step": 19405 + }, + { + "epoch": 1.0866838391757196, + "grad_norm": 1.4534461498260498, + "learning_rate": 9.698000000000001e-05, + "loss": 0.4004, + "step": 19406 + }, + { + "epoch": 1.0867398364878487, + "grad_norm": 1.1196978092193604, + "learning_rate": 9.698500000000001e-05, + "loss": 0.3419, + "step": 19407 + }, + { + "epoch": 1.0867958337999777, + "grad_norm": 1.272823452949524, + "learning_rate": 9.699e-05, + "loss": 0.4797, + "step": 19408 + }, + { + "epoch": 1.0868518311121067, + "grad_norm": 6.032217979431152, + "learning_rate": 9.6995e-05, + "loss": 0.5645, + "step": 19409 + }, + { + "epoch": 1.0869078284242357, + "grad_norm": 1.2282921075820923, + "learning_rate": 9.7e-05, + "loss": 0.494, + "step": 19410 + }, + { + "epoch": 1.0869638257363647, + "grad_norm": 1.432591199874878, + "learning_rate": 9.7005e-05, + "loss": 0.4727, + "step": 19411 + }, + { + "epoch": 1.0870198230484938, + "grad_norm": 1.428158164024353, + "learning_rate": 9.701e-05, + "loss": 0.3715, + "step": 19412 + }, + { + "epoch": 1.0870758203606228, + "grad_norm": 1.33553147315979, + "learning_rate": 9.7015e-05, + "loss": 0.4723, + "step": 19413 + }, + { + "epoch": 1.0871318176727518, + "grad_norm": 1.2606146335601807, + "learning_rate": 9.702e-05, + "loss": 0.4094, + "step": 19414 + }, + { + "epoch": 1.0871878149848808, + "grad_norm": 1.3955553770065308, + "learning_rate": 9.7025e-05, + "loss": 0.3692, + "step": 19415 + }, + { + "epoch": 1.0872438122970098, + "grad_norm": 1.202028751373291, + "learning_rate": 9.703000000000001e-05, + "loss": 0.4275, + "step": 19416 + }, + { + "epoch": 1.0872998096091389, + "grad_norm": 1.497829556465149, + "learning_rate": 9.703500000000001e-05, + "loss": 0.6001, + "step": 19417 + }, + { + "epoch": 1.0873558069212679, + "grad_norm": 1.1924201250076294, + "learning_rate": 9.704e-05, + "loss": 0.3614, + "step": 19418 + }, + { + "epoch": 1.087411804233397, + "grad_norm": 1.5991381406784058, + "learning_rate": 9.7045e-05, + "loss": 0.5625, + "step": 19419 + }, + { + "epoch": 1.087467801545526, + "grad_norm": 1.3351833820343018, + "learning_rate": 9.705e-05, + "loss": 0.5205, + "step": 19420 + }, + { + "epoch": 1.087523798857655, + "grad_norm": 1.2472409009933472, + "learning_rate": 9.705500000000001e-05, + "loss": 0.3638, + "step": 19421 + }, + { + "epoch": 1.087579796169784, + "grad_norm": 1.1877214908599854, + "learning_rate": 9.706000000000001e-05, + "loss": 0.4038, + "step": 19422 + }, + { + "epoch": 1.087635793481913, + "grad_norm": 1.28447425365448, + "learning_rate": 9.7065e-05, + "loss": 0.4491, + "step": 19423 + }, + { + "epoch": 1.0876917907940418, + "grad_norm": 1.2487019300460815, + "learning_rate": 9.707e-05, + "loss": 0.5416, + "step": 19424 + }, + { + "epoch": 1.0877477881061708, + "grad_norm": 1.412729024887085, + "learning_rate": 9.7075e-05, + "loss": 0.4928, + "step": 19425 + }, + { + "epoch": 1.0878037854182998, + "grad_norm": 1.8077267408370972, + "learning_rate": 9.708e-05, + "loss": 0.4177, + "step": 19426 + }, + { + "epoch": 1.0878597827304288, + "grad_norm": 1.2820024490356445, + "learning_rate": 9.708500000000001e-05, + "loss": 0.4856, + "step": 19427 + }, + { + "epoch": 1.0879157800425578, + "grad_norm": 1.74324369430542, + "learning_rate": 9.709000000000001e-05, + "loss": 0.3792, + "step": 19428 + }, + { + "epoch": 1.0879717773546869, + "grad_norm": 1.2279449701309204, + "learning_rate": 9.7095e-05, + "loss": 0.341, + "step": 19429 + }, + { + "epoch": 1.0880277746668159, + "grad_norm": 1.4273266792297363, + "learning_rate": 9.71e-05, + "loss": 0.4763, + "step": 19430 + }, + { + "epoch": 1.088083771978945, + "grad_norm": 1.2334628105163574, + "learning_rate": 9.7105e-05, + "loss": 0.3511, + "step": 19431 + }, + { + "epoch": 1.088139769291074, + "grad_norm": 1.3678807020187378, + "learning_rate": 9.711e-05, + "loss": 0.3458, + "step": 19432 + }, + { + "epoch": 1.088195766603203, + "grad_norm": 1.2043136358261108, + "learning_rate": 9.7115e-05, + "loss": 0.348, + "step": 19433 + }, + { + "epoch": 1.088251763915332, + "grad_norm": 1.4696546792984009, + "learning_rate": 9.712e-05, + "loss": 0.5889, + "step": 19434 + }, + { + "epoch": 1.088307761227461, + "grad_norm": 1.3678463697433472, + "learning_rate": 9.7125e-05, + "loss": 0.5672, + "step": 19435 + }, + { + "epoch": 1.08836375853959, + "grad_norm": 1.7388309240341187, + "learning_rate": 9.713000000000001e-05, + "loss": 0.3944, + "step": 19436 + }, + { + "epoch": 1.088419755851719, + "grad_norm": 1.696213722229004, + "learning_rate": 9.713500000000001e-05, + "loss": 0.3871, + "step": 19437 + }, + { + "epoch": 1.088475753163848, + "grad_norm": 1.5011556148529053, + "learning_rate": 9.714000000000001e-05, + "loss": 0.4874, + "step": 19438 + }, + { + "epoch": 1.088531750475977, + "grad_norm": 1.4647469520568848, + "learning_rate": 9.7145e-05, + "loss": 0.3355, + "step": 19439 + }, + { + "epoch": 1.088587747788106, + "grad_norm": 1.3213003873825073, + "learning_rate": 9.715e-05, + "loss": 0.33, + "step": 19440 + }, + { + "epoch": 1.088643745100235, + "grad_norm": 1.250311255455017, + "learning_rate": 9.7155e-05, + "loss": 0.4597, + "step": 19441 + }, + { + "epoch": 1.0886997424123641, + "grad_norm": 1.4594535827636719, + "learning_rate": 9.716000000000001e-05, + "loss": 0.4793, + "step": 19442 + }, + { + "epoch": 1.0887557397244931, + "grad_norm": 1.3642311096191406, + "learning_rate": 9.716500000000001e-05, + "loss": 0.4608, + "step": 19443 + }, + { + "epoch": 1.0888117370366222, + "grad_norm": 2.3334622383117676, + "learning_rate": 9.717e-05, + "loss": 0.5047, + "step": 19444 + }, + { + "epoch": 1.0888677343487512, + "grad_norm": 1.3684736490249634, + "learning_rate": 9.7175e-05, + "loss": 0.3814, + "step": 19445 + }, + { + "epoch": 1.0889237316608802, + "grad_norm": 1.1973755359649658, + "learning_rate": 9.718e-05, + "loss": 0.4501, + "step": 19446 + }, + { + "epoch": 1.0889797289730092, + "grad_norm": 1.4940571784973145, + "learning_rate": 9.7185e-05, + "loss": 0.6219, + "step": 19447 + }, + { + "epoch": 1.0890357262851382, + "grad_norm": 1.252989649772644, + "learning_rate": 9.719000000000001e-05, + "loss": 0.41, + "step": 19448 + }, + { + "epoch": 1.0890917235972672, + "grad_norm": 1.185983419418335, + "learning_rate": 9.719500000000001e-05, + "loss": 0.4355, + "step": 19449 + }, + { + "epoch": 1.0891477209093963, + "grad_norm": 1.1518192291259766, + "learning_rate": 9.72e-05, + "loss": 0.3613, + "step": 19450 + }, + { + "epoch": 1.0892037182215253, + "grad_norm": 1.5924357175827026, + "learning_rate": 9.7205e-05, + "loss": 0.4771, + "step": 19451 + }, + { + "epoch": 1.0892597155336543, + "grad_norm": 1.1367547512054443, + "learning_rate": 9.721e-05, + "loss": 0.4243, + "step": 19452 + }, + { + "epoch": 1.0893157128457833, + "grad_norm": 1.4977604150772095, + "learning_rate": 9.7215e-05, + "loss": 0.3691, + "step": 19453 + }, + { + "epoch": 1.0893717101579123, + "grad_norm": 1.2454397678375244, + "learning_rate": 9.722e-05, + "loss": 0.5785, + "step": 19454 + }, + { + "epoch": 1.0894277074700414, + "grad_norm": 1.5086647272109985, + "learning_rate": 9.7225e-05, + "loss": 0.3714, + "step": 19455 + }, + { + "epoch": 1.0894837047821704, + "grad_norm": 1.3603461980819702, + "learning_rate": 9.723000000000002e-05, + "loss": 0.5453, + "step": 19456 + }, + { + "epoch": 1.0895397020942994, + "grad_norm": 1.7683531045913696, + "learning_rate": 9.723500000000001e-05, + "loss": 0.7335, + "step": 19457 + }, + { + "epoch": 1.0895956994064284, + "grad_norm": 2.0208988189697266, + "learning_rate": 9.724000000000001e-05, + "loss": 0.4242, + "step": 19458 + }, + { + "epoch": 1.0896516967185574, + "grad_norm": 1.403065800666809, + "learning_rate": 9.724500000000001e-05, + "loss": 0.4319, + "step": 19459 + }, + { + "epoch": 1.0897076940306865, + "grad_norm": 1.7681505680084229, + "learning_rate": 9.725e-05, + "loss": 0.5649, + "step": 19460 + }, + { + "epoch": 1.0897636913428155, + "grad_norm": 5.0404133796691895, + "learning_rate": 9.7255e-05, + "loss": 0.3528, + "step": 19461 + }, + { + "epoch": 1.0898196886549445, + "grad_norm": 1.498591661453247, + "learning_rate": 9.726e-05, + "loss": 0.5161, + "step": 19462 + }, + { + "epoch": 1.0898756859670735, + "grad_norm": 1.8758131265640259, + "learning_rate": 9.726500000000001e-05, + "loss": 0.5007, + "step": 19463 + }, + { + "epoch": 1.0899316832792025, + "grad_norm": 1.260574221611023, + "learning_rate": 9.727000000000001e-05, + "loss": 0.3745, + "step": 19464 + }, + { + "epoch": 1.0899876805913316, + "grad_norm": 1.312031865119934, + "learning_rate": 9.7275e-05, + "loss": 0.4584, + "step": 19465 + }, + { + "epoch": 1.0900436779034606, + "grad_norm": 1.5896401405334473, + "learning_rate": 9.728e-05, + "loss": 0.47, + "step": 19466 + }, + { + "epoch": 1.0900996752155896, + "grad_norm": 1.0249112844467163, + "learning_rate": 9.7285e-05, + "loss": 0.3238, + "step": 19467 + }, + { + "epoch": 1.0901556725277186, + "grad_norm": 1.251260757446289, + "learning_rate": 9.729e-05, + "loss": 0.3825, + "step": 19468 + }, + { + "epoch": 1.0902116698398476, + "grad_norm": 1.4393593072891235, + "learning_rate": 9.729500000000001e-05, + "loss": 0.4745, + "step": 19469 + }, + { + "epoch": 1.0902676671519766, + "grad_norm": 1.361510157585144, + "learning_rate": 9.730000000000001e-05, + "loss": 0.419, + "step": 19470 + }, + { + "epoch": 1.0903236644641057, + "grad_norm": 1.2182210683822632, + "learning_rate": 9.7305e-05, + "loss": 0.4122, + "step": 19471 + }, + { + "epoch": 1.0903796617762347, + "grad_norm": 1.2895756959915161, + "learning_rate": 9.731e-05, + "loss": 0.406, + "step": 19472 + }, + { + "epoch": 1.0904356590883637, + "grad_norm": 1.3974794149398804, + "learning_rate": 9.7315e-05, + "loss": 0.5104, + "step": 19473 + }, + { + "epoch": 1.0904916564004927, + "grad_norm": 1.6768306493759155, + "learning_rate": 9.732e-05, + "loss": 0.4613, + "step": 19474 + }, + { + "epoch": 1.0905476537126217, + "grad_norm": 1.3968340158462524, + "learning_rate": 9.7325e-05, + "loss": 0.4519, + "step": 19475 + }, + { + "epoch": 1.0906036510247508, + "grad_norm": 1.7102560997009277, + "learning_rate": 9.733e-05, + "loss": 0.5399, + "step": 19476 + }, + { + "epoch": 1.0906596483368798, + "grad_norm": 1.3042027950286865, + "learning_rate": 9.733500000000002e-05, + "loss": 0.4404, + "step": 19477 + }, + { + "epoch": 1.0907156456490088, + "grad_norm": 1.2797521352767944, + "learning_rate": 9.734000000000001e-05, + "loss": 0.4302, + "step": 19478 + }, + { + "epoch": 1.0907716429611378, + "grad_norm": 1.3735034465789795, + "learning_rate": 9.734500000000001e-05, + "loss": 0.4131, + "step": 19479 + }, + { + "epoch": 1.0908276402732668, + "grad_norm": 1.4389700889587402, + "learning_rate": 9.735000000000001e-05, + "loss": 0.5966, + "step": 19480 + }, + { + "epoch": 1.0908836375853959, + "grad_norm": 1.6658045053482056, + "learning_rate": 9.7355e-05, + "loss": 0.5419, + "step": 19481 + }, + { + "epoch": 1.0909396348975249, + "grad_norm": 1.3026015758514404, + "learning_rate": 9.736e-05, + "loss": 0.5054, + "step": 19482 + }, + { + "epoch": 1.090995632209654, + "grad_norm": 1.3731954097747803, + "learning_rate": 9.7365e-05, + "loss": 0.4267, + "step": 19483 + }, + { + "epoch": 1.091051629521783, + "grad_norm": 1.4135109186172485, + "learning_rate": 9.737000000000001e-05, + "loss": 0.451, + "step": 19484 + }, + { + "epoch": 1.091107626833912, + "grad_norm": 1.2790257930755615, + "learning_rate": 9.737500000000001e-05, + "loss": 0.3954, + "step": 19485 + }, + { + "epoch": 1.091163624146041, + "grad_norm": 1.4345476627349854, + "learning_rate": 9.738e-05, + "loss": 0.4735, + "step": 19486 + }, + { + "epoch": 1.09121962145817, + "grad_norm": 1.2349601984024048, + "learning_rate": 9.7385e-05, + "loss": 0.3137, + "step": 19487 + }, + { + "epoch": 1.091275618770299, + "grad_norm": 1.20396089553833, + "learning_rate": 9.739e-05, + "loss": 0.4095, + "step": 19488 + }, + { + "epoch": 1.091331616082428, + "grad_norm": 1.3240493535995483, + "learning_rate": 9.7395e-05, + "loss": 0.4282, + "step": 19489 + }, + { + "epoch": 1.091387613394557, + "grad_norm": 1.3587307929992676, + "learning_rate": 9.74e-05, + "loss": 0.5745, + "step": 19490 + }, + { + "epoch": 1.091443610706686, + "grad_norm": 1.4722732305526733, + "learning_rate": 9.7405e-05, + "loss": 0.4488, + "step": 19491 + }, + { + "epoch": 1.091499608018815, + "grad_norm": 1.6319332122802734, + "learning_rate": 9.741e-05, + "loss": 0.4726, + "step": 19492 + }, + { + "epoch": 1.091555605330944, + "grad_norm": 1.2616441249847412, + "learning_rate": 9.7415e-05, + "loss": 0.4085, + "step": 19493 + }, + { + "epoch": 1.091611602643073, + "grad_norm": 1.4479644298553467, + "learning_rate": 9.742e-05, + "loss": 0.4895, + "step": 19494 + }, + { + "epoch": 1.0916675999552021, + "grad_norm": 1.1906683444976807, + "learning_rate": 9.7425e-05, + "loss": 0.3664, + "step": 19495 + }, + { + "epoch": 1.0917235972673311, + "grad_norm": 1.1659542322158813, + "learning_rate": 9.743000000000001e-05, + "loss": 0.455, + "step": 19496 + }, + { + "epoch": 1.0917795945794602, + "grad_norm": 1.4838449954986572, + "learning_rate": 9.7435e-05, + "loss": 0.4508, + "step": 19497 + }, + { + "epoch": 1.0918355918915892, + "grad_norm": 1.3363466262817383, + "learning_rate": 9.744000000000002e-05, + "loss": 0.3449, + "step": 19498 + }, + { + "epoch": 1.0918915892037182, + "grad_norm": 1.0771715641021729, + "learning_rate": 9.744500000000001e-05, + "loss": 0.3355, + "step": 19499 + }, + { + "epoch": 1.0919475865158472, + "grad_norm": 1.1941698789596558, + "learning_rate": 9.745000000000001e-05, + "loss": 0.4237, + "step": 19500 + }, + { + "epoch": 1.0920035838279762, + "grad_norm": 1.3635395765304565, + "learning_rate": 9.745500000000001e-05, + "loss": 0.5534, + "step": 19501 + }, + { + "epoch": 1.0920595811401053, + "grad_norm": 1.4608535766601562, + "learning_rate": 9.746e-05, + "loss": 0.3937, + "step": 19502 + }, + { + "epoch": 1.0921155784522343, + "grad_norm": 1.342836856842041, + "learning_rate": 9.7465e-05, + "loss": 0.4632, + "step": 19503 + }, + { + "epoch": 1.0921715757643633, + "grad_norm": 1.4179961681365967, + "learning_rate": 9.747e-05, + "loss": 0.5522, + "step": 19504 + }, + { + "epoch": 1.0922275730764923, + "grad_norm": 1.4744871854782104, + "learning_rate": 9.747500000000001e-05, + "loss": 0.4017, + "step": 19505 + }, + { + "epoch": 1.0922835703886213, + "grad_norm": 1.4139132499694824, + "learning_rate": 9.748000000000001e-05, + "loss": 0.4151, + "step": 19506 + }, + { + "epoch": 1.0923395677007504, + "grad_norm": 1.4782044887542725, + "learning_rate": 9.7485e-05, + "loss": 0.603, + "step": 19507 + }, + { + "epoch": 1.0923955650128794, + "grad_norm": 1.4270756244659424, + "learning_rate": 9.749e-05, + "loss": 0.4025, + "step": 19508 + }, + { + "epoch": 1.0924515623250084, + "grad_norm": 1.2587467432022095, + "learning_rate": 9.7495e-05, + "loss": 0.3908, + "step": 19509 + }, + { + "epoch": 1.0925075596371374, + "grad_norm": 1.3345550298690796, + "learning_rate": 9.75e-05, + "loss": 0.3895, + "step": 19510 + }, + { + "epoch": 1.0925635569492664, + "grad_norm": 1.5031352043151855, + "learning_rate": 9.7505e-05, + "loss": 0.4788, + "step": 19511 + }, + { + "epoch": 1.0926195542613955, + "grad_norm": 1.3319941759109497, + "learning_rate": 9.751e-05, + "loss": 0.5293, + "step": 19512 + }, + { + "epoch": 1.0926755515735245, + "grad_norm": 1.1762086153030396, + "learning_rate": 9.7515e-05, + "loss": 0.4477, + "step": 19513 + }, + { + "epoch": 1.0927315488856535, + "grad_norm": 1.3730016946792603, + "learning_rate": 9.752e-05, + "loss": 0.4041, + "step": 19514 + }, + { + "epoch": 1.0927875461977825, + "grad_norm": 1.591992735862732, + "learning_rate": 9.7525e-05, + "loss": 0.481, + "step": 19515 + }, + { + "epoch": 1.0928435435099115, + "grad_norm": 1.3490495681762695, + "learning_rate": 9.753e-05, + "loss": 0.3612, + "step": 19516 + }, + { + "epoch": 1.0928995408220405, + "grad_norm": 1.2400485277175903, + "learning_rate": 9.753500000000001e-05, + "loss": 0.4447, + "step": 19517 + }, + { + "epoch": 1.0929555381341696, + "grad_norm": 1.4158413410186768, + "learning_rate": 9.754e-05, + "loss": 0.5537, + "step": 19518 + }, + { + "epoch": 1.0930115354462986, + "grad_norm": 1.2359211444854736, + "learning_rate": 9.7545e-05, + "loss": 0.5455, + "step": 19519 + }, + { + "epoch": 1.0930675327584276, + "grad_norm": 1.2567752599716187, + "learning_rate": 9.755000000000001e-05, + "loss": 0.364, + "step": 19520 + }, + { + "epoch": 1.0931235300705566, + "grad_norm": 1.3764561414718628, + "learning_rate": 9.755500000000001e-05, + "loss": 0.4302, + "step": 19521 + }, + { + "epoch": 1.0931795273826856, + "grad_norm": 1.321619987487793, + "learning_rate": 9.756000000000001e-05, + "loss": 0.4692, + "step": 19522 + }, + { + "epoch": 1.0932355246948147, + "grad_norm": 1.33164381980896, + "learning_rate": 9.7565e-05, + "loss": 0.3829, + "step": 19523 + }, + { + "epoch": 1.0932915220069437, + "grad_norm": 1.5573786497116089, + "learning_rate": 9.757e-05, + "loss": 0.4894, + "step": 19524 + }, + { + "epoch": 1.0933475193190727, + "grad_norm": 1.4197397232055664, + "learning_rate": 9.7575e-05, + "loss": 0.3993, + "step": 19525 + }, + { + "epoch": 1.0934035166312017, + "grad_norm": 1.4187829494476318, + "learning_rate": 9.758000000000001e-05, + "loss": 0.4583, + "step": 19526 + }, + { + "epoch": 1.0934595139433307, + "grad_norm": 1.202857255935669, + "learning_rate": 9.758500000000001e-05, + "loss": 0.3423, + "step": 19527 + }, + { + "epoch": 1.0935155112554598, + "grad_norm": 1.4020442962646484, + "learning_rate": 9.759e-05, + "loss": 0.4366, + "step": 19528 + }, + { + "epoch": 1.0935715085675888, + "grad_norm": 1.3176480531692505, + "learning_rate": 9.7595e-05, + "loss": 0.4086, + "step": 19529 + }, + { + "epoch": 1.0936275058797178, + "grad_norm": 1.1130186319351196, + "learning_rate": 9.76e-05, + "loss": 0.3393, + "step": 19530 + }, + { + "epoch": 1.0936835031918468, + "grad_norm": 1.2685071229934692, + "learning_rate": 9.7605e-05, + "loss": 0.4529, + "step": 19531 + }, + { + "epoch": 1.0937395005039758, + "grad_norm": 1.6565825939178467, + "learning_rate": 9.761e-05, + "loss": 0.4969, + "step": 19532 + }, + { + "epoch": 1.0937954978161049, + "grad_norm": 1.3428248167037964, + "learning_rate": 9.7615e-05, + "loss": 0.384, + "step": 19533 + }, + { + "epoch": 1.0938514951282339, + "grad_norm": 1.4593104124069214, + "learning_rate": 9.762e-05, + "loss": 0.4969, + "step": 19534 + }, + { + "epoch": 1.093907492440363, + "grad_norm": 1.5722178220748901, + "learning_rate": 9.7625e-05, + "loss": 0.5102, + "step": 19535 + }, + { + "epoch": 1.093963489752492, + "grad_norm": 1.4600510597229004, + "learning_rate": 9.763e-05, + "loss": 0.3806, + "step": 19536 + }, + { + "epoch": 1.094019487064621, + "grad_norm": 1.3687782287597656, + "learning_rate": 9.763500000000001e-05, + "loss": 0.5298, + "step": 19537 + }, + { + "epoch": 1.09407548437675, + "grad_norm": 1.763283610343933, + "learning_rate": 9.764000000000001e-05, + "loss": 0.4613, + "step": 19538 + }, + { + "epoch": 1.094131481688879, + "grad_norm": 1.4355372190475464, + "learning_rate": 9.7645e-05, + "loss": 0.4203, + "step": 19539 + }, + { + "epoch": 1.094187479001008, + "grad_norm": 1.6046817302703857, + "learning_rate": 9.765e-05, + "loss": 0.3555, + "step": 19540 + }, + { + "epoch": 1.094243476313137, + "grad_norm": 1.6429340839385986, + "learning_rate": 9.765500000000001e-05, + "loss": 0.4662, + "step": 19541 + }, + { + "epoch": 1.094299473625266, + "grad_norm": 1.4519562721252441, + "learning_rate": 9.766000000000001e-05, + "loss": 0.4883, + "step": 19542 + }, + { + "epoch": 1.094355470937395, + "grad_norm": 1.3624401092529297, + "learning_rate": 9.766500000000001e-05, + "loss": 0.4762, + "step": 19543 + }, + { + "epoch": 1.094411468249524, + "grad_norm": 1.355677843093872, + "learning_rate": 9.767e-05, + "loss": 0.3493, + "step": 19544 + }, + { + "epoch": 1.094467465561653, + "grad_norm": 6.697079658508301, + "learning_rate": 9.7675e-05, + "loss": 0.4235, + "step": 19545 + }, + { + "epoch": 1.094523462873782, + "grad_norm": 1.138710379600525, + "learning_rate": 9.768e-05, + "loss": 0.3862, + "step": 19546 + }, + { + "epoch": 1.0945794601859111, + "grad_norm": 1.5354340076446533, + "learning_rate": 9.768500000000001e-05, + "loss": 0.4287, + "step": 19547 + }, + { + "epoch": 1.0946354574980401, + "grad_norm": 1.3873952627182007, + "learning_rate": 9.769000000000001e-05, + "loss": 0.3725, + "step": 19548 + }, + { + "epoch": 1.0946914548101692, + "grad_norm": 1.2548458576202393, + "learning_rate": 9.7695e-05, + "loss": 0.3202, + "step": 19549 + }, + { + "epoch": 1.0947474521222982, + "grad_norm": 1.577109932899475, + "learning_rate": 9.77e-05, + "loss": 0.587, + "step": 19550 + }, + { + "epoch": 1.0948034494344272, + "grad_norm": 1.3174759149551392, + "learning_rate": 9.7705e-05, + "loss": 0.4666, + "step": 19551 + }, + { + "epoch": 1.0948594467465562, + "grad_norm": 1.461652159690857, + "learning_rate": 9.771e-05, + "loss": 0.4711, + "step": 19552 + }, + { + "epoch": 1.0949154440586852, + "grad_norm": 1.3167462348937988, + "learning_rate": 9.7715e-05, + "loss": 0.4441, + "step": 19553 + }, + { + "epoch": 1.0949714413708143, + "grad_norm": 1.1816465854644775, + "learning_rate": 9.772e-05, + "loss": 0.3607, + "step": 19554 + }, + { + "epoch": 1.0950274386829433, + "grad_norm": 1.3829306364059448, + "learning_rate": 9.7725e-05, + "loss": 0.4564, + "step": 19555 + }, + { + "epoch": 1.0950834359950723, + "grad_norm": 1.3023033142089844, + "learning_rate": 9.773e-05, + "loss": 0.3954, + "step": 19556 + }, + { + "epoch": 1.0951394333072013, + "grad_norm": 1.0449341535568237, + "learning_rate": 9.773500000000001e-05, + "loss": 0.3799, + "step": 19557 + }, + { + "epoch": 1.0951954306193303, + "grad_norm": 1.3946932554244995, + "learning_rate": 9.774000000000001e-05, + "loss": 0.428, + "step": 19558 + }, + { + "epoch": 1.0952514279314594, + "grad_norm": 1.5530643463134766, + "learning_rate": 9.774500000000001e-05, + "loss": 0.565, + "step": 19559 + }, + { + "epoch": 1.0953074252435884, + "grad_norm": 1.2083131074905396, + "learning_rate": 9.775e-05, + "loss": 0.2811, + "step": 19560 + }, + { + "epoch": 1.0953634225557174, + "grad_norm": 1.2603507041931152, + "learning_rate": 9.7755e-05, + "loss": 0.596, + "step": 19561 + }, + { + "epoch": 1.0954194198678464, + "grad_norm": 1.5251781940460205, + "learning_rate": 9.776000000000001e-05, + "loss": 0.4129, + "step": 19562 + }, + { + "epoch": 1.0954754171799754, + "grad_norm": 1.2989113330841064, + "learning_rate": 9.776500000000001e-05, + "loss": 0.4234, + "step": 19563 + }, + { + "epoch": 1.0955314144921044, + "grad_norm": 1.4201045036315918, + "learning_rate": 9.777000000000001e-05, + "loss": 0.4128, + "step": 19564 + }, + { + "epoch": 1.0955874118042335, + "grad_norm": 1.3607844114303589, + "learning_rate": 9.7775e-05, + "loss": 0.6465, + "step": 19565 + }, + { + "epoch": 1.0956434091163625, + "grad_norm": 1.6310099363327026, + "learning_rate": 9.778e-05, + "loss": 0.4283, + "step": 19566 + }, + { + "epoch": 1.0956994064284915, + "grad_norm": 1.1884732246398926, + "learning_rate": 9.7785e-05, + "loss": 0.3746, + "step": 19567 + }, + { + "epoch": 1.0957554037406205, + "grad_norm": 1.3570232391357422, + "learning_rate": 9.779e-05, + "loss": 0.4903, + "step": 19568 + }, + { + "epoch": 1.0958114010527495, + "grad_norm": 1.44740629196167, + "learning_rate": 9.779500000000001e-05, + "loss": 0.4056, + "step": 19569 + }, + { + "epoch": 1.0958673983648786, + "grad_norm": 1.1972736120224, + "learning_rate": 9.78e-05, + "loss": 0.3, + "step": 19570 + }, + { + "epoch": 1.0959233956770076, + "grad_norm": 1.7060189247131348, + "learning_rate": 9.7805e-05, + "loss": 0.4381, + "step": 19571 + }, + { + "epoch": 1.0959793929891366, + "grad_norm": 1.6647682189941406, + "learning_rate": 9.781e-05, + "loss": 0.4836, + "step": 19572 + }, + { + "epoch": 1.0960353903012656, + "grad_norm": 1.5249102115631104, + "learning_rate": 9.7815e-05, + "loss": 0.4272, + "step": 19573 + }, + { + "epoch": 1.0960913876133946, + "grad_norm": 1.2867717742919922, + "learning_rate": 9.782e-05, + "loss": 0.5391, + "step": 19574 + }, + { + "epoch": 1.0961473849255237, + "grad_norm": 1.3553507328033447, + "learning_rate": 9.7825e-05, + "loss": 0.4903, + "step": 19575 + }, + { + "epoch": 1.0962033822376527, + "grad_norm": 1.1969085931777954, + "learning_rate": 9.783e-05, + "loss": 0.4377, + "step": 19576 + }, + { + "epoch": 1.0962593795497817, + "grad_norm": 1.2163243293762207, + "learning_rate": 9.783500000000001e-05, + "loss": 0.3771, + "step": 19577 + }, + { + "epoch": 1.0963153768619107, + "grad_norm": 2.052074432373047, + "learning_rate": 9.784000000000001e-05, + "loss": 0.3332, + "step": 19578 + }, + { + "epoch": 1.0963713741740397, + "grad_norm": 1.3686301708221436, + "learning_rate": 9.784500000000001e-05, + "loss": 0.5045, + "step": 19579 + }, + { + "epoch": 1.0964273714861688, + "grad_norm": 1.6917316913604736, + "learning_rate": 9.785e-05, + "loss": 0.4673, + "step": 19580 + }, + { + "epoch": 1.0964833687982978, + "grad_norm": 1.682552456855774, + "learning_rate": 9.7855e-05, + "loss": 0.4664, + "step": 19581 + }, + { + "epoch": 1.0965393661104268, + "grad_norm": 1.4329495429992676, + "learning_rate": 9.786e-05, + "loss": 0.4861, + "step": 19582 + }, + { + "epoch": 1.0965953634225558, + "grad_norm": 1.289271354675293, + "learning_rate": 9.786500000000001e-05, + "loss": 0.3885, + "step": 19583 + }, + { + "epoch": 1.0966513607346848, + "grad_norm": 1.4165812730789185, + "learning_rate": 9.787000000000001e-05, + "loss": 0.4461, + "step": 19584 + }, + { + "epoch": 1.0967073580468139, + "grad_norm": 1.2960909605026245, + "learning_rate": 9.787500000000001e-05, + "loss": 0.5017, + "step": 19585 + }, + { + "epoch": 1.0967633553589429, + "grad_norm": 1.6284939050674438, + "learning_rate": 9.788e-05, + "loss": 0.453, + "step": 19586 + }, + { + "epoch": 1.096819352671072, + "grad_norm": 1.2743760347366333, + "learning_rate": 9.7885e-05, + "loss": 0.2907, + "step": 19587 + }, + { + "epoch": 1.096875349983201, + "grad_norm": 1.3700569868087769, + "learning_rate": 9.789e-05, + "loss": 0.4638, + "step": 19588 + }, + { + "epoch": 1.09693134729533, + "grad_norm": 1.3631771802902222, + "learning_rate": 9.7895e-05, + "loss": 0.3999, + "step": 19589 + }, + { + "epoch": 1.096987344607459, + "grad_norm": 1.553858757019043, + "learning_rate": 9.790000000000001e-05, + "loss": 0.514, + "step": 19590 + }, + { + "epoch": 1.097043341919588, + "grad_norm": 1.1983616352081299, + "learning_rate": 9.7905e-05, + "loss": 0.3723, + "step": 19591 + }, + { + "epoch": 1.097099339231717, + "grad_norm": 1.3355696201324463, + "learning_rate": 9.791e-05, + "loss": 0.3802, + "step": 19592 + }, + { + "epoch": 1.097155336543846, + "grad_norm": 1.3248353004455566, + "learning_rate": 9.7915e-05, + "loss": 0.5072, + "step": 19593 + }, + { + "epoch": 1.097211333855975, + "grad_norm": 1.1041831970214844, + "learning_rate": 9.792e-05, + "loss": 0.3715, + "step": 19594 + }, + { + "epoch": 1.097267331168104, + "grad_norm": 1.2678688764572144, + "learning_rate": 9.7925e-05, + "loss": 0.4019, + "step": 19595 + }, + { + "epoch": 1.097323328480233, + "grad_norm": 1.381998896598816, + "learning_rate": 9.793e-05, + "loss": 0.5244, + "step": 19596 + }, + { + "epoch": 1.097379325792362, + "grad_norm": 1.5407527685165405, + "learning_rate": 9.7935e-05, + "loss": 0.4575, + "step": 19597 + }, + { + "epoch": 1.097435323104491, + "grad_norm": 1.1095999479293823, + "learning_rate": 9.794000000000001e-05, + "loss": 0.3277, + "step": 19598 + }, + { + "epoch": 1.0974913204166201, + "grad_norm": 1.348344326019287, + "learning_rate": 9.794500000000001e-05, + "loss": 0.4471, + "step": 19599 + }, + { + "epoch": 1.097547317728749, + "grad_norm": 1.2373028993606567, + "learning_rate": 9.795000000000001e-05, + "loss": 0.5059, + "step": 19600 + }, + { + "epoch": 1.097603315040878, + "grad_norm": 1.3573837280273438, + "learning_rate": 9.7955e-05, + "loss": 0.4558, + "step": 19601 + }, + { + "epoch": 1.097659312353007, + "grad_norm": 1.5080405473709106, + "learning_rate": 9.796e-05, + "loss": 0.5021, + "step": 19602 + }, + { + "epoch": 1.097715309665136, + "grad_norm": 1.1538140773773193, + "learning_rate": 9.7965e-05, + "loss": 0.365, + "step": 19603 + }, + { + "epoch": 1.097771306977265, + "grad_norm": 1.2251487970352173, + "learning_rate": 9.797000000000001e-05, + "loss": 0.3905, + "step": 19604 + }, + { + "epoch": 1.097827304289394, + "grad_norm": 1.2731026411056519, + "learning_rate": 9.797500000000001e-05, + "loss": 0.3414, + "step": 19605 + }, + { + "epoch": 1.097883301601523, + "grad_norm": 1.5197020769119263, + "learning_rate": 9.798000000000001e-05, + "loss": 0.5708, + "step": 19606 + }, + { + "epoch": 1.097939298913652, + "grad_norm": 1.5866721868515015, + "learning_rate": 9.7985e-05, + "loss": 0.5346, + "step": 19607 + }, + { + "epoch": 1.097995296225781, + "grad_norm": 1.284306526184082, + "learning_rate": 9.799e-05, + "loss": 0.4819, + "step": 19608 + }, + { + "epoch": 1.09805129353791, + "grad_norm": 1.3480887413024902, + "learning_rate": 9.7995e-05, + "loss": 0.4383, + "step": 19609 + }, + { + "epoch": 1.098107290850039, + "grad_norm": 1.904942274093628, + "learning_rate": 9.8e-05, + "loss": 0.433, + "step": 19610 + }, + { + "epoch": 1.0981632881621681, + "grad_norm": 1.3913054466247559, + "learning_rate": 9.800500000000001e-05, + "loss": 0.4088, + "step": 19611 + }, + { + "epoch": 1.0982192854742971, + "grad_norm": 1.3774702548980713, + "learning_rate": 9.801e-05, + "loss": 0.3585, + "step": 19612 + }, + { + "epoch": 1.0982752827864262, + "grad_norm": 1.439720630645752, + "learning_rate": 9.8015e-05, + "loss": 0.4768, + "step": 19613 + }, + { + "epoch": 1.0983312800985552, + "grad_norm": 1.4266635179519653, + "learning_rate": 9.802e-05, + "loss": 0.4101, + "step": 19614 + }, + { + "epoch": 1.0983872774106842, + "grad_norm": 1.3650580644607544, + "learning_rate": 9.8025e-05, + "loss": 0.326, + "step": 19615 + }, + { + "epoch": 1.0984432747228132, + "grad_norm": 1.794957160949707, + "learning_rate": 9.803e-05, + "loss": 0.5428, + "step": 19616 + }, + { + "epoch": 1.0984992720349422, + "grad_norm": 1.2034342288970947, + "learning_rate": 9.8035e-05, + "loss": 0.361, + "step": 19617 + }, + { + "epoch": 1.0985552693470713, + "grad_norm": 1.705862045288086, + "learning_rate": 9.804e-05, + "loss": 0.5285, + "step": 19618 + }, + { + "epoch": 1.0986112666592003, + "grad_norm": 1.4839178323745728, + "learning_rate": 9.804500000000001e-05, + "loss": 0.5371, + "step": 19619 + }, + { + "epoch": 1.0986672639713293, + "grad_norm": 1.4856160879135132, + "learning_rate": 9.805000000000001e-05, + "loss": 0.4007, + "step": 19620 + }, + { + "epoch": 1.0987232612834583, + "grad_norm": 1.6990814208984375, + "learning_rate": 9.805500000000001e-05, + "loss": 0.4334, + "step": 19621 + }, + { + "epoch": 1.0987792585955873, + "grad_norm": 1.3838716745376587, + "learning_rate": 9.806e-05, + "loss": 0.5617, + "step": 19622 + }, + { + "epoch": 1.0988352559077164, + "grad_norm": 1.3111406564712524, + "learning_rate": 9.8065e-05, + "loss": 0.5367, + "step": 19623 + }, + { + "epoch": 1.0988912532198454, + "grad_norm": 1.1596437692642212, + "learning_rate": 9.807e-05, + "loss": 0.3757, + "step": 19624 + }, + { + "epoch": 1.0989472505319744, + "grad_norm": 1.465855360031128, + "learning_rate": 9.807500000000001e-05, + "loss": 0.4408, + "step": 19625 + }, + { + "epoch": 1.0990032478441034, + "grad_norm": 1.2543396949768066, + "learning_rate": 9.808000000000001e-05, + "loss": 0.4489, + "step": 19626 + }, + { + "epoch": 1.0990592451562324, + "grad_norm": 1.525762677192688, + "learning_rate": 9.808500000000001e-05, + "loss": 0.5929, + "step": 19627 + }, + { + "epoch": 1.0991152424683615, + "grad_norm": 1.179938554763794, + "learning_rate": 9.809e-05, + "loss": 0.4061, + "step": 19628 + }, + { + "epoch": 1.0991712397804905, + "grad_norm": 1.2849570512771606, + "learning_rate": 9.8095e-05, + "loss": 0.4987, + "step": 19629 + }, + { + "epoch": 1.0992272370926195, + "grad_norm": 1.1673990488052368, + "learning_rate": 9.81e-05, + "loss": 0.3156, + "step": 19630 + }, + { + "epoch": 1.0992832344047485, + "grad_norm": 1.4120322465896606, + "learning_rate": 9.8105e-05, + "loss": 0.4933, + "step": 19631 + }, + { + "epoch": 1.0993392317168775, + "grad_norm": 1.819898009300232, + "learning_rate": 9.811000000000001e-05, + "loss": 0.4087, + "step": 19632 + }, + { + "epoch": 1.0993952290290065, + "grad_norm": 1.1839394569396973, + "learning_rate": 9.8115e-05, + "loss": 0.4765, + "step": 19633 + }, + { + "epoch": 1.0994512263411356, + "grad_norm": 1.347901463508606, + "learning_rate": 9.812e-05, + "loss": 0.417, + "step": 19634 + }, + { + "epoch": 1.0995072236532646, + "grad_norm": 1.4624589681625366, + "learning_rate": 9.8125e-05, + "loss": 0.4186, + "step": 19635 + }, + { + "epoch": 1.0995632209653936, + "grad_norm": 1.3179152011871338, + "learning_rate": 9.813e-05, + "loss": 0.3827, + "step": 19636 + }, + { + "epoch": 1.0996192182775226, + "grad_norm": 1.1995933055877686, + "learning_rate": 9.8135e-05, + "loss": 0.3866, + "step": 19637 + }, + { + "epoch": 1.0996752155896516, + "grad_norm": 1.2979129552841187, + "learning_rate": 9.814e-05, + "loss": 0.4895, + "step": 19638 + }, + { + "epoch": 1.0997312129017807, + "grad_norm": 1.2624669075012207, + "learning_rate": 9.8145e-05, + "loss": 0.4745, + "step": 19639 + }, + { + "epoch": 1.0997872102139097, + "grad_norm": 1.2604899406433105, + "learning_rate": 9.815000000000001e-05, + "loss": 0.4489, + "step": 19640 + }, + { + "epoch": 1.0998432075260387, + "grad_norm": 1.3490601778030396, + "learning_rate": 9.815500000000001e-05, + "loss": 0.414, + "step": 19641 + }, + { + "epoch": 1.0998992048381677, + "grad_norm": 1.4333354234695435, + "learning_rate": 9.816000000000001e-05, + "loss": 0.43, + "step": 19642 + }, + { + "epoch": 1.0999552021502967, + "grad_norm": 1.474410057067871, + "learning_rate": 9.8165e-05, + "loss": 0.5682, + "step": 19643 + }, + { + "epoch": 1.1000111994624258, + "grad_norm": 1.4290974140167236, + "learning_rate": 9.817e-05, + "loss": 0.4719, + "step": 19644 + }, + { + "epoch": 1.1000671967745548, + "grad_norm": 1.4126131534576416, + "learning_rate": 9.8175e-05, + "loss": 0.4099, + "step": 19645 + }, + { + "epoch": 1.1001231940866838, + "grad_norm": 1.2462862730026245, + "learning_rate": 9.818000000000001e-05, + "loss": 0.3927, + "step": 19646 + }, + { + "epoch": 1.1001791913988128, + "grad_norm": 1.3805636167526245, + "learning_rate": 9.818500000000001e-05, + "loss": 0.3539, + "step": 19647 + }, + { + "epoch": 1.1002351887109418, + "grad_norm": 1.6017307043075562, + "learning_rate": 9.819000000000001e-05, + "loss": 0.434, + "step": 19648 + }, + { + "epoch": 1.1002911860230709, + "grad_norm": 1.4915167093276978, + "learning_rate": 9.8195e-05, + "loss": 0.2893, + "step": 19649 + }, + { + "epoch": 1.1003471833351999, + "grad_norm": 1.3202970027923584, + "learning_rate": 9.82e-05, + "loss": 0.4855, + "step": 19650 + }, + { + "epoch": 1.100403180647329, + "grad_norm": 1.255157232284546, + "learning_rate": 9.8205e-05, + "loss": 0.383, + "step": 19651 + }, + { + "epoch": 1.100459177959458, + "grad_norm": 1.5192474126815796, + "learning_rate": 9.821e-05, + "loss": 0.5071, + "step": 19652 + }, + { + "epoch": 1.100515175271587, + "grad_norm": 1.2128627300262451, + "learning_rate": 9.821500000000001e-05, + "loss": 0.352, + "step": 19653 + }, + { + "epoch": 1.100571172583716, + "grad_norm": 1.457595944404602, + "learning_rate": 9.822e-05, + "loss": 0.3849, + "step": 19654 + }, + { + "epoch": 1.100627169895845, + "grad_norm": 1.1318013668060303, + "learning_rate": 9.8225e-05, + "loss": 0.3474, + "step": 19655 + }, + { + "epoch": 1.100683167207974, + "grad_norm": 1.130577564239502, + "learning_rate": 9.823e-05, + "loss": 0.4295, + "step": 19656 + }, + { + "epoch": 1.100739164520103, + "grad_norm": 1.4926098585128784, + "learning_rate": 9.8235e-05, + "loss": 0.3817, + "step": 19657 + }, + { + "epoch": 1.100795161832232, + "grad_norm": 1.287928819656372, + "learning_rate": 9.824000000000001e-05, + "loss": 0.3621, + "step": 19658 + }, + { + "epoch": 1.100851159144361, + "grad_norm": 1.5619772672653198, + "learning_rate": 9.8245e-05, + "loss": 0.4055, + "step": 19659 + }, + { + "epoch": 1.10090715645649, + "grad_norm": 1.1738126277923584, + "learning_rate": 9.825e-05, + "loss": 0.301, + "step": 19660 + }, + { + "epoch": 1.100963153768619, + "grad_norm": 1.506620168685913, + "learning_rate": 9.825500000000001e-05, + "loss": 0.4636, + "step": 19661 + }, + { + "epoch": 1.101019151080748, + "grad_norm": 1.37851881980896, + "learning_rate": 9.826000000000001e-05, + "loss": 0.4036, + "step": 19662 + }, + { + "epoch": 1.1010751483928771, + "grad_norm": 1.145391821861267, + "learning_rate": 9.826500000000001e-05, + "loss": 0.4037, + "step": 19663 + }, + { + "epoch": 1.1011311457050061, + "grad_norm": 1.5921082496643066, + "learning_rate": 9.827e-05, + "loss": 0.4837, + "step": 19664 + }, + { + "epoch": 1.1011871430171352, + "grad_norm": 1.2360310554504395, + "learning_rate": 9.8275e-05, + "loss": 0.3909, + "step": 19665 + }, + { + "epoch": 1.1012431403292642, + "grad_norm": 1.3479084968566895, + "learning_rate": 9.828e-05, + "loss": 0.3891, + "step": 19666 + }, + { + "epoch": 1.1012991376413932, + "grad_norm": 1.4180879592895508, + "learning_rate": 9.8285e-05, + "loss": 0.4824, + "step": 19667 + }, + { + "epoch": 1.1013551349535222, + "grad_norm": 1.416598916053772, + "learning_rate": 9.829000000000001e-05, + "loss": 0.4931, + "step": 19668 + }, + { + "epoch": 1.1014111322656512, + "grad_norm": 1.3986800909042358, + "learning_rate": 9.8295e-05, + "loss": 0.4377, + "step": 19669 + }, + { + "epoch": 1.1014671295777803, + "grad_norm": 1.4680436849594116, + "learning_rate": 9.83e-05, + "loss": 0.4207, + "step": 19670 + }, + { + "epoch": 1.1015231268899093, + "grad_norm": 1.2534722089767456, + "learning_rate": 9.8305e-05, + "loss": 0.4465, + "step": 19671 + }, + { + "epoch": 1.1015791242020383, + "grad_norm": 1.4206799268722534, + "learning_rate": 9.831e-05, + "loss": 0.3915, + "step": 19672 + }, + { + "epoch": 1.1016351215141673, + "grad_norm": 1.613774299621582, + "learning_rate": 9.8315e-05, + "loss": 0.5944, + "step": 19673 + }, + { + "epoch": 1.1016911188262963, + "grad_norm": 1.422844409942627, + "learning_rate": 9.832000000000001e-05, + "loss": 0.4587, + "step": 19674 + }, + { + "epoch": 1.1017471161384254, + "grad_norm": 1.4094058275222778, + "learning_rate": 9.8325e-05, + "loss": 0.4965, + "step": 19675 + }, + { + "epoch": 1.1018031134505544, + "grad_norm": 1.4469799995422363, + "learning_rate": 9.833e-05, + "loss": 0.4653, + "step": 19676 + }, + { + "epoch": 1.1018591107626834, + "grad_norm": 1.2660422325134277, + "learning_rate": 9.8335e-05, + "loss": 0.4329, + "step": 19677 + }, + { + "epoch": 1.1019151080748124, + "grad_norm": 1.3411662578582764, + "learning_rate": 9.834000000000001e-05, + "loss": 0.4491, + "step": 19678 + }, + { + "epoch": 1.1019711053869414, + "grad_norm": 1.233513593673706, + "learning_rate": 9.834500000000001e-05, + "loss": 0.5012, + "step": 19679 + }, + { + "epoch": 1.1020271026990704, + "grad_norm": 1.741809368133545, + "learning_rate": 9.835e-05, + "loss": 0.4552, + "step": 19680 + }, + { + "epoch": 1.1020831000111995, + "grad_norm": 1.3911317586898804, + "learning_rate": 9.8355e-05, + "loss": 0.4226, + "step": 19681 + }, + { + "epoch": 1.1021390973233285, + "grad_norm": 1.219521403312683, + "learning_rate": 9.836000000000001e-05, + "loss": 0.3876, + "step": 19682 + }, + { + "epoch": 1.1021950946354575, + "grad_norm": 1.204106092453003, + "learning_rate": 9.836500000000001e-05, + "loss": 0.3628, + "step": 19683 + }, + { + "epoch": 1.1022510919475865, + "grad_norm": 1.6147651672363281, + "learning_rate": 9.837000000000001e-05, + "loss": 0.4808, + "step": 19684 + }, + { + "epoch": 1.1023070892597155, + "grad_norm": 1.4001786708831787, + "learning_rate": 9.8375e-05, + "loss": 0.4303, + "step": 19685 + }, + { + "epoch": 1.1023630865718446, + "grad_norm": 1.4113482236862183, + "learning_rate": 9.838e-05, + "loss": 0.5507, + "step": 19686 + }, + { + "epoch": 1.1024190838839736, + "grad_norm": 1.3065470457077026, + "learning_rate": 9.8385e-05, + "loss": 0.4914, + "step": 19687 + }, + { + "epoch": 1.1024750811961026, + "grad_norm": 1.4770466089248657, + "learning_rate": 9.839e-05, + "loss": 0.4447, + "step": 19688 + }, + { + "epoch": 1.1025310785082316, + "grad_norm": 1.696991205215454, + "learning_rate": 9.839500000000001e-05, + "loss": 0.6721, + "step": 19689 + }, + { + "epoch": 1.1025870758203606, + "grad_norm": 1.3368000984191895, + "learning_rate": 9.84e-05, + "loss": 0.3748, + "step": 19690 + }, + { + "epoch": 1.1026430731324897, + "grad_norm": 1.4557217359542847, + "learning_rate": 9.8405e-05, + "loss": 0.3309, + "step": 19691 + }, + { + "epoch": 1.1026990704446187, + "grad_norm": 1.1468091011047363, + "learning_rate": 9.841e-05, + "loss": 0.323, + "step": 19692 + }, + { + "epoch": 1.1027550677567477, + "grad_norm": 1.4091640710830688, + "learning_rate": 9.8415e-05, + "loss": 0.4994, + "step": 19693 + }, + { + "epoch": 1.1028110650688767, + "grad_norm": 1.1214548349380493, + "learning_rate": 9.842e-05, + "loss": 0.4106, + "step": 19694 + }, + { + "epoch": 1.1028670623810057, + "grad_norm": 1.9984763860702515, + "learning_rate": 9.842500000000001e-05, + "loss": 0.5373, + "step": 19695 + }, + { + "epoch": 1.1029230596931348, + "grad_norm": 1.2726832628250122, + "learning_rate": 9.843e-05, + "loss": 0.4828, + "step": 19696 + }, + { + "epoch": 1.1029790570052638, + "grad_norm": 1.4429244995117188, + "learning_rate": 9.8435e-05, + "loss": 0.4125, + "step": 19697 + }, + { + "epoch": 1.1030350543173928, + "grad_norm": 1.4349812269210815, + "learning_rate": 9.844000000000001e-05, + "loss": 0.3782, + "step": 19698 + }, + { + "epoch": 1.1030910516295218, + "grad_norm": 1.1398873329162598, + "learning_rate": 9.844500000000001e-05, + "loss": 0.3644, + "step": 19699 + }, + { + "epoch": 1.1031470489416508, + "grad_norm": 1.466614842414856, + "learning_rate": 9.845000000000001e-05, + "loss": 0.4185, + "step": 19700 + }, + { + "epoch": 1.1032030462537799, + "grad_norm": 1.4406763315200806, + "learning_rate": 9.8455e-05, + "loss": 0.5374, + "step": 19701 + }, + { + "epoch": 1.1032590435659089, + "grad_norm": 1.4726649522781372, + "learning_rate": 9.846e-05, + "loss": 0.4873, + "step": 19702 + }, + { + "epoch": 1.103315040878038, + "grad_norm": 1.8432420492172241, + "learning_rate": 9.846500000000001e-05, + "loss": 0.5391, + "step": 19703 + }, + { + "epoch": 1.103371038190167, + "grad_norm": 1.6330406665802002, + "learning_rate": 9.847000000000001e-05, + "loss": 0.4411, + "step": 19704 + }, + { + "epoch": 1.103427035502296, + "grad_norm": 1.1099885702133179, + "learning_rate": 9.847500000000001e-05, + "loss": 0.358, + "step": 19705 + }, + { + "epoch": 1.103483032814425, + "grad_norm": 1.2998239994049072, + "learning_rate": 9.848e-05, + "loss": 0.6747, + "step": 19706 + }, + { + "epoch": 1.103539030126554, + "grad_norm": 1.2724019289016724, + "learning_rate": 9.8485e-05, + "loss": 0.3967, + "step": 19707 + }, + { + "epoch": 1.103595027438683, + "grad_norm": 1.0637582540512085, + "learning_rate": 9.849e-05, + "loss": 0.5166, + "step": 19708 + }, + { + "epoch": 1.103651024750812, + "grad_norm": 1.5849372148513794, + "learning_rate": 9.8495e-05, + "loss": 0.5009, + "step": 19709 + }, + { + "epoch": 1.103707022062941, + "grad_norm": 1.1329911947250366, + "learning_rate": 9.850000000000001e-05, + "loss": 0.4318, + "step": 19710 + }, + { + "epoch": 1.10376301937507, + "grad_norm": 1.33184814453125, + "learning_rate": 9.8505e-05, + "loss": 0.3949, + "step": 19711 + }, + { + "epoch": 1.103819016687199, + "grad_norm": 1.7777830362319946, + "learning_rate": 9.851e-05, + "loss": 0.2906, + "step": 19712 + }, + { + "epoch": 1.103875013999328, + "grad_norm": 1.3954609632492065, + "learning_rate": 9.8515e-05, + "loss": 0.4586, + "step": 19713 + }, + { + "epoch": 1.103931011311457, + "grad_norm": 1.442158818244934, + "learning_rate": 9.852e-05, + "loss": 0.4672, + "step": 19714 + }, + { + "epoch": 1.1039870086235861, + "grad_norm": 1.2966798543930054, + "learning_rate": 9.8525e-05, + "loss": 0.3713, + "step": 19715 + }, + { + "epoch": 1.1040430059357151, + "grad_norm": 1.3441522121429443, + "learning_rate": 9.853e-05, + "loss": 0.5063, + "step": 19716 + }, + { + "epoch": 1.1040990032478442, + "grad_norm": 1.5825473070144653, + "learning_rate": 9.8535e-05, + "loss": 0.4611, + "step": 19717 + }, + { + "epoch": 1.1041550005599732, + "grad_norm": 1.2520579099655151, + "learning_rate": 9.854000000000002e-05, + "loss": 0.4068, + "step": 19718 + }, + { + "epoch": 1.1042109978721022, + "grad_norm": 1.5830234289169312, + "learning_rate": 9.854500000000001e-05, + "loss": 0.544, + "step": 19719 + }, + { + "epoch": 1.1042669951842312, + "grad_norm": 1.355677604675293, + "learning_rate": 9.855000000000001e-05, + "loss": 0.4227, + "step": 19720 + }, + { + "epoch": 1.1043229924963602, + "grad_norm": 1.1585386991500854, + "learning_rate": 9.855500000000001e-05, + "loss": 0.412, + "step": 19721 + }, + { + "epoch": 1.1043789898084893, + "grad_norm": 1.249297857284546, + "learning_rate": 9.856e-05, + "loss": 0.4418, + "step": 19722 + }, + { + "epoch": 1.1044349871206183, + "grad_norm": 1.499921202659607, + "learning_rate": 9.8565e-05, + "loss": 0.4302, + "step": 19723 + }, + { + "epoch": 1.1044909844327473, + "grad_norm": 1.3647310733795166, + "learning_rate": 9.857000000000001e-05, + "loss": 0.3631, + "step": 19724 + }, + { + "epoch": 1.1045469817448763, + "grad_norm": 1.7964094877243042, + "learning_rate": 9.857500000000001e-05, + "loss": 0.5309, + "step": 19725 + }, + { + "epoch": 1.1046029790570053, + "grad_norm": 1.1209633350372314, + "learning_rate": 9.858000000000001e-05, + "loss": 0.3896, + "step": 19726 + }, + { + "epoch": 1.1046589763691343, + "grad_norm": 1.3948016166687012, + "learning_rate": 9.8585e-05, + "loss": 0.3297, + "step": 19727 + }, + { + "epoch": 1.1047149736812634, + "grad_norm": 1.5150035619735718, + "learning_rate": 9.859e-05, + "loss": 0.419, + "step": 19728 + }, + { + "epoch": 1.1047709709933924, + "grad_norm": 1.659343957901001, + "learning_rate": 9.8595e-05, + "loss": 0.6311, + "step": 19729 + }, + { + "epoch": 1.1048269683055214, + "grad_norm": 1.4386720657348633, + "learning_rate": 9.86e-05, + "loss": 0.4453, + "step": 19730 + }, + { + "epoch": 1.1048829656176504, + "grad_norm": 1.4890894889831543, + "learning_rate": 9.860500000000001e-05, + "loss": 0.518, + "step": 19731 + }, + { + "epoch": 1.1049389629297794, + "grad_norm": 1.4367374181747437, + "learning_rate": 9.861e-05, + "loss": 0.5873, + "step": 19732 + }, + { + "epoch": 1.1049949602419085, + "grad_norm": 1.6929882764816284, + "learning_rate": 9.8615e-05, + "loss": 0.4449, + "step": 19733 + }, + { + "epoch": 1.1050509575540375, + "grad_norm": 1.3177821636199951, + "learning_rate": 9.862e-05, + "loss": 0.3507, + "step": 19734 + }, + { + "epoch": 1.1051069548661665, + "grad_norm": 1.3555843830108643, + "learning_rate": 9.8625e-05, + "loss": 0.4354, + "step": 19735 + }, + { + "epoch": 1.1051629521782955, + "grad_norm": 1.4369921684265137, + "learning_rate": 9.863e-05, + "loss": 0.4411, + "step": 19736 + }, + { + "epoch": 1.1052189494904245, + "grad_norm": 1.2860368490219116, + "learning_rate": 9.8635e-05, + "loss": 0.3695, + "step": 19737 + }, + { + "epoch": 1.1052749468025536, + "grad_norm": 1.36696457862854, + "learning_rate": 9.864e-05, + "loss": 0.4118, + "step": 19738 + }, + { + "epoch": 1.1053309441146826, + "grad_norm": 1.2955561876296997, + "learning_rate": 9.864500000000002e-05, + "loss": 0.3879, + "step": 19739 + }, + { + "epoch": 1.1053869414268116, + "grad_norm": 1.7504791021347046, + "learning_rate": 9.865000000000001e-05, + "loss": 0.4655, + "step": 19740 + }, + { + "epoch": 1.1054429387389406, + "grad_norm": 1.4993438720703125, + "learning_rate": 9.865500000000001e-05, + "loss": 0.4497, + "step": 19741 + }, + { + "epoch": 1.1054989360510696, + "grad_norm": 1.411922574043274, + "learning_rate": 9.866000000000001e-05, + "loss": 0.5643, + "step": 19742 + }, + { + "epoch": 1.1055549333631987, + "grad_norm": 1.3271328210830688, + "learning_rate": 9.8665e-05, + "loss": 0.4518, + "step": 19743 + }, + { + "epoch": 1.1056109306753277, + "grad_norm": 1.5522270202636719, + "learning_rate": 9.867e-05, + "loss": 0.4669, + "step": 19744 + }, + { + "epoch": 1.1056669279874567, + "grad_norm": 1.6213088035583496, + "learning_rate": 9.8675e-05, + "loss": 0.39, + "step": 19745 + }, + { + "epoch": 1.1057229252995857, + "grad_norm": 1.1774853467941284, + "learning_rate": 9.868000000000001e-05, + "loss": 0.4392, + "step": 19746 + }, + { + "epoch": 1.1057789226117147, + "grad_norm": 1.3302592039108276, + "learning_rate": 9.868500000000001e-05, + "loss": 0.4107, + "step": 19747 + }, + { + "epoch": 1.1058349199238438, + "grad_norm": 1.5091776847839355, + "learning_rate": 9.869e-05, + "loss": 0.4309, + "step": 19748 + }, + { + "epoch": 1.1058909172359728, + "grad_norm": 1.4652016162872314, + "learning_rate": 9.8695e-05, + "loss": 0.4422, + "step": 19749 + }, + { + "epoch": 1.1059469145481018, + "grad_norm": 1.6019277572631836, + "learning_rate": 9.87e-05, + "loss": 0.4204, + "step": 19750 + }, + { + "epoch": 1.1060029118602308, + "grad_norm": 1.4288212060928345, + "learning_rate": 9.8705e-05, + "loss": 0.4841, + "step": 19751 + }, + { + "epoch": 1.1060589091723598, + "grad_norm": 1.8394572734832764, + "learning_rate": 9.871000000000001e-05, + "loss": 0.4747, + "step": 19752 + }, + { + "epoch": 1.1061149064844888, + "grad_norm": 1.221510648727417, + "learning_rate": 9.8715e-05, + "loss": 0.463, + "step": 19753 + }, + { + "epoch": 1.1061709037966179, + "grad_norm": 1.2516493797302246, + "learning_rate": 9.872e-05, + "loss": 0.3969, + "step": 19754 + }, + { + "epoch": 1.1062269011087467, + "grad_norm": 1.5816762447357178, + "learning_rate": 9.8725e-05, + "loss": 0.449, + "step": 19755 + }, + { + "epoch": 1.1062828984208757, + "grad_norm": 1.6973276138305664, + "learning_rate": 9.873e-05, + "loss": 0.4978, + "step": 19756 + }, + { + "epoch": 1.1063388957330047, + "grad_norm": 1.1648826599121094, + "learning_rate": 9.8735e-05, + "loss": 0.3786, + "step": 19757 + }, + { + "epoch": 1.1063948930451337, + "grad_norm": 1.2789723873138428, + "learning_rate": 9.874e-05, + "loss": 0.3556, + "step": 19758 + }, + { + "epoch": 1.1064508903572627, + "grad_norm": 1.9684841632843018, + "learning_rate": 9.8745e-05, + "loss": 0.4716, + "step": 19759 + }, + { + "epoch": 1.1065068876693918, + "grad_norm": 1.2908488512039185, + "learning_rate": 9.875000000000002e-05, + "loss": 0.4018, + "step": 19760 + }, + { + "epoch": 1.1065628849815208, + "grad_norm": 1.6126617193222046, + "learning_rate": 9.875500000000001e-05, + "loss": 0.4275, + "step": 19761 + }, + { + "epoch": 1.1066188822936498, + "grad_norm": 1.5051411390304565, + "learning_rate": 9.876000000000001e-05, + "loss": 0.4489, + "step": 19762 + }, + { + "epoch": 1.1066748796057788, + "grad_norm": 1.4963515996932983, + "learning_rate": 9.876500000000001e-05, + "loss": 0.4229, + "step": 19763 + }, + { + "epoch": 1.1067308769179078, + "grad_norm": 1.419785499572754, + "learning_rate": 9.877e-05, + "loss": 0.4226, + "step": 19764 + }, + { + "epoch": 1.1067868742300369, + "grad_norm": 1.2918188571929932, + "learning_rate": 9.8775e-05, + "loss": 0.4028, + "step": 19765 + }, + { + "epoch": 1.1068428715421659, + "grad_norm": 1.3430591821670532, + "learning_rate": 9.878e-05, + "loss": 0.4244, + "step": 19766 + }, + { + "epoch": 1.106898868854295, + "grad_norm": 1.2693862915039062, + "learning_rate": 9.878500000000001e-05, + "loss": 0.4661, + "step": 19767 + }, + { + "epoch": 1.106954866166424, + "grad_norm": 1.434861421585083, + "learning_rate": 9.879000000000001e-05, + "loss": 0.5121, + "step": 19768 + }, + { + "epoch": 1.107010863478553, + "grad_norm": 1.250809669494629, + "learning_rate": 9.8795e-05, + "loss": 0.3467, + "step": 19769 + }, + { + "epoch": 1.107066860790682, + "grad_norm": 1.286453127861023, + "learning_rate": 9.88e-05, + "loss": 0.5006, + "step": 19770 + }, + { + "epoch": 1.107122858102811, + "grad_norm": 1.2133138179779053, + "learning_rate": 9.8805e-05, + "loss": 0.4423, + "step": 19771 + }, + { + "epoch": 1.10717885541494, + "grad_norm": 1.489060401916504, + "learning_rate": 9.881e-05, + "loss": 0.3541, + "step": 19772 + }, + { + "epoch": 1.107234852727069, + "grad_norm": 1.31699800491333, + "learning_rate": 9.881500000000001e-05, + "loss": 0.3656, + "step": 19773 + }, + { + "epoch": 1.107290850039198, + "grad_norm": 1.2022706270217896, + "learning_rate": 9.882e-05, + "loss": 0.4431, + "step": 19774 + }, + { + "epoch": 1.107346847351327, + "grad_norm": 1.1489864587783813, + "learning_rate": 9.8825e-05, + "loss": 0.3205, + "step": 19775 + }, + { + "epoch": 1.107402844663456, + "grad_norm": 1.2494655847549438, + "learning_rate": 9.883e-05, + "loss": 0.3814, + "step": 19776 + }, + { + "epoch": 1.107458841975585, + "grad_norm": 1.5846800804138184, + "learning_rate": 9.8835e-05, + "loss": 0.4948, + "step": 19777 + }, + { + "epoch": 1.107514839287714, + "grad_norm": 1.374254584312439, + "learning_rate": 9.884e-05, + "loss": 0.5802, + "step": 19778 + }, + { + "epoch": 1.1075708365998431, + "grad_norm": 1.5100247859954834, + "learning_rate": 9.8845e-05, + "loss": 0.4543, + "step": 19779 + }, + { + "epoch": 1.1076268339119721, + "grad_norm": 1.4424753189086914, + "learning_rate": 9.885e-05, + "loss": 0.4236, + "step": 19780 + }, + { + "epoch": 1.1076828312241012, + "grad_norm": 1.4685970544815063, + "learning_rate": 9.885500000000001e-05, + "loss": 0.4548, + "step": 19781 + }, + { + "epoch": 1.1077388285362302, + "grad_norm": 1.7806181907653809, + "learning_rate": 9.886000000000001e-05, + "loss": 0.3814, + "step": 19782 + }, + { + "epoch": 1.1077948258483592, + "grad_norm": 1.264974594116211, + "learning_rate": 9.886500000000001e-05, + "loss": 0.3284, + "step": 19783 + }, + { + "epoch": 1.1078508231604882, + "grad_norm": 1.2328931093215942, + "learning_rate": 9.887000000000001e-05, + "loss": 0.5104, + "step": 19784 + }, + { + "epoch": 1.1079068204726172, + "grad_norm": 1.28622567653656, + "learning_rate": 9.8875e-05, + "loss": 0.4541, + "step": 19785 + }, + { + "epoch": 1.1079628177847463, + "grad_norm": 1.5155245065689087, + "learning_rate": 9.888e-05, + "loss": 0.4498, + "step": 19786 + }, + { + "epoch": 1.1080188150968753, + "grad_norm": 1.269075632095337, + "learning_rate": 9.8885e-05, + "loss": 0.2918, + "step": 19787 + }, + { + "epoch": 1.1080748124090043, + "grad_norm": 2.4623186588287354, + "learning_rate": 9.889000000000001e-05, + "loss": 0.4697, + "step": 19788 + }, + { + "epoch": 1.1081308097211333, + "grad_norm": 1.508831262588501, + "learning_rate": 9.889500000000001e-05, + "loss": 0.629, + "step": 19789 + }, + { + "epoch": 1.1081868070332623, + "grad_norm": 1.5116913318634033, + "learning_rate": 9.89e-05, + "loss": 0.5501, + "step": 19790 + }, + { + "epoch": 1.1082428043453914, + "grad_norm": 1.5507465600967407, + "learning_rate": 9.8905e-05, + "loss": 0.443, + "step": 19791 + }, + { + "epoch": 1.1082988016575204, + "grad_norm": 1.4017212390899658, + "learning_rate": 9.891e-05, + "loss": 0.4627, + "step": 19792 + }, + { + "epoch": 1.1083547989696494, + "grad_norm": 1.7601009607315063, + "learning_rate": 9.8915e-05, + "loss": 0.323, + "step": 19793 + }, + { + "epoch": 1.1084107962817784, + "grad_norm": 1.5531072616577148, + "learning_rate": 9.892e-05, + "loss": 0.4752, + "step": 19794 + }, + { + "epoch": 1.1084667935939074, + "grad_norm": 1.5044505596160889, + "learning_rate": 9.8925e-05, + "loss": 0.5064, + "step": 19795 + }, + { + "epoch": 1.1085227909060364, + "grad_norm": 1.2802830934524536, + "learning_rate": 9.893e-05, + "loss": 0.5113, + "step": 19796 + }, + { + "epoch": 1.1085787882181655, + "grad_norm": 1.742136001586914, + "learning_rate": 9.8935e-05, + "loss": 0.7409, + "step": 19797 + }, + { + "epoch": 1.1086347855302945, + "grad_norm": 1.4399957656860352, + "learning_rate": 9.894e-05, + "loss": 0.5201, + "step": 19798 + }, + { + "epoch": 1.1086907828424235, + "grad_norm": 1.4919958114624023, + "learning_rate": 9.894500000000001e-05, + "loss": 0.4162, + "step": 19799 + }, + { + "epoch": 1.1087467801545525, + "grad_norm": 1.313231348991394, + "learning_rate": 9.895e-05, + "loss": 0.377, + "step": 19800 + }, + { + "epoch": 1.1088027774666815, + "grad_norm": 1.7701061964035034, + "learning_rate": 9.8955e-05, + "loss": 0.4284, + "step": 19801 + }, + { + "epoch": 1.1088587747788106, + "grad_norm": 1.6026102304458618, + "learning_rate": 9.896000000000001e-05, + "loss": 0.4224, + "step": 19802 + }, + { + "epoch": 1.1089147720909396, + "grad_norm": 1.1849709749221802, + "learning_rate": 9.896500000000001e-05, + "loss": 0.4069, + "step": 19803 + }, + { + "epoch": 1.1089707694030686, + "grad_norm": 1.3921117782592773, + "learning_rate": 9.897000000000001e-05, + "loss": 0.4451, + "step": 19804 + }, + { + "epoch": 1.1090267667151976, + "grad_norm": 1.1940969228744507, + "learning_rate": 9.897500000000001e-05, + "loss": 0.4032, + "step": 19805 + }, + { + "epoch": 1.1090827640273266, + "grad_norm": 1.337113857269287, + "learning_rate": 9.898e-05, + "loss": 0.4439, + "step": 19806 + }, + { + "epoch": 1.1091387613394557, + "grad_norm": 1.444793701171875, + "learning_rate": 9.8985e-05, + "loss": 0.494, + "step": 19807 + }, + { + "epoch": 1.1091947586515847, + "grad_norm": 1.5827836990356445, + "learning_rate": 9.899e-05, + "loss": 0.4808, + "step": 19808 + }, + { + "epoch": 1.1092507559637137, + "grad_norm": 1.4604209661483765, + "learning_rate": 9.899500000000001e-05, + "loss": 0.4782, + "step": 19809 + }, + { + "epoch": 1.1093067532758427, + "grad_norm": 1.7103921175003052, + "learning_rate": 9.900000000000001e-05, + "loss": 0.7892, + "step": 19810 + }, + { + "epoch": 1.1093627505879717, + "grad_norm": 1.7148162126541138, + "learning_rate": 9.9005e-05, + "loss": 0.3339, + "step": 19811 + }, + { + "epoch": 1.1094187479001008, + "grad_norm": 1.3999277353286743, + "learning_rate": 9.901e-05, + "loss": 0.5298, + "step": 19812 + }, + { + "epoch": 1.1094747452122298, + "grad_norm": 1.165982723236084, + "learning_rate": 9.9015e-05, + "loss": 0.4168, + "step": 19813 + }, + { + "epoch": 1.1095307425243588, + "grad_norm": 1.240397334098816, + "learning_rate": 9.902e-05, + "loss": 0.3886, + "step": 19814 + }, + { + "epoch": 1.1095867398364878, + "grad_norm": 1.2412090301513672, + "learning_rate": 9.9025e-05, + "loss": 0.32, + "step": 19815 + }, + { + "epoch": 1.1096427371486168, + "grad_norm": 1.3912166357040405, + "learning_rate": 9.903e-05, + "loss": 0.4508, + "step": 19816 + }, + { + "epoch": 1.1096987344607459, + "grad_norm": 1.2844611406326294, + "learning_rate": 9.9035e-05, + "loss": 0.4375, + "step": 19817 + }, + { + "epoch": 1.1097547317728749, + "grad_norm": 2.117445707321167, + "learning_rate": 9.904e-05, + "loss": 0.601, + "step": 19818 + }, + { + "epoch": 1.1098107290850039, + "grad_norm": 1.6978082656860352, + "learning_rate": 9.904500000000001e-05, + "loss": 0.512, + "step": 19819 + }, + { + "epoch": 1.109866726397133, + "grad_norm": 1.3439422845840454, + "learning_rate": 9.905000000000001e-05, + "loss": 0.473, + "step": 19820 + }, + { + "epoch": 1.109922723709262, + "grad_norm": 1.410443663597107, + "learning_rate": 9.9055e-05, + "loss": 0.401, + "step": 19821 + }, + { + "epoch": 1.109978721021391, + "grad_norm": 1.4224430322647095, + "learning_rate": 9.906e-05, + "loss": 0.4945, + "step": 19822 + }, + { + "epoch": 1.11003471833352, + "grad_norm": 1.2218562364578247, + "learning_rate": 9.9065e-05, + "loss": 0.4759, + "step": 19823 + }, + { + "epoch": 1.110090715645649, + "grad_norm": 1.3283394575119019, + "learning_rate": 9.907000000000001e-05, + "loss": 0.4862, + "step": 19824 + }, + { + "epoch": 1.110146712957778, + "grad_norm": 1.2634313106536865, + "learning_rate": 9.907500000000001e-05, + "loss": 0.4187, + "step": 19825 + }, + { + "epoch": 1.110202710269907, + "grad_norm": 1.288685917854309, + "learning_rate": 9.908000000000001e-05, + "loss": 0.3703, + "step": 19826 + }, + { + "epoch": 1.110258707582036, + "grad_norm": 1.2056894302368164, + "learning_rate": 9.9085e-05, + "loss": 0.4127, + "step": 19827 + }, + { + "epoch": 1.110314704894165, + "grad_norm": 1.2373862266540527, + "learning_rate": 9.909e-05, + "loss": 0.4087, + "step": 19828 + }, + { + "epoch": 1.110370702206294, + "grad_norm": 1.2632445096969604, + "learning_rate": 9.9095e-05, + "loss": 0.4093, + "step": 19829 + }, + { + "epoch": 1.110426699518423, + "grad_norm": 1.4656373262405396, + "learning_rate": 9.910000000000001e-05, + "loss": 0.5283, + "step": 19830 + }, + { + "epoch": 1.1104826968305521, + "grad_norm": 1.6199040412902832, + "learning_rate": 9.910500000000001e-05, + "loss": 0.4204, + "step": 19831 + }, + { + "epoch": 1.1105386941426811, + "grad_norm": 1.6431338787078857, + "learning_rate": 9.911e-05, + "loss": 0.4061, + "step": 19832 + }, + { + "epoch": 1.1105946914548102, + "grad_norm": 1.3754061460494995, + "learning_rate": 9.9115e-05, + "loss": 0.3841, + "step": 19833 + }, + { + "epoch": 1.1106506887669392, + "grad_norm": 1.3536350727081299, + "learning_rate": 9.912e-05, + "loss": 0.4043, + "step": 19834 + }, + { + "epoch": 1.1107066860790682, + "grad_norm": 1.239001750946045, + "learning_rate": 9.9125e-05, + "loss": 0.4993, + "step": 19835 + }, + { + "epoch": 1.1107626833911972, + "grad_norm": 1.9410868883132935, + "learning_rate": 9.913e-05, + "loss": 0.5781, + "step": 19836 + }, + { + "epoch": 1.1108186807033262, + "grad_norm": 1.3264325857162476, + "learning_rate": 9.9135e-05, + "loss": 0.3521, + "step": 19837 + }, + { + "epoch": 1.1108746780154553, + "grad_norm": 1.7010470628738403, + "learning_rate": 9.914e-05, + "loss": 0.4823, + "step": 19838 + }, + { + "epoch": 1.1109306753275843, + "grad_norm": 1.9255300760269165, + "learning_rate": 9.914500000000001e-05, + "loss": 0.5195, + "step": 19839 + }, + { + "epoch": 1.1109866726397133, + "grad_norm": 1.4976234436035156, + "learning_rate": 9.915000000000001e-05, + "loss": 0.4206, + "step": 19840 + }, + { + "epoch": 1.1110426699518423, + "grad_norm": 1.4784563779830933, + "learning_rate": 9.915500000000001e-05, + "loss": 0.3892, + "step": 19841 + }, + { + "epoch": 1.1110986672639713, + "grad_norm": 1.702567458152771, + "learning_rate": 9.916e-05, + "loss": 0.467, + "step": 19842 + }, + { + "epoch": 1.1111546645761003, + "grad_norm": 1.2280988693237305, + "learning_rate": 9.9165e-05, + "loss": 0.4062, + "step": 19843 + }, + { + "epoch": 1.1112106618882294, + "grad_norm": 1.4703607559204102, + "learning_rate": 9.917e-05, + "loss": 0.5394, + "step": 19844 + }, + { + "epoch": 1.1112666592003584, + "grad_norm": 1.282779335975647, + "learning_rate": 9.917500000000001e-05, + "loss": 0.3558, + "step": 19845 + }, + { + "epoch": 1.1113226565124874, + "grad_norm": 1.4535640478134155, + "learning_rate": 9.918000000000001e-05, + "loss": 0.4454, + "step": 19846 + }, + { + "epoch": 1.1113786538246164, + "grad_norm": 1.484246850013733, + "learning_rate": 9.9185e-05, + "loss": 0.4246, + "step": 19847 + }, + { + "epoch": 1.1114346511367454, + "grad_norm": 1.4134314060211182, + "learning_rate": 9.919e-05, + "loss": 0.3681, + "step": 19848 + }, + { + "epoch": 1.1114906484488745, + "grad_norm": 1.7610067129135132, + "learning_rate": 9.9195e-05, + "loss": 0.5749, + "step": 19849 + }, + { + "epoch": 1.1115466457610035, + "grad_norm": 1.642107605934143, + "learning_rate": 9.92e-05, + "loss": 0.48, + "step": 19850 + }, + { + "epoch": 1.1116026430731325, + "grad_norm": 1.2779661417007446, + "learning_rate": 9.920500000000001e-05, + "loss": 0.3769, + "step": 19851 + }, + { + "epoch": 1.1116586403852615, + "grad_norm": 1.3027851581573486, + "learning_rate": 9.921000000000001e-05, + "loss": 0.4104, + "step": 19852 + }, + { + "epoch": 1.1117146376973905, + "grad_norm": 1.3508186340332031, + "learning_rate": 9.9215e-05, + "loss": 0.4403, + "step": 19853 + }, + { + "epoch": 1.1117706350095196, + "grad_norm": 1.39118230342865, + "learning_rate": 9.922e-05, + "loss": 0.4971, + "step": 19854 + }, + { + "epoch": 1.1118266323216486, + "grad_norm": 1.4822081327438354, + "learning_rate": 9.9225e-05, + "loss": 0.4282, + "step": 19855 + }, + { + "epoch": 1.1118826296337776, + "grad_norm": 1.4425870180130005, + "learning_rate": 9.923e-05, + "loss": 0.3867, + "step": 19856 + }, + { + "epoch": 1.1119386269459066, + "grad_norm": 1.5058995485305786, + "learning_rate": 9.9235e-05, + "loss": 0.3943, + "step": 19857 + }, + { + "epoch": 1.1119946242580356, + "grad_norm": 1.1833412647247314, + "learning_rate": 9.924e-05, + "loss": 0.384, + "step": 19858 + }, + { + "epoch": 1.1120506215701647, + "grad_norm": 1.3850737810134888, + "learning_rate": 9.924500000000002e-05, + "loss": 0.4203, + "step": 19859 + }, + { + "epoch": 1.1121066188822937, + "grad_norm": 1.5803650617599487, + "learning_rate": 9.925000000000001e-05, + "loss": 0.4912, + "step": 19860 + }, + { + "epoch": 1.1121626161944227, + "grad_norm": 1.3808683156967163, + "learning_rate": 9.925500000000001e-05, + "loss": 0.4308, + "step": 19861 + }, + { + "epoch": 1.1122186135065517, + "grad_norm": 1.4314091205596924, + "learning_rate": 9.926000000000001e-05, + "loss": 0.5991, + "step": 19862 + }, + { + "epoch": 1.1122746108186807, + "grad_norm": 1.2249228954315186, + "learning_rate": 9.9265e-05, + "loss": 0.5417, + "step": 19863 + }, + { + "epoch": 1.1123306081308098, + "grad_norm": 1.2787408828735352, + "learning_rate": 9.927e-05, + "loss": 0.3841, + "step": 19864 + }, + { + "epoch": 1.1123866054429388, + "grad_norm": 1.3893275260925293, + "learning_rate": 9.9275e-05, + "loss": 0.467, + "step": 19865 + }, + { + "epoch": 1.1124426027550678, + "grad_norm": 1.149018406867981, + "learning_rate": 9.928000000000001e-05, + "loss": 0.39, + "step": 19866 + }, + { + "epoch": 1.1124986000671968, + "grad_norm": 1.4450526237487793, + "learning_rate": 9.928500000000001e-05, + "loss": 0.371, + "step": 19867 + }, + { + "epoch": 1.1125545973793258, + "grad_norm": 1.4963886737823486, + "learning_rate": 9.929e-05, + "loss": 0.4029, + "step": 19868 + }, + { + "epoch": 1.1126105946914548, + "grad_norm": 1.2389373779296875, + "learning_rate": 9.9295e-05, + "loss": 0.4354, + "step": 19869 + }, + { + "epoch": 1.1126665920035839, + "grad_norm": 1.412307858467102, + "learning_rate": 9.93e-05, + "loss": 0.4996, + "step": 19870 + }, + { + "epoch": 1.1127225893157129, + "grad_norm": 1.8511191606521606, + "learning_rate": 9.9305e-05, + "loss": 0.4936, + "step": 19871 + }, + { + "epoch": 1.112778586627842, + "grad_norm": 1.1849406957626343, + "learning_rate": 9.931000000000001e-05, + "loss": 0.3702, + "step": 19872 + }, + { + "epoch": 1.112834583939971, + "grad_norm": 1.2159291505813599, + "learning_rate": 9.931500000000001e-05, + "loss": 0.414, + "step": 19873 + }, + { + "epoch": 1.1128905812521, + "grad_norm": 1.3139448165893555, + "learning_rate": 9.932e-05, + "loss": 0.3703, + "step": 19874 + }, + { + "epoch": 1.112946578564229, + "grad_norm": 1.3350764513015747, + "learning_rate": 9.9325e-05, + "loss": 0.6695, + "step": 19875 + }, + { + "epoch": 1.113002575876358, + "grad_norm": 1.1906070709228516, + "learning_rate": 9.933e-05, + "loss": 0.4074, + "step": 19876 + }, + { + "epoch": 1.113058573188487, + "grad_norm": 1.3370988368988037, + "learning_rate": 9.9335e-05, + "loss": 0.553, + "step": 19877 + }, + { + "epoch": 1.113114570500616, + "grad_norm": 1.351703405380249, + "learning_rate": 9.934e-05, + "loss": 0.4098, + "step": 19878 + }, + { + "epoch": 1.113170567812745, + "grad_norm": 1.4156376123428345, + "learning_rate": 9.9345e-05, + "loss": 0.4852, + "step": 19879 + }, + { + "epoch": 1.113226565124874, + "grad_norm": 1.2463830709457397, + "learning_rate": 9.935000000000002e-05, + "loss": 0.3755, + "step": 19880 + }, + { + "epoch": 1.113282562437003, + "grad_norm": 1.4752771854400635, + "learning_rate": 9.935500000000001e-05, + "loss": 0.4217, + "step": 19881 + }, + { + "epoch": 1.113338559749132, + "grad_norm": 1.4596432447433472, + "learning_rate": 9.936000000000001e-05, + "loss": 0.4529, + "step": 19882 + }, + { + "epoch": 1.1133945570612611, + "grad_norm": 1.6576263904571533, + "learning_rate": 9.936500000000001e-05, + "loss": 0.5571, + "step": 19883 + }, + { + "epoch": 1.1134505543733901, + "grad_norm": 1.398348331451416, + "learning_rate": 9.937e-05, + "loss": 0.4008, + "step": 19884 + }, + { + "epoch": 1.1135065516855192, + "grad_norm": 1.4356671571731567, + "learning_rate": 9.9375e-05, + "loss": 0.4391, + "step": 19885 + }, + { + "epoch": 1.1135625489976482, + "grad_norm": 1.416871190071106, + "learning_rate": 9.938e-05, + "loss": 0.5502, + "step": 19886 + }, + { + "epoch": 1.1136185463097772, + "grad_norm": 1.1983762979507446, + "learning_rate": 9.938500000000001e-05, + "loss": 0.4317, + "step": 19887 + }, + { + "epoch": 1.1136745436219062, + "grad_norm": 1.3804419040679932, + "learning_rate": 9.939000000000001e-05, + "loss": 0.4029, + "step": 19888 + }, + { + "epoch": 1.1137305409340352, + "grad_norm": 1.318977952003479, + "learning_rate": 9.9395e-05, + "loss": 0.5044, + "step": 19889 + }, + { + "epoch": 1.1137865382461642, + "grad_norm": 1.2622560262680054, + "learning_rate": 9.94e-05, + "loss": 0.3935, + "step": 19890 + }, + { + "epoch": 1.1138425355582933, + "grad_norm": 1.322582483291626, + "learning_rate": 9.9405e-05, + "loss": 0.4329, + "step": 19891 + }, + { + "epoch": 1.1138985328704223, + "grad_norm": 1.4327465295791626, + "learning_rate": 9.941e-05, + "loss": 0.4451, + "step": 19892 + }, + { + "epoch": 1.1139545301825513, + "grad_norm": 1.2685604095458984, + "learning_rate": 9.9415e-05, + "loss": 0.3814, + "step": 19893 + }, + { + "epoch": 1.1140105274946803, + "grad_norm": 1.1945087909698486, + "learning_rate": 9.942000000000001e-05, + "loss": 0.4192, + "step": 19894 + }, + { + "epoch": 1.1140665248068093, + "grad_norm": 1.2283843755722046, + "learning_rate": 9.9425e-05, + "loss": 0.5873, + "step": 19895 + }, + { + "epoch": 1.1141225221189384, + "grad_norm": 1.350475788116455, + "learning_rate": 9.943e-05, + "loss": 0.3933, + "step": 19896 + }, + { + "epoch": 1.1141785194310674, + "grad_norm": 1.5221939086914062, + "learning_rate": 9.9435e-05, + "loss": 0.4215, + "step": 19897 + }, + { + "epoch": 1.1142345167431964, + "grad_norm": 1.5168911218643188, + "learning_rate": 9.944e-05, + "loss": 0.4496, + "step": 19898 + }, + { + "epoch": 1.1142905140553254, + "grad_norm": 1.6667182445526123, + "learning_rate": 9.9445e-05, + "loss": 0.4945, + "step": 19899 + }, + { + "epoch": 1.1143465113674544, + "grad_norm": 1.4806978702545166, + "learning_rate": 9.945e-05, + "loss": 0.3982, + "step": 19900 + }, + { + "epoch": 1.1144025086795835, + "grad_norm": 1.5881752967834473, + "learning_rate": 9.945500000000002e-05, + "loss": 0.4349, + "step": 19901 + }, + { + "epoch": 1.1144585059917125, + "grad_norm": 1.4468464851379395, + "learning_rate": 9.946000000000001e-05, + "loss": 0.5028, + "step": 19902 + }, + { + "epoch": 1.1145145033038415, + "grad_norm": 1.4290721416473389, + "learning_rate": 9.946500000000001e-05, + "loss": 0.5494, + "step": 19903 + }, + { + "epoch": 1.1145705006159705, + "grad_norm": 1.4594035148620605, + "learning_rate": 9.947000000000001e-05, + "loss": 0.4004, + "step": 19904 + }, + { + "epoch": 1.1146264979280995, + "grad_norm": 1.3635236024856567, + "learning_rate": 9.9475e-05, + "loss": 0.5512, + "step": 19905 + }, + { + "epoch": 1.1146824952402286, + "grad_norm": 1.4220784902572632, + "learning_rate": 9.948e-05, + "loss": 0.3574, + "step": 19906 + }, + { + "epoch": 1.1147384925523576, + "grad_norm": 1.4105898141860962, + "learning_rate": 9.9485e-05, + "loss": 0.5975, + "step": 19907 + }, + { + "epoch": 1.1147944898644866, + "grad_norm": 1.3315365314483643, + "learning_rate": 9.949000000000001e-05, + "loss": 0.4772, + "step": 19908 + }, + { + "epoch": 1.1148504871766156, + "grad_norm": 1.4663732051849365, + "learning_rate": 9.949500000000001e-05, + "loss": 0.4859, + "step": 19909 + }, + { + "epoch": 1.1149064844887446, + "grad_norm": 1.4048585891723633, + "learning_rate": 9.95e-05, + "loss": 0.4054, + "step": 19910 + }, + { + "epoch": 1.1149624818008737, + "grad_norm": 1.4173774719238281, + "learning_rate": 9.9505e-05, + "loss": 0.395, + "step": 19911 + }, + { + "epoch": 1.1150184791130027, + "grad_norm": 1.297888994216919, + "learning_rate": 9.951e-05, + "loss": 0.3928, + "step": 19912 + }, + { + "epoch": 1.1150744764251317, + "grad_norm": 1.144591212272644, + "learning_rate": 9.9515e-05, + "loss": 0.3601, + "step": 19913 + }, + { + "epoch": 1.1151304737372607, + "grad_norm": 1.7306883335113525, + "learning_rate": 9.952e-05, + "loss": 0.4475, + "step": 19914 + }, + { + "epoch": 1.1151864710493897, + "grad_norm": 1.7416670322418213, + "learning_rate": 9.952500000000001e-05, + "loss": 0.5564, + "step": 19915 + }, + { + "epoch": 1.1152424683615187, + "grad_norm": 1.2478415966033936, + "learning_rate": 9.953e-05, + "loss": 0.5177, + "step": 19916 + }, + { + "epoch": 1.1152984656736478, + "grad_norm": 1.2330882549285889, + "learning_rate": 9.9535e-05, + "loss": 0.4486, + "step": 19917 + }, + { + "epoch": 1.1153544629857768, + "grad_norm": 1.2023470401763916, + "learning_rate": 9.954e-05, + "loss": 0.5095, + "step": 19918 + }, + { + "epoch": 1.1154104602979058, + "grad_norm": 1.6836299896240234, + "learning_rate": 9.9545e-05, + "loss": 0.5091, + "step": 19919 + }, + { + "epoch": 1.1154664576100348, + "grad_norm": 1.297433614730835, + "learning_rate": 9.955000000000001e-05, + "loss": 0.3423, + "step": 19920 + }, + { + "epoch": 1.1155224549221638, + "grad_norm": 1.5119662284851074, + "learning_rate": 9.9555e-05, + "loss": 0.4965, + "step": 19921 + }, + { + "epoch": 1.1155784522342929, + "grad_norm": 1.4331485033035278, + "learning_rate": 9.956e-05, + "loss": 0.4122, + "step": 19922 + }, + { + "epoch": 1.1156344495464219, + "grad_norm": 1.4502153396606445, + "learning_rate": 9.956500000000001e-05, + "loss": 0.4129, + "step": 19923 + }, + { + "epoch": 1.115690446858551, + "grad_norm": 1.3752232789993286, + "learning_rate": 9.957000000000001e-05, + "loss": 0.408, + "step": 19924 + }, + { + "epoch": 1.11574644417068, + "grad_norm": 1.5951974391937256, + "learning_rate": 9.957500000000001e-05, + "loss": 0.5723, + "step": 19925 + }, + { + "epoch": 1.115802441482809, + "grad_norm": 1.2176891565322876, + "learning_rate": 9.958e-05, + "loss": 0.4047, + "step": 19926 + }, + { + "epoch": 1.115858438794938, + "grad_norm": 1.3480055332183838, + "learning_rate": 9.9585e-05, + "loss": 0.4002, + "step": 19927 + }, + { + "epoch": 1.115914436107067, + "grad_norm": 1.6599169969558716, + "learning_rate": 9.959e-05, + "loss": 0.6165, + "step": 19928 + }, + { + "epoch": 1.115970433419196, + "grad_norm": 1.3681401014328003, + "learning_rate": 9.959500000000001e-05, + "loss": 0.7238, + "step": 19929 + }, + { + "epoch": 1.116026430731325, + "grad_norm": 1.1669130325317383, + "learning_rate": 9.960000000000001e-05, + "loss": 0.3322, + "step": 19930 + }, + { + "epoch": 1.1160824280434538, + "grad_norm": 1.4666856527328491, + "learning_rate": 9.9605e-05, + "loss": 0.5052, + "step": 19931 + }, + { + "epoch": 1.1161384253555828, + "grad_norm": 1.4829007387161255, + "learning_rate": 9.961e-05, + "loss": 0.3775, + "step": 19932 + }, + { + "epoch": 1.1161944226677118, + "grad_norm": 1.5757139921188354, + "learning_rate": 9.9615e-05, + "loss": 0.3391, + "step": 19933 + }, + { + "epoch": 1.1162504199798409, + "grad_norm": 1.6869962215423584, + "learning_rate": 9.962e-05, + "loss": 0.6753, + "step": 19934 + }, + { + "epoch": 1.1163064172919699, + "grad_norm": 1.349286675453186, + "learning_rate": 9.9625e-05, + "loss": 0.4492, + "step": 19935 + }, + { + "epoch": 1.116362414604099, + "grad_norm": 1.3248045444488525, + "learning_rate": 9.963e-05, + "loss": 0.3965, + "step": 19936 + }, + { + "epoch": 1.116418411916228, + "grad_norm": 1.3681490421295166, + "learning_rate": 9.9635e-05, + "loss": 0.4602, + "step": 19937 + }, + { + "epoch": 1.116474409228357, + "grad_norm": 1.4884835481643677, + "learning_rate": 9.964e-05, + "loss": 0.4888, + "step": 19938 + }, + { + "epoch": 1.116530406540486, + "grad_norm": 1.4044023752212524, + "learning_rate": 9.9645e-05, + "loss": 0.6349, + "step": 19939 + }, + { + "epoch": 1.116586403852615, + "grad_norm": 1.4194635152816772, + "learning_rate": 9.965000000000001e-05, + "loss": 0.5979, + "step": 19940 + }, + { + "epoch": 1.116642401164744, + "grad_norm": 1.5223853588104248, + "learning_rate": 9.965500000000001e-05, + "loss": 0.4812, + "step": 19941 + }, + { + "epoch": 1.116698398476873, + "grad_norm": 1.4153226613998413, + "learning_rate": 9.966e-05, + "loss": 0.4602, + "step": 19942 + }, + { + "epoch": 1.116754395789002, + "grad_norm": 1.2923203706741333, + "learning_rate": 9.9665e-05, + "loss": 0.4366, + "step": 19943 + }, + { + "epoch": 1.116810393101131, + "grad_norm": 1.382320523262024, + "learning_rate": 9.967000000000001e-05, + "loss": 0.4938, + "step": 19944 + }, + { + "epoch": 1.11686639041326, + "grad_norm": 1.4697731733322144, + "learning_rate": 9.967500000000001e-05, + "loss": 0.4776, + "step": 19945 + }, + { + "epoch": 1.116922387725389, + "grad_norm": 1.3057854175567627, + "learning_rate": 9.968000000000001e-05, + "loss": 0.489, + "step": 19946 + }, + { + "epoch": 1.1169783850375181, + "grad_norm": 1.5513603687286377, + "learning_rate": 9.9685e-05, + "loss": 0.4308, + "step": 19947 + }, + { + "epoch": 1.1170343823496471, + "grad_norm": 1.413629174232483, + "learning_rate": 9.969e-05, + "loss": 0.413, + "step": 19948 + }, + { + "epoch": 1.1170903796617762, + "grad_norm": 1.5906438827514648, + "learning_rate": 9.9695e-05, + "loss": 0.3332, + "step": 19949 + }, + { + "epoch": 1.1171463769739052, + "grad_norm": 1.3415582180023193, + "learning_rate": 9.970000000000001e-05, + "loss": 0.5089, + "step": 19950 + }, + { + "epoch": 1.1172023742860342, + "grad_norm": 1.3672477006912231, + "learning_rate": 9.970500000000001e-05, + "loss": 0.4277, + "step": 19951 + }, + { + "epoch": 1.1172583715981632, + "grad_norm": 1.3722107410430908, + "learning_rate": 9.971e-05, + "loss": 0.4536, + "step": 19952 + }, + { + "epoch": 1.1173143689102922, + "grad_norm": 1.2795250415802002, + "learning_rate": 9.9715e-05, + "loss": 0.4078, + "step": 19953 + }, + { + "epoch": 1.1173703662224213, + "grad_norm": 1.5060938596725464, + "learning_rate": 9.972e-05, + "loss": 0.4411, + "step": 19954 + }, + { + "epoch": 1.1174263635345503, + "grad_norm": 1.2826478481292725, + "learning_rate": 9.9725e-05, + "loss": 0.4602, + "step": 19955 + }, + { + "epoch": 1.1174823608466793, + "grad_norm": 1.434885859489441, + "learning_rate": 9.973e-05, + "loss": 0.453, + "step": 19956 + }, + { + "epoch": 1.1175383581588083, + "grad_norm": 1.210551381111145, + "learning_rate": 9.9735e-05, + "loss": 0.4582, + "step": 19957 + }, + { + "epoch": 1.1175943554709373, + "grad_norm": 1.3893661499023438, + "learning_rate": 9.974e-05, + "loss": 0.4822, + "step": 19958 + }, + { + "epoch": 1.1176503527830663, + "grad_norm": 1.260366678237915, + "learning_rate": 9.9745e-05, + "loss": 0.3964, + "step": 19959 + }, + { + "epoch": 1.1177063500951954, + "grad_norm": 1.0513705015182495, + "learning_rate": 9.975000000000001e-05, + "loss": 0.3309, + "step": 19960 + }, + { + "epoch": 1.1177623474073244, + "grad_norm": 4.4490790367126465, + "learning_rate": 9.975500000000001e-05, + "loss": 0.5086, + "step": 19961 + }, + { + "epoch": 1.1178183447194534, + "grad_norm": 1.1814807653427124, + "learning_rate": 9.976000000000001e-05, + "loss": 0.4661, + "step": 19962 + }, + { + "epoch": 1.1178743420315824, + "grad_norm": 1.4177594184875488, + "learning_rate": 9.9765e-05, + "loss": 0.454, + "step": 19963 + }, + { + "epoch": 1.1179303393437114, + "grad_norm": 1.2462059259414673, + "learning_rate": 9.977e-05, + "loss": 0.5212, + "step": 19964 + }, + { + "epoch": 1.1179863366558405, + "grad_norm": 1.2088459730148315, + "learning_rate": 9.977500000000001e-05, + "loss": 0.3805, + "step": 19965 + }, + { + "epoch": 1.1180423339679695, + "grad_norm": 1.2562636137008667, + "learning_rate": 9.978000000000001e-05, + "loss": 0.4543, + "step": 19966 + }, + { + "epoch": 1.1180983312800985, + "grad_norm": 1.3372293710708618, + "learning_rate": 9.978500000000001e-05, + "loss": 0.639, + "step": 19967 + }, + { + "epoch": 1.1181543285922275, + "grad_norm": 1.5341663360595703, + "learning_rate": 9.979e-05, + "loss": 0.4539, + "step": 19968 + }, + { + "epoch": 1.1182103259043565, + "grad_norm": 1.4747451543807983, + "learning_rate": 9.9795e-05, + "loss": 0.4447, + "step": 19969 + }, + { + "epoch": 1.1182663232164856, + "grad_norm": 1.4931836128234863, + "learning_rate": 9.98e-05, + "loss": 0.4791, + "step": 19970 + }, + { + "epoch": 1.1183223205286146, + "grad_norm": 1.6403145790100098, + "learning_rate": 9.9805e-05, + "loss": 0.4906, + "step": 19971 + }, + { + "epoch": 1.1183783178407436, + "grad_norm": 1.296626091003418, + "learning_rate": 9.981000000000001e-05, + "loss": 0.4241, + "step": 19972 + }, + { + "epoch": 1.1184343151528726, + "grad_norm": 1.3703420162200928, + "learning_rate": 9.9815e-05, + "loss": 0.3721, + "step": 19973 + }, + { + "epoch": 1.1184903124650016, + "grad_norm": 1.6404469013214111, + "learning_rate": 9.982e-05, + "loss": 0.4843, + "step": 19974 + }, + { + "epoch": 1.1185463097771307, + "grad_norm": 1.3929030895233154, + "learning_rate": 9.9825e-05, + "loss": 0.518, + "step": 19975 + }, + { + "epoch": 1.1186023070892597, + "grad_norm": 1.3429993391036987, + "learning_rate": 9.983e-05, + "loss": 0.3587, + "step": 19976 + }, + { + "epoch": 1.1186583044013887, + "grad_norm": 1.367019534111023, + "learning_rate": 9.9835e-05, + "loss": 0.6241, + "step": 19977 + }, + { + "epoch": 1.1187143017135177, + "grad_norm": 1.4297236204147339, + "learning_rate": 9.984e-05, + "loss": 0.557, + "step": 19978 + }, + { + "epoch": 1.1187702990256467, + "grad_norm": 1.4048442840576172, + "learning_rate": 9.9845e-05, + "loss": 0.4619, + "step": 19979 + }, + { + "epoch": 1.1188262963377757, + "grad_norm": 1.6790038347244263, + "learning_rate": 9.985000000000001e-05, + "loss": 0.5104, + "step": 19980 + }, + { + "epoch": 1.1188822936499048, + "grad_norm": 1.2666529417037964, + "learning_rate": 9.985500000000001e-05, + "loss": 0.4194, + "step": 19981 + }, + { + "epoch": 1.1189382909620338, + "grad_norm": 1.4653370380401611, + "learning_rate": 9.986000000000001e-05, + "loss": 0.3837, + "step": 19982 + }, + { + "epoch": 1.1189942882741628, + "grad_norm": 1.3848682641983032, + "learning_rate": 9.986500000000001e-05, + "loss": 0.5892, + "step": 19983 + }, + { + "epoch": 1.1190502855862918, + "grad_norm": 1.2159497737884521, + "learning_rate": 9.987e-05, + "loss": 0.4306, + "step": 19984 + }, + { + "epoch": 1.1191062828984208, + "grad_norm": 1.0413435697555542, + "learning_rate": 9.9875e-05, + "loss": 0.3403, + "step": 19985 + }, + { + "epoch": 1.1191622802105499, + "grad_norm": 1.1079907417297363, + "learning_rate": 9.988000000000001e-05, + "loss": 0.3271, + "step": 19986 + }, + { + "epoch": 1.1192182775226789, + "grad_norm": 1.1294519901275635, + "learning_rate": 9.988500000000001e-05, + "loss": 0.3808, + "step": 19987 + }, + { + "epoch": 1.119274274834808, + "grad_norm": 1.372883677482605, + "learning_rate": 9.989000000000001e-05, + "loss": 0.4385, + "step": 19988 + }, + { + "epoch": 1.119330272146937, + "grad_norm": 1.23002028465271, + "learning_rate": 9.9895e-05, + "loss": 0.4712, + "step": 19989 + }, + { + "epoch": 1.119386269459066, + "grad_norm": 1.3323321342468262, + "learning_rate": 9.99e-05, + "loss": 0.3743, + "step": 19990 + }, + { + "epoch": 1.119442266771195, + "grad_norm": 1.47197425365448, + "learning_rate": 9.9905e-05, + "loss": 0.4424, + "step": 19991 + }, + { + "epoch": 1.119498264083324, + "grad_norm": 1.7073791027069092, + "learning_rate": 9.991e-05, + "loss": 0.4088, + "step": 19992 + }, + { + "epoch": 1.119554261395453, + "grad_norm": 1.2094861268997192, + "learning_rate": 9.991500000000001e-05, + "loss": 0.3556, + "step": 19993 + }, + { + "epoch": 1.119610258707582, + "grad_norm": 1.3336609601974487, + "learning_rate": 9.992e-05, + "loss": 0.4196, + "step": 19994 + }, + { + "epoch": 1.119666256019711, + "grad_norm": 1.3385021686553955, + "learning_rate": 9.9925e-05, + "loss": 0.4877, + "step": 19995 + }, + { + "epoch": 1.11972225333184, + "grad_norm": 1.3331232070922852, + "learning_rate": 9.993e-05, + "loss": 0.429, + "step": 19996 + }, + { + "epoch": 1.119778250643969, + "grad_norm": 1.5061585903167725, + "learning_rate": 9.9935e-05, + "loss": 0.4839, + "step": 19997 + }, + { + "epoch": 1.119834247956098, + "grad_norm": 1.2313766479492188, + "learning_rate": 9.994e-05, + "loss": 0.4743, + "step": 19998 + }, + { + "epoch": 1.1198902452682271, + "grad_norm": 1.3281816244125366, + "learning_rate": 9.9945e-05, + "loss": 0.4664, + "step": 19999 + }, + { + "epoch": 1.1199462425803561, + "grad_norm": 1.4939744472503662, + "learning_rate": 9.995e-05, + "loss": 0.6259, + "step": 20000 + }, + { + "epoch": 1.1200022398924852, + "grad_norm": 1.670356035232544, + "learning_rate": 9.995500000000001e-05, + "loss": 0.3992, + "step": 20001 + }, + { + "epoch": 1.1200582372046142, + "grad_norm": 1.2830830812454224, + "learning_rate": 9.996000000000001e-05, + "loss": 0.4945, + "step": 20002 + }, + { + "epoch": 1.1201142345167432, + "grad_norm": 1.5486961603164673, + "learning_rate": 9.996500000000001e-05, + "loss": 0.3299, + "step": 20003 + }, + { + "epoch": 1.1201702318288722, + "grad_norm": 1.5304434299468994, + "learning_rate": 9.997e-05, + "loss": 0.5126, + "step": 20004 + }, + { + "epoch": 1.1202262291410012, + "grad_norm": 1.4435508251190186, + "learning_rate": 9.9975e-05, + "loss": 0.4213, + "step": 20005 + }, + { + "epoch": 1.1202822264531302, + "grad_norm": 1.157757043838501, + "learning_rate": 9.998e-05, + "loss": 0.34, + "step": 20006 + }, + { + "epoch": 1.1203382237652593, + "grad_norm": 1.3835796117782593, + "learning_rate": 9.998500000000001e-05, + "loss": 0.5336, + "step": 20007 + }, + { + "epoch": 1.1203942210773883, + "grad_norm": 1.747640609741211, + "learning_rate": 9.999000000000001e-05, + "loss": 0.4485, + "step": 20008 + }, + { + "epoch": 1.1204502183895173, + "grad_norm": 1.5142403841018677, + "learning_rate": 9.999500000000001e-05, + "loss": 0.6584, + "step": 20009 + }, + { + "epoch": 1.1205062157016463, + "grad_norm": 1.684420108795166, + "learning_rate": 0.0001, + "loss": 0.5178, + "step": 20010 + }, + { + "epoch": 1.1205622130137753, + "grad_norm": 1.2592800855636597, + "learning_rate": 9.999973684210526e-05, + "loss": 0.4609, + "step": 20011 + }, + { + "epoch": 1.1206182103259044, + "grad_norm": 1.562971830368042, + "learning_rate": 9.999947368421054e-05, + "loss": 0.5383, + "step": 20012 + }, + { + "epoch": 1.1206742076380334, + "grad_norm": 1.318636417388916, + "learning_rate": 9.999921052631578e-05, + "loss": 0.3895, + "step": 20013 + }, + { + "epoch": 1.1207302049501624, + "grad_norm": 1.1930203437805176, + "learning_rate": 9.999894736842106e-05, + "loss": 0.3013, + "step": 20014 + }, + { + "epoch": 1.1207862022622914, + "grad_norm": 1.4527639150619507, + "learning_rate": 9.999868421052632e-05, + "loss": 0.3946, + "step": 20015 + }, + { + "epoch": 1.1208421995744204, + "grad_norm": 1.5577555894851685, + "learning_rate": 9.999842105263159e-05, + "loss": 0.5031, + "step": 20016 + }, + { + "epoch": 1.1208981968865495, + "grad_norm": 1.3654688596725464, + "learning_rate": 9.999815789473685e-05, + "loss": 0.4687, + "step": 20017 + }, + { + "epoch": 1.1209541941986785, + "grad_norm": 1.3246850967407227, + "learning_rate": 9.999789473684211e-05, + "loss": 0.4485, + "step": 20018 + }, + { + "epoch": 1.1210101915108075, + "grad_norm": 1.4335442781448364, + "learning_rate": 9.999763157894737e-05, + "loss": 0.4369, + "step": 20019 + }, + { + "epoch": 1.1210661888229365, + "grad_norm": 1.2487568855285645, + "learning_rate": 9.999736842105264e-05, + "loss": 0.4527, + "step": 20020 + }, + { + "epoch": 1.1211221861350655, + "grad_norm": 1.457713007926941, + "learning_rate": 9.99971052631579e-05, + "loss": 0.4996, + "step": 20021 + }, + { + "epoch": 1.1211781834471946, + "grad_norm": 1.7885032892227173, + "learning_rate": 9.999684210526316e-05, + "loss": 0.456, + "step": 20022 + }, + { + "epoch": 1.1212341807593236, + "grad_norm": 1.3303115367889404, + "learning_rate": 9.999657894736842e-05, + "loss": 0.3477, + "step": 20023 + }, + { + "epoch": 1.1212901780714526, + "grad_norm": 1.3550374507904053, + "learning_rate": 9.99963157894737e-05, + "loss": 0.3813, + "step": 20024 + }, + { + "epoch": 1.1213461753835816, + "grad_norm": 1.4085867404937744, + "learning_rate": 9.999605263157895e-05, + "loss": 0.4964, + "step": 20025 + }, + { + "epoch": 1.1214021726957106, + "grad_norm": 1.7398312091827393, + "learning_rate": 9.999578947368421e-05, + "loss": 0.4673, + "step": 20026 + }, + { + "epoch": 1.1214581700078396, + "grad_norm": 1.3218722343444824, + "learning_rate": 9.999552631578947e-05, + "loss": 0.3901, + "step": 20027 + }, + { + "epoch": 1.1215141673199687, + "grad_norm": 1.3908706903457642, + "learning_rate": 9.999526315789473e-05, + "loss": 0.5142, + "step": 20028 + }, + { + "epoch": 1.1215701646320977, + "grad_norm": 1.4564788341522217, + "learning_rate": 9.999500000000001e-05, + "loss": 0.4245, + "step": 20029 + }, + { + "epoch": 1.1216261619442267, + "grad_norm": 1.5256010293960571, + "learning_rate": 9.999473684210527e-05, + "loss": 0.3972, + "step": 20030 + }, + { + "epoch": 1.1216821592563557, + "grad_norm": 1.153856873512268, + "learning_rate": 9.999447368421053e-05, + "loss": 0.3373, + "step": 20031 + }, + { + "epoch": 1.1217381565684847, + "grad_norm": 1.2020797729492188, + "learning_rate": 9.999421052631579e-05, + "loss": 0.3982, + "step": 20032 + }, + { + "epoch": 1.1217941538806138, + "grad_norm": 1.2495990991592407, + "learning_rate": 9.999394736842106e-05, + "loss": 0.2986, + "step": 20033 + }, + { + "epoch": 1.1218501511927428, + "grad_norm": 1.400131106376648, + "learning_rate": 9.999368421052632e-05, + "loss": 0.3613, + "step": 20034 + }, + { + "epoch": 1.1219061485048718, + "grad_norm": 1.214959979057312, + "learning_rate": 9.999342105263159e-05, + "loss": 0.44, + "step": 20035 + }, + { + "epoch": 1.1219621458170008, + "grad_norm": 1.2081224918365479, + "learning_rate": 9.999315789473684e-05, + "loss": 0.3817, + "step": 20036 + }, + { + "epoch": 1.1220181431291298, + "grad_norm": 1.3629376888275146, + "learning_rate": 9.999289473684211e-05, + "loss": 0.5077, + "step": 20037 + }, + { + "epoch": 1.1220741404412589, + "grad_norm": 1.4008897542953491, + "learning_rate": 9.999263157894737e-05, + "loss": 0.47, + "step": 20038 + }, + { + "epoch": 1.1221301377533879, + "grad_norm": 1.4684243202209473, + "learning_rate": 9.999236842105265e-05, + "loss": 0.3975, + "step": 20039 + }, + { + "epoch": 1.122186135065517, + "grad_norm": 1.8027886152267456, + "learning_rate": 9.999210526315789e-05, + "loss": 0.4434, + "step": 20040 + }, + { + "epoch": 1.122242132377646, + "grad_norm": 1.8867990970611572, + "learning_rate": 9.999184210526316e-05, + "loss": 0.362, + "step": 20041 + }, + { + "epoch": 1.122298129689775, + "grad_norm": 1.4636904001235962, + "learning_rate": 9.999157894736842e-05, + "loss": 0.4318, + "step": 20042 + }, + { + "epoch": 1.122354127001904, + "grad_norm": 1.3516290187835693, + "learning_rate": 9.99913157894737e-05, + "loss": 0.5048, + "step": 20043 + }, + { + "epoch": 1.122410124314033, + "grad_norm": 1.6282988786697388, + "learning_rate": 9.999105263157896e-05, + "loss": 0.643, + "step": 20044 + }, + { + "epoch": 1.122466121626162, + "grad_norm": 1.0735915899276733, + "learning_rate": 9.99907894736842e-05, + "loss": 0.3689, + "step": 20045 + }, + { + "epoch": 1.122522118938291, + "grad_norm": 1.4593976736068726, + "learning_rate": 9.999052631578948e-05, + "loss": 0.3651, + "step": 20046 + }, + { + "epoch": 1.12257811625042, + "grad_norm": 1.146026849746704, + "learning_rate": 9.999026315789474e-05, + "loss": 0.3888, + "step": 20047 + }, + { + "epoch": 1.122634113562549, + "grad_norm": 1.2889912128448486, + "learning_rate": 9.999000000000001e-05, + "loss": 0.3881, + "step": 20048 + }, + { + "epoch": 1.122690110874678, + "grad_norm": 1.4708292484283447, + "learning_rate": 9.998973684210527e-05, + "loss": 0.4643, + "step": 20049 + }, + { + "epoch": 1.122746108186807, + "grad_norm": 1.4523096084594727, + "learning_rate": 9.998947368421053e-05, + "loss": 0.5212, + "step": 20050 + }, + { + "epoch": 1.122802105498936, + "grad_norm": 1.3572546243667603, + "learning_rate": 9.998921052631579e-05, + "loss": 0.485, + "step": 20051 + }, + { + "epoch": 1.1228581028110651, + "grad_norm": 1.9971601963043213, + "learning_rate": 9.998894736842106e-05, + "loss": 0.3975, + "step": 20052 + }, + { + "epoch": 1.1229141001231941, + "grad_norm": 1.4978028535842896, + "learning_rate": 9.998868421052632e-05, + "loss": 0.4827, + "step": 20053 + }, + { + "epoch": 1.1229700974353232, + "grad_norm": 1.2745866775512695, + "learning_rate": 9.998842105263158e-05, + "loss": 0.3795, + "step": 20054 + }, + { + "epoch": 1.1230260947474522, + "grad_norm": 1.484288215637207, + "learning_rate": 9.998815789473684e-05, + "loss": 0.3018, + "step": 20055 + }, + { + "epoch": 1.1230820920595812, + "grad_norm": 1.2436482906341553, + "learning_rate": 9.998789473684211e-05, + "loss": 0.3835, + "step": 20056 + }, + { + "epoch": 1.1231380893717102, + "grad_norm": 1.1885489225387573, + "learning_rate": 9.998763157894737e-05, + "loss": 0.3402, + "step": 20057 + }, + { + "epoch": 1.1231940866838392, + "grad_norm": 1.3961933851242065, + "learning_rate": 9.998736842105263e-05, + "loss": 0.5356, + "step": 20058 + }, + { + "epoch": 1.1232500839959683, + "grad_norm": 1.2171860933303833, + "learning_rate": 9.99871052631579e-05, + "loss": 0.4313, + "step": 20059 + }, + { + "epoch": 1.1233060813080973, + "grad_norm": 1.3281238079071045, + "learning_rate": 9.998684210526317e-05, + "loss": 0.5049, + "step": 20060 + }, + { + "epoch": 1.1233620786202263, + "grad_norm": 1.1421741247177124, + "learning_rate": 9.998657894736843e-05, + "loss": 0.3011, + "step": 20061 + }, + { + "epoch": 1.1234180759323553, + "grad_norm": 1.3167433738708496, + "learning_rate": 9.998631578947369e-05, + "loss": 0.6157, + "step": 20062 + }, + { + "epoch": 1.1234740732444843, + "grad_norm": 1.498759150505066, + "learning_rate": 9.998605263157895e-05, + "loss": 0.3853, + "step": 20063 + }, + { + "epoch": 1.1235300705566134, + "grad_norm": 1.7297914028167725, + "learning_rate": 9.99857894736842e-05, + "loss": 0.4637, + "step": 20064 + }, + { + "epoch": 1.1235860678687424, + "grad_norm": 1.525895357131958, + "learning_rate": 9.998552631578948e-05, + "loss": 0.5171, + "step": 20065 + }, + { + "epoch": 1.1236420651808714, + "grad_norm": 1.3542238473892212, + "learning_rate": 9.998526315789474e-05, + "loss": 0.3932, + "step": 20066 + }, + { + "epoch": 1.1236980624930004, + "grad_norm": 1.4197075366973877, + "learning_rate": 9.998500000000001e-05, + "loss": 0.406, + "step": 20067 + }, + { + "epoch": 1.1237540598051294, + "grad_norm": 1.2412537336349487, + "learning_rate": 9.998473684210526e-05, + "loss": 0.3571, + "step": 20068 + }, + { + "epoch": 1.1238100571172585, + "grad_norm": 1.256371021270752, + "learning_rate": 9.998447368421053e-05, + "loss": 0.3646, + "step": 20069 + }, + { + "epoch": 1.1238660544293875, + "grad_norm": 1.223979115486145, + "learning_rate": 9.998421052631579e-05, + "loss": 0.4346, + "step": 20070 + }, + { + "epoch": 1.1239220517415165, + "grad_norm": 1.237862229347229, + "learning_rate": 9.998394736842107e-05, + "loss": 0.4622, + "step": 20071 + }, + { + "epoch": 1.1239780490536455, + "grad_norm": 1.4763144254684448, + "learning_rate": 9.998368421052632e-05, + "loss": 0.4447, + "step": 20072 + }, + { + "epoch": 1.1240340463657745, + "grad_norm": 1.2453192472457886, + "learning_rate": 9.998342105263158e-05, + "loss": 0.4321, + "step": 20073 + }, + { + "epoch": 1.1240900436779035, + "grad_norm": 1.6521111726760864, + "learning_rate": 9.998315789473684e-05, + "loss": 0.4248, + "step": 20074 + }, + { + "epoch": 1.1241460409900326, + "grad_norm": 1.4103846549987793, + "learning_rate": 9.998289473684212e-05, + "loss": 0.5108, + "step": 20075 + }, + { + "epoch": 1.1242020383021616, + "grad_norm": 1.3992470502853394, + "learning_rate": 9.998263157894738e-05, + "loss": 0.432, + "step": 20076 + }, + { + "epoch": 1.1242580356142906, + "grad_norm": 1.2738618850708008, + "learning_rate": 9.998236842105264e-05, + "loss": 0.4344, + "step": 20077 + }, + { + "epoch": 1.1243140329264196, + "grad_norm": 1.4086074829101562, + "learning_rate": 9.99821052631579e-05, + "loss": 0.4177, + "step": 20078 + }, + { + "epoch": 1.1243700302385486, + "grad_norm": 1.2592560052871704, + "learning_rate": 9.998184210526317e-05, + "loss": 0.3631, + "step": 20079 + }, + { + "epoch": 1.1244260275506777, + "grad_norm": 1.4146602153778076, + "learning_rate": 9.998157894736843e-05, + "loss": 0.4133, + "step": 20080 + }, + { + "epoch": 1.1244820248628067, + "grad_norm": 1.422480821609497, + "learning_rate": 9.998131578947369e-05, + "loss": 0.4386, + "step": 20081 + }, + { + "epoch": 1.1245380221749357, + "grad_norm": 1.299598217010498, + "learning_rate": 9.998105263157895e-05, + "loss": 0.4574, + "step": 20082 + }, + { + "epoch": 1.1245940194870647, + "grad_norm": 1.1920071840286255, + "learning_rate": 9.998078947368421e-05, + "loss": 0.4934, + "step": 20083 + }, + { + "epoch": 1.1246500167991937, + "grad_norm": 1.2635859251022339, + "learning_rate": 9.998052631578948e-05, + "loss": 0.4196, + "step": 20084 + }, + { + "epoch": 1.1247060141113228, + "grad_norm": 1.3033627271652222, + "learning_rate": 9.998026315789474e-05, + "loss": 0.3937, + "step": 20085 + }, + { + "epoch": 1.1247620114234516, + "grad_norm": 1.5347754955291748, + "learning_rate": 9.998e-05, + "loss": 0.5463, + "step": 20086 + }, + { + "epoch": 1.1248180087355806, + "grad_norm": 1.1636805534362793, + "learning_rate": 9.997973684210526e-05, + "loss": 0.3372, + "step": 20087 + }, + { + "epoch": 1.1248740060477096, + "grad_norm": 1.404335379600525, + "learning_rate": 9.997947368421053e-05, + "loss": 0.3842, + "step": 20088 + }, + { + "epoch": 1.1249300033598386, + "grad_norm": 1.210408329963684, + "learning_rate": 9.99792105263158e-05, + "loss": 0.4594, + "step": 20089 + }, + { + "epoch": 1.1249860006719676, + "grad_norm": 1.3475546836853027, + "learning_rate": 9.997894736842107e-05, + "loss": 0.4462, + "step": 20090 + }, + { + "epoch": 1.1250419979840967, + "grad_norm": 1.3855321407318115, + "learning_rate": 9.997868421052631e-05, + "loss": 0.4473, + "step": 20091 + }, + { + "epoch": 1.1250979952962257, + "grad_norm": 1.3210816383361816, + "learning_rate": 9.997842105263159e-05, + "loss": 0.4109, + "step": 20092 + }, + { + "epoch": 1.1251539926083547, + "grad_norm": 1.2670806646347046, + "learning_rate": 9.997815789473685e-05, + "loss": 0.3964, + "step": 20093 + }, + { + "epoch": 1.1252099899204837, + "grad_norm": 1.292494297027588, + "learning_rate": 9.997789473684212e-05, + "loss": 0.3616, + "step": 20094 + }, + { + "epoch": 1.1252659872326127, + "grad_norm": 1.4878965616226196, + "learning_rate": 9.997763157894737e-05, + "loss": 0.4488, + "step": 20095 + }, + { + "epoch": 1.1253219845447417, + "grad_norm": 1.2568440437316895, + "learning_rate": 9.997736842105264e-05, + "loss": 0.425, + "step": 20096 + }, + { + "epoch": 1.1253779818568708, + "grad_norm": 1.6238255500793457, + "learning_rate": 9.99771052631579e-05, + "loss": 0.4368, + "step": 20097 + }, + { + "epoch": 1.1254339791689998, + "grad_norm": 1.3254094123840332, + "learning_rate": 9.997684210526316e-05, + "loss": 0.3848, + "step": 20098 + }, + { + "epoch": 1.1254899764811288, + "grad_norm": 1.2998703718185425, + "learning_rate": 9.997657894736843e-05, + "loss": 0.3981, + "step": 20099 + }, + { + "epoch": 1.1255459737932578, + "grad_norm": 1.4188164472579956, + "learning_rate": 9.997631578947368e-05, + "loss": 0.569, + "step": 20100 + }, + { + "epoch": 1.1256019711053868, + "grad_norm": 1.50592839717865, + "learning_rate": 9.997605263157895e-05, + "loss": 0.665, + "step": 20101 + }, + { + "epoch": 1.1256579684175159, + "grad_norm": 1.3922353982925415, + "learning_rate": 9.997578947368421e-05, + "loss": 0.5353, + "step": 20102 + }, + { + "epoch": 1.1257139657296449, + "grad_norm": 1.3044599294662476, + "learning_rate": 9.997552631578948e-05, + "loss": 0.439, + "step": 20103 + }, + { + "epoch": 1.125769963041774, + "grad_norm": 1.473437786102295, + "learning_rate": 9.997526315789474e-05, + "loss": 0.5626, + "step": 20104 + }, + { + "epoch": 1.125825960353903, + "grad_norm": 1.2697563171386719, + "learning_rate": 9.9975e-05, + "loss": 0.501, + "step": 20105 + }, + { + "epoch": 1.125881957666032, + "grad_norm": 1.4197789430618286, + "learning_rate": 9.997473684210526e-05, + "loss": 0.5738, + "step": 20106 + }, + { + "epoch": 1.125937954978161, + "grad_norm": 1.2517338991165161, + "learning_rate": 9.997447368421054e-05, + "loss": 0.434, + "step": 20107 + }, + { + "epoch": 1.12599395229029, + "grad_norm": 1.3196479082107544, + "learning_rate": 9.99742105263158e-05, + "loss": 0.5725, + "step": 20108 + }, + { + "epoch": 1.126049949602419, + "grad_norm": 1.2089532613754272, + "learning_rate": 9.997394736842106e-05, + "loss": 0.4245, + "step": 20109 + }, + { + "epoch": 1.126105946914548, + "grad_norm": 1.2290751934051514, + "learning_rate": 9.997368421052632e-05, + "loss": 0.5232, + "step": 20110 + }, + { + "epoch": 1.126161944226677, + "grad_norm": 1.5362193584442139, + "learning_rate": 9.997342105263159e-05, + "loss": 0.5124, + "step": 20111 + }, + { + "epoch": 1.126217941538806, + "grad_norm": 1.1183627843856812, + "learning_rate": 9.997315789473685e-05, + "loss": 0.3339, + "step": 20112 + }, + { + "epoch": 1.126273938850935, + "grad_norm": 1.3534480333328247, + "learning_rate": 9.997289473684211e-05, + "loss": 0.37, + "step": 20113 + }, + { + "epoch": 1.126329936163064, + "grad_norm": 1.407397747039795, + "learning_rate": 9.997263157894737e-05, + "loss": 0.3819, + "step": 20114 + }, + { + "epoch": 1.1263859334751931, + "grad_norm": 1.2968662977218628, + "learning_rate": 9.997236842105263e-05, + "loss": 0.4112, + "step": 20115 + }, + { + "epoch": 1.1264419307873221, + "grad_norm": 1.645027756690979, + "learning_rate": 9.99721052631579e-05, + "loss": 0.5773, + "step": 20116 + }, + { + "epoch": 1.1264979280994512, + "grad_norm": 1.0771293640136719, + "learning_rate": 9.997184210526316e-05, + "loss": 0.3627, + "step": 20117 + }, + { + "epoch": 1.1265539254115802, + "grad_norm": 1.7107199430465698, + "learning_rate": 9.997157894736842e-05, + "loss": 0.6184, + "step": 20118 + }, + { + "epoch": 1.1266099227237092, + "grad_norm": 1.67527174949646, + "learning_rate": 9.997131578947368e-05, + "loss": 0.6594, + "step": 20119 + }, + { + "epoch": 1.1266659200358382, + "grad_norm": 1.3696786165237427, + "learning_rate": 9.997105263157895e-05, + "loss": 0.3945, + "step": 20120 + }, + { + "epoch": 1.1267219173479672, + "grad_norm": 1.4254775047302246, + "learning_rate": 9.997078947368421e-05, + "loss": 0.5711, + "step": 20121 + }, + { + "epoch": 1.1267779146600962, + "grad_norm": 1.559033989906311, + "learning_rate": 9.997052631578949e-05, + "loss": 0.5888, + "step": 20122 + }, + { + "epoch": 1.1268339119722253, + "grad_norm": 1.3969683647155762, + "learning_rate": 9.997026315789473e-05, + "loss": 0.4524, + "step": 20123 + }, + { + "epoch": 1.1268899092843543, + "grad_norm": 1.4289509057998657, + "learning_rate": 9.997e-05, + "loss": 0.4408, + "step": 20124 + }, + { + "epoch": 1.1269459065964833, + "grad_norm": 1.3135063648223877, + "learning_rate": 9.996973684210527e-05, + "loss": 0.5331, + "step": 20125 + }, + { + "epoch": 1.1270019039086123, + "grad_norm": 1.2597249746322632, + "learning_rate": 9.996947368421054e-05, + "loss": 0.3825, + "step": 20126 + }, + { + "epoch": 1.1270579012207413, + "grad_norm": 1.3031622171401978, + "learning_rate": 9.99692105263158e-05, + "loss": 0.5424, + "step": 20127 + }, + { + "epoch": 1.1271138985328704, + "grad_norm": 1.5318948030471802, + "learning_rate": 9.996894736842106e-05, + "loss": 0.4662, + "step": 20128 + }, + { + "epoch": 1.1271698958449994, + "grad_norm": 1.6408461332321167, + "learning_rate": 9.996868421052632e-05, + "loss": 0.4568, + "step": 20129 + }, + { + "epoch": 1.1272258931571284, + "grad_norm": 1.4029548168182373, + "learning_rate": 9.996842105263159e-05, + "loss": 0.5072, + "step": 20130 + }, + { + "epoch": 1.1272818904692574, + "grad_norm": 1.8679447174072266, + "learning_rate": 9.996815789473685e-05, + "loss": 0.6695, + "step": 20131 + }, + { + "epoch": 1.1273378877813864, + "grad_norm": 1.3182162046432495, + "learning_rate": 9.99678947368421e-05, + "loss": 0.477, + "step": 20132 + }, + { + "epoch": 1.1273938850935155, + "grad_norm": 1.8410577774047852, + "learning_rate": 9.996763157894737e-05, + "loss": 0.5332, + "step": 20133 + }, + { + "epoch": 1.1274498824056445, + "grad_norm": 1.2833086252212524, + "learning_rate": 9.996736842105263e-05, + "loss": 0.4246, + "step": 20134 + }, + { + "epoch": 1.1275058797177735, + "grad_norm": 1.196503758430481, + "learning_rate": 9.99671052631579e-05, + "loss": 0.4582, + "step": 20135 + }, + { + "epoch": 1.1275618770299025, + "grad_norm": 1.2653039693832397, + "learning_rate": 9.996684210526316e-05, + "loss": 0.4371, + "step": 20136 + }, + { + "epoch": 1.1276178743420315, + "grad_norm": 1.2870479822158813, + "learning_rate": 9.996657894736842e-05, + "loss": 0.3413, + "step": 20137 + }, + { + "epoch": 1.1276738716541606, + "grad_norm": 1.4446899890899658, + "learning_rate": 9.996631578947368e-05, + "loss": 0.3806, + "step": 20138 + }, + { + "epoch": 1.1277298689662896, + "grad_norm": 1.706510066986084, + "learning_rate": 9.996605263157896e-05, + "loss": 0.6093, + "step": 20139 + }, + { + "epoch": 1.1277858662784186, + "grad_norm": 1.2224639654159546, + "learning_rate": 9.996578947368422e-05, + "loss": 0.4163, + "step": 20140 + }, + { + "epoch": 1.1278418635905476, + "grad_norm": 1.2918223142623901, + "learning_rate": 9.996552631578948e-05, + "loss": 0.4171, + "step": 20141 + }, + { + "epoch": 1.1278978609026766, + "grad_norm": 1.895620346069336, + "learning_rate": 9.996526315789474e-05, + "loss": 0.434, + "step": 20142 + }, + { + "epoch": 1.1279538582148056, + "grad_norm": 1.5262978076934814, + "learning_rate": 9.996500000000001e-05, + "loss": 0.4186, + "step": 20143 + }, + { + "epoch": 1.1280098555269347, + "grad_norm": 1.4467686414718628, + "learning_rate": 9.996473684210527e-05, + "loss": 0.4421, + "step": 20144 + }, + { + "epoch": 1.1280658528390637, + "grad_norm": 1.2918657064437866, + "learning_rate": 9.996447368421054e-05, + "loss": 0.4428, + "step": 20145 + }, + { + "epoch": 1.1281218501511927, + "grad_norm": 1.3140455484390259, + "learning_rate": 9.996421052631579e-05, + "loss": 0.4878, + "step": 20146 + }, + { + "epoch": 1.1281778474633217, + "grad_norm": 1.4031810760498047, + "learning_rate": 9.996394736842106e-05, + "loss": 0.5236, + "step": 20147 + }, + { + "epoch": 1.1282338447754507, + "grad_norm": 1.2455710172653198, + "learning_rate": 9.996368421052632e-05, + "loss": 0.4509, + "step": 20148 + }, + { + "epoch": 1.1282898420875798, + "grad_norm": 1.3842425346374512, + "learning_rate": 9.996342105263158e-05, + "loss": 0.4766, + "step": 20149 + }, + { + "epoch": 1.1283458393997088, + "grad_norm": 1.1807523965835571, + "learning_rate": 9.996315789473684e-05, + "loss": 0.401, + "step": 20150 + }, + { + "epoch": 1.1284018367118378, + "grad_norm": 1.1878485679626465, + "learning_rate": 9.99628947368421e-05, + "loss": 0.455, + "step": 20151 + }, + { + "epoch": 1.1284578340239668, + "grad_norm": 1.2592793703079224, + "learning_rate": 9.996263157894737e-05, + "loss": 0.4084, + "step": 20152 + }, + { + "epoch": 1.1285138313360958, + "grad_norm": 1.6455912590026855, + "learning_rate": 9.996236842105263e-05, + "loss": 0.4427, + "step": 20153 + }, + { + "epoch": 1.1285698286482249, + "grad_norm": 1.357524037361145, + "learning_rate": 9.996210526315791e-05, + "loss": 0.3725, + "step": 20154 + }, + { + "epoch": 1.1286258259603539, + "grad_norm": 1.5846613645553589, + "learning_rate": 9.996184210526315e-05, + "loss": 0.4286, + "step": 20155 + }, + { + "epoch": 1.128681823272483, + "grad_norm": 1.5454031229019165, + "learning_rate": 9.996157894736843e-05, + "loss": 0.5492, + "step": 20156 + }, + { + "epoch": 1.128737820584612, + "grad_norm": 1.4770495891571045, + "learning_rate": 9.996131578947369e-05, + "loss": 0.4957, + "step": 20157 + }, + { + "epoch": 1.128793817896741, + "grad_norm": 1.6208133697509766, + "learning_rate": 9.996105263157896e-05, + "loss": 0.4944, + "step": 20158 + }, + { + "epoch": 1.12884981520887, + "grad_norm": 1.5294944047927856, + "learning_rate": 9.996078947368422e-05, + "loss": 0.5993, + "step": 20159 + }, + { + "epoch": 1.128905812520999, + "grad_norm": 1.2093846797943115, + "learning_rate": 9.996052631578948e-05, + "loss": 0.3749, + "step": 20160 + }, + { + "epoch": 1.128961809833128, + "grad_norm": 1.3696269989013672, + "learning_rate": 9.996026315789474e-05, + "loss": 0.4462, + "step": 20161 + }, + { + "epoch": 1.129017807145257, + "grad_norm": 1.315756916999817, + "learning_rate": 9.996000000000001e-05, + "loss": 0.4758, + "step": 20162 + }, + { + "epoch": 1.129073804457386, + "grad_norm": 1.4041450023651123, + "learning_rate": 9.995973684210527e-05, + "loss": 0.3999, + "step": 20163 + }, + { + "epoch": 1.129129801769515, + "grad_norm": 1.455556035041809, + "learning_rate": 9.995947368421053e-05, + "loss": 0.4594, + "step": 20164 + }, + { + "epoch": 1.129185799081644, + "grad_norm": 1.2051758766174316, + "learning_rate": 9.995921052631579e-05, + "loss": 0.3883, + "step": 20165 + }, + { + "epoch": 1.129241796393773, + "grad_norm": 1.2870639562606812, + "learning_rate": 9.995894736842105e-05, + "loss": 0.5278, + "step": 20166 + }, + { + "epoch": 1.129297793705902, + "grad_norm": 1.8627378940582275, + "learning_rate": 9.995868421052632e-05, + "loss": 0.4947, + "step": 20167 + }, + { + "epoch": 1.1293537910180311, + "grad_norm": 1.3675086498260498, + "learning_rate": 9.995842105263158e-05, + "loss": 0.3386, + "step": 20168 + }, + { + "epoch": 1.1294097883301601, + "grad_norm": 1.2064725160598755, + "learning_rate": 9.995815789473684e-05, + "loss": 0.4058, + "step": 20169 + }, + { + "epoch": 1.1294657856422892, + "grad_norm": 1.3493707180023193, + "learning_rate": 9.99578947368421e-05, + "loss": 0.4858, + "step": 20170 + }, + { + "epoch": 1.1295217829544182, + "grad_norm": 1.31057870388031, + "learning_rate": 9.995763157894738e-05, + "loss": 0.3552, + "step": 20171 + }, + { + "epoch": 1.1295777802665472, + "grad_norm": 1.3686400651931763, + "learning_rate": 9.995736842105264e-05, + "loss": 0.4329, + "step": 20172 + }, + { + "epoch": 1.1296337775786762, + "grad_norm": 1.170465350151062, + "learning_rate": 9.99571052631579e-05, + "loss": 0.3591, + "step": 20173 + }, + { + "epoch": 1.1296897748908052, + "grad_norm": 1.2574832439422607, + "learning_rate": 9.995684210526316e-05, + "loss": 0.4382, + "step": 20174 + }, + { + "epoch": 1.1297457722029343, + "grad_norm": 1.652482509613037, + "learning_rate": 9.995657894736843e-05, + "loss": 0.4608, + "step": 20175 + }, + { + "epoch": 1.1298017695150633, + "grad_norm": 1.4721181392669678, + "learning_rate": 9.995631578947369e-05, + "loss": 0.4936, + "step": 20176 + }, + { + "epoch": 1.1298577668271923, + "grad_norm": 1.2871087789535522, + "learning_rate": 9.995605263157896e-05, + "loss": 0.3359, + "step": 20177 + }, + { + "epoch": 1.1299137641393213, + "grad_norm": 1.4253180027008057, + "learning_rate": 9.995578947368421e-05, + "loss": 0.4732, + "step": 20178 + }, + { + "epoch": 1.1299697614514503, + "grad_norm": 1.4310779571533203, + "learning_rate": 9.995552631578948e-05, + "loss": 0.45, + "step": 20179 + }, + { + "epoch": 1.1300257587635794, + "grad_norm": 1.3896969556808472, + "learning_rate": 9.995526315789474e-05, + "loss": 0.4549, + "step": 20180 + }, + { + "epoch": 1.1300817560757084, + "grad_norm": 4.925571441650391, + "learning_rate": 9.995500000000001e-05, + "loss": 0.4441, + "step": 20181 + }, + { + "epoch": 1.1301377533878374, + "grad_norm": 1.9883801937103271, + "learning_rate": 9.995473684210527e-05, + "loss": 0.431, + "step": 20182 + }, + { + "epoch": 1.1301937506999664, + "grad_norm": 1.221846580505371, + "learning_rate": 9.995447368421052e-05, + "loss": 0.3792, + "step": 20183 + }, + { + "epoch": 1.1302497480120954, + "grad_norm": 1.3770896196365356, + "learning_rate": 9.99542105263158e-05, + "loss": 0.5133, + "step": 20184 + }, + { + "epoch": 1.1303057453242245, + "grad_norm": 1.4019097089767456, + "learning_rate": 9.995394736842105e-05, + "loss": 0.5023, + "step": 20185 + }, + { + "epoch": 1.1303617426363535, + "grad_norm": 1.2293190956115723, + "learning_rate": 9.995368421052633e-05, + "loss": 0.4406, + "step": 20186 + }, + { + "epoch": 1.1304177399484825, + "grad_norm": 1.328400731086731, + "learning_rate": 9.995342105263157e-05, + "loss": 0.4856, + "step": 20187 + }, + { + "epoch": 1.1304737372606115, + "grad_norm": 1.5837477445602417, + "learning_rate": 9.995315789473685e-05, + "loss": 0.4598, + "step": 20188 + }, + { + "epoch": 1.1305297345727405, + "grad_norm": 1.52598237991333, + "learning_rate": 9.99528947368421e-05, + "loss": 0.4943, + "step": 20189 + }, + { + "epoch": 1.1305857318848695, + "grad_norm": 1.2836613655090332, + "learning_rate": 9.995263157894738e-05, + "loss": 0.431, + "step": 20190 + }, + { + "epoch": 1.1306417291969986, + "grad_norm": 2.122936487197876, + "learning_rate": 9.995236842105264e-05, + "loss": 0.5085, + "step": 20191 + }, + { + "epoch": 1.1306977265091276, + "grad_norm": 1.7128816843032837, + "learning_rate": 9.99521052631579e-05, + "loss": 0.4088, + "step": 20192 + }, + { + "epoch": 1.1307537238212566, + "grad_norm": 2.03788685798645, + "learning_rate": 9.995184210526316e-05, + "loss": 0.6117, + "step": 20193 + }, + { + "epoch": 1.1308097211333856, + "grad_norm": 1.322121024131775, + "learning_rate": 9.995157894736843e-05, + "loss": 0.323, + "step": 20194 + }, + { + "epoch": 1.1308657184455146, + "grad_norm": 1.515406847000122, + "learning_rate": 9.995131578947369e-05, + "loss": 0.4084, + "step": 20195 + }, + { + "epoch": 1.1309217157576437, + "grad_norm": 1.3654356002807617, + "learning_rate": 9.995105263157895e-05, + "loss": 0.3751, + "step": 20196 + }, + { + "epoch": 1.1309777130697727, + "grad_norm": 1.6777520179748535, + "learning_rate": 9.995078947368421e-05, + "loss": 0.5004, + "step": 20197 + }, + { + "epoch": 1.1310337103819017, + "grad_norm": 1.3237303495407104, + "learning_rate": 9.995052631578948e-05, + "loss": 0.4237, + "step": 20198 + }, + { + "epoch": 1.1310897076940307, + "grad_norm": 1.4779011011123657, + "learning_rate": 9.995026315789474e-05, + "loss": 0.3831, + "step": 20199 + }, + { + "epoch": 1.1311457050061597, + "grad_norm": 1.2313202619552612, + "learning_rate": 9.995e-05, + "loss": 0.3035, + "step": 20200 + }, + { + "epoch": 1.1312017023182888, + "grad_norm": 1.7041383981704712, + "learning_rate": 9.994973684210526e-05, + "loss": 0.4756, + "step": 20201 + }, + { + "epoch": 1.1312576996304178, + "grad_norm": 1.3493001461029053, + "learning_rate": 9.994947368421052e-05, + "loss": 0.6141, + "step": 20202 + }, + { + "epoch": 1.1313136969425468, + "grad_norm": 1.3731756210327148, + "learning_rate": 9.99492105263158e-05, + "loss": 0.3948, + "step": 20203 + }, + { + "epoch": 1.1313696942546758, + "grad_norm": 1.9412846565246582, + "learning_rate": 9.994894736842106e-05, + "loss": 0.5277, + "step": 20204 + }, + { + "epoch": 1.1314256915668048, + "grad_norm": 1.3892247676849365, + "learning_rate": 9.994868421052632e-05, + "loss": 0.5429, + "step": 20205 + }, + { + "epoch": 1.1314816888789339, + "grad_norm": 1.6253856420516968, + "learning_rate": 9.994842105263158e-05, + "loss": 0.611, + "step": 20206 + }, + { + "epoch": 1.1315376861910629, + "grad_norm": 1.4898595809936523, + "learning_rate": 9.994815789473685e-05, + "loss": 0.3932, + "step": 20207 + }, + { + "epoch": 1.131593683503192, + "grad_norm": 1.2663226127624512, + "learning_rate": 9.994789473684211e-05, + "loss": 0.5623, + "step": 20208 + }, + { + "epoch": 1.131649680815321, + "grad_norm": 1.4919707775115967, + "learning_rate": 9.994763157894738e-05, + "loss": 0.4961, + "step": 20209 + }, + { + "epoch": 1.13170567812745, + "grad_norm": 1.314507007598877, + "learning_rate": 9.994736842105263e-05, + "loss": 0.503, + "step": 20210 + }, + { + "epoch": 1.131761675439579, + "grad_norm": 1.7011282444000244, + "learning_rate": 9.99471052631579e-05, + "loss": 0.4651, + "step": 20211 + }, + { + "epoch": 1.131817672751708, + "grad_norm": 1.2699919939041138, + "learning_rate": 9.994684210526316e-05, + "loss": 0.493, + "step": 20212 + }, + { + "epoch": 1.131873670063837, + "grad_norm": 1.4917173385620117, + "learning_rate": 9.994657894736843e-05, + "loss": 0.4717, + "step": 20213 + }, + { + "epoch": 1.131929667375966, + "grad_norm": 1.2997092008590698, + "learning_rate": 9.99463157894737e-05, + "loss": 0.5401, + "step": 20214 + }, + { + "epoch": 1.131985664688095, + "grad_norm": 2.0021791458129883, + "learning_rate": 9.994605263157895e-05, + "loss": 0.6804, + "step": 20215 + }, + { + "epoch": 1.132041662000224, + "grad_norm": 1.6106153726577759, + "learning_rate": 9.994578947368421e-05, + "loss": 0.3436, + "step": 20216 + }, + { + "epoch": 1.132097659312353, + "grad_norm": 1.16148841381073, + "learning_rate": 9.994552631578949e-05, + "loss": 0.4427, + "step": 20217 + }, + { + "epoch": 1.132153656624482, + "grad_norm": 1.8368303775787354, + "learning_rate": 9.994526315789475e-05, + "loss": 0.5691, + "step": 20218 + }, + { + "epoch": 1.132209653936611, + "grad_norm": 1.274310827255249, + "learning_rate": 9.9945e-05, + "loss": 0.4706, + "step": 20219 + }, + { + "epoch": 1.1322656512487401, + "grad_norm": 1.1377419233322144, + "learning_rate": 9.994473684210527e-05, + "loss": 0.4518, + "step": 20220 + }, + { + "epoch": 1.1323216485608691, + "grad_norm": 1.1848833560943604, + "learning_rate": 9.994447368421053e-05, + "loss": 0.3248, + "step": 20221 + }, + { + "epoch": 1.1323776458729982, + "grad_norm": 1.4867243766784668, + "learning_rate": 9.99442105263158e-05, + "loss": 0.4576, + "step": 20222 + }, + { + "epoch": 1.1324336431851272, + "grad_norm": 1.7923411130905151, + "learning_rate": 9.994394736842106e-05, + "loss": 0.4153, + "step": 20223 + }, + { + "epoch": 1.1324896404972562, + "grad_norm": 1.1907631158828735, + "learning_rate": 9.994368421052632e-05, + "loss": 0.3663, + "step": 20224 + }, + { + "epoch": 1.1325456378093852, + "grad_norm": 1.2464754581451416, + "learning_rate": 9.994342105263158e-05, + "loss": 0.4706, + "step": 20225 + }, + { + "epoch": 1.1326016351215142, + "grad_norm": 1.1599808931350708, + "learning_rate": 9.994315789473685e-05, + "loss": 0.4196, + "step": 20226 + }, + { + "epoch": 1.1326576324336433, + "grad_norm": 1.3001772165298462, + "learning_rate": 9.994289473684211e-05, + "loss": 0.4365, + "step": 20227 + }, + { + "epoch": 1.1327136297457723, + "grad_norm": 1.241241455078125, + "learning_rate": 9.994263157894737e-05, + "loss": 0.3607, + "step": 20228 + }, + { + "epoch": 1.1327696270579013, + "grad_norm": 1.176579475402832, + "learning_rate": 9.994236842105263e-05, + "loss": 0.2961, + "step": 20229 + }, + { + "epoch": 1.1328256243700303, + "grad_norm": 1.525162696838379, + "learning_rate": 9.99421052631579e-05, + "loss": 0.5938, + "step": 20230 + }, + { + "epoch": 1.1328816216821593, + "grad_norm": 1.3000282049179077, + "learning_rate": 9.994184210526316e-05, + "loss": 0.4874, + "step": 20231 + }, + { + "epoch": 1.1329376189942884, + "grad_norm": 1.2870616912841797, + "learning_rate": 9.994157894736844e-05, + "loss": 0.4187, + "step": 20232 + }, + { + "epoch": 1.1329936163064174, + "grad_norm": 1.3188973665237427, + "learning_rate": 9.994131578947368e-05, + "loss": 0.4117, + "step": 20233 + }, + { + "epoch": 1.1330496136185464, + "grad_norm": 1.2505483627319336, + "learning_rate": 9.994105263157896e-05, + "loss": 0.302, + "step": 20234 + }, + { + "epoch": 1.1331056109306754, + "grad_norm": 1.1003673076629639, + "learning_rate": 9.994078947368422e-05, + "loss": 0.383, + "step": 20235 + }, + { + "epoch": 1.1331616082428044, + "grad_norm": 1.0437648296356201, + "learning_rate": 9.994052631578948e-05, + "loss": 0.4162, + "step": 20236 + }, + { + "epoch": 1.1332176055549334, + "grad_norm": 1.2069073915481567, + "learning_rate": 9.994026315789475e-05, + "loss": 0.3115, + "step": 20237 + }, + { + "epoch": 1.1332736028670625, + "grad_norm": 1.2497919797897339, + "learning_rate": 9.994e-05, + "loss": 0.4764, + "step": 20238 + }, + { + "epoch": 1.1333296001791915, + "grad_norm": 1.428944706916809, + "learning_rate": 9.993973684210527e-05, + "loss": 0.5107, + "step": 20239 + }, + { + "epoch": 1.1333855974913205, + "grad_norm": 1.3723713159561157, + "learning_rate": 9.993947368421053e-05, + "loss": 0.5085, + "step": 20240 + }, + { + "epoch": 1.1334415948034495, + "grad_norm": 1.3960775136947632, + "learning_rate": 9.99392105263158e-05, + "loss": 0.5389, + "step": 20241 + }, + { + "epoch": 1.1334975921155785, + "grad_norm": 1.1459434032440186, + "learning_rate": 9.993894736842105e-05, + "loss": 0.3664, + "step": 20242 + }, + { + "epoch": 1.1335535894277076, + "grad_norm": 1.4901725053787231, + "learning_rate": 9.993868421052632e-05, + "loss": 0.5682, + "step": 20243 + }, + { + "epoch": 1.1336095867398366, + "grad_norm": 1.2028594017028809, + "learning_rate": 9.993842105263158e-05, + "loss": 0.2462, + "step": 20244 + }, + { + "epoch": 1.1336655840519656, + "grad_norm": 1.3778281211853027, + "learning_rate": 9.993815789473685e-05, + "loss": 0.4629, + "step": 20245 + }, + { + "epoch": 1.1337215813640946, + "grad_norm": 1.487456202507019, + "learning_rate": 9.993789473684211e-05, + "loss": 0.4372, + "step": 20246 + }, + { + "epoch": 1.1337775786762236, + "grad_norm": 2.2966628074645996, + "learning_rate": 9.993763157894737e-05, + "loss": 0.6028, + "step": 20247 + }, + { + "epoch": 1.1338335759883527, + "grad_norm": 1.343872308731079, + "learning_rate": 9.993736842105263e-05, + "loss": 0.3569, + "step": 20248 + }, + { + "epoch": 1.1338895733004817, + "grad_norm": 1.3263475894927979, + "learning_rate": 9.99371052631579e-05, + "loss": 0.4684, + "step": 20249 + }, + { + "epoch": 1.1339455706126107, + "grad_norm": 1.538833737373352, + "learning_rate": 9.993684210526317e-05, + "loss": 0.5163, + "step": 20250 + }, + { + "epoch": 1.1340015679247397, + "grad_norm": 1.429449200630188, + "learning_rate": 9.993657894736843e-05, + "loss": 0.454, + "step": 20251 + }, + { + "epoch": 1.1340575652368687, + "grad_norm": 1.3572133779525757, + "learning_rate": 9.993631578947369e-05, + "loss": 0.6305, + "step": 20252 + }, + { + "epoch": 1.1341135625489978, + "grad_norm": 1.3160983324050903, + "learning_rate": 9.993605263157895e-05, + "loss": 0.3628, + "step": 20253 + }, + { + "epoch": 1.1341695598611268, + "grad_norm": 1.3186285495758057, + "learning_rate": 9.993578947368422e-05, + "loss": 0.467, + "step": 20254 + }, + { + "epoch": 1.1342255571732558, + "grad_norm": 1.1341521739959717, + "learning_rate": 9.993552631578948e-05, + "loss": 0.3895, + "step": 20255 + }, + { + "epoch": 1.1342815544853848, + "grad_norm": 1.5238111019134521, + "learning_rate": 9.993526315789474e-05, + "loss": 0.3964, + "step": 20256 + }, + { + "epoch": 1.1343375517975138, + "grad_norm": 1.2447681427001953, + "learning_rate": 9.9935e-05, + "loss": 0.4356, + "step": 20257 + }, + { + "epoch": 1.1343935491096429, + "grad_norm": 1.2781203985214233, + "learning_rate": 9.993473684210527e-05, + "loss": 0.3634, + "step": 20258 + }, + { + "epoch": 1.1344495464217719, + "grad_norm": 1.5016010999679565, + "learning_rate": 9.993447368421053e-05, + "loss": 0.4863, + "step": 20259 + }, + { + "epoch": 1.134505543733901, + "grad_norm": 1.2829502820968628, + "learning_rate": 9.993421052631579e-05, + "loss": 0.465, + "step": 20260 + }, + { + "epoch": 1.13456154104603, + "grad_norm": 1.2866705656051636, + "learning_rate": 9.993394736842105e-05, + "loss": 0.3207, + "step": 20261 + }, + { + "epoch": 1.134617538358159, + "grad_norm": 1.5260947942733765, + "learning_rate": 9.993368421052632e-05, + "loss": 0.4029, + "step": 20262 + }, + { + "epoch": 1.134673535670288, + "grad_norm": 1.3858051300048828, + "learning_rate": 9.993342105263158e-05, + "loss": 0.3886, + "step": 20263 + }, + { + "epoch": 1.134729532982417, + "grad_norm": 1.4056178331375122, + "learning_rate": 9.993315789473686e-05, + "loss": 0.6166, + "step": 20264 + }, + { + "epoch": 1.134785530294546, + "grad_norm": 2.4712114334106445, + "learning_rate": 9.99328947368421e-05, + "loss": 0.4633, + "step": 20265 + }, + { + "epoch": 1.1348415276066748, + "grad_norm": 1.2079639434814453, + "learning_rate": 9.993263157894738e-05, + "loss": 0.385, + "step": 20266 + }, + { + "epoch": 1.1348975249188038, + "grad_norm": 1.3643262386322021, + "learning_rate": 9.993236842105264e-05, + "loss": 0.4943, + "step": 20267 + }, + { + "epoch": 1.1349535222309328, + "grad_norm": 1.1451470851898193, + "learning_rate": 9.993210526315791e-05, + "loss": 0.3125, + "step": 20268 + }, + { + "epoch": 1.1350095195430618, + "grad_norm": 1.769522786140442, + "learning_rate": 9.993184210526317e-05, + "loss": 0.4586, + "step": 20269 + }, + { + "epoch": 1.1350655168551909, + "grad_norm": 9.267542839050293, + "learning_rate": 9.993157894736841e-05, + "loss": 0.4237, + "step": 20270 + }, + { + "epoch": 1.1351215141673199, + "grad_norm": 3.177318572998047, + "learning_rate": 9.993131578947369e-05, + "loss": 0.4726, + "step": 20271 + }, + { + "epoch": 1.135177511479449, + "grad_norm": 1.3887708187103271, + "learning_rate": 9.993105263157895e-05, + "loss": 0.4423, + "step": 20272 + }, + { + "epoch": 1.135233508791578, + "grad_norm": 1.4719115495681763, + "learning_rate": 9.993078947368422e-05, + "loss": 0.449, + "step": 20273 + }, + { + "epoch": 1.135289506103707, + "grad_norm": 1.4405137300491333, + "learning_rate": 9.993052631578948e-05, + "loss": 0.4776, + "step": 20274 + }, + { + "epoch": 1.135345503415836, + "grad_norm": 1.2587157487869263, + "learning_rate": 9.993026315789474e-05, + "loss": 0.4237, + "step": 20275 + }, + { + "epoch": 1.135401500727965, + "grad_norm": 1.2803078889846802, + "learning_rate": 9.993e-05, + "loss": 0.4338, + "step": 20276 + }, + { + "epoch": 1.135457498040094, + "grad_norm": 1.1684249639511108, + "learning_rate": 9.992973684210527e-05, + "loss": 0.3643, + "step": 20277 + }, + { + "epoch": 1.135513495352223, + "grad_norm": 1.356095314025879, + "learning_rate": 9.992947368421053e-05, + "loss": 0.5039, + "step": 20278 + }, + { + "epoch": 1.135569492664352, + "grad_norm": 1.1681362390518188, + "learning_rate": 9.992921052631579e-05, + "loss": 0.4442, + "step": 20279 + }, + { + "epoch": 1.135625489976481, + "grad_norm": 1.3019719123840332, + "learning_rate": 9.992894736842105e-05, + "loss": 0.4194, + "step": 20280 + }, + { + "epoch": 1.13568148728861, + "grad_norm": 1.1020950078964233, + "learning_rate": 9.992868421052633e-05, + "loss": 0.3683, + "step": 20281 + }, + { + "epoch": 1.135737484600739, + "grad_norm": 1.5293177366256714, + "learning_rate": 9.992842105263159e-05, + "loss": 0.494, + "step": 20282 + }, + { + "epoch": 1.135793481912868, + "grad_norm": 1.2927979230880737, + "learning_rate": 9.992815789473685e-05, + "loss": 0.4483, + "step": 20283 + }, + { + "epoch": 1.1358494792249971, + "grad_norm": 1.9020200967788696, + "learning_rate": 9.99278947368421e-05, + "loss": 0.5442, + "step": 20284 + }, + { + "epoch": 1.1359054765371261, + "grad_norm": 1.4373677968978882, + "learning_rate": 9.992763157894738e-05, + "loss": 0.4161, + "step": 20285 + }, + { + "epoch": 1.1359614738492552, + "grad_norm": 1.4182528257369995, + "learning_rate": 9.992736842105264e-05, + "loss": 0.4502, + "step": 20286 + }, + { + "epoch": 1.1360174711613842, + "grad_norm": 1.112560510635376, + "learning_rate": 9.99271052631579e-05, + "loss": 0.3414, + "step": 20287 + }, + { + "epoch": 1.1360734684735132, + "grad_norm": 1.262067198753357, + "learning_rate": 9.992684210526316e-05, + "loss": 0.4059, + "step": 20288 + }, + { + "epoch": 1.1361294657856422, + "grad_norm": 1.3725446462631226, + "learning_rate": 9.992657894736842e-05, + "loss": 0.44, + "step": 20289 + }, + { + "epoch": 1.1361854630977712, + "grad_norm": 1.2827565670013428, + "learning_rate": 9.992631578947369e-05, + "loss": 0.4307, + "step": 20290 + }, + { + "epoch": 1.1362414604099003, + "grad_norm": 1.291059136390686, + "learning_rate": 9.992605263157895e-05, + "loss": 0.4441, + "step": 20291 + }, + { + "epoch": 1.1362974577220293, + "grad_norm": 1.5054551362991333, + "learning_rate": 9.992578947368422e-05, + "loss": 0.477, + "step": 20292 + }, + { + "epoch": 1.1363534550341583, + "grad_norm": 1.8364559412002563, + "learning_rate": 9.992552631578947e-05, + "loss": 0.4822, + "step": 20293 + }, + { + "epoch": 1.1364094523462873, + "grad_norm": 1.191460371017456, + "learning_rate": 9.992526315789474e-05, + "loss": 0.4269, + "step": 20294 + }, + { + "epoch": 1.1364654496584163, + "grad_norm": 3.196669578552246, + "learning_rate": 9.9925e-05, + "loss": 0.5738, + "step": 20295 + }, + { + "epoch": 1.1365214469705454, + "grad_norm": 1.3577088117599487, + "learning_rate": 9.992473684210528e-05, + "loss": 0.4304, + "step": 20296 + }, + { + "epoch": 1.1365774442826744, + "grad_norm": 1.5136973857879639, + "learning_rate": 9.992447368421052e-05, + "loss": 0.4075, + "step": 20297 + }, + { + "epoch": 1.1366334415948034, + "grad_norm": 1.4799716472625732, + "learning_rate": 9.99242105263158e-05, + "loss": 0.4673, + "step": 20298 + }, + { + "epoch": 1.1366894389069324, + "grad_norm": 1.289697527885437, + "learning_rate": 9.992394736842106e-05, + "loss": 0.3812, + "step": 20299 + }, + { + "epoch": 1.1367454362190614, + "grad_norm": 1.0879672765731812, + "learning_rate": 9.992368421052633e-05, + "loss": 0.3818, + "step": 20300 + }, + { + "epoch": 1.1368014335311905, + "grad_norm": 1.3830888271331787, + "learning_rate": 9.992342105263159e-05, + "loss": 0.4774, + "step": 20301 + }, + { + "epoch": 1.1368574308433195, + "grad_norm": 1.3574031591415405, + "learning_rate": 9.992315789473685e-05, + "loss": 0.6394, + "step": 20302 + }, + { + "epoch": 1.1369134281554485, + "grad_norm": 1.266954779624939, + "learning_rate": 9.992289473684211e-05, + "loss": 0.4814, + "step": 20303 + }, + { + "epoch": 1.1369694254675775, + "grad_norm": 1.6034536361694336, + "learning_rate": 9.992263157894737e-05, + "loss": 0.5257, + "step": 20304 + }, + { + "epoch": 1.1370254227797065, + "grad_norm": 1.2043932676315308, + "learning_rate": 9.992236842105264e-05, + "loss": 0.4585, + "step": 20305 + }, + { + "epoch": 1.1370814200918355, + "grad_norm": 1.4146957397460938, + "learning_rate": 9.99221052631579e-05, + "loss": 0.4716, + "step": 20306 + }, + { + "epoch": 1.1371374174039646, + "grad_norm": 1.373159646987915, + "learning_rate": 9.992184210526316e-05, + "loss": 0.3746, + "step": 20307 + }, + { + "epoch": 1.1371934147160936, + "grad_norm": 1.1979814767837524, + "learning_rate": 9.992157894736842e-05, + "loss": 0.3212, + "step": 20308 + }, + { + "epoch": 1.1372494120282226, + "grad_norm": 1.7314233779907227, + "learning_rate": 9.99213157894737e-05, + "loss": 0.3733, + "step": 20309 + }, + { + "epoch": 1.1373054093403516, + "grad_norm": 1.1924324035644531, + "learning_rate": 9.992105263157895e-05, + "loss": 0.4887, + "step": 20310 + }, + { + "epoch": 1.1373614066524806, + "grad_norm": 1.3776707649230957, + "learning_rate": 9.992078947368421e-05, + "loss": 0.3593, + "step": 20311 + }, + { + "epoch": 1.1374174039646097, + "grad_norm": 1.463600754737854, + "learning_rate": 9.992052631578947e-05, + "loss": 0.4736, + "step": 20312 + }, + { + "epoch": 1.1374734012767387, + "grad_norm": 1.376007080078125, + "learning_rate": 9.992026315789475e-05, + "loss": 0.4454, + "step": 20313 + }, + { + "epoch": 1.1375293985888677, + "grad_norm": 1.5093474388122559, + "learning_rate": 9.992e-05, + "loss": 0.4242, + "step": 20314 + }, + { + "epoch": 1.1375853959009967, + "grad_norm": 1.3137797117233276, + "learning_rate": 9.991973684210527e-05, + "loss": 0.4289, + "step": 20315 + }, + { + "epoch": 1.1376413932131257, + "grad_norm": 1.4704359769821167, + "learning_rate": 9.991947368421053e-05, + "loss": 0.4248, + "step": 20316 + }, + { + "epoch": 1.1376973905252548, + "grad_norm": 1.3900424242019653, + "learning_rate": 9.99192105263158e-05, + "loss": 0.412, + "step": 20317 + }, + { + "epoch": 1.1377533878373838, + "grad_norm": 1.325015664100647, + "learning_rate": 9.991894736842106e-05, + "loss": 0.5808, + "step": 20318 + }, + { + "epoch": 1.1378093851495128, + "grad_norm": 1.2525914907455444, + "learning_rate": 9.991868421052633e-05, + "loss": 0.5415, + "step": 20319 + }, + { + "epoch": 1.1378653824616418, + "grad_norm": 1.4956260919570923, + "learning_rate": 9.991842105263158e-05, + "loss": 0.4928, + "step": 20320 + }, + { + "epoch": 1.1379213797737708, + "grad_norm": 1.1873290538787842, + "learning_rate": 9.991815789473685e-05, + "loss": 0.3795, + "step": 20321 + }, + { + "epoch": 1.1379773770858999, + "grad_norm": 1.2872254848480225, + "learning_rate": 9.991789473684211e-05, + "loss": 0.4509, + "step": 20322 + }, + { + "epoch": 1.1380333743980289, + "grad_norm": 1.3473589420318604, + "learning_rate": 9.991763157894737e-05, + "loss": 0.343, + "step": 20323 + }, + { + "epoch": 1.138089371710158, + "grad_norm": 1.6654990911483765, + "learning_rate": 9.991736842105264e-05, + "loss": 0.5751, + "step": 20324 + }, + { + "epoch": 1.138145369022287, + "grad_norm": 1.8152382373809814, + "learning_rate": 9.991710526315789e-05, + "loss": 0.3431, + "step": 20325 + }, + { + "epoch": 1.138201366334416, + "grad_norm": 1.5003809928894043, + "learning_rate": 9.991684210526316e-05, + "loss": 0.4206, + "step": 20326 + }, + { + "epoch": 1.138257363646545, + "grad_norm": 1.2405860424041748, + "learning_rate": 9.991657894736842e-05, + "loss": 0.3798, + "step": 20327 + }, + { + "epoch": 1.138313360958674, + "grad_norm": 1.306026577949524, + "learning_rate": 9.99163157894737e-05, + "loss": 0.4388, + "step": 20328 + }, + { + "epoch": 1.138369358270803, + "grad_norm": 1.517681360244751, + "learning_rate": 9.991605263157896e-05, + "loss": 0.5016, + "step": 20329 + }, + { + "epoch": 1.138425355582932, + "grad_norm": 2.5119831562042236, + "learning_rate": 9.991578947368422e-05, + "loss": 0.4232, + "step": 20330 + }, + { + "epoch": 1.138481352895061, + "grad_norm": 1.238312005996704, + "learning_rate": 9.991552631578948e-05, + "loss": 0.3336, + "step": 20331 + }, + { + "epoch": 1.13853735020719, + "grad_norm": 1.3370847702026367, + "learning_rate": 9.991526315789475e-05, + "loss": 0.474, + "step": 20332 + }, + { + "epoch": 1.138593347519319, + "grad_norm": 3.627495527267456, + "learning_rate": 9.991500000000001e-05, + "loss": 0.5222, + "step": 20333 + }, + { + "epoch": 1.138649344831448, + "grad_norm": 1.314353585243225, + "learning_rate": 9.991473684210527e-05, + "loss": 0.4736, + "step": 20334 + }, + { + "epoch": 1.138705342143577, + "grad_norm": 1.693156361579895, + "learning_rate": 9.991447368421053e-05, + "loss": 0.5348, + "step": 20335 + }, + { + "epoch": 1.1387613394557061, + "grad_norm": 5.299622535705566, + "learning_rate": 9.99142105263158e-05, + "loss": 0.4973, + "step": 20336 + }, + { + "epoch": 1.1388173367678351, + "grad_norm": 1.2720335721969604, + "learning_rate": 9.991394736842106e-05, + "loss": 0.3379, + "step": 20337 + }, + { + "epoch": 1.1388733340799642, + "grad_norm": 1.6211950778961182, + "learning_rate": 9.991368421052632e-05, + "loss": 0.4027, + "step": 20338 + }, + { + "epoch": 1.1389293313920932, + "grad_norm": 1.5061919689178467, + "learning_rate": 9.991342105263158e-05, + "loss": 0.6308, + "step": 20339 + }, + { + "epoch": 1.1389853287042222, + "grad_norm": 1.3513188362121582, + "learning_rate": 9.991315789473684e-05, + "loss": 0.4841, + "step": 20340 + }, + { + "epoch": 1.1390413260163512, + "grad_norm": 1.410709023475647, + "learning_rate": 9.991289473684211e-05, + "loss": 0.3373, + "step": 20341 + }, + { + "epoch": 1.1390973233284802, + "grad_norm": 1.2556793689727783, + "learning_rate": 9.991263157894737e-05, + "loss": 0.4272, + "step": 20342 + }, + { + "epoch": 1.1391533206406093, + "grad_norm": 1.5008658170700073, + "learning_rate": 9.991236842105263e-05, + "loss": 0.5107, + "step": 20343 + }, + { + "epoch": 1.1392093179527383, + "grad_norm": 1.5143998861312866, + "learning_rate": 9.991210526315789e-05, + "loss": 0.6184, + "step": 20344 + }, + { + "epoch": 1.1392653152648673, + "grad_norm": 1.224623441696167, + "learning_rate": 9.991184210526317e-05, + "loss": 0.4284, + "step": 20345 + }, + { + "epoch": 1.1393213125769963, + "grad_norm": 1.4190151691436768, + "learning_rate": 9.991157894736843e-05, + "loss": 0.4862, + "step": 20346 + }, + { + "epoch": 1.1393773098891253, + "grad_norm": 1.2998725175857544, + "learning_rate": 9.991131578947368e-05, + "loss": 0.4061, + "step": 20347 + }, + { + "epoch": 1.1394333072012544, + "grad_norm": 1.2712948322296143, + "learning_rate": 9.991105263157894e-05, + "loss": 0.4702, + "step": 20348 + }, + { + "epoch": 1.1394893045133834, + "grad_norm": 1.305171012878418, + "learning_rate": 9.991078947368422e-05, + "loss": 0.3515, + "step": 20349 + }, + { + "epoch": 1.1395453018255124, + "grad_norm": 1.5488324165344238, + "learning_rate": 9.991052631578948e-05, + "loss": 0.4834, + "step": 20350 + }, + { + "epoch": 1.1396012991376414, + "grad_norm": 1.2901452779769897, + "learning_rate": 9.991026315789475e-05, + "loss": 0.4051, + "step": 20351 + }, + { + "epoch": 1.1396572964497704, + "grad_norm": 1.3085746765136719, + "learning_rate": 9.991e-05, + "loss": 0.4558, + "step": 20352 + }, + { + "epoch": 1.1397132937618994, + "grad_norm": 1.3028777837753296, + "learning_rate": 9.990973684210527e-05, + "loss": 0.5685, + "step": 20353 + }, + { + "epoch": 1.1397692910740285, + "grad_norm": 1.3169550895690918, + "learning_rate": 9.990947368421053e-05, + "loss": 0.3991, + "step": 20354 + }, + { + "epoch": 1.1398252883861575, + "grad_norm": 1.3387531042099, + "learning_rate": 9.99092105263158e-05, + "loss": 0.3692, + "step": 20355 + }, + { + "epoch": 1.1398812856982865, + "grad_norm": 1.3168439865112305, + "learning_rate": 9.990894736842106e-05, + "loss": 0.3662, + "step": 20356 + }, + { + "epoch": 1.1399372830104155, + "grad_norm": 1.3575280904769897, + "learning_rate": 9.990868421052631e-05, + "loss": 0.4071, + "step": 20357 + }, + { + "epoch": 1.1399932803225445, + "grad_norm": 1.6750175952911377, + "learning_rate": 9.990842105263158e-05, + "loss": 0.6067, + "step": 20358 + }, + { + "epoch": 1.1400492776346736, + "grad_norm": 1.2230628728866577, + "learning_rate": 9.990815789473684e-05, + "loss": 0.4325, + "step": 20359 + }, + { + "epoch": 1.1401052749468026, + "grad_norm": 1.4564143419265747, + "learning_rate": 9.990789473684212e-05, + "loss": 0.3646, + "step": 20360 + }, + { + "epoch": 1.1401612722589316, + "grad_norm": 1.7707886695861816, + "learning_rate": 9.990763157894738e-05, + "loss": 0.3448, + "step": 20361 + }, + { + "epoch": 1.1402172695710606, + "grad_norm": 1.2103395462036133, + "learning_rate": 9.990736842105264e-05, + "loss": 0.4444, + "step": 20362 + }, + { + "epoch": 1.1402732668831896, + "grad_norm": 1.2842963933944702, + "learning_rate": 9.99071052631579e-05, + "loss": 0.5412, + "step": 20363 + }, + { + "epoch": 1.1403292641953187, + "grad_norm": 1.1807801723480225, + "learning_rate": 9.990684210526317e-05, + "loss": 0.4317, + "step": 20364 + }, + { + "epoch": 1.1403852615074477, + "grad_norm": 1.2823046445846558, + "learning_rate": 9.990657894736843e-05, + "loss": 0.5272, + "step": 20365 + }, + { + "epoch": 1.1404412588195767, + "grad_norm": 1.4724619388580322, + "learning_rate": 9.990631578947369e-05, + "loss": 0.4194, + "step": 20366 + }, + { + "epoch": 1.1404972561317057, + "grad_norm": 1.0696430206298828, + "learning_rate": 9.990605263157895e-05, + "loss": 0.311, + "step": 20367 + }, + { + "epoch": 1.1405532534438347, + "grad_norm": 1.2921104431152344, + "learning_rate": 9.990578947368422e-05, + "loss": 0.434, + "step": 20368 + }, + { + "epoch": 1.1406092507559638, + "grad_norm": 1.2019782066345215, + "learning_rate": 9.990552631578948e-05, + "loss": 0.3851, + "step": 20369 + }, + { + "epoch": 1.1406652480680928, + "grad_norm": 1.3826487064361572, + "learning_rate": 9.990526315789474e-05, + "loss": 0.5371, + "step": 20370 + }, + { + "epoch": 1.1407212453802218, + "grad_norm": 1.5002570152282715, + "learning_rate": 9.9905e-05, + "loss": 0.357, + "step": 20371 + }, + { + "epoch": 1.1407772426923508, + "grad_norm": 1.426370620727539, + "learning_rate": 9.990473684210527e-05, + "loss": 0.5019, + "step": 20372 + }, + { + "epoch": 1.1408332400044798, + "grad_norm": 1.5056532621383667, + "learning_rate": 9.990447368421053e-05, + "loss": 0.5796, + "step": 20373 + }, + { + "epoch": 1.1408892373166089, + "grad_norm": 1.2421802282333374, + "learning_rate": 9.990421052631579e-05, + "loss": 0.3594, + "step": 20374 + }, + { + "epoch": 1.1409452346287379, + "grad_norm": 1.8825669288635254, + "learning_rate": 9.990394736842105e-05, + "loss": 0.4201, + "step": 20375 + }, + { + "epoch": 1.141001231940867, + "grad_norm": 1.3323009014129639, + "learning_rate": 9.990368421052631e-05, + "loss": 0.4136, + "step": 20376 + }, + { + "epoch": 1.141057229252996, + "grad_norm": 1.405066728591919, + "learning_rate": 9.990342105263159e-05, + "loss": 0.4292, + "step": 20377 + }, + { + "epoch": 1.141113226565125, + "grad_norm": 1.2221298217773438, + "learning_rate": 9.990315789473684e-05, + "loss": 0.3495, + "step": 20378 + }, + { + "epoch": 1.141169223877254, + "grad_norm": 1.2865514755249023, + "learning_rate": 9.990289473684212e-05, + "loss": 0.3648, + "step": 20379 + }, + { + "epoch": 1.141225221189383, + "grad_norm": 1.406097173690796, + "learning_rate": 9.990263157894736e-05, + "loss": 0.428, + "step": 20380 + }, + { + "epoch": 1.141281218501512, + "grad_norm": 2.4247918128967285, + "learning_rate": 9.990236842105264e-05, + "loss": 0.4855, + "step": 20381 + }, + { + "epoch": 1.141337215813641, + "grad_norm": 1.4736703634262085, + "learning_rate": 9.99021052631579e-05, + "loss": 0.4213, + "step": 20382 + }, + { + "epoch": 1.14139321312577, + "grad_norm": 1.3291274309158325, + "learning_rate": 9.990184210526317e-05, + "loss": 0.3984, + "step": 20383 + }, + { + "epoch": 1.141449210437899, + "grad_norm": 1.3679965734481812, + "learning_rate": 9.990157894736843e-05, + "loss": 0.4346, + "step": 20384 + }, + { + "epoch": 1.141505207750028, + "grad_norm": 1.220797061920166, + "learning_rate": 9.990131578947369e-05, + "loss": 0.4462, + "step": 20385 + }, + { + "epoch": 1.141561205062157, + "grad_norm": 1.4401963949203491, + "learning_rate": 9.990105263157895e-05, + "loss": 0.357, + "step": 20386 + }, + { + "epoch": 1.141617202374286, + "grad_norm": 1.3059804439544678, + "learning_rate": 9.990078947368422e-05, + "loss": 0.4317, + "step": 20387 + }, + { + "epoch": 1.1416731996864151, + "grad_norm": 2.1954119205474854, + "learning_rate": 9.990052631578948e-05, + "loss": 0.576, + "step": 20388 + }, + { + "epoch": 1.1417291969985441, + "grad_norm": 1.312447190284729, + "learning_rate": 9.990026315789474e-05, + "loss": 0.4316, + "step": 20389 + }, + { + "epoch": 1.1417851943106732, + "grad_norm": 1.3905044794082642, + "learning_rate": 9.99e-05, + "loss": 0.6226, + "step": 20390 + }, + { + "epoch": 1.1418411916228022, + "grad_norm": 1.4395277500152588, + "learning_rate": 9.989973684210526e-05, + "loss": 0.6674, + "step": 20391 + }, + { + "epoch": 1.1418971889349312, + "grad_norm": 1.7190442085266113, + "learning_rate": 9.989947368421054e-05, + "loss": 0.4215, + "step": 20392 + }, + { + "epoch": 1.1419531862470602, + "grad_norm": 1.292906641960144, + "learning_rate": 9.98992105263158e-05, + "loss": 0.4206, + "step": 20393 + }, + { + "epoch": 1.1420091835591892, + "grad_norm": 1.3469399213790894, + "learning_rate": 9.989894736842105e-05, + "loss": 0.3768, + "step": 20394 + }, + { + "epoch": 1.1420651808713183, + "grad_norm": 2.9747273921966553, + "learning_rate": 9.989868421052631e-05, + "loss": 0.4534, + "step": 20395 + }, + { + "epoch": 1.1421211781834473, + "grad_norm": 1.2159113883972168, + "learning_rate": 9.989842105263159e-05, + "loss": 0.4651, + "step": 20396 + }, + { + "epoch": 1.1421771754955763, + "grad_norm": 1.318577527999878, + "learning_rate": 9.989815789473685e-05, + "loss": 0.412, + "step": 20397 + }, + { + "epoch": 1.1422331728077053, + "grad_norm": 1.2219117879867554, + "learning_rate": 9.989789473684211e-05, + "loss": 0.4052, + "step": 20398 + }, + { + "epoch": 1.1422891701198343, + "grad_norm": 1.5057100057601929, + "learning_rate": 9.989763157894737e-05, + "loss": 0.4254, + "step": 20399 + }, + { + "epoch": 1.1423451674319633, + "grad_norm": 1.3558597564697266, + "learning_rate": 9.989736842105264e-05, + "loss": 0.4651, + "step": 20400 + }, + { + "epoch": 1.1424011647440924, + "grad_norm": 9.762716293334961, + "learning_rate": 9.98971052631579e-05, + "loss": 0.4503, + "step": 20401 + }, + { + "epoch": 1.1424571620562214, + "grad_norm": 1.5793808698654175, + "learning_rate": 9.989684210526316e-05, + "loss": 0.4069, + "step": 20402 + }, + { + "epoch": 1.1425131593683504, + "grad_norm": 1.2100074291229248, + "learning_rate": 9.989657894736842e-05, + "loss": 0.33, + "step": 20403 + }, + { + "epoch": 1.1425691566804794, + "grad_norm": 1.594765543937683, + "learning_rate": 9.989631578947369e-05, + "loss": 0.5182, + "step": 20404 + }, + { + "epoch": 1.1426251539926084, + "grad_norm": 1.6750227212905884, + "learning_rate": 9.989605263157895e-05, + "loss": 0.4615, + "step": 20405 + }, + { + "epoch": 1.1426811513047375, + "grad_norm": 1.3448501825332642, + "learning_rate": 9.989578947368423e-05, + "loss": 0.512, + "step": 20406 + }, + { + "epoch": 1.1427371486168665, + "grad_norm": 1.508351445198059, + "learning_rate": 9.989552631578947e-05, + "loss": 0.5387, + "step": 20407 + }, + { + "epoch": 1.1427931459289955, + "grad_norm": 1.4007755517959595, + "learning_rate": 9.989526315789473e-05, + "loss": 0.4336, + "step": 20408 + }, + { + "epoch": 1.1428491432411245, + "grad_norm": 1.5024051666259766, + "learning_rate": 9.9895e-05, + "loss": 0.3761, + "step": 20409 + }, + { + "epoch": 1.1429051405532535, + "grad_norm": 1.227728009223938, + "learning_rate": 9.989473684210526e-05, + "loss": 0.5032, + "step": 20410 + }, + { + "epoch": 1.1429611378653823, + "grad_norm": 1.4767719507217407, + "learning_rate": 9.989447368421054e-05, + "loss": 0.4621, + "step": 20411 + }, + { + "epoch": 1.1430171351775114, + "grad_norm": 1.2244586944580078, + "learning_rate": 9.989421052631578e-05, + "loss": 0.4291, + "step": 20412 + }, + { + "epoch": 1.1430731324896404, + "grad_norm": 1.3493458032608032, + "learning_rate": 9.989394736842106e-05, + "loss": 0.3902, + "step": 20413 + }, + { + "epoch": 1.1431291298017694, + "grad_norm": 1.400938868522644, + "learning_rate": 9.989368421052632e-05, + "loss": 0.421, + "step": 20414 + }, + { + "epoch": 1.1431851271138984, + "grad_norm": 1.3430699110031128, + "learning_rate": 9.989342105263159e-05, + "loss": 0.429, + "step": 20415 + }, + { + "epoch": 1.1432411244260274, + "grad_norm": 1.5283825397491455, + "learning_rate": 9.989315789473685e-05, + "loss": 0.5953, + "step": 20416 + }, + { + "epoch": 1.1432971217381565, + "grad_norm": 1.3737848997116089, + "learning_rate": 9.989289473684211e-05, + "loss": 0.4791, + "step": 20417 + }, + { + "epoch": 1.1433531190502855, + "grad_norm": 1.2857928276062012, + "learning_rate": 9.989263157894737e-05, + "loss": 0.5354, + "step": 20418 + }, + { + "epoch": 1.1434091163624145, + "grad_norm": 1.208808422088623, + "learning_rate": 9.989236842105264e-05, + "loss": 0.4551, + "step": 20419 + }, + { + "epoch": 1.1434651136745435, + "grad_norm": 1.372432827949524, + "learning_rate": 9.98921052631579e-05, + "loss": 0.3604, + "step": 20420 + }, + { + "epoch": 1.1435211109866725, + "grad_norm": 1.3018593788146973, + "learning_rate": 9.989184210526316e-05, + "loss": 0.3871, + "step": 20421 + }, + { + "epoch": 1.1435771082988015, + "grad_norm": 1.326655387878418, + "learning_rate": 9.989157894736842e-05, + "loss": 0.3472, + "step": 20422 + }, + { + "epoch": 1.1436331056109306, + "grad_norm": 1.8529084920883179, + "learning_rate": 9.98913157894737e-05, + "loss": 0.5448, + "step": 20423 + }, + { + "epoch": 1.1436891029230596, + "grad_norm": 1.4630001783370972, + "learning_rate": 9.989105263157896e-05, + "loss": 0.4667, + "step": 20424 + }, + { + "epoch": 1.1437451002351886, + "grad_norm": 1.5406907796859741, + "learning_rate": 9.989078947368421e-05, + "loss": 0.4948, + "step": 20425 + }, + { + "epoch": 1.1438010975473176, + "grad_norm": 1.7303768396377563, + "learning_rate": 9.989052631578947e-05, + "loss": 0.4362, + "step": 20426 + }, + { + "epoch": 1.1438570948594466, + "grad_norm": 1.3066902160644531, + "learning_rate": 9.989026315789473e-05, + "loss": 0.4374, + "step": 20427 + }, + { + "epoch": 1.1439130921715757, + "grad_norm": 1.3579692840576172, + "learning_rate": 9.989000000000001e-05, + "loss": 0.4402, + "step": 20428 + }, + { + "epoch": 1.1439690894837047, + "grad_norm": 1.7817784547805786, + "learning_rate": 9.988973684210527e-05, + "loss": 0.5121, + "step": 20429 + }, + { + "epoch": 1.1440250867958337, + "grad_norm": 1.3389939069747925, + "learning_rate": 9.988947368421053e-05, + "loss": 0.4115, + "step": 20430 + }, + { + "epoch": 1.1440810841079627, + "grad_norm": 1.320111632347107, + "learning_rate": 9.988921052631579e-05, + "loss": 0.4343, + "step": 20431 + }, + { + "epoch": 1.1441370814200917, + "grad_norm": 1.242687463760376, + "learning_rate": 9.988894736842106e-05, + "loss": 0.3491, + "step": 20432 + }, + { + "epoch": 1.1441930787322208, + "grad_norm": 1.3836536407470703, + "learning_rate": 9.988868421052632e-05, + "loss": 0.3486, + "step": 20433 + }, + { + "epoch": 1.1442490760443498, + "grad_norm": 1.419258952140808, + "learning_rate": 9.988842105263159e-05, + "loss": 0.4177, + "step": 20434 + }, + { + "epoch": 1.1443050733564788, + "grad_norm": 1.140465259552002, + "learning_rate": 9.988815789473684e-05, + "loss": 0.4119, + "step": 20435 + }, + { + "epoch": 1.1443610706686078, + "grad_norm": 1.4027020931243896, + "learning_rate": 9.988789473684211e-05, + "loss": 0.3921, + "step": 20436 + }, + { + "epoch": 1.1444170679807368, + "grad_norm": 1.3744571208953857, + "learning_rate": 9.988763157894737e-05, + "loss": 0.4064, + "step": 20437 + }, + { + "epoch": 1.1444730652928659, + "grad_norm": 1.6248918771743774, + "learning_rate": 9.988736842105265e-05, + "loss": 0.5104, + "step": 20438 + }, + { + "epoch": 1.1445290626049949, + "grad_norm": 1.2133638858795166, + "learning_rate": 9.98871052631579e-05, + "loss": 0.3985, + "step": 20439 + }, + { + "epoch": 1.144585059917124, + "grad_norm": 1.2253282070159912, + "learning_rate": 9.988684210526316e-05, + "loss": 0.4253, + "step": 20440 + }, + { + "epoch": 1.144641057229253, + "grad_norm": 1.3476512432098389, + "learning_rate": 9.988657894736842e-05, + "loss": 0.389, + "step": 20441 + }, + { + "epoch": 1.144697054541382, + "grad_norm": 1.5335700511932373, + "learning_rate": 9.98863157894737e-05, + "loss": 0.3119, + "step": 20442 + }, + { + "epoch": 1.144753051853511, + "grad_norm": 1.3555032014846802, + "learning_rate": 9.988605263157896e-05, + "loss": 0.5063, + "step": 20443 + }, + { + "epoch": 1.14480904916564, + "grad_norm": 1.432375431060791, + "learning_rate": 9.98857894736842e-05, + "loss": 0.5124, + "step": 20444 + }, + { + "epoch": 1.144865046477769, + "grad_norm": 1.4518640041351318, + "learning_rate": 9.988552631578948e-05, + "loss": 0.426, + "step": 20445 + }, + { + "epoch": 1.144921043789898, + "grad_norm": 1.2916922569274902, + "learning_rate": 9.988526315789474e-05, + "loss": 0.4223, + "step": 20446 + }, + { + "epoch": 1.144977041102027, + "grad_norm": 1.2688654661178589, + "learning_rate": 9.988500000000001e-05, + "loss": 0.3681, + "step": 20447 + }, + { + "epoch": 1.145033038414156, + "grad_norm": 2.6093363761901855, + "learning_rate": 9.988473684210527e-05, + "loss": 0.4633, + "step": 20448 + }, + { + "epoch": 1.145089035726285, + "grad_norm": 1.2063078880310059, + "learning_rate": 9.988447368421053e-05, + "loss": 0.3911, + "step": 20449 + }, + { + "epoch": 1.145145033038414, + "grad_norm": 1.247310757637024, + "learning_rate": 9.988421052631579e-05, + "loss": 0.3752, + "step": 20450 + }, + { + "epoch": 1.145201030350543, + "grad_norm": 1.5782006978988647, + "learning_rate": 9.988394736842106e-05, + "loss": 0.4034, + "step": 20451 + }, + { + "epoch": 1.1452570276626721, + "grad_norm": 1.9347234964370728, + "learning_rate": 9.988368421052632e-05, + "loss": 0.5993, + "step": 20452 + }, + { + "epoch": 1.1453130249748011, + "grad_norm": 1.2036774158477783, + "learning_rate": 9.988342105263158e-05, + "loss": 0.4467, + "step": 20453 + }, + { + "epoch": 1.1453690222869302, + "grad_norm": 1.3685601949691772, + "learning_rate": 9.988315789473684e-05, + "loss": 0.4881, + "step": 20454 + }, + { + "epoch": 1.1454250195990592, + "grad_norm": 2.390435218811035, + "learning_rate": 9.988289473684212e-05, + "loss": 0.4933, + "step": 20455 + }, + { + "epoch": 1.1454810169111882, + "grad_norm": 1.2845882177352905, + "learning_rate": 9.988263157894737e-05, + "loss": 0.4687, + "step": 20456 + }, + { + "epoch": 1.1455370142233172, + "grad_norm": 1.82760751247406, + "learning_rate": 9.988236842105263e-05, + "loss": 0.4768, + "step": 20457 + }, + { + "epoch": 1.1455930115354462, + "grad_norm": 1.1389309167861938, + "learning_rate": 9.98821052631579e-05, + "loss": 0.4183, + "step": 20458 + }, + { + "epoch": 1.1456490088475753, + "grad_norm": 1.4520819187164307, + "learning_rate": 9.988184210526317e-05, + "loss": 0.3848, + "step": 20459 + }, + { + "epoch": 1.1457050061597043, + "grad_norm": 1.26947021484375, + "learning_rate": 9.988157894736843e-05, + "loss": 0.4503, + "step": 20460 + }, + { + "epoch": 1.1457610034718333, + "grad_norm": 1.2558846473693848, + "learning_rate": 9.988131578947369e-05, + "loss": 0.3364, + "step": 20461 + }, + { + "epoch": 1.1458170007839623, + "grad_norm": 1.8046354055404663, + "learning_rate": 9.988105263157895e-05, + "loss": 0.6396, + "step": 20462 + }, + { + "epoch": 1.1458729980960913, + "grad_norm": 1.3201208114624023, + "learning_rate": 9.98807894736842e-05, + "loss": 0.4427, + "step": 20463 + }, + { + "epoch": 1.1459289954082204, + "grad_norm": 1.6858083009719849, + "learning_rate": 9.988052631578948e-05, + "loss": 0.5229, + "step": 20464 + }, + { + "epoch": 1.1459849927203494, + "grad_norm": 1.1688151359558105, + "learning_rate": 9.988026315789474e-05, + "loss": 0.43, + "step": 20465 + }, + { + "epoch": 1.1460409900324784, + "grad_norm": 1.6450614929199219, + "learning_rate": 9.988000000000001e-05, + "loss": 0.4932, + "step": 20466 + }, + { + "epoch": 1.1460969873446074, + "grad_norm": 1.4939018487930298, + "learning_rate": 9.987973684210526e-05, + "loss": 0.4432, + "step": 20467 + }, + { + "epoch": 1.1461529846567364, + "grad_norm": 1.5850712060928345, + "learning_rate": 9.987947368421053e-05, + "loss": 0.4831, + "step": 20468 + }, + { + "epoch": 1.1462089819688654, + "grad_norm": 1.2346611022949219, + "learning_rate": 9.987921052631579e-05, + "loss": 0.4935, + "step": 20469 + }, + { + "epoch": 1.1462649792809945, + "grad_norm": 1.5039423704147339, + "learning_rate": 9.987894736842107e-05, + "loss": 0.4986, + "step": 20470 + }, + { + "epoch": 1.1463209765931235, + "grad_norm": 1.4204386472702026, + "learning_rate": 9.987868421052632e-05, + "loss": 0.3778, + "step": 20471 + }, + { + "epoch": 1.1463769739052525, + "grad_norm": 1.4285898208618164, + "learning_rate": 9.987842105263158e-05, + "loss": 0.4692, + "step": 20472 + }, + { + "epoch": 1.1464329712173815, + "grad_norm": 1.6953669786453247, + "learning_rate": 9.987815789473684e-05, + "loss": 0.3309, + "step": 20473 + }, + { + "epoch": 1.1464889685295105, + "grad_norm": 1.5123811960220337, + "learning_rate": 9.987789473684212e-05, + "loss": 0.4395, + "step": 20474 + }, + { + "epoch": 1.1465449658416396, + "grad_norm": 1.2125428915023804, + "learning_rate": 9.987763157894738e-05, + "loss": 0.3578, + "step": 20475 + }, + { + "epoch": 1.1466009631537686, + "grad_norm": 1.2554445266723633, + "learning_rate": 9.987736842105264e-05, + "loss": 0.41, + "step": 20476 + }, + { + "epoch": 1.1466569604658976, + "grad_norm": 1.1073169708251953, + "learning_rate": 9.98771052631579e-05, + "loss": 0.3199, + "step": 20477 + }, + { + "epoch": 1.1467129577780266, + "grad_norm": 1.5877275466918945, + "learning_rate": 9.987684210526316e-05, + "loss": 0.4738, + "step": 20478 + }, + { + "epoch": 1.1467689550901556, + "grad_norm": 1.2053771018981934, + "learning_rate": 9.987657894736843e-05, + "loss": 0.4658, + "step": 20479 + }, + { + "epoch": 1.1468249524022847, + "grad_norm": 1.2012308835983276, + "learning_rate": 9.987631578947369e-05, + "loss": 0.4932, + "step": 20480 + }, + { + "epoch": 1.1468809497144137, + "grad_norm": 1.4577291011810303, + "learning_rate": 9.987605263157895e-05, + "loss": 0.4037, + "step": 20481 + }, + { + "epoch": 1.1469369470265427, + "grad_norm": 1.980048656463623, + "learning_rate": 9.987578947368421e-05, + "loss": 0.5397, + "step": 20482 + }, + { + "epoch": 1.1469929443386717, + "grad_norm": 1.3027416467666626, + "learning_rate": 9.987552631578948e-05, + "loss": 0.4341, + "step": 20483 + }, + { + "epoch": 1.1470489416508007, + "grad_norm": 1.3045016527175903, + "learning_rate": 9.987526315789474e-05, + "loss": 0.4104, + "step": 20484 + }, + { + "epoch": 1.1471049389629298, + "grad_norm": 1.537992000579834, + "learning_rate": 9.9875e-05, + "loss": 0.554, + "step": 20485 + }, + { + "epoch": 1.1471609362750588, + "grad_norm": 1.3715176582336426, + "learning_rate": 9.987473684210526e-05, + "loss": 0.4554, + "step": 20486 + }, + { + "epoch": 1.1472169335871878, + "grad_norm": 1.564469337463379, + "learning_rate": 9.987447368421053e-05, + "loss": 0.4798, + "step": 20487 + }, + { + "epoch": 1.1472729308993168, + "grad_norm": 1.3704088926315308, + "learning_rate": 9.98742105263158e-05, + "loss": 0.4266, + "step": 20488 + }, + { + "epoch": 1.1473289282114458, + "grad_norm": 1.655909538269043, + "learning_rate": 9.987394736842107e-05, + "loss": 0.5037, + "step": 20489 + }, + { + "epoch": 1.1473849255235749, + "grad_norm": 1.3380017280578613, + "learning_rate": 9.987368421052631e-05, + "loss": 0.5295, + "step": 20490 + }, + { + "epoch": 1.1474409228357039, + "grad_norm": 1.4491338729858398, + "learning_rate": 9.987342105263159e-05, + "loss": 0.3792, + "step": 20491 + }, + { + "epoch": 1.1474969201478329, + "grad_norm": 1.2327592372894287, + "learning_rate": 9.987315789473685e-05, + "loss": 0.3807, + "step": 20492 + }, + { + "epoch": 1.147552917459962, + "grad_norm": 1.2314577102661133, + "learning_rate": 9.987289473684212e-05, + "loss": 0.3485, + "step": 20493 + }, + { + "epoch": 1.147608914772091, + "grad_norm": 1.2795847654342651, + "learning_rate": 9.987263157894738e-05, + "loss": 0.4447, + "step": 20494 + }, + { + "epoch": 1.14766491208422, + "grad_norm": 1.2632372379302979, + "learning_rate": 9.987236842105263e-05, + "loss": 0.4008, + "step": 20495 + }, + { + "epoch": 1.147720909396349, + "grad_norm": 1.2883623838424683, + "learning_rate": 9.98721052631579e-05, + "loss": 0.4505, + "step": 20496 + }, + { + "epoch": 1.147776906708478, + "grad_norm": 1.2738240957260132, + "learning_rate": 9.987184210526316e-05, + "loss": 0.3776, + "step": 20497 + }, + { + "epoch": 1.147832904020607, + "grad_norm": 1.2178534269332886, + "learning_rate": 9.987157894736843e-05, + "loss": 0.3194, + "step": 20498 + }, + { + "epoch": 1.147888901332736, + "grad_norm": 1.4835726022720337, + "learning_rate": 9.987131578947368e-05, + "loss": 0.383, + "step": 20499 + }, + { + "epoch": 1.147944898644865, + "grad_norm": 1.263297438621521, + "learning_rate": 9.987105263157895e-05, + "loss": 0.3836, + "step": 20500 + }, + { + "epoch": 1.148000895956994, + "grad_norm": 1.3925824165344238, + "learning_rate": 9.987078947368421e-05, + "loss": 0.4921, + "step": 20501 + }, + { + "epoch": 1.148056893269123, + "grad_norm": 1.2069382667541504, + "learning_rate": 9.987052631578948e-05, + "loss": 0.4491, + "step": 20502 + }, + { + "epoch": 1.148112890581252, + "grad_norm": 1.632684588432312, + "learning_rate": 9.987026315789474e-05, + "loss": 0.5105, + "step": 20503 + }, + { + "epoch": 1.1481688878933811, + "grad_norm": 1.347937822341919, + "learning_rate": 9.987e-05, + "loss": 0.4321, + "step": 20504 + }, + { + "epoch": 1.1482248852055101, + "grad_norm": 1.5664825439453125, + "learning_rate": 9.986973684210526e-05, + "loss": 0.4648, + "step": 20505 + }, + { + "epoch": 1.1482808825176392, + "grad_norm": 1.4892882108688354, + "learning_rate": 9.986947368421054e-05, + "loss": 0.4631, + "step": 20506 + }, + { + "epoch": 1.1483368798297682, + "grad_norm": 1.3442851305007935, + "learning_rate": 9.98692105263158e-05, + "loss": 0.5604, + "step": 20507 + }, + { + "epoch": 1.1483928771418972, + "grad_norm": 1.3856630325317383, + "learning_rate": 9.986894736842106e-05, + "loss": 0.3672, + "step": 20508 + }, + { + "epoch": 1.1484488744540262, + "grad_norm": 1.2508807182312012, + "learning_rate": 9.986868421052632e-05, + "loss": 0.4337, + "step": 20509 + }, + { + "epoch": 1.1485048717661552, + "grad_norm": 1.9127460718154907, + "learning_rate": 9.986842105263159e-05, + "loss": 0.4016, + "step": 20510 + }, + { + "epoch": 1.1485608690782843, + "grad_norm": 1.5735092163085938, + "learning_rate": 9.986815789473685e-05, + "loss": 0.5388, + "step": 20511 + }, + { + "epoch": 1.1486168663904133, + "grad_norm": 1.1594423055648804, + "learning_rate": 9.986789473684211e-05, + "loss": 0.3916, + "step": 20512 + }, + { + "epoch": 1.1486728637025423, + "grad_norm": 1.6421160697937012, + "learning_rate": 9.986763157894737e-05, + "loss": 0.5958, + "step": 20513 + }, + { + "epoch": 1.1487288610146713, + "grad_norm": 1.5584092140197754, + "learning_rate": 9.986736842105263e-05, + "loss": 0.4277, + "step": 20514 + }, + { + "epoch": 1.1487848583268003, + "grad_norm": 1.321160078048706, + "learning_rate": 9.98671052631579e-05, + "loss": 0.4582, + "step": 20515 + }, + { + "epoch": 1.1488408556389293, + "grad_norm": 2.9948270320892334, + "learning_rate": 9.986684210526316e-05, + "loss": 0.4916, + "step": 20516 + }, + { + "epoch": 1.1488968529510584, + "grad_norm": 1.7630518674850464, + "learning_rate": 9.986657894736842e-05, + "loss": 0.4982, + "step": 20517 + }, + { + "epoch": 1.1489528502631874, + "grad_norm": 1.5841964483261108, + "learning_rate": 9.986631578947368e-05, + "loss": 0.4953, + "step": 20518 + }, + { + "epoch": 1.1490088475753164, + "grad_norm": 1.4199410676956177, + "learning_rate": 9.986605263157895e-05, + "loss": 0.3478, + "step": 20519 + }, + { + "epoch": 1.1490648448874454, + "grad_norm": 1.2151843309402466, + "learning_rate": 9.986578947368421e-05, + "loss": 0.3648, + "step": 20520 + }, + { + "epoch": 1.1491208421995744, + "grad_norm": 1.4104688167572021, + "learning_rate": 9.986552631578949e-05, + "loss": 0.4732, + "step": 20521 + }, + { + "epoch": 1.1491768395117035, + "grad_norm": 1.6134860515594482, + "learning_rate": 9.986526315789473e-05, + "loss": 0.4832, + "step": 20522 + }, + { + "epoch": 1.1492328368238325, + "grad_norm": 1.2508913278579712, + "learning_rate": 9.986500000000001e-05, + "loss": 0.4126, + "step": 20523 + }, + { + "epoch": 1.1492888341359615, + "grad_norm": 7.151427268981934, + "learning_rate": 9.986473684210527e-05, + "loss": 0.5626, + "step": 20524 + }, + { + "epoch": 1.1493448314480905, + "grad_norm": 1.5135068893432617, + "learning_rate": 9.986447368421054e-05, + "loss": 0.5287, + "step": 20525 + }, + { + "epoch": 1.1494008287602195, + "grad_norm": 1.3105688095092773, + "learning_rate": 9.98642105263158e-05, + "loss": 0.453, + "step": 20526 + }, + { + "epoch": 1.1494568260723486, + "grad_norm": 1.422707438468933, + "learning_rate": 9.986394736842106e-05, + "loss": 0.4645, + "step": 20527 + }, + { + "epoch": 1.1495128233844776, + "grad_norm": 1.54299795627594, + "learning_rate": 9.986368421052632e-05, + "loss": 0.497, + "step": 20528 + }, + { + "epoch": 1.1495688206966066, + "grad_norm": 1.470493197441101, + "learning_rate": 9.986342105263158e-05, + "loss": 0.4095, + "step": 20529 + }, + { + "epoch": 1.1496248180087356, + "grad_norm": 1.6525027751922607, + "learning_rate": 9.986315789473685e-05, + "loss": 0.4128, + "step": 20530 + }, + { + "epoch": 1.1496808153208646, + "grad_norm": 1.5500452518463135, + "learning_rate": 9.986289473684211e-05, + "loss": 0.3531, + "step": 20531 + }, + { + "epoch": 1.1497368126329937, + "grad_norm": 1.5278432369232178, + "learning_rate": 9.986263157894737e-05, + "loss": 0.6385, + "step": 20532 + }, + { + "epoch": 1.1497928099451227, + "grad_norm": 1.5010216236114502, + "learning_rate": 9.986236842105263e-05, + "loss": 0.4497, + "step": 20533 + }, + { + "epoch": 1.1498488072572517, + "grad_norm": 1.275917649269104, + "learning_rate": 9.98621052631579e-05, + "loss": 0.4798, + "step": 20534 + }, + { + "epoch": 1.1499048045693807, + "grad_norm": 1.313026785850525, + "learning_rate": 9.986184210526316e-05, + "loss": 0.4179, + "step": 20535 + }, + { + "epoch": 1.1499608018815097, + "grad_norm": 1.8314224481582642, + "learning_rate": 9.986157894736842e-05, + "loss": 0.3713, + "step": 20536 + }, + { + "epoch": 1.1500167991936388, + "grad_norm": 1.3292884826660156, + "learning_rate": 9.986131578947368e-05, + "loss": 0.4248, + "step": 20537 + }, + { + "epoch": 1.1500727965057678, + "grad_norm": 1.2726421356201172, + "learning_rate": 9.986105263157896e-05, + "loss": 0.3801, + "step": 20538 + }, + { + "epoch": 1.1501287938178968, + "grad_norm": 1.2855806350708008, + "learning_rate": 9.986078947368422e-05, + "loss": 0.4511, + "step": 20539 + }, + { + "epoch": 1.1501847911300258, + "grad_norm": 1.3955669403076172, + "learning_rate": 9.986052631578948e-05, + "loss": 0.5053, + "step": 20540 + }, + { + "epoch": 1.1502407884421548, + "grad_norm": 1.2548118829727173, + "learning_rate": 9.986026315789474e-05, + "loss": 0.3955, + "step": 20541 + }, + { + "epoch": 1.1502967857542838, + "grad_norm": 1.2880767583847046, + "learning_rate": 9.986000000000001e-05, + "loss": 0.3904, + "step": 20542 + }, + { + "epoch": 1.1503527830664129, + "grad_norm": 1.262155294418335, + "learning_rate": 9.985973684210527e-05, + "loss": 0.4347, + "step": 20543 + }, + { + "epoch": 1.1504087803785419, + "grad_norm": 1.4613949060440063, + "learning_rate": 9.985947368421054e-05, + "loss": 0.47, + "step": 20544 + }, + { + "epoch": 1.150464777690671, + "grad_norm": 1.3043702840805054, + "learning_rate": 9.985921052631579e-05, + "loss": 0.3711, + "step": 20545 + }, + { + "epoch": 1.1505207750028, + "grad_norm": 1.5589770078659058, + "learning_rate": 9.985894736842105e-05, + "loss": 0.3877, + "step": 20546 + }, + { + "epoch": 1.150576772314929, + "grad_norm": 1.381598711013794, + "learning_rate": 9.985868421052632e-05, + "loss": 0.4932, + "step": 20547 + }, + { + "epoch": 1.150632769627058, + "grad_norm": 1.4977377653121948, + "learning_rate": 9.985842105263158e-05, + "loss": 0.5111, + "step": 20548 + }, + { + "epoch": 1.150688766939187, + "grad_norm": 2.3996832370758057, + "learning_rate": 9.985815789473684e-05, + "loss": 0.4615, + "step": 20549 + }, + { + "epoch": 1.150744764251316, + "grad_norm": 1.307060956954956, + "learning_rate": 9.98578947368421e-05, + "loss": 0.5004, + "step": 20550 + }, + { + "epoch": 1.150800761563445, + "grad_norm": 1.3533759117126465, + "learning_rate": 9.985763157894737e-05, + "loss": 0.3433, + "step": 20551 + }, + { + "epoch": 1.150856758875574, + "grad_norm": 1.4080413579940796, + "learning_rate": 9.985736842105263e-05, + "loss": 0.3815, + "step": 20552 + }, + { + "epoch": 1.150912756187703, + "grad_norm": 1.3012210130691528, + "learning_rate": 9.985710526315791e-05, + "loss": 0.529, + "step": 20553 + }, + { + "epoch": 1.150968753499832, + "grad_norm": 1.5659222602844238, + "learning_rate": 9.985684210526315e-05, + "loss": 0.4356, + "step": 20554 + }, + { + "epoch": 1.151024750811961, + "grad_norm": 1.316042423248291, + "learning_rate": 9.985657894736843e-05, + "loss": 0.4283, + "step": 20555 + }, + { + "epoch": 1.1510807481240901, + "grad_norm": 1.5393595695495605, + "learning_rate": 9.985631578947369e-05, + "loss": 0.4821, + "step": 20556 + }, + { + "epoch": 1.1511367454362191, + "grad_norm": 1.5968881845474243, + "learning_rate": 9.985605263157896e-05, + "loss": 0.5475, + "step": 20557 + }, + { + "epoch": 1.1511927427483482, + "grad_norm": 1.4276247024536133, + "learning_rate": 9.985578947368422e-05, + "loss": 0.4981, + "step": 20558 + }, + { + "epoch": 1.1512487400604772, + "grad_norm": 1.418346881866455, + "learning_rate": 9.985552631578948e-05, + "loss": 0.4105, + "step": 20559 + }, + { + "epoch": 1.1513047373726062, + "grad_norm": 1.4498636722564697, + "learning_rate": 9.985526315789474e-05, + "loss": 0.4128, + "step": 20560 + }, + { + "epoch": 1.1513607346847352, + "grad_norm": 2.0232760906219482, + "learning_rate": 9.985500000000001e-05, + "loss": 0.6095, + "step": 20561 + }, + { + "epoch": 1.1514167319968642, + "grad_norm": 1.7056657075881958, + "learning_rate": 9.985473684210527e-05, + "loss": 0.3965, + "step": 20562 + }, + { + "epoch": 1.1514727293089932, + "grad_norm": 1.3510469198226929, + "learning_rate": 9.985447368421053e-05, + "loss": 0.3811, + "step": 20563 + }, + { + "epoch": 1.1515287266211223, + "grad_norm": 1.7105083465576172, + "learning_rate": 9.985421052631579e-05, + "loss": 0.4305, + "step": 20564 + }, + { + "epoch": 1.1515847239332513, + "grad_norm": 1.1439239978790283, + "learning_rate": 9.985394736842105e-05, + "loss": 0.3628, + "step": 20565 + }, + { + "epoch": 1.1516407212453803, + "grad_norm": 1.3918046951293945, + "learning_rate": 9.985368421052632e-05, + "loss": 0.4256, + "step": 20566 + }, + { + "epoch": 1.1516967185575093, + "grad_norm": 1.1805944442749023, + "learning_rate": 9.985342105263158e-05, + "loss": 0.4436, + "step": 20567 + }, + { + "epoch": 1.1517527158696383, + "grad_norm": 1.303356409072876, + "learning_rate": 9.985315789473684e-05, + "loss": 0.363, + "step": 20568 + }, + { + "epoch": 1.1518087131817674, + "grad_norm": 9.980767250061035, + "learning_rate": 9.98528947368421e-05, + "loss": 0.4461, + "step": 20569 + }, + { + "epoch": 1.1518647104938964, + "grad_norm": 1.2744040489196777, + "learning_rate": 9.985263157894738e-05, + "loss": 0.4167, + "step": 20570 + }, + { + "epoch": 1.1519207078060254, + "grad_norm": 1.841589331626892, + "learning_rate": 9.985236842105264e-05, + "loss": 0.3982, + "step": 20571 + }, + { + "epoch": 1.1519767051181544, + "grad_norm": 1.4686000347137451, + "learning_rate": 9.98521052631579e-05, + "loss": 0.4736, + "step": 20572 + }, + { + "epoch": 1.1520327024302834, + "grad_norm": 2.5662789344787598, + "learning_rate": 9.985184210526316e-05, + "loss": 0.3899, + "step": 20573 + }, + { + "epoch": 1.1520886997424125, + "grad_norm": 3.3797099590301514, + "learning_rate": 9.985157894736843e-05, + "loss": 0.661, + "step": 20574 + }, + { + "epoch": 1.1521446970545415, + "grad_norm": 1.292691946029663, + "learning_rate": 9.985131578947369e-05, + "loss": 0.5272, + "step": 20575 + }, + { + "epoch": 1.1522006943666705, + "grad_norm": 1.3117496967315674, + "learning_rate": 9.985105263157896e-05, + "loss": 0.42, + "step": 20576 + }, + { + "epoch": 1.1522566916787995, + "grad_norm": 1.5081813335418701, + "learning_rate": 9.985078947368421e-05, + "loss": 0.6265, + "step": 20577 + }, + { + "epoch": 1.1523126889909285, + "grad_norm": 1.4189059734344482, + "learning_rate": 9.985052631578948e-05, + "loss": 0.5678, + "step": 20578 + }, + { + "epoch": 1.1523686863030576, + "grad_norm": 1.3685775995254517, + "learning_rate": 9.985026315789474e-05, + "loss": 0.449, + "step": 20579 + }, + { + "epoch": 1.1524246836151866, + "grad_norm": 1.1283231973648071, + "learning_rate": 9.985000000000001e-05, + "loss": 0.3494, + "step": 20580 + }, + { + "epoch": 1.1524806809273156, + "grad_norm": 1.373646855354309, + "learning_rate": 9.984973684210527e-05, + "loss": 0.5035, + "step": 20581 + }, + { + "epoch": 1.1525366782394446, + "grad_norm": 1.3119111061096191, + "learning_rate": 9.984947368421052e-05, + "loss": 0.4533, + "step": 20582 + }, + { + "epoch": 1.1525926755515736, + "grad_norm": 1.592946171760559, + "learning_rate": 9.98492105263158e-05, + "loss": 0.4196, + "step": 20583 + }, + { + "epoch": 1.1526486728637027, + "grad_norm": 1.5488710403442383, + "learning_rate": 9.984894736842105e-05, + "loss": 0.5623, + "step": 20584 + }, + { + "epoch": 1.1527046701758317, + "grad_norm": 1.075756549835205, + "learning_rate": 9.984868421052633e-05, + "loss": 0.295, + "step": 20585 + }, + { + "epoch": 1.1527606674879607, + "grad_norm": 1.412971019744873, + "learning_rate": 9.984842105263159e-05, + "loss": 0.3412, + "step": 20586 + }, + { + "epoch": 1.1528166648000897, + "grad_norm": 1.327127456665039, + "learning_rate": 9.984815789473685e-05, + "loss": 0.3978, + "step": 20587 + }, + { + "epoch": 1.1528726621122187, + "grad_norm": 1.4128966331481934, + "learning_rate": 9.98478947368421e-05, + "loss": 0.513, + "step": 20588 + }, + { + "epoch": 1.1529286594243477, + "grad_norm": 1.3311222791671753, + "learning_rate": 9.984763157894738e-05, + "loss": 0.3648, + "step": 20589 + }, + { + "epoch": 1.1529846567364768, + "grad_norm": 1.5850287675857544, + "learning_rate": 9.984736842105264e-05, + "loss": 0.6009, + "step": 20590 + }, + { + "epoch": 1.1530406540486058, + "grad_norm": 1.4272699356079102, + "learning_rate": 9.98471052631579e-05, + "loss": 0.4599, + "step": 20591 + }, + { + "epoch": 1.1530966513607348, + "grad_norm": 1.2315454483032227, + "learning_rate": 9.984684210526316e-05, + "loss": 0.4101, + "step": 20592 + }, + { + "epoch": 1.1531526486728638, + "grad_norm": 1.3531090021133423, + "learning_rate": 9.984657894736843e-05, + "loss": 0.5647, + "step": 20593 + }, + { + "epoch": 1.1532086459849928, + "grad_norm": 1.4263765811920166, + "learning_rate": 9.984631578947369e-05, + "loss": 0.4509, + "step": 20594 + }, + { + "epoch": 1.1532646432971219, + "grad_norm": 1.655713438987732, + "learning_rate": 9.984605263157895e-05, + "loss": 0.4888, + "step": 20595 + }, + { + "epoch": 1.1533206406092509, + "grad_norm": 1.4959297180175781, + "learning_rate": 9.984578947368421e-05, + "loss": 0.4943, + "step": 20596 + }, + { + "epoch": 1.1533766379213797, + "grad_norm": 1.14749276638031, + "learning_rate": 9.984552631578948e-05, + "loss": 0.3827, + "step": 20597 + }, + { + "epoch": 1.1534326352335087, + "grad_norm": 1.2836191654205322, + "learning_rate": 9.984526315789474e-05, + "loss": 0.4814, + "step": 20598 + }, + { + "epoch": 1.1534886325456377, + "grad_norm": 1.748325228691101, + "learning_rate": 9.9845e-05, + "loss": 0.4994, + "step": 20599 + }, + { + "epoch": 1.1535446298577667, + "grad_norm": 1.253800392150879, + "learning_rate": 9.984473684210526e-05, + "loss": 0.3932, + "step": 20600 + }, + { + "epoch": 1.1536006271698958, + "grad_norm": 1.2215180397033691, + "learning_rate": 9.984447368421052e-05, + "loss": 0.4599, + "step": 20601 + }, + { + "epoch": 1.1536566244820248, + "grad_norm": 1.1674331426620483, + "learning_rate": 9.98442105263158e-05, + "loss": 0.3978, + "step": 20602 + }, + { + "epoch": 1.1537126217941538, + "grad_norm": 1.3800455331802368, + "learning_rate": 9.984394736842106e-05, + "loss": 0.4135, + "step": 20603 + }, + { + "epoch": 1.1537686191062828, + "grad_norm": 1.2307895421981812, + "learning_rate": 9.984368421052632e-05, + "loss": 0.4272, + "step": 20604 + }, + { + "epoch": 1.1538246164184118, + "grad_norm": 1.4166638851165771, + "learning_rate": 9.984342105263158e-05, + "loss": 0.4664, + "step": 20605 + }, + { + "epoch": 1.1538806137305408, + "grad_norm": 1.3545113801956177, + "learning_rate": 9.984315789473685e-05, + "loss": 0.4026, + "step": 20606 + }, + { + "epoch": 1.1539366110426699, + "grad_norm": 1.3365097045898438, + "learning_rate": 9.984289473684211e-05, + "loss": 0.4322, + "step": 20607 + }, + { + "epoch": 1.1539926083547989, + "grad_norm": 1.1657264232635498, + "learning_rate": 9.984263157894738e-05, + "loss": 0.3561, + "step": 20608 + }, + { + "epoch": 1.154048605666928, + "grad_norm": 1.6107335090637207, + "learning_rate": 9.984236842105263e-05, + "loss": 0.4779, + "step": 20609 + }, + { + "epoch": 1.154104602979057, + "grad_norm": 1.2185475826263428, + "learning_rate": 9.98421052631579e-05, + "loss": 0.4163, + "step": 20610 + }, + { + "epoch": 1.154160600291186, + "grad_norm": 1.1416774988174438, + "learning_rate": 9.984184210526316e-05, + "loss": 0.3986, + "step": 20611 + }, + { + "epoch": 1.154216597603315, + "grad_norm": 1.601789951324463, + "learning_rate": 9.984157894736843e-05, + "loss": 0.5496, + "step": 20612 + }, + { + "epoch": 1.154272594915444, + "grad_norm": 1.754992127418518, + "learning_rate": 9.98413157894737e-05, + "loss": 0.4872, + "step": 20613 + }, + { + "epoch": 1.154328592227573, + "grad_norm": 1.2180742025375366, + "learning_rate": 9.984105263157895e-05, + "loss": 0.5108, + "step": 20614 + }, + { + "epoch": 1.154384589539702, + "grad_norm": 1.204809546470642, + "learning_rate": 9.984078947368421e-05, + "loss": 0.4099, + "step": 20615 + }, + { + "epoch": 1.154440586851831, + "grad_norm": 1.3774548768997192, + "learning_rate": 9.984052631578947e-05, + "loss": 0.5555, + "step": 20616 + }, + { + "epoch": 1.15449658416396, + "grad_norm": 1.548224925994873, + "learning_rate": 9.984026315789475e-05, + "loss": 0.4562, + "step": 20617 + }, + { + "epoch": 1.154552581476089, + "grad_norm": 1.4371669292449951, + "learning_rate": 9.984e-05, + "loss": 0.4748, + "step": 20618 + }, + { + "epoch": 1.154608578788218, + "grad_norm": 1.6385565996170044, + "learning_rate": 9.983973684210527e-05, + "loss": 0.5372, + "step": 20619 + }, + { + "epoch": 1.1546645761003471, + "grad_norm": 1.3957754373550415, + "learning_rate": 9.983947368421053e-05, + "loss": 0.3931, + "step": 20620 + }, + { + "epoch": 1.1547205734124761, + "grad_norm": 1.3508726358413696, + "learning_rate": 9.98392105263158e-05, + "loss": 0.591, + "step": 20621 + }, + { + "epoch": 1.1547765707246052, + "grad_norm": 1.2779326438903809, + "learning_rate": 9.983894736842106e-05, + "loss": 0.4228, + "step": 20622 + }, + { + "epoch": 1.1548325680367342, + "grad_norm": 1.6317874193191528, + "learning_rate": 9.983868421052632e-05, + "loss": 0.7641, + "step": 20623 + }, + { + "epoch": 1.1548885653488632, + "grad_norm": 1.2334160804748535, + "learning_rate": 9.983842105263158e-05, + "loss": 0.3859, + "step": 20624 + }, + { + "epoch": 1.1549445626609922, + "grad_norm": 1.157307744026184, + "learning_rate": 9.983815789473685e-05, + "loss": 0.3748, + "step": 20625 + }, + { + "epoch": 1.1550005599731212, + "grad_norm": 1.2137049436569214, + "learning_rate": 9.983789473684211e-05, + "loss": 0.3819, + "step": 20626 + }, + { + "epoch": 1.1550565572852503, + "grad_norm": 1.1553492546081543, + "learning_rate": 9.983763157894737e-05, + "loss": 0.3774, + "step": 20627 + }, + { + "epoch": 1.1551125545973793, + "grad_norm": 1.3425161838531494, + "learning_rate": 9.983736842105263e-05, + "loss": 0.359, + "step": 20628 + }, + { + "epoch": 1.1551685519095083, + "grad_norm": 1.3786944150924683, + "learning_rate": 9.98371052631579e-05, + "loss": 0.4054, + "step": 20629 + }, + { + "epoch": 1.1552245492216373, + "grad_norm": 1.623252511024475, + "learning_rate": 9.983684210526316e-05, + "loss": 0.3692, + "step": 20630 + }, + { + "epoch": 1.1552805465337663, + "grad_norm": 1.4085582494735718, + "learning_rate": 9.983657894736844e-05, + "loss": 0.4427, + "step": 20631 + }, + { + "epoch": 1.1553365438458953, + "grad_norm": 1.2663112878799438, + "learning_rate": 9.983631578947368e-05, + "loss": 0.3932, + "step": 20632 + }, + { + "epoch": 1.1553925411580244, + "grad_norm": 1.4995399713516235, + "learning_rate": 9.983605263157894e-05, + "loss": 0.5565, + "step": 20633 + }, + { + "epoch": 1.1554485384701534, + "grad_norm": 1.4257159233093262, + "learning_rate": 9.983578947368422e-05, + "loss": 0.5338, + "step": 20634 + }, + { + "epoch": 1.1555045357822824, + "grad_norm": 1.286906361579895, + "learning_rate": 9.983552631578948e-05, + "loss": 0.4541, + "step": 20635 + }, + { + "epoch": 1.1555605330944114, + "grad_norm": 1.3742042779922485, + "learning_rate": 9.983526315789475e-05, + "loss": 0.4161, + "step": 20636 + }, + { + "epoch": 1.1556165304065404, + "grad_norm": 1.2167344093322754, + "learning_rate": 9.9835e-05, + "loss": 0.3699, + "step": 20637 + }, + { + "epoch": 1.1556725277186695, + "grad_norm": 2.8422434329986572, + "learning_rate": 9.983473684210527e-05, + "loss": 0.4744, + "step": 20638 + }, + { + "epoch": 1.1557285250307985, + "grad_norm": 1.3396075963974, + "learning_rate": 9.983447368421053e-05, + "loss": 0.4409, + "step": 20639 + }, + { + "epoch": 1.1557845223429275, + "grad_norm": 1.3033193349838257, + "learning_rate": 9.98342105263158e-05, + "loss": 0.3278, + "step": 20640 + }, + { + "epoch": 1.1558405196550565, + "grad_norm": 1.7807316780090332, + "learning_rate": 9.983394736842106e-05, + "loss": 0.407, + "step": 20641 + }, + { + "epoch": 1.1558965169671855, + "grad_norm": 1.249908208847046, + "learning_rate": 9.983368421052632e-05, + "loss": 0.386, + "step": 20642 + }, + { + "epoch": 1.1559525142793146, + "grad_norm": 1.384415864944458, + "learning_rate": 9.983342105263158e-05, + "loss": 0.4447, + "step": 20643 + }, + { + "epoch": 1.1560085115914436, + "grad_norm": 1.1966683864593506, + "learning_rate": 9.983315789473685e-05, + "loss": 0.3824, + "step": 20644 + }, + { + "epoch": 1.1560645089035726, + "grad_norm": 1.3831819295883179, + "learning_rate": 9.983289473684211e-05, + "loss": 0.4426, + "step": 20645 + }, + { + "epoch": 1.1561205062157016, + "grad_norm": 1.4994510412216187, + "learning_rate": 9.983263157894737e-05, + "loss": 0.4747, + "step": 20646 + }, + { + "epoch": 1.1561765035278306, + "grad_norm": 1.6412901878356934, + "learning_rate": 9.983236842105263e-05, + "loss": 0.4636, + "step": 20647 + }, + { + "epoch": 1.1562325008399597, + "grad_norm": 1.3096891641616821, + "learning_rate": 9.98321052631579e-05, + "loss": 0.5052, + "step": 20648 + }, + { + "epoch": 1.1562884981520887, + "grad_norm": 1.3484374284744263, + "learning_rate": 9.983184210526317e-05, + "loss": 0.453, + "step": 20649 + }, + { + "epoch": 1.1563444954642177, + "grad_norm": 1.302478551864624, + "learning_rate": 9.983157894736843e-05, + "loss": 0.3929, + "step": 20650 + }, + { + "epoch": 1.1564004927763467, + "grad_norm": 1.4896066188812256, + "learning_rate": 9.983131578947369e-05, + "loss": 0.5056, + "step": 20651 + }, + { + "epoch": 1.1564564900884757, + "grad_norm": 1.3241114616394043, + "learning_rate": 9.983105263157895e-05, + "loss": 0.3247, + "step": 20652 + }, + { + "epoch": 1.1565124874006047, + "grad_norm": 1.3620420694351196, + "learning_rate": 9.983078947368422e-05, + "loss": 0.4644, + "step": 20653 + }, + { + "epoch": 1.1565684847127338, + "grad_norm": 1.547397255897522, + "learning_rate": 9.983052631578948e-05, + "loss": 0.346, + "step": 20654 + }, + { + "epoch": 1.1566244820248628, + "grad_norm": 1.4957448244094849, + "learning_rate": 9.983026315789474e-05, + "loss": 0.5126, + "step": 20655 + }, + { + "epoch": 1.1566804793369918, + "grad_norm": 1.281531810760498, + "learning_rate": 9.983e-05, + "loss": 0.3975, + "step": 20656 + }, + { + "epoch": 1.1567364766491208, + "grad_norm": 1.444127082824707, + "learning_rate": 9.982973684210527e-05, + "loss": 0.4188, + "step": 20657 + }, + { + "epoch": 1.1567924739612498, + "grad_norm": 1.3772447109222412, + "learning_rate": 9.982947368421053e-05, + "loss": 0.3654, + "step": 20658 + }, + { + "epoch": 1.1568484712733789, + "grad_norm": 1.1936354637145996, + "learning_rate": 9.982921052631579e-05, + "loss": 0.3566, + "step": 20659 + }, + { + "epoch": 1.1569044685855079, + "grad_norm": 1.5570268630981445, + "learning_rate": 9.982894736842105e-05, + "loss": 0.3684, + "step": 20660 + }, + { + "epoch": 1.156960465897637, + "grad_norm": 1.8814806938171387, + "learning_rate": 9.982868421052632e-05, + "loss": 0.3629, + "step": 20661 + }, + { + "epoch": 1.157016463209766, + "grad_norm": 1.587518334388733, + "learning_rate": 9.982842105263158e-05, + "loss": 0.4118, + "step": 20662 + }, + { + "epoch": 1.157072460521895, + "grad_norm": 1.2009578943252563, + "learning_rate": 9.982815789473686e-05, + "loss": 0.359, + "step": 20663 + }, + { + "epoch": 1.157128457834024, + "grad_norm": 1.4607332944869995, + "learning_rate": 9.98278947368421e-05, + "loss": 0.4445, + "step": 20664 + }, + { + "epoch": 1.157184455146153, + "grad_norm": 2.0582501888275146, + "learning_rate": 9.982763157894738e-05, + "loss": 0.4486, + "step": 20665 + }, + { + "epoch": 1.157240452458282, + "grad_norm": 1.9901195764541626, + "learning_rate": 9.982736842105264e-05, + "loss": 0.4279, + "step": 20666 + }, + { + "epoch": 1.157296449770411, + "grad_norm": 2.280203342437744, + "learning_rate": 9.98271052631579e-05, + "loss": 0.5867, + "step": 20667 + }, + { + "epoch": 1.15735244708254, + "grad_norm": 1.4749000072479248, + "learning_rate": 9.982684210526317e-05, + "loss": 0.4305, + "step": 20668 + }, + { + "epoch": 1.157408444394669, + "grad_norm": 1.3941112756729126, + "learning_rate": 9.982657894736842e-05, + "loss": 0.5329, + "step": 20669 + }, + { + "epoch": 1.157464441706798, + "grad_norm": 1.2100077867507935, + "learning_rate": 9.982631578947369e-05, + "loss": 0.4537, + "step": 20670 + }, + { + "epoch": 1.157520439018927, + "grad_norm": 1.497073769569397, + "learning_rate": 9.982605263157895e-05, + "loss": 0.4575, + "step": 20671 + }, + { + "epoch": 1.1575764363310561, + "grad_norm": 1.3474293947219849, + "learning_rate": 9.982578947368422e-05, + "loss": 0.4577, + "step": 20672 + }, + { + "epoch": 1.1576324336431851, + "grad_norm": 1.2917795181274414, + "learning_rate": 9.982552631578948e-05, + "loss": 0.3241, + "step": 20673 + }, + { + "epoch": 1.1576884309553142, + "grad_norm": 1.353052020072937, + "learning_rate": 9.982526315789474e-05, + "loss": 0.5881, + "step": 20674 + }, + { + "epoch": 1.1577444282674432, + "grad_norm": 1.6958577632904053, + "learning_rate": 9.9825e-05, + "loss": 0.7388, + "step": 20675 + }, + { + "epoch": 1.1578004255795722, + "grad_norm": 1.5526212453842163, + "learning_rate": 9.982473684210527e-05, + "loss": 0.4998, + "step": 20676 + }, + { + "epoch": 1.1578564228917012, + "grad_norm": 1.227476716041565, + "learning_rate": 9.982447368421053e-05, + "loss": 0.4287, + "step": 20677 + }, + { + "epoch": 1.1579124202038302, + "grad_norm": 1.4508161544799805, + "learning_rate": 9.98242105263158e-05, + "loss": 0.4634, + "step": 20678 + }, + { + "epoch": 1.1579684175159592, + "grad_norm": 1.591030240058899, + "learning_rate": 9.982394736842105e-05, + "loss": 0.5235, + "step": 20679 + }, + { + "epoch": 1.1580244148280883, + "grad_norm": 1.2589104175567627, + "learning_rate": 9.982368421052633e-05, + "loss": 0.4743, + "step": 20680 + }, + { + "epoch": 1.1580804121402173, + "grad_norm": 1.3506782054901123, + "learning_rate": 9.982342105263159e-05, + "loss": 0.4832, + "step": 20681 + }, + { + "epoch": 1.1581364094523463, + "grad_norm": 2.7089169025421143, + "learning_rate": 9.982315789473685e-05, + "loss": 0.3896, + "step": 20682 + }, + { + "epoch": 1.1581924067644753, + "grad_norm": 1.3554126024246216, + "learning_rate": 9.98228947368421e-05, + "loss": 0.4197, + "step": 20683 + }, + { + "epoch": 1.1582484040766043, + "grad_norm": 1.5139764547348022, + "learning_rate": 9.982263157894738e-05, + "loss": 0.4632, + "step": 20684 + }, + { + "epoch": 1.1583044013887334, + "grad_norm": 1.5954136848449707, + "learning_rate": 9.982236842105264e-05, + "loss": 0.5863, + "step": 20685 + }, + { + "epoch": 1.1583603987008624, + "grad_norm": 1.5175155401229858, + "learning_rate": 9.98221052631579e-05, + "loss": 0.3828, + "step": 20686 + }, + { + "epoch": 1.1584163960129914, + "grad_norm": 1.2836077213287354, + "learning_rate": 9.982184210526316e-05, + "loss": 0.4393, + "step": 20687 + }, + { + "epoch": 1.1584723933251204, + "grad_norm": 1.2348936796188354, + "learning_rate": 9.982157894736842e-05, + "loss": 0.4897, + "step": 20688 + }, + { + "epoch": 1.1585283906372494, + "grad_norm": 1.135157823562622, + "learning_rate": 9.982131578947369e-05, + "loss": 0.375, + "step": 20689 + }, + { + "epoch": 1.1585843879493785, + "grad_norm": 1.1472995281219482, + "learning_rate": 9.982105263157895e-05, + "loss": 0.4163, + "step": 20690 + }, + { + "epoch": 1.1586403852615075, + "grad_norm": 1.7743520736694336, + "learning_rate": 9.982078947368422e-05, + "loss": 0.5812, + "step": 20691 + }, + { + "epoch": 1.1586963825736365, + "grad_norm": 1.519546627998352, + "learning_rate": 9.982052631578947e-05, + "loss": 0.4854, + "step": 20692 + }, + { + "epoch": 1.1587523798857655, + "grad_norm": 1.6588484048843384, + "learning_rate": 9.982026315789474e-05, + "loss": 0.4428, + "step": 20693 + }, + { + "epoch": 1.1588083771978945, + "grad_norm": 1.4660861492156982, + "learning_rate": 9.982e-05, + "loss": 0.5322, + "step": 20694 + }, + { + "epoch": 1.1588643745100236, + "grad_norm": 1.6412279605865479, + "learning_rate": 9.981973684210528e-05, + "loss": 0.3968, + "step": 20695 + }, + { + "epoch": 1.1589203718221526, + "grad_norm": 1.2905727624893188, + "learning_rate": 9.981947368421054e-05, + "loss": 0.3578, + "step": 20696 + }, + { + "epoch": 1.1589763691342816, + "grad_norm": 1.4095821380615234, + "learning_rate": 9.98192105263158e-05, + "loss": 0.5055, + "step": 20697 + }, + { + "epoch": 1.1590323664464106, + "grad_norm": 1.3190431594848633, + "learning_rate": 9.981894736842106e-05, + "loss": 0.4439, + "step": 20698 + }, + { + "epoch": 1.1590883637585396, + "grad_norm": 1.1977996826171875, + "learning_rate": 9.981868421052633e-05, + "loss": 0.3916, + "step": 20699 + }, + { + "epoch": 1.1591443610706686, + "grad_norm": 1.425007700920105, + "learning_rate": 9.981842105263159e-05, + "loss": 0.3826, + "step": 20700 + }, + { + "epoch": 1.1592003583827977, + "grad_norm": 1.4595378637313843, + "learning_rate": 9.981815789473685e-05, + "loss": 0.4243, + "step": 20701 + }, + { + "epoch": 1.1592563556949267, + "grad_norm": 1.5491729974746704, + "learning_rate": 9.981789473684211e-05, + "loss": 0.5941, + "step": 20702 + }, + { + "epoch": 1.1593123530070557, + "grad_norm": 1.4100964069366455, + "learning_rate": 9.981763157894737e-05, + "loss": 0.5043, + "step": 20703 + }, + { + "epoch": 1.1593683503191847, + "grad_norm": 1.2770140171051025, + "learning_rate": 9.981736842105264e-05, + "loss": 0.4207, + "step": 20704 + }, + { + "epoch": 1.1594243476313137, + "grad_norm": 2.4358294010162354, + "learning_rate": 9.98171052631579e-05, + "loss": 0.3678, + "step": 20705 + }, + { + "epoch": 1.1594803449434428, + "grad_norm": 1.441752314567566, + "learning_rate": 9.981684210526316e-05, + "loss": 0.5353, + "step": 20706 + }, + { + "epoch": 1.1595363422555718, + "grad_norm": 1.4326294660568237, + "learning_rate": 9.981657894736842e-05, + "loss": 0.4606, + "step": 20707 + }, + { + "epoch": 1.1595923395677008, + "grad_norm": 1.4000003337860107, + "learning_rate": 9.98163157894737e-05, + "loss": 0.442, + "step": 20708 + }, + { + "epoch": 1.1596483368798298, + "grad_norm": 1.3965154886245728, + "learning_rate": 9.981605263157895e-05, + "loss": 0.4879, + "step": 20709 + }, + { + "epoch": 1.1597043341919588, + "grad_norm": 1.3141099214553833, + "learning_rate": 9.981578947368421e-05, + "loss": 0.3911, + "step": 20710 + }, + { + "epoch": 1.1597603315040879, + "grad_norm": 1.27518630027771, + "learning_rate": 9.981552631578947e-05, + "loss": 0.3508, + "step": 20711 + }, + { + "epoch": 1.1598163288162169, + "grad_norm": 1.3997254371643066, + "learning_rate": 9.981526315789475e-05, + "loss": 0.3492, + "step": 20712 + }, + { + "epoch": 1.159872326128346, + "grad_norm": 1.1966441869735718, + "learning_rate": 9.9815e-05, + "loss": 0.4952, + "step": 20713 + }, + { + "epoch": 1.159928323440475, + "grad_norm": 1.563307762145996, + "learning_rate": 9.981473684210527e-05, + "loss": 0.4078, + "step": 20714 + }, + { + "epoch": 1.159984320752604, + "grad_norm": 1.7178142070770264, + "learning_rate": 9.981447368421053e-05, + "loss": 0.4521, + "step": 20715 + }, + { + "epoch": 1.160040318064733, + "grad_norm": 1.086503028869629, + "learning_rate": 9.98142105263158e-05, + "loss": 0.3867, + "step": 20716 + }, + { + "epoch": 1.160096315376862, + "grad_norm": 1.123307466506958, + "learning_rate": 9.981394736842106e-05, + "loss": 0.346, + "step": 20717 + }, + { + "epoch": 1.160152312688991, + "grad_norm": 1.4775654077529907, + "learning_rate": 9.981368421052633e-05, + "loss": 0.5772, + "step": 20718 + }, + { + "epoch": 1.16020831000112, + "grad_norm": 1.3776569366455078, + "learning_rate": 9.981342105263158e-05, + "loss": 0.441, + "step": 20719 + }, + { + "epoch": 1.160264307313249, + "grad_norm": 1.5174689292907715, + "learning_rate": 9.981315789473684e-05, + "loss": 0.4709, + "step": 20720 + }, + { + "epoch": 1.160320304625378, + "grad_norm": 1.0853722095489502, + "learning_rate": 9.981289473684211e-05, + "loss": 0.4545, + "step": 20721 + }, + { + "epoch": 1.160376301937507, + "grad_norm": 1.148170828819275, + "learning_rate": 9.981263157894737e-05, + "loss": 0.368, + "step": 20722 + }, + { + "epoch": 1.160432299249636, + "grad_norm": 1.2531620264053345, + "learning_rate": 9.981236842105264e-05, + "loss": 0.395, + "step": 20723 + }, + { + "epoch": 1.160488296561765, + "grad_norm": 1.188633680343628, + "learning_rate": 9.981210526315789e-05, + "loss": 0.3, + "step": 20724 + }, + { + "epoch": 1.1605442938738941, + "grad_norm": 1.3251160383224487, + "learning_rate": 9.981184210526316e-05, + "loss": 0.4416, + "step": 20725 + }, + { + "epoch": 1.1606002911860231, + "grad_norm": 1.6037322282791138, + "learning_rate": 9.981157894736842e-05, + "loss": 0.4821, + "step": 20726 + }, + { + "epoch": 1.1606562884981522, + "grad_norm": 1.3819395303726196, + "learning_rate": 9.98113157894737e-05, + "loss": 0.4847, + "step": 20727 + }, + { + "epoch": 1.1607122858102812, + "grad_norm": 1.2603572607040405, + "learning_rate": 9.981105263157896e-05, + "loss": 0.401, + "step": 20728 + }, + { + "epoch": 1.1607682831224102, + "grad_norm": 1.2100732326507568, + "learning_rate": 9.981078947368422e-05, + "loss": 0.5267, + "step": 20729 + }, + { + "epoch": 1.1608242804345392, + "grad_norm": 2.2614517211914062, + "learning_rate": 9.981052631578948e-05, + "loss": 0.4493, + "step": 20730 + }, + { + "epoch": 1.1608802777466682, + "grad_norm": 1.3757879734039307, + "learning_rate": 9.981026315789475e-05, + "loss": 0.4935, + "step": 20731 + }, + { + "epoch": 1.1609362750587973, + "grad_norm": 1.8969056606292725, + "learning_rate": 9.981000000000001e-05, + "loss": 0.3687, + "step": 20732 + }, + { + "epoch": 1.1609922723709263, + "grad_norm": 1.234013319015503, + "learning_rate": 9.980973684210527e-05, + "loss": 0.3627, + "step": 20733 + }, + { + "epoch": 1.1610482696830553, + "grad_norm": 1.4058195352554321, + "learning_rate": 9.980947368421053e-05, + "loss": 0.4655, + "step": 20734 + }, + { + "epoch": 1.1611042669951843, + "grad_norm": 1.357609748840332, + "learning_rate": 9.98092105263158e-05, + "loss": 0.4541, + "step": 20735 + }, + { + "epoch": 1.1611602643073133, + "grad_norm": 1.443774938583374, + "learning_rate": 9.980894736842106e-05, + "loss": 0.4308, + "step": 20736 + }, + { + "epoch": 1.1612162616194424, + "grad_norm": 1.4773916006088257, + "learning_rate": 9.980868421052632e-05, + "loss": 0.4524, + "step": 20737 + }, + { + "epoch": 1.1612722589315714, + "grad_norm": 1.1345545053482056, + "learning_rate": 9.980842105263158e-05, + "loss": 0.489, + "step": 20738 + }, + { + "epoch": 1.1613282562437004, + "grad_norm": 1.2463173866271973, + "learning_rate": 9.980815789473684e-05, + "loss": 0.3472, + "step": 20739 + }, + { + "epoch": 1.1613842535558294, + "grad_norm": 1.7290700674057007, + "learning_rate": 9.980789473684211e-05, + "loss": 0.424, + "step": 20740 + }, + { + "epoch": 1.1614402508679584, + "grad_norm": 1.5474966764450073, + "learning_rate": 9.980763157894737e-05, + "loss": 0.415, + "step": 20741 + }, + { + "epoch": 1.1614962481800872, + "grad_norm": 1.5076913833618164, + "learning_rate": 9.980736842105263e-05, + "loss": 0.4521, + "step": 20742 + }, + { + "epoch": 1.1615522454922163, + "grad_norm": 1.579359531402588, + "learning_rate": 9.980710526315789e-05, + "loss": 0.5414, + "step": 20743 + }, + { + "epoch": 1.1616082428043453, + "grad_norm": 1.6798981428146362, + "learning_rate": 9.980684210526317e-05, + "loss": 0.3569, + "step": 20744 + }, + { + "epoch": 1.1616642401164743, + "grad_norm": 1.8302743434906006, + "learning_rate": 9.980657894736843e-05, + "loss": 0.4196, + "step": 20745 + }, + { + "epoch": 1.1617202374286033, + "grad_norm": 1.3661141395568848, + "learning_rate": 9.98063157894737e-05, + "loss": 0.4603, + "step": 20746 + }, + { + "epoch": 1.1617762347407323, + "grad_norm": 1.3627281188964844, + "learning_rate": 9.980605263157894e-05, + "loss": 0.3453, + "step": 20747 + }, + { + "epoch": 1.1618322320528613, + "grad_norm": 1.2853434085845947, + "learning_rate": 9.980578947368422e-05, + "loss": 0.3704, + "step": 20748 + }, + { + "epoch": 1.1618882293649904, + "grad_norm": 1.2826396226882935, + "learning_rate": 9.980552631578948e-05, + "loss": 0.4098, + "step": 20749 + }, + { + "epoch": 1.1619442266771194, + "grad_norm": 1.340430736541748, + "learning_rate": 9.980526315789475e-05, + "loss": 0.5137, + "step": 20750 + }, + { + "epoch": 1.1620002239892484, + "grad_norm": 1.456680417060852, + "learning_rate": 9.9805e-05, + "loss": 0.4012, + "step": 20751 + }, + { + "epoch": 1.1620562213013774, + "grad_norm": 1.3966798782348633, + "learning_rate": 9.980473684210527e-05, + "loss": 0.4541, + "step": 20752 + }, + { + "epoch": 1.1621122186135064, + "grad_norm": 1.2240643501281738, + "learning_rate": 9.980447368421053e-05, + "loss": 0.51, + "step": 20753 + }, + { + "epoch": 1.1621682159256355, + "grad_norm": 1.1132729053497314, + "learning_rate": 9.980421052631579e-05, + "loss": 0.3773, + "step": 20754 + }, + { + "epoch": 1.1622242132377645, + "grad_norm": 1.4580998420715332, + "learning_rate": 9.980394736842106e-05, + "loss": 0.4872, + "step": 20755 + }, + { + "epoch": 1.1622802105498935, + "grad_norm": 1.3250617980957031, + "learning_rate": 9.980368421052631e-05, + "loss": 0.3912, + "step": 20756 + }, + { + "epoch": 1.1623362078620225, + "grad_norm": 1.5409741401672363, + "learning_rate": 9.980342105263158e-05, + "loss": 0.3819, + "step": 20757 + }, + { + "epoch": 1.1623922051741515, + "grad_norm": 1.1932024955749512, + "learning_rate": 9.980315789473684e-05, + "loss": 0.4859, + "step": 20758 + }, + { + "epoch": 1.1624482024862806, + "grad_norm": 1.4459530115127563, + "learning_rate": 9.980289473684212e-05, + "loss": 0.4208, + "step": 20759 + }, + { + "epoch": 1.1625041997984096, + "grad_norm": 1.2691309452056885, + "learning_rate": 9.980263157894738e-05, + "loss": 0.4219, + "step": 20760 + }, + { + "epoch": 1.1625601971105386, + "grad_norm": 1.4836251735687256, + "learning_rate": 9.980236842105264e-05, + "loss": 0.3515, + "step": 20761 + }, + { + "epoch": 1.1626161944226676, + "grad_norm": 1.3650662899017334, + "learning_rate": 9.98021052631579e-05, + "loss": 0.4355, + "step": 20762 + }, + { + "epoch": 1.1626721917347966, + "grad_norm": 1.3967193365097046, + "learning_rate": 9.980184210526317e-05, + "loss": 0.4403, + "step": 20763 + }, + { + "epoch": 1.1627281890469257, + "grad_norm": 1.680420160293579, + "learning_rate": 9.980157894736843e-05, + "loss": 0.6033, + "step": 20764 + }, + { + "epoch": 1.1627841863590547, + "grad_norm": 1.2812414169311523, + "learning_rate": 9.980131578947369e-05, + "loss": 0.3003, + "step": 20765 + }, + { + "epoch": 1.1628401836711837, + "grad_norm": 1.4538753032684326, + "learning_rate": 9.980105263157895e-05, + "loss": 0.4283, + "step": 20766 + }, + { + "epoch": 1.1628961809833127, + "grad_norm": 1.4691946506500244, + "learning_rate": 9.980078947368422e-05, + "loss": 0.4283, + "step": 20767 + }, + { + "epoch": 1.1629521782954417, + "grad_norm": 1.609779953956604, + "learning_rate": 9.980052631578948e-05, + "loss": 0.4358, + "step": 20768 + }, + { + "epoch": 1.1630081756075707, + "grad_norm": 1.4415971040725708, + "learning_rate": 9.980026315789474e-05, + "loss": 0.5102, + "step": 20769 + }, + { + "epoch": 1.1630641729196998, + "grad_norm": 1.41593337059021, + "learning_rate": 9.98e-05, + "loss": 0.4327, + "step": 20770 + }, + { + "epoch": 1.1631201702318288, + "grad_norm": 5.720890045166016, + "learning_rate": 9.979973684210526e-05, + "loss": 0.5614, + "step": 20771 + }, + { + "epoch": 1.1631761675439578, + "grad_norm": 1.302211880683899, + "learning_rate": 9.979947368421053e-05, + "loss": 0.4625, + "step": 20772 + }, + { + "epoch": 1.1632321648560868, + "grad_norm": 1.50058114528656, + "learning_rate": 9.979921052631579e-05, + "loss": 0.4799, + "step": 20773 + }, + { + "epoch": 1.1632881621682158, + "grad_norm": 1.2870792150497437, + "learning_rate": 9.979894736842105e-05, + "loss": 0.4244, + "step": 20774 + }, + { + "epoch": 1.1633441594803449, + "grad_norm": 1.3122467994689941, + "learning_rate": 9.979868421052631e-05, + "loss": 0.3458, + "step": 20775 + }, + { + "epoch": 1.1634001567924739, + "grad_norm": 2.0894978046417236, + "learning_rate": 9.979842105263159e-05, + "loss": 0.5191, + "step": 20776 + }, + { + "epoch": 1.163456154104603, + "grad_norm": 1.4296149015426636, + "learning_rate": 9.979815789473685e-05, + "loss": 0.4258, + "step": 20777 + }, + { + "epoch": 1.163512151416732, + "grad_norm": 1.4373489618301392, + "learning_rate": 9.979789473684212e-05, + "loss": 0.4261, + "step": 20778 + }, + { + "epoch": 1.163568148728861, + "grad_norm": 1.5073524713516235, + "learning_rate": 9.979763157894736e-05, + "loss": 0.4356, + "step": 20779 + }, + { + "epoch": 1.16362414604099, + "grad_norm": 1.6112574338912964, + "learning_rate": 9.979736842105264e-05, + "loss": 0.606, + "step": 20780 + }, + { + "epoch": 1.163680143353119, + "grad_norm": 1.4365694522857666, + "learning_rate": 9.97971052631579e-05, + "loss": 0.454, + "step": 20781 + }, + { + "epoch": 1.163736140665248, + "grad_norm": 1.2466692924499512, + "learning_rate": 9.979684210526317e-05, + "loss": 0.3782, + "step": 20782 + }, + { + "epoch": 1.163792137977377, + "grad_norm": 1.4135067462921143, + "learning_rate": 9.979657894736843e-05, + "loss": 0.5012, + "step": 20783 + }, + { + "epoch": 1.163848135289506, + "grad_norm": 1.2118281126022339, + "learning_rate": 9.979631578947369e-05, + "loss": 0.4247, + "step": 20784 + }, + { + "epoch": 1.163904132601635, + "grad_norm": 1.41605544090271, + "learning_rate": 9.979605263157895e-05, + "loss": 0.4478, + "step": 20785 + }, + { + "epoch": 1.163960129913764, + "grad_norm": 1.5420175790786743, + "learning_rate": 9.979578947368422e-05, + "loss": 0.4679, + "step": 20786 + }, + { + "epoch": 1.164016127225893, + "grad_norm": 1.2664000988006592, + "learning_rate": 9.979552631578948e-05, + "loss": 0.4349, + "step": 20787 + }, + { + "epoch": 1.1640721245380221, + "grad_norm": 1.2067443132400513, + "learning_rate": 9.979526315789474e-05, + "loss": 0.4742, + "step": 20788 + }, + { + "epoch": 1.1641281218501511, + "grad_norm": 1.4331861734390259, + "learning_rate": 9.9795e-05, + "loss": 0.4219, + "step": 20789 + }, + { + "epoch": 1.1641841191622802, + "grad_norm": 2.042576789855957, + "learning_rate": 9.979473684210526e-05, + "loss": 0.4347, + "step": 20790 + }, + { + "epoch": 1.1642401164744092, + "grad_norm": 1.2423555850982666, + "learning_rate": 9.979447368421054e-05, + "loss": 0.4861, + "step": 20791 + }, + { + "epoch": 1.1642961137865382, + "grad_norm": 1.7242858409881592, + "learning_rate": 9.97942105263158e-05, + "loss": 0.6151, + "step": 20792 + }, + { + "epoch": 1.1643521110986672, + "grad_norm": 1.4775879383087158, + "learning_rate": 9.979394736842105e-05, + "loss": 0.4475, + "step": 20793 + }, + { + "epoch": 1.1644081084107962, + "grad_norm": 1.215835452079773, + "learning_rate": 9.979368421052631e-05, + "loss": 0.4152, + "step": 20794 + }, + { + "epoch": 1.1644641057229252, + "grad_norm": 1.7902518510818481, + "learning_rate": 9.979342105263159e-05, + "loss": 0.5051, + "step": 20795 + }, + { + "epoch": 1.1645201030350543, + "grad_norm": 1.3555628061294556, + "learning_rate": 9.979315789473685e-05, + "loss": 0.4597, + "step": 20796 + }, + { + "epoch": 1.1645761003471833, + "grad_norm": 1.3614206314086914, + "learning_rate": 9.979289473684211e-05, + "loss": 0.4464, + "step": 20797 + }, + { + "epoch": 1.1646320976593123, + "grad_norm": 1.5785999298095703, + "learning_rate": 9.979263157894737e-05, + "loss": 0.494, + "step": 20798 + }, + { + "epoch": 1.1646880949714413, + "grad_norm": 1.6326556205749512, + "learning_rate": 9.979236842105264e-05, + "loss": 0.5103, + "step": 20799 + }, + { + "epoch": 1.1647440922835703, + "grad_norm": 1.3406559228897095, + "learning_rate": 9.97921052631579e-05, + "loss": 0.4389, + "step": 20800 + }, + { + "epoch": 1.1648000895956994, + "grad_norm": 1.7493377923965454, + "learning_rate": 9.979184210526317e-05, + "loss": 0.5527, + "step": 20801 + }, + { + "epoch": 1.1648560869078284, + "grad_norm": 1.2731801271438599, + "learning_rate": 9.979157894736842e-05, + "loss": 0.4271, + "step": 20802 + }, + { + "epoch": 1.1649120842199574, + "grad_norm": 1.9941129684448242, + "learning_rate": 9.979131578947369e-05, + "loss": 0.4659, + "step": 20803 + }, + { + "epoch": 1.1649680815320864, + "grad_norm": 1.557101845741272, + "learning_rate": 9.979105263157895e-05, + "loss": 0.5432, + "step": 20804 + }, + { + "epoch": 1.1650240788442154, + "grad_norm": 1.1992847919464111, + "learning_rate": 9.979078947368423e-05, + "loss": 0.4119, + "step": 20805 + }, + { + "epoch": 1.1650800761563445, + "grad_norm": 1.6048921346664429, + "learning_rate": 9.979052631578947e-05, + "loss": 0.5382, + "step": 20806 + }, + { + "epoch": 1.1651360734684735, + "grad_norm": 1.4330283403396606, + "learning_rate": 9.979026315789473e-05, + "loss": 0.498, + "step": 20807 + }, + { + "epoch": 1.1651920707806025, + "grad_norm": 1.2458750009536743, + "learning_rate": 9.979e-05, + "loss": 0.3971, + "step": 20808 + }, + { + "epoch": 1.1652480680927315, + "grad_norm": 1.2125712633132935, + "learning_rate": 9.978973684210526e-05, + "loss": 0.3569, + "step": 20809 + }, + { + "epoch": 1.1653040654048605, + "grad_norm": 1.4834809303283691, + "learning_rate": 9.978947368421054e-05, + "loss": 0.3777, + "step": 20810 + }, + { + "epoch": 1.1653600627169896, + "grad_norm": 1.3419028520584106, + "learning_rate": 9.978921052631578e-05, + "loss": 0.4946, + "step": 20811 + }, + { + "epoch": 1.1654160600291186, + "grad_norm": 1.4450846910476685, + "learning_rate": 9.978894736842106e-05, + "loss": 0.4897, + "step": 20812 + }, + { + "epoch": 1.1654720573412476, + "grad_norm": 1.1625169515609741, + "learning_rate": 9.978868421052632e-05, + "loss": 0.3192, + "step": 20813 + }, + { + "epoch": 1.1655280546533766, + "grad_norm": 1.422334909439087, + "learning_rate": 9.978842105263159e-05, + "loss": 0.413, + "step": 20814 + }, + { + "epoch": 1.1655840519655056, + "grad_norm": 1.3449368476867676, + "learning_rate": 9.978815789473685e-05, + "loss": 0.477, + "step": 20815 + }, + { + "epoch": 1.1656400492776346, + "grad_norm": 1.451908826828003, + "learning_rate": 9.978789473684211e-05, + "loss": 0.4372, + "step": 20816 + }, + { + "epoch": 1.1656960465897637, + "grad_norm": 1.3651716709136963, + "learning_rate": 9.978763157894737e-05, + "loss": 0.4729, + "step": 20817 + }, + { + "epoch": 1.1657520439018927, + "grad_norm": 1.708986759185791, + "learning_rate": 9.978736842105264e-05, + "loss": 0.4606, + "step": 20818 + }, + { + "epoch": 1.1658080412140217, + "grad_norm": 1.6787852048873901, + "learning_rate": 9.97871052631579e-05, + "loss": 0.496, + "step": 20819 + }, + { + "epoch": 1.1658640385261507, + "grad_norm": 1.1393404006958008, + "learning_rate": 9.978684210526316e-05, + "loss": 0.3712, + "step": 20820 + }, + { + "epoch": 1.1659200358382797, + "grad_norm": 1.606389045715332, + "learning_rate": 9.978657894736842e-05, + "loss": 0.468, + "step": 20821 + }, + { + "epoch": 1.1659760331504088, + "grad_norm": 1.4294898509979248, + "learning_rate": 9.97863157894737e-05, + "loss": 0.4755, + "step": 20822 + }, + { + "epoch": 1.1660320304625378, + "grad_norm": 1.9825870990753174, + "learning_rate": 9.978605263157896e-05, + "loss": 0.4481, + "step": 20823 + }, + { + "epoch": 1.1660880277746668, + "grad_norm": 1.1417534351348877, + "learning_rate": 9.978578947368421e-05, + "loss": 0.3835, + "step": 20824 + }, + { + "epoch": 1.1661440250867958, + "grad_norm": 1.6894428730010986, + "learning_rate": 9.978552631578947e-05, + "loss": 0.6405, + "step": 20825 + }, + { + "epoch": 1.1662000223989248, + "grad_norm": 1.5436031818389893, + "learning_rate": 9.978526315789473e-05, + "loss": 0.5195, + "step": 20826 + }, + { + "epoch": 1.1662560197110539, + "grad_norm": 1.8321113586425781, + "learning_rate": 9.978500000000001e-05, + "loss": 0.4712, + "step": 20827 + }, + { + "epoch": 1.1663120170231829, + "grad_norm": 1.4048885107040405, + "learning_rate": 9.978473684210527e-05, + "loss": 0.4554, + "step": 20828 + }, + { + "epoch": 1.166368014335312, + "grad_norm": 1.2561533451080322, + "learning_rate": 9.978447368421053e-05, + "loss": 0.4446, + "step": 20829 + }, + { + "epoch": 1.166424011647441, + "grad_norm": 1.5617958307266235, + "learning_rate": 9.978421052631579e-05, + "loss": 0.4572, + "step": 20830 + }, + { + "epoch": 1.16648000895957, + "grad_norm": 2.4134650230407715, + "learning_rate": 9.978394736842106e-05, + "loss": 0.3244, + "step": 20831 + }, + { + "epoch": 1.166536006271699, + "grad_norm": 2.2136783599853516, + "learning_rate": 9.978368421052632e-05, + "loss": 0.5679, + "step": 20832 + }, + { + "epoch": 1.166592003583828, + "grad_norm": 1.5566372871398926, + "learning_rate": 9.978342105263159e-05, + "loss": 0.3798, + "step": 20833 + }, + { + "epoch": 1.166648000895957, + "grad_norm": 1.5471076965332031, + "learning_rate": 9.978315789473684e-05, + "loss": 0.2983, + "step": 20834 + }, + { + "epoch": 1.166703998208086, + "grad_norm": 1.1122899055480957, + "learning_rate": 9.978289473684211e-05, + "loss": 0.3883, + "step": 20835 + }, + { + "epoch": 1.166759995520215, + "grad_norm": 1.3770607709884644, + "learning_rate": 9.978263157894737e-05, + "loss": 0.414, + "step": 20836 + }, + { + "epoch": 1.166815992832344, + "grad_norm": 1.6381101608276367, + "learning_rate": 9.978236842105265e-05, + "loss": 0.4451, + "step": 20837 + }, + { + "epoch": 1.166871990144473, + "grad_norm": 1.3374508619308472, + "learning_rate": 9.97821052631579e-05, + "loss": 0.6068, + "step": 20838 + }, + { + "epoch": 1.166927987456602, + "grad_norm": 1.1922324895858765, + "learning_rate": 9.978184210526317e-05, + "loss": 0.4348, + "step": 20839 + }, + { + "epoch": 1.166983984768731, + "grad_norm": 1.3648905754089355, + "learning_rate": 9.978157894736842e-05, + "loss": 0.5869, + "step": 20840 + }, + { + "epoch": 1.1670399820808601, + "grad_norm": 1.3370119333267212, + "learning_rate": 9.978131578947368e-05, + "loss": 0.3223, + "step": 20841 + }, + { + "epoch": 1.1670959793929891, + "grad_norm": 1.3336783647537231, + "learning_rate": 9.978105263157896e-05, + "loss": 0.3605, + "step": 20842 + }, + { + "epoch": 1.1671519767051182, + "grad_norm": 1.1952753067016602, + "learning_rate": 9.978078947368422e-05, + "loss": 0.409, + "step": 20843 + }, + { + "epoch": 1.1672079740172472, + "grad_norm": 1.3847600221633911, + "learning_rate": 9.978052631578948e-05, + "loss": 0.473, + "step": 20844 + }, + { + "epoch": 1.1672639713293762, + "grad_norm": 1.280964970588684, + "learning_rate": 9.978026315789474e-05, + "loss": 0.4478, + "step": 20845 + }, + { + "epoch": 1.1673199686415052, + "grad_norm": 1.550473928451538, + "learning_rate": 9.978000000000001e-05, + "loss": 0.3973, + "step": 20846 + }, + { + "epoch": 1.1673759659536342, + "grad_norm": 1.2537569999694824, + "learning_rate": 9.977973684210527e-05, + "loss": 0.4855, + "step": 20847 + }, + { + "epoch": 1.1674319632657633, + "grad_norm": 1.225132703781128, + "learning_rate": 9.977947368421053e-05, + "loss": 0.3958, + "step": 20848 + }, + { + "epoch": 1.1674879605778923, + "grad_norm": 1.2652547359466553, + "learning_rate": 9.977921052631579e-05, + "loss": 0.4369, + "step": 20849 + }, + { + "epoch": 1.1675439578900213, + "grad_norm": 1.440535306930542, + "learning_rate": 9.977894736842106e-05, + "loss": 0.5253, + "step": 20850 + }, + { + "epoch": 1.1675999552021503, + "grad_norm": 1.3133693933486938, + "learning_rate": 9.977868421052632e-05, + "loss": 0.3711, + "step": 20851 + }, + { + "epoch": 1.1676559525142793, + "grad_norm": 1.2892177104949951, + "learning_rate": 9.977842105263158e-05, + "loss": 0.4003, + "step": 20852 + }, + { + "epoch": 1.1677119498264084, + "grad_norm": 1.1579002141952515, + "learning_rate": 9.977815789473684e-05, + "loss": 0.4041, + "step": 20853 + }, + { + "epoch": 1.1677679471385374, + "grad_norm": 1.5492885112762451, + "learning_rate": 9.977789473684212e-05, + "loss": 0.4028, + "step": 20854 + }, + { + "epoch": 1.1678239444506664, + "grad_norm": 1.4084981679916382, + "learning_rate": 9.977763157894737e-05, + "loss": 0.5682, + "step": 20855 + }, + { + "epoch": 1.1678799417627954, + "grad_norm": 1.1969753503799438, + "learning_rate": 9.977736842105265e-05, + "loss": 0.4436, + "step": 20856 + }, + { + "epoch": 1.1679359390749244, + "grad_norm": 1.3001867532730103, + "learning_rate": 9.97771052631579e-05, + "loss": 0.3737, + "step": 20857 + }, + { + "epoch": 1.1679919363870535, + "grad_norm": 1.2896686792373657, + "learning_rate": 9.977684210526315e-05, + "loss": 0.3692, + "step": 20858 + }, + { + "epoch": 1.1680479336991825, + "grad_norm": 1.4139043092727661, + "learning_rate": 9.977657894736843e-05, + "loss": 0.4735, + "step": 20859 + }, + { + "epoch": 1.1681039310113115, + "grad_norm": 1.5470494031906128, + "learning_rate": 9.977631578947369e-05, + "loss": 0.5146, + "step": 20860 + }, + { + "epoch": 1.1681599283234405, + "grad_norm": 1.162338376045227, + "learning_rate": 9.977605263157895e-05, + "loss": 0.3699, + "step": 20861 + }, + { + "epoch": 1.1682159256355695, + "grad_norm": 1.4471659660339355, + "learning_rate": 9.97757894736842e-05, + "loss": 0.571, + "step": 20862 + }, + { + "epoch": 1.1682719229476985, + "grad_norm": 1.1782408952713013, + "learning_rate": 9.977552631578948e-05, + "loss": 0.4531, + "step": 20863 + }, + { + "epoch": 1.1683279202598276, + "grad_norm": 1.2059881687164307, + "learning_rate": 9.977526315789474e-05, + "loss": 0.4108, + "step": 20864 + }, + { + "epoch": 1.1683839175719566, + "grad_norm": 1.3328951597213745, + "learning_rate": 9.977500000000001e-05, + "loss": 0.5086, + "step": 20865 + }, + { + "epoch": 1.1684399148840856, + "grad_norm": 1.190344214439392, + "learning_rate": 9.977473684210526e-05, + "loss": 0.4355, + "step": 20866 + }, + { + "epoch": 1.1684959121962146, + "grad_norm": 2.1711649894714355, + "learning_rate": 9.977447368421053e-05, + "loss": 0.404, + "step": 20867 + }, + { + "epoch": 1.1685519095083436, + "grad_norm": 1.2574708461761475, + "learning_rate": 9.977421052631579e-05, + "loss": 0.3654, + "step": 20868 + }, + { + "epoch": 1.1686079068204727, + "grad_norm": 1.353613018989563, + "learning_rate": 9.977394736842107e-05, + "loss": 0.396, + "step": 20869 + }, + { + "epoch": 1.1686639041326017, + "grad_norm": 1.3925970792770386, + "learning_rate": 9.977368421052633e-05, + "loss": 0.5186, + "step": 20870 + }, + { + "epoch": 1.1687199014447307, + "grad_norm": 1.6180205345153809, + "learning_rate": 9.977342105263158e-05, + "loss": 0.4202, + "step": 20871 + }, + { + "epoch": 1.1687758987568597, + "grad_norm": 1.2593497037887573, + "learning_rate": 9.977315789473684e-05, + "loss": 0.4175, + "step": 20872 + }, + { + "epoch": 1.1688318960689887, + "grad_norm": 1.4117871522903442, + "learning_rate": 9.977289473684212e-05, + "loss": 0.3986, + "step": 20873 + }, + { + "epoch": 1.1688878933811178, + "grad_norm": 1.1125867366790771, + "learning_rate": 9.977263157894738e-05, + "loss": 0.296, + "step": 20874 + }, + { + "epoch": 1.1689438906932468, + "grad_norm": 1.1166563034057617, + "learning_rate": 9.977236842105264e-05, + "loss": 0.3429, + "step": 20875 + }, + { + "epoch": 1.1689998880053758, + "grad_norm": 1.1332788467407227, + "learning_rate": 9.97721052631579e-05, + "loss": 0.3374, + "step": 20876 + }, + { + "epoch": 1.1690558853175048, + "grad_norm": 1.4194594621658325, + "learning_rate": 9.977184210526316e-05, + "loss": 0.3917, + "step": 20877 + }, + { + "epoch": 1.1691118826296338, + "grad_norm": 1.3464772701263428, + "learning_rate": 9.977157894736843e-05, + "loss": 0.5524, + "step": 20878 + }, + { + "epoch": 1.1691678799417629, + "grad_norm": 1.4248733520507812, + "learning_rate": 9.977131578947369e-05, + "loss": 0.6119, + "step": 20879 + }, + { + "epoch": 1.1692238772538919, + "grad_norm": 1.4053601026535034, + "learning_rate": 9.977105263157895e-05, + "loss": 0.4178, + "step": 20880 + }, + { + "epoch": 1.169279874566021, + "grad_norm": 1.308698058128357, + "learning_rate": 9.977078947368421e-05, + "loss": 0.4344, + "step": 20881 + }, + { + "epoch": 1.16933587187815, + "grad_norm": 1.3528685569763184, + "learning_rate": 9.977052631578948e-05, + "loss": 0.5386, + "step": 20882 + }, + { + "epoch": 1.169391869190279, + "grad_norm": 1.090651273727417, + "learning_rate": 9.977026315789474e-05, + "loss": 0.4617, + "step": 20883 + }, + { + "epoch": 1.169447866502408, + "grad_norm": 2.5601022243499756, + "learning_rate": 9.977e-05, + "loss": 0.4759, + "step": 20884 + }, + { + "epoch": 1.169503863814537, + "grad_norm": 1.2338849306106567, + "learning_rate": 9.976973684210526e-05, + "loss": 0.3925, + "step": 20885 + }, + { + "epoch": 1.169559861126666, + "grad_norm": 1.5448415279388428, + "learning_rate": 9.976947368421053e-05, + "loss": 0.6681, + "step": 20886 + }, + { + "epoch": 1.169615858438795, + "grad_norm": 1.7976655960083008, + "learning_rate": 9.97692105263158e-05, + "loss": 0.4577, + "step": 20887 + }, + { + "epoch": 1.169671855750924, + "grad_norm": 1.1426849365234375, + "learning_rate": 9.976894736842107e-05, + "loss": 0.4016, + "step": 20888 + }, + { + "epoch": 1.169727853063053, + "grad_norm": 1.3843919038772583, + "learning_rate": 9.976868421052631e-05, + "loss": 0.5273, + "step": 20889 + }, + { + "epoch": 1.169783850375182, + "grad_norm": 1.341097116470337, + "learning_rate": 9.976842105263159e-05, + "loss": 0.4748, + "step": 20890 + }, + { + "epoch": 1.169839847687311, + "grad_norm": 1.3938333988189697, + "learning_rate": 9.976815789473685e-05, + "loss": 0.586, + "step": 20891 + }, + { + "epoch": 1.16989584499944, + "grad_norm": 1.2222189903259277, + "learning_rate": 9.976789473684211e-05, + "loss": 0.4304, + "step": 20892 + }, + { + "epoch": 1.1699518423115691, + "grad_norm": 1.2287477254867554, + "learning_rate": 9.976763157894738e-05, + "loss": 0.3932, + "step": 20893 + }, + { + "epoch": 1.1700078396236981, + "grad_norm": 1.4622381925582886, + "learning_rate": 9.976736842105263e-05, + "loss": 0.4012, + "step": 20894 + }, + { + "epoch": 1.1700638369358272, + "grad_norm": 1.5046672821044922, + "learning_rate": 9.97671052631579e-05, + "loss": 0.5457, + "step": 20895 + }, + { + "epoch": 1.1701198342479562, + "grad_norm": 1.8150392770767212, + "learning_rate": 9.976684210526316e-05, + "loss": 0.6839, + "step": 20896 + }, + { + "epoch": 1.1701758315600852, + "grad_norm": 1.2193825244903564, + "learning_rate": 9.976657894736843e-05, + "loss": 0.4728, + "step": 20897 + }, + { + "epoch": 1.1702318288722142, + "grad_norm": 1.1769912242889404, + "learning_rate": 9.976631578947368e-05, + "loss": 0.3759, + "step": 20898 + }, + { + "epoch": 1.1702878261843432, + "grad_norm": 1.4072461128234863, + "learning_rate": 9.976605263157895e-05, + "loss": 0.3223, + "step": 20899 + }, + { + "epoch": 1.1703438234964723, + "grad_norm": 1.457567572593689, + "learning_rate": 9.976578947368421e-05, + "loss": 0.5848, + "step": 20900 + }, + { + "epoch": 1.1703998208086013, + "grad_norm": 1.1242045164108276, + "learning_rate": 9.976552631578949e-05, + "loss": 0.3526, + "step": 20901 + }, + { + "epoch": 1.1704558181207303, + "grad_norm": 5.152798175811768, + "learning_rate": 9.976526315789474e-05, + "loss": 0.5271, + "step": 20902 + }, + { + "epoch": 1.1705118154328593, + "grad_norm": 1.4303909540176392, + "learning_rate": 9.9765e-05, + "loss": 0.4677, + "step": 20903 + }, + { + "epoch": 1.1705678127449883, + "grad_norm": 1.189496397972107, + "learning_rate": 9.976473684210526e-05, + "loss": 0.3779, + "step": 20904 + }, + { + "epoch": 1.1706238100571174, + "grad_norm": 1.820021629333496, + "learning_rate": 9.976447368421054e-05, + "loss": 0.5416, + "step": 20905 + }, + { + "epoch": 1.1706798073692464, + "grad_norm": 2.0237650871276855, + "learning_rate": 9.97642105263158e-05, + "loss": 0.6019, + "step": 20906 + }, + { + "epoch": 1.1707358046813754, + "grad_norm": 1.4961506128311157, + "learning_rate": 9.976394736842106e-05, + "loss": 0.4167, + "step": 20907 + }, + { + "epoch": 1.1707918019935044, + "grad_norm": 1.4235213994979858, + "learning_rate": 9.976368421052632e-05, + "loss": 0.4684, + "step": 20908 + }, + { + "epoch": 1.1708477993056334, + "grad_norm": 1.7053412199020386, + "learning_rate": 9.976342105263158e-05, + "loss": 0.404, + "step": 20909 + }, + { + "epoch": 1.1709037966177624, + "grad_norm": 1.4406343698501587, + "learning_rate": 9.976315789473685e-05, + "loss": 0.4105, + "step": 20910 + }, + { + "epoch": 1.1709597939298915, + "grad_norm": 1.739396572113037, + "learning_rate": 9.976289473684211e-05, + "loss": 0.551, + "step": 20911 + }, + { + "epoch": 1.1710157912420205, + "grad_norm": 1.2890076637268066, + "learning_rate": 9.976263157894737e-05, + "loss": 0.4523, + "step": 20912 + }, + { + "epoch": 1.1710717885541495, + "grad_norm": 1.667888879776001, + "learning_rate": 9.976236842105263e-05, + "loss": 0.3999, + "step": 20913 + }, + { + "epoch": 1.1711277858662785, + "grad_norm": 1.4856075048446655, + "learning_rate": 9.97621052631579e-05, + "loss": 0.5666, + "step": 20914 + }, + { + "epoch": 1.1711837831784075, + "grad_norm": 1.2786290645599365, + "learning_rate": 9.976184210526316e-05, + "loss": 0.4664, + "step": 20915 + }, + { + "epoch": 1.1712397804905366, + "grad_norm": 1.2476434707641602, + "learning_rate": 9.976157894736842e-05, + "loss": 0.4253, + "step": 20916 + }, + { + "epoch": 1.1712957778026656, + "grad_norm": 1.4082708358764648, + "learning_rate": 9.976131578947368e-05, + "loss": 0.4569, + "step": 20917 + }, + { + "epoch": 1.1713517751147946, + "grad_norm": 4.578914642333984, + "learning_rate": 9.976105263157895e-05, + "loss": 0.3723, + "step": 20918 + }, + { + "epoch": 1.1714077724269236, + "grad_norm": 1.360572338104248, + "learning_rate": 9.976078947368421e-05, + "loss": 0.4925, + "step": 20919 + }, + { + "epoch": 1.1714637697390526, + "grad_norm": 1.3907374143600464, + "learning_rate": 9.976052631578949e-05, + "loss": 0.5032, + "step": 20920 + }, + { + "epoch": 1.1715197670511817, + "grad_norm": 1.2744126319885254, + "learning_rate": 9.976026315789473e-05, + "loss": 0.3994, + "step": 20921 + }, + { + "epoch": 1.1715757643633107, + "grad_norm": 1.6579738855361938, + "learning_rate": 9.976000000000001e-05, + "loss": 0.5932, + "step": 20922 + }, + { + "epoch": 1.1716317616754397, + "grad_norm": 1.3585381507873535, + "learning_rate": 9.975973684210527e-05, + "loss": 0.4995, + "step": 20923 + }, + { + "epoch": 1.1716877589875687, + "grad_norm": 1.6113086938858032, + "learning_rate": 9.975947368421054e-05, + "loss": 0.3571, + "step": 20924 + }, + { + "epoch": 1.1717437562996977, + "grad_norm": 1.8635375499725342, + "learning_rate": 9.97592105263158e-05, + "loss": 0.6094, + "step": 20925 + }, + { + "epoch": 1.1717997536118268, + "grad_norm": 1.3696166276931763, + "learning_rate": 9.975894736842106e-05, + "loss": 0.5199, + "step": 20926 + }, + { + "epoch": 1.1718557509239558, + "grad_norm": 1.7059210538864136, + "learning_rate": 9.975868421052632e-05, + "loss": 0.6103, + "step": 20927 + }, + { + "epoch": 1.1719117482360846, + "grad_norm": 1.4262621402740479, + "learning_rate": 9.975842105263158e-05, + "loss": 0.5573, + "step": 20928 + }, + { + "epoch": 1.1719677455482136, + "grad_norm": 1.6460988521575928, + "learning_rate": 9.975815789473685e-05, + "loss": 0.6694, + "step": 20929 + }, + { + "epoch": 1.1720237428603426, + "grad_norm": 1.3685556650161743, + "learning_rate": 9.975789473684211e-05, + "loss": 0.4285, + "step": 20930 + }, + { + "epoch": 1.1720797401724716, + "grad_norm": 1.4430515766143799, + "learning_rate": 9.975763157894737e-05, + "loss": 0.4029, + "step": 20931 + }, + { + "epoch": 1.1721357374846006, + "grad_norm": 1.3129043579101562, + "learning_rate": 9.975736842105263e-05, + "loss": 0.4033, + "step": 20932 + }, + { + "epoch": 1.1721917347967297, + "grad_norm": 1.3137214183807373, + "learning_rate": 9.97571052631579e-05, + "loss": 0.3447, + "step": 20933 + }, + { + "epoch": 1.1722477321088587, + "grad_norm": 1.2377527952194214, + "learning_rate": 9.975684210526316e-05, + "loss": 0.4837, + "step": 20934 + }, + { + "epoch": 1.1723037294209877, + "grad_norm": 1.4682475328445435, + "learning_rate": 9.975657894736842e-05, + "loss": 0.3254, + "step": 20935 + }, + { + "epoch": 1.1723597267331167, + "grad_norm": 1.4442527294158936, + "learning_rate": 9.975631578947368e-05, + "loss": 0.5485, + "step": 20936 + }, + { + "epoch": 1.1724157240452457, + "grad_norm": 1.2747081518173218, + "learning_rate": 9.975605263157896e-05, + "loss": 0.3679, + "step": 20937 + }, + { + "epoch": 1.1724717213573748, + "grad_norm": 1.3651938438415527, + "learning_rate": 9.975578947368422e-05, + "loss": 0.5902, + "step": 20938 + }, + { + "epoch": 1.1725277186695038, + "grad_norm": 1.3734140396118164, + "learning_rate": 9.975552631578948e-05, + "loss": 0.4437, + "step": 20939 + }, + { + "epoch": 1.1725837159816328, + "grad_norm": 3.9022696018218994, + "learning_rate": 9.975526315789474e-05, + "loss": 0.4237, + "step": 20940 + }, + { + "epoch": 1.1726397132937618, + "grad_norm": 1.5584402084350586, + "learning_rate": 9.975500000000001e-05, + "loss": 0.496, + "step": 20941 + }, + { + "epoch": 1.1726957106058908, + "grad_norm": 1.225854516029358, + "learning_rate": 9.975473684210527e-05, + "loss": 0.4197, + "step": 20942 + }, + { + "epoch": 1.1727517079180199, + "grad_norm": 1.3228957653045654, + "learning_rate": 9.975447368421054e-05, + "loss": 0.4086, + "step": 20943 + }, + { + "epoch": 1.1728077052301489, + "grad_norm": 1.4241491556167603, + "learning_rate": 9.975421052631579e-05, + "loss": 0.5034, + "step": 20944 + }, + { + "epoch": 1.172863702542278, + "grad_norm": 1.351749300956726, + "learning_rate": 9.975394736842105e-05, + "loss": 0.4078, + "step": 20945 + }, + { + "epoch": 1.172919699854407, + "grad_norm": 1.266481637954712, + "learning_rate": 9.975368421052632e-05, + "loss": 0.4345, + "step": 20946 + }, + { + "epoch": 1.172975697166536, + "grad_norm": 1.4176220893859863, + "learning_rate": 9.975342105263158e-05, + "loss": 0.5739, + "step": 20947 + }, + { + "epoch": 1.173031694478665, + "grad_norm": 1.3308656215667725, + "learning_rate": 9.975315789473685e-05, + "loss": 0.4177, + "step": 20948 + }, + { + "epoch": 1.173087691790794, + "grad_norm": 1.5689510107040405, + "learning_rate": 9.97528947368421e-05, + "loss": 0.49, + "step": 20949 + }, + { + "epoch": 1.173143689102923, + "grad_norm": 1.4418233633041382, + "learning_rate": 9.975263157894737e-05, + "loss": 0.4703, + "step": 20950 + }, + { + "epoch": 1.173199686415052, + "grad_norm": 1.4436392784118652, + "learning_rate": 9.975236842105263e-05, + "loss": 0.4242, + "step": 20951 + }, + { + "epoch": 1.173255683727181, + "grad_norm": 1.423810362815857, + "learning_rate": 9.975210526315791e-05, + "loss": 0.3812, + "step": 20952 + }, + { + "epoch": 1.17331168103931, + "grad_norm": 1.2401375770568848, + "learning_rate": 9.975184210526315e-05, + "loss": 0.4388, + "step": 20953 + }, + { + "epoch": 1.173367678351439, + "grad_norm": 1.3293055295944214, + "learning_rate": 9.975157894736843e-05, + "loss": 0.4964, + "step": 20954 + }, + { + "epoch": 1.173423675663568, + "grad_norm": 1.3621879816055298, + "learning_rate": 9.975131578947369e-05, + "loss": 0.4066, + "step": 20955 + }, + { + "epoch": 1.173479672975697, + "grad_norm": 1.1478829383850098, + "learning_rate": 9.975105263157896e-05, + "loss": 0.3469, + "step": 20956 + }, + { + "epoch": 1.1735356702878261, + "grad_norm": 1.63117253780365, + "learning_rate": 9.975078947368422e-05, + "loss": 0.6342, + "step": 20957 + }, + { + "epoch": 1.1735916675999551, + "grad_norm": 1.3149467706680298, + "learning_rate": 9.975052631578948e-05, + "loss": 0.3948, + "step": 20958 + }, + { + "epoch": 1.1736476649120842, + "grad_norm": 1.46638822555542, + "learning_rate": 9.975026315789474e-05, + "loss": 0.4828, + "step": 20959 + }, + { + "epoch": 1.1737036622242132, + "grad_norm": 1.3919765949249268, + "learning_rate": 9.975000000000001e-05, + "loss": 0.4549, + "step": 20960 + }, + { + "epoch": 1.1737596595363422, + "grad_norm": 1.3568085432052612, + "learning_rate": 9.974973684210527e-05, + "loss": 0.4909, + "step": 20961 + }, + { + "epoch": 1.1738156568484712, + "grad_norm": 1.5416512489318848, + "learning_rate": 9.974947368421053e-05, + "loss": 0.6284, + "step": 20962 + }, + { + "epoch": 1.1738716541606002, + "grad_norm": 1.1301103830337524, + "learning_rate": 9.974921052631579e-05, + "loss": 0.4017, + "step": 20963 + }, + { + "epoch": 1.1739276514727293, + "grad_norm": 1.399775505065918, + "learning_rate": 9.974894736842105e-05, + "loss": 0.49, + "step": 20964 + }, + { + "epoch": 1.1739836487848583, + "grad_norm": 1.6052110195159912, + "learning_rate": 9.974868421052632e-05, + "loss": 0.5099, + "step": 20965 + }, + { + "epoch": 1.1740396460969873, + "grad_norm": 1.214698314666748, + "learning_rate": 9.974842105263158e-05, + "loss": 0.4041, + "step": 20966 + }, + { + "epoch": 1.1740956434091163, + "grad_norm": 1.6586273908615112, + "learning_rate": 9.974815789473684e-05, + "loss": 0.4254, + "step": 20967 + }, + { + "epoch": 1.1741516407212453, + "grad_norm": 1.307340383529663, + "learning_rate": 9.97478947368421e-05, + "loss": 0.4385, + "step": 20968 + }, + { + "epoch": 1.1742076380333744, + "grad_norm": 1.3171969652175903, + "learning_rate": 9.974763157894738e-05, + "loss": 0.5378, + "step": 20969 + }, + { + "epoch": 1.1742636353455034, + "grad_norm": 1.87435781955719, + "learning_rate": 9.974736842105264e-05, + "loss": 0.5037, + "step": 20970 + }, + { + "epoch": 1.1743196326576324, + "grad_norm": 1.3954530954360962, + "learning_rate": 9.97471052631579e-05, + "loss": 0.5401, + "step": 20971 + }, + { + "epoch": 1.1743756299697614, + "grad_norm": 1.3167064189910889, + "learning_rate": 9.974684210526316e-05, + "loss": 0.4602, + "step": 20972 + }, + { + "epoch": 1.1744316272818904, + "grad_norm": 1.5326517820358276, + "learning_rate": 9.974657894736843e-05, + "loss": 0.4996, + "step": 20973 + }, + { + "epoch": 1.1744876245940195, + "grad_norm": 1.471760869026184, + "learning_rate": 9.974631578947369e-05, + "loss": 0.3486, + "step": 20974 + }, + { + "epoch": 1.1745436219061485, + "grad_norm": 1.2557377815246582, + "learning_rate": 9.974605263157896e-05, + "loss": 0.4018, + "step": 20975 + }, + { + "epoch": 1.1745996192182775, + "grad_norm": 1.3838822841644287, + "learning_rate": 9.974578947368421e-05, + "loss": 0.4234, + "step": 20976 + }, + { + "epoch": 1.1746556165304065, + "grad_norm": 1.2633038759231567, + "learning_rate": 9.974552631578948e-05, + "loss": 0.4, + "step": 20977 + }, + { + "epoch": 1.1747116138425355, + "grad_norm": 1.2640166282653809, + "learning_rate": 9.974526315789474e-05, + "loss": 0.3977, + "step": 20978 + }, + { + "epoch": 1.1747676111546645, + "grad_norm": 1.1059231758117676, + "learning_rate": 9.9745e-05, + "loss": 0.3712, + "step": 20979 + }, + { + "epoch": 1.1748236084667936, + "grad_norm": 1.2637633085250854, + "learning_rate": 9.974473684210527e-05, + "loss": 0.5684, + "step": 20980 + }, + { + "epoch": 1.1748796057789226, + "grad_norm": 1.5723506212234497, + "learning_rate": 9.974447368421052e-05, + "loss": 0.4166, + "step": 20981 + }, + { + "epoch": 1.1749356030910516, + "grad_norm": 1.3939402103424072, + "learning_rate": 9.97442105263158e-05, + "loss": 0.4191, + "step": 20982 + }, + { + "epoch": 1.1749916004031806, + "grad_norm": 1.2307891845703125, + "learning_rate": 9.974394736842105e-05, + "loss": 0.3587, + "step": 20983 + }, + { + "epoch": 1.1750475977153096, + "grad_norm": 1.5806177854537964, + "learning_rate": 9.974368421052633e-05, + "loss": 0.5389, + "step": 20984 + }, + { + "epoch": 1.1751035950274387, + "grad_norm": 1.2895103693008423, + "learning_rate": 9.974342105263159e-05, + "loss": 0.469, + "step": 20985 + }, + { + "epoch": 1.1751595923395677, + "grad_norm": 1.590126395225525, + "learning_rate": 9.974315789473685e-05, + "loss": 0.5257, + "step": 20986 + }, + { + "epoch": 1.1752155896516967, + "grad_norm": 1.3139359951019287, + "learning_rate": 9.97428947368421e-05, + "loss": 0.4209, + "step": 20987 + }, + { + "epoch": 1.1752715869638257, + "grad_norm": 1.2274155616760254, + "learning_rate": 9.974263157894738e-05, + "loss": 0.4931, + "step": 20988 + }, + { + "epoch": 1.1753275842759547, + "grad_norm": 1.2153725624084473, + "learning_rate": 9.974236842105264e-05, + "loss": 0.5214, + "step": 20989 + }, + { + "epoch": 1.1753835815880838, + "grad_norm": 1.3513826131820679, + "learning_rate": 9.97421052631579e-05, + "loss": 0.4725, + "step": 20990 + }, + { + "epoch": 1.1754395789002128, + "grad_norm": 1.201668381690979, + "learning_rate": 9.974184210526316e-05, + "loss": 0.5036, + "step": 20991 + }, + { + "epoch": 1.1754955762123418, + "grad_norm": 1.4883041381835938, + "learning_rate": 9.974157894736843e-05, + "loss": 0.4364, + "step": 20992 + }, + { + "epoch": 1.1755515735244708, + "grad_norm": 1.4872490167617798, + "learning_rate": 9.974131578947369e-05, + "loss": 0.446, + "step": 20993 + }, + { + "epoch": 1.1756075708365998, + "grad_norm": 1.2378628253936768, + "learning_rate": 9.974105263157895e-05, + "loss": 0.4013, + "step": 20994 + }, + { + "epoch": 1.1756635681487289, + "grad_norm": 1.235908031463623, + "learning_rate": 9.974078947368421e-05, + "loss": 0.3845, + "step": 20995 + }, + { + "epoch": 1.1757195654608579, + "grad_norm": 1.1609922647476196, + "learning_rate": 9.974052631578947e-05, + "loss": 0.5331, + "step": 20996 + }, + { + "epoch": 1.175775562772987, + "grad_norm": 1.4406769275665283, + "learning_rate": 9.974026315789474e-05, + "loss": 0.4935, + "step": 20997 + }, + { + "epoch": 1.175831560085116, + "grad_norm": 1.3535735607147217, + "learning_rate": 9.974e-05, + "loss": 0.3864, + "step": 20998 + }, + { + "epoch": 1.175887557397245, + "grad_norm": 1.3318134546279907, + "learning_rate": 9.973973684210526e-05, + "loss": 0.4113, + "step": 20999 + }, + { + "epoch": 1.175943554709374, + "grad_norm": 1.5921144485473633, + "learning_rate": 9.973947368421052e-05, + "loss": 0.3982, + "step": 21000 + }, + { + "epoch": 1.175999552021503, + "grad_norm": 1.7537912130355835, + "learning_rate": 9.97392105263158e-05, + "loss": 0.4266, + "step": 21001 + }, + { + "epoch": 1.176055549333632, + "grad_norm": 1.2887009382247925, + "learning_rate": 9.973894736842106e-05, + "loss": 0.4111, + "step": 21002 + }, + { + "epoch": 1.176111546645761, + "grad_norm": 1.4059815406799316, + "learning_rate": 9.973868421052633e-05, + "loss": 0.3887, + "step": 21003 + }, + { + "epoch": 1.17616754395789, + "grad_norm": 1.3824046850204468, + "learning_rate": 9.973842105263158e-05, + "loss": 0.4825, + "step": 21004 + }, + { + "epoch": 1.176223541270019, + "grad_norm": 2.390695571899414, + "learning_rate": 9.973815789473685e-05, + "loss": 0.5852, + "step": 21005 + }, + { + "epoch": 1.176279538582148, + "grad_norm": 1.2309800386428833, + "learning_rate": 9.973789473684211e-05, + "loss": 0.3685, + "step": 21006 + }, + { + "epoch": 1.176335535894277, + "grad_norm": 1.358850121498108, + "learning_rate": 9.973763157894738e-05, + "loss": 0.3702, + "step": 21007 + }, + { + "epoch": 1.176391533206406, + "grad_norm": 2.115166187286377, + "learning_rate": 9.973736842105263e-05, + "loss": 0.5503, + "step": 21008 + }, + { + "epoch": 1.1764475305185351, + "grad_norm": 1.2767928838729858, + "learning_rate": 9.97371052631579e-05, + "loss": 0.4729, + "step": 21009 + }, + { + "epoch": 1.1765035278306641, + "grad_norm": 1.5280072689056396, + "learning_rate": 9.973684210526316e-05, + "loss": 0.3253, + "step": 21010 + }, + { + "epoch": 1.1765595251427932, + "grad_norm": 1.5275877714157104, + "learning_rate": 9.973657894736843e-05, + "loss": 0.4214, + "step": 21011 + }, + { + "epoch": 1.1766155224549222, + "grad_norm": 1.173607587814331, + "learning_rate": 9.97363157894737e-05, + "loss": 0.3094, + "step": 21012 + }, + { + "epoch": 1.1766715197670512, + "grad_norm": 1.2253994941711426, + "learning_rate": 9.973605263157894e-05, + "loss": 0.4247, + "step": 21013 + }, + { + "epoch": 1.1767275170791802, + "grad_norm": 1.1790021657943726, + "learning_rate": 9.973578947368421e-05, + "loss": 0.353, + "step": 21014 + }, + { + "epoch": 1.1767835143913092, + "grad_norm": 1.2789949178695679, + "learning_rate": 9.973552631578947e-05, + "loss": 0.3371, + "step": 21015 + }, + { + "epoch": 1.1768395117034383, + "grad_norm": 1.2025184631347656, + "learning_rate": 9.973526315789475e-05, + "loss": 0.474, + "step": 21016 + }, + { + "epoch": 1.1768955090155673, + "grad_norm": 1.3859963417053223, + "learning_rate": 9.9735e-05, + "loss": 0.5285, + "step": 21017 + }, + { + "epoch": 1.1769515063276963, + "grad_norm": 1.4734511375427246, + "learning_rate": 9.973473684210527e-05, + "loss": 0.4577, + "step": 21018 + }, + { + "epoch": 1.1770075036398253, + "grad_norm": 1.6155064105987549, + "learning_rate": 9.973447368421053e-05, + "loss": 0.4629, + "step": 21019 + }, + { + "epoch": 1.1770635009519543, + "grad_norm": 1.3375678062438965, + "learning_rate": 9.97342105263158e-05, + "loss": 0.3894, + "step": 21020 + }, + { + "epoch": 1.1771194982640834, + "grad_norm": 1.2337805032730103, + "learning_rate": 9.973394736842106e-05, + "loss": 0.3927, + "step": 21021 + }, + { + "epoch": 1.1771754955762124, + "grad_norm": 1.4827440977096558, + "learning_rate": 9.973368421052632e-05, + "loss": 0.4476, + "step": 21022 + }, + { + "epoch": 1.1772314928883414, + "grad_norm": 1.2658737897872925, + "learning_rate": 9.973342105263158e-05, + "loss": 0.4027, + "step": 21023 + }, + { + "epoch": 1.1772874902004704, + "grad_norm": 1.3625433444976807, + "learning_rate": 9.973315789473685e-05, + "loss": 0.4178, + "step": 21024 + }, + { + "epoch": 1.1773434875125994, + "grad_norm": 1.4328525066375732, + "learning_rate": 9.973289473684211e-05, + "loss": 0.4612, + "step": 21025 + }, + { + "epoch": 1.1773994848247284, + "grad_norm": 1.4753912687301636, + "learning_rate": 9.973263157894737e-05, + "loss": 0.4367, + "step": 21026 + }, + { + "epoch": 1.1774554821368575, + "grad_norm": 1.2543789148330688, + "learning_rate": 9.973236842105263e-05, + "loss": 0.4802, + "step": 21027 + }, + { + "epoch": 1.1775114794489865, + "grad_norm": 1.206464409828186, + "learning_rate": 9.97321052631579e-05, + "loss": 0.4139, + "step": 21028 + }, + { + "epoch": 1.1775674767611155, + "grad_norm": 1.302475094795227, + "learning_rate": 9.973184210526316e-05, + "loss": 0.4461, + "step": 21029 + }, + { + "epoch": 1.1776234740732445, + "grad_norm": 1.0936827659606934, + "learning_rate": 9.973157894736842e-05, + "loss": 0.4318, + "step": 21030 + }, + { + "epoch": 1.1776794713853735, + "grad_norm": 1.2795578241348267, + "learning_rate": 9.973131578947368e-05, + "loss": 0.3422, + "step": 21031 + }, + { + "epoch": 1.1777354686975026, + "grad_norm": 1.5770725011825562, + "learning_rate": 9.973105263157894e-05, + "loss": 0.4887, + "step": 21032 + }, + { + "epoch": 1.1777914660096316, + "grad_norm": 1.2700650691986084, + "learning_rate": 9.973078947368422e-05, + "loss": 0.5678, + "step": 21033 + }, + { + "epoch": 1.1778474633217606, + "grad_norm": 1.371291160583496, + "learning_rate": 9.973052631578948e-05, + "loss": 0.4998, + "step": 21034 + }, + { + "epoch": 1.1779034606338896, + "grad_norm": 1.370296597480774, + "learning_rate": 9.973026315789475e-05, + "loss": 0.4893, + "step": 21035 + }, + { + "epoch": 1.1779594579460186, + "grad_norm": 1.5093014240264893, + "learning_rate": 9.973e-05, + "loss": 0.4992, + "step": 21036 + }, + { + "epoch": 1.1780154552581477, + "grad_norm": 1.3816587924957275, + "learning_rate": 9.972973684210527e-05, + "loss": 0.3892, + "step": 21037 + }, + { + "epoch": 1.1780714525702767, + "grad_norm": 1.5106549263000488, + "learning_rate": 9.972947368421053e-05, + "loss": 0.4294, + "step": 21038 + }, + { + "epoch": 1.1781274498824057, + "grad_norm": 1.2390880584716797, + "learning_rate": 9.97292105263158e-05, + "loss": 0.3547, + "step": 21039 + }, + { + "epoch": 1.1781834471945347, + "grad_norm": 1.52642822265625, + "learning_rate": 9.972894736842106e-05, + "loss": 0.5257, + "step": 21040 + }, + { + "epoch": 1.1782394445066637, + "grad_norm": 1.2675113677978516, + "learning_rate": 9.972868421052632e-05, + "loss": 0.4144, + "step": 21041 + }, + { + "epoch": 1.1782954418187928, + "grad_norm": 1.4749233722686768, + "learning_rate": 9.972842105263158e-05, + "loss": 0.4099, + "step": 21042 + }, + { + "epoch": 1.1783514391309218, + "grad_norm": 1.2252509593963623, + "learning_rate": 9.972815789473685e-05, + "loss": 0.4446, + "step": 21043 + }, + { + "epoch": 1.1784074364430508, + "grad_norm": 1.230692982673645, + "learning_rate": 9.972789473684211e-05, + "loss": 0.4159, + "step": 21044 + }, + { + "epoch": 1.1784634337551798, + "grad_norm": 1.4128847122192383, + "learning_rate": 9.972763157894737e-05, + "loss": 0.4486, + "step": 21045 + }, + { + "epoch": 1.1785194310673088, + "grad_norm": 1.276092290878296, + "learning_rate": 9.972736842105263e-05, + "loss": 0.4334, + "step": 21046 + }, + { + "epoch": 1.1785754283794379, + "grad_norm": 1.8333954811096191, + "learning_rate": 9.97271052631579e-05, + "loss": 0.5329, + "step": 21047 + }, + { + "epoch": 1.1786314256915669, + "grad_norm": 1.220476746559143, + "learning_rate": 9.972684210526317e-05, + "loss": 0.425, + "step": 21048 + }, + { + "epoch": 1.1786874230036959, + "grad_norm": 1.54896879196167, + "learning_rate": 9.972657894736843e-05, + "loss": 0.5269, + "step": 21049 + }, + { + "epoch": 1.178743420315825, + "grad_norm": 1.5744624137878418, + "learning_rate": 9.972631578947369e-05, + "loss": 0.3989, + "step": 21050 + }, + { + "epoch": 1.178799417627954, + "grad_norm": 1.5536669492721558, + "learning_rate": 9.972605263157895e-05, + "loss": 0.4169, + "step": 21051 + }, + { + "epoch": 1.178855414940083, + "grad_norm": 1.5690683126449585, + "learning_rate": 9.972578947368422e-05, + "loss": 0.4538, + "step": 21052 + }, + { + "epoch": 1.178911412252212, + "grad_norm": 1.5268265008926392, + "learning_rate": 9.972552631578948e-05, + "loss": 0.3558, + "step": 21053 + }, + { + "epoch": 1.178967409564341, + "grad_norm": 1.5727897882461548, + "learning_rate": 9.972526315789474e-05, + "loss": 0.6762, + "step": 21054 + }, + { + "epoch": 1.17902340687647, + "grad_norm": 1.347290277481079, + "learning_rate": 9.9725e-05, + "loss": 0.4515, + "step": 21055 + }, + { + "epoch": 1.179079404188599, + "grad_norm": 1.4351496696472168, + "learning_rate": 9.972473684210527e-05, + "loss": 0.5595, + "step": 21056 + }, + { + "epoch": 1.179135401500728, + "grad_norm": 1.2999906539916992, + "learning_rate": 9.972447368421053e-05, + "loss": 0.478, + "step": 21057 + }, + { + "epoch": 1.179191398812857, + "grad_norm": 1.2803971767425537, + "learning_rate": 9.97242105263158e-05, + "loss": 0.503, + "step": 21058 + }, + { + "epoch": 1.179247396124986, + "grad_norm": 2.6762804985046387, + "learning_rate": 9.972394736842105e-05, + "loss": 0.5161, + "step": 21059 + }, + { + "epoch": 1.179303393437115, + "grad_norm": 1.2523714303970337, + "learning_rate": 9.972368421052632e-05, + "loss": 0.3979, + "step": 21060 + }, + { + "epoch": 1.1793593907492441, + "grad_norm": 1.3335241079330444, + "learning_rate": 9.972342105263158e-05, + "loss": 0.424, + "step": 21061 + }, + { + "epoch": 1.1794153880613731, + "grad_norm": 1.3811438083648682, + "learning_rate": 9.972315789473686e-05, + "loss": 0.4412, + "step": 21062 + }, + { + "epoch": 1.1794713853735022, + "grad_norm": 1.3896058797836304, + "learning_rate": 9.97228947368421e-05, + "loss": 0.5693, + "step": 21063 + }, + { + "epoch": 1.1795273826856312, + "grad_norm": 1.2304675579071045, + "learning_rate": 9.972263157894738e-05, + "loss": 0.3832, + "step": 21064 + }, + { + "epoch": 1.1795833799977602, + "grad_norm": 1.9563837051391602, + "learning_rate": 9.972236842105264e-05, + "loss": 0.4691, + "step": 21065 + }, + { + "epoch": 1.1796393773098892, + "grad_norm": 1.3286216259002686, + "learning_rate": 9.97221052631579e-05, + "loss": 0.446, + "step": 21066 + }, + { + "epoch": 1.1796953746220182, + "grad_norm": 1.3801134824752808, + "learning_rate": 9.972184210526317e-05, + "loss": 0.5168, + "step": 21067 + }, + { + "epoch": 1.1797513719341473, + "grad_norm": 1.3077819347381592, + "learning_rate": 9.972157894736842e-05, + "loss": 0.4792, + "step": 21068 + }, + { + "epoch": 1.1798073692462763, + "grad_norm": 1.4988596439361572, + "learning_rate": 9.972131578947369e-05, + "loss": 0.5545, + "step": 21069 + }, + { + "epoch": 1.1798633665584053, + "grad_norm": 1.228618860244751, + "learning_rate": 9.972105263157895e-05, + "loss": 0.407, + "step": 21070 + }, + { + "epoch": 1.1799193638705343, + "grad_norm": 1.342016339302063, + "learning_rate": 9.972078947368422e-05, + "loss": 0.4125, + "step": 21071 + }, + { + "epoch": 1.1799753611826633, + "grad_norm": 1.7181639671325684, + "learning_rate": 9.972052631578948e-05, + "loss": 0.3964, + "step": 21072 + }, + { + "epoch": 1.1800313584947921, + "grad_norm": 1.7656135559082031, + "learning_rate": 9.972026315789474e-05, + "loss": 0.532, + "step": 21073 + }, + { + "epoch": 1.1800873558069211, + "grad_norm": 1.4386123418807983, + "learning_rate": 9.972e-05, + "loss": 0.4891, + "step": 21074 + }, + { + "epoch": 1.1801433531190502, + "grad_norm": 1.3285212516784668, + "learning_rate": 9.971973684210527e-05, + "loss": 0.4108, + "step": 21075 + }, + { + "epoch": 1.1801993504311792, + "grad_norm": 1.3960325717926025, + "learning_rate": 9.971947368421053e-05, + "loss": 0.3639, + "step": 21076 + }, + { + "epoch": 1.1802553477433082, + "grad_norm": 1.492347002029419, + "learning_rate": 9.97192105263158e-05, + "loss": 0.5306, + "step": 21077 + }, + { + "epoch": 1.1803113450554372, + "grad_norm": 1.1340129375457764, + "learning_rate": 9.971894736842105e-05, + "loss": 0.3658, + "step": 21078 + }, + { + "epoch": 1.1803673423675662, + "grad_norm": 1.175807237625122, + "learning_rate": 9.971868421052633e-05, + "loss": 0.344, + "step": 21079 + }, + { + "epoch": 1.1804233396796953, + "grad_norm": 1.6824955940246582, + "learning_rate": 9.971842105263159e-05, + "loss": 0.5288, + "step": 21080 + }, + { + "epoch": 1.1804793369918243, + "grad_norm": 1.2158433198928833, + "learning_rate": 9.971815789473685e-05, + "loss": 0.3972, + "step": 21081 + }, + { + "epoch": 1.1805353343039533, + "grad_norm": 1.6033775806427002, + "learning_rate": 9.97178947368421e-05, + "loss": 0.4921, + "step": 21082 + }, + { + "epoch": 1.1805913316160823, + "grad_norm": 1.1867610216140747, + "learning_rate": 9.971763157894737e-05, + "loss": 0.3832, + "step": 21083 + }, + { + "epoch": 1.1806473289282113, + "grad_norm": 1.357165813446045, + "learning_rate": 9.971736842105264e-05, + "loss": 0.4266, + "step": 21084 + }, + { + "epoch": 1.1807033262403404, + "grad_norm": 1.2187163829803467, + "learning_rate": 9.97171052631579e-05, + "loss": 0.4727, + "step": 21085 + }, + { + "epoch": 1.1807593235524694, + "grad_norm": 1.299239993095398, + "learning_rate": 9.971684210526316e-05, + "loss": 0.4482, + "step": 21086 + }, + { + "epoch": 1.1808153208645984, + "grad_norm": 1.2567377090454102, + "learning_rate": 9.971657894736842e-05, + "loss": 0.445, + "step": 21087 + }, + { + "epoch": 1.1808713181767274, + "grad_norm": 1.2322558164596558, + "learning_rate": 9.971631578947369e-05, + "loss": 0.3458, + "step": 21088 + }, + { + "epoch": 1.1809273154888564, + "grad_norm": 1.7484732866287231, + "learning_rate": 9.971605263157895e-05, + "loss": 0.4452, + "step": 21089 + }, + { + "epoch": 1.1809833128009855, + "grad_norm": 1.2590219974517822, + "learning_rate": 9.971578947368422e-05, + "loss": 0.4281, + "step": 21090 + }, + { + "epoch": 1.1810393101131145, + "grad_norm": 1.654700756072998, + "learning_rate": 9.971552631578947e-05, + "loss": 0.5242, + "step": 21091 + }, + { + "epoch": 1.1810953074252435, + "grad_norm": 1.3631558418273926, + "learning_rate": 9.971526315789474e-05, + "loss": 0.4196, + "step": 21092 + }, + { + "epoch": 1.1811513047373725, + "grad_norm": 1.3610966205596924, + "learning_rate": 9.9715e-05, + "loss": 0.4374, + "step": 21093 + }, + { + "epoch": 1.1812073020495015, + "grad_norm": 1.3292502164840698, + "learning_rate": 9.971473684210528e-05, + "loss": 0.5613, + "step": 21094 + }, + { + "epoch": 1.1812632993616305, + "grad_norm": 1.3234503269195557, + "learning_rate": 9.971447368421054e-05, + "loss": 0.4301, + "step": 21095 + }, + { + "epoch": 1.1813192966737596, + "grad_norm": 1.2691986560821533, + "learning_rate": 9.97142105263158e-05, + "loss": 0.3603, + "step": 21096 + }, + { + "epoch": 1.1813752939858886, + "grad_norm": 1.2768298387527466, + "learning_rate": 9.971394736842106e-05, + "loss": 0.3793, + "step": 21097 + }, + { + "epoch": 1.1814312912980176, + "grad_norm": 1.067457914352417, + "learning_rate": 9.971368421052633e-05, + "loss": 0.5021, + "step": 21098 + }, + { + "epoch": 1.1814872886101466, + "grad_norm": 1.585533857345581, + "learning_rate": 9.971342105263159e-05, + "loss": 0.7983, + "step": 21099 + }, + { + "epoch": 1.1815432859222756, + "grad_norm": 1.301196575164795, + "learning_rate": 9.971315789473683e-05, + "loss": 0.4705, + "step": 21100 + }, + { + "epoch": 1.1815992832344047, + "grad_norm": 1.3239070177078247, + "learning_rate": 9.971289473684211e-05, + "loss": 0.4195, + "step": 21101 + }, + { + "epoch": 1.1816552805465337, + "grad_norm": 1.5636088848114014, + "learning_rate": 9.971263157894737e-05, + "loss": 0.544, + "step": 21102 + }, + { + "epoch": 1.1817112778586627, + "grad_norm": 1.20651113986969, + "learning_rate": 9.971236842105264e-05, + "loss": 0.4805, + "step": 21103 + }, + { + "epoch": 1.1817672751707917, + "grad_norm": 1.2414714097976685, + "learning_rate": 9.97121052631579e-05, + "loss": 0.3596, + "step": 21104 + }, + { + "epoch": 1.1818232724829207, + "grad_norm": 1.429947853088379, + "learning_rate": 9.971184210526316e-05, + "loss": 0.4822, + "step": 21105 + }, + { + "epoch": 1.1818792697950498, + "grad_norm": 1.3667641878128052, + "learning_rate": 9.971157894736842e-05, + "loss": 0.3992, + "step": 21106 + }, + { + "epoch": 1.1819352671071788, + "grad_norm": 1.1207997798919678, + "learning_rate": 9.97113157894737e-05, + "loss": 0.3619, + "step": 21107 + }, + { + "epoch": 1.1819912644193078, + "grad_norm": 1.050496220588684, + "learning_rate": 9.971105263157895e-05, + "loss": 0.3202, + "step": 21108 + }, + { + "epoch": 1.1820472617314368, + "grad_norm": 1.368922472000122, + "learning_rate": 9.971078947368421e-05, + "loss": 0.585, + "step": 21109 + }, + { + "epoch": 1.1821032590435658, + "grad_norm": 1.4292868375778198, + "learning_rate": 9.971052631578947e-05, + "loss": 0.3907, + "step": 21110 + }, + { + "epoch": 1.1821592563556949, + "grad_norm": 1.2813223600387573, + "learning_rate": 9.971026315789475e-05, + "loss": 0.3623, + "step": 21111 + }, + { + "epoch": 1.1822152536678239, + "grad_norm": 1.4284342527389526, + "learning_rate": 9.971e-05, + "loss": 0.4354, + "step": 21112 + }, + { + "epoch": 1.182271250979953, + "grad_norm": 1.565657615661621, + "learning_rate": 9.970973684210528e-05, + "loss": 0.4779, + "step": 21113 + }, + { + "epoch": 1.182327248292082, + "grad_norm": 1.3310754299163818, + "learning_rate": 9.970947368421053e-05, + "loss": 0.4205, + "step": 21114 + }, + { + "epoch": 1.182383245604211, + "grad_norm": 1.2147763967514038, + "learning_rate": 9.97092105263158e-05, + "loss": 0.4249, + "step": 21115 + }, + { + "epoch": 1.18243924291634, + "grad_norm": 1.3342591524124146, + "learning_rate": 9.970894736842106e-05, + "loss": 0.4208, + "step": 21116 + }, + { + "epoch": 1.182495240228469, + "grad_norm": 1.507914662361145, + "learning_rate": 9.970868421052632e-05, + "loss": 0.6189, + "step": 21117 + }, + { + "epoch": 1.182551237540598, + "grad_norm": 1.5464972257614136, + "learning_rate": 9.970842105263158e-05, + "loss": 0.5017, + "step": 21118 + }, + { + "epoch": 1.182607234852727, + "grad_norm": 1.3173556327819824, + "learning_rate": 9.970815789473684e-05, + "loss": 0.4078, + "step": 21119 + }, + { + "epoch": 1.182663232164856, + "grad_norm": 1.3701661825180054, + "learning_rate": 9.970789473684211e-05, + "loss": 0.5615, + "step": 21120 + }, + { + "epoch": 1.182719229476985, + "grad_norm": 1.4368195533752441, + "learning_rate": 9.970763157894737e-05, + "loss": 0.4509, + "step": 21121 + }, + { + "epoch": 1.182775226789114, + "grad_norm": 1.326619029045105, + "learning_rate": 9.970736842105264e-05, + "loss": 0.4387, + "step": 21122 + }, + { + "epoch": 1.182831224101243, + "grad_norm": 1.341138243675232, + "learning_rate": 9.970710526315789e-05, + "loss": 0.4683, + "step": 21123 + }, + { + "epoch": 1.182887221413372, + "grad_norm": 1.3173452615737915, + "learning_rate": 9.970684210526316e-05, + "loss": 0.4314, + "step": 21124 + }, + { + "epoch": 1.1829432187255011, + "grad_norm": 1.32731032371521, + "learning_rate": 9.970657894736842e-05, + "loss": 0.431, + "step": 21125 + }, + { + "epoch": 1.1829992160376301, + "grad_norm": 1.4874069690704346, + "learning_rate": 9.97063157894737e-05, + "loss": 0.5038, + "step": 21126 + }, + { + "epoch": 1.1830552133497592, + "grad_norm": 1.3662054538726807, + "learning_rate": 9.970605263157896e-05, + "loss": 0.4337, + "step": 21127 + }, + { + "epoch": 1.1831112106618882, + "grad_norm": 1.2005529403686523, + "learning_rate": 9.970578947368422e-05, + "loss": 0.3957, + "step": 21128 + }, + { + "epoch": 1.1831672079740172, + "grad_norm": 1.4747321605682373, + "learning_rate": 9.970552631578948e-05, + "loss": 0.4642, + "step": 21129 + }, + { + "epoch": 1.1832232052861462, + "grad_norm": 1.3802555799484253, + "learning_rate": 9.970526315789475e-05, + "loss": 0.4484, + "step": 21130 + }, + { + "epoch": 1.1832792025982752, + "grad_norm": 1.442726731300354, + "learning_rate": 9.970500000000001e-05, + "loss": 0.4193, + "step": 21131 + }, + { + "epoch": 1.1833351999104043, + "grad_norm": 1.507625937461853, + "learning_rate": 9.970473684210527e-05, + "loss": 0.5281, + "step": 21132 + }, + { + "epoch": 1.1833911972225333, + "grad_norm": 1.5044076442718506, + "learning_rate": 9.970447368421053e-05, + "loss": 0.4702, + "step": 21133 + }, + { + "epoch": 1.1834471945346623, + "grad_norm": 2.6730031967163086, + "learning_rate": 9.970421052631579e-05, + "loss": 0.338, + "step": 21134 + }, + { + "epoch": 1.1835031918467913, + "grad_norm": 1.2748754024505615, + "learning_rate": 9.970394736842106e-05, + "loss": 0.5369, + "step": 21135 + }, + { + "epoch": 1.1835591891589203, + "grad_norm": 1.3605180978775024, + "learning_rate": 9.970368421052632e-05, + "loss": 0.6243, + "step": 21136 + }, + { + "epoch": 1.1836151864710494, + "grad_norm": 1.4342129230499268, + "learning_rate": 9.970342105263158e-05, + "loss": 0.3888, + "step": 21137 + }, + { + "epoch": 1.1836711837831784, + "grad_norm": 1.2869508266448975, + "learning_rate": 9.970315789473684e-05, + "loss": 0.5126, + "step": 21138 + }, + { + "epoch": 1.1837271810953074, + "grad_norm": 1.6351805925369263, + "learning_rate": 9.970289473684211e-05, + "loss": 0.5752, + "step": 21139 + }, + { + "epoch": 1.1837831784074364, + "grad_norm": 1.45833158493042, + "learning_rate": 9.970263157894737e-05, + "loss": 0.3881, + "step": 21140 + }, + { + "epoch": 1.1838391757195654, + "grad_norm": 1.2941596508026123, + "learning_rate": 9.970236842105263e-05, + "loss": 0.5191, + "step": 21141 + }, + { + "epoch": 1.1838951730316944, + "grad_norm": 1.3377100229263306, + "learning_rate": 9.970210526315789e-05, + "loss": 0.3507, + "step": 21142 + }, + { + "epoch": 1.1839511703438235, + "grad_norm": 1.2845380306243896, + "learning_rate": 9.970184210526317e-05, + "loss": 0.4726, + "step": 21143 + }, + { + "epoch": 1.1840071676559525, + "grad_norm": 1.2385921478271484, + "learning_rate": 9.970157894736843e-05, + "loss": 0.4044, + "step": 21144 + }, + { + "epoch": 1.1840631649680815, + "grad_norm": 1.1426398754119873, + "learning_rate": 9.97013157894737e-05, + "loss": 0.4968, + "step": 21145 + }, + { + "epoch": 1.1841191622802105, + "grad_norm": 1.7909942865371704, + "learning_rate": 9.970105263157895e-05, + "loss": 0.3966, + "step": 21146 + }, + { + "epoch": 1.1841751595923395, + "grad_norm": 2.319905996322632, + "learning_rate": 9.970078947368422e-05, + "loss": 0.5612, + "step": 21147 + }, + { + "epoch": 1.1842311569044686, + "grad_norm": 1.4069215059280396, + "learning_rate": 9.970052631578948e-05, + "loss": 0.5357, + "step": 21148 + }, + { + "epoch": 1.1842871542165976, + "grad_norm": 1.180469036102295, + "learning_rate": 9.970026315789475e-05, + "loss": 0.3794, + "step": 21149 + }, + { + "epoch": 1.1843431515287266, + "grad_norm": 1.1504889726638794, + "learning_rate": 9.970000000000001e-05, + "loss": 0.3408, + "step": 21150 + }, + { + "epoch": 1.1843991488408556, + "grad_norm": 1.4989124536514282, + "learning_rate": 9.969973684210526e-05, + "loss": 0.5002, + "step": 21151 + }, + { + "epoch": 1.1844551461529846, + "grad_norm": 1.8440818786621094, + "learning_rate": 9.969947368421053e-05, + "loss": 0.6285, + "step": 21152 + }, + { + "epoch": 1.1845111434651137, + "grad_norm": 7.958072662353516, + "learning_rate": 9.969921052631579e-05, + "loss": 0.3883, + "step": 21153 + }, + { + "epoch": 1.1845671407772427, + "grad_norm": 1.494621992111206, + "learning_rate": 9.969894736842106e-05, + "loss": 0.5401, + "step": 21154 + }, + { + "epoch": 1.1846231380893717, + "grad_norm": 1.2599050998687744, + "learning_rate": 9.969868421052631e-05, + "loss": 0.4195, + "step": 21155 + }, + { + "epoch": 1.1846791354015007, + "grad_norm": 1.1688024997711182, + "learning_rate": 9.969842105263158e-05, + "loss": 0.3738, + "step": 21156 + }, + { + "epoch": 1.1847351327136297, + "grad_norm": 1.5886930227279663, + "learning_rate": 9.969815789473684e-05, + "loss": 0.4661, + "step": 21157 + }, + { + "epoch": 1.1847911300257588, + "grad_norm": 1.2828972339630127, + "learning_rate": 9.969789473684212e-05, + "loss": 0.3685, + "step": 21158 + }, + { + "epoch": 1.1848471273378878, + "grad_norm": 1.2016386985778809, + "learning_rate": 9.969763157894738e-05, + "loss": 0.3928, + "step": 21159 + }, + { + "epoch": 1.1849031246500168, + "grad_norm": 1.6821802854537964, + "learning_rate": 9.969736842105264e-05, + "loss": 0.4608, + "step": 21160 + }, + { + "epoch": 1.1849591219621458, + "grad_norm": 1.7325512170791626, + "learning_rate": 9.96971052631579e-05, + "loss": 0.376, + "step": 21161 + }, + { + "epoch": 1.1850151192742748, + "grad_norm": 1.6614699363708496, + "learning_rate": 9.969684210526317e-05, + "loss": 0.3121, + "step": 21162 + }, + { + "epoch": 1.1850711165864038, + "grad_norm": 1.2217885255813599, + "learning_rate": 9.969657894736843e-05, + "loss": 0.3949, + "step": 21163 + }, + { + "epoch": 1.1851271138985329, + "grad_norm": 1.9034477472305298, + "learning_rate": 9.969631578947369e-05, + "loss": 0.6209, + "step": 21164 + }, + { + "epoch": 1.1851831112106619, + "grad_norm": 1.560486078262329, + "learning_rate": 9.969605263157895e-05, + "loss": 0.5011, + "step": 21165 + }, + { + "epoch": 1.185239108522791, + "grad_norm": 1.3662017583847046, + "learning_rate": 9.969578947368422e-05, + "loss": 0.4632, + "step": 21166 + }, + { + "epoch": 1.18529510583492, + "grad_norm": 1.2816357612609863, + "learning_rate": 9.969552631578948e-05, + "loss": 0.371, + "step": 21167 + }, + { + "epoch": 1.185351103147049, + "grad_norm": 1.4631057977676392, + "learning_rate": 9.969526315789474e-05, + "loss": 0.491, + "step": 21168 + }, + { + "epoch": 1.185407100459178, + "grad_norm": 1.1913096904754639, + "learning_rate": 9.9695e-05, + "loss": 0.4349, + "step": 21169 + }, + { + "epoch": 1.185463097771307, + "grad_norm": 1.2965353727340698, + "learning_rate": 9.969473684210526e-05, + "loss": 0.4607, + "step": 21170 + }, + { + "epoch": 1.185519095083436, + "grad_norm": 1.4136563539505005, + "learning_rate": 9.969447368421053e-05, + "loss": 0.4314, + "step": 21171 + }, + { + "epoch": 1.185575092395565, + "grad_norm": 1.3343579769134521, + "learning_rate": 9.969421052631579e-05, + "loss": 0.4353, + "step": 21172 + }, + { + "epoch": 1.185631089707694, + "grad_norm": 1.5607622861862183, + "learning_rate": 9.969394736842105e-05, + "loss": 0.4919, + "step": 21173 + }, + { + "epoch": 1.185687087019823, + "grad_norm": 1.2649791240692139, + "learning_rate": 9.969368421052631e-05, + "loss": 0.5258, + "step": 21174 + }, + { + "epoch": 1.185743084331952, + "grad_norm": 1.422687292098999, + "learning_rate": 9.969342105263159e-05, + "loss": 0.556, + "step": 21175 + }, + { + "epoch": 1.185799081644081, + "grad_norm": 1.6056827306747437, + "learning_rate": 9.969315789473685e-05, + "loss": 0.439, + "step": 21176 + }, + { + "epoch": 1.1858550789562101, + "grad_norm": 1.2976654767990112, + "learning_rate": 9.969289473684212e-05, + "loss": 0.4113, + "step": 21177 + }, + { + "epoch": 1.1859110762683391, + "grad_norm": 1.3225038051605225, + "learning_rate": 9.969263157894736e-05, + "loss": 0.4313, + "step": 21178 + }, + { + "epoch": 1.1859670735804682, + "grad_norm": 1.2887746095657349, + "learning_rate": 9.969236842105264e-05, + "loss": 0.4083, + "step": 21179 + }, + { + "epoch": 1.1860230708925972, + "grad_norm": 1.6592974662780762, + "learning_rate": 9.96921052631579e-05, + "loss": 0.4067, + "step": 21180 + }, + { + "epoch": 1.1860790682047262, + "grad_norm": 1.1698036193847656, + "learning_rate": 9.969184210526317e-05, + "loss": 0.4029, + "step": 21181 + }, + { + "epoch": 1.1861350655168552, + "grad_norm": 1.4141706228256226, + "learning_rate": 9.969157894736843e-05, + "loss": 0.4824, + "step": 21182 + }, + { + "epoch": 1.1861910628289842, + "grad_norm": 1.3743826150894165, + "learning_rate": 9.969131578947369e-05, + "loss": 0.4966, + "step": 21183 + }, + { + "epoch": 1.1862470601411133, + "grad_norm": 1.381693720817566, + "learning_rate": 9.969105263157895e-05, + "loss": 0.5144, + "step": 21184 + }, + { + "epoch": 1.1863030574532423, + "grad_norm": 1.380395770072937, + "learning_rate": 9.969078947368422e-05, + "loss": 0.3171, + "step": 21185 + }, + { + "epoch": 1.1863590547653713, + "grad_norm": 1.3432729244232178, + "learning_rate": 9.969052631578948e-05, + "loss": 0.4076, + "step": 21186 + }, + { + "epoch": 1.1864150520775003, + "grad_norm": 1.2328461408615112, + "learning_rate": 9.969026315789474e-05, + "loss": 0.3554, + "step": 21187 + }, + { + "epoch": 1.1864710493896293, + "grad_norm": 1.296217918395996, + "learning_rate": 9.969e-05, + "loss": 0.5135, + "step": 21188 + }, + { + "epoch": 1.1865270467017583, + "grad_norm": 35.66089630126953, + "learning_rate": 9.968973684210526e-05, + "loss": 0.6095, + "step": 21189 + }, + { + "epoch": 1.1865830440138874, + "grad_norm": 1.3872572183609009, + "learning_rate": 9.968947368421054e-05, + "loss": 0.4508, + "step": 21190 + }, + { + "epoch": 1.1866390413260164, + "grad_norm": 2.5166049003601074, + "learning_rate": 9.96892105263158e-05, + "loss": 0.6397, + "step": 21191 + }, + { + "epoch": 1.1866950386381454, + "grad_norm": 1.4581425189971924, + "learning_rate": 9.968894736842106e-05, + "loss": 0.4259, + "step": 21192 + }, + { + "epoch": 1.1867510359502744, + "grad_norm": 2.6536355018615723, + "learning_rate": 9.968868421052631e-05, + "loss": 0.5135, + "step": 21193 + }, + { + "epoch": 1.1868070332624034, + "grad_norm": 1.2793033123016357, + "learning_rate": 9.968842105263159e-05, + "loss": 0.3964, + "step": 21194 + }, + { + "epoch": 1.1868630305745325, + "grad_norm": 1.4136197566986084, + "learning_rate": 9.968815789473685e-05, + "loss": 0.5737, + "step": 21195 + }, + { + "epoch": 1.1869190278866615, + "grad_norm": 1.2036255598068237, + "learning_rate": 9.968789473684211e-05, + "loss": 0.359, + "step": 21196 + }, + { + "epoch": 1.1869750251987905, + "grad_norm": 1.5432047843933105, + "learning_rate": 9.968763157894737e-05, + "loss": 0.4279, + "step": 21197 + }, + { + "epoch": 1.1870310225109195, + "grad_norm": 1.3293156623840332, + "learning_rate": 9.968736842105264e-05, + "loss": 0.3859, + "step": 21198 + }, + { + "epoch": 1.1870870198230485, + "grad_norm": 1.7360200881958008, + "learning_rate": 9.96871052631579e-05, + "loss": 0.4776, + "step": 21199 + }, + { + "epoch": 1.1871430171351776, + "grad_norm": 2.022270917892456, + "learning_rate": 9.968684210526317e-05, + "loss": 0.3391, + "step": 21200 + }, + { + "epoch": 1.1871990144473066, + "grad_norm": 1.334336280822754, + "learning_rate": 9.968657894736842e-05, + "loss": 0.5049, + "step": 21201 + }, + { + "epoch": 1.1872550117594356, + "grad_norm": 1.9071433544158936, + "learning_rate": 9.968631578947369e-05, + "loss": 0.3772, + "step": 21202 + }, + { + "epoch": 1.1873110090715646, + "grad_norm": 1.2202825546264648, + "learning_rate": 9.968605263157895e-05, + "loss": 0.4446, + "step": 21203 + }, + { + "epoch": 1.1873670063836936, + "grad_norm": 1.5728347301483154, + "learning_rate": 9.968578947368421e-05, + "loss": 0.5384, + "step": 21204 + }, + { + "epoch": 1.1874230036958227, + "grad_norm": 1.3353360891342163, + "learning_rate": 9.968552631578949e-05, + "loss": 0.3864, + "step": 21205 + }, + { + "epoch": 1.1874790010079517, + "grad_norm": 1.5640684366226196, + "learning_rate": 9.968526315789473e-05, + "loss": 0.439, + "step": 21206 + }, + { + "epoch": 1.1875349983200807, + "grad_norm": 1.3092772960662842, + "learning_rate": 9.9685e-05, + "loss": 0.4114, + "step": 21207 + }, + { + "epoch": 1.1875909956322097, + "grad_norm": 1.6550471782684326, + "learning_rate": 9.968473684210526e-05, + "loss": 0.4563, + "step": 21208 + }, + { + "epoch": 1.1876469929443387, + "grad_norm": 1.2956535816192627, + "learning_rate": 9.968447368421054e-05, + "loss": 0.4703, + "step": 21209 + }, + { + "epoch": 1.1877029902564677, + "grad_norm": 1.3555691242218018, + "learning_rate": 9.968421052631578e-05, + "loss": 0.5481, + "step": 21210 + }, + { + "epoch": 1.1877589875685968, + "grad_norm": 1.2641892433166504, + "learning_rate": 9.968394736842106e-05, + "loss": 0.4387, + "step": 21211 + }, + { + "epoch": 1.1878149848807258, + "grad_norm": 1.3119490146636963, + "learning_rate": 9.968368421052632e-05, + "loss": 0.3682, + "step": 21212 + }, + { + "epoch": 1.1878709821928548, + "grad_norm": 1.429458737373352, + "learning_rate": 9.968342105263159e-05, + "loss": 0.4616, + "step": 21213 + }, + { + "epoch": 1.1879269795049838, + "grad_norm": 1.39130437374115, + "learning_rate": 9.968315789473685e-05, + "loss": 0.4945, + "step": 21214 + }, + { + "epoch": 1.1879829768171128, + "grad_norm": 1.6254644393920898, + "learning_rate": 9.968289473684211e-05, + "loss": 0.5361, + "step": 21215 + }, + { + "epoch": 1.1880389741292419, + "grad_norm": 1.241886854171753, + "learning_rate": 9.968263157894737e-05, + "loss": 0.4756, + "step": 21216 + }, + { + "epoch": 1.1880949714413709, + "grad_norm": 1.2532511949539185, + "learning_rate": 9.968236842105264e-05, + "loss": 0.4426, + "step": 21217 + }, + { + "epoch": 1.1881509687535, + "grad_norm": 1.3908519744873047, + "learning_rate": 9.96821052631579e-05, + "loss": 0.5752, + "step": 21218 + }, + { + "epoch": 1.188206966065629, + "grad_norm": 1.4657741785049438, + "learning_rate": 9.968184210526316e-05, + "loss": 0.4851, + "step": 21219 + }, + { + "epoch": 1.188262963377758, + "grad_norm": 1.380887746810913, + "learning_rate": 9.968157894736842e-05, + "loss": 0.4347, + "step": 21220 + }, + { + "epoch": 1.188318960689887, + "grad_norm": 1.3606361150741577, + "learning_rate": 9.968131578947368e-05, + "loss": 0.4537, + "step": 21221 + }, + { + "epoch": 1.188374958002016, + "grad_norm": 1.447104573249817, + "learning_rate": 9.968105263157896e-05, + "loss": 0.4163, + "step": 21222 + }, + { + "epoch": 1.188430955314145, + "grad_norm": 8.842304229736328, + "learning_rate": 9.968078947368422e-05, + "loss": 0.4515, + "step": 21223 + }, + { + "epoch": 1.188486952626274, + "grad_norm": 1.6093858480453491, + "learning_rate": 9.968052631578947e-05, + "loss": 0.5563, + "step": 21224 + }, + { + "epoch": 1.188542949938403, + "grad_norm": 1.502869963645935, + "learning_rate": 9.968026315789473e-05, + "loss": 0.4019, + "step": 21225 + }, + { + "epoch": 1.188598947250532, + "grad_norm": 1.334153413772583, + "learning_rate": 9.968000000000001e-05, + "loss": 0.5447, + "step": 21226 + }, + { + "epoch": 1.188654944562661, + "grad_norm": 1.394298791885376, + "learning_rate": 9.967973684210527e-05, + "loss": 0.4455, + "step": 21227 + }, + { + "epoch": 1.18871094187479, + "grad_norm": 1.8458468914031982, + "learning_rate": 9.967947368421053e-05, + "loss": 0.4829, + "step": 21228 + }, + { + "epoch": 1.1887669391869191, + "grad_norm": 1.3631985187530518, + "learning_rate": 9.967921052631579e-05, + "loss": 0.4603, + "step": 21229 + }, + { + "epoch": 1.1888229364990481, + "grad_norm": 1.7145485877990723, + "learning_rate": 9.967894736842106e-05, + "loss": 0.4807, + "step": 21230 + }, + { + "epoch": 1.1888789338111772, + "grad_norm": 1.5248175859451294, + "learning_rate": 9.967868421052632e-05, + "loss": 0.5424, + "step": 21231 + }, + { + "epoch": 1.1889349311233062, + "grad_norm": 1.4054481983184814, + "learning_rate": 9.96784210526316e-05, + "loss": 0.3944, + "step": 21232 + }, + { + "epoch": 1.1889909284354352, + "grad_norm": 1.3018110990524292, + "learning_rate": 9.967815789473684e-05, + "loss": 0.5388, + "step": 21233 + }, + { + "epoch": 1.1890469257475642, + "grad_norm": 1.7539788484573364, + "learning_rate": 9.967789473684211e-05, + "loss": 0.5211, + "step": 21234 + }, + { + "epoch": 1.1891029230596932, + "grad_norm": 1.2612608671188354, + "learning_rate": 9.967763157894737e-05, + "loss": 0.3915, + "step": 21235 + }, + { + "epoch": 1.1891589203718222, + "grad_norm": 1.3768184185028076, + "learning_rate": 9.967736842105265e-05, + "loss": 0.4413, + "step": 21236 + }, + { + "epoch": 1.1892149176839513, + "grad_norm": 1.557417392730713, + "learning_rate": 9.96771052631579e-05, + "loss": 0.5665, + "step": 21237 + }, + { + "epoch": 1.1892709149960803, + "grad_norm": 1.3070082664489746, + "learning_rate": 9.967684210526315e-05, + "loss": 0.4983, + "step": 21238 + }, + { + "epoch": 1.1893269123082093, + "grad_norm": 1.38764226436615, + "learning_rate": 9.967657894736842e-05, + "loss": 0.4311, + "step": 21239 + }, + { + "epoch": 1.1893829096203383, + "grad_norm": 1.4594128131866455, + "learning_rate": 9.967631578947368e-05, + "loss": 0.5161, + "step": 21240 + }, + { + "epoch": 1.1894389069324673, + "grad_norm": 1.1985621452331543, + "learning_rate": 9.967605263157896e-05, + "loss": 0.403, + "step": 21241 + }, + { + "epoch": 1.1894949042445964, + "grad_norm": 1.1932530403137207, + "learning_rate": 9.967578947368422e-05, + "loss": 0.4045, + "step": 21242 + }, + { + "epoch": 1.1895509015567254, + "grad_norm": 1.3379814624786377, + "learning_rate": 9.967552631578948e-05, + "loss": 0.42, + "step": 21243 + }, + { + "epoch": 1.1896068988688544, + "grad_norm": 1.2524373531341553, + "learning_rate": 9.967526315789474e-05, + "loss": 0.3603, + "step": 21244 + }, + { + "epoch": 1.1896628961809834, + "grad_norm": 1.932081699371338, + "learning_rate": 9.967500000000001e-05, + "loss": 0.5632, + "step": 21245 + }, + { + "epoch": 1.1897188934931124, + "grad_norm": 1.4900124073028564, + "learning_rate": 9.967473684210527e-05, + "loss": 0.6733, + "step": 21246 + }, + { + "epoch": 1.1897748908052415, + "grad_norm": 1.4772049188613892, + "learning_rate": 9.967447368421053e-05, + "loss": 0.5363, + "step": 21247 + }, + { + "epoch": 1.1898308881173705, + "grad_norm": 1.3865453004837036, + "learning_rate": 9.967421052631579e-05, + "loss": 0.5008, + "step": 21248 + }, + { + "epoch": 1.1898868854294995, + "grad_norm": 1.3593593835830688, + "learning_rate": 9.967394736842106e-05, + "loss": 0.439, + "step": 21249 + }, + { + "epoch": 1.1899428827416285, + "grad_norm": 1.2697149515151978, + "learning_rate": 9.967368421052632e-05, + "loss": 0.391, + "step": 21250 + }, + { + "epoch": 1.1899988800537575, + "grad_norm": 1.3371034860610962, + "learning_rate": 9.967342105263158e-05, + "loss": 0.447, + "step": 21251 + }, + { + "epoch": 1.1900548773658866, + "grad_norm": 1.3540842533111572, + "learning_rate": 9.967315789473684e-05, + "loss": 0.466, + "step": 21252 + }, + { + "epoch": 1.1901108746780156, + "grad_norm": 1.2161946296691895, + "learning_rate": 9.967289473684212e-05, + "loss": 0.4371, + "step": 21253 + }, + { + "epoch": 1.1901668719901446, + "grad_norm": 1.0837620496749878, + "learning_rate": 9.967263157894738e-05, + "loss": 0.406, + "step": 21254 + }, + { + "epoch": 1.1902228693022736, + "grad_norm": 1.256539225578308, + "learning_rate": 9.967236842105263e-05, + "loss": 0.4907, + "step": 21255 + }, + { + "epoch": 1.1902788666144026, + "grad_norm": 1.5777192115783691, + "learning_rate": 9.96721052631579e-05, + "loss": 0.3872, + "step": 21256 + }, + { + "epoch": 1.1903348639265316, + "grad_norm": 1.4282094240188599, + "learning_rate": 9.967184210526315e-05, + "loss": 0.5761, + "step": 21257 + }, + { + "epoch": 1.1903908612386607, + "grad_norm": 1.4186253547668457, + "learning_rate": 9.967157894736843e-05, + "loss": 0.4186, + "step": 21258 + }, + { + "epoch": 1.1904468585507895, + "grad_norm": 1.409191370010376, + "learning_rate": 9.967131578947369e-05, + "loss": 0.4498, + "step": 21259 + }, + { + "epoch": 1.1905028558629185, + "grad_norm": 1.305984377861023, + "learning_rate": 9.967105263157896e-05, + "loss": 0.4489, + "step": 21260 + }, + { + "epoch": 1.1905588531750475, + "grad_norm": 1.7411454916000366, + "learning_rate": 9.96707894736842e-05, + "loss": 0.4493, + "step": 21261 + }, + { + "epoch": 1.1906148504871765, + "grad_norm": 1.277187466621399, + "learning_rate": 9.967052631578948e-05, + "loss": 0.4765, + "step": 21262 + }, + { + "epoch": 1.1906708477993055, + "grad_norm": 1.2747749090194702, + "learning_rate": 9.967026315789474e-05, + "loss": 0.4391, + "step": 21263 + }, + { + "epoch": 1.1907268451114346, + "grad_norm": 1.30585515499115, + "learning_rate": 9.967000000000001e-05, + "loss": 0.4503, + "step": 21264 + }, + { + "epoch": 1.1907828424235636, + "grad_norm": 1.7725613117218018, + "learning_rate": 9.966973684210526e-05, + "loss": 0.6786, + "step": 21265 + }, + { + "epoch": 1.1908388397356926, + "grad_norm": 1.4894185066223145, + "learning_rate": 9.966947368421053e-05, + "loss": 0.4221, + "step": 21266 + }, + { + "epoch": 1.1908948370478216, + "grad_norm": 1.205344319343567, + "learning_rate": 9.966921052631579e-05, + "loss": 0.4327, + "step": 21267 + }, + { + "epoch": 1.1909508343599506, + "grad_norm": 1.4172561168670654, + "learning_rate": 9.966894736842107e-05, + "loss": 0.4166, + "step": 21268 + }, + { + "epoch": 1.1910068316720797, + "grad_norm": 1.1164103746414185, + "learning_rate": 9.966868421052633e-05, + "loss": 0.316, + "step": 21269 + }, + { + "epoch": 1.1910628289842087, + "grad_norm": 1.3036106824874878, + "learning_rate": 9.966842105263158e-05, + "loss": 0.4415, + "step": 21270 + }, + { + "epoch": 1.1911188262963377, + "grad_norm": 1.145293951034546, + "learning_rate": 9.966815789473684e-05, + "loss": 0.335, + "step": 21271 + }, + { + "epoch": 1.1911748236084667, + "grad_norm": 1.4188669919967651, + "learning_rate": 9.96678947368421e-05, + "loss": 0.5391, + "step": 21272 + }, + { + "epoch": 1.1912308209205957, + "grad_norm": 1.3428798913955688, + "learning_rate": 9.966763157894738e-05, + "loss": 0.4446, + "step": 21273 + }, + { + "epoch": 1.1912868182327248, + "grad_norm": 1.8040522336959839, + "learning_rate": 9.966736842105264e-05, + "loss": 0.5103, + "step": 21274 + }, + { + "epoch": 1.1913428155448538, + "grad_norm": 1.2655612230300903, + "learning_rate": 9.96671052631579e-05, + "loss": 0.4005, + "step": 21275 + }, + { + "epoch": 1.1913988128569828, + "grad_norm": 1.1684056520462036, + "learning_rate": 9.966684210526316e-05, + "loss": 0.2977, + "step": 21276 + }, + { + "epoch": 1.1914548101691118, + "grad_norm": 1.3208248615264893, + "learning_rate": 9.966657894736843e-05, + "loss": 0.5333, + "step": 21277 + }, + { + "epoch": 1.1915108074812408, + "grad_norm": 1.7194039821624756, + "learning_rate": 9.966631578947369e-05, + "loss": 0.4584, + "step": 21278 + }, + { + "epoch": 1.1915668047933698, + "grad_norm": 1.319769263267517, + "learning_rate": 9.966605263157895e-05, + "loss": 0.5436, + "step": 21279 + }, + { + "epoch": 1.1916228021054989, + "grad_norm": 1.57979416847229, + "learning_rate": 9.966578947368421e-05, + "loss": 0.3921, + "step": 21280 + }, + { + "epoch": 1.1916787994176279, + "grad_norm": 1.232108235359192, + "learning_rate": 9.966552631578948e-05, + "loss": 0.3542, + "step": 21281 + }, + { + "epoch": 1.191734796729757, + "grad_norm": 1.295562505722046, + "learning_rate": 9.966526315789474e-05, + "loss": 0.4108, + "step": 21282 + }, + { + "epoch": 1.191790794041886, + "grad_norm": 1.457898736000061, + "learning_rate": 9.9665e-05, + "loss": 0.4532, + "step": 21283 + }, + { + "epoch": 1.191846791354015, + "grad_norm": 1.6455161571502686, + "learning_rate": 9.966473684210526e-05, + "loss": 0.5247, + "step": 21284 + }, + { + "epoch": 1.191902788666144, + "grad_norm": 1.4352648258209229, + "learning_rate": 9.966447368421054e-05, + "loss": 0.4688, + "step": 21285 + }, + { + "epoch": 1.191958785978273, + "grad_norm": 42.5954475402832, + "learning_rate": 9.96642105263158e-05, + "loss": 0.5555, + "step": 21286 + }, + { + "epoch": 1.192014783290402, + "grad_norm": 1.550161361694336, + "learning_rate": 9.966394736842107e-05, + "loss": 0.493, + "step": 21287 + }, + { + "epoch": 1.192070780602531, + "grad_norm": 1.437830924987793, + "learning_rate": 9.966368421052631e-05, + "loss": 0.4395, + "step": 21288 + }, + { + "epoch": 1.19212677791466, + "grad_norm": 1.3734545707702637, + "learning_rate": 9.966342105263159e-05, + "loss": 0.432, + "step": 21289 + }, + { + "epoch": 1.192182775226789, + "grad_norm": 1.2367222309112549, + "learning_rate": 9.966315789473685e-05, + "loss": 0.3989, + "step": 21290 + }, + { + "epoch": 1.192238772538918, + "grad_norm": 1.257411003112793, + "learning_rate": 9.966289473684211e-05, + "loss": 0.4927, + "step": 21291 + }, + { + "epoch": 1.192294769851047, + "grad_norm": 1.535960078239441, + "learning_rate": 9.966263157894738e-05, + "loss": 0.3534, + "step": 21292 + }, + { + "epoch": 1.1923507671631761, + "grad_norm": 1.664744257926941, + "learning_rate": 9.966236842105263e-05, + "loss": 0.4783, + "step": 21293 + }, + { + "epoch": 1.1924067644753051, + "grad_norm": 1.3406223058700562, + "learning_rate": 9.96621052631579e-05, + "loss": 0.5224, + "step": 21294 + }, + { + "epoch": 1.1924627617874342, + "grad_norm": 1.4745821952819824, + "learning_rate": 9.966184210526316e-05, + "loss": 0.3627, + "step": 21295 + }, + { + "epoch": 1.1925187590995632, + "grad_norm": 1.4468576908111572, + "learning_rate": 9.966157894736843e-05, + "loss": 0.4381, + "step": 21296 + }, + { + "epoch": 1.1925747564116922, + "grad_norm": 1.48208749294281, + "learning_rate": 9.966131578947369e-05, + "loss": 0.4918, + "step": 21297 + }, + { + "epoch": 1.1926307537238212, + "grad_norm": 1.2047662734985352, + "learning_rate": 9.966105263157895e-05, + "loss": 0.5098, + "step": 21298 + }, + { + "epoch": 1.1926867510359502, + "grad_norm": 1.2078361511230469, + "learning_rate": 9.966078947368421e-05, + "loss": 0.4489, + "step": 21299 + }, + { + "epoch": 1.1927427483480793, + "grad_norm": 1.2705317735671997, + "learning_rate": 9.966052631578949e-05, + "loss": 0.3713, + "step": 21300 + }, + { + "epoch": 1.1927987456602083, + "grad_norm": 1.3581575155258179, + "learning_rate": 9.966026315789474e-05, + "loss": 0.376, + "step": 21301 + }, + { + "epoch": 1.1928547429723373, + "grad_norm": 1.405099630355835, + "learning_rate": 9.966e-05, + "loss": 0.4952, + "step": 21302 + }, + { + "epoch": 1.1929107402844663, + "grad_norm": 1.1661505699157715, + "learning_rate": 9.965973684210526e-05, + "loss": 0.3296, + "step": 21303 + }, + { + "epoch": 1.1929667375965953, + "grad_norm": 1.4038792848587036, + "learning_rate": 9.965947368421054e-05, + "loss": 0.3139, + "step": 21304 + }, + { + "epoch": 1.1930227349087243, + "grad_norm": 1.3428869247436523, + "learning_rate": 9.96592105263158e-05, + "loss": 0.4924, + "step": 21305 + }, + { + "epoch": 1.1930787322208534, + "grad_norm": 1.3574388027191162, + "learning_rate": 9.965894736842106e-05, + "loss": 0.5391, + "step": 21306 + }, + { + "epoch": 1.1931347295329824, + "grad_norm": 1.272050142288208, + "learning_rate": 9.965868421052632e-05, + "loss": 0.401, + "step": 21307 + }, + { + "epoch": 1.1931907268451114, + "grad_norm": 1.4629734754562378, + "learning_rate": 9.965842105263158e-05, + "loss": 0.4211, + "step": 21308 + }, + { + "epoch": 1.1932467241572404, + "grad_norm": 1.3369213342666626, + "learning_rate": 9.965815789473685e-05, + "loss": 0.3077, + "step": 21309 + }, + { + "epoch": 1.1933027214693694, + "grad_norm": 1.5254815816879272, + "learning_rate": 9.965789473684211e-05, + "loss": 0.6437, + "step": 21310 + }, + { + "epoch": 1.1933587187814985, + "grad_norm": 1.391893744468689, + "learning_rate": 9.965763157894737e-05, + "loss": 0.4797, + "step": 21311 + }, + { + "epoch": 1.1934147160936275, + "grad_norm": 1.3041632175445557, + "learning_rate": 9.965736842105263e-05, + "loss": 0.4406, + "step": 21312 + }, + { + "epoch": 1.1934707134057565, + "grad_norm": 1.2722002267837524, + "learning_rate": 9.96571052631579e-05, + "loss": 0.3874, + "step": 21313 + }, + { + "epoch": 1.1935267107178855, + "grad_norm": 1.2253620624542236, + "learning_rate": 9.965684210526316e-05, + "loss": 0.4311, + "step": 21314 + }, + { + "epoch": 1.1935827080300145, + "grad_norm": 1.3320913314819336, + "learning_rate": 9.965657894736844e-05, + "loss": 0.3835, + "step": 21315 + }, + { + "epoch": 1.1936387053421436, + "grad_norm": 1.3502252101898193, + "learning_rate": 9.965631578947368e-05, + "loss": 0.4198, + "step": 21316 + }, + { + "epoch": 1.1936947026542726, + "grad_norm": 1.4054888486862183, + "learning_rate": 9.965605263157895e-05, + "loss": 0.4118, + "step": 21317 + }, + { + "epoch": 1.1937506999664016, + "grad_norm": 1.4755635261535645, + "learning_rate": 9.965578947368421e-05, + "loss": 0.3879, + "step": 21318 + }, + { + "epoch": 1.1938066972785306, + "grad_norm": 1.1939069032669067, + "learning_rate": 9.965552631578949e-05, + "loss": 0.4434, + "step": 21319 + }, + { + "epoch": 1.1938626945906596, + "grad_norm": 1.3770779371261597, + "learning_rate": 9.965526315789473e-05, + "loss": 0.4295, + "step": 21320 + }, + { + "epoch": 1.1939186919027887, + "grad_norm": 1.2577886581420898, + "learning_rate": 9.965500000000001e-05, + "loss": 0.3482, + "step": 21321 + }, + { + "epoch": 1.1939746892149177, + "grad_norm": 1.3705942630767822, + "learning_rate": 9.965473684210527e-05, + "loss": 0.4651, + "step": 21322 + }, + { + "epoch": 1.1940306865270467, + "grad_norm": 1.2908939123153687, + "learning_rate": 9.965447368421054e-05, + "loss": 0.3609, + "step": 21323 + }, + { + "epoch": 1.1940866838391757, + "grad_norm": 1.2212295532226562, + "learning_rate": 9.96542105263158e-05, + "loss": 0.4819, + "step": 21324 + }, + { + "epoch": 1.1941426811513047, + "grad_norm": 2.922239303588867, + "learning_rate": 9.965394736842105e-05, + "loss": 0.3876, + "step": 21325 + }, + { + "epoch": 1.1941986784634337, + "grad_norm": 1.2339801788330078, + "learning_rate": 9.965368421052632e-05, + "loss": 0.3714, + "step": 21326 + }, + { + "epoch": 1.1942546757755628, + "grad_norm": 1.2695056200027466, + "learning_rate": 9.965342105263158e-05, + "loss": 0.4169, + "step": 21327 + }, + { + "epoch": 1.1943106730876918, + "grad_norm": 1.566595196723938, + "learning_rate": 9.965315789473685e-05, + "loss": 0.4286, + "step": 21328 + }, + { + "epoch": 1.1943666703998208, + "grad_norm": 1.6339783668518066, + "learning_rate": 9.965289473684211e-05, + "loss": 0.5141, + "step": 21329 + }, + { + "epoch": 1.1944226677119498, + "grad_norm": 1.6444056034088135, + "learning_rate": 9.965263157894737e-05, + "loss": 0.4442, + "step": 21330 + }, + { + "epoch": 1.1944786650240788, + "grad_norm": 9.891744613647461, + "learning_rate": 9.965236842105263e-05, + "loss": 0.6255, + "step": 21331 + }, + { + "epoch": 1.1945346623362079, + "grad_norm": 1.4509602785110474, + "learning_rate": 9.96521052631579e-05, + "loss": 0.5928, + "step": 21332 + }, + { + "epoch": 1.1945906596483369, + "grad_norm": 1.338374376296997, + "learning_rate": 9.965184210526316e-05, + "loss": 0.4418, + "step": 21333 + }, + { + "epoch": 1.194646656960466, + "grad_norm": 2.459693670272827, + "learning_rate": 9.965157894736842e-05, + "loss": 0.4032, + "step": 21334 + }, + { + "epoch": 1.194702654272595, + "grad_norm": 1.3252931833267212, + "learning_rate": 9.965131578947368e-05, + "loss": 0.3934, + "step": 21335 + }, + { + "epoch": 1.194758651584724, + "grad_norm": 1.5309665203094482, + "learning_rate": 9.965105263157896e-05, + "loss": 0.5197, + "step": 21336 + }, + { + "epoch": 1.194814648896853, + "grad_norm": 1.3036658763885498, + "learning_rate": 9.965078947368422e-05, + "loss": 0.4037, + "step": 21337 + }, + { + "epoch": 1.194870646208982, + "grad_norm": 1.296344518661499, + "learning_rate": 9.965052631578948e-05, + "loss": 0.5048, + "step": 21338 + }, + { + "epoch": 1.194926643521111, + "grad_norm": 1.0616509914398193, + "learning_rate": 9.965026315789474e-05, + "loss": 0.339, + "step": 21339 + }, + { + "epoch": 1.19498264083324, + "grad_norm": 1.7376258373260498, + "learning_rate": 9.965000000000001e-05, + "loss": 0.5438, + "step": 21340 + }, + { + "epoch": 1.195038638145369, + "grad_norm": 1.2771480083465576, + "learning_rate": 9.964973684210527e-05, + "loss": 0.4788, + "step": 21341 + }, + { + "epoch": 1.195094635457498, + "grad_norm": 1.6966956853866577, + "learning_rate": 9.964947368421053e-05, + "loss": 0.477, + "step": 21342 + }, + { + "epoch": 1.195150632769627, + "grad_norm": 1.2666939496994019, + "learning_rate": 9.964921052631579e-05, + "loss": 0.6467, + "step": 21343 + }, + { + "epoch": 1.195206630081756, + "grad_norm": 1.2245177030563354, + "learning_rate": 9.964894736842105e-05, + "loss": 0.4658, + "step": 21344 + }, + { + "epoch": 1.1952626273938851, + "grad_norm": 1.3075501918792725, + "learning_rate": 9.964868421052632e-05, + "loss": 0.4255, + "step": 21345 + }, + { + "epoch": 1.1953186247060141, + "grad_norm": 1.3158923387527466, + "learning_rate": 9.964842105263158e-05, + "loss": 0.4028, + "step": 21346 + }, + { + "epoch": 1.1953746220181432, + "grad_norm": 1.310836911201477, + "learning_rate": 9.964815789473686e-05, + "loss": 0.4359, + "step": 21347 + }, + { + "epoch": 1.1954306193302722, + "grad_norm": 1.4936611652374268, + "learning_rate": 9.96478947368421e-05, + "loss": 0.5623, + "step": 21348 + }, + { + "epoch": 1.1954866166424012, + "grad_norm": 1.1484125852584839, + "learning_rate": 9.964763157894737e-05, + "loss": 0.3725, + "step": 21349 + }, + { + "epoch": 1.1955426139545302, + "grad_norm": 1.3295791149139404, + "learning_rate": 9.964736842105263e-05, + "loss": 0.5219, + "step": 21350 + }, + { + "epoch": 1.1955986112666592, + "grad_norm": 1.4058802127838135, + "learning_rate": 9.964710526315791e-05, + "loss": 0.4106, + "step": 21351 + }, + { + "epoch": 1.1956546085787882, + "grad_norm": 1.1733869314193726, + "learning_rate": 9.964684210526317e-05, + "loss": 0.3727, + "step": 21352 + }, + { + "epoch": 1.1957106058909173, + "grad_norm": 1.3899941444396973, + "learning_rate": 9.964657894736843e-05, + "loss": 0.4666, + "step": 21353 + }, + { + "epoch": 1.1957666032030463, + "grad_norm": 1.5182164907455444, + "learning_rate": 9.964631578947369e-05, + "loss": 0.6035, + "step": 21354 + }, + { + "epoch": 1.1958226005151753, + "grad_norm": 1.3368122577667236, + "learning_rate": 9.964605263157896e-05, + "loss": 0.4835, + "step": 21355 + }, + { + "epoch": 1.1958785978273043, + "grad_norm": 1.2818577289581299, + "learning_rate": 9.964578947368422e-05, + "loss": 0.5112, + "step": 21356 + }, + { + "epoch": 1.1959345951394333, + "grad_norm": 1.1480575799942017, + "learning_rate": 9.964552631578948e-05, + "loss": 0.4568, + "step": 21357 + }, + { + "epoch": 1.1959905924515624, + "grad_norm": 1.4644219875335693, + "learning_rate": 9.964526315789474e-05, + "loss": 0.4418, + "step": 21358 + }, + { + "epoch": 1.1960465897636914, + "grad_norm": 1.3618824481964111, + "learning_rate": 9.9645e-05, + "loss": 0.4163, + "step": 21359 + }, + { + "epoch": 1.1961025870758204, + "grad_norm": 1.3760063648223877, + "learning_rate": 9.964473684210527e-05, + "loss": 0.4738, + "step": 21360 + }, + { + "epoch": 1.1961585843879494, + "grad_norm": 1.3598028421401978, + "learning_rate": 9.964447368421053e-05, + "loss": 0.3665, + "step": 21361 + }, + { + "epoch": 1.1962145817000784, + "grad_norm": 1.2918466329574585, + "learning_rate": 9.964421052631579e-05, + "loss": 0.4522, + "step": 21362 + }, + { + "epoch": 1.1962705790122075, + "grad_norm": 1.4884247779846191, + "learning_rate": 9.964394736842105e-05, + "loss": 0.4283, + "step": 21363 + }, + { + "epoch": 1.1963265763243365, + "grad_norm": 1.1714822053909302, + "learning_rate": 9.964368421052632e-05, + "loss": 0.4868, + "step": 21364 + }, + { + "epoch": 1.1963825736364655, + "grad_norm": 1.141059160232544, + "learning_rate": 9.964342105263158e-05, + "loss": 0.3575, + "step": 21365 + }, + { + "epoch": 1.1964385709485945, + "grad_norm": 1.2665141820907593, + "learning_rate": 9.964315789473684e-05, + "loss": 0.4476, + "step": 21366 + }, + { + "epoch": 1.1964945682607235, + "grad_norm": 1.3462717533111572, + "learning_rate": 9.96428947368421e-05, + "loss": 0.3294, + "step": 21367 + }, + { + "epoch": 1.1965505655728526, + "grad_norm": 1.5221977233886719, + "learning_rate": 9.964263157894738e-05, + "loss": 0.4968, + "step": 21368 + }, + { + "epoch": 1.1966065628849816, + "grad_norm": 2.3991973400115967, + "learning_rate": 9.964236842105264e-05, + "loss": 0.555, + "step": 21369 + }, + { + "epoch": 1.1966625601971106, + "grad_norm": 1.454770565032959, + "learning_rate": 9.96421052631579e-05, + "loss": 0.4786, + "step": 21370 + }, + { + "epoch": 1.1967185575092396, + "grad_norm": 1.3468774557113647, + "learning_rate": 9.964184210526316e-05, + "loss": 0.4394, + "step": 21371 + }, + { + "epoch": 1.1967745548213686, + "grad_norm": 1.3988748788833618, + "learning_rate": 9.964157894736843e-05, + "loss": 0.4111, + "step": 21372 + }, + { + "epoch": 1.1968305521334976, + "grad_norm": 1.39087975025177, + "learning_rate": 9.964131578947369e-05, + "loss": 0.4374, + "step": 21373 + }, + { + "epoch": 1.1968865494456267, + "grad_norm": 1.1370114088058472, + "learning_rate": 9.964105263157896e-05, + "loss": 0.4175, + "step": 21374 + }, + { + "epoch": 1.1969425467577557, + "grad_norm": 1.2697925567626953, + "learning_rate": 9.964078947368421e-05, + "loss": 0.3974, + "step": 21375 + }, + { + "epoch": 1.1969985440698847, + "grad_norm": 1.4969922304153442, + "learning_rate": 9.964052631578947e-05, + "loss": 0.6954, + "step": 21376 + }, + { + "epoch": 1.1970545413820137, + "grad_norm": 1.474172592163086, + "learning_rate": 9.964026315789474e-05, + "loss": 0.4591, + "step": 21377 + }, + { + "epoch": 1.1971105386941427, + "grad_norm": 1.2626137733459473, + "learning_rate": 9.964e-05, + "loss": 0.4082, + "step": 21378 + }, + { + "epoch": 1.1971665360062718, + "grad_norm": 1.0900942087173462, + "learning_rate": 9.963973684210527e-05, + "loss": 0.3125, + "step": 21379 + }, + { + "epoch": 1.1972225333184008, + "grad_norm": 1.2060264348983765, + "learning_rate": 9.963947368421052e-05, + "loss": 0.561, + "step": 21380 + }, + { + "epoch": 1.1972785306305298, + "grad_norm": 1.5008060932159424, + "learning_rate": 9.96392105263158e-05, + "loss": 0.4794, + "step": 21381 + }, + { + "epoch": 1.1973345279426588, + "grad_norm": 1.6069817543029785, + "learning_rate": 9.963894736842105e-05, + "loss": 0.7367, + "step": 21382 + }, + { + "epoch": 1.1973905252547878, + "grad_norm": 1.6059280633926392, + "learning_rate": 9.963868421052633e-05, + "loss": 0.4607, + "step": 21383 + }, + { + "epoch": 1.1974465225669169, + "grad_norm": 1.3171371221542358, + "learning_rate": 9.963842105263159e-05, + "loss": 0.4136, + "step": 21384 + }, + { + "epoch": 1.1975025198790459, + "grad_norm": 1.3216816186904907, + "learning_rate": 9.963815789473685e-05, + "loss": 0.3876, + "step": 21385 + }, + { + "epoch": 1.197558517191175, + "grad_norm": 1.4280669689178467, + "learning_rate": 9.96378947368421e-05, + "loss": 0.3501, + "step": 21386 + }, + { + "epoch": 1.197614514503304, + "grad_norm": 1.2969985008239746, + "learning_rate": 9.963763157894738e-05, + "loss": 0.4646, + "step": 21387 + }, + { + "epoch": 1.197670511815433, + "grad_norm": 1.5294933319091797, + "learning_rate": 9.963736842105264e-05, + "loss": 0.4396, + "step": 21388 + }, + { + "epoch": 1.197726509127562, + "grad_norm": 1.4797515869140625, + "learning_rate": 9.96371052631579e-05, + "loss": 0.5789, + "step": 21389 + }, + { + "epoch": 1.197782506439691, + "grad_norm": 1.872753381729126, + "learning_rate": 9.963684210526316e-05, + "loss": 0.4848, + "step": 21390 + }, + { + "epoch": 1.19783850375182, + "grad_norm": 1.5574126243591309, + "learning_rate": 9.963657894736843e-05, + "loss": 0.4941, + "step": 21391 + }, + { + "epoch": 1.197894501063949, + "grad_norm": 12.988001823425293, + "learning_rate": 9.963631578947369e-05, + "loss": 0.5403, + "step": 21392 + }, + { + "epoch": 1.197950498376078, + "grad_norm": 1.631577730178833, + "learning_rate": 9.963605263157895e-05, + "loss": 0.4207, + "step": 21393 + }, + { + "epoch": 1.198006495688207, + "grad_norm": 1.3914836645126343, + "learning_rate": 9.963578947368421e-05, + "loss": 0.4603, + "step": 21394 + }, + { + "epoch": 1.198062493000336, + "grad_norm": 1.272395133972168, + "learning_rate": 9.963552631578947e-05, + "loss": 0.3924, + "step": 21395 + }, + { + "epoch": 1.198118490312465, + "grad_norm": 1.0716838836669922, + "learning_rate": 9.963526315789474e-05, + "loss": 0.3835, + "step": 21396 + }, + { + "epoch": 1.198174487624594, + "grad_norm": 1.6219170093536377, + "learning_rate": 9.9635e-05, + "loss": 0.6136, + "step": 21397 + }, + { + "epoch": 1.1982304849367231, + "grad_norm": 1.3890399932861328, + "learning_rate": 9.963473684210526e-05, + "loss": 0.4818, + "step": 21398 + }, + { + "epoch": 1.1982864822488521, + "grad_norm": 1.3197038173675537, + "learning_rate": 9.963447368421052e-05, + "loss": 0.3899, + "step": 21399 + }, + { + "epoch": 1.1983424795609812, + "grad_norm": 1.3574610948562622, + "learning_rate": 9.96342105263158e-05, + "loss": 0.4244, + "step": 21400 + }, + { + "epoch": 1.1983984768731102, + "grad_norm": 1.9468427896499634, + "learning_rate": 9.963394736842106e-05, + "loss": 0.4961, + "step": 21401 + }, + { + "epoch": 1.1984544741852392, + "grad_norm": 1.6920266151428223, + "learning_rate": 9.963368421052633e-05, + "loss": 0.446, + "step": 21402 + }, + { + "epoch": 1.198510471497368, + "grad_norm": 1.560967206954956, + "learning_rate": 9.963342105263158e-05, + "loss": 0.4072, + "step": 21403 + }, + { + "epoch": 1.198566468809497, + "grad_norm": 1.432548999786377, + "learning_rate": 9.963315789473685e-05, + "loss": 0.4393, + "step": 21404 + }, + { + "epoch": 1.198622466121626, + "grad_norm": 1.4270511865615845, + "learning_rate": 9.963289473684211e-05, + "loss": 0.3981, + "step": 21405 + }, + { + "epoch": 1.198678463433755, + "grad_norm": 1.488874912261963, + "learning_rate": 9.963263157894738e-05, + "loss": 0.4374, + "step": 21406 + }, + { + "epoch": 1.198734460745884, + "grad_norm": 1.3100860118865967, + "learning_rate": 9.963236842105264e-05, + "loss": 0.5362, + "step": 21407 + }, + { + "epoch": 1.198790458058013, + "grad_norm": 1.2270326614379883, + "learning_rate": 9.96321052631579e-05, + "loss": 0.3289, + "step": 21408 + }, + { + "epoch": 1.1988464553701421, + "grad_norm": 1.1580663919448853, + "learning_rate": 9.963184210526316e-05, + "loss": 0.4817, + "step": 21409 + }, + { + "epoch": 1.1989024526822711, + "grad_norm": 1.5381953716278076, + "learning_rate": 9.963157894736843e-05, + "loss": 0.4355, + "step": 21410 + }, + { + "epoch": 1.1989584499944002, + "grad_norm": 1.052895188331604, + "learning_rate": 9.96313157894737e-05, + "loss": 0.3743, + "step": 21411 + }, + { + "epoch": 1.1990144473065292, + "grad_norm": 1.3846203088760376, + "learning_rate": 9.963105263157894e-05, + "loss": 0.4153, + "step": 21412 + }, + { + "epoch": 1.1990704446186582, + "grad_norm": 1.3448079824447632, + "learning_rate": 9.963078947368421e-05, + "loss": 0.3799, + "step": 21413 + }, + { + "epoch": 1.1991264419307872, + "grad_norm": 1.3910526037216187, + "learning_rate": 9.963052631578947e-05, + "loss": 0.3322, + "step": 21414 + }, + { + "epoch": 1.1991824392429162, + "grad_norm": 1.280060052871704, + "learning_rate": 9.963026315789475e-05, + "loss": 0.4396, + "step": 21415 + }, + { + "epoch": 1.1992384365550453, + "grad_norm": 1.3302288055419922, + "learning_rate": 9.963e-05, + "loss": 0.3988, + "step": 21416 + }, + { + "epoch": 1.1992944338671743, + "grad_norm": 1.322451114654541, + "learning_rate": 9.962973684210527e-05, + "loss": 0.3296, + "step": 21417 + }, + { + "epoch": 1.1993504311793033, + "grad_norm": 1.6919171810150146, + "learning_rate": 9.962947368421053e-05, + "loss": 0.4415, + "step": 21418 + }, + { + "epoch": 1.1994064284914323, + "grad_norm": 1.2231345176696777, + "learning_rate": 9.96292105263158e-05, + "loss": 0.3932, + "step": 21419 + }, + { + "epoch": 1.1994624258035613, + "grad_norm": 1.4488109350204468, + "learning_rate": 9.962894736842106e-05, + "loss": 0.4924, + "step": 21420 + }, + { + "epoch": 1.1995184231156903, + "grad_norm": 1.2579987049102783, + "learning_rate": 9.962868421052632e-05, + "loss": 0.4286, + "step": 21421 + }, + { + "epoch": 1.1995744204278194, + "grad_norm": 1.4669923782348633, + "learning_rate": 9.962842105263158e-05, + "loss": 0.4557, + "step": 21422 + }, + { + "epoch": 1.1996304177399484, + "grad_norm": 1.3621535301208496, + "learning_rate": 9.962815789473685e-05, + "loss": 0.454, + "step": 21423 + }, + { + "epoch": 1.1996864150520774, + "grad_norm": 1.6935299634933472, + "learning_rate": 9.962789473684211e-05, + "loss": 0.4028, + "step": 21424 + }, + { + "epoch": 1.1997424123642064, + "grad_norm": 1.788367509841919, + "learning_rate": 9.962763157894737e-05, + "loss": 0.4214, + "step": 21425 + }, + { + "epoch": 1.1997984096763354, + "grad_norm": 1.615911841392517, + "learning_rate": 9.962736842105263e-05, + "loss": 0.4857, + "step": 21426 + }, + { + "epoch": 1.1998544069884645, + "grad_norm": 1.5796549320220947, + "learning_rate": 9.96271052631579e-05, + "loss": 0.4454, + "step": 21427 + }, + { + "epoch": 1.1999104043005935, + "grad_norm": 1.3333300352096558, + "learning_rate": 9.962684210526316e-05, + "loss": 0.4156, + "step": 21428 + }, + { + "epoch": 1.1999664016127225, + "grad_norm": 1.4734907150268555, + "learning_rate": 9.962657894736842e-05, + "loss": 0.5741, + "step": 21429 + }, + { + "epoch": 1.2000223989248515, + "grad_norm": 1.3020298480987549, + "learning_rate": 9.962631578947368e-05, + "loss": 0.4521, + "step": 21430 + }, + { + "epoch": 1.2000783962369805, + "grad_norm": 1.2103228569030762, + "learning_rate": 9.962605263157894e-05, + "loss": 0.3992, + "step": 21431 + }, + { + "epoch": 1.2001343935491096, + "grad_norm": 1.1992607116699219, + "learning_rate": 9.962578947368422e-05, + "loss": 0.3216, + "step": 21432 + }, + { + "epoch": 1.2001903908612386, + "grad_norm": 1.7995073795318604, + "learning_rate": 9.962552631578948e-05, + "loss": 0.5035, + "step": 21433 + }, + { + "epoch": 1.2002463881733676, + "grad_norm": 1.3206636905670166, + "learning_rate": 9.962526315789475e-05, + "loss": 0.3423, + "step": 21434 + }, + { + "epoch": 1.2003023854854966, + "grad_norm": 1.1745672225952148, + "learning_rate": 9.9625e-05, + "loss": 0.4342, + "step": 21435 + }, + { + "epoch": 1.2003583827976256, + "grad_norm": 1.5004053115844727, + "learning_rate": 9.962473684210527e-05, + "loss": 0.5099, + "step": 21436 + }, + { + "epoch": 1.2004143801097547, + "grad_norm": 1.3555021286010742, + "learning_rate": 9.962447368421053e-05, + "loss": 0.4008, + "step": 21437 + }, + { + "epoch": 1.2004703774218837, + "grad_norm": 1.3807787895202637, + "learning_rate": 9.96242105263158e-05, + "loss": 0.4379, + "step": 21438 + }, + { + "epoch": 1.2005263747340127, + "grad_norm": 1.484658122062683, + "learning_rate": 9.962394736842106e-05, + "loss": 0.5916, + "step": 21439 + }, + { + "epoch": 1.2005823720461417, + "grad_norm": 1.4990845918655396, + "learning_rate": 9.962368421052632e-05, + "loss": 0.3455, + "step": 21440 + }, + { + "epoch": 1.2006383693582707, + "grad_norm": 1.7396429777145386, + "learning_rate": 9.962342105263158e-05, + "loss": 0.4284, + "step": 21441 + }, + { + "epoch": 1.2006943666703997, + "grad_norm": 1.1902375221252441, + "learning_rate": 9.962315789473685e-05, + "loss": 0.3986, + "step": 21442 + }, + { + "epoch": 1.2007503639825288, + "grad_norm": 1.521207571029663, + "learning_rate": 9.962289473684211e-05, + "loss": 0.4777, + "step": 21443 + }, + { + "epoch": 1.2008063612946578, + "grad_norm": 1.5875146389007568, + "learning_rate": 9.962263157894737e-05, + "loss": 0.6852, + "step": 21444 + }, + { + "epoch": 1.2008623586067868, + "grad_norm": 1.0622169971466064, + "learning_rate": 9.962236842105263e-05, + "loss": 0.3953, + "step": 21445 + }, + { + "epoch": 1.2009183559189158, + "grad_norm": 1.5218700170516968, + "learning_rate": 9.96221052631579e-05, + "loss": 0.5035, + "step": 21446 + }, + { + "epoch": 1.2009743532310448, + "grad_norm": 1.2983635663986206, + "learning_rate": 9.962184210526317e-05, + "loss": 0.3688, + "step": 21447 + }, + { + "epoch": 1.2010303505431739, + "grad_norm": 1.4676125049591064, + "learning_rate": 9.962157894736843e-05, + "loss": 0.4088, + "step": 21448 + }, + { + "epoch": 1.2010863478553029, + "grad_norm": 1.672722578048706, + "learning_rate": 9.962131578947369e-05, + "loss": 0.5554, + "step": 21449 + }, + { + "epoch": 1.201142345167432, + "grad_norm": 1.4020755290985107, + "learning_rate": 9.962105263157895e-05, + "loss": 0.3693, + "step": 21450 + }, + { + "epoch": 1.201198342479561, + "grad_norm": 2.088850498199463, + "learning_rate": 9.962078947368422e-05, + "loss": 0.3458, + "step": 21451 + }, + { + "epoch": 1.20125433979169, + "grad_norm": 1.5886541604995728, + "learning_rate": 9.962052631578948e-05, + "loss": 0.5027, + "step": 21452 + }, + { + "epoch": 1.201310337103819, + "grad_norm": 1.2782281637191772, + "learning_rate": 9.962026315789474e-05, + "loss": 0.4541, + "step": 21453 + }, + { + "epoch": 1.201366334415948, + "grad_norm": 1.2737568616867065, + "learning_rate": 9.962e-05, + "loss": 0.4547, + "step": 21454 + }, + { + "epoch": 1.201422331728077, + "grad_norm": 1.5377733707427979, + "learning_rate": 9.961973684210527e-05, + "loss": 0.5331, + "step": 21455 + }, + { + "epoch": 1.201478329040206, + "grad_norm": 1.5774884223937988, + "learning_rate": 9.961947368421053e-05, + "loss": 0.4701, + "step": 21456 + }, + { + "epoch": 1.201534326352335, + "grad_norm": 1.1816591024398804, + "learning_rate": 9.96192105263158e-05, + "loss": 0.3861, + "step": 21457 + }, + { + "epoch": 1.201590323664464, + "grad_norm": 1.193567156791687, + "learning_rate": 9.961894736842105e-05, + "loss": 0.408, + "step": 21458 + }, + { + "epoch": 1.201646320976593, + "grad_norm": 1.3032221794128418, + "learning_rate": 9.961868421052632e-05, + "loss": 0.4111, + "step": 21459 + }, + { + "epoch": 1.201702318288722, + "grad_norm": 1.4719740152359009, + "learning_rate": 9.961842105263158e-05, + "loss": 0.5088, + "step": 21460 + }, + { + "epoch": 1.2017583156008511, + "grad_norm": 1.2805112600326538, + "learning_rate": 9.961815789473686e-05, + "loss": 0.381, + "step": 21461 + }, + { + "epoch": 1.2018143129129801, + "grad_norm": 1.557629108428955, + "learning_rate": 9.961789473684212e-05, + "loss": 0.4849, + "step": 21462 + }, + { + "epoch": 1.2018703102251092, + "grad_norm": 1.222686767578125, + "learning_rate": 9.961763157894736e-05, + "loss": 0.4187, + "step": 21463 + }, + { + "epoch": 1.2019263075372382, + "grad_norm": 1.6230337619781494, + "learning_rate": 9.961736842105264e-05, + "loss": 0.5211, + "step": 21464 + }, + { + "epoch": 1.2019823048493672, + "grad_norm": 1.3267515897750854, + "learning_rate": 9.96171052631579e-05, + "loss": 0.4405, + "step": 21465 + }, + { + "epoch": 1.2020383021614962, + "grad_norm": 1.1997919082641602, + "learning_rate": 9.961684210526317e-05, + "loss": 0.307, + "step": 21466 + }, + { + "epoch": 1.2020942994736252, + "grad_norm": 1.6164220571517944, + "learning_rate": 9.961657894736842e-05, + "loss": 0.4621, + "step": 21467 + }, + { + "epoch": 1.2021502967857542, + "grad_norm": 1.169376254081726, + "learning_rate": 9.961631578947369e-05, + "loss": 0.366, + "step": 21468 + }, + { + "epoch": 1.2022062940978833, + "grad_norm": 1.418872594833374, + "learning_rate": 9.961605263157895e-05, + "loss": 0.3817, + "step": 21469 + }, + { + "epoch": 1.2022622914100123, + "grad_norm": 1.3538204431533813, + "learning_rate": 9.961578947368422e-05, + "loss": 0.4116, + "step": 21470 + }, + { + "epoch": 1.2023182887221413, + "grad_norm": 1.2791597843170166, + "learning_rate": 9.961552631578948e-05, + "loss": 0.4707, + "step": 21471 + }, + { + "epoch": 1.2023742860342703, + "grad_norm": 1.051046371459961, + "learning_rate": 9.961526315789474e-05, + "loss": 0.3611, + "step": 21472 + }, + { + "epoch": 1.2024302833463993, + "grad_norm": 1.4842638969421387, + "learning_rate": 9.9615e-05, + "loss": 0.4924, + "step": 21473 + }, + { + "epoch": 1.2024862806585284, + "grad_norm": 1.563080072402954, + "learning_rate": 9.961473684210527e-05, + "loss": 0.5495, + "step": 21474 + }, + { + "epoch": 1.2025422779706574, + "grad_norm": 1.3472084999084473, + "learning_rate": 9.961447368421053e-05, + "loss": 0.3709, + "step": 21475 + }, + { + "epoch": 1.2025982752827864, + "grad_norm": 1.3370013236999512, + "learning_rate": 9.96142105263158e-05, + "loss": 0.3595, + "step": 21476 + }, + { + "epoch": 1.2026542725949154, + "grad_norm": 1.3887287378311157, + "learning_rate": 9.961394736842105e-05, + "loss": 0.4461, + "step": 21477 + }, + { + "epoch": 1.2027102699070444, + "grad_norm": 1.4351294040679932, + "learning_rate": 9.961368421052633e-05, + "loss": 0.4641, + "step": 21478 + }, + { + "epoch": 1.2027662672191735, + "grad_norm": 1.1647791862487793, + "learning_rate": 9.961342105263159e-05, + "loss": 0.3386, + "step": 21479 + }, + { + "epoch": 1.2028222645313025, + "grad_norm": 1.3226819038391113, + "learning_rate": 9.961315789473685e-05, + "loss": 0.4429, + "step": 21480 + }, + { + "epoch": 1.2028782618434315, + "grad_norm": 2.2665774822235107, + "learning_rate": 9.96128947368421e-05, + "loss": 0.441, + "step": 21481 + }, + { + "epoch": 1.2029342591555605, + "grad_norm": 1.6629362106323242, + "learning_rate": 9.961263157894737e-05, + "loss": 0.3264, + "step": 21482 + }, + { + "epoch": 1.2029902564676895, + "grad_norm": 1.1915266513824463, + "learning_rate": 9.961236842105264e-05, + "loss": 0.323, + "step": 21483 + }, + { + "epoch": 1.2030462537798186, + "grad_norm": 1.7877452373504639, + "learning_rate": 9.96121052631579e-05, + "loss": 0.4892, + "step": 21484 + }, + { + "epoch": 1.2031022510919476, + "grad_norm": 1.3100731372833252, + "learning_rate": 9.961184210526316e-05, + "loss": 0.4047, + "step": 21485 + }, + { + "epoch": 1.2031582484040766, + "grad_norm": 1.5466324090957642, + "learning_rate": 9.961157894736842e-05, + "loss": 0.4025, + "step": 21486 + }, + { + "epoch": 1.2032142457162056, + "grad_norm": 1.2910239696502686, + "learning_rate": 9.961131578947369e-05, + "loss": 0.4358, + "step": 21487 + }, + { + "epoch": 1.2032702430283346, + "grad_norm": 1.1227089166641235, + "learning_rate": 9.961105263157895e-05, + "loss": 0.3257, + "step": 21488 + }, + { + "epoch": 1.2033262403404636, + "grad_norm": 3.1409835815429688, + "learning_rate": 9.961078947368422e-05, + "loss": 0.4212, + "step": 21489 + }, + { + "epoch": 1.2033822376525927, + "grad_norm": 1.5216270685195923, + "learning_rate": 9.961052631578947e-05, + "loss": 0.6025, + "step": 21490 + }, + { + "epoch": 1.2034382349647217, + "grad_norm": 1.5169122219085693, + "learning_rate": 9.961026315789474e-05, + "loss": 0.6018, + "step": 21491 + }, + { + "epoch": 1.2034942322768507, + "grad_norm": 1.2840487957000732, + "learning_rate": 9.961e-05, + "loss": 0.3487, + "step": 21492 + }, + { + "epoch": 1.2035502295889797, + "grad_norm": 1.481377363204956, + "learning_rate": 9.960973684210528e-05, + "loss": 0.5206, + "step": 21493 + }, + { + "epoch": 1.2036062269011087, + "grad_norm": 1.334641456604004, + "learning_rate": 9.960947368421054e-05, + "loss": 0.3421, + "step": 21494 + }, + { + "epoch": 1.2036622242132378, + "grad_norm": 1.4304063320159912, + "learning_rate": 9.96092105263158e-05, + "loss": 0.5398, + "step": 21495 + }, + { + "epoch": 1.2037182215253668, + "grad_norm": 1.308811902999878, + "learning_rate": 9.960894736842106e-05, + "loss": 0.4299, + "step": 21496 + }, + { + "epoch": 1.2037742188374958, + "grad_norm": 1.3681529760360718, + "learning_rate": 9.960868421052632e-05, + "loss": 0.5409, + "step": 21497 + }, + { + "epoch": 1.2038302161496248, + "grad_norm": 1.987178921699524, + "learning_rate": 9.960842105263159e-05, + "loss": 0.5736, + "step": 21498 + }, + { + "epoch": 1.2038862134617538, + "grad_norm": 1.3422547578811646, + "learning_rate": 9.960815789473685e-05, + "loss": 0.4158, + "step": 21499 + }, + { + "epoch": 1.2039422107738829, + "grad_norm": 1.2379730939865112, + "learning_rate": 9.960789473684211e-05, + "loss": 0.3255, + "step": 21500 + }, + { + "epoch": 1.2039982080860119, + "grad_norm": 1.7198050022125244, + "learning_rate": 9.960763157894737e-05, + "loss": 0.561, + "step": 21501 + }, + { + "epoch": 1.204054205398141, + "grad_norm": 1.3510574102401733, + "learning_rate": 9.960736842105264e-05, + "loss": 0.4318, + "step": 21502 + }, + { + "epoch": 1.20411020271027, + "grad_norm": 1.4553757905960083, + "learning_rate": 9.96071052631579e-05, + "loss": 0.5253, + "step": 21503 + }, + { + "epoch": 1.204166200022399, + "grad_norm": 1.3594117164611816, + "learning_rate": 9.960684210526316e-05, + "loss": 0.3898, + "step": 21504 + }, + { + "epoch": 1.204222197334528, + "grad_norm": 1.6740353107452393, + "learning_rate": 9.960657894736842e-05, + "loss": 0.4418, + "step": 21505 + }, + { + "epoch": 1.204278194646657, + "grad_norm": 1.3242052793502808, + "learning_rate": 9.96063157894737e-05, + "loss": 0.4278, + "step": 21506 + }, + { + "epoch": 1.204334191958786, + "grad_norm": 1.1198610067367554, + "learning_rate": 9.960605263157895e-05, + "loss": 0.3511, + "step": 21507 + }, + { + "epoch": 1.204390189270915, + "grad_norm": 1.5841436386108398, + "learning_rate": 9.960578947368421e-05, + "loss": 0.4644, + "step": 21508 + }, + { + "epoch": 1.204446186583044, + "grad_norm": 1.480082392692566, + "learning_rate": 9.960552631578947e-05, + "loss": 0.4577, + "step": 21509 + }, + { + "epoch": 1.204502183895173, + "grad_norm": 1.6126313209533691, + "learning_rate": 9.960526315789475e-05, + "loss": 0.5515, + "step": 21510 + }, + { + "epoch": 1.204558181207302, + "grad_norm": 1.250475287437439, + "learning_rate": 9.9605e-05, + "loss": 0.4454, + "step": 21511 + }, + { + "epoch": 1.204614178519431, + "grad_norm": 1.662462830543518, + "learning_rate": 9.960473684210528e-05, + "loss": 0.5816, + "step": 21512 + }, + { + "epoch": 1.20467017583156, + "grad_norm": 1.361572504043579, + "learning_rate": 9.960447368421053e-05, + "loss": 0.4464, + "step": 21513 + }, + { + "epoch": 1.2047261731436891, + "grad_norm": 1.2813810110092163, + "learning_rate": 9.960421052631579e-05, + "loss": 0.4901, + "step": 21514 + }, + { + "epoch": 1.2047821704558181, + "grad_norm": 1.5056607723236084, + "learning_rate": 9.960394736842106e-05, + "loss": 0.5114, + "step": 21515 + }, + { + "epoch": 1.2048381677679472, + "grad_norm": 1.4565441608428955, + "learning_rate": 9.960368421052632e-05, + "loss": 0.5469, + "step": 21516 + }, + { + "epoch": 1.2048941650800762, + "grad_norm": 1.654646635055542, + "learning_rate": 9.960342105263159e-05, + "loss": 0.4382, + "step": 21517 + }, + { + "epoch": 1.2049501623922052, + "grad_norm": 1.1464526653289795, + "learning_rate": 9.960315789473684e-05, + "loss": 0.381, + "step": 21518 + }, + { + "epoch": 1.2050061597043342, + "grad_norm": 2.7664570808410645, + "learning_rate": 9.960289473684211e-05, + "loss": 0.6341, + "step": 21519 + }, + { + "epoch": 1.2050621570164632, + "grad_norm": 1.6704425811767578, + "learning_rate": 9.960263157894737e-05, + "loss": 0.4385, + "step": 21520 + }, + { + "epoch": 1.2051181543285923, + "grad_norm": 1.5679821968078613, + "learning_rate": 9.960236842105264e-05, + "loss": 0.4312, + "step": 21521 + }, + { + "epoch": 1.2051741516407213, + "grad_norm": 1.2296874523162842, + "learning_rate": 9.960210526315789e-05, + "loss": 0.4045, + "step": 21522 + }, + { + "epoch": 1.2052301489528503, + "grad_norm": 1.1888378858566284, + "learning_rate": 9.960184210526316e-05, + "loss": 0.4166, + "step": 21523 + }, + { + "epoch": 1.2052861462649793, + "grad_norm": 1.4093694686889648, + "learning_rate": 9.960157894736842e-05, + "loss": 0.4597, + "step": 21524 + }, + { + "epoch": 1.2053421435771083, + "grad_norm": 1.5005062818527222, + "learning_rate": 9.96013157894737e-05, + "loss": 0.4724, + "step": 21525 + }, + { + "epoch": 1.2053981408892374, + "grad_norm": 1.2124828100204468, + "learning_rate": 9.960105263157896e-05, + "loss": 0.3277, + "step": 21526 + }, + { + "epoch": 1.2054541382013664, + "grad_norm": 1.685762882232666, + "learning_rate": 9.960078947368422e-05, + "loss": 0.3987, + "step": 21527 + }, + { + "epoch": 1.2055101355134954, + "grad_norm": 1.3396176099777222, + "learning_rate": 9.960052631578948e-05, + "loss": 0.511, + "step": 21528 + }, + { + "epoch": 1.2055661328256244, + "grad_norm": 1.2800570726394653, + "learning_rate": 9.960026315789475e-05, + "loss": 0.3879, + "step": 21529 + }, + { + "epoch": 1.2056221301377534, + "grad_norm": 1.1974067687988281, + "learning_rate": 9.960000000000001e-05, + "loss": 0.3899, + "step": 21530 + }, + { + "epoch": 1.2056781274498825, + "grad_norm": 1.2989071607589722, + "learning_rate": 9.959973684210527e-05, + "loss": 0.4172, + "step": 21531 + }, + { + "epoch": 1.2057341247620115, + "grad_norm": 1.431694507598877, + "learning_rate": 9.959947368421053e-05, + "loss": 0.4452, + "step": 21532 + }, + { + "epoch": 1.2057901220741405, + "grad_norm": 1.1873540878295898, + "learning_rate": 9.959921052631579e-05, + "loss": 0.4166, + "step": 21533 + }, + { + "epoch": 1.2058461193862695, + "grad_norm": 1.2419899702072144, + "learning_rate": 9.959894736842106e-05, + "loss": 0.4983, + "step": 21534 + }, + { + "epoch": 1.2059021166983985, + "grad_norm": 1.3938467502593994, + "learning_rate": 9.959868421052632e-05, + "loss": 0.5308, + "step": 21535 + }, + { + "epoch": 1.2059581140105275, + "grad_norm": 1.1994540691375732, + "learning_rate": 9.959842105263158e-05, + "loss": 0.4284, + "step": 21536 + }, + { + "epoch": 1.2060141113226566, + "grad_norm": 1.2388249635696411, + "learning_rate": 9.959815789473684e-05, + "loss": 0.3959, + "step": 21537 + }, + { + "epoch": 1.2060701086347856, + "grad_norm": 1.3360904455184937, + "learning_rate": 9.959789473684211e-05, + "loss": 0.6121, + "step": 21538 + }, + { + "epoch": 1.2061261059469146, + "grad_norm": 1.4434248208999634, + "learning_rate": 9.959763157894737e-05, + "loss": 0.4103, + "step": 21539 + }, + { + "epoch": 1.2061821032590436, + "grad_norm": 1.458721399307251, + "learning_rate": 9.959736842105263e-05, + "loss": 0.444, + "step": 21540 + }, + { + "epoch": 1.2062381005711726, + "grad_norm": 1.3926796913146973, + "learning_rate": 9.959710526315789e-05, + "loss": 0.5739, + "step": 21541 + }, + { + "epoch": 1.2062940978833017, + "grad_norm": 1.5073621273040771, + "learning_rate": 9.959684210526317e-05, + "loss": 0.4406, + "step": 21542 + }, + { + "epoch": 1.2063500951954307, + "grad_norm": 1.4399853944778442, + "learning_rate": 9.959657894736843e-05, + "loss": 0.3999, + "step": 21543 + }, + { + "epoch": 1.2064060925075597, + "grad_norm": 1.4274659156799316, + "learning_rate": 9.95963157894737e-05, + "loss": 0.4341, + "step": 21544 + }, + { + "epoch": 1.2064620898196887, + "grad_norm": 1.4583518505096436, + "learning_rate": 9.959605263157895e-05, + "loss": 0.4265, + "step": 21545 + }, + { + "epoch": 1.2065180871318177, + "grad_norm": 1.3247216939926147, + "learning_rate": 9.959578947368422e-05, + "loss": 0.4782, + "step": 21546 + }, + { + "epoch": 1.2065740844439468, + "grad_norm": 1.3564420938491821, + "learning_rate": 9.959552631578948e-05, + "loss": 0.5076, + "step": 21547 + }, + { + "epoch": 1.2066300817560758, + "grad_norm": 1.6099863052368164, + "learning_rate": 9.959526315789475e-05, + "loss": 0.5902, + "step": 21548 + }, + { + "epoch": 1.2066860790682048, + "grad_norm": 1.083328127861023, + "learning_rate": 9.959500000000001e-05, + "loss": 0.3665, + "step": 21549 + }, + { + "epoch": 1.2067420763803338, + "grad_norm": 1.2699973583221436, + "learning_rate": 9.959473684210526e-05, + "loss": 0.4768, + "step": 21550 + }, + { + "epoch": 1.2067980736924628, + "grad_norm": 1.5290131568908691, + "learning_rate": 9.959447368421053e-05, + "loss": 0.5987, + "step": 21551 + }, + { + "epoch": 1.2068540710045919, + "grad_norm": 1.3370295763015747, + "learning_rate": 9.959421052631579e-05, + "loss": 0.3452, + "step": 21552 + }, + { + "epoch": 1.2069100683167209, + "grad_norm": 1.2521337270736694, + "learning_rate": 9.959394736842106e-05, + "loss": 0.3931, + "step": 21553 + }, + { + "epoch": 1.20696606562885, + "grad_norm": 1.3776090145111084, + "learning_rate": 9.959368421052632e-05, + "loss": 0.4932, + "step": 21554 + }, + { + "epoch": 1.207022062940979, + "grad_norm": 1.7571406364440918, + "learning_rate": 9.959342105263158e-05, + "loss": 0.3998, + "step": 21555 + }, + { + "epoch": 1.207078060253108, + "grad_norm": 1.3792510032653809, + "learning_rate": 9.959315789473684e-05, + "loss": 0.4431, + "step": 21556 + }, + { + "epoch": 1.207134057565237, + "grad_norm": 1.34672212600708, + "learning_rate": 9.959289473684212e-05, + "loss": 0.3362, + "step": 21557 + }, + { + "epoch": 1.207190054877366, + "grad_norm": 1.3030239343643188, + "learning_rate": 9.959263157894738e-05, + "loss": 0.4635, + "step": 21558 + }, + { + "epoch": 1.207246052189495, + "grad_norm": 1.6461795568466187, + "learning_rate": 9.959236842105264e-05, + "loss": 0.4938, + "step": 21559 + }, + { + "epoch": 1.207302049501624, + "grad_norm": 1.3412219285964966, + "learning_rate": 9.95921052631579e-05, + "loss": 0.4992, + "step": 21560 + }, + { + "epoch": 1.207358046813753, + "grad_norm": 1.4476515054702759, + "learning_rate": 9.959184210526317e-05, + "loss": 0.4607, + "step": 21561 + }, + { + "epoch": 1.207414044125882, + "grad_norm": 1.3024920225143433, + "learning_rate": 9.959157894736843e-05, + "loss": 0.447, + "step": 21562 + }, + { + "epoch": 1.207470041438011, + "grad_norm": 1.2919872999191284, + "learning_rate": 9.959131578947369e-05, + "loss": 0.5535, + "step": 21563 + }, + { + "epoch": 1.20752603875014, + "grad_norm": 1.3789916038513184, + "learning_rate": 9.959105263157895e-05, + "loss": 0.4159, + "step": 21564 + }, + { + "epoch": 1.207582036062269, + "grad_norm": 1.3930158615112305, + "learning_rate": 9.959078947368422e-05, + "loss": 0.3977, + "step": 21565 + }, + { + "epoch": 1.2076380333743981, + "grad_norm": 1.4170221090316772, + "learning_rate": 9.959052631578948e-05, + "loss": 0.5159, + "step": 21566 + }, + { + "epoch": 1.2076940306865271, + "grad_norm": 1.5150773525238037, + "learning_rate": 9.959026315789474e-05, + "loss": 0.429, + "step": 21567 + }, + { + "epoch": 1.2077500279986562, + "grad_norm": 6.028048992156982, + "learning_rate": 9.959e-05, + "loss": 0.5761, + "step": 21568 + }, + { + "epoch": 1.2078060253107852, + "grad_norm": 1.591076135635376, + "learning_rate": 9.958973684210526e-05, + "loss": 0.4305, + "step": 21569 + }, + { + "epoch": 1.2078620226229142, + "grad_norm": 1.4415366649627686, + "learning_rate": 9.958947368421053e-05, + "loss": 0.3811, + "step": 21570 + }, + { + "epoch": 1.2079180199350432, + "grad_norm": 1.2981065511703491, + "learning_rate": 9.958921052631579e-05, + "loss": 0.4176, + "step": 21571 + }, + { + "epoch": 1.2079740172471722, + "grad_norm": 1.2798056602478027, + "learning_rate": 9.958894736842105e-05, + "loss": 0.4618, + "step": 21572 + }, + { + "epoch": 1.2080300145593013, + "grad_norm": 1.2739033699035645, + "learning_rate": 9.958868421052631e-05, + "loss": 0.4072, + "step": 21573 + }, + { + "epoch": 1.2080860118714303, + "grad_norm": 1.510717749595642, + "learning_rate": 9.958842105263159e-05, + "loss": 0.6019, + "step": 21574 + }, + { + "epoch": 1.2081420091835593, + "grad_norm": 1.3855395317077637, + "learning_rate": 9.958815789473685e-05, + "loss": 0.5853, + "step": 21575 + }, + { + "epoch": 1.2081980064956883, + "grad_norm": 1.42533540725708, + "learning_rate": 9.958789473684212e-05, + "loss": 0.4465, + "step": 21576 + }, + { + "epoch": 1.2082540038078173, + "grad_norm": 1.3876395225524902, + "learning_rate": 9.958763157894736e-05, + "loss": 0.4108, + "step": 21577 + }, + { + "epoch": 1.2083100011199464, + "grad_norm": 1.4974334239959717, + "learning_rate": 9.958736842105264e-05, + "loss": 0.6935, + "step": 21578 + }, + { + "epoch": 1.2083659984320754, + "grad_norm": 1.5912235975265503, + "learning_rate": 9.95871052631579e-05, + "loss": 0.5145, + "step": 21579 + }, + { + "epoch": 1.2084219957442044, + "grad_norm": 1.3313535451889038, + "learning_rate": 9.958684210526317e-05, + "loss": 0.4336, + "step": 21580 + }, + { + "epoch": 1.2084779930563334, + "grad_norm": 1.5140681266784668, + "learning_rate": 9.958657894736843e-05, + "loss": 0.4945, + "step": 21581 + }, + { + "epoch": 1.2085339903684624, + "grad_norm": 1.1587097644805908, + "learning_rate": 9.958631578947369e-05, + "loss": 0.3736, + "step": 21582 + }, + { + "epoch": 1.2085899876805914, + "grad_norm": 1.2712211608886719, + "learning_rate": 9.958605263157895e-05, + "loss": 0.3106, + "step": 21583 + }, + { + "epoch": 1.2086459849927205, + "grad_norm": 1.2500139474868774, + "learning_rate": 9.958578947368421e-05, + "loss": 0.4155, + "step": 21584 + }, + { + "epoch": 1.2087019823048495, + "grad_norm": 1.386947751045227, + "learning_rate": 9.958552631578948e-05, + "loss": 0.4118, + "step": 21585 + }, + { + "epoch": 1.2087579796169785, + "grad_norm": 1.6933709383010864, + "learning_rate": 9.958526315789474e-05, + "loss": 0.4991, + "step": 21586 + }, + { + "epoch": 1.2088139769291075, + "grad_norm": 1.3202474117279053, + "learning_rate": 9.9585e-05, + "loss": 0.4367, + "step": 21587 + }, + { + "epoch": 1.2088699742412365, + "grad_norm": 1.3380085229873657, + "learning_rate": 9.958473684210526e-05, + "loss": 0.4499, + "step": 21588 + }, + { + "epoch": 1.2089259715533653, + "grad_norm": 1.1275161504745483, + "learning_rate": 9.958447368421054e-05, + "loss": 0.3378, + "step": 21589 + }, + { + "epoch": 1.2089819688654944, + "grad_norm": 1.6810859441757202, + "learning_rate": 9.95842105263158e-05, + "loss": 0.4943, + "step": 21590 + }, + { + "epoch": 1.2090379661776234, + "grad_norm": 1.4383162260055542, + "learning_rate": 9.958394736842106e-05, + "loss": 0.541, + "step": 21591 + }, + { + "epoch": 1.2090939634897524, + "grad_norm": 1.5488122701644897, + "learning_rate": 9.958368421052632e-05, + "loss": 0.661, + "step": 21592 + }, + { + "epoch": 1.2091499608018814, + "grad_norm": 1.4834942817687988, + "learning_rate": 9.958342105263159e-05, + "loss": 0.4147, + "step": 21593 + }, + { + "epoch": 1.2092059581140104, + "grad_norm": 1.313153624534607, + "learning_rate": 9.958315789473685e-05, + "loss": 0.3932, + "step": 21594 + }, + { + "epoch": 1.2092619554261395, + "grad_norm": 1.1389399766921997, + "learning_rate": 9.958289473684211e-05, + "loss": 0.3893, + "step": 21595 + }, + { + "epoch": 1.2093179527382685, + "grad_norm": 1.2442681789398193, + "learning_rate": 9.958263157894737e-05, + "loss": 0.4184, + "step": 21596 + }, + { + "epoch": 1.2093739500503975, + "grad_norm": 1.3375062942504883, + "learning_rate": 9.958236842105264e-05, + "loss": 0.4146, + "step": 21597 + }, + { + "epoch": 1.2094299473625265, + "grad_norm": 1.2993109226226807, + "learning_rate": 9.95821052631579e-05, + "loss": 0.4562, + "step": 21598 + }, + { + "epoch": 1.2094859446746555, + "grad_norm": 1.506517767906189, + "learning_rate": 9.958184210526317e-05, + "loss": 0.5449, + "step": 21599 + }, + { + "epoch": 1.2095419419867846, + "grad_norm": 1.4779539108276367, + "learning_rate": 9.958157894736842e-05, + "loss": 0.401, + "step": 21600 + }, + { + "epoch": 1.2095979392989136, + "grad_norm": 1.145774483680725, + "learning_rate": 9.958131578947368e-05, + "loss": 0.446, + "step": 21601 + }, + { + "epoch": 1.2096539366110426, + "grad_norm": 1.577304482460022, + "learning_rate": 9.958105263157895e-05, + "loss": 0.4408, + "step": 21602 + }, + { + "epoch": 1.2097099339231716, + "grad_norm": 1.4342930316925049, + "learning_rate": 9.958078947368421e-05, + "loss": 0.4376, + "step": 21603 + }, + { + "epoch": 1.2097659312353006, + "grad_norm": 1.370580792427063, + "learning_rate": 9.958052631578949e-05, + "loss": 0.3853, + "step": 21604 + }, + { + "epoch": 1.2098219285474296, + "grad_norm": 1.2701548337936401, + "learning_rate": 9.958026315789473e-05, + "loss": 0.5224, + "step": 21605 + }, + { + "epoch": 1.2098779258595587, + "grad_norm": 1.2481516599655151, + "learning_rate": 9.958e-05, + "loss": 0.3483, + "step": 21606 + }, + { + "epoch": 1.2099339231716877, + "grad_norm": 1.2778574228286743, + "learning_rate": 9.957973684210527e-05, + "loss": 0.4317, + "step": 21607 + }, + { + "epoch": 1.2099899204838167, + "grad_norm": 1.218406081199646, + "learning_rate": 9.957947368421054e-05, + "loss": 0.432, + "step": 21608 + }, + { + "epoch": 1.2100459177959457, + "grad_norm": 1.25705885887146, + "learning_rate": 9.95792105263158e-05, + "loss": 0.4008, + "step": 21609 + }, + { + "epoch": 1.2101019151080747, + "grad_norm": 1.8609923124313354, + "learning_rate": 9.957894736842106e-05, + "loss": 0.5195, + "step": 21610 + }, + { + "epoch": 1.2101579124202038, + "grad_norm": 1.0980687141418457, + "learning_rate": 9.957868421052632e-05, + "loss": 0.3524, + "step": 21611 + }, + { + "epoch": 1.2102139097323328, + "grad_norm": 1.4224040508270264, + "learning_rate": 9.957842105263159e-05, + "loss": 0.4607, + "step": 21612 + }, + { + "epoch": 1.2102699070444618, + "grad_norm": 1.319006323814392, + "learning_rate": 9.957815789473685e-05, + "loss": 0.5207, + "step": 21613 + }, + { + "epoch": 1.2103259043565908, + "grad_norm": 1.6345844268798828, + "learning_rate": 9.957789473684211e-05, + "loss": 0.5688, + "step": 21614 + }, + { + "epoch": 1.2103819016687198, + "grad_norm": 1.1791088581085205, + "learning_rate": 9.957763157894737e-05, + "loss": 0.3316, + "step": 21615 + }, + { + "epoch": 1.2104378989808489, + "grad_norm": 1.996338129043579, + "learning_rate": 9.957736842105264e-05, + "loss": 0.3288, + "step": 21616 + }, + { + "epoch": 1.2104938962929779, + "grad_norm": 1.5343947410583496, + "learning_rate": 9.95771052631579e-05, + "loss": 0.3911, + "step": 21617 + }, + { + "epoch": 1.210549893605107, + "grad_norm": 1.213384747505188, + "learning_rate": 9.957684210526316e-05, + "loss": 0.4806, + "step": 21618 + }, + { + "epoch": 1.210605890917236, + "grad_norm": 1.6705191135406494, + "learning_rate": 9.957657894736842e-05, + "loss": 0.4419, + "step": 21619 + }, + { + "epoch": 1.210661888229365, + "grad_norm": 1.6513222455978394, + "learning_rate": 9.957631578947368e-05, + "loss": 0.4492, + "step": 21620 + }, + { + "epoch": 1.210717885541494, + "grad_norm": 1.4181777238845825, + "learning_rate": 9.957605263157896e-05, + "loss": 0.4674, + "step": 21621 + }, + { + "epoch": 1.210773882853623, + "grad_norm": 4.974144458770752, + "learning_rate": 9.957578947368422e-05, + "loss": 0.5406, + "step": 21622 + }, + { + "epoch": 1.210829880165752, + "grad_norm": 1.1779018640518188, + "learning_rate": 9.957552631578948e-05, + "loss": 0.4889, + "step": 21623 + }, + { + "epoch": 1.210885877477881, + "grad_norm": 1.3417086601257324, + "learning_rate": 9.957526315789473e-05, + "loss": 0.3843, + "step": 21624 + }, + { + "epoch": 1.21094187479001, + "grad_norm": 1.226450800895691, + "learning_rate": 9.957500000000001e-05, + "loss": 0.3566, + "step": 21625 + }, + { + "epoch": 1.210997872102139, + "grad_norm": 1.1275731325149536, + "learning_rate": 9.957473684210527e-05, + "loss": 0.3272, + "step": 21626 + }, + { + "epoch": 1.211053869414268, + "grad_norm": 1.4575562477111816, + "learning_rate": 9.957447368421053e-05, + "loss": 0.5366, + "step": 21627 + }, + { + "epoch": 1.211109866726397, + "grad_norm": 1.3971645832061768, + "learning_rate": 9.957421052631579e-05, + "loss": 0.442, + "step": 21628 + }, + { + "epoch": 1.211165864038526, + "grad_norm": 1.5425329208374023, + "learning_rate": 9.957394736842106e-05, + "loss": 0.5183, + "step": 21629 + }, + { + "epoch": 1.2112218613506551, + "grad_norm": 1.0971078872680664, + "learning_rate": 9.957368421052632e-05, + "loss": 0.3486, + "step": 21630 + }, + { + "epoch": 1.2112778586627841, + "grad_norm": 1.5646895170211792, + "learning_rate": 9.95734210526316e-05, + "loss": 0.4559, + "step": 21631 + }, + { + "epoch": 1.2113338559749132, + "grad_norm": 1.5675113201141357, + "learning_rate": 9.957315789473684e-05, + "loss": 0.423, + "step": 21632 + }, + { + "epoch": 1.2113898532870422, + "grad_norm": 1.320599913597107, + "learning_rate": 9.957289473684211e-05, + "loss": 0.3928, + "step": 21633 + }, + { + "epoch": 1.2114458505991712, + "grad_norm": 1.4620963335037231, + "learning_rate": 9.957263157894737e-05, + "loss": 0.4002, + "step": 21634 + }, + { + "epoch": 1.2115018479113002, + "grad_norm": 1.4255797863006592, + "learning_rate": 9.957236842105263e-05, + "loss": 0.4461, + "step": 21635 + }, + { + "epoch": 1.2115578452234292, + "grad_norm": 1.2193968296051025, + "learning_rate": 9.95721052631579e-05, + "loss": 0.4334, + "step": 21636 + }, + { + "epoch": 1.2116138425355583, + "grad_norm": 1.4193572998046875, + "learning_rate": 9.957184210526315e-05, + "loss": 0.5235, + "step": 21637 + }, + { + "epoch": 1.2116698398476873, + "grad_norm": 1.3480080366134644, + "learning_rate": 9.957157894736843e-05, + "loss": 0.4982, + "step": 21638 + }, + { + "epoch": 1.2117258371598163, + "grad_norm": 1.2971700429916382, + "learning_rate": 9.957131578947368e-05, + "loss": 0.5002, + "step": 21639 + }, + { + "epoch": 1.2117818344719453, + "grad_norm": 4.556559085845947, + "learning_rate": 9.957105263157896e-05, + "loss": 0.4989, + "step": 21640 + }, + { + "epoch": 1.2118378317840743, + "grad_norm": 1.9515577554702759, + "learning_rate": 9.957078947368422e-05, + "loss": 0.4755, + "step": 21641 + }, + { + "epoch": 1.2118938290962034, + "grad_norm": 1.8058103322982788, + "learning_rate": 9.957052631578948e-05, + "loss": 0.4307, + "step": 21642 + }, + { + "epoch": 1.2119498264083324, + "grad_norm": 1.3275461196899414, + "learning_rate": 9.957026315789474e-05, + "loss": 0.43, + "step": 21643 + }, + { + "epoch": 1.2120058237204614, + "grad_norm": 3.464611530303955, + "learning_rate": 9.957000000000001e-05, + "loss": 0.4449, + "step": 21644 + }, + { + "epoch": 1.2120618210325904, + "grad_norm": 1.3496110439300537, + "learning_rate": 9.956973684210527e-05, + "loss": 0.396, + "step": 21645 + }, + { + "epoch": 1.2121178183447194, + "grad_norm": 1.2601943016052246, + "learning_rate": 9.956947368421053e-05, + "loss": 0.3952, + "step": 21646 + }, + { + "epoch": 1.2121738156568485, + "grad_norm": 1.7452715635299683, + "learning_rate": 9.956921052631579e-05, + "loss": 0.3525, + "step": 21647 + }, + { + "epoch": 1.2122298129689775, + "grad_norm": 1.5159343481063843, + "learning_rate": 9.956894736842106e-05, + "loss": 0.504, + "step": 21648 + }, + { + "epoch": 1.2122858102811065, + "grad_norm": 1.2624573707580566, + "learning_rate": 9.956868421052632e-05, + "loss": 0.3595, + "step": 21649 + }, + { + "epoch": 1.2123418075932355, + "grad_norm": 1.3922098875045776, + "learning_rate": 9.956842105263158e-05, + "loss": 0.2768, + "step": 21650 + }, + { + "epoch": 1.2123978049053645, + "grad_norm": 1.1745513677597046, + "learning_rate": 9.956815789473684e-05, + "loss": 0.327, + "step": 21651 + }, + { + "epoch": 1.2124538022174935, + "grad_norm": 1.3081787824630737, + "learning_rate": 9.956789473684212e-05, + "loss": 0.3648, + "step": 21652 + }, + { + "epoch": 1.2125097995296226, + "grad_norm": 1.6047451496124268, + "learning_rate": 9.956763157894738e-05, + "loss": 0.3854, + "step": 21653 + }, + { + "epoch": 1.2125657968417516, + "grad_norm": 1.5055924654006958, + "learning_rate": 9.956736842105263e-05, + "loss": 0.4896, + "step": 21654 + }, + { + "epoch": 1.2126217941538806, + "grad_norm": 1.1641325950622559, + "learning_rate": 9.95671052631579e-05, + "loss": 0.3244, + "step": 21655 + }, + { + "epoch": 1.2126777914660096, + "grad_norm": 1.4093161821365356, + "learning_rate": 9.956684210526315e-05, + "loss": 0.6209, + "step": 21656 + }, + { + "epoch": 1.2127337887781386, + "grad_norm": 1.447231411933899, + "learning_rate": 9.956657894736843e-05, + "loss": 0.5382, + "step": 21657 + }, + { + "epoch": 1.2127897860902677, + "grad_norm": 1.467686653137207, + "learning_rate": 9.956631578947369e-05, + "loss": 0.5004, + "step": 21658 + }, + { + "epoch": 1.2128457834023967, + "grad_norm": 1.2182942628860474, + "learning_rate": 9.956605263157896e-05, + "loss": 0.3318, + "step": 21659 + }, + { + "epoch": 1.2129017807145257, + "grad_norm": 1.369750738143921, + "learning_rate": 9.956578947368421e-05, + "loss": 0.4887, + "step": 21660 + }, + { + "epoch": 1.2129577780266547, + "grad_norm": 1.306819200515747, + "learning_rate": 9.956552631578948e-05, + "loss": 0.4644, + "step": 21661 + }, + { + "epoch": 1.2130137753387837, + "grad_norm": 1.2664053440093994, + "learning_rate": 9.956526315789474e-05, + "loss": 0.3991, + "step": 21662 + }, + { + "epoch": 1.2130697726509128, + "grad_norm": 1.3251110315322876, + "learning_rate": 9.956500000000001e-05, + "loss": 0.4963, + "step": 21663 + }, + { + "epoch": 1.2131257699630418, + "grad_norm": 1.359578013420105, + "learning_rate": 9.956473684210527e-05, + "loss": 0.4374, + "step": 21664 + }, + { + "epoch": 1.2131817672751708, + "grad_norm": 1.6653258800506592, + "learning_rate": 9.956447368421053e-05, + "loss": 0.4607, + "step": 21665 + }, + { + "epoch": 1.2132377645872998, + "grad_norm": 1.531223177909851, + "learning_rate": 9.956421052631579e-05, + "loss": 0.4853, + "step": 21666 + }, + { + "epoch": 1.2132937618994288, + "grad_norm": 1.707360863685608, + "learning_rate": 9.956394736842107e-05, + "loss": 0.5206, + "step": 21667 + }, + { + "epoch": 1.2133497592115579, + "grad_norm": 1.4159029722213745, + "learning_rate": 9.956368421052633e-05, + "loss": 0.4003, + "step": 21668 + }, + { + "epoch": 1.2134057565236869, + "grad_norm": 1.3171870708465576, + "learning_rate": 9.956342105263159e-05, + "loss": 0.4966, + "step": 21669 + }, + { + "epoch": 1.213461753835816, + "grad_norm": 1.4929555654525757, + "learning_rate": 9.956315789473684e-05, + "loss": 0.4954, + "step": 21670 + }, + { + "epoch": 1.213517751147945, + "grad_norm": 1.3464287519454956, + "learning_rate": 9.95628947368421e-05, + "loss": 0.3948, + "step": 21671 + }, + { + "epoch": 1.213573748460074, + "grad_norm": 1.375730037689209, + "learning_rate": 9.956263157894738e-05, + "loss": 0.4981, + "step": 21672 + }, + { + "epoch": 1.213629745772203, + "grad_norm": 1.1861927509307861, + "learning_rate": 9.956236842105264e-05, + "loss": 0.3404, + "step": 21673 + }, + { + "epoch": 1.213685743084332, + "grad_norm": 1.2284128665924072, + "learning_rate": 9.95621052631579e-05, + "loss": 0.4488, + "step": 21674 + }, + { + "epoch": 1.213741740396461, + "grad_norm": 1.2582566738128662, + "learning_rate": 9.956184210526316e-05, + "loss": 0.3798, + "step": 21675 + }, + { + "epoch": 1.21379773770859, + "grad_norm": 1.329533576965332, + "learning_rate": 9.956157894736843e-05, + "loss": 0.5515, + "step": 21676 + }, + { + "epoch": 1.213853735020719, + "grad_norm": 1.1948121786117554, + "learning_rate": 9.956131578947369e-05, + "loss": 0.4594, + "step": 21677 + }, + { + "epoch": 1.213909732332848, + "grad_norm": 1.0382745265960693, + "learning_rate": 9.956105263157895e-05, + "loss": 0.3061, + "step": 21678 + }, + { + "epoch": 1.213965729644977, + "grad_norm": 1.280571699142456, + "learning_rate": 9.956078947368421e-05, + "loss": 0.5167, + "step": 21679 + }, + { + "epoch": 1.214021726957106, + "grad_norm": 1.3734811544418335, + "learning_rate": 9.956052631578948e-05, + "loss": 0.5118, + "step": 21680 + }, + { + "epoch": 1.214077724269235, + "grad_norm": 1.4955744743347168, + "learning_rate": 9.956026315789474e-05, + "loss": 0.3749, + "step": 21681 + }, + { + "epoch": 1.2141337215813641, + "grad_norm": 1.4537034034729004, + "learning_rate": 9.956e-05, + "loss": 0.4924, + "step": 21682 + }, + { + "epoch": 1.2141897188934931, + "grad_norm": 1.1773545742034912, + "learning_rate": 9.955973684210526e-05, + "loss": 0.3595, + "step": 21683 + }, + { + "epoch": 1.2142457162056222, + "grad_norm": 1.5288422107696533, + "learning_rate": 9.955947368421054e-05, + "loss": 0.4485, + "step": 21684 + }, + { + "epoch": 1.2143017135177512, + "grad_norm": 1.2528376579284668, + "learning_rate": 9.95592105263158e-05, + "loss": 0.4384, + "step": 21685 + }, + { + "epoch": 1.2143577108298802, + "grad_norm": 1.4522837400436401, + "learning_rate": 9.955894736842107e-05, + "loss": 0.4189, + "step": 21686 + }, + { + "epoch": 1.2144137081420092, + "grad_norm": 2.0988106727600098, + "learning_rate": 9.955868421052631e-05, + "loss": 0.5293, + "step": 21687 + }, + { + "epoch": 1.2144697054541382, + "grad_norm": 1.4099358320236206, + "learning_rate": 9.955842105263157e-05, + "loss": 0.4423, + "step": 21688 + }, + { + "epoch": 1.2145257027662673, + "grad_norm": 1.4312374591827393, + "learning_rate": 9.955815789473685e-05, + "loss": 0.5194, + "step": 21689 + }, + { + "epoch": 1.2145817000783963, + "grad_norm": 1.713837742805481, + "learning_rate": 9.955789473684211e-05, + "loss": 0.3867, + "step": 21690 + }, + { + "epoch": 1.2146376973905253, + "grad_norm": 1.3737242221832275, + "learning_rate": 9.955763157894738e-05, + "loss": 0.4443, + "step": 21691 + }, + { + "epoch": 1.2146936947026543, + "grad_norm": 1.7588526010513306, + "learning_rate": 9.955736842105263e-05, + "loss": 0.7413, + "step": 21692 + }, + { + "epoch": 1.2147496920147833, + "grad_norm": 1.221935749053955, + "learning_rate": 9.95571052631579e-05, + "loss": 0.4119, + "step": 21693 + }, + { + "epoch": 1.2148056893269124, + "grad_norm": 1.4457955360412598, + "learning_rate": 9.955684210526316e-05, + "loss": 0.4465, + "step": 21694 + }, + { + "epoch": 1.2148616866390414, + "grad_norm": 1.3118256330490112, + "learning_rate": 9.955657894736843e-05, + "loss": 0.4991, + "step": 21695 + }, + { + "epoch": 1.2149176839511704, + "grad_norm": 1.287116527557373, + "learning_rate": 9.955631578947369e-05, + "loss": 0.5206, + "step": 21696 + }, + { + "epoch": 1.2149736812632994, + "grad_norm": 1.5391192436218262, + "learning_rate": 9.955605263157895e-05, + "loss": 0.4731, + "step": 21697 + }, + { + "epoch": 1.2150296785754284, + "grad_norm": 1.3973290920257568, + "learning_rate": 9.955578947368421e-05, + "loss": 0.4245, + "step": 21698 + }, + { + "epoch": 1.2150856758875574, + "grad_norm": 1.5534621477127075, + "learning_rate": 9.955552631578949e-05, + "loss": 0.5042, + "step": 21699 + }, + { + "epoch": 1.2151416731996865, + "grad_norm": 1.383452296257019, + "learning_rate": 9.955526315789475e-05, + "loss": 0.4167, + "step": 21700 + }, + { + "epoch": 1.2151976705118155, + "grad_norm": 1.4800420999526978, + "learning_rate": 9.9555e-05, + "loss": 0.4413, + "step": 21701 + }, + { + "epoch": 1.2152536678239445, + "grad_norm": 1.3420456647872925, + "learning_rate": 9.955473684210526e-05, + "loss": 0.5102, + "step": 21702 + }, + { + "epoch": 1.2153096651360735, + "grad_norm": 1.287355661392212, + "learning_rate": 9.955447368421054e-05, + "loss": 0.3622, + "step": 21703 + }, + { + "epoch": 1.2153656624482025, + "grad_norm": 1.442916750907898, + "learning_rate": 9.95542105263158e-05, + "loss": 0.443, + "step": 21704 + }, + { + "epoch": 1.2154216597603316, + "grad_norm": 1.3073047399520874, + "learning_rate": 9.955394736842106e-05, + "loss": 0.4629, + "step": 21705 + }, + { + "epoch": 1.2154776570724606, + "grad_norm": 1.3350088596343994, + "learning_rate": 9.955368421052632e-05, + "loss": 0.4702, + "step": 21706 + }, + { + "epoch": 1.2155336543845896, + "grad_norm": 1.529811143875122, + "learning_rate": 9.955342105263158e-05, + "loss": 0.4841, + "step": 21707 + }, + { + "epoch": 1.2155896516967186, + "grad_norm": 1.3264755010604858, + "learning_rate": 9.955315789473685e-05, + "loss": 0.4314, + "step": 21708 + }, + { + "epoch": 1.2156456490088476, + "grad_norm": 1.4241228103637695, + "learning_rate": 9.955289473684211e-05, + "loss": 0.5245, + "step": 21709 + }, + { + "epoch": 1.2157016463209767, + "grad_norm": 3.6906840801239014, + "learning_rate": 9.955263157894737e-05, + "loss": 0.4509, + "step": 21710 + }, + { + "epoch": 1.2157576436331057, + "grad_norm": 1.7241957187652588, + "learning_rate": 9.955236842105263e-05, + "loss": 0.42, + "step": 21711 + }, + { + "epoch": 1.2158136409452347, + "grad_norm": 1.3581733703613281, + "learning_rate": 9.95521052631579e-05, + "loss": 0.4015, + "step": 21712 + }, + { + "epoch": 1.2158696382573637, + "grad_norm": 1.3444708585739136, + "learning_rate": 9.955184210526316e-05, + "loss": 0.5752, + "step": 21713 + }, + { + "epoch": 1.2159256355694927, + "grad_norm": 1.5004737377166748, + "learning_rate": 9.955157894736844e-05, + "loss": 0.4007, + "step": 21714 + }, + { + "epoch": 1.2159816328816218, + "grad_norm": 1.3935128450393677, + "learning_rate": 9.955131578947368e-05, + "loss": 0.4082, + "step": 21715 + }, + { + "epoch": 1.2160376301937508, + "grad_norm": 1.5477619171142578, + "learning_rate": 9.955105263157895e-05, + "loss": 0.6332, + "step": 21716 + }, + { + "epoch": 1.2160936275058798, + "grad_norm": 1.4518890380859375, + "learning_rate": 9.955078947368421e-05, + "loss": 0.3699, + "step": 21717 + }, + { + "epoch": 1.2161496248180088, + "grad_norm": 1.3490943908691406, + "learning_rate": 9.955052631578949e-05, + "loss": 0.4337, + "step": 21718 + }, + { + "epoch": 1.2162056221301378, + "grad_norm": 1.44427490234375, + "learning_rate": 9.955026315789473e-05, + "loss": 0.4376, + "step": 21719 + }, + { + "epoch": 1.2162616194422669, + "grad_norm": 1.369771122932434, + "learning_rate": 9.955000000000001e-05, + "loss": 0.389, + "step": 21720 + }, + { + "epoch": 1.2163176167543959, + "grad_norm": 1.5623319149017334, + "learning_rate": 9.954973684210527e-05, + "loss": 0.4553, + "step": 21721 + }, + { + "epoch": 1.2163736140665249, + "grad_norm": 1.4721201658248901, + "learning_rate": 9.954947368421053e-05, + "loss": 0.5529, + "step": 21722 + }, + { + "epoch": 1.216429611378654, + "grad_norm": 1.42538583278656, + "learning_rate": 9.95492105263158e-05, + "loss": 0.4611, + "step": 21723 + }, + { + "epoch": 1.216485608690783, + "grad_norm": 1.4518930912017822, + "learning_rate": 9.954894736842105e-05, + "loss": 0.4414, + "step": 21724 + }, + { + "epoch": 1.216541606002912, + "grad_norm": 1.5610616207122803, + "learning_rate": 9.954868421052632e-05, + "loss": 0.3666, + "step": 21725 + }, + { + "epoch": 1.216597603315041, + "grad_norm": 1.145919680595398, + "learning_rate": 9.954842105263158e-05, + "loss": 0.444, + "step": 21726 + }, + { + "epoch": 1.21665360062717, + "grad_norm": 1.3703927993774414, + "learning_rate": 9.954815789473685e-05, + "loss": 0.4152, + "step": 21727 + }, + { + "epoch": 1.216709597939299, + "grad_norm": 1.3518468141555786, + "learning_rate": 9.954789473684211e-05, + "loss": 0.4267, + "step": 21728 + }, + { + "epoch": 1.216765595251428, + "grad_norm": 1.1545908451080322, + "learning_rate": 9.954763157894737e-05, + "loss": 0.4011, + "step": 21729 + }, + { + "epoch": 1.216821592563557, + "grad_norm": 1.2211787700653076, + "learning_rate": 9.954736842105263e-05, + "loss": 0.3923, + "step": 21730 + }, + { + "epoch": 1.216877589875686, + "grad_norm": 1.3977153301239014, + "learning_rate": 9.95471052631579e-05, + "loss": 0.5393, + "step": 21731 + }, + { + "epoch": 1.216933587187815, + "grad_norm": 1.1466578245162964, + "learning_rate": 9.954684210526316e-05, + "loss": 0.5073, + "step": 21732 + }, + { + "epoch": 1.216989584499944, + "grad_norm": 1.2234030961990356, + "learning_rate": 9.954657894736842e-05, + "loss": 0.3789, + "step": 21733 + }, + { + "epoch": 1.217045581812073, + "grad_norm": 1.222891926765442, + "learning_rate": 9.954631578947368e-05, + "loss": 0.4936, + "step": 21734 + }, + { + "epoch": 1.217101579124202, + "grad_norm": 1.2851600646972656, + "learning_rate": 9.954605263157896e-05, + "loss": 0.5131, + "step": 21735 + }, + { + "epoch": 1.217157576436331, + "grad_norm": 1.6744606494903564, + "learning_rate": 9.954578947368422e-05, + "loss": 0.5656, + "step": 21736 + }, + { + "epoch": 1.21721357374846, + "grad_norm": 1.4397356510162354, + "learning_rate": 9.954552631578948e-05, + "loss": 0.4175, + "step": 21737 + }, + { + "epoch": 1.217269571060589, + "grad_norm": 1.3127284049987793, + "learning_rate": 9.954526315789474e-05, + "loss": 0.6474, + "step": 21738 + }, + { + "epoch": 1.217325568372718, + "grad_norm": 1.4042620658874512, + "learning_rate": 9.9545e-05, + "loss": 0.4045, + "step": 21739 + }, + { + "epoch": 1.217381565684847, + "grad_norm": 1.4031119346618652, + "learning_rate": 9.954473684210527e-05, + "loss": 0.4204, + "step": 21740 + }, + { + "epoch": 1.217437562996976, + "grad_norm": 1.441322684288025, + "learning_rate": 9.954447368421053e-05, + "loss": 0.3801, + "step": 21741 + }, + { + "epoch": 1.217493560309105, + "grad_norm": 1.4328351020812988, + "learning_rate": 9.954421052631579e-05, + "loss": 0.4234, + "step": 21742 + }, + { + "epoch": 1.217549557621234, + "grad_norm": 1.3555734157562256, + "learning_rate": 9.954394736842105e-05, + "loss": 0.418, + "step": 21743 + }, + { + "epoch": 1.217605554933363, + "grad_norm": 1.1974542140960693, + "learning_rate": 9.954368421052632e-05, + "loss": 0.4094, + "step": 21744 + }, + { + "epoch": 1.217661552245492, + "grad_norm": 1.3164422512054443, + "learning_rate": 9.954342105263158e-05, + "loss": 0.5754, + "step": 21745 + }, + { + "epoch": 1.2177175495576211, + "grad_norm": 1.6550158262252808, + "learning_rate": 9.954315789473686e-05, + "loss": 0.4881, + "step": 21746 + }, + { + "epoch": 1.2177735468697501, + "grad_norm": 1.514971375465393, + "learning_rate": 9.95428947368421e-05, + "loss": 0.6, + "step": 21747 + }, + { + "epoch": 1.2178295441818792, + "grad_norm": 1.312474012374878, + "learning_rate": 9.954263157894737e-05, + "loss": 0.3611, + "step": 21748 + }, + { + "epoch": 1.2178855414940082, + "grad_norm": 1.4859702587127686, + "learning_rate": 9.954236842105263e-05, + "loss": 0.4517, + "step": 21749 + }, + { + "epoch": 1.2179415388061372, + "grad_norm": 1.599524736404419, + "learning_rate": 9.954210526315791e-05, + "loss": 0.486, + "step": 21750 + }, + { + "epoch": 1.2179975361182662, + "grad_norm": 1.16690993309021, + "learning_rate": 9.954184210526317e-05, + "loss": 0.33, + "step": 21751 + }, + { + "epoch": 1.2180535334303952, + "grad_norm": 1.520194172859192, + "learning_rate": 9.954157894736843e-05, + "loss": 0.4749, + "step": 21752 + }, + { + "epoch": 1.2181095307425243, + "grad_norm": 1.2532674074172974, + "learning_rate": 9.954131578947369e-05, + "loss": 0.375, + "step": 21753 + }, + { + "epoch": 1.2181655280546533, + "grad_norm": 1.128656029701233, + "learning_rate": 9.954105263157896e-05, + "loss": 0.359, + "step": 21754 + }, + { + "epoch": 1.2182215253667823, + "grad_norm": 1.3241829872131348, + "learning_rate": 9.954078947368422e-05, + "loss": 0.4322, + "step": 21755 + }, + { + "epoch": 1.2182775226789113, + "grad_norm": 1.3982354402542114, + "learning_rate": 9.954052631578948e-05, + "loss": 0.6009, + "step": 21756 + }, + { + "epoch": 1.2183335199910403, + "grad_norm": 1.492138147354126, + "learning_rate": 9.954026315789474e-05, + "loss": 0.4982, + "step": 21757 + }, + { + "epoch": 1.2183895173031694, + "grad_norm": 1.5098954439163208, + "learning_rate": 9.954e-05, + "loss": 0.5411, + "step": 21758 + }, + { + "epoch": 1.2184455146152984, + "grad_norm": 1.4105461835861206, + "learning_rate": 9.953973684210527e-05, + "loss": 0.4803, + "step": 21759 + }, + { + "epoch": 1.2185015119274274, + "grad_norm": 1.2827019691467285, + "learning_rate": 9.953947368421053e-05, + "loss": 0.416, + "step": 21760 + }, + { + "epoch": 1.2185575092395564, + "grad_norm": 1.4231181144714355, + "learning_rate": 9.953921052631579e-05, + "loss": 0.4757, + "step": 21761 + }, + { + "epoch": 1.2186135065516854, + "grad_norm": 1.319778323173523, + "learning_rate": 9.953894736842105e-05, + "loss": 0.4043, + "step": 21762 + }, + { + "epoch": 1.2186695038638145, + "grad_norm": 1.1663310527801514, + "learning_rate": 9.953868421052632e-05, + "loss": 0.3814, + "step": 21763 + }, + { + "epoch": 1.2187255011759435, + "grad_norm": 1.4952927827835083, + "learning_rate": 9.953842105263158e-05, + "loss": 0.4432, + "step": 21764 + }, + { + "epoch": 1.2187814984880725, + "grad_norm": 1.2660924196243286, + "learning_rate": 9.953815789473684e-05, + "loss": 0.6356, + "step": 21765 + }, + { + "epoch": 1.2188374958002015, + "grad_norm": 1.4471004009246826, + "learning_rate": 9.95378947368421e-05, + "loss": 0.4191, + "step": 21766 + }, + { + "epoch": 1.2188934931123305, + "grad_norm": 1.4613450765609741, + "learning_rate": 9.953763157894738e-05, + "loss": 0.487, + "step": 21767 + }, + { + "epoch": 1.2189494904244595, + "grad_norm": 2.181941032409668, + "learning_rate": 9.953736842105264e-05, + "loss": 0.3376, + "step": 21768 + }, + { + "epoch": 1.2190054877365886, + "grad_norm": 1.267209529876709, + "learning_rate": 9.953710526315791e-05, + "loss": 0.3515, + "step": 21769 + }, + { + "epoch": 1.2190614850487176, + "grad_norm": 1.8140164613723755, + "learning_rate": 9.953684210526316e-05, + "loss": 0.5538, + "step": 21770 + }, + { + "epoch": 1.2191174823608466, + "grad_norm": 1.7960492372512817, + "learning_rate": 9.953657894736843e-05, + "loss": 0.5585, + "step": 21771 + }, + { + "epoch": 1.2191734796729756, + "grad_norm": 1.3258659839630127, + "learning_rate": 9.953631578947369e-05, + "loss": 0.4138, + "step": 21772 + }, + { + "epoch": 1.2192294769851046, + "grad_norm": 1.4085932970046997, + "learning_rate": 9.953605263157896e-05, + "loss": 0.5124, + "step": 21773 + }, + { + "epoch": 1.2192854742972337, + "grad_norm": 1.3527405261993408, + "learning_rate": 9.953578947368421e-05, + "loss": 0.3983, + "step": 21774 + }, + { + "epoch": 1.2193414716093627, + "grad_norm": 1.5936415195465088, + "learning_rate": 9.953552631578947e-05, + "loss": 0.4506, + "step": 21775 + }, + { + "epoch": 1.2193974689214917, + "grad_norm": 1.4233524799346924, + "learning_rate": 9.953526315789474e-05, + "loss": 0.5028, + "step": 21776 + }, + { + "epoch": 1.2194534662336207, + "grad_norm": 1.5624884366989136, + "learning_rate": 9.9535e-05, + "loss": 0.5677, + "step": 21777 + }, + { + "epoch": 1.2195094635457497, + "grad_norm": 1.445286750793457, + "learning_rate": 9.953473684210527e-05, + "loss": 0.4837, + "step": 21778 + }, + { + "epoch": 1.2195654608578788, + "grad_norm": 1.3730077743530273, + "learning_rate": 9.953447368421052e-05, + "loss": 0.4522, + "step": 21779 + }, + { + "epoch": 1.2196214581700078, + "grad_norm": 1.4525657892227173, + "learning_rate": 9.95342105263158e-05, + "loss": 0.5774, + "step": 21780 + }, + { + "epoch": 1.2196774554821368, + "grad_norm": 1.1443114280700684, + "learning_rate": 9.953394736842105e-05, + "loss": 0.3527, + "step": 21781 + }, + { + "epoch": 1.2197334527942658, + "grad_norm": 1.6584999561309814, + "learning_rate": 9.953368421052633e-05, + "loss": 0.3506, + "step": 21782 + }, + { + "epoch": 1.2197894501063948, + "grad_norm": 1.3194817304611206, + "learning_rate": 9.953342105263159e-05, + "loss": 0.3406, + "step": 21783 + }, + { + "epoch": 1.2198454474185239, + "grad_norm": 1.5065053701400757, + "learning_rate": 9.953315789473685e-05, + "loss": 0.4697, + "step": 21784 + }, + { + "epoch": 1.2199014447306529, + "grad_norm": 1.257745623588562, + "learning_rate": 9.95328947368421e-05, + "loss": 0.3553, + "step": 21785 + }, + { + "epoch": 1.219957442042782, + "grad_norm": 1.4275087118148804, + "learning_rate": 9.953263157894738e-05, + "loss": 0.5639, + "step": 21786 + }, + { + "epoch": 1.220013439354911, + "grad_norm": 1.3018772602081299, + "learning_rate": 9.953236842105264e-05, + "loss": 0.5343, + "step": 21787 + }, + { + "epoch": 1.22006943666704, + "grad_norm": 1.5407583713531494, + "learning_rate": 9.95321052631579e-05, + "loss": 0.5871, + "step": 21788 + }, + { + "epoch": 1.220125433979169, + "grad_norm": 1.4225414991378784, + "learning_rate": 9.953184210526316e-05, + "loss": 0.4552, + "step": 21789 + }, + { + "epoch": 1.220181431291298, + "grad_norm": 1.134092926979065, + "learning_rate": 9.953157894736843e-05, + "loss": 0.4514, + "step": 21790 + }, + { + "epoch": 1.220237428603427, + "grad_norm": 1.4558898210525513, + "learning_rate": 9.953131578947369e-05, + "loss": 0.4356, + "step": 21791 + }, + { + "epoch": 1.220293425915556, + "grad_norm": 1.0426757335662842, + "learning_rate": 9.953105263157895e-05, + "loss": 0.4332, + "step": 21792 + }, + { + "epoch": 1.220349423227685, + "grad_norm": 1.164726972579956, + "learning_rate": 9.953078947368421e-05, + "loss": 0.3746, + "step": 21793 + }, + { + "epoch": 1.220405420539814, + "grad_norm": 1.221907138824463, + "learning_rate": 9.953052631578947e-05, + "loss": 0.5337, + "step": 21794 + }, + { + "epoch": 1.220461417851943, + "grad_norm": 1.24656343460083, + "learning_rate": 9.953026315789474e-05, + "loss": 0.3993, + "step": 21795 + }, + { + "epoch": 1.220517415164072, + "grad_norm": 1.378005862236023, + "learning_rate": 9.953e-05, + "loss": 0.4511, + "step": 21796 + }, + { + "epoch": 1.220573412476201, + "grad_norm": 1.2552727460861206, + "learning_rate": 9.952973684210526e-05, + "loss": 0.3287, + "step": 21797 + }, + { + "epoch": 1.2206294097883301, + "grad_norm": 1.2627837657928467, + "learning_rate": 9.952947368421052e-05, + "loss": 0.4697, + "step": 21798 + }, + { + "epoch": 1.2206854071004591, + "grad_norm": 1.59735906124115, + "learning_rate": 9.95292105263158e-05, + "loss": 0.6023, + "step": 21799 + }, + { + "epoch": 1.2207414044125882, + "grad_norm": 1.2051788568496704, + "learning_rate": 9.952894736842106e-05, + "loss": 0.4292, + "step": 21800 + }, + { + "epoch": 1.2207974017247172, + "grad_norm": 1.4525799751281738, + "learning_rate": 9.952868421052633e-05, + "loss": 0.4569, + "step": 21801 + }, + { + "epoch": 1.2208533990368462, + "grad_norm": 1.2709426879882812, + "learning_rate": 9.952842105263158e-05, + "loss": 0.34, + "step": 21802 + }, + { + "epoch": 1.2209093963489752, + "grad_norm": 1.4722206592559814, + "learning_rate": 9.952815789473685e-05, + "loss": 0.4586, + "step": 21803 + }, + { + "epoch": 1.2209653936611042, + "grad_norm": 1.3889596462249756, + "learning_rate": 9.952789473684211e-05, + "loss": 0.4202, + "step": 21804 + }, + { + "epoch": 1.2210213909732333, + "grad_norm": 1.5910038948059082, + "learning_rate": 9.952763157894738e-05, + "loss": 0.4474, + "step": 21805 + }, + { + "epoch": 1.2210773882853623, + "grad_norm": 1.3301708698272705, + "learning_rate": 9.952736842105264e-05, + "loss": 0.467, + "step": 21806 + }, + { + "epoch": 1.2211333855974913, + "grad_norm": 1.9921144247055054, + "learning_rate": 9.95271052631579e-05, + "loss": 0.3968, + "step": 21807 + }, + { + "epoch": 1.2211893829096203, + "grad_norm": 1.4369726181030273, + "learning_rate": 9.952684210526316e-05, + "loss": 0.4611, + "step": 21808 + }, + { + "epoch": 1.2212453802217493, + "grad_norm": 1.7992271184921265, + "learning_rate": 9.952657894736842e-05, + "loss": 0.5521, + "step": 21809 + }, + { + "epoch": 1.2213013775338784, + "grad_norm": 1.1561758518218994, + "learning_rate": 9.95263157894737e-05, + "loss": 0.3971, + "step": 21810 + }, + { + "epoch": 1.2213573748460074, + "grad_norm": 1.366234540939331, + "learning_rate": 9.952605263157895e-05, + "loss": 0.5476, + "step": 21811 + }, + { + "epoch": 1.2214133721581364, + "grad_norm": 1.1349620819091797, + "learning_rate": 9.952578947368421e-05, + "loss": 0.4166, + "step": 21812 + }, + { + "epoch": 1.2214693694702654, + "grad_norm": 1.4356046915054321, + "learning_rate": 9.952552631578947e-05, + "loss": 0.4494, + "step": 21813 + }, + { + "epoch": 1.2215253667823944, + "grad_norm": 1.5290894508361816, + "learning_rate": 9.952526315789475e-05, + "loss": 0.4659, + "step": 21814 + }, + { + "epoch": 1.2215813640945234, + "grad_norm": 2.1164944171905518, + "learning_rate": 9.952500000000001e-05, + "loss": 0.4189, + "step": 21815 + }, + { + "epoch": 1.2216373614066525, + "grad_norm": 1.3442543745040894, + "learning_rate": 9.952473684210527e-05, + "loss": 0.4612, + "step": 21816 + }, + { + "epoch": 1.2216933587187815, + "grad_norm": 1.2771326303482056, + "learning_rate": 9.952447368421053e-05, + "loss": 0.4856, + "step": 21817 + }, + { + "epoch": 1.2217493560309105, + "grad_norm": 1.2080665826797485, + "learning_rate": 9.95242105263158e-05, + "loss": 0.32, + "step": 21818 + }, + { + "epoch": 1.2218053533430395, + "grad_norm": 1.4783048629760742, + "learning_rate": 9.952394736842106e-05, + "loss": 0.4008, + "step": 21819 + }, + { + "epoch": 1.2218613506551685, + "grad_norm": 1.331005573272705, + "learning_rate": 9.952368421052632e-05, + "loss": 0.4148, + "step": 21820 + }, + { + "epoch": 1.2219173479672976, + "grad_norm": 1.3131736516952515, + "learning_rate": 9.952342105263158e-05, + "loss": 0.5158, + "step": 21821 + }, + { + "epoch": 1.2219733452794266, + "grad_norm": 1.3083288669586182, + "learning_rate": 9.952315789473685e-05, + "loss": 0.4283, + "step": 21822 + }, + { + "epoch": 1.2220293425915556, + "grad_norm": 1.2380086183547974, + "learning_rate": 9.952289473684211e-05, + "loss": 0.505, + "step": 21823 + }, + { + "epoch": 1.2220853399036846, + "grad_norm": 1.2146620750427246, + "learning_rate": 9.952263157894739e-05, + "loss": 0.4575, + "step": 21824 + }, + { + "epoch": 1.2221413372158136, + "grad_norm": 1.1539934873580933, + "learning_rate": 9.952236842105263e-05, + "loss": 0.4541, + "step": 21825 + }, + { + "epoch": 1.2221973345279427, + "grad_norm": 1.4234150648117065, + "learning_rate": 9.952210526315789e-05, + "loss": 0.3875, + "step": 21826 + }, + { + "epoch": 1.2222533318400717, + "grad_norm": 1.3258788585662842, + "learning_rate": 9.952184210526316e-05, + "loss": 0.3533, + "step": 21827 + }, + { + "epoch": 1.2223093291522007, + "grad_norm": 1.063740611076355, + "learning_rate": 9.952157894736842e-05, + "loss": 0.3701, + "step": 21828 + }, + { + "epoch": 1.2223653264643297, + "grad_norm": 1.3509206771850586, + "learning_rate": 9.952131578947368e-05, + "loss": 0.5831, + "step": 21829 + }, + { + "epoch": 1.2224213237764587, + "grad_norm": 1.35112464427948, + "learning_rate": 9.952105263157894e-05, + "loss": 0.3951, + "step": 21830 + }, + { + "epoch": 1.2224773210885878, + "grad_norm": 1.0923048257827759, + "learning_rate": 9.952078947368422e-05, + "loss": 0.2773, + "step": 21831 + }, + { + "epoch": 1.2225333184007168, + "grad_norm": 1.3539552688598633, + "learning_rate": 9.952052631578948e-05, + "loss": 0.4607, + "step": 21832 + }, + { + "epoch": 1.2225893157128458, + "grad_norm": 2.1112303733825684, + "learning_rate": 9.952026315789475e-05, + "loss": 0.4197, + "step": 21833 + }, + { + "epoch": 1.2226453130249748, + "grad_norm": 1.36155366897583, + "learning_rate": 9.952e-05, + "loss": 0.4435, + "step": 21834 + }, + { + "epoch": 1.2227013103371038, + "grad_norm": 1.6987247467041016, + "learning_rate": 9.951973684210527e-05, + "loss": 0.6736, + "step": 21835 + }, + { + "epoch": 1.2227573076492328, + "grad_norm": 1.2438112497329712, + "learning_rate": 9.951947368421053e-05, + "loss": 0.3778, + "step": 21836 + }, + { + "epoch": 1.2228133049613619, + "grad_norm": 1.5742439031600952, + "learning_rate": 9.95192105263158e-05, + "loss": 0.4305, + "step": 21837 + }, + { + "epoch": 1.2228693022734909, + "grad_norm": 1.335503101348877, + "learning_rate": 9.951894736842106e-05, + "loss": 0.4913, + "step": 21838 + }, + { + "epoch": 1.22292529958562, + "grad_norm": 1.3608275651931763, + "learning_rate": 9.951868421052632e-05, + "loss": 0.3757, + "step": 21839 + }, + { + "epoch": 1.222981296897749, + "grad_norm": 1.3223425149917603, + "learning_rate": 9.951842105263158e-05, + "loss": 0.3729, + "step": 21840 + }, + { + "epoch": 1.223037294209878, + "grad_norm": 1.2835404872894287, + "learning_rate": 9.951815789473685e-05, + "loss": 0.3632, + "step": 21841 + }, + { + "epoch": 1.223093291522007, + "grad_norm": 1.4302679300308228, + "learning_rate": 9.951789473684211e-05, + "loss": 0.496, + "step": 21842 + }, + { + "epoch": 1.223149288834136, + "grad_norm": 2.2417383193969727, + "learning_rate": 9.951763157894737e-05, + "loss": 0.4393, + "step": 21843 + }, + { + "epoch": 1.223205286146265, + "grad_norm": 1.3884906768798828, + "learning_rate": 9.951736842105263e-05, + "loss": 0.5005, + "step": 21844 + }, + { + "epoch": 1.223261283458394, + "grad_norm": 1.4113117456436157, + "learning_rate": 9.95171052631579e-05, + "loss": 0.4553, + "step": 21845 + }, + { + "epoch": 1.223317280770523, + "grad_norm": 1.2487066984176636, + "learning_rate": 9.951684210526317e-05, + "loss": 0.3879, + "step": 21846 + }, + { + "epoch": 1.223373278082652, + "grad_norm": 1.2708920240402222, + "learning_rate": 9.951657894736843e-05, + "loss": 0.4038, + "step": 21847 + }, + { + "epoch": 1.223429275394781, + "grad_norm": 1.1820656061172485, + "learning_rate": 9.951631578947369e-05, + "loss": 0.6154, + "step": 21848 + }, + { + "epoch": 1.22348527270691, + "grad_norm": 1.363383173942566, + "learning_rate": 9.951605263157895e-05, + "loss": 0.4523, + "step": 21849 + }, + { + "epoch": 1.2235412700190391, + "grad_norm": 1.3615005016326904, + "learning_rate": 9.951578947368422e-05, + "loss": 0.4351, + "step": 21850 + }, + { + "epoch": 1.2235972673311681, + "grad_norm": 1.5357478857040405, + "learning_rate": 9.951552631578948e-05, + "loss": 0.3839, + "step": 21851 + }, + { + "epoch": 1.2236532646432972, + "grad_norm": 1.2987799644470215, + "learning_rate": 9.951526315789474e-05, + "loss": 0.4115, + "step": 21852 + }, + { + "epoch": 1.2237092619554262, + "grad_norm": 1.168209195137024, + "learning_rate": 9.9515e-05, + "loss": 0.3717, + "step": 21853 + }, + { + "epoch": 1.2237652592675552, + "grad_norm": 2.50730562210083, + "learning_rate": 9.951473684210527e-05, + "loss": 0.4838, + "step": 21854 + }, + { + "epoch": 1.2238212565796842, + "grad_norm": 1.7884670495986938, + "learning_rate": 9.951447368421053e-05, + "loss": 0.5113, + "step": 21855 + }, + { + "epoch": 1.2238772538918132, + "grad_norm": 1.4803868532180786, + "learning_rate": 9.95142105263158e-05, + "loss": 0.4637, + "step": 21856 + }, + { + "epoch": 1.2239332512039423, + "grad_norm": 1.1885650157928467, + "learning_rate": 9.951394736842105e-05, + "loss": 0.4261, + "step": 21857 + }, + { + "epoch": 1.2239892485160713, + "grad_norm": 1.30832839012146, + "learning_rate": 9.951368421052632e-05, + "loss": 0.4534, + "step": 21858 + }, + { + "epoch": 1.2240452458282003, + "grad_norm": 1.4795804023742676, + "learning_rate": 9.951342105263158e-05, + "loss": 0.4565, + "step": 21859 + }, + { + "epoch": 1.2241012431403293, + "grad_norm": 1.636049509048462, + "learning_rate": 9.951315789473684e-05, + "loss": 0.5965, + "step": 21860 + }, + { + "epoch": 1.2241572404524583, + "grad_norm": 1.4008960723876953, + "learning_rate": 9.951289473684212e-05, + "loss": 0.4041, + "step": 21861 + }, + { + "epoch": 1.2242132377645873, + "grad_norm": 1.2367534637451172, + "learning_rate": 9.951263157894736e-05, + "loss": 0.4344, + "step": 21862 + }, + { + "epoch": 1.2242692350767164, + "grad_norm": 1.3793079853057861, + "learning_rate": 9.951236842105264e-05, + "loss": 0.367, + "step": 21863 + }, + { + "epoch": 1.2243252323888454, + "grad_norm": 1.6336742639541626, + "learning_rate": 9.95121052631579e-05, + "loss": 0.4368, + "step": 21864 + }, + { + "epoch": 1.2243812297009744, + "grad_norm": 1.6996160745620728, + "learning_rate": 9.951184210526317e-05, + "loss": 0.4245, + "step": 21865 + }, + { + "epoch": 1.2244372270131034, + "grad_norm": 1.3252843618392944, + "learning_rate": 9.951157894736843e-05, + "loss": 0.3982, + "step": 21866 + }, + { + "epoch": 1.2244932243252324, + "grad_norm": 1.583376407623291, + "learning_rate": 9.951131578947369e-05, + "loss": 0.4504, + "step": 21867 + }, + { + "epoch": 1.2245492216373615, + "grad_norm": 1.4512195587158203, + "learning_rate": 9.951105263157895e-05, + "loss": 0.481, + "step": 21868 + }, + { + "epoch": 1.2246052189494905, + "grad_norm": 1.6560012102127075, + "learning_rate": 9.951078947368422e-05, + "loss": 0.4535, + "step": 21869 + }, + { + "epoch": 1.2246612162616195, + "grad_norm": 3.5533268451690674, + "learning_rate": 9.951052631578948e-05, + "loss": 0.4402, + "step": 21870 + }, + { + "epoch": 1.2247172135737485, + "grad_norm": 1.4186865091323853, + "learning_rate": 9.951026315789474e-05, + "loss": 0.5834, + "step": 21871 + }, + { + "epoch": 1.2247732108858775, + "grad_norm": 1.329943299293518, + "learning_rate": 9.951e-05, + "loss": 0.4864, + "step": 21872 + }, + { + "epoch": 1.2248292081980066, + "grad_norm": 1.423318862915039, + "learning_rate": 9.950973684210527e-05, + "loss": 0.4489, + "step": 21873 + }, + { + "epoch": 1.2248852055101356, + "grad_norm": 1.3300644159317017, + "learning_rate": 9.950947368421053e-05, + "loss": 0.4682, + "step": 21874 + }, + { + "epoch": 1.2249412028222646, + "grad_norm": 1.6161000728607178, + "learning_rate": 9.95092105263158e-05, + "loss": 0.4761, + "step": 21875 + }, + { + "epoch": 1.2249972001343936, + "grad_norm": 1.3090307712554932, + "learning_rate": 9.950894736842105e-05, + "loss": 0.3507, + "step": 21876 + }, + { + "epoch": 1.2250531974465226, + "grad_norm": 1.590484380722046, + "learning_rate": 9.950868421052631e-05, + "loss": 0.5039, + "step": 21877 + }, + { + "epoch": 1.2251091947586517, + "grad_norm": 1.5652788877487183, + "learning_rate": 9.950842105263159e-05, + "loss": 0.409, + "step": 21878 + }, + { + "epoch": 1.2251651920707807, + "grad_norm": 1.3590055704116821, + "learning_rate": 9.950815789473685e-05, + "loss": 0.4055, + "step": 21879 + }, + { + "epoch": 1.2252211893829097, + "grad_norm": 7.608685493469238, + "learning_rate": 9.95078947368421e-05, + "loss": 0.3679, + "step": 21880 + }, + { + "epoch": 1.2252771866950387, + "grad_norm": 1.1881346702575684, + "learning_rate": 9.950763157894737e-05, + "loss": 0.4288, + "step": 21881 + }, + { + "epoch": 1.2253331840071677, + "grad_norm": 1.3057074546813965, + "learning_rate": 9.950736842105264e-05, + "loss": 0.519, + "step": 21882 + }, + { + "epoch": 1.2253891813192967, + "grad_norm": 1.5817879438400269, + "learning_rate": 9.95071052631579e-05, + "loss": 0.6551, + "step": 21883 + }, + { + "epoch": 1.2254451786314258, + "grad_norm": 1.1580766439437866, + "learning_rate": 9.950684210526316e-05, + "loss": 0.4142, + "step": 21884 + }, + { + "epoch": 1.2255011759435548, + "grad_norm": 1.2933225631713867, + "learning_rate": 9.950657894736842e-05, + "loss": 0.3733, + "step": 21885 + }, + { + "epoch": 1.2255571732556838, + "grad_norm": 1.273256540298462, + "learning_rate": 9.950631578947369e-05, + "loss": 0.346, + "step": 21886 + }, + { + "epoch": 1.2256131705678128, + "grad_norm": 1.346725583076477, + "learning_rate": 9.950605263157895e-05, + "loss": 0.4856, + "step": 21887 + }, + { + "epoch": 1.2256691678799418, + "grad_norm": 1.4436297416687012, + "learning_rate": 9.950578947368422e-05, + "loss": 0.3798, + "step": 21888 + }, + { + "epoch": 1.2257251651920709, + "grad_norm": 1.3356685638427734, + "learning_rate": 9.950552631578947e-05, + "loss": 0.4504, + "step": 21889 + }, + { + "epoch": 1.2257811625041999, + "grad_norm": 1.2018934488296509, + "learning_rate": 9.950526315789474e-05, + "loss": 0.5068, + "step": 21890 + }, + { + "epoch": 1.225837159816329, + "grad_norm": 1.6328685283660889, + "learning_rate": 9.9505e-05, + "loss": 0.4376, + "step": 21891 + }, + { + "epoch": 1.225893157128458, + "grad_norm": 1.0057116746902466, + "learning_rate": 9.950473684210528e-05, + "loss": 0.3189, + "step": 21892 + }, + { + "epoch": 1.225949154440587, + "grad_norm": 1.5768277645111084, + "learning_rate": 9.950447368421054e-05, + "loss": 0.4603, + "step": 21893 + }, + { + "epoch": 1.226005151752716, + "grad_norm": 1.7225438356399536, + "learning_rate": 9.95042105263158e-05, + "loss": 0.4968, + "step": 21894 + }, + { + "epoch": 1.226061149064845, + "grad_norm": 1.4130609035491943, + "learning_rate": 9.950394736842106e-05, + "loss": 0.4008, + "step": 21895 + }, + { + "epoch": 1.226117146376974, + "grad_norm": 1.2787760496139526, + "learning_rate": 9.950368421052632e-05, + "loss": 0.4211, + "step": 21896 + }, + { + "epoch": 1.226173143689103, + "grad_norm": 1.7041151523590088, + "learning_rate": 9.950342105263159e-05, + "loss": 0.4104, + "step": 21897 + }, + { + "epoch": 1.226229141001232, + "grad_norm": 1.1386144161224365, + "learning_rate": 9.950315789473685e-05, + "loss": 0.3393, + "step": 21898 + }, + { + "epoch": 1.226285138313361, + "grad_norm": 1.2944810390472412, + "learning_rate": 9.950289473684211e-05, + "loss": 0.3709, + "step": 21899 + }, + { + "epoch": 1.22634113562549, + "grad_norm": 1.467772126197815, + "learning_rate": 9.950263157894737e-05, + "loss": 0.5673, + "step": 21900 + }, + { + "epoch": 1.226397132937619, + "grad_norm": 1.4492416381835938, + "learning_rate": 9.950236842105264e-05, + "loss": 0.5022, + "step": 21901 + }, + { + "epoch": 1.2264531302497481, + "grad_norm": 1.3122985363006592, + "learning_rate": 9.95021052631579e-05, + "loss": 0.38, + "step": 21902 + }, + { + "epoch": 1.2265091275618771, + "grad_norm": 1.4372916221618652, + "learning_rate": 9.950184210526316e-05, + "loss": 0.4658, + "step": 21903 + }, + { + "epoch": 1.2265651248740062, + "grad_norm": 1.2543593645095825, + "learning_rate": 9.950157894736842e-05, + "loss": 0.4077, + "step": 21904 + }, + { + "epoch": 1.2266211221861352, + "grad_norm": 1.4239617586135864, + "learning_rate": 9.95013157894737e-05, + "loss": 0.4194, + "step": 21905 + }, + { + "epoch": 1.2266771194982642, + "grad_norm": 1.496111273765564, + "learning_rate": 9.950105263157895e-05, + "loss": 0.3906, + "step": 21906 + }, + { + "epoch": 1.2267331168103932, + "grad_norm": 1.2875492572784424, + "learning_rate": 9.950078947368421e-05, + "loss": 0.4025, + "step": 21907 + }, + { + "epoch": 1.2267891141225222, + "grad_norm": 1.4934632778167725, + "learning_rate": 9.950052631578947e-05, + "loss": 0.4507, + "step": 21908 + }, + { + "epoch": 1.2268451114346512, + "grad_norm": 1.660032033920288, + "learning_rate": 9.950026315789475e-05, + "loss": 0.5312, + "step": 21909 + }, + { + "epoch": 1.2269011087467803, + "grad_norm": 1.4773995876312256, + "learning_rate": 9.95e-05, + "loss": 0.498, + "step": 21910 + }, + { + "epoch": 1.2269571060589093, + "grad_norm": 1.4536614418029785, + "learning_rate": 9.949973684210528e-05, + "loss": 0.4295, + "step": 21911 + }, + { + "epoch": 1.2270131033710383, + "grad_norm": 1.2980177402496338, + "learning_rate": 9.949947368421053e-05, + "loss": 0.3971, + "step": 21912 + }, + { + "epoch": 1.2270691006831673, + "grad_norm": 1.3747931718826294, + "learning_rate": 9.949921052631579e-05, + "loss": 0.458, + "step": 21913 + }, + { + "epoch": 1.2271250979952963, + "grad_norm": 1.1824984550476074, + "learning_rate": 9.949894736842106e-05, + "loss": 0.4842, + "step": 21914 + }, + { + "epoch": 1.2271810953074254, + "grad_norm": 1.315635323524475, + "learning_rate": 9.949868421052632e-05, + "loss": 0.4605, + "step": 21915 + }, + { + "epoch": 1.2272370926195544, + "grad_norm": 1.3485517501831055, + "learning_rate": 9.949842105263159e-05, + "loss": 0.4314, + "step": 21916 + }, + { + "epoch": 1.2272930899316834, + "grad_norm": 1.3250218629837036, + "learning_rate": 9.949815789473684e-05, + "loss": 0.3913, + "step": 21917 + }, + { + "epoch": 1.2273490872438124, + "grad_norm": 1.4313737154006958, + "learning_rate": 9.949789473684211e-05, + "loss": 0.4219, + "step": 21918 + }, + { + "epoch": 1.2274050845559414, + "grad_norm": 1.1929755210876465, + "learning_rate": 9.949763157894737e-05, + "loss": 0.4118, + "step": 21919 + }, + { + "epoch": 1.2274610818680702, + "grad_norm": 1.24820876121521, + "learning_rate": 9.949736842105264e-05, + "loss": 0.363, + "step": 21920 + }, + { + "epoch": 1.2275170791801993, + "grad_norm": 1.482637643814087, + "learning_rate": 9.949710526315789e-05, + "loss": 0.413, + "step": 21921 + }, + { + "epoch": 1.2275730764923283, + "grad_norm": 1.4789204597473145, + "learning_rate": 9.949684210526316e-05, + "loss": 0.4624, + "step": 21922 + }, + { + "epoch": 1.2276290738044573, + "grad_norm": 1.3046389818191528, + "learning_rate": 9.949657894736842e-05, + "loss": 0.385, + "step": 21923 + }, + { + "epoch": 1.2276850711165863, + "grad_norm": 1.3254011869430542, + "learning_rate": 9.94963157894737e-05, + "loss": 0.4519, + "step": 21924 + }, + { + "epoch": 1.2277410684287153, + "grad_norm": 1.443664789199829, + "learning_rate": 9.949605263157896e-05, + "loss": 0.5582, + "step": 21925 + }, + { + "epoch": 1.2277970657408444, + "grad_norm": 1.4166078567504883, + "learning_rate": 9.949578947368422e-05, + "loss": 0.5071, + "step": 21926 + }, + { + "epoch": 1.2278530630529734, + "grad_norm": 1.342435598373413, + "learning_rate": 9.949552631578948e-05, + "loss": 0.4589, + "step": 21927 + }, + { + "epoch": 1.2279090603651024, + "grad_norm": 1.318955898284912, + "learning_rate": 9.949526315789475e-05, + "loss": 0.3732, + "step": 21928 + }, + { + "epoch": 1.2279650576772314, + "grad_norm": 1.3455950021743774, + "learning_rate": 9.949500000000001e-05, + "loss": 0.3828, + "step": 21929 + }, + { + "epoch": 1.2280210549893604, + "grad_norm": 1.5169085264205933, + "learning_rate": 9.949473684210527e-05, + "loss": 0.3917, + "step": 21930 + }, + { + "epoch": 1.2280770523014894, + "grad_norm": 1.2915533781051636, + "learning_rate": 9.949447368421053e-05, + "loss": 0.396, + "step": 21931 + }, + { + "epoch": 1.2281330496136185, + "grad_norm": 1.4210155010223389, + "learning_rate": 9.949421052631579e-05, + "loss": 0.5223, + "step": 21932 + }, + { + "epoch": 1.2281890469257475, + "grad_norm": 1.20391845703125, + "learning_rate": 9.949394736842106e-05, + "loss": 0.3596, + "step": 21933 + }, + { + "epoch": 1.2282450442378765, + "grad_norm": 1.4098650217056274, + "learning_rate": 9.949368421052632e-05, + "loss": 0.6011, + "step": 21934 + }, + { + "epoch": 1.2283010415500055, + "grad_norm": 1.37461256980896, + "learning_rate": 9.949342105263158e-05, + "loss": 0.495, + "step": 21935 + }, + { + "epoch": 1.2283570388621345, + "grad_norm": 1.2328859567642212, + "learning_rate": 9.949315789473684e-05, + "loss": 0.4217, + "step": 21936 + }, + { + "epoch": 1.2284130361742636, + "grad_norm": 1.310872197151184, + "learning_rate": 9.949289473684211e-05, + "loss": 0.3175, + "step": 21937 + }, + { + "epoch": 1.2284690334863926, + "grad_norm": 1.284334421157837, + "learning_rate": 9.949263157894737e-05, + "loss": 0.4362, + "step": 21938 + }, + { + "epoch": 1.2285250307985216, + "grad_norm": 1.304630160331726, + "learning_rate": 9.949236842105263e-05, + "loss": 0.3955, + "step": 21939 + }, + { + "epoch": 1.2285810281106506, + "grad_norm": 1.5838603973388672, + "learning_rate": 9.949210526315789e-05, + "loss": 0.5161, + "step": 21940 + }, + { + "epoch": 1.2286370254227796, + "grad_norm": 1.4570293426513672, + "learning_rate": 9.949184210526317e-05, + "loss": 0.4528, + "step": 21941 + }, + { + "epoch": 1.2286930227349087, + "grad_norm": 1.5105780363082886, + "learning_rate": 9.949157894736843e-05, + "loss": 0.3747, + "step": 21942 + }, + { + "epoch": 1.2287490200470377, + "grad_norm": 1.4986460208892822, + "learning_rate": 9.94913157894737e-05, + "loss": 0.4594, + "step": 21943 + }, + { + "epoch": 1.2288050173591667, + "grad_norm": 1.5359302759170532, + "learning_rate": 9.949105263157895e-05, + "loss": 0.4651, + "step": 21944 + }, + { + "epoch": 1.2288610146712957, + "grad_norm": 1.283356785774231, + "learning_rate": 9.949078947368422e-05, + "loss": 0.416, + "step": 21945 + }, + { + "epoch": 1.2289170119834247, + "grad_norm": 1.3096617460250854, + "learning_rate": 9.949052631578948e-05, + "loss": 0.3662, + "step": 21946 + }, + { + "epoch": 1.2289730092955538, + "grad_norm": 1.2936625480651855, + "learning_rate": 9.949026315789474e-05, + "loss": 0.4686, + "step": 21947 + }, + { + "epoch": 1.2290290066076828, + "grad_norm": 1.3283549547195435, + "learning_rate": 9.949000000000001e-05, + "loss": 0.4289, + "step": 21948 + }, + { + "epoch": 1.2290850039198118, + "grad_norm": 1.4151219129562378, + "learning_rate": 9.948973684210526e-05, + "loss": 0.62, + "step": 21949 + }, + { + "epoch": 1.2291410012319408, + "grad_norm": 1.4237349033355713, + "learning_rate": 9.948947368421053e-05, + "loss": 0.5798, + "step": 21950 + }, + { + "epoch": 1.2291969985440698, + "grad_norm": 1.4631147384643555, + "learning_rate": 9.948921052631579e-05, + "loss": 0.485, + "step": 21951 + }, + { + "epoch": 1.2292529958561988, + "grad_norm": 1.5169850587844849, + "learning_rate": 9.948894736842106e-05, + "loss": 0.5413, + "step": 21952 + }, + { + "epoch": 1.2293089931683279, + "grad_norm": 1.290627121925354, + "learning_rate": 9.948868421052632e-05, + "loss": 0.3879, + "step": 21953 + }, + { + "epoch": 1.2293649904804569, + "grad_norm": 1.3381317853927612, + "learning_rate": 9.948842105263158e-05, + "loss": 0.4044, + "step": 21954 + }, + { + "epoch": 1.229420987792586, + "grad_norm": 1.0008268356323242, + "learning_rate": 9.948815789473684e-05, + "loss": 0.3143, + "step": 21955 + }, + { + "epoch": 1.229476985104715, + "grad_norm": 2.7601494789123535, + "learning_rate": 9.948789473684212e-05, + "loss": 0.4273, + "step": 21956 + }, + { + "epoch": 1.229532982416844, + "grad_norm": 1.354504108428955, + "learning_rate": 9.948763157894738e-05, + "loss": 0.3963, + "step": 21957 + }, + { + "epoch": 1.229588979728973, + "grad_norm": 1.7259458303451538, + "learning_rate": 9.948736842105264e-05, + "loss": 0.5035, + "step": 21958 + }, + { + "epoch": 1.229644977041102, + "grad_norm": 1.5698692798614502, + "learning_rate": 9.94871052631579e-05, + "loss": 0.4932, + "step": 21959 + }, + { + "epoch": 1.229700974353231, + "grad_norm": 1.169247031211853, + "learning_rate": 9.948684210526317e-05, + "loss": 0.3707, + "step": 21960 + }, + { + "epoch": 1.22975697166536, + "grad_norm": 1.2915327548980713, + "learning_rate": 9.948657894736843e-05, + "loss": 0.5059, + "step": 21961 + }, + { + "epoch": 1.229812968977489, + "grad_norm": 1.2802354097366333, + "learning_rate": 9.948631578947369e-05, + "loss": 0.4147, + "step": 21962 + }, + { + "epoch": 1.229868966289618, + "grad_norm": 1.6147280931472778, + "learning_rate": 9.948605263157895e-05, + "loss": 0.4679, + "step": 21963 + }, + { + "epoch": 1.229924963601747, + "grad_norm": 1.5009760856628418, + "learning_rate": 9.948578947368421e-05, + "loss": 0.4806, + "step": 21964 + }, + { + "epoch": 1.229980960913876, + "grad_norm": 1.4112187623977661, + "learning_rate": 9.948552631578948e-05, + "loss": 0.5722, + "step": 21965 + }, + { + "epoch": 1.2300369582260051, + "grad_norm": 1.3325297832489014, + "learning_rate": 9.948526315789474e-05, + "loss": 0.4073, + "step": 21966 + }, + { + "epoch": 1.2300929555381341, + "grad_norm": 1.1587578058242798, + "learning_rate": 9.9485e-05, + "loss": 0.3948, + "step": 21967 + }, + { + "epoch": 1.2301489528502632, + "grad_norm": 1.315110683441162, + "learning_rate": 9.948473684210526e-05, + "loss": 0.3932, + "step": 21968 + }, + { + "epoch": 1.2302049501623922, + "grad_norm": 1.4653820991516113, + "learning_rate": 9.948447368421053e-05, + "loss": 0.4289, + "step": 21969 + }, + { + "epoch": 1.2302609474745212, + "grad_norm": 1.9477462768554688, + "learning_rate": 9.948421052631579e-05, + "loss": 0.7315, + "step": 21970 + }, + { + "epoch": 1.2303169447866502, + "grad_norm": 1.3053503036499023, + "learning_rate": 9.948394736842107e-05, + "loss": 0.5627, + "step": 21971 + }, + { + "epoch": 1.2303729420987792, + "grad_norm": 1.2114213705062866, + "learning_rate": 9.948368421052631e-05, + "loss": 0.4151, + "step": 21972 + }, + { + "epoch": 1.2304289394109083, + "grad_norm": 1.4587870836257935, + "learning_rate": 9.948342105263159e-05, + "loss": 0.4482, + "step": 21973 + }, + { + "epoch": 1.2304849367230373, + "grad_norm": 1.4069160223007202, + "learning_rate": 9.948315789473685e-05, + "loss": 0.4085, + "step": 21974 + }, + { + "epoch": 1.2305409340351663, + "grad_norm": 1.241890549659729, + "learning_rate": 9.948289473684212e-05, + "loss": 0.4834, + "step": 21975 + }, + { + "epoch": 1.2305969313472953, + "grad_norm": 1.5539815425872803, + "learning_rate": 9.948263157894737e-05, + "loss": 0.6285, + "step": 21976 + }, + { + "epoch": 1.2306529286594243, + "grad_norm": 1.247409462928772, + "learning_rate": 9.948236842105264e-05, + "loss": 0.4226, + "step": 21977 + }, + { + "epoch": 1.2307089259715533, + "grad_norm": 1.646337866783142, + "learning_rate": 9.94821052631579e-05, + "loss": 0.4629, + "step": 21978 + }, + { + "epoch": 1.2307649232836824, + "grad_norm": 1.3517980575561523, + "learning_rate": 9.948184210526317e-05, + "loss": 0.3765, + "step": 21979 + }, + { + "epoch": 1.2308209205958114, + "grad_norm": 1.4111981391906738, + "learning_rate": 9.948157894736843e-05, + "loss": 0.4171, + "step": 21980 + }, + { + "epoch": 1.2308769179079404, + "grad_norm": 1.6455378532409668, + "learning_rate": 9.948131578947368e-05, + "loss": 0.5553, + "step": 21981 + }, + { + "epoch": 1.2309329152200694, + "grad_norm": 1.2768588066101074, + "learning_rate": 9.948105263157895e-05, + "loss": 0.432, + "step": 21982 + }, + { + "epoch": 1.2309889125321984, + "grad_norm": 1.9461045265197754, + "learning_rate": 9.948078947368421e-05, + "loss": 0.5182, + "step": 21983 + }, + { + "epoch": 1.2310449098443275, + "grad_norm": 1.5526859760284424, + "learning_rate": 9.948052631578948e-05, + "loss": 0.5356, + "step": 21984 + }, + { + "epoch": 1.2311009071564565, + "grad_norm": 1.4912222623825073, + "learning_rate": 9.948026315789474e-05, + "loss": 0.5068, + "step": 21985 + }, + { + "epoch": 1.2311569044685855, + "grad_norm": 1.3449803590774536, + "learning_rate": 9.948e-05, + "loss": 0.4846, + "step": 21986 + }, + { + "epoch": 1.2312129017807145, + "grad_norm": 1.4552032947540283, + "learning_rate": 9.947973684210526e-05, + "loss": 0.4275, + "step": 21987 + }, + { + "epoch": 1.2312688990928435, + "grad_norm": 1.3703938722610474, + "learning_rate": 9.947947368421054e-05, + "loss": 0.5228, + "step": 21988 + }, + { + "epoch": 1.2313248964049726, + "grad_norm": 2.451890230178833, + "learning_rate": 9.94792105263158e-05, + "loss": 0.3579, + "step": 21989 + }, + { + "epoch": 1.2313808937171016, + "grad_norm": 1.6656391620635986, + "learning_rate": 9.947894736842106e-05, + "loss": 0.5006, + "step": 21990 + }, + { + "epoch": 1.2314368910292306, + "grad_norm": 1.4365131855010986, + "learning_rate": 9.947868421052632e-05, + "loss": 0.3193, + "step": 21991 + }, + { + "epoch": 1.2314928883413596, + "grad_norm": 1.176457405090332, + "learning_rate": 9.947842105263159e-05, + "loss": 0.335, + "step": 21992 + }, + { + "epoch": 1.2315488856534886, + "grad_norm": 1.615265130996704, + "learning_rate": 9.947815789473685e-05, + "loss": 0.4727, + "step": 21993 + }, + { + "epoch": 1.2316048829656177, + "grad_norm": 1.2442609071731567, + "learning_rate": 9.947789473684211e-05, + "loss": 0.5069, + "step": 21994 + }, + { + "epoch": 1.2316608802777467, + "grad_norm": 1.1496793031692505, + "learning_rate": 9.947763157894737e-05, + "loss": 0.4064, + "step": 21995 + }, + { + "epoch": 1.2317168775898757, + "grad_norm": 1.6059843301773071, + "learning_rate": 9.947736842105264e-05, + "loss": 0.385, + "step": 21996 + }, + { + "epoch": 1.2317728749020047, + "grad_norm": 1.0907304286956787, + "learning_rate": 9.94771052631579e-05, + "loss": 0.3182, + "step": 21997 + }, + { + "epoch": 1.2318288722141337, + "grad_norm": 1.3871437311172485, + "learning_rate": 9.947684210526316e-05, + "loss": 0.432, + "step": 21998 + }, + { + "epoch": 1.2318848695262627, + "grad_norm": 1.5002268552780151, + "learning_rate": 9.947657894736842e-05, + "loss": 0.4025, + "step": 21999 + }, + { + "epoch": 1.2319408668383918, + "grad_norm": 1.7191928625106812, + "learning_rate": 9.947631578947368e-05, + "loss": 0.459, + "step": 22000 + }, + { + "epoch": 1.2319968641505208, + "grad_norm": 1.2988009452819824, + "learning_rate": 9.947605263157895e-05, + "loss": 0.3909, + "step": 22001 + }, + { + "epoch": 1.2320528614626498, + "grad_norm": 1.4330321550369263, + "learning_rate": 9.947578947368421e-05, + "loss": 0.4795, + "step": 22002 + }, + { + "epoch": 1.2321088587747788, + "grad_norm": 1.443952202796936, + "learning_rate": 9.947552631578949e-05, + "loss": 0.4142, + "step": 22003 + }, + { + "epoch": 1.2321648560869078, + "grad_norm": 1.4141242504119873, + "learning_rate": 9.947526315789473e-05, + "loss": 0.4104, + "step": 22004 + }, + { + "epoch": 1.2322208533990369, + "grad_norm": 1.3222407102584839, + "learning_rate": 9.9475e-05, + "loss": 0.5119, + "step": 22005 + }, + { + "epoch": 1.2322768507111659, + "grad_norm": 1.3565367460250854, + "learning_rate": 9.947473684210527e-05, + "loss": 0.537, + "step": 22006 + }, + { + "epoch": 1.232332848023295, + "grad_norm": 1.2684520483016968, + "learning_rate": 9.947447368421054e-05, + "loss": 0.5206, + "step": 22007 + }, + { + "epoch": 1.232388845335424, + "grad_norm": 2.818842887878418, + "learning_rate": 9.94742105263158e-05, + "loss": 0.43, + "step": 22008 + }, + { + "epoch": 1.232444842647553, + "grad_norm": 1.4474869966506958, + "learning_rate": 9.947394736842106e-05, + "loss": 0.5481, + "step": 22009 + }, + { + "epoch": 1.232500839959682, + "grad_norm": 1.3364468812942505, + "learning_rate": 9.947368421052632e-05, + "loss": 0.417, + "step": 22010 + }, + { + "epoch": 1.232556837271811, + "grad_norm": 1.1697131395339966, + "learning_rate": 9.947342105263159e-05, + "loss": 0.392, + "step": 22011 + }, + { + "epoch": 1.23261283458394, + "grad_norm": 1.129098653793335, + "learning_rate": 9.947315789473685e-05, + "loss": 0.3236, + "step": 22012 + }, + { + "epoch": 1.232668831896069, + "grad_norm": 1.2740800380706787, + "learning_rate": 9.947289473684211e-05, + "loss": 0.4697, + "step": 22013 + }, + { + "epoch": 1.232724829208198, + "grad_norm": 1.2584391832351685, + "learning_rate": 9.947263157894737e-05, + "loss": 0.4366, + "step": 22014 + }, + { + "epoch": 1.232780826520327, + "grad_norm": 1.5128530263900757, + "learning_rate": 9.947236842105264e-05, + "loss": 0.4795, + "step": 22015 + }, + { + "epoch": 1.232836823832456, + "grad_norm": 1.2469381093978882, + "learning_rate": 9.94721052631579e-05, + "loss": 0.3744, + "step": 22016 + }, + { + "epoch": 1.232892821144585, + "grad_norm": 1.313928246498108, + "learning_rate": 9.947184210526316e-05, + "loss": 0.5378, + "step": 22017 + }, + { + "epoch": 1.2329488184567141, + "grad_norm": 1.3469971418380737, + "learning_rate": 9.947157894736842e-05, + "loss": 0.3226, + "step": 22018 + }, + { + "epoch": 1.2330048157688431, + "grad_norm": 1.7495861053466797, + "learning_rate": 9.947131578947368e-05, + "loss": 0.4469, + "step": 22019 + }, + { + "epoch": 1.2330608130809722, + "grad_norm": 1.542587399482727, + "learning_rate": 9.947105263157896e-05, + "loss": 0.4056, + "step": 22020 + }, + { + "epoch": 1.2331168103931012, + "grad_norm": 1.398809552192688, + "learning_rate": 9.947078947368422e-05, + "loss": 0.5727, + "step": 22021 + }, + { + "epoch": 1.2331728077052302, + "grad_norm": 1.3981959819793701, + "learning_rate": 9.947052631578948e-05, + "loss": 0.4962, + "step": 22022 + }, + { + "epoch": 1.2332288050173592, + "grad_norm": 1.6644326448440552, + "learning_rate": 9.947026315789473e-05, + "loss": 0.7883, + "step": 22023 + }, + { + "epoch": 1.2332848023294882, + "grad_norm": 1.3118822574615479, + "learning_rate": 9.947000000000001e-05, + "loss": 0.4914, + "step": 22024 + }, + { + "epoch": 1.2333407996416172, + "grad_norm": 1.5341873168945312, + "learning_rate": 9.946973684210527e-05, + "loss": 0.478, + "step": 22025 + }, + { + "epoch": 1.2333967969537463, + "grad_norm": 1.3558776378631592, + "learning_rate": 9.946947368421054e-05, + "loss": 0.4463, + "step": 22026 + }, + { + "epoch": 1.2334527942658753, + "grad_norm": 1.348998785018921, + "learning_rate": 9.946921052631579e-05, + "loss": 0.4356, + "step": 22027 + }, + { + "epoch": 1.2335087915780043, + "grad_norm": 1.3207428455352783, + "learning_rate": 9.946894736842106e-05, + "loss": 0.5011, + "step": 22028 + }, + { + "epoch": 1.2335647888901333, + "grad_norm": 1.0505967140197754, + "learning_rate": 9.946868421052632e-05, + "loss": 0.3547, + "step": 22029 + }, + { + "epoch": 1.2336207862022623, + "grad_norm": 1.4877405166625977, + "learning_rate": 9.94684210526316e-05, + "loss": 0.5126, + "step": 22030 + }, + { + "epoch": 1.2336767835143914, + "grad_norm": 1.4617226123809814, + "learning_rate": 9.946815789473684e-05, + "loss": 0.457, + "step": 22031 + }, + { + "epoch": 1.2337327808265204, + "grad_norm": 1.4387129545211792, + "learning_rate": 9.946789473684211e-05, + "loss": 0.5297, + "step": 22032 + }, + { + "epoch": 1.2337887781386494, + "grad_norm": 1.2057104110717773, + "learning_rate": 9.946763157894737e-05, + "loss": 0.44, + "step": 22033 + }, + { + "epoch": 1.2338447754507784, + "grad_norm": 1.140297532081604, + "learning_rate": 9.946736842105263e-05, + "loss": 0.4564, + "step": 22034 + }, + { + "epoch": 1.2339007727629074, + "grad_norm": 1.2437033653259277, + "learning_rate": 9.94671052631579e-05, + "loss": 0.4692, + "step": 22035 + }, + { + "epoch": 1.2339567700750365, + "grad_norm": 1.368577003479004, + "learning_rate": 9.946684210526315e-05, + "loss": 0.4024, + "step": 22036 + }, + { + "epoch": 1.2340127673871655, + "grad_norm": 1.985155463218689, + "learning_rate": 9.946657894736843e-05, + "loss": 0.5369, + "step": 22037 + }, + { + "epoch": 1.2340687646992945, + "grad_norm": 1.2214103937149048, + "learning_rate": 9.946631578947369e-05, + "loss": 0.3905, + "step": 22038 + }, + { + "epoch": 1.2341247620114235, + "grad_norm": 1.4224534034729004, + "learning_rate": 9.946605263157896e-05, + "loss": 0.4445, + "step": 22039 + }, + { + "epoch": 1.2341807593235525, + "grad_norm": 1.625382423400879, + "learning_rate": 9.946578947368422e-05, + "loss": 0.4275, + "step": 22040 + }, + { + "epoch": 1.2342367566356816, + "grad_norm": 1.3813602924346924, + "learning_rate": 9.946552631578948e-05, + "loss": 0.4503, + "step": 22041 + }, + { + "epoch": 1.2342927539478106, + "grad_norm": 1.353810429573059, + "learning_rate": 9.946526315789474e-05, + "loss": 0.5219, + "step": 22042 + }, + { + "epoch": 1.2343487512599396, + "grad_norm": 1.115501880645752, + "learning_rate": 9.946500000000001e-05, + "loss": 0.2978, + "step": 22043 + }, + { + "epoch": 1.2344047485720686, + "grad_norm": 1.4730665683746338, + "learning_rate": 9.946473684210527e-05, + "loss": 0.3858, + "step": 22044 + }, + { + "epoch": 1.2344607458841976, + "grad_norm": 5.164679527282715, + "learning_rate": 9.946447368421053e-05, + "loss": 0.3937, + "step": 22045 + }, + { + "epoch": 1.2345167431963266, + "grad_norm": 1.3251657485961914, + "learning_rate": 9.946421052631579e-05, + "loss": 0.4165, + "step": 22046 + }, + { + "epoch": 1.2345727405084557, + "grad_norm": 2.141411781311035, + "learning_rate": 9.946394736842106e-05, + "loss": 0.593, + "step": 22047 + }, + { + "epoch": 1.2346287378205847, + "grad_norm": 1.614847183227539, + "learning_rate": 9.946368421052632e-05, + "loss": 0.3216, + "step": 22048 + }, + { + "epoch": 1.2346847351327137, + "grad_norm": 1.5325751304626465, + "learning_rate": 9.946342105263158e-05, + "loss": 0.5522, + "step": 22049 + }, + { + "epoch": 1.2347407324448427, + "grad_norm": 1.457053542137146, + "learning_rate": 9.946315789473684e-05, + "loss": 0.5598, + "step": 22050 + }, + { + "epoch": 1.2347967297569717, + "grad_norm": 1.3280045986175537, + "learning_rate": 9.94628947368421e-05, + "loss": 0.4834, + "step": 22051 + }, + { + "epoch": 1.2348527270691008, + "grad_norm": 1.3620203733444214, + "learning_rate": 9.946263157894738e-05, + "loss": 0.4212, + "step": 22052 + }, + { + "epoch": 1.2349087243812298, + "grad_norm": 1.6541097164154053, + "learning_rate": 9.946236842105264e-05, + "loss": 0.5099, + "step": 22053 + }, + { + "epoch": 1.2349647216933588, + "grad_norm": 1.9816346168518066, + "learning_rate": 9.94621052631579e-05, + "loss": 0.6487, + "step": 22054 + }, + { + "epoch": 1.2350207190054878, + "grad_norm": 1.4045215845108032, + "learning_rate": 9.946184210526315e-05, + "loss": 0.4277, + "step": 22055 + }, + { + "epoch": 1.2350767163176168, + "grad_norm": 1.0655875205993652, + "learning_rate": 9.946157894736843e-05, + "loss": 0.3683, + "step": 22056 + }, + { + "epoch": 1.2351327136297459, + "grad_norm": 1.6678571701049805, + "learning_rate": 9.946131578947369e-05, + "loss": 0.5642, + "step": 22057 + }, + { + "epoch": 1.2351887109418749, + "grad_norm": 1.317575216293335, + "learning_rate": 9.946105263157896e-05, + "loss": 0.3587, + "step": 22058 + }, + { + "epoch": 1.235244708254004, + "grad_norm": 1.7448569536209106, + "learning_rate": 9.946078947368421e-05, + "loss": 0.5373, + "step": 22059 + }, + { + "epoch": 1.235300705566133, + "grad_norm": 1.504167914390564, + "learning_rate": 9.946052631578948e-05, + "loss": 0.4347, + "step": 22060 + }, + { + "epoch": 1.235356702878262, + "grad_norm": 1.193869709968567, + "learning_rate": 9.946026315789474e-05, + "loss": 0.3772, + "step": 22061 + }, + { + "epoch": 1.235412700190391, + "grad_norm": 1.2586314678192139, + "learning_rate": 9.946000000000001e-05, + "loss": 0.4539, + "step": 22062 + }, + { + "epoch": 1.23546869750252, + "grad_norm": 1.4514977931976318, + "learning_rate": 9.945973684210527e-05, + "loss": 0.3762, + "step": 22063 + }, + { + "epoch": 1.235524694814649, + "grad_norm": 2.0244147777557373, + "learning_rate": 9.945947368421053e-05, + "loss": 0.5464, + "step": 22064 + }, + { + "epoch": 1.2355806921267778, + "grad_norm": 1.2562710046768188, + "learning_rate": 9.945921052631579e-05, + "loss": 0.3524, + "step": 22065 + }, + { + "epoch": 1.2356366894389068, + "grad_norm": 1.542137861251831, + "learning_rate": 9.945894736842107e-05, + "loss": 0.552, + "step": 22066 + }, + { + "epoch": 1.2356926867510358, + "grad_norm": 1.2267441749572754, + "learning_rate": 9.945868421052633e-05, + "loss": 0.4243, + "step": 22067 + }, + { + "epoch": 1.2357486840631648, + "grad_norm": 1.5006121397018433, + "learning_rate": 9.945842105263159e-05, + "loss": 0.4032, + "step": 22068 + }, + { + "epoch": 1.2358046813752939, + "grad_norm": 2.746145725250244, + "learning_rate": 9.945815789473684e-05, + "loss": 0.4965, + "step": 22069 + }, + { + "epoch": 1.2358606786874229, + "grad_norm": 1.4521000385284424, + "learning_rate": 9.94578947368421e-05, + "loss": 0.5064, + "step": 22070 + }, + { + "epoch": 1.235916675999552, + "grad_norm": 1.5078973770141602, + "learning_rate": 9.945763157894738e-05, + "loss": 0.4564, + "step": 22071 + }, + { + "epoch": 1.235972673311681, + "grad_norm": 1.340214490890503, + "learning_rate": 9.945736842105264e-05, + "loss": 0.3385, + "step": 22072 + }, + { + "epoch": 1.23602867062381, + "grad_norm": 2.683298110961914, + "learning_rate": 9.94571052631579e-05, + "loss": 0.6248, + "step": 22073 + }, + { + "epoch": 1.236084667935939, + "grad_norm": 2.3832719326019287, + "learning_rate": 9.945684210526316e-05, + "loss": 0.3919, + "step": 22074 + }, + { + "epoch": 1.236140665248068, + "grad_norm": 1.521113634109497, + "learning_rate": 9.945657894736843e-05, + "loss": 0.4484, + "step": 22075 + }, + { + "epoch": 1.236196662560197, + "grad_norm": 1.3267568349838257, + "learning_rate": 9.945631578947369e-05, + "loss": 0.4635, + "step": 22076 + }, + { + "epoch": 1.236252659872326, + "grad_norm": 1.3356620073318481, + "learning_rate": 9.945605263157895e-05, + "loss": 0.438, + "step": 22077 + }, + { + "epoch": 1.236308657184455, + "grad_norm": 1.4089046716690063, + "learning_rate": 9.945578947368421e-05, + "loss": 0.6273, + "step": 22078 + }, + { + "epoch": 1.236364654496584, + "grad_norm": 1.6194113492965698, + "learning_rate": 9.945552631578948e-05, + "loss": 0.4205, + "step": 22079 + }, + { + "epoch": 1.236420651808713, + "grad_norm": 1.4121274948120117, + "learning_rate": 9.945526315789474e-05, + "loss": 0.453, + "step": 22080 + }, + { + "epoch": 1.236476649120842, + "grad_norm": 1.3679141998291016, + "learning_rate": 9.945500000000002e-05, + "loss": 0.484, + "step": 22081 + }, + { + "epoch": 1.2365326464329711, + "grad_norm": 1.5797394514083862, + "learning_rate": 9.945473684210526e-05, + "loss": 0.5339, + "step": 22082 + }, + { + "epoch": 1.2365886437451001, + "grad_norm": 1.3631484508514404, + "learning_rate": 9.945447368421054e-05, + "loss": 0.3056, + "step": 22083 + }, + { + "epoch": 1.2366446410572292, + "grad_norm": 2.1892387866973877, + "learning_rate": 9.94542105263158e-05, + "loss": 0.4002, + "step": 22084 + }, + { + "epoch": 1.2367006383693582, + "grad_norm": 1.414676547050476, + "learning_rate": 9.945394736842105e-05, + "loss": 0.5185, + "step": 22085 + }, + { + "epoch": 1.2367566356814872, + "grad_norm": 1.4846559762954712, + "learning_rate": 9.945368421052631e-05, + "loss": 0.5629, + "step": 22086 + }, + { + "epoch": 1.2368126329936162, + "grad_norm": 1.285040259361267, + "learning_rate": 9.945342105263157e-05, + "loss": 0.42, + "step": 22087 + }, + { + "epoch": 1.2368686303057452, + "grad_norm": 1.1735033988952637, + "learning_rate": 9.945315789473685e-05, + "loss": 0.4051, + "step": 22088 + }, + { + "epoch": 1.2369246276178743, + "grad_norm": 1.2226616144180298, + "learning_rate": 9.945289473684211e-05, + "loss": 0.3276, + "step": 22089 + }, + { + "epoch": 1.2369806249300033, + "grad_norm": 1.4109970331192017, + "learning_rate": 9.945263157894738e-05, + "loss": 0.4347, + "step": 22090 + }, + { + "epoch": 1.2370366222421323, + "grad_norm": 1.3883955478668213, + "learning_rate": 9.945236842105263e-05, + "loss": 0.3999, + "step": 22091 + }, + { + "epoch": 1.2370926195542613, + "grad_norm": 1.646531105041504, + "learning_rate": 9.94521052631579e-05, + "loss": 0.5099, + "step": 22092 + }, + { + "epoch": 1.2371486168663903, + "grad_norm": 1.2651643753051758, + "learning_rate": 9.945184210526316e-05, + "loss": 0.3708, + "step": 22093 + }, + { + "epoch": 1.2372046141785193, + "grad_norm": 1.265564203262329, + "learning_rate": 9.945157894736843e-05, + "loss": 0.398, + "step": 22094 + }, + { + "epoch": 1.2372606114906484, + "grad_norm": 1.3503236770629883, + "learning_rate": 9.945131578947369e-05, + "loss": 0.3481, + "step": 22095 + }, + { + "epoch": 1.2373166088027774, + "grad_norm": 1.5110692977905273, + "learning_rate": 9.945105263157895e-05, + "loss": 0.4045, + "step": 22096 + }, + { + "epoch": 1.2373726061149064, + "grad_norm": 1.2529535293579102, + "learning_rate": 9.945078947368421e-05, + "loss": 0.4833, + "step": 22097 + }, + { + "epoch": 1.2374286034270354, + "grad_norm": 1.2186033725738525, + "learning_rate": 9.945052631578949e-05, + "loss": 0.408, + "step": 22098 + }, + { + "epoch": 1.2374846007391644, + "grad_norm": 1.1480300426483154, + "learning_rate": 9.945026315789475e-05, + "loss": 0.4217, + "step": 22099 + }, + { + "epoch": 1.2375405980512935, + "grad_norm": 1.8707743883132935, + "learning_rate": 9.945e-05, + "loss": 0.4986, + "step": 22100 + }, + { + "epoch": 1.2375965953634225, + "grad_norm": 1.02902090549469, + "learning_rate": 9.944973684210526e-05, + "loss": 0.3439, + "step": 22101 + }, + { + "epoch": 1.2376525926755515, + "grad_norm": 1.3396210670471191, + "learning_rate": 9.944947368421052e-05, + "loss": 0.4738, + "step": 22102 + }, + { + "epoch": 1.2377085899876805, + "grad_norm": 1.529415488243103, + "learning_rate": 9.94492105263158e-05, + "loss": 0.562, + "step": 22103 + }, + { + "epoch": 1.2377645872998095, + "grad_norm": 1.4021307229995728, + "learning_rate": 9.944894736842106e-05, + "loss": 0.5112, + "step": 22104 + }, + { + "epoch": 1.2378205846119386, + "grad_norm": 1.6247341632843018, + "learning_rate": 9.944868421052632e-05, + "loss": 0.4893, + "step": 22105 + }, + { + "epoch": 1.2378765819240676, + "grad_norm": 2.6440608501434326, + "learning_rate": 9.944842105263158e-05, + "loss": 0.3807, + "step": 22106 + }, + { + "epoch": 1.2379325792361966, + "grad_norm": 1.268375277519226, + "learning_rate": 9.944815789473685e-05, + "loss": 0.3197, + "step": 22107 + }, + { + "epoch": 1.2379885765483256, + "grad_norm": 1.6202696561813354, + "learning_rate": 9.944789473684211e-05, + "loss": 0.5831, + "step": 22108 + }, + { + "epoch": 1.2380445738604546, + "grad_norm": 1.4476566314697266, + "learning_rate": 9.944763157894737e-05, + "loss": 0.5444, + "step": 22109 + }, + { + "epoch": 1.2381005711725837, + "grad_norm": 1.3280572891235352, + "learning_rate": 9.944736842105263e-05, + "loss": 0.3884, + "step": 22110 + }, + { + "epoch": 1.2381565684847127, + "grad_norm": 1.6256791353225708, + "learning_rate": 9.94471052631579e-05, + "loss": 0.496, + "step": 22111 + }, + { + "epoch": 1.2382125657968417, + "grad_norm": 1.2308534383773804, + "learning_rate": 9.944684210526316e-05, + "loss": 0.498, + "step": 22112 + }, + { + "epoch": 1.2382685631089707, + "grad_norm": 1.2840787172317505, + "learning_rate": 9.944657894736844e-05, + "loss": 0.478, + "step": 22113 + }, + { + "epoch": 1.2383245604210997, + "grad_norm": 1.9302207231521606, + "learning_rate": 9.944631578947368e-05, + "loss": 0.3156, + "step": 22114 + }, + { + "epoch": 1.2383805577332287, + "grad_norm": 1.466124415397644, + "learning_rate": 9.944605263157896e-05, + "loss": 0.5097, + "step": 22115 + }, + { + "epoch": 1.2384365550453578, + "grad_norm": 1.376786470413208, + "learning_rate": 9.944578947368421e-05, + "loss": 0.5116, + "step": 22116 + }, + { + "epoch": 1.2384925523574868, + "grad_norm": 1.3685163259506226, + "learning_rate": 9.944552631578949e-05, + "loss": 0.5624, + "step": 22117 + }, + { + "epoch": 1.2385485496696158, + "grad_norm": 1.5305886268615723, + "learning_rate": 9.944526315789475e-05, + "loss": 0.4855, + "step": 22118 + }, + { + "epoch": 1.2386045469817448, + "grad_norm": 1.2085416316986084, + "learning_rate": 9.9445e-05, + "loss": 0.382, + "step": 22119 + }, + { + "epoch": 1.2386605442938738, + "grad_norm": 1.3168268203735352, + "learning_rate": 9.944473684210527e-05, + "loss": 0.5213, + "step": 22120 + }, + { + "epoch": 1.2387165416060029, + "grad_norm": 1.3097447156906128, + "learning_rate": 9.944447368421053e-05, + "loss": 0.4341, + "step": 22121 + }, + { + "epoch": 1.2387725389181319, + "grad_norm": 1.3421961069107056, + "learning_rate": 9.94442105263158e-05, + "loss": 0.3893, + "step": 22122 + }, + { + "epoch": 1.238828536230261, + "grad_norm": 1.2957892417907715, + "learning_rate": 9.944394736842105e-05, + "loss": 0.4352, + "step": 22123 + }, + { + "epoch": 1.23888453354239, + "grad_norm": 1.2423800230026245, + "learning_rate": 9.944368421052632e-05, + "loss": 0.454, + "step": 22124 + }, + { + "epoch": 1.238940530854519, + "grad_norm": 1.5311256647109985, + "learning_rate": 9.944342105263158e-05, + "loss": 0.5097, + "step": 22125 + }, + { + "epoch": 1.238996528166648, + "grad_norm": 1.586444616317749, + "learning_rate": 9.944315789473685e-05, + "loss": 0.4933, + "step": 22126 + }, + { + "epoch": 1.239052525478777, + "grad_norm": 1.3216956853866577, + "learning_rate": 9.944289473684211e-05, + "loss": 0.3512, + "step": 22127 + }, + { + "epoch": 1.239108522790906, + "grad_norm": 1.4504178762435913, + "learning_rate": 9.944263157894737e-05, + "loss": 0.4363, + "step": 22128 + }, + { + "epoch": 1.239164520103035, + "grad_norm": 1.3706461191177368, + "learning_rate": 9.944236842105263e-05, + "loss": 0.3395, + "step": 22129 + }, + { + "epoch": 1.239220517415164, + "grad_norm": 1.4141967296600342, + "learning_rate": 9.94421052631579e-05, + "loss": 0.3825, + "step": 22130 + }, + { + "epoch": 1.239276514727293, + "grad_norm": 1.3471068143844604, + "learning_rate": 9.944184210526316e-05, + "loss": 0.4097, + "step": 22131 + }, + { + "epoch": 1.239332512039422, + "grad_norm": 1.5354331731796265, + "learning_rate": 9.944157894736842e-05, + "loss": 0.4644, + "step": 22132 + }, + { + "epoch": 1.239388509351551, + "grad_norm": 1.3198585510253906, + "learning_rate": 9.944131578947368e-05, + "loss": 0.4278, + "step": 22133 + }, + { + "epoch": 1.2394445066636801, + "grad_norm": 1.4136360883712769, + "learning_rate": 9.944105263157896e-05, + "loss": 0.3762, + "step": 22134 + }, + { + "epoch": 1.2395005039758091, + "grad_norm": 1.4282978773117065, + "learning_rate": 9.944078947368422e-05, + "loss": 0.4303, + "step": 22135 + }, + { + "epoch": 1.2395565012879382, + "grad_norm": 1.3022985458374023, + "learning_rate": 9.944052631578949e-05, + "loss": 0.4176, + "step": 22136 + }, + { + "epoch": 1.2396124986000672, + "grad_norm": 1.4591301679611206, + "learning_rate": 9.944026315789474e-05, + "loss": 0.4183, + "step": 22137 + }, + { + "epoch": 1.2396684959121962, + "grad_norm": 1.5927734375, + "learning_rate": 9.944e-05, + "loss": 0.5347, + "step": 22138 + }, + { + "epoch": 1.2397244932243252, + "grad_norm": 1.2869540452957153, + "learning_rate": 9.943973684210527e-05, + "loss": 0.4606, + "step": 22139 + }, + { + "epoch": 1.2397804905364542, + "grad_norm": 1.3185064792633057, + "learning_rate": 9.943947368421053e-05, + "loss": 0.6397, + "step": 22140 + }, + { + "epoch": 1.2398364878485832, + "grad_norm": 1.3768068552017212, + "learning_rate": 9.943921052631579e-05, + "loss": 0.3936, + "step": 22141 + }, + { + "epoch": 1.2398924851607123, + "grad_norm": 1.537149429321289, + "learning_rate": 9.943894736842105e-05, + "loss": 0.4421, + "step": 22142 + }, + { + "epoch": 1.2399484824728413, + "grad_norm": 1.7025221586227417, + "learning_rate": 9.943868421052632e-05, + "loss": 0.6484, + "step": 22143 + }, + { + "epoch": 1.2400044797849703, + "grad_norm": 1.379907488822937, + "learning_rate": 9.943842105263158e-05, + "loss": 0.4589, + "step": 22144 + }, + { + "epoch": 1.2400604770970993, + "grad_norm": 1.6071739196777344, + "learning_rate": 9.943815789473686e-05, + "loss": 0.5187, + "step": 22145 + }, + { + "epoch": 1.2401164744092283, + "grad_norm": 1.2255569696426392, + "learning_rate": 9.94378947368421e-05, + "loss": 0.5301, + "step": 22146 + }, + { + "epoch": 1.2401724717213574, + "grad_norm": 1.5059415102005005, + "learning_rate": 9.943763157894737e-05, + "loss": 0.5854, + "step": 22147 + }, + { + "epoch": 1.2402284690334864, + "grad_norm": 1.4591748714447021, + "learning_rate": 9.943736842105263e-05, + "loss": 0.3683, + "step": 22148 + }, + { + "epoch": 1.2402844663456154, + "grad_norm": 1.818745493888855, + "learning_rate": 9.943710526315791e-05, + "loss": 0.5289, + "step": 22149 + }, + { + "epoch": 1.2403404636577444, + "grad_norm": 1.3020381927490234, + "learning_rate": 9.943684210526317e-05, + "loss": 0.4695, + "step": 22150 + }, + { + "epoch": 1.2403964609698734, + "grad_norm": 1.2086421251296997, + "learning_rate": 9.943657894736843e-05, + "loss": 0.3829, + "step": 22151 + }, + { + "epoch": 1.2404524582820025, + "grad_norm": 1.4827592372894287, + "learning_rate": 9.943631578947369e-05, + "loss": 0.6491, + "step": 22152 + }, + { + "epoch": 1.2405084555941315, + "grad_norm": 1.2764463424682617, + "learning_rate": 9.943605263157896e-05, + "loss": 0.4088, + "step": 22153 + }, + { + "epoch": 1.2405644529062605, + "grad_norm": 1.2118462324142456, + "learning_rate": 9.943578947368422e-05, + "loss": 0.3283, + "step": 22154 + }, + { + "epoch": 1.2406204502183895, + "grad_norm": 1.3163617849349976, + "learning_rate": 9.943552631578948e-05, + "loss": 0.454, + "step": 22155 + }, + { + "epoch": 1.2406764475305185, + "grad_norm": 1.503927230834961, + "learning_rate": 9.943526315789474e-05, + "loss": 0.5271, + "step": 22156 + }, + { + "epoch": 1.2407324448426476, + "grad_norm": 1.8561662435531616, + "learning_rate": 9.9435e-05, + "loss": 0.5902, + "step": 22157 + }, + { + "epoch": 1.2407884421547766, + "grad_norm": 1.3484762907028198, + "learning_rate": 9.943473684210527e-05, + "loss": 0.44, + "step": 22158 + }, + { + "epoch": 1.2408444394669056, + "grad_norm": 1.370915412902832, + "learning_rate": 9.943447368421053e-05, + "loss": 0.5732, + "step": 22159 + }, + { + "epoch": 1.2409004367790346, + "grad_norm": 1.4066773653030396, + "learning_rate": 9.943421052631579e-05, + "loss": 0.45, + "step": 22160 + }, + { + "epoch": 1.2409564340911636, + "grad_norm": 1.4867023229599, + "learning_rate": 9.943394736842105e-05, + "loss": 0.4574, + "step": 22161 + }, + { + "epoch": 1.2410124314032926, + "grad_norm": 1.3713905811309814, + "learning_rate": 9.943368421052632e-05, + "loss": 0.4768, + "step": 22162 + }, + { + "epoch": 1.2410684287154217, + "grad_norm": 1.5865164995193481, + "learning_rate": 9.943342105263158e-05, + "loss": 0.4932, + "step": 22163 + }, + { + "epoch": 1.2411244260275507, + "grad_norm": 1.3177303075790405, + "learning_rate": 9.943315789473684e-05, + "loss": 0.4736, + "step": 22164 + }, + { + "epoch": 1.2411804233396797, + "grad_norm": 1.3435002565383911, + "learning_rate": 9.94328947368421e-05, + "loss": 0.5665, + "step": 22165 + }, + { + "epoch": 1.2412364206518087, + "grad_norm": 2.1346640586853027, + "learning_rate": 9.943263157894738e-05, + "loss": 0.4935, + "step": 22166 + }, + { + "epoch": 1.2412924179639377, + "grad_norm": 1.5035032033920288, + "learning_rate": 9.943236842105264e-05, + "loss": 0.4118, + "step": 22167 + }, + { + "epoch": 1.2413484152760668, + "grad_norm": 1.4672106504440308, + "learning_rate": 9.943210526315791e-05, + "loss": 0.3559, + "step": 22168 + }, + { + "epoch": 1.2414044125881958, + "grad_norm": 1.371329426765442, + "learning_rate": 9.943184210526316e-05, + "loss": 0.4685, + "step": 22169 + }, + { + "epoch": 1.2414604099003248, + "grad_norm": 1.3162622451782227, + "learning_rate": 9.943157894736843e-05, + "loss": 0.4055, + "step": 22170 + }, + { + "epoch": 1.2415164072124538, + "grad_norm": 1.2451874017715454, + "learning_rate": 9.943131578947369e-05, + "loss": 0.5376, + "step": 22171 + }, + { + "epoch": 1.2415724045245828, + "grad_norm": 1.2837245464324951, + "learning_rate": 9.943105263157895e-05, + "loss": 0.473, + "step": 22172 + }, + { + "epoch": 1.2416284018367119, + "grad_norm": 1.3146480321884155, + "learning_rate": 9.943078947368422e-05, + "loss": 0.3934, + "step": 22173 + }, + { + "epoch": 1.2416843991488409, + "grad_norm": 1.25737464427948, + "learning_rate": 9.943052631578947e-05, + "loss": 0.3653, + "step": 22174 + }, + { + "epoch": 1.24174039646097, + "grad_norm": 1.4885339736938477, + "learning_rate": 9.943026315789474e-05, + "loss": 0.3836, + "step": 22175 + }, + { + "epoch": 1.241796393773099, + "grad_norm": 1.474367380142212, + "learning_rate": 9.943e-05, + "loss": 0.5566, + "step": 22176 + }, + { + "epoch": 1.241852391085228, + "grad_norm": 1.4683201313018799, + "learning_rate": 9.942973684210528e-05, + "loss": 0.4191, + "step": 22177 + }, + { + "epoch": 1.241908388397357, + "grad_norm": 1.3633314371109009, + "learning_rate": 9.942947368421052e-05, + "loss": 0.4509, + "step": 22178 + }, + { + "epoch": 1.241964385709486, + "grad_norm": 1.3684865236282349, + "learning_rate": 9.94292105263158e-05, + "loss": 0.4024, + "step": 22179 + }, + { + "epoch": 1.242020383021615, + "grad_norm": 1.5431170463562012, + "learning_rate": 9.942894736842105e-05, + "loss": 0.5027, + "step": 22180 + }, + { + "epoch": 1.242076380333744, + "grad_norm": 1.3154023885726929, + "learning_rate": 9.942868421052633e-05, + "loss": 0.5557, + "step": 22181 + }, + { + "epoch": 1.242132377645873, + "grad_norm": 1.136061429977417, + "learning_rate": 9.942842105263159e-05, + "loss": 0.4273, + "step": 22182 + }, + { + "epoch": 1.242188374958002, + "grad_norm": 1.4089622497558594, + "learning_rate": 9.942815789473685e-05, + "loss": 0.4442, + "step": 22183 + }, + { + "epoch": 1.242244372270131, + "grad_norm": 1.4251272678375244, + "learning_rate": 9.94278947368421e-05, + "loss": 0.4511, + "step": 22184 + }, + { + "epoch": 1.24230036958226, + "grad_norm": 1.441521406173706, + "learning_rate": 9.942763157894738e-05, + "loss": 0.4659, + "step": 22185 + }, + { + "epoch": 1.242356366894389, + "grad_norm": 1.539623737335205, + "learning_rate": 9.942736842105264e-05, + "loss": 0.5026, + "step": 22186 + }, + { + "epoch": 1.2424123642065181, + "grad_norm": 1.4089666604995728, + "learning_rate": 9.94271052631579e-05, + "loss": 0.3992, + "step": 22187 + }, + { + "epoch": 1.2424683615186471, + "grad_norm": 1.3999193906784058, + "learning_rate": 9.942684210526316e-05, + "loss": 0.6125, + "step": 22188 + }, + { + "epoch": 1.2425243588307762, + "grad_norm": 1.5073438882827759, + "learning_rate": 9.942657894736842e-05, + "loss": 0.4619, + "step": 22189 + }, + { + "epoch": 1.2425803561429052, + "grad_norm": 1.6267902851104736, + "learning_rate": 9.942631578947369e-05, + "loss": 0.5663, + "step": 22190 + }, + { + "epoch": 1.2426363534550342, + "grad_norm": 1.5895700454711914, + "learning_rate": 9.942605263157895e-05, + "loss": 0.4105, + "step": 22191 + }, + { + "epoch": 1.2426923507671632, + "grad_norm": 1.6286300420761108, + "learning_rate": 9.942578947368421e-05, + "loss": 0.5225, + "step": 22192 + }, + { + "epoch": 1.2427483480792922, + "grad_norm": 1.1963218450546265, + "learning_rate": 9.942552631578947e-05, + "loss": 0.3773, + "step": 22193 + }, + { + "epoch": 1.2428043453914213, + "grad_norm": 1.268042802810669, + "learning_rate": 9.942526315789474e-05, + "loss": 0.4605, + "step": 22194 + }, + { + "epoch": 1.2428603427035503, + "grad_norm": 1.5324139595031738, + "learning_rate": 9.9425e-05, + "loss": 0.4588, + "step": 22195 + }, + { + "epoch": 1.2429163400156793, + "grad_norm": 1.2663476467132568, + "learning_rate": 9.942473684210526e-05, + "loss": 0.4409, + "step": 22196 + }, + { + "epoch": 1.2429723373278083, + "grad_norm": 1.4255861043930054, + "learning_rate": 9.942447368421052e-05, + "loss": 0.4213, + "step": 22197 + }, + { + "epoch": 1.2430283346399373, + "grad_norm": 1.4221341609954834, + "learning_rate": 9.94242105263158e-05, + "loss": 0.4848, + "step": 22198 + }, + { + "epoch": 1.2430843319520664, + "grad_norm": 1.505860686302185, + "learning_rate": 9.942394736842106e-05, + "loss": 0.6597, + "step": 22199 + }, + { + "epoch": 1.2431403292641954, + "grad_norm": 1.4762026071548462, + "learning_rate": 9.942368421052633e-05, + "loss": 0.4869, + "step": 22200 + }, + { + "epoch": 1.2431963265763244, + "grad_norm": 1.6010220050811768, + "learning_rate": 9.942342105263158e-05, + "loss": 0.5208, + "step": 22201 + }, + { + "epoch": 1.2432523238884534, + "grad_norm": 1.171999216079712, + "learning_rate": 9.942315789473685e-05, + "loss": 0.3945, + "step": 22202 + }, + { + "epoch": 1.2433083212005824, + "grad_norm": 1.299700379371643, + "learning_rate": 9.942289473684211e-05, + "loss": 0.4803, + "step": 22203 + }, + { + "epoch": 1.2433643185127115, + "grad_norm": 1.5711313486099243, + "learning_rate": 9.942263157894738e-05, + "loss": 0.4983, + "step": 22204 + }, + { + "epoch": 1.2434203158248405, + "grad_norm": 1.389089822769165, + "learning_rate": 9.942236842105264e-05, + "loss": 0.4457, + "step": 22205 + }, + { + "epoch": 1.2434763131369695, + "grad_norm": 1.341274380683899, + "learning_rate": 9.942210526315789e-05, + "loss": 0.3424, + "step": 22206 + }, + { + "epoch": 1.2435323104490985, + "grad_norm": 1.1177542209625244, + "learning_rate": 9.942184210526316e-05, + "loss": 0.3997, + "step": 22207 + }, + { + "epoch": 1.2435883077612275, + "grad_norm": 1.4258272647857666, + "learning_rate": 9.942157894736842e-05, + "loss": 0.4808, + "step": 22208 + }, + { + "epoch": 1.2436443050733565, + "grad_norm": 1.605284571647644, + "learning_rate": 9.94213157894737e-05, + "loss": 0.4504, + "step": 22209 + }, + { + "epoch": 1.2437003023854856, + "grad_norm": 1.4095648527145386, + "learning_rate": 9.942105263157895e-05, + "loss": 0.3706, + "step": 22210 + }, + { + "epoch": 1.2437562996976146, + "grad_norm": 1.2311546802520752, + "learning_rate": 9.942078947368421e-05, + "loss": 0.3636, + "step": 22211 + }, + { + "epoch": 1.2438122970097436, + "grad_norm": 1.4106169939041138, + "learning_rate": 9.942052631578947e-05, + "loss": 0.5184, + "step": 22212 + }, + { + "epoch": 1.2438682943218726, + "grad_norm": 1.5023138523101807, + "learning_rate": 9.942026315789475e-05, + "loss": 0.3936, + "step": 22213 + }, + { + "epoch": 1.2439242916340016, + "grad_norm": 1.2082806825637817, + "learning_rate": 9.942000000000001e-05, + "loss": 0.5197, + "step": 22214 + }, + { + "epoch": 1.2439802889461307, + "grad_norm": 1.411442756652832, + "learning_rate": 9.941973684210527e-05, + "loss": 0.4805, + "step": 22215 + }, + { + "epoch": 1.2440362862582597, + "grad_norm": 1.3924330472946167, + "learning_rate": 9.941947368421053e-05, + "loss": 0.3279, + "step": 22216 + }, + { + "epoch": 1.2440922835703887, + "grad_norm": 1.2783757448196411, + "learning_rate": 9.94192105263158e-05, + "loss": 0.4446, + "step": 22217 + }, + { + "epoch": 1.2441482808825177, + "grad_norm": 1.6165719032287598, + "learning_rate": 9.941894736842106e-05, + "loss": 0.5575, + "step": 22218 + }, + { + "epoch": 1.2442042781946467, + "grad_norm": 1.1490000486373901, + "learning_rate": 9.941868421052632e-05, + "loss": 0.4069, + "step": 22219 + }, + { + "epoch": 1.2442602755067758, + "grad_norm": 1.1328785419464111, + "learning_rate": 9.941842105263158e-05, + "loss": 0.3499, + "step": 22220 + }, + { + "epoch": 1.2443162728189048, + "grad_norm": 1.3411613702774048, + "learning_rate": 9.941815789473685e-05, + "loss": 0.4329, + "step": 22221 + }, + { + "epoch": 1.2443722701310338, + "grad_norm": 1.3682115077972412, + "learning_rate": 9.941789473684211e-05, + "loss": 0.4656, + "step": 22222 + }, + { + "epoch": 1.2444282674431628, + "grad_norm": 1.3170279264450073, + "learning_rate": 9.941763157894737e-05, + "loss": 0.5161, + "step": 22223 + }, + { + "epoch": 1.2444842647552918, + "grad_norm": 1.417770504951477, + "learning_rate": 9.941736842105263e-05, + "loss": 0.3697, + "step": 22224 + }, + { + "epoch": 1.2445402620674209, + "grad_norm": 1.6053637266159058, + "learning_rate": 9.941710526315789e-05, + "loss": 0.4575, + "step": 22225 + }, + { + "epoch": 1.2445962593795499, + "grad_norm": 1.4814941883087158, + "learning_rate": 9.941684210526316e-05, + "loss": 0.3615, + "step": 22226 + }, + { + "epoch": 1.244652256691679, + "grad_norm": 1.3396278619766235, + "learning_rate": 9.941657894736842e-05, + "loss": 0.4416, + "step": 22227 + }, + { + "epoch": 1.244708254003808, + "grad_norm": 1.1928372383117676, + "learning_rate": 9.94163157894737e-05, + "loss": 0.4007, + "step": 22228 + }, + { + "epoch": 1.244764251315937, + "grad_norm": 1.2904865741729736, + "learning_rate": 9.941605263157894e-05, + "loss": 0.7117, + "step": 22229 + }, + { + "epoch": 1.244820248628066, + "grad_norm": 1.3465505838394165, + "learning_rate": 9.941578947368422e-05, + "loss": 0.4399, + "step": 22230 + }, + { + "epoch": 1.244876245940195, + "grad_norm": 1.3889199495315552, + "learning_rate": 9.941552631578948e-05, + "loss": 0.4516, + "step": 22231 + }, + { + "epoch": 1.244932243252324, + "grad_norm": 1.256568193435669, + "learning_rate": 9.941526315789475e-05, + "loss": 0.4759, + "step": 22232 + }, + { + "epoch": 1.244988240564453, + "grad_norm": 1.5486410856246948, + "learning_rate": 9.9415e-05, + "loss": 0.4454, + "step": 22233 + }, + { + "epoch": 1.245044237876582, + "grad_norm": 1.2451177835464478, + "learning_rate": 9.941473684210527e-05, + "loss": 0.3403, + "step": 22234 + }, + { + "epoch": 1.245100235188711, + "grad_norm": 1.2428570985794067, + "learning_rate": 9.941447368421053e-05, + "loss": 0.377, + "step": 22235 + }, + { + "epoch": 1.24515623250084, + "grad_norm": 1.336918830871582, + "learning_rate": 9.94142105263158e-05, + "loss": 0.4067, + "step": 22236 + }, + { + "epoch": 1.245212229812969, + "grad_norm": 1.4773956537246704, + "learning_rate": 9.941394736842106e-05, + "loss": 0.4488, + "step": 22237 + }, + { + "epoch": 1.245268227125098, + "grad_norm": 1.2210487127304077, + "learning_rate": 9.941368421052632e-05, + "loss": 0.4032, + "step": 22238 + }, + { + "epoch": 1.2453242244372271, + "grad_norm": 1.3869651556015015, + "learning_rate": 9.941342105263158e-05, + "loss": 0.4829, + "step": 22239 + }, + { + "epoch": 1.2453802217493561, + "grad_norm": 1.6309747695922852, + "learning_rate": 9.941315789473684e-05, + "loss": 0.4308, + "step": 22240 + }, + { + "epoch": 1.2454362190614852, + "grad_norm": 1.2051997184753418, + "learning_rate": 9.941289473684211e-05, + "loss": 0.5023, + "step": 22241 + }, + { + "epoch": 1.2454922163736142, + "grad_norm": 1.7952145338058472, + "learning_rate": 9.941263157894737e-05, + "loss": 0.3756, + "step": 22242 + }, + { + "epoch": 1.2455482136857432, + "grad_norm": 1.40574049949646, + "learning_rate": 9.941236842105263e-05, + "loss": 0.4318, + "step": 22243 + }, + { + "epoch": 1.2456042109978722, + "grad_norm": 1.3252264261245728, + "learning_rate": 9.94121052631579e-05, + "loss": 0.6048, + "step": 22244 + }, + { + "epoch": 1.2456602083100012, + "grad_norm": 1.380267858505249, + "learning_rate": 9.941184210526317e-05, + "loss": 0.4548, + "step": 22245 + }, + { + "epoch": 1.2457162056221303, + "grad_norm": 1.1676914691925049, + "learning_rate": 9.941157894736843e-05, + "loss": 0.4655, + "step": 22246 + }, + { + "epoch": 1.2457722029342593, + "grad_norm": 1.3717881441116333, + "learning_rate": 9.941131578947369e-05, + "loss": 0.3673, + "step": 22247 + }, + { + "epoch": 1.2458282002463883, + "grad_norm": 1.1353402137756348, + "learning_rate": 9.941105263157895e-05, + "loss": 0.355, + "step": 22248 + }, + { + "epoch": 1.2458841975585173, + "grad_norm": 1.4019495248794556, + "learning_rate": 9.941078947368422e-05, + "loss": 0.3895, + "step": 22249 + }, + { + "epoch": 1.2459401948706463, + "grad_norm": 1.2970120906829834, + "learning_rate": 9.941052631578948e-05, + "loss": 0.3942, + "step": 22250 + }, + { + "epoch": 1.2459961921827751, + "grad_norm": 1.249554991722107, + "learning_rate": 9.941026315789474e-05, + "loss": 0.3587, + "step": 22251 + }, + { + "epoch": 1.2460521894949041, + "grad_norm": 1.4984300136566162, + "learning_rate": 9.941e-05, + "loss": 0.5534, + "step": 22252 + }, + { + "epoch": 1.2461081868070332, + "grad_norm": 1.3894104957580566, + "learning_rate": 9.940973684210527e-05, + "loss": 0.4597, + "step": 22253 + }, + { + "epoch": 1.2461641841191622, + "grad_norm": 1.3068922758102417, + "learning_rate": 9.940947368421053e-05, + "loss": 0.3798, + "step": 22254 + }, + { + "epoch": 1.2462201814312912, + "grad_norm": 1.3988882303237915, + "learning_rate": 9.94092105263158e-05, + "loss": 0.4954, + "step": 22255 + }, + { + "epoch": 1.2462761787434202, + "grad_norm": 1.2204664945602417, + "learning_rate": 9.940894736842105e-05, + "loss": 0.4786, + "step": 22256 + }, + { + "epoch": 1.2463321760555492, + "grad_norm": 1.2899380922317505, + "learning_rate": 9.940868421052632e-05, + "loss": 0.4025, + "step": 22257 + }, + { + "epoch": 1.2463881733676783, + "grad_norm": 1.3493857383728027, + "learning_rate": 9.940842105263158e-05, + "loss": 0.5435, + "step": 22258 + }, + { + "epoch": 1.2464441706798073, + "grad_norm": 1.234354853630066, + "learning_rate": 9.940815789473684e-05, + "loss": 0.3658, + "step": 22259 + }, + { + "epoch": 1.2465001679919363, + "grad_norm": 1.7639458179473877, + "learning_rate": 9.940789473684212e-05, + "loss": 0.4752, + "step": 22260 + }, + { + "epoch": 1.2465561653040653, + "grad_norm": 1.5952787399291992, + "learning_rate": 9.940763157894736e-05, + "loss": 0.508, + "step": 22261 + }, + { + "epoch": 1.2466121626161943, + "grad_norm": 1.4668351411819458, + "learning_rate": 9.940736842105264e-05, + "loss": 0.4437, + "step": 22262 + }, + { + "epoch": 1.2466681599283234, + "grad_norm": 1.3444815874099731, + "learning_rate": 9.94071052631579e-05, + "loss": 0.3824, + "step": 22263 + }, + { + "epoch": 1.2467241572404524, + "grad_norm": 1.4767779111862183, + "learning_rate": 9.940684210526317e-05, + "loss": 0.4853, + "step": 22264 + }, + { + "epoch": 1.2467801545525814, + "grad_norm": 1.5499085187911987, + "learning_rate": 9.940657894736843e-05, + "loss": 0.4013, + "step": 22265 + }, + { + "epoch": 1.2468361518647104, + "grad_norm": 1.2688138484954834, + "learning_rate": 9.940631578947369e-05, + "loss": 0.4274, + "step": 22266 + }, + { + "epoch": 1.2468921491768394, + "grad_norm": 1.3768786191940308, + "learning_rate": 9.940605263157895e-05, + "loss": 0.4536, + "step": 22267 + }, + { + "epoch": 1.2469481464889685, + "grad_norm": 1.2985590696334839, + "learning_rate": 9.940578947368422e-05, + "loss": 0.4155, + "step": 22268 + }, + { + "epoch": 1.2470041438010975, + "grad_norm": 1.2233731746673584, + "learning_rate": 9.940552631578948e-05, + "loss": 0.3946, + "step": 22269 + }, + { + "epoch": 1.2470601411132265, + "grad_norm": 1.357564091682434, + "learning_rate": 9.940526315789474e-05, + "loss": 0.3937, + "step": 22270 + }, + { + "epoch": 1.2471161384253555, + "grad_norm": 1.3891246318817139, + "learning_rate": 9.9405e-05, + "loss": 0.4024, + "step": 22271 + }, + { + "epoch": 1.2471721357374845, + "grad_norm": 1.4539968967437744, + "learning_rate": 9.940473684210527e-05, + "loss": 0.4038, + "step": 22272 + }, + { + "epoch": 1.2472281330496136, + "grad_norm": 1.7275545597076416, + "learning_rate": 9.940447368421053e-05, + "loss": 0.5758, + "step": 22273 + }, + { + "epoch": 1.2472841303617426, + "grad_norm": 1.1803683042526245, + "learning_rate": 9.94042105263158e-05, + "loss": 0.4849, + "step": 22274 + }, + { + "epoch": 1.2473401276738716, + "grad_norm": 1.2474204301834106, + "learning_rate": 9.940394736842105e-05, + "loss": 0.4421, + "step": 22275 + }, + { + "epoch": 1.2473961249860006, + "grad_norm": 1.5943682193756104, + "learning_rate": 9.940368421052631e-05, + "loss": 0.5201, + "step": 22276 + }, + { + "epoch": 1.2474521222981296, + "grad_norm": 1.2650312185287476, + "learning_rate": 9.940342105263159e-05, + "loss": 0.3833, + "step": 22277 + }, + { + "epoch": 1.2475081196102586, + "grad_norm": 4.03690242767334, + "learning_rate": 9.940315789473685e-05, + "loss": 0.4693, + "step": 22278 + }, + { + "epoch": 1.2475641169223877, + "grad_norm": 1.3714172840118408, + "learning_rate": 9.94028947368421e-05, + "loss": 0.5724, + "step": 22279 + }, + { + "epoch": 1.2476201142345167, + "grad_norm": 1.2636523246765137, + "learning_rate": 9.940263157894737e-05, + "loss": 0.4871, + "step": 22280 + }, + { + "epoch": 1.2476761115466457, + "grad_norm": 1.3796244859695435, + "learning_rate": 9.940236842105264e-05, + "loss": 0.4291, + "step": 22281 + }, + { + "epoch": 1.2477321088587747, + "grad_norm": 1.2901058197021484, + "learning_rate": 9.94021052631579e-05, + "loss": 0.5145, + "step": 22282 + }, + { + "epoch": 1.2477881061709037, + "grad_norm": 1.2571921348571777, + "learning_rate": 9.940184210526317e-05, + "loss": 0.3346, + "step": 22283 + }, + { + "epoch": 1.2478441034830328, + "grad_norm": 1.4464619159698486, + "learning_rate": 9.940157894736842e-05, + "loss": 0.5793, + "step": 22284 + }, + { + "epoch": 1.2479001007951618, + "grad_norm": 1.5002973079681396, + "learning_rate": 9.940131578947369e-05, + "loss": 0.4873, + "step": 22285 + }, + { + "epoch": 1.2479560981072908, + "grad_norm": 1.4570118188858032, + "learning_rate": 9.940105263157895e-05, + "loss": 0.3904, + "step": 22286 + }, + { + "epoch": 1.2480120954194198, + "grad_norm": 1.5554147958755493, + "learning_rate": 9.940078947368422e-05, + "loss": 0.4778, + "step": 22287 + }, + { + "epoch": 1.2480680927315488, + "grad_norm": 8.259346961975098, + "learning_rate": 9.940052631578947e-05, + "loss": 0.4362, + "step": 22288 + }, + { + "epoch": 1.2481240900436779, + "grad_norm": 1.4304662942886353, + "learning_rate": 9.940026315789474e-05, + "loss": 0.5186, + "step": 22289 + }, + { + "epoch": 1.2481800873558069, + "grad_norm": 1.2556445598602295, + "learning_rate": 9.94e-05, + "loss": 0.3436, + "step": 22290 + }, + { + "epoch": 1.248236084667936, + "grad_norm": 1.1076174974441528, + "learning_rate": 9.939973684210528e-05, + "loss": 0.3614, + "step": 22291 + }, + { + "epoch": 1.248292081980065, + "grad_norm": 1.23257315158844, + "learning_rate": 9.939947368421054e-05, + "loss": 0.4392, + "step": 22292 + }, + { + "epoch": 1.248348079292194, + "grad_norm": 1.283825397491455, + "learning_rate": 9.939921052631578e-05, + "loss": 0.4014, + "step": 22293 + }, + { + "epoch": 1.248404076604323, + "grad_norm": 1.176995038986206, + "learning_rate": 9.939894736842106e-05, + "loss": 0.3408, + "step": 22294 + }, + { + "epoch": 1.248460073916452, + "grad_norm": 1.4385688304901123, + "learning_rate": 9.939868421052632e-05, + "loss": 0.3907, + "step": 22295 + }, + { + "epoch": 1.248516071228581, + "grad_norm": 1.8741458654403687, + "learning_rate": 9.939842105263159e-05, + "loss": 0.8512, + "step": 22296 + }, + { + "epoch": 1.24857206854071, + "grad_norm": 1.3005434274673462, + "learning_rate": 9.939815789473685e-05, + "loss": 0.5882, + "step": 22297 + }, + { + "epoch": 1.248628065852839, + "grad_norm": 1.5760449171066284, + "learning_rate": 9.939789473684211e-05, + "loss": 0.3411, + "step": 22298 + }, + { + "epoch": 1.248684063164968, + "grad_norm": 1.1352293491363525, + "learning_rate": 9.939763157894737e-05, + "loss": 0.4053, + "step": 22299 + }, + { + "epoch": 1.248740060477097, + "grad_norm": 1.546398639678955, + "learning_rate": 9.939736842105264e-05, + "loss": 0.4613, + "step": 22300 + }, + { + "epoch": 1.248796057789226, + "grad_norm": 1.702652096748352, + "learning_rate": 9.93971052631579e-05, + "loss": 0.5096, + "step": 22301 + }, + { + "epoch": 1.248852055101355, + "grad_norm": 1.2364565134048462, + "learning_rate": 9.939684210526316e-05, + "loss": 0.3482, + "step": 22302 + }, + { + "epoch": 1.2489080524134841, + "grad_norm": 1.5198919773101807, + "learning_rate": 9.939657894736842e-05, + "loss": 0.585, + "step": 22303 + }, + { + "epoch": 1.2489640497256131, + "grad_norm": 1.2503507137298584, + "learning_rate": 9.93963157894737e-05, + "loss": 0.3867, + "step": 22304 + }, + { + "epoch": 1.2490200470377422, + "grad_norm": 1.4735126495361328, + "learning_rate": 9.939605263157895e-05, + "loss": 0.4261, + "step": 22305 + }, + { + "epoch": 1.2490760443498712, + "grad_norm": 1.3438857793807983, + "learning_rate": 9.939578947368421e-05, + "loss": 0.4776, + "step": 22306 + }, + { + "epoch": 1.2491320416620002, + "grad_norm": 1.3483028411865234, + "learning_rate": 9.939552631578947e-05, + "loss": 0.5217, + "step": 22307 + }, + { + "epoch": 1.2491880389741292, + "grad_norm": 1.2666211128234863, + "learning_rate": 9.939526315789475e-05, + "loss": 0.4004, + "step": 22308 + }, + { + "epoch": 1.2492440362862582, + "grad_norm": 1.336466670036316, + "learning_rate": 9.9395e-05, + "loss": 0.4599, + "step": 22309 + }, + { + "epoch": 1.2493000335983873, + "grad_norm": 1.5537548065185547, + "learning_rate": 9.939473684210527e-05, + "loss": 0.6118, + "step": 22310 + }, + { + "epoch": 1.2493560309105163, + "grad_norm": 1.4111347198486328, + "learning_rate": 9.939447368421053e-05, + "loss": 0.4152, + "step": 22311 + }, + { + "epoch": 1.2494120282226453, + "grad_norm": 1.4069594144821167, + "learning_rate": 9.939421052631579e-05, + "loss": 0.4386, + "step": 22312 + }, + { + "epoch": 1.2494680255347743, + "grad_norm": 1.263898253440857, + "learning_rate": 9.939394736842106e-05, + "loss": 0.4336, + "step": 22313 + }, + { + "epoch": 1.2495240228469033, + "grad_norm": 1.617743730545044, + "learning_rate": 9.939368421052632e-05, + "loss": 0.6002, + "step": 22314 + }, + { + "epoch": 1.2495800201590324, + "grad_norm": 1.2375640869140625, + "learning_rate": 9.939342105263159e-05, + "loss": 0.4006, + "step": 22315 + }, + { + "epoch": 1.2496360174711614, + "grad_norm": 1.4769809246063232, + "learning_rate": 9.939315789473684e-05, + "loss": 0.4202, + "step": 22316 + }, + { + "epoch": 1.2496920147832904, + "grad_norm": 1.7477962970733643, + "learning_rate": 9.939289473684211e-05, + "loss": 0.5736, + "step": 22317 + }, + { + "epoch": 1.2497480120954194, + "grad_norm": 1.5742456912994385, + "learning_rate": 9.939263157894737e-05, + "loss": 0.4195, + "step": 22318 + }, + { + "epoch": 1.2498040094075484, + "grad_norm": 1.5923107862472534, + "learning_rate": 9.939236842105264e-05, + "loss": 0.4855, + "step": 22319 + }, + { + "epoch": 1.2498600067196775, + "grad_norm": 1.0994890928268433, + "learning_rate": 9.93921052631579e-05, + "loss": 0.2878, + "step": 22320 + }, + { + "epoch": 1.2499160040318065, + "grad_norm": 1.2776751518249512, + "learning_rate": 9.939184210526316e-05, + "loss": 0.5888, + "step": 22321 + }, + { + "epoch": 1.2499720013439355, + "grad_norm": 1.275792121887207, + "learning_rate": 9.939157894736842e-05, + "loss": 0.4022, + "step": 22322 + }, + { + "epoch": 1.2500279986560645, + "grad_norm": 1.2924615144729614, + "learning_rate": 9.93913157894737e-05, + "loss": 0.4109, + "step": 22323 + }, + { + "epoch": 1.2500839959681935, + "grad_norm": 1.181273341178894, + "learning_rate": 9.939105263157896e-05, + "loss": 0.4517, + "step": 22324 + }, + { + "epoch": 1.2501399932803225, + "grad_norm": 1.4977375268936157, + "learning_rate": 9.939078947368422e-05, + "loss": 0.4974, + "step": 22325 + }, + { + "epoch": 1.2501959905924516, + "grad_norm": 1.384088158607483, + "learning_rate": 9.939052631578948e-05, + "loss": 0.3408, + "step": 22326 + }, + { + "epoch": 1.2502519879045806, + "grad_norm": 1.5096538066864014, + "learning_rate": 9.939026315789474e-05, + "loss": 0.5548, + "step": 22327 + }, + { + "epoch": 1.2503079852167096, + "grad_norm": 1.2352845668792725, + "learning_rate": 9.939000000000001e-05, + "loss": 0.3673, + "step": 22328 + }, + { + "epoch": 1.2503639825288386, + "grad_norm": 1.3208589553833008, + "learning_rate": 9.938973684210527e-05, + "loss": 0.6064, + "step": 22329 + }, + { + "epoch": 1.2504199798409676, + "grad_norm": 2.8895554542541504, + "learning_rate": 9.938947368421053e-05, + "loss": 0.4623, + "step": 22330 + }, + { + "epoch": 1.2504759771530967, + "grad_norm": 1.7574106454849243, + "learning_rate": 9.938921052631579e-05, + "loss": 0.5314, + "step": 22331 + }, + { + "epoch": 1.2505319744652257, + "grad_norm": 1.6027345657348633, + "learning_rate": 9.938894736842106e-05, + "loss": 0.442, + "step": 22332 + }, + { + "epoch": 1.2505879717773547, + "grad_norm": 2.04962158203125, + "learning_rate": 9.938868421052632e-05, + "loss": 0.5385, + "step": 22333 + }, + { + "epoch": 1.2506439690894837, + "grad_norm": 1.3174147605895996, + "learning_rate": 9.938842105263158e-05, + "loss": 0.569, + "step": 22334 + }, + { + "epoch": 1.2506999664016127, + "grad_norm": 1.7815295457839966, + "learning_rate": 9.938815789473684e-05, + "loss": 0.5231, + "step": 22335 + }, + { + "epoch": 1.2507559637137418, + "grad_norm": 1.4336323738098145, + "learning_rate": 9.938789473684211e-05, + "loss": 0.6225, + "step": 22336 + }, + { + "epoch": 1.2508119610258708, + "grad_norm": 1.342552661895752, + "learning_rate": 9.938763157894737e-05, + "loss": 0.4109, + "step": 22337 + }, + { + "epoch": 1.2508679583379998, + "grad_norm": 1.2027740478515625, + "learning_rate": 9.938736842105265e-05, + "loss": 0.3825, + "step": 22338 + }, + { + "epoch": 1.2509239556501288, + "grad_norm": 1.3601100444793701, + "learning_rate": 9.938710526315789e-05, + "loss": 0.448, + "step": 22339 + }, + { + "epoch": 1.2509799529622578, + "grad_norm": 1.4747097492218018, + "learning_rate": 9.938684210526317e-05, + "loss": 0.4084, + "step": 22340 + }, + { + "epoch": 1.2510359502743869, + "grad_norm": 1.3954651355743408, + "learning_rate": 9.938657894736843e-05, + "loss": 0.4549, + "step": 22341 + }, + { + "epoch": 1.2510919475865159, + "grad_norm": 1.2875089645385742, + "learning_rate": 9.93863157894737e-05, + "loss": 0.3957, + "step": 22342 + }, + { + "epoch": 1.251147944898645, + "grad_norm": 1.372676968574524, + "learning_rate": 9.938605263157895e-05, + "loss": 0.4569, + "step": 22343 + }, + { + "epoch": 1.251203942210774, + "grad_norm": 1.1782981157302856, + "learning_rate": 9.93857894736842e-05, + "loss": 0.4145, + "step": 22344 + }, + { + "epoch": 1.251259939522903, + "grad_norm": 1.5319457054138184, + "learning_rate": 9.938552631578948e-05, + "loss": 0.5579, + "step": 22345 + }, + { + "epoch": 1.251315936835032, + "grad_norm": 1.3733348846435547, + "learning_rate": 9.938526315789474e-05, + "loss": 0.4205, + "step": 22346 + }, + { + "epoch": 1.251371934147161, + "grad_norm": 1.3196403980255127, + "learning_rate": 9.938500000000001e-05, + "loss": 0.3781, + "step": 22347 + }, + { + "epoch": 1.25142793145929, + "grad_norm": 1.1703699827194214, + "learning_rate": 9.938473684210526e-05, + "loss": 0.3795, + "step": 22348 + }, + { + "epoch": 1.251483928771419, + "grad_norm": 1.5440356731414795, + "learning_rate": 9.938447368421053e-05, + "loss": 0.4559, + "step": 22349 + }, + { + "epoch": 1.251539926083548, + "grad_norm": 1.190883755683899, + "learning_rate": 9.938421052631579e-05, + "loss": 0.3914, + "step": 22350 + }, + { + "epoch": 1.251595923395677, + "grad_norm": 1.065455436706543, + "learning_rate": 9.938394736842106e-05, + "loss": 0.3576, + "step": 22351 + }, + { + "epoch": 1.251651920707806, + "grad_norm": 1.5451136827468872, + "learning_rate": 9.938368421052632e-05, + "loss": 0.5983, + "step": 22352 + }, + { + "epoch": 1.251707918019935, + "grad_norm": 1.3333629369735718, + "learning_rate": 9.938342105263158e-05, + "loss": 0.4733, + "step": 22353 + }, + { + "epoch": 1.251763915332064, + "grad_norm": 1.1256550550460815, + "learning_rate": 9.938315789473684e-05, + "loss": 0.3686, + "step": 22354 + }, + { + "epoch": 1.2518199126441931, + "grad_norm": 1.2385841608047485, + "learning_rate": 9.938289473684212e-05, + "loss": 0.4191, + "step": 22355 + }, + { + "epoch": 1.2518759099563221, + "grad_norm": 1.2452980279922485, + "learning_rate": 9.938263157894738e-05, + "loss": 0.4756, + "step": 22356 + }, + { + "epoch": 1.2519319072684512, + "grad_norm": 1.3275479078292847, + "learning_rate": 9.938236842105264e-05, + "loss": 0.3779, + "step": 22357 + }, + { + "epoch": 1.2519879045805802, + "grad_norm": 1.409300446510315, + "learning_rate": 9.93821052631579e-05, + "loss": 0.3782, + "step": 22358 + }, + { + "epoch": 1.2520439018927092, + "grad_norm": 1.6561707258224487, + "learning_rate": 9.938184210526317e-05, + "loss": 0.5491, + "step": 22359 + }, + { + "epoch": 1.2520998992048382, + "grad_norm": 1.3989508152008057, + "learning_rate": 9.938157894736843e-05, + "loss": 0.4917, + "step": 22360 + }, + { + "epoch": 1.2521558965169672, + "grad_norm": 1.2363028526306152, + "learning_rate": 9.938131578947369e-05, + "loss": 0.4661, + "step": 22361 + }, + { + "epoch": 1.2522118938290963, + "grad_norm": 1.3518849611282349, + "learning_rate": 9.938105263157895e-05, + "loss": 0.4335, + "step": 22362 + }, + { + "epoch": 1.2522678911412253, + "grad_norm": 1.5282328128814697, + "learning_rate": 9.938078947368421e-05, + "loss": 0.5125, + "step": 22363 + }, + { + "epoch": 1.2523238884533543, + "grad_norm": 1.4124184846878052, + "learning_rate": 9.938052631578948e-05, + "loss": 0.5615, + "step": 22364 + }, + { + "epoch": 1.2523798857654833, + "grad_norm": 1.2635563611984253, + "learning_rate": 9.938026315789474e-05, + "loss": 0.403, + "step": 22365 + }, + { + "epoch": 1.2524358830776123, + "grad_norm": 1.281112551689148, + "learning_rate": 9.938e-05, + "loss": 0.3814, + "step": 22366 + }, + { + "epoch": 1.2524918803897414, + "grad_norm": 1.242078423500061, + "learning_rate": 9.937973684210526e-05, + "loss": 0.4, + "step": 22367 + }, + { + "epoch": 1.2525478777018704, + "grad_norm": 1.2379521131515503, + "learning_rate": 9.937947368421053e-05, + "loss": 0.4875, + "step": 22368 + }, + { + "epoch": 1.2526038750139994, + "grad_norm": 1.6423168182373047, + "learning_rate": 9.93792105263158e-05, + "loss": 0.4695, + "step": 22369 + }, + { + "epoch": 1.2526598723261284, + "grad_norm": 1.422355055809021, + "learning_rate": 9.937894736842107e-05, + "loss": 0.5009, + "step": 22370 + }, + { + "epoch": 1.2527158696382574, + "grad_norm": 1.112548828125, + "learning_rate": 9.937868421052631e-05, + "loss": 0.3949, + "step": 22371 + }, + { + "epoch": 1.2527718669503864, + "grad_norm": 1.3380110263824463, + "learning_rate": 9.937842105263159e-05, + "loss": 0.458, + "step": 22372 + }, + { + "epoch": 1.2528278642625155, + "grad_norm": 1.386528730392456, + "learning_rate": 9.937815789473685e-05, + "loss": 0.4417, + "step": 22373 + }, + { + "epoch": 1.2528838615746445, + "grad_norm": 1.596781849861145, + "learning_rate": 9.937789473684212e-05, + "loss": 0.6139, + "step": 22374 + }, + { + "epoch": 1.2529398588867735, + "grad_norm": 1.2813609838485718, + "learning_rate": 9.937763157894738e-05, + "loss": 0.41, + "step": 22375 + }, + { + "epoch": 1.2529958561989025, + "grad_norm": 2.168900728225708, + "learning_rate": 9.937736842105264e-05, + "loss": 0.6009, + "step": 22376 + }, + { + "epoch": 1.2530518535110315, + "grad_norm": 1.4229382276535034, + "learning_rate": 9.93771052631579e-05, + "loss": 0.4844, + "step": 22377 + }, + { + "epoch": 1.2531078508231606, + "grad_norm": 1.288794755935669, + "learning_rate": 9.937684210526317e-05, + "loss": 0.4613, + "step": 22378 + }, + { + "epoch": 1.2531638481352896, + "grad_norm": 1.322526216506958, + "learning_rate": 9.937657894736843e-05, + "loss": 0.4081, + "step": 22379 + }, + { + "epoch": 1.2532198454474186, + "grad_norm": 1.375086784362793, + "learning_rate": 9.937631578947368e-05, + "loss": 0.3796, + "step": 22380 + }, + { + "epoch": 1.2532758427595476, + "grad_norm": 1.1509244441986084, + "learning_rate": 9.937605263157895e-05, + "loss": 0.4033, + "step": 22381 + }, + { + "epoch": 1.2533318400716766, + "grad_norm": 1.2860116958618164, + "learning_rate": 9.937578947368421e-05, + "loss": 0.462, + "step": 22382 + }, + { + "epoch": 1.2533878373838057, + "grad_norm": 1.4602913856506348, + "learning_rate": 9.937552631578948e-05, + "loss": 0.4564, + "step": 22383 + }, + { + "epoch": 1.2534438346959347, + "grad_norm": 1.9868360757827759, + "learning_rate": 9.937526315789474e-05, + "loss": 0.4496, + "step": 22384 + }, + { + "epoch": 1.2534998320080635, + "grad_norm": 1.5726895332336426, + "learning_rate": 9.9375e-05, + "loss": 0.4239, + "step": 22385 + }, + { + "epoch": 1.2535558293201925, + "grad_norm": 2.0822389125823975, + "learning_rate": 9.937473684210526e-05, + "loss": 0.4286, + "step": 22386 + }, + { + "epoch": 1.2536118266323215, + "grad_norm": 1.3133800029754639, + "learning_rate": 9.937447368421054e-05, + "loss": 0.5122, + "step": 22387 + }, + { + "epoch": 1.2536678239444505, + "grad_norm": 1.394006371498108, + "learning_rate": 9.93742105263158e-05, + "loss": 0.4558, + "step": 22388 + }, + { + "epoch": 1.2537238212565796, + "grad_norm": 1.356560230255127, + "learning_rate": 9.937394736842106e-05, + "loss": 0.424, + "step": 22389 + }, + { + "epoch": 1.2537798185687086, + "grad_norm": 1.328401803970337, + "learning_rate": 9.937368421052632e-05, + "loss": 0.5114, + "step": 22390 + }, + { + "epoch": 1.2538358158808376, + "grad_norm": 1.2721163034439087, + "learning_rate": 9.937342105263159e-05, + "loss": 0.4165, + "step": 22391 + }, + { + "epoch": 1.2538918131929666, + "grad_norm": 3.9038705825805664, + "learning_rate": 9.937315789473685e-05, + "loss": 0.3836, + "step": 22392 + }, + { + "epoch": 1.2539478105050956, + "grad_norm": 1.5038775205612183, + "learning_rate": 9.937289473684211e-05, + "loss": 0.6409, + "step": 22393 + }, + { + "epoch": 1.2540038078172246, + "grad_norm": 1.7684061527252197, + "learning_rate": 9.937263157894737e-05, + "loss": 0.4112, + "step": 22394 + }, + { + "epoch": 1.2540598051293537, + "grad_norm": 1.331947922706604, + "learning_rate": 9.937236842105264e-05, + "loss": 0.4065, + "step": 22395 + }, + { + "epoch": 1.2541158024414827, + "grad_norm": 1.4933711290359497, + "learning_rate": 9.93721052631579e-05, + "loss": 0.384, + "step": 22396 + }, + { + "epoch": 1.2541717997536117, + "grad_norm": 1.411307454109192, + "learning_rate": 9.937184210526316e-05, + "loss": 0.4413, + "step": 22397 + }, + { + "epoch": 1.2542277970657407, + "grad_norm": 1.5018677711486816, + "learning_rate": 9.937157894736842e-05, + "loss": 0.4437, + "step": 22398 + }, + { + "epoch": 1.2542837943778697, + "grad_norm": 1.2441569566726685, + "learning_rate": 9.937131578947368e-05, + "loss": 0.404, + "step": 22399 + }, + { + "epoch": 1.2543397916899988, + "grad_norm": 1.35121488571167, + "learning_rate": 9.937105263157895e-05, + "loss": 0.4505, + "step": 22400 + }, + { + "epoch": 1.2543957890021278, + "grad_norm": 1.5654253959655762, + "learning_rate": 9.937078947368421e-05, + "loss": 0.4104, + "step": 22401 + }, + { + "epoch": 1.2544517863142568, + "grad_norm": 1.331819772720337, + "learning_rate": 9.937052631578949e-05, + "loss": 0.3457, + "step": 22402 + }, + { + "epoch": 1.2545077836263858, + "grad_norm": 1.3496921062469482, + "learning_rate": 9.937026315789473e-05, + "loss": 0.4175, + "step": 22403 + }, + { + "epoch": 1.2545637809385148, + "grad_norm": 1.254142165184021, + "learning_rate": 9.937e-05, + "loss": 0.5591, + "step": 22404 + }, + { + "epoch": 1.2546197782506439, + "grad_norm": 1.2177461385726929, + "learning_rate": 9.936973684210527e-05, + "loss": 0.3637, + "step": 22405 + }, + { + "epoch": 1.2546757755627729, + "grad_norm": 1.4695017337799072, + "learning_rate": 9.936947368421054e-05, + "loss": 0.4761, + "step": 22406 + }, + { + "epoch": 1.254731772874902, + "grad_norm": 1.5132089853286743, + "learning_rate": 9.93692105263158e-05, + "loss": 0.634, + "step": 22407 + }, + { + "epoch": 1.254787770187031, + "grad_norm": 1.3467780351638794, + "learning_rate": 9.936894736842106e-05, + "loss": 0.6429, + "step": 22408 + }, + { + "epoch": 1.25484376749916, + "grad_norm": 1.2279789447784424, + "learning_rate": 9.936868421052632e-05, + "loss": 0.4997, + "step": 22409 + }, + { + "epoch": 1.254899764811289, + "grad_norm": 1.2986432313919067, + "learning_rate": 9.936842105263159e-05, + "loss": 0.4413, + "step": 22410 + }, + { + "epoch": 1.254955762123418, + "grad_norm": 1.3174965381622314, + "learning_rate": 9.936815789473685e-05, + "loss": 0.3556, + "step": 22411 + }, + { + "epoch": 1.255011759435547, + "grad_norm": 1.1024069786071777, + "learning_rate": 9.936789473684211e-05, + "loss": 0.3554, + "step": 22412 + }, + { + "epoch": 1.255067756747676, + "grad_norm": 13.55655574798584, + "learning_rate": 9.936763157894737e-05, + "loss": 0.4215, + "step": 22413 + }, + { + "epoch": 1.255123754059805, + "grad_norm": 1.4097927808761597, + "learning_rate": 9.936736842105263e-05, + "loss": 0.441, + "step": 22414 + }, + { + "epoch": 1.255179751371934, + "grad_norm": 1.3753725290298462, + "learning_rate": 9.93671052631579e-05, + "loss": 0.5994, + "step": 22415 + }, + { + "epoch": 1.255235748684063, + "grad_norm": 1.4475756883621216, + "learning_rate": 9.936684210526316e-05, + "loss": 0.3789, + "step": 22416 + }, + { + "epoch": 1.255291745996192, + "grad_norm": 2.690183401107788, + "learning_rate": 9.936657894736842e-05, + "loss": 0.445, + "step": 22417 + }, + { + "epoch": 1.255347743308321, + "grad_norm": 1.5758112668991089, + "learning_rate": 9.936631578947368e-05, + "loss": 0.4321, + "step": 22418 + }, + { + "epoch": 1.2554037406204501, + "grad_norm": 1.377387285232544, + "learning_rate": 9.936605263157896e-05, + "loss": 0.5935, + "step": 22419 + }, + { + "epoch": 1.2554597379325791, + "grad_norm": 1.4579378366470337, + "learning_rate": 9.936578947368422e-05, + "loss": 0.3929, + "step": 22420 + }, + { + "epoch": 1.2555157352447082, + "grad_norm": 1.174188494682312, + "learning_rate": 9.936552631578948e-05, + "loss": 0.3638, + "step": 22421 + }, + { + "epoch": 1.2555717325568372, + "grad_norm": 1.19265615940094, + "learning_rate": 9.936526315789474e-05, + "loss": 0.3975, + "step": 22422 + }, + { + "epoch": 1.2556277298689662, + "grad_norm": 1.4435083866119385, + "learning_rate": 9.936500000000001e-05, + "loss": 0.4415, + "step": 22423 + }, + { + "epoch": 1.2556837271810952, + "grad_norm": 1.4970040321350098, + "learning_rate": 9.936473684210527e-05, + "loss": 0.5084, + "step": 22424 + }, + { + "epoch": 1.2557397244932242, + "grad_norm": 1.7829281091690063, + "learning_rate": 9.936447368421054e-05, + "loss": 0.4626, + "step": 22425 + }, + { + "epoch": 1.2557957218053533, + "grad_norm": 1.4064826965332031, + "learning_rate": 9.936421052631579e-05, + "loss": 0.4295, + "step": 22426 + }, + { + "epoch": 1.2558517191174823, + "grad_norm": 1.2395046949386597, + "learning_rate": 9.936394736842106e-05, + "loss": 0.5363, + "step": 22427 + }, + { + "epoch": 1.2559077164296113, + "grad_norm": 1.4189103841781616, + "learning_rate": 9.936368421052632e-05, + "loss": 0.346, + "step": 22428 + }, + { + "epoch": 1.2559637137417403, + "grad_norm": 1.219213843345642, + "learning_rate": 9.93634210526316e-05, + "loss": 0.4594, + "step": 22429 + }, + { + "epoch": 1.2560197110538693, + "grad_norm": 1.280315637588501, + "learning_rate": 9.936315789473685e-05, + "loss": 0.3594, + "step": 22430 + }, + { + "epoch": 1.2560757083659984, + "grad_norm": 1.197350025177002, + "learning_rate": 9.93628947368421e-05, + "loss": 0.4468, + "step": 22431 + }, + { + "epoch": 1.2561317056781274, + "grad_norm": 1.1670315265655518, + "learning_rate": 9.936263157894737e-05, + "loss": 0.3612, + "step": 22432 + }, + { + "epoch": 1.2561877029902564, + "grad_norm": 1.382169485092163, + "learning_rate": 9.936236842105263e-05, + "loss": 0.4715, + "step": 22433 + }, + { + "epoch": 1.2562437003023854, + "grad_norm": 1.1867884397506714, + "learning_rate": 9.93621052631579e-05, + "loss": 0.3865, + "step": 22434 + }, + { + "epoch": 1.2562996976145144, + "grad_norm": 1.3531599044799805, + "learning_rate": 9.936184210526315e-05, + "loss": 0.4135, + "step": 22435 + }, + { + "epoch": 1.2563556949266435, + "grad_norm": 1.5227432250976562, + "learning_rate": 9.936157894736843e-05, + "loss": 0.4965, + "step": 22436 + }, + { + "epoch": 1.2564116922387725, + "grad_norm": 1.5554322004318237, + "learning_rate": 9.936131578947369e-05, + "loss": 0.63, + "step": 22437 + }, + { + "epoch": 1.2564676895509015, + "grad_norm": 1.5383269786834717, + "learning_rate": 9.936105263157896e-05, + "loss": 0.5737, + "step": 22438 + }, + { + "epoch": 1.2565236868630305, + "grad_norm": 1.520043134689331, + "learning_rate": 9.936078947368422e-05, + "loss": 0.5204, + "step": 22439 + }, + { + "epoch": 1.2565796841751595, + "grad_norm": 1.1973000764846802, + "learning_rate": 9.936052631578948e-05, + "loss": 0.3516, + "step": 22440 + }, + { + "epoch": 1.2566356814872885, + "grad_norm": 1.0638693571090698, + "learning_rate": 9.936026315789474e-05, + "loss": 0.3651, + "step": 22441 + }, + { + "epoch": 1.2566916787994176, + "grad_norm": 3.457150936126709, + "learning_rate": 9.936000000000001e-05, + "loss": 0.4624, + "step": 22442 + }, + { + "epoch": 1.2567476761115466, + "grad_norm": 1.3856191635131836, + "learning_rate": 9.935973684210527e-05, + "loss": 0.4131, + "step": 22443 + }, + { + "epoch": 1.2568036734236756, + "grad_norm": 1.7095839977264404, + "learning_rate": 9.935947368421053e-05, + "loss": 0.4786, + "step": 22444 + }, + { + "epoch": 1.2568596707358046, + "grad_norm": 1.5006550550460815, + "learning_rate": 9.935921052631579e-05, + "loss": 0.4271, + "step": 22445 + }, + { + "epoch": 1.2569156680479336, + "grad_norm": 1.334295392036438, + "learning_rate": 9.935894736842106e-05, + "loss": 0.4931, + "step": 22446 + }, + { + "epoch": 1.2569716653600627, + "grad_norm": 1.4089369773864746, + "learning_rate": 9.935868421052632e-05, + "loss": 0.5179, + "step": 22447 + }, + { + "epoch": 1.2570276626721917, + "grad_norm": 1.551820993423462, + "learning_rate": 9.935842105263158e-05, + "loss": 0.6057, + "step": 22448 + }, + { + "epoch": 1.2570836599843207, + "grad_norm": 1.658945083618164, + "learning_rate": 9.935815789473684e-05, + "loss": 0.3865, + "step": 22449 + }, + { + "epoch": 1.2571396572964497, + "grad_norm": 1.4844506978988647, + "learning_rate": 9.93578947368421e-05, + "loss": 0.4183, + "step": 22450 + }, + { + "epoch": 1.2571956546085787, + "grad_norm": 1.2960132360458374, + "learning_rate": 9.935763157894738e-05, + "loss": 0.4867, + "step": 22451 + }, + { + "epoch": 1.2572516519207078, + "grad_norm": 1.2157917022705078, + "learning_rate": 9.935736842105264e-05, + "loss": 0.4369, + "step": 22452 + }, + { + "epoch": 1.2573076492328368, + "grad_norm": 1.3133894205093384, + "learning_rate": 9.93571052631579e-05, + "loss": 0.3787, + "step": 22453 + }, + { + "epoch": 1.2573636465449658, + "grad_norm": 1.1887586116790771, + "learning_rate": 9.935684210526315e-05, + "loss": 0.3378, + "step": 22454 + }, + { + "epoch": 1.2574196438570948, + "grad_norm": 1.3167790174484253, + "learning_rate": 9.935657894736843e-05, + "loss": 0.3459, + "step": 22455 + }, + { + "epoch": 1.2574756411692238, + "grad_norm": 1.3035658597946167, + "learning_rate": 9.935631578947369e-05, + "loss": 0.4213, + "step": 22456 + }, + { + "epoch": 1.2575316384813529, + "grad_norm": 1.229737401008606, + "learning_rate": 9.935605263157896e-05, + "loss": 0.2927, + "step": 22457 + }, + { + "epoch": 1.2575876357934819, + "grad_norm": 1.2345038652420044, + "learning_rate": 9.935578947368421e-05, + "loss": 0.4091, + "step": 22458 + }, + { + "epoch": 1.257643633105611, + "grad_norm": 1.5438814163208008, + "learning_rate": 9.935552631578948e-05, + "loss": 0.5458, + "step": 22459 + }, + { + "epoch": 1.25769963041774, + "grad_norm": 1.3399029970169067, + "learning_rate": 9.935526315789474e-05, + "loss": 0.4387, + "step": 22460 + }, + { + "epoch": 1.257755627729869, + "grad_norm": 1.5059328079223633, + "learning_rate": 9.935500000000001e-05, + "loss": 0.5205, + "step": 22461 + }, + { + "epoch": 1.257811625041998, + "grad_norm": 1.3784549236297607, + "learning_rate": 9.935473684210527e-05, + "loss": 0.4553, + "step": 22462 + }, + { + "epoch": 1.257867622354127, + "grad_norm": 1.2881489992141724, + "learning_rate": 9.935447368421053e-05, + "loss": 0.4272, + "step": 22463 + }, + { + "epoch": 1.257923619666256, + "grad_norm": 1.2905099391937256, + "learning_rate": 9.935421052631579e-05, + "loss": 0.4378, + "step": 22464 + }, + { + "epoch": 1.257979616978385, + "grad_norm": 1.4473720788955688, + "learning_rate": 9.935394736842105e-05, + "loss": 0.4647, + "step": 22465 + }, + { + "epoch": 1.258035614290514, + "grad_norm": 1.42154860496521, + "learning_rate": 9.935368421052633e-05, + "loss": 0.3559, + "step": 22466 + }, + { + "epoch": 1.258091611602643, + "grad_norm": 1.2381058931350708, + "learning_rate": 9.935342105263159e-05, + "loss": 0.4903, + "step": 22467 + }, + { + "epoch": 1.258147608914772, + "grad_norm": 1.1547553539276123, + "learning_rate": 9.935315789473685e-05, + "loss": 0.2718, + "step": 22468 + }, + { + "epoch": 1.258203606226901, + "grad_norm": 1.392749547958374, + "learning_rate": 9.93528947368421e-05, + "loss": 0.5243, + "step": 22469 + }, + { + "epoch": 1.25825960353903, + "grad_norm": 1.1658364534378052, + "learning_rate": 9.935263157894738e-05, + "loss": 0.4561, + "step": 22470 + }, + { + "epoch": 1.2583156008511591, + "grad_norm": 1.4107457399368286, + "learning_rate": 9.935236842105264e-05, + "loss": 0.3249, + "step": 22471 + }, + { + "epoch": 1.2583715981632881, + "grad_norm": 1.2678806781768799, + "learning_rate": 9.93521052631579e-05, + "loss": 0.3788, + "step": 22472 + }, + { + "epoch": 1.2584275954754172, + "grad_norm": 1.3851932287216187, + "learning_rate": 9.935184210526316e-05, + "loss": 0.4094, + "step": 22473 + }, + { + "epoch": 1.2584835927875462, + "grad_norm": 1.5530339479446411, + "learning_rate": 9.935157894736843e-05, + "loss": 0.6043, + "step": 22474 + }, + { + "epoch": 1.2585395900996752, + "grad_norm": 1.711771845817566, + "learning_rate": 9.935131578947369e-05, + "loss": 0.6201, + "step": 22475 + }, + { + "epoch": 1.2585955874118042, + "grad_norm": 1.614958643913269, + "learning_rate": 9.935105263157895e-05, + "loss": 0.5384, + "step": 22476 + }, + { + "epoch": 1.2586515847239332, + "grad_norm": 1.2900781631469727, + "learning_rate": 9.935078947368421e-05, + "loss": 0.3236, + "step": 22477 + }, + { + "epoch": 1.2587075820360623, + "grad_norm": 1.4119040966033936, + "learning_rate": 9.935052631578948e-05, + "loss": 0.4346, + "step": 22478 + }, + { + "epoch": 1.2587635793481913, + "grad_norm": 1.2123664617538452, + "learning_rate": 9.935026315789474e-05, + "loss": 0.3862, + "step": 22479 + }, + { + "epoch": 1.2588195766603203, + "grad_norm": 1.2141798734664917, + "learning_rate": 9.935000000000002e-05, + "loss": 0.4515, + "step": 22480 + }, + { + "epoch": 1.2588755739724493, + "grad_norm": 1.382216453552246, + "learning_rate": 9.934973684210526e-05, + "loss": 0.4536, + "step": 22481 + }, + { + "epoch": 1.2589315712845783, + "grad_norm": 1.378426432609558, + "learning_rate": 9.934947368421052e-05, + "loss": 0.581, + "step": 22482 + }, + { + "epoch": 1.2589875685967074, + "grad_norm": 1.417020320892334, + "learning_rate": 9.93492105263158e-05, + "loss": 0.3848, + "step": 22483 + }, + { + "epoch": 1.2590435659088364, + "grad_norm": 1.6204237937927246, + "learning_rate": 9.934894736842106e-05, + "loss": 0.5406, + "step": 22484 + }, + { + "epoch": 1.2590995632209654, + "grad_norm": 1.4758626222610474, + "learning_rate": 9.934868421052633e-05, + "loss": 0.5592, + "step": 22485 + }, + { + "epoch": 1.2591555605330944, + "grad_norm": 3.563157320022583, + "learning_rate": 9.934842105263157e-05, + "loss": 0.3563, + "step": 22486 + }, + { + "epoch": 1.2592115578452234, + "grad_norm": 1.2028464078903198, + "learning_rate": 9.934815789473685e-05, + "loss": 0.3935, + "step": 22487 + }, + { + "epoch": 1.2592675551573524, + "grad_norm": 1.3662872314453125, + "learning_rate": 9.934789473684211e-05, + "loss": 0.3752, + "step": 22488 + }, + { + "epoch": 1.2593235524694815, + "grad_norm": 1.7939461469650269, + "learning_rate": 9.934763157894738e-05, + "loss": 0.4497, + "step": 22489 + }, + { + "epoch": 1.2593795497816105, + "grad_norm": 1.3583811521530151, + "learning_rate": 9.934736842105263e-05, + "loss": 0.4819, + "step": 22490 + }, + { + "epoch": 1.2594355470937395, + "grad_norm": 1.4784542322158813, + "learning_rate": 9.93471052631579e-05, + "loss": 0.512, + "step": 22491 + }, + { + "epoch": 1.2594915444058685, + "grad_norm": 1.7761197090148926, + "learning_rate": 9.934684210526316e-05, + "loss": 0.4361, + "step": 22492 + }, + { + "epoch": 1.2595475417179975, + "grad_norm": 1.5552124977111816, + "learning_rate": 9.934657894736843e-05, + "loss": 0.4968, + "step": 22493 + }, + { + "epoch": 1.2596035390301266, + "grad_norm": 1.419356107711792, + "learning_rate": 9.934631578947369e-05, + "loss": 0.4412, + "step": 22494 + }, + { + "epoch": 1.2596595363422556, + "grad_norm": 1.3594200611114502, + "learning_rate": 9.934605263157895e-05, + "loss": 0.4545, + "step": 22495 + }, + { + "epoch": 1.2597155336543846, + "grad_norm": 1.2832539081573486, + "learning_rate": 9.934578947368421e-05, + "loss": 0.4305, + "step": 22496 + }, + { + "epoch": 1.2597715309665136, + "grad_norm": 1.2215479612350464, + "learning_rate": 9.934552631578949e-05, + "loss": 0.4875, + "step": 22497 + }, + { + "epoch": 1.2598275282786426, + "grad_norm": 1.4554733037948608, + "learning_rate": 9.934526315789475e-05, + "loss": 0.4148, + "step": 22498 + }, + { + "epoch": 1.2598835255907717, + "grad_norm": 1.5905406475067139, + "learning_rate": 9.9345e-05, + "loss": 0.4645, + "step": 22499 + }, + { + "epoch": 1.2599395229029007, + "grad_norm": 1.4548741579055786, + "learning_rate": 9.934473684210526e-05, + "loss": 0.4613, + "step": 22500 + }, + { + "epoch": 1.2599955202150297, + "grad_norm": 1.1292834281921387, + "learning_rate": 9.934447368421052e-05, + "loss": 0.3886, + "step": 22501 + }, + { + "epoch": 1.2600515175271587, + "grad_norm": 1.3880661725997925, + "learning_rate": 9.93442105263158e-05, + "loss": 0.4223, + "step": 22502 + }, + { + "epoch": 1.2601075148392877, + "grad_norm": 1.3671998977661133, + "learning_rate": 9.934394736842106e-05, + "loss": 0.4806, + "step": 22503 + }, + { + "epoch": 1.2601635121514168, + "grad_norm": 4.363045692443848, + "learning_rate": 9.934368421052632e-05, + "loss": 0.4813, + "step": 22504 + }, + { + "epoch": 1.2602195094635458, + "grad_norm": 1.4290465116500854, + "learning_rate": 9.934342105263158e-05, + "loss": 0.5236, + "step": 22505 + }, + { + "epoch": 1.2602755067756748, + "grad_norm": 2.196024179458618, + "learning_rate": 9.934315789473685e-05, + "loss": 0.3131, + "step": 22506 + }, + { + "epoch": 1.2603315040878038, + "grad_norm": 1.4151439666748047, + "learning_rate": 9.934289473684211e-05, + "loss": 0.4163, + "step": 22507 + }, + { + "epoch": 1.2603875013999328, + "grad_norm": 1.2434852123260498, + "learning_rate": 9.934263157894737e-05, + "loss": 0.4072, + "step": 22508 + }, + { + "epoch": 1.2604434987120618, + "grad_norm": 1.2777451276779175, + "learning_rate": 9.934236842105263e-05, + "loss": 0.4306, + "step": 22509 + }, + { + "epoch": 1.2604994960241909, + "grad_norm": 1.3524788618087769, + "learning_rate": 9.93421052631579e-05, + "loss": 0.4388, + "step": 22510 + }, + { + "epoch": 1.2605554933363199, + "grad_norm": 1.5618058443069458, + "learning_rate": 9.934184210526316e-05, + "loss": 0.6227, + "step": 22511 + }, + { + "epoch": 1.260611490648449, + "grad_norm": 1.243468165397644, + "learning_rate": 9.934157894736844e-05, + "loss": 0.4222, + "step": 22512 + }, + { + "epoch": 1.260667487960578, + "grad_norm": 2.1506240367889404, + "learning_rate": 9.934131578947368e-05, + "loss": 0.4433, + "step": 22513 + }, + { + "epoch": 1.260723485272707, + "grad_norm": 3.119558572769165, + "learning_rate": 9.934105263157896e-05, + "loss": 0.4319, + "step": 22514 + }, + { + "epoch": 1.260779482584836, + "grad_norm": 1.3329764604568481, + "learning_rate": 9.934078947368421e-05, + "loss": 0.4438, + "step": 22515 + }, + { + "epoch": 1.260835479896965, + "grad_norm": 1.4162479639053345, + "learning_rate": 9.934052631578949e-05, + "loss": 0.4017, + "step": 22516 + }, + { + "epoch": 1.260891477209094, + "grad_norm": 1.6753971576690674, + "learning_rate": 9.934026315789475e-05, + "loss": 0.5185, + "step": 22517 + }, + { + "epoch": 1.260947474521223, + "grad_norm": 1.3797465562820435, + "learning_rate": 9.934e-05, + "loss": 0.3933, + "step": 22518 + }, + { + "epoch": 1.261003471833352, + "grad_norm": 1.4132909774780273, + "learning_rate": 9.933973684210527e-05, + "loss": 0.4236, + "step": 22519 + }, + { + "epoch": 1.261059469145481, + "grad_norm": 1.2999058961868286, + "learning_rate": 9.933947368421053e-05, + "loss": 0.3285, + "step": 22520 + }, + { + "epoch": 1.26111546645761, + "grad_norm": 1.4819581508636475, + "learning_rate": 9.93392105263158e-05, + "loss": 0.3924, + "step": 22521 + }, + { + "epoch": 1.261171463769739, + "grad_norm": 2.1245462894439697, + "learning_rate": 9.933894736842106e-05, + "loss": 0.4404, + "step": 22522 + }, + { + "epoch": 1.2612274610818681, + "grad_norm": 1.2867658138275146, + "learning_rate": 9.933868421052632e-05, + "loss": 0.5039, + "step": 22523 + }, + { + "epoch": 1.2612834583939971, + "grad_norm": 1.4297866821289062, + "learning_rate": 9.933842105263158e-05, + "loss": 0.4432, + "step": 22524 + }, + { + "epoch": 1.2613394557061262, + "grad_norm": 1.3429068326950073, + "learning_rate": 9.933815789473685e-05, + "loss": 0.5242, + "step": 22525 + }, + { + "epoch": 1.2613954530182552, + "grad_norm": 2.5976576805114746, + "learning_rate": 9.933789473684211e-05, + "loss": 0.5616, + "step": 22526 + }, + { + "epoch": 1.2614514503303842, + "grad_norm": 1.0975341796875, + "learning_rate": 9.933763157894737e-05, + "loss": 0.3919, + "step": 22527 + }, + { + "epoch": 1.2615074476425132, + "grad_norm": 1.465174913406372, + "learning_rate": 9.933736842105263e-05, + "loss": 0.5706, + "step": 22528 + }, + { + "epoch": 1.2615634449546422, + "grad_norm": 1.300743579864502, + "learning_rate": 9.93371052631579e-05, + "loss": 0.4728, + "step": 22529 + }, + { + "epoch": 1.2616194422667713, + "grad_norm": 2.122889757156372, + "learning_rate": 9.933684210526317e-05, + "loss": 0.3431, + "step": 22530 + }, + { + "epoch": 1.2616754395789003, + "grad_norm": 1.3574353456497192, + "learning_rate": 9.933657894736842e-05, + "loss": 0.4763, + "step": 22531 + }, + { + "epoch": 1.2617314368910293, + "grad_norm": 1.1337097883224487, + "learning_rate": 9.933631578947368e-05, + "loss": 0.3846, + "step": 22532 + }, + { + "epoch": 1.2617874342031583, + "grad_norm": 1.2418605089187622, + "learning_rate": 9.933605263157896e-05, + "loss": 0.4468, + "step": 22533 + }, + { + "epoch": 1.2618434315152873, + "grad_norm": 1.3232368230819702, + "learning_rate": 9.933578947368422e-05, + "loss": 0.5621, + "step": 22534 + }, + { + "epoch": 1.2618994288274163, + "grad_norm": 1.1776726245880127, + "learning_rate": 9.933552631578948e-05, + "loss": 0.4511, + "step": 22535 + }, + { + "epoch": 1.2619554261395454, + "grad_norm": 1.4379807710647583, + "learning_rate": 9.933526315789474e-05, + "loss": 0.4569, + "step": 22536 + }, + { + "epoch": 1.2620114234516744, + "grad_norm": 1.3012895584106445, + "learning_rate": 9.9335e-05, + "loss": 0.3947, + "step": 22537 + }, + { + "epoch": 1.2620674207638034, + "grad_norm": 1.8353028297424316, + "learning_rate": 9.933473684210527e-05, + "loss": 0.3924, + "step": 22538 + }, + { + "epoch": 1.2621234180759324, + "grad_norm": 1.2095950841903687, + "learning_rate": 9.933447368421053e-05, + "loss": 0.4401, + "step": 22539 + }, + { + "epoch": 1.2621794153880614, + "grad_norm": 1.2847932577133179, + "learning_rate": 9.93342105263158e-05, + "loss": 0.4456, + "step": 22540 + }, + { + "epoch": 1.2622354127001905, + "grad_norm": 1.5020012855529785, + "learning_rate": 9.933394736842105e-05, + "loss": 0.5991, + "step": 22541 + }, + { + "epoch": 1.2622914100123195, + "grad_norm": 1.4112101793289185, + "learning_rate": 9.933368421052632e-05, + "loss": 0.4483, + "step": 22542 + }, + { + "epoch": 1.2623474073244485, + "grad_norm": 3.058119058609009, + "learning_rate": 9.933342105263158e-05, + "loss": 0.3954, + "step": 22543 + }, + { + "epoch": 1.2624034046365775, + "grad_norm": 1.421589732170105, + "learning_rate": 9.933315789473686e-05, + "loss": 0.4271, + "step": 22544 + }, + { + "epoch": 1.2624594019487065, + "grad_norm": 1.2064660787582397, + "learning_rate": 9.93328947368421e-05, + "loss": 0.3404, + "step": 22545 + }, + { + "epoch": 1.2625153992608356, + "grad_norm": 1.434069037437439, + "learning_rate": 9.933263157894737e-05, + "loss": 0.4127, + "step": 22546 + }, + { + "epoch": 1.2625713965729646, + "grad_norm": 1.3939950466156006, + "learning_rate": 9.933236842105263e-05, + "loss": 0.4086, + "step": 22547 + }, + { + "epoch": 1.2626273938850936, + "grad_norm": 1.2280699014663696, + "learning_rate": 9.933210526315791e-05, + "loss": 0.5022, + "step": 22548 + }, + { + "epoch": 1.2626833911972226, + "grad_norm": 1.229111909866333, + "learning_rate": 9.933184210526317e-05, + "loss": 0.4686, + "step": 22549 + }, + { + "epoch": 1.2627393885093516, + "grad_norm": 1.3211842775344849, + "learning_rate": 9.933157894736843e-05, + "loss": 0.3912, + "step": 22550 + }, + { + "epoch": 1.2627953858214807, + "grad_norm": 1.461344838142395, + "learning_rate": 9.933131578947369e-05, + "loss": 0.5204, + "step": 22551 + }, + { + "epoch": 1.2628513831336097, + "grad_norm": 1.4679924249649048, + "learning_rate": 9.933105263157895e-05, + "loss": 0.4477, + "step": 22552 + }, + { + "epoch": 1.2629073804457387, + "grad_norm": 1.2944622039794922, + "learning_rate": 9.933078947368422e-05, + "loss": 0.3761, + "step": 22553 + }, + { + "epoch": 1.2629633777578677, + "grad_norm": 1.4670464992523193, + "learning_rate": 9.933052631578948e-05, + "loss": 0.4294, + "step": 22554 + }, + { + "epoch": 1.2630193750699967, + "grad_norm": 1.3800796270370483, + "learning_rate": 9.933026315789474e-05, + "loss": 0.4694, + "step": 22555 + }, + { + "epoch": 1.2630753723821257, + "grad_norm": 1.2836834192276, + "learning_rate": 9.933e-05, + "loss": 0.437, + "step": 22556 + }, + { + "epoch": 1.2631313696942548, + "grad_norm": 1.2517718076705933, + "learning_rate": 9.932973684210527e-05, + "loss": 0.446, + "step": 22557 + }, + { + "epoch": 1.2631873670063838, + "grad_norm": 1.3426647186279297, + "learning_rate": 9.932947368421053e-05, + "loss": 0.4189, + "step": 22558 + }, + { + "epoch": 1.2632433643185128, + "grad_norm": 1.2995389699935913, + "learning_rate": 9.932921052631579e-05, + "loss": 0.3016, + "step": 22559 + }, + { + "epoch": 1.2632993616306418, + "grad_norm": 1.4668916463851929, + "learning_rate": 9.932894736842105e-05, + "loss": 0.3776, + "step": 22560 + }, + { + "epoch": 1.2633553589427708, + "grad_norm": 1.3852781057357788, + "learning_rate": 9.932868421052633e-05, + "loss": 0.5454, + "step": 22561 + }, + { + "epoch": 1.2634113562548999, + "grad_norm": 1.3087972402572632, + "learning_rate": 9.932842105263158e-05, + "loss": 0.4569, + "step": 22562 + }, + { + "epoch": 1.2634673535670289, + "grad_norm": 1.356679916381836, + "learning_rate": 9.932815789473684e-05, + "loss": 0.4116, + "step": 22563 + }, + { + "epoch": 1.263523350879158, + "grad_norm": 1.1689960956573486, + "learning_rate": 9.93278947368421e-05, + "loss": 0.3984, + "step": 22564 + }, + { + "epoch": 1.263579348191287, + "grad_norm": 1.408976674079895, + "learning_rate": 9.932763157894738e-05, + "loss": 0.5703, + "step": 22565 + }, + { + "epoch": 1.263635345503416, + "grad_norm": 1.362504243850708, + "learning_rate": 9.932736842105264e-05, + "loss": 0.3978, + "step": 22566 + }, + { + "epoch": 1.263691342815545, + "grad_norm": 1.2659173011779785, + "learning_rate": 9.932710526315791e-05, + "loss": 0.4344, + "step": 22567 + }, + { + "epoch": 1.263747340127674, + "grad_norm": 1.2167366743087769, + "learning_rate": 9.932684210526316e-05, + "loss": 0.4085, + "step": 22568 + }, + { + "epoch": 1.263803337439803, + "grad_norm": 1.1932754516601562, + "learning_rate": 9.932657894736842e-05, + "loss": 0.3856, + "step": 22569 + }, + { + "epoch": 1.263859334751932, + "grad_norm": 1.2302398681640625, + "learning_rate": 9.932631578947369e-05, + "loss": 0.3514, + "step": 22570 + }, + { + "epoch": 1.263915332064061, + "grad_norm": 1.4113627672195435, + "learning_rate": 9.932605263157895e-05, + "loss": 0.562, + "step": 22571 + }, + { + "epoch": 1.26397132937619, + "grad_norm": 1.2027363777160645, + "learning_rate": 9.932578947368422e-05, + "loss": 0.4555, + "step": 22572 + }, + { + "epoch": 1.264027326688319, + "grad_norm": 1.3833410739898682, + "learning_rate": 9.932552631578947e-05, + "loss": 0.4095, + "step": 22573 + }, + { + "epoch": 1.264083324000448, + "grad_norm": 1.2301329374313354, + "learning_rate": 9.932526315789474e-05, + "loss": 0.3541, + "step": 22574 + }, + { + "epoch": 1.2641393213125771, + "grad_norm": 1.1803874969482422, + "learning_rate": 9.9325e-05, + "loss": 0.3284, + "step": 22575 + }, + { + "epoch": 1.2641953186247061, + "grad_norm": 1.8832403421401978, + "learning_rate": 9.932473684210528e-05, + "loss": 0.4084, + "step": 22576 + }, + { + "epoch": 1.2642513159368352, + "grad_norm": 1.1612879037857056, + "learning_rate": 9.932447368421053e-05, + "loss": 0.3429, + "step": 22577 + }, + { + "epoch": 1.2643073132489642, + "grad_norm": 1.7041854858398438, + "learning_rate": 9.93242105263158e-05, + "loss": 0.3767, + "step": 22578 + }, + { + "epoch": 1.2643633105610932, + "grad_norm": 1.5245976448059082, + "learning_rate": 9.932394736842105e-05, + "loss": 0.4273, + "step": 22579 + }, + { + "epoch": 1.2644193078732222, + "grad_norm": 1.4128202199935913, + "learning_rate": 9.932368421052633e-05, + "loss": 0.3816, + "step": 22580 + }, + { + "epoch": 1.2644753051853512, + "grad_norm": 1.1555976867675781, + "learning_rate": 9.932342105263159e-05, + "loss": 0.4031, + "step": 22581 + }, + { + "epoch": 1.2645313024974802, + "grad_norm": 1.5287786722183228, + "learning_rate": 9.932315789473685e-05, + "loss": 0.4134, + "step": 22582 + }, + { + "epoch": 1.2645872998096093, + "grad_norm": 1.2848385572433472, + "learning_rate": 9.932289473684211e-05, + "loss": 0.3846, + "step": 22583 + }, + { + "epoch": 1.2646432971217383, + "grad_norm": 4.332828521728516, + "learning_rate": 9.932263157894738e-05, + "loss": 0.5469, + "step": 22584 + }, + { + "epoch": 1.2646992944338673, + "grad_norm": 1.4228366613388062, + "learning_rate": 9.932236842105264e-05, + "loss": 0.6276, + "step": 22585 + }, + { + "epoch": 1.2647552917459963, + "grad_norm": 1.3125123977661133, + "learning_rate": 9.93221052631579e-05, + "loss": 0.5502, + "step": 22586 + }, + { + "epoch": 1.2648112890581253, + "grad_norm": 1.3094737529754639, + "learning_rate": 9.932184210526316e-05, + "loss": 0.4707, + "step": 22587 + }, + { + "epoch": 1.2648672863702544, + "grad_norm": 1.190169334411621, + "learning_rate": 9.932157894736842e-05, + "loss": 0.4817, + "step": 22588 + }, + { + "epoch": 1.2649232836823834, + "grad_norm": 1.2978438138961792, + "learning_rate": 9.932131578947369e-05, + "loss": 0.4921, + "step": 22589 + }, + { + "epoch": 1.2649792809945124, + "grad_norm": 1.3994370698928833, + "learning_rate": 9.932105263157895e-05, + "loss": 0.8326, + "step": 22590 + }, + { + "epoch": 1.2650352783066414, + "grad_norm": Infinity, + "learning_rate": 9.932105263157895e-05, + "loss": 0.5344, + "step": 22591 + }, + { + "epoch": 1.2650912756187702, + "grad_norm": 1.2328864336013794, + "learning_rate": 9.932078947368421e-05, + "loss": 0.3464, + "step": 22592 + }, + { + "epoch": 1.2651472729308992, + "grad_norm": 1.366552710533142, + "learning_rate": 9.932052631578947e-05, + "loss": 0.481, + "step": 22593 + }, + { + "epoch": 1.2652032702430283, + "grad_norm": 1.4918768405914307, + "learning_rate": 9.932026315789474e-05, + "loss": 0.5796, + "step": 22594 + }, + { + "epoch": 1.2652592675551573, + "grad_norm": 1.2238956689834595, + "learning_rate": 9.932e-05, + "loss": 0.4425, + "step": 22595 + }, + { + "epoch": 1.2653152648672863, + "grad_norm": 1.2184131145477295, + "learning_rate": 9.931973684210526e-05, + "loss": 0.3699, + "step": 22596 + }, + { + "epoch": 1.2653712621794153, + "grad_norm": 1.5443089008331299, + "learning_rate": 9.931947368421052e-05, + "loss": 0.4648, + "step": 22597 + }, + { + "epoch": 1.2654272594915443, + "grad_norm": 1.5369009971618652, + "learning_rate": 9.93192105263158e-05, + "loss": 0.4838, + "step": 22598 + }, + { + "epoch": 1.2654832568036734, + "grad_norm": 1.3724949359893799, + "learning_rate": 9.931894736842106e-05, + "loss": 0.5331, + "step": 22599 + }, + { + "epoch": 1.2655392541158024, + "grad_norm": 1.2850879430770874, + "learning_rate": 9.931868421052633e-05, + "loss": 0.4102, + "step": 22600 + }, + { + "epoch": 1.2655952514279314, + "grad_norm": 1.2060558795928955, + "learning_rate": 9.931842105263158e-05, + "loss": 0.4479, + "step": 22601 + }, + { + "epoch": 1.2656512487400604, + "grad_norm": 1.307312250137329, + "learning_rate": 9.931815789473685e-05, + "loss": 0.4925, + "step": 22602 + }, + { + "epoch": 1.2657072460521894, + "grad_norm": 1.4637165069580078, + "learning_rate": 9.931789473684211e-05, + "loss": 0.4648, + "step": 22603 + }, + { + "epoch": 1.2657632433643184, + "grad_norm": 1.487190842628479, + "learning_rate": 9.931763157894737e-05, + "loss": 0.4919, + "step": 22604 + }, + { + "epoch": 1.2658192406764475, + "grad_norm": 1.3126341104507446, + "learning_rate": 9.931736842105264e-05, + "loss": 0.3671, + "step": 22605 + }, + { + "epoch": 1.2658752379885765, + "grad_norm": 1.2969690561294556, + "learning_rate": 9.931710526315789e-05, + "loss": 0.5605, + "step": 22606 + }, + { + "epoch": 1.2659312353007055, + "grad_norm": 1.6308763027191162, + "learning_rate": 9.931684210526316e-05, + "loss": 0.4663, + "step": 22607 + }, + { + "epoch": 1.2659872326128345, + "grad_norm": 1.4421902894973755, + "learning_rate": 9.931657894736842e-05, + "loss": 0.5375, + "step": 22608 + }, + { + "epoch": 1.2660432299249635, + "grad_norm": 1.2509255409240723, + "learning_rate": 9.93163157894737e-05, + "loss": 0.4908, + "step": 22609 + }, + { + "epoch": 1.2660992272370926, + "grad_norm": 1.2548978328704834, + "learning_rate": 9.931605263157895e-05, + "loss": 0.4617, + "step": 22610 + }, + { + "epoch": 1.2661552245492216, + "grad_norm": 1.403206467628479, + "learning_rate": 9.931578947368421e-05, + "loss": 0.4799, + "step": 22611 + }, + { + "epoch": 1.2662112218613506, + "grad_norm": 1.8052750825881958, + "learning_rate": 9.931552631578947e-05, + "loss": 0.5925, + "step": 22612 + }, + { + "epoch": 1.2662672191734796, + "grad_norm": 6.041443824768066, + "learning_rate": 9.931526315789475e-05, + "loss": 0.4942, + "step": 22613 + }, + { + "epoch": 1.2663232164856086, + "grad_norm": 1.2583448886871338, + "learning_rate": 9.931500000000001e-05, + "loss": 0.456, + "step": 22614 + }, + { + "epoch": 1.2663792137977377, + "grad_norm": 1.2603918313980103, + "learning_rate": 9.931473684210527e-05, + "loss": 0.3992, + "step": 22615 + }, + { + "epoch": 1.2664352111098667, + "grad_norm": 1.594143271446228, + "learning_rate": 9.931447368421053e-05, + "loss": 0.6076, + "step": 22616 + }, + { + "epoch": 1.2664912084219957, + "grad_norm": 1.2620702981948853, + "learning_rate": 9.93142105263158e-05, + "loss": 0.3765, + "step": 22617 + }, + { + "epoch": 1.2665472057341247, + "grad_norm": 1.5117707252502441, + "learning_rate": 9.931394736842106e-05, + "loss": 0.4807, + "step": 22618 + }, + { + "epoch": 1.2666032030462537, + "grad_norm": 1.297575831413269, + "learning_rate": 9.931368421052632e-05, + "loss": 0.4377, + "step": 22619 + }, + { + "epoch": 1.2666592003583828, + "grad_norm": 1.1439862251281738, + "learning_rate": 9.931342105263158e-05, + "loss": 0.2711, + "step": 22620 + }, + { + "epoch": 1.2667151976705118, + "grad_norm": 1.2498003244400024, + "learning_rate": 9.931315789473685e-05, + "loss": 0.4216, + "step": 22621 + }, + { + "epoch": 1.2667711949826408, + "grad_norm": 1.4508490562438965, + "learning_rate": 9.931289473684211e-05, + "loss": 0.5733, + "step": 22622 + }, + { + "epoch": 1.2668271922947698, + "grad_norm": 1.2860407829284668, + "learning_rate": 9.931263157894737e-05, + "loss": 0.4151, + "step": 22623 + }, + { + "epoch": 1.2668831896068988, + "grad_norm": 1.059950590133667, + "learning_rate": 9.931236842105263e-05, + "loss": 0.4224, + "step": 22624 + }, + { + "epoch": 1.2669391869190278, + "grad_norm": 1.1142617464065552, + "learning_rate": 9.931210526315789e-05, + "loss": 0.348, + "step": 22625 + }, + { + "epoch": 1.2669951842311569, + "grad_norm": 2.086233615875244, + "learning_rate": 9.931184210526316e-05, + "loss": 0.5684, + "step": 22626 + }, + { + "epoch": 1.2670511815432859, + "grad_norm": 1.5305190086364746, + "learning_rate": 9.931157894736842e-05, + "loss": 0.4595, + "step": 22627 + }, + { + "epoch": 1.267107178855415, + "grad_norm": 1.4111287593841553, + "learning_rate": 9.93113157894737e-05, + "loss": 0.5267, + "step": 22628 + }, + { + "epoch": 1.267163176167544, + "grad_norm": 2.5089962482452393, + "learning_rate": 9.931105263157894e-05, + "loss": 0.4802, + "step": 22629 + }, + { + "epoch": 1.267219173479673, + "grad_norm": 1.3890100717544556, + "learning_rate": 9.931078947368422e-05, + "loss": 0.4647, + "step": 22630 + }, + { + "epoch": 1.267275170791802, + "grad_norm": 1.2288728952407837, + "learning_rate": 9.931052631578948e-05, + "loss": 0.4682, + "step": 22631 + }, + { + "epoch": 1.267331168103931, + "grad_norm": 1.0214381217956543, + "learning_rate": 9.931026315789475e-05, + "loss": 0.3127, + "step": 22632 + }, + { + "epoch": 1.26738716541606, + "grad_norm": 1.4180487394332886, + "learning_rate": 9.931000000000001e-05, + "loss": 0.3874, + "step": 22633 + }, + { + "epoch": 1.267443162728189, + "grad_norm": 1.4683669805526733, + "learning_rate": 9.930973684210527e-05, + "loss": 0.424, + "step": 22634 + }, + { + "epoch": 1.267499160040318, + "grad_norm": 1.7716941833496094, + "learning_rate": 9.930947368421053e-05, + "loss": 0.5721, + "step": 22635 + }, + { + "epoch": 1.267555157352447, + "grad_norm": 1.7148070335388184, + "learning_rate": 9.93092105263158e-05, + "loss": 0.5187, + "step": 22636 + }, + { + "epoch": 1.267611154664576, + "grad_norm": 1.155671238899231, + "learning_rate": 9.930894736842106e-05, + "loss": 0.2932, + "step": 22637 + }, + { + "epoch": 1.267667151976705, + "grad_norm": 1.2717502117156982, + "learning_rate": 9.930868421052632e-05, + "loss": 0.3633, + "step": 22638 + }, + { + "epoch": 1.2677231492888341, + "grad_norm": 1.1481229066848755, + "learning_rate": 9.930842105263158e-05, + "loss": 0.3724, + "step": 22639 + }, + { + "epoch": 1.2677791466009631, + "grad_norm": 1.5281686782836914, + "learning_rate": 9.930815789473684e-05, + "loss": 0.4342, + "step": 22640 + }, + { + "epoch": 1.2678351439130922, + "grad_norm": 1.3560229539871216, + "learning_rate": 9.930789473684211e-05, + "loss": 0.3925, + "step": 22641 + }, + { + "epoch": 1.2678911412252212, + "grad_norm": 1.318763017654419, + "learning_rate": 9.930763157894737e-05, + "loss": 0.3761, + "step": 22642 + }, + { + "epoch": 1.2679471385373502, + "grad_norm": 1.2733261585235596, + "learning_rate": 9.930736842105263e-05, + "loss": 0.5068, + "step": 22643 + }, + { + "epoch": 1.2680031358494792, + "grad_norm": 1.5524920225143433, + "learning_rate": 9.93071052631579e-05, + "loss": 0.4662, + "step": 22644 + }, + { + "epoch": 1.2680591331616082, + "grad_norm": 1.2721418142318726, + "learning_rate": 9.930684210526317e-05, + "loss": 0.4295, + "step": 22645 + }, + { + "epoch": 1.2681151304737373, + "grad_norm": 1.371688723564148, + "learning_rate": 9.930657894736843e-05, + "loss": 0.4775, + "step": 22646 + }, + { + "epoch": 1.2681711277858663, + "grad_norm": 1.32282292842865, + "learning_rate": 9.930631578947369e-05, + "loss": 0.4748, + "step": 22647 + }, + { + "epoch": 1.2682271250979953, + "grad_norm": 1.218231439590454, + "learning_rate": 9.930605263157895e-05, + "loss": 0.4281, + "step": 22648 + }, + { + "epoch": 1.2682831224101243, + "grad_norm": 1.451771855354309, + "learning_rate": 9.930578947368422e-05, + "loss": 0.4842, + "step": 22649 + }, + { + "epoch": 1.2683391197222533, + "grad_norm": 3.515385150909424, + "learning_rate": 9.930552631578948e-05, + "loss": 0.5745, + "step": 22650 + }, + { + "epoch": 1.2683951170343823, + "grad_norm": 1.4690711498260498, + "learning_rate": 9.930526315789474e-05, + "loss": 0.4232, + "step": 22651 + }, + { + "epoch": 1.2684511143465114, + "grad_norm": 2.0906107425689697, + "learning_rate": 9.9305e-05, + "loss": 0.3972, + "step": 22652 + }, + { + "epoch": 1.2685071116586404, + "grad_norm": 1.2995021343231201, + "learning_rate": 9.930473684210527e-05, + "loss": 0.5252, + "step": 22653 + }, + { + "epoch": 1.2685631089707694, + "grad_norm": 1.2416428327560425, + "learning_rate": 9.930447368421053e-05, + "loss": 0.4176, + "step": 22654 + }, + { + "epoch": 1.2686191062828984, + "grad_norm": 1.553397536277771, + "learning_rate": 9.93042105263158e-05, + "loss": 0.4392, + "step": 22655 + }, + { + "epoch": 1.2686751035950274, + "grad_norm": 1.4556041955947876, + "learning_rate": 9.930394736842105e-05, + "loss": 0.4612, + "step": 22656 + }, + { + "epoch": 1.2687311009071565, + "grad_norm": 1.3560270071029663, + "learning_rate": 9.930368421052631e-05, + "loss": 0.5722, + "step": 22657 + }, + { + "epoch": 1.2687870982192855, + "grad_norm": 1.7231885194778442, + "learning_rate": 9.930342105263158e-05, + "loss": 0.5763, + "step": 22658 + }, + { + "epoch": 1.2688430955314145, + "grad_norm": 1.4385744333267212, + "learning_rate": 9.930315789473684e-05, + "loss": 0.3599, + "step": 22659 + }, + { + "epoch": 1.2688990928435435, + "grad_norm": 2.524521589279175, + "learning_rate": 9.930289473684212e-05, + "loss": 0.6094, + "step": 22660 + }, + { + "epoch": 1.2689550901556725, + "grad_norm": 3.413239002227783, + "learning_rate": 9.930263157894736e-05, + "loss": 0.6705, + "step": 22661 + }, + { + "epoch": 1.2690110874678016, + "grad_norm": 1.4177204370498657, + "learning_rate": 9.930236842105264e-05, + "loss": 0.4917, + "step": 22662 + }, + { + "epoch": 1.2690670847799306, + "grad_norm": 1.4034504890441895, + "learning_rate": 9.93021052631579e-05, + "loss": 0.432, + "step": 22663 + }, + { + "epoch": 1.2691230820920596, + "grad_norm": 1.50716233253479, + "learning_rate": 9.930184210526317e-05, + "loss": 0.5209, + "step": 22664 + }, + { + "epoch": 1.2691790794041886, + "grad_norm": 1.3623316287994385, + "learning_rate": 9.930157894736843e-05, + "loss": 0.4581, + "step": 22665 + }, + { + "epoch": 1.2692350767163176, + "grad_norm": 1.5193994045257568, + "learning_rate": 9.930131578947369e-05, + "loss": 0.3974, + "step": 22666 + }, + { + "epoch": 1.2692910740284467, + "grad_norm": 1.293651819229126, + "learning_rate": 9.930105263157895e-05, + "loss": 0.3696, + "step": 22667 + }, + { + "epoch": 1.2693470713405757, + "grad_norm": 1.8996182680130005, + "learning_rate": 9.930078947368422e-05, + "loss": 0.4571, + "step": 22668 + }, + { + "epoch": 1.2694030686527047, + "grad_norm": 1.2406213283538818, + "learning_rate": 9.930052631578948e-05, + "loss": 0.4494, + "step": 22669 + }, + { + "epoch": 1.2694590659648337, + "grad_norm": 4.635396957397461, + "learning_rate": 9.930026315789474e-05, + "loss": 0.5383, + "step": 22670 + }, + { + "epoch": 1.2695150632769627, + "grad_norm": 1.3629764318466187, + "learning_rate": 9.93e-05, + "loss": 0.3442, + "step": 22671 + }, + { + "epoch": 1.2695710605890917, + "grad_norm": 1.5809091329574585, + "learning_rate": 9.929973684210527e-05, + "loss": 0.4282, + "step": 22672 + }, + { + "epoch": 1.2696270579012208, + "grad_norm": 1.1668684482574463, + "learning_rate": 9.929947368421053e-05, + "loss": 0.3434, + "step": 22673 + }, + { + "epoch": 1.2696830552133498, + "grad_norm": 1.1767786741256714, + "learning_rate": 9.92992105263158e-05, + "loss": 0.3571, + "step": 22674 + }, + { + "epoch": 1.2697390525254788, + "grad_norm": 1.4719984531402588, + "learning_rate": 9.929894736842105e-05, + "loss": 0.4222, + "step": 22675 + }, + { + "epoch": 1.2697950498376078, + "grad_norm": 1.181861162185669, + "learning_rate": 9.929868421052631e-05, + "loss": 0.4426, + "step": 22676 + }, + { + "epoch": 1.2698510471497368, + "grad_norm": 1.6003828048706055, + "learning_rate": 9.929842105263159e-05, + "loss": 0.4858, + "step": 22677 + }, + { + "epoch": 1.2699070444618659, + "grad_norm": 2.3208487033843994, + "learning_rate": 9.929815789473685e-05, + "loss": 0.506, + "step": 22678 + }, + { + "epoch": 1.2699630417739949, + "grad_norm": 1.3948818445205688, + "learning_rate": 9.92978947368421e-05, + "loss": 0.3287, + "step": 22679 + }, + { + "epoch": 1.270019039086124, + "grad_norm": 1.5072851181030273, + "learning_rate": 9.929763157894737e-05, + "loss": 0.4279, + "step": 22680 + }, + { + "epoch": 1.270075036398253, + "grad_norm": 1.5477532148361206, + "learning_rate": 9.929736842105264e-05, + "loss": 0.5258, + "step": 22681 + }, + { + "epoch": 1.270131033710382, + "grad_norm": 1.3021478652954102, + "learning_rate": 9.92971052631579e-05, + "loss": 0.4106, + "step": 22682 + }, + { + "epoch": 1.270187031022511, + "grad_norm": 12.972148895263672, + "learning_rate": 9.929684210526317e-05, + "loss": 0.4896, + "step": 22683 + }, + { + "epoch": 1.27024302833464, + "grad_norm": 1.3581346273422241, + "learning_rate": 9.929657894736842e-05, + "loss": 0.5091, + "step": 22684 + }, + { + "epoch": 1.270299025646769, + "grad_norm": 2.3846018314361572, + "learning_rate": 9.929631578947369e-05, + "loss": 0.5777, + "step": 22685 + }, + { + "epoch": 1.270355022958898, + "grad_norm": 1.26251220703125, + "learning_rate": 9.929605263157895e-05, + "loss": 0.4046, + "step": 22686 + }, + { + "epoch": 1.270411020271027, + "grad_norm": 1.7669390439987183, + "learning_rate": 9.929578947368422e-05, + "loss": 0.3931, + "step": 22687 + }, + { + "epoch": 1.270467017583156, + "grad_norm": 1.5544970035552979, + "learning_rate": 9.929552631578948e-05, + "loss": 0.5815, + "step": 22688 + }, + { + "epoch": 1.270523014895285, + "grad_norm": 1.0569502115249634, + "learning_rate": 9.929526315789474e-05, + "loss": 0.3724, + "step": 22689 + }, + { + "epoch": 1.270579012207414, + "grad_norm": 1.8381023406982422, + "learning_rate": 9.9295e-05, + "loss": 0.5538, + "step": 22690 + }, + { + "epoch": 1.2706350095195431, + "grad_norm": 1.1855922937393188, + "learning_rate": 9.929473684210526e-05, + "loss": 0.4204, + "step": 22691 + }, + { + "epoch": 1.2706910068316721, + "grad_norm": 1.2583905458450317, + "learning_rate": 9.929447368421054e-05, + "loss": 0.4074, + "step": 22692 + }, + { + "epoch": 1.2707470041438012, + "grad_norm": 1.2125978469848633, + "learning_rate": 9.929421052631578e-05, + "loss": 0.4162, + "step": 22693 + }, + { + "epoch": 1.2708030014559302, + "grad_norm": 1.5250681638717651, + "learning_rate": 9.929394736842106e-05, + "loss": 0.4806, + "step": 22694 + }, + { + "epoch": 1.2708589987680592, + "grad_norm": 1.3331366777420044, + "learning_rate": 9.929368421052632e-05, + "loss": 0.3697, + "step": 22695 + }, + { + "epoch": 1.2709149960801882, + "grad_norm": 1.1396459341049194, + "learning_rate": 9.929342105263159e-05, + "loss": 0.4164, + "step": 22696 + }, + { + "epoch": 1.2709709933923172, + "grad_norm": 1.2768718004226685, + "learning_rate": 9.929315789473685e-05, + "loss": 0.3931, + "step": 22697 + }, + { + "epoch": 1.2710269907044462, + "grad_norm": 1.8912655115127563, + "learning_rate": 9.929289473684211e-05, + "loss": 0.5287, + "step": 22698 + }, + { + "epoch": 1.2710829880165753, + "grad_norm": 1.3284953832626343, + "learning_rate": 9.929263157894737e-05, + "loss": 0.4845, + "step": 22699 + }, + { + "epoch": 1.2711389853287043, + "grad_norm": 1.2338652610778809, + "learning_rate": 9.929236842105264e-05, + "loss": 0.4182, + "step": 22700 + }, + { + "epoch": 1.2711949826408333, + "grad_norm": 1.208163857460022, + "learning_rate": 9.92921052631579e-05, + "loss": 0.4332, + "step": 22701 + }, + { + "epoch": 1.2712509799529623, + "grad_norm": 1.2945038080215454, + "learning_rate": 9.929184210526316e-05, + "loss": 0.4089, + "step": 22702 + }, + { + "epoch": 1.2713069772650913, + "grad_norm": 1.4350838661193848, + "learning_rate": 9.929157894736842e-05, + "loss": 0.5331, + "step": 22703 + }, + { + "epoch": 1.2713629745772204, + "grad_norm": 1.4823426008224487, + "learning_rate": 9.92913157894737e-05, + "loss": 0.4397, + "step": 22704 + }, + { + "epoch": 1.2714189718893494, + "grad_norm": 1.3634966611862183, + "learning_rate": 9.929105263157895e-05, + "loss": 0.4962, + "step": 22705 + }, + { + "epoch": 1.2714749692014784, + "grad_norm": 1.4452133178710938, + "learning_rate": 9.929078947368421e-05, + "loss": 0.5064, + "step": 22706 + }, + { + "epoch": 1.2715309665136074, + "grad_norm": 1.3377217054367065, + "learning_rate": 9.929052631578947e-05, + "loss": 0.4391, + "step": 22707 + }, + { + "epoch": 1.2715869638257364, + "grad_norm": 1.923106074333191, + "learning_rate": 9.929026315789473e-05, + "loss": 0.5312, + "step": 22708 + }, + { + "epoch": 1.2716429611378655, + "grad_norm": 1.1288068294525146, + "learning_rate": 9.929e-05, + "loss": 0.4602, + "step": 22709 + }, + { + "epoch": 1.2716989584499945, + "grad_norm": 1.2240934371948242, + "learning_rate": 9.928973684210527e-05, + "loss": 0.4053, + "step": 22710 + }, + { + "epoch": 1.2717549557621235, + "grad_norm": 1.4392454624176025, + "learning_rate": 9.928947368421053e-05, + "loss": 0.4006, + "step": 22711 + }, + { + "epoch": 1.2718109530742525, + "grad_norm": 1.578526496887207, + "learning_rate": 9.928921052631579e-05, + "loss": 0.5425, + "step": 22712 + }, + { + "epoch": 1.2718669503863815, + "grad_norm": 2.0550968647003174, + "learning_rate": 9.928894736842106e-05, + "loss": 0.5106, + "step": 22713 + }, + { + "epoch": 1.2719229476985106, + "grad_norm": 1.3066192865371704, + "learning_rate": 9.928868421052632e-05, + "loss": 0.3744, + "step": 22714 + }, + { + "epoch": 1.2719789450106396, + "grad_norm": 1.4003432989120483, + "learning_rate": 9.928842105263159e-05, + "loss": 0.4796, + "step": 22715 + }, + { + "epoch": 1.2720349423227684, + "grad_norm": 1.2575757503509521, + "learning_rate": 9.928815789473684e-05, + "loss": 0.3904, + "step": 22716 + }, + { + "epoch": 1.2720909396348974, + "grad_norm": 1.1841113567352295, + "learning_rate": 9.928789473684211e-05, + "loss": 0.3849, + "step": 22717 + }, + { + "epoch": 1.2721469369470264, + "grad_norm": 1.306808352470398, + "learning_rate": 9.928763157894737e-05, + "loss": 0.434, + "step": 22718 + }, + { + "epoch": 1.2722029342591554, + "grad_norm": 1.1899160146713257, + "learning_rate": 9.928736842105264e-05, + "loss": 0.4461, + "step": 22719 + }, + { + "epoch": 1.2722589315712844, + "grad_norm": 1.0495973825454712, + "learning_rate": 9.92871052631579e-05, + "loss": 0.4398, + "step": 22720 + }, + { + "epoch": 1.2723149288834135, + "grad_norm": 1.5488747358322144, + "learning_rate": 9.928684210526316e-05, + "loss": 0.5062, + "step": 22721 + }, + { + "epoch": 1.2723709261955425, + "grad_norm": 1.3600432872772217, + "learning_rate": 9.928657894736842e-05, + "loss": 0.4004, + "step": 22722 + }, + { + "epoch": 1.2724269235076715, + "grad_norm": 1.1288678646087646, + "learning_rate": 9.92863157894737e-05, + "loss": 0.4184, + "step": 22723 + }, + { + "epoch": 1.2724829208198005, + "grad_norm": 1.2847695350646973, + "learning_rate": 9.928605263157896e-05, + "loss": 0.434, + "step": 22724 + }, + { + "epoch": 1.2725389181319295, + "grad_norm": 1.3045952320098877, + "learning_rate": 9.928578947368422e-05, + "loss": 0.4622, + "step": 22725 + }, + { + "epoch": 1.2725949154440586, + "grad_norm": 1.2724335193634033, + "learning_rate": 9.928552631578948e-05, + "loss": 0.3853, + "step": 22726 + }, + { + "epoch": 1.2726509127561876, + "grad_norm": 1.4003771543502808, + "learning_rate": 9.928526315789474e-05, + "loss": 0.4671, + "step": 22727 + }, + { + "epoch": 1.2727069100683166, + "grad_norm": 1.9080029726028442, + "learning_rate": 9.928500000000001e-05, + "loss": 0.6663, + "step": 22728 + }, + { + "epoch": 1.2727629073804456, + "grad_norm": 1.3403178453445435, + "learning_rate": 9.928473684210527e-05, + "loss": 0.4044, + "step": 22729 + }, + { + "epoch": 1.2728189046925746, + "grad_norm": 1.050033450126648, + "learning_rate": 9.928447368421053e-05, + "loss": 0.3496, + "step": 22730 + }, + { + "epoch": 1.2728749020047037, + "grad_norm": 1.0832682847976685, + "learning_rate": 9.928421052631579e-05, + "loss": 0.4166, + "step": 22731 + }, + { + "epoch": 1.2729308993168327, + "grad_norm": 1.425012469291687, + "learning_rate": 9.928394736842106e-05, + "loss": 0.4182, + "step": 22732 + }, + { + "epoch": 1.2729868966289617, + "grad_norm": 1.3718820810317993, + "learning_rate": 9.928368421052632e-05, + "loss": 0.5987, + "step": 22733 + }, + { + "epoch": 1.2730428939410907, + "grad_norm": 1.3492567539215088, + "learning_rate": 9.928342105263158e-05, + "loss": 0.4202, + "step": 22734 + }, + { + "epoch": 1.2730988912532197, + "grad_norm": 9.869452476501465, + "learning_rate": 9.928315789473684e-05, + "loss": 0.5806, + "step": 22735 + }, + { + "epoch": 1.2731548885653488, + "grad_norm": 1.481376051902771, + "learning_rate": 9.928289473684211e-05, + "loss": 0.4489, + "step": 22736 + }, + { + "epoch": 1.2732108858774778, + "grad_norm": 1.1709439754486084, + "learning_rate": 9.928263157894737e-05, + "loss": 0.3878, + "step": 22737 + }, + { + "epoch": 1.2732668831896068, + "grad_norm": 4.314883708953857, + "learning_rate": 9.928236842105265e-05, + "loss": 0.5497, + "step": 22738 + }, + { + "epoch": 1.2733228805017358, + "grad_norm": 1.635369062423706, + "learning_rate": 9.928210526315789e-05, + "loss": 0.4089, + "step": 22739 + }, + { + "epoch": 1.2733788778138648, + "grad_norm": 1.622637391090393, + "learning_rate": 9.928184210526317e-05, + "loss": 0.4684, + "step": 22740 + }, + { + "epoch": 1.2734348751259938, + "grad_norm": 1.3720473051071167, + "learning_rate": 9.928157894736843e-05, + "loss": 0.3942, + "step": 22741 + }, + { + "epoch": 1.2734908724381229, + "grad_norm": 1.2264175415039062, + "learning_rate": 9.92813157894737e-05, + "loss": 0.3322, + "step": 22742 + }, + { + "epoch": 1.2735468697502519, + "grad_norm": 1.4567515850067139, + "learning_rate": 9.928105263157895e-05, + "loss": 0.4325, + "step": 22743 + }, + { + "epoch": 1.273602867062381, + "grad_norm": 1.5054537057876587, + "learning_rate": 9.92807894736842e-05, + "loss": 0.4869, + "step": 22744 + }, + { + "epoch": 1.27365886437451, + "grad_norm": 1.3365561962127686, + "learning_rate": 9.928052631578948e-05, + "loss": 0.4938, + "step": 22745 + }, + { + "epoch": 1.273714861686639, + "grad_norm": 1.5146760940551758, + "learning_rate": 9.928026315789474e-05, + "loss": 0.5028, + "step": 22746 + }, + { + "epoch": 1.273770858998768, + "grad_norm": 1.0109838247299194, + "learning_rate": 9.928000000000001e-05, + "loss": 0.3685, + "step": 22747 + }, + { + "epoch": 1.273826856310897, + "grad_norm": 1.3436310291290283, + "learning_rate": 9.927973684210526e-05, + "loss": 0.5354, + "step": 22748 + }, + { + "epoch": 1.273882853623026, + "grad_norm": 1.4745495319366455, + "learning_rate": 9.927947368421053e-05, + "loss": 0.4171, + "step": 22749 + }, + { + "epoch": 1.273938850935155, + "grad_norm": 1.4597543478012085, + "learning_rate": 9.927921052631579e-05, + "loss": 0.3829, + "step": 22750 + }, + { + "epoch": 1.273994848247284, + "grad_norm": 1.4160975217819214, + "learning_rate": 9.927894736842106e-05, + "loss": 0.4748, + "step": 22751 + }, + { + "epoch": 1.274050845559413, + "grad_norm": 1.3686274290084839, + "learning_rate": 9.927868421052632e-05, + "loss": 0.4098, + "step": 22752 + }, + { + "epoch": 1.274106842871542, + "grad_norm": 1.3850740194320679, + "learning_rate": 9.927842105263158e-05, + "loss": 0.4831, + "step": 22753 + }, + { + "epoch": 1.274162840183671, + "grad_norm": 2.055142402648926, + "learning_rate": 9.927815789473684e-05, + "loss": 0.4452, + "step": 22754 + }, + { + "epoch": 1.2742188374958001, + "grad_norm": 1.2674225568771362, + "learning_rate": 9.927789473684212e-05, + "loss": 0.3733, + "step": 22755 + }, + { + "epoch": 1.2742748348079291, + "grad_norm": 1.5821294784545898, + "learning_rate": 9.927763157894738e-05, + "loss": 0.4717, + "step": 22756 + }, + { + "epoch": 1.2743308321200582, + "grad_norm": 1.2697193622589111, + "learning_rate": 9.927736842105264e-05, + "loss": 0.3312, + "step": 22757 + }, + { + "epoch": 1.2743868294321872, + "grad_norm": 1.1778618097305298, + "learning_rate": 9.92771052631579e-05, + "loss": 0.3977, + "step": 22758 + }, + { + "epoch": 1.2744428267443162, + "grad_norm": 1.5188071727752686, + "learning_rate": 9.927684210526317e-05, + "loss": 0.5024, + "step": 22759 + }, + { + "epoch": 1.2744988240564452, + "grad_norm": 2.8332552909851074, + "learning_rate": 9.927657894736843e-05, + "loss": 0.4515, + "step": 22760 + }, + { + "epoch": 1.2745548213685742, + "grad_norm": 0.9822652339935303, + "learning_rate": 9.927631578947369e-05, + "loss": 0.2588, + "step": 22761 + }, + { + "epoch": 1.2746108186807033, + "grad_norm": 1.1252400875091553, + "learning_rate": 9.927605263157895e-05, + "loss": 0.4655, + "step": 22762 + }, + { + "epoch": 1.2746668159928323, + "grad_norm": 1.4667878150939941, + "learning_rate": 9.927578947368421e-05, + "loss": 0.5684, + "step": 22763 + }, + { + "epoch": 1.2747228133049613, + "grad_norm": 1.3001575469970703, + "learning_rate": 9.927552631578948e-05, + "loss": 0.3713, + "step": 22764 + }, + { + "epoch": 1.2747788106170903, + "grad_norm": 1.185944676399231, + "learning_rate": 9.927526315789474e-05, + "loss": 0.3744, + "step": 22765 + }, + { + "epoch": 1.2748348079292193, + "grad_norm": 1.4352209568023682, + "learning_rate": 9.9275e-05, + "loss": 0.5192, + "step": 22766 + }, + { + "epoch": 1.2748908052413483, + "grad_norm": 1.1593495607376099, + "learning_rate": 9.927473684210526e-05, + "loss": 0.3618, + "step": 22767 + }, + { + "epoch": 1.2749468025534774, + "grad_norm": 1.3951646089553833, + "learning_rate": 9.927447368421053e-05, + "loss": 0.4428, + "step": 22768 + }, + { + "epoch": 1.2750027998656064, + "grad_norm": 1.2441003322601318, + "learning_rate": 9.92742105263158e-05, + "loss": 0.37, + "step": 22769 + }, + { + "epoch": 1.2750587971777354, + "grad_norm": 1.2693815231323242, + "learning_rate": 9.927394736842107e-05, + "loss": 0.3531, + "step": 22770 + }, + { + "epoch": 1.2751147944898644, + "grad_norm": 1.3525625467300415, + "learning_rate": 9.927368421052631e-05, + "loss": 0.3369, + "step": 22771 + }, + { + "epoch": 1.2751707918019934, + "grad_norm": 1.563995122909546, + "learning_rate": 9.927342105263159e-05, + "loss": 0.4396, + "step": 22772 + }, + { + "epoch": 1.2752267891141225, + "grad_norm": 1.6154165267944336, + "learning_rate": 9.927315789473685e-05, + "loss": 0.4805, + "step": 22773 + }, + { + "epoch": 1.2752827864262515, + "grad_norm": 1.5421653985977173, + "learning_rate": 9.927289473684212e-05, + "loss": 0.4318, + "step": 22774 + }, + { + "epoch": 1.2753387837383805, + "grad_norm": 1.5196174383163452, + "learning_rate": 9.927263157894738e-05, + "loss": 0.4372, + "step": 22775 + }, + { + "epoch": 1.2753947810505095, + "grad_norm": 1.498060703277588, + "learning_rate": 9.927236842105264e-05, + "loss": 0.5817, + "step": 22776 + }, + { + "epoch": 1.2754507783626385, + "grad_norm": 1.1516247987747192, + "learning_rate": 9.92721052631579e-05, + "loss": 0.4153, + "step": 22777 + }, + { + "epoch": 1.2755067756747676, + "grad_norm": 1.5450985431671143, + "learning_rate": 9.927184210526316e-05, + "loss": 0.3987, + "step": 22778 + }, + { + "epoch": 1.2755627729868966, + "grad_norm": 1.2992115020751953, + "learning_rate": 9.927157894736843e-05, + "loss": 0.4102, + "step": 22779 + }, + { + "epoch": 1.2756187702990256, + "grad_norm": 1.2836987972259521, + "learning_rate": 9.927131578947369e-05, + "loss": 0.4377, + "step": 22780 + }, + { + "epoch": 1.2756747676111546, + "grad_norm": 1.2355024814605713, + "learning_rate": 9.927105263157895e-05, + "loss": 0.3656, + "step": 22781 + }, + { + "epoch": 1.2757307649232836, + "grad_norm": 1.5421550273895264, + "learning_rate": 9.927078947368421e-05, + "loss": 0.395, + "step": 22782 + }, + { + "epoch": 1.2757867622354127, + "grad_norm": 1.3215744495391846, + "learning_rate": 9.927052631578948e-05, + "loss": 0.3083, + "step": 22783 + }, + { + "epoch": 1.2758427595475417, + "grad_norm": 1.0425972938537598, + "learning_rate": 9.927026315789474e-05, + "loss": 0.3134, + "step": 22784 + }, + { + "epoch": 1.2758987568596707, + "grad_norm": 1.2993683815002441, + "learning_rate": 9.927e-05, + "loss": 0.4841, + "step": 22785 + }, + { + "epoch": 1.2759547541717997, + "grad_norm": 1.6722605228424072, + "learning_rate": 9.926973684210526e-05, + "loss": 0.6632, + "step": 22786 + }, + { + "epoch": 1.2760107514839287, + "grad_norm": 1.634016513824463, + "learning_rate": 9.926947368421054e-05, + "loss": 0.4314, + "step": 22787 + }, + { + "epoch": 1.2760667487960577, + "grad_norm": 1.4416072368621826, + "learning_rate": 9.92692105263158e-05, + "loss": 0.4843, + "step": 22788 + }, + { + "epoch": 1.2761227461081868, + "grad_norm": 1.86212158203125, + "learning_rate": 9.926894736842106e-05, + "loss": 0.3136, + "step": 22789 + }, + { + "epoch": 1.2761787434203158, + "grad_norm": 1.419284701347351, + "learning_rate": 9.926868421052632e-05, + "loss": 0.4174, + "step": 22790 + }, + { + "epoch": 1.2762347407324448, + "grad_norm": 1.5763509273529053, + "learning_rate": 9.926842105263159e-05, + "loss": 0.4896, + "step": 22791 + }, + { + "epoch": 1.2762907380445738, + "grad_norm": 1.4166450500488281, + "learning_rate": 9.926815789473685e-05, + "loss": 0.5477, + "step": 22792 + }, + { + "epoch": 1.2763467353567028, + "grad_norm": 1.4179977178573608, + "learning_rate": 9.926789473684212e-05, + "loss": 0.5523, + "step": 22793 + }, + { + "epoch": 1.2764027326688319, + "grad_norm": 1.2980576753616333, + "learning_rate": 9.926763157894737e-05, + "loss": 0.4405, + "step": 22794 + }, + { + "epoch": 1.2764587299809609, + "grad_norm": 1.8971774578094482, + "learning_rate": 9.926736842105263e-05, + "loss": 0.3989, + "step": 22795 + }, + { + "epoch": 1.27651472729309, + "grad_norm": 1.6122095584869385, + "learning_rate": 9.92671052631579e-05, + "loss": 0.4271, + "step": 22796 + }, + { + "epoch": 1.276570724605219, + "grad_norm": 1.5946292877197266, + "learning_rate": 9.926684210526316e-05, + "loss": 0.6605, + "step": 22797 + }, + { + "epoch": 1.276626721917348, + "grad_norm": 1.4192845821380615, + "learning_rate": 9.926657894736842e-05, + "loss": 0.4882, + "step": 22798 + }, + { + "epoch": 1.276682719229477, + "grad_norm": 1.4047622680664062, + "learning_rate": 9.926631578947368e-05, + "loss": 0.5338, + "step": 22799 + }, + { + "epoch": 1.276738716541606, + "grad_norm": 2.6966567039489746, + "learning_rate": 9.926605263157895e-05, + "loss": 0.4442, + "step": 22800 + }, + { + "epoch": 1.276794713853735, + "grad_norm": 1.3931602239608765, + "learning_rate": 9.926578947368421e-05, + "loss": 0.3877, + "step": 22801 + }, + { + "epoch": 1.276850711165864, + "grad_norm": 1.3101481199264526, + "learning_rate": 9.926552631578949e-05, + "loss": 0.426, + "step": 22802 + }, + { + "epoch": 1.276906708477993, + "grad_norm": 1.3449358940124512, + "learning_rate": 9.926526315789473e-05, + "loss": 0.3264, + "step": 22803 + }, + { + "epoch": 1.276962705790122, + "grad_norm": 1.4406251907348633, + "learning_rate": 9.9265e-05, + "loss": 0.4213, + "step": 22804 + }, + { + "epoch": 1.277018703102251, + "grad_norm": 1.2217750549316406, + "learning_rate": 9.926473684210527e-05, + "loss": 0.527, + "step": 22805 + }, + { + "epoch": 1.27707470041438, + "grad_norm": 1.2291377782821655, + "learning_rate": 9.926447368421054e-05, + "loss": 0.5167, + "step": 22806 + }, + { + "epoch": 1.2771306977265091, + "grad_norm": 1.5686665773391724, + "learning_rate": 9.92642105263158e-05, + "loss": 0.4669, + "step": 22807 + }, + { + "epoch": 1.2771866950386381, + "grad_norm": 1.1758569478988647, + "learning_rate": 9.926394736842106e-05, + "loss": 0.3745, + "step": 22808 + }, + { + "epoch": 1.2772426923507672, + "grad_norm": 1.3279191255569458, + "learning_rate": 9.926368421052632e-05, + "loss": 0.5415, + "step": 22809 + }, + { + "epoch": 1.2772986896628962, + "grad_norm": 1.224311351776123, + "learning_rate": 9.926342105263159e-05, + "loss": 0.4282, + "step": 22810 + }, + { + "epoch": 1.2773546869750252, + "grad_norm": 1.3506956100463867, + "learning_rate": 9.926315789473685e-05, + "loss": 0.4217, + "step": 22811 + }, + { + "epoch": 1.2774106842871542, + "grad_norm": 1.8883819580078125, + "learning_rate": 9.926289473684211e-05, + "loss": 0.4152, + "step": 22812 + }, + { + "epoch": 1.2774666815992832, + "grad_norm": 1.3244603872299194, + "learning_rate": 9.926263157894737e-05, + "loss": 0.3734, + "step": 22813 + }, + { + "epoch": 1.2775226789114122, + "grad_norm": 1.224166989326477, + "learning_rate": 9.926236842105263e-05, + "loss": 0.3873, + "step": 22814 + }, + { + "epoch": 1.2775786762235413, + "grad_norm": 1.5324288606643677, + "learning_rate": 9.92621052631579e-05, + "loss": 0.5001, + "step": 22815 + }, + { + "epoch": 1.2776346735356703, + "grad_norm": 1.5295687913894653, + "learning_rate": 9.926184210526316e-05, + "loss": 0.4734, + "step": 22816 + }, + { + "epoch": 1.2776906708477993, + "grad_norm": 1.729920744895935, + "learning_rate": 9.926157894736842e-05, + "loss": 0.5655, + "step": 22817 + }, + { + "epoch": 1.2777466681599283, + "grad_norm": 1.1620714664459229, + "learning_rate": 9.926131578947368e-05, + "loss": 0.3914, + "step": 22818 + }, + { + "epoch": 1.2778026654720573, + "grad_norm": 1.2352561950683594, + "learning_rate": 9.926105263157896e-05, + "loss": 0.4323, + "step": 22819 + }, + { + "epoch": 1.2778586627841864, + "grad_norm": 1.185273289680481, + "learning_rate": 9.926078947368422e-05, + "loss": 0.3784, + "step": 22820 + }, + { + "epoch": 1.2779146600963154, + "grad_norm": 1.7075848579406738, + "learning_rate": 9.926052631578948e-05, + "loss": 0.4938, + "step": 22821 + }, + { + "epoch": 1.2779706574084444, + "grad_norm": 1.936911940574646, + "learning_rate": 9.926026315789474e-05, + "loss": 0.5181, + "step": 22822 + }, + { + "epoch": 1.2780266547205734, + "grad_norm": 1.4712456464767456, + "learning_rate": 9.926000000000001e-05, + "loss": 0.482, + "step": 22823 + }, + { + "epoch": 1.2780826520327024, + "grad_norm": 1.2620044946670532, + "learning_rate": 9.925973684210527e-05, + "loss": 0.3792, + "step": 22824 + }, + { + "epoch": 1.2781386493448315, + "grad_norm": 1.149962067604065, + "learning_rate": 9.925947368421054e-05, + "loss": 0.413, + "step": 22825 + }, + { + "epoch": 1.2781946466569605, + "grad_norm": 1.4623929262161255, + "learning_rate": 9.925921052631579e-05, + "loss": 0.4461, + "step": 22826 + }, + { + "epoch": 1.2782506439690895, + "grad_norm": 1.2333474159240723, + "learning_rate": 9.925894736842106e-05, + "loss": 0.5105, + "step": 22827 + }, + { + "epoch": 1.2783066412812185, + "grad_norm": 1.1953965425491333, + "learning_rate": 9.925868421052632e-05, + "loss": 0.3357, + "step": 22828 + }, + { + "epoch": 1.2783626385933475, + "grad_norm": 1.5779125690460205, + "learning_rate": 9.925842105263158e-05, + "loss": 0.4918, + "step": 22829 + }, + { + "epoch": 1.2784186359054766, + "grad_norm": 1.3370437622070312, + "learning_rate": 9.925815789473685e-05, + "loss": 0.441, + "step": 22830 + }, + { + "epoch": 1.2784746332176056, + "grad_norm": 1.671709418296814, + "learning_rate": 9.92578947368421e-05, + "loss": 0.5659, + "step": 22831 + }, + { + "epoch": 1.2785306305297346, + "grad_norm": 1.4015928506851196, + "learning_rate": 9.925763157894737e-05, + "loss": 0.3417, + "step": 22832 + }, + { + "epoch": 1.2785866278418636, + "grad_norm": 1.3909051418304443, + "learning_rate": 9.925736842105263e-05, + "loss": 0.4817, + "step": 22833 + }, + { + "epoch": 1.2786426251539926, + "grad_norm": 1.496991515159607, + "learning_rate": 9.92571052631579e-05, + "loss": 0.4239, + "step": 22834 + }, + { + "epoch": 1.2786986224661216, + "grad_norm": 1.3109850883483887, + "learning_rate": 9.925684210526317e-05, + "loss": 0.4174, + "step": 22835 + }, + { + "epoch": 1.2787546197782507, + "grad_norm": 1.5020232200622559, + "learning_rate": 9.925657894736843e-05, + "loss": 0.3756, + "step": 22836 + }, + { + "epoch": 1.2788106170903797, + "grad_norm": 1.3662246465682983, + "learning_rate": 9.925631578947369e-05, + "loss": 0.3375, + "step": 22837 + }, + { + "epoch": 1.2788666144025087, + "grad_norm": 1.225379228591919, + "learning_rate": 9.925605263157896e-05, + "loss": 0.3597, + "step": 22838 + }, + { + "epoch": 1.2789226117146377, + "grad_norm": 1.8974720239639282, + "learning_rate": 9.925578947368422e-05, + "loss": 0.455, + "step": 22839 + }, + { + "epoch": 1.2789786090267667, + "grad_norm": 1.445088267326355, + "learning_rate": 9.925552631578948e-05, + "loss": 0.3239, + "step": 22840 + }, + { + "epoch": 1.2790346063388958, + "grad_norm": 1.2902582883834839, + "learning_rate": 9.925526315789474e-05, + "loss": 0.595, + "step": 22841 + }, + { + "epoch": 1.2790906036510248, + "grad_norm": 1.2275384664535522, + "learning_rate": 9.925500000000001e-05, + "loss": 0.3344, + "step": 22842 + }, + { + "epoch": 1.2791466009631538, + "grad_norm": 1.4865615367889404, + "learning_rate": 9.925473684210527e-05, + "loss": 0.5262, + "step": 22843 + }, + { + "epoch": 1.2792025982752828, + "grad_norm": 1.4789400100708008, + "learning_rate": 9.925447368421053e-05, + "loss": 0.4692, + "step": 22844 + }, + { + "epoch": 1.2792585955874118, + "grad_norm": 1.9096077680587769, + "learning_rate": 9.925421052631579e-05, + "loss": 0.4197, + "step": 22845 + }, + { + "epoch": 1.2793145928995409, + "grad_norm": 9.500982284545898, + "learning_rate": 9.925394736842105e-05, + "loss": 0.3817, + "step": 22846 + }, + { + "epoch": 1.2793705902116699, + "grad_norm": 1.4734511375427246, + "learning_rate": 9.925368421052632e-05, + "loss": 0.4654, + "step": 22847 + }, + { + "epoch": 1.279426587523799, + "grad_norm": 1.6119033098220825, + "learning_rate": 9.925342105263158e-05, + "loss": 0.4628, + "step": 22848 + }, + { + "epoch": 1.279482584835928, + "grad_norm": 1.4294317960739136, + "learning_rate": 9.925315789473684e-05, + "loss": 0.4772, + "step": 22849 + }, + { + "epoch": 1.279538582148057, + "grad_norm": 1.3593608140945435, + "learning_rate": 9.92528947368421e-05, + "loss": 0.478, + "step": 22850 + }, + { + "epoch": 1.279594579460186, + "grad_norm": 1.2838557958602905, + "learning_rate": 9.925263157894738e-05, + "loss": 0.4266, + "step": 22851 + }, + { + "epoch": 1.279650576772315, + "grad_norm": 1.3299751281738281, + "learning_rate": 9.925236842105264e-05, + "loss": 0.4981, + "step": 22852 + }, + { + "epoch": 1.279706574084444, + "grad_norm": 1.2452366352081299, + "learning_rate": 9.92521052631579e-05, + "loss": 0.5585, + "step": 22853 + }, + { + "epoch": 1.279762571396573, + "grad_norm": 1.288326621055603, + "learning_rate": 9.925184210526315e-05, + "loss": 0.4661, + "step": 22854 + }, + { + "epoch": 1.279818568708702, + "grad_norm": 1.3746986389160156, + "learning_rate": 9.925157894736843e-05, + "loss": 0.4094, + "step": 22855 + }, + { + "epoch": 1.279874566020831, + "grad_norm": 1.115032434463501, + "learning_rate": 9.925131578947369e-05, + "loss": 0.4007, + "step": 22856 + }, + { + "epoch": 1.27993056333296, + "grad_norm": 1.2488641738891602, + "learning_rate": 9.925105263157896e-05, + "loss": 0.4994, + "step": 22857 + }, + { + "epoch": 1.279986560645089, + "grad_norm": 1.425959825515747, + "learning_rate": 9.925078947368421e-05, + "loss": 0.4653, + "step": 22858 + }, + { + "epoch": 1.280042557957218, + "grad_norm": 1.4689654111862183, + "learning_rate": 9.925052631578948e-05, + "loss": 0.5289, + "step": 22859 + }, + { + "epoch": 1.2800985552693471, + "grad_norm": 1.4871768951416016, + "learning_rate": 9.925026315789474e-05, + "loss": 0.4693, + "step": 22860 + }, + { + "epoch": 1.2801545525814761, + "grad_norm": 1.5057939291000366, + "learning_rate": 9.925000000000001e-05, + "loss": 0.5627, + "step": 22861 + }, + { + "epoch": 1.2802105498936052, + "grad_norm": 1.2794889211654663, + "learning_rate": 9.924973684210527e-05, + "loss": 0.4061, + "step": 22862 + }, + { + "epoch": 1.2802665472057342, + "grad_norm": 1.3119163513183594, + "learning_rate": 9.924947368421053e-05, + "loss": 0.3685, + "step": 22863 + }, + { + "epoch": 1.2803225445178632, + "grad_norm": 1.499916672706604, + "learning_rate": 9.924921052631579e-05, + "loss": 0.4594, + "step": 22864 + }, + { + "epoch": 1.2803785418299922, + "grad_norm": 1.4060181379318237, + "learning_rate": 9.924894736842105e-05, + "loss": 0.3679, + "step": 22865 + }, + { + "epoch": 1.2804345391421212, + "grad_norm": 1.4560726881027222, + "learning_rate": 9.924868421052633e-05, + "loss": 0.4411, + "step": 22866 + }, + { + "epoch": 1.2804905364542503, + "grad_norm": 1.4118001461029053, + "learning_rate": 9.924842105263159e-05, + "loss": 0.4929, + "step": 22867 + }, + { + "epoch": 1.2805465337663793, + "grad_norm": 1.429483413696289, + "learning_rate": 9.924815789473685e-05, + "loss": 0.6687, + "step": 22868 + }, + { + "epoch": 1.2806025310785083, + "grad_norm": 1.3398168087005615, + "learning_rate": 9.92478947368421e-05, + "loss": 0.4995, + "step": 22869 + }, + { + "epoch": 1.2806585283906373, + "grad_norm": 1.3888211250305176, + "learning_rate": 9.924763157894738e-05, + "loss": 0.4382, + "step": 22870 + }, + { + "epoch": 1.2807145257027663, + "grad_norm": 1.420980453491211, + "learning_rate": 9.924736842105264e-05, + "loss": 0.5059, + "step": 22871 + }, + { + "epoch": 1.2807705230148954, + "grad_norm": 1.3897972106933594, + "learning_rate": 9.92471052631579e-05, + "loss": 0.5511, + "step": 22872 + }, + { + "epoch": 1.2808265203270244, + "grad_norm": 1.4518322944641113, + "learning_rate": 9.924684210526316e-05, + "loss": 0.5084, + "step": 22873 + }, + { + "epoch": 1.2808825176391534, + "grad_norm": 1.3902095556259155, + "learning_rate": 9.924657894736843e-05, + "loss": 0.7499, + "step": 22874 + }, + { + "epoch": 1.2809385149512824, + "grad_norm": 1.311393141746521, + "learning_rate": 9.924631578947369e-05, + "loss": 0.4051, + "step": 22875 + }, + { + "epoch": 1.2809945122634114, + "grad_norm": 1.9083179235458374, + "learning_rate": 9.924605263157895e-05, + "loss": 0.4966, + "step": 22876 + }, + { + "epoch": 1.2810505095755405, + "grad_norm": 1.2876859903335571, + "learning_rate": 9.924578947368421e-05, + "loss": 0.4092, + "step": 22877 + }, + { + "epoch": 1.2811065068876695, + "grad_norm": 1.4630753993988037, + "learning_rate": 9.924552631578948e-05, + "loss": 0.5745, + "step": 22878 + }, + { + "epoch": 1.2811625041997985, + "grad_norm": 1.289911150932312, + "learning_rate": 9.924526315789474e-05, + "loss": 0.4048, + "step": 22879 + }, + { + "epoch": 1.2812185015119275, + "grad_norm": 1.2508763074874878, + "learning_rate": 9.924500000000002e-05, + "loss": 0.4841, + "step": 22880 + }, + { + "epoch": 1.2812744988240565, + "grad_norm": 1.3895246982574463, + "learning_rate": 9.924473684210526e-05, + "loss": 0.4195, + "step": 22881 + }, + { + "epoch": 1.2813304961361855, + "grad_norm": 1.732861876487732, + "learning_rate": 9.924447368421052e-05, + "loss": 0.4791, + "step": 22882 + }, + { + "epoch": 1.2813864934483146, + "grad_norm": 1.433110237121582, + "learning_rate": 9.92442105263158e-05, + "loss": 0.4858, + "step": 22883 + }, + { + "epoch": 1.2814424907604436, + "grad_norm": 1.2530598640441895, + "learning_rate": 9.924394736842106e-05, + "loss": 0.4106, + "step": 22884 + }, + { + "epoch": 1.2814984880725726, + "grad_norm": 1.45210599899292, + "learning_rate": 9.924368421052633e-05, + "loss": 0.4895, + "step": 22885 + }, + { + "epoch": 1.2815544853847016, + "grad_norm": 1.2971900701522827, + "learning_rate": 9.924342105263157e-05, + "loss": 0.4203, + "step": 22886 + }, + { + "epoch": 1.2816104826968306, + "grad_norm": 1.337934136390686, + "learning_rate": 9.924315789473685e-05, + "loss": 0.438, + "step": 22887 + }, + { + "epoch": 1.2816664800089597, + "grad_norm": 1.5100983381271362, + "learning_rate": 9.924289473684211e-05, + "loss": 0.4976, + "step": 22888 + }, + { + "epoch": 1.2817224773210887, + "grad_norm": 1.305716872215271, + "learning_rate": 9.924263157894738e-05, + "loss": 0.5341, + "step": 22889 + }, + { + "epoch": 1.2817784746332177, + "grad_norm": 1.6484627723693848, + "learning_rate": 9.924236842105264e-05, + "loss": 0.5838, + "step": 22890 + }, + { + "epoch": 1.2818344719453467, + "grad_norm": 1.4973548650741577, + "learning_rate": 9.92421052631579e-05, + "loss": 0.4746, + "step": 22891 + }, + { + "epoch": 1.2818904692574757, + "grad_norm": 1.2990798950195312, + "learning_rate": 9.924184210526316e-05, + "loss": 0.3971, + "step": 22892 + }, + { + "epoch": 1.2819464665696048, + "grad_norm": 1.5169073343276978, + "learning_rate": 9.924157894736843e-05, + "loss": 0.5484, + "step": 22893 + }, + { + "epoch": 1.2820024638817338, + "grad_norm": 1.4381048679351807, + "learning_rate": 9.924131578947369e-05, + "loss": 0.4447, + "step": 22894 + }, + { + "epoch": 1.2820584611938628, + "grad_norm": 1.9256312847137451, + "learning_rate": 9.924105263157895e-05, + "loss": 0.3603, + "step": 22895 + }, + { + "epoch": 1.2821144585059918, + "grad_norm": 1.35012686252594, + "learning_rate": 9.924078947368421e-05, + "loss": 0.3953, + "step": 22896 + }, + { + "epoch": 1.2821704558181208, + "grad_norm": 1.3643207550048828, + "learning_rate": 9.924052631578949e-05, + "loss": 0.4825, + "step": 22897 + }, + { + "epoch": 1.2822264531302499, + "grad_norm": 1.6131207942962646, + "learning_rate": 9.924026315789475e-05, + "loss": 0.5158, + "step": 22898 + }, + { + "epoch": 1.2822824504423789, + "grad_norm": 1.4574543237686157, + "learning_rate": 9.924e-05, + "loss": 0.485, + "step": 22899 + }, + { + "epoch": 1.282338447754508, + "grad_norm": 1.2956045866012573, + "learning_rate": 9.923973684210527e-05, + "loss": 0.4301, + "step": 22900 + }, + { + "epoch": 1.282394445066637, + "grad_norm": 1.2448629140853882, + "learning_rate": 9.923947368421052e-05, + "loss": 0.4139, + "step": 22901 + }, + { + "epoch": 1.282450442378766, + "grad_norm": 1.3458913564682007, + "learning_rate": 9.92392105263158e-05, + "loss": 0.5027, + "step": 22902 + }, + { + "epoch": 1.282506439690895, + "grad_norm": 1.2780959606170654, + "learning_rate": 9.923894736842106e-05, + "loss": 0.348, + "step": 22903 + }, + { + "epoch": 1.282562437003024, + "grad_norm": 1.369726300239563, + "learning_rate": 9.923868421052632e-05, + "loss": 0.4936, + "step": 22904 + }, + { + "epoch": 1.282618434315153, + "grad_norm": 1.415688395500183, + "learning_rate": 9.923842105263158e-05, + "loss": 0.4766, + "step": 22905 + }, + { + "epoch": 1.282674431627282, + "grad_norm": 1.3793411254882812, + "learning_rate": 9.923815789473685e-05, + "loss": 0.419, + "step": 22906 + }, + { + "epoch": 1.282730428939411, + "grad_norm": 1.4151756763458252, + "learning_rate": 9.923789473684211e-05, + "loss": 0.5377, + "step": 22907 + }, + { + "epoch": 1.28278642625154, + "grad_norm": 1.4490402936935425, + "learning_rate": 9.923763157894737e-05, + "loss": 0.4964, + "step": 22908 + }, + { + "epoch": 1.282842423563669, + "grad_norm": 1.4372507333755493, + "learning_rate": 9.923736842105263e-05, + "loss": 0.455, + "step": 22909 + }, + { + "epoch": 1.282898420875798, + "grad_norm": 1.2414966821670532, + "learning_rate": 9.92371052631579e-05, + "loss": 0.438, + "step": 22910 + }, + { + "epoch": 1.282954418187927, + "grad_norm": 1.3037598133087158, + "learning_rate": 9.923684210526316e-05, + "loss": 0.4289, + "step": 22911 + }, + { + "epoch": 1.2830104155000561, + "grad_norm": 2.441774368286133, + "learning_rate": 9.923657894736844e-05, + "loss": 0.486, + "step": 22912 + }, + { + "epoch": 1.2830664128121851, + "grad_norm": 1.678240418434143, + "learning_rate": 9.923631578947368e-05, + "loss": 0.4303, + "step": 22913 + }, + { + "epoch": 1.2831224101243142, + "grad_norm": 1.2907674312591553, + "learning_rate": 9.923605263157896e-05, + "loss": 0.4251, + "step": 22914 + }, + { + "epoch": 1.2831784074364432, + "grad_norm": 1.5444412231445312, + "learning_rate": 9.923578947368422e-05, + "loss": 0.5689, + "step": 22915 + }, + { + "epoch": 1.2832344047485722, + "grad_norm": 1.678706169128418, + "learning_rate": 9.923552631578947e-05, + "loss": 0.5201, + "step": 22916 + }, + { + "epoch": 1.2832904020607012, + "grad_norm": 1.3194148540496826, + "learning_rate": 9.923526315789475e-05, + "loss": 0.3818, + "step": 22917 + }, + { + "epoch": 1.2833463993728302, + "grad_norm": 1.344393014907837, + "learning_rate": 9.9235e-05, + "loss": 0.4487, + "step": 22918 + }, + { + "epoch": 1.2834023966849593, + "grad_norm": 1.3960899114608765, + "learning_rate": 9.923473684210527e-05, + "loss": 0.4525, + "step": 22919 + }, + { + "epoch": 1.2834583939970883, + "grad_norm": 1.4669640064239502, + "learning_rate": 9.923447368421053e-05, + "loss": 0.3751, + "step": 22920 + }, + { + "epoch": 1.2835143913092173, + "grad_norm": 1.2237739562988281, + "learning_rate": 9.92342105263158e-05, + "loss": 0.4054, + "step": 22921 + }, + { + "epoch": 1.283570388621346, + "grad_norm": 1.3809826374053955, + "learning_rate": 9.923394736842106e-05, + "loss": 0.405, + "step": 22922 + }, + { + "epoch": 1.2836263859334751, + "grad_norm": 1.3056340217590332, + "learning_rate": 9.923368421052632e-05, + "loss": 0.4087, + "step": 22923 + }, + { + "epoch": 1.2836823832456041, + "grad_norm": 1.3003950119018555, + "learning_rate": 9.923342105263158e-05, + "loss": 0.5366, + "step": 22924 + }, + { + "epoch": 1.2837383805577331, + "grad_norm": 1.3883391618728638, + "learning_rate": 9.923315789473685e-05, + "loss": 0.4015, + "step": 22925 + }, + { + "epoch": 1.2837943778698622, + "grad_norm": 1.3687617778778076, + "learning_rate": 9.923289473684211e-05, + "loss": 0.5419, + "step": 22926 + }, + { + "epoch": 1.2838503751819912, + "grad_norm": 1.247031569480896, + "learning_rate": 9.923263157894737e-05, + "loss": 0.4375, + "step": 22927 + }, + { + "epoch": 1.2839063724941202, + "grad_norm": 1.4073357582092285, + "learning_rate": 9.923236842105263e-05, + "loss": 0.4831, + "step": 22928 + }, + { + "epoch": 1.2839623698062492, + "grad_norm": 1.2339050769805908, + "learning_rate": 9.92321052631579e-05, + "loss": 0.4407, + "step": 22929 + }, + { + "epoch": 1.2840183671183782, + "grad_norm": 1.2139928340911865, + "learning_rate": 9.923184210526317e-05, + "loss": 0.4936, + "step": 22930 + }, + { + "epoch": 1.2840743644305073, + "grad_norm": 1.5380933284759521, + "learning_rate": 9.923157894736842e-05, + "loss": 0.4634, + "step": 22931 + }, + { + "epoch": 1.2841303617426363, + "grad_norm": 1.1507809162139893, + "learning_rate": 9.923131578947368e-05, + "loss": 0.434, + "step": 22932 + }, + { + "epoch": 1.2841863590547653, + "grad_norm": 1.1982436180114746, + "learning_rate": 9.923105263157894e-05, + "loss": 0.3755, + "step": 22933 + }, + { + "epoch": 1.2842423563668943, + "grad_norm": 1.8436267375946045, + "learning_rate": 9.923078947368422e-05, + "loss": 0.4652, + "step": 22934 + }, + { + "epoch": 1.2842983536790233, + "grad_norm": 2.4296042919158936, + "learning_rate": 9.923052631578948e-05, + "loss": 0.7161, + "step": 22935 + }, + { + "epoch": 1.2843543509911524, + "grad_norm": 1.224182367324829, + "learning_rate": 9.923026315789474e-05, + "loss": 0.5098, + "step": 22936 + }, + { + "epoch": 1.2844103483032814, + "grad_norm": 1.4148139953613281, + "learning_rate": 9.923e-05, + "loss": 0.4536, + "step": 22937 + }, + { + "epoch": 1.2844663456154104, + "grad_norm": 1.2621914148330688, + "learning_rate": 9.922973684210527e-05, + "loss": 0.4855, + "step": 22938 + }, + { + "epoch": 1.2845223429275394, + "grad_norm": 1.5679540634155273, + "learning_rate": 9.922947368421053e-05, + "loss": 0.5845, + "step": 22939 + }, + { + "epoch": 1.2845783402396684, + "grad_norm": 1.6275293827056885, + "learning_rate": 9.92292105263158e-05, + "loss": 0.4481, + "step": 22940 + }, + { + "epoch": 1.2846343375517975, + "grad_norm": 1.2808688879013062, + "learning_rate": 9.922894736842105e-05, + "loss": 0.4123, + "step": 22941 + }, + { + "epoch": 1.2846903348639265, + "grad_norm": 1.4030125141143799, + "learning_rate": 9.922868421052632e-05, + "loss": 0.5084, + "step": 22942 + }, + { + "epoch": 1.2847463321760555, + "grad_norm": 1.191109299659729, + "learning_rate": 9.922842105263158e-05, + "loss": 0.4447, + "step": 22943 + }, + { + "epoch": 1.2848023294881845, + "grad_norm": 1.2105380296707153, + "learning_rate": 9.922815789473686e-05, + "loss": 0.4081, + "step": 22944 + }, + { + "epoch": 1.2848583268003135, + "grad_norm": 1.7374064922332764, + "learning_rate": 9.92278947368421e-05, + "loss": 0.4894, + "step": 22945 + }, + { + "epoch": 1.2849143241124426, + "grad_norm": 1.5357162952423096, + "learning_rate": 9.922763157894738e-05, + "loss": 0.4604, + "step": 22946 + }, + { + "epoch": 1.2849703214245716, + "grad_norm": 1.4711189270019531, + "learning_rate": 9.922736842105263e-05, + "loss": 0.523, + "step": 22947 + }, + { + "epoch": 1.2850263187367006, + "grad_norm": 1.3166229724884033, + "learning_rate": 9.922710526315791e-05, + "loss": 0.4023, + "step": 22948 + }, + { + "epoch": 1.2850823160488296, + "grad_norm": 1.3460315465927124, + "learning_rate": 9.922684210526317e-05, + "loss": 0.4786, + "step": 22949 + }, + { + "epoch": 1.2851383133609586, + "grad_norm": 1.4328230619430542, + "learning_rate": 9.922657894736841e-05, + "loss": 0.6095, + "step": 22950 + }, + { + "epoch": 1.2851943106730876, + "grad_norm": 1.4445209503173828, + "learning_rate": 9.922631578947369e-05, + "loss": 0.4819, + "step": 22951 + }, + { + "epoch": 1.2852503079852167, + "grad_norm": 1.5904972553253174, + "learning_rate": 9.922605263157895e-05, + "loss": 0.4077, + "step": 22952 + }, + { + "epoch": 1.2853063052973457, + "grad_norm": 1.2449522018432617, + "learning_rate": 9.922578947368422e-05, + "loss": 0.4679, + "step": 22953 + }, + { + "epoch": 1.2853623026094747, + "grad_norm": 1.2776936292648315, + "learning_rate": 9.922552631578948e-05, + "loss": 0.4167, + "step": 22954 + }, + { + "epoch": 1.2854182999216037, + "grad_norm": 5.033515930175781, + "learning_rate": 9.922526315789474e-05, + "loss": 0.4469, + "step": 22955 + }, + { + "epoch": 1.2854742972337327, + "grad_norm": 1.3768417835235596, + "learning_rate": 9.9225e-05, + "loss": 0.4248, + "step": 22956 + }, + { + "epoch": 1.2855302945458618, + "grad_norm": 1.1071889400482178, + "learning_rate": 9.922473684210527e-05, + "loss": 0.3615, + "step": 22957 + }, + { + "epoch": 1.2855862918579908, + "grad_norm": 1.2394598722457886, + "learning_rate": 9.922447368421053e-05, + "loss": 0.4888, + "step": 22958 + }, + { + "epoch": 1.2856422891701198, + "grad_norm": 1.2708501815795898, + "learning_rate": 9.922421052631579e-05, + "loss": 0.4206, + "step": 22959 + }, + { + "epoch": 1.2856982864822488, + "grad_norm": 1.3693835735321045, + "learning_rate": 9.922394736842105e-05, + "loss": 0.4744, + "step": 22960 + }, + { + "epoch": 1.2857542837943778, + "grad_norm": 1.2162554264068604, + "learning_rate": 9.922368421052633e-05, + "loss": 0.317, + "step": 22961 + }, + { + "epoch": 1.2858102811065069, + "grad_norm": 1.262914776802063, + "learning_rate": 9.922342105263158e-05, + "loss": 0.4, + "step": 22962 + }, + { + "epoch": 1.2858662784186359, + "grad_norm": 1.3579424619674683, + "learning_rate": 9.922315789473684e-05, + "loss": 0.4434, + "step": 22963 + }, + { + "epoch": 1.285922275730765, + "grad_norm": 1.2783117294311523, + "learning_rate": 9.92228947368421e-05, + "loss": 0.398, + "step": 22964 + }, + { + "epoch": 1.285978273042894, + "grad_norm": 1.3735289573669434, + "learning_rate": 9.922263157894738e-05, + "loss": 0.4491, + "step": 22965 + }, + { + "epoch": 1.286034270355023, + "grad_norm": 1.716067910194397, + "learning_rate": 9.922236842105264e-05, + "loss": 0.5226, + "step": 22966 + }, + { + "epoch": 1.286090267667152, + "grad_norm": 1.2945685386657715, + "learning_rate": 9.92221052631579e-05, + "loss": 0.4089, + "step": 22967 + }, + { + "epoch": 1.286146264979281, + "grad_norm": 1.5746315717697144, + "learning_rate": 9.922184210526316e-05, + "loss": 0.4545, + "step": 22968 + }, + { + "epoch": 1.28620226229141, + "grad_norm": 1.5032798051834106, + "learning_rate": 9.922157894736842e-05, + "loss": 0.4913, + "step": 22969 + }, + { + "epoch": 1.286258259603539, + "grad_norm": 1.5064482688903809, + "learning_rate": 9.922131578947369e-05, + "loss": 0.4879, + "step": 22970 + }, + { + "epoch": 1.286314256915668, + "grad_norm": 1.6576777696609497, + "learning_rate": 9.922105263157895e-05, + "loss": 0.445, + "step": 22971 + }, + { + "epoch": 1.286370254227797, + "grad_norm": 1.1466107368469238, + "learning_rate": 9.922078947368422e-05, + "loss": 0.3226, + "step": 22972 + }, + { + "epoch": 1.286426251539926, + "grad_norm": 1.3217601776123047, + "learning_rate": 9.922052631578947e-05, + "loss": 0.4444, + "step": 22973 + }, + { + "epoch": 1.286482248852055, + "grad_norm": 1.2319965362548828, + "learning_rate": 9.922026315789474e-05, + "loss": 0.4418, + "step": 22974 + }, + { + "epoch": 1.286538246164184, + "grad_norm": 1.4229353666305542, + "learning_rate": 9.922e-05, + "loss": 0.52, + "step": 22975 + }, + { + "epoch": 1.2865942434763131, + "grad_norm": 1.6926286220550537, + "learning_rate": 9.921973684210528e-05, + "loss": 0.4617, + "step": 22976 + }, + { + "epoch": 1.2866502407884421, + "grad_norm": 1.4075435400009155, + "learning_rate": 9.921947368421054e-05, + "loss": 0.4135, + "step": 22977 + }, + { + "epoch": 1.2867062381005712, + "grad_norm": 1.431416630744934, + "learning_rate": 9.92192105263158e-05, + "loss": 0.5034, + "step": 22978 + }, + { + "epoch": 1.2867622354127002, + "grad_norm": 1.2957077026367188, + "learning_rate": 9.921894736842105e-05, + "loss": 0.4524, + "step": 22979 + }, + { + "epoch": 1.2868182327248292, + "grad_norm": 1.4212028980255127, + "learning_rate": 9.921868421052633e-05, + "loss": 0.4904, + "step": 22980 + }, + { + "epoch": 1.2868742300369582, + "grad_norm": 1.1704585552215576, + "learning_rate": 9.921842105263159e-05, + "loss": 0.483, + "step": 22981 + }, + { + "epoch": 1.2869302273490872, + "grad_norm": 1.2489521503448486, + "learning_rate": 9.921815789473685e-05, + "loss": 0.428, + "step": 22982 + }, + { + "epoch": 1.2869862246612163, + "grad_norm": 1.6500221490859985, + "learning_rate": 9.921789473684211e-05, + "loss": 0.5857, + "step": 22983 + }, + { + "epoch": 1.2870422219733453, + "grad_norm": 1.25044584274292, + "learning_rate": 9.921763157894738e-05, + "loss": 0.4966, + "step": 22984 + }, + { + "epoch": 1.2870982192854743, + "grad_norm": 1.3077419996261597, + "learning_rate": 9.921736842105264e-05, + "loss": 0.4304, + "step": 22985 + }, + { + "epoch": 1.2871542165976033, + "grad_norm": 1.6721961498260498, + "learning_rate": 9.92171052631579e-05, + "loss": 0.5613, + "step": 22986 + }, + { + "epoch": 1.2872102139097323, + "grad_norm": 1.2416599988937378, + "learning_rate": 9.921684210526316e-05, + "loss": 0.5033, + "step": 22987 + }, + { + "epoch": 1.2872662112218614, + "grad_norm": 1.3946799039840698, + "learning_rate": 9.921657894736842e-05, + "loss": 0.4012, + "step": 22988 + }, + { + "epoch": 1.2873222085339904, + "grad_norm": 1.3088847398757935, + "learning_rate": 9.921631578947369e-05, + "loss": 0.3774, + "step": 22989 + }, + { + "epoch": 1.2873782058461194, + "grad_norm": 1.2221100330352783, + "learning_rate": 9.921605263157895e-05, + "loss": 0.4021, + "step": 22990 + }, + { + "epoch": 1.2874342031582484, + "grad_norm": 1.2882754802703857, + "learning_rate": 9.921578947368421e-05, + "loss": 0.4187, + "step": 22991 + }, + { + "epoch": 1.2874902004703774, + "grad_norm": 1.244855523109436, + "learning_rate": 9.921552631578947e-05, + "loss": 0.4645, + "step": 22992 + }, + { + "epoch": 1.2875461977825065, + "grad_norm": 1.4222966432571411, + "learning_rate": 9.921526315789474e-05, + "loss": 0.4661, + "step": 22993 + }, + { + "epoch": 1.2876021950946355, + "grad_norm": 1.1522328853607178, + "learning_rate": 9.9215e-05, + "loss": 0.35, + "step": 22994 + }, + { + "epoch": 1.2876581924067645, + "grad_norm": 1.5621757507324219, + "learning_rate": 9.921473684210528e-05, + "loss": 0.3748, + "step": 22995 + }, + { + "epoch": 1.2877141897188935, + "grad_norm": 1.5290151834487915, + "learning_rate": 9.921447368421052e-05, + "loss": 0.5444, + "step": 22996 + }, + { + "epoch": 1.2877701870310225, + "grad_norm": 1.228925347328186, + "learning_rate": 9.92142105263158e-05, + "loss": 0.4329, + "step": 22997 + }, + { + "epoch": 1.2878261843431515, + "grad_norm": 1.1519784927368164, + "learning_rate": 9.921394736842106e-05, + "loss": 0.3533, + "step": 22998 + }, + { + "epoch": 1.2878821816552806, + "grad_norm": 1.2597635984420776, + "learning_rate": 9.921368421052633e-05, + "loss": 0.5011, + "step": 22999 + }, + { + "epoch": 1.2879381789674096, + "grad_norm": 1.2440520524978638, + "learning_rate": 9.921342105263158e-05, + "loss": 0.5224, + "step": 23000 + }, + { + "epoch": 1.2879941762795386, + "grad_norm": 1.4682767391204834, + "learning_rate": 9.921315789473685e-05, + "loss": 0.3505, + "step": 23001 + }, + { + "epoch": 1.2880501735916676, + "grad_norm": 1.5257654190063477, + "learning_rate": 9.921289473684211e-05, + "loss": 0.5408, + "step": 23002 + }, + { + "epoch": 1.2881061709037966, + "grad_norm": 1.4215542078018188, + "learning_rate": 9.921263157894737e-05, + "loss": 0.466, + "step": 23003 + }, + { + "epoch": 1.2881621682159257, + "grad_norm": 1.2798796892166138, + "learning_rate": 9.921236842105264e-05, + "loss": 0.4863, + "step": 23004 + }, + { + "epoch": 1.2882181655280547, + "grad_norm": 1.1246417760849, + "learning_rate": 9.921210526315789e-05, + "loss": 0.4249, + "step": 23005 + }, + { + "epoch": 1.2882741628401837, + "grad_norm": 1.7751320600509644, + "learning_rate": 9.921184210526316e-05, + "loss": 0.4136, + "step": 23006 + }, + { + "epoch": 1.2883301601523127, + "grad_norm": 1.4135159254074097, + "learning_rate": 9.921157894736842e-05, + "loss": 0.5479, + "step": 23007 + }, + { + "epoch": 1.2883861574644417, + "grad_norm": 1.7594817876815796, + "learning_rate": 9.92113157894737e-05, + "loss": 0.4854, + "step": 23008 + }, + { + "epoch": 1.2884421547765708, + "grad_norm": 1.517059564590454, + "learning_rate": 9.921105263157895e-05, + "loss": 0.8628, + "step": 23009 + }, + { + "epoch": 1.2884981520886998, + "grad_norm": 1.2950935363769531, + "learning_rate": 9.921078947368421e-05, + "loss": 0.3913, + "step": 23010 + }, + { + "epoch": 1.2885541494008288, + "grad_norm": 1.353660225868225, + "learning_rate": 9.921052631578947e-05, + "loss": 0.4868, + "step": 23011 + }, + { + "epoch": 1.2886101467129578, + "grad_norm": 0.9766411185264587, + "learning_rate": 9.921026315789475e-05, + "loss": 0.407, + "step": 23012 + }, + { + "epoch": 1.2886661440250868, + "grad_norm": 1.3170545101165771, + "learning_rate": 9.921000000000001e-05, + "loss": 0.5691, + "step": 23013 + }, + { + "epoch": 1.2887221413372159, + "grad_norm": 5.148563385009766, + "learning_rate": 9.920973684210527e-05, + "loss": 0.3936, + "step": 23014 + }, + { + "epoch": 1.2887781386493449, + "grad_norm": 1.4028126001358032, + "learning_rate": 9.920947368421053e-05, + "loss": 0.4737, + "step": 23015 + }, + { + "epoch": 1.288834135961474, + "grad_norm": 1.4964874982833862, + "learning_rate": 9.92092105263158e-05, + "loss": 0.5049, + "step": 23016 + }, + { + "epoch": 1.288890133273603, + "grad_norm": 2.170239210128784, + "learning_rate": 9.920894736842106e-05, + "loss": 0.4979, + "step": 23017 + }, + { + "epoch": 1.288946130585732, + "grad_norm": 1.45847487449646, + "learning_rate": 9.920868421052632e-05, + "loss": 0.4045, + "step": 23018 + }, + { + "epoch": 1.289002127897861, + "grad_norm": 1.2943224906921387, + "learning_rate": 9.920842105263158e-05, + "loss": 0.3783, + "step": 23019 + }, + { + "epoch": 1.28905812520999, + "grad_norm": 1.3317760229110718, + "learning_rate": 9.920815789473684e-05, + "loss": 0.4034, + "step": 23020 + }, + { + "epoch": 1.289114122522119, + "grad_norm": 1.1763025522232056, + "learning_rate": 9.920789473684211e-05, + "loss": 0.4745, + "step": 23021 + }, + { + "epoch": 1.289170119834248, + "grad_norm": 1.17929208278656, + "learning_rate": 9.920763157894737e-05, + "loss": 0.4017, + "step": 23022 + }, + { + "epoch": 1.289226117146377, + "grad_norm": 1.2791979312896729, + "learning_rate": 9.920736842105263e-05, + "loss": 0.4489, + "step": 23023 + }, + { + "epoch": 1.289282114458506, + "grad_norm": 4.220419406890869, + "learning_rate": 9.920710526315789e-05, + "loss": 0.3953, + "step": 23024 + }, + { + "epoch": 1.289338111770635, + "grad_norm": 1.5760184526443481, + "learning_rate": 9.920684210526316e-05, + "loss": 0.6107, + "step": 23025 + }, + { + "epoch": 1.289394109082764, + "grad_norm": 1.4542875289916992, + "learning_rate": 9.920657894736842e-05, + "loss": 0.5295, + "step": 23026 + }, + { + "epoch": 1.289450106394893, + "grad_norm": 1.0801390409469604, + "learning_rate": 9.92063157894737e-05, + "loss": 0.4279, + "step": 23027 + }, + { + "epoch": 1.2895061037070221, + "grad_norm": 1.3517122268676758, + "learning_rate": 9.920605263157894e-05, + "loss": 0.6221, + "step": 23028 + }, + { + "epoch": 1.2895621010191511, + "grad_norm": 1.0612396001815796, + "learning_rate": 9.920578947368422e-05, + "loss": 0.379, + "step": 23029 + }, + { + "epoch": 1.2896180983312802, + "grad_norm": 1.3833634853363037, + "learning_rate": 9.920552631578948e-05, + "loss": 0.5444, + "step": 23030 + }, + { + "epoch": 1.2896740956434092, + "grad_norm": 1.5076874494552612, + "learning_rate": 9.920526315789475e-05, + "loss": 0.4223, + "step": 23031 + }, + { + "epoch": 1.2897300929555382, + "grad_norm": 1.2255945205688477, + "learning_rate": 9.920500000000001e-05, + "loss": 0.4352, + "step": 23032 + }, + { + "epoch": 1.2897860902676672, + "grad_norm": 16.56756591796875, + "learning_rate": 9.920473684210527e-05, + "loss": 0.4737, + "step": 23033 + }, + { + "epoch": 1.2898420875797962, + "grad_norm": 1.5384169816970825, + "learning_rate": 9.920447368421053e-05, + "loss": 0.5292, + "step": 23034 + }, + { + "epoch": 1.2898980848919253, + "grad_norm": 1.4383050203323364, + "learning_rate": 9.92042105263158e-05, + "loss": 0.5717, + "step": 23035 + }, + { + "epoch": 1.2899540822040543, + "grad_norm": 1.4422905445098877, + "learning_rate": 9.920394736842106e-05, + "loss": 0.5326, + "step": 23036 + }, + { + "epoch": 1.2900100795161833, + "grad_norm": 1.395366907119751, + "learning_rate": 9.920368421052632e-05, + "loss": 0.5032, + "step": 23037 + }, + { + "epoch": 1.2900660768283123, + "grad_norm": 1.51004159450531, + "learning_rate": 9.920342105263158e-05, + "loss": 0.584, + "step": 23038 + }, + { + "epoch": 1.2901220741404413, + "grad_norm": 12.25399112701416, + "learning_rate": 9.920315789473684e-05, + "loss": 0.4347, + "step": 23039 + }, + { + "epoch": 1.2901780714525704, + "grad_norm": 1.4509413242340088, + "learning_rate": 9.920289473684211e-05, + "loss": 0.4863, + "step": 23040 + }, + { + "epoch": 1.2902340687646994, + "grad_norm": 1.5943634510040283, + "learning_rate": 9.920263157894737e-05, + "loss": 0.47, + "step": 23041 + }, + { + "epoch": 1.2902900660768284, + "grad_norm": 1.487524390220642, + "learning_rate": 9.920236842105263e-05, + "loss": 0.4692, + "step": 23042 + }, + { + "epoch": 1.2903460633889574, + "grad_norm": 1.4222885370254517, + "learning_rate": 9.92021052631579e-05, + "loss": 0.4326, + "step": 23043 + }, + { + "epoch": 1.2904020607010864, + "grad_norm": 1.3612477779388428, + "learning_rate": 9.920184210526317e-05, + "loss": 0.4119, + "step": 23044 + }, + { + "epoch": 1.2904580580132154, + "grad_norm": 1.4671094417572021, + "learning_rate": 9.920157894736843e-05, + "loss": 0.4623, + "step": 23045 + }, + { + "epoch": 1.2905140553253442, + "grad_norm": 1.35136079788208, + "learning_rate": 9.920131578947369e-05, + "loss": 0.4039, + "step": 23046 + }, + { + "epoch": 1.2905700526374733, + "grad_norm": 1.349650502204895, + "learning_rate": 9.920105263157895e-05, + "loss": 0.4961, + "step": 23047 + }, + { + "epoch": 1.2906260499496023, + "grad_norm": 1.610949158668518, + "learning_rate": 9.920078947368422e-05, + "loss": 0.4722, + "step": 23048 + }, + { + "epoch": 1.2906820472617313, + "grad_norm": 1.4275761842727661, + "learning_rate": 9.920052631578948e-05, + "loss": 0.4182, + "step": 23049 + }, + { + "epoch": 1.2907380445738603, + "grad_norm": 1.6514980792999268, + "learning_rate": 9.920026315789475e-05, + "loss": 0.5518, + "step": 23050 + }, + { + "epoch": 1.2907940418859893, + "grad_norm": 1.4580165147781372, + "learning_rate": 9.92e-05, + "loss": 0.3794, + "step": 23051 + }, + { + "epoch": 1.2908500391981184, + "grad_norm": 1.6363708972930908, + "learning_rate": 9.919973684210527e-05, + "loss": 0.4578, + "step": 23052 + }, + { + "epoch": 1.2909060365102474, + "grad_norm": 1.6622533798217773, + "learning_rate": 9.919947368421053e-05, + "loss": 0.7171, + "step": 23053 + }, + { + "epoch": 1.2909620338223764, + "grad_norm": 1.8035329580307007, + "learning_rate": 9.919921052631579e-05, + "loss": 0.4881, + "step": 23054 + }, + { + "epoch": 1.2910180311345054, + "grad_norm": 1.3802659511566162, + "learning_rate": 9.919894736842105e-05, + "loss": 0.4492, + "step": 23055 + }, + { + "epoch": 1.2910740284466344, + "grad_norm": 2.016417980194092, + "learning_rate": 9.919868421052631e-05, + "loss": 0.5521, + "step": 23056 + }, + { + "epoch": 1.2911300257587635, + "grad_norm": 1.479732632637024, + "learning_rate": 9.919842105263158e-05, + "loss": 0.4334, + "step": 23057 + }, + { + "epoch": 1.2911860230708925, + "grad_norm": 1.2081102132797241, + "learning_rate": 9.919815789473684e-05, + "loss": 0.4467, + "step": 23058 + }, + { + "epoch": 1.2912420203830215, + "grad_norm": 1.1472679376602173, + "learning_rate": 9.919789473684212e-05, + "loss": 0.3787, + "step": 23059 + }, + { + "epoch": 1.2912980176951505, + "grad_norm": 1.3147450685501099, + "learning_rate": 9.919763157894736e-05, + "loss": 0.3942, + "step": 23060 + }, + { + "epoch": 1.2913540150072795, + "grad_norm": 1.5052769184112549, + "learning_rate": 9.919736842105264e-05, + "loss": 0.4573, + "step": 23061 + }, + { + "epoch": 1.2914100123194086, + "grad_norm": 1.358510136604309, + "learning_rate": 9.91971052631579e-05, + "loss": 0.5059, + "step": 23062 + }, + { + "epoch": 1.2914660096315376, + "grad_norm": 1.321942925453186, + "learning_rate": 9.919684210526317e-05, + "loss": 0.5401, + "step": 23063 + }, + { + "epoch": 1.2915220069436666, + "grad_norm": 1.363074779510498, + "learning_rate": 9.919657894736843e-05, + "loss": 0.4345, + "step": 23064 + }, + { + "epoch": 1.2915780042557956, + "grad_norm": 1.3865578174591064, + "learning_rate": 9.919631578947369e-05, + "loss": 0.5602, + "step": 23065 + }, + { + "epoch": 1.2916340015679246, + "grad_norm": 1.3393489122390747, + "learning_rate": 9.919605263157895e-05, + "loss": 0.3869, + "step": 23066 + }, + { + "epoch": 1.2916899988800536, + "grad_norm": 1.5695127248764038, + "learning_rate": 9.919578947368422e-05, + "loss": 0.5572, + "step": 23067 + }, + { + "epoch": 1.2917459961921827, + "grad_norm": 1.1903185844421387, + "learning_rate": 9.919552631578948e-05, + "loss": 0.4136, + "step": 23068 + }, + { + "epoch": 1.2918019935043117, + "grad_norm": 1.7370045185089111, + "learning_rate": 9.919526315789474e-05, + "loss": 0.6811, + "step": 23069 + }, + { + "epoch": 1.2918579908164407, + "grad_norm": 1.3773322105407715, + "learning_rate": 9.9195e-05, + "loss": 0.4945, + "step": 23070 + }, + { + "epoch": 1.2919139881285697, + "grad_norm": 1.3980776071548462, + "learning_rate": 9.919473684210526e-05, + "loss": 0.4945, + "step": 23071 + }, + { + "epoch": 1.2919699854406987, + "grad_norm": 1.73771071434021, + "learning_rate": 9.919447368421053e-05, + "loss": 0.4702, + "step": 23072 + }, + { + "epoch": 1.2920259827528278, + "grad_norm": 1.491405963897705, + "learning_rate": 9.91942105263158e-05, + "loss": 0.5025, + "step": 23073 + }, + { + "epoch": 1.2920819800649568, + "grad_norm": 1.2875639200210571, + "learning_rate": 9.919394736842105e-05, + "loss": 0.3839, + "step": 23074 + }, + { + "epoch": 1.2921379773770858, + "grad_norm": 1.4439479112625122, + "learning_rate": 9.919368421052631e-05, + "loss": 0.4741, + "step": 23075 + }, + { + "epoch": 1.2921939746892148, + "grad_norm": 1.5555182695388794, + "learning_rate": 9.919342105263159e-05, + "loss": 0.547, + "step": 23076 + }, + { + "epoch": 1.2922499720013438, + "grad_norm": 1.3721479177474976, + "learning_rate": 9.919315789473685e-05, + "loss": 0.3663, + "step": 23077 + }, + { + "epoch": 1.2923059693134729, + "grad_norm": 4.716894626617432, + "learning_rate": 9.91928947368421e-05, + "loss": 0.4126, + "step": 23078 + }, + { + "epoch": 1.2923619666256019, + "grad_norm": 1.2825888395309448, + "learning_rate": 9.919263157894737e-05, + "loss": 0.4151, + "step": 23079 + }, + { + "epoch": 1.292417963937731, + "grad_norm": 1.594100832939148, + "learning_rate": 9.919236842105264e-05, + "loss": 0.5598, + "step": 23080 + }, + { + "epoch": 1.29247396124986, + "grad_norm": 1.3940268754959106, + "learning_rate": 9.91921052631579e-05, + "loss": 0.4255, + "step": 23081 + }, + { + "epoch": 1.292529958561989, + "grad_norm": 1.435741901397705, + "learning_rate": 9.919184210526317e-05, + "loss": 0.5045, + "step": 23082 + }, + { + "epoch": 1.292585955874118, + "grad_norm": 1.2281544208526611, + "learning_rate": 9.919157894736842e-05, + "loss": 0.4039, + "step": 23083 + }, + { + "epoch": 1.292641953186247, + "grad_norm": 1.472573161125183, + "learning_rate": 9.919131578947369e-05, + "loss": 0.5481, + "step": 23084 + }, + { + "epoch": 1.292697950498376, + "grad_norm": 1.5225231647491455, + "learning_rate": 9.919105263157895e-05, + "loss": 0.5484, + "step": 23085 + }, + { + "epoch": 1.292753947810505, + "grad_norm": 1.4332185983657837, + "learning_rate": 9.919078947368422e-05, + "loss": 0.5739, + "step": 23086 + }, + { + "epoch": 1.292809945122634, + "grad_norm": 1.1747983694076538, + "learning_rate": 9.919052631578948e-05, + "loss": 0.3832, + "step": 23087 + }, + { + "epoch": 1.292865942434763, + "grad_norm": 1.521575927734375, + "learning_rate": 9.919026315789473e-05, + "loss": 0.4869, + "step": 23088 + }, + { + "epoch": 1.292921939746892, + "grad_norm": 1.1403270959854126, + "learning_rate": 9.919e-05, + "loss": 0.3319, + "step": 23089 + }, + { + "epoch": 1.292977937059021, + "grad_norm": 2.0384562015533447, + "learning_rate": 9.918973684210526e-05, + "loss": 0.4807, + "step": 23090 + }, + { + "epoch": 1.29303393437115, + "grad_norm": 1.4302958250045776, + "learning_rate": 9.918947368421054e-05, + "loss": 0.5648, + "step": 23091 + }, + { + "epoch": 1.2930899316832791, + "grad_norm": 1.2359665632247925, + "learning_rate": 9.918921052631578e-05, + "loss": 0.5447, + "step": 23092 + }, + { + "epoch": 1.2931459289954081, + "grad_norm": 1.3534879684448242, + "learning_rate": 9.918894736842106e-05, + "loss": 0.6683, + "step": 23093 + }, + { + "epoch": 1.2932019263075372, + "grad_norm": 1.5503642559051514, + "learning_rate": 9.918868421052632e-05, + "loss": 0.5193, + "step": 23094 + }, + { + "epoch": 1.2932579236196662, + "grad_norm": 1.4643770456314087, + "learning_rate": 9.918842105263159e-05, + "loss": 0.4131, + "step": 23095 + }, + { + "epoch": 1.2933139209317952, + "grad_norm": 2.064157247543335, + "learning_rate": 9.918815789473685e-05, + "loss": 0.4761, + "step": 23096 + }, + { + "epoch": 1.2933699182439242, + "grad_norm": 1.501900553703308, + "learning_rate": 9.918789473684211e-05, + "loss": 0.4655, + "step": 23097 + }, + { + "epoch": 1.2934259155560532, + "grad_norm": 1.5062413215637207, + "learning_rate": 9.918763157894737e-05, + "loss": 0.4824, + "step": 23098 + }, + { + "epoch": 1.2934819128681823, + "grad_norm": 2.126584053039551, + "learning_rate": 9.918736842105264e-05, + "loss": 0.4974, + "step": 23099 + }, + { + "epoch": 1.2935379101803113, + "grad_norm": 1.206376075744629, + "learning_rate": 9.91871052631579e-05, + "loss": 0.4751, + "step": 23100 + }, + { + "epoch": 1.2935939074924403, + "grad_norm": 1.2563661336898804, + "learning_rate": 9.918684210526316e-05, + "loss": 0.4306, + "step": 23101 + }, + { + "epoch": 1.2936499048045693, + "grad_norm": 1.3478997945785522, + "learning_rate": 9.918657894736842e-05, + "loss": 0.4101, + "step": 23102 + }, + { + "epoch": 1.2937059021166983, + "grad_norm": 1.3861345052719116, + "learning_rate": 9.91863157894737e-05, + "loss": 0.4732, + "step": 23103 + }, + { + "epoch": 1.2937618994288274, + "grad_norm": 1.8398720026016235, + "learning_rate": 9.918605263157895e-05, + "loss": 0.5285, + "step": 23104 + }, + { + "epoch": 1.2938178967409564, + "grad_norm": 1.252652645111084, + "learning_rate": 9.918578947368423e-05, + "loss": 0.4509, + "step": 23105 + }, + { + "epoch": 1.2938738940530854, + "grad_norm": 1.211471676826477, + "learning_rate": 9.918552631578947e-05, + "loss": 0.4291, + "step": 23106 + }, + { + "epoch": 1.2939298913652144, + "grad_norm": 1.5865445137023926, + "learning_rate": 9.918526315789473e-05, + "loss": 0.6073, + "step": 23107 + }, + { + "epoch": 1.2939858886773434, + "grad_norm": 1.4167582988739014, + "learning_rate": 9.9185e-05, + "loss": 0.4617, + "step": 23108 + }, + { + "epoch": 1.2940418859894725, + "grad_norm": 1.1195567846298218, + "learning_rate": 9.918473684210527e-05, + "loss": 0.4275, + "step": 23109 + }, + { + "epoch": 1.2940978833016015, + "grad_norm": 1.261151909828186, + "learning_rate": 9.918447368421053e-05, + "loss": 0.4719, + "step": 23110 + }, + { + "epoch": 1.2941538806137305, + "grad_norm": 1.4415532350540161, + "learning_rate": 9.918421052631579e-05, + "loss": 0.4583, + "step": 23111 + }, + { + "epoch": 1.2942098779258595, + "grad_norm": 1.394088864326477, + "learning_rate": 9.918394736842106e-05, + "loss": 0.488, + "step": 23112 + }, + { + "epoch": 1.2942658752379885, + "grad_norm": 1.8100812435150146, + "learning_rate": 9.918368421052632e-05, + "loss": 0.6291, + "step": 23113 + }, + { + "epoch": 1.2943218725501175, + "grad_norm": 1.3433568477630615, + "learning_rate": 9.918342105263159e-05, + "loss": 0.5986, + "step": 23114 + }, + { + "epoch": 1.2943778698622466, + "grad_norm": 1.442548155784607, + "learning_rate": 9.918315789473684e-05, + "loss": 0.412, + "step": 23115 + }, + { + "epoch": 1.2944338671743756, + "grad_norm": 1.5105018615722656, + "learning_rate": 9.918289473684211e-05, + "loss": 0.3668, + "step": 23116 + }, + { + "epoch": 1.2944898644865046, + "grad_norm": 1.1323000192642212, + "learning_rate": 9.918263157894737e-05, + "loss": 0.4888, + "step": 23117 + }, + { + "epoch": 1.2945458617986336, + "grad_norm": 1.354244589805603, + "learning_rate": 9.918236842105264e-05, + "loss": 0.4052, + "step": 23118 + }, + { + "epoch": 1.2946018591107626, + "grad_norm": 1.3402349948883057, + "learning_rate": 9.91821052631579e-05, + "loss": 0.3926, + "step": 23119 + }, + { + "epoch": 1.2946578564228917, + "grad_norm": 1.3419550657272339, + "learning_rate": 9.918184210526316e-05, + "loss": 0.437, + "step": 23120 + }, + { + "epoch": 1.2947138537350207, + "grad_norm": 2.1664059162139893, + "learning_rate": 9.918157894736842e-05, + "loss": 0.6513, + "step": 23121 + }, + { + "epoch": 1.2947698510471497, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.91813157894737e-05, + "loss": 0.4336, + "step": 23122 + }, + { + "epoch": 1.2948258483592787, + "grad_norm": 1.2470271587371826, + "learning_rate": 9.918105263157896e-05, + "loss": 0.4391, + "step": 23123 + }, + { + "epoch": 1.2948818456714077, + "grad_norm": 1.4891743659973145, + "learning_rate": 9.918078947368422e-05, + "loss": 0.4758, + "step": 23124 + }, + { + "epoch": 1.2949378429835368, + "grad_norm": 1.537827491760254, + "learning_rate": 9.918052631578948e-05, + "loss": 0.5983, + "step": 23125 + }, + { + "epoch": 1.2949938402956658, + "grad_norm": 1.628988265991211, + "learning_rate": 9.918026315789474e-05, + "loss": 0.4892, + "step": 23126 + }, + { + "epoch": 1.2950498376077948, + "grad_norm": 1.4828397035598755, + "learning_rate": 9.918000000000001e-05, + "loss": 0.4178, + "step": 23127 + }, + { + "epoch": 1.2951058349199238, + "grad_norm": 1.4297609329223633, + "learning_rate": 9.917973684210527e-05, + "loss": 0.6309, + "step": 23128 + }, + { + "epoch": 1.2951618322320528, + "grad_norm": 1.074594497680664, + "learning_rate": 9.917947368421053e-05, + "loss": 0.3676, + "step": 23129 + }, + { + "epoch": 1.2952178295441819, + "grad_norm": 1.2312418222427368, + "learning_rate": 9.917921052631579e-05, + "loss": 0.3847, + "step": 23130 + }, + { + "epoch": 1.2952738268563109, + "grad_norm": 1.3568131923675537, + "learning_rate": 9.917894736842106e-05, + "loss": 0.5376, + "step": 23131 + }, + { + "epoch": 1.29532982416844, + "grad_norm": 1.3103629350662231, + "learning_rate": 9.917868421052632e-05, + "loss": 0.5244, + "step": 23132 + }, + { + "epoch": 1.295385821480569, + "grad_norm": 1.4107483625411987, + "learning_rate": 9.917842105263158e-05, + "loss": 0.462, + "step": 23133 + }, + { + "epoch": 1.295441818792698, + "grad_norm": 1.229911208152771, + "learning_rate": 9.917815789473684e-05, + "loss": 0.303, + "step": 23134 + }, + { + "epoch": 1.295497816104827, + "grad_norm": 2.078338623046875, + "learning_rate": 9.917789473684211e-05, + "loss": 0.4735, + "step": 23135 + }, + { + "epoch": 1.295553813416956, + "grad_norm": 1.6818737983703613, + "learning_rate": 9.917763157894737e-05, + "loss": 0.4912, + "step": 23136 + }, + { + "epoch": 1.295609810729085, + "grad_norm": 1.308260202407837, + "learning_rate": 9.917736842105265e-05, + "loss": 0.5235, + "step": 23137 + }, + { + "epoch": 1.295665808041214, + "grad_norm": 1.2334599494934082, + "learning_rate": 9.91771052631579e-05, + "loss": 0.3955, + "step": 23138 + }, + { + "epoch": 1.295721805353343, + "grad_norm": 1.2334140539169312, + "learning_rate": 9.917684210526317e-05, + "loss": 0.3996, + "step": 23139 + }, + { + "epoch": 1.295777802665472, + "grad_norm": 1.3915631771087646, + "learning_rate": 9.917657894736843e-05, + "loss": 0.4741, + "step": 23140 + }, + { + "epoch": 1.295833799977601, + "grad_norm": 1.263722538948059, + "learning_rate": 9.917631578947369e-05, + "loss": 0.5301, + "step": 23141 + }, + { + "epoch": 1.29588979728973, + "grad_norm": 1.5782407522201538, + "learning_rate": 9.917605263157896e-05, + "loss": 0.5274, + "step": 23142 + }, + { + "epoch": 1.295945794601859, + "grad_norm": 1.4976212978363037, + "learning_rate": 9.91757894736842e-05, + "loss": 0.4581, + "step": 23143 + }, + { + "epoch": 1.2960017919139881, + "grad_norm": 1.3090317249298096, + "learning_rate": 9.917552631578948e-05, + "loss": 0.4289, + "step": 23144 + }, + { + "epoch": 1.2960577892261171, + "grad_norm": 1.5705372095108032, + "learning_rate": 9.917526315789474e-05, + "loss": 0.4602, + "step": 23145 + }, + { + "epoch": 1.2961137865382462, + "grad_norm": 1.349718451499939, + "learning_rate": 9.917500000000001e-05, + "loss": 0.5322, + "step": 23146 + }, + { + "epoch": 1.2961697838503752, + "grad_norm": 1.5553501844406128, + "learning_rate": 9.917473684210526e-05, + "loss": 0.5053, + "step": 23147 + }, + { + "epoch": 1.2962257811625042, + "grad_norm": 1.4202433824539185, + "learning_rate": 9.917447368421053e-05, + "loss": 0.4849, + "step": 23148 + }, + { + "epoch": 1.2962817784746332, + "grad_norm": 10.982044219970703, + "learning_rate": 9.917421052631579e-05, + "loss": 0.4594, + "step": 23149 + }, + { + "epoch": 1.2963377757867622, + "grad_norm": 2.137157917022705, + "learning_rate": 9.917394736842106e-05, + "loss": 0.4211, + "step": 23150 + }, + { + "epoch": 1.2963937730988913, + "grad_norm": 1.470205545425415, + "learning_rate": 9.917368421052632e-05, + "loss": 0.5466, + "step": 23151 + }, + { + "epoch": 1.2964497704110203, + "grad_norm": 1.2946199178695679, + "learning_rate": 9.917342105263158e-05, + "loss": 0.3932, + "step": 23152 + }, + { + "epoch": 1.2965057677231493, + "grad_norm": 1.8522518873214722, + "learning_rate": 9.917315789473684e-05, + "loss": 0.5165, + "step": 23153 + }, + { + "epoch": 1.2965617650352783, + "grad_norm": 1.4067450761795044, + "learning_rate": 9.917289473684212e-05, + "loss": 0.4807, + "step": 23154 + }, + { + "epoch": 1.2966177623474073, + "grad_norm": 1.2604808807373047, + "learning_rate": 9.917263157894738e-05, + "loss": 0.4159, + "step": 23155 + }, + { + "epoch": 1.2966737596595364, + "grad_norm": 1.3013297319412231, + "learning_rate": 9.917236842105264e-05, + "loss": 0.4577, + "step": 23156 + }, + { + "epoch": 1.2967297569716654, + "grad_norm": 1.9174307584762573, + "learning_rate": 9.91721052631579e-05, + "loss": 0.5487, + "step": 23157 + }, + { + "epoch": 1.2967857542837944, + "grad_norm": 1.2069087028503418, + "learning_rate": 9.917184210526316e-05, + "loss": 0.4228, + "step": 23158 + }, + { + "epoch": 1.2968417515959234, + "grad_norm": 1.5480200052261353, + "learning_rate": 9.917157894736843e-05, + "loss": 0.5662, + "step": 23159 + }, + { + "epoch": 1.2968977489080524, + "grad_norm": 1.130193829536438, + "learning_rate": 9.917131578947369e-05, + "loss": 0.4401, + "step": 23160 + }, + { + "epoch": 1.2969537462201814, + "grad_norm": 1.2581520080566406, + "learning_rate": 9.917105263157895e-05, + "loss": 0.3823, + "step": 23161 + }, + { + "epoch": 1.2970097435323105, + "grad_norm": 1.2995219230651855, + "learning_rate": 9.917078947368421e-05, + "loss": 0.39, + "step": 23162 + }, + { + "epoch": 1.2970657408444395, + "grad_norm": 1.1968741416931152, + "learning_rate": 9.917052631578948e-05, + "loss": 0.371, + "step": 23163 + }, + { + "epoch": 1.2971217381565685, + "grad_norm": 1.3204034566879272, + "learning_rate": 9.917026315789474e-05, + "loss": 0.4821, + "step": 23164 + }, + { + "epoch": 1.2971777354686975, + "grad_norm": 1.4278463125228882, + "learning_rate": 9.917e-05, + "loss": 0.4841, + "step": 23165 + }, + { + "epoch": 1.2972337327808265, + "grad_norm": 1.7300227880477905, + "learning_rate": 9.916973684210526e-05, + "loss": 0.4935, + "step": 23166 + }, + { + "epoch": 1.2972897300929556, + "grad_norm": 1.3219255208969116, + "learning_rate": 9.916947368421053e-05, + "loss": 0.4569, + "step": 23167 + }, + { + "epoch": 1.2973457274050846, + "grad_norm": 1.4268701076507568, + "learning_rate": 9.91692105263158e-05, + "loss": 0.5079, + "step": 23168 + }, + { + "epoch": 1.2974017247172136, + "grad_norm": 1.519562005996704, + "learning_rate": 9.916894736842107e-05, + "loss": 0.4115, + "step": 23169 + }, + { + "epoch": 1.2974577220293426, + "grad_norm": 1.5569610595703125, + "learning_rate": 9.916868421052631e-05, + "loss": 0.4973, + "step": 23170 + }, + { + "epoch": 1.2975137193414716, + "grad_norm": 1.2861106395721436, + "learning_rate": 9.916842105263159e-05, + "loss": 0.4534, + "step": 23171 + }, + { + "epoch": 1.2975697166536007, + "grad_norm": 1.3012754917144775, + "learning_rate": 9.916815789473685e-05, + "loss": 0.5036, + "step": 23172 + }, + { + "epoch": 1.2976257139657297, + "grad_norm": 1.5721306800842285, + "learning_rate": 9.916789473684212e-05, + "loss": 0.5698, + "step": 23173 + }, + { + "epoch": 1.2976817112778587, + "grad_norm": 1.498849868774414, + "learning_rate": 9.916763157894738e-05, + "loss": 0.5996, + "step": 23174 + }, + { + "epoch": 1.2977377085899877, + "grad_norm": 1.4276922941207886, + "learning_rate": 9.916736842105263e-05, + "loss": 0.4091, + "step": 23175 + }, + { + "epoch": 1.2977937059021167, + "grad_norm": 1.5499815940856934, + "learning_rate": 9.91671052631579e-05, + "loss": 0.3802, + "step": 23176 + }, + { + "epoch": 1.2978497032142458, + "grad_norm": 1.2192739248275757, + "learning_rate": 9.916684210526316e-05, + "loss": 0.5103, + "step": 23177 + }, + { + "epoch": 1.2979057005263748, + "grad_norm": 1.1659903526306152, + "learning_rate": 9.916657894736843e-05, + "loss": 0.4654, + "step": 23178 + }, + { + "epoch": 1.2979616978385038, + "grad_norm": 1.4987658262252808, + "learning_rate": 9.916631578947369e-05, + "loss": 0.4804, + "step": 23179 + }, + { + "epoch": 1.2980176951506328, + "grad_norm": 2.007099151611328, + "learning_rate": 9.916605263157895e-05, + "loss": 0.5708, + "step": 23180 + }, + { + "epoch": 1.2980736924627618, + "grad_norm": 1.3453506231307983, + "learning_rate": 9.916578947368421e-05, + "loss": 0.4868, + "step": 23181 + }, + { + "epoch": 1.2981296897748908, + "grad_norm": 1.3088680505752563, + "learning_rate": 9.916552631578948e-05, + "loss": 0.4905, + "step": 23182 + }, + { + "epoch": 1.2981856870870199, + "grad_norm": 1.4027286767959595, + "learning_rate": 9.916526315789474e-05, + "loss": 0.6571, + "step": 23183 + }, + { + "epoch": 1.2982416843991489, + "grad_norm": 1.422242522239685, + "learning_rate": 9.9165e-05, + "loss": 0.4173, + "step": 23184 + }, + { + "epoch": 1.298297681711278, + "grad_norm": 1.295256495475769, + "learning_rate": 9.916473684210526e-05, + "loss": 0.4624, + "step": 23185 + }, + { + "epoch": 1.298353679023407, + "grad_norm": 1.3469479084014893, + "learning_rate": 9.916447368421054e-05, + "loss": 0.6286, + "step": 23186 + }, + { + "epoch": 1.298409676335536, + "grad_norm": 1.4076720476150513, + "learning_rate": 9.91642105263158e-05, + "loss": 0.4399, + "step": 23187 + }, + { + "epoch": 1.298465673647665, + "grad_norm": 1.4321870803833008, + "learning_rate": 9.916394736842106e-05, + "loss": 0.4432, + "step": 23188 + }, + { + "epoch": 1.298521670959794, + "grad_norm": 1.5671515464782715, + "learning_rate": 9.916368421052632e-05, + "loss": 0.4282, + "step": 23189 + }, + { + "epoch": 1.298577668271923, + "grad_norm": 1.2739958763122559, + "learning_rate": 9.916342105263159e-05, + "loss": 0.3489, + "step": 23190 + }, + { + "epoch": 1.298633665584052, + "grad_norm": 1.3998924493789673, + "learning_rate": 9.916315789473685e-05, + "loss": 0.4127, + "step": 23191 + }, + { + "epoch": 1.298689662896181, + "grad_norm": 1.3785351514816284, + "learning_rate": 9.916289473684211e-05, + "loss": 0.452, + "step": 23192 + }, + { + "epoch": 1.29874566020831, + "grad_norm": 1.5077903270721436, + "learning_rate": 9.916263157894737e-05, + "loss": 0.4126, + "step": 23193 + }, + { + "epoch": 1.298801657520439, + "grad_norm": 1.72370445728302, + "learning_rate": 9.916236842105263e-05, + "loss": 0.513, + "step": 23194 + }, + { + "epoch": 1.298857654832568, + "grad_norm": 1.2757314443588257, + "learning_rate": 9.91621052631579e-05, + "loss": 0.4414, + "step": 23195 + }, + { + "epoch": 1.2989136521446971, + "grad_norm": 6.817765712738037, + "learning_rate": 9.916184210526316e-05, + "loss": 0.4587, + "step": 23196 + }, + { + "epoch": 1.2989696494568261, + "grad_norm": 1.236328363418579, + "learning_rate": 9.916157894736843e-05, + "loss": 0.5042, + "step": 23197 + }, + { + "epoch": 1.2990256467689552, + "grad_norm": 1.5084718465805054, + "learning_rate": 9.916131578947368e-05, + "loss": 0.6613, + "step": 23198 + }, + { + "epoch": 1.2990816440810842, + "grad_norm": 1.5099937915802002, + "learning_rate": 9.916105263157895e-05, + "loss": 0.4576, + "step": 23199 + }, + { + "epoch": 1.2991376413932132, + "grad_norm": 1.9682420492172241, + "learning_rate": 9.916078947368421e-05, + "loss": 0.508, + "step": 23200 + }, + { + "epoch": 1.2991936387053422, + "grad_norm": 1.282922625541687, + "learning_rate": 9.916052631578949e-05, + "loss": 0.5232, + "step": 23201 + }, + { + "epoch": 1.2992496360174712, + "grad_norm": 1.4363484382629395, + "learning_rate": 9.916026315789473e-05, + "loss": 0.5648, + "step": 23202 + }, + { + "epoch": 1.2993056333296003, + "grad_norm": 1.1488429307937622, + "learning_rate": 9.916e-05, + "loss": 0.5025, + "step": 23203 + }, + { + "epoch": 1.2993616306417293, + "grad_norm": 1.7818005084991455, + "learning_rate": 9.915973684210527e-05, + "loss": 0.7077, + "step": 23204 + }, + { + "epoch": 1.2994176279538583, + "grad_norm": 1.6766014099121094, + "learning_rate": 9.915947368421054e-05, + "loss": 0.4379, + "step": 23205 + }, + { + "epoch": 1.2994736252659873, + "grad_norm": 1.5596418380737305, + "learning_rate": 9.91592105263158e-05, + "loss": 0.4459, + "step": 23206 + }, + { + "epoch": 1.2995296225781163, + "grad_norm": 1.570223093032837, + "learning_rate": 9.915894736842106e-05, + "loss": 0.5508, + "step": 23207 + }, + { + "epoch": 1.2995856198902453, + "grad_norm": 1.3504315614700317, + "learning_rate": 9.915868421052632e-05, + "loss": 0.4035, + "step": 23208 + }, + { + "epoch": 1.2996416172023744, + "grad_norm": 1.6046736240386963, + "learning_rate": 9.915842105263158e-05, + "loss": 0.3972, + "step": 23209 + }, + { + "epoch": 1.2996976145145034, + "grad_norm": 1.485824465751648, + "learning_rate": 9.915815789473685e-05, + "loss": 0.4442, + "step": 23210 + }, + { + "epoch": 1.2997536118266324, + "grad_norm": 1.156082272529602, + "learning_rate": 9.915789473684211e-05, + "loss": 0.4251, + "step": 23211 + }, + { + "epoch": 1.2998096091387614, + "grad_norm": 1.5914150476455688, + "learning_rate": 9.915763157894737e-05, + "loss": 0.5364, + "step": 23212 + }, + { + "epoch": 1.2998656064508904, + "grad_norm": 1.5587588548660278, + "learning_rate": 9.915736842105263e-05, + "loss": 0.5468, + "step": 23213 + }, + { + "epoch": 1.2999216037630195, + "grad_norm": 1.4740647077560425, + "learning_rate": 9.91571052631579e-05, + "loss": 0.4464, + "step": 23214 + }, + { + "epoch": 1.2999776010751485, + "grad_norm": 1.7286256551742554, + "learning_rate": 9.915684210526316e-05, + "loss": 0.5414, + "step": 23215 + }, + { + "epoch": 1.3000335983872775, + "grad_norm": 1.2766107320785522, + "learning_rate": 9.915657894736842e-05, + "loss": 0.4173, + "step": 23216 + }, + { + "epoch": 1.3000895956994065, + "grad_norm": 6.632284641265869, + "learning_rate": 9.915631578947368e-05, + "loss": 0.5423, + "step": 23217 + }, + { + "epoch": 1.3001455930115355, + "grad_norm": 1.4115962982177734, + "learning_rate": 9.915605263157896e-05, + "loss": 0.5377, + "step": 23218 + }, + { + "epoch": 1.3002015903236646, + "grad_norm": 1.3499791622161865, + "learning_rate": 9.915578947368422e-05, + "loss": 0.4549, + "step": 23219 + }, + { + "epoch": 1.3002575876357936, + "grad_norm": 1.404026746749878, + "learning_rate": 9.915552631578948e-05, + "loss": 0.5127, + "step": 23220 + }, + { + "epoch": 1.3003135849479226, + "grad_norm": 8.091444969177246, + "learning_rate": 9.915526315789474e-05, + "loss": 0.3913, + "step": 23221 + }, + { + "epoch": 1.3003695822600516, + "grad_norm": 1.43260657787323, + "learning_rate": 9.915500000000001e-05, + "loss": 0.4801, + "step": 23222 + }, + { + "epoch": 1.3004255795721806, + "grad_norm": 1.3025494813919067, + "learning_rate": 9.915473684210527e-05, + "loss": 0.3282, + "step": 23223 + }, + { + "epoch": 1.3004815768843097, + "grad_norm": 1.3490900993347168, + "learning_rate": 9.915447368421054e-05, + "loss": 0.4077, + "step": 23224 + }, + { + "epoch": 1.3005375741964387, + "grad_norm": 1.1420457363128662, + "learning_rate": 9.915421052631579e-05, + "loss": 0.3791, + "step": 23225 + }, + { + "epoch": 1.3005935715085677, + "grad_norm": 1.450870156288147, + "learning_rate": 9.915394736842106e-05, + "loss": 0.3893, + "step": 23226 + }, + { + "epoch": 1.3006495688206967, + "grad_norm": 1.3151649236679077, + "learning_rate": 9.915368421052632e-05, + "loss": 0.4269, + "step": 23227 + }, + { + "epoch": 1.3007055661328257, + "grad_norm": 1.3947206735610962, + "learning_rate": 9.915342105263158e-05, + "loss": 0.5421, + "step": 23228 + }, + { + "epoch": 1.3007615634449547, + "grad_norm": 1.2942920923233032, + "learning_rate": 9.915315789473685e-05, + "loss": 0.4889, + "step": 23229 + }, + { + "epoch": 1.3008175607570838, + "grad_norm": 1.3855652809143066, + "learning_rate": 9.91528947368421e-05, + "loss": 0.4664, + "step": 23230 + }, + { + "epoch": 1.3008735580692128, + "grad_norm": 1.5210803747177124, + "learning_rate": 9.915263157894737e-05, + "loss": 0.447, + "step": 23231 + }, + { + "epoch": 1.3009295553813418, + "grad_norm": 1.2672810554504395, + "learning_rate": 9.915236842105263e-05, + "loss": 0.4659, + "step": 23232 + }, + { + "epoch": 1.3009855526934708, + "grad_norm": 1.268456220626831, + "learning_rate": 9.91521052631579e-05, + "loss": 0.3986, + "step": 23233 + }, + { + "epoch": 1.3010415500055998, + "grad_norm": 1.3206121921539307, + "learning_rate": 9.915184210526317e-05, + "loss": 0.4101, + "step": 23234 + }, + { + "epoch": 1.3010975473177289, + "grad_norm": 1.2786929607391357, + "learning_rate": 9.915157894736843e-05, + "loss": 0.412, + "step": 23235 + }, + { + "epoch": 1.3011535446298579, + "grad_norm": 1.324182152748108, + "learning_rate": 9.915131578947369e-05, + "loss": 0.5193, + "step": 23236 + }, + { + "epoch": 1.301209541941987, + "grad_norm": 1.262039303779602, + "learning_rate": 9.915105263157896e-05, + "loss": 0.3655, + "step": 23237 + }, + { + "epoch": 1.301265539254116, + "grad_norm": 1.12849760055542, + "learning_rate": 9.915078947368422e-05, + "loss": 0.371, + "step": 23238 + }, + { + "epoch": 1.301321536566245, + "grad_norm": 1.2479850053787231, + "learning_rate": 9.915052631578948e-05, + "loss": 0.4755, + "step": 23239 + }, + { + "epoch": 1.301377533878374, + "grad_norm": 1.3527796268463135, + "learning_rate": 9.915026315789474e-05, + "loss": 0.5213, + "step": 23240 + }, + { + "epoch": 1.301433531190503, + "grad_norm": 1.407912015914917, + "learning_rate": 9.915000000000001e-05, + "loss": 0.4465, + "step": 23241 + }, + { + "epoch": 1.301489528502632, + "grad_norm": 1.2366207838058472, + "learning_rate": 9.914973684210527e-05, + "loss": 0.3874, + "step": 23242 + }, + { + "epoch": 1.301545525814761, + "grad_norm": 1.6261301040649414, + "learning_rate": 9.914947368421053e-05, + "loss": 0.4496, + "step": 23243 + }, + { + "epoch": 1.30160152312689, + "grad_norm": 2.06445574760437, + "learning_rate": 9.914921052631579e-05, + "loss": 0.7028, + "step": 23244 + }, + { + "epoch": 1.301657520439019, + "grad_norm": 1.4373807907104492, + "learning_rate": 9.914894736842105e-05, + "loss": 0.4088, + "step": 23245 + }, + { + "epoch": 1.301713517751148, + "grad_norm": 1.5163663625717163, + "learning_rate": 9.914868421052632e-05, + "loss": 0.4876, + "step": 23246 + }, + { + "epoch": 1.301769515063277, + "grad_norm": 1.2399814128875732, + "learning_rate": 9.914842105263158e-05, + "loss": 0.4709, + "step": 23247 + }, + { + "epoch": 1.3018255123754061, + "grad_norm": 1.331455945968628, + "learning_rate": 9.914815789473684e-05, + "loss": 0.4659, + "step": 23248 + }, + { + "epoch": 1.3018815096875351, + "grad_norm": 1.4218822717666626, + "learning_rate": 9.91478947368421e-05, + "loss": 0.4753, + "step": 23249 + }, + { + "epoch": 1.3019375069996642, + "grad_norm": 1.5947976112365723, + "learning_rate": 9.914763157894738e-05, + "loss": 0.5404, + "step": 23250 + }, + { + "epoch": 1.3019935043117932, + "grad_norm": 1.4025335311889648, + "learning_rate": 9.914736842105264e-05, + "loss": 0.4188, + "step": 23251 + }, + { + "epoch": 1.3020495016239222, + "grad_norm": 1.6437119245529175, + "learning_rate": 9.914710526315791e-05, + "loss": 0.5713, + "step": 23252 + }, + { + "epoch": 1.302105498936051, + "grad_norm": 1.9255142211914062, + "learning_rate": 9.914684210526316e-05, + "loss": 0.6225, + "step": 23253 + }, + { + "epoch": 1.30216149624818, + "grad_norm": 1.7305785417556763, + "learning_rate": 9.914657894736843e-05, + "loss": 0.4303, + "step": 23254 + }, + { + "epoch": 1.302217493560309, + "grad_norm": 1.2466832399368286, + "learning_rate": 9.914631578947369e-05, + "loss": 0.4593, + "step": 23255 + }, + { + "epoch": 1.302273490872438, + "grad_norm": 1.3282427787780762, + "learning_rate": 9.914605263157896e-05, + "loss": 0.4284, + "step": 23256 + }, + { + "epoch": 1.302329488184567, + "grad_norm": 1.3135789632797241, + "learning_rate": 9.914578947368421e-05, + "loss": 0.5275, + "step": 23257 + }, + { + "epoch": 1.302385485496696, + "grad_norm": 1.4246985912322998, + "learning_rate": 9.914552631578948e-05, + "loss": 0.4657, + "step": 23258 + }, + { + "epoch": 1.302441482808825, + "grad_norm": 1.2614291906356812, + "learning_rate": 9.914526315789474e-05, + "loss": 0.4454, + "step": 23259 + }, + { + "epoch": 1.3024974801209541, + "grad_norm": 1.3540866374969482, + "learning_rate": 9.914500000000001e-05, + "loss": 0.5905, + "step": 23260 + }, + { + "epoch": 1.3025534774330831, + "grad_norm": 1.4114477634429932, + "learning_rate": 9.914473684210527e-05, + "loss": 0.4183, + "step": 23261 + }, + { + "epoch": 1.3026094747452122, + "grad_norm": 1.3629343509674072, + "learning_rate": 9.914447368421052e-05, + "loss": 0.5109, + "step": 23262 + }, + { + "epoch": 1.3026654720573412, + "grad_norm": 1.3114348649978638, + "learning_rate": 9.914421052631579e-05, + "loss": 0.404, + "step": 23263 + }, + { + "epoch": 1.3027214693694702, + "grad_norm": 1.1841384172439575, + "learning_rate": 9.914394736842105e-05, + "loss": 0.4458, + "step": 23264 + }, + { + "epoch": 1.3027774666815992, + "grad_norm": 1.4203022718429565, + "learning_rate": 9.914368421052633e-05, + "loss": 0.5124, + "step": 23265 + }, + { + "epoch": 1.3028334639937282, + "grad_norm": 1.2591602802276611, + "learning_rate": 9.914342105263159e-05, + "loss": 0.3617, + "step": 23266 + }, + { + "epoch": 1.3028894613058573, + "grad_norm": 1.407633900642395, + "learning_rate": 9.914315789473685e-05, + "loss": 0.5957, + "step": 23267 + }, + { + "epoch": 1.3029454586179863, + "grad_norm": 1.4555846452713013, + "learning_rate": 9.91428947368421e-05, + "loss": 0.5717, + "step": 23268 + }, + { + "epoch": 1.3030014559301153, + "grad_norm": 1.4491373300552368, + "learning_rate": 9.914263157894738e-05, + "loss": 0.5847, + "step": 23269 + }, + { + "epoch": 1.3030574532422443, + "grad_norm": 1.156622052192688, + "learning_rate": 9.914236842105264e-05, + "loss": 0.4499, + "step": 23270 + }, + { + "epoch": 1.3031134505543733, + "grad_norm": 1.415259599685669, + "learning_rate": 9.91421052631579e-05, + "loss": 0.4797, + "step": 23271 + }, + { + "epoch": 1.3031694478665024, + "grad_norm": 1.3011866807937622, + "learning_rate": 9.914184210526316e-05, + "loss": 0.5682, + "step": 23272 + }, + { + "epoch": 1.3032254451786314, + "grad_norm": 1.2020739316940308, + "learning_rate": 9.914157894736843e-05, + "loss": 0.4119, + "step": 23273 + }, + { + "epoch": 1.3032814424907604, + "grad_norm": 1.252511739730835, + "learning_rate": 9.914131578947369e-05, + "loss": 0.4318, + "step": 23274 + }, + { + "epoch": 1.3033374398028894, + "grad_norm": 1.3260678052902222, + "learning_rate": 9.914105263157895e-05, + "loss": 0.456, + "step": 23275 + }, + { + "epoch": 1.3033934371150184, + "grad_norm": 1.4194862842559814, + "learning_rate": 9.914078947368421e-05, + "loss": 0.3703, + "step": 23276 + }, + { + "epoch": 1.3034494344271474, + "grad_norm": 4.396236419677734, + "learning_rate": 9.914052631578948e-05, + "loss": 0.5069, + "step": 23277 + }, + { + "epoch": 1.3035054317392765, + "grad_norm": 1.5681681632995605, + "learning_rate": 9.914026315789474e-05, + "loss": 0.3922, + "step": 23278 + }, + { + "epoch": 1.3035614290514055, + "grad_norm": 1.3979411125183105, + "learning_rate": 9.914e-05, + "loss": 0.4714, + "step": 23279 + }, + { + "epoch": 1.3036174263635345, + "grad_norm": 1.3500561714172363, + "learning_rate": 9.913973684210526e-05, + "loss": 0.4894, + "step": 23280 + }, + { + "epoch": 1.3036734236756635, + "grad_norm": 1.3158615827560425, + "learning_rate": 9.913947368421052e-05, + "loss": 0.4118, + "step": 23281 + }, + { + "epoch": 1.3037294209877925, + "grad_norm": 1.5024524927139282, + "learning_rate": 9.91392105263158e-05, + "loss": 0.5543, + "step": 23282 + }, + { + "epoch": 1.3037854182999216, + "grad_norm": 1.415050745010376, + "learning_rate": 9.913894736842106e-05, + "loss": 0.4649, + "step": 23283 + }, + { + "epoch": 1.3038414156120506, + "grad_norm": 1.1837997436523438, + "learning_rate": 9.913868421052633e-05, + "loss": 0.3367, + "step": 23284 + }, + { + "epoch": 1.3038974129241796, + "grad_norm": 1.5360347032546997, + "learning_rate": 9.913842105263157e-05, + "loss": 0.4749, + "step": 23285 + }, + { + "epoch": 1.3039534102363086, + "grad_norm": 1.4858312606811523, + "learning_rate": 9.913815789473685e-05, + "loss": 0.5332, + "step": 23286 + }, + { + "epoch": 1.3040094075484376, + "grad_norm": 1.4720791578292847, + "learning_rate": 9.913789473684211e-05, + "loss": 0.4504, + "step": 23287 + }, + { + "epoch": 1.3040654048605667, + "grad_norm": 1.3621578216552734, + "learning_rate": 9.913763157894738e-05, + "loss": 0.4723, + "step": 23288 + }, + { + "epoch": 1.3041214021726957, + "grad_norm": 1.3581650257110596, + "learning_rate": 9.913736842105264e-05, + "loss": 0.4798, + "step": 23289 + }, + { + "epoch": 1.3041773994848247, + "grad_norm": 1.228894591331482, + "learning_rate": 9.91371052631579e-05, + "loss": 0.4839, + "step": 23290 + }, + { + "epoch": 1.3042333967969537, + "grad_norm": 1.6072814464569092, + "learning_rate": 9.913684210526316e-05, + "loss": 0.5408, + "step": 23291 + }, + { + "epoch": 1.3042893941090827, + "grad_norm": 1.314273476600647, + "learning_rate": 9.913657894736843e-05, + "loss": 0.428, + "step": 23292 + }, + { + "epoch": 1.3043453914212118, + "grad_norm": 1.6539702415466309, + "learning_rate": 9.91363157894737e-05, + "loss": 0.4635, + "step": 23293 + }, + { + "epoch": 1.3044013887333408, + "grad_norm": 1.5021799802780151, + "learning_rate": 9.913605263157895e-05, + "loss": 0.4711, + "step": 23294 + }, + { + "epoch": 1.3044573860454698, + "grad_norm": 1.4261759519577026, + "learning_rate": 9.913578947368421e-05, + "loss": 0.3836, + "step": 23295 + }, + { + "epoch": 1.3045133833575988, + "grad_norm": 1.695491909980774, + "learning_rate": 9.913552631578947e-05, + "loss": 0.43, + "step": 23296 + }, + { + "epoch": 1.3045693806697278, + "grad_norm": 1.4659150838851929, + "learning_rate": 9.913526315789475e-05, + "loss": 0.4848, + "step": 23297 + }, + { + "epoch": 1.3046253779818568, + "grad_norm": 1.2196158170700073, + "learning_rate": 9.9135e-05, + "loss": 0.479, + "step": 23298 + }, + { + "epoch": 1.3046813752939859, + "grad_norm": 1.092777967453003, + "learning_rate": 9.913473684210527e-05, + "loss": 0.3985, + "step": 23299 + }, + { + "epoch": 1.3047373726061149, + "grad_norm": 1.4441514015197754, + "learning_rate": 9.913447368421052e-05, + "loss": 0.5424, + "step": 23300 + }, + { + "epoch": 1.304793369918244, + "grad_norm": 1.565144658088684, + "learning_rate": 9.91342105263158e-05, + "loss": 0.4983, + "step": 23301 + }, + { + "epoch": 1.304849367230373, + "grad_norm": 1.3440790176391602, + "learning_rate": 9.913394736842106e-05, + "loss": 0.3815, + "step": 23302 + }, + { + "epoch": 1.304905364542502, + "grad_norm": 1.4669320583343506, + "learning_rate": 9.913368421052632e-05, + "loss": 0.5202, + "step": 23303 + }, + { + "epoch": 1.304961361854631, + "grad_norm": 1.23827064037323, + "learning_rate": 9.913342105263158e-05, + "loss": 0.4864, + "step": 23304 + }, + { + "epoch": 1.30501735916676, + "grad_norm": 1.6211961507797241, + "learning_rate": 9.913315789473685e-05, + "loss": 0.6918, + "step": 23305 + }, + { + "epoch": 1.305073356478889, + "grad_norm": 1.5507512092590332, + "learning_rate": 9.913289473684211e-05, + "loss": 0.4171, + "step": 23306 + }, + { + "epoch": 1.305129353791018, + "grad_norm": 1.3851318359375, + "learning_rate": 9.913263157894738e-05, + "loss": 0.59, + "step": 23307 + }, + { + "epoch": 1.305185351103147, + "grad_norm": 1.2288877964019775, + "learning_rate": 9.913236842105263e-05, + "loss": 0.574, + "step": 23308 + }, + { + "epoch": 1.305241348415276, + "grad_norm": 1.1446152925491333, + "learning_rate": 9.91321052631579e-05, + "loss": 0.4483, + "step": 23309 + }, + { + "epoch": 1.305297345727405, + "grad_norm": 1.4239884614944458, + "learning_rate": 9.913184210526316e-05, + "loss": 0.5982, + "step": 23310 + }, + { + "epoch": 1.305353343039534, + "grad_norm": 1.4093072414398193, + "learning_rate": 9.913157894736844e-05, + "loss": 0.5805, + "step": 23311 + }, + { + "epoch": 1.3054093403516631, + "grad_norm": 1.4945576190948486, + "learning_rate": 9.913131578947368e-05, + "loss": 0.4839, + "step": 23312 + }, + { + "epoch": 1.3054653376637921, + "grad_norm": 1.3525145053863525, + "learning_rate": 9.913105263157894e-05, + "loss": 0.4899, + "step": 23313 + }, + { + "epoch": 1.3055213349759212, + "grad_norm": 3.119349241256714, + "learning_rate": 9.913078947368422e-05, + "loss": 0.5881, + "step": 23314 + }, + { + "epoch": 1.3055773322880502, + "grad_norm": 1.3717396259307861, + "learning_rate": 9.913052631578948e-05, + "loss": 0.4957, + "step": 23315 + }, + { + "epoch": 1.3056333296001792, + "grad_norm": 1.430946707725525, + "learning_rate": 9.913026315789475e-05, + "loss": 0.5012, + "step": 23316 + }, + { + "epoch": 1.3056893269123082, + "grad_norm": 1.260886788368225, + "learning_rate": 9.913e-05, + "loss": 0.365, + "step": 23317 + }, + { + "epoch": 1.3057453242244372, + "grad_norm": 1.356167197227478, + "learning_rate": 9.912973684210527e-05, + "loss": 0.4189, + "step": 23318 + }, + { + "epoch": 1.3058013215365663, + "grad_norm": 1.2889000177383423, + "learning_rate": 9.912947368421053e-05, + "loss": 0.4207, + "step": 23319 + }, + { + "epoch": 1.3058573188486953, + "grad_norm": 1.3411356210708618, + "learning_rate": 9.91292105263158e-05, + "loss": 0.5744, + "step": 23320 + }, + { + "epoch": 1.3059133161608243, + "grad_norm": 1.5483306646347046, + "learning_rate": 9.912894736842106e-05, + "loss": 0.4928, + "step": 23321 + }, + { + "epoch": 1.3059693134729533, + "grad_norm": 1.1878876686096191, + "learning_rate": 9.912868421052632e-05, + "loss": 0.4499, + "step": 23322 + }, + { + "epoch": 1.3060253107850823, + "grad_norm": 1.2884507179260254, + "learning_rate": 9.912842105263158e-05, + "loss": 0.4424, + "step": 23323 + }, + { + "epoch": 1.3060813080972113, + "grad_norm": 1.4340940713882446, + "learning_rate": 9.912815789473685e-05, + "loss": 0.3464, + "step": 23324 + }, + { + "epoch": 1.3061373054093404, + "grad_norm": 1.3988069295883179, + "learning_rate": 9.912789473684211e-05, + "loss": 0.4857, + "step": 23325 + }, + { + "epoch": 1.3061933027214694, + "grad_norm": 1.6385242938995361, + "learning_rate": 9.912763157894737e-05, + "loss": 0.4739, + "step": 23326 + }, + { + "epoch": 1.3062493000335984, + "grad_norm": 1.5921891927719116, + "learning_rate": 9.912736842105263e-05, + "loss": 0.4359, + "step": 23327 + }, + { + "epoch": 1.3063052973457274, + "grad_norm": 1.787548303604126, + "learning_rate": 9.91271052631579e-05, + "loss": 0.6647, + "step": 23328 + }, + { + "epoch": 1.3063612946578564, + "grad_norm": 1.3793574571609497, + "learning_rate": 9.912684210526317e-05, + "loss": 0.5143, + "step": 23329 + }, + { + "epoch": 1.3064172919699855, + "grad_norm": 1.6733791828155518, + "learning_rate": 9.912657894736843e-05, + "loss": 0.3727, + "step": 23330 + }, + { + "epoch": 1.3064732892821145, + "grad_norm": 1.2901532649993896, + "learning_rate": 9.912631578947368e-05, + "loss": 0.3639, + "step": 23331 + }, + { + "epoch": 1.3065292865942435, + "grad_norm": 1.5952303409576416, + "learning_rate": 9.912605263157894e-05, + "loss": 0.4403, + "step": 23332 + }, + { + "epoch": 1.3065852839063725, + "grad_norm": 1.2913167476654053, + "learning_rate": 9.912578947368422e-05, + "loss": 0.4291, + "step": 23333 + }, + { + "epoch": 1.3066412812185015, + "grad_norm": 1.5069353580474854, + "learning_rate": 9.912552631578948e-05, + "loss": 0.5659, + "step": 23334 + }, + { + "epoch": 1.3066972785306306, + "grad_norm": 1.4820927381515503, + "learning_rate": 9.912526315789474e-05, + "loss": 0.6137, + "step": 23335 + }, + { + "epoch": 1.3067532758427596, + "grad_norm": 1.560227870941162, + "learning_rate": 9.9125e-05, + "loss": 0.5614, + "step": 23336 + }, + { + "epoch": 1.3068092731548886, + "grad_norm": 1.298736572265625, + "learning_rate": 9.912473684210527e-05, + "loss": 0.6134, + "step": 23337 + }, + { + "epoch": 1.3068652704670176, + "grad_norm": 1.2767888307571411, + "learning_rate": 9.912447368421053e-05, + "loss": 0.4978, + "step": 23338 + }, + { + "epoch": 1.3069212677791466, + "grad_norm": 1.344883680343628, + "learning_rate": 9.91242105263158e-05, + "loss": 0.4717, + "step": 23339 + }, + { + "epoch": 1.3069772650912757, + "grad_norm": 1.1105804443359375, + "learning_rate": 9.912394736842105e-05, + "loss": 0.4326, + "step": 23340 + }, + { + "epoch": 1.3070332624034047, + "grad_norm": 1.3717206716537476, + "learning_rate": 9.912368421052632e-05, + "loss": 0.4515, + "step": 23341 + }, + { + "epoch": 1.3070892597155337, + "grad_norm": 1.207067847251892, + "learning_rate": 9.912342105263158e-05, + "loss": 0.3183, + "step": 23342 + }, + { + "epoch": 1.3071452570276627, + "grad_norm": 1.5931912660598755, + "learning_rate": 9.912315789473686e-05, + "loss": 0.5249, + "step": 23343 + }, + { + "epoch": 1.3072012543397917, + "grad_norm": 1.3497254848480225, + "learning_rate": 9.912289473684212e-05, + "loss": 0.4493, + "step": 23344 + }, + { + "epoch": 1.3072572516519207, + "grad_norm": 1.2425850629806519, + "learning_rate": 9.912263157894738e-05, + "loss": 0.5717, + "step": 23345 + }, + { + "epoch": 1.3073132489640498, + "grad_norm": 1.7057499885559082, + "learning_rate": 9.912236842105264e-05, + "loss": 0.6663, + "step": 23346 + }, + { + "epoch": 1.3073692462761788, + "grad_norm": 1.1396676301956177, + "learning_rate": 9.912210526315791e-05, + "loss": 0.4206, + "step": 23347 + }, + { + "epoch": 1.3074252435883078, + "grad_norm": 1.438910961151123, + "learning_rate": 9.912184210526317e-05, + "loss": 0.521, + "step": 23348 + }, + { + "epoch": 1.3074812409004368, + "grad_norm": 1.5462173223495483, + "learning_rate": 9.912157894736841e-05, + "loss": 0.5551, + "step": 23349 + }, + { + "epoch": 1.3075372382125658, + "grad_norm": 1.5258138179779053, + "learning_rate": 9.912131578947369e-05, + "loss": 0.4648, + "step": 23350 + }, + { + "epoch": 1.3075932355246949, + "grad_norm": 1.5073356628417969, + "learning_rate": 9.912105263157895e-05, + "loss": 0.4733, + "step": 23351 + }, + { + "epoch": 1.3076492328368239, + "grad_norm": 1.3298369646072388, + "learning_rate": 9.912078947368422e-05, + "loss": 0.5597, + "step": 23352 + }, + { + "epoch": 1.307705230148953, + "grad_norm": 1.5825093984603882, + "learning_rate": 9.912052631578948e-05, + "loss": 0.5574, + "step": 23353 + }, + { + "epoch": 1.307761227461082, + "grad_norm": 1.3182618618011475, + "learning_rate": 9.912026315789474e-05, + "loss": 0.5032, + "step": 23354 + }, + { + "epoch": 1.307817224773211, + "grad_norm": 1.4643890857696533, + "learning_rate": 9.912e-05, + "loss": 0.6005, + "step": 23355 + }, + { + "epoch": 1.30787322208534, + "grad_norm": 1.3440163135528564, + "learning_rate": 9.911973684210527e-05, + "loss": 0.6323, + "step": 23356 + }, + { + "epoch": 1.307929219397469, + "grad_norm": 1.2643388509750366, + "learning_rate": 9.911947368421053e-05, + "loss": 0.3923, + "step": 23357 + }, + { + "epoch": 1.307985216709598, + "grad_norm": 1.3173118829727173, + "learning_rate": 9.911921052631579e-05, + "loss": 0.5506, + "step": 23358 + }, + { + "epoch": 1.308041214021727, + "grad_norm": 1.325744867324829, + "learning_rate": 9.911894736842105e-05, + "loss": 0.3308, + "step": 23359 + }, + { + "epoch": 1.308097211333856, + "grad_norm": 1.4909019470214844, + "learning_rate": 9.911868421052633e-05, + "loss": 0.4128, + "step": 23360 + }, + { + "epoch": 1.308153208645985, + "grad_norm": 1.180491328239441, + "learning_rate": 9.911842105263159e-05, + "loss": 0.5487, + "step": 23361 + }, + { + "epoch": 1.308209205958114, + "grad_norm": 1.4190880060195923, + "learning_rate": 9.911815789473686e-05, + "loss": 0.3859, + "step": 23362 + }, + { + "epoch": 1.308265203270243, + "grad_norm": 1.1913870573043823, + "learning_rate": 9.91178947368421e-05, + "loss": 0.4741, + "step": 23363 + }, + { + "epoch": 1.3083212005823721, + "grad_norm": 1.2166470289230347, + "learning_rate": 9.911763157894738e-05, + "loss": 0.3879, + "step": 23364 + }, + { + "epoch": 1.3083771978945011, + "grad_norm": 1.200237512588501, + "learning_rate": 9.911736842105264e-05, + "loss": 0.4206, + "step": 23365 + }, + { + "epoch": 1.3084331952066302, + "grad_norm": 1.4765000343322754, + "learning_rate": 9.91171052631579e-05, + "loss": 0.4388, + "step": 23366 + }, + { + "epoch": 1.3084891925187592, + "grad_norm": 1.7343289852142334, + "learning_rate": 9.911684210526316e-05, + "loss": 0.5143, + "step": 23367 + }, + { + "epoch": 1.3085451898308882, + "grad_norm": 1.2377667427062988, + "learning_rate": 9.911657894736842e-05, + "loss": 0.4297, + "step": 23368 + }, + { + "epoch": 1.3086011871430172, + "grad_norm": 1.5643452405929565, + "learning_rate": 9.911631578947369e-05, + "loss": 0.4753, + "step": 23369 + }, + { + "epoch": 1.3086571844551462, + "grad_norm": 1.6922138929367065, + "learning_rate": 9.911605263157895e-05, + "loss": 0.4719, + "step": 23370 + }, + { + "epoch": 1.3087131817672752, + "grad_norm": 1.2513607740402222, + "learning_rate": 9.911578947368422e-05, + "loss": 0.401, + "step": 23371 + }, + { + "epoch": 1.3087691790794043, + "grad_norm": 1.266663908958435, + "learning_rate": 9.911552631578947e-05, + "loss": 0.4367, + "step": 23372 + }, + { + "epoch": 1.3088251763915333, + "grad_norm": 1.1690346002578735, + "learning_rate": 9.911526315789474e-05, + "loss": 0.3916, + "step": 23373 + }, + { + "epoch": 1.3088811737036623, + "grad_norm": 1.3350450992584229, + "learning_rate": 9.9115e-05, + "loss": 0.4463, + "step": 23374 + }, + { + "epoch": 1.3089371710157913, + "grad_norm": 1.3662041425704956, + "learning_rate": 9.911473684210528e-05, + "loss": 0.4163, + "step": 23375 + }, + { + "epoch": 1.3089931683279203, + "grad_norm": 1.4900022745132446, + "learning_rate": 9.911447368421054e-05, + "loss": 0.4521, + "step": 23376 + }, + { + "epoch": 1.3090491656400491, + "grad_norm": 1.3184181451797485, + "learning_rate": 9.91142105263158e-05, + "loss": 0.4926, + "step": 23377 + }, + { + "epoch": 1.3091051629521782, + "grad_norm": 1.287375807762146, + "learning_rate": 9.911394736842105e-05, + "loss": 0.4877, + "step": 23378 + }, + { + "epoch": 1.3091611602643072, + "grad_norm": 1.5189950466156006, + "learning_rate": 9.911368421052633e-05, + "loss": 0.5017, + "step": 23379 + }, + { + "epoch": 1.3092171575764362, + "grad_norm": 1.565103530883789, + "learning_rate": 9.911342105263159e-05, + "loss": 0.6216, + "step": 23380 + }, + { + "epoch": 1.3092731548885652, + "grad_norm": 1.495690941810608, + "learning_rate": 9.911315789473685e-05, + "loss": 0.5092, + "step": 23381 + }, + { + "epoch": 1.3093291522006942, + "grad_norm": 1.5382283926010132, + "learning_rate": 9.911289473684211e-05, + "loss": 0.7018, + "step": 23382 + }, + { + "epoch": 1.3093851495128233, + "grad_norm": 1.5031582117080688, + "learning_rate": 9.911263157894737e-05, + "loss": 0.5123, + "step": 23383 + }, + { + "epoch": 1.3094411468249523, + "grad_norm": 1.1811401844024658, + "learning_rate": 9.911236842105264e-05, + "loss": 0.5173, + "step": 23384 + }, + { + "epoch": 1.3094971441370813, + "grad_norm": 1.1787773370742798, + "learning_rate": 9.91121052631579e-05, + "loss": 0.4346, + "step": 23385 + }, + { + "epoch": 1.3095531414492103, + "grad_norm": 1.3113309144973755, + "learning_rate": 9.911184210526316e-05, + "loss": 0.3488, + "step": 23386 + }, + { + "epoch": 1.3096091387613393, + "grad_norm": 1.4525781869888306, + "learning_rate": 9.911157894736842e-05, + "loss": 0.4528, + "step": 23387 + }, + { + "epoch": 1.3096651360734684, + "grad_norm": 1.2588186264038086, + "learning_rate": 9.911131578947369e-05, + "loss": 0.4124, + "step": 23388 + }, + { + "epoch": 1.3097211333855974, + "grad_norm": 1.700707197189331, + "learning_rate": 9.911105263157895e-05, + "loss": 0.4042, + "step": 23389 + }, + { + "epoch": 1.3097771306977264, + "grad_norm": 2.0704498291015625, + "learning_rate": 9.911078947368421e-05, + "loss": 0.5692, + "step": 23390 + }, + { + "epoch": 1.3098331280098554, + "grad_norm": 1.415603756904602, + "learning_rate": 9.911052631578947e-05, + "loss": 0.376, + "step": 23391 + }, + { + "epoch": 1.3098891253219844, + "grad_norm": 1.505025029182434, + "learning_rate": 9.911026315789475e-05, + "loss": 0.4533, + "step": 23392 + }, + { + "epoch": 1.3099451226341134, + "grad_norm": 1.4242792129516602, + "learning_rate": 9.911e-05, + "loss": 0.4603, + "step": 23393 + }, + { + "epoch": 1.3100011199462425, + "grad_norm": 1.1377774477005005, + "learning_rate": 9.910973684210528e-05, + "loss": 0.3962, + "step": 23394 + }, + { + "epoch": 1.3100571172583715, + "grad_norm": 1.831237554550171, + "learning_rate": 9.910947368421052e-05, + "loss": 0.4864, + "step": 23395 + }, + { + "epoch": 1.3101131145705005, + "grad_norm": 1.5969734191894531, + "learning_rate": 9.91092105263158e-05, + "loss": 0.4875, + "step": 23396 + }, + { + "epoch": 1.3101691118826295, + "grad_norm": 1.4190731048583984, + "learning_rate": 9.910894736842106e-05, + "loss": 0.4684, + "step": 23397 + }, + { + "epoch": 1.3102251091947585, + "grad_norm": 1.3128427267074585, + "learning_rate": 9.910868421052633e-05, + "loss": 0.5009, + "step": 23398 + }, + { + "epoch": 1.3102811065068876, + "grad_norm": 1.2901273965835571, + "learning_rate": 9.910842105263159e-05, + "loss": 0.5086, + "step": 23399 + }, + { + "epoch": 1.3103371038190166, + "grad_norm": 1.7611383199691772, + "learning_rate": 9.910815789473684e-05, + "loss": 0.461, + "step": 23400 + }, + { + "epoch": 1.3103931011311456, + "grad_norm": 1.4367409944534302, + "learning_rate": 9.910789473684211e-05, + "loss": 0.6223, + "step": 23401 + }, + { + "epoch": 1.3104490984432746, + "grad_norm": 1.338951587677002, + "learning_rate": 9.910763157894737e-05, + "loss": 0.4048, + "step": 23402 + }, + { + "epoch": 1.3105050957554036, + "grad_norm": 1.4415680170059204, + "learning_rate": 9.910736842105264e-05, + "loss": 0.5566, + "step": 23403 + }, + { + "epoch": 1.3105610930675327, + "grad_norm": 1.296931266784668, + "learning_rate": 9.910710526315789e-05, + "loss": 0.5373, + "step": 23404 + }, + { + "epoch": 1.3106170903796617, + "grad_norm": 1.7117139101028442, + "learning_rate": 9.910684210526316e-05, + "loss": 0.4762, + "step": 23405 + }, + { + "epoch": 1.3106730876917907, + "grad_norm": 1.2670321464538574, + "learning_rate": 9.910657894736842e-05, + "loss": 0.4376, + "step": 23406 + }, + { + "epoch": 1.3107290850039197, + "grad_norm": 1.2396507263183594, + "learning_rate": 9.91063157894737e-05, + "loss": 0.5328, + "step": 23407 + }, + { + "epoch": 1.3107850823160487, + "grad_norm": 1.573288917541504, + "learning_rate": 9.910605263157895e-05, + "loss": 0.7282, + "step": 23408 + }, + { + "epoch": 1.3108410796281778, + "grad_norm": 1.1891690492630005, + "learning_rate": 9.910578947368421e-05, + "loss": 0.4956, + "step": 23409 + }, + { + "epoch": 1.3108970769403068, + "grad_norm": 1.2904771566390991, + "learning_rate": 9.910552631578947e-05, + "loss": 0.5151, + "step": 23410 + }, + { + "epoch": 1.3109530742524358, + "grad_norm": 1.278806209564209, + "learning_rate": 9.910526315789475e-05, + "loss": 0.5051, + "step": 23411 + }, + { + "epoch": 1.3110090715645648, + "grad_norm": 1.4368467330932617, + "learning_rate": 9.910500000000001e-05, + "loss": 0.447, + "step": 23412 + }, + { + "epoch": 1.3110650688766938, + "grad_norm": 1.5844480991363525, + "learning_rate": 9.910473684210527e-05, + "loss": 0.6849, + "step": 23413 + }, + { + "epoch": 1.3111210661888228, + "grad_norm": 1.3631752729415894, + "learning_rate": 9.910447368421053e-05, + "loss": 0.5, + "step": 23414 + }, + { + "epoch": 1.3111770635009519, + "grad_norm": 1.7622642517089844, + "learning_rate": 9.91042105263158e-05, + "loss": 0.7695, + "step": 23415 + }, + { + "epoch": 1.3112330608130809, + "grad_norm": 1.1892203092575073, + "learning_rate": 9.910394736842106e-05, + "loss": 0.3501, + "step": 23416 + }, + { + "epoch": 1.31128905812521, + "grad_norm": 1.4493550062179565, + "learning_rate": 9.910368421052632e-05, + "loss": 0.5148, + "step": 23417 + }, + { + "epoch": 1.311345055437339, + "grad_norm": 2.0674352645874023, + "learning_rate": 9.910342105263158e-05, + "loss": 0.3919, + "step": 23418 + }, + { + "epoch": 1.311401052749468, + "grad_norm": 1.2321887016296387, + "learning_rate": 9.910315789473684e-05, + "loss": 0.4374, + "step": 23419 + }, + { + "epoch": 1.311457050061597, + "grad_norm": 1.5014137029647827, + "learning_rate": 9.910289473684211e-05, + "loss": 0.5516, + "step": 23420 + }, + { + "epoch": 1.311513047373726, + "grad_norm": 1.337637186050415, + "learning_rate": 9.910263157894737e-05, + "loss": 0.4281, + "step": 23421 + }, + { + "epoch": 1.311569044685855, + "grad_norm": 1.747450351715088, + "learning_rate": 9.910236842105263e-05, + "loss": 0.594, + "step": 23422 + }, + { + "epoch": 1.311625041997984, + "grad_norm": 1.2766363620758057, + "learning_rate": 9.910210526315789e-05, + "loss": 0.4447, + "step": 23423 + }, + { + "epoch": 1.311681039310113, + "grad_norm": 1.3487766981124878, + "learning_rate": 9.910184210526316e-05, + "loss": 0.5729, + "step": 23424 + }, + { + "epoch": 1.311737036622242, + "grad_norm": 1.3309218883514404, + "learning_rate": 9.910157894736842e-05, + "loss": 0.5082, + "step": 23425 + }, + { + "epoch": 1.311793033934371, + "grad_norm": 1.2421057224273682, + "learning_rate": 9.91013157894737e-05, + "loss": 0.423, + "step": 23426 + }, + { + "epoch": 1.3118490312465, + "grad_norm": 1.2512723207473755, + "learning_rate": 9.910105263157894e-05, + "loss": 0.3813, + "step": 23427 + }, + { + "epoch": 1.3119050285586291, + "grad_norm": 1.492981195449829, + "learning_rate": 9.910078947368422e-05, + "loss": 0.5883, + "step": 23428 + }, + { + "epoch": 1.3119610258707581, + "grad_norm": 1.3836313486099243, + "learning_rate": 9.910052631578948e-05, + "loss": 0.4316, + "step": 23429 + }, + { + "epoch": 1.3120170231828872, + "grad_norm": 1.6830849647521973, + "learning_rate": 9.910026315789475e-05, + "loss": 0.5704, + "step": 23430 + }, + { + "epoch": 1.3120730204950162, + "grad_norm": 1.978153944015503, + "learning_rate": 9.910000000000001e-05, + "loss": 0.5258, + "step": 23431 + }, + { + "epoch": 1.3121290178071452, + "grad_norm": 1.2363853454589844, + "learning_rate": 9.909973684210527e-05, + "loss": 0.4237, + "step": 23432 + }, + { + "epoch": 1.3121850151192742, + "grad_norm": 1.2318207025527954, + "learning_rate": 9.909947368421053e-05, + "loss": 0.4274, + "step": 23433 + }, + { + "epoch": 1.3122410124314032, + "grad_norm": 1.1699894666671753, + "learning_rate": 9.909921052631579e-05, + "loss": 0.4445, + "step": 23434 + }, + { + "epoch": 1.3122970097435323, + "grad_norm": 1.2759298086166382, + "learning_rate": 9.909894736842106e-05, + "loss": 0.4688, + "step": 23435 + }, + { + "epoch": 1.3123530070556613, + "grad_norm": 1.3733446598052979, + "learning_rate": 9.909868421052632e-05, + "loss": 0.4503, + "step": 23436 + }, + { + "epoch": 1.3124090043677903, + "grad_norm": 1.1985161304473877, + "learning_rate": 9.909842105263158e-05, + "loss": 0.469, + "step": 23437 + }, + { + "epoch": 1.3124650016799193, + "grad_norm": 1.4328134059906006, + "learning_rate": 9.909815789473684e-05, + "loss": 0.4739, + "step": 23438 + }, + { + "epoch": 1.3125209989920483, + "grad_norm": 1.2340866327285767, + "learning_rate": 9.909789473684211e-05, + "loss": 0.415, + "step": 23439 + }, + { + "epoch": 1.3125769963041773, + "grad_norm": 1.4035005569458008, + "learning_rate": 9.909763157894737e-05, + "loss": 0.3763, + "step": 23440 + }, + { + "epoch": 1.3126329936163064, + "grad_norm": 1.3108558654785156, + "learning_rate": 9.909736842105263e-05, + "loss": 0.371, + "step": 23441 + }, + { + "epoch": 1.3126889909284354, + "grad_norm": 1.7617404460906982, + "learning_rate": 9.90971052631579e-05, + "loss": 0.5191, + "step": 23442 + }, + { + "epoch": 1.3127449882405644, + "grad_norm": 1.3765901327133179, + "learning_rate": 9.909684210526317e-05, + "loss": 0.6761, + "step": 23443 + }, + { + "epoch": 1.3128009855526934, + "grad_norm": 1.166473150253296, + "learning_rate": 9.909657894736843e-05, + "loss": 0.4358, + "step": 23444 + }, + { + "epoch": 1.3128569828648224, + "grad_norm": 1.3755375146865845, + "learning_rate": 9.909631578947369e-05, + "loss": 0.4297, + "step": 23445 + }, + { + "epoch": 1.3129129801769515, + "grad_norm": 1.3300050497055054, + "learning_rate": 9.909605263157895e-05, + "loss": 0.4262, + "step": 23446 + }, + { + "epoch": 1.3129689774890805, + "grad_norm": 1.282021403312683, + "learning_rate": 9.909578947368422e-05, + "loss": 0.4449, + "step": 23447 + }, + { + "epoch": 1.3130249748012095, + "grad_norm": 1.2514032125473022, + "learning_rate": 9.909552631578948e-05, + "loss": 0.5067, + "step": 23448 + }, + { + "epoch": 1.3130809721133385, + "grad_norm": 1.432243824005127, + "learning_rate": 9.909526315789475e-05, + "loss": 0.4014, + "step": 23449 + }, + { + "epoch": 1.3131369694254675, + "grad_norm": 1.4824650287628174, + "learning_rate": 9.9095e-05, + "loss": 0.4567, + "step": 23450 + }, + { + "epoch": 1.3131929667375966, + "grad_norm": 2.4755280017852783, + "learning_rate": 9.909473684210526e-05, + "loss": 0.5068, + "step": 23451 + }, + { + "epoch": 1.3132489640497256, + "grad_norm": 1.7152857780456543, + "learning_rate": 9.909447368421053e-05, + "loss": 0.4997, + "step": 23452 + }, + { + "epoch": 1.3133049613618546, + "grad_norm": 1.4264469146728516, + "learning_rate": 9.909421052631579e-05, + "loss": 0.475, + "step": 23453 + }, + { + "epoch": 1.3133609586739836, + "grad_norm": 1.1713494062423706, + "learning_rate": 9.909394736842107e-05, + "loss": 0.317, + "step": 23454 + }, + { + "epoch": 1.3134169559861126, + "grad_norm": 1.0777666568756104, + "learning_rate": 9.909368421052631e-05, + "loss": 0.3438, + "step": 23455 + }, + { + "epoch": 1.3134729532982417, + "grad_norm": 1.5504034757614136, + "learning_rate": 9.909342105263158e-05, + "loss": 0.7497, + "step": 23456 + }, + { + "epoch": 1.3135289506103707, + "grad_norm": 1.3663454055786133, + "learning_rate": 9.909315789473684e-05, + "loss": 0.4087, + "step": 23457 + }, + { + "epoch": 1.3135849479224997, + "grad_norm": 1.5986995697021484, + "learning_rate": 9.909289473684212e-05, + "loss": 0.5057, + "step": 23458 + }, + { + "epoch": 1.3136409452346287, + "grad_norm": 1.3435286283493042, + "learning_rate": 9.909263157894736e-05, + "loss": 0.3877, + "step": 23459 + }, + { + "epoch": 1.3136969425467577, + "grad_norm": 1.3977755308151245, + "learning_rate": 9.909236842105264e-05, + "loss": 0.5025, + "step": 23460 + }, + { + "epoch": 1.3137529398588867, + "grad_norm": 1.5699728727340698, + "learning_rate": 9.90921052631579e-05, + "loss": 0.4892, + "step": 23461 + }, + { + "epoch": 1.3138089371710158, + "grad_norm": 1.2305196523666382, + "learning_rate": 9.909184210526317e-05, + "loss": 0.4396, + "step": 23462 + }, + { + "epoch": 1.3138649344831448, + "grad_norm": 1.4703571796417236, + "learning_rate": 9.909157894736843e-05, + "loss": 0.4488, + "step": 23463 + }, + { + "epoch": 1.3139209317952738, + "grad_norm": 2.4165940284729004, + "learning_rate": 9.909131578947369e-05, + "loss": 0.4089, + "step": 23464 + }, + { + "epoch": 1.3139769291074028, + "grad_norm": 1.211911916732788, + "learning_rate": 9.909105263157895e-05, + "loss": 0.4259, + "step": 23465 + }, + { + "epoch": 1.3140329264195318, + "grad_norm": 1.1673394441604614, + "learning_rate": 9.909078947368422e-05, + "loss": 0.3949, + "step": 23466 + }, + { + "epoch": 1.3140889237316609, + "grad_norm": 1.3991273641586304, + "learning_rate": 9.909052631578948e-05, + "loss": 0.4416, + "step": 23467 + }, + { + "epoch": 1.3141449210437899, + "grad_norm": 9.993056297302246, + "learning_rate": 9.909026315789474e-05, + "loss": 0.5116, + "step": 23468 + }, + { + "epoch": 1.314200918355919, + "grad_norm": 1.3935984373092651, + "learning_rate": 9.909e-05, + "loss": 0.4229, + "step": 23469 + }, + { + "epoch": 1.314256915668048, + "grad_norm": 1.576490044593811, + "learning_rate": 9.908973684210526e-05, + "loss": 0.5644, + "step": 23470 + }, + { + "epoch": 1.314312912980177, + "grad_norm": 1.3270258903503418, + "learning_rate": 9.908947368421053e-05, + "loss": 0.4396, + "step": 23471 + }, + { + "epoch": 1.314368910292306, + "grad_norm": 1.487135410308838, + "learning_rate": 9.90892105263158e-05, + "loss": 0.6049, + "step": 23472 + }, + { + "epoch": 1.314424907604435, + "grad_norm": 1.3678137063980103, + "learning_rate": 9.908894736842105e-05, + "loss": 0.5267, + "step": 23473 + }, + { + "epoch": 1.314480904916564, + "grad_norm": 2.1125478744506836, + "learning_rate": 9.908868421052631e-05, + "loss": 0.4868, + "step": 23474 + }, + { + "epoch": 1.314536902228693, + "grad_norm": 1.335037112236023, + "learning_rate": 9.908842105263159e-05, + "loss": 0.5048, + "step": 23475 + }, + { + "epoch": 1.314592899540822, + "grad_norm": 1.3628175258636475, + "learning_rate": 9.908815789473685e-05, + "loss": 0.4408, + "step": 23476 + }, + { + "epoch": 1.314648896852951, + "grad_norm": 1.4586461782455444, + "learning_rate": 9.90878947368421e-05, + "loss": 0.5362, + "step": 23477 + }, + { + "epoch": 1.31470489416508, + "grad_norm": 1.2521742582321167, + "learning_rate": 9.908763157894737e-05, + "loss": 0.4385, + "step": 23478 + }, + { + "epoch": 1.314760891477209, + "grad_norm": 1.889302134513855, + "learning_rate": 9.908736842105264e-05, + "loss": 0.7015, + "step": 23479 + }, + { + "epoch": 1.3148168887893381, + "grad_norm": 1.3009964227676392, + "learning_rate": 9.90871052631579e-05, + "loss": 0.4561, + "step": 23480 + }, + { + "epoch": 1.3148728861014671, + "grad_norm": 1.9913792610168457, + "learning_rate": 9.908684210526317e-05, + "loss": 0.4249, + "step": 23481 + }, + { + "epoch": 1.3149288834135962, + "grad_norm": 1.304996132850647, + "learning_rate": 9.908657894736842e-05, + "loss": 0.5047, + "step": 23482 + }, + { + "epoch": 1.3149848807257252, + "grad_norm": 1.3131805658340454, + "learning_rate": 9.908631578947369e-05, + "loss": 0.4517, + "step": 23483 + }, + { + "epoch": 1.3150408780378542, + "grad_norm": 1.6353704929351807, + "learning_rate": 9.908605263157895e-05, + "loss": 0.396, + "step": 23484 + }, + { + "epoch": 1.3150968753499832, + "grad_norm": 1.500420093536377, + "learning_rate": 9.908578947368423e-05, + "loss": 0.5526, + "step": 23485 + }, + { + "epoch": 1.3151528726621122, + "grad_norm": 1.3532007932662964, + "learning_rate": 9.908552631578948e-05, + "loss": 0.4128, + "step": 23486 + }, + { + "epoch": 1.3152088699742412, + "grad_norm": 1.5647958517074585, + "learning_rate": 9.908526315789473e-05, + "loss": 0.5633, + "step": 23487 + }, + { + "epoch": 1.3152648672863703, + "grad_norm": 1.302091121673584, + "learning_rate": 9.9085e-05, + "loss": 0.4586, + "step": 23488 + }, + { + "epoch": 1.3153208645984993, + "grad_norm": 1.4732964038848877, + "learning_rate": 9.908473684210526e-05, + "loss": 0.5431, + "step": 23489 + }, + { + "epoch": 1.3153768619106283, + "grad_norm": 1.4219179153442383, + "learning_rate": 9.908447368421054e-05, + "loss": 0.5483, + "step": 23490 + }, + { + "epoch": 1.3154328592227573, + "grad_norm": 1.3076339960098267, + "learning_rate": 9.90842105263158e-05, + "loss": 0.4174, + "step": 23491 + }, + { + "epoch": 1.3154888565348863, + "grad_norm": 1.2017974853515625, + "learning_rate": 9.908394736842106e-05, + "loss": 0.4726, + "step": 23492 + }, + { + "epoch": 1.3155448538470154, + "grad_norm": 1.4847337007522583, + "learning_rate": 9.908368421052632e-05, + "loss": 0.5449, + "step": 23493 + }, + { + "epoch": 1.3156008511591444, + "grad_norm": 1.6240051984786987, + "learning_rate": 9.908342105263159e-05, + "loss": 0.4613, + "step": 23494 + }, + { + "epoch": 1.3156568484712734, + "grad_norm": 1.3387105464935303, + "learning_rate": 9.908315789473685e-05, + "loss": 0.5762, + "step": 23495 + }, + { + "epoch": 1.3157128457834024, + "grad_norm": 1.5709962844848633, + "learning_rate": 9.908289473684211e-05, + "loss": 0.4868, + "step": 23496 + }, + { + "epoch": 1.3157688430955314, + "grad_norm": 1.442383885383606, + "learning_rate": 9.908263157894737e-05, + "loss": 0.516, + "step": 23497 + }, + { + "epoch": 1.3158248404076605, + "grad_norm": 1.1911067962646484, + "learning_rate": 9.908236842105264e-05, + "loss": 0.4622, + "step": 23498 + }, + { + "epoch": 1.3158808377197895, + "grad_norm": 1.2875454425811768, + "learning_rate": 9.90821052631579e-05, + "loss": 0.3478, + "step": 23499 + }, + { + "epoch": 1.3159368350319185, + "grad_norm": 1.450999140739441, + "learning_rate": 9.908184210526316e-05, + "loss": 0.3776, + "step": 23500 + }, + { + "epoch": 1.3159928323440475, + "grad_norm": 1.3420605659484863, + "learning_rate": 9.908157894736842e-05, + "loss": 0.3914, + "step": 23501 + }, + { + "epoch": 1.3160488296561765, + "grad_norm": 1.3781684637069702, + "learning_rate": 9.90813157894737e-05, + "loss": 0.4826, + "step": 23502 + }, + { + "epoch": 1.3161048269683056, + "grad_norm": 1.4783337116241455, + "learning_rate": 9.908105263157895e-05, + "loss": 0.4982, + "step": 23503 + }, + { + "epoch": 1.3161608242804346, + "grad_norm": 1.4523431062698364, + "learning_rate": 9.908078947368421e-05, + "loss": 0.4266, + "step": 23504 + }, + { + "epoch": 1.3162168215925636, + "grad_norm": 1.678612470626831, + "learning_rate": 9.908052631578947e-05, + "loss": 0.5663, + "step": 23505 + }, + { + "epoch": 1.3162728189046926, + "grad_norm": 1.7157851457595825, + "learning_rate": 9.908026315789473e-05, + "loss": 0.4389, + "step": 23506 + }, + { + "epoch": 1.3163288162168216, + "grad_norm": 1.1345454454421997, + "learning_rate": 9.908000000000001e-05, + "loss": 0.3387, + "step": 23507 + }, + { + "epoch": 1.3163848135289506, + "grad_norm": 1.5600314140319824, + "learning_rate": 9.907973684210527e-05, + "loss": 0.4027, + "step": 23508 + }, + { + "epoch": 1.3164408108410797, + "grad_norm": 1.4446918964385986, + "learning_rate": 9.907947368421054e-05, + "loss": 0.3934, + "step": 23509 + }, + { + "epoch": 1.3164968081532087, + "grad_norm": 1.4340838193893433, + "learning_rate": 9.907921052631579e-05, + "loss": 0.4871, + "step": 23510 + }, + { + "epoch": 1.3165528054653377, + "grad_norm": 1.360880732536316, + "learning_rate": 9.907894736842106e-05, + "loss": 0.4362, + "step": 23511 + }, + { + "epoch": 1.3166088027774667, + "grad_norm": 1.3731484413146973, + "learning_rate": 9.907868421052632e-05, + "loss": 0.3982, + "step": 23512 + }, + { + "epoch": 1.3166648000895957, + "grad_norm": 1.2852195501327515, + "learning_rate": 9.907842105263159e-05, + "loss": 0.4248, + "step": 23513 + }, + { + "epoch": 1.3167207974017248, + "grad_norm": 1.6905219554901123, + "learning_rate": 9.907815789473684e-05, + "loss": 0.466, + "step": 23514 + }, + { + "epoch": 1.3167767947138538, + "grad_norm": 1.1920872926712036, + "learning_rate": 9.907789473684211e-05, + "loss": 0.3411, + "step": 23515 + }, + { + "epoch": 1.3168327920259828, + "grad_norm": 1.4498956203460693, + "learning_rate": 9.907763157894737e-05, + "loss": 0.4507, + "step": 23516 + }, + { + "epoch": 1.3168887893381118, + "grad_norm": 1.4717071056365967, + "learning_rate": 9.907736842105264e-05, + "loss": 0.635, + "step": 23517 + }, + { + "epoch": 1.3169447866502408, + "grad_norm": 1.6787028312683105, + "learning_rate": 9.90771052631579e-05, + "loss": 0.4732, + "step": 23518 + }, + { + "epoch": 1.3170007839623699, + "grad_norm": 1.3689689636230469, + "learning_rate": 9.907684210526316e-05, + "loss": 0.3653, + "step": 23519 + }, + { + "epoch": 1.3170567812744989, + "grad_norm": 1.5186647176742554, + "learning_rate": 9.907657894736842e-05, + "loss": 0.5114, + "step": 23520 + }, + { + "epoch": 1.317112778586628, + "grad_norm": 1.6663926839828491, + "learning_rate": 9.907631578947368e-05, + "loss": 0.5142, + "step": 23521 + }, + { + "epoch": 1.317168775898757, + "grad_norm": 1.1840616464614868, + "learning_rate": 9.907605263157896e-05, + "loss": 0.3912, + "step": 23522 + }, + { + "epoch": 1.317224773210886, + "grad_norm": 1.6792411804199219, + "learning_rate": 9.907578947368422e-05, + "loss": 0.4961, + "step": 23523 + }, + { + "epoch": 1.317280770523015, + "grad_norm": 1.4093555212020874, + "learning_rate": 9.907552631578948e-05, + "loss": 0.4001, + "step": 23524 + }, + { + "epoch": 1.317336767835144, + "grad_norm": 1.30805242061615, + "learning_rate": 9.907526315789474e-05, + "loss": 0.4575, + "step": 23525 + }, + { + "epoch": 1.317392765147273, + "grad_norm": 1.1401379108428955, + "learning_rate": 9.907500000000001e-05, + "loss": 0.2999, + "step": 23526 + }, + { + "epoch": 1.317448762459402, + "grad_norm": 1.3849704265594482, + "learning_rate": 9.907473684210527e-05, + "loss": 0.419, + "step": 23527 + }, + { + "epoch": 1.317504759771531, + "grad_norm": 1.422619104385376, + "learning_rate": 9.907447368421053e-05, + "loss": 0.4429, + "step": 23528 + }, + { + "epoch": 1.31756075708366, + "grad_norm": 2.4414379596710205, + "learning_rate": 9.907421052631579e-05, + "loss": 0.5035, + "step": 23529 + }, + { + "epoch": 1.317616754395789, + "grad_norm": 1.6046645641326904, + "learning_rate": 9.907394736842106e-05, + "loss": 0.4716, + "step": 23530 + }, + { + "epoch": 1.317672751707918, + "grad_norm": 1.6596109867095947, + "learning_rate": 9.907368421052632e-05, + "loss": 0.6331, + "step": 23531 + }, + { + "epoch": 1.317728749020047, + "grad_norm": 1.4059385061264038, + "learning_rate": 9.907342105263158e-05, + "loss": 0.6395, + "step": 23532 + }, + { + "epoch": 1.3177847463321761, + "grad_norm": 1.3787490129470825, + "learning_rate": 9.907315789473684e-05, + "loss": 0.4719, + "step": 23533 + }, + { + "epoch": 1.3178407436443051, + "grad_norm": 1.3036671876907349, + "learning_rate": 9.907289473684211e-05, + "loss": 0.5903, + "step": 23534 + }, + { + "epoch": 1.3178967409564342, + "grad_norm": 1.2987549304962158, + "learning_rate": 9.907263157894737e-05, + "loss": 0.4609, + "step": 23535 + }, + { + "epoch": 1.3179527382685632, + "grad_norm": 1.3021806478500366, + "learning_rate": 9.907236842105265e-05, + "loss": 0.4061, + "step": 23536 + }, + { + "epoch": 1.3180087355806922, + "grad_norm": 1.3273011445999146, + "learning_rate": 9.90721052631579e-05, + "loss": 0.5937, + "step": 23537 + }, + { + "epoch": 1.3180647328928212, + "grad_norm": 1.6703534126281738, + "learning_rate": 9.907184210526315e-05, + "loss": 0.6635, + "step": 23538 + }, + { + "epoch": 1.3181207302049502, + "grad_norm": 1.2274681329727173, + "learning_rate": 9.907157894736843e-05, + "loss": 0.418, + "step": 23539 + }, + { + "epoch": 1.3181767275170793, + "grad_norm": 1.3198310136795044, + "learning_rate": 9.907131578947369e-05, + "loss": 0.516, + "step": 23540 + }, + { + "epoch": 1.3182327248292083, + "grad_norm": 0.9746121168136597, + "learning_rate": 9.907105263157896e-05, + "loss": 0.3675, + "step": 23541 + }, + { + "epoch": 1.3182887221413373, + "grad_norm": 1.48617684841156, + "learning_rate": 9.90707894736842e-05, + "loss": 0.4091, + "step": 23542 + }, + { + "epoch": 1.3183447194534663, + "grad_norm": 1.5356407165527344, + "learning_rate": 9.907052631578948e-05, + "loss": 0.5064, + "step": 23543 + }, + { + "epoch": 1.3184007167655953, + "grad_norm": 1.353983998298645, + "learning_rate": 9.907026315789474e-05, + "loss": 0.5867, + "step": 23544 + }, + { + "epoch": 1.3184567140777244, + "grad_norm": 1.3710651397705078, + "learning_rate": 9.907000000000001e-05, + "loss": 0.515, + "step": 23545 + }, + { + "epoch": 1.3185127113898534, + "grad_norm": 1.3370361328125, + "learning_rate": 9.906973684210527e-05, + "loss": 0.3541, + "step": 23546 + }, + { + "epoch": 1.3185687087019824, + "grad_norm": 1.3643617630004883, + "learning_rate": 9.906947368421053e-05, + "loss": 0.3977, + "step": 23547 + }, + { + "epoch": 1.3186247060141114, + "grad_norm": 1.2992652654647827, + "learning_rate": 9.906921052631579e-05, + "loss": 0.5181, + "step": 23548 + }, + { + "epoch": 1.3186807033262404, + "grad_norm": 2.7935121059417725, + "learning_rate": 9.906894736842106e-05, + "loss": 0.4438, + "step": 23549 + }, + { + "epoch": 1.3187367006383695, + "grad_norm": 1.092849612236023, + "learning_rate": 9.906868421052632e-05, + "loss": 0.3636, + "step": 23550 + }, + { + "epoch": 1.3187926979504985, + "grad_norm": 1.3572300672531128, + "learning_rate": 9.906842105263158e-05, + "loss": 0.3727, + "step": 23551 + }, + { + "epoch": 1.3188486952626275, + "grad_norm": 1.3177443742752075, + "learning_rate": 9.906815789473684e-05, + "loss": 0.4337, + "step": 23552 + }, + { + "epoch": 1.3189046925747565, + "grad_norm": 1.4217324256896973, + "learning_rate": 9.906789473684212e-05, + "loss": 0.4441, + "step": 23553 + }, + { + "epoch": 1.3189606898868855, + "grad_norm": 1.5611952543258667, + "learning_rate": 9.906763157894738e-05, + "loss": 0.4666, + "step": 23554 + }, + { + "epoch": 1.3190166871990145, + "grad_norm": 1.3093900680541992, + "learning_rate": 9.906736842105264e-05, + "loss": 0.4245, + "step": 23555 + }, + { + "epoch": 1.3190726845111436, + "grad_norm": 1.3968415260314941, + "learning_rate": 9.90671052631579e-05, + "loss": 0.4779, + "step": 23556 + }, + { + "epoch": 1.3191286818232726, + "grad_norm": 1.3176441192626953, + "learning_rate": 9.906684210526316e-05, + "loss": 0.5966, + "step": 23557 + }, + { + "epoch": 1.3191846791354016, + "grad_norm": 1.1914488077163696, + "learning_rate": 9.906657894736843e-05, + "loss": 0.3269, + "step": 23558 + }, + { + "epoch": 1.3192406764475306, + "grad_norm": 1.5402307510375977, + "learning_rate": 9.906631578947369e-05, + "loss": 0.4766, + "step": 23559 + }, + { + "epoch": 1.3192966737596596, + "grad_norm": 1.4681423902511597, + "learning_rate": 9.906605263157895e-05, + "loss": 0.6561, + "step": 23560 + }, + { + "epoch": 1.3193526710717887, + "grad_norm": 1.2147117853164673, + "learning_rate": 9.906578947368421e-05, + "loss": 0.3518, + "step": 23561 + }, + { + "epoch": 1.3194086683839177, + "grad_norm": 1.1922838687896729, + "learning_rate": 9.906552631578948e-05, + "loss": 0.4168, + "step": 23562 + }, + { + "epoch": 1.3194646656960467, + "grad_norm": 1.4813134670257568, + "learning_rate": 9.906526315789474e-05, + "loss": 0.4051, + "step": 23563 + }, + { + "epoch": 1.3195206630081757, + "grad_norm": 1.6255486011505127, + "learning_rate": 9.9065e-05, + "loss": 0.6109, + "step": 23564 + }, + { + "epoch": 1.3195766603203047, + "grad_norm": 1.5070154666900635, + "learning_rate": 9.906473684210526e-05, + "loss": 0.5359, + "step": 23565 + }, + { + "epoch": 1.3196326576324338, + "grad_norm": 1.2078232765197754, + "learning_rate": 9.906447368421053e-05, + "loss": 0.5, + "step": 23566 + }, + { + "epoch": 1.3196886549445628, + "grad_norm": 1.221826195716858, + "learning_rate": 9.90642105263158e-05, + "loss": 0.4715, + "step": 23567 + }, + { + "epoch": 1.3197446522566918, + "grad_norm": 1.1665562391281128, + "learning_rate": 9.906394736842107e-05, + "loss": 0.4743, + "step": 23568 + }, + { + "epoch": 1.3198006495688208, + "grad_norm": 1.7769436836242676, + "learning_rate": 9.906368421052631e-05, + "loss": 0.651, + "step": 23569 + }, + { + "epoch": 1.3198566468809498, + "grad_norm": 1.6282001733779907, + "learning_rate": 9.906342105263159e-05, + "loss": 0.5087, + "step": 23570 + }, + { + "epoch": 1.3199126441930789, + "grad_norm": 1.4367485046386719, + "learning_rate": 9.906315789473685e-05, + "loss": 0.5163, + "step": 23571 + }, + { + "epoch": 1.3199686415052079, + "grad_norm": 1.1818881034851074, + "learning_rate": 9.90628947368421e-05, + "loss": 0.4329, + "step": 23572 + }, + { + "epoch": 1.320024638817337, + "grad_norm": 1.3425483703613281, + "learning_rate": 9.906263157894738e-05, + "loss": 0.4593, + "step": 23573 + }, + { + "epoch": 1.320080636129466, + "grad_norm": 1.2086644172668457, + "learning_rate": 9.906236842105263e-05, + "loss": 0.497, + "step": 23574 + }, + { + "epoch": 1.320136633441595, + "grad_norm": 1.4390406608581543, + "learning_rate": 9.90621052631579e-05, + "loss": 0.607, + "step": 23575 + }, + { + "epoch": 1.320192630753724, + "grad_norm": 1.5852943658828735, + "learning_rate": 9.906184210526316e-05, + "loss": 0.4744, + "step": 23576 + }, + { + "epoch": 1.320248628065853, + "grad_norm": 1.4295347929000854, + "learning_rate": 9.906157894736843e-05, + "loss": 0.3604, + "step": 23577 + }, + { + "epoch": 1.320304625377982, + "grad_norm": 1.4102352857589722, + "learning_rate": 9.906131578947369e-05, + "loss": 0.4666, + "step": 23578 + }, + { + "epoch": 1.320360622690111, + "grad_norm": 1.2955905199050903, + "learning_rate": 9.906105263157895e-05, + "loss": 0.3701, + "step": 23579 + }, + { + "epoch": 1.32041662000224, + "grad_norm": 1.466480016708374, + "learning_rate": 9.906078947368421e-05, + "loss": 0.7739, + "step": 23580 + }, + { + "epoch": 1.320472617314369, + "grad_norm": 1.4667768478393555, + "learning_rate": 9.906052631578948e-05, + "loss": 0.6104, + "step": 23581 + }, + { + "epoch": 1.320528614626498, + "grad_norm": 1.45987868309021, + "learning_rate": 9.906026315789474e-05, + "loss": 0.5134, + "step": 23582 + }, + { + "epoch": 1.320584611938627, + "grad_norm": 1.2193151712417603, + "learning_rate": 9.906e-05, + "loss": 0.5586, + "step": 23583 + }, + { + "epoch": 1.3206406092507559, + "grad_norm": 1.5995523929595947, + "learning_rate": 9.905973684210526e-05, + "loss": 0.4621, + "step": 23584 + }, + { + "epoch": 1.320696606562885, + "grad_norm": 1.2353379726409912, + "learning_rate": 9.905947368421054e-05, + "loss": 0.514, + "step": 23585 + }, + { + "epoch": 1.320752603875014, + "grad_norm": 1.2061514854431152, + "learning_rate": 9.90592105263158e-05, + "loss": 0.4795, + "step": 23586 + }, + { + "epoch": 1.320808601187143, + "grad_norm": 1.2943384647369385, + "learning_rate": 9.905894736842106e-05, + "loss": 0.3866, + "step": 23587 + }, + { + "epoch": 1.320864598499272, + "grad_norm": 1.351973056793213, + "learning_rate": 9.905868421052632e-05, + "loss": 0.4919, + "step": 23588 + }, + { + "epoch": 1.320920595811401, + "grad_norm": 1.2900298833847046, + "learning_rate": 9.905842105263159e-05, + "loss": 0.3786, + "step": 23589 + }, + { + "epoch": 1.32097659312353, + "grad_norm": 1.6199102401733398, + "learning_rate": 9.905815789473685e-05, + "loss": 0.523, + "step": 23590 + }, + { + "epoch": 1.321032590435659, + "grad_norm": 1.2421436309814453, + "learning_rate": 9.905789473684211e-05, + "loss": 0.5326, + "step": 23591 + }, + { + "epoch": 1.321088587747788, + "grad_norm": 1.2802703380584717, + "learning_rate": 9.905763157894737e-05, + "loss": 0.4736, + "step": 23592 + }, + { + "epoch": 1.321144585059917, + "grad_norm": 1.3789219856262207, + "learning_rate": 9.905736842105263e-05, + "loss": 0.5246, + "step": 23593 + }, + { + "epoch": 1.321200582372046, + "grad_norm": 2.457754373550415, + "learning_rate": 9.90571052631579e-05, + "loss": 0.6164, + "step": 23594 + }, + { + "epoch": 1.321256579684175, + "grad_norm": 1.5327216386795044, + "learning_rate": 9.905684210526316e-05, + "loss": 0.4155, + "step": 23595 + }, + { + "epoch": 1.3213125769963041, + "grad_norm": 1.3117456436157227, + "learning_rate": 9.905657894736843e-05, + "loss": 0.502, + "step": 23596 + }, + { + "epoch": 1.3213685743084331, + "grad_norm": 1.27993905544281, + "learning_rate": 9.905631578947368e-05, + "loss": 0.4094, + "step": 23597 + }, + { + "epoch": 1.3214245716205621, + "grad_norm": 1.54668128490448, + "learning_rate": 9.905605263157895e-05, + "loss": 0.3735, + "step": 23598 + }, + { + "epoch": 1.3214805689326912, + "grad_norm": 2.8389892578125, + "learning_rate": 9.905578947368421e-05, + "loss": 0.5307, + "step": 23599 + }, + { + "epoch": 1.3215365662448202, + "grad_norm": 1.3819185495376587, + "learning_rate": 9.905552631578949e-05, + "loss": 0.5652, + "step": 23600 + }, + { + "epoch": 1.3215925635569492, + "grad_norm": 1.5571867227554321, + "learning_rate": 9.905526315789475e-05, + "loss": 0.5315, + "step": 23601 + }, + { + "epoch": 1.3216485608690782, + "grad_norm": 1.5231329202651978, + "learning_rate": 9.9055e-05, + "loss": 0.4304, + "step": 23602 + }, + { + "epoch": 1.3217045581812072, + "grad_norm": 1.5173362493515015, + "learning_rate": 9.905473684210527e-05, + "loss": 0.3471, + "step": 23603 + }, + { + "epoch": 1.3217605554933363, + "grad_norm": 1.2694761753082275, + "learning_rate": 9.905447368421054e-05, + "loss": 0.5271, + "step": 23604 + }, + { + "epoch": 1.3218165528054653, + "grad_norm": 1.2435005903244019, + "learning_rate": 9.90542105263158e-05, + "loss": 0.3912, + "step": 23605 + }, + { + "epoch": 1.3218725501175943, + "grad_norm": 1.2682331800460815, + "learning_rate": 9.905394736842106e-05, + "loss": 0.4828, + "step": 23606 + }, + { + "epoch": 1.3219285474297233, + "grad_norm": 1.5103245973587036, + "learning_rate": 9.905368421052632e-05, + "loss": 0.4391, + "step": 23607 + }, + { + "epoch": 1.3219845447418523, + "grad_norm": 1.295146107673645, + "learning_rate": 9.905342105263158e-05, + "loss": 0.3748, + "step": 23608 + }, + { + "epoch": 1.3220405420539814, + "grad_norm": 1.4384539127349854, + "learning_rate": 9.905315789473685e-05, + "loss": 0.5867, + "step": 23609 + }, + { + "epoch": 1.3220965393661104, + "grad_norm": 1.448047399520874, + "learning_rate": 9.905289473684211e-05, + "loss": 0.4441, + "step": 23610 + }, + { + "epoch": 1.3221525366782394, + "grad_norm": 1.8027104139328003, + "learning_rate": 9.905263157894737e-05, + "loss": 0.5361, + "step": 23611 + }, + { + "epoch": 1.3222085339903684, + "grad_norm": 1.6681878566741943, + "learning_rate": 9.905236842105263e-05, + "loss": 0.5267, + "step": 23612 + }, + { + "epoch": 1.3222645313024974, + "grad_norm": 1.4393134117126465, + "learning_rate": 9.90521052631579e-05, + "loss": 0.7328, + "step": 23613 + }, + { + "epoch": 1.3223205286146265, + "grad_norm": 1.2529213428497314, + "learning_rate": 9.905184210526316e-05, + "loss": 0.4681, + "step": 23614 + }, + { + "epoch": 1.3223765259267555, + "grad_norm": 1.8222781419754028, + "learning_rate": 9.905157894736842e-05, + "loss": 0.5243, + "step": 23615 + }, + { + "epoch": 1.3224325232388845, + "grad_norm": 1.4331270456314087, + "learning_rate": 9.905131578947368e-05, + "loss": 0.4813, + "step": 23616 + }, + { + "epoch": 1.3224885205510135, + "grad_norm": 1.3592449426651, + "learning_rate": 9.905105263157896e-05, + "loss": 0.5771, + "step": 23617 + }, + { + "epoch": 1.3225445178631425, + "grad_norm": 1.7799861431121826, + "learning_rate": 9.905078947368422e-05, + "loss": 0.6256, + "step": 23618 + }, + { + "epoch": 1.3226005151752716, + "grad_norm": 1.8847559690475464, + "learning_rate": 9.905052631578948e-05, + "loss": 0.6124, + "step": 23619 + }, + { + "epoch": 1.3226565124874006, + "grad_norm": 1.9002000093460083, + "learning_rate": 9.905026315789474e-05, + "loss": 0.5131, + "step": 23620 + }, + { + "epoch": 1.3227125097995296, + "grad_norm": 1.2628037929534912, + "learning_rate": 9.905000000000001e-05, + "loss": 0.5107, + "step": 23621 + }, + { + "epoch": 1.3227685071116586, + "grad_norm": 1.39218270778656, + "learning_rate": 9.904973684210527e-05, + "loss": 0.4912, + "step": 23622 + }, + { + "epoch": 1.3228245044237876, + "grad_norm": 1.3301101922988892, + "learning_rate": 9.904947368421054e-05, + "loss": 0.4268, + "step": 23623 + }, + { + "epoch": 1.3228805017359166, + "grad_norm": 1.2446775436401367, + "learning_rate": 9.904921052631579e-05, + "loss": 0.3518, + "step": 23624 + }, + { + "epoch": 1.3229364990480457, + "grad_norm": 1.6048548221588135, + "learning_rate": 9.904894736842105e-05, + "loss": 0.5149, + "step": 23625 + }, + { + "epoch": 1.3229924963601747, + "grad_norm": 1.354109764099121, + "learning_rate": 9.904868421052632e-05, + "loss": 0.4675, + "step": 23626 + }, + { + "epoch": 1.3230484936723037, + "grad_norm": 1.8846253156661987, + "learning_rate": 9.904842105263158e-05, + "loss": 0.48, + "step": 23627 + }, + { + "epoch": 1.3231044909844327, + "grad_norm": 1.7262190580368042, + "learning_rate": 9.904815789473685e-05, + "loss": 0.6207, + "step": 23628 + }, + { + "epoch": 1.3231604882965617, + "grad_norm": 1.422507405281067, + "learning_rate": 9.90478947368421e-05, + "loss": 0.4041, + "step": 23629 + }, + { + "epoch": 1.3232164856086908, + "grad_norm": 1.5060955286026, + "learning_rate": 9.904763157894737e-05, + "loss": 0.454, + "step": 23630 + }, + { + "epoch": 1.3232724829208198, + "grad_norm": 1.2696168422698975, + "learning_rate": 9.904736842105263e-05, + "loss": 0.4178, + "step": 23631 + }, + { + "epoch": 1.3233284802329488, + "grad_norm": 18.032329559326172, + "learning_rate": 9.90471052631579e-05, + "loss": 0.4343, + "step": 23632 + }, + { + "epoch": 1.3233844775450778, + "grad_norm": 1.286111831665039, + "learning_rate": 9.904684210526317e-05, + "loss": 0.5, + "step": 23633 + }, + { + "epoch": 1.3234404748572068, + "grad_norm": 1.2645292282104492, + "learning_rate": 9.904657894736843e-05, + "loss": 0.3458, + "step": 23634 + }, + { + "epoch": 1.3234964721693359, + "grad_norm": 1.7569935321807861, + "learning_rate": 9.904631578947369e-05, + "loss": 0.4716, + "step": 23635 + }, + { + "epoch": 1.3235524694814649, + "grad_norm": 5.75446891784668, + "learning_rate": 9.904605263157896e-05, + "loss": 0.4636, + "step": 23636 + }, + { + "epoch": 1.323608466793594, + "grad_norm": 1.3120683431625366, + "learning_rate": 9.904578947368422e-05, + "loss": 0.5186, + "step": 23637 + }, + { + "epoch": 1.323664464105723, + "grad_norm": 1.477587103843689, + "learning_rate": 9.904552631578948e-05, + "loss": 0.3833, + "step": 23638 + }, + { + "epoch": 1.323720461417852, + "grad_norm": 2.1010634899139404, + "learning_rate": 9.904526315789474e-05, + "loss": 0.5562, + "step": 23639 + }, + { + "epoch": 1.323776458729981, + "grad_norm": 1.5339431762695312, + "learning_rate": 9.904500000000001e-05, + "loss": 0.4913, + "step": 23640 + }, + { + "epoch": 1.32383245604211, + "grad_norm": 1.4985063076019287, + "learning_rate": 9.904473684210527e-05, + "loss": 0.4716, + "step": 23641 + }, + { + "epoch": 1.323888453354239, + "grad_norm": 1.2650494575500488, + "learning_rate": 9.904447368421053e-05, + "loss": 0.4502, + "step": 23642 + }, + { + "epoch": 1.323944450666368, + "grad_norm": 1.399677038192749, + "learning_rate": 9.904421052631579e-05, + "loss": 0.453, + "step": 23643 + }, + { + "epoch": 1.324000447978497, + "grad_norm": 1.3583924770355225, + "learning_rate": 9.904394736842105e-05, + "loss": 0.4261, + "step": 23644 + }, + { + "epoch": 1.324056445290626, + "grad_norm": 1.2457411289215088, + "learning_rate": 9.904368421052632e-05, + "loss": 0.4543, + "step": 23645 + }, + { + "epoch": 1.324112442602755, + "grad_norm": 1.3103523254394531, + "learning_rate": 9.904342105263158e-05, + "loss": 0.4961, + "step": 23646 + }, + { + "epoch": 1.324168439914884, + "grad_norm": 2.0466365814208984, + "learning_rate": 9.904315789473684e-05, + "loss": 0.5445, + "step": 23647 + }, + { + "epoch": 1.324224437227013, + "grad_norm": 1.5205439329147339, + "learning_rate": 9.90428947368421e-05, + "loss": 0.4668, + "step": 23648 + }, + { + "epoch": 1.3242804345391421, + "grad_norm": 1.236509919166565, + "learning_rate": 9.904263157894738e-05, + "loss": 0.428, + "step": 23649 + }, + { + "epoch": 1.3243364318512711, + "grad_norm": 1.7488242387771606, + "learning_rate": 9.904236842105264e-05, + "loss": 0.523, + "step": 23650 + }, + { + "epoch": 1.3243924291634002, + "grad_norm": 1.2344638109207153, + "learning_rate": 9.904210526315791e-05, + "loss": 0.6077, + "step": 23651 + }, + { + "epoch": 1.3244484264755292, + "grad_norm": 1.4946460723876953, + "learning_rate": 9.904184210526316e-05, + "loss": 0.4272, + "step": 23652 + }, + { + "epoch": 1.3245044237876582, + "grad_norm": 1.2750486135482788, + "learning_rate": 9.904157894736843e-05, + "loss": 0.3809, + "step": 23653 + }, + { + "epoch": 1.3245604210997872, + "grad_norm": 1.2810460329055786, + "learning_rate": 9.904131578947369e-05, + "loss": 0.45, + "step": 23654 + }, + { + "epoch": 1.3246164184119162, + "grad_norm": 1.2462742328643799, + "learning_rate": 9.904105263157896e-05, + "loss": 0.452, + "step": 23655 + }, + { + "epoch": 1.3246724157240453, + "grad_norm": 1.3404079675674438, + "learning_rate": 9.904078947368422e-05, + "loss": 0.426, + "step": 23656 + }, + { + "epoch": 1.3247284130361743, + "grad_norm": 2.8471829891204834, + "learning_rate": 9.904052631578948e-05, + "loss": 0.4118, + "step": 23657 + }, + { + "epoch": 1.3247844103483033, + "grad_norm": 1.2869397401809692, + "learning_rate": 9.904026315789474e-05, + "loss": 0.3979, + "step": 23658 + }, + { + "epoch": 1.3248404076604323, + "grad_norm": 1.3661129474639893, + "learning_rate": 9.904e-05, + "loss": 0.5137, + "step": 23659 + }, + { + "epoch": 1.3248964049725613, + "grad_norm": 1.7654578685760498, + "learning_rate": 9.903973684210527e-05, + "loss": 0.5692, + "step": 23660 + }, + { + "epoch": 1.3249524022846904, + "grad_norm": 1.4775665998458862, + "learning_rate": 9.903947368421052e-05, + "loss": 0.4675, + "step": 23661 + }, + { + "epoch": 1.3250083995968194, + "grad_norm": 1.3470083475112915, + "learning_rate": 9.903921052631579e-05, + "loss": 0.461, + "step": 23662 + }, + { + "epoch": 1.3250643969089484, + "grad_norm": 1.2037103176116943, + "learning_rate": 9.903894736842105e-05, + "loss": 0.3816, + "step": 23663 + }, + { + "epoch": 1.3251203942210774, + "grad_norm": 1.2058035135269165, + "learning_rate": 9.903868421052633e-05, + "loss": 0.372, + "step": 23664 + }, + { + "epoch": 1.3251763915332064, + "grad_norm": 1.8195207118988037, + "learning_rate": 9.903842105263159e-05, + "loss": 0.5475, + "step": 23665 + }, + { + "epoch": 1.3252323888453355, + "grad_norm": 1.3020286560058594, + "learning_rate": 9.903815789473685e-05, + "loss": 0.4242, + "step": 23666 + }, + { + "epoch": 1.3252883861574645, + "grad_norm": 1.6143763065338135, + "learning_rate": 9.90378947368421e-05, + "loss": 0.4134, + "step": 23667 + }, + { + "epoch": 1.3253443834695935, + "grad_norm": 1.2927172183990479, + "learning_rate": 9.903763157894738e-05, + "loss": 0.3848, + "step": 23668 + }, + { + "epoch": 1.3254003807817225, + "grad_norm": 1.150124430656433, + "learning_rate": 9.903736842105264e-05, + "loss": 0.44, + "step": 23669 + }, + { + "epoch": 1.3254563780938515, + "grad_norm": 1.4438631534576416, + "learning_rate": 9.90371052631579e-05, + "loss": 0.4656, + "step": 23670 + }, + { + "epoch": 1.3255123754059805, + "grad_norm": 1.6583176851272583, + "learning_rate": 9.903684210526316e-05, + "loss": 0.5393, + "step": 23671 + }, + { + "epoch": 1.3255683727181096, + "grad_norm": 1.572582721710205, + "learning_rate": 9.903657894736843e-05, + "loss": 0.4246, + "step": 23672 + }, + { + "epoch": 1.3256243700302386, + "grad_norm": 1.8211469650268555, + "learning_rate": 9.903631578947369e-05, + "loss": 0.5976, + "step": 23673 + }, + { + "epoch": 1.3256803673423676, + "grad_norm": 1.1383578777313232, + "learning_rate": 9.903605263157895e-05, + "loss": 0.3202, + "step": 23674 + }, + { + "epoch": 1.3257363646544966, + "grad_norm": 1.207336664199829, + "learning_rate": 9.903578947368421e-05, + "loss": 0.4443, + "step": 23675 + }, + { + "epoch": 1.3257923619666256, + "grad_norm": 1.387158751487732, + "learning_rate": 9.903552631578947e-05, + "loss": 0.5514, + "step": 23676 + }, + { + "epoch": 1.3258483592787547, + "grad_norm": 1.4295538663864136, + "learning_rate": 9.903526315789474e-05, + "loss": 0.4565, + "step": 23677 + }, + { + "epoch": 1.3259043565908837, + "grad_norm": 1.262218952178955, + "learning_rate": 9.9035e-05, + "loss": 0.3952, + "step": 23678 + }, + { + "epoch": 1.3259603539030127, + "grad_norm": 2.4699628353118896, + "learning_rate": 9.903473684210526e-05, + "loss": 0.571, + "step": 23679 + }, + { + "epoch": 1.3260163512151417, + "grad_norm": 1.5111093521118164, + "learning_rate": 9.903447368421052e-05, + "loss": 0.4995, + "step": 23680 + }, + { + "epoch": 1.3260723485272707, + "grad_norm": 1.4468804597854614, + "learning_rate": 9.90342105263158e-05, + "loss": 0.4995, + "step": 23681 + }, + { + "epoch": 1.3261283458393998, + "grad_norm": 1.4177590608596802, + "learning_rate": 9.903394736842106e-05, + "loss": 0.4481, + "step": 23682 + }, + { + "epoch": 1.3261843431515288, + "grad_norm": 1.2426563501358032, + "learning_rate": 9.903368421052633e-05, + "loss": 0.6114, + "step": 23683 + }, + { + "epoch": 1.3262403404636578, + "grad_norm": 1.3937419652938843, + "learning_rate": 9.903342105263157e-05, + "loss": 0.5069, + "step": 23684 + }, + { + "epoch": 1.3262963377757868, + "grad_norm": 1.3842486143112183, + "learning_rate": 9.903315789473685e-05, + "loss": 0.5955, + "step": 23685 + }, + { + "epoch": 1.3263523350879158, + "grad_norm": 1.3546769618988037, + "learning_rate": 9.903289473684211e-05, + "loss": 0.4685, + "step": 23686 + }, + { + "epoch": 1.3264083324000449, + "grad_norm": 1.254425287246704, + "learning_rate": 9.903263157894738e-05, + "loss": 0.5517, + "step": 23687 + }, + { + "epoch": 1.3264643297121739, + "grad_norm": 1.3651771545410156, + "learning_rate": 9.903236842105264e-05, + "loss": 0.4507, + "step": 23688 + }, + { + "epoch": 1.326520327024303, + "grad_norm": 1.5061709880828857, + "learning_rate": 9.90321052631579e-05, + "loss": 0.5048, + "step": 23689 + }, + { + "epoch": 1.326576324336432, + "grad_norm": 1.3575242757797241, + "learning_rate": 9.903184210526316e-05, + "loss": 0.5553, + "step": 23690 + }, + { + "epoch": 1.326632321648561, + "grad_norm": 1.389123558998108, + "learning_rate": 9.903157894736843e-05, + "loss": 0.5087, + "step": 23691 + }, + { + "epoch": 1.32668831896069, + "grad_norm": 1.5058927536010742, + "learning_rate": 9.90313157894737e-05, + "loss": 0.4397, + "step": 23692 + }, + { + "epoch": 1.326744316272819, + "grad_norm": 1.1221836805343628, + "learning_rate": 9.903105263157895e-05, + "loss": 0.4602, + "step": 23693 + }, + { + "epoch": 1.326800313584948, + "grad_norm": 1.5292891263961792, + "learning_rate": 9.903078947368421e-05, + "loss": 0.6224, + "step": 23694 + }, + { + "epoch": 1.326856310897077, + "grad_norm": 1.6511558294296265, + "learning_rate": 9.903052631578947e-05, + "loss": 0.4988, + "step": 23695 + }, + { + "epoch": 1.326912308209206, + "grad_norm": 1.4084031581878662, + "learning_rate": 9.903026315789475e-05, + "loss": 0.4657, + "step": 23696 + }, + { + "epoch": 1.326968305521335, + "grad_norm": 1.500809669494629, + "learning_rate": 9.903e-05, + "loss": 0.5624, + "step": 23697 + }, + { + "epoch": 1.327024302833464, + "grad_norm": 1.824904441833496, + "learning_rate": 9.902973684210527e-05, + "loss": 0.5121, + "step": 23698 + }, + { + "epoch": 1.327080300145593, + "grad_norm": 1.3979873657226562, + "learning_rate": 9.902947368421053e-05, + "loss": 0.3647, + "step": 23699 + }, + { + "epoch": 1.327136297457722, + "grad_norm": 1.4668703079223633, + "learning_rate": 9.90292105263158e-05, + "loss": 0.5255, + "step": 23700 + }, + { + "epoch": 1.3271922947698511, + "grad_norm": 1.708409070968628, + "learning_rate": 9.902894736842106e-05, + "loss": 0.5224, + "step": 23701 + }, + { + "epoch": 1.3272482920819801, + "grad_norm": 1.5218987464904785, + "learning_rate": 9.902868421052632e-05, + "loss": 0.373, + "step": 23702 + }, + { + "epoch": 1.3273042893941092, + "grad_norm": 1.4584457874298096, + "learning_rate": 9.902842105263158e-05, + "loss": 0.4978, + "step": 23703 + }, + { + "epoch": 1.3273602867062382, + "grad_norm": 1.9057399034500122, + "learning_rate": 9.902815789473685e-05, + "loss": 0.4955, + "step": 23704 + }, + { + "epoch": 1.3274162840183672, + "grad_norm": 1.1189838647842407, + "learning_rate": 9.902789473684211e-05, + "loss": 0.3878, + "step": 23705 + }, + { + "epoch": 1.3274722813304962, + "grad_norm": 1.8672751188278198, + "learning_rate": 9.902763157894738e-05, + "loss": 0.4425, + "step": 23706 + }, + { + "epoch": 1.3275282786426252, + "grad_norm": 1.2922204732894897, + "learning_rate": 9.902736842105263e-05, + "loss": 0.4279, + "step": 23707 + }, + { + "epoch": 1.327584275954754, + "grad_norm": 1.1615028381347656, + "learning_rate": 9.90271052631579e-05, + "loss": 0.3908, + "step": 23708 + }, + { + "epoch": 1.327640273266883, + "grad_norm": 1.163578748703003, + "learning_rate": 9.902684210526316e-05, + "loss": 0.4818, + "step": 23709 + }, + { + "epoch": 1.327696270579012, + "grad_norm": 1.2902050018310547, + "learning_rate": 9.902657894736844e-05, + "loss": 0.4775, + "step": 23710 + }, + { + "epoch": 1.327752267891141, + "grad_norm": 1.1960757970809937, + "learning_rate": 9.90263157894737e-05, + "loss": 0.3787, + "step": 23711 + }, + { + "epoch": 1.32780826520327, + "grad_norm": 1.5162867307662964, + "learning_rate": 9.902605263157894e-05, + "loss": 0.5849, + "step": 23712 + }, + { + "epoch": 1.3278642625153991, + "grad_norm": 1.2826370000839233, + "learning_rate": 9.902578947368422e-05, + "loss": 0.4882, + "step": 23713 + }, + { + "epoch": 1.3279202598275281, + "grad_norm": 1.2803281545639038, + "learning_rate": 9.902552631578948e-05, + "loss": 0.4433, + "step": 23714 + }, + { + "epoch": 1.3279762571396572, + "grad_norm": 2.0065600872039795, + "learning_rate": 9.902526315789475e-05, + "loss": 0.5693, + "step": 23715 + }, + { + "epoch": 1.3280322544517862, + "grad_norm": 1.1913907527923584, + "learning_rate": 9.9025e-05, + "loss": 0.4042, + "step": 23716 + }, + { + "epoch": 1.3280882517639152, + "grad_norm": 1.5494247674942017, + "learning_rate": 9.902473684210527e-05, + "loss": 0.4673, + "step": 23717 + }, + { + "epoch": 1.3281442490760442, + "grad_norm": 1.230475664138794, + "learning_rate": 9.902447368421053e-05, + "loss": 0.4799, + "step": 23718 + }, + { + "epoch": 1.3282002463881732, + "grad_norm": 1.455629587173462, + "learning_rate": 9.90242105263158e-05, + "loss": 0.5006, + "step": 23719 + }, + { + "epoch": 1.3282562437003023, + "grad_norm": 1.6144088506698608, + "learning_rate": 9.902394736842106e-05, + "loss": 0.4582, + "step": 23720 + }, + { + "epoch": 1.3283122410124313, + "grad_norm": 1.2861231565475464, + "learning_rate": 9.902368421052632e-05, + "loss": 0.4438, + "step": 23721 + }, + { + "epoch": 1.3283682383245603, + "grad_norm": 1.314123272895813, + "learning_rate": 9.902342105263158e-05, + "loss": 0.4057, + "step": 23722 + }, + { + "epoch": 1.3284242356366893, + "grad_norm": 1.452102780342102, + "learning_rate": 9.902315789473685e-05, + "loss": 0.4223, + "step": 23723 + }, + { + "epoch": 1.3284802329488183, + "grad_norm": 1.2253562211990356, + "learning_rate": 9.902289473684211e-05, + "loss": 0.501, + "step": 23724 + }, + { + "epoch": 1.3285362302609474, + "grad_norm": 1.557931661605835, + "learning_rate": 9.902263157894737e-05, + "loss": 0.4189, + "step": 23725 + }, + { + "epoch": 1.3285922275730764, + "grad_norm": 1.4173012971878052, + "learning_rate": 9.902236842105263e-05, + "loss": 0.4374, + "step": 23726 + }, + { + "epoch": 1.3286482248852054, + "grad_norm": 1.6278053522109985, + "learning_rate": 9.90221052631579e-05, + "loss": 0.4733, + "step": 23727 + }, + { + "epoch": 1.3287042221973344, + "grad_norm": 2.0490424633026123, + "learning_rate": 9.902184210526317e-05, + "loss": 0.4737, + "step": 23728 + }, + { + "epoch": 1.3287602195094634, + "grad_norm": 1.7847868204116821, + "learning_rate": 9.902157894736843e-05, + "loss": 0.4162, + "step": 23729 + }, + { + "epoch": 1.3288162168215925, + "grad_norm": 1.4404356479644775, + "learning_rate": 9.902131578947369e-05, + "loss": 0.3731, + "step": 23730 + }, + { + "epoch": 1.3288722141337215, + "grad_norm": 1.6273163557052612, + "learning_rate": 9.902105263157894e-05, + "loss": 0.5964, + "step": 23731 + }, + { + "epoch": 1.3289282114458505, + "grad_norm": 1.505690097808838, + "learning_rate": 9.902078947368422e-05, + "loss": 0.5087, + "step": 23732 + }, + { + "epoch": 1.3289842087579795, + "grad_norm": 1.3277822732925415, + "learning_rate": 9.902052631578948e-05, + "loss": 0.6378, + "step": 23733 + }, + { + "epoch": 1.3290402060701085, + "grad_norm": 1.4372085332870483, + "learning_rate": 9.902026315789474e-05, + "loss": 0.6282, + "step": 23734 + }, + { + "epoch": 1.3290962033822376, + "grad_norm": 1.8132154941558838, + "learning_rate": 9.902e-05, + "loss": 0.5666, + "step": 23735 + }, + { + "epoch": 1.3291522006943666, + "grad_norm": 1.291549563407898, + "learning_rate": 9.901973684210527e-05, + "loss": 0.3794, + "step": 23736 + }, + { + "epoch": 1.3292081980064956, + "grad_norm": 1.598890781402588, + "learning_rate": 9.901947368421053e-05, + "loss": 0.5991, + "step": 23737 + }, + { + "epoch": 1.3292641953186246, + "grad_norm": 1.4591436386108398, + "learning_rate": 9.90192105263158e-05, + "loss": 0.4907, + "step": 23738 + }, + { + "epoch": 1.3293201926307536, + "grad_norm": 1.3021881580352783, + "learning_rate": 9.901894736842105e-05, + "loss": 0.458, + "step": 23739 + }, + { + "epoch": 1.3293761899428826, + "grad_norm": 1.9970439672470093, + "learning_rate": 9.901868421052632e-05, + "loss": 0.5475, + "step": 23740 + }, + { + "epoch": 1.3294321872550117, + "grad_norm": 1.3215672969818115, + "learning_rate": 9.901842105263158e-05, + "loss": 0.4985, + "step": 23741 + }, + { + "epoch": 1.3294881845671407, + "grad_norm": 1.1670761108398438, + "learning_rate": 9.901815789473686e-05, + "loss": 0.4236, + "step": 23742 + }, + { + "epoch": 1.3295441818792697, + "grad_norm": 1.6619517803192139, + "learning_rate": 9.901789473684212e-05, + "loss": 0.5797, + "step": 23743 + }, + { + "epoch": 1.3296001791913987, + "grad_norm": 1.4194841384887695, + "learning_rate": 9.901763157894738e-05, + "loss": 0.4307, + "step": 23744 + }, + { + "epoch": 1.3296561765035277, + "grad_norm": 1.455501914024353, + "learning_rate": 9.901736842105264e-05, + "loss": 0.4563, + "step": 23745 + }, + { + "epoch": 1.3297121738156568, + "grad_norm": 1.743912696838379, + "learning_rate": 9.90171052631579e-05, + "loss": 0.5251, + "step": 23746 + }, + { + "epoch": 1.3297681711277858, + "grad_norm": 1.3530521392822266, + "learning_rate": 9.901684210526317e-05, + "loss": 0.412, + "step": 23747 + }, + { + "epoch": 1.3298241684399148, + "grad_norm": 1.2381137609481812, + "learning_rate": 9.901657894736843e-05, + "loss": 0.4692, + "step": 23748 + }, + { + "epoch": 1.3298801657520438, + "grad_norm": 1.2860863208770752, + "learning_rate": 9.901631578947369e-05, + "loss": 0.396, + "step": 23749 + }, + { + "epoch": 1.3299361630641728, + "grad_norm": 1.500382423400879, + "learning_rate": 9.901605263157895e-05, + "loss": 0.6196, + "step": 23750 + }, + { + "epoch": 1.3299921603763019, + "grad_norm": 1.6573114395141602, + "learning_rate": 9.901578947368422e-05, + "loss": 0.4758, + "step": 23751 + }, + { + "epoch": 1.3300481576884309, + "grad_norm": 1.65608811378479, + "learning_rate": 9.901552631578948e-05, + "loss": 0.5383, + "step": 23752 + }, + { + "epoch": 1.33010415500056, + "grad_norm": 1.4041887521743774, + "learning_rate": 9.901526315789474e-05, + "loss": 0.5479, + "step": 23753 + }, + { + "epoch": 1.330160152312689, + "grad_norm": 1.302161455154419, + "learning_rate": 9.9015e-05, + "loss": 0.4635, + "step": 23754 + }, + { + "epoch": 1.330216149624818, + "grad_norm": 1.4354580640792847, + "learning_rate": 9.901473684210527e-05, + "loss": 0.4113, + "step": 23755 + }, + { + "epoch": 1.330272146936947, + "grad_norm": 1.409231424331665, + "learning_rate": 9.901447368421053e-05, + "loss": 0.4626, + "step": 23756 + }, + { + "epoch": 1.330328144249076, + "grad_norm": 1.9290640354156494, + "learning_rate": 9.901421052631579e-05, + "loss": 0.6919, + "step": 23757 + }, + { + "epoch": 1.330384141561205, + "grad_norm": 1.1375243663787842, + "learning_rate": 9.901394736842105e-05, + "loss": 0.4853, + "step": 23758 + }, + { + "epoch": 1.330440138873334, + "grad_norm": 1.466916561126709, + "learning_rate": 9.901368421052633e-05, + "loss": 0.4337, + "step": 23759 + }, + { + "epoch": 1.330496136185463, + "grad_norm": 1.4821162223815918, + "learning_rate": 9.901342105263159e-05, + "loss": 0.4767, + "step": 23760 + }, + { + "epoch": 1.330552133497592, + "grad_norm": 2.9241535663604736, + "learning_rate": 9.901315789473686e-05, + "loss": 0.4131, + "step": 23761 + }, + { + "epoch": 1.330608130809721, + "grad_norm": 1.6557778120040894, + "learning_rate": 9.90128947368421e-05, + "loss": 0.4093, + "step": 23762 + }, + { + "epoch": 1.33066412812185, + "grad_norm": 1.8381612300872803, + "learning_rate": 9.901263157894736e-05, + "loss": 1.0502, + "step": 23763 + }, + { + "epoch": 1.330720125433979, + "grad_norm": 1.4598925113677979, + "learning_rate": 9.901236842105264e-05, + "loss": 0.5094, + "step": 23764 + }, + { + "epoch": 1.3307761227461081, + "grad_norm": 1.5245411396026611, + "learning_rate": 9.90121052631579e-05, + "loss": 0.4248, + "step": 23765 + }, + { + "epoch": 1.3308321200582371, + "grad_norm": 1.3863434791564941, + "learning_rate": 9.901184210526316e-05, + "loss": 0.4848, + "step": 23766 + }, + { + "epoch": 1.3308881173703662, + "grad_norm": 1.2266355752944946, + "learning_rate": 9.901157894736842e-05, + "loss": 0.5014, + "step": 23767 + }, + { + "epoch": 1.3309441146824952, + "grad_norm": 1.5366392135620117, + "learning_rate": 9.901131578947369e-05, + "loss": 0.4406, + "step": 23768 + }, + { + "epoch": 1.3310001119946242, + "grad_norm": 1.3355239629745483, + "learning_rate": 9.901105263157895e-05, + "loss": 0.5142, + "step": 23769 + }, + { + "epoch": 1.3310561093067532, + "grad_norm": 1.3259963989257812, + "learning_rate": 9.901078947368422e-05, + "loss": 0.4757, + "step": 23770 + }, + { + "epoch": 1.3311121066188822, + "grad_norm": 1.2686805725097656, + "learning_rate": 9.901052631578947e-05, + "loss": 0.4744, + "step": 23771 + }, + { + "epoch": 1.3311681039310113, + "grad_norm": 1.6363102197647095, + "learning_rate": 9.901026315789474e-05, + "loss": 0.5022, + "step": 23772 + }, + { + "epoch": 1.3312241012431403, + "grad_norm": 1.3323304653167725, + "learning_rate": 9.901e-05, + "loss": 0.4078, + "step": 23773 + }, + { + "epoch": 1.3312800985552693, + "grad_norm": 1.2837271690368652, + "learning_rate": 9.900973684210528e-05, + "loss": 0.4101, + "step": 23774 + }, + { + "epoch": 1.3313360958673983, + "grad_norm": 1.3309253454208374, + "learning_rate": 9.900947368421054e-05, + "loss": 0.5137, + "step": 23775 + }, + { + "epoch": 1.3313920931795273, + "grad_norm": 1.3191558122634888, + "learning_rate": 9.90092105263158e-05, + "loss": 0.4689, + "step": 23776 + }, + { + "epoch": 1.3314480904916564, + "grad_norm": 1.7781141996383667, + "learning_rate": 9.900894736842105e-05, + "loss": 0.6415, + "step": 23777 + }, + { + "epoch": 1.3315040878037854, + "grad_norm": 1.3525689840316772, + "learning_rate": 9.900868421052633e-05, + "loss": 0.4296, + "step": 23778 + }, + { + "epoch": 1.3315600851159144, + "grad_norm": 1.1907037496566772, + "learning_rate": 9.900842105263159e-05, + "loss": 0.3696, + "step": 23779 + }, + { + "epoch": 1.3316160824280434, + "grad_norm": 1.3596187829971313, + "learning_rate": 9.900815789473685e-05, + "loss": 0.451, + "step": 23780 + }, + { + "epoch": 1.3316720797401724, + "grad_norm": 1.2794840335845947, + "learning_rate": 9.900789473684211e-05, + "loss": 0.5895, + "step": 23781 + }, + { + "epoch": 1.3317280770523015, + "grad_norm": 1.1582374572753906, + "learning_rate": 9.900763157894737e-05, + "loss": 0.4681, + "step": 23782 + }, + { + "epoch": 1.3317840743644305, + "grad_norm": 1.742173433303833, + "learning_rate": 9.900736842105264e-05, + "loss": 0.5168, + "step": 23783 + }, + { + "epoch": 1.3318400716765595, + "grad_norm": 1.382517695426941, + "learning_rate": 9.90071052631579e-05, + "loss": 0.4484, + "step": 23784 + }, + { + "epoch": 1.3318960689886885, + "grad_norm": 1.4867075681686401, + "learning_rate": 9.900684210526316e-05, + "loss": 0.5246, + "step": 23785 + }, + { + "epoch": 1.3319520663008175, + "grad_norm": 1.6939916610717773, + "learning_rate": 9.900657894736842e-05, + "loss": 0.5886, + "step": 23786 + }, + { + "epoch": 1.3320080636129465, + "grad_norm": 1.1807746887207031, + "learning_rate": 9.900631578947369e-05, + "loss": 0.3459, + "step": 23787 + }, + { + "epoch": 1.3320640609250756, + "grad_norm": 1.394364595413208, + "learning_rate": 9.900605263157895e-05, + "loss": 0.4881, + "step": 23788 + }, + { + "epoch": 1.3321200582372046, + "grad_norm": 1.2194969654083252, + "learning_rate": 9.900578947368421e-05, + "loss": 0.4091, + "step": 23789 + }, + { + "epoch": 1.3321760555493336, + "grad_norm": 1.711050033569336, + "learning_rate": 9.900552631578947e-05, + "loss": 0.5224, + "step": 23790 + }, + { + "epoch": 1.3322320528614626, + "grad_norm": 1.814692497253418, + "learning_rate": 9.900526315789475e-05, + "loss": 0.4065, + "step": 23791 + }, + { + "epoch": 1.3322880501735916, + "grad_norm": 1.7624183893203735, + "learning_rate": 9.9005e-05, + "loss": 0.4169, + "step": 23792 + }, + { + "epoch": 1.3323440474857207, + "grad_norm": 1.291270136833191, + "learning_rate": 9.900473684210528e-05, + "loss": 0.4397, + "step": 23793 + }, + { + "epoch": 1.3324000447978497, + "grad_norm": 2.6617846488952637, + "learning_rate": 9.900447368421052e-05, + "loss": 0.681, + "step": 23794 + }, + { + "epoch": 1.3324560421099787, + "grad_norm": 1.305954933166504, + "learning_rate": 9.90042105263158e-05, + "loss": 0.5125, + "step": 23795 + }, + { + "epoch": 1.3325120394221077, + "grad_norm": 1.3788312673568726, + "learning_rate": 9.900394736842106e-05, + "loss": 0.4138, + "step": 23796 + }, + { + "epoch": 1.3325680367342367, + "grad_norm": 1.4607813358306885, + "learning_rate": 9.900368421052632e-05, + "loss": 0.4552, + "step": 23797 + }, + { + "epoch": 1.3326240340463658, + "grad_norm": 1.2364460229873657, + "learning_rate": 9.900342105263159e-05, + "loss": 0.3507, + "step": 23798 + }, + { + "epoch": 1.3326800313584948, + "grad_norm": 1.564463496208191, + "learning_rate": 9.900315789473684e-05, + "loss": 0.4571, + "step": 23799 + }, + { + "epoch": 1.3327360286706238, + "grad_norm": 1.3653931617736816, + "learning_rate": 9.900289473684211e-05, + "loss": 0.4893, + "step": 23800 + }, + { + "epoch": 1.3327920259827528, + "grad_norm": 1.6454015970230103, + "learning_rate": 9.900263157894737e-05, + "loss": 0.7647, + "step": 23801 + }, + { + "epoch": 1.3328480232948818, + "grad_norm": 2.1272964477539062, + "learning_rate": 9.900236842105264e-05, + "loss": 0.7298, + "step": 23802 + }, + { + "epoch": 1.3329040206070109, + "grad_norm": 1.3992019891738892, + "learning_rate": 9.90021052631579e-05, + "loss": 0.4638, + "step": 23803 + }, + { + "epoch": 1.3329600179191399, + "grad_norm": 1.2260433435440063, + "learning_rate": 9.900184210526316e-05, + "loss": 0.457, + "step": 23804 + }, + { + "epoch": 1.333016015231269, + "grad_norm": 1.2614973783493042, + "learning_rate": 9.900157894736842e-05, + "loss": 0.5395, + "step": 23805 + }, + { + "epoch": 1.333072012543398, + "grad_norm": 1.2293323278427124, + "learning_rate": 9.90013157894737e-05, + "loss": 0.4591, + "step": 23806 + }, + { + "epoch": 1.333128009855527, + "grad_norm": 1.2137826681137085, + "learning_rate": 9.900105263157896e-05, + "loss": 0.4168, + "step": 23807 + }, + { + "epoch": 1.333184007167656, + "grad_norm": 1.2654191255569458, + "learning_rate": 9.900078947368421e-05, + "loss": 0.4625, + "step": 23808 + }, + { + "epoch": 1.333240004479785, + "grad_norm": 1.5558730363845825, + "learning_rate": 9.900052631578947e-05, + "loss": 0.5058, + "step": 23809 + }, + { + "epoch": 1.333296001791914, + "grad_norm": 1.408207893371582, + "learning_rate": 9.900026315789475e-05, + "loss": 0.466, + "step": 23810 + }, + { + "epoch": 1.333351999104043, + "grad_norm": 1.3477212190628052, + "learning_rate": 9.900000000000001e-05, + "loss": 0.4401, + "step": 23811 + }, + { + "epoch": 1.333407996416172, + "grad_norm": 1.4396218061447144, + "learning_rate": 9.899973684210527e-05, + "loss": 0.4345, + "step": 23812 + }, + { + "epoch": 1.333463993728301, + "grad_norm": 1.7757333517074585, + "learning_rate": 9.899947368421053e-05, + "loss": 0.4976, + "step": 23813 + }, + { + "epoch": 1.33351999104043, + "grad_norm": 1.4348074197769165, + "learning_rate": 9.899921052631579e-05, + "loss": 0.5476, + "step": 23814 + }, + { + "epoch": 1.333575988352559, + "grad_norm": 1.3225101232528687, + "learning_rate": 9.899894736842106e-05, + "loss": 0.371, + "step": 23815 + }, + { + "epoch": 1.333631985664688, + "grad_norm": 1.611476182937622, + "learning_rate": 9.899868421052632e-05, + "loss": 0.6114, + "step": 23816 + }, + { + "epoch": 1.3336879829768171, + "grad_norm": 1.424462080001831, + "learning_rate": 9.899842105263158e-05, + "loss": 0.3367, + "step": 23817 + }, + { + "epoch": 1.3337439802889461, + "grad_norm": 1.14649498462677, + "learning_rate": 9.899815789473684e-05, + "loss": 0.3709, + "step": 23818 + }, + { + "epoch": 1.3337999776010752, + "grad_norm": 1.7224383354187012, + "learning_rate": 9.899789473684211e-05, + "loss": 0.6526, + "step": 23819 + }, + { + "epoch": 1.3338559749132042, + "grad_norm": 1.3974765539169312, + "learning_rate": 9.899763157894737e-05, + "loss": 0.6072, + "step": 23820 + }, + { + "epoch": 1.3339119722253332, + "grad_norm": 1.3082847595214844, + "learning_rate": 9.899736842105263e-05, + "loss": 0.4481, + "step": 23821 + }, + { + "epoch": 1.3339679695374622, + "grad_norm": 1.349003791809082, + "learning_rate": 9.899710526315789e-05, + "loss": 0.5265, + "step": 23822 + }, + { + "epoch": 1.3340239668495912, + "grad_norm": 1.996715784072876, + "learning_rate": 9.899684210526316e-05, + "loss": 0.5606, + "step": 23823 + }, + { + "epoch": 1.3340799641617203, + "grad_norm": 1.4019711017608643, + "learning_rate": 9.899657894736842e-05, + "loss": 0.4384, + "step": 23824 + }, + { + "epoch": 1.3341359614738493, + "grad_norm": 1.392748236656189, + "learning_rate": 9.89963157894737e-05, + "loss": 0.5603, + "step": 23825 + }, + { + "epoch": 1.3341919587859783, + "grad_norm": 1.4841468334197998, + "learning_rate": 9.899605263157894e-05, + "loss": 0.4993, + "step": 23826 + }, + { + "epoch": 1.3342479560981073, + "grad_norm": 1.6232941150665283, + "learning_rate": 9.899578947368422e-05, + "loss": 0.5431, + "step": 23827 + }, + { + "epoch": 1.3343039534102363, + "grad_norm": 1.5400526523590088, + "learning_rate": 9.899552631578948e-05, + "loss": 0.436, + "step": 23828 + }, + { + "epoch": 1.3343599507223654, + "grad_norm": 1.1462382078170776, + "learning_rate": 9.899526315789475e-05, + "loss": 0.4438, + "step": 23829 + }, + { + "epoch": 1.3344159480344944, + "grad_norm": 1.3616477251052856, + "learning_rate": 9.899500000000001e-05, + "loss": 0.3233, + "step": 23830 + }, + { + "epoch": 1.3344719453466234, + "grad_norm": 2.2361109256744385, + "learning_rate": 9.899473684210527e-05, + "loss": 0.7127, + "step": 23831 + }, + { + "epoch": 1.3345279426587524, + "grad_norm": 1.9489185810089111, + "learning_rate": 9.899447368421053e-05, + "loss": 0.4546, + "step": 23832 + }, + { + "epoch": 1.3345839399708814, + "grad_norm": 7.203779220581055, + "learning_rate": 9.899421052631579e-05, + "loss": 0.5088, + "step": 23833 + }, + { + "epoch": 1.3346399372830104, + "grad_norm": 1.516080617904663, + "learning_rate": 9.899394736842106e-05, + "loss": 0.4153, + "step": 23834 + }, + { + "epoch": 1.3346959345951395, + "grad_norm": 1.2757153511047363, + "learning_rate": 9.899368421052632e-05, + "loss": 0.4722, + "step": 23835 + }, + { + "epoch": 1.3347519319072685, + "grad_norm": 1.1644984483718872, + "learning_rate": 9.899342105263158e-05, + "loss": 0.4474, + "step": 23836 + }, + { + "epoch": 1.3348079292193975, + "grad_norm": 1.3059358596801758, + "learning_rate": 9.899315789473684e-05, + "loss": 0.5506, + "step": 23837 + }, + { + "epoch": 1.3348639265315265, + "grad_norm": 1.5706517696380615, + "learning_rate": 9.899289473684212e-05, + "loss": 0.4251, + "step": 23838 + }, + { + "epoch": 1.3349199238436555, + "grad_norm": 1.316448450088501, + "learning_rate": 9.899263157894737e-05, + "loss": 0.4417, + "step": 23839 + }, + { + "epoch": 1.3349759211557846, + "grad_norm": 1.5210957527160645, + "learning_rate": 9.899236842105263e-05, + "loss": 0.4646, + "step": 23840 + }, + { + "epoch": 1.3350319184679136, + "grad_norm": 1.292188286781311, + "learning_rate": 9.89921052631579e-05, + "loss": 0.4646, + "step": 23841 + }, + { + "epoch": 1.3350879157800426, + "grad_norm": 1.5652449131011963, + "learning_rate": 9.899184210526317e-05, + "loss": 0.3949, + "step": 23842 + }, + { + "epoch": 1.3351439130921716, + "grad_norm": 2.441153049468994, + "learning_rate": 9.899157894736843e-05, + "loss": 0.4277, + "step": 23843 + }, + { + "epoch": 1.3351999104043006, + "grad_norm": 1.1771906614303589, + "learning_rate": 9.899131578947369e-05, + "loss": 0.4003, + "step": 23844 + }, + { + "epoch": 1.3352559077164297, + "grad_norm": 1.5029371976852417, + "learning_rate": 9.899105263157895e-05, + "loss": 0.5011, + "step": 23845 + }, + { + "epoch": 1.3353119050285587, + "grad_norm": 1.25845468044281, + "learning_rate": 9.899078947368422e-05, + "loss": 0.4785, + "step": 23846 + }, + { + "epoch": 1.3353679023406877, + "grad_norm": 1.39874267578125, + "learning_rate": 9.899052631578948e-05, + "loss": 0.4808, + "step": 23847 + }, + { + "epoch": 1.3354238996528167, + "grad_norm": 1.8373452425003052, + "learning_rate": 9.899026315789475e-05, + "loss": 0.5285, + "step": 23848 + }, + { + "epoch": 1.3354798969649457, + "grad_norm": 1.3263756036758423, + "learning_rate": 9.899e-05, + "loss": 0.5416, + "step": 23849 + }, + { + "epoch": 1.3355358942770748, + "grad_norm": 1.2071919441223145, + "learning_rate": 9.898973684210526e-05, + "loss": 0.4166, + "step": 23850 + }, + { + "epoch": 1.3355918915892038, + "grad_norm": 1.3608685731887817, + "learning_rate": 9.898947368421053e-05, + "loss": 0.4098, + "step": 23851 + }, + { + "epoch": 1.3356478889013328, + "grad_norm": 1.263219952583313, + "learning_rate": 9.898921052631579e-05, + "loss": 0.4067, + "step": 23852 + }, + { + "epoch": 1.3357038862134618, + "grad_norm": 1.5391923189163208, + "learning_rate": 9.898894736842107e-05, + "loss": 0.51, + "step": 23853 + }, + { + "epoch": 1.3357598835255908, + "grad_norm": 1.4992666244506836, + "learning_rate": 9.898868421052631e-05, + "loss": 0.5237, + "step": 23854 + }, + { + "epoch": 1.3358158808377198, + "grad_norm": 1.4221733808517456, + "learning_rate": 9.898842105263158e-05, + "loss": 0.473, + "step": 23855 + }, + { + "epoch": 1.3358718781498489, + "grad_norm": 1.4912809133529663, + "learning_rate": 9.898815789473684e-05, + "loss": 0.4861, + "step": 23856 + }, + { + "epoch": 1.3359278754619779, + "grad_norm": 1.3533376455307007, + "learning_rate": 9.898789473684212e-05, + "loss": 0.5411, + "step": 23857 + }, + { + "epoch": 1.335983872774107, + "grad_norm": 1.2698400020599365, + "learning_rate": 9.898763157894738e-05, + "loss": 0.4958, + "step": 23858 + }, + { + "epoch": 1.336039870086236, + "grad_norm": 1.6540385484695435, + "learning_rate": 9.898736842105264e-05, + "loss": 0.5944, + "step": 23859 + }, + { + "epoch": 1.336095867398365, + "grad_norm": 1.4382905960083008, + "learning_rate": 9.89871052631579e-05, + "loss": 0.4067, + "step": 23860 + }, + { + "epoch": 1.336151864710494, + "grad_norm": 1.4954227209091187, + "learning_rate": 9.898684210526317e-05, + "loss": 0.5319, + "step": 23861 + }, + { + "epoch": 1.336207862022623, + "grad_norm": 1.2915523052215576, + "learning_rate": 9.898657894736843e-05, + "loss": 0.4074, + "step": 23862 + }, + { + "epoch": 1.336263859334752, + "grad_norm": 1.2406599521636963, + "learning_rate": 9.898631578947369e-05, + "loss": 0.4717, + "step": 23863 + }, + { + "epoch": 1.336319856646881, + "grad_norm": 1.2790749073028564, + "learning_rate": 9.898605263157895e-05, + "loss": 0.5137, + "step": 23864 + }, + { + "epoch": 1.33637585395901, + "grad_norm": 1.512863039970398, + "learning_rate": 9.898578947368422e-05, + "loss": 0.4512, + "step": 23865 + }, + { + "epoch": 1.336431851271139, + "grad_norm": 1.4381901025772095, + "learning_rate": 9.898552631578948e-05, + "loss": 0.5711, + "step": 23866 + }, + { + "epoch": 1.336487848583268, + "grad_norm": 1.1895484924316406, + "learning_rate": 9.898526315789474e-05, + "loss": 0.4086, + "step": 23867 + }, + { + "epoch": 1.336543845895397, + "grad_norm": 1.587947964668274, + "learning_rate": 9.8985e-05, + "loss": 0.46, + "step": 23868 + }, + { + "epoch": 1.3365998432075261, + "grad_norm": 1.3747241497039795, + "learning_rate": 9.898473684210526e-05, + "loss": 0.549, + "step": 23869 + }, + { + "epoch": 1.3366558405196551, + "grad_norm": 1.4006222486495972, + "learning_rate": 9.898447368421053e-05, + "loss": 0.4107, + "step": 23870 + }, + { + "epoch": 1.3367118378317842, + "grad_norm": 1.5141626596450806, + "learning_rate": 9.89842105263158e-05, + "loss": 0.5305, + "step": 23871 + }, + { + "epoch": 1.3367678351439132, + "grad_norm": 1.4598748683929443, + "learning_rate": 9.898394736842105e-05, + "loss": 0.4646, + "step": 23872 + }, + { + "epoch": 1.3368238324560422, + "grad_norm": 1.4114829301834106, + "learning_rate": 9.898368421052631e-05, + "loss": 0.4784, + "step": 23873 + }, + { + "epoch": 1.3368798297681712, + "grad_norm": 1.3672314882278442, + "learning_rate": 9.898342105263159e-05, + "loss": 0.4479, + "step": 23874 + }, + { + "epoch": 1.3369358270803002, + "grad_norm": 1.234108567237854, + "learning_rate": 9.898315789473685e-05, + "loss": 0.4793, + "step": 23875 + }, + { + "epoch": 1.3369918243924293, + "grad_norm": 1.3132939338684082, + "learning_rate": 9.89828947368421e-05, + "loss": 0.4638, + "step": 23876 + }, + { + "epoch": 1.3370478217045583, + "grad_norm": 1.4558367729187012, + "learning_rate": 9.898263157894737e-05, + "loss": 0.566, + "step": 23877 + }, + { + "epoch": 1.3371038190166873, + "grad_norm": 1.4092020988464355, + "learning_rate": 9.898236842105264e-05, + "loss": 0.3604, + "step": 23878 + }, + { + "epoch": 1.3371598163288163, + "grad_norm": 1.476643443107605, + "learning_rate": 9.89821052631579e-05, + "loss": 0.5447, + "step": 23879 + }, + { + "epoch": 1.3372158136409453, + "grad_norm": 1.3778104782104492, + "learning_rate": 9.898184210526317e-05, + "loss": 0.4364, + "step": 23880 + }, + { + "epoch": 1.3372718109530743, + "grad_norm": 1.4713430404663086, + "learning_rate": 9.898157894736842e-05, + "loss": 0.516, + "step": 23881 + }, + { + "epoch": 1.3373278082652034, + "grad_norm": 1.4517395496368408, + "learning_rate": 9.898131578947369e-05, + "loss": 0.4876, + "step": 23882 + }, + { + "epoch": 1.3373838055773324, + "grad_norm": 1.3174575567245483, + "learning_rate": 9.898105263157895e-05, + "loss": 0.4194, + "step": 23883 + }, + { + "epoch": 1.3374398028894614, + "grad_norm": 1.451480507850647, + "learning_rate": 9.898078947368421e-05, + "loss": 0.5662, + "step": 23884 + }, + { + "epoch": 1.3374958002015904, + "grad_norm": 1.321994662284851, + "learning_rate": 9.898052631578948e-05, + "loss": 0.3988, + "step": 23885 + }, + { + "epoch": 1.3375517975137194, + "grad_norm": 1.323826789855957, + "learning_rate": 9.898026315789473e-05, + "loss": 0.4865, + "step": 23886 + }, + { + "epoch": 1.3376077948258485, + "grad_norm": 1.6025311946868896, + "learning_rate": 9.898e-05, + "loss": 0.5381, + "step": 23887 + }, + { + "epoch": 1.3376637921379775, + "grad_norm": 1.4213122129440308, + "learning_rate": 9.897973684210526e-05, + "loss": 0.5454, + "step": 23888 + }, + { + "epoch": 1.3377197894501065, + "grad_norm": 1.103776216506958, + "learning_rate": 9.897947368421054e-05, + "loss": 0.4856, + "step": 23889 + }, + { + "epoch": 1.3377757867622355, + "grad_norm": 1.3291431665420532, + "learning_rate": 9.89792105263158e-05, + "loss": 0.4835, + "step": 23890 + }, + { + "epoch": 1.3378317840743645, + "grad_norm": 1.5485930442810059, + "learning_rate": 9.897894736842106e-05, + "loss": 0.6487, + "step": 23891 + }, + { + "epoch": 1.3378877813864936, + "grad_norm": 1.4208502769470215, + "learning_rate": 9.897868421052632e-05, + "loss": 0.4842, + "step": 23892 + }, + { + "epoch": 1.3379437786986226, + "grad_norm": 1.3566259145736694, + "learning_rate": 9.897842105263159e-05, + "loss": 0.3162, + "step": 23893 + }, + { + "epoch": 1.3379997760107516, + "grad_norm": 1.3701115846633911, + "learning_rate": 9.897815789473685e-05, + "loss": 0.4999, + "step": 23894 + }, + { + "epoch": 1.3380557733228806, + "grad_norm": 1.1461424827575684, + "learning_rate": 9.897789473684211e-05, + "loss": 0.4746, + "step": 23895 + }, + { + "epoch": 1.3381117706350096, + "grad_norm": 1.3429561853408813, + "learning_rate": 9.897763157894737e-05, + "loss": 0.4234, + "step": 23896 + }, + { + "epoch": 1.3381677679471387, + "grad_norm": 1.2542699575424194, + "learning_rate": 9.897736842105264e-05, + "loss": 0.4548, + "step": 23897 + }, + { + "epoch": 1.3382237652592677, + "grad_norm": 1.7728514671325684, + "learning_rate": 9.89771052631579e-05, + "loss": 0.5006, + "step": 23898 + }, + { + "epoch": 1.3382797625713967, + "grad_norm": 2.911759376525879, + "learning_rate": 9.897684210526316e-05, + "loss": 0.4549, + "step": 23899 + }, + { + "epoch": 1.3383357598835257, + "grad_norm": 2.126786708831787, + "learning_rate": 9.897657894736842e-05, + "loss": 0.5309, + "step": 23900 + }, + { + "epoch": 1.3383917571956547, + "grad_norm": 1.303514003753662, + "learning_rate": 9.897631578947368e-05, + "loss": 0.4588, + "step": 23901 + }, + { + "epoch": 1.3384477545077837, + "grad_norm": 1.3816440105438232, + "learning_rate": 9.897605263157895e-05, + "loss": 0.4875, + "step": 23902 + }, + { + "epoch": 1.3385037518199128, + "grad_norm": 1.1241973638534546, + "learning_rate": 9.897578947368421e-05, + "loss": 0.3535, + "step": 23903 + }, + { + "epoch": 1.3385597491320418, + "grad_norm": 1.3934160470962524, + "learning_rate": 9.897552631578947e-05, + "loss": 0.4369, + "step": 23904 + }, + { + "epoch": 1.3386157464441708, + "grad_norm": 1.5238735675811768, + "learning_rate": 9.897526315789473e-05, + "loss": 0.4324, + "step": 23905 + }, + { + "epoch": 1.3386717437562998, + "grad_norm": 1.5582380294799805, + "learning_rate": 9.897500000000001e-05, + "loss": 0.5663, + "step": 23906 + }, + { + "epoch": 1.3387277410684288, + "grad_norm": 1.082220435142517, + "learning_rate": 9.897473684210527e-05, + "loss": 0.4133, + "step": 23907 + }, + { + "epoch": 1.3387837383805579, + "grad_norm": 1.3235070705413818, + "learning_rate": 9.897447368421054e-05, + "loss": 0.4129, + "step": 23908 + }, + { + "epoch": 1.3388397356926869, + "grad_norm": 1.3532748222351074, + "learning_rate": 9.897421052631579e-05, + "loss": 0.5135, + "step": 23909 + }, + { + "epoch": 1.338895733004816, + "grad_norm": 1.4414713382720947, + "learning_rate": 9.897394736842106e-05, + "loss": 0.3811, + "step": 23910 + }, + { + "epoch": 1.338951730316945, + "grad_norm": 1.3468427658081055, + "learning_rate": 9.897368421052632e-05, + "loss": 0.4218, + "step": 23911 + }, + { + "epoch": 1.339007727629074, + "grad_norm": 1.3484735488891602, + "learning_rate": 9.897342105263159e-05, + "loss": 0.4637, + "step": 23912 + }, + { + "epoch": 1.339063724941203, + "grad_norm": 2.199549436569214, + "learning_rate": 9.897315789473685e-05, + "loss": 0.5371, + "step": 23913 + }, + { + "epoch": 1.339119722253332, + "grad_norm": 1.2956798076629639, + "learning_rate": 9.897289473684211e-05, + "loss": 0.4634, + "step": 23914 + }, + { + "epoch": 1.3391757195654608, + "grad_norm": 1.543900728225708, + "learning_rate": 9.897263157894737e-05, + "loss": 0.5257, + "step": 23915 + }, + { + "epoch": 1.3392317168775898, + "grad_norm": 1.3237605094909668, + "learning_rate": 9.897236842105264e-05, + "loss": 0.4308, + "step": 23916 + }, + { + "epoch": 1.3392877141897188, + "grad_norm": 1.7733278274536133, + "learning_rate": 9.89721052631579e-05, + "loss": 0.4651, + "step": 23917 + }, + { + "epoch": 1.3393437115018478, + "grad_norm": 1.2714046239852905, + "learning_rate": 9.897184210526315e-05, + "loss": 0.4149, + "step": 23918 + }, + { + "epoch": 1.3393997088139769, + "grad_norm": 1.4521942138671875, + "learning_rate": 9.897157894736842e-05, + "loss": 0.3297, + "step": 23919 + }, + { + "epoch": 1.3394557061261059, + "grad_norm": 1.4575254917144775, + "learning_rate": 9.897131578947368e-05, + "loss": 0.6913, + "step": 23920 + }, + { + "epoch": 1.339511703438235, + "grad_norm": 1.6212458610534668, + "learning_rate": 9.897105263157896e-05, + "loss": 0.5082, + "step": 23921 + }, + { + "epoch": 1.339567700750364, + "grad_norm": 1.4157074689865112, + "learning_rate": 9.897078947368422e-05, + "loss": 0.4816, + "step": 23922 + }, + { + "epoch": 1.339623698062493, + "grad_norm": 1.3685389757156372, + "learning_rate": 9.897052631578948e-05, + "loss": 0.4676, + "step": 23923 + }, + { + "epoch": 1.339679695374622, + "grad_norm": 1.3349485397338867, + "learning_rate": 9.897026315789474e-05, + "loss": 0.7631, + "step": 23924 + }, + { + "epoch": 1.339735692686751, + "grad_norm": 1.2802698612213135, + "learning_rate": 9.897000000000001e-05, + "loss": 0.5143, + "step": 23925 + }, + { + "epoch": 1.33979168999888, + "grad_norm": 1.3084275722503662, + "learning_rate": 9.896973684210527e-05, + "loss": 0.4478, + "step": 23926 + }, + { + "epoch": 1.339847687311009, + "grad_norm": 1.560441255569458, + "learning_rate": 9.896947368421053e-05, + "loss": 0.5547, + "step": 23927 + }, + { + "epoch": 1.339903684623138, + "grad_norm": 1.327688217163086, + "learning_rate": 9.896921052631579e-05, + "loss": 0.6401, + "step": 23928 + }, + { + "epoch": 1.339959681935267, + "grad_norm": 1.4617291688919067, + "learning_rate": 9.896894736842106e-05, + "loss": 0.4964, + "step": 23929 + }, + { + "epoch": 1.340015679247396, + "grad_norm": 1.5066403150558472, + "learning_rate": 9.896868421052632e-05, + "loss": 0.4309, + "step": 23930 + }, + { + "epoch": 1.340071676559525, + "grad_norm": 1.3324167728424072, + "learning_rate": 9.896842105263158e-05, + "loss": 0.4384, + "step": 23931 + }, + { + "epoch": 1.340127673871654, + "grad_norm": 1.2845227718353271, + "learning_rate": 9.896815789473684e-05, + "loss": 0.4418, + "step": 23932 + }, + { + "epoch": 1.3401836711837831, + "grad_norm": 1.507017731666565, + "learning_rate": 9.896789473684211e-05, + "loss": 0.5666, + "step": 23933 + }, + { + "epoch": 1.3402396684959121, + "grad_norm": 1.5677680969238281, + "learning_rate": 9.896763157894737e-05, + "loss": 0.5223, + "step": 23934 + }, + { + "epoch": 1.3402956658080412, + "grad_norm": 1.4154253005981445, + "learning_rate": 9.896736842105263e-05, + "loss": 0.5921, + "step": 23935 + }, + { + "epoch": 1.3403516631201702, + "grad_norm": 1.358176589012146, + "learning_rate": 9.89671052631579e-05, + "loss": 0.4748, + "step": 23936 + }, + { + "epoch": 1.3404076604322992, + "grad_norm": 1.3491337299346924, + "learning_rate": 9.896684210526315e-05, + "loss": 0.4248, + "step": 23937 + }, + { + "epoch": 1.3404636577444282, + "grad_norm": 1.1043469905853271, + "learning_rate": 9.896657894736843e-05, + "loss": 0.4779, + "step": 23938 + }, + { + "epoch": 1.3405196550565572, + "grad_norm": 1.4574761390686035, + "learning_rate": 9.896631578947369e-05, + "loss": 0.6477, + "step": 23939 + }, + { + "epoch": 1.3405756523686863, + "grad_norm": 1.283100962638855, + "learning_rate": 9.896605263157896e-05, + "loss": 0.4059, + "step": 23940 + }, + { + "epoch": 1.3406316496808153, + "grad_norm": 1.1675559282302856, + "learning_rate": 9.89657894736842e-05, + "loss": 0.3313, + "step": 23941 + }, + { + "epoch": 1.3406876469929443, + "grad_norm": 1.175600290298462, + "learning_rate": 9.896552631578948e-05, + "loss": 0.3677, + "step": 23942 + }, + { + "epoch": 1.3407436443050733, + "grad_norm": 1.2264043092727661, + "learning_rate": 9.896526315789474e-05, + "loss": 0.4248, + "step": 23943 + }, + { + "epoch": 1.3407996416172023, + "grad_norm": 1.5873838663101196, + "learning_rate": 9.896500000000001e-05, + "loss": 0.487, + "step": 23944 + }, + { + "epoch": 1.3408556389293314, + "grad_norm": 1.2819849252700806, + "learning_rate": 9.896473684210527e-05, + "loss": 0.4605, + "step": 23945 + }, + { + "epoch": 1.3409116362414604, + "grad_norm": 1.4461268186569214, + "learning_rate": 9.896447368421053e-05, + "loss": 0.525, + "step": 23946 + }, + { + "epoch": 1.3409676335535894, + "grad_norm": 1.2202770709991455, + "learning_rate": 9.896421052631579e-05, + "loss": 0.4893, + "step": 23947 + }, + { + "epoch": 1.3410236308657184, + "grad_norm": 1.4147801399230957, + "learning_rate": 9.896394736842106e-05, + "loss": 0.4573, + "step": 23948 + }, + { + "epoch": 1.3410796281778474, + "grad_norm": 1.9122450351715088, + "learning_rate": 9.896368421052632e-05, + "loss": 0.4832, + "step": 23949 + }, + { + "epoch": 1.3411356254899764, + "grad_norm": 1.5094122886657715, + "learning_rate": 9.896342105263158e-05, + "loss": 0.4196, + "step": 23950 + }, + { + "epoch": 1.3411916228021055, + "grad_norm": 1.4098691940307617, + "learning_rate": 9.896315789473684e-05, + "loss": 0.478, + "step": 23951 + }, + { + "epoch": 1.3412476201142345, + "grad_norm": 1.3345513343811035, + "learning_rate": 9.896289473684212e-05, + "loss": 0.3645, + "step": 23952 + }, + { + "epoch": 1.3413036174263635, + "grad_norm": 1.1226009130477905, + "learning_rate": 9.896263157894738e-05, + "loss": 0.3738, + "step": 23953 + }, + { + "epoch": 1.3413596147384925, + "grad_norm": 1.3263577222824097, + "learning_rate": 9.896236842105264e-05, + "loss": 0.4815, + "step": 23954 + }, + { + "epoch": 1.3414156120506215, + "grad_norm": 1.2852894067764282, + "learning_rate": 9.89621052631579e-05, + "loss": 0.4569, + "step": 23955 + }, + { + "epoch": 1.3414716093627506, + "grad_norm": 1.4982022047042847, + "learning_rate": 9.896184210526316e-05, + "loss": 0.6568, + "step": 23956 + }, + { + "epoch": 1.3415276066748796, + "grad_norm": 1.3840714693069458, + "learning_rate": 9.896157894736843e-05, + "loss": 0.4405, + "step": 23957 + }, + { + "epoch": 1.3415836039870086, + "grad_norm": 1.18630051612854, + "learning_rate": 9.896131578947369e-05, + "loss": 0.5221, + "step": 23958 + }, + { + "epoch": 1.3416396012991376, + "grad_norm": 1.3169230222702026, + "learning_rate": 9.896105263157895e-05, + "loss": 0.5608, + "step": 23959 + }, + { + "epoch": 1.3416955986112666, + "grad_norm": 1.7112146615982056, + "learning_rate": 9.896078947368421e-05, + "loss": 0.5546, + "step": 23960 + }, + { + "epoch": 1.3417515959233957, + "grad_norm": 1.5063049793243408, + "learning_rate": 9.896052631578948e-05, + "loss": 0.6101, + "step": 23961 + }, + { + "epoch": 1.3418075932355247, + "grad_norm": 1.383370280265808, + "learning_rate": 9.896026315789474e-05, + "loss": 0.4475, + "step": 23962 + }, + { + "epoch": 1.3418635905476537, + "grad_norm": 1.4847168922424316, + "learning_rate": 9.896000000000001e-05, + "loss": 0.5212, + "step": 23963 + }, + { + "epoch": 1.3419195878597827, + "grad_norm": 1.172202229499817, + "learning_rate": 9.895973684210526e-05, + "loss": 0.4173, + "step": 23964 + }, + { + "epoch": 1.3419755851719117, + "grad_norm": 1.6316022872924805, + "learning_rate": 9.895947368421053e-05, + "loss": 0.5424, + "step": 23965 + }, + { + "epoch": 1.3420315824840408, + "grad_norm": 1.3216683864593506, + "learning_rate": 9.89592105263158e-05, + "loss": 0.4224, + "step": 23966 + }, + { + "epoch": 1.3420875797961698, + "grad_norm": 1.0985703468322754, + "learning_rate": 9.895894736842107e-05, + "loss": 0.3667, + "step": 23967 + }, + { + "epoch": 1.3421435771082988, + "grad_norm": 1.6286671161651611, + "learning_rate": 9.895868421052631e-05, + "loss": 0.5541, + "step": 23968 + }, + { + "epoch": 1.3421995744204278, + "grad_norm": 1.5791877508163452, + "learning_rate": 9.895842105263159e-05, + "loss": 0.5442, + "step": 23969 + }, + { + "epoch": 1.3422555717325568, + "grad_norm": 1.5616247653961182, + "learning_rate": 9.895815789473685e-05, + "loss": 0.6061, + "step": 23970 + }, + { + "epoch": 1.3423115690446858, + "grad_norm": 1.5898517370224, + "learning_rate": 9.89578947368421e-05, + "loss": 0.4698, + "step": 23971 + }, + { + "epoch": 1.3423675663568149, + "grad_norm": 1.1105499267578125, + "learning_rate": 9.895763157894738e-05, + "loss": 0.4383, + "step": 23972 + }, + { + "epoch": 1.3424235636689439, + "grad_norm": 1.3896452188491821, + "learning_rate": 9.895736842105263e-05, + "loss": 0.4147, + "step": 23973 + }, + { + "epoch": 1.342479560981073, + "grad_norm": 1.7090681791305542, + "learning_rate": 9.89571052631579e-05, + "loss": 0.4694, + "step": 23974 + }, + { + "epoch": 1.342535558293202, + "grad_norm": 2.046135187149048, + "learning_rate": 9.895684210526316e-05, + "loss": 0.7754, + "step": 23975 + }, + { + "epoch": 1.342591555605331, + "grad_norm": 1.567522644996643, + "learning_rate": 9.895657894736843e-05, + "loss": 0.5032, + "step": 23976 + }, + { + "epoch": 1.34264755291746, + "grad_norm": 1.248820424079895, + "learning_rate": 9.895631578947369e-05, + "loss": 0.4866, + "step": 23977 + }, + { + "epoch": 1.342703550229589, + "grad_norm": 1.32659113407135, + "learning_rate": 9.895605263157895e-05, + "loss": 0.5262, + "step": 23978 + }, + { + "epoch": 1.342759547541718, + "grad_norm": 1.36756432056427, + "learning_rate": 9.895578947368421e-05, + "loss": 0.4342, + "step": 23979 + }, + { + "epoch": 1.342815544853847, + "grad_norm": 1.3160245418548584, + "learning_rate": 9.895552631578948e-05, + "loss": 0.4198, + "step": 23980 + }, + { + "epoch": 1.342871542165976, + "grad_norm": 1.4190582036972046, + "learning_rate": 9.895526315789474e-05, + "loss": 0.4506, + "step": 23981 + }, + { + "epoch": 1.342927539478105, + "grad_norm": 1.7953743934631348, + "learning_rate": 9.8955e-05, + "loss": 0.4899, + "step": 23982 + }, + { + "epoch": 1.342983536790234, + "grad_norm": 1.3152244091033936, + "learning_rate": 9.895473684210526e-05, + "loss": 0.4237, + "step": 23983 + }, + { + "epoch": 1.343039534102363, + "grad_norm": 1.426226019859314, + "learning_rate": 9.895447368421054e-05, + "loss": 0.5225, + "step": 23984 + }, + { + "epoch": 1.3430955314144921, + "grad_norm": 1.4641419649124146, + "learning_rate": 9.89542105263158e-05, + "loss": 0.476, + "step": 23985 + }, + { + "epoch": 1.3431515287266211, + "grad_norm": 1.3623632192611694, + "learning_rate": 9.895394736842106e-05, + "loss": 0.5086, + "step": 23986 + }, + { + "epoch": 1.3432075260387502, + "grad_norm": 1.0939357280731201, + "learning_rate": 9.895368421052632e-05, + "loss": 0.5236, + "step": 23987 + }, + { + "epoch": 1.3432635233508792, + "grad_norm": 1.5585516691207886, + "learning_rate": 9.895342105263158e-05, + "loss": 0.594, + "step": 23988 + }, + { + "epoch": 1.3433195206630082, + "grad_norm": 19.078744888305664, + "learning_rate": 9.895315789473685e-05, + "loss": 0.4286, + "step": 23989 + }, + { + "epoch": 1.3433755179751372, + "grad_norm": 1.1451103687286377, + "learning_rate": 9.895289473684211e-05, + "loss": 0.4013, + "step": 23990 + }, + { + "epoch": 1.3434315152872662, + "grad_norm": 1.6688843965530396, + "learning_rate": 9.895263157894737e-05, + "loss": 0.6271, + "step": 23991 + }, + { + "epoch": 1.3434875125993953, + "grad_norm": 1.4637559652328491, + "learning_rate": 9.895236842105263e-05, + "loss": 0.6666, + "step": 23992 + }, + { + "epoch": 1.3435435099115243, + "grad_norm": 1.3607884645462036, + "learning_rate": 9.89521052631579e-05, + "loss": 0.5131, + "step": 23993 + }, + { + "epoch": 1.3435995072236533, + "grad_norm": 1.3291383981704712, + "learning_rate": 9.895184210526316e-05, + "loss": 0.4581, + "step": 23994 + }, + { + "epoch": 1.3436555045357823, + "grad_norm": 1.298227071762085, + "learning_rate": 9.895157894736843e-05, + "loss": 0.5034, + "step": 23995 + }, + { + "epoch": 1.3437115018479113, + "grad_norm": 1.3066940307617188, + "learning_rate": 9.895131578947368e-05, + "loss": 0.4796, + "step": 23996 + }, + { + "epoch": 1.3437674991600403, + "grad_norm": 1.0873332023620605, + "learning_rate": 9.895105263157895e-05, + "loss": 0.3945, + "step": 23997 + }, + { + "epoch": 1.3438234964721694, + "grad_norm": 1.6476949453353882, + "learning_rate": 9.895078947368421e-05, + "loss": 0.5423, + "step": 23998 + }, + { + "epoch": 1.3438794937842984, + "grad_norm": 1.4215620756149292, + "learning_rate": 9.895052631578949e-05, + "loss": 0.4441, + "step": 23999 + }, + { + "epoch": 1.3439354910964274, + "grad_norm": 1.2197273969650269, + "learning_rate": 9.895026315789475e-05, + "loss": 0.383, + "step": 24000 + }, + { + "epoch": 1.3439914884085564, + "grad_norm": 1.3874455690383911, + "learning_rate": 9.895e-05, + "loss": 0.5279, + "step": 24001 + }, + { + "epoch": 1.3440474857206854, + "grad_norm": 1.2576227188110352, + "learning_rate": 9.894973684210527e-05, + "loss": 0.4465, + "step": 24002 + }, + { + "epoch": 1.3441034830328145, + "grad_norm": 1.453696846961975, + "learning_rate": 9.894947368421054e-05, + "loss": 0.4785, + "step": 24003 + }, + { + "epoch": 1.3441594803449435, + "grad_norm": 1.446753978729248, + "learning_rate": 9.89492105263158e-05, + "loss": 0.5079, + "step": 24004 + }, + { + "epoch": 1.3442154776570725, + "grad_norm": 1.7142091989517212, + "learning_rate": 9.894894736842106e-05, + "loss": 0.6639, + "step": 24005 + }, + { + "epoch": 1.3442714749692015, + "grad_norm": 1.5219104290008545, + "learning_rate": 9.894868421052632e-05, + "loss": 0.4946, + "step": 24006 + }, + { + "epoch": 1.3443274722813305, + "grad_norm": 1.135257601737976, + "learning_rate": 9.894842105263158e-05, + "loss": 0.3277, + "step": 24007 + }, + { + "epoch": 1.3443834695934596, + "grad_norm": 1.2626960277557373, + "learning_rate": 9.894815789473685e-05, + "loss": 0.4215, + "step": 24008 + }, + { + "epoch": 1.3444394669055886, + "grad_norm": 1.1269794702529907, + "learning_rate": 9.894789473684211e-05, + "loss": 0.4061, + "step": 24009 + }, + { + "epoch": 1.3444954642177176, + "grad_norm": 1.2398040294647217, + "learning_rate": 9.894763157894737e-05, + "loss": 0.5051, + "step": 24010 + }, + { + "epoch": 1.3445514615298466, + "grad_norm": 1.2162896394729614, + "learning_rate": 9.894736842105263e-05, + "loss": 0.3527, + "step": 24011 + }, + { + "epoch": 1.3446074588419756, + "grad_norm": 1.1273400783538818, + "learning_rate": 9.89471052631579e-05, + "loss": 0.3632, + "step": 24012 + }, + { + "epoch": 1.3446634561541047, + "grad_norm": 1.2960906028747559, + "learning_rate": 9.894684210526316e-05, + "loss": 0.4308, + "step": 24013 + }, + { + "epoch": 1.3447194534662337, + "grad_norm": 1.1805799007415771, + "learning_rate": 9.894657894736842e-05, + "loss": 0.3707, + "step": 24014 + }, + { + "epoch": 1.3447754507783627, + "grad_norm": 1.424858570098877, + "learning_rate": 9.894631578947368e-05, + "loss": 0.4553, + "step": 24015 + }, + { + "epoch": 1.3448314480904917, + "grad_norm": 1.1969478130340576, + "learning_rate": 9.894605263157896e-05, + "loss": 0.5471, + "step": 24016 + }, + { + "epoch": 1.3448874454026207, + "grad_norm": 1.5464682579040527, + "learning_rate": 9.894578947368422e-05, + "loss": 0.4241, + "step": 24017 + }, + { + "epoch": 1.3449434427147497, + "grad_norm": 1.2001961469650269, + "learning_rate": 9.894552631578949e-05, + "loss": 0.4713, + "step": 24018 + }, + { + "epoch": 1.3449994400268788, + "grad_norm": 1.5608412027359009, + "learning_rate": 9.894526315789474e-05, + "loss": 0.5646, + "step": 24019 + }, + { + "epoch": 1.3450554373390078, + "grad_norm": 1.757699728012085, + "learning_rate": 9.894500000000001e-05, + "loss": 0.5386, + "step": 24020 + }, + { + "epoch": 1.3451114346511368, + "grad_norm": 1.6653386354446411, + "learning_rate": 9.894473684210527e-05, + "loss": 0.593, + "step": 24021 + }, + { + "epoch": 1.3451674319632658, + "grad_norm": 1.305739402770996, + "learning_rate": 9.894447368421053e-05, + "loss": 0.4264, + "step": 24022 + }, + { + "epoch": 1.3452234292753948, + "grad_norm": 1.2630366086959839, + "learning_rate": 9.894421052631579e-05, + "loss": 0.5489, + "step": 24023 + }, + { + "epoch": 1.3452794265875239, + "grad_norm": 1.4003286361694336, + "learning_rate": 9.894394736842105e-05, + "loss": 0.5305, + "step": 24024 + }, + { + "epoch": 1.3453354238996529, + "grad_norm": 1.5072417259216309, + "learning_rate": 9.894368421052632e-05, + "loss": 0.5619, + "step": 24025 + }, + { + "epoch": 1.345391421211782, + "grad_norm": 1.4055123329162598, + "learning_rate": 9.894342105263158e-05, + "loss": 0.4955, + "step": 24026 + }, + { + "epoch": 1.345447418523911, + "grad_norm": 1.1914441585540771, + "learning_rate": 9.894315789473685e-05, + "loss": 0.3817, + "step": 24027 + }, + { + "epoch": 1.34550341583604, + "grad_norm": 1.378165364265442, + "learning_rate": 9.89428947368421e-05, + "loss": 0.4743, + "step": 24028 + }, + { + "epoch": 1.345559413148169, + "grad_norm": 1.399539828300476, + "learning_rate": 9.894263157894737e-05, + "loss": 0.4816, + "step": 24029 + }, + { + "epoch": 1.345615410460298, + "grad_norm": 1.3169493675231934, + "learning_rate": 9.894236842105263e-05, + "loss": 0.4081, + "step": 24030 + }, + { + "epoch": 1.345671407772427, + "grad_norm": 1.3407050371170044, + "learning_rate": 9.89421052631579e-05, + "loss": 0.3549, + "step": 24031 + }, + { + "epoch": 1.345727405084556, + "grad_norm": 1.0616273880004883, + "learning_rate": 9.894184210526317e-05, + "loss": 0.3927, + "step": 24032 + }, + { + "epoch": 1.345783402396685, + "grad_norm": 1.173591136932373, + "learning_rate": 9.894157894736843e-05, + "loss": 0.3702, + "step": 24033 + }, + { + "epoch": 1.345839399708814, + "grad_norm": 1.3523924350738525, + "learning_rate": 9.894131578947369e-05, + "loss": 0.4446, + "step": 24034 + }, + { + "epoch": 1.345895397020943, + "grad_norm": 1.2364277839660645, + "learning_rate": 9.894105263157896e-05, + "loss": 0.4579, + "step": 24035 + }, + { + "epoch": 1.345951394333072, + "grad_norm": 1.3914684057235718, + "learning_rate": 9.894078947368422e-05, + "loss": 0.4652, + "step": 24036 + }, + { + "epoch": 1.3460073916452011, + "grad_norm": 1.5705797672271729, + "learning_rate": 9.894052631578948e-05, + "loss": 0.4735, + "step": 24037 + }, + { + "epoch": 1.3460633889573301, + "grad_norm": 1.4909099340438843, + "learning_rate": 9.894026315789474e-05, + "loss": 0.4696, + "step": 24038 + }, + { + "epoch": 1.346119386269459, + "grad_norm": 1.4098501205444336, + "learning_rate": 9.894e-05, + "loss": 0.5234, + "step": 24039 + }, + { + "epoch": 1.346175383581588, + "grad_norm": 1.753283143043518, + "learning_rate": 9.893973684210527e-05, + "loss": 0.3517, + "step": 24040 + }, + { + "epoch": 1.346231380893717, + "grad_norm": 1.263336420059204, + "learning_rate": 9.893947368421053e-05, + "loss": 0.4078, + "step": 24041 + }, + { + "epoch": 1.346287378205846, + "grad_norm": 1.461779236793518, + "learning_rate": 9.893921052631579e-05, + "loss": 0.4651, + "step": 24042 + }, + { + "epoch": 1.346343375517975, + "grad_norm": 1.3641862869262695, + "learning_rate": 9.893894736842105e-05, + "loss": 0.5486, + "step": 24043 + }, + { + "epoch": 1.346399372830104, + "grad_norm": 1.3653966188430786, + "learning_rate": 9.893868421052632e-05, + "loss": 0.4846, + "step": 24044 + }, + { + "epoch": 1.346455370142233, + "grad_norm": 1.9145190715789795, + "learning_rate": 9.893842105263158e-05, + "loss": 0.3525, + "step": 24045 + }, + { + "epoch": 1.346511367454362, + "grad_norm": 1.817022442817688, + "learning_rate": 9.893815789473684e-05, + "loss": 0.4692, + "step": 24046 + }, + { + "epoch": 1.346567364766491, + "grad_norm": 1.4517470598220825, + "learning_rate": 9.89378947368421e-05, + "loss": 0.4655, + "step": 24047 + }, + { + "epoch": 1.34662336207862, + "grad_norm": 1.4838329553604126, + "learning_rate": 9.893763157894738e-05, + "loss": 0.5176, + "step": 24048 + }, + { + "epoch": 1.3466793593907491, + "grad_norm": 1.3544859886169434, + "learning_rate": 9.893736842105264e-05, + "loss": 0.435, + "step": 24049 + }, + { + "epoch": 1.3467353567028781, + "grad_norm": 1.4888906478881836, + "learning_rate": 9.893710526315791e-05, + "loss": 0.4983, + "step": 24050 + }, + { + "epoch": 1.3467913540150072, + "grad_norm": 1.4776861667633057, + "learning_rate": 9.893684210526316e-05, + "loss": 0.3908, + "step": 24051 + }, + { + "epoch": 1.3468473513271362, + "grad_norm": 1.268038034439087, + "learning_rate": 9.893657894736843e-05, + "loss": 0.5198, + "step": 24052 + }, + { + "epoch": 1.3469033486392652, + "grad_norm": 1.2676458358764648, + "learning_rate": 9.893631578947369e-05, + "loss": 0.4383, + "step": 24053 + }, + { + "epoch": 1.3469593459513942, + "grad_norm": 1.3287932872772217, + "learning_rate": 9.893605263157896e-05, + "loss": 0.6178, + "step": 24054 + }, + { + "epoch": 1.3470153432635232, + "grad_norm": 1.209586501121521, + "learning_rate": 9.893578947368422e-05, + "loss": 0.3452, + "step": 24055 + }, + { + "epoch": 1.3470713405756523, + "grad_norm": 1.239275336265564, + "learning_rate": 9.893552631578947e-05, + "loss": 0.4227, + "step": 24056 + }, + { + "epoch": 1.3471273378877813, + "grad_norm": 1.1617977619171143, + "learning_rate": 9.893526315789474e-05, + "loss": 0.5356, + "step": 24057 + }, + { + "epoch": 1.3471833351999103, + "grad_norm": 1.5607088804244995, + "learning_rate": 9.8935e-05, + "loss": 0.4493, + "step": 24058 + }, + { + "epoch": 1.3472393325120393, + "grad_norm": 1.4072494506835938, + "learning_rate": 9.893473684210527e-05, + "loss": 0.4303, + "step": 24059 + }, + { + "epoch": 1.3472953298241683, + "grad_norm": 1.407301902770996, + "learning_rate": 9.893447368421053e-05, + "loss": 0.4385, + "step": 24060 + }, + { + "epoch": 1.3473513271362973, + "grad_norm": 2.6843373775482178, + "learning_rate": 9.89342105263158e-05, + "loss": 0.5358, + "step": 24061 + }, + { + "epoch": 1.3474073244484264, + "grad_norm": 1.351423978805542, + "learning_rate": 9.893394736842105e-05, + "loss": 0.4161, + "step": 24062 + }, + { + "epoch": 1.3474633217605554, + "grad_norm": 1.351823329925537, + "learning_rate": 9.893368421052633e-05, + "loss": 0.4508, + "step": 24063 + }, + { + "epoch": 1.3475193190726844, + "grad_norm": 1.767401933670044, + "learning_rate": 9.893342105263159e-05, + "loss": 0.7152, + "step": 24064 + }, + { + "epoch": 1.3475753163848134, + "grad_norm": 6.961129188537598, + "learning_rate": 9.893315789473685e-05, + "loss": 0.5912, + "step": 24065 + }, + { + "epoch": 1.3476313136969424, + "grad_norm": 1.4069809913635254, + "learning_rate": 9.89328947368421e-05, + "loss": 0.3986, + "step": 24066 + }, + { + "epoch": 1.3476873110090715, + "grad_norm": 1.523929476737976, + "learning_rate": 9.893263157894738e-05, + "loss": 0.5491, + "step": 24067 + }, + { + "epoch": 1.3477433083212005, + "grad_norm": 1.4899320602416992, + "learning_rate": 9.893236842105264e-05, + "loss": 0.474, + "step": 24068 + }, + { + "epoch": 1.3477993056333295, + "grad_norm": 1.280630350112915, + "learning_rate": 9.89321052631579e-05, + "loss": 0.4654, + "step": 24069 + }, + { + "epoch": 1.3478553029454585, + "grad_norm": 1.3929500579833984, + "learning_rate": 9.893184210526316e-05, + "loss": 0.3879, + "step": 24070 + }, + { + "epoch": 1.3479113002575875, + "grad_norm": 1.3972034454345703, + "learning_rate": 9.893157894736843e-05, + "loss": 0.4501, + "step": 24071 + }, + { + "epoch": 1.3479672975697166, + "grad_norm": 1.2639726400375366, + "learning_rate": 9.893131578947369e-05, + "loss": 0.4856, + "step": 24072 + }, + { + "epoch": 1.3480232948818456, + "grad_norm": 1.5184447765350342, + "learning_rate": 9.893105263157896e-05, + "loss": 0.4357, + "step": 24073 + }, + { + "epoch": 1.3480792921939746, + "grad_norm": 1.315146565437317, + "learning_rate": 9.893078947368421e-05, + "loss": 0.4788, + "step": 24074 + }, + { + "epoch": 1.3481352895061036, + "grad_norm": 1.240248680114746, + "learning_rate": 9.893052631578947e-05, + "loss": 0.4533, + "step": 24075 + }, + { + "epoch": 1.3481912868182326, + "grad_norm": 1.172282338142395, + "learning_rate": 9.893026315789474e-05, + "loss": 0.4746, + "step": 24076 + }, + { + "epoch": 1.3482472841303617, + "grad_norm": 1.311255931854248, + "learning_rate": 9.893e-05, + "loss": 0.3963, + "step": 24077 + }, + { + "epoch": 1.3483032814424907, + "grad_norm": 1.3184565305709839, + "learning_rate": 9.892973684210526e-05, + "loss": 0.539, + "step": 24078 + }, + { + "epoch": 1.3483592787546197, + "grad_norm": 1.647714376449585, + "learning_rate": 9.892947368421052e-05, + "loss": 0.4814, + "step": 24079 + }, + { + "epoch": 1.3484152760667487, + "grad_norm": 1.3690624237060547, + "learning_rate": 9.89292105263158e-05, + "loss": 0.5514, + "step": 24080 + }, + { + "epoch": 1.3484712733788777, + "grad_norm": 1.2252135276794434, + "learning_rate": 9.892894736842106e-05, + "loss": 0.3129, + "step": 24081 + }, + { + "epoch": 1.3485272706910068, + "grad_norm": 1.3502732515335083, + "learning_rate": 9.892868421052633e-05, + "loss": 0.5709, + "step": 24082 + }, + { + "epoch": 1.3485832680031358, + "grad_norm": 1.294970989227295, + "learning_rate": 9.892842105263158e-05, + "loss": 0.4358, + "step": 24083 + }, + { + "epoch": 1.3486392653152648, + "grad_norm": 1.469613790512085, + "learning_rate": 9.892815789473685e-05, + "loss": 0.4896, + "step": 24084 + }, + { + "epoch": 1.3486952626273938, + "grad_norm": 1.8542873859405518, + "learning_rate": 9.892789473684211e-05, + "loss": 0.5623, + "step": 24085 + }, + { + "epoch": 1.3487512599395228, + "grad_norm": 1.658245325088501, + "learning_rate": 9.892763157894738e-05, + "loss": 0.6528, + "step": 24086 + }, + { + "epoch": 1.3488072572516518, + "grad_norm": 1.2902683019638062, + "learning_rate": 9.892736842105264e-05, + "loss": 0.5618, + "step": 24087 + }, + { + "epoch": 1.3488632545637809, + "grad_norm": 1.1421093940734863, + "learning_rate": 9.89271052631579e-05, + "loss": 0.4831, + "step": 24088 + }, + { + "epoch": 1.3489192518759099, + "grad_norm": 1.7525116205215454, + "learning_rate": 9.892684210526316e-05, + "loss": 0.5039, + "step": 24089 + }, + { + "epoch": 1.348975249188039, + "grad_norm": 1.113273024559021, + "learning_rate": 9.892657894736843e-05, + "loss": 0.3923, + "step": 24090 + }, + { + "epoch": 1.349031246500168, + "grad_norm": 1.418257713317871, + "learning_rate": 9.89263157894737e-05, + "loss": 0.5238, + "step": 24091 + }, + { + "epoch": 1.349087243812297, + "grad_norm": 1.840466856956482, + "learning_rate": 9.892605263157895e-05, + "loss": 0.542, + "step": 24092 + }, + { + "epoch": 1.349143241124426, + "grad_norm": 1.3966180086135864, + "learning_rate": 9.892578947368421e-05, + "loss": 0.4315, + "step": 24093 + }, + { + "epoch": 1.349199238436555, + "grad_norm": 1.2987778186798096, + "learning_rate": 9.892552631578947e-05, + "loss": 0.5709, + "step": 24094 + }, + { + "epoch": 1.349255235748684, + "grad_norm": 1.2208820581436157, + "learning_rate": 9.892526315789475e-05, + "loss": 0.45, + "step": 24095 + }, + { + "epoch": 1.349311233060813, + "grad_norm": 2.0124058723449707, + "learning_rate": 9.8925e-05, + "loss": 0.544, + "step": 24096 + }, + { + "epoch": 1.349367230372942, + "grad_norm": 1.3852250576019287, + "learning_rate": 9.892473684210527e-05, + "loss": 0.4652, + "step": 24097 + }, + { + "epoch": 1.349423227685071, + "grad_norm": 1.2076325416564941, + "learning_rate": 9.892447368421053e-05, + "loss": 0.3717, + "step": 24098 + }, + { + "epoch": 1.3494792249972, + "grad_norm": 1.1834232807159424, + "learning_rate": 9.89242105263158e-05, + "loss": 0.4067, + "step": 24099 + }, + { + "epoch": 1.349535222309329, + "grad_norm": 1.4517765045166016, + "learning_rate": 9.892394736842106e-05, + "loss": 0.5416, + "step": 24100 + }, + { + "epoch": 1.3495912196214581, + "grad_norm": 1.5725260972976685, + "learning_rate": 9.892368421052632e-05, + "loss": 0.607, + "step": 24101 + }, + { + "epoch": 1.3496472169335871, + "grad_norm": 1.9018875360488892, + "learning_rate": 9.892342105263158e-05, + "loss": 0.6389, + "step": 24102 + }, + { + "epoch": 1.3497032142457162, + "grad_norm": 1.2932662963867188, + "learning_rate": 9.892315789473685e-05, + "loss": 0.432, + "step": 24103 + }, + { + "epoch": 1.3497592115578452, + "grad_norm": 1.3422033786773682, + "learning_rate": 9.892289473684211e-05, + "loss": 0.4565, + "step": 24104 + }, + { + "epoch": 1.3498152088699742, + "grad_norm": 1.4826048612594604, + "learning_rate": 9.892263157894738e-05, + "loss": 0.5029, + "step": 24105 + }, + { + "epoch": 1.3498712061821032, + "grad_norm": 1.4096723794937134, + "learning_rate": 9.892236842105263e-05, + "loss": 0.5243, + "step": 24106 + }, + { + "epoch": 1.3499272034942322, + "grad_norm": 1.2899080514907837, + "learning_rate": 9.89221052631579e-05, + "loss": 0.4729, + "step": 24107 + }, + { + "epoch": 1.3499832008063612, + "grad_norm": 3.5839273929595947, + "learning_rate": 9.892184210526316e-05, + "loss": 0.4041, + "step": 24108 + }, + { + "epoch": 1.3500391981184903, + "grad_norm": 1.2935422658920288, + "learning_rate": 9.892157894736842e-05, + "loss": 0.5591, + "step": 24109 + }, + { + "epoch": 1.3500951954306193, + "grad_norm": 1.571065068244934, + "learning_rate": 9.89213157894737e-05, + "loss": 0.4823, + "step": 24110 + }, + { + "epoch": 1.3501511927427483, + "grad_norm": 1.3770251274108887, + "learning_rate": 9.892105263157894e-05, + "loss": 0.5661, + "step": 24111 + }, + { + "epoch": 1.3502071900548773, + "grad_norm": 1.1110776662826538, + "learning_rate": 9.892078947368422e-05, + "loss": 0.2878, + "step": 24112 + }, + { + "epoch": 1.3502631873670063, + "grad_norm": 1.5924330949783325, + "learning_rate": 9.892052631578948e-05, + "loss": 0.6316, + "step": 24113 + }, + { + "epoch": 1.3503191846791354, + "grad_norm": 1.457471489906311, + "learning_rate": 9.892026315789475e-05, + "loss": 0.4762, + "step": 24114 + }, + { + "epoch": 1.3503751819912644, + "grad_norm": 1.3572285175323486, + "learning_rate": 9.892e-05, + "loss": 0.6146, + "step": 24115 + }, + { + "epoch": 1.3504311793033934, + "grad_norm": 1.2651491165161133, + "learning_rate": 9.891973684210527e-05, + "loss": 0.5161, + "step": 24116 + }, + { + "epoch": 1.3504871766155224, + "grad_norm": 1.3892878293991089, + "learning_rate": 9.891947368421053e-05, + "loss": 0.4709, + "step": 24117 + }, + { + "epoch": 1.3505431739276514, + "grad_norm": 1.5117753744125366, + "learning_rate": 9.89192105263158e-05, + "loss": 0.5472, + "step": 24118 + }, + { + "epoch": 1.3505991712397805, + "grad_norm": 1.2897844314575195, + "learning_rate": 9.891894736842106e-05, + "loss": 0.4222, + "step": 24119 + }, + { + "epoch": 1.3506551685519095, + "grad_norm": 1.2057937383651733, + "learning_rate": 9.891868421052632e-05, + "loss": 0.4167, + "step": 24120 + }, + { + "epoch": 1.3507111658640385, + "grad_norm": 1.2438232898712158, + "learning_rate": 9.891842105263158e-05, + "loss": 0.3894, + "step": 24121 + }, + { + "epoch": 1.3507671631761675, + "grad_norm": 1.4046286344528198, + "learning_rate": 9.891815789473685e-05, + "loss": 0.5186, + "step": 24122 + }, + { + "epoch": 1.3508231604882965, + "grad_norm": 1.3658525943756104, + "learning_rate": 9.891789473684211e-05, + "loss": 0.4163, + "step": 24123 + }, + { + "epoch": 1.3508791578004256, + "grad_norm": 1.5054686069488525, + "learning_rate": 9.891763157894737e-05, + "loss": 0.5208, + "step": 24124 + }, + { + "epoch": 1.3509351551125546, + "grad_norm": 1.4575799703598022, + "learning_rate": 9.891736842105263e-05, + "loss": 0.439, + "step": 24125 + }, + { + "epoch": 1.3509911524246836, + "grad_norm": 1.4247299432754517, + "learning_rate": 9.891710526315789e-05, + "loss": 0.4459, + "step": 24126 + }, + { + "epoch": 1.3510471497368126, + "grad_norm": 1.237309455871582, + "learning_rate": 9.891684210526317e-05, + "loss": 0.5082, + "step": 24127 + }, + { + "epoch": 1.3511031470489416, + "grad_norm": 1.267808198928833, + "learning_rate": 9.891657894736843e-05, + "loss": 0.3936, + "step": 24128 + }, + { + "epoch": 1.3511591443610707, + "grad_norm": 1.9146208763122559, + "learning_rate": 9.891631578947369e-05, + "loss": 0.5779, + "step": 24129 + }, + { + "epoch": 1.3512151416731997, + "grad_norm": 1.3047025203704834, + "learning_rate": 9.891605263157894e-05, + "loss": 0.5201, + "step": 24130 + }, + { + "epoch": 1.3512711389853287, + "grad_norm": 1.3481351137161255, + "learning_rate": 9.891578947368422e-05, + "loss": 0.5738, + "step": 24131 + }, + { + "epoch": 1.3513271362974577, + "grad_norm": 1.3294408321380615, + "learning_rate": 9.891552631578948e-05, + "loss": 0.4455, + "step": 24132 + }, + { + "epoch": 1.3513831336095867, + "grad_norm": 1.4808603525161743, + "learning_rate": 9.891526315789474e-05, + "loss": 0.454, + "step": 24133 + }, + { + "epoch": 1.3514391309217157, + "grad_norm": 1.704590082168579, + "learning_rate": 9.8915e-05, + "loss": 0.4827, + "step": 24134 + }, + { + "epoch": 1.3514951282338448, + "grad_norm": 1.5826306343078613, + "learning_rate": 9.891473684210527e-05, + "loss": 0.3863, + "step": 24135 + }, + { + "epoch": 1.3515511255459738, + "grad_norm": 1.5127642154693604, + "learning_rate": 9.891447368421053e-05, + "loss": 0.4407, + "step": 24136 + }, + { + "epoch": 1.3516071228581028, + "grad_norm": 1.5637285709381104, + "learning_rate": 9.89142105263158e-05, + "loss": 0.5563, + "step": 24137 + }, + { + "epoch": 1.3516631201702318, + "grad_norm": 1.279770851135254, + "learning_rate": 9.891394736842105e-05, + "loss": 0.4167, + "step": 24138 + }, + { + "epoch": 1.3517191174823608, + "grad_norm": 1.5844379663467407, + "learning_rate": 9.891368421052632e-05, + "loss": 0.4667, + "step": 24139 + }, + { + "epoch": 1.3517751147944899, + "grad_norm": 1.5427618026733398, + "learning_rate": 9.891342105263158e-05, + "loss": 0.546, + "step": 24140 + }, + { + "epoch": 1.3518311121066189, + "grad_norm": 1.4364628791809082, + "learning_rate": 9.891315789473686e-05, + "loss": 0.5371, + "step": 24141 + }, + { + "epoch": 1.351887109418748, + "grad_norm": 1.5178382396697998, + "learning_rate": 9.891289473684212e-05, + "loss": 0.5804, + "step": 24142 + }, + { + "epoch": 1.351943106730877, + "grad_norm": 1.4384104013442993, + "learning_rate": 9.891263157894736e-05, + "loss": 0.3964, + "step": 24143 + }, + { + "epoch": 1.351999104043006, + "grad_norm": 1.4983294010162354, + "learning_rate": 9.891236842105264e-05, + "loss": 0.4637, + "step": 24144 + }, + { + "epoch": 1.352055101355135, + "grad_norm": 1.268157720565796, + "learning_rate": 9.89121052631579e-05, + "loss": 0.4143, + "step": 24145 + }, + { + "epoch": 1.352111098667264, + "grad_norm": 2.0224101543426514, + "learning_rate": 9.891184210526317e-05, + "loss": 0.5155, + "step": 24146 + }, + { + "epoch": 1.352167095979393, + "grad_norm": 1.149941086769104, + "learning_rate": 9.891157894736843e-05, + "loss": 0.4014, + "step": 24147 + }, + { + "epoch": 1.352223093291522, + "grad_norm": 1.277801752090454, + "learning_rate": 9.891131578947369e-05, + "loss": 0.3095, + "step": 24148 + }, + { + "epoch": 1.352279090603651, + "grad_norm": 1.5732860565185547, + "learning_rate": 9.891105263157895e-05, + "loss": 0.5345, + "step": 24149 + }, + { + "epoch": 1.35233508791578, + "grad_norm": 1.5471495389938354, + "learning_rate": 9.891078947368422e-05, + "loss": 0.4654, + "step": 24150 + }, + { + "epoch": 1.352391085227909, + "grad_norm": 1.4263899326324463, + "learning_rate": 9.891052631578948e-05, + "loss": 0.5575, + "step": 24151 + }, + { + "epoch": 1.352447082540038, + "grad_norm": 1.419783115386963, + "learning_rate": 9.891026315789474e-05, + "loss": 0.5694, + "step": 24152 + }, + { + "epoch": 1.3525030798521671, + "grad_norm": 1.1714369058609009, + "learning_rate": 9.891e-05, + "loss": 0.3831, + "step": 24153 + }, + { + "epoch": 1.3525590771642961, + "grad_norm": 1.4590812921524048, + "learning_rate": 9.890973684210527e-05, + "loss": 0.4227, + "step": 24154 + }, + { + "epoch": 1.3526150744764251, + "grad_norm": 1.3871046304702759, + "learning_rate": 9.890947368421053e-05, + "loss": 0.4155, + "step": 24155 + }, + { + "epoch": 1.3526710717885542, + "grad_norm": 1.2001183032989502, + "learning_rate": 9.890921052631579e-05, + "loss": 0.506, + "step": 24156 + }, + { + "epoch": 1.3527270691006832, + "grad_norm": 1.2502472400665283, + "learning_rate": 9.890894736842105e-05, + "loss": 0.4634, + "step": 24157 + }, + { + "epoch": 1.3527830664128122, + "grad_norm": 1.3538122177124023, + "learning_rate": 9.890868421052633e-05, + "loss": 0.5772, + "step": 24158 + }, + { + "epoch": 1.3528390637249412, + "grad_norm": 1.198918104171753, + "learning_rate": 9.890842105263159e-05, + "loss": 0.4198, + "step": 24159 + }, + { + "epoch": 1.3528950610370702, + "grad_norm": 1.3008321523666382, + "learning_rate": 9.890815789473685e-05, + "loss": 0.4081, + "step": 24160 + }, + { + "epoch": 1.3529510583491993, + "grad_norm": 1.3779500722885132, + "learning_rate": 9.89078947368421e-05, + "loss": 0.3471, + "step": 24161 + }, + { + "epoch": 1.3530070556613283, + "grad_norm": 1.5330551862716675, + "learning_rate": 9.890763157894736e-05, + "loss": 0.5198, + "step": 24162 + }, + { + "epoch": 1.3530630529734573, + "grad_norm": 1.5421743392944336, + "learning_rate": 9.890736842105264e-05, + "loss": 0.5889, + "step": 24163 + }, + { + "epoch": 1.3531190502855863, + "grad_norm": 1.4606456756591797, + "learning_rate": 9.89071052631579e-05, + "loss": 0.5037, + "step": 24164 + }, + { + "epoch": 1.3531750475977153, + "grad_norm": 1.3997637033462524, + "learning_rate": 9.890684210526317e-05, + "loss": 0.4339, + "step": 24165 + }, + { + "epoch": 1.3532310449098444, + "grad_norm": 2.146996259689331, + "learning_rate": 9.890657894736842e-05, + "loss": 0.4112, + "step": 24166 + }, + { + "epoch": 1.3532870422219734, + "grad_norm": 1.3905378580093384, + "learning_rate": 9.890631578947369e-05, + "loss": 0.4845, + "step": 24167 + }, + { + "epoch": 1.3533430395341024, + "grad_norm": 1.1500251293182373, + "learning_rate": 9.890605263157895e-05, + "loss": 0.3876, + "step": 24168 + }, + { + "epoch": 1.3533990368462314, + "grad_norm": 1.3702603578567505, + "learning_rate": 9.890578947368422e-05, + "loss": 0.5397, + "step": 24169 + }, + { + "epoch": 1.3534550341583604, + "grad_norm": 1.3898779153823853, + "learning_rate": 9.890552631578947e-05, + "loss": 0.61, + "step": 24170 + }, + { + "epoch": 1.3535110314704895, + "grad_norm": 1.6539660692214966, + "learning_rate": 9.890526315789474e-05, + "loss": 0.5015, + "step": 24171 + }, + { + "epoch": 1.3535670287826185, + "grad_norm": 1.4206243753433228, + "learning_rate": 9.8905e-05, + "loss": 0.5337, + "step": 24172 + }, + { + "epoch": 1.3536230260947475, + "grad_norm": 1.1216557025909424, + "learning_rate": 9.890473684210528e-05, + "loss": 0.4585, + "step": 24173 + }, + { + "epoch": 1.3536790234068765, + "grad_norm": 1.3988527059555054, + "learning_rate": 9.890447368421054e-05, + "loss": 0.4261, + "step": 24174 + }, + { + "epoch": 1.3537350207190055, + "grad_norm": 1.198021411895752, + "learning_rate": 9.89042105263158e-05, + "loss": 0.4139, + "step": 24175 + }, + { + "epoch": 1.3537910180311346, + "grad_norm": 1.432989478111267, + "learning_rate": 9.890394736842106e-05, + "loss": 0.5061, + "step": 24176 + }, + { + "epoch": 1.3538470153432636, + "grad_norm": 1.4509515762329102, + "learning_rate": 9.890368421052631e-05, + "loss": 0.4368, + "step": 24177 + }, + { + "epoch": 1.3539030126553926, + "grad_norm": 1.2205407619476318, + "learning_rate": 9.890342105263159e-05, + "loss": 0.4162, + "step": 24178 + }, + { + "epoch": 1.3539590099675216, + "grad_norm": 1.3447471857070923, + "learning_rate": 9.890315789473685e-05, + "loss": 0.4203, + "step": 24179 + }, + { + "epoch": 1.3540150072796506, + "grad_norm": 1.5836881399154663, + "learning_rate": 9.890289473684211e-05, + "loss": 0.4985, + "step": 24180 + }, + { + "epoch": 1.3540710045917796, + "grad_norm": 1.6705793142318726, + "learning_rate": 9.890263157894737e-05, + "loss": 0.4286, + "step": 24181 + }, + { + "epoch": 1.3541270019039087, + "grad_norm": 1.2332497835159302, + "learning_rate": 9.890236842105264e-05, + "loss": 0.3292, + "step": 24182 + }, + { + "epoch": 1.3541829992160377, + "grad_norm": 3.5343668460845947, + "learning_rate": 9.89021052631579e-05, + "loss": 0.5716, + "step": 24183 + }, + { + "epoch": 1.3542389965281667, + "grad_norm": 1.3043307065963745, + "learning_rate": 9.890184210526316e-05, + "loss": 0.4799, + "step": 24184 + }, + { + "epoch": 1.3542949938402957, + "grad_norm": 1.2603365182876587, + "learning_rate": 9.890157894736842e-05, + "loss": 0.4495, + "step": 24185 + }, + { + "epoch": 1.3543509911524247, + "grad_norm": 1.324040412902832, + "learning_rate": 9.890131578947369e-05, + "loss": 0.3862, + "step": 24186 + }, + { + "epoch": 1.3544069884645538, + "grad_norm": 1.3032180070877075, + "learning_rate": 9.890105263157895e-05, + "loss": 0.5301, + "step": 24187 + }, + { + "epoch": 1.3544629857766828, + "grad_norm": 1.455611228942871, + "learning_rate": 9.890078947368421e-05, + "loss": 0.4353, + "step": 24188 + }, + { + "epoch": 1.3545189830888118, + "grad_norm": 1.2120689153671265, + "learning_rate": 9.890052631578947e-05, + "loss": 0.429, + "step": 24189 + }, + { + "epoch": 1.3545749804009408, + "grad_norm": 1.3751366138458252, + "learning_rate": 9.890026315789475e-05, + "loss": 0.4037, + "step": 24190 + }, + { + "epoch": 1.3546309777130698, + "grad_norm": 1.38014817237854, + "learning_rate": 9.89e-05, + "loss": 0.5004, + "step": 24191 + }, + { + "epoch": 1.3546869750251989, + "grad_norm": 1.1251559257507324, + "learning_rate": 9.889973684210528e-05, + "loss": 0.4815, + "step": 24192 + }, + { + "epoch": 1.3547429723373279, + "grad_norm": 1.3140547275543213, + "learning_rate": 9.889947368421052e-05, + "loss": 0.5415, + "step": 24193 + }, + { + "epoch": 1.354798969649457, + "grad_norm": 1.2560465335845947, + "learning_rate": 9.88992105263158e-05, + "loss": 0.4763, + "step": 24194 + }, + { + "epoch": 1.354854966961586, + "grad_norm": 1.240114450454712, + "learning_rate": 9.889894736842106e-05, + "loss": 0.4366, + "step": 24195 + }, + { + "epoch": 1.354910964273715, + "grad_norm": 1.2777663469314575, + "learning_rate": 9.889868421052632e-05, + "loss": 0.3919, + "step": 24196 + }, + { + "epoch": 1.354966961585844, + "grad_norm": 1.5230880975723267, + "learning_rate": 9.889842105263159e-05, + "loss": 0.5446, + "step": 24197 + }, + { + "epoch": 1.355022958897973, + "grad_norm": 1.1917310953140259, + "learning_rate": 9.889815789473684e-05, + "loss": 0.5723, + "step": 24198 + }, + { + "epoch": 1.355078956210102, + "grad_norm": 1.3890283107757568, + "learning_rate": 9.889789473684211e-05, + "loss": 0.6764, + "step": 24199 + }, + { + "epoch": 1.355134953522231, + "grad_norm": 1.5664749145507812, + "learning_rate": 9.889763157894737e-05, + "loss": 0.5056, + "step": 24200 + }, + { + "epoch": 1.35519095083436, + "grad_norm": 1.706183671951294, + "learning_rate": 9.889736842105264e-05, + "loss": 0.4927, + "step": 24201 + }, + { + "epoch": 1.355246948146489, + "grad_norm": 1.0862797498703003, + "learning_rate": 9.88971052631579e-05, + "loss": 0.3748, + "step": 24202 + }, + { + "epoch": 1.355302945458618, + "grad_norm": 1.3727495670318604, + "learning_rate": 9.889684210526316e-05, + "loss": 0.4898, + "step": 24203 + }, + { + "epoch": 1.355358942770747, + "grad_norm": 1.635277271270752, + "learning_rate": 9.889657894736842e-05, + "loss": 0.4993, + "step": 24204 + }, + { + "epoch": 1.355414940082876, + "grad_norm": 1.2087209224700928, + "learning_rate": 9.88963157894737e-05, + "loss": 0.3636, + "step": 24205 + }, + { + "epoch": 1.3554709373950051, + "grad_norm": 1.111750602722168, + "learning_rate": 9.889605263157896e-05, + "loss": 0.4543, + "step": 24206 + }, + { + "epoch": 1.3555269347071341, + "grad_norm": 1.292447805404663, + "learning_rate": 9.889578947368422e-05, + "loss": 0.4484, + "step": 24207 + }, + { + "epoch": 1.3555829320192632, + "grad_norm": 1.3316152095794678, + "learning_rate": 9.889552631578947e-05, + "loss": 0.3604, + "step": 24208 + }, + { + "epoch": 1.3556389293313922, + "grad_norm": 1.314098834991455, + "learning_rate": 9.889526315789475e-05, + "loss": 0.3969, + "step": 24209 + }, + { + "epoch": 1.3556949266435212, + "grad_norm": 1.5935890674591064, + "learning_rate": 9.889500000000001e-05, + "loss": 0.4482, + "step": 24210 + }, + { + "epoch": 1.3557509239556502, + "grad_norm": 1.6971734762191772, + "learning_rate": 9.889473684210527e-05, + "loss": 0.4667, + "step": 24211 + }, + { + "epoch": 1.3558069212677792, + "grad_norm": 1.528926134109497, + "learning_rate": 9.889447368421053e-05, + "loss": 0.4533, + "step": 24212 + }, + { + "epoch": 1.3558629185799083, + "grad_norm": 1.1695584058761597, + "learning_rate": 9.889421052631579e-05, + "loss": 0.3628, + "step": 24213 + }, + { + "epoch": 1.3559189158920373, + "grad_norm": 1.1592373847961426, + "learning_rate": 9.889394736842106e-05, + "loss": 0.4212, + "step": 24214 + }, + { + "epoch": 1.3559749132041663, + "grad_norm": 1.202193021774292, + "learning_rate": 9.889368421052632e-05, + "loss": 0.4012, + "step": 24215 + }, + { + "epoch": 1.3560309105162953, + "grad_norm": 1.3646345138549805, + "learning_rate": 9.889342105263158e-05, + "loss": 0.3992, + "step": 24216 + }, + { + "epoch": 1.3560869078284243, + "grad_norm": 1.2469518184661865, + "learning_rate": 9.889315789473684e-05, + "loss": 0.5593, + "step": 24217 + }, + { + "epoch": 1.3561429051405534, + "grad_norm": 1.3526692390441895, + "learning_rate": 9.889289473684211e-05, + "loss": 0.434, + "step": 24218 + }, + { + "epoch": 1.3561989024526824, + "grad_norm": 1.6600333452224731, + "learning_rate": 9.889263157894737e-05, + "loss": 0.456, + "step": 24219 + }, + { + "epoch": 1.3562548997648114, + "grad_norm": 1.4503874778747559, + "learning_rate": 9.889236842105265e-05, + "loss": 0.4817, + "step": 24220 + }, + { + "epoch": 1.3563108970769404, + "grad_norm": 1.2051103115081787, + "learning_rate": 9.889210526315789e-05, + "loss": 0.48, + "step": 24221 + }, + { + "epoch": 1.3563668943890694, + "grad_norm": 1.310248613357544, + "learning_rate": 9.889184210526317e-05, + "loss": 0.4754, + "step": 24222 + }, + { + "epoch": 1.3564228917011985, + "grad_norm": 1.2252604961395264, + "learning_rate": 9.889157894736842e-05, + "loss": 0.4161, + "step": 24223 + }, + { + "epoch": 1.3564788890133275, + "grad_norm": 1.4256223440170288, + "learning_rate": 9.88913157894737e-05, + "loss": 0.6532, + "step": 24224 + }, + { + "epoch": 1.3565348863254565, + "grad_norm": 1.3690801858901978, + "learning_rate": 9.889105263157894e-05, + "loss": 0.5856, + "step": 24225 + }, + { + "epoch": 1.3565908836375855, + "grad_norm": 1.3578503131866455, + "learning_rate": 9.889078947368422e-05, + "loss": 0.4091, + "step": 24226 + }, + { + "epoch": 1.3566468809497145, + "grad_norm": 1.3648658990859985, + "learning_rate": 9.889052631578948e-05, + "loss": 0.5009, + "step": 24227 + }, + { + "epoch": 1.3567028782618435, + "grad_norm": 1.122082233428955, + "learning_rate": 9.889026315789475e-05, + "loss": 0.4603, + "step": 24228 + }, + { + "epoch": 1.3567588755739726, + "grad_norm": 1.250191330909729, + "learning_rate": 9.889000000000001e-05, + "loss": 0.4719, + "step": 24229 + }, + { + "epoch": 1.3568148728861016, + "grad_norm": 1.492835283279419, + "learning_rate": 9.888973684210526e-05, + "loss": 0.3728, + "step": 24230 + }, + { + "epoch": 1.3568708701982306, + "grad_norm": 1.8283497095108032, + "learning_rate": 9.888947368421053e-05, + "loss": 0.5613, + "step": 24231 + }, + { + "epoch": 1.3569268675103596, + "grad_norm": 1.3256876468658447, + "learning_rate": 9.888921052631579e-05, + "loss": 0.458, + "step": 24232 + }, + { + "epoch": 1.3569828648224886, + "grad_norm": 1.3450658321380615, + "learning_rate": 9.888894736842106e-05, + "loss": 0.5543, + "step": 24233 + }, + { + "epoch": 1.3570388621346177, + "grad_norm": 1.1495815515518188, + "learning_rate": 9.888868421052632e-05, + "loss": 0.3296, + "step": 24234 + }, + { + "epoch": 1.3570948594467467, + "grad_norm": 1.409485936164856, + "learning_rate": 9.888842105263158e-05, + "loss": 0.5735, + "step": 24235 + }, + { + "epoch": 1.3571508567588757, + "grad_norm": 1.1729849576950073, + "learning_rate": 9.888815789473684e-05, + "loss": 0.4933, + "step": 24236 + }, + { + "epoch": 1.3572068540710047, + "grad_norm": 1.3659942150115967, + "learning_rate": 9.888789473684212e-05, + "loss": 0.5493, + "step": 24237 + }, + { + "epoch": 1.3572628513831337, + "grad_norm": 1.3520770072937012, + "learning_rate": 9.888763157894737e-05, + "loss": 0.5127, + "step": 24238 + }, + { + "epoch": 1.3573188486952628, + "grad_norm": 1.4395102262496948, + "learning_rate": 9.888736842105263e-05, + "loss": 0.3862, + "step": 24239 + }, + { + "epoch": 1.3573748460073918, + "grad_norm": 1.3926860094070435, + "learning_rate": 9.88871052631579e-05, + "loss": 0.476, + "step": 24240 + }, + { + "epoch": 1.3574308433195208, + "grad_norm": 1.5400936603546143, + "learning_rate": 9.888684210526317e-05, + "loss": 0.415, + "step": 24241 + }, + { + "epoch": 1.3574868406316498, + "grad_norm": 1.6989082098007202, + "learning_rate": 9.888657894736843e-05, + "loss": 0.4737, + "step": 24242 + }, + { + "epoch": 1.3575428379437788, + "grad_norm": 1.441806674003601, + "learning_rate": 9.888631578947369e-05, + "loss": 0.3868, + "step": 24243 + }, + { + "epoch": 1.3575988352559079, + "grad_norm": 1.4132057428359985, + "learning_rate": 9.888605263157895e-05, + "loss": 0.536, + "step": 24244 + }, + { + "epoch": 1.3576548325680367, + "grad_norm": 1.4496694803237915, + "learning_rate": 9.888578947368422e-05, + "loss": 0.4312, + "step": 24245 + }, + { + "epoch": 1.3577108298801657, + "grad_norm": 1.1595571041107178, + "learning_rate": 9.888552631578948e-05, + "loss": 0.3974, + "step": 24246 + }, + { + "epoch": 1.3577668271922947, + "grad_norm": 1.4537824392318726, + "learning_rate": 9.888526315789474e-05, + "loss": 0.5266, + "step": 24247 + }, + { + "epoch": 1.3578228245044237, + "grad_norm": 1.4500279426574707, + "learning_rate": 9.8885e-05, + "loss": 0.5608, + "step": 24248 + }, + { + "epoch": 1.3578788218165527, + "grad_norm": 1.2889269590377808, + "learning_rate": 9.888473684210526e-05, + "loss": 0.4108, + "step": 24249 + }, + { + "epoch": 1.3579348191286817, + "grad_norm": 1.3143959045410156, + "learning_rate": 9.888447368421053e-05, + "loss": 0.4612, + "step": 24250 + }, + { + "epoch": 1.3579908164408108, + "grad_norm": 1.5159499645233154, + "learning_rate": 9.888421052631579e-05, + "loss": 0.5644, + "step": 24251 + }, + { + "epoch": 1.3580468137529398, + "grad_norm": 1.3953338861465454, + "learning_rate": 9.888394736842107e-05, + "loss": 0.4103, + "step": 24252 + }, + { + "epoch": 1.3581028110650688, + "grad_norm": 1.1751587390899658, + "learning_rate": 9.888368421052631e-05, + "loss": 0.4934, + "step": 24253 + }, + { + "epoch": 1.3581588083771978, + "grad_norm": 1.4721421003341675, + "learning_rate": 9.888342105263158e-05, + "loss": 0.5593, + "step": 24254 + }, + { + "epoch": 1.3582148056893268, + "grad_norm": 1.2335082292556763, + "learning_rate": 9.888315789473684e-05, + "loss": 0.5028, + "step": 24255 + }, + { + "epoch": 1.3582708030014559, + "grad_norm": 1.506395697593689, + "learning_rate": 9.888289473684212e-05, + "loss": 0.4373, + "step": 24256 + }, + { + "epoch": 1.3583268003135849, + "grad_norm": 1.208866000175476, + "learning_rate": 9.888263157894738e-05, + "loss": 0.4574, + "step": 24257 + }, + { + "epoch": 1.358382797625714, + "grad_norm": 1.3434886932373047, + "learning_rate": 9.888236842105264e-05, + "loss": 0.5081, + "step": 24258 + }, + { + "epoch": 1.358438794937843, + "grad_norm": 1.523879051208496, + "learning_rate": 9.88821052631579e-05, + "loss": 0.4654, + "step": 24259 + }, + { + "epoch": 1.358494792249972, + "grad_norm": 1.5192832946777344, + "learning_rate": 9.888184210526317e-05, + "loss": 0.3732, + "step": 24260 + }, + { + "epoch": 1.358550789562101, + "grad_norm": 1.497955560684204, + "learning_rate": 9.888157894736843e-05, + "loss": 0.4321, + "step": 24261 + }, + { + "epoch": 1.35860678687423, + "grad_norm": 1.3706817626953125, + "learning_rate": 9.888131578947369e-05, + "loss": 0.5732, + "step": 24262 + }, + { + "epoch": 1.358662784186359, + "grad_norm": 1.3490325212478638, + "learning_rate": 9.888105263157895e-05, + "loss": 0.4374, + "step": 24263 + }, + { + "epoch": 1.358718781498488, + "grad_norm": 1.118749737739563, + "learning_rate": 9.888078947368421e-05, + "loss": 0.3573, + "step": 24264 + }, + { + "epoch": 1.358774778810617, + "grad_norm": 1.126299262046814, + "learning_rate": 9.888052631578948e-05, + "loss": 0.2995, + "step": 24265 + }, + { + "epoch": 1.358830776122746, + "grad_norm": 1.1920883655548096, + "learning_rate": 9.888026315789474e-05, + "loss": 0.4185, + "step": 24266 + }, + { + "epoch": 1.358886773434875, + "grad_norm": 1.254758358001709, + "learning_rate": 9.888e-05, + "loss": 0.4563, + "step": 24267 + }, + { + "epoch": 1.358942770747004, + "grad_norm": 1.0732206106185913, + "learning_rate": 9.887973684210526e-05, + "loss": 0.3767, + "step": 24268 + }, + { + "epoch": 1.358998768059133, + "grad_norm": 1.278638243675232, + "learning_rate": 9.887947368421053e-05, + "loss": 0.4268, + "step": 24269 + }, + { + "epoch": 1.3590547653712621, + "grad_norm": 1.693253517150879, + "learning_rate": 9.88792105263158e-05, + "loss": 0.4124, + "step": 24270 + }, + { + "epoch": 1.3591107626833911, + "grad_norm": 1.4540988206863403, + "learning_rate": 9.887894736842105e-05, + "loss": 0.4139, + "step": 24271 + }, + { + "epoch": 1.3591667599955202, + "grad_norm": 1.2501556873321533, + "learning_rate": 9.887868421052631e-05, + "loss": 0.467, + "step": 24272 + }, + { + "epoch": 1.3592227573076492, + "grad_norm": 1.3515464067459106, + "learning_rate": 9.887842105263159e-05, + "loss": 0.5135, + "step": 24273 + }, + { + "epoch": 1.3592787546197782, + "grad_norm": 1.2665053606033325, + "learning_rate": 9.887815789473685e-05, + "loss": 0.3851, + "step": 24274 + }, + { + "epoch": 1.3593347519319072, + "grad_norm": 1.5510942935943604, + "learning_rate": 9.887789473684212e-05, + "loss": 0.5612, + "step": 24275 + }, + { + "epoch": 1.3593907492440362, + "grad_norm": 1.9409633874893188, + "learning_rate": 9.887763157894737e-05, + "loss": 0.6694, + "step": 24276 + }, + { + "epoch": 1.3594467465561653, + "grad_norm": 1.3130944967269897, + "learning_rate": 9.887736842105264e-05, + "loss": 0.4047, + "step": 24277 + }, + { + "epoch": 1.3595027438682943, + "grad_norm": 1.4870585203170776, + "learning_rate": 9.88771052631579e-05, + "loss": 0.4797, + "step": 24278 + }, + { + "epoch": 1.3595587411804233, + "grad_norm": 1.2149924039840698, + "learning_rate": 9.887684210526317e-05, + "loss": 0.3805, + "step": 24279 + }, + { + "epoch": 1.3596147384925523, + "grad_norm": 1.4774595499038696, + "learning_rate": 9.887657894736842e-05, + "loss": 0.5419, + "step": 24280 + }, + { + "epoch": 1.3596707358046813, + "grad_norm": 1.419573187828064, + "learning_rate": 9.887631578947368e-05, + "loss": 0.5009, + "step": 24281 + }, + { + "epoch": 1.3597267331168104, + "grad_norm": 1.3416260480880737, + "learning_rate": 9.887605263157895e-05, + "loss": 0.4692, + "step": 24282 + }, + { + "epoch": 1.3597827304289394, + "grad_norm": 1.3263635635375977, + "learning_rate": 9.887578947368421e-05, + "loss": 0.4656, + "step": 24283 + }, + { + "epoch": 1.3598387277410684, + "grad_norm": 1.96281099319458, + "learning_rate": 9.887552631578949e-05, + "loss": 0.4804, + "step": 24284 + }, + { + "epoch": 1.3598947250531974, + "grad_norm": 1.3477357625961304, + "learning_rate": 9.887526315789473e-05, + "loss": 0.4723, + "step": 24285 + }, + { + "epoch": 1.3599507223653264, + "grad_norm": 1.3089745044708252, + "learning_rate": 9.8875e-05, + "loss": 0.5166, + "step": 24286 + }, + { + "epoch": 1.3600067196774555, + "grad_norm": 1.3642199039459229, + "learning_rate": 9.887473684210526e-05, + "loss": 0.5577, + "step": 24287 + }, + { + "epoch": 1.3600627169895845, + "grad_norm": 1.3563685417175293, + "learning_rate": 9.887447368421054e-05, + "loss": 0.6061, + "step": 24288 + }, + { + "epoch": 1.3601187143017135, + "grad_norm": 1.2317285537719727, + "learning_rate": 9.88742105263158e-05, + "loss": 0.3844, + "step": 24289 + }, + { + "epoch": 1.3601747116138425, + "grad_norm": 1.2608413696289062, + "learning_rate": 9.887394736842106e-05, + "loss": 0.5228, + "step": 24290 + }, + { + "epoch": 1.3602307089259715, + "grad_norm": 1.5067768096923828, + "learning_rate": 9.887368421052632e-05, + "loss": 0.4704, + "step": 24291 + }, + { + "epoch": 1.3602867062381006, + "grad_norm": 1.5356553792953491, + "learning_rate": 9.887342105263159e-05, + "loss": 0.7033, + "step": 24292 + }, + { + "epoch": 1.3603427035502296, + "grad_norm": 1.1460360288619995, + "learning_rate": 9.887315789473685e-05, + "loss": 0.399, + "step": 24293 + }, + { + "epoch": 1.3603987008623586, + "grad_norm": 1.5921638011932373, + "learning_rate": 9.887289473684211e-05, + "loss": 0.5124, + "step": 24294 + }, + { + "epoch": 1.3604546981744876, + "grad_norm": 1.4019817113876343, + "learning_rate": 9.887263157894737e-05, + "loss": 0.4948, + "step": 24295 + }, + { + "epoch": 1.3605106954866166, + "grad_norm": 1.5591027736663818, + "learning_rate": 9.887236842105264e-05, + "loss": 0.4862, + "step": 24296 + }, + { + "epoch": 1.3605666927987456, + "grad_norm": 1.374293565750122, + "learning_rate": 9.88721052631579e-05, + "loss": 0.5368, + "step": 24297 + }, + { + "epoch": 1.3606226901108747, + "grad_norm": 1.49769926071167, + "learning_rate": 9.887184210526316e-05, + "loss": 0.55, + "step": 24298 + }, + { + "epoch": 1.3606786874230037, + "grad_norm": 1.1991252899169922, + "learning_rate": 9.887157894736842e-05, + "loss": 0.4687, + "step": 24299 + }, + { + "epoch": 1.3607346847351327, + "grad_norm": 1.3198788166046143, + "learning_rate": 9.887131578947368e-05, + "loss": 0.4489, + "step": 24300 + }, + { + "epoch": 1.3607906820472617, + "grad_norm": 1.2617061138153076, + "learning_rate": 9.887105263157895e-05, + "loss": 0.395, + "step": 24301 + }, + { + "epoch": 1.3608466793593907, + "grad_norm": 1.3370567560195923, + "learning_rate": 9.887078947368421e-05, + "loss": 0.3999, + "step": 24302 + }, + { + "epoch": 1.3609026766715198, + "grad_norm": 1.3850817680358887, + "learning_rate": 9.887052631578947e-05, + "loss": 0.5725, + "step": 24303 + }, + { + "epoch": 1.3609586739836488, + "grad_norm": 1.5199081897735596, + "learning_rate": 9.887026315789473e-05, + "loss": 0.4016, + "step": 24304 + }, + { + "epoch": 1.3610146712957778, + "grad_norm": 1.3331574201583862, + "learning_rate": 9.887000000000001e-05, + "loss": 0.4695, + "step": 24305 + }, + { + "epoch": 1.3610706686079068, + "grad_norm": 1.2357096672058105, + "learning_rate": 9.886973684210527e-05, + "loss": 0.4227, + "step": 24306 + }, + { + "epoch": 1.3611266659200358, + "grad_norm": 1.3251912593841553, + "learning_rate": 9.886947368421054e-05, + "loss": 0.4537, + "step": 24307 + }, + { + "epoch": 1.3611826632321649, + "grad_norm": 1.1950063705444336, + "learning_rate": 9.886921052631579e-05, + "loss": 0.4806, + "step": 24308 + }, + { + "epoch": 1.3612386605442939, + "grad_norm": 1.2474392652511597, + "learning_rate": 9.886894736842106e-05, + "loss": 0.4358, + "step": 24309 + }, + { + "epoch": 1.361294657856423, + "grad_norm": 1.0946778059005737, + "learning_rate": 9.886868421052632e-05, + "loss": 0.3485, + "step": 24310 + }, + { + "epoch": 1.361350655168552, + "grad_norm": 1.3305790424346924, + "learning_rate": 9.886842105263159e-05, + "loss": 0.3922, + "step": 24311 + }, + { + "epoch": 1.361406652480681, + "grad_norm": 1.1710745096206665, + "learning_rate": 9.886815789473685e-05, + "loss": 0.3931, + "step": 24312 + }, + { + "epoch": 1.36146264979281, + "grad_norm": 1.5039604902267456, + "learning_rate": 9.886789473684211e-05, + "loss": 0.4597, + "step": 24313 + }, + { + "epoch": 1.361518647104939, + "grad_norm": 1.3262319564819336, + "learning_rate": 9.886763157894737e-05, + "loss": 0.5, + "step": 24314 + }, + { + "epoch": 1.361574644417068, + "grad_norm": 1.3019124269485474, + "learning_rate": 9.886736842105265e-05, + "loss": 0.5419, + "step": 24315 + }, + { + "epoch": 1.361630641729197, + "grad_norm": 1.5411773920059204, + "learning_rate": 9.88671052631579e-05, + "loss": 0.4171, + "step": 24316 + }, + { + "epoch": 1.361686639041326, + "grad_norm": 1.4485447406768799, + "learning_rate": 9.886684210526315e-05, + "loss": 0.4519, + "step": 24317 + }, + { + "epoch": 1.361742636353455, + "grad_norm": 1.3841382265090942, + "learning_rate": 9.886657894736842e-05, + "loss": 0.4242, + "step": 24318 + }, + { + "epoch": 1.361798633665584, + "grad_norm": 1.1673097610473633, + "learning_rate": 9.886631578947368e-05, + "loss": 0.4415, + "step": 24319 + }, + { + "epoch": 1.361854630977713, + "grad_norm": 1.6136304140090942, + "learning_rate": 9.886605263157896e-05, + "loss": 0.546, + "step": 24320 + }, + { + "epoch": 1.361910628289842, + "grad_norm": 1.567229151725769, + "learning_rate": 9.886578947368422e-05, + "loss": 0.4276, + "step": 24321 + }, + { + "epoch": 1.3619666256019711, + "grad_norm": 1.3506273031234741, + "learning_rate": 9.886552631578948e-05, + "loss": 0.4994, + "step": 24322 + }, + { + "epoch": 1.3620226229141001, + "grad_norm": 1.3055624961853027, + "learning_rate": 9.886526315789474e-05, + "loss": 0.347, + "step": 24323 + }, + { + "epoch": 1.3620786202262292, + "grad_norm": 1.38615882396698, + "learning_rate": 9.886500000000001e-05, + "loss": 0.4291, + "step": 24324 + }, + { + "epoch": 1.3621346175383582, + "grad_norm": 1.3798707723617554, + "learning_rate": 9.886473684210527e-05, + "loss": 0.4276, + "step": 24325 + }, + { + "epoch": 1.3621906148504872, + "grad_norm": 1.1843464374542236, + "learning_rate": 9.886447368421053e-05, + "loss": 0.4777, + "step": 24326 + }, + { + "epoch": 1.3622466121626162, + "grad_norm": 1.2293568849563599, + "learning_rate": 9.886421052631579e-05, + "loss": 0.448, + "step": 24327 + }, + { + "epoch": 1.3623026094747452, + "grad_norm": 1.2945222854614258, + "learning_rate": 9.886394736842106e-05, + "loss": 0.4568, + "step": 24328 + }, + { + "epoch": 1.3623586067868743, + "grad_norm": 1.490443229675293, + "learning_rate": 9.886368421052632e-05, + "loss": 0.6576, + "step": 24329 + }, + { + "epoch": 1.3624146040990033, + "grad_norm": 1.3081691265106201, + "learning_rate": 9.88634210526316e-05, + "loss": 0.516, + "step": 24330 + }, + { + "epoch": 1.3624706014111323, + "grad_norm": 2.057906150817871, + "learning_rate": 9.886315789473684e-05, + "loss": 0.4726, + "step": 24331 + }, + { + "epoch": 1.3625265987232613, + "grad_norm": 1.1586756706237793, + "learning_rate": 9.886289473684211e-05, + "loss": 0.364, + "step": 24332 + }, + { + "epoch": 1.3625825960353903, + "grad_norm": 1.5228568315505981, + "learning_rate": 9.886263157894737e-05, + "loss": 0.5054, + "step": 24333 + }, + { + "epoch": 1.3626385933475194, + "grad_norm": 1.2080132961273193, + "learning_rate": 9.886236842105263e-05, + "loss": 0.5405, + "step": 24334 + }, + { + "epoch": 1.3626945906596484, + "grad_norm": 1.8775131702423096, + "learning_rate": 9.88621052631579e-05, + "loss": 0.5727, + "step": 24335 + }, + { + "epoch": 1.3627505879717774, + "grad_norm": 1.4475866556167603, + "learning_rate": 9.886184210526315e-05, + "loss": 0.4302, + "step": 24336 + }, + { + "epoch": 1.3628065852839064, + "grad_norm": 1.6076793670654297, + "learning_rate": 9.886157894736843e-05, + "loss": 0.5248, + "step": 24337 + }, + { + "epoch": 1.3628625825960354, + "grad_norm": 1.269997000694275, + "learning_rate": 9.886131578947369e-05, + "loss": 0.3485, + "step": 24338 + }, + { + "epoch": 1.3629185799081645, + "grad_norm": 1.2178103923797607, + "learning_rate": 9.886105263157896e-05, + "loss": 0.4373, + "step": 24339 + }, + { + "epoch": 1.3629745772202935, + "grad_norm": 1.101955771446228, + "learning_rate": 9.88607894736842e-05, + "loss": 0.3823, + "step": 24340 + }, + { + "epoch": 1.3630305745324225, + "grad_norm": 1.411412239074707, + "learning_rate": 9.886052631578948e-05, + "loss": 0.5211, + "step": 24341 + }, + { + "epoch": 1.3630865718445515, + "grad_norm": 1.3933584690093994, + "learning_rate": 9.886026315789474e-05, + "loss": 0.5002, + "step": 24342 + }, + { + "epoch": 1.3631425691566805, + "grad_norm": 1.3537825345993042, + "learning_rate": 9.886000000000001e-05, + "loss": 0.5249, + "step": 24343 + }, + { + "epoch": 1.3631985664688095, + "grad_norm": 1.3923225402832031, + "learning_rate": 9.885973684210527e-05, + "loss": 0.4003, + "step": 24344 + }, + { + "epoch": 1.3632545637809386, + "grad_norm": 1.1837642192840576, + "learning_rate": 9.885947368421053e-05, + "loss": 0.4657, + "step": 24345 + }, + { + "epoch": 1.3633105610930676, + "grad_norm": 1.1448791027069092, + "learning_rate": 9.885921052631579e-05, + "loss": 0.4039, + "step": 24346 + }, + { + "epoch": 1.3633665584051966, + "grad_norm": 1.3756505250930786, + "learning_rate": 9.885894736842106e-05, + "loss": 0.4797, + "step": 24347 + }, + { + "epoch": 1.3634225557173256, + "grad_norm": 1.3280857801437378, + "learning_rate": 9.885868421052632e-05, + "loss": 0.4247, + "step": 24348 + }, + { + "epoch": 1.3634785530294546, + "grad_norm": 1.4541010856628418, + "learning_rate": 9.885842105263158e-05, + "loss": 0.4844, + "step": 24349 + }, + { + "epoch": 1.3635345503415837, + "grad_norm": 1.3154367208480835, + "learning_rate": 9.885815789473684e-05, + "loss": 0.5767, + "step": 24350 + }, + { + "epoch": 1.3635905476537127, + "grad_norm": 1.5184102058410645, + "learning_rate": 9.88578947368421e-05, + "loss": 0.5135, + "step": 24351 + }, + { + "epoch": 1.3636465449658417, + "grad_norm": 1.2757196426391602, + "learning_rate": 9.885763157894738e-05, + "loss": 0.4336, + "step": 24352 + }, + { + "epoch": 1.3637025422779707, + "grad_norm": 1.3980156183242798, + "learning_rate": 9.885736842105264e-05, + "loss": 0.4931, + "step": 24353 + }, + { + "epoch": 1.3637585395900997, + "grad_norm": 1.666050672531128, + "learning_rate": 9.88571052631579e-05, + "loss": 0.3957, + "step": 24354 + }, + { + "epoch": 1.3638145369022288, + "grad_norm": 1.250495195388794, + "learning_rate": 9.885684210526316e-05, + "loss": 0.4734, + "step": 24355 + }, + { + "epoch": 1.3638705342143578, + "grad_norm": 1.1907325983047485, + "learning_rate": 9.885657894736843e-05, + "loss": 0.4164, + "step": 24356 + }, + { + "epoch": 1.3639265315264868, + "grad_norm": 1.5041922330856323, + "learning_rate": 9.885631578947369e-05, + "loss": 0.5274, + "step": 24357 + }, + { + "epoch": 1.3639825288386158, + "grad_norm": 1.2572085857391357, + "learning_rate": 9.885605263157895e-05, + "loss": 0.4697, + "step": 24358 + }, + { + "epoch": 1.3640385261507448, + "grad_norm": 1.5445501804351807, + "learning_rate": 9.885578947368421e-05, + "loss": 0.5151, + "step": 24359 + }, + { + "epoch": 1.3640945234628739, + "grad_norm": 1.4259023666381836, + "learning_rate": 9.885552631578948e-05, + "loss": 0.5375, + "step": 24360 + }, + { + "epoch": 1.3641505207750029, + "grad_norm": 1.1657230854034424, + "learning_rate": 9.885526315789474e-05, + "loss": 0.4456, + "step": 24361 + }, + { + "epoch": 1.364206518087132, + "grad_norm": 1.5526230335235596, + "learning_rate": 9.885500000000001e-05, + "loss": 0.4723, + "step": 24362 + }, + { + "epoch": 1.364262515399261, + "grad_norm": 1.306484580039978, + "learning_rate": 9.885473684210526e-05, + "loss": 0.4282, + "step": 24363 + }, + { + "epoch": 1.36431851271139, + "grad_norm": 1.1856343746185303, + "learning_rate": 9.885447368421053e-05, + "loss": 0.4246, + "step": 24364 + }, + { + "epoch": 1.364374510023519, + "grad_norm": 1.329923391342163, + "learning_rate": 9.88542105263158e-05, + "loss": 0.502, + "step": 24365 + }, + { + "epoch": 1.364430507335648, + "grad_norm": 1.2879289388656616, + "learning_rate": 9.885394736842107e-05, + "loss": 0.4894, + "step": 24366 + }, + { + "epoch": 1.364486504647777, + "grad_norm": 1.3006231784820557, + "learning_rate": 9.885368421052633e-05, + "loss": 0.462, + "step": 24367 + }, + { + "epoch": 1.364542501959906, + "grad_norm": 1.650084376335144, + "learning_rate": 9.885342105263157e-05, + "loss": 0.387, + "step": 24368 + }, + { + "epoch": 1.3645984992720348, + "grad_norm": 1.4683609008789062, + "learning_rate": 9.885315789473685e-05, + "loss": 0.3361, + "step": 24369 + }, + { + "epoch": 1.3646544965841638, + "grad_norm": 1.5280910730361938, + "learning_rate": 9.88528947368421e-05, + "loss": 0.4098, + "step": 24370 + }, + { + "epoch": 1.3647104938962928, + "grad_norm": 1.3542401790618896, + "learning_rate": 9.885263157894738e-05, + "loss": 0.4193, + "step": 24371 + }, + { + "epoch": 1.3647664912084219, + "grad_norm": 1.2905116081237793, + "learning_rate": 9.885236842105263e-05, + "loss": 0.4167, + "step": 24372 + }, + { + "epoch": 1.3648224885205509, + "grad_norm": 1.6238502264022827, + "learning_rate": 9.88521052631579e-05, + "loss": 0.495, + "step": 24373 + }, + { + "epoch": 1.36487848583268, + "grad_norm": 1.3055897951126099, + "learning_rate": 9.885184210526316e-05, + "loss": 0.4439, + "step": 24374 + }, + { + "epoch": 1.364934483144809, + "grad_norm": 1.3446067571640015, + "learning_rate": 9.885157894736843e-05, + "loss": 0.3404, + "step": 24375 + }, + { + "epoch": 1.364990480456938, + "grad_norm": 1.355931043624878, + "learning_rate": 9.885131578947369e-05, + "loss": 0.4637, + "step": 24376 + }, + { + "epoch": 1.365046477769067, + "grad_norm": 1.3010797500610352, + "learning_rate": 9.885105263157895e-05, + "loss": 0.457, + "step": 24377 + }, + { + "epoch": 1.365102475081196, + "grad_norm": 1.3977206945419312, + "learning_rate": 9.885078947368421e-05, + "loss": 0.5038, + "step": 24378 + }, + { + "epoch": 1.365158472393325, + "grad_norm": 1.1111963987350464, + "learning_rate": 9.885052631578948e-05, + "loss": 0.3715, + "step": 24379 + }, + { + "epoch": 1.365214469705454, + "grad_norm": 1.1828562021255493, + "learning_rate": 9.885026315789474e-05, + "loss": 0.4013, + "step": 24380 + }, + { + "epoch": 1.365270467017583, + "grad_norm": 1.5111533403396606, + "learning_rate": 9.885e-05, + "loss": 0.4277, + "step": 24381 + }, + { + "epoch": 1.365326464329712, + "grad_norm": 24.50279426574707, + "learning_rate": 9.884973684210526e-05, + "loss": 0.5584, + "step": 24382 + }, + { + "epoch": 1.365382461641841, + "grad_norm": 1.3131940364837646, + "learning_rate": 9.884947368421054e-05, + "loss": 0.3771, + "step": 24383 + }, + { + "epoch": 1.36543845895397, + "grad_norm": 1.5126020908355713, + "learning_rate": 9.88492105263158e-05, + "loss": 0.5313, + "step": 24384 + }, + { + "epoch": 1.365494456266099, + "grad_norm": 1.2867248058319092, + "learning_rate": 9.884894736842106e-05, + "loss": 0.4346, + "step": 24385 + }, + { + "epoch": 1.3655504535782281, + "grad_norm": 1.7722824811935425, + "learning_rate": 9.884868421052632e-05, + "loss": 0.4468, + "step": 24386 + }, + { + "epoch": 1.3656064508903571, + "grad_norm": 1.28387451171875, + "learning_rate": 9.884842105263158e-05, + "loss": 0.4015, + "step": 24387 + }, + { + "epoch": 1.3656624482024862, + "grad_norm": 1.6635466814041138, + "learning_rate": 9.884815789473685e-05, + "loss": 0.4431, + "step": 24388 + }, + { + "epoch": 1.3657184455146152, + "grad_norm": 1.243938684463501, + "learning_rate": 9.884789473684211e-05, + "loss": 0.47, + "step": 24389 + }, + { + "epoch": 1.3657744428267442, + "grad_norm": 1.4119457006454468, + "learning_rate": 9.884763157894737e-05, + "loss": 0.5061, + "step": 24390 + }, + { + "epoch": 1.3658304401388732, + "grad_norm": 1.2198487520217896, + "learning_rate": 9.884736842105263e-05, + "loss": 0.3484, + "step": 24391 + }, + { + "epoch": 1.3658864374510022, + "grad_norm": 1.2817981243133545, + "learning_rate": 9.88471052631579e-05, + "loss": 0.5075, + "step": 24392 + }, + { + "epoch": 1.3659424347631313, + "grad_norm": 1.5079666376113892, + "learning_rate": 9.884684210526316e-05, + "loss": 0.5416, + "step": 24393 + }, + { + "epoch": 1.3659984320752603, + "grad_norm": 1.1374844312667847, + "learning_rate": 9.884657894736843e-05, + "loss": 0.3744, + "step": 24394 + }, + { + "epoch": 1.3660544293873893, + "grad_norm": 1.438244342803955, + "learning_rate": 9.884631578947368e-05, + "loss": 0.4617, + "step": 24395 + }, + { + "epoch": 1.3661104266995183, + "grad_norm": 1.3679901361465454, + "learning_rate": 9.884605263157895e-05, + "loss": 0.4839, + "step": 24396 + }, + { + "epoch": 1.3661664240116473, + "grad_norm": 1.2486059665679932, + "learning_rate": 9.884578947368421e-05, + "loss": 0.4152, + "step": 24397 + }, + { + "epoch": 1.3662224213237764, + "grad_norm": 1.423294186592102, + "learning_rate": 9.884552631578949e-05, + "loss": 0.4701, + "step": 24398 + }, + { + "epoch": 1.3662784186359054, + "grad_norm": 1.3104181289672852, + "learning_rate": 9.884526315789475e-05, + "loss": 0.5171, + "step": 24399 + }, + { + "epoch": 1.3663344159480344, + "grad_norm": 1.3045990467071533, + "learning_rate": 9.8845e-05, + "loss": 0.4298, + "step": 24400 + }, + { + "epoch": 1.3663904132601634, + "grad_norm": 1.4200360774993896, + "learning_rate": 9.884473684210527e-05, + "loss": 0.5425, + "step": 24401 + }, + { + "epoch": 1.3664464105722924, + "grad_norm": 1.6482840776443481, + "learning_rate": 9.884447368421053e-05, + "loss": 0.644, + "step": 24402 + }, + { + "epoch": 1.3665024078844215, + "grad_norm": 1.2006323337554932, + "learning_rate": 9.88442105263158e-05, + "loss": 0.4565, + "step": 24403 + }, + { + "epoch": 1.3665584051965505, + "grad_norm": 1.4098336696624756, + "learning_rate": 9.884394736842106e-05, + "loss": 0.4197, + "step": 24404 + }, + { + "epoch": 1.3666144025086795, + "grad_norm": 1.4508843421936035, + "learning_rate": 9.884368421052632e-05, + "loss": 0.5249, + "step": 24405 + }, + { + "epoch": 1.3666703998208085, + "grad_norm": 1.574019432067871, + "learning_rate": 9.884342105263158e-05, + "loss": 0.6243, + "step": 24406 + }, + { + "epoch": 1.3667263971329375, + "grad_norm": 1.6158201694488525, + "learning_rate": 9.884315789473685e-05, + "loss": 0.5481, + "step": 24407 + }, + { + "epoch": 1.3667823944450666, + "grad_norm": 1.360234260559082, + "learning_rate": 9.884289473684211e-05, + "loss": 0.5952, + "step": 24408 + }, + { + "epoch": 1.3668383917571956, + "grad_norm": 1.18491530418396, + "learning_rate": 9.884263157894737e-05, + "loss": 0.2932, + "step": 24409 + }, + { + "epoch": 1.3668943890693246, + "grad_norm": 1.4678871631622314, + "learning_rate": 9.884236842105263e-05, + "loss": 0.4682, + "step": 24410 + }, + { + "epoch": 1.3669503863814536, + "grad_norm": 1.540473222732544, + "learning_rate": 9.88421052631579e-05, + "loss": 0.4994, + "step": 24411 + }, + { + "epoch": 1.3670063836935826, + "grad_norm": 1.4704207181930542, + "learning_rate": 9.884184210526316e-05, + "loss": 0.4684, + "step": 24412 + }, + { + "epoch": 1.3670623810057116, + "grad_norm": 1.3927764892578125, + "learning_rate": 9.884157894736842e-05, + "loss": 0.4538, + "step": 24413 + }, + { + "epoch": 1.3671183783178407, + "grad_norm": 1.0648525953292847, + "learning_rate": 9.884131578947368e-05, + "loss": 0.3678, + "step": 24414 + }, + { + "epoch": 1.3671743756299697, + "grad_norm": 1.3720592260360718, + "learning_rate": 9.884105263157896e-05, + "loss": 0.4999, + "step": 24415 + }, + { + "epoch": 1.3672303729420987, + "grad_norm": 1.8324408531188965, + "learning_rate": 9.884078947368422e-05, + "loss": 0.5443, + "step": 24416 + }, + { + "epoch": 1.3672863702542277, + "grad_norm": 1.3010083436965942, + "learning_rate": 9.884052631578949e-05, + "loss": 0.3828, + "step": 24417 + }, + { + "epoch": 1.3673423675663567, + "grad_norm": 1.4595146179199219, + "learning_rate": 9.884026315789474e-05, + "loss": 0.6761, + "step": 24418 + }, + { + "epoch": 1.3673983648784858, + "grad_norm": 1.4448872804641724, + "learning_rate": 9.884e-05, + "loss": 0.4775, + "step": 24419 + }, + { + "epoch": 1.3674543621906148, + "grad_norm": 2.1411805152893066, + "learning_rate": 9.883973684210527e-05, + "loss": 0.7041, + "step": 24420 + }, + { + "epoch": 1.3675103595027438, + "grad_norm": 1.5478935241699219, + "learning_rate": 9.883947368421053e-05, + "loss": 0.503, + "step": 24421 + }, + { + "epoch": 1.3675663568148728, + "grad_norm": 1.2933239936828613, + "learning_rate": 9.88392105263158e-05, + "loss": 0.3763, + "step": 24422 + }, + { + "epoch": 1.3676223541270018, + "grad_norm": 1.6128108501434326, + "learning_rate": 9.883894736842105e-05, + "loss": 0.5932, + "step": 24423 + }, + { + "epoch": 1.3676783514391309, + "grad_norm": 1.8574882745742798, + "learning_rate": 9.883868421052632e-05, + "loss": 0.4911, + "step": 24424 + }, + { + "epoch": 1.3677343487512599, + "grad_norm": 1.3461345434188843, + "learning_rate": 9.883842105263158e-05, + "loss": 0.4497, + "step": 24425 + }, + { + "epoch": 1.367790346063389, + "grad_norm": 1.1728650331497192, + "learning_rate": 9.883815789473685e-05, + "loss": 0.3946, + "step": 24426 + }, + { + "epoch": 1.367846343375518, + "grad_norm": 1.2892390489578247, + "learning_rate": 9.88378947368421e-05, + "loss": 0.4278, + "step": 24427 + }, + { + "epoch": 1.367902340687647, + "grad_norm": 1.397377848625183, + "learning_rate": 9.883763157894737e-05, + "loss": 0.3746, + "step": 24428 + }, + { + "epoch": 1.367958337999776, + "grad_norm": 1.3317534923553467, + "learning_rate": 9.883736842105263e-05, + "loss": 0.449, + "step": 24429 + }, + { + "epoch": 1.368014335311905, + "grad_norm": 1.370968222618103, + "learning_rate": 9.883710526315791e-05, + "loss": 0.5822, + "step": 24430 + }, + { + "epoch": 1.368070332624034, + "grad_norm": 1.4074839353561401, + "learning_rate": 9.883684210526317e-05, + "loss": 0.4826, + "step": 24431 + }, + { + "epoch": 1.368126329936163, + "grad_norm": 1.3753305673599243, + "learning_rate": 9.883657894736843e-05, + "loss": 0.3796, + "step": 24432 + }, + { + "epoch": 1.368182327248292, + "grad_norm": 1.620547890663147, + "learning_rate": 9.883631578947369e-05, + "loss": 0.5574, + "step": 24433 + }, + { + "epoch": 1.368238324560421, + "grad_norm": 1.3297902345657349, + "learning_rate": 9.883605263157896e-05, + "loss": 0.4992, + "step": 24434 + }, + { + "epoch": 1.36829432187255, + "grad_norm": 1.27519953250885, + "learning_rate": 9.883578947368422e-05, + "loss": 0.4256, + "step": 24435 + }, + { + "epoch": 1.368350319184679, + "grad_norm": 1.8830041885375977, + "learning_rate": 9.883552631578948e-05, + "loss": 0.5545, + "step": 24436 + }, + { + "epoch": 1.368406316496808, + "grad_norm": 1.4125512838363647, + "learning_rate": 9.883526315789474e-05, + "loss": 0.4259, + "step": 24437 + }, + { + "epoch": 1.3684623138089371, + "grad_norm": 1.3391048908233643, + "learning_rate": 9.8835e-05, + "loss": 0.4256, + "step": 24438 + }, + { + "epoch": 1.3685183111210661, + "grad_norm": 1.4954277276992798, + "learning_rate": 9.883473684210527e-05, + "loss": 0.5805, + "step": 24439 + }, + { + "epoch": 1.3685743084331952, + "grad_norm": 1.4033195972442627, + "learning_rate": 9.883447368421053e-05, + "loss": 0.4628, + "step": 24440 + }, + { + "epoch": 1.3686303057453242, + "grad_norm": 1.6419473886489868, + "learning_rate": 9.883421052631579e-05, + "loss": 0.4202, + "step": 24441 + }, + { + "epoch": 1.3686863030574532, + "grad_norm": 1.9852774143218994, + "learning_rate": 9.883394736842105e-05, + "loss": 0.5977, + "step": 24442 + }, + { + "epoch": 1.3687423003695822, + "grad_norm": 1.5373849868774414, + "learning_rate": 9.883368421052632e-05, + "loss": 0.7519, + "step": 24443 + }, + { + "epoch": 1.3687982976817112, + "grad_norm": 1.4142190217971802, + "learning_rate": 9.883342105263158e-05, + "loss": 0.5456, + "step": 24444 + }, + { + "epoch": 1.3688542949938403, + "grad_norm": 1.35452139377594, + "learning_rate": 9.883315789473684e-05, + "loss": 0.5445, + "step": 24445 + }, + { + "epoch": 1.3689102923059693, + "grad_norm": 1.2676734924316406, + "learning_rate": 9.88328947368421e-05, + "loss": 0.3916, + "step": 24446 + }, + { + "epoch": 1.3689662896180983, + "grad_norm": 1.472924828529358, + "learning_rate": 9.883263157894738e-05, + "loss": 0.5168, + "step": 24447 + }, + { + "epoch": 1.3690222869302273, + "grad_norm": 1.5486114025115967, + "learning_rate": 9.883236842105264e-05, + "loss": 0.5228, + "step": 24448 + }, + { + "epoch": 1.3690782842423563, + "grad_norm": 1.091369390487671, + "learning_rate": 9.883210526315791e-05, + "loss": 0.3869, + "step": 24449 + }, + { + "epoch": 1.3691342815544854, + "grad_norm": 1.6277631521224976, + "learning_rate": 9.883184210526316e-05, + "loss": 0.4969, + "step": 24450 + }, + { + "epoch": 1.3691902788666144, + "grad_norm": 1.6347336769104004, + "learning_rate": 9.883157894736843e-05, + "loss": 0.5751, + "step": 24451 + }, + { + "epoch": 1.3692462761787434, + "grad_norm": 1.3268325328826904, + "learning_rate": 9.883131578947369e-05, + "loss": 0.3474, + "step": 24452 + }, + { + "epoch": 1.3693022734908724, + "grad_norm": 1.3420542478561401, + "learning_rate": 9.883105263157896e-05, + "loss": 0.4211, + "step": 24453 + }, + { + "epoch": 1.3693582708030014, + "grad_norm": 1.2873342037200928, + "learning_rate": 9.883078947368422e-05, + "loss": 0.5331, + "step": 24454 + }, + { + "epoch": 1.3694142681151305, + "grad_norm": 1.4083830118179321, + "learning_rate": 9.883052631578947e-05, + "loss": 0.4748, + "step": 24455 + }, + { + "epoch": 1.3694702654272595, + "grad_norm": 2.098632335662842, + "learning_rate": 9.883026315789474e-05, + "loss": 0.5454, + "step": 24456 + }, + { + "epoch": 1.3695262627393885, + "grad_norm": 1.594950795173645, + "learning_rate": 9.883e-05, + "loss": 0.5077, + "step": 24457 + }, + { + "epoch": 1.3695822600515175, + "grad_norm": 1.342111349105835, + "learning_rate": 9.882973684210527e-05, + "loss": 0.5786, + "step": 24458 + }, + { + "epoch": 1.3696382573636465, + "grad_norm": 1.401429295539856, + "learning_rate": 9.882947368421053e-05, + "loss": 0.4681, + "step": 24459 + }, + { + "epoch": 1.3696942546757755, + "grad_norm": 2.9357070922851562, + "learning_rate": 9.88292105263158e-05, + "loss": 0.6699, + "step": 24460 + }, + { + "epoch": 1.3697502519879046, + "grad_norm": 1.313418984413147, + "learning_rate": 9.882894736842105e-05, + "loss": 0.4755, + "step": 24461 + }, + { + "epoch": 1.3698062493000336, + "grad_norm": 1.44545316696167, + "learning_rate": 9.882868421052633e-05, + "loss": 0.4548, + "step": 24462 + }, + { + "epoch": 1.3698622466121626, + "grad_norm": 1.3986786603927612, + "learning_rate": 9.882842105263159e-05, + "loss": 0.4006, + "step": 24463 + }, + { + "epoch": 1.3699182439242916, + "grad_norm": 1.4873632192611694, + "learning_rate": 9.882815789473685e-05, + "loss": 0.5852, + "step": 24464 + }, + { + "epoch": 1.3699742412364206, + "grad_norm": 1.25263512134552, + "learning_rate": 9.88278947368421e-05, + "loss": 0.3935, + "step": 24465 + }, + { + "epoch": 1.3700302385485497, + "grad_norm": 1.2605167627334595, + "learning_rate": 9.882763157894738e-05, + "loss": 0.5369, + "step": 24466 + }, + { + "epoch": 1.3700862358606787, + "grad_norm": 1.7462189197540283, + "learning_rate": 9.882736842105264e-05, + "loss": 0.5562, + "step": 24467 + }, + { + "epoch": 1.3701422331728077, + "grad_norm": 1.29815673828125, + "learning_rate": 9.88271052631579e-05, + "loss": 0.4363, + "step": 24468 + }, + { + "epoch": 1.3701982304849367, + "grad_norm": 1.4782339334487915, + "learning_rate": 9.882684210526316e-05, + "loss": 0.4121, + "step": 24469 + }, + { + "epoch": 1.3702542277970657, + "grad_norm": 1.279172420501709, + "learning_rate": 9.882657894736843e-05, + "loss": 0.4359, + "step": 24470 + }, + { + "epoch": 1.3703102251091948, + "grad_norm": 1.3147929906845093, + "learning_rate": 9.882631578947369e-05, + "loss": 0.4926, + "step": 24471 + }, + { + "epoch": 1.3703662224213238, + "grad_norm": 1.593000888824463, + "learning_rate": 9.882605263157895e-05, + "loss": 0.43, + "step": 24472 + }, + { + "epoch": 1.3704222197334528, + "grad_norm": 1.2197630405426025, + "learning_rate": 9.882578947368421e-05, + "loss": 0.424, + "step": 24473 + }, + { + "epoch": 1.3704782170455818, + "grad_norm": 1.2020893096923828, + "learning_rate": 9.882552631578947e-05, + "loss": 0.4941, + "step": 24474 + }, + { + "epoch": 1.3705342143577108, + "grad_norm": 1.323865532875061, + "learning_rate": 9.882526315789474e-05, + "loss": 0.6674, + "step": 24475 + }, + { + "epoch": 1.3705902116698399, + "grad_norm": 1.4042918682098389, + "learning_rate": 9.8825e-05, + "loss": 0.3982, + "step": 24476 + }, + { + "epoch": 1.3706462089819689, + "grad_norm": 1.2864902019500732, + "learning_rate": 9.882473684210528e-05, + "loss": 0.4238, + "step": 24477 + }, + { + "epoch": 1.370702206294098, + "grad_norm": 1.6968425512313843, + "learning_rate": 9.882447368421052e-05, + "loss": 0.4565, + "step": 24478 + }, + { + "epoch": 1.370758203606227, + "grad_norm": 1.2619678974151611, + "learning_rate": 9.88242105263158e-05, + "loss": 0.4263, + "step": 24479 + }, + { + "epoch": 1.370814200918356, + "grad_norm": 1.5142074823379517, + "learning_rate": 9.882394736842106e-05, + "loss": 0.4625, + "step": 24480 + }, + { + "epoch": 1.370870198230485, + "grad_norm": 1.8756927251815796, + "learning_rate": 9.882368421052633e-05, + "loss": 0.4626, + "step": 24481 + }, + { + "epoch": 1.370926195542614, + "grad_norm": 1.309545874595642, + "learning_rate": 9.882342105263158e-05, + "loss": 0.4355, + "step": 24482 + }, + { + "epoch": 1.370982192854743, + "grad_norm": 1.2559274435043335, + "learning_rate": 9.882315789473685e-05, + "loss": 0.4347, + "step": 24483 + }, + { + "epoch": 1.371038190166872, + "grad_norm": 1.3520760536193848, + "learning_rate": 9.882289473684211e-05, + "loss": 0.4774, + "step": 24484 + }, + { + "epoch": 1.371094187479001, + "grad_norm": 1.803807258605957, + "learning_rate": 9.882263157894738e-05, + "loss": 0.5191, + "step": 24485 + }, + { + "epoch": 1.37115018479113, + "grad_norm": 1.264050006866455, + "learning_rate": 9.882236842105264e-05, + "loss": 0.42, + "step": 24486 + }, + { + "epoch": 1.371206182103259, + "grad_norm": 1.4715521335601807, + "learning_rate": 9.88221052631579e-05, + "loss": 0.453, + "step": 24487 + }, + { + "epoch": 1.371262179415388, + "grad_norm": 1.6300288438796997, + "learning_rate": 9.882184210526316e-05, + "loss": 0.4245, + "step": 24488 + }, + { + "epoch": 1.371318176727517, + "grad_norm": 1.3610649108886719, + "learning_rate": 9.882157894736842e-05, + "loss": 0.5405, + "step": 24489 + }, + { + "epoch": 1.3713741740396461, + "grad_norm": 1.3018198013305664, + "learning_rate": 9.88213157894737e-05, + "loss": 0.392, + "step": 24490 + }, + { + "epoch": 1.3714301713517751, + "grad_norm": 1.224164366722107, + "learning_rate": 9.882105263157895e-05, + "loss": 0.478, + "step": 24491 + }, + { + "epoch": 1.3714861686639042, + "grad_norm": 1.346011757850647, + "learning_rate": 9.882078947368421e-05, + "loss": 0.5077, + "step": 24492 + }, + { + "epoch": 1.3715421659760332, + "grad_norm": 1.1635698080062866, + "learning_rate": 9.882052631578947e-05, + "loss": 0.4368, + "step": 24493 + }, + { + "epoch": 1.3715981632881622, + "grad_norm": 0.9612644910812378, + "learning_rate": 9.882026315789475e-05, + "loss": 0.3163, + "step": 24494 + }, + { + "epoch": 1.3716541606002912, + "grad_norm": 1.6849596500396729, + "learning_rate": 9.882e-05, + "loss": 0.4784, + "step": 24495 + }, + { + "epoch": 1.3717101579124202, + "grad_norm": 1.464515209197998, + "learning_rate": 9.881973684210527e-05, + "loss": 0.5673, + "step": 24496 + }, + { + "epoch": 1.3717661552245493, + "grad_norm": 1.300506830215454, + "learning_rate": 9.881947368421053e-05, + "loss": 0.4394, + "step": 24497 + }, + { + "epoch": 1.3718221525366783, + "grad_norm": 1.3180797100067139, + "learning_rate": 9.88192105263158e-05, + "loss": 0.5123, + "step": 24498 + }, + { + "epoch": 1.3718781498488073, + "grad_norm": 1.2333238124847412, + "learning_rate": 9.881894736842106e-05, + "loss": 0.4677, + "step": 24499 + }, + { + "epoch": 1.3719341471609363, + "grad_norm": 1.2898578643798828, + "learning_rate": 9.881868421052632e-05, + "loss": 0.4185, + "step": 24500 + }, + { + "epoch": 1.3719901444730653, + "grad_norm": 1.4654231071472168, + "learning_rate": 9.881842105263158e-05, + "loss": 0.5147, + "step": 24501 + }, + { + "epoch": 1.3720461417851944, + "grad_norm": 1.352738618850708, + "learning_rate": 9.881815789473685e-05, + "loss": 0.5083, + "step": 24502 + }, + { + "epoch": 1.3721021390973234, + "grad_norm": 1.1011550426483154, + "learning_rate": 9.881789473684211e-05, + "loss": 0.385, + "step": 24503 + }, + { + "epoch": 1.3721581364094524, + "grad_norm": 3.091196060180664, + "learning_rate": 9.881763157894738e-05, + "loss": 0.4052, + "step": 24504 + }, + { + "epoch": 1.3722141337215814, + "grad_norm": 1.392540454864502, + "learning_rate": 9.881736842105263e-05, + "loss": 0.4687, + "step": 24505 + }, + { + "epoch": 1.3722701310337104, + "grad_norm": 1.606001853942871, + "learning_rate": 9.881710526315789e-05, + "loss": 0.4993, + "step": 24506 + }, + { + "epoch": 1.3723261283458394, + "grad_norm": 1.2238637208938599, + "learning_rate": 9.881684210526316e-05, + "loss": 0.3676, + "step": 24507 + }, + { + "epoch": 1.3723821256579685, + "grad_norm": 1.2948689460754395, + "learning_rate": 9.881657894736842e-05, + "loss": 0.5334, + "step": 24508 + }, + { + "epoch": 1.3724381229700975, + "grad_norm": 1.380072832107544, + "learning_rate": 9.88163157894737e-05, + "loss": 0.4704, + "step": 24509 + }, + { + "epoch": 1.3724941202822265, + "grad_norm": 1.105183482170105, + "learning_rate": 9.881605263157894e-05, + "loss": 0.4189, + "step": 24510 + }, + { + "epoch": 1.3725501175943555, + "grad_norm": 1.1319661140441895, + "learning_rate": 9.881578947368422e-05, + "loss": 0.4072, + "step": 24511 + }, + { + "epoch": 1.3726061149064845, + "grad_norm": 2.206179141998291, + "learning_rate": 9.881552631578948e-05, + "loss": 0.6171, + "step": 24512 + }, + { + "epoch": 1.3726621122186136, + "grad_norm": 1.2761162519454956, + "learning_rate": 9.881526315789475e-05, + "loss": 0.3865, + "step": 24513 + }, + { + "epoch": 1.3727181095307426, + "grad_norm": 1.4989609718322754, + "learning_rate": 9.881500000000001e-05, + "loss": 0.5733, + "step": 24514 + }, + { + "epoch": 1.3727741068428716, + "grad_norm": 1.320183277130127, + "learning_rate": 9.881473684210527e-05, + "loss": 0.475, + "step": 24515 + }, + { + "epoch": 1.3728301041550006, + "grad_norm": 1.1859266757965088, + "learning_rate": 9.881447368421053e-05, + "loss": 0.4993, + "step": 24516 + }, + { + "epoch": 1.3728861014671296, + "grad_norm": 1.1573539972305298, + "learning_rate": 9.88142105263158e-05, + "loss": 0.3842, + "step": 24517 + }, + { + "epoch": 1.3729420987792587, + "grad_norm": 1.4029028415679932, + "learning_rate": 9.881394736842106e-05, + "loss": 0.5271, + "step": 24518 + }, + { + "epoch": 1.3729980960913877, + "grad_norm": 1.2326328754425049, + "learning_rate": 9.881368421052632e-05, + "loss": 0.4335, + "step": 24519 + }, + { + "epoch": 1.3730540934035167, + "grad_norm": 3.316587209701538, + "learning_rate": 9.881342105263158e-05, + "loss": 0.5095, + "step": 24520 + }, + { + "epoch": 1.3731100907156457, + "grad_norm": 1.5112648010253906, + "learning_rate": 9.881315789473685e-05, + "loss": 0.4672, + "step": 24521 + }, + { + "epoch": 1.3731660880277747, + "grad_norm": 1.3780750036239624, + "learning_rate": 9.881289473684211e-05, + "loss": 0.446, + "step": 24522 + }, + { + "epoch": 1.3732220853399038, + "grad_norm": 1.4295034408569336, + "learning_rate": 9.881263157894737e-05, + "loss": 0.3854, + "step": 24523 + }, + { + "epoch": 1.3732780826520328, + "grad_norm": 1.6485295295715332, + "learning_rate": 9.881236842105263e-05, + "loss": 0.5406, + "step": 24524 + }, + { + "epoch": 1.3733340799641618, + "grad_norm": 1.5437777042388916, + "learning_rate": 9.881210526315789e-05, + "loss": 0.6657, + "step": 24525 + }, + { + "epoch": 1.3733900772762908, + "grad_norm": 1.3259806632995605, + "learning_rate": 9.881184210526317e-05, + "loss": 0.4393, + "step": 24526 + }, + { + "epoch": 1.3734460745884198, + "grad_norm": 1.273875117301941, + "learning_rate": 9.881157894736843e-05, + "loss": 0.5589, + "step": 24527 + }, + { + "epoch": 1.3735020719005488, + "grad_norm": 5.459080696105957, + "learning_rate": 9.881131578947369e-05, + "loss": 0.3946, + "step": 24528 + }, + { + "epoch": 1.3735580692126779, + "grad_norm": 1.2246806621551514, + "learning_rate": 9.881105263157895e-05, + "loss": 0.4394, + "step": 24529 + }, + { + "epoch": 1.3736140665248069, + "grad_norm": 1.2419131994247437, + "learning_rate": 9.881078947368422e-05, + "loss": 0.5603, + "step": 24530 + }, + { + "epoch": 1.373670063836936, + "grad_norm": 1.3171749114990234, + "learning_rate": 9.881052631578948e-05, + "loss": 0.4098, + "step": 24531 + }, + { + "epoch": 1.373726061149065, + "grad_norm": 1.3591086864471436, + "learning_rate": 9.881026315789475e-05, + "loss": 0.3953, + "step": 24532 + }, + { + "epoch": 1.373782058461194, + "grad_norm": 1.4800583124160767, + "learning_rate": 9.881e-05, + "loss": 0.3976, + "step": 24533 + }, + { + "epoch": 1.373838055773323, + "grad_norm": 1.4704041481018066, + "learning_rate": 9.880973684210527e-05, + "loss": 0.4332, + "step": 24534 + }, + { + "epoch": 1.373894053085452, + "grad_norm": 1.4346450567245483, + "learning_rate": 9.880947368421053e-05, + "loss": 0.6352, + "step": 24535 + }, + { + "epoch": 1.373950050397581, + "grad_norm": 1.7150028944015503, + "learning_rate": 9.88092105263158e-05, + "loss": 0.5394, + "step": 24536 + }, + { + "epoch": 1.37400604770971, + "grad_norm": 1.4494582414627075, + "learning_rate": 9.880894736842105e-05, + "loss": 0.4575, + "step": 24537 + }, + { + "epoch": 1.374062045021839, + "grad_norm": 1.6348971128463745, + "learning_rate": 9.880868421052632e-05, + "loss": 0.5308, + "step": 24538 + }, + { + "epoch": 1.374118042333968, + "grad_norm": 1.4942302703857422, + "learning_rate": 9.880842105263158e-05, + "loss": 0.4096, + "step": 24539 + }, + { + "epoch": 1.374174039646097, + "grad_norm": 1.2634156942367554, + "learning_rate": 9.880815789473684e-05, + "loss": 0.465, + "step": 24540 + }, + { + "epoch": 1.374230036958226, + "grad_norm": 54.6109504699707, + "learning_rate": 9.880789473684212e-05, + "loss": 0.4818, + "step": 24541 + }, + { + "epoch": 1.3742860342703551, + "grad_norm": 1.678976058959961, + "learning_rate": 9.880763157894736e-05, + "loss": 0.4527, + "step": 24542 + }, + { + "epoch": 1.3743420315824841, + "grad_norm": 1.4438743591308594, + "learning_rate": 9.880736842105264e-05, + "loss": 0.4749, + "step": 24543 + }, + { + "epoch": 1.3743980288946132, + "grad_norm": 1.649899959564209, + "learning_rate": 9.88071052631579e-05, + "loss": 0.3697, + "step": 24544 + }, + { + "epoch": 1.3744540262067422, + "grad_norm": 1.311410903930664, + "learning_rate": 9.880684210526317e-05, + "loss": 0.3974, + "step": 24545 + }, + { + "epoch": 1.3745100235188712, + "grad_norm": 1.3957568407058716, + "learning_rate": 9.880657894736843e-05, + "loss": 0.5027, + "step": 24546 + }, + { + "epoch": 1.3745660208310002, + "grad_norm": 3.369692802429199, + "learning_rate": 9.880631578947369e-05, + "loss": 0.5027, + "step": 24547 + }, + { + "epoch": 1.3746220181431292, + "grad_norm": 1.3832881450653076, + "learning_rate": 9.880605263157895e-05, + "loss": 0.4475, + "step": 24548 + }, + { + "epoch": 1.3746780154552583, + "grad_norm": 1.0688807964324951, + "learning_rate": 9.880578947368422e-05, + "loss": 0.3709, + "step": 24549 + }, + { + "epoch": 1.3747340127673873, + "grad_norm": 1.4251041412353516, + "learning_rate": 9.880552631578948e-05, + "loss": 0.5787, + "step": 24550 + }, + { + "epoch": 1.3747900100795163, + "grad_norm": 3.9179627895355225, + "learning_rate": 9.880526315789474e-05, + "loss": 0.5285, + "step": 24551 + }, + { + "epoch": 1.3748460073916453, + "grad_norm": 1.3410621881484985, + "learning_rate": 9.8805e-05, + "loss": 0.3814, + "step": 24552 + }, + { + "epoch": 1.3749020047037743, + "grad_norm": 1.1759706735610962, + "learning_rate": 9.880473684210527e-05, + "loss": 0.4728, + "step": 24553 + }, + { + "epoch": 1.3749580020159033, + "grad_norm": 1.222511649131775, + "learning_rate": 9.880447368421053e-05, + "loss": 0.3362, + "step": 24554 + }, + { + "epoch": 1.3750139993280324, + "grad_norm": 1.8133656978607178, + "learning_rate": 9.880421052631579e-05, + "loss": 0.6572, + "step": 24555 + }, + { + "epoch": 1.3750699966401614, + "grad_norm": 1.2357032299041748, + "learning_rate": 9.880394736842105e-05, + "loss": 0.4965, + "step": 24556 + }, + { + "epoch": 1.3751259939522904, + "grad_norm": 1.3727672100067139, + "learning_rate": 9.880368421052633e-05, + "loss": 0.3667, + "step": 24557 + }, + { + "epoch": 1.3751819912644194, + "grad_norm": 1.2672778367996216, + "learning_rate": 9.880342105263159e-05, + "loss": 0.3819, + "step": 24558 + }, + { + "epoch": 1.3752379885765484, + "grad_norm": 1.537623643875122, + "learning_rate": 9.880315789473685e-05, + "loss": 0.4472, + "step": 24559 + }, + { + "epoch": 1.3752939858886775, + "grad_norm": 1.3242238759994507, + "learning_rate": 9.88028947368421e-05, + "loss": 0.3838, + "step": 24560 + }, + { + "epoch": 1.3753499832008065, + "grad_norm": 1.2333530187606812, + "learning_rate": 9.880263157894736e-05, + "loss": 0.4078, + "step": 24561 + }, + { + "epoch": 1.3754059805129355, + "grad_norm": 1.4914171695709229, + "learning_rate": 9.880236842105264e-05, + "loss": 0.4885, + "step": 24562 + }, + { + "epoch": 1.3754619778250645, + "grad_norm": 1.398218035697937, + "learning_rate": 9.88021052631579e-05, + "loss": 0.404, + "step": 24563 + }, + { + "epoch": 1.3755179751371935, + "grad_norm": 1.4480798244476318, + "learning_rate": 9.880184210526317e-05, + "loss": 0.4943, + "step": 24564 + }, + { + "epoch": 1.3755739724493226, + "grad_norm": 1.5293318033218384, + "learning_rate": 9.880157894736842e-05, + "loss": 0.5123, + "step": 24565 + }, + { + "epoch": 1.3756299697614516, + "grad_norm": 2.8959763050079346, + "learning_rate": 9.880131578947369e-05, + "loss": 0.5938, + "step": 24566 + }, + { + "epoch": 1.3756859670735806, + "grad_norm": 1.165149211883545, + "learning_rate": 9.880105263157895e-05, + "loss": 0.43, + "step": 24567 + }, + { + "epoch": 1.3757419643857096, + "grad_norm": 1.2796895503997803, + "learning_rate": 9.880078947368422e-05, + "loss": 0.3938, + "step": 24568 + }, + { + "epoch": 1.3757979616978386, + "grad_norm": 1.3704499006271362, + "learning_rate": 9.880052631578948e-05, + "loss": 0.4379, + "step": 24569 + }, + { + "epoch": 1.3758539590099677, + "grad_norm": 1.426423192024231, + "learning_rate": 9.880026315789474e-05, + "loss": 0.4486, + "step": 24570 + }, + { + "epoch": 1.3759099563220967, + "grad_norm": 1.323846459388733, + "learning_rate": 9.88e-05, + "loss": 0.3953, + "step": 24571 + }, + { + "epoch": 1.3759659536342257, + "grad_norm": 1.355147361755371, + "learning_rate": 9.879973684210528e-05, + "loss": 0.4667, + "step": 24572 + }, + { + "epoch": 1.3760219509463547, + "grad_norm": 1.044049620628357, + "learning_rate": 9.879947368421054e-05, + "loss": 0.374, + "step": 24573 + }, + { + "epoch": 1.3760779482584837, + "grad_norm": 1.219199538230896, + "learning_rate": 9.87992105263158e-05, + "loss": 0.4622, + "step": 24574 + }, + { + "epoch": 1.3761339455706127, + "grad_norm": 1.918542504310608, + "learning_rate": 9.879894736842106e-05, + "loss": 0.4706, + "step": 24575 + }, + { + "epoch": 1.3761899428827415, + "grad_norm": 1.292521595954895, + "learning_rate": 9.879868421052631e-05, + "loss": 0.3774, + "step": 24576 + }, + { + "epoch": 1.3762459401948706, + "grad_norm": 1.4552044868469238, + "learning_rate": 9.879842105263159e-05, + "loss": 0.5177, + "step": 24577 + }, + { + "epoch": 1.3763019375069996, + "grad_norm": 1.1717822551727295, + "learning_rate": 9.879815789473685e-05, + "loss": 0.5617, + "step": 24578 + }, + { + "epoch": 1.3763579348191286, + "grad_norm": 1.4983909130096436, + "learning_rate": 9.879789473684211e-05, + "loss": 0.4669, + "step": 24579 + }, + { + "epoch": 1.3764139321312576, + "grad_norm": 1.1356526613235474, + "learning_rate": 9.879763157894737e-05, + "loss": 0.4193, + "step": 24580 + }, + { + "epoch": 1.3764699294433866, + "grad_norm": 1.1955333948135376, + "learning_rate": 9.879736842105264e-05, + "loss": 0.3941, + "step": 24581 + }, + { + "epoch": 1.3765259267555157, + "grad_norm": 1.644252896308899, + "learning_rate": 9.87971052631579e-05, + "loss": 0.5471, + "step": 24582 + }, + { + "epoch": 1.3765819240676447, + "grad_norm": 1.354682445526123, + "learning_rate": 9.879684210526316e-05, + "loss": 0.4126, + "step": 24583 + }, + { + "epoch": 1.3766379213797737, + "grad_norm": 1.320406436920166, + "learning_rate": 9.879657894736842e-05, + "loss": 0.4518, + "step": 24584 + }, + { + "epoch": 1.3766939186919027, + "grad_norm": 1.5978227853775024, + "learning_rate": 9.879631578947369e-05, + "loss": 0.5316, + "step": 24585 + }, + { + "epoch": 1.3767499160040317, + "grad_norm": 1.4413739442825317, + "learning_rate": 9.879605263157895e-05, + "loss": 0.5114, + "step": 24586 + }, + { + "epoch": 1.3768059133161608, + "grad_norm": 1.6403529644012451, + "learning_rate": 9.879578947368421e-05, + "loss": 0.5069, + "step": 24587 + }, + { + "epoch": 1.3768619106282898, + "grad_norm": 2.017141103744507, + "learning_rate": 9.879552631578947e-05, + "loss": 0.4421, + "step": 24588 + }, + { + "epoch": 1.3769179079404188, + "grad_norm": 1.1616919040679932, + "learning_rate": 9.879526315789475e-05, + "loss": 0.3874, + "step": 24589 + }, + { + "epoch": 1.3769739052525478, + "grad_norm": 1.409917950630188, + "learning_rate": 9.8795e-05, + "loss": 0.4899, + "step": 24590 + }, + { + "epoch": 1.3770299025646768, + "grad_norm": 1.1836915016174316, + "learning_rate": 9.879473684210528e-05, + "loss": 0.4048, + "step": 24591 + }, + { + "epoch": 1.3770858998768059, + "grad_norm": 1.5157136917114258, + "learning_rate": 9.879447368421052e-05, + "loss": 0.6242, + "step": 24592 + }, + { + "epoch": 1.3771418971889349, + "grad_norm": 1.304404616355896, + "learning_rate": 9.879421052631578e-05, + "loss": 0.4373, + "step": 24593 + }, + { + "epoch": 1.377197894501064, + "grad_norm": 1.431369423866272, + "learning_rate": 9.879394736842106e-05, + "loss": 0.4589, + "step": 24594 + }, + { + "epoch": 1.377253891813193, + "grad_norm": 1.170019507408142, + "learning_rate": 9.879368421052632e-05, + "loss": 0.456, + "step": 24595 + }, + { + "epoch": 1.377309889125322, + "grad_norm": 1.325610876083374, + "learning_rate": 9.879342105263159e-05, + "loss": 0.5171, + "step": 24596 + }, + { + "epoch": 1.377365886437451, + "grad_norm": 1.6425721645355225, + "learning_rate": 9.879315789473684e-05, + "loss": 0.5185, + "step": 24597 + }, + { + "epoch": 1.37742188374958, + "grad_norm": 1.7873594760894775, + "learning_rate": 9.879289473684211e-05, + "loss": 0.5937, + "step": 24598 + }, + { + "epoch": 1.377477881061709, + "grad_norm": 1.5284775495529175, + "learning_rate": 9.879263157894737e-05, + "loss": 0.411, + "step": 24599 + }, + { + "epoch": 1.377533878373838, + "grad_norm": 1.3774112462997437, + "learning_rate": 9.879236842105264e-05, + "loss": 0.4618, + "step": 24600 + }, + { + "epoch": 1.377589875685967, + "grad_norm": 1.327060580253601, + "learning_rate": 9.87921052631579e-05, + "loss": 0.443, + "step": 24601 + }, + { + "epoch": 1.377645872998096, + "grad_norm": 1.7170281410217285, + "learning_rate": 9.879184210526316e-05, + "loss": 0.5056, + "step": 24602 + }, + { + "epoch": 1.377701870310225, + "grad_norm": 1.513201355934143, + "learning_rate": 9.879157894736842e-05, + "loss": 0.5068, + "step": 24603 + }, + { + "epoch": 1.377757867622354, + "grad_norm": 1.3682736158370972, + "learning_rate": 9.87913157894737e-05, + "loss": 0.566, + "step": 24604 + }, + { + "epoch": 1.377813864934483, + "grad_norm": 1.3634577989578247, + "learning_rate": 9.879105263157896e-05, + "loss": 0.4635, + "step": 24605 + }, + { + "epoch": 1.3778698622466121, + "grad_norm": 1.2694344520568848, + "learning_rate": 9.879078947368422e-05, + "loss": 0.5066, + "step": 24606 + }, + { + "epoch": 1.3779258595587411, + "grad_norm": 1.6849581003189087, + "learning_rate": 9.879052631578947e-05, + "loss": 0.5615, + "step": 24607 + }, + { + "epoch": 1.3779818568708702, + "grad_norm": 1.1305550336837769, + "learning_rate": 9.879026315789475e-05, + "loss": 0.5246, + "step": 24608 + }, + { + "epoch": 1.3780378541829992, + "grad_norm": 1.4769785404205322, + "learning_rate": 9.879000000000001e-05, + "loss": 0.4707, + "step": 24609 + }, + { + "epoch": 1.3780938514951282, + "grad_norm": 1.4925611019134521, + "learning_rate": 9.878973684210527e-05, + "loss": 0.8428, + "step": 24610 + }, + { + "epoch": 1.3781498488072572, + "grad_norm": 1.2915689945220947, + "learning_rate": 9.878947368421053e-05, + "loss": 0.4596, + "step": 24611 + }, + { + "epoch": 1.3782058461193862, + "grad_norm": 1.2616046667099, + "learning_rate": 9.878921052631579e-05, + "loss": 0.4247, + "step": 24612 + }, + { + "epoch": 1.3782618434315153, + "grad_norm": 1.432315707206726, + "learning_rate": 9.878894736842106e-05, + "loss": 0.4922, + "step": 24613 + }, + { + "epoch": 1.3783178407436443, + "grad_norm": 1.3737187385559082, + "learning_rate": 9.878868421052632e-05, + "loss": 0.5139, + "step": 24614 + }, + { + "epoch": 1.3783738380557733, + "grad_norm": 1.4290072917938232, + "learning_rate": 9.878842105263158e-05, + "loss": 0.3874, + "step": 24615 + }, + { + "epoch": 1.3784298353679023, + "grad_norm": 1.3011133670806885, + "learning_rate": 9.878815789473684e-05, + "loss": 0.4629, + "step": 24616 + }, + { + "epoch": 1.3784858326800313, + "grad_norm": 1.2998813390731812, + "learning_rate": 9.878789473684211e-05, + "loss": 0.4393, + "step": 24617 + }, + { + "epoch": 1.3785418299921604, + "grad_norm": 1.4572488069534302, + "learning_rate": 9.878763157894737e-05, + "loss": 0.4486, + "step": 24618 + }, + { + "epoch": 1.3785978273042894, + "grad_norm": 1.1944403648376465, + "learning_rate": 9.878736842105265e-05, + "loss": 0.3721, + "step": 24619 + }, + { + "epoch": 1.3786538246164184, + "grad_norm": 1.4873815774917603, + "learning_rate": 9.878710526315789e-05, + "loss": 0.4823, + "step": 24620 + }, + { + "epoch": 1.3787098219285474, + "grad_norm": 1.6921597719192505, + "learning_rate": 9.878684210526317e-05, + "loss": 0.5813, + "step": 24621 + }, + { + "epoch": 1.3787658192406764, + "grad_norm": 1.540161371231079, + "learning_rate": 9.878657894736843e-05, + "loss": 0.5494, + "step": 24622 + }, + { + "epoch": 1.3788218165528054, + "grad_norm": 1.5844582319259644, + "learning_rate": 9.87863157894737e-05, + "loss": 0.5868, + "step": 24623 + }, + { + "epoch": 1.3788778138649345, + "grad_norm": 1.5227375030517578, + "learning_rate": 9.878605263157896e-05, + "loss": 0.4472, + "step": 24624 + }, + { + "epoch": 1.3789338111770635, + "grad_norm": 1.295911431312561, + "learning_rate": 9.878578947368422e-05, + "loss": 0.434, + "step": 24625 + }, + { + "epoch": 1.3789898084891925, + "grad_norm": 1.7489311695098877, + "learning_rate": 9.878552631578948e-05, + "loss": 0.4214, + "step": 24626 + }, + { + "epoch": 1.3790458058013215, + "grad_norm": 1.0726250410079956, + "learning_rate": 9.878526315789474e-05, + "loss": 0.3382, + "step": 24627 + }, + { + "epoch": 1.3791018031134505, + "grad_norm": 1.4631832838058472, + "learning_rate": 9.878500000000001e-05, + "loss": 0.4638, + "step": 24628 + }, + { + "epoch": 1.3791578004255796, + "grad_norm": 1.142659068107605, + "learning_rate": 9.878473684210526e-05, + "loss": 0.351, + "step": 24629 + }, + { + "epoch": 1.3792137977377086, + "grad_norm": 1.292729377746582, + "learning_rate": 9.878447368421053e-05, + "loss": 0.3391, + "step": 24630 + }, + { + "epoch": 1.3792697950498376, + "grad_norm": 1.1582690477371216, + "learning_rate": 9.878421052631579e-05, + "loss": 0.4104, + "step": 24631 + }, + { + "epoch": 1.3793257923619666, + "grad_norm": 1.4168444871902466, + "learning_rate": 9.878394736842106e-05, + "loss": 0.3806, + "step": 24632 + }, + { + "epoch": 1.3793817896740956, + "grad_norm": 1.4440557956695557, + "learning_rate": 9.878368421052632e-05, + "loss": 0.5473, + "step": 24633 + }, + { + "epoch": 1.3794377869862247, + "grad_norm": 1.5434112548828125, + "learning_rate": 9.878342105263158e-05, + "loss": 0.4289, + "step": 24634 + }, + { + "epoch": 1.3794937842983537, + "grad_norm": 1.4226053953170776, + "learning_rate": 9.878315789473684e-05, + "loss": 0.4301, + "step": 24635 + }, + { + "epoch": 1.3795497816104827, + "grad_norm": 1.3752599954605103, + "learning_rate": 9.878289473684212e-05, + "loss": 0.583, + "step": 24636 + }, + { + "epoch": 1.3796057789226117, + "grad_norm": 1.5356531143188477, + "learning_rate": 9.878263157894738e-05, + "loss": 0.4519, + "step": 24637 + }, + { + "epoch": 1.3796617762347407, + "grad_norm": 1.1383914947509766, + "learning_rate": 9.878236842105263e-05, + "loss": 0.4648, + "step": 24638 + }, + { + "epoch": 1.3797177735468698, + "grad_norm": 1.3420947790145874, + "learning_rate": 9.87821052631579e-05, + "loss": 0.4335, + "step": 24639 + }, + { + "epoch": 1.3797737708589988, + "grad_norm": 1.1258677244186401, + "learning_rate": 9.878184210526317e-05, + "loss": 0.4219, + "step": 24640 + }, + { + "epoch": 1.3798297681711278, + "grad_norm": 1.1578919887542725, + "learning_rate": 9.878157894736843e-05, + "loss": 0.4617, + "step": 24641 + }, + { + "epoch": 1.3798857654832568, + "grad_norm": 1.464247226715088, + "learning_rate": 9.878131578947369e-05, + "loss": 0.5192, + "step": 24642 + }, + { + "epoch": 1.3799417627953858, + "grad_norm": 1.4965718984603882, + "learning_rate": 9.878105263157895e-05, + "loss": 0.4869, + "step": 24643 + }, + { + "epoch": 1.3799977601075148, + "grad_norm": 2.279975652694702, + "learning_rate": 9.878078947368421e-05, + "loss": 0.4597, + "step": 24644 + }, + { + "epoch": 1.3800537574196439, + "grad_norm": 1.3425018787384033, + "learning_rate": 9.878052631578948e-05, + "loss": 0.3994, + "step": 24645 + }, + { + "epoch": 1.3801097547317729, + "grad_norm": 1.2083648443222046, + "learning_rate": 9.878026315789474e-05, + "loss": 0.3594, + "step": 24646 + }, + { + "epoch": 1.380165752043902, + "grad_norm": 1.3597885370254517, + "learning_rate": 9.878e-05, + "loss": 0.4676, + "step": 24647 + }, + { + "epoch": 1.380221749356031, + "grad_norm": 1.4435182809829712, + "learning_rate": 9.877973684210526e-05, + "loss": 0.4462, + "step": 24648 + }, + { + "epoch": 1.38027774666816, + "grad_norm": 1.4416115283966064, + "learning_rate": 9.877947368421053e-05, + "loss": 0.5033, + "step": 24649 + }, + { + "epoch": 1.380333743980289, + "grad_norm": 1.3227752447128296, + "learning_rate": 9.877921052631579e-05, + "loss": 0.4932, + "step": 24650 + }, + { + "epoch": 1.380389741292418, + "grad_norm": 1.5899603366851807, + "learning_rate": 9.877894736842107e-05, + "loss": 0.6036, + "step": 24651 + }, + { + "epoch": 1.380445738604547, + "grad_norm": 1.2183899879455566, + "learning_rate": 9.877868421052631e-05, + "loss": 0.4057, + "step": 24652 + }, + { + "epoch": 1.380501735916676, + "grad_norm": 1.0418287515640259, + "learning_rate": 9.877842105263159e-05, + "loss": 0.3262, + "step": 24653 + }, + { + "epoch": 1.380557733228805, + "grad_norm": 1.2916969060897827, + "learning_rate": 9.877815789473684e-05, + "loss": 0.4956, + "step": 24654 + }, + { + "epoch": 1.380613730540934, + "grad_norm": 1.5164707899093628, + "learning_rate": 9.877789473684212e-05, + "loss": 0.46, + "step": 24655 + }, + { + "epoch": 1.380669727853063, + "grad_norm": 1.257965326309204, + "learning_rate": 9.877763157894738e-05, + "loss": 0.5249, + "step": 24656 + }, + { + "epoch": 1.380725725165192, + "grad_norm": 1.462067723274231, + "learning_rate": 9.877736842105264e-05, + "loss": 0.4827, + "step": 24657 + }, + { + "epoch": 1.3807817224773211, + "grad_norm": 1.2253189086914062, + "learning_rate": 9.87771052631579e-05, + "loss": 0.379, + "step": 24658 + }, + { + "epoch": 1.3808377197894501, + "grad_norm": 1.1978322267532349, + "learning_rate": 9.877684210526317e-05, + "loss": 0.3814, + "step": 24659 + }, + { + "epoch": 1.3808937171015792, + "grad_norm": 1.9540668725967407, + "learning_rate": 9.877657894736843e-05, + "loss": 0.4833, + "step": 24660 + }, + { + "epoch": 1.3809497144137082, + "grad_norm": 1.3607112169265747, + "learning_rate": 9.877631578947369e-05, + "loss": 0.5521, + "step": 24661 + }, + { + "epoch": 1.3810057117258372, + "grad_norm": 1.5534237623214722, + "learning_rate": 9.877605263157895e-05, + "loss": 0.385, + "step": 24662 + }, + { + "epoch": 1.3810617090379662, + "grad_norm": 1.330488681793213, + "learning_rate": 9.877578947368421e-05, + "loss": 0.5468, + "step": 24663 + }, + { + "epoch": 1.3811177063500952, + "grad_norm": 1.4170063734054565, + "learning_rate": 9.877552631578948e-05, + "loss": 0.5088, + "step": 24664 + }, + { + "epoch": 1.3811737036622243, + "grad_norm": 1.3981940746307373, + "learning_rate": 9.877526315789474e-05, + "loss": 0.5177, + "step": 24665 + }, + { + "epoch": 1.3812297009743533, + "grad_norm": 1.315423846244812, + "learning_rate": 9.8775e-05, + "loss": 0.4008, + "step": 24666 + }, + { + "epoch": 1.3812856982864823, + "grad_norm": 1.325331687927246, + "learning_rate": 9.877473684210526e-05, + "loss": 0.6356, + "step": 24667 + }, + { + "epoch": 1.3813416955986113, + "grad_norm": 1.5587759017944336, + "learning_rate": 9.877447368421054e-05, + "loss": 0.5645, + "step": 24668 + }, + { + "epoch": 1.3813976929107403, + "grad_norm": 1.4653170108795166, + "learning_rate": 9.87742105263158e-05, + "loss": 0.5262, + "step": 24669 + }, + { + "epoch": 1.3814536902228693, + "grad_norm": 1.3540924787521362, + "learning_rate": 9.877394736842105e-05, + "loss": 0.3624, + "step": 24670 + }, + { + "epoch": 1.3815096875349984, + "grad_norm": 1.3128352165222168, + "learning_rate": 9.877368421052631e-05, + "loss": 0.4842, + "step": 24671 + }, + { + "epoch": 1.3815656848471274, + "grad_norm": 1.0599039793014526, + "learning_rate": 9.877342105263159e-05, + "loss": 0.3384, + "step": 24672 + }, + { + "epoch": 1.3816216821592564, + "grad_norm": 1.2542729377746582, + "learning_rate": 9.877315789473685e-05, + "loss": 0.5512, + "step": 24673 + }, + { + "epoch": 1.3816776794713854, + "grad_norm": 1.223557710647583, + "learning_rate": 9.877289473684212e-05, + "loss": 0.4321, + "step": 24674 + }, + { + "epoch": 1.3817336767835144, + "grad_norm": 1.2568310499191284, + "learning_rate": 9.877263157894737e-05, + "loss": 0.5891, + "step": 24675 + }, + { + "epoch": 1.3817896740956435, + "grad_norm": 1.4729034900665283, + "learning_rate": 9.877236842105264e-05, + "loss": 0.4209, + "step": 24676 + }, + { + "epoch": 1.3818456714077725, + "grad_norm": 1.715546727180481, + "learning_rate": 9.87721052631579e-05, + "loss": 0.5552, + "step": 24677 + }, + { + "epoch": 1.3819016687199015, + "grad_norm": 1.522024154663086, + "learning_rate": 9.877184210526317e-05, + "loss": 0.4391, + "step": 24678 + }, + { + "epoch": 1.3819576660320305, + "grad_norm": 1.5103111267089844, + "learning_rate": 9.877157894736843e-05, + "loss": 0.6572, + "step": 24679 + }, + { + "epoch": 1.3820136633441595, + "grad_norm": 1.2847040891647339, + "learning_rate": 9.877131578947368e-05, + "loss": 0.3829, + "step": 24680 + }, + { + "epoch": 1.3820696606562886, + "grad_norm": 1.3065253496170044, + "learning_rate": 9.877105263157895e-05, + "loss": 0.364, + "step": 24681 + }, + { + "epoch": 1.3821256579684176, + "grad_norm": 1.60619056224823, + "learning_rate": 9.877078947368421e-05, + "loss": 0.4733, + "step": 24682 + }, + { + "epoch": 1.3821816552805466, + "grad_norm": 1.3917906284332275, + "learning_rate": 9.877052631578949e-05, + "loss": 0.5068, + "step": 24683 + }, + { + "epoch": 1.3822376525926756, + "grad_norm": 1.093767523765564, + "learning_rate": 9.877026315789473e-05, + "loss": 0.3872, + "step": 24684 + }, + { + "epoch": 1.3822936499048046, + "grad_norm": 1.4227591753005981, + "learning_rate": 9.877e-05, + "loss": 0.565, + "step": 24685 + }, + { + "epoch": 1.3823496472169337, + "grad_norm": 1.251604437828064, + "learning_rate": 9.876973684210526e-05, + "loss": 0.5833, + "step": 24686 + }, + { + "epoch": 1.3824056445290627, + "grad_norm": 1.1730982065200806, + "learning_rate": 9.876947368421054e-05, + "loss": 0.4711, + "step": 24687 + }, + { + "epoch": 1.3824616418411917, + "grad_norm": 1.510393500328064, + "learning_rate": 9.87692105263158e-05, + "loss": 0.486, + "step": 24688 + }, + { + "epoch": 1.3825176391533207, + "grad_norm": 1.3401352167129517, + "learning_rate": 9.876894736842106e-05, + "loss": 0.4406, + "step": 24689 + }, + { + "epoch": 1.3825736364654497, + "grad_norm": 1.283668041229248, + "learning_rate": 9.876868421052632e-05, + "loss": 0.5077, + "step": 24690 + }, + { + "epoch": 1.3826296337775787, + "grad_norm": 1.4496607780456543, + "learning_rate": 9.876842105263159e-05, + "loss": 0.5748, + "step": 24691 + }, + { + "epoch": 1.3826856310897078, + "grad_norm": 1.473398208618164, + "learning_rate": 9.876815789473685e-05, + "loss": 0.4832, + "step": 24692 + }, + { + "epoch": 1.3827416284018368, + "grad_norm": 1.3518096208572388, + "learning_rate": 9.876789473684211e-05, + "loss": 0.3901, + "step": 24693 + }, + { + "epoch": 1.3827976257139658, + "grad_norm": 1.4092508554458618, + "learning_rate": 9.876763157894737e-05, + "loss": 0.4113, + "step": 24694 + }, + { + "epoch": 1.3828536230260948, + "grad_norm": 1.2037180662155151, + "learning_rate": 9.876736842105264e-05, + "loss": 0.435, + "step": 24695 + }, + { + "epoch": 1.3829096203382238, + "grad_norm": 1.2900842428207397, + "learning_rate": 9.87671052631579e-05, + "loss": 0.4582, + "step": 24696 + }, + { + "epoch": 1.3829656176503529, + "grad_norm": 1.3820942640304565, + "learning_rate": 9.876684210526316e-05, + "loss": 0.5405, + "step": 24697 + }, + { + "epoch": 1.3830216149624819, + "grad_norm": 1.4084657430648804, + "learning_rate": 9.876657894736842e-05, + "loss": 0.5001, + "step": 24698 + }, + { + "epoch": 1.383077612274611, + "grad_norm": NaN, + "learning_rate": 9.876657894736842e-05, + "loss": 0.5658, + "step": 24699 + }, + { + "epoch": 1.3831336095867397, + "grad_norm": 1.2814196348190308, + "learning_rate": 9.876631578947368e-05, + "loss": 0.4785, + "step": 24700 + }, + { + "epoch": 1.3831896068988687, + "grad_norm": 1.270787239074707, + "learning_rate": 9.876605263157895e-05, + "loss": 0.4129, + "step": 24701 + }, + { + "epoch": 1.3832456042109977, + "grad_norm": 1.2655234336853027, + "learning_rate": 9.876578947368421e-05, + "loss": 0.5402, + "step": 24702 + }, + { + "epoch": 1.3833016015231268, + "grad_norm": 1.4280146360397339, + "learning_rate": 9.876552631578947e-05, + "loss": 0.5095, + "step": 24703 + }, + { + "epoch": 1.3833575988352558, + "grad_norm": 1.4920822381973267, + "learning_rate": 9.876526315789473e-05, + "loss": 0.6155, + "step": 24704 + }, + { + "epoch": 1.3834135961473848, + "grad_norm": 1.2366554737091064, + "learning_rate": 9.876500000000001e-05, + "loss": 0.5197, + "step": 24705 + }, + { + "epoch": 1.3834695934595138, + "grad_norm": 1.5644608736038208, + "learning_rate": 9.876473684210527e-05, + "loss": 0.5479, + "step": 24706 + }, + { + "epoch": 1.3835255907716428, + "grad_norm": 1.2335882186889648, + "learning_rate": 9.876447368421054e-05, + "loss": 0.3712, + "step": 24707 + }, + { + "epoch": 1.3835815880837719, + "grad_norm": 1.6657646894454956, + "learning_rate": 9.876421052631579e-05, + "loss": 0.4121, + "step": 24708 + }, + { + "epoch": 1.3836375853959009, + "grad_norm": 1.9175894260406494, + "learning_rate": 9.876394736842106e-05, + "loss": 0.4764, + "step": 24709 + }, + { + "epoch": 1.38369358270803, + "grad_norm": 1.399888515472412, + "learning_rate": 9.876368421052632e-05, + "loss": 0.5589, + "step": 24710 + }, + { + "epoch": 1.383749580020159, + "grad_norm": 1.2780975103378296, + "learning_rate": 9.876342105263159e-05, + "loss": 0.3981, + "step": 24711 + }, + { + "epoch": 1.383805577332288, + "grad_norm": 1.5270181894302368, + "learning_rate": 9.876315789473685e-05, + "loss": 0.3951, + "step": 24712 + }, + { + "epoch": 1.383861574644417, + "grad_norm": 1.413232684135437, + "learning_rate": 9.876289473684211e-05, + "loss": 0.5846, + "step": 24713 + }, + { + "epoch": 1.383917571956546, + "grad_norm": 1.5790107250213623, + "learning_rate": 9.876263157894737e-05, + "loss": 0.5064, + "step": 24714 + }, + { + "epoch": 1.383973569268675, + "grad_norm": 1.382285714149475, + "learning_rate": 9.876236842105263e-05, + "loss": 0.5088, + "step": 24715 + }, + { + "epoch": 1.384029566580804, + "grad_norm": 1.5090510845184326, + "learning_rate": 9.87621052631579e-05, + "loss": 0.5117, + "step": 24716 + }, + { + "epoch": 1.384085563892933, + "grad_norm": 1.3253145217895508, + "learning_rate": 9.876184210526316e-05, + "loss": 0.4963, + "step": 24717 + }, + { + "epoch": 1.384141561205062, + "grad_norm": 1.4267219305038452, + "learning_rate": 9.876157894736842e-05, + "loss": 0.5101, + "step": 24718 + }, + { + "epoch": 1.384197558517191, + "grad_norm": 1.1575123071670532, + "learning_rate": 9.876131578947368e-05, + "loss": 0.3897, + "step": 24719 + }, + { + "epoch": 1.38425355582932, + "grad_norm": 1.4060261249542236, + "learning_rate": 9.876105263157896e-05, + "loss": 0.3516, + "step": 24720 + }, + { + "epoch": 1.384309553141449, + "grad_norm": 1.3479305505752563, + "learning_rate": 9.876078947368422e-05, + "loss": 0.3674, + "step": 24721 + }, + { + "epoch": 1.3843655504535781, + "grad_norm": 1.3310964107513428, + "learning_rate": 9.876052631578948e-05, + "loss": 0.435, + "step": 24722 + }, + { + "epoch": 1.3844215477657071, + "grad_norm": 1.4235334396362305, + "learning_rate": 9.876026315789474e-05, + "loss": 0.4469, + "step": 24723 + }, + { + "epoch": 1.3844775450778362, + "grad_norm": 1.3577880859375, + "learning_rate": 9.876000000000001e-05, + "loss": 0.4412, + "step": 24724 + }, + { + "epoch": 1.3845335423899652, + "grad_norm": 1.4240859746932983, + "learning_rate": 9.875973684210527e-05, + "loss": 0.5463, + "step": 24725 + }, + { + "epoch": 1.3845895397020942, + "grad_norm": 1.590160608291626, + "learning_rate": 9.875947368421053e-05, + "loss": 0.5334, + "step": 24726 + }, + { + "epoch": 1.3846455370142232, + "grad_norm": 1.2518236637115479, + "learning_rate": 9.875921052631579e-05, + "loss": 0.4658, + "step": 24727 + }, + { + "epoch": 1.3847015343263522, + "grad_norm": 1.2080436944961548, + "learning_rate": 9.875894736842106e-05, + "loss": 0.4117, + "step": 24728 + }, + { + "epoch": 1.3847575316384813, + "grad_norm": 1.5010343790054321, + "learning_rate": 9.875868421052632e-05, + "loss": 0.4611, + "step": 24729 + }, + { + "epoch": 1.3848135289506103, + "grad_norm": 1.06449294090271, + "learning_rate": 9.87584210526316e-05, + "loss": 0.3236, + "step": 24730 + }, + { + "epoch": 1.3848695262627393, + "grad_norm": 1.2299138307571411, + "learning_rate": 9.875815789473684e-05, + "loss": 0.4756, + "step": 24731 + }, + { + "epoch": 1.3849255235748683, + "grad_norm": 1.4848288297653198, + "learning_rate": 9.87578947368421e-05, + "loss": 0.4313, + "step": 24732 + }, + { + "epoch": 1.3849815208869973, + "grad_norm": 1.2216533422470093, + "learning_rate": 9.875763157894737e-05, + "loss": 0.5013, + "step": 24733 + }, + { + "epoch": 1.3850375181991263, + "grad_norm": 1.5379114151000977, + "learning_rate": 9.875736842105263e-05, + "loss": 0.5764, + "step": 24734 + }, + { + "epoch": 1.3850935155112554, + "grad_norm": 1.2929130792617798, + "learning_rate": 9.875710526315791e-05, + "loss": 0.4394, + "step": 24735 + }, + { + "epoch": 1.3851495128233844, + "grad_norm": 1.3166282176971436, + "learning_rate": 9.875684210526315e-05, + "loss": 0.3692, + "step": 24736 + }, + { + "epoch": 1.3852055101355134, + "grad_norm": 1.3195898532867432, + "learning_rate": 9.875657894736843e-05, + "loss": 0.3983, + "step": 24737 + }, + { + "epoch": 1.3852615074476424, + "grad_norm": 1.0964648723602295, + "learning_rate": 9.875631578947369e-05, + "loss": 0.3791, + "step": 24738 + }, + { + "epoch": 1.3853175047597714, + "grad_norm": 1.157021164894104, + "learning_rate": 9.875605263157896e-05, + "loss": 0.4123, + "step": 24739 + }, + { + "epoch": 1.3853735020719005, + "grad_norm": 1.494493842124939, + "learning_rate": 9.87557894736842e-05, + "loss": 0.5978, + "step": 24740 + }, + { + "epoch": 1.3854294993840295, + "grad_norm": 1.2963590621948242, + "learning_rate": 9.875552631578948e-05, + "loss": 0.4444, + "step": 24741 + }, + { + "epoch": 1.3854854966961585, + "grad_norm": 1.553191900253296, + "learning_rate": 9.875526315789474e-05, + "loss": 0.5829, + "step": 24742 + }, + { + "epoch": 1.3855414940082875, + "grad_norm": 1.1810563802719116, + "learning_rate": 9.875500000000001e-05, + "loss": 0.3162, + "step": 24743 + }, + { + "epoch": 1.3855974913204165, + "grad_norm": 1.3519984483718872, + "learning_rate": 9.875473684210527e-05, + "loss": 0.5755, + "step": 24744 + }, + { + "epoch": 1.3856534886325456, + "grad_norm": 1.3369777202606201, + "learning_rate": 9.875447368421053e-05, + "loss": 0.5488, + "step": 24745 + }, + { + "epoch": 1.3857094859446746, + "grad_norm": 1.6594231128692627, + "learning_rate": 9.875421052631579e-05, + "loss": 0.7104, + "step": 24746 + }, + { + "epoch": 1.3857654832568036, + "grad_norm": 1.256836175918579, + "learning_rate": 9.875394736842106e-05, + "loss": 0.403, + "step": 24747 + }, + { + "epoch": 1.3858214805689326, + "grad_norm": 1.3793790340423584, + "learning_rate": 9.875368421052632e-05, + "loss": 0.5349, + "step": 24748 + }, + { + "epoch": 1.3858774778810616, + "grad_norm": 1.877210259437561, + "learning_rate": 9.875342105263158e-05, + "loss": 0.5041, + "step": 24749 + }, + { + "epoch": 1.3859334751931907, + "grad_norm": 1.3058441877365112, + "learning_rate": 9.875315789473684e-05, + "loss": 0.5957, + "step": 24750 + }, + { + "epoch": 1.3859894725053197, + "grad_norm": 1.1621700525283813, + "learning_rate": 9.87528947368421e-05, + "loss": 0.3936, + "step": 24751 + }, + { + "epoch": 1.3860454698174487, + "grad_norm": 1.2263314723968506, + "learning_rate": 9.875263157894738e-05, + "loss": 0.4816, + "step": 24752 + }, + { + "epoch": 1.3861014671295777, + "grad_norm": 1.1986078023910522, + "learning_rate": 9.875236842105264e-05, + "loss": 0.4512, + "step": 24753 + }, + { + "epoch": 1.3861574644417067, + "grad_norm": 1.5553449392318726, + "learning_rate": 9.87521052631579e-05, + "loss": 0.4484, + "step": 24754 + }, + { + "epoch": 1.3862134617538358, + "grad_norm": 1.3770147562026978, + "learning_rate": 9.875184210526316e-05, + "loss": 0.5147, + "step": 24755 + }, + { + "epoch": 1.3862694590659648, + "grad_norm": 1.026626706123352, + "learning_rate": 9.875157894736843e-05, + "loss": 0.323, + "step": 24756 + }, + { + "epoch": 1.3863254563780938, + "grad_norm": 1.424196481704712, + "learning_rate": 9.875131578947369e-05, + "loss": 0.5097, + "step": 24757 + }, + { + "epoch": 1.3863814536902228, + "grad_norm": 1.755611777305603, + "learning_rate": 9.875105263157895e-05, + "loss": 0.52, + "step": 24758 + }, + { + "epoch": 1.3864374510023518, + "grad_norm": 1.2150194644927979, + "learning_rate": 9.875078947368421e-05, + "loss": 0.4028, + "step": 24759 + }, + { + "epoch": 1.3864934483144808, + "grad_norm": 1.298742413520813, + "learning_rate": 9.875052631578948e-05, + "loss": 0.4336, + "step": 24760 + }, + { + "epoch": 1.3865494456266099, + "grad_norm": 1.5085780620574951, + "learning_rate": 9.875026315789474e-05, + "loss": 0.3935, + "step": 24761 + }, + { + "epoch": 1.3866054429387389, + "grad_norm": 1.3242181539535522, + "learning_rate": 9.875000000000002e-05, + "loss": 0.51, + "step": 24762 + }, + { + "epoch": 1.386661440250868, + "grad_norm": 1.444621205329895, + "learning_rate": 9.874973684210526e-05, + "loss": 0.4794, + "step": 24763 + }, + { + "epoch": 1.386717437562997, + "grad_norm": 1.2220877408981323, + "learning_rate": 9.874947368421053e-05, + "loss": 0.4074, + "step": 24764 + }, + { + "epoch": 1.386773434875126, + "grad_norm": 1.3091826438903809, + "learning_rate": 9.87492105263158e-05, + "loss": 0.4074, + "step": 24765 + }, + { + "epoch": 1.386829432187255, + "grad_norm": 1.3461570739746094, + "learning_rate": 9.874894736842105e-05, + "loss": 0.4497, + "step": 24766 + }, + { + "epoch": 1.386885429499384, + "grad_norm": 1.4616492986679077, + "learning_rate": 9.874868421052633e-05, + "loss": 0.4912, + "step": 24767 + }, + { + "epoch": 1.386941426811513, + "grad_norm": 1.343125343322754, + "learning_rate": 9.874842105263157e-05, + "loss": 0.5187, + "step": 24768 + }, + { + "epoch": 1.386997424123642, + "grad_norm": 1.45669424533844, + "learning_rate": 9.874815789473685e-05, + "loss": 0.5832, + "step": 24769 + }, + { + "epoch": 1.387053421435771, + "grad_norm": 1.4133449792861938, + "learning_rate": 9.87478947368421e-05, + "loss": 0.4095, + "step": 24770 + }, + { + "epoch": 1.3871094187479, + "grad_norm": 1.8313589096069336, + "learning_rate": 9.874763157894738e-05, + "loss": 0.6705, + "step": 24771 + }, + { + "epoch": 1.387165416060029, + "grad_norm": 1.3852347135543823, + "learning_rate": 9.874736842105264e-05, + "loss": 0.5292, + "step": 24772 + }, + { + "epoch": 1.387221413372158, + "grad_norm": 1.3958368301391602, + "learning_rate": 9.87471052631579e-05, + "loss": 0.3874, + "step": 24773 + }, + { + "epoch": 1.3872774106842871, + "grad_norm": 1.496346354484558, + "learning_rate": 9.874684210526316e-05, + "loss": 0.5661, + "step": 24774 + }, + { + "epoch": 1.3873334079964161, + "grad_norm": 1.2413619756698608, + "learning_rate": 9.874657894736843e-05, + "loss": 0.5856, + "step": 24775 + }, + { + "epoch": 1.3873894053085452, + "grad_norm": 1.4150477647781372, + "learning_rate": 9.874631578947369e-05, + "loss": 0.4548, + "step": 24776 + }, + { + "epoch": 1.3874454026206742, + "grad_norm": 1.2681676149368286, + "learning_rate": 9.874605263157895e-05, + "loss": 0.5001, + "step": 24777 + }, + { + "epoch": 1.3875013999328032, + "grad_norm": 1.6500073671340942, + "learning_rate": 9.874578947368421e-05, + "loss": 0.5182, + "step": 24778 + }, + { + "epoch": 1.3875573972449322, + "grad_norm": 1.3954724073410034, + "learning_rate": 9.874552631578948e-05, + "loss": 0.3805, + "step": 24779 + }, + { + "epoch": 1.3876133945570612, + "grad_norm": 1.540405035018921, + "learning_rate": 9.874526315789474e-05, + "loss": 0.6096, + "step": 24780 + }, + { + "epoch": 1.3876693918691902, + "grad_norm": 1.313897728919983, + "learning_rate": 9.8745e-05, + "loss": 0.4262, + "step": 24781 + }, + { + "epoch": 1.3877253891813193, + "grad_norm": 1.4382654428482056, + "learning_rate": 9.874473684210526e-05, + "loss": 0.4548, + "step": 24782 + }, + { + "epoch": 1.3877813864934483, + "grad_norm": 1.3728103637695312, + "learning_rate": 9.874447368421052e-05, + "loss": 0.4775, + "step": 24783 + }, + { + "epoch": 1.3878373838055773, + "grad_norm": 1.2177050113677979, + "learning_rate": 9.87442105263158e-05, + "loss": 0.4215, + "step": 24784 + }, + { + "epoch": 1.3878933811177063, + "grad_norm": 1.2666003704071045, + "learning_rate": 9.874394736842106e-05, + "loss": 0.5811, + "step": 24785 + }, + { + "epoch": 1.3879493784298353, + "grad_norm": 1.3011101484298706, + "learning_rate": 9.874368421052632e-05, + "loss": 0.4707, + "step": 24786 + }, + { + "epoch": 1.3880053757419644, + "grad_norm": 1.4403101205825806, + "learning_rate": 9.874342105263158e-05, + "loss": 0.5231, + "step": 24787 + }, + { + "epoch": 1.3880613730540934, + "grad_norm": 1.5996503829956055, + "learning_rate": 9.874315789473685e-05, + "loss": 0.5047, + "step": 24788 + }, + { + "epoch": 1.3881173703662224, + "grad_norm": 1.383208155632019, + "learning_rate": 9.874289473684211e-05, + "loss": 0.4274, + "step": 24789 + }, + { + "epoch": 1.3881733676783514, + "grad_norm": 1.2859172821044922, + "learning_rate": 9.874263157894737e-05, + "loss": 0.4462, + "step": 24790 + }, + { + "epoch": 1.3882293649904804, + "grad_norm": 1.318159818649292, + "learning_rate": 9.874236842105263e-05, + "loss": 0.4422, + "step": 24791 + }, + { + "epoch": 1.3882853623026095, + "grad_norm": 1.6016712188720703, + "learning_rate": 9.87421052631579e-05, + "loss": 0.4696, + "step": 24792 + }, + { + "epoch": 1.3883413596147385, + "grad_norm": 1.4052642583847046, + "learning_rate": 9.874184210526316e-05, + "loss": 0.534, + "step": 24793 + }, + { + "epoch": 1.3883973569268675, + "grad_norm": 1.2752387523651123, + "learning_rate": 9.874157894736843e-05, + "loss": 0.4008, + "step": 24794 + }, + { + "epoch": 1.3884533542389965, + "grad_norm": 1.1638542413711548, + "learning_rate": 9.874131578947368e-05, + "loss": 0.494, + "step": 24795 + }, + { + "epoch": 1.3885093515511255, + "grad_norm": 1.2391552925109863, + "learning_rate": 9.874105263157895e-05, + "loss": 0.5531, + "step": 24796 + }, + { + "epoch": 1.3885653488632546, + "grad_norm": 1.5980744361877441, + "learning_rate": 9.874078947368421e-05, + "loss": 0.3775, + "step": 24797 + }, + { + "epoch": 1.3886213461753836, + "grad_norm": 1.2987377643585205, + "learning_rate": 9.874052631578949e-05, + "loss": 0.448, + "step": 24798 + }, + { + "epoch": 1.3886773434875126, + "grad_norm": 1.1683251857757568, + "learning_rate": 9.874026315789475e-05, + "loss": 0.3695, + "step": 24799 + }, + { + "epoch": 1.3887333407996416, + "grad_norm": 1.540535807609558, + "learning_rate": 9.874e-05, + "loss": 0.3995, + "step": 24800 + }, + { + "epoch": 1.3887893381117706, + "grad_norm": 1.7004040479660034, + "learning_rate": 9.873973684210527e-05, + "loss": 0.6755, + "step": 24801 + }, + { + "epoch": 1.3888453354238997, + "grad_norm": 1.4696658849716187, + "learning_rate": 9.873947368421053e-05, + "loss": 0.5824, + "step": 24802 + }, + { + "epoch": 1.3889013327360287, + "grad_norm": 1.2471377849578857, + "learning_rate": 9.87392105263158e-05, + "loss": 0.4334, + "step": 24803 + }, + { + "epoch": 1.3889573300481577, + "grad_norm": 1.641444444656372, + "learning_rate": 9.873894736842106e-05, + "loss": 0.4461, + "step": 24804 + }, + { + "epoch": 1.3890133273602867, + "grad_norm": 1.4771136045455933, + "learning_rate": 9.873868421052632e-05, + "loss": 0.4074, + "step": 24805 + }, + { + "epoch": 1.3890693246724157, + "grad_norm": 1.2724125385284424, + "learning_rate": 9.873842105263158e-05, + "loss": 0.4007, + "step": 24806 + }, + { + "epoch": 1.3891253219845447, + "grad_norm": 1.336830973625183, + "learning_rate": 9.873815789473685e-05, + "loss": 0.4776, + "step": 24807 + }, + { + "epoch": 1.3891813192966738, + "grad_norm": 1.6851872205734253, + "learning_rate": 9.873789473684211e-05, + "loss": 0.5717, + "step": 24808 + }, + { + "epoch": 1.3892373166088028, + "grad_norm": 1.4466724395751953, + "learning_rate": 9.873763157894737e-05, + "loss": 0.5386, + "step": 24809 + }, + { + "epoch": 1.3892933139209318, + "grad_norm": 1.3866513967514038, + "learning_rate": 9.873736842105263e-05, + "loss": 0.4549, + "step": 24810 + }, + { + "epoch": 1.3893493112330608, + "grad_norm": 1.4376996755599976, + "learning_rate": 9.87371052631579e-05, + "loss": 0.5588, + "step": 24811 + }, + { + "epoch": 1.3894053085451898, + "grad_norm": 1.3783830404281616, + "learning_rate": 9.873684210526316e-05, + "loss": 0.3829, + "step": 24812 + }, + { + "epoch": 1.3894613058573189, + "grad_norm": 1.3257275819778442, + "learning_rate": 9.873657894736842e-05, + "loss": 0.4604, + "step": 24813 + }, + { + "epoch": 1.3895173031694479, + "grad_norm": 1.4820529222488403, + "learning_rate": 9.873631578947368e-05, + "loss": 0.4265, + "step": 24814 + }, + { + "epoch": 1.389573300481577, + "grad_norm": 1.401774287223816, + "learning_rate": 9.873605263157896e-05, + "loss": 0.4687, + "step": 24815 + }, + { + "epoch": 1.389629297793706, + "grad_norm": 1.2722257375717163, + "learning_rate": 9.873578947368422e-05, + "loss": 0.4431, + "step": 24816 + }, + { + "epoch": 1.389685295105835, + "grad_norm": 1.3507345914840698, + "learning_rate": 9.873552631578949e-05, + "loss": 0.499, + "step": 24817 + }, + { + "epoch": 1.389741292417964, + "grad_norm": 1.3066329956054688, + "learning_rate": 9.873526315789474e-05, + "loss": 0.3926, + "step": 24818 + }, + { + "epoch": 1.389797289730093, + "grad_norm": 1.2284746170043945, + "learning_rate": 9.8735e-05, + "loss": 0.4999, + "step": 24819 + }, + { + "epoch": 1.389853287042222, + "grad_norm": 1.3337175846099854, + "learning_rate": 9.873473684210527e-05, + "loss": 0.4306, + "step": 24820 + }, + { + "epoch": 1.389909284354351, + "grad_norm": 1.391411542892456, + "learning_rate": 9.873447368421053e-05, + "loss": 0.5146, + "step": 24821 + }, + { + "epoch": 1.38996528166648, + "grad_norm": 1.3370726108551025, + "learning_rate": 9.87342105263158e-05, + "loss": 0.5088, + "step": 24822 + }, + { + "epoch": 1.390021278978609, + "grad_norm": 1.313614010810852, + "learning_rate": 9.873394736842105e-05, + "loss": 0.545, + "step": 24823 + }, + { + "epoch": 1.390077276290738, + "grad_norm": 1.2633293867111206, + "learning_rate": 9.873368421052632e-05, + "loss": 0.5053, + "step": 24824 + }, + { + "epoch": 1.390133273602867, + "grad_norm": 1.53170907497406, + "learning_rate": 9.873342105263158e-05, + "loss": 0.541, + "step": 24825 + }, + { + "epoch": 1.3901892709149961, + "grad_norm": 1.2162044048309326, + "learning_rate": 9.873315789473685e-05, + "loss": 0.4113, + "step": 24826 + }, + { + "epoch": 1.3902452682271251, + "grad_norm": 1.4812928438186646, + "learning_rate": 9.873289473684211e-05, + "loss": 0.416, + "step": 24827 + }, + { + "epoch": 1.3903012655392541, + "grad_norm": 1.2285056114196777, + "learning_rate": 9.873263157894737e-05, + "loss": 0.304, + "step": 24828 + }, + { + "epoch": 1.3903572628513832, + "grad_norm": 1.3497986793518066, + "learning_rate": 9.873236842105263e-05, + "loss": 0.5413, + "step": 24829 + }, + { + "epoch": 1.3904132601635122, + "grad_norm": 1.410062551498413, + "learning_rate": 9.873210526315791e-05, + "loss": 0.6767, + "step": 24830 + }, + { + "epoch": 1.3904692574756412, + "grad_norm": 1.5519541501998901, + "learning_rate": 9.873184210526317e-05, + "loss": 0.5966, + "step": 24831 + }, + { + "epoch": 1.3905252547877702, + "grad_norm": 1.116697907447815, + "learning_rate": 9.873157894736843e-05, + "loss": 0.474, + "step": 24832 + }, + { + "epoch": 1.3905812520998992, + "grad_norm": 1.367313265800476, + "learning_rate": 9.873131578947369e-05, + "loss": 0.4658, + "step": 24833 + }, + { + "epoch": 1.3906372494120283, + "grad_norm": 1.3255162239074707, + "learning_rate": 9.873105263157896e-05, + "loss": 0.5173, + "step": 24834 + }, + { + "epoch": 1.3906932467241573, + "grad_norm": 1.3296301364898682, + "learning_rate": 9.873078947368422e-05, + "loss": 0.5345, + "step": 24835 + }, + { + "epoch": 1.3907492440362863, + "grad_norm": 1.52505624294281, + "learning_rate": 9.873052631578948e-05, + "loss": 0.4608, + "step": 24836 + }, + { + "epoch": 1.3908052413484153, + "grad_norm": 1.3161721229553223, + "learning_rate": 9.873026315789474e-05, + "loss": 0.5049, + "step": 24837 + }, + { + "epoch": 1.3908612386605443, + "grad_norm": 1.3209775686264038, + "learning_rate": 9.873e-05, + "loss": 0.397, + "step": 24838 + }, + { + "epoch": 1.3909172359726734, + "grad_norm": 1.4917718172073364, + "learning_rate": 9.872973684210527e-05, + "loss": 0.694, + "step": 24839 + }, + { + "epoch": 1.3909732332848024, + "grad_norm": 1.4647622108459473, + "learning_rate": 9.872947368421053e-05, + "loss": 0.6038, + "step": 24840 + }, + { + "epoch": 1.3910292305969314, + "grad_norm": 1.9479498863220215, + "learning_rate": 9.872921052631579e-05, + "loss": 0.5725, + "step": 24841 + }, + { + "epoch": 1.3910852279090604, + "grad_norm": 1.154721736907959, + "learning_rate": 9.872894736842105e-05, + "loss": 0.3818, + "step": 24842 + }, + { + "epoch": 1.3911412252211894, + "grad_norm": 1.3792271614074707, + "learning_rate": 9.872868421052632e-05, + "loss": 0.5345, + "step": 24843 + }, + { + "epoch": 1.3911972225333185, + "grad_norm": 1.2258814573287964, + "learning_rate": 9.872842105263158e-05, + "loss": 0.4447, + "step": 24844 + }, + { + "epoch": 1.3912532198454475, + "grad_norm": 1.2293710708618164, + "learning_rate": 9.872815789473684e-05, + "loss": 0.4907, + "step": 24845 + }, + { + "epoch": 1.3913092171575765, + "grad_norm": 1.2616662979125977, + "learning_rate": 9.87278947368421e-05, + "loss": 0.5773, + "step": 24846 + }, + { + "epoch": 1.3913652144697055, + "grad_norm": 1.1698933839797974, + "learning_rate": 9.872763157894738e-05, + "loss": 0.3963, + "step": 24847 + }, + { + "epoch": 1.3914212117818345, + "grad_norm": 1.3462740182876587, + "learning_rate": 9.872736842105264e-05, + "loss": 0.4663, + "step": 24848 + }, + { + "epoch": 1.3914772090939636, + "grad_norm": 1.2720924615859985, + "learning_rate": 9.872710526315791e-05, + "loss": 0.456, + "step": 24849 + }, + { + "epoch": 1.3915332064060926, + "grad_norm": 1.2319821119308472, + "learning_rate": 9.872684210526316e-05, + "loss": 0.4582, + "step": 24850 + }, + { + "epoch": 1.3915892037182216, + "grad_norm": 1.3817527294158936, + "learning_rate": 9.872657894736843e-05, + "loss": 0.558, + "step": 24851 + }, + { + "epoch": 1.3916452010303506, + "grad_norm": 1.1048712730407715, + "learning_rate": 9.872631578947369e-05, + "loss": 0.405, + "step": 24852 + }, + { + "epoch": 1.3917011983424796, + "grad_norm": 1.3013241291046143, + "learning_rate": 9.872605263157895e-05, + "loss": 0.4592, + "step": 24853 + }, + { + "epoch": 1.3917571956546086, + "grad_norm": 1.7271647453308105, + "learning_rate": 9.872578947368422e-05, + "loss": 0.5207, + "step": 24854 + }, + { + "epoch": 1.3918131929667377, + "grad_norm": 1.4949439764022827, + "learning_rate": 9.872552631578947e-05, + "loss": 0.4057, + "step": 24855 + }, + { + "epoch": 1.3918691902788667, + "grad_norm": 1.1679201126098633, + "learning_rate": 9.872526315789474e-05, + "loss": 0.4945, + "step": 24856 + }, + { + "epoch": 1.3919251875909957, + "grad_norm": 1.4353044033050537, + "learning_rate": 9.8725e-05, + "loss": 0.4496, + "step": 24857 + }, + { + "epoch": 1.3919811849031247, + "grad_norm": 1.3244861364364624, + "learning_rate": 9.872473684210527e-05, + "loss": 0.5183, + "step": 24858 + }, + { + "epoch": 1.3920371822152537, + "grad_norm": 1.666304588317871, + "learning_rate": 9.872447368421053e-05, + "loss": 0.4756, + "step": 24859 + }, + { + "epoch": 1.3920931795273828, + "grad_norm": 1.4096368551254272, + "learning_rate": 9.87242105263158e-05, + "loss": 0.5145, + "step": 24860 + }, + { + "epoch": 1.3921491768395118, + "grad_norm": 1.434039831161499, + "learning_rate": 9.872394736842105e-05, + "loss": 0.3844, + "step": 24861 + }, + { + "epoch": 1.3922051741516408, + "grad_norm": 1.3839205503463745, + "learning_rate": 9.872368421052633e-05, + "loss": 0.4218, + "step": 24862 + }, + { + "epoch": 1.3922611714637698, + "grad_norm": 1.1076642274856567, + "learning_rate": 9.872342105263159e-05, + "loss": 0.3004, + "step": 24863 + }, + { + "epoch": 1.3923171687758988, + "grad_norm": 1.3232903480529785, + "learning_rate": 9.872315789473685e-05, + "loss": 0.4578, + "step": 24864 + }, + { + "epoch": 1.3923731660880279, + "grad_norm": 1.181853175163269, + "learning_rate": 9.87228947368421e-05, + "loss": 0.4971, + "step": 24865 + }, + { + "epoch": 1.3924291634001569, + "grad_norm": 1.3774648904800415, + "learning_rate": 9.872263157894738e-05, + "loss": 0.4604, + "step": 24866 + }, + { + "epoch": 1.392485160712286, + "grad_norm": 1.4425255060195923, + "learning_rate": 9.872236842105264e-05, + "loss": 0.4997, + "step": 24867 + }, + { + "epoch": 1.392541158024415, + "grad_norm": 1.4591766595840454, + "learning_rate": 9.87221052631579e-05, + "loss": 0.5154, + "step": 24868 + }, + { + "epoch": 1.392597155336544, + "grad_norm": 1.1732569932937622, + "learning_rate": 9.872184210526316e-05, + "loss": 0.3848, + "step": 24869 + }, + { + "epoch": 1.392653152648673, + "grad_norm": 1.3114980459213257, + "learning_rate": 9.872157894736842e-05, + "loss": 0.4703, + "step": 24870 + }, + { + "epoch": 1.392709149960802, + "grad_norm": 1.6508270502090454, + "learning_rate": 9.872131578947369e-05, + "loss": 0.4463, + "step": 24871 + }, + { + "epoch": 1.392765147272931, + "grad_norm": 1.3888405561447144, + "learning_rate": 9.872105263157895e-05, + "loss": 0.6197, + "step": 24872 + }, + { + "epoch": 1.39282114458506, + "grad_norm": 1.3268065452575684, + "learning_rate": 9.872078947368421e-05, + "loss": 0.5795, + "step": 24873 + }, + { + "epoch": 1.392877141897189, + "grad_norm": 1.2224912643432617, + "learning_rate": 9.872052631578947e-05, + "loss": 0.3589, + "step": 24874 + }, + { + "epoch": 1.392933139209318, + "grad_norm": 1.2793575525283813, + "learning_rate": 9.872026315789474e-05, + "loss": 0.401, + "step": 24875 + }, + { + "epoch": 1.392989136521447, + "grad_norm": 1.3169201612472534, + "learning_rate": 9.872e-05, + "loss": 0.5786, + "step": 24876 + }, + { + "epoch": 1.393045133833576, + "grad_norm": 1.3333497047424316, + "learning_rate": 9.871973684210528e-05, + "loss": 0.5736, + "step": 24877 + }, + { + "epoch": 1.393101131145705, + "grad_norm": 1.2146955728530884, + "learning_rate": 9.871947368421052e-05, + "loss": 0.4339, + "step": 24878 + }, + { + "epoch": 1.3931571284578341, + "grad_norm": 1.5016287565231323, + "learning_rate": 9.87192105263158e-05, + "loss": 0.4464, + "step": 24879 + }, + { + "epoch": 1.3932131257699631, + "grad_norm": 1.2557700872421265, + "learning_rate": 9.871894736842106e-05, + "loss": 0.437, + "step": 24880 + }, + { + "epoch": 1.3932691230820922, + "grad_norm": 1.499138355255127, + "learning_rate": 9.871868421052633e-05, + "loss": 0.585, + "step": 24881 + }, + { + "epoch": 1.3933251203942212, + "grad_norm": 1.5727580785751343, + "learning_rate": 9.871842105263159e-05, + "loss": 0.4809, + "step": 24882 + }, + { + "epoch": 1.3933811177063502, + "grad_norm": 2.0146894454956055, + "learning_rate": 9.871815789473685e-05, + "loss": 0.6236, + "step": 24883 + }, + { + "epoch": 1.3934371150184792, + "grad_norm": 1.3595463037490845, + "learning_rate": 9.871789473684211e-05, + "loss": 0.498, + "step": 24884 + }, + { + "epoch": 1.3934931123306082, + "grad_norm": 1.3299007415771484, + "learning_rate": 9.871763157894738e-05, + "loss": 0.3855, + "step": 24885 + }, + { + "epoch": 1.3935491096427373, + "grad_norm": 1.214621663093567, + "learning_rate": 9.871736842105264e-05, + "loss": 0.4616, + "step": 24886 + }, + { + "epoch": 1.3936051069548663, + "grad_norm": 1.3083232641220093, + "learning_rate": 9.871710526315789e-05, + "loss": 0.4835, + "step": 24887 + }, + { + "epoch": 1.3936611042669953, + "grad_norm": 1.3820178508758545, + "learning_rate": 9.871684210526316e-05, + "loss": 0.4902, + "step": 24888 + }, + { + "epoch": 1.3937171015791243, + "grad_norm": 1.3684327602386475, + "learning_rate": 9.871657894736842e-05, + "loss": 0.5788, + "step": 24889 + }, + { + "epoch": 1.3937730988912533, + "grad_norm": 1.7087435722351074, + "learning_rate": 9.87163157894737e-05, + "loss": 0.6771, + "step": 24890 + }, + { + "epoch": 1.3938290962033824, + "grad_norm": 1.7902201414108276, + "learning_rate": 9.871605263157895e-05, + "loss": 0.7825, + "step": 24891 + }, + { + "epoch": 1.3938850935155114, + "grad_norm": 1.5060359239578247, + "learning_rate": 9.871578947368421e-05, + "loss": 0.4653, + "step": 24892 + }, + { + "epoch": 1.3939410908276404, + "grad_norm": 1.374214768409729, + "learning_rate": 9.871552631578947e-05, + "loss": 0.4788, + "step": 24893 + }, + { + "epoch": 1.3939970881397694, + "grad_norm": 1.6973198652267456, + "learning_rate": 9.871526315789475e-05, + "loss": 0.5671, + "step": 24894 + }, + { + "epoch": 1.3940530854518984, + "grad_norm": 1.3596285581588745, + "learning_rate": 9.8715e-05, + "loss": 0.4583, + "step": 24895 + }, + { + "epoch": 1.3941090827640275, + "grad_norm": 1.5764806270599365, + "learning_rate": 9.871473684210527e-05, + "loss": 0.4715, + "step": 24896 + }, + { + "epoch": 1.3941650800761565, + "grad_norm": 2.0739612579345703, + "learning_rate": 9.871447368421053e-05, + "loss": 0.5395, + "step": 24897 + }, + { + "epoch": 1.3942210773882855, + "grad_norm": 1.426306128501892, + "learning_rate": 9.87142105263158e-05, + "loss": 0.4057, + "step": 24898 + }, + { + "epoch": 1.3942770747004145, + "grad_norm": 1.3731775283813477, + "learning_rate": 9.871394736842106e-05, + "loss": 0.4563, + "step": 24899 + }, + { + "epoch": 1.3943330720125435, + "grad_norm": 1.3683106899261475, + "learning_rate": 9.871368421052632e-05, + "loss": 0.4421, + "step": 24900 + }, + { + "epoch": 1.3943890693246725, + "grad_norm": 1.569212555885315, + "learning_rate": 9.871342105263158e-05, + "loss": 0.5724, + "step": 24901 + }, + { + "epoch": 1.3944450666368016, + "grad_norm": 1.333946704864502, + "learning_rate": 9.871315789473685e-05, + "loss": 0.4599, + "step": 24902 + }, + { + "epoch": 1.3945010639489306, + "grad_norm": 1.504699945449829, + "learning_rate": 9.871289473684211e-05, + "loss": 0.4934, + "step": 24903 + }, + { + "epoch": 1.3945570612610596, + "grad_norm": 1.2900937795639038, + "learning_rate": 9.871263157894737e-05, + "loss": 0.4915, + "step": 24904 + }, + { + "epoch": 1.3946130585731886, + "grad_norm": 1.4753167629241943, + "learning_rate": 9.871236842105263e-05, + "loss": 0.4803, + "step": 24905 + }, + { + "epoch": 1.3946690558853176, + "grad_norm": 2.1940629482269287, + "learning_rate": 9.871210526315789e-05, + "loss": 0.6225, + "step": 24906 + }, + { + "epoch": 1.3947250531974464, + "grad_norm": 1.283613920211792, + "learning_rate": 9.871184210526316e-05, + "loss": 0.4688, + "step": 24907 + }, + { + "epoch": 1.3947810505095755, + "grad_norm": 1.2940969467163086, + "learning_rate": 9.871157894736842e-05, + "loss": 0.5717, + "step": 24908 + }, + { + "epoch": 1.3948370478217045, + "grad_norm": 1.5835634469985962, + "learning_rate": 9.87113157894737e-05, + "loss": 0.4983, + "step": 24909 + }, + { + "epoch": 1.3948930451338335, + "grad_norm": 1.1694972515106201, + "learning_rate": 9.871105263157894e-05, + "loss": 0.3836, + "step": 24910 + }, + { + "epoch": 1.3949490424459625, + "grad_norm": 1.377196192741394, + "learning_rate": 9.871078947368422e-05, + "loss": 0.4218, + "step": 24911 + }, + { + "epoch": 1.3950050397580915, + "grad_norm": 1.129834771156311, + "learning_rate": 9.871052631578948e-05, + "loss": 0.4985, + "step": 24912 + }, + { + "epoch": 1.3950610370702206, + "grad_norm": 1.351381540298462, + "learning_rate": 9.871026315789475e-05, + "loss": 0.5257, + "step": 24913 + }, + { + "epoch": 1.3951170343823496, + "grad_norm": 1.200209379196167, + "learning_rate": 9.871000000000001e-05, + "loss": 0.4166, + "step": 24914 + }, + { + "epoch": 1.3951730316944786, + "grad_norm": 1.3657712936401367, + "learning_rate": 9.870973684210527e-05, + "loss": 0.5301, + "step": 24915 + }, + { + "epoch": 1.3952290290066076, + "grad_norm": 1.5016694068908691, + "learning_rate": 9.870947368421053e-05, + "loss": 0.4532, + "step": 24916 + }, + { + "epoch": 1.3952850263187366, + "grad_norm": 1.2941175699234009, + "learning_rate": 9.87092105263158e-05, + "loss": 0.4956, + "step": 24917 + }, + { + "epoch": 1.3953410236308657, + "grad_norm": 1.1918156147003174, + "learning_rate": 9.870894736842106e-05, + "loss": 0.427, + "step": 24918 + }, + { + "epoch": 1.3953970209429947, + "grad_norm": 1.270900011062622, + "learning_rate": 9.870868421052632e-05, + "loss": 0.4563, + "step": 24919 + }, + { + "epoch": 1.3954530182551237, + "grad_norm": 1.9775011539459229, + "learning_rate": 9.870842105263158e-05, + "loss": 0.5685, + "step": 24920 + }, + { + "epoch": 1.3955090155672527, + "grad_norm": 1.4194166660308838, + "learning_rate": 9.870815789473685e-05, + "loss": 0.5493, + "step": 24921 + }, + { + "epoch": 1.3955650128793817, + "grad_norm": 1.7846382856369019, + "learning_rate": 9.870789473684211e-05, + "loss": 0.4364, + "step": 24922 + }, + { + "epoch": 1.3956210101915107, + "grad_norm": 1.2885242700576782, + "learning_rate": 9.870763157894737e-05, + "loss": 0.4624, + "step": 24923 + }, + { + "epoch": 1.3956770075036398, + "grad_norm": 1.304019570350647, + "learning_rate": 9.870736842105263e-05, + "loss": 0.3475, + "step": 24924 + }, + { + "epoch": 1.3957330048157688, + "grad_norm": 1.1312437057495117, + "learning_rate": 9.870710526315789e-05, + "loss": 0.4062, + "step": 24925 + }, + { + "epoch": 1.3957890021278978, + "grad_norm": 1.23923921585083, + "learning_rate": 9.870684210526317e-05, + "loss": 0.3765, + "step": 24926 + }, + { + "epoch": 1.3958449994400268, + "grad_norm": 1.3791269063949585, + "learning_rate": 9.870657894736843e-05, + "loss": 0.4871, + "step": 24927 + }, + { + "epoch": 1.3959009967521558, + "grad_norm": 1.3312296867370605, + "learning_rate": 9.870631578947369e-05, + "loss": 0.3912, + "step": 24928 + }, + { + "epoch": 1.3959569940642849, + "grad_norm": 1.7769653797149658, + "learning_rate": 9.870605263157895e-05, + "loss": 0.475, + "step": 24929 + }, + { + "epoch": 1.3960129913764139, + "grad_norm": 1.4600893259048462, + "learning_rate": 9.870578947368422e-05, + "loss": 0.5491, + "step": 24930 + }, + { + "epoch": 1.396068988688543, + "grad_norm": 1.404732346534729, + "learning_rate": 9.870552631578948e-05, + "loss": 0.4643, + "step": 24931 + }, + { + "epoch": 1.396124986000672, + "grad_norm": 1.6797943115234375, + "learning_rate": 9.870526315789475e-05, + "loss": 0.7382, + "step": 24932 + }, + { + "epoch": 1.396180983312801, + "grad_norm": 1.4181331396102905, + "learning_rate": 9.8705e-05, + "loss": 0.5598, + "step": 24933 + }, + { + "epoch": 1.39623698062493, + "grad_norm": 1.5253980159759521, + "learning_rate": 9.870473684210527e-05, + "loss": 0.4718, + "step": 24934 + }, + { + "epoch": 1.396292977937059, + "grad_norm": 1.158437728881836, + "learning_rate": 9.870447368421053e-05, + "loss": 0.3587, + "step": 24935 + }, + { + "epoch": 1.396348975249188, + "grad_norm": 1.0369495153427124, + "learning_rate": 9.87042105263158e-05, + "loss": 0.2864, + "step": 24936 + }, + { + "epoch": 1.396404972561317, + "grad_norm": 1.5150394439697266, + "learning_rate": 9.870394736842105e-05, + "loss": 0.4394, + "step": 24937 + }, + { + "epoch": 1.396460969873446, + "grad_norm": 1.1184009313583374, + "learning_rate": 9.870368421052632e-05, + "loss": 0.3565, + "step": 24938 + }, + { + "epoch": 1.396516967185575, + "grad_norm": 1.5306912660598755, + "learning_rate": 9.870342105263158e-05, + "loss": 0.5793, + "step": 24939 + }, + { + "epoch": 1.396572964497704, + "grad_norm": 1.180704951286316, + "learning_rate": 9.870315789473684e-05, + "loss": 0.4901, + "step": 24940 + }, + { + "epoch": 1.396628961809833, + "grad_norm": 1.214342474937439, + "learning_rate": 9.870289473684212e-05, + "loss": 0.527, + "step": 24941 + }, + { + "epoch": 1.396684959121962, + "grad_norm": 1.1204359531402588, + "learning_rate": 9.870263157894736e-05, + "loss": 0.6086, + "step": 24942 + }, + { + "epoch": 1.3967409564340911, + "grad_norm": 1.6315031051635742, + "learning_rate": 9.870236842105264e-05, + "loss": 0.6003, + "step": 24943 + }, + { + "epoch": 1.3967969537462201, + "grad_norm": 1.6887123584747314, + "learning_rate": 9.87021052631579e-05, + "loss": 0.6169, + "step": 24944 + }, + { + "epoch": 1.3968529510583492, + "grad_norm": 1.3773730993270874, + "learning_rate": 9.870184210526317e-05, + "loss": 0.4426, + "step": 24945 + }, + { + "epoch": 1.3969089483704782, + "grad_norm": 1.2887552976608276, + "learning_rate": 9.870157894736843e-05, + "loss": 0.5258, + "step": 24946 + }, + { + "epoch": 1.3969649456826072, + "grad_norm": 1.3308465480804443, + "learning_rate": 9.870131578947369e-05, + "loss": 0.5726, + "step": 24947 + }, + { + "epoch": 1.3970209429947362, + "grad_norm": 1.4514648914337158, + "learning_rate": 9.870105263157895e-05, + "loss": 0.4428, + "step": 24948 + }, + { + "epoch": 1.3970769403068652, + "grad_norm": 1.5108258724212646, + "learning_rate": 9.870078947368422e-05, + "loss": 0.4245, + "step": 24949 + }, + { + "epoch": 1.3971329376189943, + "grad_norm": 1.440600872039795, + "learning_rate": 9.870052631578948e-05, + "loss": 0.4508, + "step": 24950 + }, + { + "epoch": 1.3971889349311233, + "grad_norm": 1.2561041116714478, + "learning_rate": 9.870026315789474e-05, + "loss": 0.4105, + "step": 24951 + }, + { + "epoch": 1.3972449322432523, + "grad_norm": 1.3195409774780273, + "learning_rate": 9.87e-05, + "loss": 0.5741, + "step": 24952 + }, + { + "epoch": 1.3973009295553813, + "grad_norm": 1.2621251344680786, + "learning_rate": 9.869973684210527e-05, + "loss": 0.42, + "step": 24953 + }, + { + "epoch": 1.3973569268675103, + "grad_norm": 1.4393939971923828, + "learning_rate": 9.869947368421053e-05, + "loss": 0.4588, + "step": 24954 + }, + { + "epoch": 1.3974129241796394, + "grad_norm": 1.3184726238250732, + "learning_rate": 9.869921052631579e-05, + "loss": 0.3996, + "step": 24955 + }, + { + "epoch": 1.3974689214917684, + "grad_norm": 1.3019930124282837, + "learning_rate": 9.869894736842105e-05, + "loss": 0.35, + "step": 24956 + }, + { + "epoch": 1.3975249188038974, + "grad_norm": 1.455165982246399, + "learning_rate": 9.869868421052631e-05, + "loss": 0.471, + "step": 24957 + }, + { + "epoch": 1.3975809161160264, + "grad_norm": 1.0577287673950195, + "learning_rate": 9.869842105263159e-05, + "loss": 0.3745, + "step": 24958 + }, + { + "epoch": 1.3976369134281554, + "grad_norm": 1.6369513273239136, + "learning_rate": 9.869815789473685e-05, + "loss": 0.5191, + "step": 24959 + }, + { + "epoch": 1.3976929107402845, + "grad_norm": 1.1356538534164429, + "learning_rate": 9.86978947368421e-05, + "loss": 0.4098, + "step": 24960 + }, + { + "epoch": 1.3977489080524135, + "grad_norm": 1.5345463752746582, + "learning_rate": 9.869763157894736e-05, + "loss": 0.5791, + "step": 24961 + }, + { + "epoch": 1.3978049053645425, + "grad_norm": 1.3994766473770142, + "learning_rate": 9.869736842105264e-05, + "loss": 0.4134, + "step": 24962 + }, + { + "epoch": 1.3978609026766715, + "grad_norm": 1.4844574928283691, + "learning_rate": 9.86971052631579e-05, + "loss": 0.7353, + "step": 24963 + }, + { + "epoch": 1.3979168999888005, + "grad_norm": 1.3428765535354614, + "learning_rate": 9.869684210526317e-05, + "loss": 0.4492, + "step": 24964 + }, + { + "epoch": 1.3979728973009296, + "grad_norm": 1.1608526706695557, + "learning_rate": 9.869657894736842e-05, + "loss": 0.3797, + "step": 24965 + }, + { + "epoch": 1.3980288946130586, + "grad_norm": 1.1632158756256104, + "learning_rate": 9.869631578947369e-05, + "loss": 0.3693, + "step": 24966 + }, + { + "epoch": 1.3980848919251876, + "grad_norm": 1.0819597244262695, + "learning_rate": 9.869605263157895e-05, + "loss": 0.3702, + "step": 24967 + }, + { + "epoch": 1.3981408892373166, + "grad_norm": 1.1602689027786255, + "learning_rate": 9.869578947368422e-05, + "loss": 0.4737, + "step": 24968 + }, + { + "epoch": 1.3981968865494456, + "grad_norm": 1.334148645401001, + "learning_rate": 9.869552631578948e-05, + "loss": 0.4082, + "step": 24969 + }, + { + "epoch": 1.3982528838615746, + "grad_norm": 1.1840577125549316, + "learning_rate": 9.869526315789474e-05, + "loss": 0.4302, + "step": 24970 + }, + { + "epoch": 1.3983088811737037, + "grad_norm": 1.401780366897583, + "learning_rate": 9.8695e-05, + "loss": 0.4981, + "step": 24971 + }, + { + "epoch": 1.3983648784858327, + "grad_norm": 1.2232701778411865, + "learning_rate": 9.869473684210528e-05, + "loss": 0.3785, + "step": 24972 + }, + { + "epoch": 1.3984208757979617, + "grad_norm": 1.3001359701156616, + "learning_rate": 9.869447368421054e-05, + "loss": 0.6799, + "step": 24973 + }, + { + "epoch": 1.3984768731100907, + "grad_norm": 1.3686200380325317, + "learning_rate": 9.86942105263158e-05, + "loss": 0.4898, + "step": 24974 + }, + { + "epoch": 1.3985328704222197, + "grad_norm": 1.2565200328826904, + "learning_rate": 9.869394736842106e-05, + "loss": 0.5439, + "step": 24975 + }, + { + "epoch": 1.3985888677343488, + "grad_norm": 1.2554477453231812, + "learning_rate": 9.869368421052632e-05, + "loss": 0.3778, + "step": 24976 + }, + { + "epoch": 1.3986448650464778, + "grad_norm": 1.2617875337600708, + "learning_rate": 9.869342105263159e-05, + "loss": 0.5034, + "step": 24977 + }, + { + "epoch": 1.3987008623586068, + "grad_norm": 1.0165064334869385, + "learning_rate": 9.869315789473685e-05, + "loss": 0.3976, + "step": 24978 + }, + { + "epoch": 1.3987568596707358, + "grad_norm": 1.1564257144927979, + "learning_rate": 9.869289473684211e-05, + "loss": 0.4893, + "step": 24979 + }, + { + "epoch": 1.3988128569828648, + "grad_norm": 1.5127946138381958, + "learning_rate": 9.869263157894737e-05, + "loss": 0.6844, + "step": 24980 + }, + { + "epoch": 1.3988688542949939, + "grad_norm": 1.4281575679779053, + "learning_rate": 9.869236842105264e-05, + "loss": 0.481, + "step": 24981 + }, + { + "epoch": 1.3989248516071229, + "grad_norm": 1.6895654201507568, + "learning_rate": 9.86921052631579e-05, + "loss": 0.5923, + "step": 24982 + }, + { + "epoch": 1.398980848919252, + "grad_norm": 1.4220174551010132, + "learning_rate": 9.869184210526316e-05, + "loss": 0.4944, + "step": 24983 + }, + { + "epoch": 1.399036846231381, + "grad_norm": 1.6604970693588257, + "learning_rate": 9.869157894736842e-05, + "loss": 0.5302, + "step": 24984 + }, + { + "epoch": 1.39909284354351, + "grad_norm": 1.1861413717269897, + "learning_rate": 9.86913157894737e-05, + "loss": 0.4556, + "step": 24985 + }, + { + "epoch": 1.399148840855639, + "grad_norm": 1.10104238986969, + "learning_rate": 9.869105263157895e-05, + "loss": 0.3691, + "step": 24986 + }, + { + "epoch": 1.399204838167768, + "grad_norm": 1.4999996423721313, + "learning_rate": 9.869078947368423e-05, + "loss": 0.488, + "step": 24987 + }, + { + "epoch": 1.399260835479897, + "grad_norm": 1.4314191341400146, + "learning_rate": 9.869052631578947e-05, + "loss": 0.5801, + "step": 24988 + }, + { + "epoch": 1.399316832792026, + "grad_norm": 1.188765287399292, + "learning_rate": 9.869026315789475e-05, + "loss": 0.4722, + "step": 24989 + }, + { + "epoch": 1.399372830104155, + "grad_norm": 1.447745442390442, + "learning_rate": 9.869e-05, + "loss": 0.4674, + "step": 24990 + }, + { + "epoch": 1.399428827416284, + "grad_norm": 1.2270760536193848, + "learning_rate": 9.868973684210527e-05, + "loss": 0.4121, + "step": 24991 + }, + { + "epoch": 1.399484824728413, + "grad_norm": 1.1734915971755981, + "learning_rate": 9.868947368421052e-05, + "loss": 0.385, + "step": 24992 + }, + { + "epoch": 1.399540822040542, + "grad_norm": 1.1888699531555176, + "learning_rate": 9.868921052631578e-05, + "loss": 0.3492, + "step": 24993 + }, + { + "epoch": 1.399596819352671, + "grad_norm": 1.3761533498764038, + "learning_rate": 9.868894736842106e-05, + "loss": 0.5162, + "step": 24994 + }, + { + "epoch": 1.3996528166648001, + "grad_norm": 1.2712526321411133, + "learning_rate": 9.868868421052632e-05, + "loss": 0.5014, + "step": 24995 + }, + { + "epoch": 1.3997088139769291, + "grad_norm": 1.1590772867202759, + "learning_rate": 9.868842105263159e-05, + "loss": 0.3405, + "step": 24996 + }, + { + "epoch": 1.3997648112890582, + "grad_norm": 1.3925637006759644, + "learning_rate": 9.868815789473684e-05, + "loss": 0.4855, + "step": 24997 + }, + { + "epoch": 1.3998208086011872, + "grad_norm": 1.4906693696975708, + "learning_rate": 9.868789473684211e-05, + "loss": 0.4965, + "step": 24998 + }, + { + "epoch": 1.3998768059133162, + "grad_norm": 1.0911304950714111, + "learning_rate": 9.868763157894737e-05, + "loss": 0.5819, + "step": 24999 + }, + { + "epoch": 1.3999328032254452, + "grad_norm": 1.2345770597457886, + "learning_rate": 9.868736842105264e-05, + "loss": 0.3439, + "step": 25000 + }, + { + "epoch": 1.3999888005375742, + "grad_norm": 1.4910013675689697, + "learning_rate": 9.86871052631579e-05, + "loss": 0.4444, + "step": 25001 + }, + { + "epoch": 1.4000447978497033, + "grad_norm": 1.181877613067627, + "learning_rate": 9.868684210526316e-05, + "loss": 0.4475, + "step": 25002 + }, + { + "epoch": 1.4001007951618323, + "grad_norm": 1.181572675704956, + "learning_rate": 9.868657894736842e-05, + "loss": 0.4454, + "step": 25003 + }, + { + "epoch": 1.4001567924739613, + "grad_norm": 1.3123735189437866, + "learning_rate": 9.86863157894737e-05, + "loss": 0.4238, + "step": 25004 + }, + { + "epoch": 1.4002127897860903, + "grad_norm": 1.2522852420806885, + "learning_rate": 9.868605263157896e-05, + "loss": 0.3972, + "step": 25005 + }, + { + "epoch": 1.4002687870982193, + "grad_norm": 1.5129053592681885, + "learning_rate": 9.868578947368422e-05, + "loss": 0.4779, + "step": 25006 + }, + { + "epoch": 1.4003247844103484, + "grad_norm": 1.5927486419677734, + "learning_rate": 9.868552631578948e-05, + "loss": 0.3409, + "step": 25007 + }, + { + "epoch": 1.4003807817224774, + "grad_norm": 1.570186734199524, + "learning_rate": 9.868526315789473e-05, + "loss": 0.4116, + "step": 25008 + }, + { + "epoch": 1.4004367790346064, + "grad_norm": 1.2253022193908691, + "learning_rate": 9.868500000000001e-05, + "loss": 0.4927, + "step": 25009 + }, + { + "epoch": 1.4004927763467354, + "grad_norm": 1.3859150409698486, + "learning_rate": 9.868473684210527e-05, + "loss": 0.4001, + "step": 25010 + }, + { + "epoch": 1.4005487736588644, + "grad_norm": 1.146719217300415, + "learning_rate": 9.868447368421053e-05, + "loss": 0.4135, + "step": 25011 + }, + { + "epoch": 1.4006047709709935, + "grad_norm": 1.5594269037246704, + "learning_rate": 9.868421052631579e-05, + "loss": 0.4072, + "step": 25012 + }, + { + "epoch": 1.4006607682831225, + "grad_norm": 1.3745996952056885, + "learning_rate": 9.868394736842106e-05, + "loss": 0.4967, + "step": 25013 + }, + { + "epoch": 1.4007167655952515, + "grad_norm": 1.658186912536621, + "learning_rate": 9.868368421052632e-05, + "loss": 0.5801, + "step": 25014 + }, + { + "epoch": 1.4007727629073805, + "grad_norm": 1.9151132106781006, + "learning_rate": 9.868342105263158e-05, + "loss": 0.5393, + "step": 25015 + }, + { + "epoch": 1.4008287602195095, + "grad_norm": 1.4492130279541016, + "learning_rate": 9.868315789473684e-05, + "loss": 0.4651, + "step": 25016 + }, + { + "epoch": 1.4008847575316385, + "grad_norm": 1.218334436416626, + "learning_rate": 9.868289473684211e-05, + "loss": 0.4098, + "step": 25017 + }, + { + "epoch": 1.4009407548437676, + "grad_norm": 1.4384442567825317, + "learning_rate": 9.868263157894737e-05, + "loss": 0.558, + "step": 25018 + }, + { + "epoch": 1.4009967521558966, + "grad_norm": 1.6621073484420776, + "learning_rate": 9.868236842105265e-05, + "loss": 0.438, + "step": 25019 + }, + { + "epoch": 1.4010527494680256, + "grad_norm": 1.2724928855895996, + "learning_rate": 9.868210526315789e-05, + "loss": 0.416, + "step": 25020 + }, + { + "epoch": 1.4011087467801546, + "grad_norm": 1.4699560403823853, + "learning_rate": 9.868184210526317e-05, + "loss": 0.4839, + "step": 25021 + }, + { + "epoch": 1.4011647440922836, + "grad_norm": 1.3664757013320923, + "learning_rate": 9.868157894736843e-05, + "loss": 0.5102, + "step": 25022 + }, + { + "epoch": 1.4012207414044127, + "grad_norm": 1.3542282581329346, + "learning_rate": 9.86813157894737e-05, + "loss": 0.5138, + "step": 25023 + }, + { + "epoch": 1.4012767387165417, + "grad_norm": 1.2294542789459229, + "learning_rate": 9.868105263157896e-05, + "loss": 0.4108, + "step": 25024 + }, + { + "epoch": 1.4013327360286707, + "grad_norm": 1.1522819995880127, + "learning_rate": 9.86807894736842e-05, + "loss": 0.4243, + "step": 25025 + }, + { + "epoch": 1.4013887333407997, + "grad_norm": 1.4976519346237183, + "learning_rate": 9.868052631578948e-05, + "loss": 0.6372, + "step": 25026 + }, + { + "epoch": 1.4014447306529287, + "grad_norm": 1.2452642917633057, + "learning_rate": 9.868026315789474e-05, + "loss": 0.5069, + "step": 25027 + }, + { + "epoch": 1.4015007279650578, + "grad_norm": 1.0968621969223022, + "learning_rate": 9.868000000000001e-05, + "loss": 0.4141, + "step": 25028 + }, + { + "epoch": 1.4015567252771868, + "grad_norm": 1.0446137189865112, + "learning_rate": 9.867973684210527e-05, + "loss": 0.3676, + "step": 25029 + }, + { + "epoch": 1.4016127225893158, + "grad_norm": 1.1803224086761475, + "learning_rate": 9.867947368421053e-05, + "loss": 0.5365, + "step": 25030 + }, + { + "epoch": 1.4016687199014446, + "grad_norm": 1.3734524250030518, + "learning_rate": 9.867921052631579e-05, + "loss": 0.542, + "step": 25031 + }, + { + "epoch": 1.4017247172135736, + "grad_norm": 1.3984856605529785, + "learning_rate": 9.867894736842106e-05, + "loss": 0.4396, + "step": 25032 + }, + { + "epoch": 1.4017807145257026, + "grad_norm": 1.6063979864120483, + "learning_rate": 9.867868421052632e-05, + "loss": 0.5612, + "step": 25033 + }, + { + "epoch": 1.4018367118378317, + "grad_norm": 1.452295184135437, + "learning_rate": 9.867842105263158e-05, + "loss": 0.5969, + "step": 25034 + }, + { + "epoch": 1.4018927091499607, + "grad_norm": 1.2554432153701782, + "learning_rate": 9.867815789473684e-05, + "loss": 0.4216, + "step": 25035 + }, + { + "epoch": 1.4019487064620897, + "grad_norm": 1.2694385051727295, + "learning_rate": 9.867789473684212e-05, + "loss": 0.4662, + "step": 25036 + }, + { + "epoch": 1.4020047037742187, + "grad_norm": 1.1114654541015625, + "learning_rate": 9.867763157894738e-05, + "loss": 0.3756, + "step": 25037 + }, + { + "epoch": 1.4020607010863477, + "grad_norm": 3.559643268585205, + "learning_rate": 9.867736842105264e-05, + "loss": 0.4125, + "step": 25038 + }, + { + "epoch": 1.4021166983984767, + "grad_norm": 1.172831416130066, + "learning_rate": 9.86771052631579e-05, + "loss": 0.357, + "step": 25039 + }, + { + "epoch": 1.4021726957106058, + "grad_norm": 1.3581202030181885, + "learning_rate": 9.867684210526317e-05, + "loss": 0.5453, + "step": 25040 + }, + { + "epoch": 1.4022286930227348, + "grad_norm": 1.535194993019104, + "learning_rate": 9.867657894736843e-05, + "loss": 0.4958, + "step": 25041 + }, + { + "epoch": 1.4022846903348638, + "grad_norm": 1.432216763496399, + "learning_rate": 9.86763157894737e-05, + "loss": 0.4126, + "step": 25042 + }, + { + "epoch": 1.4023406876469928, + "grad_norm": 1.2078896760940552, + "learning_rate": 9.867605263157895e-05, + "loss": 0.4448, + "step": 25043 + }, + { + "epoch": 1.4023966849591218, + "grad_norm": 1.6733994483947754, + "learning_rate": 9.867578947368421e-05, + "loss": 0.506, + "step": 25044 + }, + { + "epoch": 1.4024526822712509, + "grad_norm": 1.4144052267074585, + "learning_rate": 9.867552631578948e-05, + "loss": 0.4617, + "step": 25045 + }, + { + "epoch": 1.4025086795833799, + "grad_norm": 1.3199208974838257, + "learning_rate": 9.867526315789474e-05, + "loss": 0.3546, + "step": 25046 + }, + { + "epoch": 1.402564676895509, + "grad_norm": 1.1983124017715454, + "learning_rate": 9.8675e-05, + "loss": 0.4099, + "step": 25047 + }, + { + "epoch": 1.402620674207638, + "grad_norm": 1.9280728101730347, + "learning_rate": 9.867473684210526e-05, + "loss": 0.4807, + "step": 25048 + }, + { + "epoch": 1.402676671519767, + "grad_norm": 1.4171066284179688, + "learning_rate": 9.867447368421053e-05, + "loss": 0.4783, + "step": 25049 + }, + { + "epoch": 1.402732668831896, + "grad_norm": 1.6370997428894043, + "learning_rate": 9.867421052631579e-05, + "loss": 0.4487, + "step": 25050 + }, + { + "epoch": 1.402788666144025, + "grad_norm": 1.3617645502090454, + "learning_rate": 9.867394736842107e-05, + "loss": 0.3976, + "step": 25051 + }, + { + "epoch": 1.402844663456154, + "grad_norm": 1.1175732612609863, + "learning_rate": 9.867368421052631e-05, + "loss": 0.3508, + "step": 25052 + }, + { + "epoch": 1.402900660768283, + "grad_norm": 5.507682800292969, + "learning_rate": 9.867342105263159e-05, + "loss": 0.3978, + "step": 25053 + }, + { + "epoch": 1.402956658080412, + "grad_norm": 1.6868797540664673, + "learning_rate": 9.867315789473684e-05, + "loss": 0.5949, + "step": 25054 + }, + { + "epoch": 1.403012655392541, + "grad_norm": 1.537165641784668, + "learning_rate": 9.867289473684212e-05, + "loss": 0.4219, + "step": 25055 + }, + { + "epoch": 1.40306865270467, + "grad_norm": 1.173524022102356, + "learning_rate": 9.867263157894738e-05, + "loss": 0.4213, + "step": 25056 + }, + { + "epoch": 1.403124650016799, + "grad_norm": 1.24052095413208, + "learning_rate": 9.867236842105264e-05, + "loss": 0.4199, + "step": 25057 + }, + { + "epoch": 1.403180647328928, + "grad_norm": 1.5992597341537476, + "learning_rate": 9.86721052631579e-05, + "loss": 0.5079, + "step": 25058 + }, + { + "epoch": 1.4032366446410571, + "grad_norm": 1.393551230430603, + "learning_rate": 9.867184210526317e-05, + "loss": 0.4988, + "step": 25059 + }, + { + "epoch": 1.4032926419531861, + "grad_norm": 1.587053894996643, + "learning_rate": 9.867157894736843e-05, + "loss": 0.5091, + "step": 25060 + }, + { + "epoch": 1.4033486392653152, + "grad_norm": 1.4986960887908936, + "learning_rate": 9.867131578947369e-05, + "loss": 0.4199, + "step": 25061 + }, + { + "epoch": 1.4034046365774442, + "grad_norm": 1.3810064792633057, + "learning_rate": 9.867105263157895e-05, + "loss": 0.3867, + "step": 25062 + }, + { + "epoch": 1.4034606338895732, + "grad_norm": 1.9216339588165283, + "learning_rate": 9.867078947368421e-05, + "loss": 0.4258, + "step": 25063 + }, + { + "epoch": 1.4035166312017022, + "grad_norm": 1.329543113708496, + "learning_rate": 9.867052631578948e-05, + "loss": 0.437, + "step": 25064 + }, + { + "epoch": 1.4035726285138312, + "grad_norm": 1.2471803426742554, + "learning_rate": 9.867026315789474e-05, + "loss": 0.4124, + "step": 25065 + }, + { + "epoch": 1.4036286258259603, + "grad_norm": 1.6240177154541016, + "learning_rate": 9.867e-05, + "loss": 0.5176, + "step": 25066 + }, + { + "epoch": 1.4036846231380893, + "grad_norm": 1.3207478523254395, + "learning_rate": 9.866973684210526e-05, + "loss": 0.4693, + "step": 25067 + }, + { + "epoch": 1.4037406204502183, + "grad_norm": 1.6427310705184937, + "learning_rate": 9.866947368421054e-05, + "loss": 0.6914, + "step": 25068 + }, + { + "epoch": 1.4037966177623473, + "grad_norm": 1.5911667346954346, + "learning_rate": 9.86692105263158e-05, + "loss": 0.5381, + "step": 25069 + }, + { + "epoch": 1.4038526150744763, + "grad_norm": 8.670450210571289, + "learning_rate": 9.866894736842105e-05, + "loss": 0.5282, + "step": 25070 + }, + { + "epoch": 1.4039086123866054, + "grad_norm": 1.9461487531661987, + "learning_rate": 9.866868421052631e-05, + "loss": 0.4911, + "step": 25071 + }, + { + "epoch": 1.4039646096987344, + "grad_norm": 1.3057644367218018, + "learning_rate": 9.866842105263159e-05, + "loss": 0.5616, + "step": 25072 + }, + { + "epoch": 1.4040206070108634, + "grad_norm": 1.534982442855835, + "learning_rate": 9.866815789473685e-05, + "loss": 0.4504, + "step": 25073 + }, + { + "epoch": 1.4040766043229924, + "grad_norm": 1.2518433332443237, + "learning_rate": 9.866789473684212e-05, + "loss": 0.5359, + "step": 25074 + }, + { + "epoch": 1.4041326016351214, + "grad_norm": 6.048405170440674, + "learning_rate": 9.866763157894737e-05, + "loss": 0.4873, + "step": 25075 + }, + { + "epoch": 1.4041885989472505, + "grad_norm": 1.257227897644043, + "learning_rate": 9.866736842105264e-05, + "loss": 0.4599, + "step": 25076 + }, + { + "epoch": 1.4042445962593795, + "grad_norm": 1.189857006072998, + "learning_rate": 9.86671052631579e-05, + "loss": 0.4897, + "step": 25077 + }, + { + "epoch": 1.4043005935715085, + "grad_norm": 1.6713621616363525, + "learning_rate": 9.866684210526316e-05, + "loss": 0.5393, + "step": 25078 + }, + { + "epoch": 1.4043565908836375, + "grad_norm": 1.223427176475525, + "learning_rate": 9.866657894736843e-05, + "loss": 0.4036, + "step": 25079 + }, + { + "epoch": 1.4044125881957665, + "grad_norm": 1.4842644929885864, + "learning_rate": 9.866631578947368e-05, + "loss": 0.4918, + "step": 25080 + }, + { + "epoch": 1.4044685855078956, + "grad_norm": 1.2758557796478271, + "learning_rate": 9.866605263157895e-05, + "loss": 0.417, + "step": 25081 + }, + { + "epoch": 1.4045245828200246, + "grad_norm": 1.3707480430603027, + "learning_rate": 9.866578947368421e-05, + "loss": 0.4824, + "step": 25082 + }, + { + "epoch": 1.4045805801321536, + "grad_norm": 1.6210976839065552, + "learning_rate": 9.866552631578949e-05, + "loss": 0.4343, + "step": 25083 + }, + { + "epoch": 1.4046365774442826, + "grad_norm": 1.2979321479797363, + "learning_rate": 9.866526315789475e-05, + "loss": 0.3599, + "step": 25084 + }, + { + "epoch": 1.4046925747564116, + "grad_norm": 1.4203746318817139, + "learning_rate": 9.8665e-05, + "loss": 0.4211, + "step": 25085 + }, + { + "epoch": 1.4047485720685406, + "grad_norm": 1.2466440200805664, + "learning_rate": 9.866473684210526e-05, + "loss": 0.3942, + "step": 25086 + }, + { + "epoch": 1.4048045693806697, + "grad_norm": 1.479690432548523, + "learning_rate": 9.866447368421054e-05, + "loss": 0.4823, + "step": 25087 + }, + { + "epoch": 1.4048605666927987, + "grad_norm": 1.480630874633789, + "learning_rate": 9.86642105263158e-05, + "loss": 0.4906, + "step": 25088 + }, + { + "epoch": 1.4049165640049277, + "grad_norm": 1.1486139297485352, + "learning_rate": 9.866394736842106e-05, + "loss": 0.4376, + "step": 25089 + }, + { + "epoch": 1.4049725613170567, + "grad_norm": 1.2108925580978394, + "learning_rate": 9.866368421052632e-05, + "loss": 0.4693, + "step": 25090 + }, + { + "epoch": 1.4050285586291857, + "grad_norm": 1.401533603668213, + "learning_rate": 9.866342105263159e-05, + "loss": 0.4328, + "step": 25091 + }, + { + "epoch": 1.4050845559413148, + "grad_norm": 1.4225749969482422, + "learning_rate": 9.866315789473685e-05, + "loss": 0.5919, + "step": 25092 + }, + { + "epoch": 1.4051405532534438, + "grad_norm": 1.5397193431854248, + "learning_rate": 9.866289473684211e-05, + "loss": 0.5583, + "step": 25093 + }, + { + "epoch": 1.4051965505655728, + "grad_norm": 1.6258461475372314, + "learning_rate": 9.866263157894737e-05, + "loss": 0.5521, + "step": 25094 + }, + { + "epoch": 1.4052525478777018, + "grad_norm": 1.1896156072616577, + "learning_rate": 9.866236842105263e-05, + "loss": 0.3568, + "step": 25095 + }, + { + "epoch": 1.4053085451898308, + "grad_norm": 1.330722689628601, + "learning_rate": 9.86621052631579e-05, + "loss": 0.418, + "step": 25096 + }, + { + "epoch": 1.4053645425019599, + "grad_norm": 1.491829514503479, + "learning_rate": 9.866184210526316e-05, + "loss": 0.4913, + "step": 25097 + }, + { + "epoch": 1.4054205398140889, + "grad_norm": 1.9061870574951172, + "learning_rate": 9.866157894736842e-05, + "loss": 0.6653, + "step": 25098 + }, + { + "epoch": 1.405476537126218, + "grad_norm": 1.0821285247802734, + "learning_rate": 9.866131578947368e-05, + "loss": 0.3845, + "step": 25099 + }, + { + "epoch": 1.405532534438347, + "grad_norm": 1.4515200853347778, + "learning_rate": 9.866105263157895e-05, + "loss": 0.4841, + "step": 25100 + }, + { + "epoch": 1.405588531750476, + "grad_norm": 1.3448429107666016, + "learning_rate": 9.866078947368421e-05, + "loss": 0.3733, + "step": 25101 + }, + { + "epoch": 1.405644529062605, + "grad_norm": 1.5497682094573975, + "learning_rate": 9.866052631578947e-05, + "loss": 0.7214, + "step": 25102 + }, + { + "epoch": 1.405700526374734, + "grad_norm": 1.6154425144195557, + "learning_rate": 9.866026315789473e-05, + "loss": 0.4119, + "step": 25103 + }, + { + "epoch": 1.405756523686863, + "grad_norm": 1.3904379606246948, + "learning_rate": 9.866000000000001e-05, + "loss": 0.3991, + "step": 25104 + }, + { + "epoch": 1.405812520998992, + "grad_norm": 1.2952815294265747, + "learning_rate": 9.865973684210527e-05, + "loss": 0.4578, + "step": 25105 + }, + { + "epoch": 1.405868518311121, + "grad_norm": 1.5114688873291016, + "learning_rate": 9.865947368421054e-05, + "loss": 0.4603, + "step": 25106 + }, + { + "epoch": 1.40592451562325, + "grad_norm": 1.2152762413024902, + "learning_rate": 9.865921052631579e-05, + "loss": 0.505, + "step": 25107 + }, + { + "epoch": 1.405980512935379, + "grad_norm": 1.5251792669296265, + "learning_rate": 9.865894736842106e-05, + "loss": 0.4445, + "step": 25108 + }, + { + "epoch": 1.406036510247508, + "grad_norm": 1.2245151996612549, + "learning_rate": 9.865868421052632e-05, + "loss": 0.4804, + "step": 25109 + }, + { + "epoch": 1.406092507559637, + "grad_norm": 1.192751169204712, + "learning_rate": 9.865842105263159e-05, + "loss": 0.4295, + "step": 25110 + }, + { + "epoch": 1.4061485048717661, + "grad_norm": 1.083000659942627, + "learning_rate": 9.865815789473685e-05, + "loss": 0.4174, + "step": 25111 + }, + { + "epoch": 1.4062045021838951, + "grad_norm": 1.5058971643447876, + "learning_rate": 9.86578947368421e-05, + "loss": 0.4594, + "step": 25112 + }, + { + "epoch": 1.4062604994960242, + "grad_norm": 1.3602256774902344, + "learning_rate": 9.865763157894737e-05, + "loss": 0.5681, + "step": 25113 + }, + { + "epoch": 1.4063164968081532, + "grad_norm": 1.3668222427368164, + "learning_rate": 9.865736842105263e-05, + "loss": 0.4684, + "step": 25114 + }, + { + "epoch": 1.4063724941202822, + "grad_norm": 1.3513565063476562, + "learning_rate": 9.86571052631579e-05, + "loss": 0.4067, + "step": 25115 + }, + { + "epoch": 1.4064284914324112, + "grad_norm": 1.42878258228302, + "learning_rate": 9.865684210526316e-05, + "loss": 0.5119, + "step": 25116 + }, + { + "epoch": 1.4064844887445402, + "grad_norm": 1.2328746318817139, + "learning_rate": 9.865657894736842e-05, + "loss": 0.49, + "step": 25117 + }, + { + "epoch": 1.4065404860566693, + "grad_norm": 1.290832281112671, + "learning_rate": 9.865631578947368e-05, + "loss": 0.4261, + "step": 25118 + }, + { + "epoch": 1.4065964833687983, + "grad_norm": 1.6286808252334595, + "learning_rate": 9.865605263157896e-05, + "loss": 0.548, + "step": 25119 + }, + { + "epoch": 1.4066524806809273, + "grad_norm": 1.1603643894195557, + "learning_rate": 9.865578947368422e-05, + "loss": 0.4343, + "step": 25120 + }, + { + "epoch": 1.4067084779930563, + "grad_norm": 1.461963415145874, + "learning_rate": 9.865552631578948e-05, + "loss": 0.5455, + "step": 25121 + }, + { + "epoch": 1.4067644753051853, + "grad_norm": 1.9852609634399414, + "learning_rate": 9.865526315789474e-05, + "loss": 0.4646, + "step": 25122 + }, + { + "epoch": 1.4068204726173144, + "grad_norm": 1.3366806507110596, + "learning_rate": 9.865500000000001e-05, + "loss": 0.4645, + "step": 25123 + }, + { + "epoch": 1.4068764699294434, + "grad_norm": 1.3001078367233276, + "learning_rate": 9.865473684210527e-05, + "loss": 0.466, + "step": 25124 + }, + { + "epoch": 1.4069324672415724, + "grad_norm": 1.2729926109313965, + "learning_rate": 9.865447368421053e-05, + "loss": 0.4699, + "step": 25125 + }, + { + "epoch": 1.4069884645537014, + "grad_norm": 1.61940336227417, + "learning_rate": 9.865421052631579e-05, + "loss": 0.6571, + "step": 25126 + }, + { + "epoch": 1.4070444618658304, + "grad_norm": 1.4772520065307617, + "learning_rate": 9.865394736842106e-05, + "loss": 0.5843, + "step": 25127 + }, + { + "epoch": 1.4071004591779595, + "grad_norm": 1.5451966524124146, + "learning_rate": 9.865368421052632e-05, + "loss": 0.5362, + "step": 25128 + }, + { + "epoch": 1.4071564564900885, + "grad_norm": 1.266040325164795, + "learning_rate": 9.865342105263158e-05, + "loss": 0.4418, + "step": 25129 + }, + { + "epoch": 1.4072124538022175, + "grad_norm": 1.3463425636291504, + "learning_rate": 9.865315789473684e-05, + "loss": 0.4314, + "step": 25130 + }, + { + "epoch": 1.4072684511143465, + "grad_norm": 1.5727450847625732, + "learning_rate": 9.86528947368421e-05, + "loss": 0.5435, + "step": 25131 + }, + { + "epoch": 1.4073244484264755, + "grad_norm": 1.3411790132522583, + "learning_rate": 9.865263157894737e-05, + "loss": 0.4142, + "step": 25132 + }, + { + "epoch": 1.4073804457386045, + "grad_norm": 1.4852484464645386, + "learning_rate": 9.865236842105263e-05, + "loss": 0.4494, + "step": 25133 + }, + { + "epoch": 1.4074364430507336, + "grad_norm": 1.394225001335144, + "learning_rate": 9.865210526315791e-05, + "loss": 0.5541, + "step": 25134 + }, + { + "epoch": 1.4074924403628626, + "grad_norm": 1.3585059642791748, + "learning_rate": 9.865184210526315e-05, + "loss": 0.3974, + "step": 25135 + }, + { + "epoch": 1.4075484376749916, + "grad_norm": 1.5817770957946777, + "learning_rate": 9.865157894736843e-05, + "loss": 0.5899, + "step": 25136 + }, + { + "epoch": 1.4076044349871206, + "grad_norm": 1.5890610218048096, + "learning_rate": 9.865131578947369e-05, + "loss": 0.5236, + "step": 25137 + }, + { + "epoch": 1.4076604322992496, + "grad_norm": 1.1551625728607178, + "learning_rate": 9.865105263157896e-05, + "loss": 0.4185, + "step": 25138 + }, + { + "epoch": 1.4077164296113787, + "grad_norm": 1.1804637908935547, + "learning_rate": 9.86507894736842e-05, + "loss": 0.4683, + "step": 25139 + }, + { + "epoch": 1.4077724269235077, + "grad_norm": 1.4861499071121216, + "learning_rate": 9.865052631578948e-05, + "loss": 0.4231, + "step": 25140 + }, + { + "epoch": 1.4078284242356367, + "grad_norm": 1.6176575422286987, + "learning_rate": 9.865026315789474e-05, + "loss": 0.4366, + "step": 25141 + }, + { + "epoch": 1.4078844215477657, + "grad_norm": 1.3628233671188354, + "learning_rate": 9.865000000000001e-05, + "loss": 0.453, + "step": 25142 + }, + { + "epoch": 1.4079404188598947, + "grad_norm": 1.26882004737854, + "learning_rate": 9.864973684210527e-05, + "loss": 0.4369, + "step": 25143 + }, + { + "epoch": 1.4079964161720238, + "grad_norm": 1.4862089157104492, + "learning_rate": 9.864947368421053e-05, + "loss": 0.3508, + "step": 25144 + }, + { + "epoch": 1.4080524134841528, + "grad_norm": 1.1362391710281372, + "learning_rate": 9.864921052631579e-05, + "loss": 0.3829, + "step": 25145 + }, + { + "epoch": 1.4081084107962818, + "grad_norm": 1.2971348762512207, + "learning_rate": 9.864894736842105e-05, + "loss": 0.4346, + "step": 25146 + }, + { + "epoch": 1.4081644081084108, + "grad_norm": 1.207412600517273, + "learning_rate": 9.864868421052632e-05, + "loss": 0.4799, + "step": 25147 + }, + { + "epoch": 1.4082204054205398, + "grad_norm": 1.363423466682434, + "learning_rate": 9.864842105263158e-05, + "loss": 0.4393, + "step": 25148 + }, + { + "epoch": 1.4082764027326689, + "grad_norm": 1.479589581489563, + "learning_rate": 9.864815789473684e-05, + "loss": 0.3937, + "step": 25149 + }, + { + "epoch": 1.4083324000447979, + "grad_norm": 1.668657660484314, + "learning_rate": 9.86478947368421e-05, + "loss": 0.5922, + "step": 25150 + }, + { + "epoch": 1.408388397356927, + "grad_norm": 1.1311315298080444, + "learning_rate": 9.864763157894738e-05, + "loss": 0.392, + "step": 25151 + }, + { + "epoch": 1.408444394669056, + "grad_norm": 2.721068859100342, + "learning_rate": 9.864736842105264e-05, + "loss": 0.6064, + "step": 25152 + }, + { + "epoch": 1.408500391981185, + "grad_norm": 1.382392168045044, + "learning_rate": 9.86471052631579e-05, + "loss": 0.4802, + "step": 25153 + }, + { + "epoch": 1.408556389293314, + "grad_norm": 1.329283356666565, + "learning_rate": 9.864684210526316e-05, + "loss": 0.3837, + "step": 25154 + }, + { + "epoch": 1.408612386605443, + "grad_norm": 1.3018931150436401, + "learning_rate": 9.864657894736843e-05, + "loss": 0.3788, + "step": 25155 + }, + { + "epoch": 1.408668383917572, + "grad_norm": 1.6913831233978271, + "learning_rate": 9.864631578947369e-05, + "loss": 0.4156, + "step": 25156 + }, + { + "epoch": 1.408724381229701, + "grad_norm": 1.326263427734375, + "learning_rate": 9.864605263157895e-05, + "loss": 0.4448, + "step": 25157 + }, + { + "epoch": 1.40878037854183, + "grad_norm": 1.895554780960083, + "learning_rate": 9.864578947368421e-05, + "loss": 0.4293, + "step": 25158 + }, + { + "epoch": 1.408836375853959, + "grad_norm": 1.3707953691482544, + "learning_rate": 9.864552631578948e-05, + "loss": 0.4818, + "step": 25159 + }, + { + "epoch": 1.408892373166088, + "grad_norm": 1.4448717832565308, + "learning_rate": 9.864526315789474e-05, + "loss": 0.55, + "step": 25160 + }, + { + "epoch": 1.408948370478217, + "grad_norm": 1.3688149452209473, + "learning_rate": 9.864500000000002e-05, + "loss": 0.6002, + "step": 25161 + }, + { + "epoch": 1.409004367790346, + "grad_norm": 1.3279937505722046, + "learning_rate": 9.864473684210526e-05, + "loss": 0.5392, + "step": 25162 + }, + { + "epoch": 1.4090603651024751, + "grad_norm": 1.3392213582992554, + "learning_rate": 9.864447368421053e-05, + "loss": 0.3843, + "step": 25163 + }, + { + "epoch": 1.4091163624146041, + "grad_norm": 1.4783343076705933, + "learning_rate": 9.86442105263158e-05, + "loss": 0.4332, + "step": 25164 + }, + { + "epoch": 1.4091723597267332, + "grad_norm": 1.4156527519226074, + "learning_rate": 9.864394736842105e-05, + "loss": 0.421, + "step": 25165 + }, + { + "epoch": 1.4092283570388622, + "grad_norm": 1.6359426975250244, + "learning_rate": 9.864368421052633e-05, + "loss": 0.5679, + "step": 25166 + }, + { + "epoch": 1.4092843543509912, + "grad_norm": 1.2974162101745605, + "learning_rate": 9.864342105263157e-05, + "loss": 0.3891, + "step": 25167 + }, + { + "epoch": 1.4093403516631202, + "grad_norm": 1.4267315864562988, + "learning_rate": 9.864315789473685e-05, + "loss": 0.4329, + "step": 25168 + }, + { + "epoch": 1.4093963489752492, + "grad_norm": 1.202324390411377, + "learning_rate": 9.86428947368421e-05, + "loss": 0.3673, + "step": 25169 + }, + { + "epoch": 1.4094523462873783, + "grad_norm": 1.3575880527496338, + "learning_rate": 9.864263157894738e-05, + "loss": 0.4658, + "step": 25170 + }, + { + "epoch": 1.4095083435995073, + "grad_norm": 1.1915929317474365, + "learning_rate": 9.864236842105264e-05, + "loss": 0.346, + "step": 25171 + }, + { + "epoch": 1.4095643409116363, + "grad_norm": 1.1185379028320312, + "learning_rate": 9.86421052631579e-05, + "loss": 0.3682, + "step": 25172 + }, + { + "epoch": 1.4096203382237653, + "grad_norm": 3.813021183013916, + "learning_rate": 9.864184210526316e-05, + "loss": 0.6769, + "step": 25173 + }, + { + "epoch": 1.4096763355358943, + "grad_norm": 1.7576009035110474, + "learning_rate": 9.864157894736843e-05, + "loss": 0.5629, + "step": 25174 + }, + { + "epoch": 1.4097323328480234, + "grad_norm": 1.261063575744629, + "learning_rate": 9.864131578947369e-05, + "loss": 0.4364, + "step": 25175 + }, + { + "epoch": 1.4097883301601524, + "grad_norm": 1.2424596548080444, + "learning_rate": 9.864105263157895e-05, + "loss": 0.4822, + "step": 25176 + }, + { + "epoch": 1.4098443274722814, + "grad_norm": 1.6005308628082275, + "learning_rate": 9.864078947368421e-05, + "loss": 0.4941, + "step": 25177 + }, + { + "epoch": 1.4099003247844104, + "grad_norm": 1.1510900259017944, + "learning_rate": 9.864052631578948e-05, + "loss": 0.477, + "step": 25178 + }, + { + "epoch": 1.4099563220965394, + "grad_norm": 1.8903448581695557, + "learning_rate": 9.864026315789474e-05, + "loss": 0.4593, + "step": 25179 + }, + { + "epoch": 1.4100123194086684, + "grad_norm": 2.102847099304199, + "learning_rate": 9.864e-05, + "loss": 0.4552, + "step": 25180 + }, + { + "epoch": 1.4100683167207975, + "grad_norm": 1.7312819957733154, + "learning_rate": 9.863973684210526e-05, + "loss": 0.4896, + "step": 25181 + }, + { + "epoch": 1.4101243140329265, + "grad_norm": 1.3706915378570557, + "learning_rate": 9.863947368421052e-05, + "loss": 0.3614, + "step": 25182 + }, + { + "epoch": 1.4101803113450555, + "grad_norm": 1.6786187887191772, + "learning_rate": 9.86392105263158e-05, + "loss": 0.5758, + "step": 25183 + }, + { + "epoch": 1.4102363086571845, + "grad_norm": 1.2910107374191284, + "learning_rate": 9.863894736842106e-05, + "loss": 0.4112, + "step": 25184 + }, + { + "epoch": 1.4102923059693135, + "grad_norm": 1.2806596755981445, + "learning_rate": 9.863868421052632e-05, + "loss": 0.4152, + "step": 25185 + }, + { + "epoch": 1.4103483032814426, + "grad_norm": 1.6166380643844604, + "learning_rate": 9.863842105263158e-05, + "loss": 0.4378, + "step": 25186 + }, + { + "epoch": 1.4104043005935716, + "grad_norm": 1.2381101846694946, + "learning_rate": 9.863815789473685e-05, + "loss": 0.3784, + "step": 25187 + }, + { + "epoch": 1.4104602979057006, + "grad_norm": 1.633828043937683, + "learning_rate": 9.863789473684211e-05, + "loss": 0.4597, + "step": 25188 + }, + { + "epoch": 1.4105162952178296, + "grad_norm": 1.0757710933685303, + "learning_rate": 9.863763157894738e-05, + "loss": 0.4191, + "step": 25189 + }, + { + "epoch": 1.4105722925299586, + "grad_norm": 1.5068286657333374, + "learning_rate": 9.863736842105263e-05, + "loss": 0.452, + "step": 25190 + }, + { + "epoch": 1.4106282898420877, + "grad_norm": 1.1757184267044067, + "learning_rate": 9.86371052631579e-05, + "loss": 0.3778, + "step": 25191 + }, + { + "epoch": 1.4106842871542167, + "grad_norm": 1.402175784111023, + "learning_rate": 9.863684210526316e-05, + "loss": 0.7289, + "step": 25192 + }, + { + "epoch": 1.4107402844663457, + "grad_norm": 1.231673240661621, + "learning_rate": 9.863657894736843e-05, + "loss": 0.4304, + "step": 25193 + }, + { + "epoch": 1.4107962817784747, + "grad_norm": 1.4890073537826538, + "learning_rate": 9.863631578947368e-05, + "loss": 0.4979, + "step": 25194 + }, + { + "epoch": 1.4108522790906037, + "grad_norm": 1.2157163619995117, + "learning_rate": 9.863605263157895e-05, + "loss": 0.4838, + "step": 25195 + }, + { + "epoch": 1.4109082764027328, + "grad_norm": 1.2294938564300537, + "learning_rate": 9.863578947368421e-05, + "loss": 0.5075, + "step": 25196 + }, + { + "epoch": 1.4109642737148618, + "grad_norm": 2.9420714378356934, + "learning_rate": 9.863552631578949e-05, + "loss": 0.4397, + "step": 25197 + }, + { + "epoch": 1.4110202710269908, + "grad_norm": 1.3563071489334106, + "learning_rate": 9.863526315789475e-05, + "loss": 0.5007, + "step": 25198 + }, + { + "epoch": 1.4110762683391198, + "grad_norm": 1.259868860244751, + "learning_rate": 9.8635e-05, + "loss": 0.6522, + "step": 25199 + }, + { + "epoch": 1.4111322656512488, + "grad_norm": 1.3337322473526, + "learning_rate": 9.863473684210527e-05, + "loss": 0.4302, + "step": 25200 + }, + { + "epoch": 1.4111882629633778, + "grad_norm": 1.2552626132965088, + "learning_rate": 9.863447368421053e-05, + "loss": 0.4105, + "step": 25201 + }, + { + "epoch": 1.4112442602755069, + "grad_norm": 1.2617647647857666, + "learning_rate": 9.86342105263158e-05, + "loss": 0.3968, + "step": 25202 + }, + { + "epoch": 1.4113002575876359, + "grad_norm": 1.4309154748916626, + "learning_rate": 9.863394736842106e-05, + "loss": 0.4875, + "step": 25203 + }, + { + "epoch": 1.411356254899765, + "grad_norm": 1.6625126600265503, + "learning_rate": 9.863368421052632e-05, + "loss": 0.6625, + "step": 25204 + }, + { + "epoch": 1.411412252211894, + "grad_norm": 1.1364010572433472, + "learning_rate": 9.863342105263158e-05, + "loss": 0.3535, + "step": 25205 + }, + { + "epoch": 1.411468249524023, + "grad_norm": 1.4304853677749634, + "learning_rate": 9.863315789473685e-05, + "loss": 0.5349, + "step": 25206 + }, + { + "epoch": 1.411524246836152, + "grad_norm": 1.3051401376724243, + "learning_rate": 9.863289473684211e-05, + "loss": 0.4308, + "step": 25207 + }, + { + "epoch": 1.411580244148281, + "grad_norm": 1.8065649271011353, + "learning_rate": 9.863263157894737e-05, + "loss": 0.567, + "step": 25208 + }, + { + "epoch": 1.41163624146041, + "grad_norm": 1.2617363929748535, + "learning_rate": 9.863236842105263e-05, + "loss": 0.5085, + "step": 25209 + }, + { + "epoch": 1.411692238772539, + "grad_norm": 1.63270103931427, + "learning_rate": 9.86321052631579e-05, + "loss": 0.5207, + "step": 25210 + }, + { + "epoch": 1.411748236084668, + "grad_norm": 1.464239478111267, + "learning_rate": 9.863184210526316e-05, + "loss": 0.4098, + "step": 25211 + }, + { + "epoch": 1.411804233396797, + "grad_norm": 1.193290114402771, + "learning_rate": 9.863157894736842e-05, + "loss": 0.3687, + "step": 25212 + }, + { + "epoch": 1.411860230708926, + "grad_norm": 1.1258714199066162, + "learning_rate": 9.863131578947368e-05, + "loss": 0.4189, + "step": 25213 + }, + { + "epoch": 1.411916228021055, + "grad_norm": 1.2496358156204224, + "learning_rate": 9.863105263157896e-05, + "loss": 0.5213, + "step": 25214 + }, + { + "epoch": 1.4119722253331841, + "grad_norm": 1.1736644506454468, + "learning_rate": 9.863078947368422e-05, + "loss": 0.5512, + "step": 25215 + }, + { + "epoch": 1.4120282226453131, + "grad_norm": 1.31013023853302, + "learning_rate": 9.863052631578948e-05, + "loss": 0.4178, + "step": 25216 + }, + { + "epoch": 1.4120842199574422, + "grad_norm": 1.6401429176330566, + "learning_rate": 9.863026315789474e-05, + "loss": 0.4042, + "step": 25217 + }, + { + "epoch": 1.4121402172695712, + "grad_norm": 1.447288990020752, + "learning_rate": 9.863e-05, + "loss": 0.3885, + "step": 25218 + }, + { + "epoch": 1.4121962145817002, + "grad_norm": 1.3267236948013306, + "learning_rate": 9.862973684210527e-05, + "loss": 0.6762, + "step": 25219 + }, + { + "epoch": 1.4122522118938292, + "grad_norm": 1.245842456817627, + "learning_rate": 9.862947368421053e-05, + "loss": 0.6838, + "step": 25220 + }, + { + "epoch": 1.4123082092059582, + "grad_norm": 1.2799817323684692, + "learning_rate": 9.86292105263158e-05, + "loss": 0.3342, + "step": 25221 + }, + { + "epoch": 1.4123642065180873, + "grad_norm": 1.138277292251587, + "learning_rate": 9.862894736842105e-05, + "loss": 0.4473, + "step": 25222 + }, + { + "epoch": 1.4124202038302163, + "grad_norm": 1.3019230365753174, + "learning_rate": 9.862868421052632e-05, + "loss": 0.4225, + "step": 25223 + }, + { + "epoch": 1.4124762011423453, + "grad_norm": 1.2484922409057617, + "learning_rate": 9.862842105263158e-05, + "loss": 0.5595, + "step": 25224 + }, + { + "epoch": 1.4125321984544743, + "grad_norm": 1.2911787033081055, + "learning_rate": 9.862815789473685e-05, + "loss": 0.5246, + "step": 25225 + }, + { + "epoch": 1.4125881957666033, + "grad_norm": 1.1501758098602295, + "learning_rate": 9.862789473684211e-05, + "loss": 0.3478, + "step": 25226 + }, + { + "epoch": 1.4126441930787323, + "grad_norm": 1.0365339517593384, + "learning_rate": 9.862763157894737e-05, + "loss": 0.3588, + "step": 25227 + }, + { + "epoch": 1.4127001903908614, + "grad_norm": 1.8831053972244263, + "learning_rate": 9.862736842105263e-05, + "loss": 0.4659, + "step": 25228 + }, + { + "epoch": 1.4127561877029904, + "grad_norm": 1.3875123262405396, + "learning_rate": 9.862710526315791e-05, + "loss": 0.4413, + "step": 25229 + }, + { + "epoch": 1.4128121850151194, + "grad_norm": 1.4171642065048218, + "learning_rate": 9.862684210526317e-05, + "loss": 0.5081, + "step": 25230 + }, + { + "epoch": 1.4128681823272484, + "grad_norm": 1.3800628185272217, + "learning_rate": 9.862657894736843e-05, + "loss": 0.3952, + "step": 25231 + }, + { + "epoch": 1.4129241796393774, + "grad_norm": 1.1604880094528198, + "learning_rate": 9.862631578947369e-05, + "loss": 0.3431, + "step": 25232 + }, + { + "epoch": 1.4129801769515065, + "grad_norm": 1.3972384929656982, + "learning_rate": 9.862605263157895e-05, + "loss": 0.5271, + "step": 25233 + }, + { + "epoch": 1.4130361742636355, + "grad_norm": 1.6486868858337402, + "learning_rate": 9.862578947368422e-05, + "loss": 0.5212, + "step": 25234 + }, + { + "epoch": 1.4130921715757645, + "grad_norm": 1.3751742839813232, + "learning_rate": 9.862552631578948e-05, + "loss": 0.3862, + "step": 25235 + }, + { + "epoch": 1.4131481688878935, + "grad_norm": 1.791991949081421, + "learning_rate": 9.862526315789474e-05, + "loss": 0.5196, + "step": 25236 + }, + { + "epoch": 1.4132041662000223, + "grad_norm": 1.3346039056777954, + "learning_rate": 9.8625e-05, + "loss": 0.5303, + "step": 25237 + }, + { + "epoch": 1.4132601635121513, + "grad_norm": 1.5640456676483154, + "learning_rate": 9.862473684210527e-05, + "loss": 0.4898, + "step": 25238 + }, + { + "epoch": 1.4133161608242804, + "grad_norm": 1.4504514932632446, + "learning_rate": 9.862447368421053e-05, + "loss": 0.4962, + "step": 25239 + }, + { + "epoch": 1.4133721581364094, + "grad_norm": 1.5084350109100342, + "learning_rate": 9.862421052631579e-05, + "loss": 0.6098, + "step": 25240 + }, + { + "epoch": 1.4134281554485384, + "grad_norm": 1.341499924659729, + "learning_rate": 9.862394736842105e-05, + "loss": 0.6482, + "step": 25241 + }, + { + "epoch": 1.4134841527606674, + "grad_norm": 1.2496647834777832, + "learning_rate": 9.862368421052632e-05, + "loss": 0.3923, + "step": 25242 + }, + { + "epoch": 1.4135401500727964, + "grad_norm": 1.2085115909576416, + "learning_rate": 9.862342105263158e-05, + "loss": 0.4895, + "step": 25243 + }, + { + "epoch": 1.4135961473849254, + "grad_norm": 1.3878856897354126, + "learning_rate": 9.862315789473686e-05, + "loss": 0.4754, + "step": 25244 + }, + { + "epoch": 1.4136521446970545, + "grad_norm": 1.2352678775787354, + "learning_rate": 9.86228947368421e-05, + "loss": 0.4851, + "step": 25245 + }, + { + "epoch": 1.4137081420091835, + "grad_norm": 1.5618391036987305, + "learning_rate": 9.862263157894738e-05, + "loss": 0.5905, + "step": 25246 + }, + { + "epoch": 1.4137641393213125, + "grad_norm": 1.4296767711639404, + "learning_rate": 9.862236842105264e-05, + "loss": 0.4608, + "step": 25247 + }, + { + "epoch": 1.4138201366334415, + "grad_norm": 1.3693063259124756, + "learning_rate": 9.862210526315791e-05, + "loss": 0.3787, + "step": 25248 + }, + { + "epoch": 1.4138761339455705, + "grad_norm": 1.2497797012329102, + "learning_rate": 9.862184210526316e-05, + "loss": 0.4279, + "step": 25249 + }, + { + "epoch": 1.4139321312576996, + "grad_norm": 1.4723483324050903, + "learning_rate": 9.862157894736842e-05, + "loss": 0.5253, + "step": 25250 + }, + { + "epoch": 1.4139881285698286, + "grad_norm": 1.3741096258163452, + "learning_rate": 9.862131578947369e-05, + "loss": 0.5041, + "step": 25251 + }, + { + "epoch": 1.4140441258819576, + "grad_norm": 1.483561396598816, + "learning_rate": 9.862105263157895e-05, + "loss": 0.4966, + "step": 25252 + }, + { + "epoch": 1.4141001231940866, + "grad_norm": 1.5171470642089844, + "learning_rate": 9.862078947368422e-05, + "loss": 0.4932, + "step": 25253 + }, + { + "epoch": 1.4141561205062156, + "grad_norm": 1.2929694652557373, + "learning_rate": 9.862052631578947e-05, + "loss": 0.5337, + "step": 25254 + }, + { + "epoch": 1.4142121178183447, + "grad_norm": 1.4575672149658203, + "learning_rate": 9.862026315789474e-05, + "loss": 0.4108, + "step": 25255 + }, + { + "epoch": 1.4142681151304737, + "grad_norm": 1.2953664064407349, + "learning_rate": 9.862e-05, + "loss": 0.4451, + "step": 25256 + }, + { + "epoch": 1.4143241124426027, + "grad_norm": 1.1995396614074707, + "learning_rate": 9.861973684210527e-05, + "loss": 0.5422, + "step": 25257 + }, + { + "epoch": 1.4143801097547317, + "grad_norm": 1.1539461612701416, + "learning_rate": 9.861947368421053e-05, + "loss": 0.401, + "step": 25258 + }, + { + "epoch": 1.4144361070668607, + "grad_norm": 1.0791876316070557, + "learning_rate": 9.86192105263158e-05, + "loss": 0.4076, + "step": 25259 + }, + { + "epoch": 1.4144921043789898, + "grad_norm": 1.157410979270935, + "learning_rate": 9.861894736842105e-05, + "loss": 0.4216, + "step": 25260 + }, + { + "epoch": 1.4145481016911188, + "grad_norm": 1.2240326404571533, + "learning_rate": 9.861868421052633e-05, + "loss": 0.3884, + "step": 25261 + }, + { + "epoch": 1.4146040990032478, + "grad_norm": 1.393512487411499, + "learning_rate": 9.861842105263159e-05, + "loss": 0.4577, + "step": 25262 + }, + { + "epoch": 1.4146600963153768, + "grad_norm": 1.1321724653244019, + "learning_rate": 9.861815789473685e-05, + "loss": 0.3056, + "step": 25263 + }, + { + "epoch": 1.4147160936275058, + "grad_norm": 1.144692063331604, + "learning_rate": 9.86178947368421e-05, + "loss": 0.4127, + "step": 25264 + }, + { + "epoch": 1.4147720909396349, + "grad_norm": 1.49162757396698, + "learning_rate": 9.861763157894738e-05, + "loss": 0.5832, + "step": 25265 + }, + { + "epoch": 1.4148280882517639, + "grad_norm": 1.2007313966751099, + "learning_rate": 9.861736842105264e-05, + "loss": 0.4345, + "step": 25266 + }, + { + "epoch": 1.414884085563893, + "grad_norm": 1.4854063987731934, + "learning_rate": 9.86171052631579e-05, + "loss": 0.5534, + "step": 25267 + }, + { + "epoch": 1.414940082876022, + "grad_norm": 2.267275094985962, + "learning_rate": 9.861684210526316e-05, + "loss": 0.5159, + "step": 25268 + }, + { + "epoch": 1.414996080188151, + "grad_norm": 1.5217339992523193, + "learning_rate": 9.861657894736842e-05, + "loss": 0.4637, + "step": 25269 + }, + { + "epoch": 1.41505207750028, + "grad_norm": 1.3068915605545044, + "learning_rate": 9.861631578947369e-05, + "loss": 0.4396, + "step": 25270 + }, + { + "epoch": 1.415108074812409, + "grad_norm": 1.3373641967773438, + "learning_rate": 9.861605263157895e-05, + "loss": 0.3759, + "step": 25271 + }, + { + "epoch": 1.415164072124538, + "grad_norm": 1.6124898195266724, + "learning_rate": 9.861578947368421e-05, + "loss": 0.4239, + "step": 25272 + }, + { + "epoch": 1.415220069436667, + "grad_norm": 1.4194447994232178, + "learning_rate": 9.861552631578947e-05, + "loss": 0.4973, + "step": 25273 + }, + { + "epoch": 1.415276066748796, + "grad_norm": 1.24207603931427, + "learning_rate": 9.861526315789474e-05, + "loss": 0.4253, + "step": 25274 + }, + { + "epoch": 1.415332064060925, + "grad_norm": 1.3166199922561646, + "learning_rate": 9.8615e-05, + "loss": 0.4236, + "step": 25275 + }, + { + "epoch": 1.415388061373054, + "grad_norm": 1.2715953588485718, + "learning_rate": 9.861473684210528e-05, + "loss": 0.486, + "step": 25276 + }, + { + "epoch": 1.415444058685183, + "grad_norm": 1.384478211402893, + "learning_rate": 9.861447368421052e-05, + "loss": 0.3994, + "step": 25277 + }, + { + "epoch": 1.415500055997312, + "grad_norm": 1.9101049900054932, + "learning_rate": 9.86142105263158e-05, + "loss": 0.4845, + "step": 25278 + }, + { + "epoch": 1.4155560533094411, + "grad_norm": 1.2240445613861084, + "learning_rate": 9.861394736842106e-05, + "loss": 0.4528, + "step": 25279 + }, + { + "epoch": 1.4156120506215701, + "grad_norm": 1.328916072845459, + "learning_rate": 9.861368421052633e-05, + "loss": 0.3869, + "step": 25280 + }, + { + "epoch": 1.4156680479336992, + "grad_norm": 1.400500774383545, + "learning_rate": 9.861342105263159e-05, + "loss": 0.5352, + "step": 25281 + }, + { + "epoch": 1.4157240452458282, + "grad_norm": 1.5027445554733276, + "learning_rate": 9.861315789473685e-05, + "loss": 0.5391, + "step": 25282 + }, + { + "epoch": 1.4157800425579572, + "grad_norm": 1.1852748394012451, + "learning_rate": 9.861289473684211e-05, + "loss": 0.4876, + "step": 25283 + }, + { + "epoch": 1.4158360398700862, + "grad_norm": 1.3775509595870972, + "learning_rate": 9.861263157894738e-05, + "loss": 0.4517, + "step": 25284 + }, + { + "epoch": 1.4158920371822152, + "grad_norm": 1.608871340751648, + "learning_rate": 9.861236842105264e-05, + "loss": 0.5324, + "step": 25285 + }, + { + "epoch": 1.4159480344943443, + "grad_norm": 1.3725733757019043, + "learning_rate": 9.86121052631579e-05, + "loss": 0.4773, + "step": 25286 + }, + { + "epoch": 1.4160040318064733, + "grad_norm": 1.2922005653381348, + "learning_rate": 9.861184210526316e-05, + "loss": 0.4044, + "step": 25287 + }, + { + "epoch": 1.4160600291186023, + "grad_norm": 1.2618980407714844, + "learning_rate": 9.861157894736842e-05, + "loss": 0.4802, + "step": 25288 + }, + { + "epoch": 1.4161160264307313, + "grad_norm": 1.3802038431167603, + "learning_rate": 9.86113157894737e-05, + "loss": 0.4104, + "step": 25289 + }, + { + "epoch": 1.4161720237428603, + "grad_norm": 1.4444077014923096, + "learning_rate": 9.861105263157895e-05, + "loss": 0.4617, + "step": 25290 + }, + { + "epoch": 1.4162280210549893, + "grad_norm": 3.483999013900757, + "learning_rate": 9.861078947368421e-05, + "loss": 0.5907, + "step": 25291 + }, + { + "epoch": 1.4162840183671184, + "grad_norm": 1.1073352098464966, + "learning_rate": 9.861052631578947e-05, + "loss": 0.4187, + "step": 25292 + }, + { + "epoch": 1.4163400156792474, + "grad_norm": 1.1422905921936035, + "learning_rate": 9.861026315789475e-05, + "loss": 0.3392, + "step": 25293 + }, + { + "epoch": 1.4163960129913764, + "grad_norm": 1.6512439250946045, + "learning_rate": 9.861e-05, + "loss": 0.4894, + "step": 25294 + }, + { + "epoch": 1.4164520103035054, + "grad_norm": 1.3519871234893799, + "learning_rate": 9.860973684210527e-05, + "loss": 0.5066, + "step": 25295 + }, + { + "epoch": 1.4165080076156344, + "grad_norm": 1.090065360069275, + "learning_rate": 9.860947368421053e-05, + "loss": 0.3512, + "step": 25296 + }, + { + "epoch": 1.4165640049277635, + "grad_norm": 1.3428910970687866, + "learning_rate": 9.86092105263158e-05, + "loss": 0.4256, + "step": 25297 + }, + { + "epoch": 1.4166200022398925, + "grad_norm": 1.3911185264587402, + "learning_rate": 9.860894736842106e-05, + "loss": 0.4326, + "step": 25298 + }, + { + "epoch": 1.4166759995520215, + "grad_norm": 1.390636920928955, + "learning_rate": 9.860868421052633e-05, + "loss": 0.4339, + "step": 25299 + }, + { + "epoch": 1.4167319968641505, + "grad_norm": 1.4221104383468628, + "learning_rate": 9.860842105263158e-05, + "loss": 0.3952, + "step": 25300 + }, + { + "epoch": 1.4167879941762795, + "grad_norm": 1.3638477325439453, + "learning_rate": 9.860815789473685e-05, + "loss": 0.4597, + "step": 25301 + }, + { + "epoch": 1.4168439914884086, + "grad_norm": 1.2395983934402466, + "learning_rate": 9.860789473684211e-05, + "loss": 0.3628, + "step": 25302 + }, + { + "epoch": 1.4168999888005376, + "grad_norm": 1.2597976922988892, + "learning_rate": 9.860763157894737e-05, + "loss": 0.422, + "step": 25303 + }, + { + "epoch": 1.4169559861126666, + "grad_norm": 1.3745061159133911, + "learning_rate": 9.860736842105263e-05, + "loss": 0.5168, + "step": 25304 + }, + { + "epoch": 1.4170119834247956, + "grad_norm": 1.2840275764465332, + "learning_rate": 9.860710526315789e-05, + "loss": 0.4069, + "step": 25305 + }, + { + "epoch": 1.4170679807369246, + "grad_norm": 1.353759765625, + "learning_rate": 9.860684210526316e-05, + "loss": 0.4599, + "step": 25306 + }, + { + "epoch": 1.4171239780490537, + "grad_norm": 1.3688899278640747, + "learning_rate": 9.860657894736842e-05, + "loss": 0.4981, + "step": 25307 + }, + { + "epoch": 1.4171799753611827, + "grad_norm": 1.1914405822753906, + "learning_rate": 9.86063157894737e-05, + "loss": 0.3851, + "step": 25308 + }, + { + "epoch": 1.4172359726733117, + "grad_norm": 1.4963513612747192, + "learning_rate": 9.860605263157894e-05, + "loss": 0.4801, + "step": 25309 + }, + { + "epoch": 1.4172919699854407, + "grad_norm": 1.4012556076049805, + "learning_rate": 9.860578947368422e-05, + "loss": 0.4715, + "step": 25310 + }, + { + "epoch": 1.4173479672975697, + "grad_norm": 1.372742772102356, + "learning_rate": 9.860552631578948e-05, + "loss": 0.4207, + "step": 25311 + }, + { + "epoch": 1.4174039646096988, + "grad_norm": 1.3505504131317139, + "learning_rate": 9.860526315789475e-05, + "loss": 0.5519, + "step": 25312 + }, + { + "epoch": 1.4174599619218278, + "grad_norm": 1.4344871044158936, + "learning_rate": 9.860500000000001e-05, + "loss": 0.4624, + "step": 25313 + }, + { + "epoch": 1.4175159592339568, + "grad_norm": 1.3725773096084595, + "learning_rate": 9.860473684210527e-05, + "loss": 0.4659, + "step": 25314 + }, + { + "epoch": 1.4175719565460858, + "grad_norm": 1.265283226966858, + "learning_rate": 9.860447368421053e-05, + "loss": 0.4751, + "step": 25315 + }, + { + "epoch": 1.4176279538582148, + "grad_norm": 1.3512071371078491, + "learning_rate": 9.86042105263158e-05, + "loss": 0.5503, + "step": 25316 + }, + { + "epoch": 1.4176839511703438, + "grad_norm": 1.2943168878555298, + "learning_rate": 9.860394736842106e-05, + "loss": 0.3472, + "step": 25317 + }, + { + "epoch": 1.4177399484824729, + "grad_norm": 1.1657301187515259, + "learning_rate": 9.860368421052632e-05, + "loss": 0.4913, + "step": 25318 + }, + { + "epoch": 1.4177959457946019, + "grad_norm": 1.530411958694458, + "learning_rate": 9.860342105263158e-05, + "loss": 0.4243, + "step": 25319 + }, + { + "epoch": 1.417851943106731, + "grad_norm": 1.179490089416504, + "learning_rate": 9.860315789473684e-05, + "loss": 0.4927, + "step": 25320 + }, + { + "epoch": 1.41790794041886, + "grad_norm": 1.3139373064041138, + "learning_rate": 9.860289473684211e-05, + "loss": 0.4186, + "step": 25321 + }, + { + "epoch": 1.417963937730989, + "grad_norm": 1.1053187847137451, + "learning_rate": 9.860263157894737e-05, + "loss": 0.4252, + "step": 25322 + }, + { + "epoch": 1.418019935043118, + "grad_norm": 1.361271858215332, + "learning_rate": 9.860236842105263e-05, + "loss": 0.5176, + "step": 25323 + }, + { + "epoch": 1.418075932355247, + "grad_norm": 1.6484142541885376, + "learning_rate": 9.860210526315789e-05, + "loss": 0.5088, + "step": 25324 + }, + { + "epoch": 1.418131929667376, + "grad_norm": 1.2425090074539185, + "learning_rate": 9.860184210526317e-05, + "loss": 0.4417, + "step": 25325 + }, + { + "epoch": 1.418187926979505, + "grad_norm": 1.8077564239501953, + "learning_rate": 9.860157894736843e-05, + "loss": 0.5891, + "step": 25326 + }, + { + "epoch": 1.418243924291634, + "grad_norm": 1.4657273292541504, + "learning_rate": 9.860131578947369e-05, + "loss": 0.5193, + "step": 25327 + }, + { + "epoch": 1.418299921603763, + "grad_norm": 1.2672172784805298, + "learning_rate": 9.860105263157895e-05, + "loss": 0.398, + "step": 25328 + }, + { + "epoch": 1.418355918915892, + "grad_norm": 1.22288978099823, + "learning_rate": 9.860078947368422e-05, + "loss": 0.5111, + "step": 25329 + }, + { + "epoch": 1.418411916228021, + "grad_norm": 1.6638717651367188, + "learning_rate": 9.860052631578948e-05, + "loss": 0.5352, + "step": 25330 + }, + { + "epoch": 1.4184679135401501, + "grad_norm": 1.5996172428131104, + "learning_rate": 9.860026315789475e-05, + "loss": 0.5242, + "step": 25331 + }, + { + "epoch": 1.4185239108522791, + "grad_norm": 1.2739406824111938, + "learning_rate": 9.86e-05, + "loss": 0.4868, + "step": 25332 + }, + { + "epoch": 1.4185799081644082, + "grad_norm": 1.4929602146148682, + "learning_rate": 9.859973684210527e-05, + "loss": 0.4906, + "step": 25333 + }, + { + "epoch": 1.4186359054765372, + "grad_norm": 1.3288265466690063, + "learning_rate": 9.859947368421053e-05, + "loss": 0.4633, + "step": 25334 + }, + { + "epoch": 1.4186919027886662, + "grad_norm": 1.4196847677230835, + "learning_rate": 9.85992105263158e-05, + "loss": 0.5721, + "step": 25335 + }, + { + "epoch": 1.4187479001007952, + "grad_norm": 1.4367882013320923, + "learning_rate": 9.859894736842106e-05, + "loss": 0.5164, + "step": 25336 + }, + { + "epoch": 1.4188038974129242, + "grad_norm": 1.402292013168335, + "learning_rate": 9.859868421052631e-05, + "loss": 0.5308, + "step": 25337 + }, + { + "epoch": 1.4188598947250533, + "grad_norm": 1.1375635862350464, + "learning_rate": 9.859842105263158e-05, + "loss": 0.3699, + "step": 25338 + }, + { + "epoch": 1.4189158920371823, + "grad_norm": 1.2534152269363403, + "learning_rate": 9.859815789473684e-05, + "loss": 0.5699, + "step": 25339 + }, + { + "epoch": 1.4189718893493113, + "grad_norm": 1.2568928003311157, + "learning_rate": 9.859789473684212e-05, + "loss": 0.3924, + "step": 25340 + }, + { + "epoch": 1.4190278866614403, + "grad_norm": 1.2098966836929321, + "learning_rate": 9.859763157894736e-05, + "loss": 0.5723, + "step": 25341 + }, + { + "epoch": 1.4190838839735693, + "grad_norm": 1.161672592163086, + "learning_rate": 9.859736842105264e-05, + "loss": 0.4008, + "step": 25342 + }, + { + "epoch": 1.4191398812856983, + "grad_norm": 1.3864883184432983, + "learning_rate": 9.85971052631579e-05, + "loss": 0.4221, + "step": 25343 + }, + { + "epoch": 1.4191958785978274, + "grad_norm": 1.7194762229919434, + "learning_rate": 9.859684210526317e-05, + "loss": 0.571, + "step": 25344 + }, + { + "epoch": 1.4192518759099564, + "grad_norm": 1.3612377643585205, + "learning_rate": 9.859657894736843e-05, + "loss": 0.3921, + "step": 25345 + }, + { + "epoch": 1.4193078732220854, + "grad_norm": 1.2218958139419556, + "learning_rate": 9.859631578947369e-05, + "loss": 0.3954, + "step": 25346 + }, + { + "epoch": 1.4193638705342144, + "grad_norm": 1.7550920248031616, + "learning_rate": 9.859605263157895e-05, + "loss": 0.529, + "step": 25347 + }, + { + "epoch": 1.4194198678463434, + "grad_norm": 1.7339459657669067, + "learning_rate": 9.859578947368422e-05, + "loss": 0.5703, + "step": 25348 + }, + { + "epoch": 1.4194758651584725, + "grad_norm": 1.1789673566818237, + "learning_rate": 9.859552631578948e-05, + "loss": 0.3967, + "step": 25349 + }, + { + "epoch": 1.4195318624706015, + "grad_norm": 1.3035659790039062, + "learning_rate": 9.859526315789474e-05, + "loss": 0.416, + "step": 25350 + }, + { + "epoch": 1.4195878597827305, + "grad_norm": 1.4353876113891602, + "learning_rate": 9.8595e-05, + "loss": 0.4244, + "step": 25351 + }, + { + "epoch": 1.4196438570948595, + "grad_norm": 1.650483250617981, + "learning_rate": 9.859473684210527e-05, + "loss": 0.662, + "step": 25352 + }, + { + "epoch": 1.4196998544069885, + "grad_norm": 1.2579351663589478, + "learning_rate": 9.859447368421053e-05, + "loss": 0.3497, + "step": 25353 + }, + { + "epoch": 1.4197558517191176, + "grad_norm": 1.3151016235351562, + "learning_rate": 9.859421052631579e-05, + "loss": 0.4805, + "step": 25354 + }, + { + "epoch": 1.4198118490312466, + "grad_norm": 2.101506471633911, + "learning_rate": 9.859394736842105e-05, + "loss": 0.5463, + "step": 25355 + }, + { + "epoch": 1.4198678463433756, + "grad_norm": 1.4948760271072388, + "learning_rate": 9.859368421052631e-05, + "loss": 0.6012, + "step": 25356 + }, + { + "epoch": 1.4199238436555046, + "grad_norm": 1.2718428373336792, + "learning_rate": 9.859342105263159e-05, + "loss": 0.5115, + "step": 25357 + }, + { + "epoch": 1.4199798409676336, + "grad_norm": 1.3629120588302612, + "learning_rate": 9.859315789473685e-05, + "loss": 0.4204, + "step": 25358 + }, + { + "epoch": 1.4200358382797627, + "grad_norm": 1.874666452407837, + "learning_rate": 9.85928947368421e-05, + "loss": 0.5246, + "step": 25359 + }, + { + "epoch": 1.4200918355918917, + "grad_norm": 1.4612150192260742, + "learning_rate": 9.859263157894737e-05, + "loss": 0.4192, + "step": 25360 + }, + { + "epoch": 1.4201478329040205, + "grad_norm": 1.6113919019699097, + "learning_rate": 9.859236842105264e-05, + "loss": 0.6654, + "step": 25361 + }, + { + "epoch": 1.4202038302161495, + "grad_norm": 1.3224183320999146, + "learning_rate": 9.85921052631579e-05, + "loss": 0.5592, + "step": 25362 + }, + { + "epoch": 1.4202598275282785, + "grad_norm": 1.343196153640747, + "learning_rate": 9.859184210526317e-05, + "loss": 0.4718, + "step": 25363 + }, + { + "epoch": 1.4203158248404075, + "grad_norm": 1.2355575561523438, + "learning_rate": 9.859157894736842e-05, + "loss": 0.4856, + "step": 25364 + }, + { + "epoch": 1.4203718221525365, + "grad_norm": 1.216900110244751, + "learning_rate": 9.859131578947369e-05, + "loss": 0.4172, + "step": 25365 + }, + { + "epoch": 1.4204278194646656, + "grad_norm": 1.3724902868270874, + "learning_rate": 9.859105263157895e-05, + "loss": 0.4328, + "step": 25366 + }, + { + "epoch": 1.4204838167767946, + "grad_norm": 1.2023346424102783, + "learning_rate": 9.859078947368422e-05, + "loss": 0.493, + "step": 25367 + }, + { + "epoch": 1.4205398140889236, + "grad_norm": 1.1472097635269165, + "learning_rate": 9.859052631578948e-05, + "loss": 0.3719, + "step": 25368 + }, + { + "epoch": 1.4205958114010526, + "grad_norm": 1.615952730178833, + "learning_rate": 9.859026315789474e-05, + "loss": 0.595, + "step": 25369 + }, + { + "epoch": 1.4206518087131816, + "grad_norm": 1.4684040546417236, + "learning_rate": 9.859e-05, + "loss": 0.4789, + "step": 25370 + }, + { + "epoch": 1.4207078060253107, + "grad_norm": 1.437103271484375, + "learning_rate": 9.858973684210526e-05, + "loss": 0.6406, + "step": 25371 + }, + { + "epoch": 1.4207638033374397, + "grad_norm": 1.5414632558822632, + "learning_rate": 9.858947368421054e-05, + "loss": 0.3936, + "step": 25372 + }, + { + "epoch": 1.4208198006495687, + "grad_norm": 1.5145574808120728, + "learning_rate": 9.85892105263158e-05, + "loss": 0.4852, + "step": 25373 + }, + { + "epoch": 1.4208757979616977, + "grad_norm": 1.5883476734161377, + "learning_rate": 9.858894736842106e-05, + "loss": 0.5683, + "step": 25374 + }, + { + "epoch": 1.4209317952738267, + "grad_norm": 1.665426254272461, + "learning_rate": 9.858868421052632e-05, + "loss": 0.4271, + "step": 25375 + }, + { + "epoch": 1.4209877925859558, + "grad_norm": 1.6864166259765625, + "learning_rate": 9.858842105263159e-05, + "loss": 0.6829, + "step": 25376 + }, + { + "epoch": 1.4210437898980848, + "grad_norm": 1.3218377828598022, + "learning_rate": 9.858815789473685e-05, + "loss": 0.4846, + "step": 25377 + }, + { + "epoch": 1.4210997872102138, + "grad_norm": 1.1267671585083008, + "learning_rate": 9.858789473684211e-05, + "loss": 0.4406, + "step": 25378 + }, + { + "epoch": 1.4211557845223428, + "grad_norm": 1.2661861181259155, + "learning_rate": 9.858763157894737e-05, + "loss": 0.4269, + "step": 25379 + }, + { + "epoch": 1.4212117818344718, + "grad_norm": 1.487857460975647, + "learning_rate": 9.858736842105264e-05, + "loss": 0.5297, + "step": 25380 + }, + { + "epoch": 1.4212677791466009, + "grad_norm": 1.3049241304397583, + "learning_rate": 9.85871052631579e-05, + "loss": 0.4583, + "step": 25381 + }, + { + "epoch": 1.4213237764587299, + "grad_norm": 1.3917500972747803, + "learning_rate": 9.858684210526316e-05, + "loss": 0.444, + "step": 25382 + }, + { + "epoch": 1.421379773770859, + "grad_norm": 1.7665787935256958, + "learning_rate": 9.858657894736842e-05, + "loss": 0.4787, + "step": 25383 + }, + { + "epoch": 1.421435771082988, + "grad_norm": 2.91767954826355, + "learning_rate": 9.85863157894737e-05, + "loss": 0.471, + "step": 25384 + }, + { + "epoch": 1.421491768395117, + "grad_norm": 1.3150043487548828, + "learning_rate": 9.858605263157895e-05, + "loss": 0.4157, + "step": 25385 + }, + { + "epoch": 1.421547765707246, + "grad_norm": 1.415709137916565, + "learning_rate": 9.858578947368423e-05, + "loss": 0.4664, + "step": 25386 + }, + { + "epoch": 1.421603763019375, + "grad_norm": 1.6887582540512085, + "learning_rate": 9.858552631578947e-05, + "loss": 0.5149, + "step": 25387 + }, + { + "epoch": 1.421659760331504, + "grad_norm": 1.4312492609024048, + "learning_rate": 9.858526315789473e-05, + "loss": 0.5076, + "step": 25388 + }, + { + "epoch": 1.421715757643633, + "grad_norm": 1.4411532878875732, + "learning_rate": 9.8585e-05, + "loss": 0.5419, + "step": 25389 + }, + { + "epoch": 1.421771754955762, + "grad_norm": 1.3503779172897339, + "learning_rate": 9.858473684210527e-05, + "loss": 0.4765, + "step": 25390 + }, + { + "epoch": 1.421827752267891, + "grad_norm": 1.2609636783599854, + "learning_rate": 9.858447368421054e-05, + "loss": 0.4416, + "step": 25391 + }, + { + "epoch": 1.42188374958002, + "grad_norm": 1.4397571086883545, + "learning_rate": 9.858421052631578e-05, + "loss": 0.4568, + "step": 25392 + }, + { + "epoch": 1.421939746892149, + "grad_norm": 1.258315086364746, + "learning_rate": 9.858394736842106e-05, + "loss": 0.4831, + "step": 25393 + }, + { + "epoch": 1.421995744204278, + "grad_norm": 1.210988998413086, + "learning_rate": 9.858368421052632e-05, + "loss": 0.5015, + "step": 25394 + }, + { + "epoch": 1.4220517415164071, + "grad_norm": 1.2439945936203003, + "learning_rate": 9.858342105263159e-05, + "loss": 0.3687, + "step": 25395 + }, + { + "epoch": 1.4221077388285361, + "grad_norm": 1.1381407976150513, + "learning_rate": 9.858315789473684e-05, + "loss": 0.4425, + "step": 25396 + }, + { + "epoch": 1.4221637361406652, + "grad_norm": 1.4158564805984497, + "learning_rate": 9.858289473684211e-05, + "loss": 0.4328, + "step": 25397 + }, + { + "epoch": 1.4222197334527942, + "grad_norm": 1.231938123703003, + "learning_rate": 9.858263157894737e-05, + "loss": 0.3726, + "step": 25398 + }, + { + "epoch": 1.4222757307649232, + "grad_norm": 1.071649193763733, + "learning_rate": 9.858236842105264e-05, + "loss": 0.3096, + "step": 25399 + }, + { + "epoch": 1.4223317280770522, + "grad_norm": 3.990752696990967, + "learning_rate": 9.85821052631579e-05, + "loss": 0.3287, + "step": 25400 + }, + { + "epoch": 1.4223877253891812, + "grad_norm": 1.2647135257720947, + "learning_rate": 9.858184210526316e-05, + "loss": 0.409, + "step": 25401 + }, + { + "epoch": 1.4224437227013103, + "grad_norm": 1.5738040208816528, + "learning_rate": 9.858157894736842e-05, + "loss": 0.5038, + "step": 25402 + }, + { + "epoch": 1.4224997200134393, + "grad_norm": 1.283447265625, + "learning_rate": 9.85813157894737e-05, + "loss": 0.4931, + "step": 25403 + }, + { + "epoch": 1.4225557173255683, + "grad_norm": 1.3840880393981934, + "learning_rate": 9.858105263157896e-05, + "loss": 0.5407, + "step": 25404 + }, + { + "epoch": 1.4226117146376973, + "grad_norm": 1.3599412441253662, + "learning_rate": 9.858078947368422e-05, + "loss": 0.5234, + "step": 25405 + }, + { + "epoch": 1.4226677119498263, + "grad_norm": 1.2359181642532349, + "learning_rate": 9.858052631578948e-05, + "loss": 0.4504, + "step": 25406 + }, + { + "epoch": 1.4227237092619553, + "grad_norm": 1.4266167879104614, + "learning_rate": 9.858026315789473e-05, + "loss": 0.5036, + "step": 25407 + }, + { + "epoch": 1.4227797065740844, + "grad_norm": 1.2063062191009521, + "learning_rate": 9.858000000000001e-05, + "loss": 0.4117, + "step": 25408 + }, + { + "epoch": 1.4228357038862134, + "grad_norm": 1.31067955493927, + "learning_rate": 9.857973684210527e-05, + "loss": 0.3727, + "step": 25409 + }, + { + "epoch": 1.4228917011983424, + "grad_norm": 2.1875691413879395, + "learning_rate": 9.857947368421053e-05, + "loss": 0.4012, + "step": 25410 + }, + { + "epoch": 1.4229476985104714, + "grad_norm": 1.1316533088684082, + "learning_rate": 9.857921052631579e-05, + "loss": 0.4388, + "step": 25411 + }, + { + "epoch": 1.4230036958226004, + "grad_norm": 1.3008742332458496, + "learning_rate": 9.857894736842106e-05, + "loss": 0.3254, + "step": 25412 + }, + { + "epoch": 1.4230596931347295, + "grad_norm": 1.3735431432724, + "learning_rate": 9.857868421052632e-05, + "loss": 0.4582, + "step": 25413 + }, + { + "epoch": 1.4231156904468585, + "grad_norm": 1.4176539182662964, + "learning_rate": 9.857842105263158e-05, + "loss": 0.4259, + "step": 25414 + }, + { + "epoch": 1.4231716877589875, + "grad_norm": 1.55917489528656, + "learning_rate": 9.857815789473684e-05, + "loss": 0.5155, + "step": 25415 + }, + { + "epoch": 1.4232276850711165, + "grad_norm": 3.2967946529388428, + "learning_rate": 9.857789473684211e-05, + "loss": 0.4553, + "step": 25416 + }, + { + "epoch": 1.4232836823832455, + "grad_norm": 2.0240895748138428, + "learning_rate": 9.857763157894737e-05, + "loss": 0.5449, + "step": 25417 + }, + { + "epoch": 1.4233396796953746, + "grad_norm": 1.7926132678985596, + "learning_rate": 9.857736842105265e-05, + "loss": 0.4438, + "step": 25418 + }, + { + "epoch": 1.4233956770075036, + "grad_norm": 1.3341935873031616, + "learning_rate": 9.857710526315789e-05, + "loss": 0.4905, + "step": 25419 + }, + { + "epoch": 1.4234516743196326, + "grad_norm": 1.5490853786468506, + "learning_rate": 9.857684210526317e-05, + "loss": 0.5582, + "step": 25420 + }, + { + "epoch": 1.4235076716317616, + "grad_norm": 1.5838794708251953, + "learning_rate": 9.857657894736843e-05, + "loss": 0.5194, + "step": 25421 + }, + { + "epoch": 1.4235636689438906, + "grad_norm": 1.493714690208435, + "learning_rate": 9.85763157894737e-05, + "loss": 0.7161, + "step": 25422 + }, + { + "epoch": 1.4236196662560197, + "grad_norm": 1.3017715215682983, + "learning_rate": 9.857605263157896e-05, + "loss": 0.4129, + "step": 25423 + }, + { + "epoch": 1.4236756635681487, + "grad_norm": 2.0874288082122803, + "learning_rate": 9.85757894736842e-05, + "loss": 0.5038, + "step": 25424 + }, + { + "epoch": 1.4237316608802777, + "grad_norm": 1.455629587173462, + "learning_rate": 9.857552631578948e-05, + "loss": 0.6834, + "step": 25425 + }, + { + "epoch": 1.4237876581924067, + "grad_norm": 1.1289703845977783, + "learning_rate": 9.857526315789474e-05, + "loss": 0.3529, + "step": 25426 + }, + { + "epoch": 1.4238436555045357, + "grad_norm": 1.2799420356750488, + "learning_rate": 9.857500000000001e-05, + "loss": 0.3936, + "step": 25427 + }, + { + "epoch": 1.4238996528166648, + "grad_norm": 1.4523367881774902, + "learning_rate": 9.857473684210527e-05, + "loss": 0.4842, + "step": 25428 + }, + { + "epoch": 1.4239556501287938, + "grad_norm": 1.5793325901031494, + "learning_rate": 9.857447368421053e-05, + "loss": 0.3518, + "step": 25429 + }, + { + "epoch": 1.4240116474409228, + "grad_norm": 1.5430808067321777, + "learning_rate": 9.857421052631579e-05, + "loss": 0.5515, + "step": 25430 + }, + { + "epoch": 1.4240676447530518, + "grad_norm": 1.567837119102478, + "learning_rate": 9.857394736842106e-05, + "loss": 0.4913, + "step": 25431 + }, + { + "epoch": 1.4241236420651808, + "grad_norm": 1.3282862901687622, + "learning_rate": 9.857368421052632e-05, + "loss": 0.5311, + "step": 25432 + }, + { + "epoch": 1.4241796393773098, + "grad_norm": 1.3393051624298096, + "learning_rate": 9.857342105263158e-05, + "loss": 0.4615, + "step": 25433 + }, + { + "epoch": 1.4242356366894389, + "grad_norm": 1.48483145236969, + "learning_rate": 9.857315789473684e-05, + "loss": 0.6161, + "step": 25434 + }, + { + "epoch": 1.4242916340015679, + "grad_norm": 1.5160679817199707, + "learning_rate": 9.857289473684212e-05, + "loss": 0.4744, + "step": 25435 + }, + { + "epoch": 1.424347631313697, + "grad_norm": 1.438240647315979, + "learning_rate": 9.857263157894738e-05, + "loss": 0.5894, + "step": 25436 + }, + { + "epoch": 1.424403628625826, + "grad_norm": 1.6397444009780884, + "learning_rate": 9.857236842105264e-05, + "loss": 0.6918, + "step": 25437 + }, + { + "epoch": 1.424459625937955, + "grad_norm": 1.548452615737915, + "learning_rate": 9.85721052631579e-05, + "loss": 0.4115, + "step": 25438 + }, + { + "epoch": 1.424515623250084, + "grad_norm": 1.4626857042312622, + "learning_rate": 9.857184210526317e-05, + "loss": 0.6428, + "step": 25439 + }, + { + "epoch": 1.424571620562213, + "grad_norm": 1.2954314947128296, + "learning_rate": 9.857157894736843e-05, + "loss": 0.4795, + "step": 25440 + }, + { + "epoch": 1.424627617874342, + "grad_norm": 1.250012755393982, + "learning_rate": 9.857131578947369e-05, + "loss": 0.3832, + "step": 25441 + }, + { + "epoch": 1.424683615186471, + "grad_norm": 1.386883020401001, + "learning_rate": 9.857105263157895e-05, + "loss": 0.5218, + "step": 25442 + }, + { + "epoch": 1.4247396124986, + "grad_norm": 1.4339525699615479, + "learning_rate": 9.857078947368421e-05, + "loss": 0.3912, + "step": 25443 + }, + { + "epoch": 1.424795609810729, + "grad_norm": 1.4767746925354004, + "learning_rate": 9.857052631578948e-05, + "loss": 0.5203, + "step": 25444 + }, + { + "epoch": 1.424851607122858, + "grad_norm": 1.441213607788086, + "learning_rate": 9.857026315789474e-05, + "loss": 0.4481, + "step": 25445 + }, + { + "epoch": 1.424907604434987, + "grad_norm": 1.6515579223632812, + "learning_rate": 9.857000000000001e-05, + "loss": 0.4584, + "step": 25446 + }, + { + "epoch": 1.4249636017471161, + "grad_norm": 1.3358708620071411, + "learning_rate": 9.856973684210526e-05, + "loss": 0.6889, + "step": 25447 + }, + { + "epoch": 1.4250195990592451, + "grad_norm": 1.3359014987945557, + "learning_rate": 9.856947368421053e-05, + "loss": 0.4941, + "step": 25448 + }, + { + "epoch": 1.4250755963713742, + "grad_norm": 1.5393649339675903, + "learning_rate": 9.856921052631579e-05, + "loss": 0.477, + "step": 25449 + }, + { + "epoch": 1.4251315936835032, + "grad_norm": 1.1691579818725586, + "learning_rate": 9.856894736842107e-05, + "loss": 0.3377, + "step": 25450 + }, + { + "epoch": 1.4251875909956322, + "grad_norm": 1.5221678018569946, + "learning_rate": 9.856868421052631e-05, + "loss": 0.6133, + "step": 25451 + }, + { + "epoch": 1.4252435883077612, + "grad_norm": 1.448843002319336, + "learning_rate": 9.856842105263159e-05, + "loss": 0.4993, + "step": 25452 + }, + { + "epoch": 1.4252995856198902, + "grad_norm": 1.2847239971160889, + "learning_rate": 9.856815789473685e-05, + "loss": 0.5198, + "step": 25453 + }, + { + "epoch": 1.4253555829320192, + "grad_norm": 1.0127615928649902, + "learning_rate": 9.856789473684212e-05, + "loss": 0.3715, + "step": 25454 + }, + { + "epoch": 1.4254115802441483, + "grad_norm": 1.3241877555847168, + "learning_rate": 9.856763157894738e-05, + "loss": 0.3097, + "step": 25455 + }, + { + "epoch": 1.4254675775562773, + "grad_norm": 1.3852566480636597, + "learning_rate": 9.856736842105264e-05, + "loss": 0.4012, + "step": 25456 + }, + { + "epoch": 1.4255235748684063, + "grad_norm": 1.2788549661636353, + "learning_rate": 9.85671052631579e-05, + "loss": 0.5921, + "step": 25457 + }, + { + "epoch": 1.4255795721805353, + "grad_norm": 1.4329038858413696, + "learning_rate": 9.856684210526316e-05, + "loss": 0.5277, + "step": 25458 + }, + { + "epoch": 1.4256355694926643, + "grad_norm": 1.3823659420013428, + "learning_rate": 9.856657894736843e-05, + "loss": 0.6759, + "step": 25459 + }, + { + "epoch": 1.4256915668047934, + "grad_norm": 1.3341295719146729, + "learning_rate": 9.856631578947369e-05, + "loss": 0.4195, + "step": 25460 + }, + { + "epoch": 1.4257475641169224, + "grad_norm": 1.3343552350997925, + "learning_rate": 9.856605263157895e-05, + "loss": 0.5346, + "step": 25461 + }, + { + "epoch": 1.4258035614290514, + "grad_norm": 1.2055914402008057, + "learning_rate": 9.856578947368421e-05, + "loss": 0.4703, + "step": 25462 + }, + { + "epoch": 1.4258595587411804, + "grad_norm": 2.759641170501709, + "learning_rate": 9.856552631578948e-05, + "loss": 0.4597, + "step": 25463 + }, + { + "epoch": 1.4259155560533094, + "grad_norm": 1.228317141532898, + "learning_rate": 9.856526315789474e-05, + "loss": 0.427, + "step": 25464 + }, + { + "epoch": 1.4259715533654385, + "grad_norm": 1.2668286561965942, + "learning_rate": 9.8565e-05, + "loss": 0.6717, + "step": 25465 + }, + { + "epoch": 1.4260275506775675, + "grad_norm": 1.519857406616211, + "learning_rate": 9.856473684210526e-05, + "loss": 0.5231, + "step": 25466 + }, + { + "epoch": 1.4260835479896965, + "grad_norm": 2.9786980152130127, + "learning_rate": 9.856447368421054e-05, + "loss": 0.5596, + "step": 25467 + }, + { + "epoch": 1.4261395453018255, + "grad_norm": 1.5299054384231567, + "learning_rate": 9.85642105263158e-05, + "loss": 0.4107, + "step": 25468 + }, + { + "epoch": 1.4261955426139545, + "grad_norm": 1.3316971063613892, + "learning_rate": 9.856394736842105e-05, + "loss": 0.4201, + "step": 25469 + }, + { + "epoch": 1.4262515399260836, + "grad_norm": 1.4687387943267822, + "learning_rate": 9.856368421052631e-05, + "loss": 0.5388, + "step": 25470 + }, + { + "epoch": 1.4263075372382126, + "grad_norm": 1.4971003532409668, + "learning_rate": 9.856342105263159e-05, + "loss": 0.4766, + "step": 25471 + }, + { + "epoch": 1.4263635345503416, + "grad_norm": 1.244888186454773, + "learning_rate": 9.856315789473685e-05, + "loss": 0.4872, + "step": 25472 + }, + { + "epoch": 1.4264195318624706, + "grad_norm": 1.1782761812210083, + "learning_rate": 9.856289473684212e-05, + "loss": 0.4772, + "step": 25473 + }, + { + "epoch": 1.4264755291745996, + "grad_norm": 1.4044368267059326, + "learning_rate": 9.856263157894737e-05, + "loss": 0.4442, + "step": 25474 + }, + { + "epoch": 1.4265315264867287, + "grad_norm": 1.432761311531067, + "learning_rate": 9.856236842105263e-05, + "loss": 0.4935, + "step": 25475 + }, + { + "epoch": 1.4265875237988577, + "grad_norm": 1.283448338508606, + "learning_rate": 9.85621052631579e-05, + "loss": 0.38, + "step": 25476 + }, + { + "epoch": 1.4266435211109867, + "grad_norm": 1.4212082624435425, + "learning_rate": 9.856184210526316e-05, + "loss": 0.539, + "step": 25477 + }, + { + "epoch": 1.4266995184231157, + "grad_norm": 1.3751952648162842, + "learning_rate": 9.856157894736843e-05, + "loss": 0.399, + "step": 25478 + }, + { + "epoch": 1.4267555157352447, + "grad_norm": 1.0430189371109009, + "learning_rate": 9.856131578947368e-05, + "loss": 0.3344, + "step": 25479 + }, + { + "epoch": 1.4268115130473737, + "grad_norm": 1.3726401329040527, + "learning_rate": 9.856105263157895e-05, + "loss": 0.5293, + "step": 25480 + }, + { + "epoch": 1.4268675103595028, + "grad_norm": 1.6846280097961426, + "learning_rate": 9.856078947368421e-05, + "loss": 0.5713, + "step": 25481 + }, + { + "epoch": 1.4269235076716318, + "grad_norm": 1.4059313535690308, + "learning_rate": 9.856052631578949e-05, + "loss": 0.4264, + "step": 25482 + }, + { + "epoch": 1.4269795049837608, + "grad_norm": 1.0575387477874756, + "learning_rate": 9.856026315789475e-05, + "loss": 0.3784, + "step": 25483 + }, + { + "epoch": 1.4270355022958898, + "grad_norm": 1.3428961038589478, + "learning_rate": 9.856e-05, + "loss": 0.5346, + "step": 25484 + }, + { + "epoch": 1.4270914996080188, + "grad_norm": 1.1390739679336548, + "learning_rate": 9.855973684210526e-05, + "loss": 0.3727, + "step": 25485 + }, + { + "epoch": 1.4271474969201479, + "grad_norm": 1.6633925437927246, + "learning_rate": 9.855947368421054e-05, + "loss": 0.551, + "step": 25486 + }, + { + "epoch": 1.4272034942322769, + "grad_norm": 1.5276228189468384, + "learning_rate": 9.85592105263158e-05, + "loss": 0.4956, + "step": 25487 + }, + { + "epoch": 1.427259491544406, + "grad_norm": 1.3396916389465332, + "learning_rate": 9.855894736842106e-05, + "loss": 0.484, + "step": 25488 + }, + { + "epoch": 1.427315488856535, + "grad_norm": 1.318367838859558, + "learning_rate": 9.855868421052632e-05, + "loss": 0.5128, + "step": 25489 + }, + { + "epoch": 1.427371486168664, + "grad_norm": 1.217202067375183, + "learning_rate": 9.855842105263159e-05, + "loss": 0.4146, + "step": 25490 + }, + { + "epoch": 1.427427483480793, + "grad_norm": 1.3000566959381104, + "learning_rate": 9.855815789473685e-05, + "loss": 0.5356, + "step": 25491 + }, + { + "epoch": 1.427483480792922, + "grad_norm": 1.2982467412948608, + "learning_rate": 9.855789473684211e-05, + "loss": 0.4702, + "step": 25492 + }, + { + "epoch": 1.427539478105051, + "grad_norm": 1.4222043752670288, + "learning_rate": 9.855763157894737e-05, + "loss": 0.4602, + "step": 25493 + }, + { + "epoch": 1.42759547541718, + "grad_norm": 1.2356319427490234, + "learning_rate": 9.855736842105263e-05, + "loss": 0.4027, + "step": 25494 + }, + { + "epoch": 1.427651472729309, + "grad_norm": 1.4177634716033936, + "learning_rate": 9.85571052631579e-05, + "loss": 0.3962, + "step": 25495 + }, + { + "epoch": 1.427707470041438, + "grad_norm": 1.6410006284713745, + "learning_rate": 9.855684210526316e-05, + "loss": 0.4403, + "step": 25496 + }, + { + "epoch": 1.427763467353567, + "grad_norm": 1.3921312093734741, + "learning_rate": 9.855657894736842e-05, + "loss": 0.4249, + "step": 25497 + }, + { + "epoch": 1.427819464665696, + "grad_norm": 1.0755549669265747, + "learning_rate": 9.855631578947368e-05, + "loss": 0.3739, + "step": 25498 + }, + { + "epoch": 1.4278754619778251, + "grad_norm": 1.423352599143982, + "learning_rate": 9.855605263157896e-05, + "loss": 0.514, + "step": 25499 + }, + { + "epoch": 1.4279314592899541, + "grad_norm": 1.5200079679489136, + "learning_rate": 9.855578947368421e-05, + "loss": 0.5753, + "step": 25500 + }, + { + "epoch": 1.4279874566020831, + "grad_norm": 1.4804867506027222, + "learning_rate": 9.855552631578949e-05, + "loss": 0.5534, + "step": 25501 + }, + { + "epoch": 1.4280434539142122, + "grad_norm": 1.2663437128067017, + "learning_rate": 9.855526315789473e-05, + "loss": 0.4266, + "step": 25502 + }, + { + "epoch": 1.4280994512263412, + "grad_norm": 1.291031002998352, + "learning_rate": 9.855500000000001e-05, + "loss": 0.4723, + "step": 25503 + }, + { + "epoch": 1.4281554485384702, + "grad_norm": 1.6855638027191162, + "learning_rate": 9.855473684210527e-05, + "loss": 0.4717, + "step": 25504 + }, + { + "epoch": 1.4282114458505992, + "grad_norm": 1.5187420845031738, + "learning_rate": 9.855447368421054e-05, + "loss": 0.4999, + "step": 25505 + }, + { + "epoch": 1.4282674431627282, + "grad_norm": 1.2003096342086792, + "learning_rate": 9.855421052631579e-05, + "loss": 0.4537, + "step": 25506 + }, + { + "epoch": 1.4283234404748573, + "grad_norm": 1.055364966392517, + "learning_rate": 9.855394736842106e-05, + "loss": 0.4704, + "step": 25507 + }, + { + "epoch": 1.4283794377869863, + "grad_norm": 1.474247932434082, + "learning_rate": 9.855368421052632e-05, + "loss": 0.577, + "step": 25508 + }, + { + "epoch": 1.4284354350991153, + "grad_norm": 1.2746540307998657, + "learning_rate": 9.855342105263158e-05, + "loss": 0.436, + "step": 25509 + }, + { + "epoch": 1.4284914324112443, + "grad_norm": 1.1947314739227295, + "learning_rate": 9.855315789473685e-05, + "loss": 0.4319, + "step": 25510 + }, + { + "epoch": 1.4285474297233733, + "grad_norm": 1.2716975212097168, + "learning_rate": 9.85528947368421e-05, + "loss": 0.4281, + "step": 25511 + }, + { + "epoch": 1.4286034270355024, + "grad_norm": 1.1532578468322754, + "learning_rate": 9.855263157894737e-05, + "loss": 0.5804, + "step": 25512 + }, + { + "epoch": 1.4286594243476314, + "grad_norm": 1.9912092685699463, + "learning_rate": 9.855236842105263e-05, + "loss": 0.5872, + "step": 25513 + }, + { + "epoch": 1.4287154216597604, + "grad_norm": 1.3635388612747192, + "learning_rate": 9.85521052631579e-05, + "loss": 0.4668, + "step": 25514 + }, + { + "epoch": 1.4287714189718894, + "grad_norm": 1.347943902015686, + "learning_rate": 9.855184210526317e-05, + "loss": 0.4532, + "step": 25515 + }, + { + "epoch": 1.4288274162840184, + "grad_norm": 1.3383828401565552, + "learning_rate": 9.855157894736842e-05, + "loss": 0.4915, + "step": 25516 + }, + { + "epoch": 1.4288834135961475, + "grad_norm": 1.4385755062103271, + "learning_rate": 9.855131578947368e-05, + "loss": 0.4985, + "step": 25517 + }, + { + "epoch": 1.4289394109082765, + "grad_norm": 1.3665130138397217, + "learning_rate": 9.855105263157896e-05, + "loss": 0.6255, + "step": 25518 + }, + { + "epoch": 1.4289954082204055, + "grad_norm": 1.445753574371338, + "learning_rate": 9.855078947368422e-05, + "loss": 0.3899, + "step": 25519 + }, + { + "epoch": 1.4290514055325345, + "grad_norm": 1.3229690790176392, + "learning_rate": 9.855052631578948e-05, + "loss": 0.3873, + "step": 25520 + }, + { + "epoch": 1.4291074028446635, + "grad_norm": 1.4007450342178345, + "learning_rate": 9.855026315789474e-05, + "loss": 0.4926, + "step": 25521 + }, + { + "epoch": 1.4291634001567926, + "grad_norm": 1.5794925689697266, + "learning_rate": 9.855000000000001e-05, + "loss": 0.486, + "step": 25522 + }, + { + "epoch": 1.4292193974689216, + "grad_norm": 2.4457929134368896, + "learning_rate": 9.854973684210527e-05, + "loss": 0.4961, + "step": 25523 + }, + { + "epoch": 1.4292753947810506, + "grad_norm": 1.2906749248504639, + "learning_rate": 9.854947368421053e-05, + "loss": 0.4901, + "step": 25524 + }, + { + "epoch": 1.4293313920931796, + "grad_norm": 1.2493500709533691, + "learning_rate": 9.854921052631579e-05, + "loss": 0.4229, + "step": 25525 + }, + { + "epoch": 1.4293873894053086, + "grad_norm": 1.722590684890747, + "learning_rate": 9.854894736842106e-05, + "loss": 0.8219, + "step": 25526 + }, + { + "epoch": 1.4294433867174376, + "grad_norm": 1.228349208831787, + "learning_rate": 9.854868421052632e-05, + "loss": 0.4558, + "step": 25527 + }, + { + "epoch": 1.4294993840295667, + "grad_norm": 1.3449047803878784, + "learning_rate": 9.854842105263158e-05, + "loss": 0.5169, + "step": 25528 + }, + { + "epoch": 1.4295553813416957, + "grad_norm": 1.2607173919677734, + "learning_rate": 9.854815789473684e-05, + "loss": 0.4035, + "step": 25529 + }, + { + "epoch": 1.4296113786538247, + "grad_norm": 1.19171941280365, + "learning_rate": 9.85478947368421e-05, + "loss": 0.3917, + "step": 25530 + }, + { + "epoch": 1.4296673759659537, + "grad_norm": 1.2546151876449585, + "learning_rate": 9.854763157894737e-05, + "loss": 0.4492, + "step": 25531 + }, + { + "epoch": 1.4297233732780827, + "grad_norm": 1.5277622938156128, + "learning_rate": 9.854736842105263e-05, + "loss": 0.5667, + "step": 25532 + }, + { + "epoch": 1.4297793705902118, + "grad_norm": 1.401011347770691, + "learning_rate": 9.854710526315791e-05, + "loss": 0.4184, + "step": 25533 + }, + { + "epoch": 1.4298353679023408, + "grad_norm": 1.8958184719085693, + "learning_rate": 9.854684210526315e-05, + "loss": 0.497, + "step": 25534 + }, + { + "epoch": 1.4298913652144698, + "grad_norm": 1.5363988876342773, + "learning_rate": 9.854657894736843e-05, + "loss": 0.4567, + "step": 25535 + }, + { + "epoch": 1.4299473625265988, + "grad_norm": 1.2325515747070312, + "learning_rate": 9.854631578947369e-05, + "loss": 0.4315, + "step": 25536 + }, + { + "epoch": 1.4300033598387278, + "grad_norm": 1.4724162817001343, + "learning_rate": 9.854605263157896e-05, + "loss": 0.3764, + "step": 25537 + }, + { + "epoch": 1.4300593571508569, + "grad_norm": 1.2884479761123657, + "learning_rate": 9.854578947368422e-05, + "loss": 0.37, + "step": 25538 + }, + { + "epoch": 1.4301153544629859, + "grad_norm": 1.3686155080795288, + "learning_rate": 9.854552631578948e-05, + "loss": 0.5295, + "step": 25539 + }, + { + "epoch": 1.430171351775115, + "grad_norm": 1.3763350248336792, + "learning_rate": 9.854526315789474e-05, + "loss": 0.5457, + "step": 25540 + }, + { + "epoch": 1.430227349087244, + "grad_norm": 2.0514578819274902, + "learning_rate": 9.854500000000001e-05, + "loss": 0.4575, + "step": 25541 + }, + { + "epoch": 1.430283346399373, + "grad_norm": 1.3097364902496338, + "learning_rate": 9.854473684210527e-05, + "loss": 0.4334, + "step": 25542 + }, + { + "epoch": 1.430339343711502, + "grad_norm": 1.1507169008255005, + "learning_rate": 9.854447368421053e-05, + "loss": 0.5702, + "step": 25543 + }, + { + "epoch": 1.430395341023631, + "grad_norm": 1.8972383737564087, + "learning_rate": 9.854421052631579e-05, + "loss": 0.4563, + "step": 25544 + }, + { + "epoch": 1.43045133833576, + "grad_norm": 1.3155434131622314, + "learning_rate": 9.854394736842105e-05, + "loss": 0.5349, + "step": 25545 + }, + { + "epoch": 1.430507335647889, + "grad_norm": 5.150670051574707, + "learning_rate": 9.854368421052632e-05, + "loss": 0.5007, + "step": 25546 + }, + { + "epoch": 1.430563332960018, + "grad_norm": 1.504660725593567, + "learning_rate": 9.854342105263158e-05, + "loss": 0.4913, + "step": 25547 + }, + { + "epoch": 1.430619330272147, + "grad_norm": 1.3363568782806396, + "learning_rate": 9.854315789473684e-05, + "loss": 0.405, + "step": 25548 + }, + { + "epoch": 1.430675327584276, + "grad_norm": 1.1625854969024658, + "learning_rate": 9.85428947368421e-05, + "loss": 0.4709, + "step": 25549 + }, + { + "epoch": 1.430731324896405, + "grad_norm": 1.4491811990737915, + "learning_rate": 9.854263157894738e-05, + "loss": 0.3984, + "step": 25550 + }, + { + "epoch": 1.430787322208534, + "grad_norm": 1.5567790269851685, + "learning_rate": 9.854236842105264e-05, + "loss": 0.4872, + "step": 25551 + }, + { + "epoch": 1.4308433195206631, + "grad_norm": 1.5600478649139404, + "learning_rate": 9.85421052631579e-05, + "loss": 0.4427, + "step": 25552 + }, + { + "epoch": 1.4308993168327921, + "grad_norm": 1.2961442470550537, + "learning_rate": 9.854184210526316e-05, + "loss": 0.3906, + "step": 25553 + }, + { + "epoch": 1.4309553141449212, + "grad_norm": 1.2450376749038696, + "learning_rate": 9.854157894736843e-05, + "loss": 0.5177, + "step": 25554 + }, + { + "epoch": 1.4310113114570502, + "grad_norm": 1.8768779039382935, + "learning_rate": 9.854131578947369e-05, + "loss": 0.4998, + "step": 25555 + }, + { + "epoch": 1.4310673087691792, + "grad_norm": 1.3666603565216064, + "learning_rate": 9.854105263157896e-05, + "loss": 0.4617, + "step": 25556 + }, + { + "epoch": 1.4311233060813082, + "grad_norm": 1.6488827466964722, + "learning_rate": 9.854078947368421e-05, + "loss": 0.6511, + "step": 25557 + }, + { + "epoch": 1.4311793033934372, + "grad_norm": 1.5491153001785278, + "learning_rate": 9.854052631578948e-05, + "loss": 0.4367, + "step": 25558 + }, + { + "epoch": 1.4312353007055663, + "grad_norm": 1.3581222295761108, + "learning_rate": 9.854026315789474e-05, + "loss": 0.5209, + "step": 25559 + }, + { + "epoch": 1.4312912980176953, + "grad_norm": 1.5273687839508057, + "learning_rate": 9.854000000000002e-05, + "loss": 0.4487, + "step": 25560 + }, + { + "epoch": 1.4313472953298243, + "grad_norm": 1.622373342514038, + "learning_rate": 9.853973684210526e-05, + "loss": 0.4901, + "step": 25561 + }, + { + "epoch": 1.4314032926419533, + "grad_norm": 1.276328682899475, + "learning_rate": 9.853947368421052e-05, + "loss": 0.559, + "step": 25562 + }, + { + "epoch": 1.4314592899540823, + "grad_norm": 1.4135698080062866, + "learning_rate": 9.85392105263158e-05, + "loss": 0.4358, + "step": 25563 + }, + { + "epoch": 1.4315152872662114, + "grad_norm": 1.2505730390548706, + "learning_rate": 9.853894736842105e-05, + "loss": 0.494, + "step": 25564 + }, + { + "epoch": 1.4315712845783404, + "grad_norm": 1.347671389579773, + "learning_rate": 9.853868421052633e-05, + "loss": 0.4177, + "step": 25565 + }, + { + "epoch": 1.4316272818904694, + "grad_norm": 1.7066562175750732, + "learning_rate": 9.853842105263157e-05, + "loss": 0.9907, + "step": 25566 + }, + { + "epoch": 1.4316832792025984, + "grad_norm": 1.3701480627059937, + "learning_rate": 9.853815789473685e-05, + "loss": 0.4696, + "step": 25567 + }, + { + "epoch": 1.4317392765147272, + "grad_norm": 1.3462611436843872, + "learning_rate": 9.85378947368421e-05, + "loss": 0.4758, + "step": 25568 + }, + { + "epoch": 1.4317952738268562, + "grad_norm": 1.2861237525939941, + "learning_rate": 9.853763157894738e-05, + "loss": 0.3977, + "step": 25569 + }, + { + "epoch": 1.4318512711389852, + "grad_norm": 1.3685340881347656, + "learning_rate": 9.853736842105264e-05, + "loss": 0.4813, + "step": 25570 + }, + { + "epoch": 1.4319072684511143, + "grad_norm": 1.4137378931045532, + "learning_rate": 9.85371052631579e-05, + "loss": 0.5633, + "step": 25571 + }, + { + "epoch": 1.4319632657632433, + "grad_norm": 1.2090106010437012, + "learning_rate": 9.853684210526316e-05, + "loss": 0.4796, + "step": 25572 + }, + { + "epoch": 1.4320192630753723, + "grad_norm": 1.4746901988983154, + "learning_rate": 9.853657894736843e-05, + "loss": 0.4088, + "step": 25573 + }, + { + "epoch": 1.4320752603875013, + "grad_norm": 1.1521053314208984, + "learning_rate": 9.853631578947369e-05, + "loss": 0.3851, + "step": 25574 + }, + { + "epoch": 1.4321312576996303, + "grad_norm": 1.3855780363082886, + "learning_rate": 9.853605263157895e-05, + "loss": 0.4598, + "step": 25575 + }, + { + "epoch": 1.4321872550117594, + "grad_norm": 1.3446911573410034, + "learning_rate": 9.853578947368421e-05, + "loss": 0.4721, + "step": 25576 + }, + { + "epoch": 1.4322432523238884, + "grad_norm": 1.3428047895431519, + "learning_rate": 9.853552631578948e-05, + "loss": 0.5576, + "step": 25577 + }, + { + "epoch": 1.4322992496360174, + "grad_norm": 1.3166146278381348, + "learning_rate": 9.853526315789474e-05, + "loss": 0.4094, + "step": 25578 + }, + { + "epoch": 1.4323552469481464, + "grad_norm": 1.4172027111053467, + "learning_rate": 9.8535e-05, + "loss": 0.4247, + "step": 25579 + }, + { + "epoch": 1.4324112442602754, + "grad_norm": 1.9372472763061523, + "learning_rate": 9.853473684210526e-05, + "loss": 0.4461, + "step": 25580 + }, + { + "epoch": 1.4324672415724045, + "grad_norm": 1.5230001211166382, + "learning_rate": 9.853447368421052e-05, + "loss": 0.4819, + "step": 25581 + }, + { + "epoch": 1.4325232388845335, + "grad_norm": 1.5996944904327393, + "learning_rate": 9.85342105263158e-05, + "loss": 0.5124, + "step": 25582 + }, + { + "epoch": 1.4325792361966625, + "grad_norm": 1.3001734018325806, + "learning_rate": 9.853394736842106e-05, + "loss": 0.4613, + "step": 25583 + }, + { + "epoch": 1.4326352335087915, + "grad_norm": 1.233330249786377, + "learning_rate": 9.853368421052632e-05, + "loss": 0.3952, + "step": 25584 + }, + { + "epoch": 1.4326912308209205, + "grad_norm": 1.220557451248169, + "learning_rate": 9.853342105263158e-05, + "loss": 0.4315, + "step": 25585 + }, + { + "epoch": 1.4327472281330496, + "grad_norm": 1.838141679763794, + "learning_rate": 9.853315789473685e-05, + "loss": 0.5661, + "step": 25586 + }, + { + "epoch": 1.4328032254451786, + "grad_norm": 1.4783620834350586, + "learning_rate": 9.853289473684211e-05, + "loss": 0.5511, + "step": 25587 + }, + { + "epoch": 1.4328592227573076, + "grad_norm": 1.6853631734848022, + "learning_rate": 9.853263157894738e-05, + "loss": 0.666, + "step": 25588 + }, + { + "epoch": 1.4329152200694366, + "grad_norm": 1.6526273488998413, + "learning_rate": 9.853236842105263e-05, + "loss": 0.4457, + "step": 25589 + }, + { + "epoch": 1.4329712173815656, + "grad_norm": 1.7893047332763672, + "learning_rate": 9.85321052631579e-05, + "loss": 0.6248, + "step": 25590 + }, + { + "epoch": 1.4330272146936947, + "grad_norm": 1.2511688470840454, + "learning_rate": 9.853184210526316e-05, + "loss": 0.4484, + "step": 25591 + }, + { + "epoch": 1.4330832120058237, + "grad_norm": 2.7740895748138428, + "learning_rate": 9.853157894736844e-05, + "loss": 0.3603, + "step": 25592 + }, + { + "epoch": 1.4331392093179527, + "grad_norm": 1.4321290254592896, + "learning_rate": 9.85313157894737e-05, + "loss": 0.4985, + "step": 25593 + }, + { + "epoch": 1.4331952066300817, + "grad_norm": 1.5247721672058105, + "learning_rate": 9.853105263157895e-05, + "loss": 0.6196, + "step": 25594 + }, + { + "epoch": 1.4332512039422107, + "grad_norm": 1.2495554685592651, + "learning_rate": 9.853078947368421e-05, + "loss": 0.4114, + "step": 25595 + }, + { + "epoch": 1.4333072012543397, + "grad_norm": 1.3315740823745728, + "learning_rate": 9.853052631578947e-05, + "loss": 0.4687, + "step": 25596 + }, + { + "epoch": 1.4333631985664688, + "grad_norm": 1.3829666376113892, + "learning_rate": 9.853026315789475e-05, + "loss": 0.4416, + "step": 25597 + }, + { + "epoch": 1.4334191958785978, + "grad_norm": 1.4854471683502197, + "learning_rate": 9.853e-05, + "loss": 0.5246, + "step": 25598 + }, + { + "epoch": 1.4334751931907268, + "grad_norm": 1.2463632822036743, + "learning_rate": 9.852973684210527e-05, + "loss": 0.438, + "step": 25599 + }, + { + "epoch": 1.4335311905028558, + "grad_norm": 1.2956230640411377, + "learning_rate": 9.852947368421053e-05, + "loss": 0.524, + "step": 25600 + }, + { + "epoch": 1.4335871878149848, + "grad_norm": 1.356801152229309, + "learning_rate": 9.85292105263158e-05, + "loss": 0.4658, + "step": 25601 + }, + { + "epoch": 1.4336431851271139, + "grad_norm": 1.358520746231079, + "learning_rate": 9.852894736842106e-05, + "loss": 0.4039, + "step": 25602 + }, + { + "epoch": 1.4336991824392429, + "grad_norm": 1.3137705326080322, + "learning_rate": 9.852868421052632e-05, + "loss": 0.4378, + "step": 25603 + }, + { + "epoch": 1.433755179751372, + "grad_norm": 1.3565123081207275, + "learning_rate": 9.852842105263158e-05, + "loss": 0.4547, + "step": 25604 + }, + { + "epoch": 1.433811177063501, + "grad_norm": 3.7248382568359375, + "learning_rate": 9.852815789473685e-05, + "loss": 0.6575, + "step": 25605 + }, + { + "epoch": 1.43386717437563, + "grad_norm": 1.2829116582870483, + "learning_rate": 9.852789473684211e-05, + "loss": 0.405, + "step": 25606 + }, + { + "epoch": 1.433923171687759, + "grad_norm": 2.3231306076049805, + "learning_rate": 9.852763157894737e-05, + "loss": 0.6263, + "step": 25607 + }, + { + "epoch": 1.433979168999888, + "grad_norm": 1.3743995428085327, + "learning_rate": 9.852736842105263e-05, + "loss": 0.4791, + "step": 25608 + }, + { + "epoch": 1.434035166312017, + "grad_norm": 1.517982840538025, + "learning_rate": 9.85271052631579e-05, + "loss": 0.5326, + "step": 25609 + }, + { + "epoch": 1.434091163624146, + "grad_norm": 1.328944206237793, + "learning_rate": 9.852684210526316e-05, + "loss": 0.4551, + "step": 25610 + }, + { + "epoch": 1.434147160936275, + "grad_norm": 1.4466986656188965, + "learning_rate": 9.852657894736842e-05, + "loss": 0.3798, + "step": 25611 + }, + { + "epoch": 1.434203158248404, + "grad_norm": 1.442425012588501, + "learning_rate": 9.852631578947368e-05, + "loss": 0.478, + "step": 25612 + }, + { + "epoch": 1.434259155560533, + "grad_norm": 1.1874006986618042, + "learning_rate": 9.852605263157894e-05, + "loss": 0.3621, + "step": 25613 + }, + { + "epoch": 1.434315152872662, + "grad_norm": 1.4170418977737427, + "learning_rate": 9.852578947368422e-05, + "loss": 0.4213, + "step": 25614 + }, + { + "epoch": 1.434371150184791, + "grad_norm": 1.4486767053604126, + "learning_rate": 9.852552631578948e-05, + "loss": 0.4419, + "step": 25615 + }, + { + "epoch": 1.4344271474969201, + "grad_norm": 1.645767092704773, + "learning_rate": 9.852526315789474e-05, + "loss": 0.4913, + "step": 25616 + }, + { + "epoch": 1.4344831448090491, + "grad_norm": 1.2231667041778564, + "learning_rate": 9.8525e-05, + "loss": 0.4702, + "step": 25617 + }, + { + "epoch": 1.4345391421211782, + "grad_norm": 1.3708308935165405, + "learning_rate": 9.852473684210527e-05, + "loss": 0.4443, + "step": 25618 + }, + { + "epoch": 1.4345951394333072, + "grad_norm": 1.3079771995544434, + "learning_rate": 9.852447368421053e-05, + "loss": 0.4473, + "step": 25619 + }, + { + "epoch": 1.4346511367454362, + "grad_norm": 1.2420459985733032, + "learning_rate": 9.85242105263158e-05, + "loss": 0.4299, + "step": 25620 + }, + { + "epoch": 1.4347071340575652, + "grad_norm": 1.3548718690872192, + "learning_rate": 9.852394736842105e-05, + "loss": 0.3791, + "step": 25621 + }, + { + "epoch": 1.4347631313696942, + "grad_norm": 1.2706632614135742, + "learning_rate": 9.852368421052632e-05, + "loss": 0.4779, + "step": 25622 + }, + { + "epoch": 1.4348191286818233, + "grad_norm": 1.0939626693725586, + "learning_rate": 9.852342105263158e-05, + "loss": 0.4237, + "step": 25623 + }, + { + "epoch": 1.4348751259939523, + "grad_norm": 1.3191313743591309, + "learning_rate": 9.852315789473685e-05, + "loss": 0.4263, + "step": 25624 + }, + { + "epoch": 1.4349311233060813, + "grad_norm": 1.2442973852157593, + "learning_rate": 9.852289473684211e-05, + "loss": 0.3207, + "step": 25625 + }, + { + "epoch": 1.4349871206182103, + "grad_norm": 1.3126877546310425, + "learning_rate": 9.852263157894737e-05, + "loss": 0.4555, + "step": 25626 + }, + { + "epoch": 1.4350431179303393, + "grad_norm": 1.4616296291351318, + "learning_rate": 9.852236842105263e-05, + "loss": 0.3951, + "step": 25627 + }, + { + "epoch": 1.4350991152424684, + "grad_norm": 1.2831814289093018, + "learning_rate": 9.852210526315791e-05, + "loss": 0.423, + "step": 25628 + }, + { + "epoch": 1.4351551125545974, + "grad_norm": 1.1951026916503906, + "learning_rate": 9.852184210526317e-05, + "loss": 0.3939, + "step": 25629 + }, + { + "epoch": 1.4352111098667264, + "grad_norm": 1.3095088005065918, + "learning_rate": 9.852157894736843e-05, + "loss": 0.5734, + "step": 25630 + }, + { + "epoch": 1.4352671071788554, + "grad_norm": 1.3465334177017212, + "learning_rate": 9.852131578947369e-05, + "loss": 0.4808, + "step": 25631 + }, + { + "epoch": 1.4353231044909844, + "grad_norm": 1.1294889450073242, + "learning_rate": 9.852105263157895e-05, + "loss": 0.5764, + "step": 25632 + }, + { + "epoch": 1.4353791018031135, + "grad_norm": 1.4509919881820679, + "learning_rate": 9.852078947368422e-05, + "loss": 0.543, + "step": 25633 + }, + { + "epoch": 1.4354350991152425, + "grad_norm": 1.4717315435409546, + "learning_rate": 9.852052631578948e-05, + "loss": 0.5028, + "step": 25634 + }, + { + "epoch": 1.4354910964273715, + "grad_norm": 1.5489089488983154, + "learning_rate": 9.852026315789474e-05, + "loss": 0.6137, + "step": 25635 + }, + { + "epoch": 1.4355470937395005, + "grad_norm": 1.3831948041915894, + "learning_rate": 9.852e-05, + "loss": 0.3883, + "step": 25636 + }, + { + "epoch": 1.4356030910516295, + "grad_norm": 1.2413886785507202, + "learning_rate": 9.851973684210527e-05, + "loss": 0.5447, + "step": 25637 + }, + { + "epoch": 1.4356590883637586, + "grad_norm": 1.3069621324539185, + "learning_rate": 9.851947368421053e-05, + "loss": 0.469, + "step": 25638 + }, + { + "epoch": 1.4357150856758876, + "grad_norm": 1.530605435371399, + "learning_rate": 9.851921052631579e-05, + "loss": 0.5449, + "step": 25639 + }, + { + "epoch": 1.4357710829880166, + "grad_norm": 1.1206198930740356, + "learning_rate": 9.851894736842105e-05, + "loss": 0.41, + "step": 25640 + }, + { + "epoch": 1.4358270803001456, + "grad_norm": 1.5072031021118164, + "learning_rate": 9.851868421052632e-05, + "loss": 0.4225, + "step": 25641 + }, + { + "epoch": 1.4358830776122746, + "grad_norm": 1.2919793128967285, + "learning_rate": 9.851842105263158e-05, + "loss": 0.4009, + "step": 25642 + }, + { + "epoch": 1.4359390749244036, + "grad_norm": 1.3727933168411255, + "learning_rate": 9.851815789473686e-05, + "loss": 0.4062, + "step": 25643 + }, + { + "epoch": 1.4359950722365327, + "grad_norm": 1.3184592723846436, + "learning_rate": 9.85178947368421e-05, + "loss": 0.5137, + "step": 25644 + }, + { + "epoch": 1.4360510695486617, + "grad_norm": 1.1559818983078003, + "learning_rate": 9.851763157894738e-05, + "loss": 0.4556, + "step": 25645 + }, + { + "epoch": 1.4361070668607907, + "grad_norm": 1.5235875844955444, + "learning_rate": 9.851736842105264e-05, + "loss": 0.459, + "step": 25646 + }, + { + "epoch": 1.4361630641729197, + "grad_norm": 1.1688374280929565, + "learning_rate": 9.851710526315791e-05, + "loss": 0.3956, + "step": 25647 + }, + { + "epoch": 1.4362190614850487, + "grad_norm": 1.375508427619934, + "learning_rate": 9.851684210526317e-05, + "loss": 0.43, + "step": 25648 + }, + { + "epoch": 1.4362750587971778, + "grad_norm": 1.9250904321670532, + "learning_rate": 9.851657894736842e-05, + "loss": 0.4892, + "step": 25649 + }, + { + "epoch": 1.4363310561093068, + "grad_norm": 1.5735348463058472, + "learning_rate": 9.851631578947369e-05, + "loss": 0.6569, + "step": 25650 + }, + { + "epoch": 1.4363870534214358, + "grad_norm": 1.421795129776001, + "learning_rate": 9.851605263157895e-05, + "loss": 0.4173, + "step": 25651 + }, + { + "epoch": 1.4364430507335648, + "grad_norm": 1.58512282371521, + "learning_rate": 9.851578947368422e-05, + "loss": 0.393, + "step": 25652 + }, + { + "epoch": 1.4364990480456938, + "grad_norm": 1.1600414514541626, + "learning_rate": 9.851552631578947e-05, + "loss": 0.4621, + "step": 25653 + }, + { + "epoch": 1.4365550453578229, + "grad_norm": 1.1983907222747803, + "learning_rate": 9.851526315789474e-05, + "loss": 0.4011, + "step": 25654 + }, + { + "epoch": 1.4366110426699519, + "grad_norm": 1.5610764026641846, + "learning_rate": 9.8515e-05, + "loss": 0.6253, + "step": 25655 + }, + { + "epoch": 1.436667039982081, + "grad_norm": 1.3131215572357178, + "learning_rate": 9.851473684210527e-05, + "loss": 0.4615, + "step": 25656 + }, + { + "epoch": 1.43672303729421, + "grad_norm": 1.3105984926223755, + "learning_rate": 9.851447368421053e-05, + "loss": 0.4597, + "step": 25657 + }, + { + "epoch": 1.436779034606339, + "grad_norm": 1.5818188190460205, + "learning_rate": 9.85142105263158e-05, + "loss": 0.4973, + "step": 25658 + }, + { + "epoch": 1.436835031918468, + "grad_norm": 1.244718313217163, + "learning_rate": 9.851394736842105e-05, + "loss": 0.4456, + "step": 25659 + }, + { + "epoch": 1.436891029230597, + "grad_norm": 1.3325386047363281, + "learning_rate": 9.851368421052633e-05, + "loss": 0.4782, + "step": 25660 + }, + { + "epoch": 1.436947026542726, + "grad_norm": 1.2712047100067139, + "learning_rate": 9.851342105263159e-05, + "loss": 0.4101, + "step": 25661 + }, + { + "epoch": 1.437003023854855, + "grad_norm": 1.068215012550354, + "learning_rate": 9.851315789473685e-05, + "loss": 0.343, + "step": 25662 + }, + { + "epoch": 1.437059021166984, + "grad_norm": 1.4893957376480103, + "learning_rate": 9.85128947368421e-05, + "loss": 0.5978, + "step": 25663 + }, + { + "epoch": 1.437115018479113, + "grad_norm": 1.3513212203979492, + "learning_rate": 9.851263157894738e-05, + "loss": 0.4153, + "step": 25664 + }, + { + "epoch": 1.437171015791242, + "grad_norm": 1.544857382774353, + "learning_rate": 9.851236842105264e-05, + "loss": 0.5701, + "step": 25665 + }, + { + "epoch": 1.437227013103371, + "grad_norm": 1.2158294916152954, + "learning_rate": 9.85121052631579e-05, + "loss": 0.3734, + "step": 25666 + }, + { + "epoch": 1.4372830104155, + "grad_norm": 2.1777915954589844, + "learning_rate": 9.851184210526316e-05, + "loss": 0.7021, + "step": 25667 + }, + { + "epoch": 1.4373390077276291, + "grad_norm": 4.20610237121582, + "learning_rate": 9.851157894736842e-05, + "loss": 0.5588, + "step": 25668 + }, + { + "epoch": 1.4373950050397581, + "grad_norm": 1.689322829246521, + "learning_rate": 9.851131578947369e-05, + "loss": 0.4976, + "step": 25669 + }, + { + "epoch": 1.4374510023518872, + "grad_norm": 1.2596203088760376, + "learning_rate": 9.851105263157895e-05, + "loss": 0.4639, + "step": 25670 + }, + { + "epoch": 1.4375069996640162, + "grad_norm": 1.1975135803222656, + "learning_rate": 9.851078947368421e-05, + "loss": 0.4729, + "step": 25671 + }, + { + "epoch": 1.4375629969761452, + "grad_norm": 1.586650013923645, + "learning_rate": 9.851052631578947e-05, + "loss": 0.6526, + "step": 25672 + }, + { + "epoch": 1.4376189942882742, + "grad_norm": 1.5360716581344604, + "learning_rate": 9.851026315789474e-05, + "loss": 0.4654, + "step": 25673 + }, + { + "epoch": 1.4376749916004032, + "grad_norm": 1.3076252937316895, + "learning_rate": 9.851e-05, + "loss": 0.4586, + "step": 25674 + }, + { + "epoch": 1.4377309889125323, + "grad_norm": 1.2574946880340576, + "learning_rate": 9.850973684210528e-05, + "loss": 0.394, + "step": 25675 + }, + { + "epoch": 1.4377869862246613, + "grad_norm": 1.2691985368728638, + "learning_rate": 9.850947368421052e-05, + "loss": 0.4306, + "step": 25676 + }, + { + "epoch": 1.4378429835367903, + "grad_norm": 1.185869812965393, + "learning_rate": 9.85092105263158e-05, + "loss": 0.3659, + "step": 25677 + }, + { + "epoch": 1.4378989808489193, + "grad_norm": 1.2608500719070435, + "learning_rate": 9.850894736842106e-05, + "loss": 0.3736, + "step": 25678 + }, + { + "epoch": 1.4379549781610483, + "grad_norm": 1.475943922996521, + "learning_rate": 9.850868421052633e-05, + "loss": 0.5287, + "step": 25679 + }, + { + "epoch": 1.4380109754731774, + "grad_norm": 1.4908572435379028, + "learning_rate": 9.850842105263159e-05, + "loss": 0.5296, + "step": 25680 + }, + { + "epoch": 1.4380669727853064, + "grad_norm": 1.2638475894927979, + "learning_rate": 9.850815789473685e-05, + "loss": 0.4952, + "step": 25681 + }, + { + "epoch": 1.4381229700974354, + "grad_norm": 1.2012163400650024, + "learning_rate": 9.850789473684211e-05, + "loss": 0.5255, + "step": 25682 + }, + { + "epoch": 1.4381789674095644, + "grad_norm": 2.8469507694244385, + "learning_rate": 9.850763157894737e-05, + "loss": 0.523, + "step": 25683 + }, + { + "epoch": 1.4382349647216934, + "grad_norm": 1.1035187244415283, + "learning_rate": 9.850736842105264e-05, + "loss": 0.3706, + "step": 25684 + }, + { + "epoch": 1.4382909620338225, + "grad_norm": 1.0298693180084229, + "learning_rate": 9.85071052631579e-05, + "loss": 0.3495, + "step": 25685 + }, + { + "epoch": 1.4383469593459515, + "grad_norm": 1.5727527141571045, + "learning_rate": 9.850684210526316e-05, + "loss": 0.6739, + "step": 25686 + }, + { + "epoch": 1.4384029566580805, + "grad_norm": 1.750372052192688, + "learning_rate": 9.850657894736842e-05, + "loss": 0.6113, + "step": 25687 + }, + { + "epoch": 1.4384589539702095, + "grad_norm": 1.5725183486938477, + "learning_rate": 9.85063157894737e-05, + "loss": 0.5526, + "step": 25688 + }, + { + "epoch": 1.4385149512823385, + "grad_norm": 1.3974330425262451, + "learning_rate": 9.850605263157895e-05, + "loss": 0.4032, + "step": 25689 + }, + { + "epoch": 1.4385709485944675, + "grad_norm": 1.3261353969573975, + "learning_rate": 9.850578947368421e-05, + "loss": 0.4921, + "step": 25690 + }, + { + "epoch": 1.4386269459065966, + "grad_norm": 1.3843958377838135, + "learning_rate": 9.850552631578947e-05, + "loss": 0.4347, + "step": 25691 + }, + { + "epoch": 1.4386829432187254, + "grad_norm": 1.2524871826171875, + "learning_rate": 9.850526315789475e-05, + "loss": 0.4132, + "step": 25692 + }, + { + "epoch": 1.4387389405308544, + "grad_norm": 1.102995753288269, + "learning_rate": 9.8505e-05, + "loss": 0.3919, + "step": 25693 + }, + { + "epoch": 1.4387949378429834, + "grad_norm": 1.183918833732605, + "learning_rate": 9.850473684210527e-05, + "loss": 0.5171, + "step": 25694 + }, + { + "epoch": 1.4388509351551124, + "grad_norm": 1.4520643949508667, + "learning_rate": 9.850447368421053e-05, + "loss": 0.4224, + "step": 25695 + }, + { + "epoch": 1.4389069324672414, + "grad_norm": 1.2692177295684814, + "learning_rate": 9.85042105263158e-05, + "loss": 0.4775, + "step": 25696 + }, + { + "epoch": 1.4389629297793705, + "grad_norm": 1.3434689044952393, + "learning_rate": 9.850394736842106e-05, + "loss": 0.4722, + "step": 25697 + }, + { + "epoch": 1.4390189270914995, + "grad_norm": 1.5888018608093262, + "learning_rate": 9.850368421052633e-05, + "loss": 0.6064, + "step": 25698 + }, + { + "epoch": 1.4390749244036285, + "grad_norm": 1.286224603652954, + "learning_rate": 9.850342105263158e-05, + "loss": 0.4078, + "step": 25699 + }, + { + "epoch": 1.4391309217157575, + "grad_norm": 1.6147180795669556, + "learning_rate": 9.850315789473684e-05, + "loss": 0.6666, + "step": 25700 + }, + { + "epoch": 1.4391869190278865, + "grad_norm": 1.2030236721038818, + "learning_rate": 9.850289473684211e-05, + "loss": 0.421, + "step": 25701 + }, + { + "epoch": 1.4392429163400156, + "grad_norm": 1.427893042564392, + "learning_rate": 9.850263157894737e-05, + "loss": 0.4825, + "step": 25702 + }, + { + "epoch": 1.4392989136521446, + "grad_norm": 1.3892394304275513, + "learning_rate": 9.850236842105264e-05, + "loss": 0.6434, + "step": 25703 + }, + { + "epoch": 1.4393549109642736, + "grad_norm": 1.3688900470733643, + "learning_rate": 9.850210526315789e-05, + "loss": 0.5591, + "step": 25704 + }, + { + "epoch": 1.4394109082764026, + "grad_norm": 1.286647081375122, + "learning_rate": 9.850184210526316e-05, + "loss": 0.5014, + "step": 25705 + }, + { + "epoch": 1.4394669055885316, + "grad_norm": 1.2540347576141357, + "learning_rate": 9.850157894736842e-05, + "loss": 0.4826, + "step": 25706 + }, + { + "epoch": 1.4395229029006607, + "grad_norm": 1.377650260925293, + "learning_rate": 9.85013157894737e-05, + "loss": 0.4497, + "step": 25707 + }, + { + "epoch": 1.4395789002127897, + "grad_norm": 1.3599501848220825, + "learning_rate": 9.850105263157894e-05, + "loss": 0.4194, + "step": 25708 + }, + { + "epoch": 1.4396348975249187, + "grad_norm": 1.356694221496582, + "learning_rate": 9.850078947368422e-05, + "loss": 0.3896, + "step": 25709 + }, + { + "epoch": 1.4396908948370477, + "grad_norm": 1.2018773555755615, + "learning_rate": 9.850052631578948e-05, + "loss": 0.4454, + "step": 25710 + }, + { + "epoch": 1.4397468921491767, + "grad_norm": 1.5941123962402344, + "learning_rate": 9.850026315789475e-05, + "loss": 0.5182, + "step": 25711 + }, + { + "epoch": 1.4398028894613057, + "grad_norm": 1.4879815578460693, + "learning_rate": 9.850000000000001e-05, + "loss": 0.4818, + "step": 25712 + }, + { + "epoch": 1.4398588867734348, + "grad_norm": 1.2622997760772705, + "learning_rate": 9.849973684210527e-05, + "loss": 0.4667, + "step": 25713 + }, + { + "epoch": 1.4399148840855638, + "grad_norm": 1.4573948383331299, + "learning_rate": 9.849947368421053e-05, + "loss": 0.4054, + "step": 25714 + }, + { + "epoch": 1.4399708813976928, + "grad_norm": 1.443192720413208, + "learning_rate": 9.84992105263158e-05, + "loss": 0.5356, + "step": 25715 + }, + { + "epoch": 1.4400268787098218, + "grad_norm": 1.1048190593719482, + "learning_rate": 9.849894736842106e-05, + "loss": 0.3802, + "step": 25716 + }, + { + "epoch": 1.4400828760219508, + "grad_norm": 1.1638503074645996, + "learning_rate": 9.849868421052632e-05, + "loss": 0.4394, + "step": 25717 + }, + { + "epoch": 1.4401388733340799, + "grad_norm": 1.3394076824188232, + "learning_rate": 9.849842105263158e-05, + "loss": 0.5312, + "step": 25718 + }, + { + "epoch": 1.4401948706462089, + "grad_norm": 1.2208266258239746, + "learning_rate": 9.849815789473684e-05, + "loss": 0.3839, + "step": 25719 + }, + { + "epoch": 1.440250867958338, + "grad_norm": 1.4026274681091309, + "learning_rate": 9.849789473684211e-05, + "loss": 0.4958, + "step": 25720 + }, + { + "epoch": 1.440306865270467, + "grad_norm": 1.4235761165618896, + "learning_rate": 9.849763157894737e-05, + "loss": 0.5339, + "step": 25721 + }, + { + "epoch": 1.440362862582596, + "grad_norm": 1.5458778142929077, + "learning_rate": 9.849736842105263e-05, + "loss": 0.5653, + "step": 25722 + }, + { + "epoch": 1.440418859894725, + "grad_norm": 7.980746746063232, + "learning_rate": 9.849710526315789e-05, + "loss": 0.4956, + "step": 25723 + }, + { + "epoch": 1.440474857206854, + "grad_norm": 1.320092797279358, + "learning_rate": 9.849684210526317e-05, + "loss": 0.5172, + "step": 25724 + }, + { + "epoch": 1.440530854518983, + "grad_norm": 1.689001441001892, + "learning_rate": 9.849657894736843e-05, + "loss": 0.5753, + "step": 25725 + }, + { + "epoch": 1.440586851831112, + "grad_norm": 1.5204671621322632, + "learning_rate": 9.849631578947369e-05, + "loss": 0.5524, + "step": 25726 + }, + { + "epoch": 1.440642849143241, + "grad_norm": 1.219130039215088, + "learning_rate": 9.849605263157895e-05, + "loss": 0.4574, + "step": 25727 + }, + { + "epoch": 1.44069884645537, + "grad_norm": 1.1669539213180542, + "learning_rate": 9.849578947368422e-05, + "loss": 0.5025, + "step": 25728 + }, + { + "epoch": 1.440754843767499, + "grad_norm": 1.4158577919006348, + "learning_rate": 9.849552631578948e-05, + "loss": 0.4848, + "step": 25729 + }, + { + "epoch": 1.440810841079628, + "grad_norm": 1.3593709468841553, + "learning_rate": 9.849526315789475e-05, + "loss": 0.4785, + "step": 25730 + }, + { + "epoch": 1.440866838391757, + "grad_norm": 1.3200407028198242, + "learning_rate": 9.8495e-05, + "loss": 0.3671, + "step": 25731 + }, + { + "epoch": 1.4409228357038861, + "grad_norm": 2.0137245655059814, + "learning_rate": 9.849473684210527e-05, + "loss": 0.5586, + "step": 25732 + }, + { + "epoch": 1.4409788330160151, + "grad_norm": 1.4319556951522827, + "learning_rate": 9.849447368421053e-05, + "loss": 0.4586, + "step": 25733 + }, + { + "epoch": 1.4410348303281442, + "grad_norm": 1.4195220470428467, + "learning_rate": 9.849421052631579e-05, + "loss": 0.5305, + "step": 25734 + }, + { + "epoch": 1.4410908276402732, + "grad_norm": 1.761801838874817, + "learning_rate": 9.849394736842106e-05, + "loss": 0.6981, + "step": 25735 + }, + { + "epoch": 1.4411468249524022, + "grad_norm": 1.3044332265853882, + "learning_rate": 9.849368421052631e-05, + "loss": 0.4531, + "step": 25736 + }, + { + "epoch": 1.4412028222645312, + "grad_norm": 1.449478030204773, + "learning_rate": 9.849342105263158e-05, + "loss": 0.5119, + "step": 25737 + }, + { + "epoch": 1.4412588195766602, + "grad_norm": 1.167271375656128, + "learning_rate": 9.849315789473684e-05, + "loss": 0.4048, + "step": 25738 + }, + { + "epoch": 1.4413148168887893, + "grad_norm": 1.1470009088516235, + "learning_rate": 9.849289473684212e-05, + "loss": 0.4402, + "step": 25739 + }, + { + "epoch": 1.4413708142009183, + "grad_norm": 1.1541756391525269, + "learning_rate": 9.849263157894738e-05, + "loss": 0.4063, + "step": 25740 + }, + { + "epoch": 1.4414268115130473, + "grad_norm": 1.43264901638031, + "learning_rate": 9.849236842105264e-05, + "loss": 0.4164, + "step": 25741 + }, + { + "epoch": 1.4414828088251763, + "grad_norm": 1.517122745513916, + "learning_rate": 9.84921052631579e-05, + "loss": 0.6524, + "step": 25742 + }, + { + "epoch": 1.4415388061373053, + "grad_norm": 1.531346082687378, + "learning_rate": 9.849184210526317e-05, + "loss": 0.4592, + "step": 25743 + }, + { + "epoch": 1.4415948034494344, + "grad_norm": 1.2700045108795166, + "learning_rate": 9.849157894736843e-05, + "loss": 0.4308, + "step": 25744 + }, + { + "epoch": 1.4416508007615634, + "grad_norm": 1.3897801637649536, + "learning_rate": 9.849131578947369e-05, + "loss": 0.5314, + "step": 25745 + }, + { + "epoch": 1.4417067980736924, + "grad_norm": 1.5570229291915894, + "learning_rate": 9.849105263157895e-05, + "loss": 0.4738, + "step": 25746 + }, + { + "epoch": 1.4417627953858214, + "grad_norm": 1.3779877424240112, + "learning_rate": 9.849078947368422e-05, + "loss": 0.6156, + "step": 25747 + }, + { + "epoch": 1.4418187926979504, + "grad_norm": 1.4035085439682007, + "learning_rate": 9.849052631578948e-05, + "loss": 0.4309, + "step": 25748 + }, + { + "epoch": 1.4418747900100795, + "grad_norm": 1.1910206079483032, + "learning_rate": 9.849026315789474e-05, + "loss": 0.4711, + "step": 25749 + }, + { + "epoch": 1.4419307873222085, + "grad_norm": 1.4764854907989502, + "learning_rate": 9.849e-05, + "loss": 0.4317, + "step": 25750 + }, + { + "epoch": 1.4419867846343375, + "grad_norm": 1.3121896982192993, + "learning_rate": 9.848973684210526e-05, + "loss": 0.4307, + "step": 25751 + }, + { + "epoch": 1.4420427819464665, + "grad_norm": 1.616694450378418, + "learning_rate": 9.848947368421053e-05, + "loss": 0.7213, + "step": 25752 + }, + { + "epoch": 1.4420987792585955, + "grad_norm": 1.1158812046051025, + "learning_rate": 9.84892105263158e-05, + "loss": 0.3822, + "step": 25753 + }, + { + "epoch": 1.4421547765707246, + "grad_norm": 1.32943594455719, + "learning_rate": 9.848894736842105e-05, + "loss": 0.4025, + "step": 25754 + }, + { + "epoch": 1.4422107738828536, + "grad_norm": 1.6812101602554321, + "learning_rate": 9.848868421052631e-05, + "loss": 0.557, + "step": 25755 + }, + { + "epoch": 1.4422667711949826, + "grad_norm": 1.5972145795822144, + "learning_rate": 9.848842105263159e-05, + "loss": 0.4719, + "step": 25756 + }, + { + "epoch": 1.4423227685071116, + "grad_norm": 1.3968019485473633, + "learning_rate": 9.848815789473685e-05, + "loss": 0.4173, + "step": 25757 + }, + { + "epoch": 1.4423787658192406, + "grad_norm": 1.6362229585647583, + "learning_rate": 9.848789473684212e-05, + "loss": 0.5076, + "step": 25758 + }, + { + "epoch": 1.4424347631313696, + "grad_norm": 1.2163110971450806, + "learning_rate": 9.848763157894737e-05, + "loss": 0.4992, + "step": 25759 + }, + { + "epoch": 1.4424907604434987, + "grad_norm": 1.2565122842788696, + "learning_rate": 9.848736842105264e-05, + "loss": 0.4421, + "step": 25760 + }, + { + "epoch": 1.4425467577556277, + "grad_norm": 1.3888131380081177, + "learning_rate": 9.84871052631579e-05, + "loss": 0.5243, + "step": 25761 + }, + { + "epoch": 1.4426027550677567, + "grad_norm": 1.3142640590667725, + "learning_rate": 9.848684210526317e-05, + "loss": 0.3618, + "step": 25762 + }, + { + "epoch": 1.4426587523798857, + "grad_norm": 1.2778666019439697, + "learning_rate": 9.848657894736842e-05, + "loss": 0.4543, + "step": 25763 + }, + { + "epoch": 1.4427147496920147, + "grad_norm": 1.3430832624435425, + "learning_rate": 9.848631578947369e-05, + "loss": 0.5383, + "step": 25764 + }, + { + "epoch": 1.4427707470041438, + "grad_norm": 1.625623106956482, + "learning_rate": 9.848605263157895e-05, + "loss": 0.5379, + "step": 25765 + }, + { + "epoch": 1.4428267443162728, + "grad_norm": 1.27606999874115, + "learning_rate": 9.848578947368422e-05, + "loss": 0.3628, + "step": 25766 + }, + { + "epoch": 1.4428827416284018, + "grad_norm": 1.6553044319152832, + "learning_rate": 9.848552631578948e-05, + "loss": 0.7632, + "step": 25767 + }, + { + "epoch": 1.4429387389405308, + "grad_norm": 1.6357741355895996, + "learning_rate": 9.848526315789474e-05, + "loss": 0.5964, + "step": 25768 + }, + { + "epoch": 1.4429947362526598, + "grad_norm": 1.7604966163635254, + "learning_rate": 9.8485e-05, + "loss": 0.7096, + "step": 25769 + }, + { + "epoch": 1.4430507335647889, + "grad_norm": 1.5702502727508545, + "learning_rate": 9.848473684210526e-05, + "loss": 0.4116, + "step": 25770 + }, + { + "epoch": 1.4431067308769179, + "grad_norm": 1.343562364578247, + "learning_rate": 9.848447368421054e-05, + "loss": 0.427, + "step": 25771 + }, + { + "epoch": 1.443162728189047, + "grad_norm": 1.2983551025390625, + "learning_rate": 9.84842105263158e-05, + "loss": 0.38, + "step": 25772 + }, + { + "epoch": 1.443218725501176, + "grad_norm": 1.2825380563735962, + "learning_rate": 9.848394736842106e-05, + "loss": 0.4135, + "step": 25773 + }, + { + "epoch": 1.443274722813305, + "grad_norm": 1.3777891397476196, + "learning_rate": 9.848368421052632e-05, + "loss": 0.3667, + "step": 25774 + }, + { + "epoch": 1.443330720125434, + "grad_norm": 1.2664874792099, + "learning_rate": 9.848342105263159e-05, + "loss": 0.4057, + "step": 25775 + }, + { + "epoch": 1.443386717437563, + "grad_norm": 1.5738770961761475, + "learning_rate": 9.848315789473685e-05, + "loss": 0.511, + "step": 25776 + }, + { + "epoch": 1.443442714749692, + "grad_norm": 1.3449558019638062, + "learning_rate": 9.848289473684211e-05, + "loss": 0.5309, + "step": 25777 + }, + { + "epoch": 1.443498712061821, + "grad_norm": 1.1180827617645264, + "learning_rate": 9.848263157894737e-05, + "loss": 0.4378, + "step": 25778 + }, + { + "epoch": 1.44355470937395, + "grad_norm": 1.8353078365325928, + "learning_rate": 9.848236842105264e-05, + "loss": 0.3912, + "step": 25779 + }, + { + "epoch": 1.443610706686079, + "grad_norm": 2.383636951446533, + "learning_rate": 9.84821052631579e-05, + "loss": 0.6155, + "step": 25780 + }, + { + "epoch": 1.443666703998208, + "grad_norm": 1.5396950244903564, + "learning_rate": 9.848184210526316e-05, + "loss": 0.4052, + "step": 25781 + }, + { + "epoch": 1.443722701310337, + "grad_norm": 34.623844146728516, + "learning_rate": 9.848157894736842e-05, + "loss": 0.4543, + "step": 25782 + }, + { + "epoch": 1.443778698622466, + "grad_norm": 1.4119480848312378, + "learning_rate": 9.84813157894737e-05, + "loss": 0.7073, + "step": 25783 + }, + { + "epoch": 1.4438346959345951, + "grad_norm": 1.2975821495056152, + "learning_rate": 9.848105263157895e-05, + "loss": 0.4418, + "step": 25784 + }, + { + "epoch": 1.4438906932467241, + "grad_norm": 1.4934360980987549, + "learning_rate": 9.848078947368423e-05, + "loss": 0.4745, + "step": 25785 + }, + { + "epoch": 1.4439466905588532, + "grad_norm": 1.6379145383834839, + "learning_rate": 9.848052631578947e-05, + "loss": 0.4778, + "step": 25786 + }, + { + "epoch": 1.4440026878709822, + "grad_norm": 1.3537036180496216, + "learning_rate": 9.848026315789473e-05, + "loss": 0.4766, + "step": 25787 + }, + { + "epoch": 1.4440586851831112, + "grad_norm": 1.351130723953247, + "learning_rate": 9.848e-05, + "loss": 0.4779, + "step": 25788 + }, + { + "epoch": 1.4441146824952402, + "grad_norm": 1.3500784635543823, + "learning_rate": 9.847973684210527e-05, + "loss": 0.4242, + "step": 25789 + }, + { + "epoch": 1.4441706798073692, + "grad_norm": 1.1697924137115479, + "learning_rate": 9.847947368421054e-05, + "loss": 0.4517, + "step": 25790 + }, + { + "epoch": 1.4442266771194983, + "grad_norm": 1.5843197107315063, + "learning_rate": 9.847921052631578e-05, + "loss": 0.3941, + "step": 25791 + }, + { + "epoch": 1.4442826744316273, + "grad_norm": 1.3092375993728638, + "learning_rate": 9.847894736842106e-05, + "loss": 0.5552, + "step": 25792 + }, + { + "epoch": 1.4443386717437563, + "grad_norm": 1.3153843879699707, + "learning_rate": 9.847868421052632e-05, + "loss": 0.3795, + "step": 25793 + }, + { + "epoch": 1.4443946690558853, + "grad_norm": 1.3666855096817017, + "learning_rate": 9.847842105263159e-05, + "loss": 0.5007, + "step": 25794 + }, + { + "epoch": 1.4444506663680143, + "grad_norm": 1.3877649307250977, + "learning_rate": 9.847815789473685e-05, + "loss": 0.5185, + "step": 25795 + }, + { + "epoch": 1.4445066636801434, + "grad_norm": 3.332524299621582, + "learning_rate": 9.847789473684211e-05, + "loss": 0.4432, + "step": 25796 + }, + { + "epoch": 1.4445626609922724, + "grad_norm": 1.4763113260269165, + "learning_rate": 9.847763157894737e-05, + "loss": 0.6981, + "step": 25797 + }, + { + "epoch": 1.4446186583044014, + "grad_norm": 1.2583116292953491, + "learning_rate": 9.847736842105264e-05, + "loss": 0.4298, + "step": 25798 + }, + { + "epoch": 1.4446746556165304, + "grad_norm": 1.458219289779663, + "learning_rate": 9.84771052631579e-05, + "loss": 0.5209, + "step": 25799 + }, + { + "epoch": 1.4447306529286594, + "grad_norm": 1.372975468635559, + "learning_rate": 9.847684210526316e-05, + "loss": 0.5491, + "step": 25800 + }, + { + "epoch": 1.4447866502407885, + "grad_norm": 1.1941468715667725, + "learning_rate": 9.847657894736842e-05, + "loss": 0.3895, + "step": 25801 + }, + { + "epoch": 1.4448426475529175, + "grad_norm": 1.404276728630066, + "learning_rate": 9.84763157894737e-05, + "loss": 0.4141, + "step": 25802 + }, + { + "epoch": 1.4448986448650465, + "grad_norm": 1.2524129152297974, + "learning_rate": 9.847605263157896e-05, + "loss": 0.5221, + "step": 25803 + }, + { + "epoch": 1.4449546421771755, + "grad_norm": 1.4511653184890747, + "learning_rate": 9.847578947368422e-05, + "loss": 0.5579, + "step": 25804 + }, + { + "epoch": 1.4450106394893045, + "grad_norm": 1.099122405052185, + "learning_rate": 9.847552631578948e-05, + "loss": 0.3619, + "step": 25805 + }, + { + "epoch": 1.4450666368014335, + "grad_norm": 1.1699548959732056, + "learning_rate": 9.847526315789474e-05, + "loss": 0.3935, + "step": 25806 + }, + { + "epoch": 1.4451226341135626, + "grad_norm": 1.9812945127487183, + "learning_rate": 9.847500000000001e-05, + "loss": 0.4948, + "step": 25807 + }, + { + "epoch": 1.4451786314256916, + "grad_norm": 1.3612101078033447, + "learning_rate": 9.847473684210527e-05, + "loss": 0.5559, + "step": 25808 + }, + { + "epoch": 1.4452346287378206, + "grad_norm": 1.381105899810791, + "learning_rate": 9.847447368421053e-05, + "loss": 0.323, + "step": 25809 + }, + { + "epoch": 1.4452906260499496, + "grad_norm": 1.4454028606414795, + "learning_rate": 9.847421052631579e-05, + "loss": 0.5805, + "step": 25810 + }, + { + "epoch": 1.4453466233620786, + "grad_norm": 2.1735076904296875, + "learning_rate": 9.847394736842106e-05, + "loss": 0.5439, + "step": 25811 + }, + { + "epoch": 1.4454026206742077, + "grad_norm": 2.0774319171905518, + "learning_rate": 9.847368421052632e-05, + "loss": 0.559, + "step": 25812 + }, + { + "epoch": 1.4454586179863367, + "grad_norm": 1.3191627264022827, + "learning_rate": 9.847342105263158e-05, + "loss": 0.4167, + "step": 25813 + }, + { + "epoch": 1.4455146152984657, + "grad_norm": 1.2631181478500366, + "learning_rate": 9.847315789473684e-05, + "loss": 0.4546, + "step": 25814 + }, + { + "epoch": 1.4455706126105947, + "grad_norm": 1.380804181098938, + "learning_rate": 9.847289473684211e-05, + "loss": 0.3802, + "step": 25815 + }, + { + "epoch": 1.4456266099227237, + "grad_norm": 1.2480416297912598, + "learning_rate": 9.847263157894737e-05, + "loss": 0.5143, + "step": 25816 + }, + { + "epoch": 1.4456826072348528, + "grad_norm": 1.4806478023529053, + "learning_rate": 9.847236842105265e-05, + "loss": 0.5376, + "step": 25817 + }, + { + "epoch": 1.4457386045469818, + "grad_norm": 1.348549246788025, + "learning_rate": 9.847210526315789e-05, + "loss": 0.3942, + "step": 25818 + }, + { + "epoch": 1.4457946018591108, + "grad_norm": 1.1411925554275513, + "learning_rate": 9.847184210526317e-05, + "loss": 0.3391, + "step": 25819 + }, + { + "epoch": 1.4458505991712398, + "grad_norm": 1.7207562923431396, + "learning_rate": 9.847157894736843e-05, + "loss": 0.5623, + "step": 25820 + }, + { + "epoch": 1.4459065964833688, + "grad_norm": 1.4202302694320679, + "learning_rate": 9.847131578947369e-05, + "loss": 0.5446, + "step": 25821 + }, + { + "epoch": 1.4459625937954979, + "grad_norm": 1.5925122499465942, + "learning_rate": 9.847105263157896e-05, + "loss": 0.4418, + "step": 25822 + }, + { + "epoch": 1.4460185911076269, + "grad_norm": 1.2614142894744873, + "learning_rate": 9.84707894736842e-05, + "loss": 0.4257, + "step": 25823 + }, + { + "epoch": 1.446074588419756, + "grad_norm": 1.0207983255386353, + "learning_rate": 9.847052631578948e-05, + "loss": 0.3638, + "step": 25824 + }, + { + "epoch": 1.446130585731885, + "grad_norm": 1.6025762557983398, + "learning_rate": 9.847026315789474e-05, + "loss": 0.5265, + "step": 25825 + }, + { + "epoch": 1.446186583044014, + "grad_norm": 1.2102876901626587, + "learning_rate": 9.847000000000001e-05, + "loss": 0.4531, + "step": 25826 + }, + { + "epoch": 1.446242580356143, + "grad_norm": 1.18339204788208, + "learning_rate": 9.846973684210527e-05, + "loss": 0.3625, + "step": 25827 + }, + { + "epoch": 1.446298577668272, + "grad_norm": 1.980252981185913, + "learning_rate": 9.846947368421053e-05, + "loss": 0.5182, + "step": 25828 + }, + { + "epoch": 1.446354574980401, + "grad_norm": 1.51128351688385, + "learning_rate": 9.846921052631579e-05, + "loss": 0.4307, + "step": 25829 + }, + { + "epoch": 1.44641057229253, + "grad_norm": 1.2699722051620483, + "learning_rate": 9.846894736842106e-05, + "loss": 0.4264, + "step": 25830 + }, + { + "epoch": 1.446466569604659, + "grad_norm": 1.3692348003387451, + "learning_rate": 9.846868421052632e-05, + "loss": 0.4324, + "step": 25831 + }, + { + "epoch": 1.446522566916788, + "grad_norm": 1.2989691495895386, + "learning_rate": 9.846842105263158e-05, + "loss": 0.5484, + "step": 25832 + }, + { + "epoch": 1.446578564228917, + "grad_norm": 1.4866071939468384, + "learning_rate": 9.846815789473684e-05, + "loss": 0.5034, + "step": 25833 + }, + { + "epoch": 1.446634561541046, + "grad_norm": 1.1898126602172852, + "learning_rate": 9.846789473684212e-05, + "loss": 0.4293, + "step": 25834 + }, + { + "epoch": 1.446690558853175, + "grad_norm": 1.5289809703826904, + "learning_rate": 9.846763157894738e-05, + "loss": 0.6081, + "step": 25835 + }, + { + "epoch": 1.4467465561653041, + "grad_norm": 1.984678030014038, + "learning_rate": 9.846736842105264e-05, + "loss": 0.54, + "step": 25836 + }, + { + "epoch": 1.4468025534774331, + "grad_norm": 1.222996711730957, + "learning_rate": 9.84671052631579e-05, + "loss": 0.3786, + "step": 25837 + }, + { + "epoch": 1.4468585507895622, + "grad_norm": 1.2271722555160522, + "learning_rate": 9.846684210526315e-05, + "loss": 0.4339, + "step": 25838 + }, + { + "epoch": 1.4469145481016912, + "grad_norm": 1.6566424369812012, + "learning_rate": 9.846657894736843e-05, + "loss": 0.3967, + "step": 25839 + }, + { + "epoch": 1.4469705454138202, + "grad_norm": 1.3022581338882446, + "learning_rate": 9.846631578947369e-05, + "loss": 0.5282, + "step": 25840 + }, + { + "epoch": 1.4470265427259492, + "grad_norm": 1.193652868270874, + "learning_rate": 9.846605263157895e-05, + "loss": 0.3789, + "step": 25841 + }, + { + "epoch": 1.4470825400380782, + "grad_norm": 1.084812045097351, + "learning_rate": 9.846578947368421e-05, + "loss": 0.4475, + "step": 25842 + }, + { + "epoch": 1.4471385373502073, + "grad_norm": 1.7399256229400635, + "learning_rate": 9.846552631578948e-05, + "loss": 0.4837, + "step": 25843 + }, + { + "epoch": 1.4471945346623363, + "grad_norm": 4.695403099060059, + "learning_rate": 9.846526315789474e-05, + "loss": 0.4705, + "step": 25844 + }, + { + "epoch": 1.4472505319744653, + "grad_norm": 7.097339630126953, + "learning_rate": 9.846500000000001e-05, + "loss": 0.4211, + "step": 25845 + }, + { + "epoch": 1.4473065292865943, + "grad_norm": 1.3863807916641235, + "learning_rate": 9.846473684210526e-05, + "loss": 0.5693, + "step": 25846 + }, + { + "epoch": 1.4473625265987233, + "grad_norm": 1.4710978269577026, + "learning_rate": 9.846447368421053e-05, + "loss": 0.4579, + "step": 25847 + }, + { + "epoch": 1.4474185239108524, + "grad_norm": 1.4749094247817993, + "learning_rate": 9.846421052631579e-05, + "loss": 0.5118, + "step": 25848 + }, + { + "epoch": 1.4474745212229814, + "grad_norm": 1.664351463317871, + "learning_rate": 9.846394736842107e-05, + "loss": 0.4544, + "step": 25849 + }, + { + "epoch": 1.4475305185351104, + "grad_norm": 1.4030771255493164, + "learning_rate": 9.846368421052633e-05, + "loss": 0.4978, + "step": 25850 + }, + { + "epoch": 1.4475865158472394, + "grad_norm": 1.3160074949264526, + "learning_rate": 9.846342105263159e-05, + "loss": 0.5448, + "step": 25851 + }, + { + "epoch": 1.4476425131593684, + "grad_norm": 1.2401901483535767, + "learning_rate": 9.846315789473685e-05, + "loss": 0.4574, + "step": 25852 + }, + { + "epoch": 1.4476985104714974, + "grad_norm": 1.3144906759262085, + "learning_rate": 9.846289473684212e-05, + "loss": 0.4755, + "step": 25853 + }, + { + "epoch": 1.4477545077836265, + "grad_norm": 1.186714768409729, + "learning_rate": 9.846263157894738e-05, + "loss": 0.4043, + "step": 25854 + }, + { + "epoch": 1.4478105050957555, + "grad_norm": 1.2592949867248535, + "learning_rate": 9.846236842105262e-05, + "loss": 0.377, + "step": 25855 + }, + { + "epoch": 1.4478665024078845, + "grad_norm": 1.4551217555999756, + "learning_rate": 9.84621052631579e-05, + "loss": 0.5906, + "step": 25856 + }, + { + "epoch": 1.4479224997200135, + "grad_norm": 1.4207539558410645, + "learning_rate": 9.846184210526316e-05, + "loss": 0.5401, + "step": 25857 + }, + { + "epoch": 1.4479784970321425, + "grad_norm": 1.1180894374847412, + "learning_rate": 9.846157894736843e-05, + "loss": 0.3977, + "step": 25858 + }, + { + "epoch": 1.4480344943442716, + "grad_norm": 1.841389775276184, + "learning_rate": 9.846131578947369e-05, + "loss": 0.5804, + "step": 25859 + }, + { + "epoch": 1.4480904916564006, + "grad_norm": 1.4709042310714722, + "learning_rate": 9.846105263157895e-05, + "loss": 0.479, + "step": 25860 + }, + { + "epoch": 1.4481464889685296, + "grad_norm": 1.5189865827560425, + "learning_rate": 9.846078947368421e-05, + "loss": 0.4432, + "step": 25861 + }, + { + "epoch": 1.4482024862806586, + "grad_norm": 1.2005620002746582, + "learning_rate": 9.846052631578948e-05, + "loss": 0.3775, + "step": 25862 + }, + { + "epoch": 1.4482584835927876, + "grad_norm": 1.397200584411621, + "learning_rate": 9.846026315789474e-05, + "loss": 0.6105, + "step": 25863 + }, + { + "epoch": 1.4483144809049167, + "grad_norm": 1.3551275730133057, + "learning_rate": 9.846e-05, + "loss": 0.4215, + "step": 25864 + }, + { + "epoch": 1.4483704782170457, + "grad_norm": 1.2143815755844116, + "learning_rate": 9.845973684210526e-05, + "loss": 0.4264, + "step": 25865 + }, + { + "epoch": 1.4484264755291747, + "grad_norm": 1.278725266456604, + "learning_rate": 9.845947368421054e-05, + "loss": 0.6367, + "step": 25866 + }, + { + "epoch": 1.4484824728413037, + "grad_norm": 1.7548737525939941, + "learning_rate": 9.84592105263158e-05, + "loss": 0.5817, + "step": 25867 + }, + { + "epoch": 1.4485384701534327, + "grad_norm": 1.250089168548584, + "learning_rate": 9.845894736842106e-05, + "loss": 0.4424, + "step": 25868 + }, + { + "epoch": 1.4485944674655618, + "grad_norm": 1.41702401638031, + "learning_rate": 9.845868421052631e-05, + "loss": 0.5236, + "step": 25869 + }, + { + "epoch": 1.4486504647776908, + "grad_norm": 1.1638234853744507, + "learning_rate": 9.845842105263159e-05, + "loss": 0.4341, + "step": 25870 + }, + { + "epoch": 1.4487064620898198, + "grad_norm": 1.0353578329086304, + "learning_rate": 9.845815789473685e-05, + "loss": 0.3386, + "step": 25871 + }, + { + "epoch": 1.4487624594019488, + "grad_norm": 1.5625780820846558, + "learning_rate": 9.845789473684211e-05, + "loss": 0.395, + "step": 25872 + }, + { + "epoch": 1.4488184567140778, + "grad_norm": 7.201679229736328, + "learning_rate": 9.845763157894737e-05, + "loss": 0.5571, + "step": 25873 + }, + { + "epoch": 1.4488744540262068, + "grad_norm": 1.3730390071868896, + "learning_rate": 9.845736842105263e-05, + "loss": 0.4704, + "step": 25874 + }, + { + "epoch": 1.4489304513383359, + "grad_norm": 1.3140690326690674, + "learning_rate": 9.84571052631579e-05, + "loss": 0.4497, + "step": 25875 + }, + { + "epoch": 1.4489864486504649, + "grad_norm": 1.3097856044769287, + "learning_rate": 9.845684210526316e-05, + "loss": 0.452, + "step": 25876 + }, + { + "epoch": 1.449042445962594, + "grad_norm": 1.4853609800338745, + "learning_rate": 9.845657894736843e-05, + "loss": 0.5977, + "step": 25877 + }, + { + "epoch": 1.449098443274723, + "grad_norm": 1.4176387786865234, + "learning_rate": 9.845631578947368e-05, + "loss": 0.4417, + "step": 25878 + }, + { + "epoch": 1.449154440586852, + "grad_norm": 1.1323189735412598, + "learning_rate": 9.845605263157895e-05, + "loss": 0.5364, + "step": 25879 + }, + { + "epoch": 1.449210437898981, + "grad_norm": 1.172784686088562, + "learning_rate": 9.845578947368421e-05, + "loss": 0.4925, + "step": 25880 + }, + { + "epoch": 1.44926643521111, + "grad_norm": 1.1581997871398926, + "learning_rate": 9.845552631578949e-05, + "loss": 0.3504, + "step": 25881 + }, + { + "epoch": 1.449322432523239, + "grad_norm": 1.2002604007720947, + "learning_rate": 9.845526315789475e-05, + "loss": 0.4836, + "step": 25882 + }, + { + "epoch": 1.449378429835368, + "grad_norm": 1.1245604753494263, + "learning_rate": 9.8455e-05, + "loss": 0.3703, + "step": 25883 + }, + { + "epoch": 1.449434427147497, + "grad_norm": 1.520147442817688, + "learning_rate": 9.845473684210526e-05, + "loss": 0.5046, + "step": 25884 + }, + { + "epoch": 1.449490424459626, + "grad_norm": 1.5617785453796387, + "learning_rate": 9.845447368421054e-05, + "loss": 0.4901, + "step": 25885 + }, + { + "epoch": 1.449546421771755, + "grad_norm": 1.4862608909606934, + "learning_rate": 9.84542105263158e-05, + "loss": 0.4195, + "step": 25886 + }, + { + "epoch": 1.449602419083884, + "grad_norm": 1.2302963733673096, + "learning_rate": 9.845394736842106e-05, + "loss": 0.4553, + "step": 25887 + }, + { + "epoch": 1.4496584163960131, + "grad_norm": 1.2800395488739014, + "learning_rate": 9.845368421052632e-05, + "loss": 0.3787, + "step": 25888 + }, + { + "epoch": 1.4497144137081421, + "grad_norm": 1.2763700485229492, + "learning_rate": 9.845342105263159e-05, + "loss": 0.3521, + "step": 25889 + }, + { + "epoch": 1.4497704110202712, + "grad_norm": 1.4803738594055176, + "learning_rate": 9.845315789473685e-05, + "loss": 0.5863, + "step": 25890 + }, + { + "epoch": 1.4498264083324002, + "grad_norm": 1.222286343574524, + "learning_rate": 9.845289473684211e-05, + "loss": 0.4811, + "step": 25891 + }, + { + "epoch": 1.4498824056445292, + "grad_norm": 1.0914252996444702, + "learning_rate": 9.845263157894737e-05, + "loss": 0.3386, + "step": 25892 + }, + { + "epoch": 1.4499384029566582, + "grad_norm": 1.4120428562164307, + "learning_rate": 9.845236842105263e-05, + "loss": 0.5278, + "step": 25893 + }, + { + "epoch": 1.4499944002687872, + "grad_norm": 1.41368567943573, + "learning_rate": 9.84521052631579e-05, + "loss": 0.4883, + "step": 25894 + }, + { + "epoch": 1.4500503975809163, + "grad_norm": 1.2134027481079102, + "learning_rate": 9.845184210526316e-05, + "loss": 0.4061, + "step": 25895 + }, + { + "epoch": 1.4501063948930453, + "grad_norm": 1.4855021238327026, + "learning_rate": 9.845157894736842e-05, + "loss": 0.4442, + "step": 25896 + }, + { + "epoch": 1.4501623922051743, + "grad_norm": 1.4125449657440186, + "learning_rate": 9.845131578947368e-05, + "loss": 0.4051, + "step": 25897 + }, + { + "epoch": 1.4502183895173033, + "grad_norm": 1.275683879852295, + "learning_rate": 9.845105263157896e-05, + "loss": 0.4895, + "step": 25898 + }, + { + "epoch": 1.450274386829432, + "grad_norm": 1.4906284809112549, + "learning_rate": 9.845078947368422e-05, + "loss": 0.4012, + "step": 25899 + }, + { + "epoch": 1.4503303841415611, + "grad_norm": 1.4855666160583496, + "learning_rate": 9.845052631578949e-05, + "loss": 0.5737, + "step": 25900 + }, + { + "epoch": 1.4503863814536901, + "grad_norm": 1.6058350801467896, + "learning_rate": 9.845026315789473e-05, + "loss": 0.5658, + "step": 25901 + }, + { + "epoch": 1.4504423787658192, + "grad_norm": 1.2203283309936523, + "learning_rate": 9.845000000000001e-05, + "loss": 0.3587, + "step": 25902 + }, + { + "epoch": 1.4504983760779482, + "grad_norm": 1.2490440607070923, + "learning_rate": 9.844973684210527e-05, + "loss": 0.3595, + "step": 25903 + }, + { + "epoch": 1.4505543733900772, + "grad_norm": 1.5844202041625977, + "learning_rate": 9.844947368421054e-05, + "loss": 0.4712, + "step": 25904 + }, + { + "epoch": 1.4506103707022062, + "grad_norm": 1.4154523611068726, + "learning_rate": 9.84492105263158e-05, + "loss": 0.4897, + "step": 25905 + }, + { + "epoch": 1.4506663680143352, + "grad_norm": 1.3495726585388184, + "learning_rate": 9.844894736842106e-05, + "loss": 0.585, + "step": 25906 + }, + { + "epoch": 1.4507223653264643, + "grad_norm": 1.4050761461257935, + "learning_rate": 9.844868421052632e-05, + "loss": 0.5269, + "step": 25907 + }, + { + "epoch": 1.4507783626385933, + "grad_norm": 1.2094513177871704, + "learning_rate": 9.844842105263158e-05, + "loss": 0.4117, + "step": 25908 + }, + { + "epoch": 1.4508343599507223, + "grad_norm": 1.6439192295074463, + "learning_rate": 9.844815789473685e-05, + "loss": 0.546, + "step": 25909 + }, + { + "epoch": 1.4508903572628513, + "grad_norm": 1.701016902923584, + "learning_rate": 9.84478947368421e-05, + "loss": 0.4767, + "step": 25910 + }, + { + "epoch": 1.4509463545749803, + "grad_norm": 1.4660980701446533, + "learning_rate": 9.844763157894737e-05, + "loss": 0.399, + "step": 25911 + }, + { + "epoch": 1.4510023518871094, + "grad_norm": 1.4048539400100708, + "learning_rate": 9.844736842105263e-05, + "loss": 0.5477, + "step": 25912 + }, + { + "epoch": 1.4510583491992384, + "grad_norm": 1.4888503551483154, + "learning_rate": 9.84471052631579e-05, + "loss": 0.6689, + "step": 25913 + }, + { + "epoch": 1.4511143465113674, + "grad_norm": 1.1039063930511475, + "learning_rate": 9.844684210526317e-05, + "loss": 0.4155, + "step": 25914 + }, + { + "epoch": 1.4511703438234964, + "grad_norm": 1.7937867641448975, + "learning_rate": 9.844657894736842e-05, + "loss": 0.4529, + "step": 25915 + }, + { + "epoch": 1.4512263411356254, + "grad_norm": 1.480340600013733, + "learning_rate": 9.844631578947368e-05, + "loss": 0.4487, + "step": 25916 + }, + { + "epoch": 1.4512823384477544, + "grad_norm": 1.8020457029342651, + "learning_rate": 9.844605263157896e-05, + "loss": 0.5597, + "step": 25917 + }, + { + "epoch": 1.4513383357598835, + "grad_norm": 3.0687625408172607, + "learning_rate": 9.844578947368422e-05, + "loss": 0.5293, + "step": 25918 + }, + { + "epoch": 1.4513943330720125, + "grad_norm": 1.4975465536117554, + "learning_rate": 9.844552631578948e-05, + "loss": 0.4141, + "step": 25919 + }, + { + "epoch": 1.4514503303841415, + "grad_norm": 1.5277751684188843, + "learning_rate": 9.844526315789474e-05, + "loss": 0.6326, + "step": 25920 + }, + { + "epoch": 1.4515063276962705, + "grad_norm": 1.370906949043274, + "learning_rate": 9.844500000000001e-05, + "loss": 0.468, + "step": 25921 + }, + { + "epoch": 1.4515623250083995, + "grad_norm": 1.2885569334030151, + "learning_rate": 9.844473684210527e-05, + "loss": 0.5034, + "step": 25922 + }, + { + "epoch": 1.4516183223205286, + "grad_norm": 1.2657023668289185, + "learning_rate": 9.844447368421053e-05, + "loss": 0.4651, + "step": 25923 + }, + { + "epoch": 1.4516743196326576, + "grad_norm": 1.3978196382522583, + "learning_rate": 9.844421052631579e-05, + "loss": 0.4269, + "step": 25924 + }, + { + "epoch": 1.4517303169447866, + "grad_norm": 1.3470312356948853, + "learning_rate": 9.844394736842105e-05, + "loss": 0.4954, + "step": 25925 + }, + { + "epoch": 1.4517863142569156, + "grad_norm": 1.8148443698883057, + "learning_rate": 9.844368421052632e-05, + "loss": 0.6209, + "step": 25926 + }, + { + "epoch": 1.4518423115690446, + "grad_norm": 5.657511234283447, + "learning_rate": 9.844342105263158e-05, + "loss": 0.4336, + "step": 25927 + }, + { + "epoch": 1.4518983088811737, + "grad_norm": 1.5702624320983887, + "learning_rate": 9.844315789473684e-05, + "loss": 0.5375, + "step": 25928 + }, + { + "epoch": 1.4519543061933027, + "grad_norm": 1.4084192514419556, + "learning_rate": 9.84428947368421e-05, + "loss": 0.5047, + "step": 25929 + }, + { + "epoch": 1.4520103035054317, + "grad_norm": 1.1949462890625, + "learning_rate": 9.844263157894738e-05, + "loss": 0.3619, + "step": 25930 + }, + { + "epoch": 1.4520663008175607, + "grad_norm": 1.2923543453216553, + "learning_rate": 9.844236842105263e-05, + "loss": 0.4469, + "step": 25931 + }, + { + "epoch": 1.4521222981296897, + "grad_norm": 1.2760673761367798, + "learning_rate": 9.844210526315791e-05, + "loss": 0.467, + "step": 25932 + }, + { + "epoch": 1.4521782954418188, + "grad_norm": 1.2177743911743164, + "learning_rate": 9.844184210526315e-05, + "loss": 0.406, + "step": 25933 + }, + { + "epoch": 1.4522342927539478, + "grad_norm": 1.1399402618408203, + "learning_rate": 9.844157894736843e-05, + "loss": 0.3858, + "step": 25934 + }, + { + "epoch": 1.4522902900660768, + "grad_norm": 1.6178439855575562, + "learning_rate": 9.844131578947369e-05, + "loss": 0.5365, + "step": 25935 + }, + { + "epoch": 1.4523462873782058, + "grad_norm": 1.4049549102783203, + "learning_rate": 9.844105263157896e-05, + "loss": 0.5515, + "step": 25936 + }, + { + "epoch": 1.4524022846903348, + "grad_norm": 1.2331537008285522, + "learning_rate": 9.844078947368422e-05, + "loss": 0.4917, + "step": 25937 + }, + { + "epoch": 1.4524582820024639, + "grad_norm": 1.2904316186904907, + "learning_rate": 9.844052631578948e-05, + "loss": 0.4256, + "step": 25938 + }, + { + "epoch": 1.4525142793145929, + "grad_norm": 1.261634349822998, + "learning_rate": 9.844026315789474e-05, + "loss": 0.458, + "step": 25939 + }, + { + "epoch": 1.452570276626722, + "grad_norm": 1.430680513381958, + "learning_rate": 9.844000000000001e-05, + "loss": 0.4403, + "step": 25940 + }, + { + "epoch": 1.452626273938851, + "grad_norm": 1.102910041809082, + "learning_rate": 9.843973684210527e-05, + "loss": 0.3499, + "step": 25941 + }, + { + "epoch": 1.45268227125098, + "grad_norm": 1.4364904165267944, + "learning_rate": 9.843947368421053e-05, + "loss": 0.5855, + "step": 25942 + }, + { + "epoch": 1.452738268563109, + "grad_norm": 1.155799388885498, + "learning_rate": 9.843921052631579e-05, + "loss": 0.3681, + "step": 25943 + }, + { + "epoch": 1.452794265875238, + "grad_norm": 1.272122859954834, + "learning_rate": 9.843894736842105e-05, + "loss": 0.4879, + "step": 25944 + }, + { + "epoch": 1.452850263187367, + "grad_norm": 1.3616063594818115, + "learning_rate": 9.843868421052633e-05, + "loss": 0.5572, + "step": 25945 + }, + { + "epoch": 1.452906260499496, + "grad_norm": 1.530600905418396, + "learning_rate": 9.843842105263158e-05, + "loss": 0.4332, + "step": 25946 + }, + { + "epoch": 1.452962257811625, + "grad_norm": 1.2354836463928223, + "learning_rate": 9.843815789473684e-05, + "loss": 0.4672, + "step": 25947 + }, + { + "epoch": 1.453018255123754, + "grad_norm": 1.435318112373352, + "learning_rate": 9.84378947368421e-05, + "loss": 0.4326, + "step": 25948 + }, + { + "epoch": 1.453074252435883, + "grad_norm": 1.4048984050750732, + "learning_rate": 9.843763157894738e-05, + "loss": 0.3772, + "step": 25949 + }, + { + "epoch": 1.453130249748012, + "grad_norm": 2.3893043994903564, + "learning_rate": 9.843736842105264e-05, + "loss": 0.5347, + "step": 25950 + }, + { + "epoch": 1.453186247060141, + "grad_norm": 1.403817057609558, + "learning_rate": 9.84371052631579e-05, + "loss": 0.5213, + "step": 25951 + }, + { + "epoch": 1.4532422443722701, + "grad_norm": 1.2591506242752075, + "learning_rate": 9.843684210526316e-05, + "loss": 0.4081, + "step": 25952 + }, + { + "epoch": 1.4532982416843991, + "grad_norm": 1.2045007944107056, + "learning_rate": 9.843657894736843e-05, + "loss": 0.5792, + "step": 25953 + }, + { + "epoch": 1.4533542389965282, + "grad_norm": 1.2742711305618286, + "learning_rate": 9.843631578947369e-05, + "loss": 0.4952, + "step": 25954 + }, + { + "epoch": 1.4534102363086572, + "grad_norm": 1.6678144931793213, + "learning_rate": 9.843605263157896e-05, + "loss": 0.5661, + "step": 25955 + }, + { + "epoch": 1.4534662336207862, + "grad_norm": 1.4130090475082397, + "learning_rate": 9.843578947368421e-05, + "loss": 0.6704, + "step": 25956 + }, + { + "epoch": 1.4535222309329152, + "grad_norm": 1.37088942527771, + "learning_rate": 9.843552631578948e-05, + "loss": 0.4189, + "step": 25957 + }, + { + "epoch": 1.4535782282450442, + "grad_norm": 1.8538419008255005, + "learning_rate": 9.843526315789474e-05, + "loss": 0.4482, + "step": 25958 + }, + { + "epoch": 1.4536342255571733, + "grad_norm": 1.3719291687011719, + "learning_rate": 9.8435e-05, + "loss": 0.3062, + "step": 25959 + }, + { + "epoch": 1.4536902228693023, + "grad_norm": 1.3360399007797241, + "learning_rate": 9.843473684210526e-05, + "loss": 0.3929, + "step": 25960 + }, + { + "epoch": 1.4537462201814313, + "grad_norm": 1.3749924898147583, + "learning_rate": 9.843447368421052e-05, + "loss": 0.4151, + "step": 25961 + }, + { + "epoch": 1.4538022174935603, + "grad_norm": 1.381787657737732, + "learning_rate": 9.84342105263158e-05, + "loss": 0.4326, + "step": 25962 + }, + { + "epoch": 1.4538582148056893, + "grad_norm": 1.9576159715652466, + "learning_rate": 9.843394736842105e-05, + "loss": 0.4619, + "step": 25963 + }, + { + "epoch": 1.4539142121178183, + "grad_norm": 1.386579155921936, + "learning_rate": 9.843368421052633e-05, + "loss": 0.4705, + "step": 25964 + }, + { + "epoch": 1.4539702094299474, + "grad_norm": 1.1891536712646484, + "learning_rate": 9.843342105263157e-05, + "loss": 0.4118, + "step": 25965 + }, + { + "epoch": 1.4540262067420764, + "grad_norm": 1.2311022281646729, + "learning_rate": 9.843315789473685e-05, + "loss": 0.4656, + "step": 25966 + }, + { + "epoch": 1.4540822040542054, + "grad_norm": 1.3781304359436035, + "learning_rate": 9.843289473684211e-05, + "loss": 0.5193, + "step": 25967 + }, + { + "epoch": 1.4541382013663344, + "grad_norm": 1.1454792022705078, + "learning_rate": 9.843263157894738e-05, + "loss": 0.4095, + "step": 25968 + }, + { + "epoch": 1.4541941986784634, + "grad_norm": 1.28517484664917, + "learning_rate": 9.843236842105264e-05, + "loss": 0.3822, + "step": 25969 + }, + { + "epoch": 1.4542501959905925, + "grad_norm": 1.3925752639770508, + "learning_rate": 9.84321052631579e-05, + "loss": 0.4105, + "step": 25970 + }, + { + "epoch": 1.4543061933027215, + "grad_norm": 3.689382553100586, + "learning_rate": 9.843184210526316e-05, + "loss": 0.603, + "step": 25971 + }, + { + "epoch": 1.4543621906148505, + "grad_norm": 1.3937817811965942, + "learning_rate": 9.843157894736843e-05, + "loss": 0.4476, + "step": 25972 + }, + { + "epoch": 1.4544181879269795, + "grad_norm": 77.36202239990234, + "learning_rate": 9.843131578947369e-05, + "loss": 0.3675, + "step": 25973 + }, + { + "epoch": 1.4544741852391085, + "grad_norm": 1.22329580783844, + "learning_rate": 9.843105263157895e-05, + "loss": 0.3984, + "step": 25974 + }, + { + "epoch": 1.4545301825512376, + "grad_norm": 1.4108954668045044, + "learning_rate": 9.843078947368421e-05, + "loss": 0.4543, + "step": 25975 + }, + { + "epoch": 1.4545861798633666, + "grad_norm": 1.2376995086669922, + "learning_rate": 9.843052631578947e-05, + "loss": 0.4174, + "step": 25976 + }, + { + "epoch": 1.4546421771754956, + "grad_norm": 1.5809969902038574, + "learning_rate": 9.843026315789474e-05, + "loss": 0.7597, + "step": 25977 + }, + { + "epoch": 1.4546981744876246, + "grad_norm": 1.929602861404419, + "learning_rate": 9.843e-05, + "loss": 0.6487, + "step": 25978 + }, + { + "epoch": 1.4547541717997536, + "grad_norm": 1.5038347244262695, + "learning_rate": 9.842973684210526e-05, + "loss": 0.6088, + "step": 25979 + }, + { + "epoch": 1.4548101691118827, + "grad_norm": 1.659993290901184, + "learning_rate": 9.842947368421052e-05, + "loss": 0.4552, + "step": 25980 + }, + { + "epoch": 1.4548661664240117, + "grad_norm": 1.1503784656524658, + "learning_rate": 9.84292105263158e-05, + "loss": 0.4481, + "step": 25981 + }, + { + "epoch": 1.4549221637361407, + "grad_norm": 1.1233444213867188, + "learning_rate": 9.842894736842106e-05, + "loss": 0.3566, + "step": 25982 + }, + { + "epoch": 1.4549781610482697, + "grad_norm": 1.5418497323989868, + "learning_rate": 9.842868421052632e-05, + "loss": 0.4217, + "step": 25983 + }, + { + "epoch": 1.4550341583603987, + "grad_norm": 1.42060387134552, + "learning_rate": 9.842842105263158e-05, + "loss": 0.5069, + "step": 25984 + }, + { + "epoch": 1.4550901556725278, + "grad_norm": 1.5929495096206665, + "learning_rate": 9.842815789473685e-05, + "loss": 0.4035, + "step": 25985 + }, + { + "epoch": 1.4551461529846568, + "grad_norm": 1.476660966873169, + "learning_rate": 9.842789473684211e-05, + "loss": 0.5383, + "step": 25986 + }, + { + "epoch": 1.4552021502967858, + "grad_norm": 1.4194258451461792, + "learning_rate": 9.842763157894738e-05, + "loss": 0.5295, + "step": 25987 + }, + { + "epoch": 1.4552581476089148, + "grad_norm": 1.1278876066207886, + "learning_rate": 9.842736842105263e-05, + "loss": 0.4407, + "step": 25988 + }, + { + "epoch": 1.4553141449210438, + "grad_norm": 1.4255927801132202, + "learning_rate": 9.84271052631579e-05, + "loss": 0.6305, + "step": 25989 + }, + { + "epoch": 1.4553701422331728, + "grad_norm": 1.2105761766433716, + "learning_rate": 9.842684210526316e-05, + "loss": 0.3823, + "step": 25990 + }, + { + "epoch": 1.4554261395453019, + "grad_norm": 1.2208470106124878, + "learning_rate": 9.842657894736844e-05, + "loss": 0.4616, + "step": 25991 + }, + { + "epoch": 1.4554821368574309, + "grad_norm": 1.30439293384552, + "learning_rate": 9.84263157894737e-05, + "loss": 0.3807, + "step": 25992 + }, + { + "epoch": 1.45553813416956, + "grad_norm": 1.7102527618408203, + "learning_rate": 9.842605263157894e-05, + "loss": 0.4421, + "step": 25993 + }, + { + "epoch": 1.455594131481689, + "grad_norm": 1.3844481706619263, + "learning_rate": 9.842578947368421e-05, + "loss": 0.5318, + "step": 25994 + }, + { + "epoch": 1.455650128793818, + "grad_norm": 1.1847615242004395, + "learning_rate": 9.842552631578947e-05, + "loss": 0.3416, + "step": 25995 + }, + { + "epoch": 1.455706126105947, + "grad_norm": 1.346411943435669, + "learning_rate": 9.842526315789475e-05, + "loss": 0.5071, + "step": 25996 + }, + { + "epoch": 1.455762123418076, + "grad_norm": 1.2523080110549927, + "learning_rate": 9.842500000000001e-05, + "loss": 0.5055, + "step": 25997 + }, + { + "epoch": 1.455818120730205, + "grad_norm": 1.2922927141189575, + "learning_rate": 9.842473684210527e-05, + "loss": 0.4191, + "step": 25998 + }, + { + "epoch": 1.455874118042334, + "grad_norm": 2.072600841522217, + "learning_rate": 9.842447368421053e-05, + "loss": 0.6591, + "step": 25999 + }, + { + "epoch": 1.455930115354463, + "grad_norm": 1.4252487421035767, + "learning_rate": 9.84242105263158e-05, + "loss": 0.4047, + "step": 26000 + }, + { + "epoch": 1.455986112666592, + "grad_norm": 1.4820553064346313, + "learning_rate": 9.842394736842106e-05, + "loss": 0.4762, + "step": 26001 + }, + { + "epoch": 1.456042109978721, + "grad_norm": 1.4761065244674683, + "learning_rate": 9.842368421052632e-05, + "loss": 0.417, + "step": 26002 + }, + { + "epoch": 1.45609810729085, + "grad_norm": 1.1952581405639648, + "learning_rate": 9.842342105263158e-05, + "loss": 0.5099, + "step": 26003 + }, + { + "epoch": 1.4561541046029791, + "grad_norm": 1.2118871212005615, + "learning_rate": 9.842315789473685e-05, + "loss": 0.4029, + "step": 26004 + }, + { + "epoch": 1.4562101019151081, + "grad_norm": 2.3767006397247314, + "learning_rate": 9.842289473684211e-05, + "loss": 0.4342, + "step": 26005 + }, + { + "epoch": 1.4562660992272372, + "grad_norm": 1.329052448272705, + "learning_rate": 9.842263157894737e-05, + "loss": 0.4584, + "step": 26006 + }, + { + "epoch": 1.4563220965393662, + "grad_norm": 1.3413516283035278, + "learning_rate": 9.842236842105263e-05, + "loss": 0.4386, + "step": 26007 + }, + { + "epoch": 1.4563780938514952, + "grad_norm": 1.215601921081543, + "learning_rate": 9.84221052631579e-05, + "loss": 0.4626, + "step": 26008 + }, + { + "epoch": 1.4564340911636242, + "grad_norm": 2.029501438140869, + "learning_rate": 9.842184210526316e-05, + "loss": 0.5052, + "step": 26009 + }, + { + "epoch": 1.4564900884757532, + "grad_norm": 1.856543779373169, + "learning_rate": 9.842157894736842e-05, + "loss": 0.4854, + "step": 26010 + }, + { + "epoch": 1.4565460857878822, + "grad_norm": 1.4724719524383545, + "learning_rate": 9.842131578947368e-05, + "loss": 0.5947, + "step": 26011 + }, + { + "epoch": 1.4566020831000113, + "grad_norm": 1.2454043626785278, + "learning_rate": 9.842105263157894e-05, + "loss": 0.4048, + "step": 26012 + }, + { + "epoch": 1.4566580804121403, + "grad_norm": 1.3093109130859375, + "learning_rate": 9.842078947368422e-05, + "loss": 0.4102, + "step": 26013 + }, + { + "epoch": 1.4567140777242693, + "grad_norm": 1.3444128036499023, + "learning_rate": 9.842052631578948e-05, + "loss": 0.4793, + "step": 26014 + }, + { + "epoch": 1.4567700750363983, + "grad_norm": 1.5629818439483643, + "learning_rate": 9.842026315789474e-05, + "loss": 0.4227, + "step": 26015 + }, + { + "epoch": 1.4568260723485273, + "grad_norm": 1.532607078552246, + "learning_rate": 9.842e-05, + "loss": 0.6029, + "step": 26016 + }, + { + "epoch": 1.4568820696606564, + "grad_norm": 1.3916041851043701, + "learning_rate": 9.841973684210527e-05, + "loss": 0.5419, + "step": 26017 + }, + { + "epoch": 1.4569380669727854, + "grad_norm": 1.4412180185317993, + "learning_rate": 9.841947368421053e-05, + "loss": 0.4692, + "step": 26018 + }, + { + "epoch": 1.4569940642849144, + "grad_norm": 1.4216548204421997, + "learning_rate": 9.84192105263158e-05, + "loss": 0.5973, + "step": 26019 + }, + { + "epoch": 1.4570500615970434, + "grad_norm": 1.2250051498413086, + "learning_rate": 9.841894736842105e-05, + "loss": 0.5463, + "step": 26020 + }, + { + "epoch": 1.4571060589091724, + "grad_norm": 1.220533847808838, + "learning_rate": 9.841868421052632e-05, + "loss": 0.4516, + "step": 26021 + }, + { + "epoch": 1.4571620562213015, + "grad_norm": 1.1817001104354858, + "learning_rate": 9.841842105263158e-05, + "loss": 0.5348, + "step": 26022 + }, + { + "epoch": 1.4572180535334303, + "grad_norm": 1.211058497428894, + "learning_rate": 9.841815789473685e-05, + "loss": 0.4168, + "step": 26023 + }, + { + "epoch": 1.4572740508455593, + "grad_norm": 1.8724970817565918, + "learning_rate": 9.841789473684211e-05, + "loss": 0.4228, + "step": 26024 + }, + { + "epoch": 1.4573300481576883, + "grad_norm": 1.1929121017456055, + "learning_rate": 9.841763157894737e-05, + "loss": 0.4154, + "step": 26025 + }, + { + "epoch": 1.4573860454698173, + "grad_norm": 1.1333788633346558, + "learning_rate": 9.841736842105263e-05, + "loss": 0.3917, + "step": 26026 + }, + { + "epoch": 1.4574420427819463, + "grad_norm": 1.3728362321853638, + "learning_rate": 9.841710526315791e-05, + "loss": 0.6962, + "step": 26027 + }, + { + "epoch": 1.4574980400940754, + "grad_norm": 1.2028093338012695, + "learning_rate": 9.841684210526317e-05, + "loss": 0.3844, + "step": 26028 + }, + { + "epoch": 1.4575540374062044, + "grad_norm": 1.2827948331832886, + "learning_rate": 9.841657894736843e-05, + "loss": 0.6314, + "step": 26029 + }, + { + "epoch": 1.4576100347183334, + "grad_norm": 1.3257192373275757, + "learning_rate": 9.841631578947369e-05, + "loss": 0.5371, + "step": 26030 + }, + { + "epoch": 1.4576660320304624, + "grad_norm": 1.401133418083191, + "learning_rate": 9.841605263157895e-05, + "loss": 0.4718, + "step": 26031 + }, + { + "epoch": 1.4577220293425914, + "grad_norm": 1.27435302734375, + "learning_rate": 9.841578947368422e-05, + "loss": 0.4775, + "step": 26032 + }, + { + "epoch": 1.4577780266547204, + "grad_norm": 1.7541958093643188, + "learning_rate": 9.841552631578948e-05, + "loss": 0.4958, + "step": 26033 + }, + { + "epoch": 1.4578340239668495, + "grad_norm": 1.232792615890503, + "learning_rate": 9.841526315789474e-05, + "loss": 0.4311, + "step": 26034 + }, + { + "epoch": 1.4578900212789785, + "grad_norm": 1.480499029159546, + "learning_rate": 9.8415e-05, + "loss": 0.5128, + "step": 26035 + }, + { + "epoch": 1.4579460185911075, + "grad_norm": 1.412124514579773, + "learning_rate": 9.841473684210527e-05, + "loss": 0.4031, + "step": 26036 + }, + { + "epoch": 1.4580020159032365, + "grad_norm": 1.420940637588501, + "learning_rate": 9.841447368421053e-05, + "loss": 0.4616, + "step": 26037 + }, + { + "epoch": 1.4580580132153655, + "grad_norm": 1.3835965394973755, + "learning_rate": 9.841421052631579e-05, + "loss": 0.4683, + "step": 26038 + }, + { + "epoch": 1.4581140105274946, + "grad_norm": 1.6787493228912354, + "learning_rate": 9.841394736842105e-05, + "loss": 0.5847, + "step": 26039 + }, + { + "epoch": 1.4581700078396236, + "grad_norm": 1.1006306409835815, + "learning_rate": 9.841368421052632e-05, + "loss": 0.3483, + "step": 26040 + }, + { + "epoch": 1.4582260051517526, + "grad_norm": 1.5119205713272095, + "learning_rate": 9.841342105263158e-05, + "loss": 0.5986, + "step": 26041 + }, + { + "epoch": 1.4582820024638816, + "grad_norm": 1.6595722436904907, + "learning_rate": 9.841315789473686e-05, + "loss": 0.4327, + "step": 26042 + }, + { + "epoch": 1.4583379997760106, + "grad_norm": 1.4033514261245728, + "learning_rate": 9.84128947368421e-05, + "loss": 0.499, + "step": 26043 + }, + { + "epoch": 1.4583939970881397, + "grad_norm": 1.226958155632019, + "learning_rate": 9.841263157894738e-05, + "loss": 0.3799, + "step": 26044 + }, + { + "epoch": 1.4584499944002687, + "grad_norm": 1.127150058746338, + "learning_rate": 9.841236842105264e-05, + "loss": 0.2782, + "step": 26045 + }, + { + "epoch": 1.4585059917123977, + "grad_norm": 1.2230535745620728, + "learning_rate": 9.84121052631579e-05, + "loss": 0.4618, + "step": 26046 + }, + { + "epoch": 1.4585619890245267, + "grad_norm": 1.1991167068481445, + "learning_rate": 9.841184210526317e-05, + "loss": 0.3825, + "step": 26047 + }, + { + "epoch": 1.4586179863366557, + "grad_norm": 1.8252909183502197, + "learning_rate": 9.841157894736842e-05, + "loss": 0.5257, + "step": 26048 + }, + { + "epoch": 1.4586739836487848, + "grad_norm": 1.4885025024414062, + "learning_rate": 9.841131578947369e-05, + "loss": 0.5272, + "step": 26049 + }, + { + "epoch": 1.4587299809609138, + "grad_norm": 1.4274439811706543, + "learning_rate": 9.841105263157895e-05, + "loss": 0.488, + "step": 26050 + }, + { + "epoch": 1.4587859782730428, + "grad_norm": 1.3002328872680664, + "learning_rate": 9.841078947368422e-05, + "loss": 0.4252, + "step": 26051 + }, + { + "epoch": 1.4588419755851718, + "grad_norm": 1.3177926540374756, + "learning_rate": 9.841052631578948e-05, + "loss": 0.4527, + "step": 26052 + }, + { + "epoch": 1.4588979728973008, + "grad_norm": 1.2957621812820435, + "learning_rate": 9.841026315789474e-05, + "loss": 0.4261, + "step": 26053 + }, + { + "epoch": 1.4589539702094299, + "grad_norm": 1.5503376722335815, + "learning_rate": 9.841e-05, + "loss": 0.4861, + "step": 26054 + }, + { + "epoch": 1.4590099675215589, + "grad_norm": 1.515969157218933, + "learning_rate": 9.840973684210527e-05, + "loss": 0.6048, + "step": 26055 + }, + { + "epoch": 1.459065964833688, + "grad_norm": 1.3552058935165405, + "learning_rate": 9.840947368421053e-05, + "loss": 0.4295, + "step": 26056 + }, + { + "epoch": 1.459121962145817, + "grad_norm": 1.1816718578338623, + "learning_rate": 9.84092105263158e-05, + "loss": 0.4525, + "step": 26057 + }, + { + "epoch": 1.459177959457946, + "grad_norm": 1.6081931591033936, + "learning_rate": 9.840894736842105e-05, + "loss": 0.5457, + "step": 26058 + }, + { + "epoch": 1.459233956770075, + "grad_norm": 1.6812665462493896, + "learning_rate": 9.840868421052633e-05, + "loss": 0.4257, + "step": 26059 + }, + { + "epoch": 1.459289954082204, + "grad_norm": 1.229638695716858, + "learning_rate": 9.840842105263159e-05, + "loss": 0.4609, + "step": 26060 + }, + { + "epoch": 1.459345951394333, + "grad_norm": 1.4026283025741577, + "learning_rate": 9.840815789473685e-05, + "loss": 0.5036, + "step": 26061 + }, + { + "epoch": 1.459401948706462, + "grad_norm": 1.263132095336914, + "learning_rate": 9.84078947368421e-05, + "loss": 0.4865, + "step": 26062 + }, + { + "epoch": 1.459457946018591, + "grad_norm": 1.5632518529891968, + "learning_rate": 9.840763157894737e-05, + "loss": 0.6191, + "step": 26063 + }, + { + "epoch": 1.45951394333072, + "grad_norm": 1.5699266195297241, + "learning_rate": 9.840736842105264e-05, + "loss": 0.5325, + "step": 26064 + }, + { + "epoch": 1.459569940642849, + "grad_norm": 1.2045263051986694, + "learning_rate": 9.84071052631579e-05, + "loss": 0.3987, + "step": 26065 + }, + { + "epoch": 1.459625937954978, + "grad_norm": 5.441231727600098, + "learning_rate": 9.840684210526316e-05, + "loss": 0.5344, + "step": 26066 + }, + { + "epoch": 1.459681935267107, + "grad_norm": 1.2703382968902588, + "learning_rate": 9.840657894736842e-05, + "loss": 0.5315, + "step": 26067 + }, + { + "epoch": 1.4597379325792361, + "grad_norm": 1.1619566679000854, + "learning_rate": 9.840631578947369e-05, + "loss": 0.3904, + "step": 26068 + }, + { + "epoch": 1.4597939298913651, + "grad_norm": 1.2699205875396729, + "learning_rate": 9.840605263157895e-05, + "loss": 0.4152, + "step": 26069 + }, + { + "epoch": 1.4598499272034942, + "grad_norm": 1.5068904161453247, + "learning_rate": 9.840578947368421e-05, + "loss": 0.393, + "step": 26070 + }, + { + "epoch": 1.4599059245156232, + "grad_norm": 1.146650791168213, + "learning_rate": 9.840552631578947e-05, + "loss": 0.3644, + "step": 26071 + }, + { + "epoch": 1.4599619218277522, + "grad_norm": 1.2180830240249634, + "learning_rate": 9.840526315789474e-05, + "loss": 0.3651, + "step": 26072 + }, + { + "epoch": 1.4600179191398812, + "grad_norm": 1.6138577461242676, + "learning_rate": 9.8405e-05, + "loss": 0.4821, + "step": 26073 + }, + { + "epoch": 1.4600739164520102, + "grad_norm": 1.2507485151290894, + "learning_rate": 9.840473684210528e-05, + "loss": 0.6241, + "step": 26074 + }, + { + "epoch": 1.4601299137641393, + "grad_norm": 1.2357007265090942, + "learning_rate": 9.840447368421052e-05, + "loss": 0.3541, + "step": 26075 + }, + { + "epoch": 1.4601859110762683, + "grad_norm": 1.3801939487457275, + "learning_rate": 9.84042105263158e-05, + "loss": 0.7115, + "step": 26076 + }, + { + "epoch": 1.4602419083883973, + "grad_norm": 1.485823631286621, + "learning_rate": 9.840394736842106e-05, + "loss": 0.4691, + "step": 26077 + }, + { + "epoch": 1.4602979057005263, + "grad_norm": 8.340812683105469, + "learning_rate": 9.840368421052633e-05, + "loss": 0.3016, + "step": 26078 + }, + { + "epoch": 1.4603539030126553, + "grad_norm": 1.2271965742111206, + "learning_rate": 9.840342105263159e-05, + "loss": 0.4139, + "step": 26079 + }, + { + "epoch": 1.4604099003247843, + "grad_norm": 1.0837594270706177, + "learning_rate": 9.840315789473684e-05, + "loss": 0.4571, + "step": 26080 + }, + { + "epoch": 1.4604658976369134, + "grad_norm": 1.594258427619934, + "learning_rate": 9.840289473684211e-05, + "loss": 0.4861, + "step": 26081 + }, + { + "epoch": 1.4605218949490424, + "grad_norm": 1.315037727355957, + "learning_rate": 9.840263157894737e-05, + "loss": 0.4359, + "step": 26082 + }, + { + "epoch": 1.4605778922611714, + "grad_norm": 1.315635085105896, + "learning_rate": 9.840236842105264e-05, + "loss": 0.5626, + "step": 26083 + }, + { + "epoch": 1.4606338895733004, + "grad_norm": 1.4542587995529175, + "learning_rate": 9.84021052631579e-05, + "loss": 0.4359, + "step": 26084 + }, + { + "epoch": 1.4606898868854294, + "grad_norm": 1.3752509355545044, + "learning_rate": 9.840184210526316e-05, + "loss": 0.4102, + "step": 26085 + }, + { + "epoch": 1.4607458841975585, + "grad_norm": 1.7064623832702637, + "learning_rate": 9.840157894736842e-05, + "loss": 0.4918, + "step": 26086 + }, + { + "epoch": 1.4608018815096875, + "grad_norm": 1.3330295085906982, + "learning_rate": 9.84013157894737e-05, + "loss": 0.4255, + "step": 26087 + }, + { + "epoch": 1.4608578788218165, + "grad_norm": 1.7486516237258911, + "learning_rate": 9.840105263157895e-05, + "loss": 0.5374, + "step": 26088 + }, + { + "epoch": 1.4609138761339455, + "grad_norm": 1.2772011756896973, + "learning_rate": 9.840078947368421e-05, + "loss": 0.4463, + "step": 26089 + }, + { + "epoch": 1.4609698734460745, + "grad_norm": 1.404482126235962, + "learning_rate": 9.840052631578947e-05, + "loss": 0.3757, + "step": 26090 + }, + { + "epoch": 1.4610258707582036, + "grad_norm": 1.397269368171692, + "learning_rate": 9.840026315789475e-05, + "loss": 0.4529, + "step": 26091 + }, + { + "epoch": 1.4610818680703326, + "grad_norm": 1.510548710823059, + "learning_rate": 9.84e-05, + "loss": 0.5331, + "step": 26092 + }, + { + "epoch": 1.4611378653824616, + "grad_norm": 1.2802950143814087, + "learning_rate": 9.839973684210527e-05, + "loss": 0.4482, + "step": 26093 + }, + { + "epoch": 1.4611938626945906, + "grad_norm": 1.235029935836792, + "learning_rate": 9.839947368421053e-05, + "loss": 0.5423, + "step": 26094 + }, + { + "epoch": 1.4612498600067196, + "grad_norm": 1.1043256521224976, + "learning_rate": 9.83992105263158e-05, + "loss": 0.4166, + "step": 26095 + }, + { + "epoch": 1.4613058573188487, + "grad_norm": 1.2232075929641724, + "learning_rate": 9.839894736842106e-05, + "loss": 0.3702, + "step": 26096 + }, + { + "epoch": 1.4613618546309777, + "grad_norm": 1.1902234554290771, + "learning_rate": 9.839868421052632e-05, + "loss": 0.4973, + "step": 26097 + }, + { + "epoch": 1.4614178519431067, + "grad_norm": 1.5543293952941895, + "learning_rate": 9.839842105263158e-05, + "loss": 0.6131, + "step": 26098 + }, + { + "epoch": 1.4614738492552357, + "grad_norm": 1.4217603206634521, + "learning_rate": 9.839815789473684e-05, + "loss": 0.4294, + "step": 26099 + }, + { + "epoch": 1.4615298465673647, + "grad_norm": 1.4103598594665527, + "learning_rate": 9.839789473684211e-05, + "loss": 0.422, + "step": 26100 + }, + { + "epoch": 1.4615858438794938, + "grad_norm": 1.322417974472046, + "learning_rate": 9.839763157894737e-05, + "loss": 0.6959, + "step": 26101 + }, + { + "epoch": 1.4616418411916228, + "grad_norm": 1.3699936866760254, + "learning_rate": 9.839736842105264e-05, + "loss": 0.6449, + "step": 26102 + }, + { + "epoch": 1.4616978385037518, + "grad_norm": 1.6707324981689453, + "learning_rate": 9.839710526315789e-05, + "loss": 0.4287, + "step": 26103 + }, + { + "epoch": 1.4617538358158808, + "grad_norm": 1.2176315784454346, + "learning_rate": 9.839684210526316e-05, + "loss": 0.3065, + "step": 26104 + }, + { + "epoch": 1.4618098331280098, + "grad_norm": 1.8233367204666138, + "learning_rate": 9.839657894736842e-05, + "loss": 0.531, + "step": 26105 + }, + { + "epoch": 1.4618658304401388, + "grad_norm": 1.1831194162368774, + "learning_rate": 9.83963157894737e-05, + "loss": 0.3419, + "step": 26106 + }, + { + "epoch": 1.4619218277522679, + "grad_norm": 1.393722414970398, + "learning_rate": 9.839605263157896e-05, + "loss": 0.522, + "step": 26107 + }, + { + "epoch": 1.4619778250643969, + "grad_norm": 1.2852668762207031, + "learning_rate": 9.839578947368422e-05, + "loss": 0.5382, + "step": 26108 + }, + { + "epoch": 1.462033822376526, + "grad_norm": 1.2282413244247437, + "learning_rate": 9.839552631578948e-05, + "loss": 0.4142, + "step": 26109 + }, + { + "epoch": 1.462089819688655, + "grad_norm": 1.3288555145263672, + "learning_rate": 9.839526315789475e-05, + "loss": 0.3723, + "step": 26110 + }, + { + "epoch": 1.462145817000784, + "grad_norm": 1.4714993238449097, + "learning_rate": 9.839500000000001e-05, + "loss": 0.4981, + "step": 26111 + }, + { + "epoch": 1.462201814312913, + "grad_norm": 1.2625560760498047, + "learning_rate": 9.839473684210527e-05, + "loss": 0.5205, + "step": 26112 + }, + { + "epoch": 1.462257811625042, + "grad_norm": 3.8642263412475586, + "learning_rate": 9.839447368421053e-05, + "loss": 0.5482, + "step": 26113 + }, + { + "epoch": 1.462313808937171, + "grad_norm": 1.502594232559204, + "learning_rate": 9.839421052631579e-05, + "loss": 0.5093, + "step": 26114 + }, + { + "epoch": 1.4623698062493, + "grad_norm": 1.234735131263733, + "learning_rate": 9.839394736842106e-05, + "loss": 0.434, + "step": 26115 + }, + { + "epoch": 1.462425803561429, + "grad_norm": 1.3437854051589966, + "learning_rate": 9.839368421052632e-05, + "loss": 0.365, + "step": 26116 + }, + { + "epoch": 1.462481800873558, + "grad_norm": 1.4200648069381714, + "learning_rate": 9.839342105263158e-05, + "loss": 0.4288, + "step": 26117 + }, + { + "epoch": 1.462537798185687, + "grad_norm": 1.7690536975860596, + "learning_rate": 9.839315789473684e-05, + "loss": 0.5416, + "step": 26118 + }, + { + "epoch": 1.462593795497816, + "grad_norm": 1.2500172853469849, + "learning_rate": 9.839289473684211e-05, + "loss": 0.374, + "step": 26119 + }, + { + "epoch": 1.4626497928099451, + "grad_norm": 1.230628252029419, + "learning_rate": 9.839263157894737e-05, + "loss": 0.5791, + "step": 26120 + }, + { + "epoch": 1.4627057901220741, + "grad_norm": 1.2279032468795776, + "learning_rate": 9.839236842105263e-05, + "loss": 0.4471, + "step": 26121 + }, + { + "epoch": 1.4627617874342032, + "grad_norm": 1.8044170141220093, + "learning_rate": 9.83921052631579e-05, + "loss": 0.8309, + "step": 26122 + }, + { + "epoch": 1.4628177847463322, + "grad_norm": 1.2737913131713867, + "learning_rate": 9.839184210526317e-05, + "loss": 0.5838, + "step": 26123 + }, + { + "epoch": 1.4628737820584612, + "grad_norm": 1.3217850923538208, + "learning_rate": 9.839157894736843e-05, + "loss": 0.4428, + "step": 26124 + }, + { + "epoch": 1.4629297793705902, + "grad_norm": 1.421263337135315, + "learning_rate": 9.839131578947369e-05, + "loss": 0.457, + "step": 26125 + }, + { + "epoch": 1.4629857766827192, + "grad_norm": 1.8002113103866577, + "learning_rate": 9.839105263157895e-05, + "loss": 0.5559, + "step": 26126 + }, + { + "epoch": 1.4630417739948482, + "grad_norm": 1.7728595733642578, + "learning_rate": 9.839078947368422e-05, + "loss": 0.6489, + "step": 26127 + }, + { + "epoch": 1.4630977713069773, + "grad_norm": 1.3049659729003906, + "learning_rate": 9.839052631578948e-05, + "loss": 0.4105, + "step": 26128 + }, + { + "epoch": 1.4631537686191063, + "grad_norm": 1.3368240594863892, + "learning_rate": 9.839026315789475e-05, + "loss": 0.4986, + "step": 26129 + }, + { + "epoch": 1.4632097659312353, + "grad_norm": 1.1114985942840576, + "learning_rate": 9.839e-05, + "loss": 0.4899, + "step": 26130 + }, + { + "epoch": 1.4632657632433643, + "grad_norm": 1.4967657327651978, + "learning_rate": 9.838973684210526e-05, + "loss": 0.419, + "step": 26131 + }, + { + "epoch": 1.4633217605554933, + "grad_norm": 1.5984803438186646, + "learning_rate": 9.838947368421053e-05, + "loss": 0.4878, + "step": 26132 + }, + { + "epoch": 1.4633777578676224, + "grad_norm": 1.439347505569458, + "learning_rate": 9.838921052631579e-05, + "loss": 0.4475, + "step": 26133 + }, + { + "epoch": 1.4634337551797514, + "grad_norm": 1.2261884212493896, + "learning_rate": 9.838894736842106e-05, + "loss": 0.434, + "step": 26134 + }, + { + "epoch": 1.4634897524918804, + "grad_norm": 1.3100343942642212, + "learning_rate": 9.838868421052631e-05, + "loss": 0.5021, + "step": 26135 + }, + { + "epoch": 1.4635457498040094, + "grad_norm": 1.1510303020477295, + "learning_rate": 9.838842105263158e-05, + "loss": 0.4006, + "step": 26136 + }, + { + "epoch": 1.4636017471161384, + "grad_norm": 1.1804864406585693, + "learning_rate": 9.838815789473684e-05, + "loss": 0.4323, + "step": 26137 + }, + { + "epoch": 1.4636577444282675, + "grad_norm": 1.1206936836242676, + "learning_rate": 9.838789473684212e-05, + "loss": 0.4375, + "step": 26138 + }, + { + "epoch": 1.4637137417403965, + "grad_norm": 1.62826669216156, + "learning_rate": 9.838763157894738e-05, + "loss": 0.5586, + "step": 26139 + }, + { + "epoch": 1.4637697390525255, + "grad_norm": 1.252765417098999, + "learning_rate": 9.838736842105264e-05, + "loss": 0.4176, + "step": 26140 + }, + { + "epoch": 1.4638257363646545, + "grad_norm": 1.4438307285308838, + "learning_rate": 9.83871052631579e-05, + "loss": 0.5453, + "step": 26141 + }, + { + "epoch": 1.4638817336767835, + "grad_norm": 1.2927666902542114, + "learning_rate": 9.838684210526317e-05, + "loss": 0.4684, + "step": 26142 + }, + { + "epoch": 1.4639377309889126, + "grad_norm": 1.4094184637069702, + "learning_rate": 9.838657894736843e-05, + "loss": 0.4477, + "step": 26143 + }, + { + "epoch": 1.4639937283010416, + "grad_norm": 1.346919059753418, + "learning_rate": 9.838631578947369e-05, + "loss": 0.4416, + "step": 26144 + }, + { + "epoch": 1.4640497256131706, + "grad_norm": 1.358778953552246, + "learning_rate": 9.838605263157895e-05, + "loss": 0.4921, + "step": 26145 + }, + { + "epoch": 1.4641057229252996, + "grad_norm": 1.0951234102249146, + "learning_rate": 9.838578947368422e-05, + "loss": 0.4282, + "step": 26146 + }, + { + "epoch": 1.4641617202374286, + "grad_norm": 1.1184499263763428, + "learning_rate": 9.838552631578948e-05, + "loss": 0.4633, + "step": 26147 + }, + { + "epoch": 1.4642177175495577, + "grad_norm": 2.312195062637329, + "learning_rate": 9.838526315789474e-05, + "loss": 0.4097, + "step": 26148 + }, + { + "epoch": 1.4642737148616867, + "grad_norm": 1.454043984413147, + "learning_rate": 9.8385e-05, + "loss": 0.4175, + "step": 26149 + }, + { + "epoch": 1.4643297121738157, + "grad_norm": 1.3082998991012573, + "learning_rate": 9.838473684210526e-05, + "loss": 0.5793, + "step": 26150 + }, + { + "epoch": 1.4643857094859447, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.838447368421053e-05, + "loss": 0.4464, + "step": 26151 + }, + { + "epoch": 1.4644417067980737, + "grad_norm": 1.262650728225708, + "learning_rate": 9.83842105263158e-05, + "loss": 0.4955, + "step": 26152 + }, + { + "epoch": 1.4644977041102027, + "grad_norm": 1.4613919258117676, + "learning_rate": 9.838394736842105e-05, + "loss": 0.4926, + "step": 26153 + }, + { + "epoch": 1.4645537014223318, + "grad_norm": 1.3332149982452393, + "learning_rate": 9.838368421052631e-05, + "loss": 0.3976, + "step": 26154 + }, + { + "epoch": 1.4646096987344608, + "grad_norm": 2.993288278579712, + "learning_rate": 9.838342105263159e-05, + "loss": 0.5283, + "step": 26155 + }, + { + "epoch": 1.4646656960465898, + "grad_norm": 1.2900582551956177, + "learning_rate": 9.838315789473685e-05, + "loss": 0.4609, + "step": 26156 + }, + { + "epoch": 1.4647216933587188, + "grad_norm": 1.1441998481750488, + "learning_rate": 9.838289473684212e-05, + "loss": 0.4822, + "step": 26157 + }, + { + "epoch": 1.4647776906708478, + "grad_norm": 2.2140674591064453, + "learning_rate": 9.838263157894737e-05, + "loss": 0.3641, + "step": 26158 + }, + { + "epoch": 1.4648336879829769, + "grad_norm": 1.6417720317840576, + "learning_rate": 9.838236842105264e-05, + "loss": 0.474, + "step": 26159 + }, + { + "epoch": 1.4648896852951059, + "grad_norm": 1.185815691947937, + "learning_rate": 9.83821052631579e-05, + "loss": 0.3657, + "step": 26160 + }, + { + "epoch": 1.464945682607235, + "grad_norm": 1.2073918581008911, + "learning_rate": 9.838184210526317e-05, + "loss": 0.4163, + "step": 26161 + }, + { + "epoch": 1.465001679919364, + "grad_norm": 1.1560301780700684, + "learning_rate": 9.838157894736842e-05, + "loss": 0.4289, + "step": 26162 + }, + { + "epoch": 1.465057677231493, + "grad_norm": 1.4747960567474365, + "learning_rate": 9.838131578947369e-05, + "loss": 0.5165, + "step": 26163 + }, + { + "epoch": 1.465113674543622, + "grad_norm": 1.760401725769043, + "learning_rate": 9.838105263157895e-05, + "loss": 0.5872, + "step": 26164 + }, + { + "epoch": 1.465169671855751, + "grad_norm": 1.168134093284607, + "learning_rate": 9.838078947368422e-05, + "loss": 0.3327, + "step": 26165 + }, + { + "epoch": 1.46522566916788, + "grad_norm": 1.3292665481567383, + "learning_rate": 9.838052631578948e-05, + "loss": 0.5577, + "step": 26166 + }, + { + "epoch": 1.465281666480009, + "grad_norm": 1.2612038850784302, + "learning_rate": 9.838026315789473e-05, + "loss": 0.4686, + "step": 26167 + }, + { + "epoch": 1.465337663792138, + "grad_norm": 1.4005955457687378, + "learning_rate": 9.838e-05, + "loss": 0.4274, + "step": 26168 + }, + { + "epoch": 1.465393661104267, + "grad_norm": 1.398913025856018, + "learning_rate": 9.837973684210526e-05, + "loss": 0.4616, + "step": 26169 + }, + { + "epoch": 1.465449658416396, + "grad_norm": 3.524045944213867, + "learning_rate": 9.837947368421054e-05, + "loss": 0.4242, + "step": 26170 + }, + { + "epoch": 1.465505655728525, + "grad_norm": 1.2521146535873413, + "learning_rate": 9.83792105263158e-05, + "loss": 0.3885, + "step": 26171 + }, + { + "epoch": 1.465561653040654, + "grad_norm": 1.2548400163650513, + "learning_rate": 9.837894736842106e-05, + "loss": 0.4356, + "step": 26172 + }, + { + "epoch": 1.4656176503527831, + "grad_norm": 1.3966028690338135, + "learning_rate": 9.837868421052632e-05, + "loss": 0.5344, + "step": 26173 + }, + { + "epoch": 1.4656736476649121, + "grad_norm": 1.1401585340499878, + "learning_rate": 9.837842105263159e-05, + "loss": 0.4329, + "step": 26174 + }, + { + "epoch": 1.4657296449770412, + "grad_norm": 1.516364336013794, + "learning_rate": 9.837815789473685e-05, + "loss": 0.4669, + "step": 26175 + }, + { + "epoch": 1.4657856422891702, + "grad_norm": 1.3908380270004272, + "learning_rate": 9.837789473684211e-05, + "loss": 0.5796, + "step": 26176 + }, + { + "epoch": 1.4658416396012992, + "grad_norm": 1.1535483598709106, + "learning_rate": 9.837763157894737e-05, + "loss": 0.3513, + "step": 26177 + }, + { + "epoch": 1.4658976369134282, + "grad_norm": 1.364871621131897, + "learning_rate": 9.837736842105264e-05, + "loss": 0.6552, + "step": 26178 + }, + { + "epoch": 1.4659536342255572, + "grad_norm": 1.3948826789855957, + "learning_rate": 9.83771052631579e-05, + "loss": 0.5461, + "step": 26179 + }, + { + "epoch": 1.4660096315376863, + "grad_norm": 1.451411485671997, + "learning_rate": 9.837684210526316e-05, + "loss": 0.5771, + "step": 26180 + }, + { + "epoch": 1.4660656288498153, + "grad_norm": 1.616327166557312, + "learning_rate": 9.837657894736842e-05, + "loss": 0.4818, + "step": 26181 + }, + { + "epoch": 1.4661216261619443, + "grad_norm": 1.4954359531402588, + "learning_rate": 9.83763157894737e-05, + "loss": 0.3666, + "step": 26182 + }, + { + "epoch": 1.4661776234740733, + "grad_norm": 2.1145246028900146, + "learning_rate": 9.837605263157895e-05, + "loss": 0.5932, + "step": 26183 + }, + { + "epoch": 1.4662336207862023, + "grad_norm": 1.4395397901535034, + "learning_rate": 9.837578947368421e-05, + "loss": 0.3956, + "step": 26184 + }, + { + "epoch": 1.4662896180983314, + "grad_norm": 1.270071268081665, + "learning_rate": 9.837552631578947e-05, + "loss": 0.4866, + "step": 26185 + }, + { + "epoch": 1.4663456154104604, + "grad_norm": 1.183920979499817, + "learning_rate": 9.837526315789473e-05, + "loss": 0.4086, + "step": 26186 + }, + { + "epoch": 1.4664016127225894, + "grad_norm": 2.1491172313690186, + "learning_rate": 9.8375e-05, + "loss": 0.619, + "step": 26187 + }, + { + "epoch": 1.4664576100347184, + "grad_norm": 1.3373596668243408, + "learning_rate": 9.837473684210527e-05, + "loss": 0.425, + "step": 26188 + }, + { + "epoch": 1.4665136073468474, + "grad_norm": 1.3180291652679443, + "learning_rate": 9.837447368421054e-05, + "loss": 0.4633, + "step": 26189 + }, + { + "epoch": 1.4665696046589765, + "grad_norm": 1.2733041048049927, + "learning_rate": 9.837421052631579e-05, + "loss": 0.5698, + "step": 26190 + }, + { + "epoch": 1.4666256019711055, + "grad_norm": 1.4104869365692139, + "learning_rate": 9.837394736842106e-05, + "loss": 0.6728, + "step": 26191 + }, + { + "epoch": 1.4666815992832345, + "grad_norm": 1.3825445175170898, + "learning_rate": 9.837368421052632e-05, + "loss": 0.4538, + "step": 26192 + }, + { + "epoch": 1.4667375965953635, + "grad_norm": 1.294661283493042, + "learning_rate": 9.837342105263159e-05, + "loss": 0.4541, + "step": 26193 + }, + { + "epoch": 1.4667935939074925, + "grad_norm": 1.3568086624145508, + "learning_rate": 9.837315789473685e-05, + "loss": 0.5706, + "step": 26194 + }, + { + "epoch": 1.4668495912196216, + "grad_norm": 2.1935811042785645, + "learning_rate": 9.837289473684211e-05, + "loss": 0.5933, + "step": 26195 + }, + { + "epoch": 1.4669055885317506, + "grad_norm": 1.3590644598007202, + "learning_rate": 9.837263157894737e-05, + "loss": 0.5384, + "step": 26196 + }, + { + "epoch": 1.4669615858438796, + "grad_norm": 1.1283819675445557, + "learning_rate": 9.837236842105264e-05, + "loss": 0.4398, + "step": 26197 + }, + { + "epoch": 1.4670175831560086, + "grad_norm": 1.3206721544265747, + "learning_rate": 9.83721052631579e-05, + "loss": 0.4794, + "step": 26198 + }, + { + "epoch": 1.4670735804681376, + "grad_norm": 1.4902489185333252, + "learning_rate": 9.837184210526316e-05, + "loss": 0.5192, + "step": 26199 + }, + { + "epoch": 1.4671295777802666, + "grad_norm": 1.4239227771759033, + "learning_rate": 9.837157894736842e-05, + "loss": 0.4689, + "step": 26200 + }, + { + "epoch": 1.4671855750923957, + "grad_norm": 1.2795462608337402, + "learning_rate": 9.837131578947368e-05, + "loss": 0.428, + "step": 26201 + }, + { + "epoch": 1.4672415724045247, + "grad_norm": 1.591731309890747, + "learning_rate": 9.837105263157896e-05, + "loss": 0.4965, + "step": 26202 + }, + { + "epoch": 1.4672975697166537, + "grad_norm": 1.4575308561325073, + "learning_rate": 9.837078947368422e-05, + "loss": 0.4823, + "step": 26203 + }, + { + "epoch": 1.4673535670287827, + "grad_norm": 1.3647435903549194, + "learning_rate": 9.837052631578948e-05, + "loss": 0.3855, + "step": 26204 + }, + { + "epoch": 1.4674095643409117, + "grad_norm": 1.602904200553894, + "learning_rate": 9.837026315789474e-05, + "loss": 0.5449, + "step": 26205 + }, + { + "epoch": 1.4674655616530408, + "grad_norm": 1.2654188871383667, + "learning_rate": 9.837000000000001e-05, + "loss": 0.4398, + "step": 26206 + }, + { + "epoch": 1.4675215589651698, + "grad_norm": 1.1874247789382935, + "learning_rate": 9.836973684210527e-05, + "loss": 0.3172, + "step": 26207 + }, + { + "epoch": 1.4675775562772988, + "grad_norm": 1.2212897539138794, + "learning_rate": 9.836947368421053e-05, + "loss": 0.4092, + "step": 26208 + }, + { + "epoch": 1.4676335535894278, + "grad_norm": 1.4444659948349, + "learning_rate": 9.836921052631579e-05, + "loss": 0.4105, + "step": 26209 + }, + { + "epoch": 1.4676895509015568, + "grad_norm": 1.3629571199417114, + "learning_rate": 9.836894736842106e-05, + "loss": 0.6394, + "step": 26210 + }, + { + "epoch": 1.4677455482136859, + "grad_norm": 1.2981512546539307, + "learning_rate": 9.836868421052632e-05, + "loss": 0.5366, + "step": 26211 + }, + { + "epoch": 1.4678015455258149, + "grad_norm": 1.6782872676849365, + "learning_rate": 9.83684210526316e-05, + "loss": 0.5427, + "step": 26212 + }, + { + "epoch": 1.467857542837944, + "grad_norm": 1.6365340948104858, + "learning_rate": 9.836815789473684e-05, + "loss": 0.494, + "step": 26213 + }, + { + "epoch": 1.467913540150073, + "grad_norm": 1.3263881206512451, + "learning_rate": 9.836789473684211e-05, + "loss": 0.4234, + "step": 26214 + }, + { + "epoch": 1.467969537462202, + "grad_norm": 1.6355187892913818, + "learning_rate": 9.836763157894737e-05, + "loss": 0.6058, + "step": 26215 + }, + { + "epoch": 1.468025534774331, + "grad_norm": 1.500312328338623, + "learning_rate": 9.836736842105265e-05, + "loss": 0.4829, + "step": 26216 + }, + { + "epoch": 1.46808153208646, + "grad_norm": 1.261370301246643, + "learning_rate": 9.836710526315789e-05, + "loss": 0.5173, + "step": 26217 + }, + { + "epoch": 1.468137529398589, + "grad_norm": 1.103760004043579, + "learning_rate": 9.836684210526315e-05, + "loss": 0.4576, + "step": 26218 + }, + { + "epoch": 1.468193526710718, + "grad_norm": 1.3855925798416138, + "learning_rate": 9.836657894736843e-05, + "loss": 0.4705, + "step": 26219 + }, + { + "epoch": 1.468249524022847, + "grad_norm": 1.7272679805755615, + "learning_rate": 9.836631578947369e-05, + "loss": 0.4376, + "step": 26220 + }, + { + "epoch": 1.468305521334976, + "grad_norm": 1.2779830694198608, + "learning_rate": 9.836605263157896e-05, + "loss": 0.558, + "step": 26221 + }, + { + "epoch": 1.468361518647105, + "grad_norm": 1.3490245342254639, + "learning_rate": 9.83657894736842e-05, + "loss": 0.4422, + "step": 26222 + }, + { + "epoch": 1.468417515959234, + "grad_norm": 1.7253708839416504, + "learning_rate": 9.836552631578948e-05, + "loss": 0.5349, + "step": 26223 + }, + { + "epoch": 1.468473513271363, + "grad_norm": 1.2533903121948242, + "learning_rate": 9.836526315789474e-05, + "loss": 0.489, + "step": 26224 + }, + { + "epoch": 1.4685295105834921, + "grad_norm": 1.0124671459197998, + "learning_rate": 9.836500000000001e-05, + "loss": 0.3777, + "step": 26225 + }, + { + "epoch": 1.4685855078956211, + "grad_norm": 1.3425922393798828, + "learning_rate": 9.836473684210527e-05, + "loss": 0.4844, + "step": 26226 + }, + { + "epoch": 1.4686415052077502, + "grad_norm": 1.3471434116363525, + "learning_rate": 9.836447368421053e-05, + "loss": 0.4589, + "step": 26227 + }, + { + "epoch": 1.4686975025198792, + "grad_norm": 1.4200447797775269, + "learning_rate": 9.836421052631579e-05, + "loss": 0.4118, + "step": 26228 + }, + { + "epoch": 1.468753499832008, + "grad_norm": 1.3445628881454468, + "learning_rate": 9.836394736842106e-05, + "loss": 0.4383, + "step": 26229 + }, + { + "epoch": 1.468809497144137, + "grad_norm": 1.3946771621704102, + "learning_rate": 9.836368421052632e-05, + "loss": 0.4094, + "step": 26230 + }, + { + "epoch": 1.468865494456266, + "grad_norm": 1.3378466367721558, + "learning_rate": 9.836342105263158e-05, + "loss": 0.4681, + "step": 26231 + }, + { + "epoch": 1.468921491768395, + "grad_norm": 1.1821906566619873, + "learning_rate": 9.836315789473684e-05, + "loss": 0.3714, + "step": 26232 + }, + { + "epoch": 1.468977489080524, + "grad_norm": 1.356821894645691, + "learning_rate": 9.836289473684212e-05, + "loss": 0.5454, + "step": 26233 + }, + { + "epoch": 1.469033486392653, + "grad_norm": 1.5115985870361328, + "learning_rate": 9.836263157894738e-05, + "loss": 0.5112, + "step": 26234 + }, + { + "epoch": 1.469089483704782, + "grad_norm": 1.245773196220398, + "learning_rate": 9.836236842105264e-05, + "loss": 0.4485, + "step": 26235 + }, + { + "epoch": 1.4691454810169111, + "grad_norm": 1.9896646738052368, + "learning_rate": 9.83621052631579e-05, + "loss": 0.6119, + "step": 26236 + }, + { + "epoch": 1.4692014783290401, + "grad_norm": 1.217750072479248, + "learning_rate": 9.836184210526315e-05, + "loss": 0.4095, + "step": 26237 + }, + { + "epoch": 1.4692574756411692, + "grad_norm": 1.9007140398025513, + "learning_rate": 9.836157894736843e-05, + "loss": 0.4894, + "step": 26238 + }, + { + "epoch": 1.4693134729532982, + "grad_norm": 1.4336644411087036, + "learning_rate": 9.836131578947369e-05, + "loss": 0.4557, + "step": 26239 + }, + { + "epoch": 1.4693694702654272, + "grad_norm": 1.2235615253448486, + "learning_rate": 9.836105263157895e-05, + "loss": 0.4204, + "step": 26240 + }, + { + "epoch": 1.4694254675775562, + "grad_norm": 1.5203443765640259, + "learning_rate": 9.836078947368421e-05, + "loss": 0.6011, + "step": 26241 + }, + { + "epoch": 1.4694814648896852, + "grad_norm": 1.2648966312408447, + "learning_rate": 9.836052631578948e-05, + "loss": 0.5083, + "step": 26242 + }, + { + "epoch": 1.4695374622018142, + "grad_norm": 1.2400827407836914, + "learning_rate": 9.836026315789474e-05, + "loss": 0.5313, + "step": 26243 + }, + { + "epoch": 1.4695934595139433, + "grad_norm": 1.233296513557434, + "learning_rate": 9.836000000000001e-05, + "loss": 0.4499, + "step": 26244 + }, + { + "epoch": 1.4696494568260723, + "grad_norm": 1.275831937789917, + "learning_rate": 9.835973684210526e-05, + "loss": 0.539, + "step": 26245 + }, + { + "epoch": 1.4697054541382013, + "grad_norm": 1.2991141080856323, + "learning_rate": 9.835947368421053e-05, + "loss": 0.502, + "step": 26246 + }, + { + "epoch": 1.4697614514503303, + "grad_norm": 1.430525541305542, + "learning_rate": 9.835921052631579e-05, + "loss": 0.489, + "step": 26247 + }, + { + "epoch": 1.4698174487624593, + "grad_norm": 1.268449068069458, + "learning_rate": 9.835894736842107e-05, + "loss": 0.3601, + "step": 26248 + }, + { + "epoch": 1.4698734460745884, + "grad_norm": 1.5674430131912231, + "learning_rate": 9.835868421052633e-05, + "loss": 0.6267, + "step": 26249 + }, + { + "epoch": 1.4699294433867174, + "grad_norm": 1.63973069190979, + "learning_rate": 9.835842105263159e-05, + "loss": 0.5781, + "step": 26250 + }, + { + "epoch": 1.4699854406988464, + "grad_norm": 1.3429597616195679, + "learning_rate": 9.835815789473685e-05, + "loss": 0.3855, + "step": 26251 + }, + { + "epoch": 1.4700414380109754, + "grad_norm": 1.366430640220642, + "learning_rate": 9.83578947368421e-05, + "loss": 0.416, + "step": 26252 + }, + { + "epoch": 1.4700974353231044, + "grad_norm": 1.4533041715621948, + "learning_rate": 9.835763157894738e-05, + "loss": 0.45, + "step": 26253 + }, + { + "epoch": 1.4701534326352335, + "grad_norm": 1.2154712677001953, + "learning_rate": 9.835736842105264e-05, + "loss": 0.4355, + "step": 26254 + }, + { + "epoch": 1.4702094299473625, + "grad_norm": 1.096551537513733, + "learning_rate": 9.83571052631579e-05, + "loss": 0.4278, + "step": 26255 + }, + { + "epoch": 1.4702654272594915, + "grad_norm": 1.5150096416473389, + "learning_rate": 9.835684210526316e-05, + "loss": 0.4331, + "step": 26256 + }, + { + "epoch": 1.4703214245716205, + "grad_norm": 1.4409756660461426, + "learning_rate": 9.835657894736843e-05, + "loss": 0.4369, + "step": 26257 + }, + { + "epoch": 1.4703774218837495, + "grad_norm": 1.529038667678833, + "learning_rate": 9.835631578947369e-05, + "loss": 0.5506, + "step": 26258 + }, + { + "epoch": 1.4704334191958786, + "grad_norm": 1.58342444896698, + "learning_rate": 9.835605263157895e-05, + "loss": 0.4856, + "step": 26259 + }, + { + "epoch": 1.4704894165080076, + "grad_norm": 1.3333591222763062, + "learning_rate": 9.835578947368421e-05, + "loss": 0.4603, + "step": 26260 + }, + { + "epoch": 1.4705454138201366, + "grad_norm": 1.2099251747131348, + "learning_rate": 9.835552631578948e-05, + "loss": 0.402, + "step": 26261 + }, + { + "epoch": 1.4706014111322656, + "grad_norm": 1.2804086208343506, + "learning_rate": 9.835526315789474e-05, + "loss": 0.3806, + "step": 26262 + }, + { + "epoch": 1.4706574084443946, + "grad_norm": 1.2557204961776733, + "learning_rate": 9.8355e-05, + "loss": 0.383, + "step": 26263 + }, + { + "epoch": 1.4707134057565237, + "grad_norm": 1.328728199005127, + "learning_rate": 9.835473684210526e-05, + "loss": 0.6845, + "step": 26264 + }, + { + "epoch": 1.4707694030686527, + "grad_norm": 1.572045922279358, + "learning_rate": 9.835447368421054e-05, + "loss": 0.4997, + "step": 26265 + }, + { + "epoch": 1.4708254003807817, + "grad_norm": 1.8102408647537231, + "learning_rate": 9.83542105263158e-05, + "loss": 0.5642, + "step": 26266 + }, + { + "epoch": 1.4708813976929107, + "grad_norm": 1.502687692642212, + "learning_rate": 9.835394736842107e-05, + "loss": 0.5193, + "step": 26267 + }, + { + "epoch": 1.4709373950050397, + "grad_norm": 1.3405128717422485, + "learning_rate": 9.835368421052631e-05, + "loss": 0.4186, + "step": 26268 + }, + { + "epoch": 1.4709933923171687, + "grad_norm": 1.430985450744629, + "learning_rate": 9.835342105263159e-05, + "loss": 0.7133, + "step": 26269 + }, + { + "epoch": 1.4710493896292978, + "grad_norm": 1.7925950288772583, + "learning_rate": 9.835315789473685e-05, + "loss": 0.4835, + "step": 26270 + }, + { + "epoch": 1.4711053869414268, + "grad_norm": 1.115112543106079, + "learning_rate": 9.835289473684211e-05, + "loss": 0.4697, + "step": 26271 + }, + { + "epoch": 1.4711613842535558, + "grad_norm": 1.8055635690689087, + "learning_rate": 9.835263157894737e-05, + "loss": 0.6008, + "step": 26272 + }, + { + "epoch": 1.4712173815656848, + "grad_norm": 1.398441195487976, + "learning_rate": 9.835236842105263e-05, + "loss": 0.4506, + "step": 26273 + }, + { + "epoch": 1.4712733788778138, + "grad_norm": 1.3960886001586914, + "learning_rate": 9.83521052631579e-05, + "loss": 0.5928, + "step": 26274 + }, + { + "epoch": 1.4713293761899429, + "grad_norm": 1.3009015321731567, + "learning_rate": 9.835184210526316e-05, + "loss": 0.4612, + "step": 26275 + }, + { + "epoch": 1.4713853735020719, + "grad_norm": 1.52046537399292, + "learning_rate": 9.835157894736843e-05, + "loss": 0.5243, + "step": 26276 + }, + { + "epoch": 1.471441370814201, + "grad_norm": 1.382170557975769, + "learning_rate": 9.835131578947368e-05, + "loss": 0.5536, + "step": 26277 + }, + { + "epoch": 1.47149736812633, + "grad_norm": 1.4639800786972046, + "learning_rate": 9.835105263157895e-05, + "loss": 0.4572, + "step": 26278 + }, + { + "epoch": 1.471553365438459, + "grad_norm": 1.494409203529358, + "learning_rate": 9.835078947368421e-05, + "loss": 0.4796, + "step": 26279 + }, + { + "epoch": 1.471609362750588, + "grad_norm": 1.181498646736145, + "learning_rate": 9.835052631578949e-05, + "loss": 0.4657, + "step": 26280 + }, + { + "epoch": 1.471665360062717, + "grad_norm": 1.3346881866455078, + "learning_rate": 9.835026315789475e-05, + "loss": 0.5247, + "step": 26281 + }, + { + "epoch": 1.471721357374846, + "grad_norm": 1.2633123397827148, + "learning_rate": 9.835e-05, + "loss": 0.3755, + "step": 26282 + }, + { + "epoch": 1.471777354686975, + "grad_norm": 1.2065980434417725, + "learning_rate": 9.834973684210527e-05, + "loss": 0.5416, + "step": 26283 + }, + { + "epoch": 1.471833351999104, + "grad_norm": 1.511853814125061, + "learning_rate": 9.834947368421054e-05, + "loss": 0.5346, + "step": 26284 + }, + { + "epoch": 1.471889349311233, + "grad_norm": 1.6110639572143555, + "learning_rate": 9.83492105263158e-05, + "loss": 0.5677, + "step": 26285 + }, + { + "epoch": 1.471945346623362, + "grad_norm": 1.2345631122589111, + "learning_rate": 9.834894736842106e-05, + "loss": 0.6015, + "step": 26286 + }, + { + "epoch": 1.472001343935491, + "grad_norm": 1.659584879875183, + "learning_rate": 9.834868421052632e-05, + "loss": 0.4492, + "step": 26287 + }, + { + "epoch": 1.47205734124762, + "grad_norm": 1.4842394590377808, + "learning_rate": 9.834842105263158e-05, + "loss": 0.5161, + "step": 26288 + }, + { + "epoch": 1.4721133385597491, + "grad_norm": 1.5364598035812378, + "learning_rate": 9.834815789473685e-05, + "loss": 0.5089, + "step": 26289 + }, + { + "epoch": 1.4721693358718781, + "grad_norm": 1.1319185495376587, + "learning_rate": 9.834789473684211e-05, + "loss": 0.3774, + "step": 26290 + }, + { + "epoch": 1.4722253331840072, + "grad_norm": 1.2564584016799927, + "learning_rate": 9.834763157894737e-05, + "loss": 0.4365, + "step": 26291 + }, + { + "epoch": 1.4722813304961362, + "grad_norm": 1.386336088180542, + "learning_rate": 9.834736842105263e-05, + "loss": 0.4626, + "step": 26292 + }, + { + "epoch": 1.4723373278082652, + "grad_norm": 1.2889857292175293, + "learning_rate": 9.83471052631579e-05, + "loss": 0.469, + "step": 26293 + }, + { + "epoch": 1.4723933251203942, + "grad_norm": 2.604438304901123, + "learning_rate": 9.834684210526316e-05, + "loss": 0.4053, + "step": 26294 + }, + { + "epoch": 1.4724493224325232, + "grad_norm": 1.454038381576538, + "learning_rate": 9.834657894736842e-05, + "loss": 0.6295, + "step": 26295 + }, + { + "epoch": 1.4725053197446523, + "grad_norm": 1.471243143081665, + "learning_rate": 9.834631578947368e-05, + "loss": 0.4956, + "step": 26296 + }, + { + "epoch": 1.4725613170567813, + "grad_norm": 1.1576184034347534, + "learning_rate": 9.834605263157896e-05, + "loss": 0.4578, + "step": 26297 + }, + { + "epoch": 1.4726173143689103, + "grad_norm": 1.247467279434204, + "learning_rate": 9.834578947368422e-05, + "loss": 0.4447, + "step": 26298 + }, + { + "epoch": 1.4726733116810393, + "grad_norm": 1.2596161365509033, + "learning_rate": 9.834552631578949e-05, + "loss": 0.4248, + "step": 26299 + }, + { + "epoch": 1.4727293089931683, + "grad_norm": 1.3921743631362915, + "learning_rate": 9.834526315789473e-05, + "loss": 0.4215, + "step": 26300 + }, + { + "epoch": 1.4727853063052974, + "grad_norm": 1.458272099494934, + "learning_rate": 9.834500000000001e-05, + "loss": 0.4621, + "step": 26301 + }, + { + "epoch": 1.4728413036174264, + "grad_norm": 1.153385043144226, + "learning_rate": 9.834473684210527e-05, + "loss": 0.4612, + "step": 26302 + }, + { + "epoch": 1.4728973009295554, + "grad_norm": 1.3927161693572998, + "learning_rate": 9.834447368421054e-05, + "loss": 0.4447, + "step": 26303 + }, + { + "epoch": 1.4729532982416844, + "grad_norm": 1.2911595106124878, + "learning_rate": 9.83442105263158e-05, + "loss": 0.4224, + "step": 26304 + }, + { + "epoch": 1.4730092955538134, + "grad_norm": 1.290352702140808, + "learning_rate": 9.834394736842105e-05, + "loss": 0.3922, + "step": 26305 + }, + { + "epoch": 1.4730652928659425, + "grad_norm": 1.5228631496429443, + "learning_rate": 9.834368421052632e-05, + "loss": 0.4679, + "step": 26306 + }, + { + "epoch": 1.4731212901780715, + "grad_norm": 1.3852399587631226, + "learning_rate": 9.834342105263158e-05, + "loss": 0.3971, + "step": 26307 + }, + { + "epoch": 1.4731772874902005, + "grad_norm": 1.2399355173110962, + "learning_rate": 9.834315789473685e-05, + "loss": 0.4984, + "step": 26308 + }, + { + "epoch": 1.4732332848023295, + "grad_norm": 1.2135710716247559, + "learning_rate": 9.83428947368421e-05, + "loss": 0.4384, + "step": 26309 + }, + { + "epoch": 1.4732892821144585, + "grad_norm": 1.644197940826416, + "learning_rate": 9.834263157894737e-05, + "loss": 0.5053, + "step": 26310 + }, + { + "epoch": 1.4733452794265876, + "grad_norm": 1.3441396951675415, + "learning_rate": 9.834236842105263e-05, + "loss": 0.3923, + "step": 26311 + }, + { + "epoch": 1.4734012767387166, + "grad_norm": 1.3883205652236938, + "learning_rate": 9.83421052631579e-05, + "loss": 0.6124, + "step": 26312 + }, + { + "epoch": 1.4734572740508456, + "grad_norm": 1.3514333963394165, + "learning_rate": 9.834184210526317e-05, + "loss": 0.5058, + "step": 26313 + }, + { + "epoch": 1.4735132713629746, + "grad_norm": 1.5079749822616577, + "learning_rate": 9.834157894736843e-05, + "loss": 0.5274, + "step": 26314 + }, + { + "epoch": 1.4735692686751036, + "grad_norm": 1.2684351205825806, + "learning_rate": 9.834131578947368e-05, + "loss": 0.4921, + "step": 26315 + }, + { + "epoch": 1.4736252659872326, + "grad_norm": 1.300086498260498, + "learning_rate": 9.834105263157896e-05, + "loss": 0.4784, + "step": 26316 + }, + { + "epoch": 1.4736812632993617, + "grad_norm": 1.300095796585083, + "learning_rate": 9.834078947368422e-05, + "loss": 0.4917, + "step": 26317 + }, + { + "epoch": 1.4737372606114907, + "grad_norm": 1.8674501180648804, + "learning_rate": 9.834052631578948e-05, + "loss": 0.5852, + "step": 26318 + }, + { + "epoch": 1.4737932579236197, + "grad_norm": 1.654843807220459, + "learning_rate": 9.834026315789474e-05, + "loss": 0.5459, + "step": 26319 + }, + { + "epoch": 1.4738492552357487, + "grad_norm": 1.3569914102554321, + "learning_rate": 9.834000000000001e-05, + "loss": 0.4422, + "step": 26320 + }, + { + "epoch": 1.4739052525478777, + "grad_norm": 1.52809739112854, + "learning_rate": 9.833973684210527e-05, + "loss": 0.2912, + "step": 26321 + }, + { + "epoch": 1.4739612498600068, + "grad_norm": 1.3407853841781616, + "learning_rate": 9.833947368421053e-05, + "loss": 0.4849, + "step": 26322 + }, + { + "epoch": 1.4740172471721358, + "grad_norm": 1.2122225761413574, + "learning_rate": 9.833921052631579e-05, + "loss": 0.3636, + "step": 26323 + }, + { + "epoch": 1.4740732444842648, + "grad_norm": 1.421341061592102, + "learning_rate": 9.833894736842105e-05, + "loss": 0.4648, + "step": 26324 + }, + { + "epoch": 1.4741292417963938, + "grad_norm": 1.3635461330413818, + "learning_rate": 9.833868421052632e-05, + "loss": 0.3847, + "step": 26325 + }, + { + "epoch": 1.4741852391085228, + "grad_norm": 1.5558712482452393, + "learning_rate": 9.833842105263158e-05, + "loss": 0.7002, + "step": 26326 + }, + { + "epoch": 1.4742412364206519, + "grad_norm": 1.379497766494751, + "learning_rate": 9.833815789473684e-05, + "loss": 0.5231, + "step": 26327 + }, + { + "epoch": 1.4742972337327809, + "grad_norm": 1.337708830833435, + "learning_rate": 9.83378947368421e-05, + "loss": 0.3953, + "step": 26328 + }, + { + "epoch": 1.47435323104491, + "grad_norm": 1.4808558225631714, + "learning_rate": 9.833763157894738e-05, + "loss": 0.4949, + "step": 26329 + }, + { + "epoch": 1.474409228357039, + "grad_norm": 1.286215901374817, + "learning_rate": 9.833736842105263e-05, + "loss": 0.352, + "step": 26330 + }, + { + "epoch": 1.474465225669168, + "grad_norm": 1.3624666929244995, + "learning_rate": 9.833710526315791e-05, + "loss": 0.5038, + "step": 26331 + }, + { + "epoch": 1.474521222981297, + "grad_norm": 1.7593283653259277, + "learning_rate": 9.833684210526315e-05, + "loss": 0.5098, + "step": 26332 + }, + { + "epoch": 1.474577220293426, + "grad_norm": 1.313141107559204, + "learning_rate": 9.833657894736843e-05, + "loss": 0.4492, + "step": 26333 + }, + { + "epoch": 1.474633217605555, + "grad_norm": 1.3633440732955933, + "learning_rate": 9.833631578947369e-05, + "loss": 0.3685, + "step": 26334 + }, + { + "epoch": 1.474689214917684, + "grad_norm": 1.2808122634887695, + "learning_rate": 9.833605263157896e-05, + "loss": 0.4479, + "step": 26335 + }, + { + "epoch": 1.474745212229813, + "grad_norm": 1.3971952199935913, + "learning_rate": 9.833578947368422e-05, + "loss": 0.4375, + "step": 26336 + }, + { + "epoch": 1.474801209541942, + "grad_norm": 1.2194265127182007, + "learning_rate": 9.833552631578948e-05, + "loss": 0.4314, + "step": 26337 + }, + { + "epoch": 1.474857206854071, + "grad_norm": 1.1891316175460815, + "learning_rate": 9.833526315789474e-05, + "loss": 0.3928, + "step": 26338 + }, + { + "epoch": 1.4749132041662, + "grad_norm": 1.1536589860916138, + "learning_rate": 9.8335e-05, + "loss": 0.4065, + "step": 26339 + }, + { + "epoch": 1.474969201478329, + "grad_norm": 1.2504948377609253, + "learning_rate": 9.833473684210527e-05, + "loss": 0.4255, + "step": 26340 + }, + { + "epoch": 1.4750251987904581, + "grad_norm": 1.4145692586898804, + "learning_rate": 9.833447368421053e-05, + "loss": 0.4366, + "step": 26341 + }, + { + "epoch": 1.4750811961025871, + "grad_norm": 1.2258678674697876, + "learning_rate": 9.833421052631579e-05, + "loss": 0.5979, + "step": 26342 + }, + { + "epoch": 1.4751371934147162, + "grad_norm": 1.4275203943252563, + "learning_rate": 9.833394736842105e-05, + "loss": 0.4396, + "step": 26343 + }, + { + "epoch": 1.4751931907268452, + "grad_norm": 1.2004852294921875, + "learning_rate": 9.833368421052633e-05, + "loss": 0.3964, + "step": 26344 + }, + { + "epoch": 1.4752491880389742, + "grad_norm": 1.274658441543579, + "learning_rate": 9.833342105263159e-05, + "loss": 0.4803, + "step": 26345 + }, + { + "epoch": 1.4753051853511032, + "grad_norm": 1.2332416772842407, + "learning_rate": 9.833315789473684e-05, + "loss": 0.4954, + "step": 26346 + }, + { + "epoch": 1.4753611826632322, + "grad_norm": 1.2086886167526245, + "learning_rate": 9.83328947368421e-05, + "loss": 0.4858, + "step": 26347 + }, + { + "epoch": 1.4754171799753613, + "grad_norm": 1.24374520778656, + "learning_rate": 9.833263157894738e-05, + "loss": 0.4812, + "step": 26348 + }, + { + "epoch": 1.4754731772874903, + "grad_norm": 1.462717890739441, + "learning_rate": 9.833236842105264e-05, + "loss": 0.4936, + "step": 26349 + }, + { + "epoch": 1.4755291745996193, + "grad_norm": 1.3813000917434692, + "learning_rate": 9.83321052631579e-05, + "loss": 0.4134, + "step": 26350 + }, + { + "epoch": 1.4755851719117483, + "grad_norm": 1.4574353694915771, + "learning_rate": 9.833184210526316e-05, + "loss": 0.5203, + "step": 26351 + }, + { + "epoch": 1.4756411692238773, + "grad_norm": 11.261170387268066, + "learning_rate": 9.833157894736843e-05, + "loss": 0.5446, + "step": 26352 + }, + { + "epoch": 1.4756971665360061, + "grad_norm": 1.5375767946243286, + "learning_rate": 9.833131578947369e-05, + "loss": 0.4074, + "step": 26353 + }, + { + "epoch": 1.4757531638481352, + "grad_norm": 1.4761403799057007, + "learning_rate": 9.833105263157896e-05, + "loss": 0.4767, + "step": 26354 + }, + { + "epoch": 1.4758091611602642, + "grad_norm": 1.3679126501083374, + "learning_rate": 9.833078947368421e-05, + "loss": 0.4278, + "step": 26355 + }, + { + "epoch": 1.4758651584723932, + "grad_norm": 1.2875906229019165, + "learning_rate": 9.833052631578947e-05, + "loss": 0.4687, + "step": 26356 + }, + { + "epoch": 1.4759211557845222, + "grad_norm": 1.0745713710784912, + "learning_rate": 9.833026315789474e-05, + "loss": 0.3685, + "step": 26357 + }, + { + "epoch": 1.4759771530966512, + "grad_norm": 1.2158112525939941, + "learning_rate": 9.833e-05, + "loss": 0.4848, + "step": 26358 + }, + { + "epoch": 1.4760331504087802, + "grad_norm": 1.1623754501342773, + "learning_rate": 9.832973684210528e-05, + "loss": 0.5242, + "step": 26359 + }, + { + "epoch": 1.4760891477209093, + "grad_norm": 1.2012572288513184, + "learning_rate": 9.832947368421052e-05, + "loss": 0.4513, + "step": 26360 + }, + { + "epoch": 1.4761451450330383, + "grad_norm": 1.6301683187484741, + "learning_rate": 9.83292105263158e-05, + "loss": 0.4982, + "step": 26361 + }, + { + "epoch": 1.4762011423451673, + "grad_norm": 1.2660621404647827, + "learning_rate": 9.832894736842105e-05, + "loss": 0.5017, + "step": 26362 + }, + { + "epoch": 1.4762571396572963, + "grad_norm": 1.179836630821228, + "learning_rate": 9.832868421052633e-05, + "loss": 0.4138, + "step": 26363 + }, + { + "epoch": 1.4763131369694253, + "grad_norm": 1.2577470541000366, + "learning_rate": 9.832842105263157e-05, + "loss": 0.4766, + "step": 26364 + }, + { + "epoch": 1.4763691342815544, + "grad_norm": 1.3299623727798462, + "learning_rate": 9.832815789473685e-05, + "loss": 0.5242, + "step": 26365 + }, + { + "epoch": 1.4764251315936834, + "grad_norm": 1.5384547710418701, + "learning_rate": 9.832789473684211e-05, + "loss": 0.453, + "step": 26366 + }, + { + "epoch": 1.4764811289058124, + "grad_norm": 1.502936840057373, + "learning_rate": 9.832763157894738e-05, + "loss": 0.4914, + "step": 26367 + }, + { + "epoch": 1.4765371262179414, + "grad_norm": 1.1734130382537842, + "learning_rate": 9.832736842105264e-05, + "loss": 0.3136, + "step": 26368 + }, + { + "epoch": 1.4765931235300704, + "grad_norm": 1.788120150566101, + "learning_rate": 9.83271052631579e-05, + "loss": 0.4253, + "step": 26369 + }, + { + "epoch": 1.4766491208421995, + "grad_norm": 1.5197820663452148, + "learning_rate": 9.832684210526316e-05, + "loss": 0.5413, + "step": 26370 + }, + { + "epoch": 1.4767051181543285, + "grad_norm": 2.077280282974243, + "learning_rate": 9.832657894736843e-05, + "loss": 0.5113, + "step": 26371 + }, + { + "epoch": 1.4767611154664575, + "grad_norm": 1.2023744583129883, + "learning_rate": 9.832631578947369e-05, + "loss": 0.5016, + "step": 26372 + }, + { + "epoch": 1.4768171127785865, + "grad_norm": 1.2389616966247559, + "learning_rate": 9.832605263157895e-05, + "loss": 0.4866, + "step": 26373 + }, + { + "epoch": 1.4768731100907155, + "grad_norm": 1.177794337272644, + "learning_rate": 9.832578947368421e-05, + "loss": 0.4466, + "step": 26374 + }, + { + "epoch": 1.4769291074028446, + "grad_norm": 1.428767442703247, + "learning_rate": 9.832552631578947e-05, + "loss": 0.4747, + "step": 26375 + }, + { + "epoch": 1.4769851047149736, + "grad_norm": 1.4767810106277466, + "learning_rate": 9.832526315789475e-05, + "loss": 0.4311, + "step": 26376 + }, + { + "epoch": 1.4770411020271026, + "grad_norm": 1.1158396005630493, + "learning_rate": 9.8325e-05, + "loss": 0.373, + "step": 26377 + }, + { + "epoch": 1.4770970993392316, + "grad_norm": 1.7182050943374634, + "learning_rate": 9.832473684210526e-05, + "loss": 0.4972, + "step": 26378 + }, + { + "epoch": 1.4771530966513606, + "grad_norm": 1.5233622789382935, + "learning_rate": 9.832447368421052e-05, + "loss": 0.4996, + "step": 26379 + }, + { + "epoch": 1.4772090939634897, + "grad_norm": 1.4264438152313232, + "learning_rate": 9.83242105263158e-05, + "loss": 0.5008, + "step": 26380 + }, + { + "epoch": 1.4772650912756187, + "grad_norm": 1.1224011182785034, + "learning_rate": 9.832394736842106e-05, + "loss": 0.3306, + "step": 26381 + }, + { + "epoch": 1.4773210885877477, + "grad_norm": 1.9449200630187988, + "learning_rate": 9.832368421052632e-05, + "loss": 0.53, + "step": 26382 + }, + { + "epoch": 1.4773770858998767, + "grad_norm": 1.3462685346603394, + "learning_rate": 9.832342105263158e-05, + "loss": 0.464, + "step": 26383 + }, + { + "epoch": 1.4774330832120057, + "grad_norm": 1.302512526512146, + "learning_rate": 9.832315789473685e-05, + "loss": 0.3426, + "step": 26384 + }, + { + "epoch": 1.4774890805241347, + "grad_norm": 1.703502893447876, + "learning_rate": 9.832289473684211e-05, + "loss": 0.5233, + "step": 26385 + }, + { + "epoch": 1.4775450778362638, + "grad_norm": 1.4388452768325806, + "learning_rate": 9.832263157894738e-05, + "loss": 0.569, + "step": 26386 + }, + { + "epoch": 1.4776010751483928, + "grad_norm": 1.9214638471603394, + "learning_rate": 9.832236842105263e-05, + "loss": 0.495, + "step": 26387 + }, + { + "epoch": 1.4776570724605218, + "grad_norm": 1.1944797039031982, + "learning_rate": 9.83221052631579e-05, + "loss": 0.4455, + "step": 26388 + }, + { + "epoch": 1.4777130697726508, + "grad_norm": 1.778193712234497, + "learning_rate": 9.832184210526316e-05, + "loss": 0.5706, + "step": 26389 + }, + { + "epoch": 1.4777690670847798, + "grad_norm": 1.3621065616607666, + "learning_rate": 9.832157894736844e-05, + "loss": 0.5761, + "step": 26390 + }, + { + "epoch": 1.4778250643969089, + "grad_norm": 1.1708863973617554, + "learning_rate": 9.83213157894737e-05, + "loss": 0.3399, + "step": 26391 + }, + { + "epoch": 1.4778810617090379, + "grad_norm": 1.17164146900177, + "learning_rate": 9.832105263157894e-05, + "loss": 0.326, + "step": 26392 + }, + { + "epoch": 1.477937059021167, + "grad_norm": 1.2393851280212402, + "learning_rate": 9.832078947368421e-05, + "loss": 0.4197, + "step": 26393 + }, + { + "epoch": 1.477993056333296, + "grad_norm": 1.4932615756988525, + "learning_rate": 9.832052631578947e-05, + "loss": 0.5323, + "step": 26394 + }, + { + "epoch": 1.478049053645425, + "grad_norm": 1.2480852603912354, + "learning_rate": 9.832026315789475e-05, + "loss": 0.461, + "step": 26395 + }, + { + "epoch": 1.478105050957554, + "grad_norm": 1.423901081085205, + "learning_rate": 9.832000000000001e-05, + "loss": 0.3795, + "step": 26396 + }, + { + "epoch": 1.478161048269683, + "grad_norm": 1.1684621572494507, + "learning_rate": 9.831973684210527e-05, + "loss": 0.4044, + "step": 26397 + }, + { + "epoch": 1.478217045581812, + "grad_norm": 1.1378426551818848, + "learning_rate": 9.831947368421053e-05, + "loss": 0.4358, + "step": 26398 + }, + { + "epoch": 1.478273042893941, + "grad_norm": 1.8066250085830688, + "learning_rate": 9.83192105263158e-05, + "loss": 0.6716, + "step": 26399 + }, + { + "epoch": 1.47832904020607, + "grad_norm": 1.414628505706787, + "learning_rate": 9.831894736842106e-05, + "loss": 0.4991, + "step": 26400 + }, + { + "epoch": 1.478385037518199, + "grad_norm": 1.4492090940475464, + "learning_rate": 9.831868421052632e-05, + "loss": 0.4438, + "step": 26401 + }, + { + "epoch": 1.478441034830328, + "grad_norm": 1.5771111249923706, + "learning_rate": 9.831842105263158e-05, + "loss": 0.4732, + "step": 26402 + }, + { + "epoch": 1.478497032142457, + "grad_norm": 1.8152694702148438, + "learning_rate": 9.831815789473685e-05, + "loss": 0.4741, + "step": 26403 + }, + { + "epoch": 1.478553029454586, + "grad_norm": 1.3209187984466553, + "learning_rate": 9.831789473684211e-05, + "loss": 0.3829, + "step": 26404 + }, + { + "epoch": 1.4786090267667151, + "grad_norm": 1.21585214138031, + "learning_rate": 9.831763157894737e-05, + "loss": 0.4164, + "step": 26405 + }, + { + "epoch": 1.4786650240788441, + "grad_norm": 1.0698697566986084, + "learning_rate": 9.831736842105263e-05, + "loss": 0.349, + "step": 26406 + }, + { + "epoch": 1.4787210213909732, + "grad_norm": 1.439136266708374, + "learning_rate": 9.83171052631579e-05, + "loss": 0.3543, + "step": 26407 + }, + { + "epoch": 1.4787770187031022, + "grad_norm": 1.2398730516433716, + "learning_rate": 9.831684210526316e-05, + "loss": 0.5895, + "step": 26408 + }, + { + "epoch": 1.4788330160152312, + "grad_norm": 1.3039520978927612, + "learning_rate": 9.831657894736842e-05, + "loss": 0.5787, + "step": 26409 + }, + { + "epoch": 1.4788890133273602, + "grad_norm": 1.1072537899017334, + "learning_rate": 9.831631578947368e-05, + "loss": 0.4565, + "step": 26410 + }, + { + "epoch": 1.4789450106394892, + "grad_norm": 1.2011222839355469, + "learning_rate": 9.831605263157894e-05, + "loss": 0.4562, + "step": 26411 + }, + { + "epoch": 1.4790010079516183, + "grad_norm": 1.3780157566070557, + "learning_rate": 9.831578947368422e-05, + "loss": 0.4488, + "step": 26412 + }, + { + "epoch": 1.4790570052637473, + "grad_norm": 1.3226121664047241, + "learning_rate": 9.831552631578948e-05, + "loss": 0.5458, + "step": 26413 + }, + { + "epoch": 1.4791130025758763, + "grad_norm": 1.4769147634506226, + "learning_rate": 9.831526315789475e-05, + "loss": 0.5004, + "step": 26414 + }, + { + "epoch": 1.4791689998880053, + "grad_norm": 1.467332363128662, + "learning_rate": 9.8315e-05, + "loss": 0.6603, + "step": 26415 + }, + { + "epoch": 1.4792249972001343, + "grad_norm": 1.6566319465637207, + "learning_rate": 9.831473684210527e-05, + "loss": 0.5555, + "step": 26416 + }, + { + "epoch": 1.4792809945122634, + "grad_norm": 1.2249643802642822, + "learning_rate": 9.831447368421053e-05, + "loss": 0.4686, + "step": 26417 + }, + { + "epoch": 1.4793369918243924, + "grad_norm": 1.3104848861694336, + "learning_rate": 9.83142105263158e-05, + "loss": 0.4912, + "step": 26418 + }, + { + "epoch": 1.4793929891365214, + "grad_norm": 1.394104242324829, + "learning_rate": 9.831394736842105e-05, + "loss": 0.5911, + "step": 26419 + }, + { + "epoch": 1.4794489864486504, + "grad_norm": 1.2208060026168823, + "learning_rate": 9.831368421052632e-05, + "loss": 0.3924, + "step": 26420 + }, + { + "epoch": 1.4795049837607794, + "grad_norm": 1.481011986732483, + "learning_rate": 9.831342105263158e-05, + "loss": 0.4138, + "step": 26421 + }, + { + "epoch": 1.4795609810729085, + "grad_norm": 1.4936903715133667, + "learning_rate": 9.831315789473686e-05, + "loss": 0.6404, + "step": 26422 + }, + { + "epoch": 1.4796169783850375, + "grad_norm": 1.253900408744812, + "learning_rate": 9.831289473684211e-05, + "loss": 0.4427, + "step": 26423 + }, + { + "epoch": 1.4796729756971665, + "grad_norm": 1.721214771270752, + "learning_rate": 9.831263157894737e-05, + "loss": 0.4268, + "step": 26424 + }, + { + "epoch": 1.4797289730092955, + "grad_norm": 1.1525635719299316, + "learning_rate": 9.831236842105263e-05, + "loss": 0.4156, + "step": 26425 + }, + { + "epoch": 1.4797849703214245, + "grad_norm": 1.419958233833313, + "learning_rate": 9.83121052631579e-05, + "loss": 0.5837, + "step": 26426 + }, + { + "epoch": 1.4798409676335536, + "grad_norm": 1.310727596282959, + "learning_rate": 9.831184210526317e-05, + "loss": 0.4129, + "step": 26427 + }, + { + "epoch": 1.4798969649456826, + "grad_norm": 1.3370429277420044, + "learning_rate": 9.831157894736843e-05, + "loss": 0.3879, + "step": 26428 + }, + { + "epoch": 1.4799529622578116, + "grad_norm": 1.1163039207458496, + "learning_rate": 9.831131578947369e-05, + "loss": 0.377, + "step": 26429 + }, + { + "epoch": 1.4800089595699406, + "grad_norm": 1.5141627788543701, + "learning_rate": 9.831105263157895e-05, + "loss": 0.7203, + "step": 26430 + }, + { + "epoch": 1.4800649568820696, + "grad_norm": 1.1618309020996094, + "learning_rate": 9.831078947368422e-05, + "loss": 0.4064, + "step": 26431 + }, + { + "epoch": 1.4801209541941986, + "grad_norm": 1.1684426069259644, + "learning_rate": 9.831052631578948e-05, + "loss": 0.4171, + "step": 26432 + }, + { + "epoch": 1.4801769515063277, + "grad_norm": 1.3729358911514282, + "learning_rate": 9.831026315789474e-05, + "loss": 0.4878, + "step": 26433 + }, + { + "epoch": 1.4802329488184567, + "grad_norm": 1.247262716293335, + "learning_rate": 9.831e-05, + "loss": 0.4128, + "step": 26434 + }, + { + "epoch": 1.4802889461305857, + "grad_norm": 1.4308249950408936, + "learning_rate": 9.830973684210527e-05, + "loss": 0.5528, + "step": 26435 + }, + { + "epoch": 1.4803449434427147, + "grad_norm": 1.364248275756836, + "learning_rate": 9.830947368421053e-05, + "loss": 0.4859, + "step": 26436 + }, + { + "epoch": 1.4804009407548437, + "grad_norm": 1.3801158666610718, + "learning_rate": 9.830921052631579e-05, + "loss": 0.4584, + "step": 26437 + }, + { + "epoch": 1.4804569380669728, + "grad_norm": 1.2886649370193481, + "learning_rate": 9.830894736842105e-05, + "loss": 0.5631, + "step": 26438 + }, + { + "epoch": 1.4805129353791018, + "grad_norm": 1.2724788188934326, + "learning_rate": 9.830868421052632e-05, + "loss": 0.449, + "step": 26439 + }, + { + "epoch": 1.4805689326912308, + "grad_norm": 1.3946900367736816, + "learning_rate": 9.830842105263158e-05, + "loss": 0.3947, + "step": 26440 + }, + { + "epoch": 1.4806249300033598, + "grad_norm": 1.218000888824463, + "learning_rate": 9.830815789473686e-05, + "loss": 0.3982, + "step": 26441 + }, + { + "epoch": 1.4806809273154888, + "grad_norm": 1.2052074670791626, + "learning_rate": 9.83078947368421e-05, + "loss": 0.4831, + "step": 26442 + }, + { + "epoch": 1.4807369246276179, + "grad_norm": 1.4074491262435913, + "learning_rate": 9.830763157894736e-05, + "loss": 0.4657, + "step": 26443 + }, + { + "epoch": 1.4807929219397469, + "grad_norm": 1.4324088096618652, + "learning_rate": 9.830736842105264e-05, + "loss": 0.4509, + "step": 26444 + }, + { + "epoch": 1.480848919251876, + "grad_norm": 1.564112901687622, + "learning_rate": 9.83071052631579e-05, + "loss": 0.5721, + "step": 26445 + }, + { + "epoch": 1.480904916564005, + "grad_norm": 1.3879790306091309, + "learning_rate": 9.830684210526317e-05, + "loss": 0.4773, + "step": 26446 + }, + { + "epoch": 1.480960913876134, + "grad_norm": 1.097662329673767, + "learning_rate": 9.830657894736842e-05, + "loss": 0.5217, + "step": 26447 + }, + { + "epoch": 1.481016911188263, + "grad_norm": 1.2054959535598755, + "learning_rate": 9.830631578947369e-05, + "loss": 0.4029, + "step": 26448 + }, + { + "epoch": 1.481072908500392, + "grad_norm": 1.091911792755127, + "learning_rate": 9.830605263157895e-05, + "loss": 0.4091, + "step": 26449 + }, + { + "epoch": 1.481128905812521, + "grad_norm": 1.366909146308899, + "learning_rate": 9.830578947368422e-05, + "loss": 0.5705, + "step": 26450 + }, + { + "epoch": 1.48118490312465, + "grad_norm": 1.342989206314087, + "learning_rate": 9.830552631578948e-05, + "loss": 0.5374, + "step": 26451 + }, + { + "epoch": 1.481240900436779, + "grad_norm": 1.3994578123092651, + "learning_rate": 9.830526315789474e-05, + "loss": 0.3981, + "step": 26452 + }, + { + "epoch": 1.481296897748908, + "grad_norm": 1.430092215538025, + "learning_rate": 9.8305e-05, + "loss": 0.4674, + "step": 26453 + }, + { + "epoch": 1.481352895061037, + "grad_norm": 1.160904884338379, + "learning_rate": 9.830473684210527e-05, + "loss": 0.3719, + "step": 26454 + }, + { + "epoch": 1.481408892373166, + "grad_norm": 1.5587517023086548, + "learning_rate": 9.830447368421053e-05, + "loss": 0.5502, + "step": 26455 + }, + { + "epoch": 1.481464889685295, + "grad_norm": 1.2606472969055176, + "learning_rate": 9.83042105263158e-05, + "loss": 0.4605, + "step": 26456 + }, + { + "epoch": 1.4815208869974241, + "grad_norm": 1.5438977479934692, + "learning_rate": 9.830394736842105e-05, + "loss": 0.8613, + "step": 26457 + }, + { + "epoch": 1.4815768843095531, + "grad_norm": 1.6708910465240479, + "learning_rate": 9.830368421052633e-05, + "loss": 0.5333, + "step": 26458 + }, + { + "epoch": 1.4816328816216822, + "grad_norm": 1.3151296377182007, + "learning_rate": 9.830342105263159e-05, + "loss": 0.3697, + "step": 26459 + }, + { + "epoch": 1.4816888789338112, + "grad_norm": 1.283218502998352, + "learning_rate": 9.830315789473685e-05, + "loss": 0.4876, + "step": 26460 + }, + { + "epoch": 1.4817448762459402, + "grad_norm": 1.3207824230194092, + "learning_rate": 9.83028947368421e-05, + "loss": 0.4758, + "step": 26461 + }, + { + "epoch": 1.4818008735580692, + "grad_norm": 1.426101803779602, + "learning_rate": 9.830263157894737e-05, + "loss": 0.5537, + "step": 26462 + }, + { + "epoch": 1.4818568708701982, + "grad_norm": 1.3909064531326294, + "learning_rate": 9.830236842105264e-05, + "loss": 0.3975, + "step": 26463 + }, + { + "epoch": 1.4819128681823273, + "grad_norm": 1.3793268203735352, + "learning_rate": 9.83021052631579e-05, + "loss": 0.5087, + "step": 26464 + }, + { + "epoch": 1.4819688654944563, + "grad_norm": 1.632776141166687, + "learning_rate": 9.830184210526316e-05, + "loss": 0.4604, + "step": 26465 + }, + { + "epoch": 1.4820248628065853, + "grad_norm": 1.5219885110855103, + "learning_rate": 9.830157894736842e-05, + "loss": 0.6344, + "step": 26466 + }, + { + "epoch": 1.4820808601187143, + "grad_norm": 1.3433325290679932, + "learning_rate": 9.830131578947369e-05, + "loss": 0.4594, + "step": 26467 + }, + { + "epoch": 1.4821368574308433, + "grad_norm": 1.540515661239624, + "learning_rate": 9.830105263157895e-05, + "loss": 0.3933, + "step": 26468 + }, + { + "epoch": 1.4821928547429724, + "grad_norm": 1.087449312210083, + "learning_rate": 9.830078947368422e-05, + "loss": 0.4609, + "step": 26469 + }, + { + "epoch": 1.4822488520551014, + "grad_norm": 1.4942513704299927, + "learning_rate": 9.830052631578947e-05, + "loss": 0.5734, + "step": 26470 + }, + { + "epoch": 1.4823048493672304, + "grad_norm": 1.2333168983459473, + "learning_rate": 9.830026315789474e-05, + "loss": 0.3683, + "step": 26471 + }, + { + "epoch": 1.4823608466793594, + "grad_norm": 1.3990916013717651, + "learning_rate": 9.83e-05, + "loss": 0.4677, + "step": 26472 + }, + { + "epoch": 1.4824168439914884, + "grad_norm": 1.342435359954834, + "learning_rate": 9.829973684210528e-05, + "loss": 0.4344, + "step": 26473 + }, + { + "epoch": 1.4824728413036175, + "grad_norm": 10.466675758361816, + "learning_rate": 9.829947368421052e-05, + "loss": 0.5972, + "step": 26474 + }, + { + "epoch": 1.4825288386157465, + "grad_norm": 1.4302014112472534, + "learning_rate": 9.82992105263158e-05, + "loss": 0.5685, + "step": 26475 + }, + { + "epoch": 1.4825848359278755, + "grad_norm": 1.4066733121871948, + "learning_rate": 9.829894736842106e-05, + "loss": 0.5278, + "step": 26476 + }, + { + "epoch": 1.4826408332400045, + "grad_norm": 2.721774101257324, + "learning_rate": 9.829868421052632e-05, + "loss": 0.4604, + "step": 26477 + }, + { + "epoch": 1.4826968305521335, + "grad_norm": 1.548677921295166, + "learning_rate": 9.829842105263159e-05, + "loss": 0.4053, + "step": 26478 + }, + { + "epoch": 1.4827528278642625, + "grad_norm": 1.1114532947540283, + "learning_rate": 9.829815789473684e-05, + "loss": 0.5194, + "step": 26479 + }, + { + "epoch": 1.4828088251763916, + "grad_norm": 1.2839246988296509, + "learning_rate": 9.829789473684211e-05, + "loss": 0.4681, + "step": 26480 + }, + { + "epoch": 1.4828648224885206, + "grad_norm": 1.4902235269546509, + "learning_rate": 9.829763157894737e-05, + "loss": 0.4127, + "step": 26481 + }, + { + "epoch": 1.4829208198006496, + "grad_norm": 1.416769027709961, + "learning_rate": 9.829736842105264e-05, + "loss": 0.5673, + "step": 26482 + }, + { + "epoch": 1.4829768171127786, + "grad_norm": 1.4382866621017456, + "learning_rate": 9.82971052631579e-05, + "loss": 0.4693, + "step": 26483 + }, + { + "epoch": 1.4830328144249076, + "grad_norm": 1.2511255741119385, + "learning_rate": 9.829684210526316e-05, + "loss": 0.439, + "step": 26484 + }, + { + "epoch": 1.4830888117370367, + "grad_norm": 1.4404335021972656, + "learning_rate": 9.829657894736842e-05, + "loss": 0.5361, + "step": 26485 + }, + { + "epoch": 1.4831448090491657, + "grad_norm": 1.4500209093093872, + "learning_rate": 9.82963157894737e-05, + "loss": 0.4741, + "step": 26486 + }, + { + "epoch": 1.4832008063612947, + "grad_norm": 1.334502100944519, + "learning_rate": 9.829605263157895e-05, + "loss": 0.5394, + "step": 26487 + }, + { + "epoch": 1.4832568036734237, + "grad_norm": 1.18729567527771, + "learning_rate": 9.829578947368421e-05, + "loss": 0.3905, + "step": 26488 + }, + { + "epoch": 1.4833128009855527, + "grad_norm": 1.1093251705169678, + "learning_rate": 9.829552631578947e-05, + "loss": 0.3999, + "step": 26489 + }, + { + "epoch": 1.4833687982976818, + "grad_norm": 1.262854814529419, + "learning_rate": 9.829526315789475e-05, + "loss": 0.4139, + "step": 26490 + }, + { + "epoch": 1.4834247956098108, + "grad_norm": 1.491692066192627, + "learning_rate": 9.8295e-05, + "loss": 0.5313, + "step": 26491 + }, + { + "epoch": 1.4834807929219398, + "grad_norm": 1.256239652633667, + "learning_rate": 9.829473684210527e-05, + "loss": 0.42, + "step": 26492 + }, + { + "epoch": 1.4835367902340688, + "grad_norm": 1.0862276554107666, + "learning_rate": 9.829447368421053e-05, + "loss": 0.3264, + "step": 26493 + }, + { + "epoch": 1.4835927875461978, + "grad_norm": 1.4838993549346924, + "learning_rate": 9.829421052631579e-05, + "loss": 0.395, + "step": 26494 + }, + { + "epoch": 1.4836487848583269, + "grad_norm": 1.4637272357940674, + "learning_rate": 9.829394736842106e-05, + "loss": 0.4645, + "step": 26495 + }, + { + "epoch": 1.4837047821704559, + "grad_norm": 1.218091368675232, + "learning_rate": 9.829368421052632e-05, + "loss": 0.5034, + "step": 26496 + }, + { + "epoch": 1.483760779482585, + "grad_norm": 1.3952760696411133, + "learning_rate": 9.829342105263158e-05, + "loss": 0.3725, + "step": 26497 + }, + { + "epoch": 1.483816776794714, + "grad_norm": 1.4923816919326782, + "learning_rate": 9.829315789473684e-05, + "loss": 0.4811, + "step": 26498 + }, + { + "epoch": 1.483872774106843, + "grad_norm": 1.4141688346862793, + "learning_rate": 9.829289473684211e-05, + "loss": 0.6126, + "step": 26499 + }, + { + "epoch": 1.483928771418972, + "grad_norm": 1.403441071510315, + "learning_rate": 9.829263157894737e-05, + "loss": 0.4566, + "step": 26500 + }, + { + "epoch": 1.483984768731101, + "grad_norm": 1.1639097929000854, + "learning_rate": 9.829236842105264e-05, + "loss": 0.4863, + "step": 26501 + }, + { + "epoch": 1.48404076604323, + "grad_norm": 1.2145073413848877, + "learning_rate": 9.829210526315789e-05, + "loss": 0.4488, + "step": 26502 + }, + { + "epoch": 1.484096763355359, + "grad_norm": 1.349830150604248, + "learning_rate": 9.829184210526316e-05, + "loss": 0.4137, + "step": 26503 + }, + { + "epoch": 1.484152760667488, + "grad_norm": 1.2369256019592285, + "learning_rate": 9.829157894736842e-05, + "loss": 0.3126, + "step": 26504 + }, + { + "epoch": 1.484208757979617, + "grad_norm": 1.1879065036773682, + "learning_rate": 9.82913157894737e-05, + "loss": 0.4301, + "step": 26505 + }, + { + "epoch": 1.484264755291746, + "grad_norm": 1.506127119064331, + "learning_rate": 9.829105263157896e-05, + "loss": 0.7175, + "step": 26506 + }, + { + "epoch": 1.484320752603875, + "grad_norm": 1.4014273881912231, + "learning_rate": 9.829078947368422e-05, + "loss": 0.4257, + "step": 26507 + }, + { + "epoch": 1.484376749916004, + "grad_norm": 1.3636915683746338, + "learning_rate": 9.829052631578948e-05, + "loss": 0.4488, + "step": 26508 + }, + { + "epoch": 1.4844327472281331, + "grad_norm": 1.363595962524414, + "learning_rate": 9.829026315789475e-05, + "loss": 0.3705, + "step": 26509 + }, + { + "epoch": 1.4844887445402621, + "grad_norm": 1.5012990236282349, + "learning_rate": 9.829000000000001e-05, + "loss": 0.412, + "step": 26510 + }, + { + "epoch": 1.4845447418523912, + "grad_norm": 1.3049137592315674, + "learning_rate": 9.828973684210527e-05, + "loss": 0.4308, + "step": 26511 + }, + { + "epoch": 1.4846007391645202, + "grad_norm": 1.3145856857299805, + "learning_rate": 9.828947368421053e-05, + "loss": 0.3557, + "step": 26512 + }, + { + "epoch": 1.4846567364766492, + "grad_norm": 1.1711186170578003, + "learning_rate": 9.828921052631579e-05, + "loss": 0.4307, + "step": 26513 + }, + { + "epoch": 1.4847127337887782, + "grad_norm": 1.9552361965179443, + "learning_rate": 9.828894736842106e-05, + "loss": 0.3842, + "step": 26514 + }, + { + "epoch": 1.4847687311009072, + "grad_norm": 1.2192083597183228, + "learning_rate": 9.828868421052632e-05, + "loss": 0.4244, + "step": 26515 + }, + { + "epoch": 1.4848247284130363, + "grad_norm": 1.537446141242981, + "learning_rate": 9.828842105263158e-05, + "loss": 0.4177, + "step": 26516 + }, + { + "epoch": 1.4848807257251653, + "grad_norm": 1.4063435792922974, + "learning_rate": 9.828815789473684e-05, + "loss": 0.537, + "step": 26517 + }, + { + "epoch": 1.4849367230372943, + "grad_norm": 1.3753093481063843, + "learning_rate": 9.828789473684211e-05, + "loss": 0.4318, + "step": 26518 + }, + { + "epoch": 1.4849927203494233, + "grad_norm": 1.205532193183899, + "learning_rate": 9.828763157894737e-05, + "loss": 0.4066, + "step": 26519 + }, + { + "epoch": 1.4850487176615523, + "grad_norm": 1.2799038887023926, + "learning_rate": 9.828736842105263e-05, + "loss": 0.4152, + "step": 26520 + }, + { + "epoch": 1.4851047149736814, + "grad_norm": 1.3891645669937134, + "learning_rate": 9.82871052631579e-05, + "loss": 0.4474, + "step": 26521 + }, + { + "epoch": 1.4851607122858104, + "grad_norm": 1.5356215238571167, + "learning_rate": 9.828684210526317e-05, + "loss": 0.5309, + "step": 26522 + }, + { + "epoch": 1.4852167095979394, + "grad_norm": 1.4802745580673218, + "learning_rate": 9.828657894736843e-05, + "loss": 0.5008, + "step": 26523 + }, + { + "epoch": 1.4852727069100684, + "grad_norm": 1.4387753009796143, + "learning_rate": 9.82863157894737e-05, + "loss": 0.5275, + "step": 26524 + }, + { + "epoch": 1.4853287042221974, + "grad_norm": 1.2407060861587524, + "learning_rate": 9.828605263157895e-05, + "loss": 0.4245, + "step": 26525 + }, + { + "epoch": 1.4853847015343264, + "grad_norm": 1.364608883857727, + "learning_rate": 9.828578947368422e-05, + "loss": 0.4472, + "step": 26526 + }, + { + "epoch": 1.4854406988464555, + "grad_norm": 1.448456883430481, + "learning_rate": 9.828552631578948e-05, + "loss": 0.5022, + "step": 26527 + }, + { + "epoch": 1.4854966961585845, + "grad_norm": 1.405077576637268, + "learning_rate": 9.828526315789475e-05, + "loss": 0.401, + "step": 26528 + }, + { + "epoch": 1.4855526934707135, + "grad_norm": 1.231691598892212, + "learning_rate": 9.8285e-05, + "loss": 0.3612, + "step": 26529 + }, + { + "epoch": 1.4856086907828425, + "grad_norm": 1.1891672611236572, + "learning_rate": 9.828473684210526e-05, + "loss": 0.5706, + "step": 26530 + }, + { + "epoch": 1.4856646880949715, + "grad_norm": 1.8921364545822144, + "learning_rate": 9.828447368421053e-05, + "loss": 0.5376, + "step": 26531 + }, + { + "epoch": 1.4857206854071006, + "grad_norm": 1.308678388595581, + "learning_rate": 9.828421052631579e-05, + "loss": 0.4693, + "step": 26532 + }, + { + "epoch": 1.4857766827192296, + "grad_norm": 1.3780572414398193, + "learning_rate": 9.828394736842106e-05, + "loss": 0.4731, + "step": 26533 + }, + { + "epoch": 1.4858326800313586, + "grad_norm": 1.3991550207138062, + "learning_rate": 9.828368421052631e-05, + "loss": 0.4826, + "step": 26534 + }, + { + "epoch": 1.4858886773434876, + "grad_norm": 6.05579948425293, + "learning_rate": 9.828342105263158e-05, + "loss": 0.5507, + "step": 26535 + }, + { + "epoch": 1.4859446746556166, + "grad_norm": 1.2682628631591797, + "learning_rate": 9.828315789473684e-05, + "loss": 0.4841, + "step": 26536 + }, + { + "epoch": 1.4860006719677457, + "grad_norm": 1.2944921255111694, + "learning_rate": 9.828289473684212e-05, + "loss": 0.4768, + "step": 26537 + }, + { + "epoch": 1.4860566692798747, + "grad_norm": 1.390641689300537, + "learning_rate": 9.828263157894738e-05, + "loss": 0.532, + "step": 26538 + }, + { + "epoch": 1.4861126665920037, + "grad_norm": 1.2902518510818481, + "learning_rate": 9.828236842105264e-05, + "loss": 0.4296, + "step": 26539 + }, + { + "epoch": 1.4861686639041327, + "grad_norm": 1.514411211013794, + "learning_rate": 9.82821052631579e-05, + "loss": 0.4976, + "step": 26540 + }, + { + "epoch": 1.4862246612162617, + "grad_norm": 1.4082057476043701, + "learning_rate": 9.828184210526317e-05, + "loss": 0.4531, + "step": 26541 + }, + { + "epoch": 1.4862806585283908, + "grad_norm": 1.2194361686706543, + "learning_rate": 9.828157894736843e-05, + "loss": 0.4623, + "step": 26542 + }, + { + "epoch": 1.4863366558405198, + "grad_norm": 1.5673096179962158, + "learning_rate": 9.828131578947369e-05, + "loss": 0.4011, + "step": 26543 + }, + { + "epoch": 1.4863926531526488, + "grad_norm": 1.132565975189209, + "learning_rate": 9.828105263157895e-05, + "loss": 0.3721, + "step": 26544 + }, + { + "epoch": 1.4864486504647778, + "grad_norm": 1.6840846538543701, + "learning_rate": 9.828078947368422e-05, + "loss": 0.5935, + "step": 26545 + }, + { + "epoch": 1.4865046477769068, + "grad_norm": 1.255689263343811, + "learning_rate": 9.828052631578948e-05, + "loss": 0.4315, + "step": 26546 + }, + { + "epoch": 1.4865606450890358, + "grad_norm": 1.2471442222595215, + "learning_rate": 9.828026315789474e-05, + "loss": 0.4162, + "step": 26547 + }, + { + "epoch": 1.4866166424011649, + "grad_norm": 1.4482321739196777, + "learning_rate": 9.828e-05, + "loss": 0.4751, + "step": 26548 + }, + { + "epoch": 1.4866726397132939, + "grad_norm": 1.298810601234436, + "learning_rate": 9.827973684210526e-05, + "loss": 0.5159, + "step": 26549 + }, + { + "epoch": 1.486728637025423, + "grad_norm": 1.200095295906067, + "learning_rate": 9.827947368421053e-05, + "loss": 0.4506, + "step": 26550 + }, + { + "epoch": 1.486784634337552, + "grad_norm": 1.4362943172454834, + "learning_rate": 9.82792105263158e-05, + "loss": 0.475, + "step": 26551 + }, + { + "epoch": 1.486840631649681, + "grad_norm": 1.1563717126846313, + "learning_rate": 9.827894736842105e-05, + "loss": 0.3387, + "step": 26552 + }, + { + "epoch": 1.48689662896181, + "grad_norm": 1.1349667310714722, + "learning_rate": 9.827868421052631e-05, + "loss": 0.3534, + "step": 26553 + }, + { + "epoch": 1.486952626273939, + "grad_norm": 1.1395677328109741, + "learning_rate": 9.827842105263159e-05, + "loss": 0.4657, + "step": 26554 + }, + { + "epoch": 1.487008623586068, + "grad_norm": 1.8083597421646118, + "learning_rate": 9.827815789473685e-05, + "loss": 0.6034, + "step": 26555 + }, + { + "epoch": 1.487064620898197, + "grad_norm": 1.188975214958191, + "learning_rate": 9.827789473684212e-05, + "loss": 0.4013, + "step": 26556 + }, + { + "epoch": 1.487120618210326, + "grad_norm": 1.316445231437683, + "learning_rate": 9.827763157894737e-05, + "loss": 0.5189, + "step": 26557 + }, + { + "epoch": 1.487176615522455, + "grad_norm": 1.5387048721313477, + "learning_rate": 9.827736842105264e-05, + "loss": 0.5348, + "step": 26558 + }, + { + "epoch": 1.487232612834584, + "grad_norm": 1.3718448877334595, + "learning_rate": 9.82771052631579e-05, + "loss": 0.5667, + "step": 26559 + }, + { + "epoch": 1.4872886101467129, + "grad_norm": 5.825066566467285, + "learning_rate": 9.827684210526317e-05, + "loss": 0.3749, + "step": 26560 + }, + { + "epoch": 1.487344607458842, + "grad_norm": 1.566150426864624, + "learning_rate": 9.827657894736843e-05, + "loss": 0.403, + "step": 26561 + }, + { + "epoch": 1.487400604770971, + "grad_norm": 1.1900097131729126, + "learning_rate": 9.827631578947369e-05, + "loss": 0.4285, + "step": 26562 + }, + { + "epoch": 1.4874566020831, + "grad_norm": 1.266889214515686, + "learning_rate": 9.827605263157895e-05, + "loss": 0.4519, + "step": 26563 + }, + { + "epoch": 1.487512599395229, + "grad_norm": 1.1044726371765137, + "learning_rate": 9.827578947368421e-05, + "loss": 0.4364, + "step": 26564 + }, + { + "epoch": 1.487568596707358, + "grad_norm": 1.3785444498062134, + "learning_rate": 9.827552631578948e-05, + "loss": 0.4805, + "step": 26565 + }, + { + "epoch": 1.487624594019487, + "grad_norm": 1.5386810302734375, + "learning_rate": 9.827526315789473e-05, + "loss": 0.5183, + "step": 26566 + }, + { + "epoch": 1.487680591331616, + "grad_norm": 1.3156059980392456, + "learning_rate": 9.8275e-05, + "loss": 0.4641, + "step": 26567 + }, + { + "epoch": 1.487736588643745, + "grad_norm": 1.3793963193893433, + "learning_rate": 9.827473684210526e-05, + "loss": 0.5021, + "step": 26568 + }, + { + "epoch": 1.487792585955874, + "grad_norm": 1.1304057836532593, + "learning_rate": 9.827447368421054e-05, + "loss": 0.4386, + "step": 26569 + }, + { + "epoch": 1.487848583268003, + "grad_norm": 1.4514892101287842, + "learning_rate": 9.82742105263158e-05, + "loss": 0.5732, + "step": 26570 + }, + { + "epoch": 1.487904580580132, + "grad_norm": 1.2929202318191528, + "learning_rate": 9.827394736842106e-05, + "loss": 0.3627, + "step": 26571 + }, + { + "epoch": 1.487960577892261, + "grad_norm": 1.4279794692993164, + "learning_rate": 9.827368421052632e-05, + "loss": 0.4568, + "step": 26572 + }, + { + "epoch": 1.4880165752043901, + "grad_norm": 1.3467376232147217, + "learning_rate": 9.827342105263159e-05, + "loss": 0.4781, + "step": 26573 + }, + { + "epoch": 1.4880725725165191, + "grad_norm": 1.162685751914978, + "learning_rate": 9.827315789473685e-05, + "loss": 0.373, + "step": 26574 + }, + { + "epoch": 1.4881285698286482, + "grad_norm": 1.4191398620605469, + "learning_rate": 9.827289473684211e-05, + "loss": 0.6028, + "step": 26575 + }, + { + "epoch": 1.4881845671407772, + "grad_norm": 1.132226586341858, + "learning_rate": 9.827263157894737e-05, + "loss": 0.3635, + "step": 26576 + }, + { + "epoch": 1.4882405644529062, + "grad_norm": 1.084893822669983, + "learning_rate": 9.827236842105264e-05, + "loss": 0.463, + "step": 26577 + }, + { + "epoch": 1.4882965617650352, + "grad_norm": 1.3243541717529297, + "learning_rate": 9.82721052631579e-05, + "loss": 0.4241, + "step": 26578 + }, + { + "epoch": 1.4883525590771642, + "grad_norm": 1.4677218198776245, + "learning_rate": 9.827184210526317e-05, + "loss": 0.5732, + "step": 26579 + }, + { + "epoch": 1.4884085563892933, + "grad_norm": 1.4174957275390625, + "learning_rate": 9.827157894736842e-05, + "loss": 0.4608, + "step": 26580 + }, + { + "epoch": 1.4884645537014223, + "grad_norm": 1.155500888824463, + "learning_rate": 9.827131578947368e-05, + "loss": 0.358, + "step": 26581 + }, + { + "epoch": 1.4885205510135513, + "grad_norm": 1.275414228439331, + "learning_rate": 9.827105263157895e-05, + "loss": 0.4646, + "step": 26582 + }, + { + "epoch": 1.4885765483256803, + "grad_norm": 1.2806388139724731, + "learning_rate": 9.827078947368421e-05, + "loss": 0.4829, + "step": 26583 + }, + { + "epoch": 1.4886325456378093, + "grad_norm": 1.2613698244094849, + "learning_rate": 9.827052631578947e-05, + "loss": 0.4348, + "step": 26584 + }, + { + "epoch": 1.4886885429499384, + "grad_norm": 1.1614763736724854, + "learning_rate": 9.827026315789473e-05, + "loss": 0.4095, + "step": 26585 + }, + { + "epoch": 1.4887445402620674, + "grad_norm": 1.1521501541137695, + "learning_rate": 9.827e-05, + "loss": 0.4157, + "step": 26586 + }, + { + "epoch": 1.4888005375741964, + "grad_norm": 1.5595088005065918, + "learning_rate": 9.826973684210527e-05, + "loss": 0.4135, + "step": 26587 + }, + { + "epoch": 1.4888565348863254, + "grad_norm": 1.1848524808883667, + "learning_rate": 9.826947368421054e-05, + "loss": 0.432, + "step": 26588 + }, + { + "epoch": 1.4889125321984544, + "grad_norm": 2.0190978050231934, + "learning_rate": 9.826921052631579e-05, + "loss": 0.4802, + "step": 26589 + }, + { + "epoch": 1.4889685295105834, + "grad_norm": 1.8962711095809937, + "learning_rate": 9.826894736842106e-05, + "loss": 0.6623, + "step": 26590 + }, + { + "epoch": 1.4890245268227125, + "grad_norm": 1.5094536542892456, + "learning_rate": 9.826868421052632e-05, + "loss": 0.5381, + "step": 26591 + }, + { + "epoch": 1.4890805241348415, + "grad_norm": 1.4123467206954956, + "learning_rate": 9.826842105263159e-05, + "loss": 0.4597, + "step": 26592 + }, + { + "epoch": 1.4891365214469705, + "grad_norm": 1.146776795387268, + "learning_rate": 9.826815789473685e-05, + "loss": 0.4171, + "step": 26593 + }, + { + "epoch": 1.4891925187590995, + "grad_norm": 1.3933452367782593, + "learning_rate": 9.826789473684211e-05, + "loss": 0.5097, + "step": 26594 + }, + { + "epoch": 1.4892485160712285, + "grad_norm": 1.508174180984497, + "learning_rate": 9.826763157894737e-05, + "loss": 0.5648, + "step": 26595 + }, + { + "epoch": 1.4893045133833576, + "grad_norm": 1.2280765771865845, + "learning_rate": 9.826736842105264e-05, + "loss": 0.4365, + "step": 26596 + }, + { + "epoch": 1.4893605106954866, + "grad_norm": 1.496867299079895, + "learning_rate": 9.82671052631579e-05, + "loss": 0.4005, + "step": 26597 + }, + { + "epoch": 1.4894165080076156, + "grad_norm": 1.4241769313812256, + "learning_rate": 9.826684210526316e-05, + "loss": 0.5276, + "step": 26598 + }, + { + "epoch": 1.4894725053197446, + "grad_norm": 1.3316019773483276, + "learning_rate": 9.826657894736842e-05, + "loss": 0.4393, + "step": 26599 + }, + { + "epoch": 1.4895285026318736, + "grad_norm": 1.1914223432540894, + "learning_rate": 9.826631578947368e-05, + "loss": 0.3833, + "step": 26600 + }, + { + "epoch": 1.4895844999440027, + "grad_norm": 1.281269907951355, + "learning_rate": 9.826605263157896e-05, + "loss": 0.5015, + "step": 26601 + }, + { + "epoch": 1.4896404972561317, + "grad_norm": 1.5901553630828857, + "learning_rate": 9.826578947368422e-05, + "loss": 0.4524, + "step": 26602 + }, + { + "epoch": 1.4896964945682607, + "grad_norm": 1.4138697385787964, + "learning_rate": 9.826552631578948e-05, + "loss": 0.508, + "step": 26603 + }, + { + "epoch": 1.4897524918803897, + "grad_norm": 1.2021832466125488, + "learning_rate": 9.826526315789474e-05, + "loss": 0.5529, + "step": 26604 + }, + { + "epoch": 1.4898084891925187, + "grad_norm": 1.3018269538879395, + "learning_rate": 9.826500000000001e-05, + "loss": 0.4325, + "step": 26605 + }, + { + "epoch": 1.4898644865046478, + "grad_norm": 1.1973603963851929, + "learning_rate": 9.826473684210527e-05, + "loss": 0.3586, + "step": 26606 + }, + { + "epoch": 1.4899204838167768, + "grad_norm": 1.347508192062378, + "learning_rate": 9.826447368421053e-05, + "loss": 0.4202, + "step": 26607 + }, + { + "epoch": 1.4899764811289058, + "grad_norm": 1.521946907043457, + "learning_rate": 9.826421052631579e-05, + "loss": 0.4604, + "step": 26608 + }, + { + "epoch": 1.4900324784410348, + "grad_norm": 1.5154759883880615, + "learning_rate": 9.826394736842106e-05, + "loss": 0.6821, + "step": 26609 + }, + { + "epoch": 1.4900884757531638, + "grad_norm": 1.2492790222167969, + "learning_rate": 9.826368421052632e-05, + "loss": 0.3793, + "step": 26610 + }, + { + "epoch": 1.4901444730652929, + "grad_norm": 1.5363818407058716, + "learning_rate": 9.82634210526316e-05, + "loss": 0.4483, + "step": 26611 + }, + { + "epoch": 1.4902004703774219, + "grad_norm": 1.3126407861709595, + "learning_rate": 9.826315789473684e-05, + "loss": 0.5083, + "step": 26612 + }, + { + "epoch": 1.490256467689551, + "grad_norm": 1.3964883089065552, + "learning_rate": 9.826289473684211e-05, + "loss": 0.4706, + "step": 26613 + }, + { + "epoch": 1.49031246500168, + "grad_norm": 1.463017463684082, + "learning_rate": 9.826263157894737e-05, + "loss": 0.4911, + "step": 26614 + }, + { + "epoch": 1.490368462313809, + "grad_norm": 1.0788241624832153, + "learning_rate": 9.826236842105263e-05, + "loss": 0.3901, + "step": 26615 + }, + { + "epoch": 1.490424459625938, + "grad_norm": 1.3994767665863037, + "learning_rate": 9.82621052631579e-05, + "loss": 0.5225, + "step": 26616 + }, + { + "epoch": 1.490480456938067, + "grad_norm": 1.4026886224746704, + "learning_rate": 9.826184210526315e-05, + "loss": 0.5064, + "step": 26617 + }, + { + "epoch": 1.490536454250196, + "grad_norm": 1.3137027025222778, + "learning_rate": 9.826157894736843e-05, + "loss": 0.5917, + "step": 26618 + }, + { + "epoch": 1.490592451562325, + "grad_norm": 1.2471706867218018, + "learning_rate": 9.826131578947369e-05, + "loss": 0.5021, + "step": 26619 + }, + { + "epoch": 1.490648448874454, + "grad_norm": 1.3848005533218384, + "learning_rate": 9.826105263157896e-05, + "loss": 0.6732, + "step": 26620 + }, + { + "epoch": 1.490704446186583, + "grad_norm": 1.459981083869934, + "learning_rate": 9.82607894736842e-05, + "loss": 0.4569, + "step": 26621 + }, + { + "epoch": 1.490760443498712, + "grad_norm": 1.3855366706848145, + "learning_rate": 9.826052631578948e-05, + "loss": 0.5632, + "step": 26622 + }, + { + "epoch": 1.490816440810841, + "grad_norm": 1.3110512495040894, + "learning_rate": 9.826026315789474e-05, + "loss": 0.3802, + "step": 26623 + }, + { + "epoch": 1.49087243812297, + "grad_norm": 1.2619868516921997, + "learning_rate": 9.826000000000001e-05, + "loss": 0.4903, + "step": 26624 + }, + { + "epoch": 1.4909284354350991, + "grad_norm": 1.2366193532943726, + "learning_rate": 9.825973684210527e-05, + "loss": 0.5048, + "step": 26625 + }, + { + "epoch": 1.4909844327472281, + "grad_norm": 1.1477752923965454, + "learning_rate": 9.825947368421053e-05, + "loss": 0.5035, + "step": 26626 + }, + { + "epoch": 1.4910404300593572, + "grad_norm": 1.4129172563552856, + "learning_rate": 9.825921052631579e-05, + "loss": 0.5237, + "step": 26627 + }, + { + "epoch": 1.4910964273714862, + "grad_norm": 1.224112868309021, + "learning_rate": 9.825894736842106e-05, + "loss": 0.3905, + "step": 26628 + }, + { + "epoch": 1.4911524246836152, + "grad_norm": 1.2950512170791626, + "learning_rate": 9.825868421052632e-05, + "loss": 0.4303, + "step": 26629 + }, + { + "epoch": 1.4912084219957442, + "grad_norm": 2.1667160987854004, + "learning_rate": 9.825842105263158e-05, + "loss": 0.4029, + "step": 26630 + }, + { + "epoch": 1.4912644193078732, + "grad_norm": 1.4129606485366821, + "learning_rate": 9.825815789473684e-05, + "loss": 0.4251, + "step": 26631 + }, + { + "epoch": 1.4913204166200023, + "grad_norm": 1.4479670524597168, + "learning_rate": 9.825789473684212e-05, + "loss": 0.4992, + "step": 26632 + }, + { + "epoch": 1.4913764139321313, + "grad_norm": 1.3471579551696777, + "learning_rate": 9.825763157894738e-05, + "loss": 0.4921, + "step": 26633 + }, + { + "epoch": 1.4914324112442603, + "grad_norm": 1.6021217107772827, + "learning_rate": 9.825736842105264e-05, + "loss": 0.4532, + "step": 26634 + }, + { + "epoch": 1.4914884085563893, + "grad_norm": 1.3492587804794312, + "learning_rate": 9.82571052631579e-05, + "loss": 0.482, + "step": 26635 + }, + { + "epoch": 1.4915444058685183, + "grad_norm": 1.9318296909332275, + "learning_rate": 9.825684210526316e-05, + "loss": 0.4424, + "step": 26636 + }, + { + "epoch": 1.4916004031806473, + "grad_norm": 1.05913245677948, + "learning_rate": 9.825657894736843e-05, + "loss": 0.4489, + "step": 26637 + }, + { + "epoch": 1.4916564004927764, + "grad_norm": 1.2017278671264648, + "learning_rate": 9.825631578947369e-05, + "loss": 0.4875, + "step": 26638 + }, + { + "epoch": 1.4917123978049054, + "grad_norm": 1.4743341207504272, + "learning_rate": 9.825605263157895e-05, + "loss": 0.5475, + "step": 26639 + }, + { + "epoch": 1.4917683951170344, + "grad_norm": 1.2858116626739502, + "learning_rate": 9.825578947368421e-05, + "loss": 0.4494, + "step": 26640 + }, + { + "epoch": 1.4918243924291634, + "grad_norm": 1.4351807832717896, + "learning_rate": 9.825552631578948e-05, + "loss": 0.3805, + "step": 26641 + }, + { + "epoch": 1.4918803897412924, + "grad_norm": 1.4434479475021362, + "learning_rate": 9.825526315789474e-05, + "loss": 0.5806, + "step": 26642 + }, + { + "epoch": 1.4919363870534215, + "grad_norm": 1.21992027759552, + "learning_rate": 9.825500000000001e-05, + "loss": 0.413, + "step": 26643 + }, + { + "epoch": 1.4919923843655505, + "grad_norm": 1.1825666427612305, + "learning_rate": 9.825473684210526e-05, + "loss": 0.4294, + "step": 26644 + }, + { + "epoch": 1.4920483816776795, + "grad_norm": 1.4497736692428589, + "learning_rate": 9.825447368421053e-05, + "loss": 0.5618, + "step": 26645 + }, + { + "epoch": 1.4921043789898085, + "grad_norm": 1.3041061162948608, + "learning_rate": 9.825421052631579e-05, + "loss": 0.5717, + "step": 26646 + }, + { + "epoch": 1.4921603763019375, + "grad_norm": 1.6015552282333374, + "learning_rate": 9.825394736842107e-05, + "loss": 0.4817, + "step": 26647 + }, + { + "epoch": 1.4922163736140666, + "grad_norm": 1.276550054550171, + "learning_rate": 9.825368421052633e-05, + "loss": 0.4927, + "step": 26648 + }, + { + "epoch": 1.4922723709261956, + "grad_norm": 1.363005518913269, + "learning_rate": 9.825342105263159e-05, + "loss": 0.4021, + "step": 26649 + }, + { + "epoch": 1.4923283682383246, + "grad_norm": 1.5803815126419067, + "learning_rate": 9.825315789473685e-05, + "loss": 0.6846, + "step": 26650 + }, + { + "epoch": 1.4923843655504536, + "grad_norm": 1.558101773262024, + "learning_rate": 9.82528947368421e-05, + "loss": 0.4083, + "step": 26651 + }, + { + "epoch": 1.4924403628625826, + "grad_norm": 1.2533341646194458, + "learning_rate": 9.825263157894738e-05, + "loss": 0.4739, + "step": 26652 + }, + { + "epoch": 1.4924963601747117, + "grad_norm": 1.1889357566833496, + "learning_rate": 9.825236842105264e-05, + "loss": 0.3874, + "step": 26653 + }, + { + "epoch": 1.4925523574868407, + "grad_norm": 1.213054895401001, + "learning_rate": 9.82521052631579e-05, + "loss": 0.424, + "step": 26654 + }, + { + "epoch": 1.4926083547989697, + "grad_norm": 1.3365778923034668, + "learning_rate": 9.825184210526316e-05, + "loss": 0.3959, + "step": 26655 + }, + { + "epoch": 1.4926643521110987, + "grad_norm": 1.2400928735733032, + "learning_rate": 9.825157894736843e-05, + "loss": 0.4036, + "step": 26656 + }, + { + "epoch": 1.4927203494232277, + "grad_norm": 1.3384134769439697, + "learning_rate": 9.825131578947369e-05, + "loss": 0.4183, + "step": 26657 + }, + { + "epoch": 1.4927763467353568, + "grad_norm": 1.3476084470748901, + "learning_rate": 9.825105263157895e-05, + "loss": 0.5117, + "step": 26658 + }, + { + "epoch": 1.4928323440474858, + "grad_norm": 1.2372089624404907, + "learning_rate": 9.825078947368421e-05, + "loss": 0.4327, + "step": 26659 + }, + { + "epoch": 1.4928883413596148, + "grad_norm": 1.314595341682434, + "learning_rate": 9.825052631578948e-05, + "loss": 0.522, + "step": 26660 + }, + { + "epoch": 1.4929443386717438, + "grad_norm": 1.4243106842041016, + "learning_rate": 9.825026315789474e-05, + "loss": 0.5655, + "step": 26661 + }, + { + "epoch": 1.4930003359838728, + "grad_norm": 1.6291049718856812, + "learning_rate": 9.825e-05, + "loss": 0.5341, + "step": 26662 + }, + { + "epoch": 1.4930563332960018, + "grad_norm": 1.1582446098327637, + "learning_rate": 9.824973684210526e-05, + "loss": 0.3955, + "step": 26663 + }, + { + "epoch": 1.4931123306081309, + "grad_norm": 1.3123834133148193, + "learning_rate": 9.824947368421054e-05, + "loss": 0.4636, + "step": 26664 + }, + { + "epoch": 1.4931683279202599, + "grad_norm": 46.05010223388672, + "learning_rate": 9.82492105263158e-05, + "loss": 0.5557, + "step": 26665 + }, + { + "epoch": 1.493224325232389, + "grad_norm": 1.4509391784667969, + "learning_rate": 9.824894736842107e-05, + "loss": 0.5353, + "step": 26666 + }, + { + "epoch": 1.493280322544518, + "grad_norm": 1.1491566896438599, + "learning_rate": 9.824868421052632e-05, + "loss": 0.386, + "step": 26667 + }, + { + "epoch": 1.493336319856647, + "grad_norm": 1.6083931922912598, + "learning_rate": 9.824842105263157e-05, + "loss": 0.581, + "step": 26668 + }, + { + "epoch": 1.493392317168776, + "grad_norm": 1.2309685945510864, + "learning_rate": 9.824815789473685e-05, + "loss": 0.366, + "step": 26669 + }, + { + "epoch": 1.493448314480905, + "grad_norm": 1.7661563158035278, + "learning_rate": 9.824789473684211e-05, + "loss": 0.6173, + "step": 26670 + }, + { + "epoch": 1.493504311793034, + "grad_norm": 1.210553526878357, + "learning_rate": 9.824763157894738e-05, + "loss": 0.4531, + "step": 26671 + }, + { + "epoch": 1.493560309105163, + "grad_norm": 1.2885973453521729, + "learning_rate": 9.824736842105263e-05, + "loss": 0.4018, + "step": 26672 + }, + { + "epoch": 1.493616306417292, + "grad_norm": 1.4575316905975342, + "learning_rate": 9.82471052631579e-05, + "loss": 0.4969, + "step": 26673 + }, + { + "epoch": 1.493672303729421, + "grad_norm": 1.6433037519454956, + "learning_rate": 9.824684210526316e-05, + "loss": 0.5275, + "step": 26674 + }, + { + "epoch": 1.49372830104155, + "grad_norm": 1.4665077924728394, + "learning_rate": 9.824657894736843e-05, + "loss": 0.5303, + "step": 26675 + }, + { + "epoch": 1.493784298353679, + "grad_norm": 1.18905770778656, + "learning_rate": 9.824631578947368e-05, + "loss": 0.4734, + "step": 26676 + }, + { + "epoch": 1.4938402956658081, + "grad_norm": 1.1709564924240112, + "learning_rate": 9.824605263157895e-05, + "loss": 0.389, + "step": 26677 + }, + { + "epoch": 1.4938962929779371, + "grad_norm": 1.2126034498214722, + "learning_rate": 9.824578947368421e-05, + "loss": 0.3979, + "step": 26678 + }, + { + "epoch": 1.4939522902900662, + "grad_norm": 1.1228069067001343, + "learning_rate": 9.824552631578949e-05, + "loss": 0.3886, + "step": 26679 + }, + { + "epoch": 1.4940082876021952, + "grad_norm": 1.4189866781234741, + "learning_rate": 9.824526315789475e-05, + "loss": 0.4907, + "step": 26680 + }, + { + "epoch": 1.4940642849143242, + "grad_norm": 1.4428449869155884, + "learning_rate": 9.8245e-05, + "loss": 0.4597, + "step": 26681 + }, + { + "epoch": 1.4941202822264532, + "grad_norm": 1.293521523475647, + "learning_rate": 9.824473684210527e-05, + "loss": 0.466, + "step": 26682 + }, + { + "epoch": 1.4941762795385822, + "grad_norm": 1.3009419441223145, + "learning_rate": 9.824447368421054e-05, + "loss": 0.4997, + "step": 26683 + }, + { + "epoch": 1.494232276850711, + "grad_norm": 1.138881802558899, + "learning_rate": 9.82442105263158e-05, + "loss": 0.3961, + "step": 26684 + }, + { + "epoch": 1.49428827416284, + "grad_norm": 1.5244901180267334, + "learning_rate": 9.824394736842106e-05, + "loss": 0.4786, + "step": 26685 + }, + { + "epoch": 1.494344271474969, + "grad_norm": 1.3683289289474487, + "learning_rate": 9.824368421052632e-05, + "loss": 0.5121, + "step": 26686 + }, + { + "epoch": 1.494400268787098, + "grad_norm": 1.4406224489212036, + "learning_rate": 9.824342105263158e-05, + "loss": 0.4976, + "step": 26687 + }, + { + "epoch": 1.494456266099227, + "grad_norm": 1.1860580444335938, + "learning_rate": 9.824315789473685e-05, + "loss": 0.3892, + "step": 26688 + }, + { + "epoch": 1.4945122634113561, + "grad_norm": 1.1105409860610962, + "learning_rate": 9.824289473684211e-05, + "loss": 0.3432, + "step": 26689 + }, + { + "epoch": 1.4945682607234851, + "grad_norm": 1.489783763885498, + "learning_rate": 9.824263157894737e-05, + "loss": 0.6195, + "step": 26690 + }, + { + "epoch": 1.4946242580356142, + "grad_norm": 1.3480902910232544, + "learning_rate": 9.824236842105263e-05, + "loss": 0.4823, + "step": 26691 + }, + { + "epoch": 1.4946802553477432, + "grad_norm": 1.6353440284729004, + "learning_rate": 9.82421052631579e-05, + "loss": 0.6926, + "step": 26692 + }, + { + "epoch": 1.4947362526598722, + "grad_norm": 1.5540894269943237, + "learning_rate": 9.824184210526316e-05, + "loss": 0.4531, + "step": 26693 + }, + { + "epoch": 1.4947922499720012, + "grad_norm": 1.8608779907226562, + "learning_rate": 9.824157894736842e-05, + "loss": 0.5218, + "step": 26694 + }, + { + "epoch": 1.4948482472841302, + "grad_norm": 1.2090544700622559, + "learning_rate": 9.824131578947368e-05, + "loss": 0.4046, + "step": 26695 + }, + { + "epoch": 1.4949042445962593, + "grad_norm": 1.2432013750076294, + "learning_rate": 9.824105263157896e-05, + "loss": 0.4473, + "step": 26696 + }, + { + "epoch": 1.4949602419083883, + "grad_norm": 1.1945785284042358, + "learning_rate": 9.824078947368422e-05, + "loss": 0.3855, + "step": 26697 + }, + { + "epoch": 1.4950162392205173, + "grad_norm": 1.077295184135437, + "learning_rate": 9.824052631578949e-05, + "loss": 0.3811, + "step": 26698 + }, + { + "epoch": 1.4950722365326463, + "grad_norm": 1.4812551736831665, + "learning_rate": 9.824026315789473e-05, + "loss": 0.6106, + "step": 26699 + }, + { + "epoch": 1.4951282338447753, + "grad_norm": 1.4711402654647827, + "learning_rate": 9.824000000000001e-05, + "loss": 0.3819, + "step": 26700 + }, + { + "epoch": 1.4951842311569044, + "grad_norm": 1.2971477508544922, + "learning_rate": 9.823973684210527e-05, + "loss": 0.4607, + "step": 26701 + }, + { + "epoch": 1.4952402284690334, + "grad_norm": 1.1944701671600342, + "learning_rate": 9.823947368421053e-05, + "loss": 0.4241, + "step": 26702 + }, + { + "epoch": 1.4952962257811624, + "grad_norm": 1.2420014142990112, + "learning_rate": 9.82392105263158e-05, + "loss": 0.3327, + "step": 26703 + }, + { + "epoch": 1.4953522230932914, + "grad_norm": 1.3738360404968262, + "learning_rate": 9.823894736842105e-05, + "loss": 0.4774, + "step": 26704 + }, + { + "epoch": 1.4954082204054204, + "grad_norm": 1.2295901775360107, + "learning_rate": 9.823868421052632e-05, + "loss": 0.4331, + "step": 26705 + }, + { + "epoch": 1.4954642177175494, + "grad_norm": 1.4630894660949707, + "learning_rate": 9.823842105263158e-05, + "loss": 0.6747, + "step": 26706 + }, + { + "epoch": 1.4955202150296785, + "grad_norm": 1.5961787700653076, + "learning_rate": 9.823815789473685e-05, + "loss": 0.5482, + "step": 26707 + }, + { + "epoch": 1.4955762123418075, + "grad_norm": 1.469846248626709, + "learning_rate": 9.823789473684211e-05, + "loss": 0.62, + "step": 26708 + }, + { + "epoch": 1.4956322096539365, + "grad_norm": 1.65489661693573, + "learning_rate": 9.823763157894737e-05, + "loss": 0.5532, + "step": 26709 + }, + { + "epoch": 1.4956882069660655, + "grad_norm": 1.1136822700500488, + "learning_rate": 9.823736842105263e-05, + "loss": 0.4125, + "step": 26710 + }, + { + "epoch": 1.4957442042781945, + "grad_norm": 1.4769656658172607, + "learning_rate": 9.82371052631579e-05, + "loss": 0.4585, + "step": 26711 + }, + { + "epoch": 1.4958002015903236, + "grad_norm": 1.5494664907455444, + "learning_rate": 9.823684210526317e-05, + "loss": 0.514, + "step": 26712 + }, + { + "epoch": 1.4958561989024526, + "grad_norm": 1.2355320453643799, + "learning_rate": 9.823657894736843e-05, + "loss": 0.4382, + "step": 26713 + }, + { + "epoch": 1.4959121962145816, + "grad_norm": 1.2471064329147339, + "learning_rate": 9.823631578947368e-05, + "loss": 0.4819, + "step": 26714 + }, + { + "epoch": 1.4959681935267106, + "grad_norm": 1.4682189226150513, + "learning_rate": 9.823605263157896e-05, + "loss": 0.5551, + "step": 26715 + }, + { + "epoch": 1.4960241908388396, + "grad_norm": 1.6817207336425781, + "learning_rate": 9.823578947368422e-05, + "loss": 0.6358, + "step": 26716 + }, + { + "epoch": 1.4960801881509687, + "grad_norm": 1.5784755945205688, + "learning_rate": 9.823552631578948e-05, + "loss": 0.4629, + "step": 26717 + }, + { + "epoch": 1.4961361854630977, + "grad_norm": 1.316011667251587, + "learning_rate": 9.823526315789474e-05, + "loss": 0.481, + "step": 26718 + }, + { + "epoch": 1.4961921827752267, + "grad_norm": 1.3917642831802368, + "learning_rate": 9.8235e-05, + "loss": 0.4046, + "step": 26719 + }, + { + "epoch": 1.4962481800873557, + "grad_norm": 1.3899905681610107, + "learning_rate": 9.823473684210527e-05, + "loss": 0.4627, + "step": 26720 + }, + { + "epoch": 1.4963041773994847, + "grad_norm": 1.5084383487701416, + "learning_rate": 9.823447368421053e-05, + "loss": 0.3227, + "step": 26721 + }, + { + "epoch": 1.4963601747116138, + "grad_norm": 1.2321195602416992, + "learning_rate": 9.823421052631579e-05, + "loss": 0.4581, + "step": 26722 + }, + { + "epoch": 1.4964161720237428, + "grad_norm": 1.4483340978622437, + "learning_rate": 9.823394736842105e-05, + "loss": 0.6091, + "step": 26723 + }, + { + "epoch": 1.4964721693358718, + "grad_norm": 1.0192410945892334, + "learning_rate": 9.823368421052632e-05, + "loss": 0.3762, + "step": 26724 + }, + { + "epoch": 1.4965281666480008, + "grad_norm": 1.1380773782730103, + "learning_rate": 9.823342105263158e-05, + "loss": 0.3941, + "step": 26725 + }, + { + "epoch": 1.4965841639601298, + "grad_norm": 1.5388333797454834, + "learning_rate": 9.823315789473686e-05, + "loss": 0.4075, + "step": 26726 + }, + { + "epoch": 1.4966401612722589, + "grad_norm": 1.4268181324005127, + "learning_rate": 9.82328947368421e-05, + "loss": 0.4967, + "step": 26727 + }, + { + "epoch": 1.4966961585843879, + "grad_norm": 1.0744291543960571, + "learning_rate": 9.823263157894738e-05, + "loss": 0.4012, + "step": 26728 + }, + { + "epoch": 1.496752155896517, + "grad_norm": 1.3298625946044922, + "learning_rate": 9.823236842105264e-05, + "loss": 0.4907, + "step": 26729 + }, + { + "epoch": 1.496808153208646, + "grad_norm": 1.501968264579773, + "learning_rate": 9.823210526315791e-05, + "loss": 0.6027, + "step": 26730 + }, + { + "epoch": 1.496864150520775, + "grad_norm": 1.2917025089263916, + "learning_rate": 9.823184210526315e-05, + "loss": 0.4129, + "step": 26731 + }, + { + "epoch": 1.496920147832904, + "grad_norm": 1.2158446311950684, + "learning_rate": 9.823157894736843e-05, + "loss": 0.3479, + "step": 26732 + }, + { + "epoch": 1.496976145145033, + "grad_norm": 1.1381868124008179, + "learning_rate": 9.823131578947369e-05, + "loss": 0.4021, + "step": 26733 + }, + { + "epoch": 1.497032142457162, + "grad_norm": 1.2020201683044434, + "learning_rate": 9.823105263157896e-05, + "loss": 0.3444, + "step": 26734 + }, + { + "epoch": 1.497088139769291, + "grad_norm": 1.4698524475097656, + "learning_rate": 9.823078947368422e-05, + "loss": 0.4059, + "step": 26735 + }, + { + "epoch": 1.49714413708142, + "grad_norm": 1.5952394008636475, + "learning_rate": 9.823052631578947e-05, + "loss": 0.4962, + "step": 26736 + }, + { + "epoch": 1.497200134393549, + "grad_norm": 1.15773606300354, + "learning_rate": 9.823026315789474e-05, + "loss": 0.4159, + "step": 26737 + }, + { + "epoch": 1.497256131705678, + "grad_norm": 1.5245182514190674, + "learning_rate": 9.823e-05, + "loss": 0.577, + "step": 26738 + }, + { + "epoch": 1.497312129017807, + "grad_norm": 1.2623963356018066, + "learning_rate": 9.822973684210527e-05, + "loss": 0.3999, + "step": 26739 + }, + { + "epoch": 1.497368126329936, + "grad_norm": 1.3014445304870605, + "learning_rate": 9.822947368421053e-05, + "loss": 0.395, + "step": 26740 + }, + { + "epoch": 1.4974241236420651, + "grad_norm": 1.1897354125976562, + "learning_rate": 9.822921052631579e-05, + "loss": 0.4322, + "step": 26741 + }, + { + "epoch": 1.4974801209541941, + "grad_norm": 1.2164031267166138, + "learning_rate": 9.822894736842105e-05, + "loss": 0.4825, + "step": 26742 + }, + { + "epoch": 1.4975361182663232, + "grad_norm": 1.5846658945083618, + "learning_rate": 9.822868421052633e-05, + "loss": 0.6695, + "step": 26743 + }, + { + "epoch": 1.4975921155784522, + "grad_norm": 1.182814598083496, + "learning_rate": 9.822842105263159e-05, + "loss": 0.421, + "step": 26744 + }, + { + "epoch": 1.4976481128905812, + "grad_norm": 1.4049961566925049, + "learning_rate": 9.822815789473684e-05, + "loss": 0.3883, + "step": 26745 + }, + { + "epoch": 1.4977041102027102, + "grad_norm": 1.538678765296936, + "learning_rate": 9.82278947368421e-05, + "loss": 0.5299, + "step": 26746 + }, + { + "epoch": 1.4977601075148392, + "grad_norm": 1.246418833732605, + "learning_rate": 9.822763157894738e-05, + "loss": 0.3696, + "step": 26747 + }, + { + "epoch": 1.4978161048269683, + "grad_norm": 1.547446846961975, + "learning_rate": 9.822736842105264e-05, + "loss": 0.5213, + "step": 26748 + }, + { + "epoch": 1.4978721021390973, + "grad_norm": 1.4471827745437622, + "learning_rate": 9.82271052631579e-05, + "loss": 0.3878, + "step": 26749 + }, + { + "epoch": 1.4979280994512263, + "grad_norm": 1.4013447761535645, + "learning_rate": 9.822684210526316e-05, + "loss": 0.5327, + "step": 26750 + }, + { + "epoch": 1.4979840967633553, + "grad_norm": 1.5289849042892456, + "learning_rate": 9.822657894736843e-05, + "loss": 0.488, + "step": 26751 + }, + { + "epoch": 1.4980400940754843, + "grad_norm": 1.0098973512649536, + "learning_rate": 9.822631578947369e-05, + "loss": 0.3345, + "step": 26752 + }, + { + "epoch": 1.4980960913876133, + "grad_norm": 1.0826987028121948, + "learning_rate": 9.822605263157896e-05, + "loss": 0.5749, + "step": 26753 + }, + { + "epoch": 1.4981520886997424, + "grad_norm": 1.5209635496139526, + "learning_rate": 9.822578947368421e-05, + "loss": 0.4544, + "step": 26754 + }, + { + "epoch": 1.4982080860118714, + "grad_norm": 2.081101417541504, + "learning_rate": 9.822552631578947e-05, + "loss": 0.8189, + "step": 26755 + }, + { + "epoch": 1.4982640833240004, + "grad_norm": 1.442839503288269, + "learning_rate": 9.822526315789474e-05, + "loss": 0.4608, + "step": 26756 + }, + { + "epoch": 1.4983200806361294, + "grad_norm": 1.606909990310669, + "learning_rate": 9.8225e-05, + "loss": 0.4206, + "step": 26757 + }, + { + "epoch": 1.4983760779482584, + "grad_norm": 1.4205514192581177, + "learning_rate": 9.822473684210528e-05, + "loss": 0.4728, + "step": 26758 + }, + { + "epoch": 1.4984320752603875, + "grad_norm": 1.2935292720794678, + "learning_rate": 9.822447368421052e-05, + "loss": 0.4086, + "step": 26759 + }, + { + "epoch": 1.4984880725725165, + "grad_norm": 1.299617052078247, + "learning_rate": 9.82242105263158e-05, + "loss": 0.483, + "step": 26760 + }, + { + "epoch": 1.4985440698846455, + "grad_norm": 1.3414956331253052, + "learning_rate": 9.822394736842105e-05, + "loss": 0.3733, + "step": 26761 + }, + { + "epoch": 1.4986000671967745, + "grad_norm": 1.2331242561340332, + "learning_rate": 9.822368421052633e-05, + "loss": 0.4835, + "step": 26762 + }, + { + "epoch": 1.4986560645089035, + "grad_norm": 1.2244077920913696, + "learning_rate": 9.822342105263159e-05, + "loss": 0.3698, + "step": 26763 + }, + { + "epoch": 1.4987120618210326, + "grad_norm": 1.3659173250198364, + "learning_rate": 9.822315789473685e-05, + "loss": 0.3211, + "step": 26764 + }, + { + "epoch": 1.4987680591331616, + "grad_norm": 1.341306447982788, + "learning_rate": 9.822289473684211e-05, + "loss": 0.4194, + "step": 26765 + }, + { + "epoch": 1.4988240564452906, + "grad_norm": 1.5292794704437256, + "learning_rate": 9.822263157894738e-05, + "loss": 0.464, + "step": 26766 + }, + { + "epoch": 1.4988800537574196, + "grad_norm": 1.1992926597595215, + "learning_rate": 9.822236842105264e-05, + "loss": 0.4708, + "step": 26767 + }, + { + "epoch": 1.4989360510695486, + "grad_norm": 1.3847891092300415, + "learning_rate": 9.82221052631579e-05, + "loss": 0.4973, + "step": 26768 + }, + { + "epoch": 1.4989920483816777, + "grad_norm": 1.101475715637207, + "learning_rate": 9.822184210526316e-05, + "loss": 0.4568, + "step": 26769 + }, + { + "epoch": 1.4990480456938067, + "grad_norm": 1.3191195726394653, + "learning_rate": 9.822157894736843e-05, + "loss": 0.3405, + "step": 26770 + }, + { + "epoch": 1.4991040430059357, + "grad_norm": 1.291374921798706, + "learning_rate": 9.822131578947369e-05, + "loss": 0.4529, + "step": 26771 + }, + { + "epoch": 1.4991600403180647, + "grad_norm": 1.3544799089431763, + "learning_rate": 9.822105263157895e-05, + "loss": 0.4364, + "step": 26772 + }, + { + "epoch": 1.4992160376301937, + "grad_norm": 1.2974934577941895, + "learning_rate": 9.822078947368421e-05, + "loss": 0.407, + "step": 26773 + }, + { + "epoch": 1.4992720349423228, + "grad_norm": 1.4829468727111816, + "learning_rate": 9.822052631578947e-05, + "loss": 0.4433, + "step": 26774 + }, + { + "epoch": 1.4993280322544518, + "grad_norm": 2.106544017791748, + "learning_rate": 9.822026315789475e-05, + "loss": 0.4644, + "step": 26775 + }, + { + "epoch": 1.4993840295665808, + "grad_norm": 1.395676612854004, + "learning_rate": 9.822e-05, + "loss": 0.4148, + "step": 26776 + }, + { + "epoch": 1.4994400268787098, + "grad_norm": 1.3502802848815918, + "learning_rate": 9.821973684210526e-05, + "loss": 0.6398, + "step": 26777 + }, + { + "epoch": 1.4994960241908388, + "grad_norm": 1.4034233093261719, + "learning_rate": 9.821947368421052e-05, + "loss": 0.6107, + "step": 26778 + }, + { + "epoch": 1.4995520215029678, + "grad_norm": 1.351454257965088, + "learning_rate": 9.82192105263158e-05, + "loss": 0.4908, + "step": 26779 + }, + { + "epoch": 1.4996080188150969, + "grad_norm": 1.7202394008636475, + "learning_rate": 9.821894736842106e-05, + "loss": 0.5555, + "step": 26780 + }, + { + "epoch": 1.4996640161272259, + "grad_norm": 1.1028008460998535, + "learning_rate": 9.821868421052632e-05, + "loss": 0.3848, + "step": 26781 + }, + { + "epoch": 1.499720013439355, + "grad_norm": 1.5374046564102173, + "learning_rate": 9.821842105263158e-05, + "loss": 0.4258, + "step": 26782 + }, + { + "epoch": 1.499776010751484, + "grad_norm": 1.2351723909378052, + "learning_rate": 9.821815789473685e-05, + "loss": 0.4447, + "step": 26783 + }, + { + "epoch": 1.499832008063613, + "grad_norm": 1.3446274995803833, + "learning_rate": 9.821789473684211e-05, + "loss": 0.4953, + "step": 26784 + }, + { + "epoch": 1.499888005375742, + "grad_norm": 1.412572979927063, + "learning_rate": 9.821763157894738e-05, + "loss": 0.4591, + "step": 26785 + }, + { + "epoch": 1.499944002687871, + "grad_norm": 1.1691511869430542, + "learning_rate": 9.821736842105263e-05, + "loss": 0.4391, + "step": 26786 + }, + { + "epoch": 1.5, + "grad_norm": 1.2498557567596436, + "learning_rate": 9.82171052631579e-05, + "loss": 0.4797, + "step": 26787 + }, + { + "epoch": 1.500055997312129, + "grad_norm": 1.5805020332336426, + "learning_rate": 9.821684210526316e-05, + "loss": 0.474, + "step": 26788 + }, + { + "epoch": 1.500111994624258, + "grad_norm": 1.2805150747299194, + "learning_rate": 9.821657894736842e-05, + "loss": 0.3752, + "step": 26789 + }, + { + "epoch": 1.500167991936387, + "grad_norm": 1.155444622039795, + "learning_rate": 9.82163157894737e-05, + "loss": 0.382, + "step": 26790 + }, + { + "epoch": 1.500223989248516, + "grad_norm": 1.1691333055496216, + "learning_rate": 9.821605263157894e-05, + "loss": 0.3829, + "step": 26791 + }, + { + "epoch": 1.500279986560645, + "grad_norm": 1.4243505001068115, + "learning_rate": 9.821578947368421e-05, + "loss": 0.5008, + "step": 26792 + }, + { + "epoch": 1.5003359838727741, + "grad_norm": 1.4051177501678467, + "learning_rate": 9.821552631578947e-05, + "loss": 0.622, + "step": 26793 + }, + { + "epoch": 1.5003919811849031, + "grad_norm": 1.3862048387527466, + "learning_rate": 9.821526315789475e-05, + "loss": 0.4147, + "step": 26794 + }, + { + "epoch": 1.5004479784970322, + "grad_norm": 1.3812639713287354, + "learning_rate": 9.821500000000001e-05, + "loss": 0.5005, + "step": 26795 + }, + { + "epoch": 1.5005039758091612, + "grad_norm": 1.407889485359192, + "learning_rate": 9.821473684210527e-05, + "loss": 0.5874, + "step": 26796 + }, + { + "epoch": 1.5005599731212902, + "grad_norm": 1.387760043144226, + "learning_rate": 9.821447368421053e-05, + "loss": 0.4919, + "step": 26797 + }, + { + "epoch": 1.5006159704334192, + "grad_norm": 1.3240712881088257, + "learning_rate": 9.82142105263158e-05, + "loss": 0.4559, + "step": 26798 + }, + { + "epoch": 1.5006719677455482, + "grad_norm": 2.1726765632629395, + "learning_rate": 9.821394736842106e-05, + "loss": 0.5946, + "step": 26799 + }, + { + "epoch": 1.5007279650576772, + "grad_norm": 1.0934633016586304, + "learning_rate": 9.821368421052632e-05, + "loss": 0.3353, + "step": 26800 + }, + { + "epoch": 1.5007839623698063, + "grad_norm": 1.3092851638793945, + "learning_rate": 9.821342105263158e-05, + "loss": 0.5126, + "step": 26801 + }, + { + "epoch": 1.5008399596819353, + "grad_norm": 1.3932744264602661, + "learning_rate": 9.821315789473685e-05, + "loss": 0.4586, + "step": 26802 + }, + { + "epoch": 1.5008959569940643, + "grad_norm": 1.3432908058166504, + "learning_rate": 9.821289473684211e-05, + "loss": 0.4227, + "step": 26803 + }, + { + "epoch": 1.5009519543061933, + "grad_norm": 1.2111737728118896, + "learning_rate": 9.821263157894737e-05, + "loss": 0.4225, + "step": 26804 + }, + { + "epoch": 1.5010079516183223, + "grad_norm": 1.2058987617492676, + "learning_rate": 9.821236842105263e-05, + "loss": 0.3126, + "step": 26805 + }, + { + "epoch": 1.5010639489304514, + "grad_norm": 1.4938188791275024, + "learning_rate": 9.821210526315789e-05, + "loss": 0.5548, + "step": 26806 + }, + { + "epoch": 1.5011199462425804, + "grad_norm": 1.2865266799926758, + "learning_rate": 9.821184210526316e-05, + "loss": 0.4605, + "step": 26807 + }, + { + "epoch": 1.5011759435547094, + "grad_norm": 1.329338550567627, + "learning_rate": 9.821157894736842e-05, + "loss": 0.4606, + "step": 26808 + }, + { + "epoch": 1.5012319408668384, + "grad_norm": 1.2825850248336792, + "learning_rate": 9.821131578947368e-05, + "loss": 0.4284, + "step": 26809 + }, + { + "epoch": 1.5012879381789674, + "grad_norm": 1.5459418296813965, + "learning_rate": 9.821105263157894e-05, + "loss": 0.6218, + "step": 26810 + }, + { + "epoch": 1.5013439354910965, + "grad_norm": 1.3349864482879639, + "learning_rate": 9.821078947368422e-05, + "loss": 0.3773, + "step": 26811 + }, + { + "epoch": 1.5013999328032255, + "grad_norm": 1.8708873987197876, + "learning_rate": 9.821052631578948e-05, + "loss": 0.4553, + "step": 26812 + }, + { + "epoch": 1.5014559301153545, + "grad_norm": 1.0955673456192017, + "learning_rate": 9.821026315789475e-05, + "loss": 0.3634, + "step": 26813 + }, + { + "epoch": 1.5015119274274835, + "grad_norm": 1.1382811069488525, + "learning_rate": 9.821e-05, + "loss": 0.4818, + "step": 26814 + }, + { + "epoch": 1.5015679247396125, + "grad_norm": 1.322993516921997, + "learning_rate": 9.820973684210527e-05, + "loss": 0.4008, + "step": 26815 + }, + { + "epoch": 1.5016239220517416, + "grad_norm": 1.3150700330734253, + "learning_rate": 9.820947368421053e-05, + "loss": 0.535, + "step": 26816 + }, + { + "epoch": 1.5016799193638706, + "grad_norm": 1.4436053037643433, + "learning_rate": 9.82092105263158e-05, + "loss": 0.4855, + "step": 26817 + }, + { + "epoch": 1.5017359166759996, + "grad_norm": 1.3064913749694824, + "learning_rate": 9.820894736842106e-05, + "loss": 0.4284, + "step": 26818 + }, + { + "epoch": 1.5017919139881286, + "grad_norm": 1.1609876155853271, + "learning_rate": 9.820868421052632e-05, + "loss": 0.3699, + "step": 26819 + }, + { + "epoch": 1.5018479113002576, + "grad_norm": 1.4147918224334717, + "learning_rate": 9.820842105263158e-05, + "loss": 0.5225, + "step": 26820 + }, + { + "epoch": 1.5019039086123867, + "grad_norm": 1.2616310119628906, + "learning_rate": 9.820815789473686e-05, + "loss": 0.3796, + "step": 26821 + }, + { + "epoch": 1.5019599059245157, + "grad_norm": 2.5908286571502686, + "learning_rate": 9.820789473684211e-05, + "loss": 0.5921, + "step": 26822 + }, + { + "epoch": 1.5020159032366447, + "grad_norm": 1.4717801809310913, + "learning_rate": 9.820763157894736e-05, + "loss": 0.4755, + "step": 26823 + }, + { + "epoch": 1.5020719005487737, + "grad_norm": 1.6030758619308472, + "learning_rate": 9.820736842105263e-05, + "loss": 0.5514, + "step": 26824 + }, + { + "epoch": 1.5021278978609027, + "grad_norm": 1.283334493637085, + "learning_rate": 9.82071052631579e-05, + "loss": 0.5847, + "step": 26825 + }, + { + "epoch": 1.5021838951730317, + "grad_norm": 1.3323065042495728, + "learning_rate": 9.820684210526317e-05, + "loss": 0.4857, + "step": 26826 + }, + { + "epoch": 1.5022398924851608, + "grad_norm": 1.284218192100525, + "learning_rate": 9.820657894736843e-05, + "loss": 0.4279, + "step": 26827 + }, + { + "epoch": 1.5022958897972898, + "grad_norm": 1.5113115310668945, + "learning_rate": 9.820631578947369e-05, + "loss": 0.4652, + "step": 26828 + }, + { + "epoch": 1.5023518871094188, + "grad_norm": 1.7711883783340454, + "learning_rate": 9.820605263157895e-05, + "loss": 0.5026, + "step": 26829 + }, + { + "epoch": 1.5024078844215478, + "grad_norm": 1.3830803632736206, + "learning_rate": 9.820578947368422e-05, + "loss": 0.4605, + "step": 26830 + }, + { + "epoch": 1.5024638817336768, + "grad_norm": 1.2874218225479126, + "learning_rate": 9.820552631578948e-05, + "loss": 0.4895, + "step": 26831 + }, + { + "epoch": 1.5025198790458059, + "grad_norm": 1.2400010824203491, + "learning_rate": 9.820526315789474e-05, + "loss": 0.3865, + "step": 26832 + }, + { + "epoch": 1.5025758763579349, + "grad_norm": 1.394715666770935, + "learning_rate": 9.8205e-05, + "loss": 0.5973, + "step": 26833 + }, + { + "epoch": 1.502631873670064, + "grad_norm": 1.2617863416671753, + "learning_rate": 9.820473684210527e-05, + "loss": 0.4719, + "step": 26834 + }, + { + "epoch": 1.502687870982193, + "grad_norm": NaN, + "learning_rate": 9.820473684210527e-05, + "loss": 0.4233, + "step": 26835 + }, + { + "epoch": 1.502743868294322, + "grad_norm": 1.1229867935180664, + "learning_rate": 9.820447368421053e-05, + "loss": 0.4105, + "step": 26836 + }, + { + "epoch": 1.502799865606451, + "grad_norm": 1.460927963256836, + "learning_rate": 9.820421052631579e-05, + "loss": 0.5992, + "step": 26837 + }, + { + "epoch": 1.50285586291858, + "grad_norm": 1.4900665283203125, + "learning_rate": 9.820394736842105e-05, + "loss": 0.5054, + "step": 26838 + }, + { + "epoch": 1.502911860230709, + "grad_norm": 1.3573169708251953, + "learning_rate": 9.820368421052632e-05, + "loss": 0.5846, + "step": 26839 + }, + { + "epoch": 1.502967857542838, + "grad_norm": 1.4125213623046875, + "learning_rate": 9.820342105263158e-05, + "loss": 0.4359, + "step": 26840 + }, + { + "epoch": 1.503023854854967, + "grad_norm": 1.2597665786743164, + "learning_rate": 9.820315789473684e-05, + "loss": 0.4514, + "step": 26841 + }, + { + "epoch": 1.503079852167096, + "grad_norm": 1.8424570560455322, + "learning_rate": 9.82028947368421e-05, + "loss": 0.6751, + "step": 26842 + }, + { + "epoch": 1.503135849479225, + "grad_norm": 1.2865324020385742, + "learning_rate": 9.820263157894736e-05, + "loss": 0.4469, + "step": 26843 + }, + { + "epoch": 1.503191846791354, + "grad_norm": 1.4428001642227173, + "learning_rate": 9.820236842105264e-05, + "loss": 0.5853, + "step": 26844 + }, + { + "epoch": 1.503247844103483, + "grad_norm": 1.3592240810394287, + "learning_rate": 9.82021052631579e-05, + "loss": 0.4209, + "step": 26845 + }, + { + "epoch": 1.5033038414156121, + "grad_norm": 1.2329879999160767, + "learning_rate": 9.820184210526317e-05, + "loss": 0.564, + "step": 26846 + }, + { + "epoch": 1.5033598387277411, + "grad_norm": 1.3990342617034912, + "learning_rate": 9.820157894736842e-05, + "loss": 0.5239, + "step": 26847 + }, + { + "epoch": 1.5034158360398702, + "grad_norm": 1.3297662734985352, + "learning_rate": 9.820131578947369e-05, + "loss": 0.4364, + "step": 26848 + }, + { + "epoch": 1.5034718333519992, + "grad_norm": 1.827245831489563, + "learning_rate": 9.820105263157895e-05, + "loss": 0.5855, + "step": 26849 + }, + { + "epoch": 1.5035278306641282, + "grad_norm": 1.5831985473632812, + "learning_rate": 9.820078947368422e-05, + "loss": 0.5982, + "step": 26850 + }, + { + "epoch": 1.5035838279762572, + "grad_norm": 1.1328448057174683, + "learning_rate": 9.820052631578948e-05, + "loss": 0.4677, + "step": 26851 + }, + { + "epoch": 1.5036398252883862, + "grad_norm": 1.227616310119629, + "learning_rate": 9.820026315789474e-05, + "loss": 0.5185, + "step": 26852 + }, + { + "epoch": 1.5036958226005153, + "grad_norm": 1.471461296081543, + "learning_rate": 9.82e-05, + "loss": 0.4526, + "step": 26853 + }, + { + "epoch": 1.5037518199126443, + "grad_norm": 1.071116328239441, + "learning_rate": 9.819973684210527e-05, + "loss": 0.3713, + "step": 26854 + }, + { + "epoch": 1.5038078172247733, + "grad_norm": 1.463884949684143, + "learning_rate": 9.819947368421053e-05, + "loss": 0.5586, + "step": 26855 + }, + { + "epoch": 1.5038638145369023, + "grad_norm": 1.183410406112671, + "learning_rate": 9.81992105263158e-05, + "loss": 0.4291, + "step": 26856 + }, + { + "epoch": 1.5039198118490313, + "grad_norm": 1.2246249914169312, + "learning_rate": 9.819894736842105e-05, + "loss": 0.4141, + "step": 26857 + }, + { + "epoch": 1.5039758091611604, + "grad_norm": 1.3548663854599, + "learning_rate": 9.819868421052631e-05, + "loss": 0.6363, + "step": 26858 + }, + { + "epoch": 1.5040318064732894, + "grad_norm": 1.112684726715088, + "learning_rate": 9.819842105263159e-05, + "loss": 0.4017, + "step": 26859 + }, + { + "epoch": 1.5040878037854184, + "grad_norm": 1.1287680864334106, + "learning_rate": 9.819815789473685e-05, + "loss": 0.3189, + "step": 26860 + }, + { + "epoch": 1.5041438010975474, + "grad_norm": 1.3464311361312866, + "learning_rate": 9.81978947368421e-05, + "loss": 0.4012, + "step": 26861 + }, + { + "epoch": 1.5041997984096764, + "grad_norm": 1.1868360042572021, + "learning_rate": 9.819763157894737e-05, + "loss": 0.3788, + "step": 26862 + }, + { + "epoch": 1.5042557957218055, + "grad_norm": 1.2991905212402344, + "learning_rate": 9.819736842105264e-05, + "loss": 0.4808, + "step": 26863 + }, + { + "epoch": 1.5043117930339345, + "grad_norm": 1.4344667196273804, + "learning_rate": 9.81971052631579e-05, + "loss": 0.5092, + "step": 26864 + }, + { + "epoch": 1.5043677903460635, + "grad_norm": 1.2147564888000488, + "learning_rate": 9.819684210526316e-05, + "loss": 0.4904, + "step": 26865 + }, + { + "epoch": 1.5044237876581925, + "grad_norm": 1.4568026065826416, + "learning_rate": 9.819657894736842e-05, + "loss": 0.5145, + "step": 26866 + }, + { + "epoch": 1.5044797849703215, + "grad_norm": 1.2571946382522583, + "learning_rate": 9.819631578947369e-05, + "loss": 0.4085, + "step": 26867 + }, + { + "epoch": 1.5045357822824506, + "grad_norm": 1.4784843921661377, + "learning_rate": 9.819605263157895e-05, + "loss": 0.5063, + "step": 26868 + }, + { + "epoch": 1.5045917795945796, + "grad_norm": 1.479533314704895, + "learning_rate": 9.819578947368423e-05, + "loss": 0.4835, + "step": 26869 + }, + { + "epoch": 1.5046477769067086, + "grad_norm": 1.1925956010818481, + "learning_rate": 9.819552631578947e-05, + "loss": 0.4318, + "step": 26870 + }, + { + "epoch": 1.5047037742188376, + "grad_norm": 1.4827125072479248, + "learning_rate": 9.819526315789474e-05, + "loss": 0.64, + "step": 26871 + }, + { + "epoch": 1.5047597715309666, + "grad_norm": 1.2384952306747437, + "learning_rate": 9.8195e-05, + "loss": 0.4034, + "step": 26872 + }, + { + "epoch": 1.5048157688430956, + "grad_norm": 1.070129156112671, + "learning_rate": 9.819473684210528e-05, + "loss": 0.3833, + "step": 26873 + }, + { + "epoch": 1.5048717661552247, + "grad_norm": 1.3075287342071533, + "learning_rate": 9.819447368421054e-05, + "loss": 0.4432, + "step": 26874 + }, + { + "epoch": 1.5049277634673537, + "grad_norm": 1.3838526010513306, + "learning_rate": 9.81942105263158e-05, + "loss": 0.4524, + "step": 26875 + }, + { + "epoch": 1.5049837607794827, + "grad_norm": 1.6466572284698486, + "learning_rate": 9.819394736842106e-05, + "loss": 0.5443, + "step": 26876 + }, + { + "epoch": 1.5050397580916117, + "grad_norm": 1.976815104484558, + "learning_rate": 9.819368421052632e-05, + "loss": 0.5765, + "step": 26877 + }, + { + "epoch": 1.5050957554037407, + "grad_norm": 1.3899425268173218, + "learning_rate": 9.819342105263159e-05, + "loss": 0.4691, + "step": 26878 + }, + { + "epoch": 1.5051517527158698, + "grad_norm": 1.2887729406356812, + "learning_rate": 9.819315789473684e-05, + "loss": 0.5758, + "step": 26879 + }, + { + "epoch": 1.5052077500279988, + "grad_norm": 1.201016902923584, + "learning_rate": 9.819289473684211e-05, + "loss": 0.3569, + "step": 26880 + }, + { + "epoch": 1.5052637473401278, + "grad_norm": 1.3866976499557495, + "learning_rate": 9.819263157894737e-05, + "loss": 0.4409, + "step": 26881 + }, + { + "epoch": 1.5053197446522568, + "grad_norm": 1.2868051528930664, + "learning_rate": 9.819236842105264e-05, + "loss": 0.4047, + "step": 26882 + }, + { + "epoch": 1.5053757419643858, + "grad_norm": 2.1382479667663574, + "learning_rate": 9.81921052631579e-05, + "loss": 0.4798, + "step": 26883 + }, + { + "epoch": 1.5054317392765149, + "grad_norm": 1.374638319015503, + "learning_rate": 9.819184210526316e-05, + "loss": 0.439, + "step": 26884 + }, + { + "epoch": 1.5054877365886439, + "grad_norm": 1.6410070657730103, + "learning_rate": 9.819157894736842e-05, + "loss": 0.5119, + "step": 26885 + }, + { + "epoch": 1.505543733900773, + "grad_norm": 1.246012568473816, + "learning_rate": 9.81913157894737e-05, + "loss": 0.4355, + "step": 26886 + }, + { + "epoch": 1.505599731212902, + "grad_norm": 1.2205312252044678, + "learning_rate": 9.819105263157895e-05, + "loss": 0.4992, + "step": 26887 + }, + { + "epoch": 1.505655728525031, + "grad_norm": 1.5243549346923828, + "learning_rate": 9.819078947368421e-05, + "loss": 0.5899, + "step": 26888 + }, + { + "epoch": 1.50571172583716, + "grad_norm": 1.6237629652023315, + "learning_rate": 9.819052631578947e-05, + "loss": 0.4594, + "step": 26889 + }, + { + "epoch": 1.505767723149289, + "grad_norm": 1.3966513872146606, + "learning_rate": 9.819026315789475e-05, + "loss": 0.4631, + "step": 26890 + }, + { + "epoch": 1.505823720461418, + "grad_norm": 1.6498286724090576, + "learning_rate": 9.819000000000001e-05, + "loss": 0.4875, + "step": 26891 + }, + { + "epoch": 1.505879717773547, + "grad_norm": 1.6096594333648682, + "learning_rate": 9.818973684210527e-05, + "loss": 0.3638, + "step": 26892 + }, + { + "epoch": 1.505935715085676, + "grad_norm": 1.3800125122070312, + "learning_rate": 9.818947368421053e-05, + "loss": 0.4304, + "step": 26893 + }, + { + "epoch": 1.505991712397805, + "grad_norm": 1.5419855117797852, + "learning_rate": 9.818921052631579e-05, + "loss": 0.4609, + "step": 26894 + }, + { + "epoch": 1.506047709709934, + "grad_norm": 1.3288782835006714, + "learning_rate": 9.818894736842106e-05, + "loss": 0.4868, + "step": 26895 + }, + { + "epoch": 1.506103707022063, + "grad_norm": 1.3769283294677734, + "learning_rate": 9.818868421052632e-05, + "loss": 0.4556, + "step": 26896 + }, + { + "epoch": 1.506159704334192, + "grad_norm": 1.2915641069412231, + "learning_rate": 9.818842105263158e-05, + "loss": 0.513, + "step": 26897 + }, + { + "epoch": 1.5062157016463211, + "grad_norm": 1.331145167350769, + "learning_rate": 9.818815789473684e-05, + "loss": 0.5951, + "step": 26898 + }, + { + "epoch": 1.5062716989584501, + "grad_norm": 1.5310689210891724, + "learning_rate": 9.818789473684211e-05, + "loss": 0.5239, + "step": 26899 + }, + { + "epoch": 1.5063276962705792, + "grad_norm": 1.2608551979064941, + "learning_rate": 9.818763157894737e-05, + "loss": 0.4522, + "step": 26900 + }, + { + "epoch": 1.5063836935827082, + "grad_norm": 1.5724152326583862, + "learning_rate": 9.818736842105264e-05, + "loss": 0.5246, + "step": 26901 + }, + { + "epoch": 1.5064396908948372, + "grad_norm": 1.3005868196487427, + "learning_rate": 9.818710526315789e-05, + "loss": 0.379, + "step": 26902 + }, + { + "epoch": 1.5064956882069662, + "grad_norm": 1.192333459854126, + "learning_rate": 9.818684210526316e-05, + "loss": 0.3715, + "step": 26903 + }, + { + "epoch": 1.5065516855190952, + "grad_norm": 1.3528265953063965, + "learning_rate": 9.818657894736842e-05, + "loss": 0.3949, + "step": 26904 + }, + { + "epoch": 1.5066076828312243, + "grad_norm": 1.7287955284118652, + "learning_rate": 9.81863157894737e-05, + "loss": 0.3794, + "step": 26905 + }, + { + "epoch": 1.5066636801433533, + "grad_norm": 2.0118093490600586, + "learning_rate": 9.818605263157896e-05, + "loss": 0.569, + "step": 26906 + }, + { + "epoch": 1.5067196774554823, + "grad_norm": 1.28496515750885, + "learning_rate": 9.818578947368422e-05, + "loss": 0.4252, + "step": 26907 + }, + { + "epoch": 1.5067756747676113, + "grad_norm": 1.237634301185608, + "learning_rate": 9.818552631578948e-05, + "loss": 0.4065, + "step": 26908 + }, + { + "epoch": 1.5068316720797403, + "grad_norm": 1.317826509475708, + "learning_rate": 9.818526315789475e-05, + "loss": 0.4356, + "step": 26909 + }, + { + "epoch": 1.5068876693918694, + "grad_norm": 1.404542326927185, + "learning_rate": 9.818500000000001e-05, + "loss": 0.5111, + "step": 26910 + }, + { + "epoch": 1.5069436667039984, + "grad_norm": 1.5804247856140137, + "learning_rate": 9.818473684210527e-05, + "loss": 0.4187, + "step": 26911 + }, + { + "epoch": 1.5069996640161272, + "grad_norm": 1.4498772621154785, + "learning_rate": 9.818447368421053e-05, + "loss": 0.5616, + "step": 26912 + }, + { + "epoch": 1.5070556613282562, + "grad_norm": 1.2389779090881348, + "learning_rate": 9.818421052631579e-05, + "loss": 0.4663, + "step": 26913 + }, + { + "epoch": 1.5071116586403852, + "grad_norm": 1.4195177555084229, + "learning_rate": 9.818394736842106e-05, + "loss": 0.5183, + "step": 26914 + }, + { + "epoch": 1.5071676559525142, + "grad_norm": 1.356787919998169, + "learning_rate": 9.818368421052632e-05, + "loss": 0.3849, + "step": 26915 + }, + { + "epoch": 1.5072236532646432, + "grad_norm": 1.1957062482833862, + "learning_rate": 9.818342105263158e-05, + "loss": 0.402, + "step": 26916 + }, + { + "epoch": 1.5072796505767723, + "grad_norm": 1.26849365234375, + "learning_rate": 9.818315789473684e-05, + "loss": 0.538, + "step": 26917 + }, + { + "epoch": 1.5073356478889013, + "grad_norm": 1.2174674272537231, + "learning_rate": 9.818289473684211e-05, + "loss": 0.4348, + "step": 26918 + }, + { + "epoch": 1.5073916452010303, + "grad_norm": 1.4033805131912231, + "learning_rate": 9.818263157894737e-05, + "loss": 0.3808, + "step": 26919 + }, + { + "epoch": 1.5074476425131593, + "grad_norm": 1.1879520416259766, + "learning_rate": 9.818236842105263e-05, + "loss": 0.5138, + "step": 26920 + }, + { + "epoch": 1.5075036398252883, + "grad_norm": 1.3620471954345703, + "learning_rate": 9.81821052631579e-05, + "loss": 0.4985, + "step": 26921 + }, + { + "epoch": 1.5075596371374174, + "grad_norm": 1.1453810930252075, + "learning_rate": 9.818184210526317e-05, + "loss": 0.4237, + "step": 26922 + }, + { + "epoch": 1.5076156344495464, + "grad_norm": 18.394256591796875, + "learning_rate": 9.818157894736843e-05, + "loss": 0.5131, + "step": 26923 + }, + { + "epoch": 1.5076716317616754, + "grad_norm": 1.3331096172332764, + "learning_rate": 9.81813157894737e-05, + "loss": 0.4732, + "step": 26924 + }, + { + "epoch": 1.5077276290738044, + "grad_norm": 1.2102887630462646, + "learning_rate": 9.818105263157895e-05, + "loss": 0.4682, + "step": 26925 + }, + { + "epoch": 1.5077836263859334, + "grad_norm": 1.5214927196502686, + "learning_rate": 9.818078947368422e-05, + "loss": 0.4753, + "step": 26926 + }, + { + "epoch": 1.5078396236980625, + "grad_norm": 1.2080925703048706, + "learning_rate": 9.818052631578948e-05, + "loss": 0.4148, + "step": 26927 + }, + { + "epoch": 1.5078956210101915, + "grad_norm": 7.116842269897461, + "learning_rate": 9.818026315789474e-05, + "loss": 0.4787, + "step": 26928 + }, + { + "epoch": 1.5079516183223205, + "grad_norm": 1.38553786277771, + "learning_rate": 9.818000000000001e-05, + "loss": 0.4541, + "step": 26929 + }, + { + "epoch": 1.5080076156344495, + "grad_norm": 3.073695659637451, + "learning_rate": 9.817973684210526e-05, + "loss": 0.4254, + "step": 26930 + }, + { + "epoch": 1.5080636129465785, + "grad_norm": 1.1337478160858154, + "learning_rate": 9.817947368421053e-05, + "loss": 0.3835, + "step": 26931 + }, + { + "epoch": 1.5081196102587076, + "grad_norm": 1.1407872438430786, + "learning_rate": 9.817921052631579e-05, + "loss": 0.3578, + "step": 26932 + }, + { + "epoch": 1.5081756075708366, + "grad_norm": 1.1704356670379639, + "learning_rate": 9.817894736842106e-05, + "loss": 0.3821, + "step": 26933 + }, + { + "epoch": 1.5082316048829656, + "grad_norm": 1.244095802307129, + "learning_rate": 9.817868421052631e-05, + "loss": 0.4444, + "step": 26934 + }, + { + "epoch": 1.5082876021950946, + "grad_norm": 1.176902413368225, + "learning_rate": 9.817842105263158e-05, + "loss": 0.389, + "step": 26935 + }, + { + "epoch": 1.5083435995072236, + "grad_norm": 1.5219768285751343, + "learning_rate": 9.817815789473684e-05, + "loss": 0.4255, + "step": 26936 + }, + { + "epoch": 1.5083995968193527, + "grad_norm": 1.2690356969833374, + "learning_rate": 9.817789473684212e-05, + "loss": 0.4926, + "step": 26937 + }, + { + "epoch": 1.5084555941314817, + "grad_norm": 1.209762454032898, + "learning_rate": 9.817763157894738e-05, + "loss": 0.4703, + "step": 26938 + }, + { + "epoch": 1.5085115914436107, + "grad_norm": 1.5581369400024414, + "learning_rate": 9.817736842105264e-05, + "loss": 0.4605, + "step": 26939 + }, + { + "epoch": 1.5085675887557397, + "grad_norm": 1.1573799848556519, + "learning_rate": 9.81771052631579e-05, + "loss": 0.4106, + "step": 26940 + }, + { + "epoch": 1.5086235860678687, + "grad_norm": 1.1588919162750244, + "learning_rate": 9.817684210526317e-05, + "loss": 0.5242, + "step": 26941 + }, + { + "epoch": 1.5086795833799977, + "grad_norm": 1.4145991802215576, + "learning_rate": 9.817657894736843e-05, + "loss": 0.4363, + "step": 26942 + }, + { + "epoch": 1.5087355806921268, + "grad_norm": 1.0535063743591309, + "learning_rate": 9.817631578947369e-05, + "loss": 0.3595, + "step": 26943 + }, + { + "epoch": 1.5087915780042558, + "grad_norm": 1.281191349029541, + "learning_rate": 9.817605263157895e-05, + "loss": 0.4164, + "step": 26944 + }, + { + "epoch": 1.5088475753163848, + "grad_norm": 1.3760112524032593, + "learning_rate": 9.817578947368421e-05, + "loss": 0.459, + "step": 26945 + }, + { + "epoch": 1.5089035726285138, + "grad_norm": 1.3487932682037354, + "learning_rate": 9.817552631578948e-05, + "loss": 0.4675, + "step": 26946 + }, + { + "epoch": 1.5089595699406428, + "grad_norm": 1.274167776107788, + "learning_rate": 9.817526315789474e-05, + "loss": 0.4668, + "step": 26947 + }, + { + "epoch": 1.5090155672527719, + "grad_norm": 1.2175053358078003, + "learning_rate": 9.8175e-05, + "loss": 0.6446, + "step": 26948 + }, + { + "epoch": 1.5090715645649009, + "grad_norm": 1.1962106227874756, + "learning_rate": 9.817473684210526e-05, + "loss": 0.4663, + "step": 26949 + }, + { + "epoch": 1.50912756187703, + "grad_norm": 7.010622024536133, + "learning_rate": 9.817447368421053e-05, + "loss": 0.2958, + "step": 26950 + }, + { + "epoch": 1.509183559189159, + "grad_norm": 1.4108219146728516, + "learning_rate": 9.81742105263158e-05, + "loss": 0.4266, + "step": 26951 + }, + { + "epoch": 1.509239556501288, + "grad_norm": 1.3778231143951416, + "learning_rate": 9.817394736842105e-05, + "loss": 0.5832, + "step": 26952 + }, + { + "epoch": 1.509295553813417, + "grad_norm": 1.2143008708953857, + "learning_rate": 9.817368421052631e-05, + "loss": 0.49, + "step": 26953 + }, + { + "epoch": 1.509351551125546, + "grad_norm": 1.4713767766952515, + "learning_rate": 9.817342105263159e-05, + "loss": 0.4437, + "step": 26954 + }, + { + "epoch": 1.509407548437675, + "grad_norm": 1.3192007541656494, + "learning_rate": 9.817315789473685e-05, + "loss": 0.4068, + "step": 26955 + }, + { + "epoch": 1.509463545749804, + "grad_norm": 1.194394826889038, + "learning_rate": 9.817289473684212e-05, + "loss": 0.4051, + "step": 26956 + }, + { + "epoch": 1.509519543061933, + "grad_norm": 1.1465498208999634, + "learning_rate": 9.817263157894737e-05, + "loss": 0.4345, + "step": 26957 + }, + { + "epoch": 1.509575540374062, + "grad_norm": 1.186548113822937, + "learning_rate": 9.817236842105264e-05, + "loss": 0.4653, + "step": 26958 + }, + { + "epoch": 1.509631537686191, + "grad_norm": 1.673143744468689, + "learning_rate": 9.81721052631579e-05, + "loss": 0.8719, + "step": 26959 + }, + { + "epoch": 1.50968753499832, + "grad_norm": 1.4337735176086426, + "learning_rate": 9.817184210526317e-05, + "loss": 0.4798, + "step": 26960 + }, + { + "epoch": 1.509743532310449, + "grad_norm": 1.67737877368927, + "learning_rate": 9.817157894736843e-05, + "loss": 0.5259, + "step": 26961 + }, + { + "epoch": 1.5097995296225781, + "grad_norm": 1.0275202989578247, + "learning_rate": 9.817131578947368e-05, + "loss": 0.4168, + "step": 26962 + }, + { + "epoch": 1.5098555269347071, + "grad_norm": 1.2809327840805054, + "learning_rate": 9.817105263157895e-05, + "loss": 0.3889, + "step": 26963 + }, + { + "epoch": 1.5099115242468362, + "grad_norm": 1.1116302013397217, + "learning_rate": 9.817078947368421e-05, + "loss": 0.4027, + "step": 26964 + }, + { + "epoch": 1.5099675215589652, + "grad_norm": 1.626907229423523, + "learning_rate": 9.817052631578948e-05, + "loss": 0.5272, + "step": 26965 + }, + { + "epoch": 1.5100235188710942, + "grad_norm": 1.2781729698181152, + "learning_rate": 9.817026315789474e-05, + "loss": 0.5919, + "step": 26966 + }, + { + "epoch": 1.5100795161832232, + "grad_norm": 1.2536100149154663, + "learning_rate": 9.817e-05, + "loss": 0.3703, + "step": 26967 + }, + { + "epoch": 1.5101355134953522, + "grad_norm": 1.2065856456756592, + "learning_rate": 9.816973684210526e-05, + "loss": 0.4474, + "step": 26968 + }, + { + "epoch": 1.5101915108074813, + "grad_norm": 1.3996970653533936, + "learning_rate": 9.816947368421054e-05, + "loss": 0.4775, + "step": 26969 + }, + { + "epoch": 1.5102475081196103, + "grad_norm": 1.4324966669082642, + "learning_rate": 9.81692105263158e-05, + "loss": 0.4612, + "step": 26970 + }, + { + "epoch": 1.5103035054317393, + "grad_norm": 1.1891260147094727, + "learning_rate": 9.816894736842106e-05, + "loss": 0.5276, + "step": 26971 + }, + { + "epoch": 1.5103595027438683, + "grad_norm": 1.230858564376831, + "learning_rate": 9.816868421052632e-05, + "loss": 0.592, + "step": 26972 + }, + { + "epoch": 1.5104155000559973, + "grad_norm": 1.4232933521270752, + "learning_rate": 9.816842105263159e-05, + "loss": 0.499, + "step": 26973 + }, + { + "epoch": 1.5104714973681264, + "grad_norm": 1.3536614179611206, + "learning_rate": 9.816815789473685e-05, + "loss": 0.3909, + "step": 26974 + }, + { + "epoch": 1.5105274946802554, + "grad_norm": 2.4492759704589844, + "learning_rate": 9.816789473684211e-05, + "loss": 0.5383, + "step": 26975 + }, + { + "epoch": 1.5105834919923844, + "grad_norm": 1.141347885131836, + "learning_rate": 9.816763157894737e-05, + "loss": 0.4357, + "step": 26976 + }, + { + "epoch": 1.5106394893045134, + "grad_norm": 1.2195390462875366, + "learning_rate": 9.816736842105264e-05, + "loss": 0.5351, + "step": 26977 + }, + { + "epoch": 1.5106954866166424, + "grad_norm": 1.432437539100647, + "learning_rate": 9.81671052631579e-05, + "loss": 0.6622, + "step": 26978 + }, + { + "epoch": 1.5107514839287715, + "grad_norm": 1.1447179317474365, + "learning_rate": 9.816684210526316e-05, + "loss": 0.3834, + "step": 26979 + }, + { + "epoch": 1.5108074812409005, + "grad_norm": 1.3678417205810547, + "learning_rate": 9.816657894736842e-05, + "loss": 0.5131, + "step": 26980 + }, + { + "epoch": 1.5108634785530295, + "grad_norm": 1.1029322147369385, + "learning_rate": 9.816631578947368e-05, + "loss": 0.4077, + "step": 26981 + }, + { + "epoch": 1.5109194758651585, + "grad_norm": 1.260974645614624, + "learning_rate": 9.816605263157895e-05, + "loss": 0.5159, + "step": 26982 + }, + { + "epoch": 1.5109754731772875, + "grad_norm": 1.2160358428955078, + "learning_rate": 9.816578947368421e-05, + "loss": 0.4207, + "step": 26983 + }, + { + "epoch": 1.5110314704894166, + "grad_norm": 1.2865164279937744, + "learning_rate": 9.816552631578947e-05, + "loss": 0.4125, + "step": 26984 + }, + { + "epoch": 1.5110874678015456, + "grad_norm": 1.6537671089172363, + "learning_rate": 9.816526315789473e-05, + "loss": 0.6076, + "step": 26985 + }, + { + "epoch": 1.5111434651136746, + "grad_norm": 1.1508857011795044, + "learning_rate": 9.8165e-05, + "loss": 0.4455, + "step": 26986 + }, + { + "epoch": 1.5111994624258036, + "grad_norm": 1.6039947271347046, + "learning_rate": 9.816473684210527e-05, + "loss": 0.6007, + "step": 26987 + }, + { + "epoch": 1.5112554597379326, + "grad_norm": 1.6592415571212769, + "learning_rate": 9.816447368421054e-05, + "loss": 0.5268, + "step": 26988 + }, + { + "epoch": 1.5113114570500616, + "grad_norm": 1.210771083831787, + "learning_rate": 9.816421052631579e-05, + "loss": 0.4452, + "step": 26989 + }, + { + "epoch": 1.5113674543621907, + "grad_norm": 1.40608549118042, + "learning_rate": 9.816394736842106e-05, + "loss": 0.4988, + "step": 26990 + }, + { + "epoch": 1.5114234516743197, + "grad_norm": 2.0569067001342773, + "learning_rate": 9.816368421052632e-05, + "loss": 0.5539, + "step": 26991 + }, + { + "epoch": 1.5114794489864487, + "grad_norm": 2.3576831817626953, + "learning_rate": 9.816342105263159e-05, + "loss": 0.6229, + "step": 26992 + }, + { + "epoch": 1.5115354462985777, + "grad_norm": 1.2888567447662354, + "learning_rate": 9.816315789473685e-05, + "loss": 0.4938, + "step": 26993 + }, + { + "epoch": 1.5115914436107065, + "grad_norm": 1.8300138711929321, + "learning_rate": 9.816289473684211e-05, + "loss": 0.6033, + "step": 26994 + }, + { + "epoch": 1.5116474409228355, + "grad_norm": 1.290547251701355, + "learning_rate": 9.816263157894737e-05, + "loss": 0.4041, + "step": 26995 + }, + { + "epoch": 1.5117034382349646, + "grad_norm": 1.5086348056793213, + "learning_rate": 9.816236842105264e-05, + "loss": 0.5601, + "step": 26996 + }, + { + "epoch": 1.5117594355470936, + "grad_norm": 1.2090970277786255, + "learning_rate": 9.81621052631579e-05, + "loss": 0.4098, + "step": 26997 + }, + { + "epoch": 1.5118154328592226, + "grad_norm": 1.125792384147644, + "learning_rate": 9.816184210526316e-05, + "loss": 0.3752, + "step": 26998 + }, + { + "epoch": 1.5118714301713516, + "grad_norm": 1.3590054512023926, + "learning_rate": 9.816157894736842e-05, + "loss": 0.4998, + "step": 26999 + }, + { + "epoch": 1.5119274274834806, + "grad_norm": 1.4590128660202026, + "learning_rate": 9.816131578947368e-05, + "loss": 0.4472, + "step": 27000 + }, + { + "epoch": 1.5119834247956097, + "grad_norm": 1.3467227220535278, + "learning_rate": 9.816105263157896e-05, + "loss": 0.5055, + "step": 27001 + }, + { + "epoch": 1.5120394221077387, + "grad_norm": 1.2745952606201172, + "learning_rate": 9.816078947368422e-05, + "loss": 0.5036, + "step": 27002 + }, + { + "epoch": 1.5120954194198677, + "grad_norm": 1.2922078371047974, + "learning_rate": 9.816052631578948e-05, + "loss": 0.5639, + "step": 27003 + }, + { + "epoch": 1.5121514167319967, + "grad_norm": 1.372616171836853, + "learning_rate": 9.816026315789474e-05, + "loss": 0.5246, + "step": 27004 + }, + { + "epoch": 1.5122074140441257, + "grad_norm": 1.3429547548294067, + "learning_rate": 9.816000000000001e-05, + "loss": 0.419, + "step": 27005 + }, + { + "epoch": 1.5122634113562547, + "grad_norm": 1.0865153074264526, + "learning_rate": 9.815973684210527e-05, + "loss": 0.3538, + "step": 27006 + }, + { + "epoch": 1.5123194086683838, + "grad_norm": 1.3962829113006592, + "learning_rate": 9.815947368421053e-05, + "loss": 0.454, + "step": 27007 + }, + { + "epoch": 1.5123754059805128, + "grad_norm": 1.2971525192260742, + "learning_rate": 9.815921052631579e-05, + "loss": 0.4568, + "step": 27008 + }, + { + "epoch": 1.5124314032926418, + "grad_norm": 3.0007100105285645, + "learning_rate": 9.815894736842106e-05, + "loss": 0.3183, + "step": 27009 + }, + { + "epoch": 1.5124874006047708, + "grad_norm": 1.372809648513794, + "learning_rate": 9.815868421052632e-05, + "loss": 0.5886, + "step": 27010 + }, + { + "epoch": 1.5125433979168998, + "grad_norm": 1.3756669759750366, + "learning_rate": 9.81584210526316e-05, + "loss": 0.4924, + "step": 27011 + }, + { + "epoch": 1.5125993952290289, + "grad_norm": 1.1371204853057861, + "learning_rate": 9.815815789473684e-05, + "loss": 0.3745, + "step": 27012 + }, + { + "epoch": 1.5126553925411579, + "grad_norm": 1.4544603824615479, + "learning_rate": 9.815789473684211e-05, + "loss": 0.4664, + "step": 27013 + }, + { + "epoch": 1.512711389853287, + "grad_norm": 1.2304860353469849, + "learning_rate": 9.815763157894737e-05, + "loss": 0.4754, + "step": 27014 + }, + { + "epoch": 1.512767387165416, + "grad_norm": 1.2915074825286865, + "learning_rate": 9.815736842105263e-05, + "loss": 0.4477, + "step": 27015 + }, + { + "epoch": 1.512823384477545, + "grad_norm": 2.008554697036743, + "learning_rate": 9.81571052631579e-05, + "loss": 0.7653, + "step": 27016 + }, + { + "epoch": 1.512879381789674, + "grad_norm": 1.2905584573745728, + "learning_rate": 9.815684210526315e-05, + "loss": 0.4139, + "step": 27017 + }, + { + "epoch": 1.512935379101803, + "grad_norm": 1.3069546222686768, + "learning_rate": 9.815657894736843e-05, + "loss": 0.5049, + "step": 27018 + }, + { + "epoch": 1.512991376413932, + "grad_norm": 1.5047800540924072, + "learning_rate": 9.815631578947369e-05, + "loss": 0.4656, + "step": 27019 + }, + { + "epoch": 1.513047373726061, + "grad_norm": 1.1808758974075317, + "learning_rate": 9.815605263157896e-05, + "loss": 0.3925, + "step": 27020 + }, + { + "epoch": 1.51310337103819, + "grad_norm": 1.5783368349075317, + "learning_rate": 9.815578947368422e-05, + "loss": 0.4801, + "step": 27021 + }, + { + "epoch": 1.513159368350319, + "grad_norm": 1.9102554321289062, + "learning_rate": 9.815552631578948e-05, + "loss": 0.5058, + "step": 27022 + }, + { + "epoch": 1.513215365662448, + "grad_norm": 1.4686520099639893, + "learning_rate": 9.815526315789474e-05, + "loss": 0.5082, + "step": 27023 + }, + { + "epoch": 1.513271362974577, + "grad_norm": 1.7134006023406982, + "learning_rate": 9.815500000000001e-05, + "loss": 0.5512, + "step": 27024 + }, + { + "epoch": 1.5133273602867061, + "grad_norm": 1.4265936613082886, + "learning_rate": 9.815473684210527e-05, + "loss": 0.5363, + "step": 27025 + }, + { + "epoch": 1.5133833575988351, + "grad_norm": 2.0115835666656494, + "learning_rate": 9.815447368421053e-05, + "loss": 0.5573, + "step": 27026 + }, + { + "epoch": 1.5134393549109642, + "grad_norm": 1.341893196105957, + "learning_rate": 9.815421052631579e-05, + "loss": 0.5759, + "step": 27027 + }, + { + "epoch": 1.5134953522230932, + "grad_norm": 4.912156581878662, + "learning_rate": 9.815394736842106e-05, + "loss": 0.5019, + "step": 27028 + }, + { + "epoch": 1.5135513495352222, + "grad_norm": 1.7848292589187622, + "learning_rate": 9.815368421052632e-05, + "loss": 0.5088, + "step": 27029 + }, + { + "epoch": 1.5136073468473512, + "grad_norm": 2.5112762451171875, + "learning_rate": 9.815342105263158e-05, + "loss": 0.5038, + "step": 27030 + }, + { + "epoch": 1.5136633441594802, + "grad_norm": 1.1688530445098877, + "learning_rate": 9.815315789473684e-05, + "loss": 0.437, + "step": 27031 + }, + { + "epoch": 1.5137193414716092, + "grad_norm": 1.1913703680038452, + "learning_rate": 9.81528947368421e-05, + "loss": 0.5217, + "step": 27032 + }, + { + "epoch": 1.5137753387837383, + "grad_norm": 1.3893768787384033, + "learning_rate": 9.815263157894738e-05, + "loss": 0.4392, + "step": 27033 + }, + { + "epoch": 1.5138313360958673, + "grad_norm": 1.5970072746276855, + "learning_rate": 9.815236842105264e-05, + "loss": 0.5538, + "step": 27034 + }, + { + "epoch": 1.5138873334079963, + "grad_norm": 1.2803162336349487, + "learning_rate": 9.81521052631579e-05, + "loss": 0.4452, + "step": 27035 + }, + { + "epoch": 1.5139433307201253, + "grad_norm": 1.2993170022964478, + "learning_rate": 9.815184210526316e-05, + "loss": 0.5912, + "step": 27036 + }, + { + "epoch": 1.5139993280322543, + "grad_norm": 1.235471487045288, + "learning_rate": 9.815157894736843e-05, + "loss": 0.5795, + "step": 27037 + }, + { + "epoch": 1.5140553253443834, + "grad_norm": 1.541257619857788, + "learning_rate": 9.815131578947369e-05, + "loss": 0.5014, + "step": 27038 + }, + { + "epoch": 1.5141113226565124, + "grad_norm": 1.2752493619918823, + "learning_rate": 9.815105263157895e-05, + "loss": 0.4493, + "step": 27039 + }, + { + "epoch": 1.5141673199686414, + "grad_norm": 1.2818028926849365, + "learning_rate": 9.815078947368421e-05, + "loss": 0.5289, + "step": 27040 + }, + { + "epoch": 1.5142233172807704, + "grad_norm": 1.458085060119629, + "learning_rate": 9.815052631578948e-05, + "loss": 0.5055, + "step": 27041 + }, + { + "epoch": 1.5142793145928994, + "grad_norm": 1.3541269302368164, + "learning_rate": 9.815026315789474e-05, + "loss": 0.447, + "step": 27042 + }, + { + "epoch": 1.5143353119050285, + "grad_norm": 1.884742021560669, + "learning_rate": 9.815000000000001e-05, + "loss": 0.4487, + "step": 27043 + }, + { + "epoch": 1.5143913092171575, + "grad_norm": 1.2877849340438843, + "learning_rate": 9.814973684210526e-05, + "loss": 0.5738, + "step": 27044 + }, + { + "epoch": 1.5144473065292865, + "grad_norm": 1.541459560394287, + "learning_rate": 9.814947368421053e-05, + "loss": 0.5417, + "step": 27045 + }, + { + "epoch": 1.5145033038414155, + "grad_norm": 1.1358022689819336, + "learning_rate": 9.814921052631579e-05, + "loss": 0.5055, + "step": 27046 + }, + { + "epoch": 1.5145593011535445, + "grad_norm": 1.6166036128997803, + "learning_rate": 9.814894736842107e-05, + "loss": 0.3977, + "step": 27047 + }, + { + "epoch": 1.5146152984656736, + "grad_norm": 1.3530287742614746, + "learning_rate": 9.814868421052633e-05, + "loss": 0.5103, + "step": 27048 + }, + { + "epoch": 1.5146712957778026, + "grad_norm": 1.3396238088607788, + "learning_rate": 9.814842105263157e-05, + "loss": 0.3918, + "step": 27049 + }, + { + "epoch": 1.5147272930899316, + "grad_norm": 1.286135196685791, + "learning_rate": 9.814815789473685e-05, + "loss": 0.3937, + "step": 27050 + }, + { + "epoch": 1.5147832904020606, + "grad_norm": 1.4488623142242432, + "learning_rate": 9.81478947368421e-05, + "loss": 0.4194, + "step": 27051 + }, + { + "epoch": 1.5148392877141896, + "grad_norm": 1.2895300388336182, + "learning_rate": 9.814763157894738e-05, + "loss": 0.4557, + "step": 27052 + }, + { + "epoch": 1.5148952850263186, + "grad_norm": 1.3724356889724731, + "learning_rate": 9.814736842105264e-05, + "loss": 0.4399, + "step": 27053 + }, + { + "epoch": 1.5149512823384477, + "grad_norm": 1.2225584983825684, + "learning_rate": 9.81471052631579e-05, + "loss": 0.4237, + "step": 27054 + }, + { + "epoch": 1.5150072796505767, + "grad_norm": 1.3517637252807617, + "learning_rate": 9.814684210526316e-05, + "loss": 0.5057, + "step": 27055 + }, + { + "epoch": 1.5150632769627057, + "grad_norm": 1.3071893453598022, + "learning_rate": 9.814657894736843e-05, + "loss": 0.4281, + "step": 27056 + }, + { + "epoch": 1.5151192742748347, + "grad_norm": 1.7707455158233643, + "learning_rate": 9.814631578947369e-05, + "loss": 0.5988, + "step": 27057 + }, + { + "epoch": 1.5151752715869637, + "grad_norm": 1.6087961196899414, + "learning_rate": 9.814605263157895e-05, + "loss": 0.6235, + "step": 27058 + }, + { + "epoch": 1.5152312688990928, + "grad_norm": 1.6496331691741943, + "learning_rate": 9.814578947368421e-05, + "loss": 0.4991, + "step": 27059 + }, + { + "epoch": 1.5152872662112218, + "grad_norm": 1.4258060455322266, + "learning_rate": 9.814552631578948e-05, + "loss": 0.4091, + "step": 27060 + }, + { + "epoch": 1.5153432635233508, + "grad_norm": 1.502301812171936, + "learning_rate": 9.814526315789474e-05, + "loss": 0.4726, + "step": 27061 + }, + { + "epoch": 1.5153992608354798, + "grad_norm": 1.0531821250915527, + "learning_rate": 9.8145e-05, + "loss": 0.4544, + "step": 27062 + }, + { + "epoch": 1.5154552581476088, + "grad_norm": 1.3646159172058105, + "learning_rate": 9.814473684210526e-05, + "loss": 0.4808, + "step": 27063 + }, + { + "epoch": 1.5155112554597379, + "grad_norm": 1.2903409004211426, + "learning_rate": 9.814447368421054e-05, + "loss": 0.455, + "step": 27064 + }, + { + "epoch": 1.5155672527718669, + "grad_norm": 1.236262559890747, + "learning_rate": 9.81442105263158e-05, + "loss": 0.4925, + "step": 27065 + }, + { + "epoch": 1.515623250083996, + "grad_norm": 1.5894114971160889, + "learning_rate": 9.814394736842106e-05, + "loss": 0.4423, + "step": 27066 + }, + { + "epoch": 1.515679247396125, + "grad_norm": 1.3579305410385132, + "learning_rate": 9.814368421052632e-05, + "loss": 0.3986, + "step": 27067 + }, + { + "epoch": 1.515735244708254, + "grad_norm": 1.482695460319519, + "learning_rate": 9.814342105263157e-05, + "loss": 0.4451, + "step": 27068 + }, + { + "epoch": 1.515791242020383, + "grad_norm": 1.119757056236267, + "learning_rate": 9.814315789473685e-05, + "loss": 0.4088, + "step": 27069 + }, + { + "epoch": 1.515847239332512, + "grad_norm": 1.4656729698181152, + "learning_rate": 9.814289473684211e-05, + "loss": 0.5124, + "step": 27070 + }, + { + "epoch": 1.515903236644641, + "grad_norm": 1.4599612951278687, + "learning_rate": 9.814263157894738e-05, + "loss": 0.5764, + "step": 27071 + }, + { + "epoch": 1.51595923395677, + "grad_norm": 1.2479785680770874, + "learning_rate": 9.814236842105263e-05, + "loss": 0.4453, + "step": 27072 + }, + { + "epoch": 1.516015231268899, + "grad_norm": 1.2437714338302612, + "learning_rate": 9.81421052631579e-05, + "loss": 0.3318, + "step": 27073 + }, + { + "epoch": 1.516071228581028, + "grad_norm": 1.187427043914795, + "learning_rate": 9.814184210526316e-05, + "loss": 0.408, + "step": 27074 + }, + { + "epoch": 1.516127225893157, + "grad_norm": 2.3595211505889893, + "learning_rate": 9.814157894736843e-05, + "loss": 0.4369, + "step": 27075 + }, + { + "epoch": 1.516183223205286, + "grad_norm": 1.1709308624267578, + "learning_rate": 9.81413157894737e-05, + "loss": 0.3735, + "step": 27076 + }, + { + "epoch": 1.516239220517415, + "grad_norm": 1.4980545043945312, + "learning_rate": 9.814105263157895e-05, + "loss": 0.558, + "step": 27077 + }, + { + "epoch": 1.5162952178295441, + "grad_norm": 1.3856656551361084, + "learning_rate": 9.814078947368421e-05, + "loss": 0.543, + "step": 27078 + }, + { + "epoch": 1.5163512151416731, + "grad_norm": 1.2116948366165161, + "learning_rate": 9.814052631578949e-05, + "loss": 0.4328, + "step": 27079 + }, + { + "epoch": 1.5164072124538022, + "grad_norm": 1.1827560663223267, + "learning_rate": 9.814026315789475e-05, + "loss": 0.4754, + "step": 27080 + }, + { + "epoch": 1.5164632097659312, + "grad_norm": 1.2897484302520752, + "learning_rate": 9.814e-05, + "loss": 0.3885, + "step": 27081 + }, + { + "epoch": 1.5165192070780602, + "grad_norm": 1.2172999382019043, + "learning_rate": 9.813973684210527e-05, + "loss": 0.4431, + "step": 27082 + }, + { + "epoch": 1.5165752043901892, + "grad_norm": 1.2611318826675415, + "learning_rate": 9.813947368421053e-05, + "loss": 0.4159, + "step": 27083 + }, + { + "epoch": 1.5166312017023182, + "grad_norm": 1.3078234195709229, + "learning_rate": 9.81392105263158e-05, + "loss": 0.4671, + "step": 27084 + }, + { + "epoch": 1.5166871990144473, + "grad_norm": 1.3110862970352173, + "learning_rate": 9.813894736842106e-05, + "loss": 0.4017, + "step": 27085 + }, + { + "epoch": 1.5167431963265763, + "grad_norm": 1.41542649269104, + "learning_rate": 9.813868421052632e-05, + "loss": 0.4007, + "step": 27086 + }, + { + "epoch": 1.5167991936387053, + "grad_norm": 1.227420449256897, + "learning_rate": 9.813842105263158e-05, + "loss": 0.5873, + "step": 27087 + }, + { + "epoch": 1.5168551909508343, + "grad_norm": 1.150922417640686, + "learning_rate": 9.813815789473685e-05, + "loss": 0.4562, + "step": 27088 + }, + { + "epoch": 1.5169111882629633, + "grad_norm": 1.364452600479126, + "learning_rate": 9.813789473684211e-05, + "loss": 0.5006, + "step": 27089 + }, + { + "epoch": 1.5169671855750924, + "grad_norm": 1.3885174989700317, + "learning_rate": 9.813763157894737e-05, + "loss": 0.4819, + "step": 27090 + }, + { + "epoch": 1.5170231828872214, + "grad_norm": 1.5120718479156494, + "learning_rate": 9.813736842105263e-05, + "loss": 0.4843, + "step": 27091 + }, + { + "epoch": 1.5170791801993504, + "grad_norm": 1.4497497081756592, + "learning_rate": 9.81371052631579e-05, + "loss": 0.413, + "step": 27092 + }, + { + "epoch": 1.5171351775114794, + "grad_norm": 1.415647029876709, + "learning_rate": 9.813684210526316e-05, + "loss": 0.4852, + "step": 27093 + }, + { + "epoch": 1.5171911748236084, + "grad_norm": 1.0697673559188843, + "learning_rate": 9.813657894736842e-05, + "loss": 0.3982, + "step": 27094 + }, + { + "epoch": 1.5172471721357375, + "grad_norm": 1.2788331508636475, + "learning_rate": 9.813631578947368e-05, + "loss": 0.4982, + "step": 27095 + }, + { + "epoch": 1.5173031694478665, + "grad_norm": 1.8143675327301025, + "learning_rate": 9.813605263157896e-05, + "loss": 0.4638, + "step": 27096 + }, + { + "epoch": 1.5173591667599955, + "grad_norm": 1.411655306816101, + "learning_rate": 9.813578947368422e-05, + "loss": 0.52, + "step": 27097 + }, + { + "epoch": 1.5174151640721245, + "grad_norm": 2.727475166320801, + "learning_rate": 9.813552631578949e-05, + "loss": 0.4718, + "step": 27098 + }, + { + "epoch": 1.5174711613842535, + "grad_norm": 1.9955580234527588, + "learning_rate": 9.813526315789473e-05, + "loss": 0.6964, + "step": 27099 + }, + { + "epoch": 1.5175271586963825, + "grad_norm": 1.6153862476348877, + "learning_rate": 9.8135e-05, + "loss": 0.4741, + "step": 27100 + }, + { + "epoch": 1.5175831560085116, + "grad_norm": 1.4560762643814087, + "learning_rate": 9.813473684210527e-05, + "loss": 0.5971, + "step": 27101 + }, + { + "epoch": 1.5176391533206406, + "grad_norm": 1.269814133644104, + "learning_rate": 9.813447368421053e-05, + "loss": 0.5312, + "step": 27102 + }, + { + "epoch": 1.5176951506327696, + "grad_norm": 1.0865706205368042, + "learning_rate": 9.81342105263158e-05, + "loss": 0.3884, + "step": 27103 + }, + { + "epoch": 1.5177511479448986, + "grad_norm": 1.2786933183670044, + "learning_rate": 9.813394736842105e-05, + "loss": 0.3779, + "step": 27104 + }, + { + "epoch": 1.5178071452570276, + "grad_norm": 1.280717134475708, + "learning_rate": 9.813368421052632e-05, + "loss": 0.3973, + "step": 27105 + }, + { + "epoch": 1.5178631425691567, + "grad_norm": 1.2063264846801758, + "learning_rate": 9.813342105263158e-05, + "loss": 0.4159, + "step": 27106 + }, + { + "epoch": 1.5179191398812857, + "grad_norm": 1.1776610612869263, + "learning_rate": 9.813315789473685e-05, + "loss": 0.4355, + "step": 27107 + }, + { + "epoch": 1.5179751371934147, + "grad_norm": 1.3531817197799683, + "learning_rate": 9.813289473684211e-05, + "loss": 0.4998, + "step": 27108 + }, + { + "epoch": 1.5180311345055437, + "grad_norm": 1.4713177680969238, + "learning_rate": 9.813263157894737e-05, + "loss": 0.5086, + "step": 27109 + }, + { + "epoch": 1.5180871318176727, + "grad_norm": 1.3408626317977905, + "learning_rate": 9.813236842105263e-05, + "loss": 0.3993, + "step": 27110 + }, + { + "epoch": 1.5181431291298018, + "grad_norm": 1.6550896167755127, + "learning_rate": 9.81321052631579e-05, + "loss": 0.5474, + "step": 27111 + }, + { + "epoch": 1.5181991264419308, + "grad_norm": 1.925497055053711, + "learning_rate": 9.813184210526317e-05, + "loss": 0.4303, + "step": 27112 + }, + { + "epoch": 1.5182551237540598, + "grad_norm": 1.4134142398834229, + "learning_rate": 9.813157894736843e-05, + "loss": 0.502, + "step": 27113 + }, + { + "epoch": 1.5183111210661888, + "grad_norm": 1.3809471130371094, + "learning_rate": 9.813131578947369e-05, + "loss": 0.5189, + "step": 27114 + }, + { + "epoch": 1.5183671183783178, + "grad_norm": 1.0960626602172852, + "learning_rate": 9.813105263157896e-05, + "loss": 0.422, + "step": 27115 + }, + { + "epoch": 1.5184231156904469, + "grad_norm": 1.3033056259155273, + "learning_rate": 9.813078947368422e-05, + "loss": 0.4158, + "step": 27116 + }, + { + "epoch": 1.5184791130025759, + "grad_norm": 1.3114839792251587, + "learning_rate": 9.813052631578948e-05, + "loss": 0.4797, + "step": 27117 + }, + { + "epoch": 1.518535110314705, + "grad_norm": 1.7351293563842773, + "learning_rate": 9.813026315789474e-05, + "loss": 0.4195, + "step": 27118 + }, + { + "epoch": 1.518591107626834, + "grad_norm": 1.5991266965866089, + "learning_rate": 9.813e-05, + "loss": 0.3638, + "step": 27119 + }, + { + "epoch": 1.518647104938963, + "grad_norm": 1.3073616027832031, + "learning_rate": 9.812973684210527e-05, + "loss": 0.3962, + "step": 27120 + }, + { + "epoch": 1.518703102251092, + "grad_norm": 1.023710012435913, + "learning_rate": 9.812947368421053e-05, + "loss": 0.461, + "step": 27121 + }, + { + "epoch": 1.518759099563221, + "grad_norm": 1.4045839309692383, + "learning_rate": 9.812921052631579e-05, + "loss": 0.4891, + "step": 27122 + }, + { + "epoch": 1.51881509687535, + "grad_norm": 1.2008012533187866, + "learning_rate": 9.812894736842105e-05, + "loss": 0.3799, + "step": 27123 + }, + { + "epoch": 1.518871094187479, + "grad_norm": 1.2304240465164185, + "learning_rate": 9.812868421052632e-05, + "loss": 0.4843, + "step": 27124 + }, + { + "epoch": 1.518927091499608, + "grad_norm": 1.461506724357605, + "learning_rate": 9.812842105263158e-05, + "loss": 0.4795, + "step": 27125 + }, + { + "epoch": 1.518983088811737, + "grad_norm": 1.3854840993881226, + "learning_rate": 9.812815789473686e-05, + "loss": 0.4384, + "step": 27126 + }, + { + "epoch": 1.519039086123866, + "grad_norm": 1.8799121379852295, + "learning_rate": 9.81278947368421e-05, + "loss": 0.6213, + "step": 27127 + }, + { + "epoch": 1.519095083435995, + "grad_norm": 1.2859798669815063, + "learning_rate": 9.812763157894738e-05, + "loss": 0.4172, + "step": 27128 + }, + { + "epoch": 1.519151080748124, + "grad_norm": 1.3912931680679321, + "learning_rate": 9.812736842105264e-05, + "loss": 0.4523, + "step": 27129 + }, + { + "epoch": 1.5192070780602531, + "grad_norm": 1.075352430343628, + "learning_rate": 9.812710526315791e-05, + "loss": 0.37, + "step": 27130 + }, + { + "epoch": 1.5192630753723821, + "grad_norm": 1.4183050394058228, + "learning_rate": 9.812684210526317e-05, + "loss": 0.408, + "step": 27131 + }, + { + "epoch": 1.5193190726845112, + "grad_norm": 1.3998409509658813, + "learning_rate": 9.812657894736843e-05, + "loss": 0.6337, + "step": 27132 + }, + { + "epoch": 1.5193750699966402, + "grad_norm": 1.930840015411377, + "learning_rate": 9.812631578947369e-05, + "loss": 0.5001, + "step": 27133 + }, + { + "epoch": 1.5194310673087692, + "grad_norm": 1.0706380605697632, + "learning_rate": 9.812605263157896e-05, + "loss": 0.3335, + "step": 27134 + }, + { + "epoch": 1.5194870646208982, + "grad_norm": 2.313913106918335, + "learning_rate": 9.812578947368422e-05, + "loss": 0.6187, + "step": 27135 + }, + { + "epoch": 1.5195430619330272, + "grad_norm": 1.260740041732788, + "learning_rate": 9.812552631578947e-05, + "loss": 0.4128, + "step": 27136 + }, + { + "epoch": 1.5195990592451563, + "grad_norm": 3.1872777938842773, + "learning_rate": 9.812526315789474e-05, + "loss": 0.4337, + "step": 27137 + }, + { + "epoch": 1.5196550565572853, + "grad_norm": 1.3023390769958496, + "learning_rate": 9.8125e-05, + "loss": 0.3654, + "step": 27138 + }, + { + "epoch": 1.5197110538694143, + "grad_norm": 1.215165376663208, + "learning_rate": 9.812473684210527e-05, + "loss": 0.3756, + "step": 27139 + }, + { + "epoch": 1.5197670511815433, + "grad_norm": 1.2889282703399658, + "learning_rate": 9.812447368421053e-05, + "loss": 0.4967, + "step": 27140 + }, + { + "epoch": 1.5198230484936723, + "grad_norm": 1.33611261844635, + "learning_rate": 9.812421052631579e-05, + "loss": 0.3644, + "step": 27141 + }, + { + "epoch": 1.5198790458058014, + "grad_norm": 1.5400924682617188, + "learning_rate": 9.812394736842105e-05, + "loss": 0.6058, + "step": 27142 + }, + { + "epoch": 1.5199350431179304, + "grad_norm": 1.2969951629638672, + "learning_rate": 9.812368421052633e-05, + "loss": 0.707, + "step": 27143 + }, + { + "epoch": 1.5199910404300594, + "grad_norm": 1.2953400611877441, + "learning_rate": 9.812342105263159e-05, + "loss": 0.5529, + "step": 27144 + }, + { + "epoch": 1.5200470377421884, + "grad_norm": 1.1095150709152222, + "learning_rate": 9.812315789473685e-05, + "loss": 0.3975, + "step": 27145 + }, + { + "epoch": 1.5201030350543174, + "grad_norm": 1.2372747659683228, + "learning_rate": 9.81228947368421e-05, + "loss": 0.5075, + "step": 27146 + }, + { + "epoch": 1.5201590323664464, + "grad_norm": 1.6907185316085815, + "learning_rate": 9.812263157894738e-05, + "loss": 0.6013, + "step": 27147 + }, + { + "epoch": 1.5202150296785755, + "grad_norm": 1.5320686101913452, + "learning_rate": 9.812236842105264e-05, + "loss": 0.5221, + "step": 27148 + }, + { + "epoch": 1.5202710269907045, + "grad_norm": 1.3951154947280884, + "learning_rate": 9.81221052631579e-05, + "loss": 0.6272, + "step": 27149 + }, + { + "epoch": 1.5203270243028335, + "grad_norm": 1.3134037256240845, + "learning_rate": 9.812184210526316e-05, + "loss": 0.4784, + "step": 27150 + }, + { + "epoch": 1.5203830216149625, + "grad_norm": 1.2420580387115479, + "learning_rate": 9.812157894736843e-05, + "loss": 0.4082, + "step": 27151 + }, + { + "epoch": 1.5204390189270915, + "grad_norm": 1.3023356199264526, + "learning_rate": 9.812131578947369e-05, + "loss": 0.5054, + "step": 27152 + }, + { + "epoch": 1.5204950162392206, + "grad_norm": 4.242269515991211, + "learning_rate": 9.812105263157895e-05, + "loss": 0.3689, + "step": 27153 + }, + { + "epoch": 1.5205510135513496, + "grad_norm": 1.17594575881958, + "learning_rate": 9.812078947368421e-05, + "loss": 0.391, + "step": 27154 + }, + { + "epoch": 1.5206070108634786, + "grad_norm": 1.258170247077942, + "learning_rate": 9.812052631578947e-05, + "loss": 0.4083, + "step": 27155 + }, + { + "epoch": 1.5206630081756076, + "grad_norm": 1.5187753438949585, + "learning_rate": 9.812026315789474e-05, + "loss": 0.6817, + "step": 27156 + }, + { + "epoch": 1.5207190054877366, + "grad_norm": 1.2015737295150757, + "learning_rate": 9.812e-05, + "loss": 0.4681, + "step": 27157 + }, + { + "epoch": 1.5207750027998657, + "grad_norm": 1.2637263536453247, + "learning_rate": 9.811973684210528e-05, + "loss": 0.3941, + "step": 27158 + }, + { + "epoch": 1.5208310001119947, + "grad_norm": 1.280678153038025, + "learning_rate": 9.811947368421052e-05, + "loss": 0.4573, + "step": 27159 + }, + { + "epoch": 1.5208869974241237, + "grad_norm": 1.2184869050979614, + "learning_rate": 9.81192105263158e-05, + "loss": 0.4618, + "step": 27160 + }, + { + "epoch": 1.5209429947362527, + "grad_norm": 1.2928537130355835, + "learning_rate": 9.811894736842105e-05, + "loss": 0.3909, + "step": 27161 + }, + { + "epoch": 1.5209989920483817, + "grad_norm": 1.3666313886642456, + "learning_rate": 9.811868421052633e-05, + "loss": 0.4929, + "step": 27162 + }, + { + "epoch": 1.5210549893605108, + "grad_norm": 1.306142807006836, + "learning_rate": 9.811842105263159e-05, + "loss": 0.4229, + "step": 27163 + }, + { + "epoch": 1.5211109866726398, + "grad_norm": 2.1070852279663086, + "learning_rate": 9.811815789473685e-05, + "loss": 0.4423, + "step": 27164 + }, + { + "epoch": 1.5211669839847688, + "grad_norm": 1.2931374311447144, + "learning_rate": 9.811789473684211e-05, + "loss": 0.4635, + "step": 27165 + }, + { + "epoch": 1.5212229812968978, + "grad_norm": 1.4965428113937378, + "learning_rate": 9.811763157894738e-05, + "loss": 0.5256, + "step": 27166 + }, + { + "epoch": 1.5212789786090268, + "grad_norm": 1.2619374990463257, + "learning_rate": 9.811736842105264e-05, + "loss": 0.4944, + "step": 27167 + }, + { + "epoch": 1.5213349759211559, + "grad_norm": 1.268546462059021, + "learning_rate": 9.81171052631579e-05, + "loss": 0.4044, + "step": 27168 + }, + { + "epoch": 1.5213909732332849, + "grad_norm": 1.2187644243240356, + "learning_rate": 9.811684210526316e-05, + "loss": 0.3879, + "step": 27169 + }, + { + "epoch": 1.521446970545414, + "grad_norm": 1.40371835231781, + "learning_rate": 9.811657894736842e-05, + "loss": 0.4912, + "step": 27170 + }, + { + "epoch": 1.521502967857543, + "grad_norm": 2.225950002670288, + "learning_rate": 9.811631578947369e-05, + "loss": 0.512, + "step": 27171 + }, + { + "epoch": 1.521558965169672, + "grad_norm": 2.1730637550354004, + "learning_rate": 9.811605263157895e-05, + "loss": 0.4606, + "step": 27172 + }, + { + "epoch": 1.521614962481801, + "grad_norm": 1.4855023622512817, + "learning_rate": 9.811578947368421e-05, + "loss": 0.4031, + "step": 27173 + }, + { + "epoch": 1.52167095979393, + "grad_norm": 1.6859471797943115, + "learning_rate": 9.811552631578947e-05, + "loss": 0.5472, + "step": 27174 + }, + { + "epoch": 1.521726957106059, + "grad_norm": 1.3349955081939697, + "learning_rate": 9.811526315789475e-05, + "loss": 0.6914, + "step": 27175 + }, + { + "epoch": 1.521782954418188, + "grad_norm": 1.1871305704116821, + "learning_rate": 9.8115e-05, + "loss": 0.3831, + "step": 27176 + }, + { + "epoch": 1.521838951730317, + "grad_norm": 1.4136004447937012, + "learning_rate": 9.811473684210526e-05, + "loss": 0.4222, + "step": 27177 + }, + { + "epoch": 1.521894949042446, + "grad_norm": 1.245827317237854, + "learning_rate": 9.811447368421052e-05, + "loss": 0.4735, + "step": 27178 + }, + { + "epoch": 1.521950946354575, + "grad_norm": 1.1601252555847168, + "learning_rate": 9.81142105263158e-05, + "loss": 0.3037, + "step": 27179 + }, + { + "epoch": 1.522006943666704, + "grad_norm": 1.2845308780670166, + "learning_rate": 9.811394736842106e-05, + "loss": 0.4207, + "step": 27180 + }, + { + "epoch": 1.522062940978833, + "grad_norm": 1.2875018119812012, + "learning_rate": 9.811368421052633e-05, + "loss": 0.5284, + "step": 27181 + }, + { + "epoch": 1.5221189382909621, + "grad_norm": 1.3341776132583618, + "learning_rate": 9.811342105263158e-05, + "loss": 0.4887, + "step": 27182 + }, + { + "epoch": 1.5221749356030911, + "grad_norm": 1.4460558891296387, + "learning_rate": 9.811315789473685e-05, + "loss": 0.4795, + "step": 27183 + }, + { + "epoch": 1.5222309329152202, + "grad_norm": 1.4187291860580444, + "learning_rate": 9.811289473684211e-05, + "loss": 0.6419, + "step": 27184 + }, + { + "epoch": 1.5222869302273492, + "grad_norm": 1.1622968912124634, + "learning_rate": 9.811263157894738e-05, + "loss": 0.3809, + "step": 27185 + }, + { + "epoch": 1.5223429275394782, + "grad_norm": 1.2642873525619507, + "learning_rate": 9.811236842105263e-05, + "loss": 0.4248, + "step": 27186 + }, + { + "epoch": 1.5223989248516072, + "grad_norm": 1.4572975635528564, + "learning_rate": 9.811210526315789e-05, + "loss": 0.7081, + "step": 27187 + }, + { + "epoch": 1.5224549221637362, + "grad_norm": 1.3987679481506348, + "learning_rate": 9.811184210526316e-05, + "loss": 0.606, + "step": 27188 + }, + { + "epoch": 1.5225109194758653, + "grad_norm": 1.2974300384521484, + "learning_rate": 9.811157894736842e-05, + "loss": 0.4245, + "step": 27189 + }, + { + "epoch": 1.5225669167879943, + "grad_norm": 1.381773829460144, + "learning_rate": 9.81113157894737e-05, + "loss": 0.465, + "step": 27190 + }, + { + "epoch": 1.5226229141001233, + "grad_norm": 1.5028408765792847, + "learning_rate": 9.811105263157894e-05, + "loss": 0.4995, + "step": 27191 + }, + { + "epoch": 1.5226789114122523, + "grad_norm": 1.1616239547729492, + "learning_rate": 9.811078947368421e-05, + "loss": 0.4463, + "step": 27192 + }, + { + "epoch": 1.5227349087243813, + "grad_norm": 1.256756067276001, + "learning_rate": 9.811052631578947e-05, + "loss": 0.396, + "step": 27193 + }, + { + "epoch": 1.5227909060365103, + "grad_norm": 1.2919772863388062, + "learning_rate": 9.811026315789475e-05, + "loss": 0.5616, + "step": 27194 + }, + { + "epoch": 1.5228469033486394, + "grad_norm": 1.1837810277938843, + "learning_rate": 9.811000000000001e-05, + "loss": 0.4797, + "step": 27195 + }, + { + "epoch": 1.5229029006607684, + "grad_norm": 2.169356346130371, + "learning_rate": 9.810973684210527e-05, + "loss": 0.4727, + "step": 27196 + }, + { + "epoch": 1.5229588979728974, + "grad_norm": 1.384002923965454, + "learning_rate": 9.810947368421053e-05, + "loss": 0.4697, + "step": 27197 + }, + { + "epoch": 1.5230148952850264, + "grad_norm": 1.2022258043289185, + "learning_rate": 9.81092105263158e-05, + "loss": 0.4269, + "step": 27198 + }, + { + "epoch": 1.5230708925971554, + "grad_norm": 1.2444813251495361, + "learning_rate": 9.810894736842106e-05, + "loss": 0.5477, + "step": 27199 + }, + { + "epoch": 1.5231268899092845, + "grad_norm": 1.3995620012283325, + "learning_rate": 9.810868421052632e-05, + "loss": 0.5408, + "step": 27200 + }, + { + "epoch": 1.5231828872214135, + "grad_norm": 1.2903931140899658, + "learning_rate": 9.810842105263158e-05, + "loss": 0.3646, + "step": 27201 + }, + { + "epoch": 1.5232388845335425, + "grad_norm": 1.0688526630401611, + "learning_rate": 9.810815789473685e-05, + "loss": 0.3268, + "step": 27202 + }, + { + "epoch": 1.5232948818456715, + "grad_norm": 1.2227773666381836, + "learning_rate": 9.810789473684211e-05, + "loss": 0.4748, + "step": 27203 + }, + { + "epoch": 1.5233508791578005, + "grad_norm": 1.2780729532241821, + "learning_rate": 9.810763157894737e-05, + "loss": 0.3487, + "step": 27204 + }, + { + "epoch": 1.5234068764699296, + "grad_norm": 1.2940983772277832, + "learning_rate": 9.810736842105263e-05, + "loss": 0.6563, + "step": 27205 + }, + { + "epoch": 1.5234628737820586, + "grad_norm": 1.4650967121124268, + "learning_rate": 9.810710526315789e-05, + "loss": 0.413, + "step": 27206 + }, + { + "epoch": 1.5235188710941876, + "grad_norm": 1.5916510820388794, + "learning_rate": 9.810684210526317e-05, + "loss": 0.4701, + "step": 27207 + }, + { + "epoch": 1.5235748684063166, + "grad_norm": 1.412753939628601, + "learning_rate": 9.810657894736842e-05, + "loss": 0.5446, + "step": 27208 + }, + { + "epoch": 1.5236308657184456, + "grad_norm": 1.569606900215149, + "learning_rate": 9.810631578947368e-05, + "loss": 0.3985, + "step": 27209 + }, + { + "epoch": 1.5236868630305747, + "grad_norm": 1.2182259559631348, + "learning_rate": 9.810605263157894e-05, + "loss": 0.3772, + "step": 27210 + }, + { + "epoch": 1.5237428603427037, + "grad_norm": 1.3352302312850952, + "learning_rate": 9.810578947368422e-05, + "loss": 0.4234, + "step": 27211 + }, + { + "epoch": 1.5237988576548327, + "grad_norm": 1.3450623750686646, + "learning_rate": 9.810552631578948e-05, + "loss": 0.5302, + "step": 27212 + }, + { + "epoch": 1.5238548549669617, + "grad_norm": 1.124396800994873, + "learning_rate": 9.810526315789475e-05, + "loss": 0.4326, + "step": 27213 + }, + { + "epoch": 1.5239108522790907, + "grad_norm": 1.2622078657150269, + "learning_rate": 9.8105e-05, + "loss": 0.4639, + "step": 27214 + }, + { + "epoch": 1.5239668495912198, + "grad_norm": 1.2417024374008179, + "learning_rate": 9.810473684210527e-05, + "loss": 0.4054, + "step": 27215 + }, + { + "epoch": 1.5240228469033488, + "grad_norm": 1.3757787942886353, + "learning_rate": 9.810447368421053e-05, + "loss": 0.656, + "step": 27216 + }, + { + "epoch": 1.5240788442154778, + "grad_norm": 3.607560873031616, + "learning_rate": 9.81042105263158e-05, + "loss": 0.5444, + "step": 27217 + }, + { + "epoch": 1.5241348415276068, + "grad_norm": 1.2746714353561401, + "learning_rate": 9.810394736842106e-05, + "loss": 0.4427, + "step": 27218 + }, + { + "epoch": 1.5241908388397358, + "grad_norm": 1.492066740989685, + "learning_rate": 9.810368421052632e-05, + "loss": 0.5981, + "step": 27219 + }, + { + "epoch": 1.5242468361518648, + "grad_norm": 1.1095901727676392, + "learning_rate": 9.810342105263158e-05, + "loss": 0.3708, + "step": 27220 + }, + { + "epoch": 1.5243028334639939, + "grad_norm": 1.349563479423523, + "learning_rate": 9.810315789473684e-05, + "loss": 0.4297, + "step": 27221 + }, + { + "epoch": 1.5243588307761229, + "grad_norm": 1.4883623123168945, + "learning_rate": 9.810289473684212e-05, + "loss": 0.3863, + "step": 27222 + }, + { + "epoch": 1.524414828088252, + "grad_norm": 1.3130929470062256, + "learning_rate": 9.810263157894737e-05, + "loss": 0.406, + "step": 27223 + }, + { + "epoch": 1.524470825400381, + "grad_norm": 1.2860057353973389, + "learning_rate": 9.810236842105263e-05, + "loss": 0.4408, + "step": 27224 + }, + { + "epoch": 1.52452682271251, + "grad_norm": 1.1853996515274048, + "learning_rate": 9.81021052631579e-05, + "loss": 0.3751, + "step": 27225 + }, + { + "epoch": 1.524582820024639, + "grad_norm": 1.4100369215011597, + "learning_rate": 9.810184210526317e-05, + "loss": 0.4522, + "step": 27226 + }, + { + "epoch": 1.524638817336768, + "grad_norm": 1.4437059164047241, + "learning_rate": 9.810157894736843e-05, + "loss": 0.5082, + "step": 27227 + }, + { + "epoch": 1.524694814648897, + "grad_norm": 0.9887460470199585, + "learning_rate": 9.810131578947369e-05, + "loss": 0.4007, + "step": 27228 + }, + { + "epoch": 1.524750811961026, + "grad_norm": 1.3373279571533203, + "learning_rate": 9.810105263157895e-05, + "loss": 0.4514, + "step": 27229 + }, + { + "epoch": 1.524806809273155, + "grad_norm": 1.519224762916565, + "learning_rate": 9.810078947368422e-05, + "loss": 0.4418, + "step": 27230 + }, + { + "epoch": 1.524862806585284, + "grad_norm": 1.5326309204101562, + "learning_rate": 9.810052631578948e-05, + "loss": 0.6816, + "step": 27231 + }, + { + "epoch": 1.524918803897413, + "grad_norm": 1.2395530939102173, + "learning_rate": 9.810026315789474e-05, + "loss": 0.4167, + "step": 27232 + }, + { + "epoch": 1.524974801209542, + "grad_norm": 1.1414129734039307, + "learning_rate": 9.81e-05, + "loss": 0.4036, + "step": 27233 + }, + { + "epoch": 1.5250307985216711, + "grad_norm": 1.3156803846359253, + "learning_rate": 9.809973684210527e-05, + "loss": 0.4922, + "step": 27234 + }, + { + "epoch": 1.5250867958338001, + "grad_norm": 1.4350786209106445, + "learning_rate": 9.809947368421053e-05, + "loss": 0.5052, + "step": 27235 + }, + { + "epoch": 1.5251427931459292, + "grad_norm": 1.3264518976211548, + "learning_rate": 9.80992105263158e-05, + "loss": 0.3631, + "step": 27236 + }, + { + "epoch": 1.5251987904580582, + "grad_norm": 1.1088902950286865, + "learning_rate": 9.809894736842105e-05, + "loss": 0.3764, + "step": 27237 + }, + { + "epoch": 1.5252547877701872, + "grad_norm": 1.1665536165237427, + "learning_rate": 9.809868421052633e-05, + "loss": 0.5014, + "step": 27238 + }, + { + "epoch": 1.5253107850823162, + "grad_norm": 1.2686662673950195, + "learning_rate": 9.809842105263158e-05, + "loss": 0.4081, + "step": 27239 + }, + { + "epoch": 1.5253667823944452, + "grad_norm": 2.089110851287842, + "learning_rate": 9.809815789473684e-05, + "loss": 0.4076, + "step": 27240 + }, + { + "epoch": 1.5254227797065743, + "grad_norm": 1.2193576097488403, + "learning_rate": 9.80978947368421e-05, + "loss": 0.4736, + "step": 27241 + }, + { + "epoch": 1.525478777018703, + "grad_norm": 1.1142656803131104, + "learning_rate": 9.809763157894736e-05, + "loss": 0.3628, + "step": 27242 + }, + { + "epoch": 1.525534774330832, + "grad_norm": 2.5167746543884277, + "learning_rate": 9.809736842105264e-05, + "loss": 0.4503, + "step": 27243 + }, + { + "epoch": 1.525590771642961, + "grad_norm": 1.2938402891159058, + "learning_rate": 9.80971052631579e-05, + "loss": 0.4976, + "step": 27244 + }, + { + "epoch": 1.52564676895509, + "grad_norm": 1.2518161535263062, + "learning_rate": 9.809684210526317e-05, + "loss": 0.457, + "step": 27245 + }, + { + "epoch": 1.5257027662672191, + "grad_norm": 1.3567074537277222, + "learning_rate": 9.809657894736842e-05, + "loss": 0.4738, + "step": 27246 + }, + { + "epoch": 1.5257587635793481, + "grad_norm": 1.4210426807403564, + "learning_rate": 9.809631578947369e-05, + "loss": 0.4596, + "step": 27247 + }, + { + "epoch": 1.5258147608914772, + "grad_norm": 1.4643398523330688, + "learning_rate": 9.809605263157895e-05, + "loss": 0.5329, + "step": 27248 + }, + { + "epoch": 1.5258707582036062, + "grad_norm": 1.2194404602050781, + "learning_rate": 9.809578947368422e-05, + "loss": 0.4716, + "step": 27249 + }, + { + "epoch": 1.5259267555157352, + "grad_norm": 1.2832481861114502, + "learning_rate": 9.809552631578948e-05, + "loss": 0.3971, + "step": 27250 + }, + { + "epoch": 1.5259827528278642, + "grad_norm": 1.1515140533447266, + "learning_rate": 9.809526315789474e-05, + "loss": 0.4872, + "step": 27251 + }, + { + "epoch": 1.5260387501399932, + "grad_norm": 1.1932960748672485, + "learning_rate": 9.8095e-05, + "loss": 0.3725, + "step": 27252 + }, + { + "epoch": 1.5260947474521223, + "grad_norm": 1.0814008712768555, + "learning_rate": 9.809473684210528e-05, + "loss": 0.2995, + "step": 27253 + }, + { + "epoch": 1.5261507447642513, + "grad_norm": 1.4640215635299683, + "learning_rate": 9.809447368421053e-05, + "loss": 0.4988, + "step": 27254 + }, + { + "epoch": 1.5262067420763803, + "grad_norm": 1.5298998355865479, + "learning_rate": 9.80942105263158e-05, + "loss": 0.491, + "step": 27255 + }, + { + "epoch": 1.5262627393885093, + "grad_norm": 1.3268486261367798, + "learning_rate": 9.809394736842105e-05, + "loss": 0.5103, + "step": 27256 + }, + { + "epoch": 1.5263187367006383, + "grad_norm": 1.8547991514205933, + "learning_rate": 9.809368421052631e-05, + "loss": 0.4281, + "step": 27257 + }, + { + "epoch": 1.5263747340127674, + "grad_norm": 1.2123308181762695, + "learning_rate": 9.809342105263159e-05, + "loss": 0.3843, + "step": 27258 + }, + { + "epoch": 1.5264307313248964, + "grad_norm": 1.3726972341537476, + "learning_rate": 9.809315789473685e-05, + "loss": 0.5126, + "step": 27259 + }, + { + "epoch": 1.5264867286370254, + "grad_norm": 1.2992104291915894, + "learning_rate": 9.809289473684211e-05, + "loss": 0.4645, + "step": 27260 + }, + { + "epoch": 1.5265427259491544, + "grad_norm": 1.420084834098816, + "learning_rate": 9.809263157894737e-05, + "loss": 0.5628, + "step": 27261 + }, + { + "epoch": 1.5265987232612834, + "grad_norm": 1.237008810043335, + "learning_rate": 9.809236842105264e-05, + "loss": 0.6578, + "step": 27262 + }, + { + "epoch": 1.5266547205734124, + "grad_norm": 1.4518870115280151, + "learning_rate": 9.80921052631579e-05, + "loss": 0.4862, + "step": 27263 + }, + { + "epoch": 1.5267107178855415, + "grad_norm": 1.1755800247192383, + "learning_rate": 9.809184210526316e-05, + "loss": 0.4391, + "step": 27264 + }, + { + "epoch": 1.5267667151976705, + "grad_norm": 1.2907551527023315, + "learning_rate": 9.809157894736842e-05, + "loss": 0.5257, + "step": 27265 + }, + { + "epoch": 1.5268227125097995, + "grad_norm": 1.1906359195709229, + "learning_rate": 9.809131578947369e-05, + "loss": 0.3457, + "step": 27266 + }, + { + "epoch": 1.5268787098219285, + "grad_norm": 1.5713539123535156, + "learning_rate": 9.809105263157895e-05, + "loss": 0.467, + "step": 27267 + }, + { + "epoch": 1.5269347071340575, + "grad_norm": 1.5531158447265625, + "learning_rate": 9.809078947368423e-05, + "loss": 0.5502, + "step": 27268 + }, + { + "epoch": 1.5269907044461866, + "grad_norm": 1.4606424570083618, + "learning_rate": 9.809052631578947e-05, + "loss": 0.2967, + "step": 27269 + }, + { + "epoch": 1.5270467017583156, + "grad_norm": 1.352168083190918, + "learning_rate": 9.809026315789474e-05, + "loss": 0.4274, + "step": 27270 + }, + { + "epoch": 1.5271026990704446, + "grad_norm": 1.477839469909668, + "learning_rate": 9.809e-05, + "loss": 0.3866, + "step": 27271 + }, + { + "epoch": 1.5271586963825736, + "grad_norm": 1.2686796188354492, + "learning_rate": 9.808973684210528e-05, + "loss": 0.4013, + "step": 27272 + }, + { + "epoch": 1.5272146936947026, + "grad_norm": 1.5855923891067505, + "learning_rate": 9.808947368421054e-05, + "loss": 0.5514, + "step": 27273 + }, + { + "epoch": 1.5272706910068317, + "grad_norm": 1.2596975564956665, + "learning_rate": 9.808921052631578e-05, + "loss": 0.4397, + "step": 27274 + }, + { + "epoch": 1.5273266883189607, + "grad_norm": 1.4806995391845703, + "learning_rate": 9.808894736842106e-05, + "loss": 0.5705, + "step": 27275 + }, + { + "epoch": 1.5273826856310897, + "grad_norm": 1.3230366706848145, + "learning_rate": 9.808868421052632e-05, + "loss": 0.6061, + "step": 27276 + }, + { + "epoch": 1.5274386829432187, + "grad_norm": 1.2527214288711548, + "learning_rate": 9.808842105263159e-05, + "loss": 0.464, + "step": 27277 + }, + { + "epoch": 1.5274946802553477, + "grad_norm": 1.3091689348220825, + "learning_rate": 9.808815789473685e-05, + "loss": 0.3889, + "step": 27278 + }, + { + "epoch": 1.5275506775674768, + "grad_norm": 1.3826245069503784, + "learning_rate": 9.808789473684211e-05, + "loss": 0.4983, + "step": 27279 + }, + { + "epoch": 1.5276066748796058, + "grad_norm": 1.2268179655075073, + "learning_rate": 9.808763157894737e-05, + "loss": 0.3406, + "step": 27280 + }, + { + "epoch": 1.5276626721917348, + "grad_norm": 1.3147023916244507, + "learning_rate": 9.808736842105264e-05, + "loss": 0.6199, + "step": 27281 + }, + { + "epoch": 1.5277186695038638, + "grad_norm": 1.1666747331619263, + "learning_rate": 9.80871052631579e-05, + "loss": 0.4304, + "step": 27282 + }, + { + "epoch": 1.5277746668159928, + "grad_norm": 1.3330376148223877, + "learning_rate": 9.808684210526316e-05, + "loss": 0.4283, + "step": 27283 + }, + { + "epoch": 1.5278306641281219, + "grad_norm": 1.4851858615875244, + "learning_rate": 9.808657894736842e-05, + "loss": 0.3865, + "step": 27284 + }, + { + "epoch": 1.5278866614402509, + "grad_norm": 1.4621042013168335, + "learning_rate": 9.80863157894737e-05, + "loss": 0.5033, + "step": 27285 + }, + { + "epoch": 1.52794265875238, + "grad_norm": 1.2530263662338257, + "learning_rate": 9.808605263157895e-05, + "loss": 0.4445, + "step": 27286 + }, + { + "epoch": 1.527998656064509, + "grad_norm": 1.4377599954605103, + "learning_rate": 9.808578947368421e-05, + "loss": 0.4676, + "step": 27287 + }, + { + "epoch": 1.528054653376638, + "grad_norm": 1.7416679859161377, + "learning_rate": 9.808552631578947e-05, + "loss": 0.5612, + "step": 27288 + }, + { + "epoch": 1.528110650688767, + "grad_norm": 1.4890049695968628, + "learning_rate": 9.808526315789475e-05, + "loss": 0.4705, + "step": 27289 + }, + { + "epoch": 1.528166648000896, + "grad_norm": 1.6244866847991943, + "learning_rate": 9.808500000000001e-05, + "loss": 0.5147, + "step": 27290 + }, + { + "epoch": 1.528222645313025, + "grad_norm": 1.2908213138580322, + "learning_rate": 9.808473684210527e-05, + "loss": 0.3702, + "step": 27291 + }, + { + "epoch": 1.528278642625154, + "grad_norm": 1.3829395771026611, + "learning_rate": 9.808447368421053e-05, + "loss": 0.4444, + "step": 27292 + }, + { + "epoch": 1.528334639937283, + "grad_norm": 1.6316728591918945, + "learning_rate": 9.808421052631579e-05, + "loss": 0.4936, + "step": 27293 + }, + { + "epoch": 1.528390637249412, + "grad_norm": 1.4428198337554932, + "learning_rate": 9.808394736842106e-05, + "loss": 0.5268, + "step": 27294 + }, + { + "epoch": 1.528446634561541, + "grad_norm": 1.3690496683120728, + "learning_rate": 9.808368421052632e-05, + "loss": 0.4147, + "step": 27295 + }, + { + "epoch": 1.52850263187367, + "grad_norm": 1.5757402181625366, + "learning_rate": 9.808342105263158e-05, + "loss": 0.6325, + "step": 27296 + }, + { + "epoch": 1.528558629185799, + "grad_norm": 1.3949118852615356, + "learning_rate": 9.808315789473684e-05, + "loss": 0.4446, + "step": 27297 + }, + { + "epoch": 1.5286146264979281, + "grad_norm": 1.326460838317871, + "learning_rate": 9.808289473684211e-05, + "loss": 0.5211, + "step": 27298 + }, + { + "epoch": 1.5286706238100571, + "grad_norm": 1.2361222505569458, + "learning_rate": 9.808263157894737e-05, + "loss": 0.4878, + "step": 27299 + }, + { + "epoch": 1.5287266211221862, + "grad_norm": 1.2712743282318115, + "learning_rate": 9.808236842105264e-05, + "loss": 0.3534, + "step": 27300 + }, + { + "epoch": 1.5287826184343152, + "grad_norm": 1.9682635068893433, + "learning_rate": 9.808210526315789e-05, + "loss": 0.4473, + "step": 27301 + }, + { + "epoch": 1.5288386157464442, + "grad_norm": 1.5164655447006226, + "learning_rate": 9.808184210526316e-05, + "loss": 0.5343, + "step": 27302 + }, + { + "epoch": 1.5288946130585732, + "grad_norm": 1.496638298034668, + "learning_rate": 9.808157894736842e-05, + "loss": 0.5247, + "step": 27303 + }, + { + "epoch": 1.5289506103707022, + "grad_norm": 1.265756607055664, + "learning_rate": 9.80813157894737e-05, + "loss": 0.3993, + "step": 27304 + }, + { + "epoch": 1.5290066076828313, + "grad_norm": 1.3341575860977173, + "learning_rate": 9.808105263157896e-05, + "loss": 0.4568, + "step": 27305 + }, + { + "epoch": 1.5290626049949603, + "grad_norm": 1.7430514097213745, + "learning_rate": 9.808078947368422e-05, + "loss": 0.4912, + "step": 27306 + }, + { + "epoch": 1.5291186023070893, + "grad_norm": 1.522204041481018, + "learning_rate": 9.808052631578948e-05, + "loss": 0.4835, + "step": 27307 + }, + { + "epoch": 1.5291745996192183, + "grad_norm": 8.616300582885742, + "learning_rate": 9.808026315789474e-05, + "loss": 0.6381, + "step": 27308 + }, + { + "epoch": 1.5292305969313473, + "grad_norm": 1.8359129428863525, + "learning_rate": 9.808000000000001e-05, + "loss": 0.6387, + "step": 27309 + }, + { + "epoch": 1.5292865942434763, + "grad_norm": 1.3288633823394775, + "learning_rate": 9.807973684210527e-05, + "loss": 0.4425, + "step": 27310 + }, + { + "epoch": 1.5293425915556054, + "grad_norm": 1.3993644714355469, + "learning_rate": 9.807947368421053e-05, + "loss": 0.4144, + "step": 27311 + }, + { + "epoch": 1.5293985888677344, + "grad_norm": 1.2721813917160034, + "learning_rate": 9.807921052631579e-05, + "loss": 0.3192, + "step": 27312 + }, + { + "epoch": 1.5294545861798634, + "grad_norm": 1.2544965744018555, + "learning_rate": 9.807894736842106e-05, + "loss": 0.5001, + "step": 27313 + }, + { + "epoch": 1.5295105834919924, + "grad_norm": 1.2644429206848145, + "learning_rate": 9.807868421052632e-05, + "loss": 0.4599, + "step": 27314 + }, + { + "epoch": 1.5295665808041214, + "grad_norm": 1.3764448165893555, + "learning_rate": 9.807842105263158e-05, + "loss": 0.518, + "step": 27315 + }, + { + "epoch": 1.5296225781162505, + "grad_norm": 1.488978624343872, + "learning_rate": 9.807815789473684e-05, + "loss": 0.5179, + "step": 27316 + }, + { + "epoch": 1.5296785754283795, + "grad_norm": 1.5431971549987793, + "learning_rate": 9.807789473684211e-05, + "loss": 0.5134, + "step": 27317 + }, + { + "epoch": 1.5297345727405085, + "grad_norm": 1.3441526889801025, + "learning_rate": 9.807763157894737e-05, + "loss": 0.3493, + "step": 27318 + }, + { + "epoch": 1.5297905700526375, + "grad_norm": 1.3060107231140137, + "learning_rate": 9.807736842105263e-05, + "loss": 0.4902, + "step": 27319 + }, + { + "epoch": 1.5298465673647665, + "grad_norm": 1.2472405433654785, + "learning_rate": 9.80771052631579e-05, + "loss": 0.3954, + "step": 27320 + }, + { + "epoch": 1.5299025646768956, + "grad_norm": 1.2510164976119995, + "learning_rate": 9.807684210526317e-05, + "loss": 0.4524, + "step": 27321 + }, + { + "epoch": 1.5299585619890246, + "grad_norm": 1.3180891275405884, + "learning_rate": 9.807657894736843e-05, + "loss": 0.3708, + "step": 27322 + }, + { + "epoch": 1.5300145593011536, + "grad_norm": 1.217759370803833, + "learning_rate": 9.80763157894737e-05, + "loss": 0.4818, + "step": 27323 + }, + { + "epoch": 1.5300705566132826, + "grad_norm": 1.3709975481033325, + "learning_rate": 9.807605263157895e-05, + "loss": 0.46, + "step": 27324 + }, + { + "epoch": 1.5301265539254114, + "grad_norm": 1.1192548274993896, + "learning_rate": 9.80757894736842e-05, + "loss": 0.3857, + "step": 27325 + }, + { + "epoch": 1.5301825512375404, + "grad_norm": 1.4618414640426636, + "learning_rate": 9.807552631578948e-05, + "loss": 0.3888, + "step": 27326 + }, + { + "epoch": 1.5302385485496695, + "grad_norm": 1.6253644227981567, + "learning_rate": 9.807526315789474e-05, + "loss": 0.6086, + "step": 27327 + }, + { + "epoch": 1.5302945458617985, + "grad_norm": 1.1323260068893433, + "learning_rate": 9.807500000000001e-05, + "loss": 0.4307, + "step": 27328 + }, + { + "epoch": 1.5303505431739275, + "grad_norm": 1.3198050260543823, + "learning_rate": 9.807473684210526e-05, + "loss": 0.4646, + "step": 27329 + }, + { + "epoch": 1.5304065404860565, + "grad_norm": 1.1652281284332275, + "learning_rate": 9.807447368421053e-05, + "loss": 0.4897, + "step": 27330 + }, + { + "epoch": 1.5304625377981855, + "grad_norm": 1.1871758699417114, + "learning_rate": 9.807421052631579e-05, + "loss": 0.5071, + "step": 27331 + }, + { + "epoch": 1.5305185351103145, + "grad_norm": 1.3732500076293945, + "learning_rate": 9.807394736842106e-05, + "loss": 0.3617, + "step": 27332 + }, + { + "epoch": 1.5305745324224436, + "grad_norm": 1.1966280937194824, + "learning_rate": 9.807368421052631e-05, + "loss": 0.4051, + "step": 27333 + }, + { + "epoch": 1.5306305297345726, + "grad_norm": 1.1944531202316284, + "learning_rate": 9.807342105263158e-05, + "loss": 0.387, + "step": 27334 + }, + { + "epoch": 1.5306865270467016, + "grad_norm": 1.246779203414917, + "learning_rate": 9.807315789473684e-05, + "loss": 0.3806, + "step": 27335 + }, + { + "epoch": 1.5307425243588306, + "grad_norm": 1.5883923768997192, + "learning_rate": 9.807289473684212e-05, + "loss": 0.5501, + "step": 27336 + }, + { + "epoch": 1.5307985216709596, + "grad_norm": 1.4065109491348267, + "learning_rate": 9.807263157894738e-05, + "loss": 0.461, + "step": 27337 + }, + { + "epoch": 1.5308545189830887, + "grad_norm": 1.3632166385650635, + "learning_rate": 9.807236842105264e-05, + "loss": 0.4515, + "step": 27338 + }, + { + "epoch": 1.5309105162952177, + "grad_norm": 1.1589895486831665, + "learning_rate": 9.80721052631579e-05, + "loss": 0.506, + "step": 27339 + }, + { + "epoch": 1.5309665136073467, + "grad_norm": 1.538962960243225, + "learning_rate": 9.807184210526317e-05, + "loss": 0.3962, + "step": 27340 + }, + { + "epoch": 1.5310225109194757, + "grad_norm": 1.7433265447616577, + "learning_rate": 9.807157894736843e-05, + "loss": 0.5894, + "step": 27341 + }, + { + "epoch": 1.5310785082316047, + "grad_norm": 1.180971384048462, + "learning_rate": 9.807131578947369e-05, + "loss": 0.2923, + "step": 27342 + }, + { + "epoch": 1.5311345055437338, + "grad_norm": 1.596555471420288, + "learning_rate": 9.807105263157895e-05, + "loss": 0.4668, + "step": 27343 + }, + { + "epoch": 1.5311905028558628, + "grad_norm": 1.2896409034729004, + "learning_rate": 9.807078947368421e-05, + "loss": 0.5266, + "step": 27344 + }, + { + "epoch": 1.5312465001679918, + "grad_norm": 1.253602385520935, + "learning_rate": 9.807052631578948e-05, + "loss": 0.5117, + "step": 27345 + }, + { + "epoch": 1.5313024974801208, + "grad_norm": 1.3735167980194092, + "learning_rate": 9.807026315789474e-05, + "loss": 0.4223, + "step": 27346 + }, + { + "epoch": 1.5313584947922498, + "grad_norm": 1.3457038402557373, + "learning_rate": 9.807e-05, + "loss": 0.4288, + "step": 27347 + }, + { + "epoch": 1.5314144921043789, + "grad_norm": 1.4801067113876343, + "learning_rate": 9.806973684210526e-05, + "loss": 0.4664, + "step": 27348 + }, + { + "epoch": 1.5314704894165079, + "grad_norm": 1.544136643409729, + "learning_rate": 9.806947368421053e-05, + "loss": 0.449, + "step": 27349 + }, + { + "epoch": 1.531526486728637, + "grad_norm": 1.1419053077697754, + "learning_rate": 9.80692105263158e-05, + "loss": 0.4573, + "step": 27350 + }, + { + "epoch": 1.531582484040766, + "grad_norm": 1.8215211629867554, + "learning_rate": 9.806894736842105e-05, + "loss": 0.495, + "step": 27351 + }, + { + "epoch": 1.531638481352895, + "grad_norm": 1.4061273336410522, + "learning_rate": 9.806868421052631e-05, + "loss": 0.5048, + "step": 27352 + }, + { + "epoch": 1.531694478665024, + "grad_norm": 1.3769176006317139, + "learning_rate": 9.806842105263159e-05, + "loss": 0.5012, + "step": 27353 + }, + { + "epoch": 1.531750475977153, + "grad_norm": 1.22798490524292, + "learning_rate": 9.806815789473685e-05, + "loss": 0.4844, + "step": 27354 + }, + { + "epoch": 1.531806473289282, + "grad_norm": 1.7953495979309082, + "learning_rate": 9.806789473684212e-05, + "loss": 0.5709, + "step": 27355 + }, + { + "epoch": 1.531862470601411, + "grad_norm": 1.2608802318572998, + "learning_rate": 9.806763157894737e-05, + "loss": 0.3773, + "step": 27356 + }, + { + "epoch": 1.53191846791354, + "grad_norm": 1.4404892921447754, + "learning_rate": 9.806736842105264e-05, + "loss": 0.6066, + "step": 27357 + }, + { + "epoch": 1.531974465225669, + "grad_norm": 1.2526389360427856, + "learning_rate": 9.80671052631579e-05, + "loss": 0.3322, + "step": 27358 + }, + { + "epoch": 1.532030462537798, + "grad_norm": 1.2167657613754272, + "learning_rate": 9.806684210526317e-05, + "loss": 0.3577, + "step": 27359 + }, + { + "epoch": 1.532086459849927, + "grad_norm": 1.2484211921691895, + "learning_rate": 9.806657894736843e-05, + "loss": 0.3917, + "step": 27360 + }, + { + "epoch": 1.532142457162056, + "grad_norm": 1.107147455215454, + "learning_rate": 9.806631578947368e-05, + "loss": 0.4293, + "step": 27361 + }, + { + "epoch": 1.5321984544741851, + "grad_norm": 1.4587103128433228, + "learning_rate": 9.806605263157895e-05, + "loss": 0.6043, + "step": 27362 + }, + { + "epoch": 1.5322544517863141, + "grad_norm": 1.3617078065872192, + "learning_rate": 9.806578947368421e-05, + "loss": 0.5124, + "step": 27363 + }, + { + "epoch": 1.5323104490984432, + "grad_norm": 1.3550008535385132, + "learning_rate": 9.806552631578948e-05, + "loss": 0.4097, + "step": 27364 + }, + { + "epoch": 1.5323664464105722, + "grad_norm": 1.4102470874786377, + "learning_rate": 9.806526315789474e-05, + "loss": 0.4618, + "step": 27365 + }, + { + "epoch": 1.5324224437227012, + "grad_norm": 1.5110503435134888, + "learning_rate": 9.8065e-05, + "loss": 0.5387, + "step": 27366 + }, + { + "epoch": 1.5324784410348302, + "grad_norm": 1.5743473768234253, + "learning_rate": 9.806473684210526e-05, + "loss": 0.4776, + "step": 27367 + }, + { + "epoch": 1.5325344383469592, + "grad_norm": 1.6093778610229492, + "learning_rate": 9.806447368421054e-05, + "loss": 0.3658, + "step": 27368 + }, + { + "epoch": 1.5325904356590883, + "grad_norm": 1.5215086936950684, + "learning_rate": 9.80642105263158e-05, + "loss": 0.5366, + "step": 27369 + }, + { + "epoch": 1.5326464329712173, + "grad_norm": 1.475719690322876, + "learning_rate": 9.806394736842106e-05, + "loss": 0.6122, + "step": 27370 + }, + { + "epoch": 1.5327024302833463, + "grad_norm": 1.2523468732833862, + "learning_rate": 9.806368421052632e-05, + "loss": 0.398, + "step": 27371 + }, + { + "epoch": 1.5327584275954753, + "grad_norm": 1.1293725967407227, + "learning_rate": 9.806342105263159e-05, + "loss": 0.3936, + "step": 27372 + }, + { + "epoch": 1.5328144249076043, + "grad_norm": 1.5792362689971924, + "learning_rate": 9.806315789473685e-05, + "loss": 0.4651, + "step": 27373 + }, + { + "epoch": 1.5328704222197334, + "grad_norm": 1.2675620317459106, + "learning_rate": 9.806289473684211e-05, + "loss": 0.6451, + "step": 27374 + }, + { + "epoch": 1.5329264195318624, + "grad_norm": 1.1812338829040527, + "learning_rate": 9.806263157894737e-05, + "loss": 0.3571, + "step": 27375 + }, + { + "epoch": 1.5329824168439914, + "grad_norm": 1.276490330696106, + "learning_rate": 9.806236842105264e-05, + "loss": 0.4349, + "step": 27376 + }, + { + "epoch": 1.5330384141561204, + "grad_norm": 1.2429367303848267, + "learning_rate": 9.80621052631579e-05, + "loss": 0.4896, + "step": 27377 + }, + { + "epoch": 1.5330944114682494, + "grad_norm": 1.464216947555542, + "learning_rate": 9.806184210526316e-05, + "loss": 0.4263, + "step": 27378 + }, + { + "epoch": 1.5331504087803784, + "grad_norm": 1.555129051208496, + "learning_rate": 9.806157894736842e-05, + "loss": 0.5057, + "step": 27379 + }, + { + "epoch": 1.5332064060925075, + "grad_norm": 1.5318166017532349, + "learning_rate": 9.806131578947368e-05, + "loss": 0.5248, + "step": 27380 + }, + { + "epoch": 1.5332624034046365, + "grad_norm": 1.4204380512237549, + "learning_rate": 9.806105263157895e-05, + "loss": 0.5319, + "step": 27381 + }, + { + "epoch": 1.5333184007167655, + "grad_norm": 1.737895131111145, + "learning_rate": 9.806078947368421e-05, + "loss": 0.543, + "step": 27382 + }, + { + "epoch": 1.5333743980288945, + "grad_norm": 1.4844969511032104, + "learning_rate": 9.806052631578949e-05, + "loss": 0.492, + "step": 27383 + }, + { + "epoch": 1.5334303953410235, + "grad_norm": 1.2135682106018066, + "learning_rate": 9.806026315789473e-05, + "loss": 0.4453, + "step": 27384 + }, + { + "epoch": 1.5334863926531526, + "grad_norm": 1.1250629425048828, + "learning_rate": 9.806e-05, + "loss": 0.3666, + "step": 27385 + }, + { + "epoch": 1.5335423899652816, + "grad_norm": 1.3407983779907227, + "learning_rate": 9.805973684210527e-05, + "loss": 0.4875, + "step": 27386 + }, + { + "epoch": 1.5335983872774106, + "grad_norm": 1.3464614152908325, + "learning_rate": 9.805947368421054e-05, + "loss": 0.3705, + "step": 27387 + }, + { + "epoch": 1.5336543845895396, + "grad_norm": 1.4388389587402344, + "learning_rate": 9.805921052631579e-05, + "loss": 0.3942, + "step": 27388 + }, + { + "epoch": 1.5337103819016686, + "grad_norm": 1.7731399536132812, + "learning_rate": 9.805894736842106e-05, + "loss": 0.4749, + "step": 27389 + }, + { + "epoch": 1.5337663792137977, + "grad_norm": 1.9700196981430054, + "learning_rate": 9.805868421052632e-05, + "loss": 0.3996, + "step": 27390 + }, + { + "epoch": 1.5338223765259267, + "grad_norm": 1.5039012432098389, + "learning_rate": 9.805842105263159e-05, + "loss": 0.5317, + "step": 27391 + }, + { + "epoch": 1.5338783738380557, + "grad_norm": 1.7295633554458618, + "learning_rate": 9.805815789473685e-05, + "loss": 0.6416, + "step": 27392 + }, + { + "epoch": 1.5339343711501847, + "grad_norm": 1.3499912023544312, + "learning_rate": 9.805789473684211e-05, + "loss": 0.5603, + "step": 27393 + }, + { + "epoch": 1.5339903684623137, + "grad_norm": 1.2199500799179077, + "learning_rate": 9.805763157894737e-05, + "loss": 0.4216, + "step": 27394 + }, + { + "epoch": 1.5340463657744428, + "grad_norm": 1.0372475385665894, + "learning_rate": 9.805736842105263e-05, + "loss": 0.3845, + "step": 27395 + }, + { + "epoch": 1.5341023630865718, + "grad_norm": 1.6075316667556763, + "learning_rate": 9.80571052631579e-05, + "loss": 0.7346, + "step": 27396 + }, + { + "epoch": 1.5341583603987008, + "grad_norm": 1.2499035596847534, + "learning_rate": 9.805684210526316e-05, + "loss": 0.5194, + "step": 27397 + }, + { + "epoch": 1.5342143577108298, + "grad_norm": 1.4742541313171387, + "learning_rate": 9.805657894736842e-05, + "loss": 0.424, + "step": 27398 + }, + { + "epoch": 1.5342703550229588, + "grad_norm": 1.3422431945800781, + "learning_rate": 9.805631578947368e-05, + "loss": 0.4758, + "step": 27399 + }, + { + "epoch": 1.5343263523350879, + "grad_norm": 1.3489582538604736, + "learning_rate": 9.805605263157896e-05, + "loss": 0.4106, + "step": 27400 + }, + { + "epoch": 1.5343823496472169, + "grad_norm": 1.3509870767593384, + "learning_rate": 9.805578947368422e-05, + "loss": 0.3976, + "step": 27401 + }, + { + "epoch": 1.534438346959346, + "grad_norm": 2.1054441928863525, + "learning_rate": 9.805552631578948e-05, + "loss": 0.5922, + "step": 27402 + }, + { + "epoch": 1.534494344271475, + "grad_norm": 1.5807559490203857, + "learning_rate": 9.805526315789474e-05, + "loss": 0.4145, + "step": 27403 + }, + { + "epoch": 1.534550341583604, + "grad_norm": 1.226464867591858, + "learning_rate": 9.805500000000001e-05, + "loss": 0.3998, + "step": 27404 + }, + { + "epoch": 1.534606338895733, + "grad_norm": 1.1433545351028442, + "learning_rate": 9.805473684210527e-05, + "loss": 0.3917, + "step": 27405 + }, + { + "epoch": 1.534662336207862, + "grad_norm": 1.6557573080062866, + "learning_rate": 9.805447368421053e-05, + "loss": 0.4927, + "step": 27406 + }, + { + "epoch": 1.534718333519991, + "grad_norm": 1.523334264755249, + "learning_rate": 9.805421052631579e-05, + "loss": 0.3263, + "step": 27407 + }, + { + "epoch": 1.53477433083212, + "grad_norm": 1.2454636096954346, + "learning_rate": 9.805394736842106e-05, + "loss": 0.4938, + "step": 27408 + }, + { + "epoch": 1.534830328144249, + "grad_norm": 1.1688791513442993, + "learning_rate": 9.805368421052632e-05, + "loss": 0.3803, + "step": 27409 + }, + { + "epoch": 1.534886325456378, + "grad_norm": 1.3370659351348877, + "learning_rate": 9.80534210526316e-05, + "loss": 0.3608, + "step": 27410 + }, + { + "epoch": 1.534942322768507, + "grad_norm": 2.0097856521606445, + "learning_rate": 9.805315789473684e-05, + "loss": 0.5683, + "step": 27411 + }, + { + "epoch": 1.534998320080636, + "grad_norm": 1.3716015815734863, + "learning_rate": 9.80528947368421e-05, + "loss": 0.3982, + "step": 27412 + }, + { + "epoch": 1.535054317392765, + "grad_norm": 1.6543197631835938, + "learning_rate": 9.805263157894737e-05, + "loss": 0.4842, + "step": 27413 + }, + { + "epoch": 1.5351103147048941, + "grad_norm": 1.7502961158752441, + "learning_rate": 9.805236842105263e-05, + "loss": 0.5106, + "step": 27414 + }, + { + "epoch": 1.5351663120170231, + "grad_norm": 1.0914019346237183, + "learning_rate": 9.80521052631579e-05, + "loss": 0.3747, + "step": 27415 + }, + { + "epoch": 1.5352223093291522, + "grad_norm": 1.7459182739257812, + "learning_rate": 9.805184210526315e-05, + "loss": 0.4082, + "step": 27416 + }, + { + "epoch": 1.5352783066412812, + "grad_norm": 1.4182765483856201, + "learning_rate": 9.805157894736843e-05, + "loss": 0.4289, + "step": 27417 + }, + { + "epoch": 1.5353343039534102, + "grad_norm": 1.6390981674194336, + "learning_rate": 9.805131578947369e-05, + "loss": 0.5726, + "step": 27418 + }, + { + "epoch": 1.5353903012655392, + "grad_norm": 1.5273953676223755, + "learning_rate": 9.805105263157896e-05, + "loss": 0.4502, + "step": 27419 + }, + { + "epoch": 1.5354462985776682, + "grad_norm": 1.197325587272644, + "learning_rate": 9.805078947368422e-05, + "loss": 0.4224, + "step": 27420 + }, + { + "epoch": 1.5355022958897973, + "grad_norm": 1.758510708808899, + "learning_rate": 9.805052631578948e-05, + "loss": 0.5773, + "step": 27421 + }, + { + "epoch": 1.5355582932019263, + "grad_norm": 1.3416218757629395, + "learning_rate": 9.805026315789474e-05, + "loss": 0.4875, + "step": 27422 + }, + { + "epoch": 1.5356142905140553, + "grad_norm": 1.3372999429702759, + "learning_rate": 9.805000000000001e-05, + "loss": 0.3808, + "step": 27423 + }, + { + "epoch": 1.5356702878261843, + "grad_norm": 1.239515781402588, + "learning_rate": 9.804973684210527e-05, + "loss": 0.6036, + "step": 27424 + }, + { + "epoch": 1.5357262851383133, + "grad_norm": 1.2493059635162354, + "learning_rate": 9.804947368421053e-05, + "loss": 0.337, + "step": 27425 + }, + { + "epoch": 1.5357822824504423, + "grad_norm": 1.2406386137008667, + "learning_rate": 9.804921052631579e-05, + "loss": 0.417, + "step": 27426 + }, + { + "epoch": 1.5358382797625714, + "grad_norm": 1.2913198471069336, + "learning_rate": 9.804894736842106e-05, + "loss": 0.4527, + "step": 27427 + }, + { + "epoch": 1.5358942770747004, + "grad_norm": 1.506883978843689, + "learning_rate": 9.804868421052632e-05, + "loss": 0.4886, + "step": 27428 + }, + { + "epoch": 1.5359502743868294, + "grad_norm": 1.262926697731018, + "learning_rate": 9.804842105263158e-05, + "loss": 0.5099, + "step": 27429 + }, + { + "epoch": 1.5360062716989584, + "grad_norm": 1.320450782775879, + "learning_rate": 9.804815789473684e-05, + "loss": 0.6103, + "step": 27430 + }, + { + "epoch": 1.5360622690110874, + "grad_norm": 1.2099217176437378, + "learning_rate": 9.80478947368421e-05, + "loss": 0.3378, + "step": 27431 + }, + { + "epoch": 1.5361182663232165, + "grad_norm": 1.5453951358795166, + "learning_rate": 9.804763157894738e-05, + "loss": 0.4624, + "step": 27432 + }, + { + "epoch": 1.5361742636353455, + "grad_norm": 1.3994524478912354, + "learning_rate": 9.804736842105264e-05, + "loss": 0.6331, + "step": 27433 + }, + { + "epoch": 1.5362302609474745, + "grad_norm": 1.4725806713104248, + "learning_rate": 9.80471052631579e-05, + "loss": 0.52, + "step": 27434 + }, + { + "epoch": 1.5362862582596035, + "grad_norm": 1.5783519744873047, + "learning_rate": 9.804684210526316e-05, + "loss": 0.5181, + "step": 27435 + }, + { + "epoch": 1.5363422555717325, + "grad_norm": 1.4294320344924927, + "learning_rate": 9.804657894736843e-05, + "loss": 0.6509, + "step": 27436 + }, + { + "epoch": 1.5363982528838616, + "grad_norm": 1.1189452409744263, + "learning_rate": 9.804631578947369e-05, + "loss": 0.367, + "step": 27437 + }, + { + "epoch": 1.5364542501959906, + "grad_norm": 1.5972261428833008, + "learning_rate": 9.804605263157896e-05, + "loss": 0.5024, + "step": 27438 + }, + { + "epoch": 1.5365102475081196, + "grad_norm": 1.3196470737457275, + "learning_rate": 9.804578947368421e-05, + "loss": 0.4849, + "step": 27439 + }, + { + "epoch": 1.5365662448202486, + "grad_norm": 1.0912482738494873, + "learning_rate": 9.804552631578948e-05, + "loss": 0.3987, + "step": 27440 + }, + { + "epoch": 1.5366222421323776, + "grad_norm": 2.2294299602508545, + "learning_rate": 9.804526315789474e-05, + "loss": 0.4531, + "step": 27441 + }, + { + "epoch": 1.5366782394445067, + "grad_norm": 1.0957279205322266, + "learning_rate": 9.804500000000001e-05, + "loss": 0.4376, + "step": 27442 + }, + { + "epoch": 1.5367342367566357, + "grad_norm": 1.2432575225830078, + "learning_rate": 9.804473684210526e-05, + "loss": 0.4539, + "step": 27443 + }, + { + "epoch": 1.5367902340687647, + "grad_norm": 2.164496898651123, + "learning_rate": 9.804447368421053e-05, + "loss": 0.31, + "step": 27444 + }, + { + "epoch": 1.5368462313808937, + "grad_norm": 1.7686548233032227, + "learning_rate": 9.80442105263158e-05, + "loss": 0.5001, + "step": 27445 + }, + { + "epoch": 1.5369022286930227, + "grad_norm": 1.1818535327911377, + "learning_rate": 9.804394736842105e-05, + "loss": 0.4528, + "step": 27446 + }, + { + "epoch": 1.5369582260051518, + "grad_norm": 1.6264586448669434, + "learning_rate": 9.804368421052633e-05, + "loss": 0.4355, + "step": 27447 + }, + { + "epoch": 1.5370142233172808, + "grad_norm": 1.404054045677185, + "learning_rate": 9.804342105263157e-05, + "loss": 0.6259, + "step": 27448 + }, + { + "epoch": 1.5370702206294098, + "grad_norm": 1.3645541667938232, + "learning_rate": 9.804315789473685e-05, + "loss": 0.6429, + "step": 27449 + }, + { + "epoch": 1.5371262179415388, + "grad_norm": 1.200858235359192, + "learning_rate": 9.80428947368421e-05, + "loss": 0.3692, + "step": 27450 + }, + { + "epoch": 1.5371822152536678, + "grad_norm": 1.3735047578811646, + "learning_rate": 9.804263157894738e-05, + "loss": 0.5971, + "step": 27451 + }, + { + "epoch": 1.5372382125657968, + "grad_norm": 1.3319628238677979, + "learning_rate": 9.804236842105264e-05, + "loss": 0.4273, + "step": 27452 + }, + { + "epoch": 1.5372942098779259, + "grad_norm": 1.5300577878952026, + "learning_rate": 9.80421052631579e-05, + "loss": 0.4289, + "step": 27453 + }, + { + "epoch": 1.5373502071900549, + "grad_norm": 1.256272315979004, + "learning_rate": 9.804184210526316e-05, + "loss": 0.4217, + "step": 27454 + }, + { + "epoch": 1.537406204502184, + "grad_norm": 1.1546261310577393, + "learning_rate": 9.804157894736843e-05, + "loss": 0.5448, + "step": 27455 + }, + { + "epoch": 1.537462201814313, + "grad_norm": 1.232764482498169, + "learning_rate": 9.804131578947369e-05, + "loss": 0.4131, + "step": 27456 + }, + { + "epoch": 1.537518199126442, + "grad_norm": 1.5305607318878174, + "learning_rate": 9.804105263157895e-05, + "loss": 0.4478, + "step": 27457 + }, + { + "epoch": 1.537574196438571, + "grad_norm": 1.6348007917404175, + "learning_rate": 9.804078947368421e-05, + "loss": 0.6888, + "step": 27458 + }, + { + "epoch": 1.5376301937507, + "grad_norm": 1.250065565109253, + "learning_rate": 9.804052631578948e-05, + "loss": 0.4321, + "step": 27459 + }, + { + "epoch": 1.537686191062829, + "grad_norm": 1.3260490894317627, + "learning_rate": 9.804026315789474e-05, + "loss": 0.463, + "step": 27460 + }, + { + "epoch": 1.537742188374958, + "grad_norm": 1.4980632066726685, + "learning_rate": 9.804e-05, + "loss": 0.6361, + "step": 27461 + }, + { + "epoch": 1.537798185687087, + "grad_norm": 1.5763019323349, + "learning_rate": 9.803973684210526e-05, + "loss": 0.8689, + "step": 27462 + }, + { + "epoch": 1.537854182999216, + "grad_norm": 1.4502410888671875, + "learning_rate": 9.803947368421052e-05, + "loss": 0.3842, + "step": 27463 + }, + { + "epoch": 1.537910180311345, + "grad_norm": 1.3471466302871704, + "learning_rate": 9.80392105263158e-05, + "loss": 0.4361, + "step": 27464 + }, + { + "epoch": 1.537966177623474, + "grad_norm": 1.247658610343933, + "learning_rate": 9.803894736842106e-05, + "loss": 0.531, + "step": 27465 + }, + { + "epoch": 1.5380221749356031, + "grad_norm": 1.1767892837524414, + "learning_rate": 9.803868421052632e-05, + "loss": 0.3491, + "step": 27466 + }, + { + "epoch": 1.5380781722477321, + "grad_norm": 1.3079240322113037, + "learning_rate": 9.803842105263158e-05, + "loss": 0.5244, + "step": 27467 + }, + { + "epoch": 1.5381341695598612, + "grad_norm": 1.3583015203475952, + "learning_rate": 9.803815789473685e-05, + "loss": 0.4296, + "step": 27468 + }, + { + "epoch": 1.5381901668719902, + "grad_norm": 1.3113908767700195, + "learning_rate": 9.803789473684211e-05, + "loss": 0.4516, + "step": 27469 + }, + { + "epoch": 1.5382461641841192, + "grad_norm": 1.5479438304901123, + "learning_rate": 9.803763157894738e-05, + "loss": 0.5803, + "step": 27470 + }, + { + "epoch": 1.5383021614962482, + "grad_norm": 1.4623644351959229, + "learning_rate": 9.803736842105263e-05, + "loss": 0.4005, + "step": 27471 + }, + { + "epoch": 1.5383581588083772, + "grad_norm": 1.529299259185791, + "learning_rate": 9.80371052631579e-05, + "loss": 0.4526, + "step": 27472 + }, + { + "epoch": 1.5384141561205062, + "grad_norm": 1.45881986618042, + "learning_rate": 9.803684210526316e-05, + "loss": 0.6338, + "step": 27473 + }, + { + "epoch": 1.5384701534326353, + "grad_norm": 1.1933166980743408, + "learning_rate": 9.803657894736843e-05, + "loss": 0.4632, + "step": 27474 + }, + { + "epoch": 1.5385261507447643, + "grad_norm": 1.365078330039978, + "learning_rate": 9.80363157894737e-05, + "loss": 0.4295, + "step": 27475 + }, + { + "epoch": 1.5385821480568933, + "grad_norm": 1.6030665636062622, + "learning_rate": 9.803605263157895e-05, + "loss": 0.5091, + "step": 27476 + }, + { + "epoch": 1.5386381453690223, + "grad_norm": 1.1716169118881226, + "learning_rate": 9.803578947368421e-05, + "loss": 0.4437, + "step": 27477 + }, + { + "epoch": 1.5386941426811513, + "grad_norm": 1.7651629447937012, + "learning_rate": 9.803552631578949e-05, + "loss": 0.5739, + "step": 27478 + }, + { + "epoch": 1.5387501399932804, + "grad_norm": 1.3289557695388794, + "learning_rate": 9.803526315789475e-05, + "loss": 0.4221, + "step": 27479 + }, + { + "epoch": 1.5388061373054094, + "grad_norm": 1.4761066436767578, + "learning_rate": 9.8035e-05, + "loss": 0.4091, + "step": 27480 + }, + { + "epoch": 1.5388621346175384, + "grad_norm": 1.7415555715560913, + "learning_rate": 9.803473684210527e-05, + "loss": 0.465, + "step": 27481 + }, + { + "epoch": 1.5389181319296674, + "grad_norm": 1.260891318321228, + "learning_rate": 9.803447368421053e-05, + "loss": 0.4759, + "step": 27482 + }, + { + "epoch": 1.5389741292417964, + "grad_norm": 1.1866629123687744, + "learning_rate": 9.80342105263158e-05, + "loss": 0.4661, + "step": 27483 + }, + { + "epoch": 1.5390301265539255, + "grad_norm": 1.501732587814331, + "learning_rate": 9.803394736842106e-05, + "loss": 0.5099, + "step": 27484 + }, + { + "epoch": 1.5390861238660545, + "grad_norm": 1.0346893072128296, + "learning_rate": 9.803368421052632e-05, + "loss": 0.3779, + "step": 27485 + }, + { + "epoch": 1.5391421211781835, + "grad_norm": 1.6514413356781006, + "learning_rate": 9.803342105263158e-05, + "loss": 0.4359, + "step": 27486 + }, + { + "epoch": 1.5391981184903125, + "grad_norm": 1.1998839378356934, + "learning_rate": 9.803315789473685e-05, + "loss": 0.4138, + "step": 27487 + }, + { + "epoch": 1.5392541158024415, + "grad_norm": 1.340802550315857, + "learning_rate": 9.803289473684211e-05, + "loss": 0.5049, + "step": 27488 + }, + { + "epoch": 1.5393101131145706, + "grad_norm": 1.6594573259353638, + "learning_rate": 9.803263157894737e-05, + "loss": 0.5302, + "step": 27489 + }, + { + "epoch": 1.5393661104266996, + "grad_norm": 1.2872958183288574, + "learning_rate": 9.803236842105263e-05, + "loss": 0.5179, + "step": 27490 + }, + { + "epoch": 1.5394221077388286, + "grad_norm": 1.357077717781067, + "learning_rate": 9.80321052631579e-05, + "loss": 0.4203, + "step": 27491 + }, + { + "epoch": 1.5394781050509576, + "grad_norm": 1.2538336515426636, + "learning_rate": 9.803184210526316e-05, + "loss": 0.4895, + "step": 27492 + }, + { + "epoch": 1.5395341023630866, + "grad_norm": 1.4729819297790527, + "learning_rate": 9.803157894736844e-05, + "loss": 0.5603, + "step": 27493 + }, + { + "epoch": 1.5395900996752157, + "grad_norm": 1.2647981643676758, + "learning_rate": 9.803131578947368e-05, + "loss": 0.4322, + "step": 27494 + }, + { + "epoch": 1.5396460969873447, + "grad_norm": 1.2713780403137207, + "learning_rate": 9.803105263157896e-05, + "loss": 0.4704, + "step": 27495 + }, + { + "epoch": 1.5397020942994737, + "grad_norm": 1.1072604656219482, + "learning_rate": 9.803078947368422e-05, + "loss": 0.4073, + "step": 27496 + }, + { + "epoch": 1.5397580916116027, + "grad_norm": 1.9605051279067993, + "learning_rate": 9.803052631578949e-05, + "loss": 0.4553, + "step": 27497 + }, + { + "epoch": 1.5398140889237317, + "grad_norm": 1.4391065835952759, + "learning_rate": 9.803026315789474e-05, + "loss": 0.5179, + "step": 27498 + }, + { + "epoch": 1.5398700862358607, + "grad_norm": 1.2579585313796997, + "learning_rate": 9.803e-05, + "loss": 0.4785, + "step": 27499 + }, + { + "epoch": 1.5399260835479898, + "grad_norm": 1.3812254667282104, + "learning_rate": 9.802973684210527e-05, + "loss": 0.4546, + "step": 27500 + }, + { + "epoch": 1.5399820808601188, + "grad_norm": 1.4660743474960327, + "learning_rate": 9.802947368421053e-05, + "loss": 0.4336, + "step": 27501 + }, + { + "epoch": 1.5400380781722478, + "grad_norm": 1.2951329946517944, + "learning_rate": 9.80292105263158e-05, + "loss": 0.3834, + "step": 27502 + }, + { + "epoch": 1.5400940754843768, + "grad_norm": 1.3852564096450806, + "learning_rate": 9.802894736842105e-05, + "loss": 0.6174, + "step": 27503 + }, + { + "epoch": 1.5401500727965058, + "grad_norm": 1.3390361070632935, + "learning_rate": 9.802868421052632e-05, + "loss": 0.573, + "step": 27504 + }, + { + "epoch": 1.5402060701086349, + "grad_norm": 1.2705798149108887, + "learning_rate": 9.802842105263158e-05, + "loss": 0.3637, + "step": 27505 + }, + { + "epoch": 1.5402620674207639, + "grad_norm": 1.1765118837356567, + "learning_rate": 9.802815789473685e-05, + "loss": 0.3652, + "step": 27506 + }, + { + "epoch": 1.540318064732893, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.802789473684211e-05, + "loss": 0.4043, + "step": 27507 + }, + { + "epoch": 1.540374062045022, + "grad_norm": 1.32753586769104, + "learning_rate": 9.802763157894737e-05, + "loss": 0.4631, + "step": 27508 + }, + { + "epoch": 1.540430059357151, + "grad_norm": 1.560129165649414, + "learning_rate": 9.802736842105263e-05, + "loss": 0.4467, + "step": 27509 + }, + { + "epoch": 1.54048605666928, + "grad_norm": 1.3294612169265747, + "learning_rate": 9.80271052631579e-05, + "loss": 0.4659, + "step": 27510 + }, + { + "epoch": 1.540542053981409, + "grad_norm": 1.3083581924438477, + "learning_rate": 9.802684210526317e-05, + "loss": 0.489, + "step": 27511 + }, + { + "epoch": 1.540598051293538, + "grad_norm": 1.2318902015686035, + "learning_rate": 9.802657894736843e-05, + "loss": 0.4461, + "step": 27512 + }, + { + "epoch": 1.540654048605667, + "grad_norm": 1.1843339204788208, + "learning_rate": 9.802631578947369e-05, + "loss": 0.4687, + "step": 27513 + }, + { + "epoch": 1.540710045917796, + "grad_norm": 1.335110068321228, + "learning_rate": 9.802605263157896e-05, + "loss": 0.3646, + "step": 27514 + }, + { + "epoch": 1.540766043229925, + "grad_norm": 1.146899700164795, + "learning_rate": 9.802578947368422e-05, + "loss": 0.4172, + "step": 27515 + }, + { + "epoch": 1.540822040542054, + "grad_norm": 1.4995315074920654, + "learning_rate": 9.802552631578948e-05, + "loss": 0.4947, + "step": 27516 + }, + { + "epoch": 1.540878037854183, + "grad_norm": 1.4839874505996704, + "learning_rate": 9.802526315789474e-05, + "loss": 0.4552, + "step": 27517 + }, + { + "epoch": 1.540934035166312, + "grad_norm": 1.3880366086959839, + "learning_rate": 9.8025e-05, + "loss": 0.5506, + "step": 27518 + }, + { + "epoch": 1.5409900324784411, + "grad_norm": 1.256199598312378, + "learning_rate": 9.802473684210527e-05, + "loss": 0.5615, + "step": 27519 + }, + { + "epoch": 1.5410460297905701, + "grad_norm": 2.4414100646972656, + "learning_rate": 9.802447368421053e-05, + "loss": 0.4661, + "step": 27520 + }, + { + "epoch": 1.5411020271026992, + "grad_norm": 1.535971999168396, + "learning_rate": 9.802421052631579e-05, + "loss": 0.5246, + "step": 27521 + }, + { + "epoch": 1.5411580244148282, + "grad_norm": 1.4283818006515503, + "learning_rate": 9.802394736842105e-05, + "loss": 0.4692, + "step": 27522 + }, + { + "epoch": 1.5412140217269572, + "grad_norm": 1.3877708911895752, + "learning_rate": 9.802368421052632e-05, + "loss": 0.5436, + "step": 27523 + }, + { + "epoch": 1.5412700190390862, + "grad_norm": 1.5361599922180176, + "learning_rate": 9.802342105263158e-05, + "loss": 0.5783, + "step": 27524 + }, + { + "epoch": 1.5413260163512152, + "grad_norm": 1.22494637966156, + "learning_rate": 9.802315789473686e-05, + "loss": 0.4429, + "step": 27525 + }, + { + "epoch": 1.5413820136633443, + "grad_norm": 1.433502435684204, + "learning_rate": 9.80228947368421e-05, + "loss": 0.5215, + "step": 27526 + }, + { + "epoch": 1.5414380109754733, + "grad_norm": 1.7901016473770142, + "learning_rate": 9.802263157894738e-05, + "loss": 0.4571, + "step": 27527 + }, + { + "epoch": 1.5414940082876023, + "grad_norm": 1.2573442459106445, + "learning_rate": 9.802236842105264e-05, + "loss": 0.4885, + "step": 27528 + }, + { + "epoch": 1.5415500055997313, + "grad_norm": 1.3599714040756226, + "learning_rate": 9.802210526315791e-05, + "loss": 0.32, + "step": 27529 + }, + { + "epoch": 1.5416060029118603, + "grad_norm": 1.5652059316635132, + "learning_rate": 9.802184210526317e-05, + "loss": 0.4739, + "step": 27530 + }, + { + "epoch": 1.5416620002239894, + "grad_norm": 1.3834737539291382, + "learning_rate": 9.802157894736843e-05, + "loss": 0.4994, + "step": 27531 + }, + { + "epoch": 1.5417179975361184, + "grad_norm": 1.1089963912963867, + "learning_rate": 9.802131578947369e-05, + "loss": 0.4456, + "step": 27532 + }, + { + "epoch": 1.5417739948482474, + "grad_norm": 1.2896775007247925, + "learning_rate": 9.802105263157895e-05, + "loss": 0.4007, + "step": 27533 + }, + { + "epoch": 1.5418299921603764, + "grad_norm": 1.4454684257507324, + "learning_rate": 9.802078947368422e-05, + "loss": 0.4368, + "step": 27534 + }, + { + "epoch": 1.5418859894725054, + "grad_norm": 1.4198540449142456, + "learning_rate": 9.802052631578947e-05, + "loss": 0.4667, + "step": 27535 + }, + { + "epoch": 1.5419419867846345, + "grad_norm": 1.478713035583496, + "learning_rate": 9.802026315789474e-05, + "loss": 0.6273, + "step": 27536 + }, + { + "epoch": 1.5419979840967635, + "grad_norm": 1.2896050214767456, + "learning_rate": 9.802e-05, + "loss": 0.4453, + "step": 27537 + }, + { + "epoch": 1.5420539814088925, + "grad_norm": 1.3719547986984253, + "learning_rate": 9.801973684210527e-05, + "loss": 0.5107, + "step": 27538 + }, + { + "epoch": 1.5421099787210215, + "grad_norm": 1.1835438013076782, + "learning_rate": 9.801947368421053e-05, + "loss": 0.4458, + "step": 27539 + }, + { + "epoch": 1.5421659760331505, + "grad_norm": 1.4666827917099, + "learning_rate": 9.801921052631579e-05, + "loss": 0.5352, + "step": 27540 + }, + { + "epoch": 1.5422219733452796, + "grad_norm": 1.3444392681121826, + "learning_rate": 9.801894736842105e-05, + "loss": 0.4657, + "step": 27541 + }, + { + "epoch": 1.5422779706574086, + "grad_norm": 1.0673942565917969, + "learning_rate": 9.801868421052633e-05, + "loss": 0.4011, + "step": 27542 + }, + { + "epoch": 1.5423339679695376, + "grad_norm": 1.4041234254837036, + "learning_rate": 9.801842105263159e-05, + "loss": 0.4219, + "step": 27543 + }, + { + "epoch": 1.5423899652816666, + "grad_norm": 1.3166135549545288, + "learning_rate": 9.801815789473685e-05, + "loss": 0.3867, + "step": 27544 + }, + { + "epoch": 1.5424459625937956, + "grad_norm": 1.516274094581604, + "learning_rate": 9.80178947368421e-05, + "loss": 0.4241, + "step": 27545 + }, + { + "epoch": 1.5425019599059246, + "grad_norm": 1.1992155313491821, + "learning_rate": 9.801763157894738e-05, + "loss": 0.3902, + "step": 27546 + }, + { + "epoch": 1.5425579572180537, + "grad_norm": 1.3640427589416504, + "learning_rate": 9.801736842105264e-05, + "loss": 0.5306, + "step": 27547 + }, + { + "epoch": 1.5426139545301827, + "grad_norm": 1.326397180557251, + "learning_rate": 9.801710526315791e-05, + "loss": 0.4747, + "step": 27548 + }, + { + "epoch": 1.5426699518423117, + "grad_norm": 1.6014254093170166, + "learning_rate": 9.801684210526316e-05, + "loss": 0.4637, + "step": 27549 + }, + { + "epoch": 1.5427259491544407, + "grad_norm": 1.4529451131820679, + "learning_rate": 9.801657894736842e-05, + "loss": 0.4927, + "step": 27550 + }, + { + "epoch": 1.5427819464665697, + "grad_norm": 1.4845919609069824, + "learning_rate": 9.801631578947369e-05, + "loss": 0.4053, + "step": 27551 + }, + { + "epoch": 1.5428379437786988, + "grad_norm": 1.4558218717575073, + "learning_rate": 9.801605263157895e-05, + "loss": 0.572, + "step": 27552 + }, + { + "epoch": 1.5428939410908278, + "grad_norm": 1.2350249290466309, + "learning_rate": 9.801578947368421e-05, + "loss": 0.324, + "step": 27553 + }, + { + "epoch": 1.5429499384029568, + "grad_norm": 1.2948768138885498, + "learning_rate": 9.801552631578947e-05, + "loss": 0.6895, + "step": 27554 + }, + { + "epoch": 1.5430059357150858, + "grad_norm": 1.2931283712387085, + "learning_rate": 9.801526315789474e-05, + "loss": 0.4341, + "step": 27555 + }, + { + "epoch": 1.5430619330272148, + "grad_norm": 1.2458338737487793, + "learning_rate": 9.8015e-05, + "loss": 0.4292, + "step": 27556 + }, + { + "epoch": 1.5431179303393439, + "grad_norm": 1.4769642353057861, + "learning_rate": 9.801473684210528e-05, + "loss": 0.7012, + "step": 27557 + }, + { + "epoch": 1.5431739276514729, + "grad_norm": 1.3621057271957397, + "learning_rate": 9.801447368421052e-05, + "loss": 0.5053, + "step": 27558 + }, + { + "epoch": 1.543229924963602, + "grad_norm": 1.2167919874191284, + "learning_rate": 9.80142105263158e-05, + "loss": 0.3819, + "step": 27559 + }, + { + "epoch": 1.543285922275731, + "grad_norm": 1.462602972984314, + "learning_rate": 9.801394736842106e-05, + "loss": 0.474, + "step": 27560 + }, + { + "epoch": 1.54334191958786, + "grad_norm": 1.011000633239746, + "learning_rate": 9.801368421052633e-05, + "loss": 0.3042, + "step": 27561 + }, + { + "epoch": 1.543397916899989, + "grad_norm": 1.2537404298782349, + "learning_rate": 9.801342105263159e-05, + "loss": 0.4026, + "step": 27562 + }, + { + "epoch": 1.543453914212118, + "grad_norm": 1.2451791763305664, + "learning_rate": 9.801315789473685e-05, + "loss": 0.4505, + "step": 27563 + }, + { + "epoch": 1.543509911524247, + "grad_norm": 1.3682466745376587, + "learning_rate": 9.801289473684211e-05, + "loss": 0.4357, + "step": 27564 + }, + { + "epoch": 1.543565908836376, + "grad_norm": 1.4733965396881104, + "learning_rate": 9.801263157894738e-05, + "loss": 0.4874, + "step": 27565 + }, + { + "epoch": 1.543621906148505, + "grad_norm": 1.4038304090499878, + "learning_rate": 9.801236842105264e-05, + "loss": 0.4441, + "step": 27566 + }, + { + "epoch": 1.543677903460634, + "grad_norm": 3.1235392093658447, + "learning_rate": 9.80121052631579e-05, + "loss": 0.4708, + "step": 27567 + }, + { + "epoch": 1.543733900772763, + "grad_norm": 1.4425427913665771, + "learning_rate": 9.801184210526316e-05, + "loss": 0.518, + "step": 27568 + }, + { + "epoch": 1.543789898084892, + "grad_norm": 1.268245816230774, + "learning_rate": 9.801157894736842e-05, + "loss": 0.6169, + "step": 27569 + }, + { + "epoch": 1.543845895397021, + "grad_norm": 1.3493518829345703, + "learning_rate": 9.801131578947369e-05, + "loss": 0.4883, + "step": 27570 + }, + { + "epoch": 1.5439018927091501, + "grad_norm": 1.2586694955825806, + "learning_rate": 9.801105263157895e-05, + "loss": 0.5377, + "step": 27571 + }, + { + "epoch": 1.5439578900212791, + "grad_norm": 1.302017092704773, + "learning_rate": 9.801078947368421e-05, + "loss": 0.463, + "step": 27572 + }, + { + "epoch": 1.544013887333408, + "grad_norm": 34.4649658203125, + "learning_rate": 9.801052631578947e-05, + "loss": 0.4456, + "step": 27573 + }, + { + "epoch": 1.544069884645537, + "grad_norm": 1.345123291015625, + "learning_rate": 9.801026315789475e-05, + "loss": 0.4977, + "step": 27574 + }, + { + "epoch": 1.544125881957666, + "grad_norm": 1.382767677307129, + "learning_rate": 9.801e-05, + "loss": 0.4965, + "step": 27575 + }, + { + "epoch": 1.544181879269795, + "grad_norm": 1.341180443763733, + "learning_rate": 9.800973684210526e-05, + "loss": 0.4113, + "step": 27576 + }, + { + "epoch": 1.544237876581924, + "grad_norm": 1.2358497381210327, + "learning_rate": 9.800947368421052e-05, + "loss": 0.5991, + "step": 27577 + }, + { + "epoch": 1.544293873894053, + "grad_norm": 1.458097219467163, + "learning_rate": 9.80092105263158e-05, + "loss": 0.5126, + "step": 27578 + }, + { + "epoch": 1.544349871206182, + "grad_norm": 1.3169153928756714, + "learning_rate": 9.800894736842106e-05, + "loss": 0.4325, + "step": 27579 + }, + { + "epoch": 1.544405868518311, + "grad_norm": 1.228348970413208, + "learning_rate": 9.800868421052633e-05, + "loss": 0.3367, + "step": 27580 + }, + { + "epoch": 1.54446186583044, + "grad_norm": 1.3079302310943604, + "learning_rate": 9.800842105263158e-05, + "loss": 0.4229, + "step": 27581 + }, + { + "epoch": 1.5445178631425691, + "grad_norm": 1.2360742092132568, + "learning_rate": 9.800815789473685e-05, + "loss": 0.4752, + "step": 27582 + }, + { + "epoch": 1.5445738604546981, + "grad_norm": 1.5535396337509155, + "learning_rate": 9.800789473684211e-05, + "loss": 0.4473, + "step": 27583 + }, + { + "epoch": 1.5446298577668272, + "grad_norm": 1.1982449293136597, + "learning_rate": 9.800763157894737e-05, + "loss": 0.416, + "step": 27584 + }, + { + "epoch": 1.5446858550789562, + "grad_norm": 1.2752082347869873, + "learning_rate": 9.800736842105264e-05, + "loss": 0.6395, + "step": 27585 + }, + { + "epoch": 1.5447418523910852, + "grad_norm": 1.3502033948898315, + "learning_rate": 9.800710526315789e-05, + "loss": 0.4872, + "step": 27586 + }, + { + "epoch": 1.5447978497032142, + "grad_norm": 1.4935429096221924, + "learning_rate": 9.800684210526316e-05, + "loss": 0.441, + "step": 27587 + }, + { + "epoch": 1.5448538470153432, + "grad_norm": 1.4000657796859741, + "learning_rate": 9.800657894736842e-05, + "loss": 0.5606, + "step": 27588 + }, + { + "epoch": 1.5449098443274722, + "grad_norm": 1.6481719017028809, + "learning_rate": 9.80063157894737e-05, + "loss": 0.4268, + "step": 27589 + }, + { + "epoch": 1.5449658416396013, + "grad_norm": 1.4190216064453125, + "learning_rate": 9.800605263157894e-05, + "loss": 0.4317, + "step": 27590 + }, + { + "epoch": 1.5450218389517303, + "grad_norm": 1.8343473672866821, + "learning_rate": 9.800578947368422e-05, + "loss": 0.4894, + "step": 27591 + }, + { + "epoch": 1.5450778362638593, + "grad_norm": 1.4488444328308105, + "learning_rate": 9.800552631578947e-05, + "loss": 0.4326, + "step": 27592 + }, + { + "epoch": 1.5451338335759883, + "grad_norm": 1.3605916500091553, + "learning_rate": 9.800526315789475e-05, + "loss": 0.4197, + "step": 27593 + }, + { + "epoch": 1.5451898308881173, + "grad_norm": 1.5493638515472412, + "learning_rate": 9.800500000000001e-05, + "loss": 0.4657, + "step": 27594 + }, + { + "epoch": 1.5452458282002464, + "grad_norm": 2.3098866939544678, + "learning_rate": 9.800473684210527e-05, + "loss": 0.437, + "step": 27595 + }, + { + "epoch": 1.5453018255123754, + "grad_norm": 1.4588706493377686, + "learning_rate": 9.800447368421053e-05, + "loss": 0.4434, + "step": 27596 + }, + { + "epoch": 1.5453578228245044, + "grad_norm": 1.5190616846084595, + "learning_rate": 9.80042105263158e-05, + "loss": 0.3889, + "step": 27597 + }, + { + "epoch": 1.5454138201366334, + "grad_norm": 1.4056833982467651, + "learning_rate": 9.800394736842106e-05, + "loss": 0.4166, + "step": 27598 + }, + { + "epoch": 1.5454698174487624, + "grad_norm": 1.3816081285476685, + "learning_rate": 9.800368421052632e-05, + "loss": 0.4344, + "step": 27599 + }, + { + "epoch": 1.5455258147608915, + "grad_norm": 1.176428198814392, + "learning_rate": 9.800342105263158e-05, + "loss": 0.4996, + "step": 27600 + }, + { + "epoch": 1.5455818120730205, + "grad_norm": 1.3347351551055908, + "learning_rate": 9.800315789473685e-05, + "loss": 0.641, + "step": 27601 + }, + { + "epoch": 1.5456378093851495, + "grad_norm": 1.6254616975784302, + "learning_rate": 9.800289473684211e-05, + "loss": 0.567, + "step": 27602 + }, + { + "epoch": 1.5456938066972785, + "grad_norm": 1.2517011165618896, + "learning_rate": 9.800263157894737e-05, + "loss": 0.3868, + "step": 27603 + }, + { + "epoch": 1.5457498040094075, + "grad_norm": 1.9341506958007812, + "learning_rate": 9.800236842105263e-05, + "loss": 0.479, + "step": 27604 + }, + { + "epoch": 1.5458058013215366, + "grad_norm": 1.3913823366165161, + "learning_rate": 9.800210526315789e-05, + "loss": 0.4766, + "step": 27605 + }, + { + "epoch": 1.5458617986336656, + "grad_norm": 1.4235960245132446, + "learning_rate": 9.800184210526317e-05, + "loss": 0.5779, + "step": 27606 + }, + { + "epoch": 1.5459177959457946, + "grad_norm": 1.5019274950027466, + "learning_rate": 9.800157894736842e-05, + "loss": 0.3857, + "step": 27607 + }, + { + "epoch": 1.5459737932579236, + "grad_norm": 1.4286653995513916, + "learning_rate": 9.800131578947368e-05, + "loss": 0.4655, + "step": 27608 + }, + { + "epoch": 1.5460297905700526, + "grad_norm": 1.2873685359954834, + "learning_rate": 9.800105263157894e-05, + "loss": 0.4759, + "step": 27609 + }, + { + "epoch": 1.5460857878821817, + "grad_norm": 1.1733746528625488, + "learning_rate": 9.800078947368422e-05, + "loss": 0.4239, + "step": 27610 + }, + { + "epoch": 1.5461417851943107, + "grad_norm": 1.183221697807312, + "learning_rate": 9.800052631578948e-05, + "loss": 0.3502, + "step": 27611 + }, + { + "epoch": 1.5461977825064397, + "grad_norm": 1.1111266613006592, + "learning_rate": 9.800026315789475e-05, + "loss": 0.4377, + "step": 27612 + }, + { + "epoch": 1.5462537798185687, + "grad_norm": 1.2624726295471191, + "learning_rate": 9.8e-05, + "loss": 0.5593, + "step": 27613 + }, + { + "epoch": 1.5463097771306977, + "grad_norm": 1.170574426651001, + "learning_rate": 9.799973684210527e-05, + "loss": 0.5029, + "step": 27614 + }, + { + "epoch": 1.5463657744428267, + "grad_norm": 1.469617247581482, + "learning_rate": 9.799947368421053e-05, + "loss": 0.4209, + "step": 27615 + }, + { + "epoch": 1.5464217717549558, + "grad_norm": 1.3512747287750244, + "learning_rate": 9.79992105263158e-05, + "loss": 0.4437, + "step": 27616 + }, + { + "epoch": 1.5464777690670848, + "grad_norm": 1.5204124450683594, + "learning_rate": 9.799894736842106e-05, + "loss": 0.5534, + "step": 27617 + }, + { + "epoch": 1.5465337663792138, + "grad_norm": 1.490783929824829, + "learning_rate": 9.799868421052632e-05, + "loss": 0.5079, + "step": 27618 + }, + { + "epoch": 1.5465897636913428, + "grad_norm": 1.3356685638427734, + "learning_rate": 9.799842105263158e-05, + "loss": 0.4309, + "step": 27619 + }, + { + "epoch": 1.5466457610034718, + "grad_norm": 1.7717808485031128, + "learning_rate": 9.799815789473684e-05, + "loss": 0.4072, + "step": 27620 + }, + { + "epoch": 1.5467017583156009, + "grad_norm": 1.167535662651062, + "learning_rate": 9.799789473684212e-05, + "loss": 0.3819, + "step": 27621 + }, + { + "epoch": 1.5467577556277299, + "grad_norm": 1.2569087743759155, + "learning_rate": 9.799763157894738e-05, + "loss": 0.3743, + "step": 27622 + }, + { + "epoch": 1.546813752939859, + "grad_norm": 1.3656669855117798, + "learning_rate": 9.799736842105263e-05, + "loss": 0.4982, + "step": 27623 + }, + { + "epoch": 1.546869750251988, + "grad_norm": 1.3347703218460083, + "learning_rate": 9.79971052631579e-05, + "loss": 0.4499, + "step": 27624 + }, + { + "epoch": 1.546925747564117, + "grad_norm": 1.5180779695510864, + "learning_rate": 9.799684210526317e-05, + "loss": 0.4667, + "step": 27625 + }, + { + "epoch": 1.546981744876246, + "grad_norm": 1.380881428718567, + "learning_rate": 9.799657894736843e-05, + "loss": 0.3848, + "step": 27626 + }, + { + "epoch": 1.547037742188375, + "grad_norm": 1.9279532432556152, + "learning_rate": 9.799631578947369e-05, + "loss": 0.7991, + "step": 27627 + }, + { + "epoch": 1.547093739500504, + "grad_norm": 2.0329720973968506, + "learning_rate": 9.799605263157895e-05, + "loss": 0.4458, + "step": 27628 + }, + { + "epoch": 1.547149736812633, + "grad_norm": 1.4294487237930298, + "learning_rate": 9.799578947368422e-05, + "loss": 0.4432, + "step": 27629 + }, + { + "epoch": 1.547205734124762, + "grad_norm": 2.325901985168457, + "learning_rate": 9.799552631578948e-05, + "loss": 0.6328, + "step": 27630 + }, + { + "epoch": 1.547261731436891, + "grad_norm": 1.354583978652954, + "learning_rate": 9.799526315789474e-05, + "loss": 0.3689, + "step": 27631 + }, + { + "epoch": 1.54731772874902, + "grad_norm": 1.095689296722412, + "learning_rate": 9.7995e-05, + "loss": 0.3846, + "step": 27632 + }, + { + "epoch": 1.547373726061149, + "grad_norm": 1.3646684885025024, + "learning_rate": 9.799473684210527e-05, + "loss": 0.5064, + "step": 27633 + }, + { + "epoch": 1.547429723373278, + "grad_norm": 1.3029592037200928, + "learning_rate": 9.799447368421053e-05, + "loss": 0.5023, + "step": 27634 + }, + { + "epoch": 1.5474857206854071, + "grad_norm": 1.477940320968628, + "learning_rate": 9.79942105263158e-05, + "loss": 0.454, + "step": 27635 + }, + { + "epoch": 1.5475417179975361, + "grad_norm": 1.6404883861541748, + "learning_rate": 9.799394736842105e-05, + "loss": 0.4442, + "step": 27636 + }, + { + "epoch": 1.5475977153096652, + "grad_norm": 1.6449779272079468, + "learning_rate": 9.799368421052631e-05, + "loss": 0.5236, + "step": 27637 + }, + { + "epoch": 1.5476537126217942, + "grad_norm": 1.4596631526947021, + "learning_rate": 9.799342105263158e-05, + "loss": 0.3402, + "step": 27638 + }, + { + "epoch": 1.5477097099339232, + "grad_norm": 1.3849345445632935, + "learning_rate": 9.799315789473684e-05, + "loss": 0.3877, + "step": 27639 + }, + { + "epoch": 1.5477657072460522, + "grad_norm": 1.363826870918274, + "learning_rate": 9.799289473684212e-05, + "loss": 0.4869, + "step": 27640 + }, + { + "epoch": 1.5478217045581812, + "grad_norm": 1.2477645874023438, + "learning_rate": 9.799263157894736e-05, + "loss": 0.4483, + "step": 27641 + }, + { + "epoch": 1.5478777018703103, + "grad_norm": 1.1474741697311401, + "learning_rate": 9.799236842105264e-05, + "loss": 0.3734, + "step": 27642 + }, + { + "epoch": 1.5479336991824393, + "grad_norm": 1.4541995525360107, + "learning_rate": 9.79921052631579e-05, + "loss": 0.4592, + "step": 27643 + }, + { + "epoch": 1.5479896964945683, + "grad_norm": 1.234514832496643, + "learning_rate": 9.799184210526317e-05, + "loss": 0.5405, + "step": 27644 + }, + { + "epoch": 1.5480456938066973, + "grad_norm": 1.9639506340026855, + "learning_rate": 9.799157894736842e-05, + "loss": 0.5765, + "step": 27645 + }, + { + "epoch": 1.5481016911188263, + "grad_norm": 1.4586303234100342, + "learning_rate": 9.799131578947369e-05, + "loss": 0.5156, + "step": 27646 + }, + { + "epoch": 1.5481576884309554, + "grad_norm": 1.235527515411377, + "learning_rate": 9.799105263157895e-05, + "loss": 0.4629, + "step": 27647 + }, + { + "epoch": 1.5482136857430844, + "grad_norm": 1.2466105222702026, + "learning_rate": 9.799078947368422e-05, + "loss": 0.4021, + "step": 27648 + }, + { + "epoch": 1.5482696830552134, + "grad_norm": 1.3709512948989868, + "learning_rate": 9.799052631578948e-05, + "loss": 0.4173, + "step": 27649 + }, + { + "epoch": 1.5483256803673424, + "grad_norm": 1.4440704584121704, + "learning_rate": 9.799026315789474e-05, + "loss": 0.5999, + "step": 27650 + }, + { + "epoch": 1.5483816776794714, + "grad_norm": 1.4406636953353882, + "learning_rate": 9.799e-05, + "loss": 0.4572, + "step": 27651 + }, + { + "epoch": 1.5484376749916005, + "grad_norm": 1.3229143619537354, + "learning_rate": 9.798973684210528e-05, + "loss": 0.5996, + "step": 27652 + }, + { + "epoch": 1.5484936723037295, + "grad_norm": 1.45196533203125, + "learning_rate": 9.798947368421054e-05, + "loss": 0.4739, + "step": 27653 + }, + { + "epoch": 1.5485496696158585, + "grad_norm": 1.112792730331421, + "learning_rate": 9.79892105263158e-05, + "loss": 0.3605, + "step": 27654 + }, + { + "epoch": 1.5486056669279875, + "grad_norm": 1.547577142715454, + "learning_rate": 9.798894736842105e-05, + "loss": 0.5193, + "step": 27655 + }, + { + "epoch": 1.5486616642401163, + "grad_norm": 1.2999566793441772, + "learning_rate": 9.798868421052631e-05, + "loss": 0.404, + "step": 27656 + }, + { + "epoch": 1.5487176615522453, + "grad_norm": 1.3730603456497192, + "learning_rate": 9.798842105263159e-05, + "loss": 0.442, + "step": 27657 + }, + { + "epoch": 1.5487736588643743, + "grad_norm": 1.4913021326065063, + "learning_rate": 9.798815789473685e-05, + "loss": 0.4299, + "step": 27658 + }, + { + "epoch": 1.5488296561765034, + "grad_norm": 1.3966526985168457, + "learning_rate": 9.798789473684211e-05, + "loss": 0.4235, + "step": 27659 + }, + { + "epoch": 1.5488856534886324, + "grad_norm": 1.2151272296905518, + "learning_rate": 9.798763157894737e-05, + "loss": 0.4361, + "step": 27660 + }, + { + "epoch": 1.5489416508007614, + "grad_norm": 1.390045404434204, + "learning_rate": 9.798736842105264e-05, + "loss": 0.4848, + "step": 27661 + }, + { + "epoch": 1.5489976481128904, + "grad_norm": 1.304226279258728, + "learning_rate": 9.79871052631579e-05, + "loss": 0.3626, + "step": 27662 + }, + { + "epoch": 1.5490536454250194, + "grad_norm": 1.1964772939682007, + "learning_rate": 9.798684210526316e-05, + "loss": 0.3906, + "step": 27663 + }, + { + "epoch": 1.5491096427371485, + "grad_norm": 1.7195937633514404, + "learning_rate": 9.798657894736842e-05, + "loss": 0.5034, + "step": 27664 + }, + { + "epoch": 1.5491656400492775, + "grad_norm": 1.2917834520339966, + "learning_rate": 9.798631578947369e-05, + "loss": 0.419, + "step": 27665 + }, + { + "epoch": 1.5492216373614065, + "grad_norm": 1.176645278930664, + "learning_rate": 9.798605263157895e-05, + "loss": 0.4128, + "step": 27666 + }, + { + "epoch": 1.5492776346735355, + "grad_norm": 1.359494924545288, + "learning_rate": 9.798578947368423e-05, + "loss": 0.4341, + "step": 27667 + }, + { + "epoch": 1.5493336319856645, + "grad_norm": 1.2506375312805176, + "learning_rate": 9.798552631578947e-05, + "loss": 0.3823, + "step": 27668 + }, + { + "epoch": 1.5493896292977936, + "grad_norm": 1.3744421005249023, + "learning_rate": 9.798526315789474e-05, + "loss": 0.5909, + "step": 27669 + }, + { + "epoch": 1.5494456266099226, + "grad_norm": 1.2375952005386353, + "learning_rate": 9.7985e-05, + "loss": 0.63, + "step": 27670 + }, + { + "epoch": 1.5495016239220516, + "grad_norm": 1.6736398935317993, + "learning_rate": 9.798473684210526e-05, + "loss": 0.6227, + "step": 27671 + }, + { + "epoch": 1.5495576212341806, + "grad_norm": 1.6351205110549927, + "learning_rate": 9.798447368421054e-05, + "loss": 0.6249, + "step": 27672 + }, + { + "epoch": 1.5496136185463096, + "grad_norm": 1.5866144895553589, + "learning_rate": 9.798421052631578e-05, + "loss": 0.6634, + "step": 27673 + }, + { + "epoch": 1.5496696158584387, + "grad_norm": 1.447206974029541, + "learning_rate": 9.798394736842106e-05, + "loss": 0.4198, + "step": 27674 + }, + { + "epoch": 1.5497256131705677, + "grad_norm": 1.0121902227401733, + "learning_rate": 9.798368421052632e-05, + "loss": 0.3474, + "step": 27675 + }, + { + "epoch": 1.5497816104826967, + "grad_norm": 1.433874487876892, + "learning_rate": 9.798342105263159e-05, + "loss": 0.4346, + "step": 27676 + }, + { + "epoch": 1.5498376077948257, + "grad_norm": 1.1715593338012695, + "learning_rate": 9.798315789473685e-05, + "loss": 0.4513, + "step": 27677 + }, + { + "epoch": 1.5498936051069547, + "grad_norm": 1.19560968875885, + "learning_rate": 9.798289473684211e-05, + "loss": 0.4297, + "step": 27678 + }, + { + "epoch": 1.5499496024190837, + "grad_norm": 1.3347197771072388, + "learning_rate": 9.798263157894737e-05, + "loss": 0.3472, + "step": 27679 + }, + { + "epoch": 1.5500055997312128, + "grad_norm": 1.138706922531128, + "learning_rate": 9.798236842105264e-05, + "loss": 0.375, + "step": 27680 + }, + { + "epoch": 1.5500615970433418, + "grad_norm": 1.112441062927246, + "learning_rate": 9.79821052631579e-05, + "loss": 0.4377, + "step": 27681 + }, + { + "epoch": 1.5501175943554708, + "grad_norm": 1.169978380203247, + "learning_rate": 9.798184210526316e-05, + "loss": 0.4607, + "step": 27682 + }, + { + "epoch": 1.5501735916675998, + "grad_norm": 1.4009206295013428, + "learning_rate": 9.798157894736842e-05, + "loss": 0.488, + "step": 27683 + }, + { + "epoch": 1.5502295889797288, + "grad_norm": 1.5235466957092285, + "learning_rate": 9.79813157894737e-05, + "loss": 0.3927, + "step": 27684 + }, + { + "epoch": 1.5502855862918579, + "grad_norm": 1.2733972072601318, + "learning_rate": 9.798105263157895e-05, + "loss": 0.4078, + "step": 27685 + }, + { + "epoch": 1.5503415836039869, + "grad_norm": 1.9851861000061035, + "learning_rate": 9.798078947368421e-05, + "loss": 0.4431, + "step": 27686 + }, + { + "epoch": 1.550397580916116, + "grad_norm": 1.2623870372772217, + "learning_rate": 9.798052631578947e-05, + "loss": 0.4077, + "step": 27687 + }, + { + "epoch": 1.550453578228245, + "grad_norm": 1.3079915046691895, + "learning_rate": 9.798026315789473e-05, + "loss": 0.3512, + "step": 27688 + }, + { + "epoch": 1.550509575540374, + "grad_norm": 1.4924261569976807, + "learning_rate": 9.798000000000001e-05, + "loss": 0.5004, + "step": 27689 + }, + { + "epoch": 1.550565572852503, + "grad_norm": 1.1912822723388672, + "learning_rate": 9.797973684210527e-05, + "loss": 0.4008, + "step": 27690 + }, + { + "epoch": 1.550621570164632, + "grad_norm": 1.5061736106872559, + "learning_rate": 9.797947368421053e-05, + "loss": 0.4683, + "step": 27691 + }, + { + "epoch": 1.550677567476761, + "grad_norm": 1.206244707107544, + "learning_rate": 9.797921052631579e-05, + "loss": 0.4397, + "step": 27692 + }, + { + "epoch": 1.55073356478889, + "grad_norm": 1.3236454725265503, + "learning_rate": 9.797894736842106e-05, + "loss": 0.416, + "step": 27693 + }, + { + "epoch": 1.550789562101019, + "grad_norm": 1.4528212547302246, + "learning_rate": 9.797868421052632e-05, + "loss": 0.6181, + "step": 27694 + }, + { + "epoch": 1.550845559413148, + "grad_norm": 1.3994133472442627, + "learning_rate": 9.797842105263159e-05, + "loss": 0.4998, + "step": 27695 + }, + { + "epoch": 1.550901556725277, + "grad_norm": 1.1481215953826904, + "learning_rate": 9.797815789473684e-05, + "loss": 0.3476, + "step": 27696 + }, + { + "epoch": 1.550957554037406, + "grad_norm": 1.178155779838562, + "learning_rate": 9.797789473684211e-05, + "loss": 0.3862, + "step": 27697 + }, + { + "epoch": 1.5510135513495351, + "grad_norm": 1.1070185899734497, + "learning_rate": 9.797763157894737e-05, + "loss": 0.5307, + "step": 27698 + }, + { + "epoch": 1.5510695486616641, + "grad_norm": 1.3176507949829102, + "learning_rate": 9.797736842105265e-05, + "loss": 0.3921, + "step": 27699 + }, + { + "epoch": 1.5511255459737932, + "grad_norm": 1.0525776147842407, + "learning_rate": 9.797710526315789e-05, + "loss": 0.3283, + "step": 27700 + }, + { + "epoch": 1.5511815432859222, + "grad_norm": 1.5223246812820435, + "learning_rate": 9.797684210526316e-05, + "loss": 0.5607, + "step": 27701 + }, + { + "epoch": 1.5512375405980512, + "grad_norm": 1.5738961696624756, + "learning_rate": 9.797657894736842e-05, + "loss": 0.5562, + "step": 27702 + }, + { + "epoch": 1.5512935379101802, + "grad_norm": 1.657384991645813, + "learning_rate": 9.79763157894737e-05, + "loss": 0.4221, + "step": 27703 + }, + { + "epoch": 1.5513495352223092, + "grad_norm": 1.3452095985412598, + "learning_rate": 9.797605263157896e-05, + "loss": 0.483, + "step": 27704 + }, + { + "epoch": 1.5514055325344382, + "grad_norm": 1.2362409830093384, + "learning_rate": 9.79757894736842e-05, + "loss": 0.3704, + "step": 27705 + }, + { + "epoch": 1.5514615298465673, + "grad_norm": 1.1338156461715698, + "learning_rate": 9.797552631578948e-05, + "loss": 0.4091, + "step": 27706 + }, + { + "epoch": 1.5515175271586963, + "grad_norm": 1.4207375049591064, + "learning_rate": 9.797526315789474e-05, + "loss": 0.4474, + "step": 27707 + }, + { + "epoch": 1.5515735244708253, + "grad_norm": 1.397441029548645, + "learning_rate": 9.797500000000001e-05, + "loss": 0.4012, + "step": 27708 + }, + { + "epoch": 1.5516295217829543, + "grad_norm": 1.2236764430999756, + "learning_rate": 9.797473684210527e-05, + "loss": 0.3869, + "step": 27709 + }, + { + "epoch": 1.5516855190950833, + "grad_norm": 1.1548060178756714, + "learning_rate": 9.797447368421053e-05, + "loss": 0.3724, + "step": 27710 + }, + { + "epoch": 1.5517415164072124, + "grad_norm": 1.5421934127807617, + "learning_rate": 9.797421052631579e-05, + "loss": 0.56, + "step": 27711 + }, + { + "epoch": 1.5517975137193414, + "grad_norm": 1.4410067796707153, + "learning_rate": 9.797394736842106e-05, + "loss": 0.4808, + "step": 27712 + }, + { + "epoch": 1.5518535110314704, + "grad_norm": 1.3346655368804932, + "learning_rate": 9.797368421052632e-05, + "loss": 0.4176, + "step": 27713 + }, + { + "epoch": 1.5519095083435994, + "grad_norm": 1.3503823280334473, + "learning_rate": 9.797342105263158e-05, + "loss": 0.5093, + "step": 27714 + }, + { + "epoch": 1.5519655056557284, + "grad_norm": 1.340509057044983, + "learning_rate": 9.797315789473684e-05, + "loss": 0.4658, + "step": 27715 + }, + { + "epoch": 1.5520215029678575, + "grad_norm": 1.1793127059936523, + "learning_rate": 9.797289473684211e-05, + "loss": 0.4391, + "step": 27716 + }, + { + "epoch": 1.5520775002799865, + "grad_norm": 1.1974103450775146, + "learning_rate": 9.797263157894737e-05, + "loss": 0.3996, + "step": 27717 + }, + { + "epoch": 1.5521334975921155, + "grad_norm": 1.266251564025879, + "learning_rate": 9.797236842105263e-05, + "loss": 0.4152, + "step": 27718 + }, + { + "epoch": 1.5521894949042445, + "grad_norm": 2.003234386444092, + "learning_rate": 9.79721052631579e-05, + "loss": 0.5629, + "step": 27719 + }, + { + "epoch": 1.5522454922163735, + "grad_norm": 1.327091097831726, + "learning_rate": 9.797184210526317e-05, + "loss": 0.3728, + "step": 27720 + }, + { + "epoch": 1.5523014895285026, + "grad_norm": 1.35039484500885, + "learning_rate": 9.797157894736843e-05, + "loss": 0.4403, + "step": 27721 + }, + { + "epoch": 1.5523574868406316, + "grad_norm": 1.0910850763320923, + "learning_rate": 9.79713157894737e-05, + "loss": 0.366, + "step": 27722 + }, + { + "epoch": 1.5524134841527606, + "grad_norm": 1.8816014528274536, + "learning_rate": 9.797105263157895e-05, + "loss": 0.6915, + "step": 27723 + }, + { + "epoch": 1.5524694814648896, + "grad_norm": 1.3539272546768188, + "learning_rate": 9.79707894736842e-05, + "loss": 0.4375, + "step": 27724 + }, + { + "epoch": 1.5525254787770186, + "grad_norm": 1.5180089473724365, + "learning_rate": 9.797052631578948e-05, + "loss": 0.4702, + "step": 27725 + }, + { + "epoch": 1.5525814760891476, + "grad_norm": 1.1890053749084473, + "learning_rate": 9.797026315789474e-05, + "loss": 0.3834, + "step": 27726 + }, + { + "epoch": 1.5526374734012767, + "grad_norm": 1.3108999729156494, + "learning_rate": 9.797000000000001e-05, + "loss": 0.4571, + "step": 27727 + }, + { + "epoch": 1.5526934707134057, + "grad_norm": 1.4722704887390137, + "learning_rate": 9.796973684210526e-05, + "loss": 0.5284, + "step": 27728 + }, + { + "epoch": 1.5527494680255347, + "grad_norm": 1.3678479194641113, + "learning_rate": 9.796947368421053e-05, + "loss": 0.5655, + "step": 27729 + }, + { + "epoch": 1.5528054653376637, + "grad_norm": 1.1171141862869263, + "learning_rate": 9.796921052631579e-05, + "loss": 0.4052, + "step": 27730 + }, + { + "epoch": 1.5528614626497927, + "grad_norm": 1.676128625869751, + "learning_rate": 9.796894736842106e-05, + "loss": 0.4494, + "step": 27731 + }, + { + "epoch": 1.5529174599619218, + "grad_norm": 1.7069107294082642, + "learning_rate": 9.796868421052632e-05, + "loss": 0.6305, + "step": 27732 + }, + { + "epoch": 1.5529734572740508, + "grad_norm": 1.9045878648757935, + "learning_rate": 9.796842105263158e-05, + "loss": 0.4019, + "step": 27733 + }, + { + "epoch": 1.5530294545861798, + "grad_norm": 1.4582682847976685, + "learning_rate": 9.796815789473684e-05, + "loss": 0.335, + "step": 27734 + }, + { + "epoch": 1.5530854518983088, + "grad_norm": 1.0639840364456177, + "learning_rate": 9.796789473684212e-05, + "loss": 0.4934, + "step": 27735 + }, + { + "epoch": 1.5531414492104378, + "grad_norm": 1.4219121932983398, + "learning_rate": 9.796763157894738e-05, + "loss": 0.4423, + "step": 27736 + }, + { + "epoch": 1.5531974465225669, + "grad_norm": 1.201599359512329, + "learning_rate": 9.796736842105264e-05, + "loss": 0.409, + "step": 27737 + }, + { + "epoch": 1.5532534438346959, + "grad_norm": 1.385581612586975, + "learning_rate": 9.79671052631579e-05, + "loss": 0.4744, + "step": 27738 + }, + { + "epoch": 1.553309441146825, + "grad_norm": 1.4093060493469238, + "learning_rate": 9.796684210526317e-05, + "loss": 0.3263, + "step": 27739 + }, + { + "epoch": 1.553365438458954, + "grad_norm": 1.5411500930786133, + "learning_rate": 9.796657894736843e-05, + "loss": 0.4366, + "step": 27740 + }, + { + "epoch": 1.553421435771083, + "grad_norm": 1.2060127258300781, + "learning_rate": 9.796631578947369e-05, + "loss": 0.4044, + "step": 27741 + }, + { + "epoch": 1.553477433083212, + "grad_norm": 1.3991246223449707, + "learning_rate": 9.796605263157895e-05, + "loss": 0.4528, + "step": 27742 + }, + { + "epoch": 1.553533430395341, + "grad_norm": 1.4705504179000854, + "learning_rate": 9.796578947368421e-05, + "loss": 0.549, + "step": 27743 + }, + { + "epoch": 1.55358942770747, + "grad_norm": 1.3193557262420654, + "learning_rate": 9.796552631578948e-05, + "loss": 0.3969, + "step": 27744 + }, + { + "epoch": 1.553645425019599, + "grad_norm": 1.5294543504714966, + "learning_rate": 9.796526315789474e-05, + "loss": 0.4579, + "step": 27745 + }, + { + "epoch": 1.553701422331728, + "grad_norm": 1.4128329753875732, + "learning_rate": 9.7965e-05, + "loss": 0.5688, + "step": 27746 + }, + { + "epoch": 1.553757419643857, + "grad_norm": 1.693296194076538, + "learning_rate": 9.796473684210526e-05, + "loss": 0.6006, + "step": 27747 + }, + { + "epoch": 1.553813416955986, + "grad_norm": 1.4760183095932007, + "learning_rate": 9.796447368421053e-05, + "loss": 0.4203, + "step": 27748 + }, + { + "epoch": 1.553869414268115, + "grad_norm": 1.2221639156341553, + "learning_rate": 9.79642105263158e-05, + "loss": 0.4608, + "step": 27749 + }, + { + "epoch": 1.553925411580244, + "grad_norm": 1.3965953588485718, + "learning_rate": 9.796394736842107e-05, + "loss": 0.5166, + "step": 27750 + }, + { + "epoch": 1.5539814088923731, + "grad_norm": 1.446052074432373, + "learning_rate": 9.796368421052631e-05, + "loss": 0.4521, + "step": 27751 + }, + { + "epoch": 1.5540374062045021, + "grad_norm": 1.1243226528167725, + "learning_rate": 9.796342105263159e-05, + "loss": 0.2749, + "step": 27752 + }, + { + "epoch": 1.5540934035166312, + "grad_norm": 1.5954065322875977, + "learning_rate": 9.796315789473685e-05, + "loss": 0.5175, + "step": 27753 + }, + { + "epoch": 1.5541494008287602, + "grad_norm": 1.2738813161849976, + "learning_rate": 9.796289473684212e-05, + "loss": 0.4379, + "step": 27754 + }, + { + "epoch": 1.5542053981408892, + "grad_norm": 1.3528434038162231, + "learning_rate": 9.796263157894737e-05, + "loss": 0.4364, + "step": 27755 + }, + { + "epoch": 1.5542613954530182, + "grad_norm": 1.5497957468032837, + "learning_rate": 9.796236842105264e-05, + "loss": 0.5718, + "step": 27756 + }, + { + "epoch": 1.5543173927651472, + "grad_norm": 1.4113351106643677, + "learning_rate": 9.79621052631579e-05, + "loss": 0.5735, + "step": 27757 + }, + { + "epoch": 1.5543733900772763, + "grad_norm": 1.560281753540039, + "learning_rate": 9.796184210526316e-05, + "loss": 0.4684, + "step": 27758 + }, + { + "epoch": 1.5544293873894053, + "grad_norm": 1.4054566621780396, + "learning_rate": 9.796157894736843e-05, + "loss": 0.3689, + "step": 27759 + }, + { + "epoch": 1.5544853847015343, + "grad_norm": 1.5809017419815063, + "learning_rate": 9.796131578947368e-05, + "loss": 0.4974, + "step": 27760 + }, + { + "epoch": 1.5545413820136633, + "grad_norm": 1.3012982606887817, + "learning_rate": 9.796105263157895e-05, + "loss": 0.5023, + "step": 27761 + }, + { + "epoch": 1.5545973793257923, + "grad_norm": 1.0782115459442139, + "learning_rate": 9.796078947368421e-05, + "loss": 0.3798, + "step": 27762 + }, + { + "epoch": 1.5546533766379214, + "grad_norm": 1.3033592700958252, + "learning_rate": 9.796052631578948e-05, + "loss": 0.4332, + "step": 27763 + }, + { + "epoch": 1.5547093739500504, + "grad_norm": 1.4053047895431519, + "learning_rate": 9.796026315789474e-05, + "loss": 0.4463, + "step": 27764 + }, + { + "epoch": 1.5547653712621794, + "grad_norm": 1.3142244815826416, + "learning_rate": 9.796e-05, + "loss": 0.4889, + "step": 27765 + }, + { + "epoch": 1.5548213685743084, + "grad_norm": 1.018303632736206, + "learning_rate": 9.795973684210526e-05, + "loss": 0.3926, + "step": 27766 + }, + { + "epoch": 1.5548773658864374, + "grad_norm": 1.1563303470611572, + "learning_rate": 9.795947368421054e-05, + "loss": 0.3991, + "step": 27767 + }, + { + "epoch": 1.5549333631985665, + "grad_norm": 1.3148008584976196, + "learning_rate": 9.79592105263158e-05, + "loss": 0.4882, + "step": 27768 + }, + { + "epoch": 1.5549893605106955, + "grad_norm": 1.119433879852295, + "learning_rate": 9.795894736842106e-05, + "loss": 0.3509, + "step": 27769 + }, + { + "epoch": 1.5550453578228245, + "grad_norm": 1.9987049102783203, + "learning_rate": 9.795868421052632e-05, + "loss": 0.4775, + "step": 27770 + }, + { + "epoch": 1.5551013551349535, + "grad_norm": 1.4459271430969238, + "learning_rate": 9.795842105263159e-05, + "loss": 0.5327, + "step": 27771 + }, + { + "epoch": 1.5551573524470825, + "grad_norm": 22.486413955688477, + "learning_rate": 9.795815789473685e-05, + "loss": 0.6281, + "step": 27772 + }, + { + "epoch": 1.5552133497592115, + "grad_norm": 1.9849474430084229, + "learning_rate": 9.795789473684211e-05, + "loss": 0.6185, + "step": 27773 + }, + { + "epoch": 1.5552693470713406, + "grad_norm": 1.5695642232894897, + "learning_rate": 9.795763157894737e-05, + "loss": 0.3902, + "step": 27774 + }, + { + "epoch": 1.5553253443834696, + "grad_norm": 1.440346121788025, + "learning_rate": 9.795736842105263e-05, + "loss": 0.5583, + "step": 27775 + }, + { + "epoch": 1.5553813416955986, + "grad_norm": 1.3403582572937012, + "learning_rate": 9.79571052631579e-05, + "loss": 0.5714, + "step": 27776 + }, + { + "epoch": 1.5554373390077276, + "grad_norm": 1.3569977283477783, + "learning_rate": 9.795684210526316e-05, + "loss": 0.3281, + "step": 27777 + }, + { + "epoch": 1.5554933363198566, + "grad_norm": 1.1667449474334717, + "learning_rate": 9.795657894736842e-05, + "loss": 0.5078, + "step": 27778 + }, + { + "epoch": 1.5555493336319857, + "grad_norm": 1.2146210670471191, + "learning_rate": 9.795631578947368e-05, + "loss": 0.5043, + "step": 27779 + }, + { + "epoch": 1.5556053309441147, + "grad_norm": 2.4014410972595215, + "learning_rate": 9.795605263157895e-05, + "loss": 0.445, + "step": 27780 + }, + { + "epoch": 1.5556613282562437, + "grad_norm": 1.3136094808578491, + "learning_rate": 9.795578947368421e-05, + "loss": 0.6192, + "step": 27781 + }, + { + "epoch": 1.5557173255683727, + "grad_norm": 1.2894420623779297, + "learning_rate": 9.795552631578949e-05, + "loss": 0.4372, + "step": 27782 + }, + { + "epoch": 1.5557733228805017, + "grad_norm": 1.4806616306304932, + "learning_rate": 9.795526315789473e-05, + "loss": 0.6182, + "step": 27783 + }, + { + "epoch": 1.5558293201926308, + "grad_norm": 1.118759274482727, + "learning_rate": 9.7955e-05, + "loss": 0.4354, + "step": 27784 + }, + { + "epoch": 1.5558853175047598, + "grad_norm": 1.2672228813171387, + "learning_rate": 9.795473684210527e-05, + "loss": 0.3302, + "step": 27785 + }, + { + "epoch": 1.5559413148168888, + "grad_norm": 1.3909382820129395, + "learning_rate": 9.795447368421054e-05, + "loss": 0.619, + "step": 27786 + }, + { + "epoch": 1.5559973121290178, + "grad_norm": 1.2416855096817017, + "learning_rate": 9.79542105263158e-05, + "loss": 0.4477, + "step": 27787 + }, + { + "epoch": 1.5560533094411468, + "grad_norm": 1.565833330154419, + "learning_rate": 9.795394736842106e-05, + "loss": 0.5442, + "step": 27788 + }, + { + "epoch": 1.5561093067532759, + "grad_norm": 1.4561254978179932, + "learning_rate": 9.795368421052632e-05, + "loss": 0.5995, + "step": 27789 + }, + { + "epoch": 1.5561653040654049, + "grad_norm": 1.3448600769042969, + "learning_rate": 9.795342105263159e-05, + "loss": 0.4515, + "step": 27790 + }, + { + "epoch": 1.556221301377534, + "grad_norm": 1.1499367952346802, + "learning_rate": 9.795315789473685e-05, + "loss": 0.445, + "step": 27791 + }, + { + "epoch": 1.556277298689663, + "grad_norm": 1.1299775838851929, + "learning_rate": 9.79528947368421e-05, + "loss": 0.4119, + "step": 27792 + }, + { + "epoch": 1.556333296001792, + "grad_norm": 1.1896737813949585, + "learning_rate": 9.795263157894737e-05, + "loss": 0.4403, + "step": 27793 + }, + { + "epoch": 1.556389293313921, + "grad_norm": 1.3264724016189575, + "learning_rate": 9.795236842105263e-05, + "loss": 0.5096, + "step": 27794 + }, + { + "epoch": 1.55644529062605, + "grad_norm": 1.5012086629867554, + "learning_rate": 9.79521052631579e-05, + "loss": 0.4116, + "step": 27795 + }, + { + "epoch": 1.556501287938179, + "grad_norm": 2.2111494541168213, + "learning_rate": 9.795184210526316e-05, + "loss": 0.5572, + "step": 27796 + }, + { + "epoch": 1.556557285250308, + "grad_norm": 1.1106387376785278, + "learning_rate": 9.795157894736842e-05, + "loss": 0.3883, + "step": 27797 + }, + { + "epoch": 1.556613282562437, + "grad_norm": 1.485128402709961, + "learning_rate": 9.795131578947368e-05, + "loss": 0.5082, + "step": 27798 + }, + { + "epoch": 1.556669279874566, + "grad_norm": 1.1329487562179565, + "learning_rate": 9.795105263157896e-05, + "loss": 0.4078, + "step": 27799 + }, + { + "epoch": 1.556725277186695, + "grad_norm": 1.0622822046279907, + "learning_rate": 9.795078947368422e-05, + "loss": 0.473, + "step": 27800 + }, + { + "epoch": 1.556781274498824, + "grad_norm": 1.2985491752624512, + "learning_rate": 9.795052631578948e-05, + "loss": 0.4488, + "step": 27801 + }, + { + "epoch": 1.556837271810953, + "grad_norm": 1.0579886436462402, + "learning_rate": 9.795026315789474e-05, + "loss": 0.3947, + "step": 27802 + }, + { + "epoch": 1.5568932691230821, + "grad_norm": 1.3231867551803589, + "learning_rate": 9.795000000000001e-05, + "loss": 0.4697, + "step": 27803 + }, + { + "epoch": 1.5569492664352111, + "grad_norm": 1.3121984004974365, + "learning_rate": 9.794973684210527e-05, + "loss": 0.4894, + "step": 27804 + }, + { + "epoch": 1.5570052637473402, + "grad_norm": 1.2421481609344482, + "learning_rate": 9.794947368421053e-05, + "loss": 0.4702, + "step": 27805 + }, + { + "epoch": 1.5570612610594692, + "grad_norm": 1.358551025390625, + "learning_rate": 9.794921052631579e-05, + "loss": 0.4754, + "step": 27806 + }, + { + "epoch": 1.5571172583715982, + "grad_norm": 1.1165051460266113, + "learning_rate": 9.794894736842106e-05, + "loss": 0.413, + "step": 27807 + }, + { + "epoch": 1.5571732556837272, + "grad_norm": 1.1838672161102295, + "learning_rate": 9.794868421052632e-05, + "loss": 0.4276, + "step": 27808 + }, + { + "epoch": 1.5572292529958562, + "grad_norm": 1.4677737951278687, + "learning_rate": 9.794842105263158e-05, + "loss": 0.5689, + "step": 27809 + }, + { + "epoch": 1.5572852503079853, + "grad_norm": 2.69555926322937, + "learning_rate": 9.794815789473684e-05, + "loss": 0.6513, + "step": 27810 + }, + { + "epoch": 1.5573412476201143, + "grad_norm": 1.8028641939163208, + "learning_rate": 9.79478947368421e-05, + "loss": 0.4972, + "step": 27811 + }, + { + "epoch": 1.5573972449322433, + "grad_norm": 1.2707033157348633, + "learning_rate": 9.794763157894737e-05, + "loss": 0.4286, + "step": 27812 + }, + { + "epoch": 1.5574532422443723, + "grad_norm": 1.3541085720062256, + "learning_rate": 9.794736842105263e-05, + "loss": 0.516, + "step": 27813 + }, + { + "epoch": 1.5575092395565013, + "grad_norm": 45.87267303466797, + "learning_rate": 9.794710526315791e-05, + "loss": 0.4842, + "step": 27814 + }, + { + "epoch": 1.5575652368686304, + "grad_norm": 1.760021686553955, + "learning_rate": 9.794684210526315e-05, + "loss": 0.7113, + "step": 27815 + }, + { + "epoch": 1.5576212341807594, + "grad_norm": 1.8014436960220337, + "learning_rate": 9.794657894736843e-05, + "loss": 0.8422, + "step": 27816 + }, + { + "epoch": 1.5576772314928884, + "grad_norm": 1.4810210466384888, + "learning_rate": 9.794631578947369e-05, + "loss": 0.441, + "step": 27817 + }, + { + "epoch": 1.5577332288050174, + "grad_norm": 1.642518162727356, + "learning_rate": 9.794605263157896e-05, + "loss": 0.386, + "step": 27818 + }, + { + "epoch": 1.5577892261171464, + "grad_norm": 1.0953466892242432, + "learning_rate": 9.794578947368422e-05, + "loss": 0.4178, + "step": 27819 + }, + { + "epoch": 1.5578452234292754, + "grad_norm": 1.2690304517745972, + "learning_rate": 9.794552631578948e-05, + "loss": 0.4771, + "step": 27820 + }, + { + "epoch": 1.5579012207414045, + "grad_norm": 1.3168573379516602, + "learning_rate": 9.794526315789474e-05, + "loss": 0.4387, + "step": 27821 + }, + { + "epoch": 1.5579572180535335, + "grad_norm": 1.2579644918441772, + "learning_rate": 9.794500000000001e-05, + "loss": 0.5125, + "step": 27822 + }, + { + "epoch": 1.5580132153656625, + "grad_norm": 1.160528540611267, + "learning_rate": 9.794473684210527e-05, + "loss": 0.4034, + "step": 27823 + }, + { + "epoch": 1.5580692126777915, + "grad_norm": 1.2254694700241089, + "learning_rate": 9.794447368421053e-05, + "loss": 0.4466, + "step": 27824 + }, + { + "epoch": 1.5581252099899205, + "grad_norm": 1.1999738216400146, + "learning_rate": 9.794421052631579e-05, + "loss": 0.441, + "step": 27825 + }, + { + "epoch": 1.5581812073020496, + "grad_norm": 1.1831997632980347, + "learning_rate": 9.794394736842105e-05, + "loss": 0.4673, + "step": 27826 + }, + { + "epoch": 1.5582372046141786, + "grad_norm": 1.330078125, + "learning_rate": 9.794368421052632e-05, + "loss": 0.5133, + "step": 27827 + }, + { + "epoch": 1.5582932019263076, + "grad_norm": 1.326755404472351, + "learning_rate": 9.794342105263158e-05, + "loss": 0.4091, + "step": 27828 + }, + { + "epoch": 1.5583491992384366, + "grad_norm": 1.13228440284729, + "learning_rate": 9.794315789473684e-05, + "loss": 0.3985, + "step": 27829 + }, + { + "epoch": 1.5584051965505656, + "grad_norm": 1.2264310121536255, + "learning_rate": 9.79428947368421e-05, + "loss": 0.4694, + "step": 27830 + }, + { + "epoch": 1.5584611938626947, + "grad_norm": 1.1563001871109009, + "learning_rate": 9.794263157894738e-05, + "loss": 0.3887, + "step": 27831 + }, + { + "epoch": 1.5585171911748237, + "grad_norm": 1.2988418340682983, + "learning_rate": 9.794236842105264e-05, + "loss": 0.5275, + "step": 27832 + }, + { + "epoch": 1.5585731884869527, + "grad_norm": 1.272973895072937, + "learning_rate": 9.79421052631579e-05, + "loss": 0.4014, + "step": 27833 + }, + { + "epoch": 1.5586291857990817, + "grad_norm": 1.4969706535339355, + "learning_rate": 9.794184210526316e-05, + "loss": 0.5557, + "step": 27834 + }, + { + "epoch": 1.5586851831112107, + "grad_norm": 1.4358437061309814, + "learning_rate": 9.794157894736843e-05, + "loss": 0.3992, + "step": 27835 + }, + { + "epoch": 1.5587411804233398, + "grad_norm": 1.112850546836853, + "learning_rate": 9.794131578947369e-05, + "loss": 0.4536, + "step": 27836 + }, + { + "epoch": 1.5587971777354688, + "grad_norm": 1.3522948026657104, + "learning_rate": 9.794105263157896e-05, + "loss": 0.4322, + "step": 27837 + }, + { + "epoch": 1.5588531750475978, + "grad_norm": 1.5419671535491943, + "learning_rate": 9.794078947368421e-05, + "loss": 0.4437, + "step": 27838 + }, + { + "epoch": 1.5589091723597268, + "grad_norm": 1.2560789585113525, + "learning_rate": 9.794052631578948e-05, + "loss": 0.414, + "step": 27839 + }, + { + "epoch": 1.5589651696718558, + "grad_norm": 1.4753572940826416, + "learning_rate": 9.794026315789474e-05, + "loss": 0.4246, + "step": 27840 + }, + { + "epoch": 1.5590211669839849, + "grad_norm": 1.5477051734924316, + "learning_rate": 9.794000000000001e-05, + "loss": 0.462, + "step": 27841 + }, + { + "epoch": 1.5590771642961139, + "grad_norm": 1.2439004182815552, + "learning_rate": 9.793973684210527e-05, + "loss": 0.3825, + "step": 27842 + }, + { + "epoch": 1.559133161608243, + "grad_norm": 1.1846262216567993, + "learning_rate": 9.793947368421053e-05, + "loss": 0.3773, + "step": 27843 + }, + { + "epoch": 1.559189158920372, + "grad_norm": 1.2155591249465942, + "learning_rate": 9.79392105263158e-05, + "loss": 0.3896, + "step": 27844 + }, + { + "epoch": 1.559245156232501, + "grad_norm": 1.6876803636550903, + "learning_rate": 9.793894736842105e-05, + "loss": 0.6784, + "step": 27845 + }, + { + "epoch": 1.55930115354463, + "grad_norm": 1.1184388399124146, + "learning_rate": 9.793868421052633e-05, + "loss": 0.4407, + "step": 27846 + }, + { + "epoch": 1.559357150856759, + "grad_norm": 1.3937392234802246, + "learning_rate": 9.793842105263157e-05, + "loss": 0.6646, + "step": 27847 + }, + { + "epoch": 1.559413148168888, + "grad_norm": 1.4409432411193848, + "learning_rate": 9.793815789473685e-05, + "loss": 0.6223, + "step": 27848 + }, + { + "epoch": 1.559469145481017, + "grad_norm": 2.5783779621124268, + "learning_rate": 9.79378947368421e-05, + "loss": 0.4499, + "step": 27849 + }, + { + "epoch": 1.559525142793146, + "grad_norm": 1.3124620914459229, + "learning_rate": 9.793763157894738e-05, + "loss": 0.4684, + "step": 27850 + }, + { + "epoch": 1.559581140105275, + "grad_norm": 1.2405046224594116, + "learning_rate": 9.793736842105264e-05, + "loss": 0.4707, + "step": 27851 + }, + { + "epoch": 1.559637137417404, + "grad_norm": 1.333479642868042, + "learning_rate": 9.79371052631579e-05, + "loss": 0.4788, + "step": 27852 + }, + { + "epoch": 1.559693134729533, + "grad_norm": 1.2092924118041992, + "learning_rate": 9.793684210526316e-05, + "loss": 0.4107, + "step": 27853 + }, + { + "epoch": 1.559749132041662, + "grad_norm": 1.1981545686721802, + "learning_rate": 9.793657894736843e-05, + "loss": 0.377, + "step": 27854 + }, + { + "epoch": 1.5598051293537911, + "grad_norm": 1.3340747356414795, + "learning_rate": 9.793631578947369e-05, + "loss": 0.4975, + "step": 27855 + }, + { + "epoch": 1.5598611266659201, + "grad_norm": 1.3088123798370361, + "learning_rate": 9.793605263157895e-05, + "loss": 0.5356, + "step": 27856 + }, + { + "epoch": 1.5599171239780492, + "grad_norm": 1.5157161951065063, + "learning_rate": 9.793578947368421e-05, + "loss": 0.5309, + "step": 27857 + }, + { + "epoch": 1.5599731212901782, + "grad_norm": 1.4487133026123047, + "learning_rate": 9.793552631578948e-05, + "loss": 0.469, + "step": 27858 + }, + { + "epoch": 1.5600291186023072, + "grad_norm": 1.2588891983032227, + "learning_rate": 9.793526315789474e-05, + "loss": 0.399, + "step": 27859 + }, + { + "epoch": 1.5600851159144362, + "grad_norm": 1.49861478805542, + "learning_rate": 9.7935e-05, + "loss": 0.4292, + "step": 27860 + }, + { + "epoch": 1.5601411132265652, + "grad_norm": 1.1538630723953247, + "learning_rate": 9.793473684210526e-05, + "loss": 0.4191, + "step": 27861 + }, + { + "epoch": 1.5601971105386943, + "grad_norm": 1.2881295680999756, + "learning_rate": 9.793447368421052e-05, + "loss": 0.413, + "step": 27862 + }, + { + "epoch": 1.5602531078508233, + "grad_norm": 1.2034615278244019, + "learning_rate": 9.79342105263158e-05, + "loss": 0.4059, + "step": 27863 + }, + { + "epoch": 1.5603091051629523, + "grad_norm": 1.1906850337982178, + "learning_rate": 9.793394736842106e-05, + "loss": 0.4292, + "step": 27864 + }, + { + "epoch": 1.5603651024750813, + "grad_norm": 1.0385959148406982, + "learning_rate": 9.793368421052632e-05, + "loss": 0.4155, + "step": 27865 + }, + { + "epoch": 1.5604210997872103, + "grad_norm": 2.187077283859253, + "learning_rate": 9.793342105263158e-05, + "loss": 0.5637, + "step": 27866 + }, + { + "epoch": 1.5604770970993393, + "grad_norm": 1.3084129095077515, + "learning_rate": 9.793315789473685e-05, + "loss": 0.4163, + "step": 27867 + }, + { + "epoch": 1.5605330944114684, + "grad_norm": 1.4236375093460083, + "learning_rate": 9.793289473684211e-05, + "loss": 0.6847, + "step": 27868 + }, + { + "epoch": 1.5605890917235974, + "grad_norm": 1.2820924520492554, + "learning_rate": 9.793263157894738e-05, + "loss": 0.4831, + "step": 27869 + }, + { + "epoch": 1.5606450890357264, + "grad_norm": 1.1205179691314697, + "learning_rate": 9.793236842105263e-05, + "loss": 0.4814, + "step": 27870 + }, + { + "epoch": 1.5607010863478554, + "grad_norm": 1.2113511562347412, + "learning_rate": 9.79321052631579e-05, + "loss": 0.342, + "step": 27871 + }, + { + "epoch": 1.5607570836599844, + "grad_norm": 1.2579805850982666, + "learning_rate": 9.793184210526316e-05, + "loss": 0.3823, + "step": 27872 + }, + { + "epoch": 1.5608130809721135, + "grad_norm": 1.8428640365600586, + "learning_rate": 9.793157894736843e-05, + "loss": 0.4281, + "step": 27873 + }, + { + "epoch": 1.5608690782842425, + "grad_norm": 1.8847538232803345, + "learning_rate": 9.79313157894737e-05, + "loss": 0.3794, + "step": 27874 + }, + { + "epoch": 1.5609250755963715, + "grad_norm": 1.2826124429702759, + "learning_rate": 9.793105263157895e-05, + "loss": 0.4482, + "step": 27875 + }, + { + "epoch": 1.5609810729085005, + "grad_norm": 1.134809136390686, + "learning_rate": 9.793078947368421e-05, + "loss": 0.4436, + "step": 27876 + }, + { + "epoch": 1.5610370702206295, + "grad_norm": 1.2939330339431763, + "learning_rate": 9.793052631578949e-05, + "loss": 0.4125, + "step": 27877 + }, + { + "epoch": 1.5610930675327586, + "grad_norm": 1.9151451587677002, + "learning_rate": 9.793026315789475e-05, + "loss": 0.5838, + "step": 27878 + }, + { + "epoch": 1.5611490648448876, + "grad_norm": 1.1742690801620483, + "learning_rate": 9.793e-05, + "loss": 0.3943, + "step": 27879 + }, + { + "epoch": 1.5612050621570166, + "grad_norm": 1.6123944520950317, + "learning_rate": 9.792973684210527e-05, + "loss": 0.5769, + "step": 27880 + }, + { + "epoch": 1.5612610594691456, + "grad_norm": 1.2266395092010498, + "learning_rate": 9.792947368421053e-05, + "loss": 0.3835, + "step": 27881 + }, + { + "epoch": 1.5613170567812746, + "grad_norm": 1.2051206827163696, + "learning_rate": 9.79292105263158e-05, + "loss": 0.4799, + "step": 27882 + }, + { + "epoch": 1.5613730540934037, + "grad_norm": 1.504221796989441, + "learning_rate": 9.792894736842106e-05, + "loss": 0.6255, + "step": 27883 + }, + { + "epoch": 1.5614290514055327, + "grad_norm": 1.3478666543960571, + "learning_rate": 9.792868421052632e-05, + "loss": 0.4994, + "step": 27884 + }, + { + "epoch": 1.5614850487176617, + "grad_norm": 1.4160724878311157, + "learning_rate": 9.792842105263158e-05, + "loss": 0.5038, + "step": 27885 + }, + { + "epoch": 1.5615410460297907, + "grad_norm": 1.9105815887451172, + "learning_rate": 9.792815789473685e-05, + "loss": 0.5542, + "step": 27886 + }, + { + "epoch": 1.5615970433419197, + "grad_norm": 1.240626335144043, + "learning_rate": 9.792789473684211e-05, + "loss": 0.3307, + "step": 27887 + }, + { + "epoch": 1.5616530406540488, + "grad_norm": 1.2806501388549805, + "learning_rate": 9.792763157894737e-05, + "loss": 0.35, + "step": 27888 + }, + { + "epoch": 1.5617090379661778, + "grad_norm": 1.2732311487197876, + "learning_rate": 9.792736842105263e-05, + "loss": 0.4266, + "step": 27889 + }, + { + "epoch": 1.5617650352783068, + "grad_norm": 1.423905372619629, + "learning_rate": 9.79271052631579e-05, + "loss": 0.4768, + "step": 27890 + }, + { + "epoch": 1.5618210325904358, + "grad_norm": 1.2721583843231201, + "learning_rate": 9.792684210526316e-05, + "loss": 0.4447, + "step": 27891 + }, + { + "epoch": 1.5618770299025648, + "grad_norm": 1.4054096937179565, + "learning_rate": 9.792657894736844e-05, + "loss": 0.5952, + "step": 27892 + }, + { + "epoch": 1.5619330272146938, + "grad_norm": 1.4433014392852783, + "learning_rate": 9.792631578947368e-05, + "loss": 0.4713, + "step": 27893 + }, + { + "epoch": 1.5619890245268229, + "grad_norm": 1.7865170240402222, + "learning_rate": 9.792605263157896e-05, + "loss": 0.4889, + "step": 27894 + }, + { + "epoch": 1.5620450218389519, + "grad_norm": 1.6442961692810059, + "learning_rate": 9.792578947368422e-05, + "loss": 0.3941, + "step": 27895 + }, + { + "epoch": 1.562101019151081, + "grad_norm": 1.4772812128067017, + "learning_rate": 9.792552631578948e-05, + "loss": 0.4298, + "step": 27896 + }, + { + "epoch": 1.56215701646321, + "grad_norm": 1.2540076971054077, + "learning_rate": 9.792526315789475e-05, + "loss": 0.381, + "step": 27897 + }, + { + "epoch": 1.562213013775339, + "grad_norm": 1.2921191453933716, + "learning_rate": 9.7925e-05, + "loss": 0.4336, + "step": 27898 + }, + { + "epoch": 1.562269011087468, + "grad_norm": 1.384541630744934, + "learning_rate": 9.792473684210527e-05, + "loss": 0.5008, + "step": 27899 + }, + { + "epoch": 1.562325008399597, + "grad_norm": 1.2399932146072388, + "learning_rate": 9.792447368421053e-05, + "loss": 0.3815, + "step": 27900 + }, + { + "epoch": 1.562381005711726, + "grad_norm": 1.7059587240219116, + "learning_rate": 9.79242105263158e-05, + "loss": 0.4672, + "step": 27901 + }, + { + "epoch": 1.562437003023855, + "grad_norm": 1.260528564453125, + "learning_rate": 9.792394736842105e-05, + "loss": 0.4081, + "step": 27902 + }, + { + "epoch": 1.562493000335984, + "grad_norm": 1.2083096504211426, + "learning_rate": 9.792368421052632e-05, + "loss": 0.4603, + "step": 27903 + }, + { + "epoch": 1.5625489976481128, + "grad_norm": 1.1734106540679932, + "learning_rate": 9.792342105263158e-05, + "loss": 0.4077, + "step": 27904 + }, + { + "epoch": 1.5626049949602419, + "grad_norm": 1.3308879137039185, + "learning_rate": 9.792315789473685e-05, + "loss": 0.3871, + "step": 27905 + }, + { + "epoch": 1.5626609922723709, + "grad_norm": 1.4251302480697632, + "learning_rate": 9.792289473684211e-05, + "loss": 0.4728, + "step": 27906 + }, + { + "epoch": 1.5627169895845, + "grad_norm": 1.292154312133789, + "learning_rate": 9.792263157894737e-05, + "loss": 0.4261, + "step": 27907 + }, + { + "epoch": 1.562772986896629, + "grad_norm": 1.6118751764297485, + "learning_rate": 9.792236842105263e-05, + "loss": 0.462, + "step": 27908 + }, + { + "epoch": 1.562828984208758, + "grad_norm": 1.6858899593353271, + "learning_rate": 9.79221052631579e-05, + "loss": 0.5896, + "step": 27909 + }, + { + "epoch": 1.562884981520887, + "grad_norm": 1.938799262046814, + "learning_rate": 9.792184210526317e-05, + "loss": 0.5382, + "step": 27910 + }, + { + "epoch": 1.562940978833016, + "grad_norm": 1.291366457939148, + "learning_rate": 9.792157894736843e-05, + "loss": 0.4255, + "step": 27911 + }, + { + "epoch": 1.562996976145145, + "grad_norm": 1.2998098134994507, + "learning_rate": 9.792131578947369e-05, + "loss": 0.471, + "step": 27912 + }, + { + "epoch": 1.563052973457274, + "grad_norm": 1.3475925922393799, + "learning_rate": 9.792105263157895e-05, + "loss": 0.4834, + "step": 27913 + }, + { + "epoch": 1.563108970769403, + "grad_norm": 1.3017666339874268, + "learning_rate": 9.792078947368422e-05, + "loss": 0.5708, + "step": 27914 + }, + { + "epoch": 1.563164968081532, + "grad_norm": 1.2234461307525635, + "learning_rate": 9.792052631578948e-05, + "loss": 0.5419, + "step": 27915 + }, + { + "epoch": 1.563220965393661, + "grad_norm": 1.4304434061050415, + "learning_rate": 9.792026315789474e-05, + "loss": 0.4753, + "step": 27916 + }, + { + "epoch": 1.56327696270579, + "grad_norm": 1.1569468975067139, + "learning_rate": 9.792e-05, + "loss": 0.3657, + "step": 27917 + }, + { + "epoch": 1.563332960017919, + "grad_norm": 1.090216040611267, + "learning_rate": 9.791973684210527e-05, + "loss": 0.3583, + "step": 27918 + }, + { + "epoch": 1.5633889573300481, + "grad_norm": 1.2005150318145752, + "learning_rate": 9.791947368421053e-05, + "loss": 0.3683, + "step": 27919 + }, + { + "epoch": 1.5634449546421771, + "grad_norm": 1.5770893096923828, + "learning_rate": 9.791921052631579e-05, + "loss": 0.5088, + "step": 27920 + }, + { + "epoch": 1.5635009519543062, + "grad_norm": 1.634976863861084, + "learning_rate": 9.791894736842105e-05, + "loss": 0.5555, + "step": 27921 + }, + { + "epoch": 1.5635569492664352, + "grad_norm": 1.1582673788070679, + "learning_rate": 9.791868421052632e-05, + "loss": 0.4653, + "step": 27922 + }, + { + "epoch": 1.5636129465785642, + "grad_norm": 1.4435118436813354, + "learning_rate": 9.791842105263158e-05, + "loss": 0.4989, + "step": 27923 + }, + { + "epoch": 1.5636689438906932, + "grad_norm": 1.4571359157562256, + "learning_rate": 9.791815789473686e-05, + "loss": 0.4679, + "step": 27924 + }, + { + "epoch": 1.5637249412028222, + "grad_norm": 1.1578705310821533, + "learning_rate": 9.79178947368421e-05, + "loss": 0.4218, + "step": 27925 + }, + { + "epoch": 1.5637809385149513, + "grad_norm": 1.4565349817276, + "learning_rate": 9.791763157894738e-05, + "loss": 0.4999, + "step": 27926 + }, + { + "epoch": 1.5638369358270803, + "grad_norm": 1.5129117965698242, + "learning_rate": 9.791736842105264e-05, + "loss": 0.4592, + "step": 27927 + }, + { + "epoch": 1.5638929331392093, + "grad_norm": 1.1397032737731934, + "learning_rate": 9.791710526315791e-05, + "loss": 0.447, + "step": 27928 + }, + { + "epoch": 1.5639489304513383, + "grad_norm": 1.3803645372390747, + "learning_rate": 9.791684210526317e-05, + "loss": 0.673, + "step": 27929 + }, + { + "epoch": 1.5640049277634673, + "grad_norm": 1.7738953828811646, + "learning_rate": 9.791657894736841e-05, + "loss": 0.5228, + "step": 27930 + }, + { + "epoch": 1.5640609250755964, + "grad_norm": 1.5509098768234253, + "learning_rate": 9.791631578947369e-05, + "loss": 0.4285, + "step": 27931 + }, + { + "epoch": 1.5641169223877254, + "grad_norm": 1.4631469249725342, + "learning_rate": 9.791605263157895e-05, + "loss": 0.5835, + "step": 27932 + }, + { + "epoch": 1.5641729196998544, + "grad_norm": 1.380316972732544, + "learning_rate": 9.791578947368422e-05, + "loss": 0.4579, + "step": 27933 + }, + { + "epoch": 1.5642289170119834, + "grad_norm": 1.1702483892440796, + "learning_rate": 9.791552631578948e-05, + "loss": 0.5154, + "step": 27934 + }, + { + "epoch": 1.5642849143241124, + "grad_norm": 1.2805495262145996, + "learning_rate": 9.791526315789474e-05, + "loss": 0.446, + "step": 27935 + }, + { + "epoch": 1.5643409116362414, + "grad_norm": 1.351359248161316, + "learning_rate": 9.7915e-05, + "loss": 0.5345, + "step": 27936 + }, + { + "epoch": 1.5643969089483705, + "grad_norm": 1.1455228328704834, + "learning_rate": 9.791473684210527e-05, + "loss": 0.3928, + "step": 27937 + }, + { + "epoch": 1.5644529062604995, + "grad_norm": 1.5278799533843994, + "learning_rate": 9.791447368421053e-05, + "loss": 0.5924, + "step": 27938 + }, + { + "epoch": 1.5645089035726285, + "grad_norm": 1.5728360414505005, + "learning_rate": 9.791421052631579e-05, + "loss": 0.4548, + "step": 27939 + }, + { + "epoch": 1.5645649008847575, + "grad_norm": 1.5662719011306763, + "learning_rate": 9.791394736842105e-05, + "loss": 0.4648, + "step": 27940 + }, + { + "epoch": 1.5646208981968865, + "grad_norm": 1.1243435144424438, + "learning_rate": 9.791368421052633e-05, + "loss": 0.339, + "step": 27941 + }, + { + "epoch": 1.5646768955090156, + "grad_norm": 1.3633006811141968, + "learning_rate": 9.791342105263159e-05, + "loss": 0.4912, + "step": 27942 + }, + { + "epoch": 1.5647328928211446, + "grad_norm": 1.3593796491622925, + "learning_rate": 9.791315789473685e-05, + "loss": 0.5445, + "step": 27943 + }, + { + "epoch": 1.5647888901332736, + "grad_norm": 1.676698923110962, + "learning_rate": 9.79128947368421e-05, + "loss": 0.4787, + "step": 27944 + }, + { + "epoch": 1.5648448874454026, + "grad_norm": 1.312387228012085, + "learning_rate": 9.791263157894738e-05, + "loss": 0.5799, + "step": 27945 + }, + { + "epoch": 1.5649008847575316, + "grad_norm": 1.405980110168457, + "learning_rate": 9.791236842105264e-05, + "loss": 0.6163, + "step": 27946 + }, + { + "epoch": 1.5649568820696607, + "grad_norm": 1.2081812620162964, + "learning_rate": 9.79121052631579e-05, + "loss": 0.4004, + "step": 27947 + }, + { + "epoch": 1.5650128793817897, + "grad_norm": 1.5452519655227661, + "learning_rate": 9.791184210526316e-05, + "loss": 0.3953, + "step": 27948 + }, + { + "epoch": 1.5650688766939187, + "grad_norm": 1.3677908182144165, + "learning_rate": 9.791157894736842e-05, + "loss": 0.474, + "step": 27949 + }, + { + "epoch": 1.5651248740060477, + "grad_norm": 1.3281514644622803, + "learning_rate": 9.791131578947369e-05, + "loss": 0.3966, + "step": 27950 + }, + { + "epoch": 1.5651808713181767, + "grad_norm": 1.2376335859298706, + "learning_rate": 9.791105263157895e-05, + "loss": 0.4162, + "step": 27951 + }, + { + "epoch": 1.5652368686303058, + "grad_norm": 1.2949895858764648, + "learning_rate": 9.791078947368422e-05, + "loss": 0.4646, + "step": 27952 + }, + { + "epoch": 1.5652928659424348, + "grad_norm": 1.4561299085617065, + "learning_rate": 9.791052631578947e-05, + "loss": 0.4796, + "step": 27953 + }, + { + "epoch": 1.5653488632545638, + "grad_norm": 1.5170055627822876, + "learning_rate": 9.791026315789474e-05, + "loss": 0.5016, + "step": 27954 + }, + { + "epoch": 1.5654048605666928, + "grad_norm": 2.710172176361084, + "learning_rate": 9.791e-05, + "loss": 0.3683, + "step": 27955 + }, + { + "epoch": 1.5654608578788218, + "grad_norm": 1.7364965677261353, + "learning_rate": 9.790973684210528e-05, + "loss": 0.541, + "step": 27956 + }, + { + "epoch": 1.5655168551909509, + "grad_norm": 1.2130730152130127, + "learning_rate": 9.790947368421052e-05, + "loss": 0.4467, + "step": 27957 + }, + { + "epoch": 1.5655728525030799, + "grad_norm": 1.2386354207992554, + "learning_rate": 9.79092105263158e-05, + "loss": 0.4314, + "step": 27958 + }, + { + "epoch": 1.565628849815209, + "grad_norm": 1.3954352140426636, + "learning_rate": 9.790894736842106e-05, + "loss": 0.4406, + "step": 27959 + }, + { + "epoch": 1.565684847127338, + "grad_norm": 1.4999171495437622, + "learning_rate": 9.790868421052633e-05, + "loss": 0.505, + "step": 27960 + }, + { + "epoch": 1.565740844439467, + "grad_norm": 1.12770676612854, + "learning_rate": 9.790842105263159e-05, + "loss": 0.4304, + "step": 27961 + }, + { + "epoch": 1.565796841751596, + "grad_norm": 1.4203526973724365, + "learning_rate": 9.790815789473685e-05, + "loss": 0.522, + "step": 27962 + }, + { + "epoch": 1.565852839063725, + "grad_norm": 1.2756706476211548, + "learning_rate": 9.790789473684211e-05, + "loss": 0.4641, + "step": 27963 + }, + { + "epoch": 1.565908836375854, + "grad_norm": 1.3431830406188965, + "learning_rate": 9.790763157894738e-05, + "loss": 0.375, + "step": 27964 + }, + { + "epoch": 1.565964833687983, + "grad_norm": 1.577451467514038, + "learning_rate": 9.790736842105264e-05, + "loss": 0.439, + "step": 27965 + }, + { + "epoch": 1.566020831000112, + "grad_norm": 1.377826452255249, + "learning_rate": 9.79071052631579e-05, + "loss": 0.5604, + "step": 27966 + }, + { + "epoch": 1.566076828312241, + "grad_norm": 1.374342679977417, + "learning_rate": 9.790684210526316e-05, + "loss": 0.4538, + "step": 27967 + }, + { + "epoch": 1.56613282562437, + "grad_norm": 1.2720531225204468, + "learning_rate": 9.790657894736842e-05, + "loss": 0.3871, + "step": 27968 + }, + { + "epoch": 1.566188822936499, + "grad_norm": 1.3962714672088623, + "learning_rate": 9.790631578947369e-05, + "loss": 0.5448, + "step": 27969 + }, + { + "epoch": 1.566244820248628, + "grad_norm": 1.2228339910507202, + "learning_rate": 9.790605263157895e-05, + "loss": 0.3319, + "step": 27970 + }, + { + "epoch": 1.5663008175607571, + "grad_norm": 1.1670565605163574, + "learning_rate": 9.790578947368421e-05, + "loss": 0.5092, + "step": 27971 + }, + { + "epoch": 1.5663568148728861, + "grad_norm": 1.0824604034423828, + "learning_rate": 9.790552631578947e-05, + "loss": 0.3979, + "step": 27972 + }, + { + "epoch": 1.5664128121850152, + "grad_norm": 1.3282324075698853, + "learning_rate": 9.790526315789475e-05, + "loss": 0.4293, + "step": 27973 + }, + { + "epoch": 1.5664688094971442, + "grad_norm": 1.1624890565872192, + "learning_rate": 9.7905e-05, + "loss": 0.4407, + "step": 27974 + }, + { + "epoch": 1.5665248068092732, + "grad_norm": 1.3137606382369995, + "learning_rate": 9.790473684210527e-05, + "loss": 0.4876, + "step": 27975 + }, + { + "epoch": 1.5665808041214022, + "grad_norm": 1.2371691465377808, + "learning_rate": 9.790447368421052e-05, + "loss": 0.4415, + "step": 27976 + }, + { + "epoch": 1.5666368014335312, + "grad_norm": 1.12775719165802, + "learning_rate": 9.79042105263158e-05, + "loss": 0.3587, + "step": 27977 + }, + { + "epoch": 1.5666927987456603, + "grad_norm": 1.0950857400894165, + "learning_rate": 9.790394736842106e-05, + "loss": 0.3711, + "step": 27978 + }, + { + "epoch": 1.5667487960577893, + "grad_norm": 1.4122695922851562, + "learning_rate": 9.790368421052633e-05, + "loss": 0.6856, + "step": 27979 + }, + { + "epoch": 1.5668047933699183, + "grad_norm": 1.40877103805542, + "learning_rate": 9.790342105263158e-05, + "loss": 0.6257, + "step": 27980 + }, + { + "epoch": 1.5668607906820473, + "grad_norm": 1.2427443265914917, + "learning_rate": 9.790315789473685e-05, + "loss": 0.3701, + "step": 27981 + }, + { + "epoch": 1.5669167879941763, + "grad_norm": 1.1560802459716797, + "learning_rate": 9.790289473684211e-05, + "loss": 0.435, + "step": 27982 + }, + { + "epoch": 1.5669727853063053, + "grad_norm": 1.0992628335952759, + "learning_rate": 9.790263157894737e-05, + "loss": 0.2943, + "step": 27983 + }, + { + "epoch": 1.5670287826184344, + "grad_norm": 1.3091014623641968, + "learning_rate": 9.790236842105264e-05, + "loss": 0.4956, + "step": 27984 + }, + { + "epoch": 1.5670847799305634, + "grad_norm": 1.0343049764633179, + "learning_rate": 9.790210526315789e-05, + "loss": 0.3894, + "step": 27985 + }, + { + "epoch": 1.5671407772426922, + "grad_norm": 1.20504891872406, + "learning_rate": 9.790184210526316e-05, + "loss": 0.3722, + "step": 27986 + }, + { + "epoch": 1.5671967745548212, + "grad_norm": 1.3490113019943237, + "learning_rate": 9.790157894736842e-05, + "loss": 0.4699, + "step": 27987 + }, + { + "epoch": 1.5672527718669502, + "grad_norm": 1.3986599445343018, + "learning_rate": 9.79013157894737e-05, + "loss": 0.4752, + "step": 27988 + }, + { + "epoch": 1.5673087691790792, + "grad_norm": 4.4811224937438965, + "learning_rate": 9.790105263157896e-05, + "loss": 0.4955, + "step": 27989 + }, + { + "epoch": 1.5673647664912083, + "grad_norm": 1.4624607563018799, + "learning_rate": 9.790078947368422e-05, + "loss": 0.5015, + "step": 27990 + }, + { + "epoch": 1.5674207638033373, + "grad_norm": 1.1344993114471436, + "learning_rate": 9.790052631578947e-05, + "loss": 0.4068, + "step": 27991 + }, + { + "epoch": 1.5674767611154663, + "grad_norm": 1.3949861526489258, + "learning_rate": 9.790026315789475e-05, + "loss": 0.4201, + "step": 27992 + }, + { + "epoch": 1.5675327584275953, + "grad_norm": 1.5104824304580688, + "learning_rate": 9.790000000000001e-05, + "loss": 0.3837, + "step": 27993 + }, + { + "epoch": 1.5675887557397243, + "grad_norm": 1.2982854843139648, + "learning_rate": 9.789973684210527e-05, + "loss": 0.4254, + "step": 27994 + }, + { + "epoch": 1.5676447530518534, + "grad_norm": 1.2567329406738281, + "learning_rate": 9.789947368421053e-05, + "loss": 0.4193, + "step": 27995 + }, + { + "epoch": 1.5677007503639824, + "grad_norm": 1.7564815282821655, + "learning_rate": 9.78992105263158e-05, + "loss": 0.6307, + "step": 27996 + }, + { + "epoch": 1.5677567476761114, + "grad_norm": 1.3201968669891357, + "learning_rate": 9.789894736842106e-05, + "loss": 0.412, + "step": 27997 + }, + { + "epoch": 1.5678127449882404, + "grad_norm": 1.2468838691711426, + "learning_rate": 9.789868421052632e-05, + "loss": 0.3827, + "step": 27998 + }, + { + "epoch": 1.5678687423003694, + "grad_norm": 1.3903758525848389, + "learning_rate": 9.789842105263158e-05, + "loss": 0.3819, + "step": 27999 + }, + { + "epoch": 1.5679247396124985, + "grad_norm": 1.143258810043335, + "learning_rate": 9.789815789473684e-05, + "loss": 0.2986, + "step": 28000 + }, + { + "epoch": 1.5679807369246275, + "grad_norm": 1.330857515335083, + "learning_rate": 9.789789473684211e-05, + "loss": 0.4789, + "step": 28001 + }, + { + "epoch": 1.5680367342367565, + "grad_norm": 1.2843005657196045, + "learning_rate": 9.789763157894737e-05, + "loss": 0.4213, + "step": 28002 + }, + { + "epoch": 1.5680927315488855, + "grad_norm": 1.3903347253799438, + "learning_rate": 9.789736842105263e-05, + "loss": 0.5385, + "step": 28003 + }, + { + "epoch": 1.5681487288610145, + "grad_norm": 1.4101808071136475, + "learning_rate": 9.789710526315789e-05, + "loss": 0.4919, + "step": 28004 + }, + { + "epoch": 1.5682047261731435, + "grad_norm": 1.4195345640182495, + "learning_rate": 9.789684210526317e-05, + "loss": 0.3698, + "step": 28005 + }, + { + "epoch": 1.5682607234852726, + "grad_norm": 1.22687828540802, + "learning_rate": 9.789657894736843e-05, + "loss": 0.3312, + "step": 28006 + }, + { + "epoch": 1.5683167207974016, + "grad_norm": 1.58999502658844, + "learning_rate": 9.789631578947368e-05, + "loss": 0.521, + "step": 28007 + }, + { + "epoch": 1.5683727181095306, + "grad_norm": 1.1324678659439087, + "learning_rate": 9.789605263157894e-05, + "loss": 0.421, + "step": 28008 + }, + { + "epoch": 1.5684287154216596, + "grad_norm": 1.395676612854004, + "learning_rate": 9.789578947368422e-05, + "loss": 0.5193, + "step": 28009 + }, + { + "epoch": 1.5684847127337886, + "grad_norm": 1.327857255935669, + "learning_rate": 9.789552631578948e-05, + "loss": 0.5392, + "step": 28010 + }, + { + "epoch": 1.5685407100459177, + "grad_norm": 1.1735122203826904, + "learning_rate": 9.789526315789475e-05, + "loss": 0.4637, + "step": 28011 + }, + { + "epoch": 1.5685967073580467, + "grad_norm": 1.3529436588287354, + "learning_rate": 9.7895e-05, + "loss": 0.4668, + "step": 28012 + }, + { + "epoch": 1.5686527046701757, + "grad_norm": 1.3784637451171875, + "learning_rate": 9.789473684210527e-05, + "loss": 0.4643, + "step": 28013 + }, + { + "epoch": 1.5687087019823047, + "grad_norm": 1.3789427280426025, + "learning_rate": 9.789447368421053e-05, + "loss": 0.4272, + "step": 28014 + }, + { + "epoch": 1.5687646992944337, + "grad_norm": 1.415741205215454, + "learning_rate": 9.78942105263158e-05, + "loss": 0.4292, + "step": 28015 + }, + { + "epoch": 1.5688206966065628, + "grad_norm": 1.377379298210144, + "learning_rate": 9.789394736842106e-05, + "loss": 0.4687, + "step": 28016 + }, + { + "epoch": 1.5688766939186918, + "grad_norm": 1.4200210571289062, + "learning_rate": 9.789368421052631e-05, + "loss": 0.4575, + "step": 28017 + }, + { + "epoch": 1.5689326912308208, + "grad_norm": 1.5323363542556763, + "learning_rate": 9.789342105263158e-05, + "loss": 0.4414, + "step": 28018 + }, + { + "epoch": 1.5689886885429498, + "grad_norm": 1.495772123336792, + "learning_rate": 9.789315789473684e-05, + "loss": 0.4087, + "step": 28019 + }, + { + "epoch": 1.5690446858550788, + "grad_norm": 1.220541000366211, + "learning_rate": 9.789289473684212e-05, + "loss": 0.4097, + "step": 28020 + }, + { + "epoch": 1.5691006831672079, + "grad_norm": 1.581607460975647, + "learning_rate": 9.789263157894738e-05, + "loss": 0.418, + "step": 28021 + }, + { + "epoch": 1.5691566804793369, + "grad_norm": 1.676621437072754, + "learning_rate": 9.789236842105263e-05, + "loss": 0.4839, + "step": 28022 + }, + { + "epoch": 1.569212677791466, + "grad_norm": 1.236293911933899, + "learning_rate": 9.78921052631579e-05, + "loss": 0.3658, + "step": 28023 + }, + { + "epoch": 1.569268675103595, + "grad_norm": 1.4252465963363647, + "learning_rate": 9.789184210526317e-05, + "loss": 0.5402, + "step": 28024 + }, + { + "epoch": 1.569324672415724, + "grad_norm": 1.4142038822174072, + "learning_rate": 9.789157894736843e-05, + "loss": 0.3963, + "step": 28025 + }, + { + "epoch": 1.569380669727853, + "grad_norm": 1.4917858839035034, + "learning_rate": 9.789131578947369e-05, + "loss": 0.4539, + "step": 28026 + }, + { + "epoch": 1.569436667039982, + "grad_norm": 1.422104835510254, + "learning_rate": 9.789105263157895e-05, + "loss": 0.5467, + "step": 28027 + }, + { + "epoch": 1.569492664352111, + "grad_norm": 1.6085867881774902, + "learning_rate": 9.789078947368422e-05, + "loss": 0.6167, + "step": 28028 + }, + { + "epoch": 1.56954866166424, + "grad_norm": 1.2367597818374634, + "learning_rate": 9.789052631578948e-05, + "loss": 0.4233, + "step": 28029 + }, + { + "epoch": 1.569604658976369, + "grad_norm": 1.2459625005722046, + "learning_rate": 9.789026315789474e-05, + "loss": 0.4643, + "step": 28030 + }, + { + "epoch": 1.569660656288498, + "grad_norm": 1.1102368831634521, + "learning_rate": 9.789e-05, + "loss": 0.4502, + "step": 28031 + }, + { + "epoch": 1.569716653600627, + "grad_norm": 1.150658130645752, + "learning_rate": 9.788973684210527e-05, + "loss": 0.3848, + "step": 28032 + }, + { + "epoch": 1.569772650912756, + "grad_norm": 1.266994595527649, + "learning_rate": 9.788947368421053e-05, + "loss": 0.3786, + "step": 28033 + }, + { + "epoch": 1.569828648224885, + "grad_norm": 1.4353046417236328, + "learning_rate": 9.788921052631579e-05, + "loss": 0.4771, + "step": 28034 + }, + { + "epoch": 1.5698846455370141, + "grad_norm": 1.238640308380127, + "learning_rate": 9.788894736842105e-05, + "loss": 0.519, + "step": 28035 + }, + { + "epoch": 1.5699406428491431, + "grad_norm": 1.2137643098831177, + "learning_rate": 9.788868421052631e-05, + "loss": 0.4772, + "step": 28036 + }, + { + "epoch": 1.5699966401612722, + "grad_norm": 1.268845558166504, + "learning_rate": 9.788842105263159e-05, + "loss": 0.424, + "step": 28037 + }, + { + "epoch": 1.5700526374734012, + "grad_norm": 1.2281060218811035, + "learning_rate": 9.788815789473684e-05, + "loss": 0.4479, + "step": 28038 + }, + { + "epoch": 1.5701086347855302, + "grad_norm": 4.048982620239258, + "learning_rate": 9.788789473684212e-05, + "loss": 0.4112, + "step": 28039 + }, + { + "epoch": 1.5701646320976592, + "grad_norm": 1.7420209646224976, + "learning_rate": 9.788763157894736e-05, + "loss": 0.7577, + "step": 28040 + }, + { + "epoch": 1.5702206294097882, + "grad_norm": 1.2172189950942993, + "learning_rate": 9.788736842105264e-05, + "loss": 0.3898, + "step": 28041 + }, + { + "epoch": 1.5702766267219173, + "grad_norm": 1.5058587789535522, + "learning_rate": 9.78871052631579e-05, + "loss": 0.4789, + "step": 28042 + }, + { + "epoch": 1.5703326240340463, + "grad_norm": 1.1051394939422607, + "learning_rate": 9.788684210526317e-05, + "loss": 0.3422, + "step": 28043 + }, + { + "epoch": 1.5703886213461753, + "grad_norm": 1.3431580066680908, + "learning_rate": 9.788657894736843e-05, + "loss": 0.5644, + "step": 28044 + }, + { + "epoch": 1.5704446186583043, + "grad_norm": 1.3015185594558716, + "learning_rate": 9.788631578947369e-05, + "loss": 0.4857, + "step": 28045 + }, + { + "epoch": 1.5705006159704333, + "grad_norm": 1.2908891439437866, + "learning_rate": 9.788605263157895e-05, + "loss": 0.3975, + "step": 28046 + }, + { + "epoch": 1.5705566132825624, + "grad_norm": 1.5780909061431885, + "learning_rate": 9.788578947368422e-05, + "loss": 0.4592, + "step": 28047 + }, + { + "epoch": 1.5706126105946914, + "grad_norm": 1.4913671016693115, + "learning_rate": 9.788552631578948e-05, + "loss": 0.6319, + "step": 28048 + }, + { + "epoch": 1.5706686079068204, + "grad_norm": 1.4054170846939087, + "learning_rate": 9.788526315789474e-05, + "loss": 0.5016, + "step": 28049 + }, + { + "epoch": 1.5707246052189494, + "grad_norm": 1.1858580112457275, + "learning_rate": 9.7885e-05, + "loss": 0.4828, + "step": 28050 + }, + { + "epoch": 1.5707806025310784, + "grad_norm": 1.3672168254852295, + "learning_rate": 9.788473684210526e-05, + "loss": 0.3784, + "step": 28051 + }, + { + "epoch": 1.5708365998432074, + "grad_norm": 1.2569587230682373, + "learning_rate": 9.788447368421054e-05, + "loss": 0.4334, + "step": 28052 + }, + { + "epoch": 1.5708925971553365, + "grad_norm": 1.2256430387496948, + "learning_rate": 9.78842105263158e-05, + "loss": 0.4234, + "step": 28053 + }, + { + "epoch": 1.5709485944674655, + "grad_norm": 1.0204970836639404, + "learning_rate": 9.788394736842105e-05, + "loss": 0.3367, + "step": 28054 + }, + { + "epoch": 1.5710045917795945, + "grad_norm": 1.4459164142608643, + "learning_rate": 9.788368421052631e-05, + "loss": 0.4671, + "step": 28055 + }, + { + "epoch": 1.5710605890917235, + "grad_norm": 1.5175567865371704, + "learning_rate": 9.788342105263159e-05, + "loss": 0.5397, + "step": 28056 + }, + { + "epoch": 1.5711165864038525, + "grad_norm": 1.364989161491394, + "learning_rate": 9.788315789473685e-05, + "loss": 0.483, + "step": 28057 + }, + { + "epoch": 1.5711725837159816, + "grad_norm": 1.1792104244232178, + "learning_rate": 9.788289473684211e-05, + "loss": 0.352, + "step": 28058 + }, + { + "epoch": 1.5712285810281106, + "grad_norm": 1.5198942422866821, + "learning_rate": 9.788263157894737e-05, + "loss": 0.4844, + "step": 28059 + }, + { + "epoch": 1.5712845783402396, + "grad_norm": 1.224273681640625, + "learning_rate": 9.788236842105264e-05, + "loss": 0.4047, + "step": 28060 + }, + { + "epoch": 1.5713405756523686, + "grad_norm": 1.302113652229309, + "learning_rate": 9.78821052631579e-05, + "loss": 0.4535, + "step": 28061 + }, + { + "epoch": 1.5713965729644976, + "grad_norm": 1.559851050376892, + "learning_rate": 9.788184210526316e-05, + "loss": 0.4636, + "step": 28062 + }, + { + "epoch": 1.5714525702766267, + "grad_norm": 1.35964834690094, + "learning_rate": 9.788157894736842e-05, + "loss": 0.5024, + "step": 28063 + }, + { + "epoch": 1.5715085675887557, + "grad_norm": 1.6735707521438599, + "learning_rate": 9.788131578947369e-05, + "loss": 0.4968, + "step": 28064 + }, + { + "epoch": 1.5715645649008847, + "grad_norm": 1.1186984777450562, + "learning_rate": 9.788105263157895e-05, + "loss": 0.3354, + "step": 28065 + }, + { + "epoch": 1.5716205622130137, + "grad_norm": 1.3679900169372559, + "learning_rate": 9.788078947368423e-05, + "loss": 0.3556, + "step": 28066 + }, + { + "epoch": 1.5716765595251427, + "grad_norm": 1.2580618858337402, + "learning_rate": 9.788052631578947e-05, + "loss": 0.466, + "step": 28067 + }, + { + "epoch": 1.5717325568372718, + "grad_norm": 1.267106056213379, + "learning_rate": 9.788026315789473e-05, + "loss": 0.4127, + "step": 28068 + }, + { + "epoch": 1.5717885541494008, + "grad_norm": 3.595611810684204, + "learning_rate": 9.788e-05, + "loss": 0.5909, + "step": 28069 + }, + { + "epoch": 1.5718445514615298, + "grad_norm": NaN, + "learning_rate": 9.788e-05, + "loss": 0.599, + "step": 28070 + }, + { + "epoch": 1.5719005487736588, + "grad_norm": 1.5328335762023926, + "learning_rate": 9.787973684210526e-05, + "loss": 0.4543, + "step": 28071 + }, + { + "epoch": 1.5719565460857878, + "grad_norm": 1.3158411979675293, + "learning_rate": 9.787947368421054e-05, + "loss": 0.488, + "step": 28072 + }, + { + "epoch": 1.5720125433979169, + "grad_norm": 2.418255090713501, + "learning_rate": 9.787921052631578e-05, + "loss": 0.5711, + "step": 28073 + }, + { + "epoch": 1.5720685407100459, + "grad_norm": 1.469887614250183, + "learning_rate": 9.787894736842106e-05, + "loss": 0.5755, + "step": 28074 + }, + { + "epoch": 1.5721245380221749, + "grad_norm": 1.3137468099594116, + "learning_rate": 9.787868421052632e-05, + "loss": 0.3472, + "step": 28075 + }, + { + "epoch": 1.572180535334304, + "grad_norm": 1.1552168130874634, + "learning_rate": 9.787842105263159e-05, + "loss": 0.5308, + "step": 28076 + }, + { + "epoch": 1.572236532646433, + "grad_norm": 1.41217839717865, + "learning_rate": 9.787815789473685e-05, + "loss": 0.3818, + "step": 28077 + }, + { + "epoch": 1.572292529958562, + "grad_norm": 1.1353222131729126, + "learning_rate": 9.787789473684211e-05, + "loss": 0.3699, + "step": 28078 + }, + { + "epoch": 1.572348527270691, + "grad_norm": 1.1848223209381104, + "learning_rate": 9.787763157894737e-05, + "loss": 0.5062, + "step": 28079 + }, + { + "epoch": 1.57240452458282, + "grad_norm": 1.1055452823638916, + "learning_rate": 9.787736842105264e-05, + "loss": 0.3496, + "step": 28080 + }, + { + "epoch": 1.572460521894949, + "grad_norm": 1.4840240478515625, + "learning_rate": 9.78771052631579e-05, + "loss": 0.4124, + "step": 28081 + }, + { + "epoch": 1.572516519207078, + "grad_norm": 1.1247225999832153, + "learning_rate": 9.787684210526316e-05, + "loss": 0.5187, + "step": 28082 + }, + { + "epoch": 1.572572516519207, + "grad_norm": 1.0302581787109375, + "learning_rate": 9.787657894736842e-05, + "loss": 0.412, + "step": 28083 + }, + { + "epoch": 1.572628513831336, + "grad_norm": 1.3668094873428345, + "learning_rate": 9.78763157894737e-05, + "loss": 0.4253, + "step": 28084 + }, + { + "epoch": 1.572684511143465, + "grad_norm": 1.3305730819702148, + "learning_rate": 9.787605263157895e-05, + "loss": 0.5992, + "step": 28085 + }, + { + "epoch": 1.572740508455594, + "grad_norm": 1.515182375907898, + "learning_rate": 9.787578947368421e-05, + "loss": 0.5052, + "step": 28086 + }, + { + "epoch": 1.5727965057677231, + "grad_norm": 1.3288472890853882, + "learning_rate": 9.787552631578947e-05, + "loss": 0.4967, + "step": 28087 + }, + { + "epoch": 1.5728525030798521, + "grad_norm": 1.110041856765747, + "learning_rate": 9.787526315789473e-05, + "loss": 0.3674, + "step": 28088 + }, + { + "epoch": 1.5729085003919812, + "grad_norm": 1.2525770664215088, + "learning_rate": 9.787500000000001e-05, + "loss": 0.414, + "step": 28089 + }, + { + "epoch": 1.5729644977041102, + "grad_norm": 1.2663404941558838, + "learning_rate": 9.787473684210527e-05, + "loss": 0.3943, + "step": 28090 + }, + { + "epoch": 1.5730204950162392, + "grad_norm": 1.105630874633789, + "learning_rate": 9.787447368421053e-05, + "loss": 0.3995, + "step": 28091 + }, + { + "epoch": 1.5730764923283682, + "grad_norm": 1.1396989822387695, + "learning_rate": 9.787421052631579e-05, + "loss": 0.4185, + "step": 28092 + }, + { + "epoch": 1.5731324896404972, + "grad_norm": 1.2848345041275024, + "learning_rate": 9.787394736842106e-05, + "loss": 0.4938, + "step": 28093 + }, + { + "epoch": 1.5731884869526263, + "grad_norm": 1.0787601470947266, + "learning_rate": 9.787368421052632e-05, + "loss": 0.4144, + "step": 28094 + }, + { + "epoch": 1.5732444842647553, + "grad_norm": 1.9113361835479736, + "learning_rate": 9.787342105263159e-05, + "loss": 0.6329, + "step": 28095 + }, + { + "epoch": 1.5733004815768843, + "grad_norm": 1.2810189723968506, + "learning_rate": 9.787315789473684e-05, + "loss": 0.4292, + "step": 28096 + }, + { + "epoch": 1.5733564788890133, + "grad_norm": 1.7841038703918457, + "learning_rate": 9.787289473684211e-05, + "loss": 0.448, + "step": 28097 + }, + { + "epoch": 1.5734124762011423, + "grad_norm": 1.3965585231781006, + "learning_rate": 9.787263157894737e-05, + "loss": 0.4491, + "step": 28098 + }, + { + "epoch": 1.5734684735132713, + "grad_norm": 1.3444855213165283, + "learning_rate": 9.787236842105265e-05, + "loss": 0.4838, + "step": 28099 + }, + { + "epoch": 1.5735244708254004, + "grad_norm": 1.3435307741165161, + "learning_rate": 9.78721052631579e-05, + "loss": 0.5611, + "step": 28100 + }, + { + "epoch": 1.5735804681375294, + "grad_norm": 1.0784518718719482, + "learning_rate": 9.787184210526316e-05, + "loss": 0.3621, + "step": 28101 + }, + { + "epoch": 1.5736364654496584, + "grad_norm": 1.5495529174804688, + "learning_rate": 9.787157894736842e-05, + "loss": 0.5277, + "step": 28102 + }, + { + "epoch": 1.5736924627617874, + "grad_norm": 2.4598546028137207, + "learning_rate": 9.78713157894737e-05, + "loss": 0.5536, + "step": 28103 + }, + { + "epoch": 1.5737484600739164, + "grad_norm": 1.3591283559799194, + "learning_rate": 9.787105263157896e-05, + "loss": 0.5098, + "step": 28104 + }, + { + "epoch": 1.5738044573860455, + "grad_norm": 1.3528375625610352, + "learning_rate": 9.78707894736842e-05, + "loss": 0.3944, + "step": 28105 + }, + { + "epoch": 1.5738604546981745, + "grad_norm": 1.5302882194519043, + "learning_rate": 9.787052631578948e-05, + "loss": 0.4248, + "step": 28106 + }, + { + "epoch": 1.5739164520103035, + "grad_norm": 2.221705675125122, + "learning_rate": 9.787026315789474e-05, + "loss": 0.5651, + "step": 28107 + }, + { + "epoch": 1.5739724493224325, + "grad_norm": 1.2829450368881226, + "learning_rate": 9.787000000000001e-05, + "loss": 0.4317, + "step": 28108 + }, + { + "epoch": 1.5740284466345615, + "grad_norm": 1.266735315322876, + "learning_rate": 9.786973684210527e-05, + "loss": 0.4263, + "step": 28109 + }, + { + "epoch": 1.5740844439466906, + "grad_norm": 1.3932968378067017, + "learning_rate": 9.786947368421053e-05, + "loss": 0.4782, + "step": 28110 + }, + { + "epoch": 1.5741404412588196, + "grad_norm": 1.2708779573440552, + "learning_rate": 9.786921052631579e-05, + "loss": 0.4157, + "step": 28111 + }, + { + "epoch": 1.5741964385709486, + "grad_norm": 1.2368707656860352, + "learning_rate": 9.786894736842106e-05, + "loss": 0.4547, + "step": 28112 + }, + { + "epoch": 1.5742524358830776, + "grad_norm": 1.4338250160217285, + "learning_rate": 9.786868421052632e-05, + "loss": 0.3706, + "step": 28113 + }, + { + "epoch": 1.5743084331952066, + "grad_norm": 1.2648639678955078, + "learning_rate": 9.786842105263158e-05, + "loss": 0.3937, + "step": 28114 + }, + { + "epoch": 1.5743644305073357, + "grad_norm": 1.6365704536437988, + "learning_rate": 9.786815789473684e-05, + "loss": 0.5993, + "step": 28115 + }, + { + "epoch": 1.5744204278194647, + "grad_norm": 1.3124196529388428, + "learning_rate": 9.786789473684211e-05, + "loss": 0.4459, + "step": 28116 + }, + { + "epoch": 1.5744764251315937, + "grad_norm": 1.3489229679107666, + "learning_rate": 9.786763157894737e-05, + "loss": 0.4817, + "step": 28117 + }, + { + "epoch": 1.5745324224437227, + "grad_norm": 1.2093262672424316, + "learning_rate": 9.786736842105263e-05, + "loss": 0.413, + "step": 28118 + }, + { + "epoch": 1.5745884197558517, + "grad_norm": 1.512413501739502, + "learning_rate": 9.78671052631579e-05, + "loss": 0.5071, + "step": 28119 + }, + { + "epoch": 1.5746444170679808, + "grad_norm": 1.3144752979278564, + "learning_rate": 9.786684210526317e-05, + "loss": 0.5384, + "step": 28120 + }, + { + "epoch": 1.5747004143801098, + "grad_norm": 1.8020782470703125, + "learning_rate": 9.786657894736843e-05, + "loss": 0.4502, + "step": 28121 + }, + { + "epoch": 1.5747564116922388, + "grad_norm": 1.2363632917404175, + "learning_rate": 9.786631578947369e-05, + "loss": 0.3991, + "step": 28122 + }, + { + "epoch": 1.5748124090043678, + "grad_norm": 1.388232946395874, + "learning_rate": 9.786605263157895e-05, + "loss": 0.5205, + "step": 28123 + }, + { + "epoch": 1.5748684063164968, + "grad_norm": 11.61746883392334, + "learning_rate": 9.78657894736842e-05, + "loss": 0.6034, + "step": 28124 + }, + { + "epoch": 1.5749244036286258, + "grad_norm": 1.5589476823806763, + "learning_rate": 9.786552631578948e-05, + "loss": 0.7174, + "step": 28125 + }, + { + "epoch": 1.5749804009407549, + "grad_norm": 1.4372823238372803, + "learning_rate": 9.786526315789474e-05, + "loss": 0.5682, + "step": 28126 + }, + { + "epoch": 1.5750363982528839, + "grad_norm": 1.3344392776489258, + "learning_rate": 9.786500000000001e-05, + "loss": 0.485, + "step": 28127 + }, + { + "epoch": 1.575092395565013, + "grad_norm": 1.3974255323410034, + "learning_rate": 9.786473684210526e-05, + "loss": 0.4426, + "step": 28128 + }, + { + "epoch": 1.575148392877142, + "grad_norm": 1.2909753322601318, + "learning_rate": 9.786447368421053e-05, + "loss": 0.4372, + "step": 28129 + }, + { + "epoch": 1.575204390189271, + "grad_norm": 1.6748439073562622, + "learning_rate": 9.786421052631579e-05, + "loss": 0.4385, + "step": 28130 + }, + { + "epoch": 1.5752603875014, + "grad_norm": 1.1811232566833496, + "learning_rate": 9.786394736842106e-05, + "loss": 0.4178, + "step": 28131 + }, + { + "epoch": 1.575316384813529, + "grad_norm": 1.3954200744628906, + "learning_rate": 9.786368421052632e-05, + "loss": 0.4656, + "step": 28132 + }, + { + "epoch": 1.575372382125658, + "grad_norm": 1.1845266819000244, + "learning_rate": 9.786342105263158e-05, + "loss": 0.4493, + "step": 28133 + }, + { + "epoch": 1.575428379437787, + "grad_norm": 1.4437274932861328, + "learning_rate": 9.786315789473684e-05, + "loss": 0.5061, + "step": 28134 + }, + { + "epoch": 1.575484376749916, + "grad_norm": 1.2547589540481567, + "learning_rate": 9.786289473684212e-05, + "loss": 0.3944, + "step": 28135 + }, + { + "epoch": 1.575540374062045, + "grad_norm": 1.269227385520935, + "learning_rate": 9.786263157894738e-05, + "loss": 0.3511, + "step": 28136 + }, + { + "epoch": 1.575596371374174, + "grad_norm": 1.3924602270126343, + "learning_rate": 9.786236842105264e-05, + "loss": 0.4636, + "step": 28137 + }, + { + "epoch": 1.575652368686303, + "grad_norm": 1.0934702157974243, + "learning_rate": 9.78621052631579e-05, + "loss": 0.3141, + "step": 28138 + }, + { + "epoch": 1.5757083659984321, + "grad_norm": 1.1891965866088867, + "learning_rate": 9.786184210526316e-05, + "loss": 0.5611, + "step": 28139 + }, + { + "epoch": 1.5757643633105611, + "grad_norm": 1.4275307655334473, + "learning_rate": 9.786157894736843e-05, + "loss": 0.4026, + "step": 28140 + }, + { + "epoch": 1.5758203606226902, + "grad_norm": 1.2542580366134644, + "learning_rate": 9.786131578947369e-05, + "loss": 0.4041, + "step": 28141 + }, + { + "epoch": 1.5758763579348192, + "grad_norm": 1.2023520469665527, + "learning_rate": 9.786105263157895e-05, + "loss": 0.5344, + "step": 28142 + }, + { + "epoch": 1.5759323552469482, + "grad_norm": 1.3916306495666504, + "learning_rate": 9.786078947368421e-05, + "loss": 0.5923, + "step": 28143 + }, + { + "epoch": 1.5759883525590772, + "grad_norm": 1.2810475826263428, + "learning_rate": 9.786052631578948e-05, + "loss": 0.4102, + "step": 28144 + }, + { + "epoch": 1.5760443498712062, + "grad_norm": 1.1698429584503174, + "learning_rate": 9.786026315789474e-05, + "loss": 0.3531, + "step": 28145 + }, + { + "epoch": 1.5761003471833352, + "grad_norm": 1.3877931833267212, + "learning_rate": 9.786e-05, + "loss": 0.4117, + "step": 28146 + }, + { + "epoch": 1.5761563444954643, + "grad_norm": 1.289060354232788, + "learning_rate": 9.785973684210526e-05, + "loss": 0.4003, + "step": 28147 + }, + { + "epoch": 1.5762123418075933, + "grad_norm": 1.2534656524658203, + "learning_rate": 9.785947368421053e-05, + "loss": 0.4815, + "step": 28148 + }, + { + "epoch": 1.5762683391197223, + "grad_norm": 1.4801337718963623, + "learning_rate": 9.78592105263158e-05, + "loss": 0.5747, + "step": 28149 + }, + { + "epoch": 1.5763243364318513, + "grad_norm": 1.0988131761550903, + "learning_rate": 9.785894736842107e-05, + "loss": 0.4168, + "step": 28150 + }, + { + "epoch": 1.5763803337439803, + "grad_norm": 1.3799777030944824, + "learning_rate": 9.785868421052631e-05, + "loss": 0.6514, + "step": 28151 + }, + { + "epoch": 1.5764363310561094, + "grad_norm": 1.220173716545105, + "learning_rate": 9.785842105263159e-05, + "loss": 0.4265, + "step": 28152 + }, + { + "epoch": 1.5764923283682384, + "grad_norm": 1.2092936038970947, + "learning_rate": 9.785815789473685e-05, + "loss": 0.4341, + "step": 28153 + }, + { + "epoch": 1.5765483256803674, + "grad_norm": 1.393599271774292, + "learning_rate": 9.785789473684212e-05, + "loss": 0.3641, + "step": 28154 + }, + { + "epoch": 1.5766043229924964, + "grad_norm": 1.2767813205718994, + "learning_rate": 9.785763157894737e-05, + "loss": 0.5336, + "step": 28155 + }, + { + "epoch": 1.5766603203046254, + "grad_norm": 1.2935755252838135, + "learning_rate": 9.785736842105263e-05, + "loss": 0.5034, + "step": 28156 + }, + { + "epoch": 1.5767163176167545, + "grad_norm": 1.108004093170166, + "learning_rate": 9.78571052631579e-05, + "loss": 0.4588, + "step": 28157 + }, + { + "epoch": 1.5767723149288835, + "grad_norm": 1.0608549118041992, + "learning_rate": 9.785684210526316e-05, + "loss": 0.4156, + "step": 28158 + }, + { + "epoch": 1.5768283122410125, + "grad_norm": 1.2897781133651733, + "learning_rate": 9.785657894736843e-05, + "loss": 0.4719, + "step": 28159 + }, + { + "epoch": 1.5768843095531415, + "grad_norm": 1.090461015701294, + "learning_rate": 9.785631578947368e-05, + "loss": 0.3136, + "step": 28160 + }, + { + "epoch": 1.5769403068652705, + "grad_norm": 1.243940830230713, + "learning_rate": 9.785605263157895e-05, + "loss": 0.3999, + "step": 28161 + }, + { + "epoch": 1.5769963041773996, + "grad_norm": 1.2789543867111206, + "learning_rate": 9.785578947368421e-05, + "loss": 0.478, + "step": 28162 + }, + { + "epoch": 1.5770523014895286, + "grad_norm": 1.2415505647659302, + "learning_rate": 9.785552631578948e-05, + "loss": 0.4176, + "step": 28163 + }, + { + "epoch": 1.5771082988016576, + "grad_norm": 1.3886882066726685, + "learning_rate": 9.785526315789474e-05, + "loss": 0.4695, + "step": 28164 + }, + { + "epoch": 1.5771642961137866, + "grad_norm": 1.1668152809143066, + "learning_rate": 9.7855e-05, + "loss": 0.3876, + "step": 28165 + }, + { + "epoch": 1.5772202934259156, + "grad_norm": 1.349058747291565, + "learning_rate": 9.785473684210526e-05, + "loss": 0.5779, + "step": 28166 + }, + { + "epoch": 1.5772762907380447, + "grad_norm": 1.3069305419921875, + "learning_rate": 9.785447368421054e-05, + "loss": 0.4907, + "step": 28167 + }, + { + "epoch": 1.5773322880501737, + "grad_norm": 1.1267633438110352, + "learning_rate": 9.78542105263158e-05, + "loss": 0.3592, + "step": 28168 + }, + { + "epoch": 1.5773882853623027, + "grad_norm": 1.205538272857666, + "learning_rate": 9.785394736842106e-05, + "loss": 0.4373, + "step": 28169 + }, + { + "epoch": 1.5774442826744317, + "grad_norm": 1.0698776245117188, + "learning_rate": 9.785368421052632e-05, + "loss": 0.4397, + "step": 28170 + }, + { + "epoch": 1.5775002799865607, + "grad_norm": 1.385589361190796, + "learning_rate": 9.785342105263159e-05, + "loss": 0.433, + "step": 28171 + }, + { + "epoch": 1.5775562772986897, + "grad_norm": 1.3485287427902222, + "learning_rate": 9.785315789473685e-05, + "loss": 0.5212, + "step": 28172 + }, + { + "epoch": 1.5776122746108188, + "grad_norm": 1.1175944805145264, + "learning_rate": 9.785289473684211e-05, + "loss": 0.3932, + "step": 28173 + }, + { + "epoch": 1.5776682719229478, + "grad_norm": 1.5841163396835327, + "learning_rate": 9.785263157894737e-05, + "loss": 0.4892, + "step": 28174 + }, + { + "epoch": 1.5777242692350768, + "grad_norm": 1.2950897216796875, + "learning_rate": 9.785236842105263e-05, + "loss": 0.4751, + "step": 28175 + }, + { + "epoch": 1.5777802665472058, + "grad_norm": 1.267866611480713, + "learning_rate": 9.78521052631579e-05, + "loss": 0.3934, + "step": 28176 + }, + { + "epoch": 1.5778362638593348, + "grad_norm": 1.0617599487304688, + "learning_rate": 9.785184210526316e-05, + "loss": 0.4001, + "step": 28177 + }, + { + "epoch": 1.5778922611714639, + "grad_norm": 1.4428179264068604, + "learning_rate": 9.785157894736842e-05, + "loss": 0.5607, + "step": 28178 + }, + { + "epoch": 1.5779482584835929, + "grad_norm": 1.1948655843734741, + "learning_rate": 9.785131578947368e-05, + "loss": 0.4326, + "step": 28179 + }, + { + "epoch": 1.578004255795722, + "grad_norm": 1.6858242750167847, + "learning_rate": 9.785105263157895e-05, + "loss": 0.4088, + "step": 28180 + }, + { + "epoch": 1.578060253107851, + "grad_norm": 1.232558250427246, + "learning_rate": 9.785078947368421e-05, + "loss": 0.4657, + "step": 28181 + }, + { + "epoch": 1.57811625041998, + "grad_norm": 1.5232340097427368, + "learning_rate": 9.785052631578949e-05, + "loss": 0.5507, + "step": 28182 + }, + { + "epoch": 1.578172247732109, + "grad_norm": 1.240832805633545, + "learning_rate": 9.785026315789473e-05, + "loss": 0.437, + "step": 28183 + }, + { + "epoch": 1.578228245044238, + "grad_norm": 1.556320071220398, + "learning_rate": 9.785e-05, + "loss": 0.4443, + "step": 28184 + }, + { + "epoch": 1.578284242356367, + "grad_norm": 1.4046788215637207, + "learning_rate": 9.784973684210527e-05, + "loss": 0.4333, + "step": 28185 + }, + { + "epoch": 1.578340239668496, + "grad_norm": 1.3440078496932983, + "learning_rate": 9.784947368421054e-05, + "loss": 0.39, + "step": 28186 + }, + { + "epoch": 1.578396236980625, + "grad_norm": 1.4519109725952148, + "learning_rate": 9.78492105263158e-05, + "loss": 0.5872, + "step": 28187 + }, + { + "epoch": 1.578452234292754, + "grad_norm": 1.3294261693954468, + "learning_rate": 9.784894736842106e-05, + "loss": 0.5281, + "step": 28188 + }, + { + "epoch": 1.578508231604883, + "grad_norm": 1.7994285821914673, + "learning_rate": 9.784868421052632e-05, + "loss": 0.4514, + "step": 28189 + }, + { + "epoch": 1.578564228917012, + "grad_norm": 1.1187090873718262, + "learning_rate": 9.784842105263158e-05, + "loss": 0.4704, + "step": 28190 + }, + { + "epoch": 1.578620226229141, + "grad_norm": 1.0622961521148682, + "learning_rate": 9.784815789473685e-05, + "loss": 0.3963, + "step": 28191 + }, + { + "epoch": 1.5786762235412701, + "grad_norm": 1.4263650178909302, + "learning_rate": 9.784789473684211e-05, + "loss": 0.5271, + "step": 28192 + }, + { + "epoch": 1.5787322208533991, + "grad_norm": 1.0818216800689697, + "learning_rate": 9.784763157894737e-05, + "loss": 0.4412, + "step": 28193 + }, + { + "epoch": 1.5787882181655282, + "grad_norm": 1.3329949378967285, + "learning_rate": 9.784736842105263e-05, + "loss": 0.5599, + "step": 28194 + }, + { + "epoch": 1.5788442154776572, + "grad_norm": 1.3518515825271606, + "learning_rate": 9.78471052631579e-05, + "loss": 0.3642, + "step": 28195 + }, + { + "epoch": 1.5789002127897862, + "grad_norm": 1.2396793365478516, + "learning_rate": 9.784684210526316e-05, + "loss": 0.4187, + "step": 28196 + }, + { + "epoch": 1.5789562101019152, + "grad_norm": 1.3476207256317139, + "learning_rate": 9.784657894736842e-05, + "loss": 0.3397, + "step": 28197 + }, + { + "epoch": 1.5790122074140442, + "grad_norm": 1.346340537071228, + "learning_rate": 9.784631578947368e-05, + "loss": 0.4135, + "step": 28198 + }, + { + "epoch": 1.5790682047261733, + "grad_norm": 1.3594069480895996, + "learning_rate": 9.784605263157896e-05, + "loss": 0.3856, + "step": 28199 + }, + { + "epoch": 1.5791242020383023, + "grad_norm": 1.5098919868469238, + "learning_rate": 9.784578947368422e-05, + "loss": 0.3712, + "step": 28200 + }, + { + "epoch": 1.5791801993504313, + "grad_norm": 1.3382796049118042, + "learning_rate": 9.784552631578948e-05, + "loss": 0.4768, + "step": 28201 + }, + { + "epoch": 1.5792361966625603, + "grad_norm": 1.1051304340362549, + "learning_rate": 9.784526315789474e-05, + "loss": 0.3683, + "step": 28202 + }, + { + "epoch": 1.5792921939746893, + "grad_norm": 1.24118971824646, + "learning_rate": 9.784500000000001e-05, + "loss": 0.4999, + "step": 28203 + }, + { + "epoch": 1.5793481912868184, + "grad_norm": 1.661124348640442, + "learning_rate": 9.784473684210527e-05, + "loss": 0.7368, + "step": 28204 + }, + { + "epoch": 1.5794041885989474, + "grad_norm": 1.280503511428833, + "learning_rate": 9.784447368421054e-05, + "loss": 0.5091, + "step": 28205 + }, + { + "epoch": 1.5794601859110764, + "grad_norm": 1.2633830308914185, + "learning_rate": 9.784421052631579e-05, + "loss": 0.4254, + "step": 28206 + }, + { + "epoch": 1.5795161832232054, + "grad_norm": 1.272430419921875, + "learning_rate": 9.784394736842106e-05, + "loss": 0.4289, + "step": 28207 + }, + { + "epoch": 1.5795721805353344, + "grad_norm": 1.4057821035385132, + "learning_rate": 9.784368421052632e-05, + "loss": 0.3939, + "step": 28208 + }, + { + "epoch": 1.5796281778474635, + "grad_norm": 1.5706238746643066, + "learning_rate": 9.784342105263158e-05, + "loss": 0.4638, + "step": 28209 + }, + { + "epoch": 1.5796841751595925, + "grad_norm": 1.3825490474700928, + "learning_rate": 9.784315789473684e-05, + "loss": 0.397, + "step": 28210 + }, + { + "epoch": 1.5797401724717215, + "grad_norm": 1.3392679691314697, + "learning_rate": 9.78428947368421e-05, + "loss": 0.4503, + "step": 28211 + }, + { + "epoch": 1.5797961697838505, + "grad_norm": 1.4337555170059204, + "learning_rate": 9.784263157894737e-05, + "loss": 0.5699, + "step": 28212 + }, + { + "epoch": 1.5798521670959795, + "grad_norm": 1.576265811920166, + "learning_rate": 9.784236842105263e-05, + "loss": 0.5449, + "step": 28213 + }, + { + "epoch": 1.5799081644081086, + "grad_norm": 1.4735445976257324, + "learning_rate": 9.784210526315791e-05, + "loss": 0.4527, + "step": 28214 + }, + { + "epoch": 1.5799641617202376, + "grad_norm": 1.4079476594924927, + "learning_rate": 9.784184210526315e-05, + "loss": 0.3421, + "step": 28215 + }, + { + "epoch": 1.5800201590323666, + "grad_norm": 1.2776329517364502, + "learning_rate": 9.784157894736843e-05, + "loss": 0.461, + "step": 28216 + }, + { + "epoch": 1.5800761563444956, + "grad_norm": 1.2142679691314697, + "learning_rate": 9.784131578947369e-05, + "loss": 0.3766, + "step": 28217 + }, + { + "epoch": 1.5801321536566246, + "grad_norm": 1.3883777856826782, + "learning_rate": 9.784105263157896e-05, + "loss": 0.454, + "step": 28218 + }, + { + "epoch": 1.5801881509687536, + "grad_norm": 1.1547354459762573, + "learning_rate": 9.784078947368422e-05, + "loss": 0.477, + "step": 28219 + }, + { + "epoch": 1.5802441482808827, + "grad_norm": 1.2915985584259033, + "learning_rate": 9.784052631578948e-05, + "loss": 0.4815, + "step": 28220 + }, + { + "epoch": 1.5803001455930117, + "grad_norm": 1.478183388710022, + "learning_rate": 9.784026315789474e-05, + "loss": 0.4279, + "step": 28221 + }, + { + "epoch": 1.5803561429051407, + "grad_norm": 1.2574313879013062, + "learning_rate": 9.784000000000001e-05, + "loss": 0.508, + "step": 28222 + }, + { + "epoch": 1.5804121402172697, + "grad_norm": 10.788265228271484, + "learning_rate": 9.783973684210527e-05, + "loss": 0.4428, + "step": 28223 + }, + { + "epoch": 1.5804681375293987, + "grad_norm": 1.188437581062317, + "learning_rate": 9.783947368421053e-05, + "loss": 0.3691, + "step": 28224 + }, + { + "epoch": 1.5805241348415278, + "grad_norm": 1.3362503051757812, + "learning_rate": 9.783921052631579e-05, + "loss": 0.4179, + "step": 28225 + }, + { + "epoch": 1.5805801321536568, + "grad_norm": 1.9426575899124146, + "learning_rate": 9.783894736842105e-05, + "loss": 0.5131, + "step": 28226 + }, + { + "epoch": 1.5806361294657858, + "grad_norm": 1.151312232017517, + "learning_rate": 9.783868421052632e-05, + "loss": 0.3808, + "step": 28227 + }, + { + "epoch": 1.5806921267779148, + "grad_norm": 1.3491356372833252, + "learning_rate": 9.783842105263158e-05, + "loss": 0.4749, + "step": 28228 + }, + { + "epoch": 1.5807481240900438, + "grad_norm": 1.3050971031188965, + "learning_rate": 9.783815789473684e-05, + "loss": 0.4173, + "step": 28229 + }, + { + "epoch": 1.5808041214021729, + "grad_norm": 1.3836475610733032, + "learning_rate": 9.78378947368421e-05, + "loss": 0.4225, + "step": 28230 + }, + { + "epoch": 1.5808601187143019, + "grad_norm": 1.2909274101257324, + "learning_rate": 9.783763157894738e-05, + "loss": 0.5205, + "step": 28231 + }, + { + "epoch": 1.580916116026431, + "grad_norm": 1.6245766878128052, + "learning_rate": 9.783736842105264e-05, + "loss": 0.5249, + "step": 28232 + }, + { + "epoch": 1.58097211333856, + "grad_norm": 1.3622593879699707, + "learning_rate": 9.78371052631579e-05, + "loss": 0.3795, + "step": 28233 + }, + { + "epoch": 1.5810281106506887, + "grad_norm": 1.450543999671936, + "learning_rate": 9.783684210526316e-05, + "loss": 0.4643, + "step": 28234 + }, + { + "epoch": 1.5810841079628177, + "grad_norm": 1.3439433574676514, + "learning_rate": 9.783657894736843e-05, + "loss": 0.6194, + "step": 28235 + }, + { + "epoch": 1.5811401052749467, + "grad_norm": 1.3028042316436768, + "learning_rate": 9.783631578947369e-05, + "loss": 0.4211, + "step": 28236 + }, + { + "epoch": 1.5811961025870758, + "grad_norm": 1.1072126626968384, + "learning_rate": 9.783605263157896e-05, + "loss": 0.3549, + "step": 28237 + }, + { + "epoch": 1.5812520998992048, + "grad_norm": 1.2173089981079102, + "learning_rate": 9.783578947368421e-05, + "loss": 0.4091, + "step": 28238 + }, + { + "epoch": 1.5813080972113338, + "grad_norm": 1.1502578258514404, + "learning_rate": 9.783552631578948e-05, + "loss": 0.4709, + "step": 28239 + }, + { + "epoch": 1.5813640945234628, + "grad_norm": 1.3251762390136719, + "learning_rate": 9.783526315789474e-05, + "loss": 0.5133, + "step": 28240 + }, + { + "epoch": 1.5814200918355918, + "grad_norm": 1.4343903064727783, + "learning_rate": 9.783500000000001e-05, + "loss": 0.4494, + "step": 28241 + }, + { + "epoch": 1.5814760891477209, + "grad_norm": 1.1779705286026, + "learning_rate": 9.783473684210527e-05, + "loss": 0.55, + "step": 28242 + }, + { + "epoch": 1.5815320864598499, + "grad_norm": 1.5313528776168823, + "learning_rate": 9.783447368421052e-05, + "loss": 0.4, + "step": 28243 + }, + { + "epoch": 1.581588083771979, + "grad_norm": 1.4285385608673096, + "learning_rate": 9.78342105263158e-05, + "loss": 0.5774, + "step": 28244 + }, + { + "epoch": 1.581644081084108, + "grad_norm": 1.9072041511535645, + "learning_rate": 9.783394736842105e-05, + "loss": 0.4054, + "step": 28245 + }, + { + "epoch": 1.581700078396237, + "grad_norm": 1.3752843141555786, + "learning_rate": 9.783368421052633e-05, + "loss": 0.423, + "step": 28246 + }, + { + "epoch": 1.581756075708366, + "grad_norm": 1.5470937490463257, + "learning_rate": 9.783342105263159e-05, + "loss": 0.4605, + "step": 28247 + }, + { + "epoch": 1.581812073020495, + "grad_norm": 1.4123104810714722, + "learning_rate": 9.783315789473685e-05, + "loss": 0.5767, + "step": 28248 + }, + { + "epoch": 1.581868070332624, + "grad_norm": 1.4098743200302124, + "learning_rate": 9.78328947368421e-05, + "loss": 0.4671, + "step": 28249 + }, + { + "epoch": 1.581924067644753, + "grad_norm": 1.2538591623306274, + "learning_rate": 9.783263157894738e-05, + "loss": 0.3655, + "step": 28250 + }, + { + "epoch": 1.581980064956882, + "grad_norm": 1.244978904724121, + "learning_rate": 9.783236842105264e-05, + "loss": 0.3907, + "step": 28251 + }, + { + "epoch": 1.582036062269011, + "grad_norm": 1.2174460887908936, + "learning_rate": 9.78321052631579e-05, + "loss": 0.3742, + "step": 28252 + }, + { + "epoch": 1.58209205958114, + "grad_norm": 16.026084899902344, + "learning_rate": 9.783184210526316e-05, + "loss": 0.5541, + "step": 28253 + }, + { + "epoch": 1.582148056893269, + "grad_norm": 1.2309362888336182, + "learning_rate": 9.783157894736843e-05, + "loss": 0.4567, + "step": 28254 + }, + { + "epoch": 1.5822040542053981, + "grad_norm": 1.18257474899292, + "learning_rate": 9.783131578947369e-05, + "loss": 0.457, + "step": 28255 + }, + { + "epoch": 1.5822600515175271, + "grad_norm": 1.2617989778518677, + "learning_rate": 9.783105263157895e-05, + "loss": 0.4647, + "step": 28256 + }, + { + "epoch": 1.5823160488296562, + "grad_norm": 1.1995033025741577, + "learning_rate": 9.783078947368421e-05, + "loss": 0.5024, + "step": 28257 + }, + { + "epoch": 1.5823720461417852, + "grad_norm": 1.1797460317611694, + "learning_rate": 9.783052631578948e-05, + "loss": 0.5191, + "step": 28258 + }, + { + "epoch": 1.5824280434539142, + "grad_norm": 1.3838032484054565, + "learning_rate": 9.783026315789474e-05, + "loss": 0.488, + "step": 28259 + }, + { + "epoch": 1.5824840407660432, + "grad_norm": 1.5013700723648071, + "learning_rate": 9.783e-05, + "loss": 0.4724, + "step": 28260 + }, + { + "epoch": 1.5825400380781722, + "grad_norm": 1.1968519687652588, + "learning_rate": 9.782973684210526e-05, + "loss": 0.4432, + "step": 28261 + }, + { + "epoch": 1.5825960353903012, + "grad_norm": 1.2762856483459473, + "learning_rate": 9.782947368421052e-05, + "loss": 0.4813, + "step": 28262 + }, + { + "epoch": 1.5826520327024303, + "grad_norm": 1.4965077638626099, + "learning_rate": 9.78292105263158e-05, + "loss": 0.4058, + "step": 28263 + }, + { + "epoch": 1.5827080300145593, + "grad_norm": 1.2279053926467896, + "learning_rate": 9.782894736842106e-05, + "loss": 0.4609, + "step": 28264 + }, + { + "epoch": 1.5827640273266883, + "grad_norm": 1.1062767505645752, + "learning_rate": 9.782868421052632e-05, + "loss": 0.3519, + "step": 28265 + }, + { + "epoch": 1.5828200246388173, + "grad_norm": 1.3223053216934204, + "learning_rate": 9.782842105263158e-05, + "loss": 0.5096, + "step": 28266 + }, + { + "epoch": 1.5828760219509463, + "grad_norm": 1.2049095630645752, + "learning_rate": 9.782815789473685e-05, + "loss": 0.4288, + "step": 28267 + }, + { + "epoch": 1.5829320192630754, + "grad_norm": 1.3944287300109863, + "learning_rate": 9.782789473684211e-05, + "loss": 0.4298, + "step": 28268 + }, + { + "epoch": 1.5829880165752044, + "grad_norm": 1.3010836839675903, + "learning_rate": 9.782763157894738e-05, + "loss": 0.3869, + "step": 28269 + }, + { + "epoch": 1.5830440138873334, + "grad_norm": 1.4387359619140625, + "learning_rate": 9.782736842105263e-05, + "loss": 0.5562, + "step": 28270 + }, + { + "epoch": 1.5831000111994624, + "grad_norm": 1.1513671875, + "learning_rate": 9.78271052631579e-05, + "loss": 0.4648, + "step": 28271 + }, + { + "epoch": 1.5831560085115914, + "grad_norm": 1.276419997215271, + "learning_rate": 9.782684210526316e-05, + "loss": 0.5492, + "step": 28272 + }, + { + "epoch": 1.5832120058237205, + "grad_norm": 1.277443766593933, + "learning_rate": 9.782657894736843e-05, + "loss": 0.4696, + "step": 28273 + }, + { + "epoch": 1.5832680031358495, + "grad_norm": 1.727570652961731, + "learning_rate": 9.78263157894737e-05, + "loss": 0.5006, + "step": 28274 + }, + { + "epoch": 1.5833240004479785, + "grad_norm": 1.3538728952407837, + "learning_rate": 9.782605263157895e-05, + "loss": 0.5115, + "step": 28275 + }, + { + "epoch": 1.5833799977601075, + "grad_norm": 1.189645767211914, + "learning_rate": 9.782578947368421e-05, + "loss": 0.4491, + "step": 28276 + }, + { + "epoch": 1.5834359950722365, + "grad_norm": 1.426240086555481, + "learning_rate": 9.782552631578947e-05, + "loss": 0.5695, + "step": 28277 + }, + { + "epoch": 1.5834919923843656, + "grad_norm": 1.1678203344345093, + "learning_rate": 9.782526315789475e-05, + "loss": 0.4941, + "step": 28278 + }, + { + "epoch": 1.5835479896964946, + "grad_norm": 1.3237804174423218, + "learning_rate": 9.7825e-05, + "loss": 0.4183, + "step": 28279 + }, + { + "epoch": 1.5836039870086236, + "grad_norm": 1.8183512687683105, + "learning_rate": 9.782473684210527e-05, + "loss": 0.485, + "step": 28280 + }, + { + "epoch": 1.5836599843207526, + "grad_norm": 1.3586292266845703, + "learning_rate": 9.782447368421053e-05, + "loss": 0.5133, + "step": 28281 + }, + { + "epoch": 1.5837159816328816, + "grad_norm": 1.332970142364502, + "learning_rate": 9.78242105263158e-05, + "loss": 0.4322, + "step": 28282 + }, + { + "epoch": 1.5837719789450107, + "grad_norm": 1.2447971105575562, + "learning_rate": 9.782394736842106e-05, + "loss": 0.4877, + "step": 28283 + }, + { + "epoch": 1.5838279762571397, + "grad_norm": 1.1528841257095337, + "learning_rate": 9.782368421052632e-05, + "loss": 0.411, + "step": 28284 + }, + { + "epoch": 1.5838839735692687, + "grad_norm": 1.331696629524231, + "learning_rate": 9.782342105263158e-05, + "loss": 0.5132, + "step": 28285 + }, + { + "epoch": 1.5839399708813977, + "grad_norm": 1.526389718055725, + "learning_rate": 9.782315789473685e-05, + "loss": 0.5063, + "step": 28286 + }, + { + "epoch": 1.5839959681935267, + "grad_norm": 1.220671534538269, + "learning_rate": 9.782289473684211e-05, + "loss": 0.4549, + "step": 28287 + }, + { + "epoch": 1.5840519655056557, + "grad_norm": 1.4934096336364746, + "learning_rate": 9.782263157894737e-05, + "loss": 0.5103, + "step": 28288 + }, + { + "epoch": 1.5841079628177848, + "grad_norm": 1.101176381111145, + "learning_rate": 9.782236842105263e-05, + "loss": 0.3865, + "step": 28289 + }, + { + "epoch": 1.5841639601299138, + "grad_norm": 1.2829971313476562, + "learning_rate": 9.78221052631579e-05, + "loss": 0.3926, + "step": 28290 + }, + { + "epoch": 1.5842199574420428, + "grad_norm": 1.3696972131729126, + "learning_rate": 9.782184210526316e-05, + "loss": 0.6527, + "step": 28291 + }, + { + "epoch": 1.5842759547541718, + "grad_norm": 1.5396428108215332, + "learning_rate": 9.782157894736844e-05, + "loss": 0.5814, + "step": 28292 + }, + { + "epoch": 1.5843319520663008, + "grad_norm": 1.616563081741333, + "learning_rate": 9.782131578947368e-05, + "loss": 0.5543, + "step": 28293 + }, + { + "epoch": 1.5843879493784299, + "grad_norm": 1.4253442287445068, + "learning_rate": 9.782105263157894e-05, + "loss": 0.4605, + "step": 28294 + }, + { + "epoch": 1.5844439466905589, + "grad_norm": 1.382038950920105, + "learning_rate": 9.782078947368422e-05, + "loss": 0.4453, + "step": 28295 + }, + { + "epoch": 1.584499944002688, + "grad_norm": 1.2013399600982666, + "learning_rate": 9.782052631578948e-05, + "loss": 0.4356, + "step": 28296 + }, + { + "epoch": 1.584555941314817, + "grad_norm": 1.2636768817901611, + "learning_rate": 9.782026315789475e-05, + "loss": 0.4449, + "step": 28297 + }, + { + "epoch": 1.584611938626946, + "grad_norm": 1.3658510446548462, + "learning_rate": 9.782e-05, + "loss": 0.5393, + "step": 28298 + }, + { + "epoch": 1.584667935939075, + "grad_norm": 1.285614252090454, + "learning_rate": 9.781973684210527e-05, + "loss": 0.4172, + "step": 28299 + }, + { + "epoch": 1.584723933251204, + "grad_norm": 1.1509438753128052, + "learning_rate": 9.781947368421053e-05, + "loss": 0.4589, + "step": 28300 + }, + { + "epoch": 1.584779930563333, + "grad_norm": 1.444385051727295, + "learning_rate": 9.78192105263158e-05, + "loss": 0.4409, + "step": 28301 + }, + { + "epoch": 1.584835927875462, + "grad_norm": 1.872693657875061, + "learning_rate": 9.781894736842106e-05, + "loss": 0.5809, + "step": 28302 + }, + { + "epoch": 1.584891925187591, + "grad_norm": 1.6845189332962036, + "learning_rate": 9.781868421052632e-05, + "loss": 0.4986, + "step": 28303 + }, + { + "epoch": 1.58494792249972, + "grad_norm": 1.6680781841278076, + "learning_rate": 9.781842105263158e-05, + "loss": 0.5092, + "step": 28304 + }, + { + "epoch": 1.585003919811849, + "grad_norm": 1.7148813009262085, + "learning_rate": 9.781815789473685e-05, + "loss": 0.564, + "step": 28305 + }, + { + "epoch": 1.585059917123978, + "grad_norm": 1.5087623596191406, + "learning_rate": 9.781789473684211e-05, + "loss": 0.6449, + "step": 28306 + }, + { + "epoch": 1.585115914436107, + "grad_norm": 1.4534099102020264, + "learning_rate": 9.781763157894737e-05, + "loss": 0.4322, + "step": 28307 + }, + { + "epoch": 1.5851719117482361, + "grad_norm": 1.3164526224136353, + "learning_rate": 9.781736842105263e-05, + "loss": 0.354, + "step": 28308 + }, + { + "epoch": 1.5852279090603651, + "grad_norm": 1.0275143384933472, + "learning_rate": 9.78171052631579e-05, + "loss": 0.2847, + "step": 28309 + }, + { + "epoch": 1.5852839063724942, + "grad_norm": 1.2909725904464722, + "learning_rate": 9.781684210526317e-05, + "loss": 0.4754, + "step": 28310 + }, + { + "epoch": 1.5853399036846232, + "grad_norm": 1.3653693199157715, + "learning_rate": 9.781657894736843e-05, + "loss": 0.3985, + "step": 28311 + }, + { + "epoch": 1.5853959009967522, + "grad_norm": 1.3306163549423218, + "learning_rate": 9.781631578947369e-05, + "loss": 0.419, + "step": 28312 + }, + { + "epoch": 1.5854518983088812, + "grad_norm": 1.3650118112564087, + "learning_rate": 9.781605263157895e-05, + "loss": 0.4289, + "step": 28313 + }, + { + "epoch": 1.5855078956210102, + "grad_norm": 1.4015605449676514, + "learning_rate": 9.781578947368422e-05, + "loss": 0.4585, + "step": 28314 + }, + { + "epoch": 1.5855638929331393, + "grad_norm": 1.2568800449371338, + "learning_rate": 9.781552631578948e-05, + "loss": 0.4393, + "step": 28315 + }, + { + "epoch": 1.5856198902452683, + "grad_norm": 1.413943886756897, + "learning_rate": 9.781526315789474e-05, + "loss": 0.4012, + "step": 28316 + }, + { + "epoch": 1.585675887557397, + "grad_norm": 1.41544771194458, + "learning_rate": 9.7815e-05, + "loss": 0.3876, + "step": 28317 + }, + { + "epoch": 1.585731884869526, + "grad_norm": 1.333017349243164, + "learning_rate": 9.781473684210527e-05, + "loss": 0.5692, + "step": 28318 + }, + { + "epoch": 1.5857878821816551, + "grad_norm": 1.1712980270385742, + "learning_rate": 9.781447368421053e-05, + "loss": 0.3676, + "step": 28319 + }, + { + "epoch": 1.5858438794937841, + "grad_norm": 1.17991042137146, + "learning_rate": 9.781421052631579e-05, + "loss": 0.4407, + "step": 28320 + }, + { + "epoch": 1.5858998768059132, + "grad_norm": 1.4239095449447632, + "learning_rate": 9.781394736842105e-05, + "loss": 0.5732, + "step": 28321 + }, + { + "epoch": 1.5859558741180422, + "grad_norm": 2.375763177871704, + "learning_rate": 9.781368421052632e-05, + "loss": 0.54, + "step": 28322 + }, + { + "epoch": 1.5860118714301712, + "grad_norm": 1.3151723146438599, + "learning_rate": 9.781342105263158e-05, + "loss": 0.3928, + "step": 28323 + }, + { + "epoch": 1.5860678687423002, + "grad_norm": 1.134048342704773, + "learning_rate": 9.781315789473686e-05, + "loss": 0.4171, + "step": 28324 + }, + { + "epoch": 1.5861238660544292, + "grad_norm": 1.9114692211151123, + "learning_rate": 9.78128947368421e-05, + "loss": 0.3269, + "step": 28325 + }, + { + "epoch": 1.5861798633665583, + "grad_norm": 3.0973849296569824, + "learning_rate": 9.781263157894738e-05, + "loss": 0.4508, + "step": 28326 + }, + { + "epoch": 1.5862358606786873, + "grad_norm": 1.5860693454742432, + "learning_rate": 9.781236842105264e-05, + "loss": 0.5947, + "step": 28327 + }, + { + "epoch": 1.5862918579908163, + "grad_norm": 1.4285081624984741, + "learning_rate": 9.781210526315791e-05, + "loss": 0.5213, + "step": 28328 + }, + { + "epoch": 1.5863478553029453, + "grad_norm": 1.274310827255249, + "learning_rate": 9.781184210526317e-05, + "loss": 0.5095, + "step": 28329 + }, + { + "epoch": 1.5864038526150743, + "grad_norm": 1.354880928993225, + "learning_rate": 9.781157894736841e-05, + "loss": 0.4487, + "step": 28330 + }, + { + "epoch": 1.5864598499272033, + "grad_norm": 1.4087774753570557, + "learning_rate": 9.781131578947369e-05, + "loss": 0.4681, + "step": 28331 + }, + { + "epoch": 1.5865158472393324, + "grad_norm": 1.4856915473937988, + "learning_rate": 9.781105263157895e-05, + "loss": 0.4907, + "step": 28332 + }, + { + "epoch": 1.5865718445514614, + "grad_norm": 1.165053129196167, + "learning_rate": 9.781078947368422e-05, + "loss": 0.4071, + "step": 28333 + }, + { + "epoch": 1.5866278418635904, + "grad_norm": 1.8275299072265625, + "learning_rate": 9.781052631578948e-05, + "loss": 0.7704, + "step": 28334 + }, + { + "epoch": 1.5866838391757194, + "grad_norm": 1.1157145500183105, + "learning_rate": 9.781026315789474e-05, + "loss": 0.4555, + "step": 28335 + }, + { + "epoch": 1.5867398364878484, + "grad_norm": 1.1894867420196533, + "learning_rate": 9.781e-05, + "loss": 0.4554, + "step": 28336 + }, + { + "epoch": 1.5867958337999775, + "grad_norm": 1.4696792364120483, + "learning_rate": 9.780973684210527e-05, + "loss": 0.4572, + "step": 28337 + }, + { + "epoch": 1.5868518311121065, + "grad_norm": 1.2457644939422607, + "learning_rate": 9.780947368421053e-05, + "loss": 0.4419, + "step": 28338 + }, + { + "epoch": 1.5869078284242355, + "grad_norm": 1.1074076890945435, + "learning_rate": 9.780921052631579e-05, + "loss": 0.5484, + "step": 28339 + }, + { + "epoch": 1.5869638257363645, + "grad_norm": 1.3094606399536133, + "learning_rate": 9.780894736842105e-05, + "loss": 0.472, + "step": 28340 + }, + { + "epoch": 1.5870198230484935, + "grad_norm": 1.7311792373657227, + "learning_rate": 9.780868421052633e-05, + "loss": 0.549, + "step": 28341 + }, + { + "epoch": 1.5870758203606226, + "grad_norm": 1.266061782836914, + "learning_rate": 9.780842105263159e-05, + "loss": 0.4031, + "step": 28342 + }, + { + "epoch": 1.5871318176727516, + "grad_norm": 1.1189930438995361, + "learning_rate": 9.780815789473685e-05, + "loss": 0.4839, + "step": 28343 + }, + { + "epoch": 1.5871878149848806, + "grad_norm": 1.2545877695083618, + "learning_rate": 9.78078947368421e-05, + "loss": 0.6219, + "step": 28344 + }, + { + "epoch": 1.5872438122970096, + "grad_norm": 1.3213335275650024, + "learning_rate": 9.780763157894738e-05, + "loss": 0.3883, + "step": 28345 + }, + { + "epoch": 1.5872998096091386, + "grad_norm": 1.3805620670318604, + "learning_rate": 9.780736842105264e-05, + "loss": 0.4589, + "step": 28346 + }, + { + "epoch": 1.5873558069212677, + "grad_norm": 1.2606295347213745, + "learning_rate": 9.78071052631579e-05, + "loss": 0.5068, + "step": 28347 + }, + { + "epoch": 1.5874118042333967, + "grad_norm": 1.353511929512024, + "learning_rate": 9.780684210526316e-05, + "loss": 0.436, + "step": 28348 + }, + { + "epoch": 1.5874678015455257, + "grad_norm": 1.7375603914260864, + "learning_rate": 9.780657894736842e-05, + "loss": 0.5207, + "step": 28349 + }, + { + "epoch": 1.5875237988576547, + "grad_norm": 1.2530337572097778, + "learning_rate": 9.780631578947369e-05, + "loss": 0.3879, + "step": 28350 + }, + { + "epoch": 1.5875797961697837, + "grad_norm": 1.5136656761169434, + "learning_rate": 9.780605263157895e-05, + "loss": 0.5764, + "step": 28351 + }, + { + "epoch": 1.5876357934819127, + "grad_norm": 1.0596904754638672, + "learning_rate": 9.780578947368422e-05, + "loss": 0.3732, + "step": 28352 + }, + { + "epoch": 1.5876917907940418, + "grad_norm": 1.2544870376586914, + "learning_rate": 9.780552631578947e-05, + "loss": 0.4187, + "step": 28353 + }, + { + "epoch": 1.5877477881061708, + "grad_norm": 1.3218860626220703, + "learning_rate": 9.780526315789474e-05, + "loss": 0.5132, + "step": 28354 + }, + { + "epoch": 1.5878037854182998, + "grad_norm": 1.3503254652023315, + "learning_rate": 9.7805e-05, + "loss": 0.4403, + "step": 28355 + }, + { + "epoch": 1.5878597827304288, + "grad_norm": 1.5628440380096436, + "learning_rate": 9.780473684210528e-05, + "loss": 0.4707, + "step": 28356 + }, + { + "epoch": 1.5879157800425578, + "grad_norm": 1.1598467826843262, + "learning_rate": 9.780447368421052e-05, + "loss": 0.4415, + "step": 28357 + }, + { + "epoch": 1.5879717773546869, + "grad_norm": 1.7052439451217651, + "learning_rate": 9.78042105263158e-05, + "loss": 0.5187, + "step": 28358 + }, + { + "epoch": 1.5880277746668159, + "grad_norm": 1.171898603439331, + "learning_rate": 9.780394736842106e-05, + "loss": 0.3662, + "step": 28359 + }, + { + "epoch": 1.588083771978945, + "grad_norm": 1.285593032836914, + "learning_rate": 9.780368421052633e-05, + "loss": 0.4864, + "step": 28360 + }, + { + "epoch": 1.588139769291074, + "grad_norm": 1.2117118835449219, + "learning_rate": 9.780342105263159e-05, + "loss": 0.3878, + "step": 28361 + }, + { + "epoch": 1.588195766603203, + "grad_norm": 1.504900574684143, + "learning_rate": 9.780315789473685e-05, + "loss": 0.5126, + "step": 28362 + }, + { + "epoch": 1.588251763915332, + "grad_norm": 1.3133801221847534, + "learning_rate": 9.780289473684211e-05, + "loss": 0.4801, + "step": 28363 + }, + { + "epoch": 1.588307761227461, + "grad_norm": 1.1876076459884644, + "learning_rate": 9.780263157894737e-05, + "loss": 0.4214, + "step": 28364 + }, + { + "epoch": 1.58836375853959, + "grad_norm": 1.3989534378051758, + "learning_rate": 9.780236842105264e-05, + "loss": 0.6245, + "step": 28365 + }, + { + "epoch": 1.588419755851719, + "grad_norm": 1.4644654989242554, + "learning_rate": 9.78021052631579e-05, + "loss": 0.5663, + "step": 28366 + }, + { + "epoch": 1.588475753163848, + "grad_norm": 1.4771467447280884, + "learning_rate": 9.780184210526316e-05, + "loss": 0.5066, + "step": 28367 + }, + { + "epoch": 1.588531750475977, + "grad_norm": 1.3771470785140991, + "learning_rate": 9.780157894736842e-05, + "loss": 0.7344, + "step": 28368 + }, + { + "epoch": 1.588587747788106, + "grad_norm": 1.4508684873580933, + "learning_rate": 9.78013157894737e-05, + "loss": 0.3952, + "step": 28369 + }, + { + "epoch": 1.588643745100235, + "grad_norm": 1.3457491397857666, + "learning_rate": 9.780105263157895e-05, + "loss": 0.4476, + "step": 28370 + }, + { + "epoch": 1.5886997424123641, + "grad_norm": 1.3605365753173828, + "learning_rate": 9.780078947368421e-05, + "loss": 0.556, + "step": 28371 + }, + { + "epoch": 1.5887557397244931, + "grad_norm": 1.2433078289031982, + "learning_rate": 9.780052631578947e-05, + "loss": 0.4486, + "step": 28372 + }, + { + "epoch": 1.5888117370366222, + "grad_norm": 1.3366619348526, + "learning_rate": 9.780026315789475e-05, + "loss": 0.4229, + "step": 28373 + }, + { + "epoch": 1.5888677343487512, + "grad_norm": 1.091226577758789, + "learning_rate": 9.78e-05, + "loss": 0.4386, + "step": 28374 + }, + { + "epoch": 1.5889237316608802, + "grad_norm": 1.1753994226455688, + "learning_rate": 9.779973684210527e-05, + "loss": 0.3889, + "step": 28375 + }, + { + "epoch": 1.5889797289730092, + "grad_norm": 1.200480341911316, + "learning_rate": 9.779947368421052e-05, + "loss": 0.3553, + "step": 28376 + }, + { + "epoch": 1.5890357262851382, + "grad_norm": 1.3582489490509033, + "learning_rate": 9.77992105263158e-05, + "loss": 0.5353, + "step": 28377 + }, + { + "epoch": 1.5890917235972672, + "grad_norm": 1.1027886867523193, + "learning_rate": 9.779894736842106e-05, + "loss": 0.3692, + "step": 28378 + }, + { + "epoch": 1.5891477209093963, + "grad_norm": 1.4722157716751099, + "learning_rate": 9.779868421052633e-05, + "loss": 0.3914, + "step": 28379 + }, + { + "epoch": 1.5892037182215253, + "grad_norm": 1.501230001449585, + "learning_rate": 9.779842105263158e-05, + "loss": 0.594, + "step": 28380 + }, + { + "epoch": 1.5892597155336543, + "grad_norm": 1.2601374387741089, + "learning_rate": 9.779815789473684e-05, + "loss": 0.4311, + "step": 28381 + }, + { + "epoch": 1.5893157128457833, + "grad_norm": 1.4137557744979858, + "learning_rate": 9.779789473684211e-05, + "loss": 0.4823, + "step": 28382 + }, + { + "epoch": 1.5893717101579123, + "grad_norm": 1.4185372591018677, + "learning_rate": 9.779763157894737e-05, + "loss": 0.3598, + "step": 28383 + }, + { + "epoch": 1.5894277074700414, + "grad_norm": 1.58175790309906, + "learning_rate": 9.779736842105264e-05, + "loss": 0.5759, + "step": 28384 + }, + { + "epoch": 1.5894837047821704, + "grad_norm": 1.2684824466705322, + "learning_rate": 9.779710526315789e-05, + "loss": 0.4728, + "step": 28385 + }, + { + "epoch": 1.5895397020942994, + "grad_norm": 1.3118031024932861, + "learning_rate": 9.779684210526316e-05, + "loss": 0.4175, + "step": 28386 + }, + { + "epoch": 1.5895956994064284, + "grad_norm": 1.0539162158966064, + "learning_rate": 9.779657894736842e-05, + "loss": 0.2494, + "step": 28387 + }, + { + "epoch": 1.5896516967185574, + "grad_norm": 1.312360167503357, + "learning_rate": 9.77963157894737e-05, + "loss": 0.4386, + "step": 28388 + }, + { + "epoch": 1.5897076940306865, + "grad_norm": 1.4711054563522339, + "learning_rate": 9.779605263157896e-05, + "loss": 0.3986, + "step": 28389 + }, + { + "epoch": 1.5897636913428155, + "grad_norm": 1.13727605342865, + "learning_rate": 9.779578947368422e-05, + "loss": 0.4532, + "step": 28390 + }, + { + "epoch": 1.5898196886549445, + "grad_norm": 1.7851922512054443, + "learning_rate": 9.779552631578948e-05, + "loss": 0.5963, + "step": 28391 + }, + { + "epoch": 1.5898756859670735, + "grad_norm": 2.342245101928711, + "learning_rate": 9.779526315789475e-05, + "loss": 0.386, + "step": 28392 + }, + { + "epoch": 1.5899316832792025, + "grad_norm": 1.5506318807601929, + "learning_rate": 9.779500000000001e-05, + "loss": 0.4646, + "step": 28393 + }, + { + "epoch": 1.5899876805913316, + "grad_norm": 1.348465085029602, + "learning_rate": 9.779473684210527e-05, + "loss": 0.4819, + "step": 28394 + }, + { + "epoch": 1.5900436779034606, + "grad_norm": 1.213765263557434, + "learning_rate": 9.779447368421053e-05, + "loss": 0.4695, + "step": 28395 + }, + { + "epoch": 1.5900996752155896, + "grad_norm": 1.4216108322143555, + "learning_rate": 9.77942105263158e-05, + "loss": 0.6269, + "step": 28396 + }, + { + "epoch": 1.5901556725277186, + "grad_norm": 1.424285650253296, + "learning_rate": 9.779394736842106e-05, + "loss": 0.4841, + "step": 28397 + }, + { + "epoch": 1.5902116698398476, + "grad_norm": 1.2293741703033447, + "learning_rate": 9.779368421052632e-05, + "loss": 0.4536, + "step": 28398 + }, + { + "epoch": 1.5902676671519766, + "grad_norm": 1.4259518384933472, + "learning_rate": 9.779342105263158e-05, + "loss": 0.4597, + "step": 28399 + }, + { + "epoch": 1.5903236644641057, + "grad_norm": 1.1571725606918335, + "learning_rate": 9.779315789473684e-05, + "loss": 0.4649, + "step": 28400 + }, + { + "epoch": 1.5903796617762347, + "grad_norm": 1.0336768627166748, + "learning_rate": 9.779289473684211e-05, + "loss": 0.4505, + "step": 28401 + }, + { + "epoch": 1.5904356590883637, + "grad_norm": 1.3144545555114746, + "learning_rate": 9.779263157894737e-05, + "loss": 0.462, + "step": 28402 + }, + { + "epoch": 1.5904916564004927, + "grad_norm": 1.402504801750183, + "learning_rate": 9.779236842105263e-05, + "loss": 0.5251, + "step": 28403 + }, + { + "epoch": 1.5905476537126217, + "grad_norm": 1.2226722240447998, + "learning_rate": 9.779210526315789e-05, + "loss": 0.4761, + "step": 28404 + }, + { + "epoch": 1.5906036510247508, + "grad_norm": 1.2393138408660889, + "learning_rate": 9.779184210526317e-05, + "loss": 0.3775, + "step": 28405 + }, + { + "epoch": 1.5906596483368798, + "grad_norm": 1.0617609024047852, + "learning_rate": 9.779157894736843e-05, + "loss": 0.3888, + "step": 28406 + }, + { + "epoch": 1.5907156456490088, + "grad_norm": 1.089975118637085, + "learning_rate": 9.77913157894737e-05, + "loss": 0.3558, + "step": 28407 + }, + { + "epoch": 1.5907716429611378, + "grad_norm": 1.3500378131866455, + "learning_rate": 9.779105263157894e-05, + "loss": 0.4809, + "step": 28408 + }, + { + "epoch": 1.5908276402732668, + "grad_norm": 1.1152628660202026, + "learning_rate": 9.779078947368422e-05, + "loss": 0.3475, + "step": 28409 + }, + { + "epoch": 1.5908836375853959, + "grad_norm": 1.1033310890197754, + "learning_rate": 9.779052631578948e-05, + "loss": 0.3902, + "step": 28410 + }, + { + "epoch": 1.5909396348975249, + "grad_norm": 1.1562005281448364, + "learning_rate": 9.779026315789475e-05, + "loss": 0.4295, + "step": 28411 + }, + { + "epoch": 1.590995632209654, + "grad_norm": 1.3591700792312622, + "learning_rate": 9.779e-05, + "loss": 0.4184, + "step": 28412 + }, + { + "epoch": 1.591051629521783, + "grad_norm": 1.2122689485549927, + "learning_rate": 9.778973684210527e-05, + "loss": 0.4766, + "step": 28413 + }, + { + "epoch": 1.591107626833912, + "grad_norm": 1.1197035312652588, + "learning_rate": 9.778947368421053e-05, + "loss": 0.5783, + "step": 28414 + }, + { + "epoch": 1.591163624146041, + "grad_norm": 1.5661566257476807, + "learning_rate": 9.778921052631579e-05, + "loss": 0.4492, + "step": 28415 + }, + { + "epoch": 1.59121962145817, + "grad_norm": 1.2168840169906616, + "learning_rate": 9.778894736842106e-05, + "loss": 0.4883, + "step": 28416 + }, + { + "epoch": 1.591275618770299, + "grad_norm": 1.9509795904159546, + "learning_rate": 9.778868421052631e-05, + "loss": 0.5233, + "step": 28417 + }, + { + "epoch": 1.591331616082428, + "grad_norm": 1.128519058227539, + "learning_rate": 9.778842105263158e-05, + "loss": 0.373, + "step": 28418 + }, + { + "epoch": 1.591387613394557, + "grad_norm": 1.1893380880355835, + "learning_rate": 9.778815789473684e-05, + "loss": 0.444, + "step": 28419 + }, + { + "epoch": 1.591443610706686, + "grad_norm": 1.22463858127594, + "learning_rate": 9.778789473684212e-05, + "loss": 0.4, + "step": 28420 + }, + { + "epoch": 1.591499608018815, + "grad_norm": 1.1782256364822388, + "learning_rate": 9.778763157894738e-05, + "loss": 0.3917, + "step": 28421 + }, + { + "epoch": 1.591555605330944, + "grad_norm": 1.3224610090255737, + "learning_rate": 9.778736842105264e-05, + "loss": 0.4997, + "step": 28422 + }, + { + "epoch": 1.591611602643073, + "grad_norm": 2.5644896030426025, + "learning_rate": 9.77871052631579e-05, + "loss": 0.6133, + "step": 28423 + }, + { + "epoch": 1.5916675999552021, + "grad_norm": 1.3538347482681274, + "learning_rate": 9.778684210526317e-05, + "loss": 0.4019, + "step": 28424 + }, + { + "epoch": 1.5917235972673311, + "grad_norm": 1.201648473739624, + "learning_rate": 9.778657894736843e-05, + "loss": 0.4126, + "step": 28425 + }, + { + "epoch": 1.5917795945794602, + "grad_norm": 1.3906424045562744, + "learning_rate": 9.778631578947369e-05, + "loss": 0.4754, + "step": 28426 + }, + { + "epoch": 1.5918355918915892, + "grad_norm": 1.3985551595687866, + "learning_rate": 9.778605263157895e-05, + "loss": 0.4963, + "step": 28427 + }, + { + "epoch": 1.5918915892037182, + "grad_norm": 1.138534426689148, + "learning_rate": 9.778578947368422e-05, + "loss": 0.4352, + "step": 28428 + }, + { + "epoch": 1.5919475865158472, + "grad_norm": 1.607399821281433, + "learning_rate": 9.778552631578948e-05, + "loss": 0.4495, + "step": 28429 + }, + { + "epoch": 1.5920035838279762, + "grad_norm": 1.2148011922836304, + "learning_rate": 9.778526315789474e-05, + "loss": 0.4655, + "step": 28430 + }, + { + "epoch": 1.5920595811401053, + "grad_norm": 1.1976314783096313, + "learning_rate": 9.7785e-05, + "loss": 0.4267, + "step": 28431 + }, + { + "epoch": 1.5921155784522343, + "grad_norm": 1.3428587913513184, + "learning_rate": 9.778473684210526e-05, + "loss": 0.4297, + "step": 28432 + }, + { + "epoch": 1.5921715757643633, + "grad_norm": 1.3928537368774414, + "learning_rate": 9.778447368421053e-05, + "loss": 0.4863, + "step": 28433 + }, + { + "epoch": 1.5922275730764923, + "grad_norm": 1.0944451093673706, + "learning_rate": 9.778421052631579e-05, + "loss": 0.3974, + "step": 28434 + }, + { + "epoch": 1.5922835703886213, + "grad_norm": 1.2076185941696167, + "learning_rate": 9.778394736842105e-05, + "loss": 0.5395, + "step": 28435 + }, + { + "epoch": 1.5923395677007504, + "grad_norm": 1.1822516918182373, + "learning_rate": 9.778368421052631e-05, + "loss": 0.405, + "step": 28436 + }, + { + "epoch": 1.5923955650128794, + "grad_norm": 1.5731171369552612, + "learning_rate": 9.778342105263159e-05, + "loss": 0.3564, + "step": 28437 + }, + { + "epoch": 1.5924515623250084, + "grad_norm": 1.2677079439163208, + "learning_rate": 9.778315789473684e-05, + "loss": 0.4636, + "step": 28438 + }, + { + "epoch": 1.5925075596371374, + "grad_norm": 1.1874058246612549, + "learning_rate": 9.778289473684212e-05, + "loss": 0.3787, + "step": 28439 + }, + { + "epoch": 1.5925635569492664, + "grad_norm": 1.4953478574752808, + "learning_rate": 9.778263157894736e-05, + "loss": 0.4372, + "step": 28440 + }, + { + "epoch": 1.5926195542613955, + "grad_norm": 1.2469935417175293, + "learning_rate": 9.778236842105264e-05, + "loss": 0.597, + "step": 28441 + }, + { + "epoch": 1.5926755515735245, + "grad_norm": 1.3644754886627197, + "learning_rate": 9.77821052631579e-05, + "loss": 0.3941, + "step": 28442 + }, + { + "epoch": 1.5927315488856535, + "grad_norm": 1.2577742338180542, + "learning_rate": 9.778184210526317e-05, + "loss": 0.4344, + "step": 28443 + }, + { + "epoch": 1.5927875461977825, + "grad_norm": 1.7004826068878174, + "learning_rate": 9.778157894736843e-05, + "loss": 0.6271, + "step": 28444 + }, + { + "epoch": 1.5928435435099115, + "grad_norm": 1.211025357246399, + "learning_rate": 9.778131578947369e-05, + "loss": 0.3728, + "step": 28445 + }, + { + "epoch": 1.5928995408220405, + "grad_norm": 1.2380868196487427, + "learning_rate": 9.778105263157895e-05, + "loss": 0.5776, + "step": 28446 + }, + { + "epoch": 1.5929555381341696, + "grad_norm": 2.2672958374023438, + "learning_rate": 9.778078947368422e-05, + "loss": 0.4709, + "step": 28447 + }, + { + "epoch": 1.5930115354462986, + "grad_norm": 1.4772093296051025, + "learning_rate": 9.778052631578948e-05, + "loss": 0.4569, + "step": 28448 + }, + { + "epoch": 1.5930675327584276, + "grad_norm": 1.0881645679473877, + "learning_rate": 9.778026315789474e-05, + "loss": 0.3936, + "step": 28449 + }, + { + "epoch": 1.5931235300705566, + "grad_norm": 1.9581612348556519, + "learning_rate": 9.778e-05, + "loss": 0.4379, + "step": 28450 + }, + { + "epoch": 1.5931795273826856, + "grad_norm": 1.2579345703125, + "learning_rate": 9.777973684210526e-05, + "loss": 0.4634, + "step": 28451 + }, + { + "epoch": 1.5932355246948147, + "grad_norm": 1.2960530519485474, + "learning_rate": 9.777947368421054e-05, + "loss": 0.4623, + "step": 28452 + }, + { + "epoch": 1.5932915220069437, + "grad_norm": 1.7639278173446655, + "learning_rate": 9.77792105263158e-05, + "loss": 0.3915, + "step": 28453 + }, + { + "epoch": 1.5933475193190727, + "grad_norm": 1.5790356397628784, + "learning_rate": 9.777894736842105e-05, + "loss": 0.4792, + "step": 28454 + }, + { + "epoch": 1.5934035166312017, + "grad_norm": 1.4003068208694458, + "learning_rate": 9.777868421052631e-05, + "loss": 0.4453, + "step": 28455 + }, + { + "epoch": 1.5934595139433307, + "grad_norm": 1.4069652557373047, + "learning_rate": 9.777842105263159e-05, + "loss": 0.4734, + "step": 28456 + }, + { + "epoch": 1.5935155112554598, + "grad_norm": 1.4430841207504272, + "learning_rate": 9.777815789473685e-05, + "loss": 0.4123, + "step": 28457 + }, + { + "epoch": 1.5935715085675888, + "grad_norm": 1.223680019378662, + "learning_rate": 9.777789473684211e-05, + "loss": 0.3639, + "step": 28458 + }, + { + "epoch": 1.5936275058797178, + "grad_norm": 1.3366825580596924, + "learning_rate": 9.777763157894737e-05, + "loss": 0.3767, + "step": 28459 + }, + { + "epoch": 1.5936835031918468, + "grad_norm": 1.191957712173462, + "learning_rate": 9.777736842105264e-05, + "loss": 0.4411, + "step": 28460 + }, + { + "epoch": 1.5937395005039758, + "grad_norm": 1.339077115058899, + "learning_rate": 9.77771052631579e-05, + "loss": 0.521, + "step": 28461 + }, + { + "epoch": 1.5937954978161049, + "grad_norm": 1.3121421337127686, + "learning_rate": 9.777684210526317e-05, + "loss": 0.5048, + "step": 28462 + }, + { + "epoch": 1.5938514951282339, + "grad_norm": 1.209811806678772, + "learning_rate": 9.777657894736842e-05, + "loss": 0.416, + "step": 28463 + }, + { + "epoch": 1.593907492440363, + "grad_norm": 1.111407995223999, + "learning_rate": 9.777631578947369e-05, + "loss": 0.4355, + "step": 28464 + }, + { + "epoch": 1.593963489752492, + "grad_norm": 1.4819329977035522, + "learning_rate": 9.777605263157895e-05, + "loss": 0.4893, + "step": 28465 + }, + { + "epoch": 1.594019487064621, + "grad_norm": 1.4028327465057373, + "learning_rate": 9.777578947368423e-05, + "loss": 0.417, + "step": 28466 + }, + { + "epoch": 1.59407548437675, + "grad_norm": 1.1550005674362183, + "learning_rate": 9.777552631578947e-05, + "loss": 0.4364, + "step": 28467 + }, + { + "epoch": 1.594131481688879, + "grad_norm": 1.216228723526001, + "learning_rate": 9.777526315789473e-05, + "loss": 0.2954, + "step": 28468 + }, + { + "epoch": 1.594187479001008, + "grad_norm": 1.1248055696487427, + "learning_rate": 9.7775e-05, + "loss": 0.3729, + "step": 28469 + }, + { + "epoch": 1.594243476313137, + "grad_norm": 1.0551655292510986, + "learning_rate": 9.777473684210526e-05, + "loss": 0.3796, + "step": 28470 + }, + { + "epoch": 1.594299473625266, + "grad_norm": 1.4725807905197144, + "learning_rate": 9.777447368421054e-05, + "loss": 0.5342, + "step": 28471 + }, + { + "epoch": 1.594355470937395, + "grad_norm": 1.3018877506256104, + "learning_rate": 9.777421052631578e-05, + "loss": 0.5014, + "step": 28472 + }, + { + "epoch": 1.594411468249524, + "grad_norm": 1.571959376335144, + "learning_rate": 9.777394736842106e-05, + "loss": 0.5121, + "step": 28473 + }, + { + "epoch": 1.594467465561653, + "grad_norm": 1.4337856769561768, + "learning_rate": 9.777368421052632e-05, + "loss": 0.4807, + "step": 28474 + }, + { + "epoch": 1.594523462873782, + "grad_norm": 1.2302309274673462, + "learning_rate": 9.777342105263159e-05, + "loss": 0.4135, + "step": 28475 + }, + { + "epoch": 1.5945794601859111, + "grad_norm": 1.445213794708252, + "learning_rate": 9.777315789473685e-05, + "loss": 0.5878, + "step": 28476 + }, + { + "epoch": 1.5946354574980401, + "grad_norm": 1.50583016872406, + "learning_rate": 9.777289473684211e-05, + "loss": 0.4031, + "step": 28477 + }, + { + "epoch": 1.5946914548101692, + "grad_norm": 1.5212727785110474, + "learning_rate": 9.777263157894737e-05, + "loss": 0.4732, + "step": 28478 + }, + { + "epoch": 1.5947474521222982, + "grad_norm": 1.2776482105255127, + "learning_rate": 9.777236842105264e-05, + "loss": 0.4414, + "step": 28479 + }, + { + "epoch": 1.5948034494344272, + "grad_norm": 1.2676594257354736, + "learning_rate": 9.77721052631579e-05, + "loss": 0.5139, + "step": 28480 + }, + { + "epoch": 1.5948594467465562, + "grad_norm": 1.4428002834320068, + "learning_rate": 9.777184210526316e-05, + "loss": 0.4788, + "step": 28481 + }, + { + "epoch": 1.5949154440586852, + "grad_norm": 1.3846535682678223, + "learning_rate": 9.777157894736842e-05, + "loss": 0.5191, + "step": 28482 + }, + { + "epoch": 1.5949714413708143, + "grad_norm": 1.5444915294647217, + "learning_rate": 9.77713157894737e-05, + "loss": 0.5192, + "step": 28483 + }, + { + "epoch": 1.5950274386829433, + "grad_norm": 1.2789089679718018, + "learning_rate": 9.777105263157896e-05, + "loss": 0.4586, + "step": 28484 + }, + { + "epoch": 1.5950834359950723, + "grad_norm": 1.6008487939834595, + "learning_rate": 9.777078947368421e-05, + "loss": 0.5281, + "step": 28485 + }, + { + "epoch": 1.5951394333072013, + "grad_norm": 1.1789329051971436, + "learning_rate": 9.777052631578947e-05, + "loss": 0.4533, + "step": 28486 + }, + { + "epoch": 1.5951954306193303, + "grad_norm": 1.3337887525558472, + "learning_rate": 9.777026315789473e-05, + "loss": 0.4568, + "step": 28487 + }, + { + "epoch": 1.5952514279314594, + "grad_norm": 1.1670817136764526, + "learning_rate": 9.777000000000001e-05, + "loss": 0.4196, + "step": 28488 + }, + { + "epoch": 1.5953074252435884, + "grad_norm": 1.2730642557144165, + "learning_rate": 9.776973684210527e-05, + "loss": 0.3497, + "step": 28489 + }, + { + "epoch": 1.5953634225557174, + "grad_norm": 1.3642950057983398, + "learning_rate": 9.776947368421053e-05, + "loss": 0.4985, + "step": 28490 + }, + { + "epoch": 1.5954194198678464, + "grad_norm": 1.2098121643066406, + "learning_rate": 9.776921052631579e-05, + "loss": 0.6366, + "step": 28491 + }, + { + "epoch": 1.5954754171799754, + "grad_norm": 1.8024334907531738, + "learning_rate": 9.776894736842106e-05, + "loss": 0.4126, + "step": 28492 + }, + { + "epoch": 1.5955314144921044, + "grad_norm": 1.2786428928375244, + "learning_rate": 9.776868421052632e-05, + "loss": 0.5756, + "step": 28493 + }, + { + "epoch": 1.5955874118042335, + "grad_norm": 4.292102813720703, + "learning_rate": 9.776842105263159e-05, + "loss": 0.548, + "step": 28494 + }, + { + "epoch": 1.5956434091163625, + "grad_norm": 1.196244239807129, + "learning_rate": 9.776815789473684e-05, + "loss": 0.3955, + "step": 28495 + }, + { + "epoch": 1.5956994064284915, + "grad_norm": 1.4397553205490112, + "learning_rate": 9.776789473684211e-05, + "loss": 0.5208, + "step": 28496 + }, + { + "epoch": 1.5957554037406205, + "grad_norm": 1.1940295696258545, + "learning_rate": 9.776763157894737e-05, + "loss": 0.3456, + "step": 28497 + }, + { + "epoch": 1.5958114010527495, + "grad_norm": 1.2804096937179565, + "learning_rate": 9.776736842105265e-05, + "loss": 0.4411, + "step": 28498 + }, + { + "epoch": 1.5958673983648786, + "grad_norm": 2.038516044616699, + "learning_rate": 9.77671052631579e-05, + "loss": 0.3775, + "step": 28499 + }, + { + "epoch": 1.5959233956770076, + "grad_norm": 1.3518465757369995, + "learning_rate": 9.776684210526316e-05, + "loss": 0.4528, + "step": 28500 + }, + { + "epoch": 1.5959793929891366, + "grad_norm": 1.3068658113479614, + "learning_rate": 9.776657894736842e-05, + "loss": 0.4894, + "step": 28501 + }, + { + "epoch": 1.5960353903012656, + "grad_norm": 1.3878062963485718, + "learning_rate": 9.776631578947368e-05, + "loss": 0.4538, + "step": 28502 + }, + { + "epoch": 1.5960913876133946, + "grad_norm": 1.7153552770614624, + "learning_rate": 9.776605263157896e-05, + "loss": 0.5061, + "step": 28503 + }, + { + "epoch": 1.5961473849255237, + "grad_norm": 1.4197717905044556, + "learning_rate": 9.776578947368422e-05, + "loss": 0.5573, + "step": 28504 + }, + { + "epoch": 1.5962033822376527, + "grad_norm": 1.4265944957733154, + "learning_rate": 9.776552631578948e-05, + "loss": 0.5851, + "step": 28505 + }, + { + "epoch": 1.5962593795497817, + "grad_norm": 1.3374272584915161, + "learning_rate": 9.776526315789474e-05, + "loss": 0.334, + "step": 28506 + }, + { + "epoch": 1.5963153768619107, + "grad_norm": 1.7477167844772339, + "learning_rate": 9.776500000000001e-05, + "loss": 0.6123, + "step": 28507 + }, + { + "epoch": 1.5963713741740397, + "grad_norm": 1.1946483850479126, + "learning_rate": 9.776473684210527e-05, + "loss": 0.4356, + "step": 28508 + }, + { + "epoch": 1.5964273714861688, + "grad_norm": 1.7405896186828613, + "learning_rate": 9.776447368421053e-05, + "loss": 0.4198, + "step": 28509 + }, + { + "epoch": 1.5964833687982978, + "grad_norm": 1.3562170267105103, + "learning_rate": 9.776421052631579e-05, + "loss": 0.3167, + "step": 28510 + }, + { + "epoch": 1.5965393661104268, + "grad_norm": 1.243408203125, + "learning_rate": 9.776394736842106e-05, + "loss": 0.5274, + "step": 28511 + }, + { + "epoch": 1.5965953634225558, + "grad_norm": 2.603712320327759, + "learning_rate": 9.776368421052632e-05, + "loss": 0.4354, + "step": 28512 + }, + { + "epoch": 1.5966513607346848, + "grad_norm": 1.4916447401046753, + "learning_rate": 9.776342105263158e-05, + "loss": 0.4733, + "step": 28513 + }, + { + "epoch": 1.5967073580468139, + "grad_norm": 1.0743104219436646, + "learning_rate": 9.776315789473684e-05, + "loss": 0.3415, + "step": 28514 + }, + { + "epoch": 1.5967633553589429, + "grad_norm": 1.333001971244812, + "learning_rate": 9.776289473684212e-05, + "loss": 0.6249, + "step": 28515 + }, + { + "epoch": 1.596819352671072, + "grad_norm": 1.2960251569747925, + "learning_rate": 9.776263157894737e-05, + "loss": 0.4857, + "step": 28516 + }, + { + "epoch": 1.596875349983201, + "grad_norm": 1.4187594652175903, + "learning_rate": 9.776236842105265e-05, + "loss": 0.5072, + "step": 28517 + }, + { + "epoch": 1.59693134729533, + "grad_norm": 0.9220099449157715, + "learning_rate": 9.77621052631579e-05, + "loss": 0.3494, + "step": 28518 + }, + { + "epoch": 1.596987344607459, + "grad_norm": 1.3434042930603027, + "learning_rate": 9.776184210526315e-05, + "loss": 0.4417, + "step": 28519 + }, + { + "epoch": 1.597043341919588, + "grad_norm": 1.4903477430343628, + "learning_rate": 9.776157894736843e-05, + "loss": 0.5027, + "step": 28520 + }, + { + "epoch": 1.597099339231717, + "grad_norm": 1.5842574834823608, + "learning_rate": 9.776131578947369e-05, + "loss": 0.688, + "step": 28521 + }, + { + "epoch": 1.597155336543846, + "grad_norm": 1.6226140260696411, + "learning_rate": 9.776105263157895e-05, + "loss": 0.4676, + "step": 28522 + }, + { + "epoch": 1.597211333855975, + "grad_norm": 1.1888446807861328, + "learning_rate": 9.77607894736842e-05, + "loss": 0.4191, + "step": 28523 + }, + { + "epoch": 1.597267331168104, + "grad_norm": 1.2289389371871948, + "learning_rate": 9.776052631578948e-05, + "loss": 0.4819, + "step": 28524 + }, + { + "epoch": 1.597323328480233, + "grad_norm": 1.3530793190002441, + "learning_rate": 9.776026315789474e-05, + "loss": 0.5033, + "step": 28525 + }, + { + "epoch": 1.597379325792362, + "grad_norm": 1.10776948928833, + "learning_rate": 9.776000000000001e-05, + "loss": 0.3494, + "step": 28526 + }, + { + "epoch": 1.597435323104491, + "grad_norm": 1.1916927099227905, + "learning_rate": 9.775973684210526e-05, + "loss": 0.4193, + "step": 28527 + }, + { + "epoch": 1.5974913204166201, + "grad_norm": 1.2298662662506104, + "learning_rate": 9.775947368421053e-05, + "loss": 0.418, + "step": 28528 + }, + { + "epoch": 1.5975473177287491, + "grad_norm": 1.2551629543304443, + "learning_rate": 9.775921052631579e-05, + "loss": 0.3814, + "step": 28529 + }, + { + "epoch": 1.5976033150408782, + "grad_norm": 1.444276213645935, + "learning_rate": 9.775894736842107e-05, + "loss": 0.5472, + "step": 28530 + }, + { + "epoch": 1.5976593123530072, + "grad_norm": 1.3685768842697144, + "learning_rate": 9.775868421052632e-05, + "loss": 0.6566, + "step": 28531 + }, + { + "epoch": 1.5977153096651362, + "grad_norm": 1.209527850151062, + "learning_rate": 9.775842105263158e-05, + "loss": 0.6138, + "step": 28532 + }, + { + "epoch": 1.5977713069772652, + "grad_norm": 1.4719892740249634, + "learning_rate": 9.775815789473684e-05, + "loss": 0.453, + "step": 28533 + }, + { + "epoch": 1.5978273042893942, + "grad_norm": 1.3141635656356812, + "learning_rate": 9.775789473684212e-05, + "loss": 0.4432, + "step": 28534 + }, + { + "epoch": 1.5978833016015233, + "grad_norm": 1.5745213031768799, + "learning_rate": 9.775763157894738e-05, + "loss": 0.5168, + "step": 28535 + }, + { + "epoch": 1.5979392989136523, + "grad_norm": 1.1044379472732544, + "learning_rate": 9.775736842105264e-05, + "loss": 0.4065, + "step": 28536 + }, + { + "epoch": 1.5979952962257813, + "grad_norm": 1.2080252170562744, + "learning_rate": 9.77571052631579e-05, + "loss": 0.4802, + "step": 28537 + }, + { + "epoch": 1.5980512935379103, + "grad_norm": 1.4356034994125366, + "learning_rate": 9.775684210526316e-05, + "loss": 0.383, + "step": 28538 + }, + { + "epoch": 1.5981072908500393, + "grad_norm": 1.205481767654419, + "learning_rate": 9.775657894736843e-05, + "loss": 0.3951, + "step": 28539 + }, + { + "epoch": 1.5981632881621683, + "grad_norm": 2.1522939205169678, + "learning_rate": 9.775631578947369e-05, + "loss": 0.5754, + "step": 28540 + }, + { + "epoch": 1.5982192854742974, + "grad_norm": 1.5407639741897583, + "learning_rate": 9.775605263157895e-05, + "loss": 0.5231, + "step": 28541 + }, + { + "epoch": 1.5982752827864264, + "grad_norm": 1.5534600019454956, + "learning_rate": 9.775578947368421e-05, + "loss": 0.4517, + "step": 28542 + }, + { + "epoch": 1.5983312800985554, + "grad_norm": 1.257481575012207, + "learning_rate": 9.775552631578948e-05, + "loss": 0.5159, + "step": 28543 + }, + { + "epoch": 1.5983872774106844, + "grad_norm": 1.5003831386566162, + "learning_rate": 9.775526315789474e-05, + "loss": 0.4466, + "step": 28544 + }, + { + "epoch": 1.5984432747228134, + "grad_norm": 1.46065354347229, + "learning_rate": 9.7755e-05, + "loss": 0.517, + "step": 28545 + }, + { + "epoch": 1.5984992720349425, + "grad_norm": 1.3138266801834106, + "learning_rate": 9.775473684210526e-05, + "loss": 0.4196, + "step": 28546 + }, + { + "epoch": 1.5985552693470715, + "grad_norm": 8.69722843170166, + "learning_rate": 9.775447368421053e-05, + "loss": 0.4998, + "step": 28547 + }, + { + "epoch": 1.5986112666592005, + "grad_norm": 1.264536738395691, + "learning_rate": 9.77542105263158e-05, + "loss": 0.4926, + "step": 28548 + }, + { + "epoch": 1.5986672639713295, + "grad_norm": 1.1531795263290405, + "learning_rate": 9.775394736842107e-05, + "loss": 0.4186, + "step": 28549 + }, + { + "epoch": 1.5987232612834585, + "grad_norm": 1.7917656898498535, + "learning_rate": 9.775368421052631e-05, + "loss": 0.4706, + "step": 28550 + }, + { + "epoch": 1.5987792585955876, + "grad_norm": 1.5780102014541626, + "learning_rate": 9.775342105263159e-05, + "loss": 0.4027, + "step": 28551 + }, + { + "epoch": 1.5988352559077166, + "grad_norm": 1.246495246887207, + "learning_rate": 9.775315789473685e-05, + "loss": 0.5135, + "step": 28552 + }, + { + "epoch": 1.5988912532198456, + "grad_norm": 1.2440063953399658, + "learning_rate": 9.77528947368421e-05, + "loss": 0.4332, + "step": 28553 + }, + { + "epoch": 1.5989472505319746, + "grad_norm": 1.3712800741195679, + "learning_rate": 9.775263157894738e-05, + "loss": 0.4079, + "step": 28554 + }, + { + "epoch": 1.5990032478441036, + "grad_norm": 1.4920446872711182, + "learning_rate": 9.775236842105263e-05, + "loss": 0.3994, + "step": 28555 + }, + { + "epoch": 1.5990592451562327, + "grad_norm": 1.6190557479858398, + "learning_rate": 9.77521052631579e-05, + "loss": 0.4627, + "step": 28556 + }, + { + "epoch": 1.5991152424683617, + "grad_norm": 1.5346342325210571, + "learning_rate": 9.775184210526316e-05, + "loss": 0.4299, + "step": 28557 + }, + { + "epoch": 1.5991712397804907, + "grad_norm": 1.3129130601882935, + "learning_rate": 9.775157894736843e-05, + "loss": 0.5504, + "step": 28558 + }, + { + "epoch": 1.5992272370926197, + "grad_norm": 1.2687749862670898, + "learning_rate": 9.775131578947368e-05, + "loss": 0.5004, + "step": 28559 + }, + { + "epoch": 1.5992832344047487, + "grad_norm": 1.371648907661438, + "learning_rate": 9.775105263157895e-05, + "loss": 0.4729, + "step": 28560 + }, + { + "epoch": 1.5993392317168778, + "grad_norm": 1.2943694591522217, + "learning_rate": 9.775078947368421e-05, + "loss": 0.6238, + "step": 28561 + }, + { + "epoch": 1.5993952290290068, + "grad_norm": 1.118265151977539, + "learning_rate": 9.775052631578948e-05, + "loss": 0.3792, + "step": 28562 + }, + { + "epoch": 1.5994512263411358, + "grad_norm": 1.4891771078109741, + "learning_rate": 9.775026315789474e-05, + "loss": 0.4695, + "step": 28563 + }, + { + "epoch": 1.5995072236532648, + "grad_norm": 1.7308967113494873, + "learning_rate": 9.775e-05, + "loss": 0.4526, + "step": 28564 + }, + { + "epoch": 1.5995632209653936, + "grad_norm": 1.083701252937317, + "learning_rate": 9.774973684210526e-05, + "loss": 0.3322, + "step": 28565 + }, + { + "epoch": 1.5996192182775226, + "grad_norm": 1.39650559425354, + "learning_rate": 9.774947368421054e-05, + "loss": 0.3853, + "step": 28566 + }, + { + "epoch": 1.5996752155896516, + "grad_norm": 1.2509013414382935, + "learning_rate": 9.77492105263158e-05, + "loss": 0.4567, + "step": 28567 + }, + { + "epoch": 1.5997312129017807, + "grad_norm": 1.6539342403411865, + "learning_rate": 9.774894736842106e-05, + "loss": 0.5734, + "step": 28568 + }, + { + "epoch": 1.5997872102139097, + "grad_norm": 1.1319507360458374, + "learning_rate": 9.774868421052632e-05, + "loss": 0.3607, + "step": 28569 + }, + { + "epoch": 1.5998432075260387, + "grad_norm": 1.6434319019317627, + "learning_rate": 9.774842105263159e-05, + "loss": 0.4695, + "step": 28570 + }, + { + "epoch": 1.5998992048381677, + "grad_norm": 1.5648937225341797, + "learning_rate": 9.774815789473685e-05, + "loss": 0.6171, + "step": 28571 + }, + { + "epoch": 1.5999552021502967, + "grad_norm": 1.357996940612793, + "learning_rate": 9.774789473684211e-05, + "loss": 0.3983, + "step": 28572 + }, + { + "epoch": 1.6000111994624258, + "grad_norm": 1.4389029741287231, + "learning_rate": 9.774763157894737e-05, + "loss": 0.4872, + "step": 28573 + }, + { + "epoch": 1.6000671967745548, + "grad_norm": 1.4677770137786865, + "learning_rate": 9.774736842105263e-05, + "loss": 0.526, + "step": 28574 + }, + { + "epoch": 1.6001231940866838, + "grad_norm": 1.207441806793213, + "learning_rate": 9.77471052631579e-05, + "loss": 0.5071, + "step": 28575 + }, + { + "epoch": 1.6001791913988128, + "grad_norm": 1.2272001504898071, + "learning_rate": 9.774684210526316e-05, + "loss": 0.5347, + "step": 28576 + }, + { + "epoch": 1.6002351887109418, + "grad_norm": 1.299938440322876, + "learning_rate": 9.774657894736842e-05, + "loss": 0.4146, + "step": 28577 + }, + { + "epoch": 1.6002911860230709, + "grad_norm": 1.4660892486572266, + "learning_rate": 9.774631578947368e-05, + "loss": 0.5567, + "step": 28578 + }, + { + "epoch": 1.6003471833351999, + "grad_norm": 1.6837488412857056, + "learning_rate": 9.774605263157895e-05, + "loss": 0.5576, + "step": 28579 + }, + { + "epoch": 1.600403180647329, + "grad_norm": 1.3120943307876587, + "learning_rate": 9.774578947368421e-05, + "loss": 0.4611, + "step": 28580 + }, + { + "epoch": 1.600459177959458, + "grad_norm": 1.2361335754394531, + "learning_rate": 9.774552631578949e-05, + "loss": 0.4284, + "step": 28581 + }, + { + "epoch": 1.600515175271587, + "grad_norm": 1.3941798210144043, + "learning_rate": 9.774526315789473e-05, + "loss": 0.629, + "step": 28582 + }, + { + "epoch": 1.600571172583716, + "grad_norm": 1.0770739316940308, + "learning_rate": 9.774500000000001e-05, + "loss": 0.479, + "step": 28583 + }, + { + "epoch": 1.600627169895845, + "grad_norm": 1.4396823644638062, + "learning_rate": 9.774473684210527e-05, + "loss": 0.5685, + "step": 28584 + }, + { + "epoch": 1.600683167207974, + "grad_norm": 1.361149549484253, + "learning_rate": 9.774447368421054e-05, + "loss": 0.4855, + "step": 28585 + }, + { + "epoch": 1.600739164520103, + "grad_norm": 1.5855399370193481, + "learning_rate": 9.77442105263158e-05, + "loss": 0.5503, + "step": 28586 + }, + { + "epoch": 1.600795161832232, + "grad_norm": 1.39113450050354, + "learning_rate": 9.774394736842106e-05, + "loss": 0.5595, + "step": 28587 + }, + { + "epoch": 1.600851159144361, + "grad_norm": 1.7800246477127075, + "learning_rate": 9.774368421052632e-05, + "loss": 0.4223, + "step": 28588 + }, + { + "epoch": 1.60090715645649, + "grad_norm": 1.1487984657287598, + "learning_rate": 9.774342105263158e-05, + "loss": 0.4232, + "step": 28589 + }, + { + "epoch": 1.600963153768619, + "grad_norm": 2.425628900527954, + "learning_rate": 9.774315789473685e-05, + "loss": 0.5678, + "step": 28590 + }, + { + "epoch": 1.601019151080748, + "grad_norm": 1.4537615776062012, + "learning_rate": 9.774289473684211e-05, + "loss": 0.5565, + "step": 28591 + }, + { + "epoch": 1.6010751483928771, + "grad_norm": 1.319662094116211, + "learning_rate": 9.774263157894737e-05, + "loss": 0.4901, + "step": 28592 + }, + { + "epoch": 1.6011311457050061, + "grad_norm": 1.9893040657043457, + "learning_rate": 9.774236842105263e-05, + "loss": 0.5555, + "step": 28593 + }, + { + "epoch": 1.6011871430171352, + "grad_norm": 1.6523683071136475, + "learning_rate": 9.77421052631579e-05, + "loss": 0.4821, + "step": 28594 + }, + { + "epoch": 1.6012431403292642, + "grad_norm": 1.3857654333114624, + "learning_rate": 9.774184210526316e-05, + "loss": 0.8286, + "step": 28595 + }, + { + "epoch": 1.6012991376413932, + "grad_norm": 1.2868489027023315, + "learning_rate": 9.774157894736842e-05, + "loss": 0.4676, + "step": 28596 + }, + { + "epoch": 1.6013551349535222, + "grad_norm": 1.4066622257232666, + "learning_rate": 9.774131578947368e-05, + "loss": 0.426, + "step": 28597 + }, + { + "epoch": 1.6014111322656512, + "grad_norm": 1.2741644382476807, + "learning_rate": 9.774105263157896e-05, + "loss": 0.3972, + "step": 28598 + }, + { + "epoch": 1.6014671295777803, + "grad_norm": 1.590599536895752, + "learning_rate": 9.774078947368422e-05, + "loss": 0.5083, + "step": 28599 + }, + { + "epoch": 1.6015231268899093, + "grad_norm": 2.7208175659179688, + "learning_rate": 9.774052631578948e-05, + "loss": 0.6283, + "step": 28600 + }, + { + "epoch": 1.6015791242020383, + "grad_norm": 1.4425791501998901, + "learning_rate": 9.774026315789474e-05, + "loss": 0.4716, + "step": 28601 + }, + { + "epoch": 1.6016351215141673, + "grad_norm": 1.4351321458816528, + "learning_rate": 9.774000000000001e-05, + "loss": 0.445, + "step": 28602 + }, + { + "epoch": 1.6016911188262963, + "grad_norm": 1.2157111167907715, + "learning_rate": 9.773973684210527e-05, + "loss": 0.3545, + "step": 28603 + }, + { + "epoch": 1.6017471161384254, + "grad_norm": 1.4584052562713623, + "learning_rate": 9.773947368421054e-05, + "loss": 0.5592, + "step": 28604 + }, + { + "epoch": 1.6018031134505544, + "grad_norm": 1.518430233001709, + "learning_rate": 9.773921052631579e-05, + "loss": 0.5112, + "step": 28605 + }, + { + "epoch": 1.6018591107626834, + "grad_norm": 1.3541641235351562, + "learning_rate": 9.773894736842105e-05, + "loss": 0.4307, + "step": 28606 + }, + { + "epoch": 1.6019151080748124, + "grad_norm": 1.5336047410964966, + "learning_rate": 9.773868421052632e-05, + "loss": 0.529, + "step": 28607 + }, + { + "epoch": 1.6019711053869414, + "grad_norm": 1.3114652633666992, + "learning_rate": 9.773842105263158e-05, + "loss": 0.4296, + "step": 28608 + }, + { + "epoch": 1.6020271026990704, + "grad_norm": 1.2435579299926758, + "learning_rate": 9.773815789473685e-05, + "loss": 0.435, + "step": 28609 + }, + { + "epoch": 1.6020831000111995, + "grad_norm": 1.3253560066223145, + "learning_rate": 9.77378947368421e-05, + "loss": 0.4692, + "step": 28610 + }, + { + "epoch": 1.6021390973233285, + "grad_norm": 1.359158992767334, + "learning_rate": 9.773763157894737e-05, + "loss": 0.4447, + "step": 28611 + }, + { + "epoch": 1.6021950946354575, + "grad_norm": 1.135666012763977, + "learning_rate": 9.773736842105263e-05, + "loss": 0.4251, + "step": 28612 + }, + { + "epoch": 1.6022510919475865, + "grad_norm": 1.4796688556671143, + "learning_rate": 9.773710526315791e-05, + "loss": 0.6012, + "step": 28613 + }, + { + "epoch": 1.6023070892597155, + "grad_norm": 1.2790898084640503, + "learning_rate": 9.773684210526315e-05, + "loss": 0.5324, + "step": 28614 + }, + { + "epoch": 1.6023630865718446, + "grad_norm": 1.2509945631027222, + "learning_rate": 9.773657894736843e-05, + "loss": 0.5256, + "step": 28615 + }, + { + "epoch": 1.6024190838839736, + "grad_norm": 1.4188028573989868, + "learning_rate": 9.773631578947369e-05, + "loss": 0.5471, + "step": 28616 + }, + { + "epoch": 1.6024750811961026, + "grad_norm": 1.5398504734039307, + "learning_rate": 9.773605263157896e-05, + "loss": 0.6039, + "step": 28617 + }, + { + "epoch": 1.6025310785082316, + "grad_norm": 1.2628508806228638, + "learning_rate": 9.773578947368422e-05, + "loss": 0.3908, + "step": 28618 + }, + { + "epoch": 1.6025870758203606, + "grad_norm": 1.200680136680603, + "learning_rate": 9.773552631578948e-05, + "loss": 0.3997, + "step": 28619 + }, + { + "epoch": 1.6026430731324897, + "grad_norm": 1.3624876737594604, + "learning_rate": 9.773526315789474e-05, + "loss": 0.4948, + "step": 28620 + }, + { + "epoch": 1.6026990704446187, + "grad_norm": 1.2947224378585815, + "learning_rate": 9.773500000000001e-05, + "loss": 0.4947, + "step": 28621 + }, + { + "epoch": 1.6027550677567477, + "grad_norm": 1.4677554368972778, + "learning_rate": 9.773473684210527e-05, + "loss": 0.4754, + "step": 28622 + }, + { + "epoch": 1.6028110650688767, + "grad_norm": 1.371080756187439, + "learning_rate": 9.773447368421053e-05, + "loss": 0.4582, + "step": 28623 + }, + { + "epoch": 1.6028670623810057, + "grad_norm": 1.522147536277771, + "learning_rate": 9.773421052631579e-05, + "loss": 0.5, + "step": 28624 + }, + { + "epoch": 1.6029230596931348, + "grad_norm": 1.4593405723571777, + "learning_rate": 9.773394736842105e-05, + "loss": 0.4705, + "step": 28625 + }, + { + "epoch": 1.6029790570052638, + "grad_norm": 1.56641685962677, + "learning_rate": 9.773368421052632e-05, + "loss": 0.5664, + "step": 28626 + }, + { + "epoch": 1.6030350543173928, + "grad_norm": 1.2927019596099854, + "learning_rate": 9.773342105263158e-05, + "loss": 0.4037, + "step": 28627 + }, + { + "epoch": 1.6030910516295218, + "grad_norm": 1.29274320602417, + "learning_rate": 9.773315789473684e-05, + "loss": 0.4363, + "step": 28628 + }, + { + "epoch": 1.6031470489416508, + "grad_norm": 1.2716351747512817, + "learning_rate": 9.77328947368421e-05, + "loss": 0.457, + "step": 28629 + }, + { + "epoch": 1.6032030462537799, + "grad_norm": 1.4632830619812012, + "learning_rate": 9.773263157894738e-05, + "loss": 0.4265, + "step": 28630 + }, + { + "epoch": 1.6032590435659089, + "grad_norm": 1.4021910429000854, + "learning_rate": 9.773236842105264e-05, + "loss": 0.4463, + "step": 28631 + }, + { + "epoch": 1.603315040878038, + "grad_norm": 1.2475794553756714, + "learning_rate": 9.77321052631579e-05, + "loss": 0.5772, + "step": 28632 + }, + { + "epoch": 1.603371038190167, + "grad_norm": 1.1905592679977417, + "learning_rate": 9.773184210526316e-05, + "loss": 0.3985, + "step": 28633 + }, + { + "epoch": 1.603427035502296, + "grad_norm": 3.8043150901794434, + "learning_rate": 9.773157894736843e-05, + "loss": 0.4323, + "step": 28634 + }, + { + "epoch": 1.603483032814425, + "grad_norm": 1.1995283365249634, + "learning_rate": 9.773131578947369e-05, + "loss": 0.3641, + "step": 28635 + }, + { + "epoch": 1.603539030126554, + "grad_norm": 1.4894057512283325, + "learning_rate": 9.773105263157896e-05, + "loss": 0.4722, + "step": 28636 + }, + { + "epoch": 1.603595027438683, + "grad_norm": 1.453940987586975, + "learning_rate": 9.773078947368421e-05, + "loss": 0.4959, + "step": 28637 + }, + { + "epoch": 1.603651024750812, + "grad_norm": 1.3902662992477417, + "learning_rate": 9.773052631578948e-05, + "loss": 0.5107, + "step": 28638 + }, + { + "epoch": 1.603707022062941, + "grad_norm": 1.1786566972732544, + "learning_rate": 9.773026315789474e-05, + "loss": 0.4885, + "step": 28639 + }, + { + "epoch": 1.60376301937507, + "grad_norm": 1.3461360931396484, + "learning_rate": 9.773e-05, + "loss": 0.4654, + "step": 28640 + }, + { + "epoch": 1.603819016687199, + "grad_norm": 1.120482325553894, + "learning_rate": 9.772973684210527e-05, + "loss": 0.4848, + "step": 28641 + }, + { + "epoch": 1.603875013999328, + "grad_norm": 2.156015396118164, + "learning_rate": 9.772947368421052e-05, + "loss": 0.6806, + "step": 28642 + }, + { + "epoch": 1.603931011311457, + "grad_norm": 1.0594418048858643, + "learning_rate": 9.77292105263158e-05, + "loss": 0.3164, + "step": 28643 + }, + { + "epoch": 1.6039870086235861, + "grad_norm": 1.2195169925689697, + "learning_rate": 9.772894736842105e-05, + "loss": 0.4343, + "step": 28644 + }, + { + "epoch": 1.6040430059357151, + "grad_norm": 1.37152898311615, + "learning_rate": 9.772868421052633e-05, + "loss": 0.4588, + "step": 28645 + }, + { + "epoch": 1.6040990032478442, + "grad_norm": 1.2625083923339844, + "learning_rate": 9.772842105263159e-05, + "loss": 0.4527, + "step": 28646 + }, + { + "epoch": 1.6041550005599732, + "grad_norm": 1.5454283952713013, + "learning_rate": 9.772815789473685e-05, + "loss": 0.6302, + "step": 28647 + }, + { + "epoch": 1.604210997872102, + "grad_norm": 1.2913886308670044, + "learning_rate": 9.77278947368421e-05, + "loss": 0.5043, + "step": 28648 + }, + { + "epoch": 1.604266995184231, + "grad_norm": 1.4128942489624023, + "learning_rate": 9.772763157894738e-05, + "loss": 0.4755, + "step": 28649 + }, + { + "epoch": 1.60432299249636, + "grad_norm": 1.3026320934295654, + "learning_rate": 9.772736842105264e-05, + "loss": 0.4333, + "step": 28650 + }, + { + "epoch": 1.604378989808489, + "grad_norm": 1.2849268913269043, + "learning_rate": 9.77271052631579e-05, + "loss": 0.5542, + "step": 28651 + }, + { + "epoch": 1.604434987120618, + "grad_norm": 1.3272056579589844, + "learning_rate": 9.772684210526316e-05, + "loss": 0.3359, + "step": 28652 + }, + { + "epoch": 1.604490984432747, + "grad_norm": 1.3442047834396362, + "learning_rate": 9.772657894736843e-05, + "loss": 0.4682, + "step": 28653 + }, + { + "epoch": 1.604546981744876, + "grad_norm": 1.4577223062515259, + "learning_rate": 9.772631578947369e-05, + "loss": 0.4763, + "step": 28654 + }, + { + "epoch": 1.604602979057005, + "grad_norm": 1.1649953126907349, + "learning_rate": 9.772605263157895e-05, + "loss": 0.4263, + "step": 28655 + }, + { + "epoch": 1.6046589763691341, + "grad_norm": 1.4000461101531982, + "learning_rate": 9.772578947368421e-05, + "loss": 0.4431, + "step": 28656 + }, + { + "epoch": 1.6047149736812631, + "grad_norm": 1.293175220489502, + "learning_rate": 9.772552631578947e-05, + "loss": 0.4242, + "step": 28657 + }, + { + "epoch": 1.6047709709933922, + "grad_norm": 1.503591537475586, + "learning_rate": 9.772526315789474e-05, + "loss": 0.5596, + "step": 28658 + }, + { + "epoch": 1.6048269683055212, + "grad_norm": 1.1230645179748535, + "learning_rate": 9.7725e-05, + "loss": 0.3241, + "step": 28659 + }, + { + "epoch": 1.6048829656176502, + "grad_norm": 1.2284173965454102, + "learning_rate": 9.772473684210526e-05, + "loss": 0.334, + "step": 28660 + }, + { + "epoch": 1.6049389629297792, + "grad_norm": 1.3745721578598022, + "learning_rate": 9.772447368421052e-05, + "loss": 0.3021, + "step": 28661 + }, + { + "epoch": 1.6049949602419082, + "grad_norm": 1.3472864627838135, + "learning_rate": 9.77242105263158e-05, + "loss": 0.4249, + "step": 28662 + }, + { + "epoch": 1.6050509575540373, + "grad_norm": 1.5999746322631836, + "learning_rate": 9.772394736842106e-05, + "loss": 0.5218, + "step": 28663 + }, + { + "epoch": 1.6051069548661663, + "grad_norm": 2.2409257888793945, + "learning_rate": 9.772368421052633e-05, + "loss": 0.6774, + "step": 28664 + }, + { + "epoch": 1.6051629521782953, + "grad_norm": 1.3826230764389038, + "learning_rate": 9.772342105263158e-05, + "loss": 0.4914, + "step": 28665 + }, + { + "epoch": 1.6052189494904243, + "grad_norm": 1.2145477533340454, + "learning_rate": 9.772315789473685e-05, + "loss": 0.3978, + "step": 28666 + }, + { + "epoch": 1.6052749468025533, + "grad_norm": 1.286562204360962, + "learning_rate": 9.772289473684211e-05, + "loss": 0.577, + "step": 28667 + }, + { + "epoch": 1.6053309441146824, + "grad_norm": 1.5623323917388916, + "learning_rate": 9.772263157894738e-05, + "loss": 0.4105, + "step": 28668 + }, + { + "epoch": 1.6053869414268114, + "grad_norm": 1.4411369562149048, + "learning_rate": 9.772236842105263e-05, + "loss": 0.4962, + "step": 28669 + }, + { + "epoch": 1.6054429387389404, + "grad_norm": 1.319985270500183, + "learning_rate": 9.77221052631579e-05, + "loss": 0.6363, + "step": 28670 + }, + { + "epoch": 1.6054989360510694, + "grad_norm": 1.4259133338928223, + "learning_rate": 9.772184210526316e-05, + "loss": 0.4932, + "step": 28671 + }, + { + "epoch": 1.6055549333631984, + "grad_norm": 1.8027509450912476, + "learning_rate": 9.772157894736843e-05, + "loss": 0.4294, + "step": 28672 + }, + { + "epoch": 1.6056109306753275, + "grad_norm": 1.3412691354751587, + "learning_rate": 9.77213157894737e-05, + "loss": 0.4014, + "step": 28673 + }, + { + "epoch": 1.6056669279874565, + "grad_norm": 1.2715134620666504, + "learning_rate": 9.772105263157894e-05, + "loss": 0.3943, + "step": 28674 + }, + { + "epoch": 1.6057229252995855, + "grad_norm": 1.5118589401245117, + "learning_rate": 9.772078947368421e-05, + "loss": 0.5511, + "step": 28675 + }, + { + "epoch": 1.6057789226117145, + "grad_norm": 1.4125322103500366, + "learning_rate": 9.772052631578947e-05, + "loss": 0.4525, + "step": 28676 + }, + { + "epoch": 1.6058349199238435, + "grad_norm": 1.3682007789611816, + "learning_rate": 9.772026315789475e-05, + "loss": 0.6081, + "step": 28677 + }, + { + "epoch": 1.6058909172359725, + "grad_norm": 1.3445745706558228, + "learning_rate": 9.772e-05, + "loss": 0.4932, + "step": 28678 + }, + { + "epoch": 1.6059469145481016, + "grad_norm": 3.1188995838165283, + "learning_rate": 9.771973684210527e-05, + "loss": 0.5, + "step": 28679 + }, + { + "epoch": 1.6060029118602306, + "grad_norm": 1.4412713050842285, + "learning_rate": 9.771947368421053e-05, + "loss": 0.4545, + "step": 28680 + }, + { + "epoch": 1.6060589091723596, + "grad_norm": 1.889561653137207, + "learning_rate": 9.77192105263158e-05, + "loss": 0.5177, + "step": 28681 + }, + { + "epoch": 1.6061149064844886, + "grad_norm": 1.3179881572723389, + "learning_rate": 9.771894736842106e-05, + "loss": 0.5502, + "step": 28682 + }, + { + "epoch": 1.6061709037966176, + "grad_norm": 1.4670768976211548, + "learning_rate": 9.771868421052632e-05, + "loss": 0.3974, + "step": 28683 + }, + { + "epoch": 1.6062269011087467, + "grad_norm": 1.2923095226287842, + "learning_rate": 9.771842105263158e-05, + "loss": 0.6051, + "step": 28684 + }, + { + "epoch": 1.6062828984208757, + "grad_norm": 1.2992335557937622, + "learning_rate": 9.771815789473685e-05, + "loss": 0.5566, + "step": 28685 + }, + { + "epoch": 1.6063388957330047, + "grad_norm": 1.779698371887207, + "learning_rate": 9.771789473684211e-05, + "loss": 0.5703, + "step": 28686 + }, + { + "epoch": 1.6063948930451337, + "grad_norm": 1.2247023582458496, + "learning_rate": 9.771763157894737e-05, + "loss": 0.4534, + "step": 28687 + }, + { + "epoch": 1.6064508903572627, + "grad_norm": 1.4279910326004028, + "learning_rate": 9.771736842105263e-05, + "loss": 0.4967, + "step": 28688 + }, + { + "epoch": 1.6065068876693918, + "grad_norm": 1.3476687669754028, + "learning_rate": 9.77171052631579e-05, + "loss": 0.6584, + "step": 28689 + }, + { + "epoch": 1.6065628849815208, + "grad_norm": 1.3156377077102661, + "learning_rate": 9.771684210526316e-05, + "loss": 0.5392, + "step": 28690 + }, + { + "epoch": 1.6066188822936498, + "grad_norm": 1.3912566900253296, + "learning_rate": 9.771657894736844e-05, + "loss": 0.4671, + "step": 28691 + }, + { + "epoch": 1.6066748796057788, + "grad_norm": 1.198332667350769, + "learning_rate": 9.771631578947368e-05, + "loss": 0.3925, + "step": 28692 + }, + { + "epoch": 1.6067308769179078, + "grad_norm": 1.2896145582199097, + "learning_rate": 9.771605263157894e-05, + "loss": 0.4474, + "step": 28693 + }, + { + "epoch": 1.6067868742300369, + "grad_norm": 1.2734466791152954, + "learning_rate": 9.771578947368422e-05, + "loss": 0.4353, + "step": 28694 + }, + { + "epoch": 1.6068428715421659, + "grad_norm": 1.3854820728302002, + "learning_rate": 9.771552631578948e-05, + "loss": 0.4362, + "step": 28695 + }, + { + "epoch": 1.606898868854295, + "grad_norm": 1.2498865127563477, + "learning_rate": 9.771526315789475e-05, + "loss": 0.4349, + "step": 28696 + }, + { + "epoch": 1.606954866166424, + "grad_norm": 1.5366005897521973, + "learning_rate": 9.7715e-05, + "loss": 0.533, + "step": 28697 + }, + { + "epoch": 1.607010863478553, + "grad_norm": 1.1614415645599365, + "learning_rate": 9.771473684210527e-05, + "loss": 0.5351, + "step": 28698 + }, + { + "epoch": 1.607066860790682, + "grad_norm": 1.7099281549453735, + "learning_rate": 9.771447368421053e-05, + "loss": 0.6309, + "step": 28699 + }, + { + "epoch": 1.607122858102811, + "grad_norm": 1.2391961812973022, + "learning_rate": 9.77142105263158e-05, + "loss": 0.5167, + "step": 28700 + }, + { + "epoch": 1.60717885541494, + "grad_norm": 1.4294838905334473, + "learning_rate": 9.771394736842106e-05, + "loss": 0.4542, + "step": 28701 + }, + { + "epoch": 1.607234852727069, + "grad_norm": 0.9986810684204102, + "learning_rate": 9.771368421052632e-05, + "loss": 0.318, + "step": 28702 + }, + { + "epoch": 1.607290850039198, + "grad_norm": 1.23465895652771, + "learning_rate": 9.771342105263158e-05, + "loss": 0.5482, + "step": 28703 + }, + { + "epoch": 1.607346847351327, + "grad_norm": 1.420244574546814, + "learning_rate": 9.771315789473685e-05, + "loss": 0.4165, + "step": 28704 + }, + { + "epoch": 1.607402844663456, + "grad_norm": 1.8302278518676758, + "learning_rate": 9.771289473684211e-05, + "loss": 0.5376, + "step": 28705 + }, + { + "epoch": 1.607458841975585, + "grad_norm": 1.2272710800170898, + "learning_rate": 9.771263157894737e-05, + "loss": 0.4202, + "step": 28706 + }, + { + "epoch": 1.607514839287714, + "grad_norm": 1.1919933557510376, + "learning_rate": 9.771236842105263e-05, + "loss": 0.4382, + "step": 28707 + }, + { + "epoch": 1.6075708365998431, + "grad_norm": 1.2344907522201538, + "learning_rate": 9.77121052631579e-05, + "loss": 0.3364, + "step": 28708 + }, + { + "epoch": 1.6076268339119721, + "grad_norm": 1.7100778818130493, + "learning_rate": 9.771184210526317e-05, + "loss": 0.5967, + "step": 28709 + }, + { + "epoch": 1.6076828312241012, + "grad_norm": 1.5475964546203613, + "learning_rate": 9.771157894736843e-05, + "loss": 0.5107, + "step": 28710 + }, + { + "epoch": 1.6077388285362302, + "grad_norm": 1.8974214792251587, + "learning_rate": 9.771131578947369e-05, + "loss": 0.7437, + "step": 28711 + }, + { + "epoch": 1.6077948258483592, + "grad_norm": 1.195861577987671, + "learning_rate": 9.771105263157895e-05, + "loss": 0.3528, + "step": 28712 + }, + { + "epoch": 1.6078508231604882, + "grad_norm": 1.2650530338287354, + "learning_rate": 9.771078947368422e-05, + "loss": 0.3817, + "step": 28713 + }, + { + "epoch": 1.6079068204726172, + "grad_norm": 1.4194986820220947, + "learning_rate": 9.771052631578948e-05, + "loss": 0.509, + "step": 28714 + }, + { + "epoch": 1.6079628177847463, + "grad_norm": 1.1955994367599487, + "learning_rate": 9.771026315789474e-05, + "loss": 0.4528, + "step": 28715 + }, + { + "epoch": 1.6080188150968753, + "grad_norm": 1.1781166791915894, + "learning_rate": 9.771e-05, + "loss": 0.5074, + "step": 28716 + }, + { + "epoch": 1.6080748124090043, + "grad_norm": 1.5795073509216309, + "learning_rate": 9.770973684210527e-05, + "loss": 0.3698, + "step": 28717 + }, + { + "epoch": 1.6081308097211333, + "grad_norm": 1.37860107421875, + "learning_rate": 9.770947368421053e-05, + "loss": 0.3894, + "step": 28718 + }, + { + "epoch": 1.6081868070332623, + "grad_norm": 1.3968303203582764, + "learning_rate": 9.77092105263158e-05, + "loss": 0.5085, + "step": 28719 + }, + { + "epoch": 1.6082428043453914, + "grad_norm": 1.8028346300125122, + "learning_rate": 9.770894736842105e-05, + "loss": 0.4945, + "step": 28720 + }, + { + "epoch": 1.6082988016575204, + "grad_norm": 1.4722049236297607, + "learning_rate": 9.770868421052632e-05, + "loss": 0.5267, + "step": 28721 + }, + { + "epoch": 1.6083547989696494, + "grad_norm": 1.2737699747085571, + "learning_rate": 9.770842105263158e-05, + "loss": 0.5085, + "step": 28722 + }, + { + "epoch": 1.6084107962817784, + "grad_norm": 1.464971661567688, + "learning_rate": 9.770815789473686e-05, + "loss": 0.5654, + "step": 28723 + }, + { + "epoch": 1.6084667935939074, + "grad_norm": 1.0730468034744263, + "learning_rate": 9.77078947368421e-05, + "loss": 0.3335, + "step": 28724 + }, + { + "epoch": 1.6085227909060364, + "grad_norm": 1.6783322095870972, + "learning_rate": 9.770763157894738e-05, + "loss": 0.5438, + "step": 28725 + }, + { + "epoch": 1.6085787882181655, + "grad_norm": 1.5127896070480347, + "learning_rate": 9.770736842105264e-05, + "loss": 0.4101, + "step": 28726 + }, + { + "epoch": 1.6086347855302945, + "grad_norm": 1.3831170797348022, + "learning_rate": 9.77071052631579e-05, + "loss": 0.529, + "step": 28727 + }, + { + "epoch": 1.6086907828424235, + "grad_norm": 1.2516558170318604, + "learning_rate": 9.770684210526317e-05, + "loss": 0.5008, + "step": 28728 + }, + { + "epoch": 1.6087467801545525, + "grad_norm": 1.1753110885620117, + "learning_rate": 9.770657894736842e-05, + "loss": 0.4279, + "step": 28729 + }, + { + "epoch": 1.6088027774666815, + "grad_norm": 1.1439446210861206, + "learning_rate": 9.770631578947369e-05, + "loss": 0.4876, + "step": 28730 + }, + { + "epoch": 1.6088587747788106, + "grad_norm": 1.350986361503601, + "learning_rate": 9.770605263157895e-05, + "loss": 0.4439, + "step": 28731 + }, + { + "epoch": 1.6089147720909396, + "grad_norm": 1.4621875286102295, + "learning_rate": 9.770578947368422e-05, + "loss": 0.4695, + "step": 28732 + }, + { + "epoch": 1.6089707694030686, + "grad_norm": 1.6829105615615845, + "learning_rate": 9.770552631578948e-05, + "loss": 0.4594, + "step": 28733 + }, + { + "epoch": 1.6090267667151976, + "grad_norm": 1.4435217380523682, + "learning_rate": 9.770526315789474e-05, + "loss": 0.4287, + "step": 28734 + }, + { + "epoch": 1.6090827640273266, + "grad_norm": 1.356022834777832, + "learning_rate": 9.7705e-05, + "loss": 0.3338, + "step": 28735 + }, + { + "epoch": 1.6091387613394557, + "grad_norm": 1.1180651187896729, + "learning_rate": 9.770473684210527e-05, + "loss": 0.3526, + "step": 28736 + }, + { + "epoch": 1.6091947586515847, + "grad_norm": 1.066878318786621, + "learning_rate": 9.770447368421053e-05, + "loss": 0.3508, + "step": 28737 + }, + { + "epoch": 1.6092507559637137, + "grad_norm": 1.2516587972640991, + "learning_rate": 9.77042105263158e-05, + "loss": 0.3417, + "step": 28738 + }, + { + "epoch": 1.6093067532758427, + "grad_norm": 1.305559754371643, + "learning_rate": 9.770394736842105e-05, + "loss": 0.4857, + "step": 28739 + }, + { + "epoch": 1.6093627505879717, + "grad_norm": 3.1608378887176514, + "learning_rate": 9.770368421052633e-05, + "loss": 0.3657, + "step": 28740 + }, + { + "epoch": 1.6094187479001008, + "grad_norm": 1.6207187175750732, + "learning_rate": 9.770342105263159e-05, + "loss": 0.5124, + "step": 28741 + }, + { + "epoch": 1.6094747452122298, + "grad_norm": 1.2724627256393433, + "learning_rate": 9.770315789473685e-05, + "loss": 0.489, + "step": 28742 + }, + { + "epoch": 1.6095307425243588, + "grad_norm": 1.23508620262146, + "learning_rate": 9.77028947368421e-05, + "loss": 0.4922, + "step": 28743 + }, + { + "epoch": 1.6095867398364878, + "grad_norm": 1.3437213897705078, + "learning_rate": 9.770263157894737e-05, + "loss": 0.4651, + "step": 28744 + }, + { + "epoch": 1.6096427371486168, + "grad_norm": 1.3021752834320068, + "learning_rate": 9.770236842105264e-05, + "loss": 0.4908, + "step": 28745 + }, + { + "epoch": 1.6096987344607459, + "grad_norm": 1.061396837234497, + "learning_rate": 9.77021052631579e-05, + "loss": 0.3948, + "step": 28746 + }, + { + "epoch": 1.6097547317728749, + "grad_norm": 1.5615110397338867, + "learning_rate": 9.770184210526316e-05, + "loss": 0.5421, + "step": 28747 + }, + { + "epoch": 1.6098107290850039, + "grad_norm": 1.4910013675689697, + "learning_rate": 9.770157894736842e-05, + "loss": 0.6022, + "step": 28748 + }, + { + "epoch": 1.609866726397133, + "grad_norm": 1.4547667503356934, + "learning_rate": 9.770131578947369e-05, + "loss": 0.5313, + "step": 28749 + }, + { + "epoch": 1.609922723709262, + "grad_norm": 1.4698052406311035, + "learning_rate": 9.770105263157895e-05, + "loss": 0.539, + "step": 28750 + }, + { + "epoch": 1.609978721021391, + "grad_norm": 1.3056294918060303, + "learning_rate": 9.770078947368422e-05, + "loss": 0.4185, + "step": 28751 + }, + { + "epoch": 1.61003471833352, + "grad_norm": 1.2521520853042603, + "learning_rate": 9.770052631578947e-05, + "loss": 0.4903, + "step": 28752 + }, + { + "epoch": 1.610090715645649, + "grad_norm": 1.549988031387329, + "learning_rate": 9.770026315789474e-05, + "loss": 0.502, + "step": 28753 + }, + { + "epoch": 1.610146712957778, + "grad_norm": 1.5590424537658691, + "learning_rate": 9.77e-05, + "loss": 0.5469, + "step": 28754 + }, + { + "epoch": 1.610202710269907, + "grad_norm": 1.0338451862335205, + "learning_rate": 9.769973684210528e-05, + "loss": 0.4221, + "step": 28755 + }, + { + "epoch": 1.610258707582036, + "grad_norm": 1.9603238105773926, + "learning_rate": 9.769947368421054e-05, + "loss": 0.5875, + "step": 28756 + }, + { + "epoch": 1.610314704894165, + "grad_norm": 1.5836817026138306, + "learning_rate": 9.76992105263158e-05, + "loss": 0.4774, + "step": 28757 + }, + { + "epoch": 1.610370702206294, + "grad_norm": 1.1883763074874878, + "learning_rate": 9.769894736842106e-05, + "loss": 0.354, + "step": 28758 + }, + { + "epoch": 1.610426699518423, + "grad_norm": 1.2072845697402954, + "learning_rate": 9.769868421052633e-05, + "loss": 0.3925, + "step": 28759 + }, + { + "epoch": 1.6104826968305521, + "grad_norm": 1.1078659296035767, + "learning_rate": 9.769842105263159e-05, + "loss": 0.4503, + "step": 28760 + }, + { + "epoch": 1.6105386941426811, + "grad_norm": 1.453686237335205, + "learning_rate": 9.769815789473683e-05, + "loss": 0.562, + "step": 28761 + }, + { + "epoch": 1.6105946914548102, + "grad_norm": 1.4762941598892212, + "learning_rate": 9.769789473684211e-05, + "loss": 0.6003, + "step": 28762 + }, + { + "epoch": 1.6106506887669392, + "grad_norm": 1.154151439666748, + "learning_rate": 9.769763157894737e-05, + "loss": 0.357, + "step": 28763 + }, + { + "epoch": 1.6107066860790682, + "grad_norm": 1.1706411838531494, + "learning_rate": 9.769736842105264e-05, + "loss": 0.4488, + "step": 28764 + }, + { + "epoch": 1.6107626833911972, + "grad_norm": 1.2597057819366455, + "learning_rate": 9.76971052631579e-05, + "loss": 0.4386, + "step": 28765 + }, + { + "epoch": 1.6108186807033262, + "grad_norm": 1.1595110893249512, + "learning_rate": 9.769684210526316e-05, + "loss": 0.4152, + "step": 28766 + }, + { + "epoch": 1.6108746780154553, + "grad_norm": 1.3331243991851807, + "learning_rate": 9.769657894736842e-05, + "loss": 0.5759, + "step": 28767 + }, + { + "epoch": 1.6109306753275843, + "grad_norm": 1.391335129737854, + "learning_rate": 9.76963157894737e-05, + "loss": 0.6717, + "step": 28768 + }, + { + "epoch": 1.6109866726397133, + "grad_norm": 1.2932710647583008, + "learning_rate": 9.769605263157895e-05, + "loss": 0.396, + "step": 28769 + }, + { + "epoch": 1.6110426699518423, + "grad_norm": 1.1795724630355835, + "learning_rate": 9.769578947368421e-05, + "loss": 0.501, + "step": 28770 + }, + { + "epoch": 1.6110986672639713, + "grad_norm": 1.2339893579483032, + "learning_rate": 9.769552631578947e-05, + "loss": 0.4543, + "step": 28771 + }, + { + "epoch": 1.6111546645761003, + "grad_norm": 3.830420732498169, + "learning_rate": 9.769526315789475e-05, + "loss": 0.3997, + "step": 28772 + }, + { + "epoch": 1.6112106618882294, + "grad_norm": 1.1677887439727783, + "learning_rate": 9.7695e-05, + "loss": 0.3785, + "step": 28773 + }, + { + "epoch": 1.6112666592003584, + "grad_norm": 1.3042711019515991, + "learning_rate": 9.769473684210528e-05, + "loss": 0.6053, + "step": 28774 + }, + { + "epoch": 1.6113226565124874, + "grad_norm": 1.1209880113601685, + "learning_rate": 9.769447368421053e-05, + "loss": 0.4068, + "step": 28775 + }, + { + "epoch": 1.6113786538246164, + "grad_norm": 1.3490440845489502, + "learning_rate": 9.76942105263158e-05, + "loss": 0.5125, + "step": 28776 + }, + { + "epoch": 1.6114346511367454, + "grad_norm": 1.358336329460144, + "learning_rate": 9.769394736842106e-05, + "loss": 0.5652, + "step": 28777 + }, + { + "epoch": 1.6114906484488745, + "grad_norm": 1.1783603429794312, + "learning_rate": 9.769368421052632e-05, + "loss": 0.4084, + "step": 28778 + }, + { + "epoch": 1.6115466457610035, + "grad_norm": 1.4164807796478271, + "learning_rate": 9.769342105263158e-05, + "loss": 0.5972, + "step": 28779 + }, + { + "epoch": 1.6116026430731325, + "grad_norm": 1.284678339958191, + "learning_rate": 9.769315789473684e-05, + "loss": 0.4398, + "step": 28780 + }, + { + "epoch": 1.6116586403852615, + "grad_norm": 1.2834359407424927, + "learning_rate": 9.769289473684211e-05, + "loss": 0.5424, + "step": 28781 + }, + { + "epoch": 1.6117146376973905, + "grad_norm": 1.4140981435775757, + "learning_rate": 9.769263157894737e-05, + "loss": 0.4268, + "step": 28782 + }, + { + "epoch": 1.6117706350095196, + "grad_norm": 1.4729633331298828, + "learning_rate": 9.769236842105264e-05, + "loss": 0.4791, + "step": 28783 + }, + { + "epoch": 1.6118266323216486, + "grad_norm": 1.2354273796081543, + "learning_rate": 9.769210526315789e-05, + "loss": 0.489, + "step": 28784 + }, + { + "epoch": 1.6118826296337776, + "grad_norm": 1.4806479215621948, + "learning_rate": 9.769184210526316e-05, + "loss": 0.4625, + "step": 28785 + }, + { + "epoch": 1.6119386269459066, + "grad_norm": 1.2784717082977295, + "learning_rate": 9.769157894736842e-05, + "loss": 0.4417, + "step": 28786 + }, + { + "epoch": 1.6119946242580356, + "grad_norm": 1.5859538316726685, + "learning_rate": 9.76913157894737e-05, + "loss": 0.5529, + "step": 28787 + }, + { + "epoch": 1.6120506215701647, + "grad_norm": 1.3850970268249512, + "learning_rate": 9.769105263157896e-05, + "loss": 0.4521, + "step": 28788 + }, + { + "epoch": 1.6121066188822937, + "grad_norm": 1.3167903423309326, + "learning_rate": 9.769078947368422e-05, + "loss": 0.4672, + "step": 28789 + }, + { + "epoch": 1.6121626161944227, + "grad_norm": 1.0518642663955688, + "learning_rate": 9.769052631578948e-05, + "loss": 0.3206, + "step": 28790 + }, + { + "epoch": 1.6122186135065517, + "grad_norm": 1.182880163192749, + "learning_rate": 9.769026315789475e-05, + "loss": 0.4477, + "step": 28791 + }, + { + "epoch": 1.6122746108186807, + "grad_norm": 1.206493854522705, + "learning_rate": 9.769000000000001e-05, + "loss": 0.4195, + "step": 28792 + }, + { + "epoch": 1.6123306081308098, + "grad_norm": 1.2070099115371704, + "learning_rate": 9.768973684210527e-05, + "loss": 0.5545, + "step": 28793 + }, + { + "epoch": 1.6123866054429388, + "grad_norm": 1.3531379699707031, + "learning_rate": 9.768947368421053e-05, + "loss": 0.3874, + "step": 28794 + }, + { + "epoch": 1.6124426027550678, + "grad_norm": 1.6257026195526123, + "learning_rate": 9.768921052631579e-05, + "loss": 0.4392, + "step": 28795 + }, + { + "epoch": 1.6124986000671968, + "grad_norm": 1.2899463176727295, + "learning_rate": 9.768894736842106e-05, + "loss": 0.3819, + "step": 28796 + }, + { + "epoch": 1.6125545973793258, + "grad_norm": 1.8360083103179932, + "learning_rate": 9.768868421052632e-05, + "loss": 0.4681, + "step": 28797 + }, + { + "epoch": 1.6126105946914548, + "grad_norm": 1.408347249031067, + "learning_rate": 9.768842105263158e-05, + "loss": 0.4443, + "step": 28798 + }, + { + "epoch": 1.6126665920035839, + "grad_norm": 16.916879653930664, + "learning_rate": 9.768815789473684e-05, + "loss": 0.5161, + "step": 28799 + }, + { + "epoch": 1.6127225893157129, + "grad_norm": 1.405197262763977, + "learning_rate": 9.768789473684211e-05, + "loss": 0.4787, + "step": 28800 + }, + { + "epoch": 1.612778586627842, + "grad_norm": 1.0854650735855103, + "learning_rate": 9.768763157894737e-05, + "loss": 0.4347, + "step": 28801 + }, + { + "epoch": 1.612834583939971, + "grad_norm": 1.2529568672180176, + "learning_rate": 9.768736842105263e-05, + "loss": 0.419, + "step": 28802 + }, + { + "epoch": 1.6128905812521, + "grad_norm": 1.1711766719818115, + "learning_rate": 9.768710526315789e-05, + "loss": 0.3822, + "step": 28803 + }, + { + "epoch": 1.612946578564229, + "grad_norm": 1.5420488119125366, + "learning_rate": 9.768684210526317e-05, + "loss": 0.4751, + "step": 28804 + }, + { + "epoch": 1.613002575876358, + "grad_norm": 1.1327838897705078, + "learning_rate": 9.768657894736843e-05, + "loss": 0.4577, + "step": 28805 + }, + { + "epoch": 1.613058573188487, + "grad_norm": 1.602009892463684, + "learning_rate": 9.76863157894737e-05, + "loss": 0.6539, + "step": 28806 + }, + { + "epoch": 1.613114570500616, + "grad_norm": 1.3847897052764893, + "learning_rate": 9.768605263157894e-05, + "loss": 0.5669, + "step": 28807 + }, + { + "epoch": 1.613170567812745, + "grad_norm": 1.1550319194793701, + "learning_rate": 9.768578947368422e-05, + "loss": 0.4647, + "step": 28808 + }, + { + "epoch": 1.613226565124874, + "grad_norm": 1.7570668458938599, + "learning_rate": 9.768552631578948e-05, + "loss": 0.4337, + "step": 28809 + }, + { + "epoch": 1.613282562437003, + "grad_norm": 1.6952332258224487, + "learning_rate": 9.768526315789475e-05, + "loss": 0.7457, + "step": 28810 + }, + { + "epoch": 1.613338559749132, + "grad_norm": 1.632666826248169, + "learning_rate": 9.768500000000001e-05, + "loss": 0.7305, + "step": 28811 + }, + { + "epoch": 1.6133945570612611, + "grad_norm": 1.2979798316955566, + "learning_rate": 9.768473684210527e-05, + "loss": 0.5339, + "step": 28812 + }, + { + "epoch": 1.6134505543733901, + "grad_norm": 1.3651719093322754, + "learning_rate": 9.768447368421053e-05, + "loss": 0.5517, + "step": 28813 + }, + { + "epoch": 1.6135065516855192, + "grad_norm": 1.3211630582809448, + "learning_rate": 9.768421052631579e-05, + "loss": 0.48, + "step": 28814 + }, + { + "epoch": 1.6135625489976482, + "grad_norm": 1.4898940324783325, + "learning_rate": 9.768394736842106e-05, + "loss": 0.5233, + "step": 28815 + }, + { + "epoch": 1.6136185463097772, + "grad_norm": 1.2606719732284546, + "learning_rate": 9.768368421052631e-05, + "loss": 0.5858, + "step": 28816 + }, + { + "epoch": 1.6136745436219062, + "grad_norm": 1.2960206270217896, + "learning_rate": 9.768342105263158e-05, + "loss": 0.4371, + "step": 28817 + }, + { + "epoch": 1.6137305409340352, + "grad_norm": 1.3298046588897705, + "learning_rate": 9.768315789473684e-05, + "loss": 0.3939, + "step": 28818 + }, + { + "epoch": 1.6137865382461642, + "grad_norm": 1.3246947526931763, + "learning_rate": 9.768289473684212e-05, + "loss": 0.4704, + "step": 28819 + }, + { + "epoch": 1.6138425355582933, + "grad_norm": 1.300347924232483, + "learning_rate": 9.768263157894738e-05, + "loss": 0.4161, + "step": 28820 + }, + { + "epoch": 1.6138985328704223, + "grad_norm": 1.2787528038024902, + "learning_rate": 9.768236842105264e-05, + "loss": 0.4934, + "step": 28821 + }, + { + "epoch": 1.6139545301825513, + "grad_norm": 1.212634563446045, + "learning_rate": 9.76821052631579e-05, + "loss": 0.426, + "step": 28822 + }, + { + "epoch": 1.6140105274946803, + "grad_norm": 1.5091456174850464, + "learning_rate": 9.768184210526317e-05, + "loss": 0.4461, + "step": 28823 + }, + { + "epoch": 1.6140665248068093, + "grad_norm": 1.3671553134918213, + "learning_rate": 9.768157894736843e-05, + "loss": 0.4442, + "step": 28824 + }, + { + "epoch": 1.6141225221189384, + "grad_norm": 1.0682740211486816, + "learning_rate": 9.768131578947369e-05, + "loss": 0.3582, + "step": 28825 + }, + { + "epoch": 1.6141785194310674, + "grad_norm": 1.0848997831344604, + "learning_rate": 9.768105263157895e-05, + "loss": 0.3017, + "step": 28826 + }, + { + "epoch": 1.6142345167431964, + "grad_norm": 1.3373178243637085, + "learning_rate": 9.768078947368422e-05, + "loss": 0.4526, + "step": 28827 + }, + { + "epoch": 1.6142905140553254, + "grad_norm": 1.0836713314056396, + "learning_rate": 9.768052631578948e-05, + "loss": 0.3695, + "step": 28828 + }, + { + "epoch": 1.6143465113674544, + "grad_norm": 1.3050196170806885, + "learning_rate": 9.768026315789474e-05, + "loss": 0.5845, + "step": 28829 + }, + { + "epoch": 1.6144025086795835, + "grad_norm": 1.3665285110473633, + "learning_rate": 9.768e-05, + "loss": 0.4255, + "step": 28830 + }, + { + "epoch": 1.6144585059917125, + "grad_norm": 1.3075628280639648, + "learning_rate": 9.767973684210526e-05, + "loss": 0.5029, + "step": 28831 + }, + { + "epoch": 1.6145145033038415, + "grad_norm": 1.5388578176498413, + "learning_rate": 9.767947368421053e-05, + "loss": 0.5079, + "step": 28832 + }, + { + "epoch": 1.6145705006159705, + "grad_norm": 1.5257887840270996, + "learning_rate": 9.767921052631579e-05, + "loss": 0.3415, + "step": 28833 + }, + { + "epoch": 1.6146264979280995, + "grad_norm": 1.3473721742630005, + "learning_rate": 9.767894736842105e-05, + "loss": 0.5001, + "step": 28834 + }, + { + "epoch": 1.6146824952402286, + "grad_norm": 1.303948163986206, + "learning_rate": 9.767868421052631e-05, + "loss": 0.4365, + "step": 28835 + }, + { + "epoch": 1.6147384925523576, + "grad_norm": 1.313559651374817, + "learning_rate": 9.767842105263159e-05, + "loss": 0.4855, + "step": 28836 + }, + { + "epoch": 1.6147944898644866, + "grad_norm": 1.1499208211898804, + "learning_rate": 9.767815789473685e-05, + "loss": 0.4064, + "step": 28837 + }, + { + "epoch": 1.6148504871766156, + "grad_norm": 1.278723955154419, + "learning_rate": 9.767789473684212e-05, + "loss": 0.4662, + "step": 28838 + }, + { + "epoch": 1.6149064844887446, + "grad_norm": 1.3083597421646118, + "learning_rate": 9.767763157894736e-05, + "loss": 0.534, + "step": 28839 + }, + { + "epoch": 1.6149624818008737, + "grad_norm": 1.1727782487869263, + "learning_rate": 9.767736842105264e-05, + "loss": 0.4407, + "step": 28840 + }, + { + "epoch": 1.6150184791130027, + "grad_norm": 1.8495877981185913, + "learning_rate": 9.76771052631579e-05, + "loss": 0.4931, + "step": 28841 + }, + { + "epoch": 1.6150744764251317, + "grad_norm": 1.2436549663543701, + "learning_rate": 9.767684210526317e-05, + "loss": 0.3709, + "step": 28842 + }, + { + "epoch": 1.6151304737372607, + "grad_norm": 1.3587554693222046, + "learning_rate": 9.767657894736843e-05, + "loss": 0.4508, + "step": 28843 + }, + { + "epoch": 1.6151864710493897, + "grad_norm": 1.335359811782837, + "learning_rate": 9.767631578947369e-05, + "loss": 0.4305, + "step": 28844 + }, + { + "epoch": 1.6152424683615187, + "grad_norm": 1.2961686849594116, + "learning_rate": 9.767605263157895e-05, + "loss": 0.4625, + "step": 28845 + }, + { + "epoch": 1.6152984656736478, + "grad_norm": 1.3412424325942993, + "learning_rate": 9.767578947368422e-05, + "loss": 0.4367, + "step": 28846 + }, + { + "epoch": 1.6153544629857768, + "grad_norm": 1.342451810836792, + "learning_rate": 9.767552631578948e-05, + "loss": 0.4499, + "step": 28847 + }, + { + "epoch": 1.6154104602979058, + "grad_norm": 1.823162317276001, + "learning_rate": 9.767526315789474e-05, + "loss": 0.4182, + "step": 28848 + }, + { + "epoch": 1.6154664576100348, + "grad_norm": 1.3667343854904175, + "learning_rate": 9.7675e-05, + "loss": 0.4889, + "step": 28849 + }, + { + "epoch": 1.6155224549221638, + "grad_norm": 1.4782923460006714, + "learning_rate": 9.767473684210526e-05, + "loss": 0.5111, + "step": 28850 + }, + { + "epoch": 1.6155784522342929, + "grad_norm": 1.3616951704025269, + "learning_rate": 9.767447368421054e-05, + "loss": 0.4375, + "step": 28851 + }, + { + "epoch": 1.6156344495464219, + "grad_norm": 1.2682764530181885, + "learning_rate": 9.76742105263158e-05, + "loss": 0.4144, + "step": 28852 + }, + { + "epoch": 1.615690446858551, + "grad_norm": 1.182246208190918, + "learning_rate": 9.767394736842105e-05, + "loss": 0.3703, + "step": 28853 + }, + { + "epoch": 1.61574644417068, + "grad_norm": 1.3063621520996094, + "learning_rate": 9.767368421052631e-05, + "loss": 0.4596, + "step": 28854 + }, + { + "epoch": 1.615802441482809, + "grad_norm": 1.2230753898620605, + "learning_rate": 9.767342105263159e-05, + "loss": 0.3699, + "step": 28855 + }, + { + "epoch": 1.615858438794938, + "grad_norm": 2.1222188472747803, + "learning_rate": 9.767315789473685e-05, + "loss": 0.5617, + "step": 28856 + }, + { + "epoch": 1.615914436107067, + "grad_norm": 1.2271397113800049, + "learning_rate": 9.767289473684211e-05, + "loss": 0.3465, + "step": 28857 + }, + { + "epoch": 1.615970433419196, + "grad_norm": 2.0450069904327393, + "learning_rate": 9.767263157894737e-05, + "loss": 0.5504, + "step": 28858 + }, + { + "epoch": 1.616026430731325, + "grad_norm": 1.3388069868087769, + "learning_rate": 9.767236842105264e-05, + "loss": 0.4284, + "step": 28859 + }, + { + "epoch": 1.616082428043454, + "grad_norm": 1.4325214624404907, + "learning_rate": 9.76721052631579e-05, + "loss": 0.3864, + "step": 28860 + }, + { + "epoch": 1.616138425355583, + "grad_norm": 1.2629402875900269, + "learning_rate": 9.767184210526317e-05, + "loss": 0.4763, + "step": 28861 + }, + { + "epoch": 1.616194422667712, + "grad_norm": 1.146431803703308, + "learning_rate": 9.767157894736842e-05, + "loss": 0.3929, + "step": 28862 + }, + { + "epoch": 1.616250419979841, + "grad_norm": 1.3569384813308716, + "learning_rate": 9.767131578947369e-05, + "loss": 0.4921, + "step": 28863 + }, + { + "epoch": 1.61630641729197, + "grad_norm": 1.4281699657440186, + "learning_rate": 9.767105263157895e-05, + "loss": 0.454, + "step": 28864 + }, + { + "epoch": 1.6163624146040991, + "grad_norm": 1.4064583778381348, + "learning_rate": 9.767078947368421e-05, + "loss": 0.4045, + "step": 28865 + }, + { + "epoch": 1.6164184119162281, + "grad_norm": 1.4081312417984009, + "learning_rate": 9.767052631578949e-05, + "loss": 0.439, + "step": 28866 + }, + { + "epoch": 1.6164744092283572, + "grad_norm": 1.4261716604232788, + "learning_rate": 9.767026315789473e-05, + "loss": 0.4116, + "step": 28867 + }, + { + "epoch": 1.6165304065404862, + "grad_norm": 1.331688404083252, + "learning_rate": 9.767e-05, + "loss": 0.4301, + "step": 28868 + }, + { + "epoch": 1.6165864038526152, + "grad_norm": 1.3330377340316772, + "learning_rate": 9.766973684210526e-05, + "loss": 0.4386, + "step": 28869 + }, + { + "epoch": 1.6166424011647442, + "grad_norm": 1.7573132514953613, + "learning_rate": 9.766947368421054e-05, + "loss": 0.6382, + "step": 28870 + }, + { + "epoch": 1.6166983984768732, + "grad_norm": 1.5500704050064087, + "learning_rate": 9.766921052631578e-05, + "loss": 0.4457, + "step": 28871 + }, + { + "epoch": 1.6167543957890023, + "grad_norm": 1.4348666667938232, + "learning_rate": 9.766894736842106e-05, + "loss": 0.4841, + "step": 28872 + }, + { + "epoch": 1.6168103931011313, + "grad_norm": 1.3392975330352783, + "learning_rate": 9.766868421052632e-05, + "loss": 0.5253, + "step": 28873 + }, + { + "epoch": 1.6168663904132603, + "grad_norm": 1.9987597465515137, + "learning_rate": 9.766842105263159e-05, + "loss": 0.4609, + "step": 28874 + }, + { + "epoch": 1.6169223877253893, + "grad_norm": 2.7324507236480713, + "learning_rate": 9.766815789473685e-05, + "loss": 0.6237, + "step": 28875 + }, + { + "epoch": 1.6169783850375183, + "grad_norm": 1.508131742477417, + "learning_rate": 9.766789473684211e-05, + "loss": 0.5241, + "step": 28876 + }, + { + "epoch": 1.6170343823496474, + "grad_norm": 1.9047346115112305, + "learning_rate": 9.766763157894737e-05, + "loss": 0.4972, + "step": 28877 + }, + { + "epoch": 1.6170903796617764, + "grad_norm": 1.4013547897338867, + "learning_rate": 9.766736842105264e-05, + "loss": 0.51, + "step": 28878 + }, + { + "epoch": 1.6171463769739054, + "grad_norm": 2.806600332260132, + "learning_rate": 9.76671052631579e-05, + "loss": 0.4834, + "step": 28879 + }, + { + "epoch": 1.6172023742860344, + "grad_norm": 1.1944141387939453, + "learning_rate": 9.766684210526316e-05, + "loss": 0.3531, + "step": 28880 + }, + { + "epoch": 1.6172583715981634, + "grad_norm": 1.2647945880889893, + "learning_rate": 9.766657894736842e-05, + "loss": 0.529, + "step": 28881 + }, + { + "epoch": 1.6173143689102925, + "grad_norm": 1.6566176414489746, + "learning_rate": 9.766631578947368e-05, + "loss": 0.4407, + "step": 28882 + }, + { + "epoch": 1.6173703662224215, + "grad_norm": 1.159731388092041, + "learning_rate": 9.766605263157896e-05, + "loss": 0.3763, + "step": 28883 + }, + { + "epoch": 1.6174263635345505, + "grad_norm": 1.4550575017929077, + "learning_rate": 9.766578947368421e-05, + "loss": 0.5408, + "step": 28884 + }, + { + "epoch": 1.6174823608466795, + "grad_norm": 1.4085232019424438, + "learning_rate": 9.766552631578947e-05, + "loss": 0.4368, + "step": 28885 + }, + { + "epoch": 1.6175383581588085, + "grad_norm": 1.7718677520751953, + "learning_rate": 9.766526315789473e-05, + "loss": 0.4229, + "step": 28886 + }, + { + "epoch": 1.6175943554709376, + "grad_norm": 1.944851040840149, + "learning_rate": 9.766500000000001e-05, + "loss": 0.4829, + "step": 28887 + }, + { + "epoch": 1.6176503527830666, + "grad_norm": 1.1601150035858154, + "learning_rate": 9.766473684210527e-05, + "loss": 0.3789, + "step": 28888 + }, + { + "epoch": 1.6177063500951956, + "grad_norm": 1.2976466417312622, + "learning_rate": 9.766447368421053e-05, + "loss": 0.4909, + "step": 28889 + }, + { + "epoch": 1.6177623474073246, + "grad_norm": 1.249280333518982, + "learning_rate": 9.766421052631579e-05, + "loss": 0.4483, + "step": 28890 + }, + { + "epoch": 1.6178183447194536, + "grad_norm": 1.6155037879943848, + "learning_rate": 9.766394736842106e-05, + "loss": 0.5474, + "step": 28891 + }, + { + "epoch": 1.6178743420315826, + "grad_norm": 1.7790343761444092, + "learning_rate": 9.766368421052632e-05, + "loss": 0.4386, + "step": 28892 + }, + { + "epoch": 1.6179303393437117, + "grad_norm": 1.2534726858139038, + "learning_rate": 9.766342105263159e-05, + "loss": 0.3634, + "step": 28893 + }, + { + "epoch": 1.6179863366558407, + "grad_norm": 1.319286823272705, + "learning_rate": 9.766315789473684e-05, + "loss": 0.5808, + "step": 28894 + }, + { + "epoch": 1.6180423339679697, + "grad_norm": 1.3928951025009155, + "learning_rate": 9.766289473684211e-05, + "loss": 0.3583, + "step": 28895 + }, + { + "epoch": 1.6180983312800985, + "grad_norm": 1.4259158372879028, + "learning_rate": 9.766263157894737e-05, + "loss": 0.4083, + "step": 28896 + }, + { + "epoch": 1.6181543285922275, + "grad_norm": 1.4068379402160645, + "learning_rate": 9.766236842105265e-05, + "loss": 0.6703, + "step": 28897 + }, + { + "epoch": 1.6182103259043565, + "grad_norm": 1.14223051071167, + "learning_rate": 9.76621052631579e-05, + "loss": 0.4393, + "step": 28898 + }, + { + "epoch": 1.6182663232164856, + "grad_norm": 1.1985951662063599, + "learning_rate": 9.766184210526315e-05, + "loss": 0.3575, + "step": 28899 + }, + { + "epoch": 1.6183223205286146, + "grad_norm": 1.5247827768325806, + "learning_rate": 9.766157894736842e-05, + "loss": 0.5642, + "step": 28900 + }, + { + "epoch": 1.6183783178407436, + "grad_norm": 1.274037480354309, + "learning_rate": 9.766131578947368e-05, + "loss": 0.4636, + "step": 28901 + }, + { + "epoch": 1.6184343151528726, + "grad_norm": 1.0825755596160889, + "learning_rate": 9.766105263157896e-05, + "loss": 0.4752, + "step": 28902 + }, + { + "epoch": 1.6184903124650016, + "grad_norm": 1.1186537742614746, + "learning_rate": 9.766078947368422e-05, + "loss": 0.4092, + "step": 28903 + }, + { + "epoch": 1.6185463097771307, + "grad_norm": 1.2335047721862793, + "learning_rate": 9.766052631578948e-05, + "loss": 0.4991, + "step": 28904 + }, + { + "epoch": 1.6186023070892597, + "grad_norm": 1.4402673244476318, + "learning_rate": 9.766026315789474e-05, + "loss": 0.4952, + "step": 28905 + }, + { + "epoch": 1.6186583044013887, + "grad_norm": 1.5665110349655151, + "learning_rate": 9.766000000000001e-05, + "loss": 0.5344, + "step": 28906 + }, + { + "epoch": 1.6187143017135177, + "grad_norm": 1.2146878242492676, + "learning_rate": 9.765973684210527e-05, + "loss": 0.4731, + "step": 28907 + }, + { + "epoch": 1.6187702990256467, + "grad_norm": 1.4464036226272583, + "learning_rate": 9.765947368421053e-05, + "loss": 0.6648, + "step": 28908 + }, + { + "epoch": 1.6188262963377757, + "grad_norm": 1.3497234582901, + "learning_rate": 9.765921052631579e-05, + "loss": 0.5241, + "step": 28909 + }, + { + "epoch": 1.6188822936499048, + "grad_norm": 1.1629825830459595, + "learning_rate": 9.765894736842106e-05, + "loss": 0.3955, + "step": 28910 + }, + { + "epoch": 1.6189382909620338, + "grad_norm": 1.1402976512908936, + "learning_rate": 9.765868421052632e-05, + "loss": 0.3949, + "step": 28911 + }, + { + "epoch": 1.6189942882741628, + "grad_norm": 1.6595808267593384, + "learning_rate": 9.765842105263158e-05, + "loss": 0.5269, + "step": 28912 + }, + { + "epoch": 1.6190502855862918, + "grad_norm": 1.1686064004898071, + "learning_rate": 9.765815789473684e-05, + "loss": 0.4283, + "step": 28913 + }, + { + "epoch": 1.6191062828984208, + "grad_norm": 1.1514596939086914, + "learning_rate": 9.765789473684212e-05, + "loss": 0.4714, + "step": 28914 + }, + { + "epoch": 1.6191622802105499, + "grad_norm": 1.1812119483947754, + "learning_rate": 9.765763157894737e-05, + "loss": 0.3457, + "step": 28915 + }, + { + "epoch": 1.6192182775226789, + "grad_norm": 1.2720551490783691, + "learning_rate": 9.765736842105263e-05, + "loss": 0.4326, + "step": 28916 + }, + { + "epoch": 1.619274274834808, + "grad_norm": 2.7686190605163574, + "learning_rate": 9.76571052631579e-05, + "loss": 0.701, + "step": 28917 + }, + { + "epoch": 1.619330272146937, + "grad_norm": 1.5894356966018677, + "learning_rate": 9.765684210526315e-05, + "loss": 0.6887, + "step": 28918 + }, + { + "epoch": 1.619386269459066, + "grad_norm": 1.4989386796951294, + "learning_rate": 9.765657894736843e-05, + "loss": 0.581, + "step": 28919 + }, + { + "epoch": 1.619442266771195, + "grad_norm": 1.2035168409347534, + "learning_rate": 9.765631578947369e-05, + "loss": 0.4001, + "step": 28920 + }, + { + "epoch": 1.619498264083324, + "grad_norm": 1.7772064208984375, + "learning_rate": 9.765605263157896e-05, + "loss": 0.4919, + "step": 28921 + }, + { + "epoch": 1.619554261395453, + "grad_norm": 1.3816946744918823, + "learning_rate": 9.76557894736842e-05, + "loss": 0.4771, + "step": 28922 + }, + { + "epoch": 1.619610258707582, + "grad_norm": 1.5412209033966064, + "learning_rate": 9.765552631578948e-05, + "loss": 0.4784, + "step": 28923 + }, + { + "epoch": 1.619666256019711, + "grad_norm": 1.265815019607544, + "learning_rate": 9.765526315789474e-05, + "loss": 0.4187, + "step": 28924 + }, + { + "epoch": 1.61972225333184, + "grad_norm": 1.6642132997512817, + "learning_rate": 9.765500000000001e-05, + "loss": 0.4875, + "step": 28925 + }, + { + "epoch": 1.619778250643969, + "grad_norm": 1.099763035774231, + "learning_rate": 9.765473684210526e-05, + "loss": 0.3481, + "step": 28926 + }, + { + "epoch": 1.619834247956098, + "grad_norm": 1.391595482826233, + "learning_rate": 9.765447368421053e-05, + "loss": 0.5049, + "step": 28927 + }, + { + "epoch": 1.6198902452682271, + "grad_norm": 1.264090895652771, + "learning_rate": 9.765421052631579e-05, + "loss": 0.401, + "step": 28928 + }, + { + "epoch": 1.6199462425803561, + "grad_norm": 1.3754740953445435, + "learning_rate": 9.765394736842107e-05, + "loss": 0.5046, + "step": 28929 + }, + { + "epoch": 1.6200022398924852, + "grad_norm": 1.443232774734497, + "learning_rate": 9.765368421052633e-05, + "loss": 0.5517, + "step": 28930 + }, + { + "epoch": 1.6200582372046142, + "grad_norm": 1.269391417503357, + "learning_rate": 9.765342105263158e-05, + "loss": 0.5542, + "step": 28931 + }, + { + "epoch": 1.6201142345167432, + "grad_norm": 1.2511099576950073, + "learning_rate": 9.765315789473684e-05, + "loss": 0.4431, + "step": 28932 + }, + { + "epoch": 1.6201702318288722, + "grad_norm": 1.5928725004196167, + "learning_rate": 9.765289473684212e-05, + "loss": 0.4087, + "step": 28933 + }, + { + "epoch": 1.6202262291410012, + "grad_norm": 1.8670235872268677, + "learning_rate": 9.765263157894738e-05, + "loss": 0.478, + "step": 28934 + }, + { + "epoch": 1.6202822264531302, + "grad_norm": 1.4178543090820312, + "learning_rate": 9.765236842105264e-05, + "loss": 0.4866, + "step": 28935 + }, + { + "epoch": 1.6203382237652593, + "grad_norm": 1.7292839288711548, + "learning_rate": 9.76521052631579e-05, + "loss": 0.5253, + "step": 28936 + }, + { + "epoch": 1.6203942210773883, + "grad_norm": 1.3527981042861938, + "learning_rate": 9.765184210526316e-05, + "loss": 0.5333, + "step": 28937 + }, + { + "epoch": 1.6204502183895173, + "grad_norm": 1.0957695245742798, + "learning_rate": 9.765157894736843e-05, + "loss": 0.4332, + "step": 28938 + }, + { + "epoch": 1.6205062157016463, + "grad_norm": 1.1085983514785767, + "learning_rate": 9.765131578947369e-05, + "loss": 0.3908, + "step": 28939 + }, + { + "epoch": 1.6205622130137753, + "grad_norm": 1.4848521947860718, + "learning_rate": 9.765105263157895e-05, + "loss": 0.3817, + "step": 28940 + }, + { + "epoch": 1.6206182103259044, + "grad_norm": 1.2339684963226318, + "learning_rate": 9.765078947368421e-05, + "loss": 0.4563, + "step": 28941 + }, + { + "epoch": 1.6206742076380334, + "grad_norm": 1.0157705545425415, + "learning_rate": 9.765052631578948e-05, + "loss": 0.3975, + "step": 28942 + }, + { + "epoch": 1.6207302049501624, + "grad_norm": 1.3542698621749878, + "learning_rate": 9.765026315789474e-05, + "loss": 0.5248, + "step": 28943 + }, + { + "epoch": 1.6207862022622914, + "grad_norm": 1.5024199485778809, + "learning_rate": 9.765e-05, + "loss": 0.5143, + "step": 28944 + }, + { + "epoch": 1.6208421995744204, + "grad_norm": 1.2035006284713745, + "learning_rate": 9.764973684210526e-05, + "loss": 0.4675, + "step": 28945 + }, + { + "epoch": 1.6208981968865495, + "grad_norm": 4.292696475982666, + "learning_rate": 9.764947368421053e-05, + "loss": 0.7096, + "step": 28946 + }, + { + "epoch": 1.6209541941986785, + "grad_norm": 1.3858609199523926, + "learning_rate": 9.76492105263158e-05, + "loss": 0.45, + "step": 28947 + }, + { + "epoch": 1.6210101915108075, + "grad_norm": 1.3534660339355469, + "learning_rate": 9.764894736842107e-05, + "loss": 0.4051, + "step": 28948 + }, + { + "epoch": 1.6210661888229365, + "grad_norm": 1.4373719692230225, + "learning_rate": 9.764868421052631e-05, + "loss": 0.4589, + "step": 28949 + }, + { + "epoch": 1.6211221861350655, + "grad_norm": 1.2982500791549683, + "learning_rate": 9.764842105263159e-05, + "loss": 0.4363, + "step": 28950 + }, + { + "epoch": 1.6211781834471946, + "grad_norm": 1.3825939893722534, + "learning_rate": 9.764815789473685e-05, + "loss": 0.5435, + "step": 28951 + }, + { + "epoch": 1.6212341807593236, + "grad_norm": 1.072726845741272, + "learning_rate": 9.764789473684211e-05, + "loss": 0.4476, + "step": 28952 + }, + { + "epoch": 1.6212901780714526, + "grad_norm": 1.3606057167053223, + "learning_rate": 9.764763157894738e-05, + "loss": 0.4894, + "step": 28953 + }, + { + "epoch": 1.6213461753835816, + "grad_norm": 1.4802929162979126, + "learning_rate": 9.764736842105263e-05, + "loss": 0.4803, + "step": 28954 + }, + { + "epoch": 1.6214021726957106, + "grad_norm": 1.1817505359649658, + "learning_rate": 9.76471052631579e-05, + "loss": 0.3624, + "step": 28955 + }, + { + "epoch": 1.6214581700078396, + "grad_norm": 1.829140067100525, + "learning_rate": 9.764684210526316e-05, + "loss": 0.686, + "step": 28956 + }, + { + "epoch": 1.6215141673199687, + "grad_norm": 1.3381738662719727, + "learning_rate": 9.764657894736843e-05, + "loss": 0.4526, + "step": 28957 + }, + { + "epoch": 1.6215701646320977, + "grad_norm": 1.410481572151184, + "learning_rate": 9.764631578947369e-05, + "loss": 0.5079, + "step": 28958 + }, + { + "epoch": 1.6216261619442267, + "grad_norm": 1.2489384412765503, + "learning_rate": 9.764605263157895e-05, + "loss": 0.4669, + "step": 28959 + }, + { + "epoch": 1.6216821592563557, + "grad_norm": 1.709755301475525, + "learning_rate": 9.764578947368421e-05, + "loss": 0.6166, + "step": 28960 + }, + { + "epoch": 1.6217381565684847, + "grad_norm": 1.248163104057312, + "learning_rate": 9.764552631578949e-05, + "loss": 0.491, + "step": 28961 + }, + { + "epoch": 1.6217941538806138, + "grad_norm": 1.2984687089920044, + "learning_rate": 9.764526315789474e-05, + "loss": 0.4477, + "step": 28962 + }, + { + "epoch": 1.6218501511927428, + "grad_norm": 1.3458508253097534, + "learning_rate": 9.7645e-05, + "loss": 0.4577, + "step": 28963 + }, + { + "epoch": 1.6219061485048718, + "grad_norm": 1.5114479064941406, + "learning_rate": 9.764473684210526e-05, + "loss": 0.704, + "step": 28964 + }, + { + "epoch": 1.6219621458170008, + "grad_norm": 1.4598270654678345, + "learning_rate": 9.764447368421054e-05, + "loss": 0.4764, + "step": 28965 + }, + { + "epoch": 1.6220181431291298, + "grad_norm": 1.3080896139144897, + "learning_rate": 9.76442105263158e-05, + "loss": 0.4064, + "step": 28966 + }, + { + "epoch": 1.6220741404412589, + "grad_norm": 1.2693126201629639, + "learning_rate": 9.764394736842106e-05, + "loss": 0.438, + "step": 28967 + }, + { + "epoch": 1.6221301377533879, + "grad_norm": 1.0561363697052002, + "learning_rate": 9.764368421052632e-05, + "loss": 0.3789, + "step": 28968 + }, + { + "epoch": 1.622186135065517, + "grad_norm": 1.2383800745010376, + "learning_rate": 9.764342105263158e-05, + "loss": 0.4333, + "step": 28969 + }, + { + "epoch": 1.622242132377646, + "grad_norm": 1.2280343770980835, + "learning_rate": 9.764315789473685e-05, + "loss": 0.5021, + "step": 28970 + }, + { + "epoch": 1.622298129689775, + "grad_norm": 1.3351688385009766, + "learning_rate": 9.764289473684211e-05, + "loss": 0.505, + "step": 28971 + }, + { + "epoch": 1.622354127001904, + "grad_norm": 1.3184148073196411, + "learning_rate": 9.764263157894737e-05, + "loss": 0.5307, + "step": 28972 + }, + { + "epoch": 1.622410124314033, + "grad_norm": 1.5563063621520996, + "learning_rate": 9.764236842105263e-05, + "loss": 0.4808, + "step": 28973 + }, + { + "epoch": 1.622466121626162, + "grad_norm": 1.2767109870910645, + "learning_rate": 9.76421052631579e-05, + "loss": 0.5952, + "step": 28974 + }, + { + "epoch": 1.622522118938291, + "grad_norm": 1.2463483810424805, + "learning_rate": 9.764184210526316e-05, + "loss": 0.377, + "step": 28975 + }, + { + "epoch": 1.62257811625042, + "grad_norm": 1.4963417053222656, + "learning_rate": 9.764157894736844e-05, + "loss": 0.4538, + "step": 28976 + }, + { + "epoch": 1.622634113562549, + "grad_norm": 1.3054777383804321, + "learning_rate": 9.764131578947368e-05, + "loss": 0.4315, + "step": 28977 + }, + { + "epoch": 1.6226901108746778, + "grad_norm": 1.5979790687561035, + "learning_rate": 9.764105263157895e-05, + "loss": 0.4133, + "step": 28978 + }, + { + "epoch": 1.6227461081868069, + "grad_norm": 1.4845672845840454, + "learning_rate": 9.764078947368421e-05, + "loss": 0.457, + "step": 28979 + }, + { + "epoch": 1.6228021054989359, + "grad_norm": 1.5465277433395386, + "learning_rate": 9.764052631578949e-05, + "loss": 0.405, + "step": 28980 + }, + { + "epoch": 1.622858102811065, + "grad_norm": 1.4176063537597656, + "learning_rate": 9.764026315789473e-05, + "loss": 0.384, + "step": 28981 + }, + { + "epoch": 1.622914100123194, + "grad_norm": 1.4219379425048828, + "learning_rate": 9.764000000000001e-05, + "loss": 0.4869, + "step": 28982 + }, + { + "epoch": 1.622970097435323, + "grad_norm": 1.5295010805130005, + "learning_rate": 9.763973684210527e-05, + "loss": 0.4806, + "step": 28983 + }, + { + "epoch": 1.623026094747452, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.763947368421054e-05, + "loss": 0.4526, + "step": 28984 + }, + { + "epoch": 1.623082092059581, + "grad_norm": 1.1920751333236694, + "learning_rate": 9.76392105263158e-05, + "loss": 0.4755, + "step": 28985 + }, + { + "epoch": 1.62313808937171, + "grad_norm": 1.9188928604125977, + "learning_rate": 9.763894736842105e-05, + "loss": 0.5597, + "step": 28986 + }, + { + "epoch": 1.623194086683839, + "grad_norm": 1.6881300210952759, + "learning_rate": 9.763868421052632e-05, + "loss": 0.5532, + "step": 28987 + }, + { + "epoch": 1.623250083995968, + "grad_norm": 1.4397168159484863, + "learning_rate": 9.763842105263158e-05, + "loss": 0.5602, + "step": 28988 + }, + { + "epoch": 1.623306081308097, + "grad_norm": 1.4025870561599731, + "learning_rate": 9.763815789473685e-05, + "loss": 0.3857, + "step": 28989 + }, + { + "epoch": 1.623362078620226, + "grad_norm": 14.822052955627441, + "learning_rate": 9.763789473684211e-05, + "loss": 0.4411, + "step": 28990 + }, + { + "epoch": 1.623418075932355, + "grad_norm": 1.1370784044265747, + "learning_rate": 9.763763157894737e-05, + "loss": 0.3767, + "step": 28991 + }, + { + "epoch": 1.6234740732444841, + "grad_norm": 1.361096978187561, + "learning_rate": 9.763736842105263e-05, + "loss": 0.6225, + "step": 28992 + }, + { + "epoch": 1.6235300705566131, + "grad_norm": 1.27276611328125, + "learning_rate": 9.76371052631579e-05, + "loss": 0.465, + "step": 28993 + }, + { + "epoch": 1.6235860678687422, + "grad_norm": 1.3719966411590576, + "learning_rate": 9.763684210526316e-05, + "loss": 0.6489, + "step": 28994 + }, + { + "epoch": 1.6236420651808712, + "grad_norm": 1.4883934259414673, + "learning_rate": 9.763657894736842e-05, + "loss": 0.4197, + "step": 28995 + }, + { + "epoch": 1.6236980624930002, + "grad_norm": 1.515711784362793, + "learning_rate": 9.763631578947368e-05, + "loss": 0.4197, + "step": 28996 + }, + { + "epoch": 1.6237540598051292, + "grad_norm": 1.0941756963729858, + "learning_rate": 9.763605263157896e-05, + "loss": 0.4487, + "step": 28997 + }, + { + "epoch": 1.6238100571172582, + "grad_norm": 1.2121586799621582, + "learning_rate": 9.763578947368422e-05, + "loss": 0.4212, + "step": 28998 + }, + { + "epoch": 1.6238660544293873, + "grad_norm": 1.2333850860595703, + "learning_rate": 9.763552631578948e-05, + "loss": 0.4775, + "step": 28999 + }, + { + "epoch": 1.6239220517415163, + "grad_norm": 1.2895197868347168, + "learning_rate": 9.763526315789474e-05, + "loss": 0.5964, + "step": 29000 + }, + { + "epoch": 1.6239780490536453, + "grad_norm": 1.435691475868225, + "learning_rate": 9.763500000000001e-05, + "loss": 0.4094, + "step": 29001 + }, + { + "epoch": 1.6240340463657743, + "grad_norm": 2.209624767303467, + "learning_rate": 9.763473684210527e-05, + "loss": 0.4834, + "step": 29002 + }, + { + "epoch": 1.6240900436779033, + "grad_norm": 1.4166560173034668, + "learning_rate": 9.763447368421053e-05, + "loss": 0.6601, + "step": 29003 + }, + { + "epoch": 1.6241460409900323, + "grad_norm": 1.151734471321106, + "learning_rate": 9.763421052631579e-05, + "loss": 0.4065, + "step": 29004 + }, + { + "epoch": 1.6242020383021614, + "grad_norm": 1.2314025163650513, + "learning_rate": 9.763394736842105e-05, + "loss": 0.511, + "step": 29005 + }, + { + "epoch": 1.6242580356142904, + "grad_norm": 1.1897450685501099, + "learning_rate": 9.763368421052632e-05, + "loss": 0.3494, + "step": 29006 + }, + { + "epoch": 1.6243140329264194, + "grad_norm": 1.5910755395889282, + "learning_rate": 9.763342105263158e-05, + "loss": 0.5022, + "step": 29007 + }, + { + "epoch": 1.6243700302385484, + "grad_norm": 1.2150752544403076, + "learning_rate": 9.763315789473685e-05, + "loss": 0.4724, + "step": 29008 + }, + { + "epoch": 1.6244260275506774, + "grad_norm": 1.533077597618103, + "learning_rate": 9.76328947368421e-05, + "loss": 0.5095, + "step": 29009 + }, + { + "epoch": 1.6244820248628065, + "grad_norm": 1.3829405307769775, + "learning_rate": 9.763263157894737e-05, + "loss": 0.4628, + "step": 29010 + }, + { + "epoch": 1.6245380221749355, + "grad_norm": 1.334855318069458, + "learning_rate": 9.763236842105263e-05, + "loss": 0.3906, + "step": 29011 + }, + { + "epoch": 1.6245940194870645, + "grad_norm": 1.2197556495666504, + "learning_rate": 9.763210526315791e-05, + "loss": 0.5492, + "step": 29012 + }, + { + "epoch": 1.6246500167991935, + "grad_norm": 1.3573681116104126, + "learning_rate": 9.763184210526317e-05, + "loss": 0.5445, + "step": 29013 + }, + { + "epoch": 1.6247060141113225, + "grad_norm": 1.6501281261444092, + "learning_rate": 9.763157894736843e-05, + "loss": 0.4317, + "step": 29014 + }, + { + "epoch": 1.6247620114234516, + "grad_norm": 1.2500278949737549, + "learning_rate": 9.763131578947369e-05, + "loss": 0.385, + "step": 29015 + }, + { + "epoch": 1.6248180087355806, + "grad_norm": 1.5370055437088013, + "learning_rate": 9.763105263157896e-05, + "loss": 0.522, + "step": 29016 + }, + { + "epoch": 1.6248740060477096, + "grad_norm": 1.313562273979187, + "learning_rate": 9.763078947368422e-05, + "loss": 0.3867, + "step": 29017 + }, + { + "epoch": 1.6249300033598386, + "grad_norm": 1.272716760635376, + "learning_rate": 9.763052631578948e-05, + "loss": 0.4413, + "step": 29018 + }, + { + "epoch": 1.6249860006719676, + "grad_norm": 1.325479507446289, + "learning_rate": 9.763026315789474e-05, + "loss": 0.4495, + "step": 29019 + }, + { + "epoch": 1.6250419979840967, + "grad_norm": 1.1775710582733154, + "learning_rate": 9.763e-05, + "loss": 0.6156, + "step": 29020 + }, + { + "epoch": 1.6250979952962257, + "grad_norm": 1.4680395126342773, + "learning_rate": 9.762973684210527e-05, + "loss": 0.4178, + "step": 29021 + }, + { + "epoch": 1.6251539926083547, + "grad_norm": 1.3455936908721924, + "learning_rate": 9.762947368421053e-05, + "loss": 0.3931, + "step": 29022 + }, + { + "epoch": 1.6252099899204837, + "grad_norm": 1.1602425575256348, + "learning_rate": 9.762921052631579e-05, + "loss": 0.4051, + "step": 29023 + }, + { + "epoch": 1.6252659872326127, + "grad_norm": 1.522652506828308, + "learning_rate": 9.762894736842105e-05, + "loss": 0.4474, + "step": 29024 + }, + { + "epoch": 1.6253219845447417, + "grad_norm": 1.1786167621612549, + "learning_rate": 9.762868421052632e-05, + "loss": 0.6032, + "step": 29025 + }, + { + "epoch": 1.6253779818568708, + "grad_norm": 1.346570372581482, + "learning_rate": 9.762842105263158e-05, + "loss": 0.4033, + "step": 29026 + }, + { + "epoch": 1.6254339791689998, + "grad_norm": 1.21056067943573, + "learning_rate": 9.762815789473684e-05, + "loss": 0.5341, + "step": 29027 + }, + { + "epoch": 1.6254899764811288, + "grad_norm": 1.5269578695297241, + "learning_rate": 9.76278947368421e-05, + "loss": 0.5068, + "step": 29028 + }, + { + "epoch": 1.6255459737932578, + "grad_norm": 1.4292340278625488, + "learning_rate": 9.762763157894738e-05, + "loss": 0.4894, + "step": 29029 + }, + { + "epoch": 1.6256019711053868, + "grad_norm": 1.1982123851776123, + "learning_rate": 9.762736842105264e-05, + "loss": 0.4273, + "step": 29030 + }, + { + "epoch": 1.6256579684175159, + "grad_norm": 1.709243893623352, + "learning_rate": 9.76271052631579e-05, + "loss": 0.5359, + "step": 29031 + }, + { + "epoch": 1.6257139657296449, + "grad_norm": 1.1783006191253662, + "learning_rate": 9.762684210526316e-05, + "loss": 0.3656, + "step": 29032 + }, + { + "epoch": 1.625769963041774, + "grad_norm": 1.061718225479126, + "learning_rate": 9.762657894736843e-05, + "loss": 0.4325, + "step": 29033 + }, + { + "epoch": 1.625825960353903, + "grad_norm": 2.1916258335113525, + "learning_rate": 9.762631578947369e-05, + "loss": 0.4931, + "step": 29034 + }, + { + "epoch": 1.625881957666032, + "grad_norm": 1.1895729303359985, + "learning_rate": 9.762605263157896e-05, + "loss": 0.3862, + "step": 29035 + }, + { + "epoch": 1.625937954978161, + "grad_norm": 1.0891497135162354, + "learning_rate": 9.762578947368421e-05, + "loss": 0.4223, + "step": 29036 + }, + { + "epoch": 1.62599395229029, + "grad_norm": 1.2871347665786743, + "learning_rate": 9.762552631578947e-05, + "loss": 0.4835, + "step": 29037 + }, + { + "epoch": 1.626049949602419, + "grad_norm": 1.1227421760559082, + "learning_rate": 9.762526315789474e-05, + "loss": 0.3586, + "step": 29038 + }, + { + "epoch": 1.626105946914548, + "grad_norm": 1.5562020540237427, + "learning_rate": 9.7625e-05, + "loss": 0.3576, + "step": 29039 + }, + { + "epoch": 1.626161944226677, + "grad_norm": 1.2298904657363892, + "learning_rate": 9.762473684210527e-05, + "loss": 0.5008, + "step": 29040 + }, + { + "epoch": 1.626217941538806, + "grad_norm": 1.3883846998214722, + "learning_rate": 9.762447368421052e-05, + "loss": 0.4788, + "step": 29041 + }, + { + "epoch": 1.626273938850935, + "grad_norm": 1.3229891061782837, + "learning_rate": 9.76242105263158e-05, + "loss": 0.503, + "step": 29042 + }, + { + "epoch": 1.626329936163064, + "grad_norm": 1.4288462400436401, + "learning_rate": 9.762394736842105e-05, + "loss": 0.5985, + "step": 29043 + }, + { + "epoch": 1.6263859334751931, + "grad_norm": 1.5787339210510254, + "learning_rate": 9.762368421052633e-05, + "loss": 0.4384, + "step": 29044 + }, + { + "epoch": 1.6264419307873221, + "grad_norm": 1.0647637844085693, + "learning_rate": 9.762342105263159e-05, + "loss": 0.3867, + "step": 29045 + }, + { + "epoch": 1.6264979280994512, + "grad_norm": 1.180756688117981, + "learning_rate": 9.762315789473685e-05, + "loss": 0.4342, + "step": 29046 + }, + { + "epoch": 1.6265539254115802, + "grad_norm": 1.1512324810028076, + "learning_rate": 9.76228947368421e-05, + "loss": 0.4566, + "step": 29047 + }, + { + "epoch": 1.6266099227237092, + "grad_norm": 1.1414687633514404, + "learning_rate": 9.762263157894738e-05, + "loss": 0.3501, + "step": 29048 + }, + { + "epoch": 1.6266659200358382, + "grad_norm": 1.374139428138733, + "learning_rate": 9.762236842105264e-05, + "loss": 0.4822, + "step": 29049 + }, + { + "epoch": 1.6267219173479672, + "grad_norm": 1.401052713394165, + "learning_rate": 9.76221052631579e-05, + "loss": 0.4372, + "step": 29050 + }, + { + "epoch": 1.6267779146600962, + "grad_norm": 1.2061361074447632, + "learning_rate": 9.762184210526316e-05, + "loss": 0.3528, + "step": 29051 + }, + { + "epoch": 1.6268339119722253, + "grad_norm": 1.4091142416000366, + "learning_rate": 9.762157894736843e-05, + "loss": 0.5102, + "step": 29052 + }, + { + "epoch": 1.6268899092843543, + "grad_norm": 1.4907644987106323, + "learning_rate": 9.762131578947369e-05, + "loss": 0.6561, + "step": 29053 + }, + { + "epoch": 1.6269459065964833, + "grad_norm": 1.1247915029525757, + "learning_rate": 9.762105263157895e-05, + "loss": 0.4596, + "step": 29054 + }, + { + "epoch": 1.6270019039086123, + "grad_norm": 1.152758002281189, + "learning_rate": 9.762078947368421e-05, + "loss": 0.4631, + "step": 29055 + }, + { + "epoch": 1.6270579012207413, + "grad_norm": 1.2452892065048218, + "learning_rate": 9.762052631578947e-05, + "loss": 0.3635, + "step": 29056 + }, + { + "epoch": 1.6271138985328704, + "grad_norm": 1.5018380880355835, + "learning_rate": 9.762026315789474e-05, + "loss": 0.5134, + "step": 29057 + }, + { + "epoch": 1.6271698958449994, + "grad_norm": 1.26292085647583, + "learning_rate": 9.762e-05, + "loss": 0.4954, + "step": 29058 + }, + { + "epoch": 1.6272258931571284, + "grad_norm": 1.3581428527832031, + "learning_rate": 9.761973684210526e-05, + "loss": 0.451, + "step": 29059 + }, + { + "epoch": 1.6272818904692574, + "grad_norm": 1.4662487506866455, + "learning_rate": 9.761947368421052e-05, + "loss": 0.4825, + "step": 29060 + }, + { + "epoch": 1.6273378877813864, + "grad_norm": 1.0710781812667847, + "learning_rate": 9.76192105263158e-05, + "loss": 0.3624, + "step": 29061 + }, + { + "epoch": 1.6273938850935155, + "grad_norm": 1.219622254371643, + "learning_rate": 9.761894736842106e-05, + "loss": 0.43, + "step": 29062 + }, + { + "epoch": 1.6274498824056445, + "grad_norm": 1.3126916885375977, + "learning_rate": 9.761868421052633e-05, + "loss": 0.4779, + "step": 29063 + }, + { + "epoch": 1.6275058797177735, + "grad_norm": 1.0866769552230835, + "learning_rate": 9.761842105263158e-05, + "loss": 0.4377, + "step": 29064 + }, + { + "epoch": 1.6275618770299025, + "grad_norm": 1.2525542974472046, + "learning_rate": 9.761815789473685e-05, + "loss": 0.4171, + "step": 29065 + }, + { + "epoch": 1.6276178743420315, + "grad_norm": 1.3107547760009766, + "learning_rate": 9.761789473684211e-05, + "loss": 0.4772, + "step": 29066 + }, + { + "epoch": 1.6276738716541606, + "grad_norm": 1.738831639289856, + "learning_rate": 9.761763157894738e-05, + "loss": 0.5713, + "step": 29067 + }, + { + "epoch": 1.6277298689662896, + "grad_norm": 1.6793580055236816, + "learning_rate": 9.761736842105264e-05, + "loss": 0.4374, + "step": 29068 + }, + { + "epoch": 1.6277858662784186, + "grad_norm": 1.2854893207550049, + "learning_rate": 9.76171052631579e-05, + "loss": 0.3926, + "step": 29069 + }, + { + "epoch": 1.6278418635905476, + "grad_norm": 1.1788318157196045, + "learning_rate": 9.761684210526316e-05, + "loss": 0.4633, + "step": 29070 + }, + { + "epoch": 1.6278978609026766, + "grad_norm": 1.4562842845916748, + "learning_rate": 9.761657894736843e-05, + "loss": 0.3869, + "step": 29071 + }, + { + "epoch": 1.6279538582148056, + "grad_norm": 1.302661657333374, + "learning_rate": 9.76163157894737e-05, + "loss": 0.4805, + "step": 29072 + }, + { + "epoch": 1.6280098555269347, + "grad_norm": 1.3763971328735352, + "learning_rate": 9.761605263157894e-05, + "loss": 0.3982, + "step": 29073 + }, + { + "epoch": 1.6280658528390637, + "grad_norm": 1.9479624032974243, + "learning_rate": 9.761578947368421e-05, + "loss": 0.4567, + "step": 29074 + }, + { + "epoch": 1.6281218501511927, + "grad_norm": 1.659568190574646, + "learning_rate": 9.761552631578947e-05, + "loss": 0.71, + "step": 29075 + }, + { + "epoch": 1.6281778474633217, + "grad_norm": 1.4795762300491333, + "learning_rate": 9.761526315789475e-05, + "loss": 0.4887, + "step": 29076 + }, + { + "epoch": 1.6282338447754507, + "grad_norm": 1.327354073524475, + "learning_rate": 9.7615e-05, + "loss": 0.5031, + "step": 29077 + }, + { + "epoch": 1.6282898420875798, + "grad_norm": 1.9933340549468994, + "learning_rate": 9.761473684210527e-05, + "loss": 0.3958, + "step": 29078 + }, + { + "epoch": 1.6283458393997088, + "grad_norm": 1.9714298248291016, + "learning_rate": 9.761447368421053e-05, + "loss": 0.5657, + "step": 29079 + }, + { + "epoch": 1.6284018367118378, + "grad_norm": 1.4459640979766846, + "learning_rate": 9.76142105263158e-05, + "loss": 0.4235, + "step": 29080 + }, + { + "epoch": 1.6284578340239668, + "grad_norm": 1.193831443786621, + "learning_rate": 9.761394736842106e-05, + "loss": 0.441, + "step": 29081 + }, + { + "epoch": 1.6285138313360958, + "grad_norm": 1.1125028133392334, + "learning_rate": 9.761368421052632e-05, + "loss": 0.4223, + "step": 29082 + }, + { + "epoch": 1.6285698286482249, + "grad_norm": 1.3093454837799072, + "learning_rate": 9.761342105263158e-05, + "loss": 0.4256, + "step": 29083 + }, + { + "epoch": 1.6286258259603539, + "grad_norm": 2.2786388397216797, + "learning_rate": 9.761315789473685e-05, + "loss": 0.54, + "step": 29084 + }, + { + "epoch": 1.628681823272483, + "grad_norm": 1.323691964149475, + "learning_rate": 9.761289473684211e-05, + "loss": 0.5686, + "step": 29085 + }, + { + "epoch": 1.628737820584612, + "grad_norm": 1.310255765914917, + "learning_rate": 9.761263157894737e-05, + "loss": 0.5394, + "step": 29086 + }, + { + "epoch": 1.628793817896741, + "grad_norm": 1.7696998119354248, + "learning_rate": 9.761236842105263e-05, + "loss": 0.5722, + "step": 29087 + }, + { + "epoch": 1.62884981520887, + "grad_norm": 1.2077407836914062, + "learning_rate": 9.76121052631579e-05, + "loss": 0.3398, + "step": 29088 + }, + { + "epoch": 1.628905812520999, + "grad_norm": 1.19025719165802, + "learning_rate": 9.761184210526316e-05, + "loss": 0.4873, + "step": 29089 + }, + { + "epoch": 1.628961809833128, + "grad_norm": 1.1577588319778442, + "learning_rate": 9.761157894736842e-05, + "loss": 0.3893, + "step": 29090 + }, + { + "epoch": 1.629017807145257, + "grad_norm": 1.648565649986267, + "learning_rate": 9.761131578947368e-05, + "loss": 0.5877, + "step": 29091 + }, + { + "epoch": 1.629073804457386, + "grad_norm": 1.1934082508087158, + "learning_rate": 9.761105263157894e-05, + "loss": 0.4315, + "step": 29092 + }, + { + "epoch": 1.629129801769515, + "grad_norm": 1.4596374034881592, + "learning_rate": 9.761078947368422e-05, + "loss": 0.5484, + "step": 29093 + }, + { + "epoch": 1.629185799081644, + "grad_norm": 1.3889191150665283, + "learning_rate": 9.761052631578948e-05, + "loss": 0.4731, + "step": 29094 + }, + { + "epoch": 1.629241796393773, + "grad_norm": 1.4306739568710327, + "learning_rate": 9.761026315789475e-05, + "loss": 0.6296, + "step": 29095 + }, + { + "epoch": 1.629297793705902, + "grad_norm": 1.3662248849868774, + "learning_rate": 9.761e-05, + "loss": 0.4153, + "step": 29096 + }, + { + "epoch": 1.6293537910180311, + "grad_norm": 1.737163782119751, + "learning_rate": 9.760973684210527e-05, + "loss": 0.5097, + "step": 29097 + }, + { + "epoch": 1.6294097883301601, + "grad_norm": 1.5683296918869019, + "learning_rate": 9.760947368421053e-05, + "loss": 0.4128, + "step": 29098 + }, + { + "epoch": 1.6294657856422892, + "grad_norm": 1.4416677951812744, + "learning_rate": 9.76092105263158e-05, + "loss": 0.4875, + "step": 29099 + }, + { + "epoch": 1.6295217829544182, + "grad_norm": 1.2504794597625732, + "learning_rate": 9.760894736842106e-05, + "loss": 0.3317, + "step": 29100 + }, + { + "epoch": 1.6295777802665472, + "grad_norm": 1.3671859502792358, + "learning_rate": 9.760868421052632e-05, + "loss": 0.513, + "step": 29101 + }, + { + "epoch": 1.6296337775786762, + "grad_norm": 1.8179361820220947, + "learning_rate": 9.760842105263158e-05, + "loss": 0.5065, + "step": 29102 + }, + { + "epoch": 1.6296897748908052, + "grad_norm": 1.1108883619308472, + "learning_rate": 9.760815789473685e-05, + "loss": 0.3864, + "step": 29103 + }, + { + "epoch": 1.6297457722029343, + "grad_norm": 15.233407020568848, + "learning_rate": 9.760789473684211e-05, + "loss": 0.3712, + "step": 29104 + }, + { + "epoch": 1.6298017695150633, + "grad_norm": 1.2472684383392334, + "learning_rate": 9.760763157894737e-05, + "loss": 0.3965, + "step": 29105 + }, + { + "epoch": 1.6298577668271923, + "grad_norm": 2.189598560333252, + "learning_rate": 9.760736842105263e-05, + "loss": 0.559, + "step": 29106 + }, + { + "epoch": 1.6299137641393213, + "grad_norm": 1.4176472425460815, + "learning_rate": 9.760710526315789e-05, + "loss": 0.443, + "step": 29107 + }, + { + "epoch": 1.6299697614514503, + "grad_norm": 1.387403964996338, + "learning_rate": 9.760684210526317e-05, + "loss": 0.5454, + "step": 29108 + }, + { + "epoch": 1.6300257587635794, + "grad_norm": 1.6350126266479492, + "learning_rate": 9.760657894736843e-05, + "loss": 0.4206, + "step": 29109 + }, + { + "epoch": 1.6300817560757084, + "grad_norm": 1.2087743282318115, + "learning_rate": 9.760631578947369e-05, + "loss": 0.4521, + "step": 29110 + }, + { + "epoch": 1.6301377533878374, + "grad_norm": 1.3117601871490479, + "learning_rate": 9.760605263157895e-05, + "loss": 0.5464, + "step": 29111 + }, + { + "epoch": 1.6301937506999664, + "grad_norm": 1.255249261856079, + "learning_rate": 9.760578947368422e-05, + "loss": 0.4812, + "step": 29112 + }, + { + "epoch": 1.6302497480120954, + "grad_norm": 1.740799903869629, + "learning_rate": 9.760552631578948e-05, + "loss": 0.4692, + "step": 29113 + }, + { + "epoch": 1.6303057453242245, + "grad_norm": 1.5625931024551392, + "learning_rate": 9.760526315789474e-05, + "loss": 0.4163, + "step": 29114 + }, + { + "epoch": 1.6303617426363535, + "grad_norm": 1.2539629936218262, + "learning_rate": 9.7605e-05, + "loss": 0.4085, + "step": 29115 + }, + { + "epoch": 1.6304177399484825, + "grad_norm": 1.3470768928527832, + "learning_rate": 9.760473684210527e-05, + "loss": 0.4537, + "step": 29116 + }, + { + "epoch": 1.6304737372606115, + "grad_norm": 1.1457774639129639, + "learning_rate": 9.760447368421053e-05, + "loss": 0.451, + "step": 29117 + }, + { + "epoch": 1.6305297345727405, + "grad_norm": 1.6563763618469238, + "learning_rate": 9.76042105263158e-05, + "loss": 0.6096, + "step": 29118 + }, + { + "epoch": 1.6305857318848695, + "grad_norm": 1.1781911849975586, + "learning_rate": 9.760394736842105e-05, + "loss": 0.4572, + "step": 29119 + }, + { + "epoch": 1.6306417291969986, + "grad_norm": 1.3582149744033813, + "learning_rate": 9.760368421052632e-05, + "loss": 0.5164, + "step": 29120 + }, + { + "epoch": 1.6306977265091276, + "grad_norm": 1.073562502861023, + "learning_rate": 9.760342105263158e-05, + "loss": 0.3497, + "step": 29121 + }, + { + "epoch": 1.6307537238212566, + "grad_norm": 1.3410911560058594, + "learning_rate": 9.760315789473686e-05, + "loss": 0.488, + "step": 29122 + }, + { + "epoch": 1.6308097211333856, + "grad_norm": 1.3580961227416992, + "learning_rate": 9.760289473684212e-05, + "loss": 0.5321, + "step": 29123 + }, + { + "epoch": 1.6308657184455146, + "grad_norm": 1.290549635887146, + "learning_rate": 9.760263157894736e-05, + "loss": 0.5001, + "step": 29124 + }, + { + "epoch": 1.6309217157576437, + "grad_norm": 1.5435512065887451, + "learning_rate": 9.760236842105264e-05, + "loss": 0.6849, + "step": 29125 + }, + { + "epoch": 1.6309777130697727, + "grad_norm": 1.1113272905349731, + "learning_rate": 9.76021052631579e-05, + "loss": 0.3134, + "step": 29126 + }, + { + "epoch": 1.6310337103819017, + "grad_norm": 1.1967477798461914, + "learning_rate": 9.760184210526317e-05, + "loss": 0.3766, + "step": 29127 + }, + { + "epoch": 1.6310897076940307, + "grad_norm": 1.4602841138839722, + "learning_rate": 9.760157894736842e-05, + "loss": 0.5771, + "step": 29128 + }, + { + "epoch": 1.6311457050061597, + "grad_norm": 1.3339039087295532, + "learning_rate": 9.760131578947369e-05, + "loss": 0.4594, + "step": 29129 + }, + { + "epoch": 1.6312017023182888, + "grad_norm": 1.2210233211517334, + "learning_rate": 9.760105263157895e-05, + "loss": 0.4366, + "step": 29130 + }, + { + "epoch": 1.6312576996304178, + "grad_norm": 1.192659616470337, + "learning_rate": 9.760078947368422e-05, + "loss": 0.5058, + "step": 29131 + }, + { + "epoch": 1.6313136969425468, + "grad_norm": 1.5911970138549805, + "learning_rate": 9.760052631578948e-05, + "loss": 0.4466, + "step": 29132 + }, + { + "epoch": 1.6313696942546758, + "grad_norm": 1.1969664096832275, + "learning_rate": 9.760026315789474e-05, + "loss": 0.4689, + "step": 29133 + }, + { + "epoch": 1.6314256915668048, + "grad_norm": 1.3877958059310913, + "learning_rate": 9.76e-05, + "loss": 0.3713, + "step": 29134 + }, + { + "epoch": 1.6314816888789339, + "grad_norm": 1.6576367616653442, + "learning_rate": 9.759973684210527e-05, + "loss": 0.5546, + "step": 29135 + }, + { + "epoch": 1.6315376861910629, + "grad_norm": 1.4091688394546509, + "learning_rate": 9.759947368421053e-05, + "loss": 0.4724, + "step": 29136 + }, + { + "epoch": 1.631593683503192, + "grad_norm": 1.2410529851913452, + "learning_rate": 9.75992105263158e-05, + "loss": 0.4976, + "step": 29137 + }, + { + "epoch": 1.631649680815321, + "grad_norm": 1.290068507194519, + "learning_rate": 9.759894736842105e-05, + "loss": 0.5175, + "step": 29138 + }, + { + "epoch": 1.63170567812745, + "grad_norm": 1.3230829238891602, + "learning_rate": 9.759868421052633e-05, + "loss": 0.357, + "step": 29139 + }, + { + "epoch": 1.631761675439579, + "grad_norm": 1.3626868724822998, + "learning_rate": 9.759842105263159e-05, + "loss": 0.6765, + "step": 29140 + }, + { + "epoch": 1.631817672751708, + "grad_norm": 1.5146164894104004, + "learning_rate": 9.759815789473685e-05, + "loss": 0.5233, + "step": 29141 + }, + { + "epoch": 1.631873670063837, + "grad_norm": 1.4057866334915161, + "learning_rate": 9.75978947368421e-05, + "loss": 0.3804, + "step": 29142 + }, + { + "epoch": 1.631929667375966, + "grad_norm": 1.1456196308135986, + "learning_rate": 9.759763157894737e-05, + "loss": 0.4014, + "step": 29143 + }, + { + "epoch": 1.631985664688095, + "grad_norm": 1.3079019784927368, + "learning_rate": 9.759736842105264e-05, + "loss": 0.5138, + "step": 29144 + }, + { + "epoch": 1.632041662000224, + "grad_norm": 1.0659583806991577, + "learning_rate": 9.75971052631579e-05, + "loss": 0.4265, + "step": 29145 + }, + { + "epoch": 1.632097659312353, + "grad_norm": 1.377334713935852, + "learning_rate": 9.759684210526316e-05, + "loss": 0.4598, + "step": 29146 + }, + { + "epoch": 1.632153656624482, + "grad_norm": 1.3103435039520264, + "learning_rate": 9.759657894736842e-05, + "loss": 0.3911, + "step": 29147 + }, + { + "epoch": 1.632209653936611, + "grad_norm": 1.2591474056243896, + "learning_rate": 9.759631578947369e-05, + "loss": 0.4053, + "step": 29148 + }, + { + "epoch": 1.6322656512487401, + "grad_norm": 1.6633414030075073, + "learning_rate": 9.759605263157895e-05, + "loss": 0.4674, + "step": 29149 + }, + { + "epoch": 1.6323216485608691, + "grad_norm": 1.1004396677017212, + "learning_rate": 9.759578947368422e-05, + "loss": 0.3968, + "step": 29150 + }, + { + "epoch": 1.6323776458729982, + "grad_norm": 1.4744101762771606, + "learning_rate": 9.759552631578947e-05, + "loss": 0.4244, + "step": 29151 + }, + { + "epoch": 1.6324336431851272, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.759526315789474e-05, + "loss": 0.4432, + "step": 29152 + }, + { + "epoch": 1.6324896404972562, + "grad_norm": 1.2009389400482178, + "learning_rate": 9.7595e-05, + "loss": 0.4098, + "step": 29153 + }, + { + "epoch": 1.6325456378093852, + "grad_norm": 1.4981553554534912, + "learning_rate": 9.759473684210528e-05, + "loss": 0.4769, + "step": 29154 + }, + { + "epoch": 1.6326016351215142, + "grad_norm": 1.6977664232254028, + "learning_rate": 9.759447368421054e-05, + "loss": 0.4882, + "step": 29155 + }, + { + "epoch": 1.6326576324336433, + "grad_norm": 0.9824967384338379, + "learning_rate": 9.75942105263158e-05, + "loss": 0.3928, + "step": 29156 + }, + { + "epoch": 1.6327136297457723, + "grad_norm": 1.3059180974960327, + "learning_rate": 9.759394736842106e-05, + "loss": 0.5373, + "step": 29157 + }, + { + "epoch": 1.6327696270579013, + "grad_norm": 1.1779322624206543, + "learning_rate": 9.759368421052632e-05, + "loss": 0.4314, + "step": 29158 + }, + { + "epoch": 1.6328256243700303, + "grad_norm": 1.2194240093231201, + "learning_rate": 9.759342105263159e-05, + "loss": 0.4762, + "step": 29159 + }, + { + "epoch": 1.6328816216821593, + "grad_norm": 1.175771713256836, + "learning_rate": 9.759315789473685e-05, + "loss": 0.4765, + "step": 29160 + }, + { + "epoch": 1.6329376189942884, + "grad_norm": 1.2643389701843262, + "learning_rate": 9.759289473684211e-05, + "loss": 0.4175, + "step": 29161 + }, + { + "epoch": 1.6329936163064174, + "grad_norm": 1.2959729433059692, + "learning_rate": 9.759263157894737e-05, + "loss": 0.452, + "step": 29162 + }, + { + "epoch": 1.6330496136185464, + "grad_norm": 1.3463983535766602, + "learning_rate": 9.759236842105264e-05, + "loss": 0.3897, + "step": 29163 + }, + { + "epoch": 1.6331056109306754, + "grad_norm": 1.2260736227035522, + "learning_rate": 9.75921052631579e-05, + "loss": 0.46, + "step": 29164 + }, + { + "epoch": 1.6331616082428044, + "grad_norm": 1.4508298635482788, + "learning_rate": 9.759184210526316e-05, + "loss": 0.5266, + "step": 29165 + }, + { + "epoch": 1.6332176055549334, + "grad_norm": 1.2655651569366455, + "learning_rate": 9.759157894736842e-05, + "loss": 0.4594, + "step": 29166 + }, + { + "epoch": 1.6332736028670625, + "grad_norm": 1.4045745134353638, + "learning_rate": 9.75913157894737e-05, + "loss": 0.7315, + "step": 29167 + }, + { + "epoch": 1.6333296001791915, + "grad_norm": 1.1083928346633911, + "learning_rate": 9.759105263157895e-05, + "loss": 0.4991, + "step": 29168 + }, + { + "epoch": 1.6333855974913205, + "grad_norm": 1.4445360898971558, + "learning_rate": 9.759078947368421e-05, + "loss": 0.5848, + "step": 29169 + }, + { + "epoch": 1.6334415948034495, + "grad_norm": 1.4294407367706299, + "learning_rate": 9.759052631578947e-05, + "loss": 0.5486, + "step": 29170 + }, + { + "epoch": 1.6334975921155785, + "grad_norm": 1.4833860397338867, + "learning_rate": 9.759026315789475e-05, + "loss": 0.5012, + "step": 29171 + }, + { + "epoch": 1.6335535894277076, + "grad_norm": 1.4099397659301758, + "learning_rate": 9.759e-05, + "loss": 0.3433, + "step": 29172 + }, + { + "epoch": 1.6336095867398366, + "grad_norm": 5.987528324127197, + "learning_rate": 9.758973684210528e-05, + "loss": 0.5535, + "step": 29173 + }, + { + "epoch": 1.6336655840519656, + "grad_norm": 1.8035095930099487, + "learning_rate": 9.758947368421053e-05, + "loss": 0.4982, + "step": 29174 + }, + { + "epoch": 1.6337215813640946, + "grad_norm": 1.0976383686065674, + "learning_rate": 9.75892105263158e-05, + "loss": 0.3637, + "step": 29175 + }, + { + "epoch": 1.6337775786762236, + "grad_norm": 1.1581015586853027, + "learning_rate": 9.758894736842106e-05, + "loss": 0.4259, + "step": 29176 + }, + { + "epoch": 1.6338335759883527, + "grad_norm": 1.3488553762435913, + "learning_rate": 9.758868421052632e-05, + "loss": 0.409, + "step": 29177 + }, + { + "epoch": 1.6338895733004817, + "grad_norm": 1.1652346849441528, + "learning_rate": 9.758842105263158e-05, + "loss": 0.478, + "step": 29178 + }, + { + "epoch": 1.6339455706126107, + "grad_norm": 1.2191673517227173, + "learning_rate": 9.758815789473684e-05, + "loss": 0.3877, + "step": 29179 + }, + { + "epoch": 1.6340015679247397, + "grad_norm": 1.2604730129241943, + "learning_rate": 9.758789473684211e-05, + "loss": 0.4978, + "step": 29180 + }, + { + "epoch": 1.6340575652368687, + "grad_norm": 1.6360100507736206, + "learning_rate": 9.758763157894737e-05, + "loss": 0.3549, + "step": 29181 + }, + { + "epoch": 1.6341135625489978, + "grad_norm": 1.3707400560379028, + "learning_rate": 9.758736842105264e-05, + "loss": 0.4374, + "step": 29182 + }, + { + "epoch": 1.6341695598611268, + "grad_norm": 1.3481628894805908, + "learning_rate": 9.758710526315789e-05, + "loss": 0.546, + "step": 29183 + }, + { + "epoch": 1.6342255571732558, + "grad_norm": 1.3352290391921997, + "learning_rate": 9.758684210526316e-05, + "loss": 0.5488, + "step": 29184 + }, + { + "epoch": 1.6342815544853848, + "grad_norm": 1.357681393623352, + "learning_rate": 9.758657894736842e-05, + "loss": 0.4132, + "step": 29185 + }, + { + "epoch": 1.6343375517975138, + "grad_norm": 1.2435659170150757, + "learning_rate": 9.75863157894737e-05, + "loss": 0.3325, + "step": 29186 + }, + { + "epoch": 1.6343935491096429, + "grad_norm": 1.1963164806365967, + "learning_rate": 9.758605263157896e-05, + "loss": 0.448, + "step": 29187 + }, + { + "epoch": 1.6344495464217719, + "grad_norm": 1.3451859951019287, + "learning_rate": 9.758578947368422e-05, + "loss": 0.5199, + "step": 29188 + }, + { + "epoch": 1.634505543733901, + "grad_norm": 1.3353017568588257, + "learning_rate": 9.758552631578948e-05, + "loss": 0.6076, + "step": 29189 + }, + { + "epoch": 1.63456154104603, + "grad_norm": 1.2593022584915161, + "learning_rate": 9.758526315789475e-05, + "loss": 0.53, + "step": 29190 + }, + { + "epoch": 1.634617538358159, + "grad_norm": 1.5316128730773926, + "learning_rate": 9.758500000000001e-05, + "loss": 0.4194, + "step": 29191 + }, + { + "epoch": 1.634673535670288, + "grad_norm": 1.4630426168441772, + "learning_rate": 9.758473684210527e-05, + "loss": 0.472, + "step": 29192 + }, + { + "epoch": 1.634729532982417, + "grad_norm": 1.186896800994873, + "learning_rate": 9.758447368421053e-05, + "loss": 0.4558, + "step": 29193 + }, + { + "epoch": 1.634785530294546, + "grad_norm": 1.35330331325531, + "learning_rate": 9.758421052631579e-05, + "loss": 0.4215, + "step": 29194 + }, + { + "epoch": 1.634841527606675, + "grad_norm": 1.1766036748886108, + "learning_rate": 9.758394736842106e-05, + "loss": 0.3758, + "step": 29195 + }, + { + "epoch": 1.634897524918804, + "grad_norm": 2.030928373336792, + "learning_rate": 9.758368421052632e-05, + "loss": 0.485, + "step": 29196 + }, + { + "epoch": 1.634953522230933, + "grad_norm": 1.5365872383117676, + "learning_rate": 9.758342105263158e-05, + "loss": 0.7332, + "step": 29197 + }, + { + "epoch": 1.635009519543062, + "grad_norm": 1.1595895290374756, + "learning_rate": 9.758315789473684e-05, + "loss": 0.3959, + "step": 29198 + }, + { + "epoch": 1.635065516855191, + "grad_norm": 1.3030779361724854, + "learning_rate": 9.758289473684211e-05, + "loss": 0.4442, + "step": 29199 + }, + { + "epoch": 1.63512151416732, + "grad_norm": 1.7303502559661865, + "learning_rate": 9.758263157894737e-05, + "loss": 0.411, + "step": 29200 + }, + { + "epoch": 1.6351775114794491, + "grad_norm": 1.6569221019744873, + "learning_rate": 9.758236842105263e-05, + "loss": 0.6445, + "step": 29201 + }, + { + "epoch": 1.6352335087915781, + "grad_norm": 1.3060539960861206, + "learning_rate": 9.758210526315789e-05, + "loss": 0.4981, + "step": 29202 + }, + { + "epoch": 1.6352895061037072, + "grad_norm": 1.2461727857589722, + "learning_rate": 9.758184210526317e-05, + "loss": 0.5195, + "step": 29203 + }, + { + "epoch": 1.6353455034158362, + "grad_norm": 1.407566785812378, + "learning_rate": 9.758157894736843e-05, + "loss": 0.4992, + "step": 29204 + }, + { + "epoch": 1.6354015007279652, + "grad_norm": 1.8369650840759277, + "learning_rate": 9.75813157894737e-05, + "loss": 0.4087, + "step": 29205 + }, + { + "epoch": 1.6354574980400942, + "grad_norm": 1.3014521598815918, + "learning_rate": 9.758105263157895e-05, + "loss": 0.4605, + "step": 29206 + }, + { + "epoch": 1.6355134953522232, + "grad_norm": 1.6740087270736694, + "learning_rate": 9.758078947368422e-05, + "loss": 0.4178, + "step": 29207 + }, + { + "epoch": 1.6355694926643523, + "grad_norm": 1.2325019836425781, + "learning_rate": 9.758052631578948e-05, + "loss": 0.405, + "step": 29208 + }, + { + "epoch": 1.6356254899764813, + "grad_norm": 1.3273875713348389, + "learning_rate": 9.758026315789475e-05, + "loss": 0.3583, + "step": 29209 + }, + { + "epoch": 1.6356814872886103, + "grad_norm": 1.1015675067901611, + "learning_rate": 9.758000000000001e-05, + "loss": 0.3004, + "step": 29210 + }, + { + "epoch": 1.6357374846007393, + "grad_norm": 1.7572027444839478, + "learning_rate": 9.757973684210526e-05, + "loss": 0.5247, + "step": 29211 + }, + { + "epoch": 1.6357934819128683, + "grad_norm": 1.3972452878952026, + "learning_rate": 9.757947368421053e-05, + "loss": 0.4335, + "step": 29212 + }, + { + "epoch": 1.6358494792249973, + "grad_norm": 1.4027369022369385, + "learning_rate": 9.757921052631579e-05, + "loss": 0.4834, + "step": 29213 + }, + { + "epoch": 1.6359054765371264, + "grad_norm": 1.4487584829330444, + "learning_rate": 9.757894736842106e-05, + "loss": 0.4702, + "step": 29214 + }, + { + "epoch": 1.6359614738492554, + "grad_norm": 1.4246277809143066, + "learning_rate": 9.757868421052632e-05, + "loss": 0.4236, + "step": 29215 + }, + { + "epoch": 1.6360174711613844, + "grad_norm": 1.6195552349090576, + "learning_rate": 9.757842105263158e-05, + "loss": 0.407, + "step": 29216 + }, + { + "epoch": 1.6360734684735134, + "grad_norm": 1.6442829370498657, + "learning_rate": 9.757815789473684e-05, + "loss": 0.4091, + "step": 29217 + }, + { + "epoch": 1.6361294657856424, + "grad_norm": 1.4494110345840454, + "learning_rate": 9.757789473684212e-05, + "loss": 0.4088, + "step": 29218 + }, + { + "epoch": 1.6361854630977715, + "grad_norm": 1.286902666091919, + "learning_rate": 9.757763157894738e-05, + "loss": 0.4466, + "step": 29219 + }, + { + "epoch": 1.6362414604099005, + "grad_norm": 1.5643664598464966, + "learning_rate": 9.757736842105264e-05, + "loss": 0.6074, + "step": 29220 + }, + { + "epoch": 1.6362974577220295, + "grad_norm": 1.476778507232666, + "learning_rate": 9.75771052631579e-05, + "loss": 0.4695, + "step": 29221 + }, + { + "epoch": 1.6363534550341585, + "grad_norm": 1.1702582836151123, + "learning_rate": 9.757684210526317e-05, + "loss": 0.4855, + "step": 29222 + }, + { + "epoch": 1.6364094523462875, + "grad_norm": 1.292647123336792, + "learning_rate": 9.757657894736843e-05, + "loss": 0.4206, + "step": 29223 + }, + { + "epoch": 1.6364654496584166, + "grad_norm": 1.072305679321289, + "learning_rate": 9.757631578947369e-05, + "loss": 0.4266, + "step": 29224 + }, + { + "epoch": 1.6365214469705456, + "grad_norm": 2.024962902069092, + "learning_rate": 9.757605263157895e-05, + "loss": 0.5598, + "step": 29225 + }, + { + "epoch": 1.6365774442826744, + "grad_norm": 1.2907689809799194, + "learning_rate": 9.757578947368422e-05, + "loss": 0.4269, + "step": 29226 + }, + { + "epoch": 1.6366334415948034, + "grad_norm": 1.6650328636169434, + "learning_rate": 9.757552631578948e-05, + "loss": 0.635, + "step": 29227 + }, + { + "epoch": 1.6366894389069324, + "grad_norm": 1.3826417922973633, + "learning_rate": 9.757526315789474e-05, + "loss": 0.4345, + "step": 29228 + }, + { + "epoch": 1.6367454362190614, + "grad_norm": 2.3610739707946777, + "learning_rate": 9.7575e-05, + "loss": 0.5797, + "step": 29229 + }, + { + "epoch": 1.6368014335311905, + "grad_norm": 1.8455467224121094, + "learning_rate": 9.757473684210526e-05, + "loss": 0.7451, + "step": 29230 + }, + { + "epoch": 1.6368574308433195, + "grad_norm": 1.270052433013916, + "learning_rate": 9.757447368421053e-05, + "loss": 0.4652, + "step": 29231 + }, + { + "epoch": 1.6369134281554485, + "grad_norm": 1.677120566368103, + "learning_rate": 9.757421052631579e-05, + "loss": 0.4804, + "step": 29232 + }, + { + "epoch": 1.6369694254675775, + "grad_norm": 1.7527693510055542, + "learning_rate": 9.757394736842105e-05, + "loss": 0.3979, + "step": 29233 + }, + { + "epoch": 1.6370254227797065, + "grad_norm": 1.2408288717269897, + "learning_rate": 9.757368421052631e-05, + "loss": 0.4502, + "step": 29234 + }, + { + "epoch": 1.6370814200918355, + "grad_norm": 1.2545652389526367, + "learning_rate": 9.757342105263159e-05, + "loss": 0.4549, + "step": 29235 + }, + { + "epoch": 1.6371374174039646, + "grad_norm": 1.446791648864746, + "learning_rate": 9.757315789473685e-05, + "loss": 0.5472, + "step": 29236 + }, + { + "epoch": 1.6371934147160936, + "grad_norm": 1.5149366855621338, + "learning_rate": 9.757289473684212e-05, + "loss": 0.4647, + "step": 29237 + }, + { + "epoch": 1.6372494120282226, + "grad_norm": 1.4112406969070435, + "learning_rate": 9.757263157894736e-05, + "loss": 0.5926, + "step": 29238 + }, + { + "epoch": 1.6373054093403516, + "grad_norm": 1.5062649250030518, + "learning_rate": 9.757236842105264e-05, + "loss": 0.3953, + "step": 29239 + }, + { + "epoch": 1.6373614066524806, + "grad_norm": 1.1949267387390137, + "learning_rate": 9.75721052631579e-05, + "loss": 0.3226, + "step": 29240 + }, + { + "epoch": 1.6374174039646097, + "grad_norm": 1.3616310358047485, + "learning_rate": 9.757184210526317e-05, + "loss": 0.5263, + "step": 29241 + }, + { + "epoch": 1.6374734012767387, + "grad_norm": 1.2105578184127808, + "learning_rate": 9.757157894736843e-05, + "loss": 0.4264, + "step": 29242 + }, + { + "epoch": 1.6375293985888677, + "grad_norm": 1.3974268436431885, + "learning_rate": 9.757131578947369e-05, + "loss": 0.5389, + "step": 29243 + }, + { + "epoch": 1.6375853959009967, + "grad_norm": 1.3129873275756836, + "learning_rate": 9.757105263157895e-05, + "loss": 0.4387, + "step": 29244 + }, + { + "epoch": 1.6376413932131257, + "grad_norm": 1.1442760229110718, + "learning_rate": 9.757078947368421e-05, + "loss": 0.4952, + "step": 29245 + }, + { + "epoch": 1.6376973905252548, + "grad_norm": 1.4596924781799316, + "learning_rate": 9.757052631578948e-05, + "loss": 0.4531, + "step": 29246 + }, + { + "epoch": 1.6377533878373838, + "grad_norm": 1.195422649383545, + "learning_rate": 9.757026315789474e-05, + "loss": 0.4395, + "step": 29247 + }, + { + "epoch": 1.6378093851495128, + "grad_norm": 1.3700623512268066, + "learning_rate": 9.757e-05, + "loss": 0.3675, + "step": 29248 + }, + { + "epoch": 1.6378653824616418, + "grad_norm": 1.3969826698303223, + "learning_rate": 9.756973684210526e-05, + "loss": 0.4904, + "step": 29249 + }, + { + "epoch": 1.6379213797737708, + "grad_norm": 1.2775415182113647, + "learning_rate": 9.756947368421054e-05, + "loss": 0.528, + "step": 29250 + }, + { + "epoch": 1.6379773770858999, + "grad_norm": 1.1494224071502686, + "learning_rate": 9.75692105263158e-05, + "loss": 0.3905, + "step": 29251 + }, + { + "epoch": 1.6380333743980289, + "grad_norm": 1.369499683380127, + "learning_rate": 9.756894736842106e-05, + "loss": 0.5188, + "step": 29252 + }, + { + "epoch": 1.638089371710158, + "grad_norm": 1.2361911535263062, + "learning_rate": 9.756868421052631e-05, + "loss": 0.6881, + "step": 29253 + }, + { + "epoch": 1.638145369022287, + "grad_norm": 1.3087455034255981, + "learning_rate": 9.756842105263159e-05, + "loss": 0.5997, + "step": 29254 + }, + { + "epoch": 1.638201366334416, + "grad_norm": 1.2577117681503296, + "learning_rate": 9.756815789473685e-05, + "loss": 0.429, + "step": 29255 + }, + { + "epoch": 1.638257363646545, + "grad_norm": 1.2478843927383423, + "learning_rate": 9.756789473684211e-05, + "loss": 0.5154, + "step": 29256 + }, + { + "epoch": 1.638313360958674, + "grad_norm": 1.2429277896881104, + "learning_rate": 9.756763157894737e-05, + "loss": 0.4897, + "step": 29257 + }, + { + "epoch": 1.638369358270803, + "grad_norm": 1.3642586469650269, + "learning_rate": 9.756736842105264e-05, + "loss": 0.4276, + "step": 29258 + }, + { + "epoch": 1.638425355582932, + "grad_norm": 1.3517820835113525, + "learning_rate": 9.75671052631579e-05, + "loss": 0.4568, + "step": 29259 + }, + { + "epoch": 1.638481352895061, + "grad_norm": 1.2205671072006226, + "learning_rate": 9.756684210526317e-05, + "loss": 0.4527, + "step": 29260 + }, + { + "epoch": 1.63853735020719, + "grad_norm": 1.2526224851608276, + "learning_rate": 9.756657894736842e-05, + "loss": 0.5031, + "step": 29261 + }, + { + "epoch": 1.638593347519319, + "grad_norm": 1.354042649269104, + "learning_rate": 9.756631578947368e-05, + "loss": 0.3161, + "step": 29262 + }, + { + "epoch": 1.638649344831448, + "grad_norm": 1.4771968126296997, + "learning_rate": 9.756605263157895e-05, + "loss": 0.5773, + "step": 29263 + }, + { + "epoch": 1.638705342143577, + "grad_norm": 1.1587164402008057, + "learning_rate": 9.756578947368421e-05, + "loss": 0.3288, + "step": 29264 + }, + { + "epoch": 1.6387613394557061, + "grad_norm": 1.5697356462478638, + "learning_rate": 9.756552631578949e-05, + "loss": 0.5518, + "step": 29265 + }, + { + "epoch": 1.6388173367678351, + "grad_norm": 1.3110226392745972, + "learning_rate": 9.756526315789473e-05, + "loss": 0.4752, + "step": 29266 + }, + { + "epoch": 1.6388733340799642, + "grad_norm": 1.389346718788147, + "learning_rate": 9.7565e-05, + "loss": 0.5271, + "step": 29267 + }, + { + "epoch": 1.6389293313920932, + "grad_norm": 1.3497350215911865, + "learning_rate": 9.756473684210526e-05, + "loss": 0.341, + "step": 29268 + }, + { + "epoch": 1.6389853287042222, + "grad_norm": 1.2732964754104614, + "learning_rate": 9.756447368421054e-05, + "loss": 0.4535, + "step": 29269 + }, + { + "epoch": 1.6390413260163512, + "grad_norm": 1.4268783330917358, + "learning_rate": 9.75642105263158e-05, + "loss": 0.5771, + "step": 29270 + }, + { + "epoch": 1.6390973233284802, + "grad_norm": 1.372564435005188, + "learning_rate": 9.756394736842106e-05, + "loss": 0.4146, + "step": 29271 + }, + { + "epoch": 1.6391533206406093, + "grad_norm": 1.3734673261642456, + "learning_rate": 9.756368421052632e-05, + "loss": 0.4694, + "step": 29272 + }, + { + "epoch": 1.6392093179527383, + "grad_norm": 1.1530158519744873, + "learning_rate": 9.756342105263159e-05, + "loss": 0.4468, + "step": 29273 + }, + { + "epoch": 1.6392653152648673, + "grad_norm": 9.442240715026855, + "learning_rate": 9.756315789473685e-05, + "loss": 0.4221, + "step": 29274 + }, + { + "epoch": 1.6393213125769963, + "grad_norm": 1.2248588800430298, + "learning_rate": 9.756289473684211e-05, + "loss": 0.4177, + "step": 29275 + }, + { + "epoch": 1.6393773098891253, + "grad_norm": 1.0724364519119263, + "learning_rate": 9.756263157894737e-05, + "loss": 0.2904, + "step": 29276 + }, + { + "epoch": 1.6394333072012544, + "grad_norm": 1.2486066818237305, + "learning_rate": 9.756236842105264e-05, + "loss": 0.4967, + "step": 29277 + }, + { + "epoch": 1.6394893045133834, + "grad_norm": 1.1921827793121338, + "learning_rate": 9.75621052631579e-05, + "loss": 0.4251, + "step": 29278 + }, + { + "epoch": 1.6395453018255124, + "grad_norm": 1.30080246925354, + "learning_rate": 9.756184210526316e-05, + "loss": 0.4643, + "step": 29279 + }, + { + "epoch": 1.6396012991376414, + "grad_norm": 1.4581222534179688, + "learning_rate": 9.756157894736842e-05, + "loss": 0.5637, + "step": 29280 + }, + { + "epoch": 1.6396572964497704, + "grad_norm": 1.3218766450881958, + "learning_rate": 9.756131578947368e-05, + "loss": 0.416, + "step": 29281 + }, + { + "epoch": 1.6397132937618994, + "grad_norm": 1.4028072357177734, + "learning_rate": 9.756105263157896e-05, + "loss": 0.4125, + "step": 29282 + }, + { + "epoch": 1.6397692910740285, + "grad_norm": 1.1699087619781494, + "learning_rate": 9.756078947368422e-05, + "loss": 0.3245, + "step": 29283 + }, + { + "epoch": 1.6398252883861575, + "grad_norm": 1.3882259130477905, + "learning_rate": 9.756052631578947e-05, + "loss": 0.456, + "step": 29284 + }, + { + "epoch": 1.6398812856982865, + "grad_norm": 1.126474142074585, + "learning_rate": 9.756026315789473e-05, + "loss": 0.3478, + "step": 29285 + }, + { + "epoch": 1.6399372830104155, + "grad_norm": 1.2113491296768188, + "learning_rate": 9.756000000000001e-05, + "loss": 0.4296, + "step": 29286 + }, + { + "epoch": 1.6399932803225445, + "grad_norm": 1.5506101846694946, + "learning_rate": 9.755973684210527e-05, + "loss": 0.5387, + "step": 29287 + }, + { + "epoch": 1.6400492776346736, + "grad_norm": 1.283271074295044, + "learning_rate": 9.755947368421053e-05, + "loss": 0.5906, + "step": 29288 + }, + { + "epoch": 1.6401052749468026, + "grad_norm": 1.549760103225708, + "learning_rate": 9.755921052631579e-05, + "loss": 0.7714, + "step": 29289 + }, + { + "epoch": 1.6401612722589316, + "grad_norm": 1.3226968050003052, + "learning_rate": 9.755894736842106e-05, + "loss": 0.3748, + "step": 29290 + }, + { + "epoch": 1.6402172695710606, + "grad_norm": 1.3081238269805908, + "learning_rate": 9.755868421052632e-05, + "loss": 0.409, + "step": 29291 + }, + { + "epoch": 1.6402732668831896, + "grad_norm": 1.3023624420166016, + "learning_rate": 9.75584210526316e-05, + "loss": 0.5124, + "step": 29292 + }, + { + "epoch": 1.6403292641953187, + "grad_norm": 1.170420527458191, + "learning_rate": 9.755815789473684e-05, + "loss": 0.4215, + "step": 29293 + }, + { + "epoch": 1.6403852615074477, + "grad_norm": 1.242762804031372, + "learning_rate": 9.755789473684211e-05, + "loss": 0.4159, + "step": 29294 + }, + { + "epoch": 1.6404412588195767, + "grad_norm": 1.4003139734268188, + "learning_rate": 9.755763157894737e-05, + "loss": 0.4027, + "step": 29295 + }, + { + "epoch": 1.6404972561317057, + "grad_norm": 1.252991795539856, + "learning_rate": 9.755736842105265e-05, + "loss": 0.4914, + "step": 29296 + }, + { + "epoch": 1.6405532534438347, + "grad_norm": 1.2703192234039307, + "learning_rate": 9.75571052631579e-05, + "loss": 0.4134, + "step": 29297 + }, + { + "epoch": 1.6406092507559638, + "grad_norm": 1.4573822021484375, + "learning_rate": 9.755684210526315e-05, + "loss": 0.4338, + "step": 29298 + }, + { + "epoch": 1.6406652480680928, + "grad_norm": 1.3850125074386597, + "learning_rate": 9.755657894736842e-05, + "loss": 0.4316, + "step": 29299 + }, + { + "epoch": 1.6407212453802218, + "grad_norm": 1.1426774263381958, + "learning_rate": 9.755631578947368e-05, + "loss": 0.4613, + "step": 29300 + }, + { + "epoch": 1.6407772426923508, + "grad_norm": 1.572794795036316, + "learning_rate": 9.755605263157896e-05, + "loss": 0.5873, + "step": 29301 + }, + { + "epoch": 1.6408332400044798, + "grad_norm": 1.3179900646209717, + "learning_rate": 9.755578947368422e-05, + "loss": 0.4128, + "step": 29302 + }, + { + "epoch": 1.6408892373166089, + "grad_norm": 1.2013174295425415, + "learning_rate": 9.755552631578948e-05, + "loss": 0.4532, + "step": 29303 + }, + { + "epoch": 1.6409452346287379, + "grad_norm": 1.488812804222107, + "learning_rate": 9.755526315789474e-05, + "loss": 0.502, + "step": 29304 + }, + { + "epoch": 1.641001231940867, + "grad_norm": 1.9621952772140503, + "learning_rate": 9.755500000000001e-05, + "loss": 0.434, + "step": 29305 + }, + { + "epoch": 1.641057229252996, + "grad_norm": 1.46525239944458, + "learning_rate": 9.755473684210527e-05, + "loss": 0.5215, + "step": 29306 + }, + { + "epoch": 1.641113226565125, + "grad_norm": 1.2031933069229126, + "learning_rate": 9.755447368421053e-05, + "loss": 0.373, + "step": 29307 + }, + { + "epoch": 1.641169223877254, + "grad_norm": 1.5745117664337158, + "learning_rate": 9.755421052631579e-05, + "loss": 0.6136, + "step": 29308 + }, + { + "epoch": 1.6412252211893827, + "grad_norm": 1.5272430181503296, + "learning_rate": 9.755394736842106e-05, + "loss": 0.5901, + "step": 29309 + }, + { + "epoch": 1.6412812185015118, + "grad_norm": 1.1108843088150024, + "learning_rate": 9.755368421052632e-05, + "loss": 0.3861, + "step": 29310 + }, + { + "epoch": 1.6413372158136408, + "grad_norm": 1.524139404296875, + "learning_rate": 9.755342105263158e-05, + "loss": 0.5559, + "step": 29311 + }, + { + "epoch": 1.6413932131257698, + "grad_norm": 1.4420135021209717, + "learning_rate": 9.755315789473684e-05, + "loss": 0.4656, + "step": 29312 + }, + { + "epoch": 1.6414492104378988, + "grad_norm": 1.3306641578674316, + "learning_rate": 9.755289473684212e-05, + "loss": 0.4025, + "step": 29313 + }, + { + "epoch": 1.6415052077500278, + "grad_norm": 1.4485255479812622, + "learning_rate": 9.755263157894738e-05, + "loss": 0.4637, + "step": 29314 + }, + { + "epoch": 1.6415612050621569, + "grad_norm": 1.257196307182312, + "learning_rate": 9.755236842105263e-05, + "loss": 0.4802, + "step": 29315 + }, + { + "epoch": 1.6416172023742859, + "grad_norm": 1.5029897689819336, + "learning_rate": 9.75521052631579e-05, + "loss": 0.4612, + "step": 29316 + }, + { + "epoch": 1.641673199686415, + "grad_norm": 1.9011369943618774, + "learning_rate": 9.755184210526315e-05, + "loss": 0.6143, + "step": 29317 + }, + { + "epoch": 1.641729196998544, + "grad_norm": 1.32305908203125, + "learning_rate": 9.755157894736843e-05, + "loss": 0.4435, + "step": 29318 + }, + { + "epoch": 1.641785194310673, + "grad_norm": 0.9825717806816101, + "learning_rate": 9.755131578947369e-05, + "loss": 0.3063, + "step": 29319 + }, + { + "epoch": 1.641841191622802, + "grad_norm": 1.2988083362579346, + "learning_rate": 9.755105263157896e-05, + "loss": 0.3966, + "step": 29320 + }, + { + "epoch": 1.641897188934931, + "grad_norm": 1.243069052696228, + "learning_rate": 9.75507894736842e-05, + "loss": 0.4338, + "step": 29321 + }, + { + "epoch": 1.64195318624706, + "grad_norm": 1.6195769309997559, + "learning_rate": 9.755052631578948e-05, + "loss": 0.4953, + "step": 29322 + }, + { + "epoch": 1.642009183559189, + "grad_norm": 1.1538461446762085, + "learning_rate": 9.755026315789474e-05, + "loss": 0.4261, + "step": 29323 + }, + { + "epoch": 1.642065180871318, + "grad_norm": 1.189353346824646, + "learning_rate": 9.755000000000001e-05, + "loss": 0.3824, + "step": 29324 + }, + { + "epoch": 1.642121178183447, + "grad_norm": 1.3259837627410889, + "learning_rate": 9.754973684210527e-05, + "loss": 0.4563, + "step": 29325 + }, + { + "epoch": 1.642177175495576, + "grad_norm": 1.227063775062561, + "learning_rate": 9.754947368421053e-05, + "loss": 0.3955, + "step": 29326 + }, + { + "epoch": 1.642233172807705, + "grad_norm": 1.155016303062439, + "learning_rate": 9.754921052631579e-05, + "loss": 0.3072, + "step": 29327 + }, + { + "epoch": 1.642289170119834, + "grad_norm": 1.4582222700119019, + "learning_rate": 9.754894736842107e-05, + "loss": 0.5228, + "step": 29328 + }, + { + "epoch": 1.6423451674319631, + "grad_norm": 1.249298095703125, + "learning_rate": 9.754868421052633e-05, + "loss": 0.3776, + "step": 29329 + }, + { + "epoch": 1.6424011647440921, + "grad_norm": 1.5514057874679565, + "learning_rate": 9.754842105263158e-05, + "loss": 0.4551, + "step": 29330 + }, + { + "epoch": 1.6424571620562212, + "grad_norm": 1.3318390846252441, + "learning_rate": 9.754815789473684e-05, + "loss": 0.5298, + "step": 29331 + }, + { + "epoch": 1.6425131593683502, + "grad_norm": 1.2294528484344482, + "learning_rate": 9.75478947368421e-05, + "loss": 0.4337, + "step": 29332 + }, + { + "epoch": 1.6425691566804792, + "grad_norm": 1.2888906002044678, + "learning_rate": 9.754763157894738e-05, + "loss": 0.4862, + "step": 29333 + }, + { + "epoch": 1.6426251539926082, + "grad_norm": 1.1327520608901978, + "learning_rate": 9.754736842105264e-05, + "loss": 0.4729, + "step": 29334 + }, + { + "epoch": 1.6426811513047372, + "grad_norm": 1.1546499729156494, + "learning_rate": 9.75471052631579e-05, + "loss": 0.4146, + "step": 29335 + }, + { + "epoch": 1.6427371486168663, + "grad_norm": 1.6673691272735596, + "learning_rate": 9.754684210526316e-05, + "loss": 0.6231, + "step": 29336 + }, + { + "epoch": 1.6427931459289953, + "grad_norm": 1.2196277379989624, + "learning_rate": 9.754657894736843e-05, + "loss": 0.4973, + "step": 29337 + }, + { + "epoch": 1.6428491432411243, + "grad_norm": 1.2602683305740356, + "learning_rate": 9.754631578947369e-05, + "loss": 0.5665, + "step": 29338 + }, + { + "epoch": 1.6429051405532533, + "grad_norm": 1.1988778114318848, + "learning_rate": 9.754605263157895e-05, + "loss": 0.4264, + "step": 29339 + }, + { + "epoch": 1.6429611378653823, + "grad_norm": 1.1710362434387207, + "learning_rate": 9.754578947368421e-05, + "loss": 0.3061, + "step": 29340 + }, + { + "epoch": 1.6430171351775114, + "grad_norm": 1.3546621799468994, + "learning_rate": 9.754552631578948e-05, + "loss": 0.6396, + "step": 29341 + }, + { + "epoch": 1.6430731324896404, + "grad_norm": 1.3007144927978516, + "learning_rate": 9.754526315789474e-05, + "loss": 0.435, + "step": 29342 + }, + { + "epoch": 1.6431291298017694, + "grad_norm": 1.340864896774292, + "learning_rate": 9.7545e-05, + "loss": 0.4531, + "step": 29343 + }, + { + "epoch": 1.6431851271138984, + "grad_norm": 1.3586357831954956, + "learning_rate": 9.754473684210526e-05, + "loss": 0.4163, + "step": 29344 + }, + { + "epoch": 1.6432411244260274, + "grad_norm": 1.363587498664856, + "learning_rate": 9.754447368421054e-05, + "loss": 0.4542, + "step": 29345 + }, + { + "epoch": 1.6432971217381565, + "grad_norm": 1.5507924556732178, + "learning_rate": 9.75442105263158e-05, + "loss": 0.4726, + "step": 29346 + }, + { + "epoch": 1.6433531190502855, + "grad_norm": 1.2276865243911743, + "learning_rate": 9.754394736842107e-05, + "loss": 0.5614, + "step": 29347 + }, + { + "epoch": 1.6434091163624145, + "grad_norm": 1.294532060623169, + "learning_rate": 9.754368421052631e-05, + "loss": 0.3719, + "step": 29348 + }, + { + "epoch": 1.6434651136745435, + "grad_norm": 1.558737874031067, + "learning_rate": 9.754342105263157e-05, + "loss": 0.5353, + "step": 29349 + }, + { + "epoch": 1.6435211109866725, + "grad_norm": 1.4003148078918457, + "learning_rate": 9.754315789473685e-05, + "loss": 0.5072, + "step": 29350 + }, + { + "epoch": 1.6435771082988015, + "grad_norm": 1.0688210725784302, + "learning_rate": 9.754289473684211e-05, + "loss": 0.3453, + "step": 29351 + }, + { + "epoch": 1.6436331056109306, + "grad_norm": 1.2977280616760254, + "learning_rate": 9.754263157894738e-05, + "loss": 0.4658, + "step": 29352 + }, + { + "epoch": 1.6436891029230596, + "grad_norm": 1.1841076612472534, + "learning_rate": 9.754236842105263e-05, + "loss": 0.4808, + "step": 29353 + }, + { + "epoch": 1.6437451002351886, + "grad_norm": 1.3413748741149902, + "learning_rate": 9.75421052631579e-05, + "loss": 0.5303, + "step": 29354 + }, + { + "epoch": 1.6438010975473176, + "grad_norm": 1.2622005939483643, + "learning_rate": 9.754184210526316e-05, + "loss": 0.4256, + "step": 29355 + }, + { + "epoch": 1.6438570948594466, + "grad_norm": 1.2882167100906372, + "learning_rate": 9.754157894736843e-05, + "loss": 0.4495, + "step": 29356 + }, + { + "epoch": 1.6439130921715757, + "grad_norm": 1.2747324705123901, + "learning_rate": 9.754131578947369e-05, + "loss": 0.4351, + "step": 29357 + }, + { + "epoch": 1.6439690894837047, + "grad_norm": 1.3000568151474, + "learning_rate": 9.754105263157895e-05, + "loss": 0.6083, + "step": 29358 + }, + { + "epoch": 1.6440250867958337, + "grad_norm": 1.176451325416565, + "learning_rate": 9.754078947368421e-05, + "loss": 0.463, + "step": 29359 + }, + { + "epoch": 1.6440810841079627, + "grad_norm": 1.1218034029006958, + "learning_rate": 9.754052631578949e-05, + "loss": 0.3896, + "step": 29360 + }, + { + "epoch": 1.6441370814200917, + "grad_norm": 1.5688334703445435, + "learning_rate": 9.754026315789474e-05, + "loss": 0.4835, + "step": 29361 + }, + { + "epoch": 1.6441930787322208, + "grad_norm": 1.240452527999878, + "learning_rate": 9.754e-05, + "loss": 0.359, + "step": 29362 + }, + { + "epoch": 1.6442490760443498, + "grad_norm": 1.2389189004898071, + "learning_rate": 9.753973684210526e-05, + "loss": 0.6237, + "step": 29363 + }, + { + "epoch": 1.6443050733564788, + "grad_norm": 1.3271735906600952, + "learning_rate": 9.753947368421054e-05, + "loss": 0.4437, + "step": 29364 + }, + { + "epoch": 1.6443610706686078, + "grad_norm": 1.3391735553741455, + "learning_rate": 9.75392105263158e-05, + "loss": 0.433, + "step": 29365 + }, + { + "epoch": 1.6444170679807368, + "grad_norm": 1.1607493162155151, + "learning_rate": 9.753894736842106e-05, + "loss": 0.3328, + "step": 29366 + }, + { + "epoch": 1.6444730652928659, + "grad_norm": 1.331188678741455, + "learning_rate": 9.753868421052632e-05, + "loss": 0.5072, + "step": 29367 + }, + { + "epoch": 1.6445290626049949, + "grad_norm": 1.3958109617233276, + "learning_rate": 9.753842105263158e-05, + "loss": 0.4293, + "step": 29368 + }, + { + "epoch": 1.644585059917124, + "grad_norm": 1.2311660051345825, + "learning_rate": 9.753815789473685e-05, + "loss": 0.4149, + "step": 29369 + }, + { + "epoch": 1.644641057229253, + "grad_norm": 1.254151463508606, + "learning_rate": 9.753789473684211e-05, + "loss": 0.3842, + "step": 29370 + }, + { + "epoch": 1.644697054541382, + "grad_norm": 1.3861286640167236, + "learning_rate": 9.753763157894737e-05, + "loss": 0.435, + "step": 29371 + }, + { + "epoch": 1.644753051853511, + "grad_norm": 1.5580637454986572, + "learning_rate": 9.753736842105263e-05, + "loss": 0.5089, + "step": 29372 + }, + { + "epoch": 1.64480904916564, + "grad_norm": 1.239936113357544, + "learning_rate": 9.75371052631579e-05, + "loss": 0.3747, + "step": 29373 + }, + { + "epoch": 1.644865046477769, + "grad_norm": 1.2191531658172607, + "learning_rate": 9.753684210526316e-05, + "loss": 0.408, + "step": 29374 + }, + { + "epoch": 1.644921043789898, + "grad_norm": 1.3743925094604492, + "learning_rate": 9.753657894736844e-05, + "loss": 0.4911, + "step": 29375 + }, + { + "epoch": 1.644977041102027, + "grad_norm": 1.1639271974563599, + "learning_rate": 9.753631578947368e-05, + "loss": 0.3753, + "step": 29376 + }, + { + "epoch": 1.645033038414156, + "grad_norm": 1.3580440282821655, + "learning_rate": 9.753605263157895e-05, + "loss": 0.4591, + "step": 29377 + }, + { + "epoch": 1.645089035726285, + "grad_norm": 1.3186320066452026, + "learning_rate": 9.753578947368421e-05, + "loss": 0.5989, + "step": 29378 + }, + { + "epoch": 1.645145033038414, + "grad_norm": 1.604811191558838, + "learning_rate": 9.753552631578949e-05, + "loss": 0.6839, + "step": 29379 + }, + { + "epoch": 1.645201030350543, + "grad_norm": 9.71956729888916, + "learning_rate": 9.753526315789473e-05, + "loss": 0.5981, + "step": 29380 + }, + { + "epoch": 1.6452570276626721, + "grad_norm": 1.2815930843353271, + "learning_rate": 9.753500000000001e-05, + "loss": 0.3662, + "step": 29381 + }, + { + "epoch": 1.6453130249748011, + "grad_norm": 1.2173564434051514, + "learning_rate": 9.753473684210527e-05, + "loss": 0.3752, + "step": 29382 + }, + { + "epoch": 1.6453690222869302, + "grad_norm": 1.4138680696487427, + "learning_rate": 9.753447368421053e-05, + "loss": 0.3849, + "step": 29383 + }, + { + "epoch": 1.6454250195990592, + "grad_norm": 1.145095944404602, + "learning_rate": 9.75342105263158e-05, + "loss": 0.3243, + "step": 29384 + }, + { + "epoch": 1.6454810169111882, + "grad_norm": 1.850839614868164, + "learning_rate": 9.753394736842105e-05, + "loss": 0.5586, + "step": 29385 + }, + { + "epoch": 1.6455370142233172, + "grad_norm": 1.1812289953231812, + "learning_rate": 9.753368421052632e-05, + "loss": 0.3993, + "step": 29386 + }, + { + "epoch": 1.6455930115354462, + "grad_norm": 1.272046446800232, + "learning_rate": 9.753342105263158e-05, + "loss": 0.4131, + "step": 29387 + }, + { + "epoch": 1.6456490088475753, + "grad_norm": 1.8550727367401123, + "learning_rate": 9.753315789473685e-05, + "loss": 0.5772, + "step": 29388 + }, + { + "epoch": 1.6457050061597043, + "grad_norm": 1.3328020572662354, + "learning_rate": 9.753289473684211e-05, + "loss": 0.5455, + "step": 29389 + }, + { + "epoch": 1.6457610034718333, + "grad_norm": 1.267332673072815, + "learning_rate": 9.753263157894737e-05, + "loss": 0.4384, + "step": 29390 + }, + { + "epoch": 1.6458170007839623, + "grad_norm": 1.1775612831115723, + "learning_rate": 9.753236842105263e-05, + "loss": 0.4278, + "step": 29391 + }, + { + "epoch": 1.6458729980960913, + "grad_norm": 1.369011402130127, + "learning_rate": 9.75321052631579e-05, + "loss": 0.4961, + "step": 29392 + }, + { + "epoch": 1.6459289954082204, + "grad_norm": 1.5223098993301392, + "learning_rate": 9.753184210526316e-05, + "loss": 0.5209, + "step": 29393 + }, + { + "epoch": 1.6459849927203494, + "grad_norm": 1.4535424709320068, + "learning_rate": 9.753157894736842e-05, + "loss": 0.5633, + "step": 29394 + }, + { + "epoch": 1.6460409900324784, + "grad_norm": 1.1435127258300781, + "learning_rate": 9.753131578947368e-05, + "loss": 0.4205, + "step": 29395 + }, + { + "epoch": 1.6460969873446074, + "grad_norm": 1.2056618928909302, + "learning_rate": 9.753105263157896e-05, + "loss": 0.3655, + "step": 29396 + }, + { + "epoch": 1.6461529846567364, + "grad_norm": 1.3217023611068726, + "learning_rate": 9.753078947368422e-05, + "loss": 0.3516, + "step": 29397 + }, + { + "epoch": 1.6462089819688654, + "grad_norm": 1.221899151802063, + "learning_rate": 9.753052631578948e-05, + "loss": 0.4635, + "step": 29398 + }, + { + "epoch": 1.6462649792809945, + "grad_norm": 0.9793908596038818, + "learning_rate": 9.753026315789474e-05, + "loss": 0.3526, + "step": 29399 + }, + { + "epoch": 1.6463209765931235, + "grad_norm": 1.0542079210281372, + "learning_rate": 9.753e-05, + "loss": 0.3677, + "step": 29400 + }, + { + "epoch": 1.6463769739052525, + "grad_norm": 1.3768837451934814, + "learning_rate": 9.752973684210527e-05, + "loss": 0.4844, + "step": 29401 + }, + { + "epoch": 1.6464329712173815, + "grad_norm": 1.3428592681884766, + "learning_rate": 9.752947368421053e-05, + "loss": 0.661, + "step": 29402 + }, + { + "epoch": 1.6464889685295105, + "grad_norm": 1.7047210931777954, + "learning_rate": 9.752921052631579e-05, + "loss": 0.5118, + "step": 29403 + }, + { + "epoch": 1.6465449658416396, + "grad_norm": 1.5451815128326416, + "learning_rate": 9.752894736842105e-05, + "loss": 0.4501, + "step": 29404 + }, + { + "epoch": 1.6466009631537686, + "grad_norm": 1.2636375427246094, + "learning_rate": 9.752868421052632e-05, + "loss": 0.4475, + "step": 29405 + }, + { + "epoch": 1.6466569604658976, + "grad_norm": 1.2503951787948608, + "learning_rate": 9.752842105263158e-05, + "loss": 0.3915, + "step": 29406 + }, + { + "epoch": 1.6467129577780266, + "grad_norm": 1.2569408416748047, + "learning_rate": 9.752815789473686e-05, + "loss": 0.4041, + "step": 29407 + }, + { + "epoch": 1.6467689550901556, + "grad_norm": 1.645736575126648, + "learning_rate": 9.75278947368421e-05, + "loss": 0.4398, + "step": 29408 + }, + { + "epoch": 1.6468249524022847, + "grad_norm": 1.7948026657104492, + "learning_rate": 9.752763157894737e-05, + "loss": 0.5674, + "step": 29409 + }, + { + "epoch": 1.6468809497144137, + "grad_norm": 1.4449021816253662, + "learning_rate": 9.752736842105263e-05, + "loss": 0.5032, + "step": 29410 + }, + { + "epoch": 1.6469369470265427, + "grad_norm": 1.430108904838562, + "learning_rate": 9.752710526315791e-05, + "loss": 0.4738, + "step": 29411 + }, + { + "epoch": 1.6469929443386717, + "grad_norm": 1.1391552686691284, + "learning_rate": 9.752684210526317e-05, + "loss": 0.4393, + "step": 29412 + }, + { + "epoch": 1.6470489416508007, + "grad_norm": 1.2336457967758179, + "learning_rate": 9.752657894736843e-05, + "loss": 0.4446, + "step": 29413 + }, + { + "epoch": 1.6471049389629298, + "grad_norm": 1.372226357460022, + "learning_rate": 9.752631578947369e-05, + "loss": 0.406, + "step": 29414 + }, + { + "epoch": 1.6471609362750588, + "grad_norm": 1.3093339204788208, + "learning_rate": 9.752605263157896e-05, + "loss": 0.5155, + "step": 29415 + }, + { + "epoch": 1.6472169335871878, + "grad_norm": 1.4605350494384766, + "learning_rate": 9.752578947368422e-05, + "loss": 0.4429, + "step": 29416 + }, + { + "epoch": 1.6472729308993168, + "grad_norm": 1.4476892948150635, + "learning_rate": 9.752552631578948e-05, + "loss": 0.415, + "step": 29417 + }, + { + "epoch": 1.6473289282114458, + "grad_norm": 1.3584709167480469, + "learning_rate": 9.752526315789474e-05, + "loss": 0.5558, + "step": 29418 + }, + { + "epoch": 1.6473849255235749, + "grad_norm": 1.3899284601211548, + "learning_rate": 9.7525e-05, + "loss": 0.5136, + "step": 29419 + }, + { + "epoch": 1.6474409228357039, + "grad_norm": 1.0593681335449219, + "learning_rate": 9.752473684210527e-05, + "loss": 0.4007, + "step": 29420 + }, + { + "epoch": 1.6474969201478329, + "grad_norm": 1.3471382856369019, + "learning_rate": 9.752447368421053e-05, + "loss": 0.4306, + "step": 29421 + }, + { + "epoch": 1.647552917459962, + "grad_norm": 1.8453447818756104, + "learning_rate": 9.752421052631579e-05, + "loss": 0.569, + "step": 29422 + }, + { + "epoch": 1.647608914772091, + "grad_norm": 1.2868098020553589, + "learning_rate": 9.752394736842105e-05, + "loss": 0.4398, + "step": 29423 + }, + { + "epoch": 1.64766491208422, + "grad_norm": 1.1600334644317627, + "learning_rate": 9.752368421052632e-05, + "loss": 0.45, + "step": 29424 + }, + { + "epoch": 1.647720909396349, + "grad_norm": 1.535599946975708, + "learning_rate": 9.752342105263158e-05, + "loss": 0.5377, + "step": 29425 + }, + { + "epoch": 1.647776906708478, + "grad_norm": 1.270965814590454, + "learning_rate": 9.752315789473684e-05, + "loss": 0.4662, + "step": 29426 + }, + { + "epoch": 1.647832904020607, + "grad_norm": 1.2857426404953003, + "learning_rate": 9.75228947368421e-05, + "loss": 0.5464, + "step": 29427 + }, + { + "epoch": 1.647888901332736, + "grad_norm": 1.3699164390563965, + "learning_rate": 9.752263157894738e-05, + "loss": 0.3875, + "step": 29428 + }, + { + "epoch": 1.647944898644865, + "grad_norm": 1.0707435607910156, + "learning_rate": 9.752236842105264e-05, + "loss": 0.4614, + "step": 29429 + }, + { + "epoch": 1.648000895956994, + "grad_norm": 1.468240737915039, + "learning_rate": 9.752210526315791e-05, + "loss": 0.4208, + "step": 29430 + }, + { + "epoch": 1.648056893269123, + "grad_norm": 1.38775634765625, + "learning_rate": 9.752184210526316e-05, + "loss": 0.5273, + "step": 29431 + }, + { + "epoch": 1.648112890581252, + "grad_norm": 1.7929494380950928, + "learning_rate": 9.752157894736843e-05, + "loss": 0.4566, + "step": 29432 + }, + { + "epoch": 1.6481688878933811, + "grad_norm": 1.2799029350280762, + "learning_rate": 9.752131578947369e-05, + "loss": 0.5098, + "step": 29433 + }, + { + "epoch": 1.6482248852055101, + "grad_norm": 1.3467371463775635, + "learning_rate": 9.752105263157896e-05, + "loss": 0.4895, + "step": 29434 + }, + { + "epoch": 1.6482808825176392, + "grad_norm": 1.270985722541809, + "learning_rate": 9.752078947368421e-05, + "loss": 0.6181, + "step": 29435 + }, + { + "epoch": 1.6483368798297682, + "grad_norm": 1.092190146446228, + "learning_rate": 9.752052631578947e-05, + "loss": 0.387, + "step": 29436 + }, + { + "epoch": 1.6483928771418972, + "grad_norm": 1.0694005489349365, + "learning_rate": 9.752026315789474e-05, + "loss": 0.4356, + "step": 29437 + }, + { + "epoch": 1.6484488744540262, + "grad_norm": 1.276921272277832, + "learning_rate": 9.752e-05, + "loss": 0.4261, + "step": 29438 + }, + { + "epoch": 1.6485048717661552, + "grad_norm": 1.4633861780166626, + "learning_rate": 9.751973684210527e-05, + "loss": 0.5498, + "step": 29439 + }, + { + "epoch": 1.6485608690782843, + "grad_norm": 1.5083926916122437, + "learning_rate": 9.751947368421052e-05, + "loss": 0.4237, + "step": 29440 + }, + { + "epoch": 1.6486168663904133, + "grad_norm": 1.3734607696533203, + "learning_rate": 9.75192105263158e-05, + "loss": 0.3969, + "step": 29441 + }, + { + "epoch": 1.6486728637025423, + "grad_norm": 1.3441766500473022, + "learning_rate": 9.751894736842105e-05, + "loss": 0.3779, + "step": 29442 + }, + { + "epoch": 1.6487288610146713, + "grad_norm": 1.093099594116211, + "learning_rate": 9.751868421052633e-05, + "loss": 0.3103, + "step": 29443 + }, + { + "epoch": 1.6487848583268003, + "grad_norm": 1.3892213106155396, + "learning_rate": 9.751842105263159e-05, + "loss": 0.4943, + "step": 29444 + }, + { + "epoch": 1.6488408556389293, + "grad_norm": 1.4378942251205444, + "learning_rate": 9.751815789473685e-05, + "loss": 0.4575, + "step": 29445 + }, + { + "epoch": 1.6488968529510584, + "grad_norm": 1.298954725265503, + "learning_rate": 9.75178947368421e-05, + "loss": 0.4933, + "step": 29446 + }, + { + "epoch": 1.6489528502631874, + "grad_norm": 1.314711332321167, + "learning_rate": 9.751763157894738e-05, + "loss": 0.4495, + "step": 29447 + }, + { + "epoch": 1.6490088475753164, + "grad_norm": 1.2707839012145996, + "learning_rate": 9.751736842105264e-05, + "loss": 0.4522, + "step": 29448 + }, + { + "epoch": 1.6490648448874454, + "grad_norm": 1.2846564054489136, + "learning_rate": 9.75171052631579e-05, + "loss": 0.3352, + "step": 29449 + }, + { + "epoch": 1.6491208421995744, + "grad_norm": 1.248152494430542, + "learning_rate": 9.751684210526316e-05, + "loss": 0.4574, + "step": 29450 + }, + { + "epoch": 1.6491768395117035, + "grad_norm": 1.2577346563339233, + "learning_rate": 9.751657894736843e-05, + "loss": 0.4628, + "step": 29451 + }, + { + "epoch": 1.6492328368238325, + "grad_norm": 1.3272029161453247, + "learning_rate": 9.751631578947369e-05, + "loss": 0.4397, + "step": 29452 + }, + { + "epoch": 1.6492888341359615, + "grad_norm": 1.3512473106384277, + "learning_rate": 9.751605263157895e-05, + "loss": 0.3505, + "step": 29453 + }, + { + "epoch": 1.6493448314480905, + "grad_norm": 1.1984714269638062, + "learning_rate": 9.751578947368421e-05, + "loss": 0.6082, + "step": 29454 + }, + { + "epoch": 1.6494008287602195, + "grad_norm": 1.3861230611801147, + "learning_rate": 9.751552631578947e-05, + "loss": 0.4913, + "step": 29455 + }, + { + "epoch": 1.6494568260723486, + "grad_norm": 1.2818530797958374, + "learning_rate": 9.751526315789474e-05, + "loss": 0.443, + "step": 29456 + }, + { + "epoch": 1.6495128233844776, + "grad_norm": 1.317275881767273, + "learning_rate": 9.7515e-05, + "loss": 0.3658, + "step": 29457 + }, + { + "epoch": 1.6495688206966066, + "grad_norm": 1.3239867687225342, + "learning_rate": 9.751473684210526e-05, + "loss": 0.4324, + "step": 29458 + }, + { + "epoch": 1.6496248180087356, + "grad_norm": 1.4137225151062012, + "learning_rate": 9.751447368421052e-05, + "loss": 0.4373, + "step": 29459 + }, + { + "epoch": 1.6496808153208646, + "grad_norm": 1.5117820501327515, + "learning_rate": 9.75142105263158e-05, + "loss": 0.5513, + "step": 29460 + }, + { + "epoch": 1.6497368126329937, + "grad_norm": 1.5019891262054443, + "learning_rate": 9.751394736842106e-05, + "loss": 0.5148, + "step": 29461 + }, + { + "epoch": 1.6497928099451227, + "grad_norm": 1.271453857421875, + "learning_rate": 9.751368421052633e-05, + "loss": 0.4867, + "step": 29462 + }, + { + "epoch": 1.6498488072572517, + "grad_norm": 1.3284165859222412, + "learning_rate": 9.751342105263158e-05, + "loss": 0.4884, + "step": 29463 + }, + { + "epoch": 1.6499048045693807, + "grad_norm": 1.3907990455627441, + "learning_rate": 9.751315789473685e-05, + "loss": 0.471, + "step": 29464 + }, + { + "epoch": 1.6499608018815097, + "grad_norm": 1.2383745908737183, + "learning_rate": 9.751289473684211e-05, + "loss": 0.318, + "step": 29465 + }, + { + "epoch": 1.6500167991936388, + "grad_norm": 1.1266353130340576, + "learning_rate": 9.751263157894738e-05, + "loss": 0.3896, + "step": 29466 + }, + { + "epoch": 1.6500727965057678, + "grad_norm": 1.3946961164474487, + "learning_rate": 9.751236842105264e-05, + "loss": 0.4474, + "step": 29467 + }, + { + "epoch": 1.6501287938178968, + "grad_norm": 1.186700463294983, + "learning_rate": 9.75121052631579e-05, + "loss": 0.319, + "step": 29468 + }, + { + "epoch": 1.6501847911300258, + "grad_norm": 1.1210553646087646, + "learning_rate": 9.751184210526316e-05, + "loss": 0.3125, + "step": 29469 + }, + { + "epoch": 1.6502407884421548, + "grad_norm": 1.0811258554458618, + "learning_rate": 9.751157894736842e-05, + "loss": 0.3655, + "step": 29470 + }, + { + "epoch": 1.6502967857542838, + "grad_norm": 1.7336432933807373, + "learning_rate": 9.75113157894737e-05, + "loss": 0.4085, + "step": 29471 + }, + { + "epoch": 1.6503527830664129, + "grad_norm": 1.5337207317352295, + "learning_rate": 9.751105263157895e-05, + "loss": 0.4996, + "step": 29472 + }, + { + "epoch": 1.6504087803785419, + "grad_norm": 1.4256982803344727, + "learning_rate": 9.751078947368421e-05, + "loss": 0.3553, + "step": 29473 + }, + { + "epoch": 1.650464777690671, + "grad_norm": 1.38419508934021, + "learning_rate": 9.751052631578947e-05, + "loss": 0.4517, + "step": 29474 + }, + { + "epoch": 1.6505207750028, + "grad_norm": 1.5306450128555298, + "learning_rate": 9.751026315789475e-05, + "loss": 0.4817, + "step": 29475 + }, + { + "epoch": 1.650576772314929, + "grad_norm": 1.411915898323059, + "learning_rate": 9.751e-05, + "loss": 0.4983, + "step": 29476 + }, + { + "epoch": 1.650632769627058, + "grad_norm": 1.1324421167373657, + "learning_rate": 9.750973684210527e-05, + "loss": 0.3834, + "step": 29477 + }, + { + "epoch": 1.650688766939187, + "grad_norm": 1.4064148664474487, + "learning_rate": 9.750947368421053e-05, + "loss": 0.5747, + "step": 29478 + }, + { + "epoch": 1.650744764251316, + "grad_norm": 1.3565051555633545, + "learning_rate": 9.75092105263158e-05, + "loss": 0.4658, + "step": 29479 + }, + { + "epoch": 1.650800761563445, + "grad_norm": 1.220345377922058, + "learning_rate": 9.750894736842106e-05, + "loss": 0.4655, + "step": 29480 + }, + { + "epoch": 1.650856758875574, + "grad_norm": 1.3398367166519165, + "learning_rate": 9.750868421052632e-05, + "loss": 0.5441, + "step": 29481 + }, + { + "epoch": 1.650912756187703, + "grad_norm": 3.0499606132507324, + "learning_rate": 9.750842105263158e-05, + "loss": 0.4552, + "step": 29482 + }, + { + "epoch": 1.650968753499832, + "grad_norm": 1.3581091165542603, + "learning_rate": 9.750815789473685e-05, + "loss": 0.5208, + "step": 29483 + }, + { + "epoch": 1.651024750811961, + "grad_norm": 1.7729673385620117, + "learning_rate": 9.750789473684211e-05, + "loss": 0.6271, + "step": 29484 + }, + { + "epoch": 1.6510807481240901, + "grad_norm": 1.3497742414474487, + "learning_rate": 9.750763157894738e-05, + "loss": 0.4637, + "step": 29485 + }, + { + "epoch": 1.6511367454362191, + "grad_norm": 1.1094225645065308, + "learning_rate": 9.750736842105263e-05, + "loss": 0.5067, + "step": 29486 + }, + { + "epoch": 1.6511927427483482, + "grad_norm": 1.2953336238861084, + "learning_rate": 9.750710526315789e-05, + "loss": 0.4458, + "step": 29487 + }, + { + "epoch": 1.6512487400604772, + "grad_norm": 1.1549729108810425, + "learning_rate": 9.750684210526316e-05, + "loss": 0.437, + "step": 29488 + }, + { + "epoch": 1.6513047373726062, + "grad_norm": 1.3755946159362793, + "learning_rate": 9.750657894736842e-05, + "loss": 0.4402, + "step": 29489 + }, + { + "epoch": 1.6513607346847352, + "grad_norm": 1.4747800827026367, + "learning_rate": 9.750631578947368e-05, + "loss": 0.4794, + "step": 29490 + }, + { + "epoch": 1.6514167319968642, + "grad_norm": 1.548931360244751, + "learning_rate": 9.750605263157894e-05, + "loss": 0.5268, + "step": 29491 + }, + { + "epoch": 1.6514727293089932, + "grad_norm": 1.3267422914505005, + "learning_rate": 9.750578947368422e-05, + "loss": 0.4324, + "step": 29492 + }, + { + "epoch": 1.6515287266211223, + "grad_norm": 1.1592317819595337, + "learning_rate": 9.750552631578948e-05, + "loss": 0.4064, + "step": 29493 + }, + { + "epoch": 1.6515847239332513, + "grad_norm": 1.4331164360046387, + "learning_rate": 9.750526315789475e-05, + "loss": 0.6338, + "step": 29494 + }, + { + "epoch": 1.6516407212453803, + "grad_norm": 1.419296383857727, + "learning_rate": 9.7505e-05, + "loss": 0.5461, + "step": 29495 + }, + { + "epoch": 1.6516967185575093, + "grad_norm": 1.5112463235855103, + "learning_rate": 9.750473684210527e-05, + "loss": 0.401, + "step": 29496 + }, + { + "epoch": 1.6517527158696383, + "grad_norm": 1.4628111124038696, + "learning_rate": 9.750447368421053e-05, + "loss": 0.5966, + "step": 29497 + }, + { + "epoch": 1.6518087131817674, + "grad_norm": 1.2177109718322754, + "learning_rate": 9.75042105263158e-05, + "loss": 0.3109, + "step": 29498 + }, + { + "epoch": 1.6518647104938964, + "grad_norm": 1.3104015588760376, + "learning_rate": 9.750394736842106e-05, + "loss": 0.4858, + "step": 29499 + }, + { + "epoch": 1.6519207078060254, + "grad_norm": 1.5395573377609253, + "learning_rate": 9.750368421052632e-05, + "loss": 0.437, + "step": 29500 + }, + { + "epoch": 1.6519767051181544, + "grad_norm": 2.065742254257202, + "learning_rate": 9.750342105263158e-05, + "loss": 0.5739, + "step": 29501 + }, + { + "epoch": 1.6520327024302834, + "grad_norm": 1.1836086511611938, + "learning_rate": 9.750315789473685e-05, + "loss": 0.4297, + "step": 29502 + }, + { + "epoch": 1.6520886997424125, + "grad_norm": 1.1465251445770264, + "learning_rate": 9.750289473684211e-05, + "loss": 0.4416, + "step": 29503 + }, + { + "epoch": 1.6521446970545415, + "grad_norm": 1.2871326208114624, + "learning_rate": 9.750263157894737e-05, + "loss": 0.5569, + "step": 29504 + }, + { + "epoch": 1.6522006943666705, + "grad_norm": 1.2531226873397827, + "learning_rate": 9.750236842105263e-05, + "loss": 0.3886, + "step": 29505 + }, + { + "epoch": 1.6522566916787995, + "grad_norm": 1.8033958673477173, + "learning_rate": 9.75021052631579e-05, + "loss": 0.4523, + "step": 29506 + }, + { + "epoch": 1.6523126889909285, + "grad_norm": 1.0945812463760376, + "learning_rate": 9.750184210526317e-05, + "loss": 0.3309, + "step": 29507 + }, + { + "epoch": 1.6523686863030576, + "grad_norm": 1.440437912940979, + "learning_rate": 9.750157894736843e-05, + "loss": 0.5318, + "step": 29508 + }, + { + "epoch": 1.6524246836151866, + "grad_norm": 1.3866740465164185, + "learning_rate": 9.750131578947369e-05, + "loss": 0.6054, + "step": 29509 + }, + { + "epoch": 1.6524806809273156, + "grad_norm": 1.3404566049575806, + "learning_rate": 9.750105263157895e-05, + "loss": 0.7541, + "step": 29510 + }, + { + "epoch": 1.6525366782394446, + "grad_norm": 1.138606071472168, + "learning_rate": 9.750078947368422e-05, + "loss": 0.3472, + "step": 29511 + }, + { + "epoch": 1.6525926755515736, + "grad_norm": 1.5618456602096558, + "learning_rate": 9.750052631578948e-05, + "loss": 0.5744, + "step": 29512 + }, + { + "epoch": 1.6526486728637027, + "grad_norm": 1.288644790649414, + "learning_rate": 9.750026315789474e-05, + "loss": 0.5016, + "step": 29513 + }, + { + "epoch": 1.6527046701758317, + "grad_norm": 1.5030759572982788, + "learning_rate": 9.75e-05, + "loss": 0.6377, + "step": 29514 + }, + { + "epoch": 1.6527606674879607, + "grad_norm": 3.824960231781006, + "learning_rate": 9.749973684210527e-05, + "loss": 0.4982, + "step": 29515 + }, + { + "epoch": 1.6528166648000897, + "grad_norm": 1.2412022352218628, + "learning_rate": 9.749947368421053e-05, + "loss": 0.4165, + "step": 29516 + }, + { + "epoch": 1.6528726621122187, + "grad_norm": 1.4901292324066162, + "learning_rate": 9.74992105263158e-05, + "loss": 0.4076, + "step": 29517 + }, + { + "epoch": 1.6529286594243477, + "grad_norm": 1.3059476613998413, + "learning_rate": 9.749894736842105e-05, + "loss": 0.4026, + "step": 29518 + }, + { + "epoch": 1.6529846567364768, + "grad_norm": 1.6507205963134766, + "learning_rate": 9.749868421052632e-05, + "loss": 0.5171, + "step": 29519 + }, + { + "epoch": 1.6530406540486058, + "grad_norm": 1.561182975769043, + "learning_rate": 9.749842105263158e-05, + "loss": 0.5634, + "step": 29520 + }, + { + "epoch": 1.6530966513607348, + "grad_norm": 1.259454607963562, + "learning_rate": 9.749815789473684e-05, + "loss": 0.4905, + "step": 29521 + }, + { + "epoch": 1.6531526486728638, + "grad_norm": 2.1463735103607178, + "learning_rate": 9.749789473684212e-05, + "loss": 0.6679, + "step": 29522 + }, + { + "epoch": 1.6532086459849928, + "grad_norm": 5.757894515991211, + "learning_rate": 9.749763157894736e-05, + "loss": 0.6395, + "step": 29523 + }, + { + "epoch": 1.6532646432971219, + "grad_norm": 1.4047274589538574, + "learning_rate": 9.749736842105264e-05, + "loss": 0.4141, + "step": 29524 + }, + { + "epoch": 1.6533206406092509, + "grad_norm": 1.2539740800857544, + "learning_rate": 9.74971052631579e-05, + "loss": 0.4272, + "step": 29525 + }, + { + "epoch": 1.65337663792138, + "grad_norm": 1.2732815742492676, + "learning_rate": 9.749684210526317e-05, + "loss": 0.5182, + "step": 29526 + }, + { + "epoch": 1.653432635233509, + "grad_norm": 1.3196169137954712, + "learning_rate": 9.749657894736842e-05, + "loss": 0.3876, + "step": 29527 + }, + { + "epoch": 1.653488632545638, + "grad_norm": 1.3361912965774536, + "learning_rate": 9.749631578947369e-05, + "loss": 0.4089, + "step": 29528 + }, + { + "epoch": 1.653544629857767, + "grad_norm": 1.3709325790405273, + "learning_rate": 9.749605263157895e-05, + "loss": 0.6227, + "step": 29529 + }, + { + "epoch": 1.653600627169896, + "grad_norm": 1.0868281126022339, + "learning_rate": 9.749578947368422e-05, + "loss": 0.3535, + "step": 29530 + }, + { + "epoch": 1.653656624482025, + "grad_norm": 1.2153334617614746, + "learning_rate": 9.749552631578948e-05, + "loss": 0.3779, + "step": 29531 + }, + { + "epoch": 1.653712621794154, + "grad_norm": 1.5079317092895508, + "learning_rate": 9.749526315789474e-05, + "loss": 0.4953, + "step": 29532 + }, + { + "epoch": 1.653768619106283, + "grad_norm": 1.44725501537323, + "learning_rate": 9.7495e-05, + "loss": 0.5007, + "step": 29533 + }, + { + "epoch": 1.653824616418412, + "grad_norm": 1.10235595703125, + "learning_rate": 9.749473684210527e-05, + "loss": 0.4212, + "step": 29534 + }, + { + "epoch": 1.653880613730541, + "grad_norm": 1.637251615524292, + "learning_rate": 9.749447368421053e-05, + "loss": 0.5369, + "step": 29535 + }, + { + "epoch": 1.65393661104267, + "grad_norm": 1.3870211839675903, + "learning_rate": 9.74942105263158e-05, + "loss": 0.4615, + "step": 29536 + }, + { + "epoch": 1.653992608354799, + "grad_norm": 1.8148595094680786, + "learning_rate": 9.749394736842105e-05, + "loss": 0.608, + "step": 29537 + }, + { + "epoch": 1.6540486056669281, + "grad_norm": 1.1375174522399902, + "learning_rate": 9.749368421052633e-05, + "loss": 0.4065, + "step": 29538 + }, + { + "epoch": 1.6541046029790571, + "grad_norm": 1.3817225694656372, + "learning_rate": 9.749342105263159e-05, + "loss": 0.5522, + "step": 29539 + }, + { + "epoch": 1.6541606002911862, + "grad_norm": 1.1288411617279053, + "learning_rate": 9.749315789473685e-05, + "loss": 0.429, + "step": 29540 + }, + { + "epoch": 1.6542165976033152, + "grad_norm": 1.1628162860870361, + "learning_rate": 9.74928947368421e-05, + "loss": 0.3498, + "step": 29541 + }, + { + "epoch": 1.6542725949154442, + "grad_norm": 1.3848342895507812, + "learning_rate": 9.749263157894737e-05, + "loss": 0.4717, + "step": 29542 + }, + { + "epoch": 1.6543285922275732, + "grad_norm": 1.5429258346557617, + "learning_rate": 9.749236842105264e-05, + "loss": 0.458, + "step": 29543 + }, + { + "epoch": 1.6543845895397022, + "grad_norm": 1.1181485652923584, + "learning_rate": 9.74921052631579e-05, + "loss": 0.4494, + "step": 29544 + }, + { + "epoch": 1.6544405868518313, + "grad_norm": 1.3679903745651245, + "learning_rate": 9.749184210526316e-05, + "loss": 0.5011, + "step": 29545 + }, + { + "epoch": 1.6544965841639603, + "grad_norm": 1.237748622894287, + "learning_rate": 9.749157894736842e-05, + "loss": 0.3805, + "step": 29546 + }, + { + "epoch": 1.6545525814760893, + "grad_norm": 1.2433940172195435, + "learning_rate": 9.749131578947369e-05, + "loss": 0.3774, + "step": 29547 + }, + { + "epoch": 1.6546085787882183, + "grad_norm": 1.468341588973999, + "learning_rate": 9.749105263157895e-05, + "loss": 0.4329, + "step": 29548 + }, + { + "epoch": 1.6546645761003473, + "grad_norm": 1.4150248765945435, + "learning_rate": 9.749078947368422e-05, + "loss": 0.4416, + "step": 29549 + }, + { + "epoch": 1.6547205734124764, + "grad_norm": 1.1630582809448242, + "learning_rate": 9.749052631578947e-05, + "loss": 0.4249, + "step": 29550 + }, + { + "epoch": 1.6547765707246054, + "grad_norm": 1.161422610282898, + "learning_rate": 9.749026315789474e-05, + "loss": 0.3848, + "step": 29551 + }, + { + "epoch": 1.6548325680367344, + "grad_norm": 1.278017520904541, + "learning_rate": 9.749e-05, + "loss": 0.402, + "step": 29552 + }, + { + "epoch": 1.6548885653488634, + "grad_norm": 1.96346914768219, + "learning_rate": 9.748973684210528e-05, + "loss": 0.6422, + "step": 29553 + }, + { + "epoch": 1.6549445626609924, + "grad_norm": 1.2192671298980713, + "learning_rate": 9.748947368421054e-05, + "loss": 0.5708, + "step": 29554 + }, + { + "epoch": 1.6550005599731215, + "grad_norm": 1.2060295343399048, + "learning_rate": 9.74892105263158e-05, + "loss": 0.5261, + "step": 29555 + }, + { + "epoch": 1.6550565572852505, + "grad_norm": 1.8292222023010254, + "learning_rate": 9.748894736842106e-05, + "loss": 0.5154, + "step": 29556 + }, + { + "epoch": 1.6551125545973793, + "grad_norm": 1.2270359992980957, + "learning_rate": 9.748868421052632e-05, + "loss": 0.5626, + "step": 29557 + }, + { + "epoch": 1.6551685519095083, + "grad_norm": 1.1451836824417114, + "learning_rate": 9.748842105263159e-05, + "loss": 0.5148, + "step": 29558 + }, + { + "epoch": 1.6552245492216373, + "grad_norm": 1.3507838249206543, + "learning_rate": 9.748815789473685e-05, + "loss": 0.6799, + "step": 29559 + }, + { + "epoch": 1.6552805465337663, + "grad_norm": 1.8475145101547241, + "learning_rate": 9.748789473684211e-05, + "loss": 0.6658, + "step": 29560 + }, + { + "epoch": 1.6553365438458953, + "grad_norm": 1.3059606552124023, + "learning_rate": 9.748763157894737e-05, + "loss": 0.4534, + "step": 29561 + }, + { + "epoch": 1.6553925411580244, + "grad_norm": 1.292354941368103, + "learning_rate": 9.748736842105264e-05, + "loss": 0.432, + "step": 29562 + }, + { + "epoch": 1.6554485384701534, + "grad_norm": 1.2703644037246704, + "learning_rate": 9.74871052631579e-05, + "loss": 0.5647, + "step": 29563 + }, + { + "epoch": 1.6555045357822824, + "grad_norm": 1.7154474258422852, + "learning_rate": 9.748684210526316e-05, + "loss": 0.5775, + "step": 29564 + }, + { + "epoch": 1.6555605330944114, + "grad_norm": 1.4255478382110596, + "learning_rate": 9.748657894736842e-05, + "loss": 0.407, + "step": 29565 + }, + { + "epoch": 1.6556165304065404, + "grad_norm": 1.5124664306640625, + "learning_rate": 9.74863157894737e-05, + "loss": 0.4397, + "step": 29566 + }, + { + "epoch": 1.6556725277186695, + "grad_norm": 1.2534633874893188, + "learning_rate": 9.748605263157895e-05, + "loss": 0.4175, + "step": 29567 + }, + { + "epoch": 1.6557285250307985, + "grad_norm": 1.333579182624817, + "learning_rate": 9.748578947368421e-05, + "loss": 0.515, + "step": 29568 + }, + { + "epoch": 1.6557845223429275, + "grad_norm": 1.228956937789917, + "learning_rate": 9.748552631578947e-05, + "loss": 0.518, + "step": 29569 + }, + { + "epoch": 1.6558405196550565, + "grad_norm": 1.1497122049331665, + "learning_rate": 9.748526315789475e-05, + "loss": 0.4025, + "step": 29570 + }, + { + "epoch": 1.6558965169671855, + "grad_norm": 1.0443910360336304, + "learning_rate": 9.7485e-05, + "loss": 0.427, + "step": 29571 + }, + { + "epoch": 1.6559525142793146, + "grad_norm": 1.344438076019287, + "learning_rate": 9.748473684210528e-05, + "loss": 0.45, + "step": 29572 + }, + { + "epoch": 1.6560085115914436, + "grad_norm": 1.1079256534576416, + "learning_rate": 9.748447368421053e-05, + "loss": 0.4042, + "step": 29573 + }, + { + "epoch": 1.6560645089035726, + "grad_norm": 1.3417983055114746, + "learning_rate": 9.748421052631579e-05, + "loss": 0.435, + "step": 29574 + }, + { + "epoch": 1.6561205062157016, + "grad_norm": 1.4069679975509644, + "learning_rate": 9.748394736842106e-05, + "loss": 0.46, + "step": 29575 + }, + { + "epoch": 1.6561765035278306, + "grad_norm": 1.1575227975845337, + "learning_rate": 9.748368421052632e-05, + "loss": 0.4175, + "step": 29576 + }, + { + "epoch": 1.6562325008399597, + "grad_norm": 1.3027757406234741, + "learning_rate": 9.748342105263159e-05, + "loss": 0.4284, + "step": 29577 + }, + { + "epoch": 1.6562884981520887, + "grad_norm": 1.4100687503814697, + "learning_rate": 9.748315789473684e-05, + "loss": 0.8479, + "step": 29578 + }, + { + "epoch": 1.6563444954642177, + "grad_norm": 1.2550665140151978, + "learning_rate": 9.748289473684211e-05, + "loss": 0.5583, + "step": 29579 + }, + { + "epoch": 1.6564004927763467, + "grad_norm": 1.2159197330474854, + "learning_rate": 9.748263157894737e-05, + "loss": 0.5085, + "step": 29580 + }, + { + "epoch": 1.6564564900884757, + "grad_norm": 1.5163697004318237, + "learning_rate": 9.748236842105264e-05, + "loss": 0.4916, + "step": 29581 + }, + { + "epoch": 1.6565124874006047, + "grad_norm": 1.3901549577713013, + "learning_rate": 9.748210526315789e-05, + "loss": 0.4707, + "step": 29582 + }, + { + "epoch": 1.6565684847127338, + "grad_norm": 1.4969396591186523, + "learning_rate": 9.748184210526316e-05, + "loss": 0.5083, + "step": 29583 + }, + { + "epoch": 1.6566244820248628, + "grad_norm": 1.331526279449463, + "learning_rate": 9.748157894736842e-05, + "loss": 0.3855, + "step": 29584 + }, + { + "epoch": 1.6566804793369918, + "grad_norm": 1.305368423461914, + "learning_rate": 9.74813157894737e-05, + "loss": 0.4413, + "step": 29585 + }, + { + "epoch": 1.6567364766491208, + "grad_norm": 1.43300461769104, + "learning_rate": 9.748105263157896e-05, + "loss": 0.5209, + "step": 29586 + }, + { + "epoch": 1.6567924739612498, + "grad_norm": 1.1488232612609863, + "learning_rate": 9.748078947368422e-05, + "loss": 0.4211, + "step": 29587 + }, + { + "epoch": 1.6568484712733789, + "grad_norm": 1.2985970973968506, + "learning_rate": 9.748052631578948e-05, + "loss": 0.531, + "step": 29588 + }, + { + "epoch": 1.6569044685855079, + "grad_norm": 1.1614956855773926, + "learning_rate": 9.748026315789475e-05, + "loss": 0.4647, + "step": 29589 + }, + { + "epoch": 1.656960465897637, + "grad_norm": 1.4463664293289185, + "learning_rate": 9.748000000000001e-05, + "loss": 0.5643, + "step": 29590 + }, + { + "epoch": 1.657016463209766, + "grad_norm": 1.2314857244491577, + "learning_rate": 9.747973684210527e-05, + "loss": 0.478, + "step": 29591 + }, + { + "epoch": 1.657072460521895, + "grad_norm": 1.4421050548553467, + "learning_rate": 9.747947368421053e-05, + "loss": 0.4519, + "step": 29592 + }, + { + "epoch": 1.657128457834024, + "grad_norm": 1.3177601099014282, + "learning_rate": 9.747921052631579e-05, + "loss": 0.4995, + "step": 29593 + }, + { + "epoch": 1.657184455146153, + "grad_norm": 1.2215352058410645, + "learning_rate": 9.747894736842106e-05, + "loss": 0.3068, + "step": 29594 + }, + { + "epoch": 1.657240452458282, + "grad_norm": 1.2740132808685303, + "learning_rate": 9.747868421052632e-05, + "loss": 0.5852, + "step": 29595 + }, + { + "epoch": 1.657296449770411, + "grad_norm": 1.2289226055145264, + "learning_rate": 9.747842105263158e-05, + "loss": 0.5125, + "step": 29596 + }, + { + "epoch": 1.65735244708254, + "grad_norm": 1.533870816230774, + "learning_rate": 9.747815789473684e-05, + "loss": 0.7458, + "step": 29597 + }, + { + "epoch": 1.657408444394669, + "grad_norm": 1.202687382698059, + "learning_rate": 9.747789473684211e-05, + "loss": 0.4193, + "step": 29598 + }, + { + "epoch": 1.657464441706798, + "grad_norm": 1.5871564149856567, + "learning_rate": 9.747763157894737e-05, + "loss": 0.6005, + "step": 29599 + }, + { + "epoch": 1.657520439018927, + "grad_norm": 1.3361384868621826, + "learning_rate": 9.747736842105263e-05, + "loss": 0.4297, + "step": 29600 + }, + { + "epoch": 1.6575764363310561, + "grad_norm": 1.468178629875183, + "learning_rate": 9.747710526315789e-05, + "loss": 0.5172, + "step": 29601 + }, + { + "epoch": 1.6576324336431851, + "grad_norm": 1.1791592836380005, + "learning_rate": 9.747684210526317e-05, + "loss": 0.3931, + "step": 29602 + }, + { + "epoch": 1.6576884309553142, + "grad_norm": 1.1586310863494873, + "learning_rate": 9.747657894736843e-05, + "loss": 0.4085, + "step": 29603 + }, + { + "epoch": 1.6577444282674432, + "grad_norm": 1.1029143333435059, + "learning_rate": 9.74763157894737e-05, + "loss": 0.4226, + "step": 29604 + }, + { + "epoch": 1.6578004255795722, + "grad_norm": 1.5106743574142456, + "learning_rate": 9.747605263157895e-05, + "loss": 0.4974, + "step": 29605 + }, + { + "epoch": 1.6578564228917012, + "grad_norm": 1.2638213634490967, + "learning_rate": 9.747578947368422e-05, + "loss": 0.3985, + "step": 29606 + }, + { + "epoch": 1.6579124202038302, + "grad_norm": 1.1854311227798462, + "learning_rate": 9.747552631578948e-05, + "loss": 0.3951, + "step": 29607 + }, + { + "epoch": 1.6579684175159592, + "grad_norm": 1.3831970691680908, + "learning_rate": 9.747526315789474e-05, + "loss": 0.4718, + "step": 29608 + }, + { + "epoch": 1.6580244148280883, + "grad_norm": 1.2144850492477417, + "learning_rate": 9.747500000000001e-05, + "loss": 0.5548, + "step": 29609 + }, + { + "epoch": 1.6580804121402173, + "grad_norm": 1.3014781475067139, + "learning_rate": 9.747473684210526e-05, + "loss": 0.3956, + "step": 29610 + }, + { + "epoch": 1.6581364094523463, + "grad_norm": 1.3508234024047852, + "learning_rate": 9.747447368421053e-05, + "loss": 0.4022, + "step": 29611 + }, + { + "epoch": 1.6581924067644753, + "grad_norm": 1.2428338527679443, + "learning_rate": 9.747421052631579e-05, + "loss": 0.4778, + "step": 29612 + }, + { + "epoch": 1.6582484040766043, + "grad_norm": 1.5273547172546387, + "learning_rate": 9.747394736842106e-05, + "loss": 0.542, + "step": 29613 + }, + { + "epoch": 1.6583044013887334, + "grad_norm": 1.193739891052246, + "learning_rate": 9.747368421052632e-05, + "loss": 0.4343, + "step": 29614 + }, + { + "epoch": 1.6583603987008624, + "grad_norm": 2.0253703594207764, + "learning_rate": 9.747342105263158e-05, + "loss": 0.6708, + "step": 29615 + }, + { + "epoch": 1.6584163960129914, + "grad_norm": 1.3252277374267578, + "learning_rate": 9.747315789473684e-05, + "loss": 0.3905, + "step": 29616 + }, + { + "epoch": 1.6584723933251204, + "grad_norm": 1.2533987760543823, + "learning_rate": 9.747289473684212e-05, + "loss": 0.384, + "step": 29617 + }, + { + "epoch": 1.6585283906372494, + "grad_norm": 1.2431951761245728, + "learning_rate": 9.747263157894738e-05, + "loss": 0.579, + "step": 29618 + }, + { + "epoch": 1.6585843879493785, + "grad_norm": 1.0912152528762817, + "learning_rate": 9.747236842105264e-05, + "loss": 0.4041, + "step": 29619 + }, + { + "epoch": 1.6586403852615075, + "grad_norm": 1.4213159084320068, + "learning_rate": 9.74721052631579e-05, + "loss": 0.6027, + "step": 29620 + }, + { + "epoch": 1.6586963825736365, + "grad_norm": 1.2781519889831543, + "learning_rate": 9.747184210526317e-05, + "loss": 0.4213, + "step": 29621 + }, + { + "epoch": 1.6587523798857655, + "grad_norm": 1.4699957370758057, + "learning_rate": 9.747157894736843e-05, + "loss": 0.5215, + "step": 29622 + }, + { + "epoch": 1.6588083771978945, + "grad_norm": 2.326874017715454, + "learning_rate": 9.747131578947369e-05, + "loss": 0.3919, + "step": 29623 + }, + { + "epoch": 1.6588643745100236, + "grad_norm": 1.2619982957839966, + "learning_rate": 9.747105263157895e-05, + "loss": 0.4346, + "step": 29624 + }, + { + "epoch": 1.6589203718221526, + "grad_norm": 1.6012333631515503, + "learning_rate": 9.747078947368421e-05, + "loss": 0.4691, + "step": 29625 + }, + { + "epoch": 1.6589763691342816, + "grad_norm": 1.347254753112793, + "learning_rate": 9.747052631578948e-05, + "loss": 0.3613, + "step": 29626 + }, + { + "epoch": 1.6590323664464106, + "grad_norm": 1.4736576080322266, + "learning_rate": 9.747026315789474e-05, + "loss": 0.4603, + "step": 29627 + }, + { + "epoch": 1.6590883637585396, + "grad_norm": 1.2075624465942383, + "learning_rate": 9.747e-05, + "loss": 0.4067, + "step": 29628 + }, + { + "epoch": 1.6591443610706686, + "grad_norm": 1.2575129270553589, + "learning_rate": 9.746973684210526e-05, + "loss": 0.4704, + "step": 29629 + }, + { + "epoch": 1.6592003583827977, + "grad_norm": 1.3325769901275635, + "learning_rate": 9.746947368421053e-05, + "loss": 0.5049, + "step": 29630 + }, + { + "epoch": 1.6592563556949267, + "grad_norm": 1.4154554605484009, + "learning_rate": 9.746921052631579e-05, + "loss": 0.5334, + "step": 29631 + }, + { + "epoch": 1.6593123530070557, + "grad_norm": 1.592248558998108, + "learning_rate": 9.746894736842107e-05, + "loss": 0.5429, + "step": 29632 + }, + { + "epoch": 1.6593683503191847, + "grad_norm": 1.3198601007461548, + "learning_rate": 9.746868421052631e-05, + "loss": 0.4146, + "step": 29633 + }, + { + "epoch": 1.6594243476313137, + "grad_norm": 1.1203079223632812, + "learning_rate": 9.746842105263159e-05, + "loss": 0.4375, + "step": 29634 + }, + { + "epoch": 1.6594803449434428, + "grad_norm": 1.3273859024047852, + "learning_rate": 9.746815789473685e-05, + "loss": 0.4493, + "step": 29635 + }, + { + "epoch": 1.6595363422555718, + "grad_norm": 1.3148910999298096, + "learning_rate": 9.746789473684212e-05, + "loss": 0.5294, + "step": 29636 + }, + { + "epoch": 1.6595923395677008, + "grad_norm": 1.3773329257965088, + "learning_rate": 9.746763157894736e-05, + "loss": 0.4416, + "step": 29637 + }, + { + "epoch": 1.6596483368798298, + "grad_norm": 1.2627286911010742, + "learning_rate": 9.746736842105264e-05, + "loss": 0.4194, + "step": 29638 + }, + { + "epoch": 1.6597043341919588, + "grad_norm": 1.3446952104568481, + "learning_rate": 9.74671052631579e-05, + "loss": 0.4122, + "step": 29639 + }, + { + "epoch": 1.6597603315040876, + "grad_norm": 1.124110460281372, + "learning_rate": 9.746684210526317e-05, + "loss": 0.5054, + "step": 29640 + }, + { + "epoch": 1.6598163288162167, + "grad_norm": 1.5453792810440063, + "learning_rate": 9.746657894736843e-05, + "loss": 0.5256, + "step": 29641 + }, + { + "epoch": 1.6598723261283457, + "grad_norm": 1.4115320444107056, + "learning_rate": 9.746631578947368e-05, + "loss": 0.4691, + "step": 29642 + }, + { + "epoch": 1.6599283234404747, + "grad_norm": 1.2420213222503662, + "learning_rate": 9.746605263157895e-05, + "loss": 0.452, + "step": 29643 + }, + { + "epoch": 1.6599843207526037, + "grad_norm": 1.2624112367630005, + "learning_rate": 9.746578947368421e-05, + "loss": 0.4987, + "step": 29644 + }, + { + "epoch": 1.6600403180647327, + "grad_norm": 1.2748420238494873, + "learning_rate": 9.746552631578948e-05, + "loss": 0.5516, + "step": 29645 + }, + { + "epoch": 1.6600963153768618, + "grad_norm": 1.506902813911438, + "learning_rate": 9.746526315789474e-05, + "loss": 0.4746, + "step": 29646 + }, + { + "epoch": 1.6601523126889908, + "grad_norm": 1.2483278512954712, + "learning_rate": 9.7465e-05, + "loss": 0.4323, + "step": 29647 + }, + { + "epoch": 1.6602083100011198, + "grad_norm": 1.122362732887268, + "learning_rate": 9.746473684210526e-05, + "loss": 0.3422, + "step": 29648 + }, + { + "epoch": 1.6602643073132488, + "grad_norm": 1.3935191631317139, + "learning_rate": 9.746447368421054e-05, + "loss": 0.3623, + "step": 29649 + }, + { + "epoch": 1.6603203046253778, + "grad_norm": 1.3148080110549927, + "learning_rate": 9.74642105263158e-05, + "loss": 0.4853, + "step": 29650 + }, + { + "epoch": 1.6603763019375068, + "grad_norm": 1.0957515239715576, + "learning_rate": 9.746394736842106e-05, + "loss": 0.4409, + "step": 29651 + }, + { + "epoch": 1.6604322992496359, + "grad_norm": 1.3667657375335693, + "learning_rate": 9.746368421052631e-05, + "loss": 0.5785, + "step": 29652 + }, + { + "epoch": 1.6604882965617649, + "grad_norm": 1.3684552907943726, + "learning_rate": 9.746342105263159e-05, + "loss": 0.4376, + "step": 29653 + }, + { + "epoch": 1.660544293873894, + "grad_norm": 1.152071475982666, + "learning_rate": 9.746315789473685e-05, + "loss": 0.4768, + "step": 29654 + }, + { + "epoch": 1.660600291186023, + "grad_norm": 3.4706151485443115, + "learning_rate": 9.746289473684211e-05, + "loss": 0.5972, + "step": 29655 + }, + { + "epoch": 1.660656288498152, + "grad_norm": 1.246666431427002, + "learning_rate": 9.746263157894737e-05, + "loss": 0.4079, + "step": 29656 + }, + { + "epoch": 1.660712285810281, + "grad_norm": 1.2707175016403198, + "learning_rate": 9.746236842105264e-05, + "loss": 0.3829, + "step": 29657 + }, + { + "epoch": 1.66076828312241, + "grad_norm": 1.4588186740875244, + "learning_rate": 9.74621052631579e-05, + "loss": 0.4542, + "step": 29658 + }, + { + "epoch": 1.660824280434539, + "grad_norm": 1.4935686588287354, + "learning_rate": 9.746184210526317e-05, + "loss": 0.393, + "step": 29659 + }, + { + "epoch": 1.660880277746668, + "grad_norm": 1.3570958375930786, + "learning_rate": 9.746157894736842e-05, + "loss": 0.3965, + "step": 29660 + }, + { + "epoch": 1.660936275058797, + "grad_norm": 1.4252172708511353, + "learning_rate": 9.746131578947368e-05, + "loss": 0.5035, + "step": 29661 + }, + { + "epoch": 1.660992272370926, + "grad_norm": 1.4909636974334717, + "learning_rate": 9.746105263157895e-05, + "loss": 0.4325, + "step": 29662 + }, + { + "epoch": 1.661048269683055, + "grad_norm": 1.0536448955535889, + "learning_rate": 9.746078947368421e-05, + "loss": 0.2782, + "step": 29663 + }, + { + "epoch": 1.661104266995184, + "grad_norm": 1.175025463104248, + "learning_rate": 9.746052631578949e-05, + "loss": 0.3818, + "step": 29664 + }, + { + "epoch": 1.6611602643073131, + "grad_norm": 1.2664695978164673, + "learning_rate": 9.746026315789473e-05, + "loss": 0.399, + "step": 29665 + }, + { + "epoch": 1.6612162616194421, + "grad_norm": 1.3744924068450928, + "learning_rate": 9.746e-05, + "loss": 0.5242, + "step": 29666 + }, + { + "epoch": 1.6612722589315712, + "grad_norm": 1.2288700342178345, + "learning_rate": 9.745973684210527e-05, + "loss": 0.4353, + "step": 29667 + }, + { + "epoch": 1.6613282562437002, + "grad_norm": 1.1383012533187866, + "learning_rate": 9.745947368421054e-05, + "loss": 0.3404, + "step": 29668 + }, + { + "epoch": 1.6613842535558292, + "grad_norm": 1.3235902786254883, + "learning_rate": 9.74592105263158e-05, + "loss": 0.4674, + "step": 29669 + }, + { + "epoch": 1.6614402508679582, + "grad_norm": 1.4621721506118774, + "learning_rate": 9.745894736842106e-05, + "loss": 0.5222, + "step": 29670 + }, + { + "epoch": 1.6614962481800872, + "grad_norm": 1.462956428527832, + "learning_rate": 9.745868421052632e-05, + "loss": 0.4607, + "step": 29671 + }, + { + "epoch": 1.6615522454922163, + "grad_norm": 1.5089340209960938, + "learning_rate": 9.745842105263159e-05, + "loss": 0.4257, + "step": 29672 + }, + { + "epoch": 1.6616082428043453, + "grad_norm": 1.2238178253173828, + "learning_rate": 9.745815789473685e-05, + "loss": 0.5535, + "step": 29673 + }, + { + "epoch": 1.6616642401164743, + "grad_norm": 1.066439151763916, + "learning_rate": 9.745789473684211e-05, + "loss": 0.4257, + "step": 29674 + }, + { + "epoch": 1.6617202374286033, + "grad_norm": 1.2708643674850464, + "learning_rate": 9.745763157894737e-05, + "loss": 0.3578, + "step": 29675 + }, + { + "epoch": 1.6617762347407323, + "grad_norm": 1.596035361289978, + "learning_rate": 9.745736842105264e-05, + "loss": 0.7007, + "step": 29676 + }, + { + "epoch": 1.6618322320528613, + "grad_norm": 1.338188648223877, + "learning_rate": 9.74571052631579e-05, + "loss": 0.4382, + "step": 29677 + }, + { + "epoch": 1.6618882293649904, + "grad_norm": 1.3567324876785278, + "learning_rate": 9.745684210526316e-05, + "loss": 0.5165, + "step": 29678 + }, + { + "epoch": 1.6619442266771194, + "grad_norm": 1.4828284978866577, + "learning_rate": 9.745657894736842e-05, + "loss": 0.693, + "step": 29679 + }, + { + "epoch": 1.6620002239892484, + "grad_norm": 1.2866010665893555, + "learning_rate": 9.745631578947368e-05, + "loss": 0.4845, + "step": 29680 + }, + { + "epoch": 1.6620562213013774, + "grad_norm": 1.3637531995773315, + "learning_rate": 9.745605263157896e-05, + "loss": 0.3814, + "step": 29681 + }, + { + "epoch": 1.6621122186135064, + "grad_norm": 1.154879093170166, + "learning_rate": 9.745578947368422e-05, + "loss": 0.4708, + "step": 29682 + }, + { + "epoch": 1.6621682159256355, + "grad_norm": 1.2102686166763306, + "learning_rate": 9.745552631578947e-05, + "loss": 0.543, + "step": 29683 + }, + { + "epoch": 1.6622242132377645, + "grad_norm": 1.2043113708496094, + "learning_rate": 9.745526315789473e-05, + "loss": 0.4621, + "step": 29684 + }, + { + "epoch": 1.6622802105498935, + "grad_norm": 1.490852952003479, + "learning_rate": 9.745500000000001e-05, + "loss": 0.399, + "step": 29685 + }, + { + "epoch": 1.6623362078620225, + "grad_norm": 1.358458161354065, + "learning_rate": 9.745473684210527e-05, + "loss": 0.5194, + "step": 29686 + }, + { + "epoch": 1.6623922051741515, + "grad_norm": 1.3399975299835205, + "learning_rate": 9.745447368421054e-05, + "loss": 0.5748, + "step": 29687 + }, + { + "epoch": 1.6624482024862806, + "grad_norm": 1.2152906656265259, + "learning_rate": 9.745421052631579e-05, + "loss": 0.4133, + "step": 29688 + }, + { + "epoch": 1.6625041997984096, + "grad_norm": 1.3359047174453735, + "learning_rate": 9.745394736842106e-05, + "loss": 0.5731, + "step": 29689 + }, + { + "epoch": 1.6625601971105386, + "grad_norm": 1.3178585767745972, + "learning_rate": 9.745368421052632e-05, + "loss": 0.4571, + "step": 29690 + }, + { + "epoch": 1.6626161944226676, + "grad_norm": 1.138414978981018, + "learning_rate": 9.74534210526316e-05, + "loss": 0.4158, + "step": 29691 + }, + { + "epoch": 1.6626721917347966, + "grad_norm": 1.1555383205413818, + "learning_rate": 9.745315789473684e-05, + "loss": 0.3688, + "step": 29692 + }, + { + "epoch": 1.6627281890469257, + "grad_norm": 1.456110954284668, + "learning_rate": 9.745289473684211e-05, + "loss": 0.4896, + "step": 29693 + }, + { + "epoch": 1.6627841863590547, + "grad_norm": 1.2308326959609985, + "learning_rate": 9.745263157894737e-05, + "loss": 0.3622, + "step": 29694 + }, + { + "epoch": 1.6628401836711837, + "grad_norm": 1.0821744203567505, + "learning_rate": 9.745236842105263e-05, + "loss": 0.3717, + "step": 29695 + }, + { + "epoch": 1.6628961809833127, + "grad_norm": 1.3416963815689087, + "learning_rate": 9.74521052631579e-05, + "loss": 0.455, + "step": 29696 + }, + { + "epoch": 1.6629521782954417, + "grad_norm": 1.3744860887527466, + "learning_rate": 9.745184210526315e-05, + "loss": 0.3445, + "step": 29697 + }, + { + "epoch": 1.6630081756075707, + "grad_norm": 1.2955223321914673, + "learning_rate": 9.745157894736843e-05, + "loss": 0.4848, + "step": 29698 + }, + { + "epoch": 1.6630641729196998, + "grad_norm": 1.7026212215423584, + "learning_rate": 9.745131578947368e-05, + "loss": 0.4915, + "step": 29699 + }, + { + "epoch": 1.6631201702318288, + "grad_norm": 1.5713258981704712, + "learning_rate": 9.745105263157896e-05, + "loss": 0.4763, + "step": 29700 + }, + { + "epoch": 1.6631761675439578, + "grad_norm": 1.4540106058120728, + "learning_rate": 9.745078947368422e-05, + "loss": 0.3707, + "step": 29701 + }, + { + "epoch": 1.6632321648560868, + "grad_norm": 1.380915880203247, + "learning_rate": 9.745052631578948e-05, + "loss": 0.477, + "step": 29702 + }, + { + "epoch": 1.6632881621682158, + "grad_norm": 1.3537237644195557, + "learning_rate": 9.745026315789474e-05, + "loss": 0.5506, + "step": 29703 + }, + { + "epoch": 1.6633441594803449, + "grad_norm": 1.5406357049942017, + "learning_rate": 9.745000000000001e-05, + "loss": 0.767, + "step": 29704 + }, + { + "epoch": 1.6634001567924739, + "grad_norm": 1.2187687158584595, + "learning_rate": 9.744973684210527e-05, + "loss": 0.4966, + "step": 29705 + }, + { + "epoch": 1.663456154104603, + "grad_norm": 1.3390312194824219, + "learning_rate": 9.744947368421053e-05, + "loss": 0.4701, + "step": 29706 + }, + { + "epoch": 1.663512151416732, + "grad_norm": 1.160866618156433, + "learning_rate": 9.744921052631579e-05, + "loss": 0.4631, + "step": 29707 + }, + { + "epoch": 1.663568148728861, + "grad_norm": 1.4030683040618896, + "learning_rate": 9.744894736842106e-05, + "loss": 0.5906, + "step": 29708 + }, + { + "epoch": 1.66362414604099, + "grad_norm": 1.3373100757598877, + "learning_rate": 9.744868421052632e-05, + "loss": 0.4799, + "step": 29709 + }, + { + "epoch": 1.663680143353119, + "grad_norm": 1.0900112390518188, + "learning_rate": 9.744842105263158e-05, + "loss": 0.3698, + "step": 29710 + }, + { + "epoch": 1.663736140665248, + "grad_norm": 1.1846494674682617, + "learning_rate": 9.744815789473684e-05, + "loss": 0.5321, + "step": 29711 + }, + { + "epoch": 1.663792137977377, + "grad_norm": 1.3239916563034058, + "learning_rate": 9.74478947368421e-05, + "loss": 0.4737, + "step": 29712 + }, + { + "epoch": 1.663848135289506, + "grad_norm": 1.345616102218628, + "learning_rate": 9.744763157894738e-05, + "loss": 0.4982, + "step": 29713 + }, + { + "epoch": 1.663904132601635, + "grad_norm": 1.169311285018921, + "learning_rate": 9.744736842105263e-05, + "loss": 0.3796, + "step": 29714 + }, + { + "epoch": 1.663960129913764, + "grad_norm": 1.1881449222564697, + "learning_rate": 9.74471052631579e-05, + "loss": 0.3696, + "step": 29715 + }, + { + "epoch": 1.664016127225893, + "grad_norm": 1.3634545803070068, + "learning_rate": 9.744684210526315e-05, + "loss": 0.4455, + "step": 29716 + }, + { + "epoch": 1.6640721245380221, + "grad_norm": 1.3136138916015625, + "learning_rate": 9.744657894736843e-05, + "loss": 0.538, + "step": 29717 + }, + { + "epoch": 1.6641281218501511, + "grad_norm": 1.3176722526550293, + "learning_rate": 9.744631578947369e-05, + "loss": 0.5301, + "step": 29718 + }, + { + "epoch": 1.6641841191622802, + "grad_norm": 1.1613926887512207, + "learning_rate": 9.744605263157896e-05, + "loss": 0.4518, + "step": 29719 + }, + { + "epoch": 1.6642401164744092, + "grad_norm": 1.5034033060073853, + "learning_rate": 9.744578947368421e-05, + "loss": 0.5178, + "step": 29720 + }, + { + "epoch": 1.6642961137865382, + "grad_norm": 1.380826711654663, + "learning_rate": 9.744552631578948e-05, + "loss": 0.3322, + "step": 29721 + }, + { + "epoch": 1.6643521110986672, + "grad_norm": 1.1774296760559082, + "learning_rate": 9.744526315789474e-05, + "loss": 0.4113, + "step": 29722 + }, + { + "epoch": 1.6644081084107962, + "grad_norm": 1.336766004562378, + "learning_rate": 9.744500000000001e-05, + "loss": 0.5364, + "step": 29723 + }, + { + "epoch": 1.6644641057229252, + "grad_norm": 1.1515580415725708, + "learning_rate": 9.744473684210527e-05, + "loss": 0.4323, + "step": 29724 + }, + { + "epoch": 1.6645201030350543, + "grad_norm": 1.148767352104187, + "learning_rate": 9.744447368421053e-05, + "loss": 0.3941, + "step": 29725 + }, + { + "epoch": 1.6645761003471833, + "grad_norm": 1.420762538909912, + "learning_rate": 9.744421052631579e-05, + "loss": 0.4618, + "step": 29726 + }, + { + "epoch": 1.6646320976593123, + "grad_norm": 1.3800610303878784, + "learning_rate": 9.744394736842107e-05, + "loss": 0.5246, + "step": 29727 + }, + { + "epoch": 1.6646880949714413, + "grad_norm": 1.1994104385375977, + "learning_rate": 9.744368421052633e-05, + "loss": 0.4552, + "step": 29728 + }, + { + "epoch": 1.6647440922835703, + "grad_norm": 1.433768391609192, + "learning_rate": 9.744342105263157e-05, + "loss": 0.4113, + "step": 29729 + }, + { + "epoch": 1.6648000895956994, + "grad_norm": 1.3951174020767212, + "learning_rate": 9.744315789473684e-05, + "loss": 0.6001, + "step": 29730 + }, + { + "epoch": 1.6648560869078284, + "grad_norm": 1.1915291547775269, + "learning_rate": 9.74428947368421e-05, + "loss": 0.484, + "step": 29731 + }, + { + "epoch": 1.6649120842199574, + "grad_norm": 4.183198928833008, + "learning_rate": 9.744263157894738e-05, + "loss": 0.4532, + "step": 29732 + }, + { + "epoch": 1.6649680815320864, + "grad_norm": 1.5106087923049927, + "learning_rate": 9.744236842105264e-05, + "loss": 0.4033, + "step": 29733 + }, + { + "epoch": 1.6650240788442154, + "grad_norm": 1.2340867519378662, + "learning_rate": 9.74421052631579e-05, + "loss": 0.3915, + "step": 29734 + }, + { + "epoch": 1.6650800761563445, + "grad_norm": 1.3168102502822876, + "learning_rate": 9.744184210526316e-05, + "loss": 0.4267, + "step": 29735 + }, + { + "epoch": 1.6651360734684735, + "grad_norm": 1.1634732484817505, + "learning_rate": 9.744157894736843e-05, + "loss": 0.36, + "step": 29736 + }, + { + "epoch": 1.6651920707806025, + "grad_norm": 1.2173901796340942, + "learning_rate": 9.744131578947369e-05, + "loss": 0.5981, + "step": 29737 + }, + { + "epoch": 1.6652480680927315, + "grad_norm": 1.174443244934082, + "learning_rate": 9.744105263157895e-05, + "loss": 0.5204, + "step": 29738 + }, + { + "epoch": 1.6653040654048605, + "grad_norm": 1.5484994649887085, + "learning_rate": 9.744078947368421e-05, + "loss": 0.3388, + "step": 29739 + }, + { + "epoch": 1.6653600627169896, + "grad_norm": 1.8700287342071533, + "learning_rate": 9.744052631578948e-05, + "loss": 0.6348, + "step": 29740 + }, + { + "epoch": 1.6654160600291186, + "grad_norm": 1.558946132659912, + "learning_rate": 9.744026315789474e-05, + "loss": 0.4442, + "step": 29741 + }, + { + "epoch": 1.6654720573412476, + "grad_norm": 1.4027788639068604, + "learning_rate": 9.744000000000002e-05, + "loss": 0.4474, + "step": 29742 + }, + { + "epoch": 1.6655280546533766, + "grad_norm": 1.3163743019104004, + "learning_rate": 9.743973684210526e-05, + "loss": 0.3866, + "step": 29743 + }, + { + "epoch": 1.6655840519655056, + "grad_norm": 1.152384638786316, + "learning_rate": 9.743947368421054e-05, + "loss": 0.4037, + "step": 29744 + }, + { + "epoch": 1.6656400492776346, + "grad_norm": 1.4696251153945923, + "learning_rate": 9.74392105263158e-05, + "loss": 0.5558, + "step": 29745 + }, + { + "epoch": 1.6656960465897637, + "grad_norm": 1.2139456272125244, + "learning_rate": 9.743894736842105e-05, + "loss": 0.3967, + "step": 29746 + }, + { + "epoch": 1.6657520439018927, + "grad_norm": 1.3266783952713013, + "learning_rate": 9.743868421052631e-05, + "loss": 0.4887, + "step": 29747 + }, + { + "epoch": 1.6658080412140217, + "grad_norm": 1.4412658214569092, + "learning_rate": 9.743842105263157e-05, + "loss": 0.5102, + "step": 29748 + }, + { + "epoch": 1.6658640385261507, + "grad_norm": 1.185046672821045, + "learning_rate": 9.743815789473685e-05, + "loss": 0.341, + "step": 29749 + }, + { + "epoch": 1.6659200358382797, + "grad_norm": 1.3606398105621338, + "learning_rate": 9.743789473684211e-05, + "loss": 0.5179, + "step": 29750 + }, + { + "epoch": 1.6659760331504088, + "grad_norm": 1.2505133152008057, + "learning_rate": 9.743763157894738e-05, + "loss": 0.4396, + "step": 29751 + }, + { + "epoch": 1.6660320304625378, + "grad_norm": 1.2264609336853027, + "learning_rate": 9.743736842105263e-05, + "loss": 0.3688, + "step": 29752 + }, + { + "epoch": 1.6660880277746668, + "grad_norm": 1.205929160118103, + "learning_rate": 9.74371052631579e-05, + "loss": 0.4127, + "step": 29753 + }, + { + "epoch": 1.6661440250867958, + "grad_norm": 1.192292332649231, + "learning_rate": 9.743684210526316e-05, + "loss": 0.4382, + "step": 29754 + }, + { + "epoch": 1.6662000223989248, + "grad_norm": 15.911165237426758, + "learning_rate": 9.743657894736843e-05, + "loss": 0.3512, + "step": 29755 + }, + { + "epoch": 1.6662560197110539, + "grad_norm": 1.7799100875854492, + "learning_rate": 9.743631578947369e-05, + "loss": 0.5259, + "step": 29756 + }, + { + "epoch": 1.6663120170231829, + "grad_norm": 3.8827314376831055, + "learning_rate": 9.743605263157895e-05, + "loss": 0.5644, + "step": 29757 + }, + { + "epoch": 1.666368014335312, + "grad_norm": 1.453129529953003, + "learning_rate": 9.743578947368421e-05, + "loss": 0.6702, + "step": 29758 + }, + { + "epoch": 1.666424011647441, + "grad_norm": 1.8428001403808594, + "learning_rate": 9.743552631578949e-05, + "loss": 0.4429, + "step": 29759 + }, + { + "epoch": 1.66648000895957, + "grad_norm": 1.1079034805297852, + "learning_rate": 9.743526315789475e-05, + "loss": 0.4014, + "step": 29760 + }, + { + "epoch": 1.666536006271699, + "grad_norm": 1.9630281925201416, + "learning_rate": 9.7435e-05, + "loss": 0.5883, + "step": 29761 + }, + { + "epoch": 1.666592003583828, + "grad_norm": 2.018019914627075, + "learning_rate": 9.743473684210526e-05, + "loss": 0.6178, + "step": 29762 + }, + { + "epoch": 1.666648000895957, + "grad_norm": 1.1073989868164062, + "learning_rate": 9.743447368421052e-05, + "loss": 0.4937, + "step": 29763 + }, + { + "epoch": 1.666703998208086, + "grad_norm": 1.2717527151107788, + "learning_rate": 9.74342105263158e-05, + "loss": 0.5471, + "step": 29764 + }, + { + "epoch": 1.666759995520215, + "grad_norm": 1.579235315322876, + "learning_rate": 9.743394736842106e-05, + "loss": 0.4843, + "step": 29765 + }, + { + "epoch": 1.666815992832344, + "grad_norm": 1.0931422710418701, + "learning_rate": 9.743368421052632e-05, + "loss": 0.3776, + "step": 29766 + }, + { + "epoch": 1.666871990144473, + "grad_norm": 1.1754850149154663, + "learning_rate": 9.743342105263158e-05, + "loss": 0.3256, + "step": 29767 + }, + { + "epoch": 1.666927987456602, + "grad_norm": 1.2110705375671387, + "learning_rate": 9.743315789473685e-05, + "loss": 0.4423, + "step": 29768 + }, + { + "epoch": 1.666983984768731, + "grad_norm": 1.223608374595642, + "learning_rate": 9.743289473684211e-05, + "loss": 0.4686, + "step": 29769 + }, + { + "epoch": 1.6670399820808601, + "grad_norm": 1.2251781225204468, + "learning_rate": 9.743263157894737e-05, + "loss": 0.432, + "step": 29770 + }, + { + "epoch": 1.6670959793929891, + "grad_norm": 1.308613657951355, + "learning_rate": 9.743236842105263e-05, + "loss": 0.531, + "step": 29771 + }, + { + "epoch": 1.6671519767051182, + "grad_norm": 1.401737928390503, + "learning_rate": 9.74321052631579e-05, + "loss": 0.5821, + "step": 29772 + }, + { + "epoch": 1.6672079740172472, + "grad_norm": 1.3729923963546753, + "learning_rate": 9.743184210526316e-05, + "loss": 0.4739, + "step": 29773 + }, + { + "epoch": 1.6672639713293762, + "grad_norm": 1.2187713384628296, + "learning_rate": 9.743157894736844e-05, + "loss": 0.5502, + "step": 29774 + }, + { + "epoch": 1.6673199686415052, + "grad_norm": 1.1433850526809692, + "learning_rate": 9.743131578947368e-05, + "loss": 0.443, + "step": 29775 + }, + { + "epoch": 1.6673759659536342, + "grad_norm": 1.6556144952774048, + "learning_rate": 9.743105263157895e-05, + "loss": 0.5669, + "step": 29776 + }, + { + "epoch": 1.6674319632657633, + "grad_norm": 1.1630947589874268, + "learning_rate": 9.743078947368421e-05, + "loss": 0.3854, + "step": 29777 + }, + { + "epoch": 1.6674879605778923, + "grad_norm": 1.1792995929718018, + "learning_rate": 9.743052631578949e-05, + "loss": 0.4185, + "step": 29778 + }, + { + "epoch": 1.6675439578900213, + "grad_norm": 1.5685478448867798, + "learning_rate": 9.743026315789475e-05, + "loss": 0.5321, + "step": 29779 + }, + { + "epoch": 1.6675999552021503, + "grad_norm": 1.180819034576416, + "learning_rate": 9.743000000000001e-05, + "loss": 0.3929, + "step": 29780 + }, + { + "epoch": 1.6676559525142793, + "grad_norm": 1.367296814918518, + "learning_rate": 9.742973684210527e-05, + "loss": 0.4154, + "step": 29781 + }, + { + "epoch": 1.6677119498264084, + "grad_norm": 1.4089455604553223, + "learning_rate": 9.742947368421053e-05, + "loss": 0.4036, + "step": 29782 + }, + { + "epoch": 1.6677679471385374, + "grad_norm": 1.3270117044448853, + "learning_rate": 9.74292105263158e-05, + "loss": 0.5185, + "step": 29783 + }, + { + "epoch": 1.6678239444506664, + "grad_norm": 1.379282832145691, + "learning_rate": 9.742894736842105e-05, + "loss": 0.3957, + "step": 29784 + }, + { + "epoch": 1.6678799417627954, + "grad_norm": 1.2152642011642456, + "learning_rate": 9.742868421052632e-05, + "loss": 0.4632, + "step": 29785 + }, + { + "epoch": 1.6679359390749244, + "grad_norm": 1.2232376337051392, + "learning_rate": 9.742842105263158e-05, + "loss": 0.4157, + "step": 29786 + }, + { + "epoch": 1.6679919363870535, + "grad_norm": 2.0165438652038574, + "learning_rate": 9.742815789473685e-05, + "loss": 0.414, + "step": 29787 + }, + { + "epoch": 1.6680479336991825, + "grad_norm": 44.752079010009766, + "learning_rate": 9.742789473684211e-05, + "loss": 0.388, + "step": 29788 + }, + { + "epoch": 1.6681039310113115, + "grad_norm": 1.298831820487976, + "learning_rate": 9.742763157894737e-05, + "loss": 0.6055, + "step": 29789 + }, + { + "epoch": 1.6681599283234405, + "grad_norm": 1.1803098917007446, + "learning_rate": 9.742736842105263e-05, + "loss": 0.3863, + "step": 29790 + }, + { + "epoch": 1.6682159256355695, + "grad_norm": 1.267655849456787, + "learning_rate": 9.74271052631579e-05, + "loss": 0.4228, + "step": 29791 + }, + { + "epoch": 1.6682719229476985, + "grad_norm": 1.3102296590805054, + "learning_rate": 9.742684210526316e-05, + "loss": 0.4232, + "step": 29792 + }, + { + "epoch": 1.6683279202598276, + "grad_norm": 1.2054402828216553, + "learning_rate": 9.742657894736842e-05, + "loss": 0.4745, + "step": 29793 + }, + { + "epoch": 1.6683839175719566, + "grad_norm": 1.2526195049285889, + "learning_rate": 9.742631578947368e-05, + "loss": 0.3803, + "step": 29794 + }, + { + "epoch": 1.6684399148840856, + "grad_norm": 1.429464340209961, + "learning_rate": 9.742605263157896e-05, + "loss": 0.5537, + "step": 29795 + }, + { + "epoch": 1.6684959121962146, + "grad_norm": 1.1986188888549805, + "learning_rate": 9.742578947368422e-05, + "loss": 0.3575, + "step": 29796 + }, + { + "epoch": 1.6685519095083436, + "grad_norm": 1.2700387239456177, + "learning_rate": 9.742552631578949e-05, + "loss": 0.4072, + "step": 29797 + }, + { + "epoch": 1.6686079068204727, + "grad_norm": 1.1243160963058472, + "learning_rate": 9.742526315789474e-05, + "loss": 0.453, + "step": 29798 + }, + { + "epoch": 1.6686639041326017, + "grad_norm": 1.2256789207458496, + "learning_rate": 9.7425e-05, + "loss": 0.3341, + "step": 29799 + }, + { + "epoch": 1.6687199014447307, + "grad_norm": 1.7087154388427734, + "learning_rate": 9.742473684210527e-05, + "loss": 0.4214, + "step": 29800 + }, + { + "epoch": 1.6687758987568597, + "grad_norm": 1.2494076490402222, + "learning_rate": 9.742447368421053e-05, + "loss": 0.609, + "step": 29801 + }, + { + "epoch": 1.6688318960689887, + "grad_norm": 1.8516823053359985, + "learning_rate": 9.742421052631579e-05, + "loss": 0.411, + "step": 29802 + }, + { + "epoch": 1.6688878933811178, + "grad_norm": 1.2978487014770508, + "learning_rate": 9.742394736842105e-05, + "loss": 0.5197, + "step": 29803 + }, + { + "epoch": 1.6689438906932468, + "grad_norm": 1.579337239265442, + "learning_rate": 9.742368421052632e-05, + "loss": 0.5147, + "step": 29804 + }, + { + "epoch": 1.6689998880053758, + "grad_norm": 1.4517598152160645, + "learning_rate": 9.742342105263158e-05, + "loss": 0.5645, + "step": 29805 + }, + { + "epoch": 1.6690558853175048, + "grad_norm": 1.1208311319351196, + "learning_rate": 9.742315789473686e-05, + "loss": 0.4752, + "step": 29806 + }, + { + "epoch": 1.6691118826296338, + "grad_norm": 1.1679915189743042, + "learning_rate": 9.74228947368421e-05, + "loss": 0.4083, + "step": 29807 + }, + { + "epoch": 1.6691678799417629, + "grad_norm": 1.585705041885376, + "learning_rate": 9.742263157894737e-05, + "loss": 0.4569, + "step": 29808 + }, + { + "epoch": 1.6692238772538919, + "grad_norm": 1.0873582363128662, + "learning_rate": 9.742236842105263e-05, + "loss": 0.39, + "step": 29809 + }, + { + "epoch": 1.669279874566021, + "grad_norm": 1.1331446170806885, + "learning_rate": 9.742210526315791e-05, + "loss": 0.4536, + "step": 29810 + }, + { + "epoch": 1.66933587187815, + "grad_norm": 1.2082889080047607, + "learning_rate": 9.742184210526317e-05, + "loss": 0.4531, + "step": 29811 + }, + { + "epoch": 1.669391869190279, + "grad_norm": 1.4118036031723022, + "learning_rate": 9.742157894736843e-05, + "loss": 0.3652, + "step": 29812 + }, + { + "epoch": 1.669447866502408, + "grad_norm": 1.1882736682891846, + "learning_rate": 9.742131578947369e-05, + "loss": 0.3823, + "step": 29813 + }, + { + "epoch": 1.669503863814537, + "grad_norm": 1.3445663452148438, + "learning_rate": 9.742105263157896e-05, + "loss": 0.4056, + "step": 29814 + }, + { + "epoch": 1.669559861126666, + "grad_norm": 1.6700886487960815, + "learning_rate": 9.742078947368422e-05, + "loss": 0.7545, + "step": 29815 + }, + { + "epoch": 1.669615858438795, + "grad_norm": 1.2657943964004517, + "learning_rate": 9.742052631578948e-05, + "loss": 0.4648, + "step": 29816 + }, + { + "epoch": 1.669671855750924, + "grad_norm": 1.2922332286834717, + "learning_rate": 9.742026315789474e-05, + "loss": 0.4243, + "step": 29817 + }, + { + "epoch": 1.669727853063053, + "grad_norm": 1.136527419090271, + "learning_rate": 9.742e-05, + "loss": 0.4334, + "step": 29818 + }, + { + "epoch": 1.669783850375182, + "grad_norm": 1.1991740465164185, + "learning_rate": 9.741973684210527e-05, + "loss": 0.4208, + "step": 29819 + }, + { + "epoch": 1.669839847687311, + "grad_norm": 1.3565469980239868, + "learning_rate": 9.741947368421053e-05, + "loss": 0.5002, + "step": 29820 + }, + { + "epoch": 1.66989584499944, + "grad_norm": 1.618970274925232, + "learning_rate": 9.741921052631579e-05, + "loss": 0.446, + "step": 29821 + }, + { + "epoch": 1.6699518423115691, + "grad_norm": 2.0439717769622803, + "learning_rate": 9.741894736842105e-05, + "loss": 0.5478, + "step": 29822 + }, + { + "epoch": 1.6700078396236981, + "grad_norm": 1.4054450988769531, + "learning_rate": 9.741868421052632e-05, + "loss": 0.4755, + "step": 29823 + }, + { + "epoch": 1.6700638369358272, + "grad_norm": 1.5218238830566406, + "learning_rate": 9.741842105263158e-05, + "loss": 0.4832, + "step": 29824 + }, + { + "epoch": 1.6701198342479562, + "grad_norm": 1.2585930824279785, + "learning_rate": 9.741815789473684e-05, + "loss": 0.6058, + "step": 29825 + }, + { + "epoch": 1.6701758315600852, + "grad_norm": 1.4107081890106201, + "learning_rate": 9.74178947368421e-05, + "loss": 0.4306, + "step": 29826 + }, + { + "epoch": 1.6702318288722142, + "grad_norm": 1.1761633157730103, + "learning_rate": 9.741763157894738e-05, + "loss": 0.4319, + "step": 29827 + }, + { + "epoch": 1.6702878261843432, + "grad_norm": 1.3887361288070679, + "learning_rate": 9.741736842105264e-05, + "loss": 0.6162, + "step": 29828 + }, + { + "epoch": 1.6703438234964723, + "grad_norm": 1.3429697751998901, + "learning_rate": 9.741710526315791e-05, + "loss": 0.4642, + "step": 29829 + }, + { + "epoch": 1.6703998208086013, + "grad_norm": 1.3777918815612793, + "learning_rate": 9.741684210526316e-05, + "loss": 0.5079, + "step": 29830 + }, + { + "epoch": 1.6704558181207303, + "grad_norm": 1.3541017770767212, + "learning_rate": 9.741657894736843e-05, + "loss": 0.513, + "step": 29831 + }, + { + "epoch": 1.6705118154328593, + "grad_norm": 1.4516394138336182, + "learning_rate": 9.741631578947369e-05, + "loss": 0.4262, + "step": 29832 + }, + { + "epoch": 1.6705678127449883, + "grad_norm": 1.2368522882461548, + "learning_rate": 9.741605263157895e-05, + "loss": 0.4667, + "step": 29833 + }, + { + "epoch": 1.6706238100571174, + "grad_norm": 1.223478078842163, + "learning_rate": 9.741578947368422e-05, + "loss": 0.4711, + "step": 29834 + }, + { + "epoch": 1.6706798073692464, + "grad_norm": 1.3523377180099487, + "learning_rate": 9.741552631578947e-05, + "loss": 0.4679, + "step": 29835 + }, + { + "epoch": 1.6707358046813754, + "grad_norm": 1.3053621053695679, + "learning_rate": 9.741526315789474e-05, + "loss": 0.3716, + "step": 29836 + }, + { + "epoch": 1.6707918019935044, + "grad_norm": 2.675323247909546, + "learning_rate": 9.7415e-05, + "loss": 0.3351, + "step": 29837 + }, + { + "epoch": 1.6708477993056334, + "grad_norm": 1.5996848344802856, + "learning_rate": 9.741473684210527e-05, + "loss": 0.3857, + "step": 29838 + }, + { + "epoch": 1.6709037966177624, + "grad_norm": 1.1878904104232788, + "learning_rate": 9.741447368421052e-05, + "loss": 0.4158, + "step": 29839 + }, + { + "epoch": 1.6709597939298915, + "grad_norm": 1.2107948064804077, + "learning_rate": 9.74142105263158e-05, + "loss": 0.4259, + "step": 29840 + }, + { + "epoch": 1.6710157912420205, + "grad_norm": 2.0347845554351807, + "learning_rate": 9.741394736842105e-05, + "loss": 0.4988, + "step": 29841 + }, + { + "epoch": 1.6710717885541495, + "grad_norm": 1.2873926162719727, + "learning_rate": 9.741368421052633e-05, + "loss": 0.4446, + "step": 29842 + }, + { + "epoch": 1.6711277858662785, + "grad_norm": 1.201278805732727, + "learning_rate": 9.741342105263159e-05, + "loss": 0.4185, + "step": 29843 + }, + { + "epoch": 1.6711837831784075, + "grad_norm": 1.1728787422180176, + "learning_rate": 9.741315789473685e-05, + "loss": 0.3828, + "step": 29844 + }, + { + "epoch": 1.6712397804905366, + "grad_norm": 1.3575496673583984, + "learning_rate": 9.74128947368421e-05, + "loss": 0.4228, + "step": 29845 + }, + { + "epoch": 1.6712957778026656, + "grad_norm": 1.3685311079025269, + "learning_rate": 9.741263157894738e-05, + "loss": 0.4682, + "step": 29846 + }, + { + "epoch": 1.6713517751147946, + "grad_norm": 1.4752414226531982, + "learning_rate": 9.741236842105264e-05, + "loss": 0.3884, + "step": 29847 + }, + { + "epoch": 1.6714077724269236, + "grad_norm": 1.190846562385559, + "learning_rate": 9.74121052631579e-05, + "loss": 0.4626, + "step": 29848 + }, + { + "epoch": 1.6714637697390526, + "grad_norm": 1.1097655296325684, + "learning_rate": 9.741184210526316e-05, + "loss": 0.414, + "step": 29849 + }, + { + "epoch": 1.6715197670511817, + "grad_norm": 1.0734790563583374, + "learning_rate": 9.741157894736842e-05, + "loss": 0.4411, + "step": 29850 + }, + { + "epoch": 1.6715757643633107, + "grad_norm": 1.2061095237731934, + "learning_rate": 9.741131578947369e-05, + "loss": 0.4699, + "step": 29851 + }, + { + "epoch": 1.6716317616754397, + "grad_norm": 2.0681514739990234, + "learning_rate": 9.741105263157895e-05, + "loss": 0.4441, + "step": 29852 + }, + { + "epoch": 1.6716877589875687, + "grad_norm": 1.153032660484314, + "learning_rate": 9.741078947368421e-05, + "loss": 0.4403, + "step": 29853 + }, + { + "epoch": 1.6717437562996977, + "grad_norm": 1.0701205730438232, + "learning_rate": 9.741052631578947e-05, + "loss": 0.3503, + "step": 29854 + }, + { + "epoch": 1.6717997536118268, + "grad_norm": 1.1991156339645386, + "learning_rate": 9.741026315789474e-05, + "loss": 0.4324, + "step": 29855 + }, + { + "epoch": 1.6718557509239558, + "grad_norm": 1.3591337203979492, + "learning_rate": 9.741e-05, + "loss": 0.5427, + "step": 29856 + }, + { + "epoch": 1.6719117482360848, + "grad_norm": 1.1301558017730713, + "learning_rate": 9.740973684210526e-05, + "loss": 0.4113, + "step": 29857 + }, + { + "epoch": 1.6719677455482138, + "grad_norm": 1.305549144744873, + "learning_rate": 9.740947368421052e-05, + "loss": 0.5043, + "step": 29858 + }, + { + "epoch": 1.6720237428603428, + "grad_norm": 3.242582321166992, + "learning_rate": 9.74092105263158e-05, + "loss": 0.5073, + "step": 29859 + }, + { + "epoch": 1.6720797401724719, + "grad_norm": 1.270106315612793, + "learning_rate": 9.740894736842106e-05, + "loss": 0.3616, + "step": 29860 + }, + { + "epoch": 1.6721357374846009, + "grad_norm": 1.0806063413619995, + "learning_rate": 9.740868421052633e-05, + "loss": 0.4177, + "step": 29861 + }, + { + "epoch": 1.67219173479673, + "grad_norm": 1.2932080030441284, + "learning_rate": 9.740842105263158e-05, + "loss": 0.4994, + "step": 29862 + }, + { + "epoch": 1.672247732108859, + "grad_norm": 1.2522165775299072, + "learning_rate": 9.740815789473685e-05, + "loss": 0.3813, + "step": 29863 + }, + { + "epoch": 1.672303729420988, + "grad_norm": 1.3221081495285034, + "learning_rate": 9.740789473684211e-05, + "loss": 0.5046, + "step": 29864 + }, + { + "epoch": 1.672359726733117, + "grad_norm": 3.6724979877471924, + "learning_rate": 9.740763157894738e-05, + "loss": 0.4389, + "step": 29865 + }, + { + "epoch": 1.672415724045246, + "grad_norm": 1.2772105932235718, + "learning_rate": 9.740736842105264e-05, + "loss": 0.4686, + "step": 29866 + }, + { + "epoch": 1.672471721357375, + "grad_norm": 1.2992759943008423, + "learning_rate": 9.740710526315789e-05, + "loss": 0.5575, + "step": 29867 + }, + { + "epoch": 1.672527718669504, + "grad_norm": 1.3870601654052734, + "learning_rate": 9.740684210526316e-05, + "loss": 0.4174, + "step": 29868 + }, + { + "epoch": 1.672583715981633, + "grad_norm": 1.2118297815322876, + "learning_rate": 9.740657894736842e-05, + "loss": 0.4362, + "step": 29869 + }, + { + "epoch": 1.672639713293762, + "grad_norm": 1.3227561712265015, + "learning_rate": 9.74063157894737e-05, + "loss": 0.3491, + "step": 29870 + }, + { + "epoch": 1.672695710605891, + "grad_norm": 1.1811689138412476, + "learning_rate": 9.740605263157895e-05, + "loss": 0.5631, + "step": 29871 + }, + { + "epoch": 1.67275170791802, + "grad_norm": 1.6219584941864014, + "learning_rate": 9.740578947368421e-05, + "loss": 0.6717, + "step": 29872 + }, + { + "epoch": 1.672807705230149, + "grad_norm": 1.3326748609542847, + "learning_rate": 9.740552631578947e-05, + "loss": 0.4784, + "step": 29873 + }, + { + "epoch": 1.6728637025422781, + "grad_norm": 1.8884485960006714, + "learning_rate": 9.740526315789475e-05, + "loss": 0.3688, + "step": 29874 + }, + { + "epoch": 1.6729196998544071, + "grad_norm": 1.4678939580917358, + "learning_rate": 9.7405e-05, + "loss": 0.4771, + "step": 29875 + }, + { + "epoch": 1.6729756971665362, + "grad_norm": 1.3649877309799194, + "learning_rate": 9.740473684210527e-05, + "loss": 0.4531, + "step": 29876 + }, + { + "epoch": 1.6730316944786652, + "grad_norm": 1.416639804840088, + "learning_rate": 9.740447368421053e-05, + "loss": 0.7262, + "step": 29877 + }, + { + "epoch": 1.6730876917907942, + "grad_norm": 1.2157752513885498, + "learning_rate": 9.74042105263158e-05, + "loss": 0.4003, + "step": 29878 + }, + { + "epoch": 1.6731436891029232, + "grad_norm": 1.248354196548462, + "learning_rate": 9.740394736842106e-05, + "loss": 0.5036, + "step": 29879 + }, + { + "epoch": 1.6731996864150522, + "grad_norm": 1.21266770362854, + "learning_rate": 9.740368421052632e-05, + "loss": 0.432, + "step": 29880 + }, + { + "epoch": 1.6732556837271813, + "grad_norm": 1.730705976486206, + "learning_rate": 9.740342105263158e-05, + "loss": 0.4821, + "step": 29881 + }, + { + "epoch": 1.6733116810393103, + "grad_norm": 1.1938798427581787, + "learning_rate": 9.740315789473685e-05, + "loss": 0.4414, + "step": 29882 + }, + { + "epoch": 1.6733676783514393, + "grad_norm": 7.105051040649414, + "learning_rate": 9.740289473684211e-05, + "loss": 0.4072, + "step": 29883 + }, + { + "epoch": 1.6734236756635683, + "grad_norm": 1.474406123161316, + "learning_rate": 9.740263157894737e-05, + "loss": 0.5356, + "step": 29884 + }, + { + "epoch": 1.6734796729756973, + "grad_norm": 1.5218783617019653, + "learning_rate": 9.740236842105263e-05, + "loss": 0.5101, + "step": 29885 + }, + { + "epoch": 1.6735356702878263, + "grad_norm": 1.516426682472229, + "learning_rate": 9.740210526315789e-05, + "loss": 0.5078, + "step": 29886 + }, + { + "epoch": 1.6735916675999554, + "grad_norm": 1.0792291164398193, + "learning_rate": 9.740184210526316e-05, + "loss": 0.4085, + "step": 29887 + }, + { + "epoch": 1.6736476649120842, + "grad_norm": 1.207808256149292, + "learning_rate": 9.740157894736842e-05, + "loss": 0.3622, + "step": 29888 + }, + { + "epoch": 1.6737036622242132, + "grad_norm": 1.3278062343597412, + "learning_rate": 9.74013157894737e-05, + "loss": 0.4181, + "step": 29889 + }, + { + "epoch": 1.6737596595363422, + "grad_norm": 1.1830133199691772, + "learning_rate": 9.740105263157894e-05, + "loss": 0.3895, + "step": 29890 + }, + { + "epoch": 1.6738156568484712, + "grad_norm": 1.3510818481445312, + "learning_rate": 9.740078947368422e-05, + "loss": 0.3787, + "step": 29891 + }, + { + "epoch": 1.6738716541606002, + "grad_norm": 1.8418278694152832, + "learning_rate": 9.740052631578948e-05, + "loss": 0.4635, + "step": 29892 + }, + { + "epoch": 1.6739276514727293, + "grad_norm": 1.1231051683425903, + "learning_rate": 9.740026315789475e-05, + "loss": 0.417, + "step": 29893 + }, + { + "epoch": 1.6739836487848583, + "grad_norm": 1.301915168762207, + "learning_rate": 9.74e-05, + "loss": 0.3922, + "step": 29894 + }, + { + "epoch": 1.6740396460969873, + "grad_norm": 1.3169394731521606, + "learning_rate": 9.739973684210527e-05, + "loss": 0.372, + "step": 29895 + }, + { + "epoch": 1.6740956434091163, + "grad_norm": 1.2419829368591309, + "learning_rate": 9.739947368421053e-05, + "loss": 0.4359, + "step": 29896 + }, + { + "epoch": 1.6741516407212453, + "grad_norm": 1.370413899421692, + "learning_rate": 9.73992105263158e-05, + "loss": 0.4605, + "step": 29897 + }, + { + "epoch": 1.6742076380333744, + "grad_norm": 1.1843878030776978, + "learning_rate": 9.739894736842106e-05, + "loss": 0.4267, + "step": 29898 + }, + { + "epoch": 1.6742636353455034, + "grad_norm": 1.4774214029312134, + "learning_rate": 9.739868421052632e-05, + "loss": 0.4291, + "step": 29899 + }, + { + "epoch": 1.6743196326576324, + "grad_norm": 1.1830426454544067, + "learning_rate": 9.739842105263158e-05, + "loss": 0.3761, + "step": 29900 + }, + { + "epoch": 1.6743756299697614, + "grad_norm": 1.2707479000091553, + "learning_rate": 9.739815789473685e-05, + "loss": 0.3663, + "step": 29901 + }, + { + "epoch": 1.6744316272818904, + "grad_norm": 1.1479617357254028, + "learning_rate": 9.739789473684211e-05, + "loss": 0.376, + "step": 29902 + }, + { + "epoch": 1.6744876245940195, + "grad_norm": 1.423410415649414, + "learning_rate": 9.739763157894737e-05, + "loss": 0.5771, + "step": 29903 + }, + { + "epoch": 1.6745436219061485, + "grad_norm": 1.1610666513442993, + "learning_rate": 9.739736842105263e-05, + "loss": 0.4341, + "step": 29904 + }, + { + "epoch": 1.6745996192182775, + "grad_norm": 1.1887913942337036, + "learning_rate": 9.73971052631579e-05, + "loss": 0.5333, + "step": 29905 + }, + { + "epoch": 1.6746556165304065, + "grad_norm": 1.1377102136611938, + "learning_rate": 9.739684210526317e-05, + "loss": 0.4063, + "step": 29906 + }, + { + "epoch": 1.6747116138425355, + "grad_norm": 1.2968591451644897, + "learning_rate": 9.739657894736843e-05, + "loss": 0.5483, + "step": 29907 + }, + { + "epoch": 1.6747676111546645, + "grad_norm": 1.3600558042526245, + "learning_rate": 9.739631578947369e-05, + "loss": 0.4918, + "step": 29908 + }, + { + "epoch": 1.6748236084667936, + "grad_norm": 1.417197585105896, + "learning_rate": 9.739605263157895e-05, + "loss": 0.4186, + "step": 29909 + }, + { + "epoch": 1.6748796057789226, + "grad_norm": 1.2363240718841553, + "learning_rate": 9.739578947368422e-05, + "loss": 0.452, + "step": 29910 + }, + { + "epoch": 1.6749356030910516, + "grad_norm": 1.2443267107009888, + "learning_rate": 9.739552631578948e-05, + "loss": 0.5973, + "step": 29911 + }, + { + "epoch": 1.6749916004031806, + "grad_norm": 1.374777913093567, + "learning_rate": 9.739526315789474e-05, + "loss": 0.5798, + "step": 29912 + }, + { + "epoch": 1.6750475977153096, + "grad_norm": 1.2771215438842773, + "learning_rate": 9.7395e-05, + "loss": 0.4863, + "step": 29913 + }, + { + "epoch": 1.6751035950274387, + "grad_norm": 1.2088505029678345, + "learning_rate": 9.739473684210527e-05, + "loss": 0.5217, + "step": 29914 + }, + { + "epoch": 1.6751595923395677, + "grad_norm": 1.2444523572921753, + "learning_rate": 9.739447368421053e-05, + "loss": 0.3946, + "step": 29915 + }, + { + "epoch": 1.6752155896516967, + "grad_norm": 1.221674919128418, + "learning_rate": 9.73942105263158e-05, + "loss": 0.4118, + "step": 29916 + }, + { + "epoch": 1.6752715869638257, + "grad_norm": 1.1710373163223267, + "learning_rate": 9.739394736842105e-05, + "loss": 0.4843, + "step": 29917 + }, + { + "epoch": 1.6753275842759547, + "grad_norm": 1.4426542520523071, + "learning_rate": 9.739368421052632e-05, + "loss": 0.3877, + "step": 29918 + }, + { + "epoch": 1.6753835815880838, + "grad_norm": 1.4047588109970093, + "learning_rate": 9.739342105263158e-05, + "loss": 0.5278, + "step": 29919 + }, + { + "epoch": 1.6754395789002128, + "grad_norm": 1.557431697845459, + "learning_rate": 9.739315789473684e-05, + "loss": 0.3851, + "step": 29920 + }, + { + "epoch": 1.6754955762123418, + "grad_norm": 1.5626405477523804, + "learning_rate": 9.739289473684212e-05, + "loss": 0.5033, + "step": 29921 + }, + { + "epoch": 1.6755515735244708, + "grad_norm": 1.4701859951019287, + "learning_rate": 9.739263157894736e-05, + "loss": 0.4843, + "step": 29922 + }, + { + "epoch": 1.6756075708365998, + "grad_norm": 1.4334521293640137, + "learning_rate": 9.739236842105264e-05, + "loss": 0.5705, + "step": 29923 + }, + { + "epoch": 1.6756635681487289, + "grad_norm": 1.1340359449386597, + "learning_rate": 9.73921052631579e-05, + "loss": 0.3748, + "step": 29924 + }, + { + "epoch": 1.6757195654608579, + "grad_norm": 1.2505853176116943, + "learning_rate": 9.739184210526317e-05, + "loss": 0.4332, + "step": 29925 + }, + { + "epoch": 1.675775562772987, + "grad_norm": 1.6236600875854492, + "learning_rate": 9.739157894736843e-05, + "loss": 0.5216, + "step": 29926 + }, + { + "epoch": 1.675831560085116, + "grad_norm": 1.5575350522994995, + "learning_rate": 9.739131578947369e-05, + "loss": 0.6268, + "step": 29927 + }, + { + "epoch": 1.675887557397245, + "grad_norm": 1.2122241258621216, + "learning_rate": 9.739105263157895e-05, + "loss": 0.3967, + "step": 29928 + }, + { + "epoch": 1.675943554709374, + "grad_norm": 1.1989843845367432, + "learning_rate": 9.739078947368422e-05, + "loss": 0.4996, + "step": 29929 + }, + { + "epoch": 1.675999552021503, + "grad_norm": 1.2929365634918213, + "learning_rate": 9.739052631578948e-05, + "loss": 0.4767, + "step": 29930 + }, + { + "epoch": 1.676055549333632, + "grad_norm": 1.5503559112548828, + "learning_rate": 9.739026315789474e-05, + "loss": 0.4401, + "step": 29931 + }, + { + "epoch": 1.676111546645761, + "grad_norm": 1.1933990716934204, + "learning_rate": 9.739e-05, + "loss": 0.4673, + "step": 29932 + }, + { + "epoch": 1.67616754395789, + "grad_norm": 1.1944936513900757, + "learning_rate": 9.738973684210527e-05, + "loss": 0.3465, + "step": 29933 + }, + { + "epoch": 1.676223541270019, + "grad_norm": 1.291595697402954, + "learning_rate": 9.738947368421053e-05, + "loss": 0.4419, + "step": 29934 + }, + { + "epoch": 1.676279538582148, + "grad_norm": 1.2692008018493652, + "learning_rate": 9.73892105263158e-05, + "loss": 0.4615, + "step": 29935 + }, + { + "epoch": 1.676335535894277, + "grad_norm": 1.3068135976791382, + "learning_rate": 9.738894736842105e-05, + "loss": 0.5755, + "step": 29936 + }, + { + "epoch": 1.676391533206406, + "grad_norm": 1.5347583293914795, + "learning_rate": 9.738868421052631e-05, + "loss": 0.4634, + "step": 29937 + }, + { + "epoch": 1.6764475305185351, + "grad_norm": 1.3170474767684937, + "learning_rate": 9.738842105263159e-05, + "loss": 0.4509, + "step": 29938 + }, + { + "epoch": 1.6765035278306641, + "grad_norm": 1.2580183744430542, + "learning_rate": 9.738815789473685e-05, + "loss": 0.3957, + "step": 29939 + }, + { + "epoch": 1.6765595251427932, + "grad_norm": 1.0235823392868042, + "learning_rate": 9.73878947368421e-05, + "loss": 0.2971, + "step": 29940 + }, + { + "epoch": 1.6766155224549222, + "grad_norm": 1.274720311164856, + "learning_rate": 9.738763157894737e-05, + "loss": 0.401, + "step": 29941 + }, + { + "epoch": 1.6766715197670512, + "grad_norm": 1.6044840812683105, + "learning_rate": 9.738736842105264e-05, + "loss": 0.7239, + "step": 29942 + }, + { + "epoch": 1.6767275170791802, + "grad_norm": 1.2107619047164917, + "learning_rate": 9.73871052631579e-05, + "loss": 0.4301, + "step": 29943 + }, + { + "epoch": 1.6767835143913092, + "grad_norm": 1.5246989727020264, + "learning_rate": 9.738684210526317e-05, + "loss": 0.5253, + "step": 29944 + }, + { + "epoch": 1.6768395117034383, + "grad_norm": 1.3754037618637085, + "learning_rate": 9.738657894736842e-05, + "loss": 0.4578, + "step": 29945 + }, + { + "epoch": 1.6768955090155673, + "grad_norm": 1.1871083974838257, + "learning_rate": 9.738631578947369e-05, + "loss": 0.3561, + "step": 29946 + }, + { + "epoch": 1.6769515063276963, + "grad_norm": 1.3134440183639526, + "learning_rate": 9.738605263157895e-05, + "loss": 0.5167, + "step": 29947 + }, + { + "epoch": 1.6770075036398253, + "grad_norm": 1.5911232233047485, + "learning_rate": 9.738578947368422e-05, + "loss": 0.4224, + "step": 29948 + }, + { + "epoch": 1.6770635009519543, + "grad_norm": 1.1805120706558228, + "learning_rate": 9.738552631578947e-05, + "loss": 0.4332, + "step": 29949 + }, + { + "epoch": 1.6771194982640834, + "grad_norm": 1.2180368900299072, + "learning_rate": 9.738526315789474e-05, + "loss": 0.4843, + "step": 29950 + }, + { + "epoch": 1.6771754955762124, + "grad_norm": 0.9737374782562256, + "learning_rate": 9.7385e-05, + "loss": 0.3797, + "step": 29951 + }, + { + "epoch": 1.6772314928883414, + "grad_norm": 1.3471542596817017, + "learning_rate": 9.738473684210528e-05, + "loss": 0.5777, + "step": 29952 + }, + { + "epoch": 1.6772874902004704, + "grad_norm": 1.407422661781311, + "learning_rate": 9.738447368421054e-05, + "loss": 0.4742, + "step": 29953 + }, + { + "epoch": 1.6773434875125994, + "grad_norm": 1.4307019710540771, + "learning_rate": 9.738421052631578e-05, + "loss": 0.4776, + "step": 29954 + }, + { + "epoch": 1.6773994848247284, + "grad_norm": 1.4423656463623047, + "learning_rate": 9.738394736842106e-05, + "loss": 0.6163, + "step": 29955 + }, + { + "epoch": 1.6774554821368575, + "grad_norm": 1.8022441864013672, + "learning_rate": 9.738368421052632e-05, + "loss": 0.4509, + "step": 29956 + }, + { + "epoch": 1.6775114794489865, + "grad_norm": 1.2700351476669312, + "learning_rate": 9.738342105263159e-05, + "loss": 0.4093, + "step": 29957 + }, + { + "epoch": 1.6775674767611155, + "grad_norm": 1.5666680335998535, + "learning_rate": 9.738315789473685e-05, + "loss": 0.5372, + "step": 29958 + }, + { + "epoch": 1.6776234740732445, + "grad_norm": 1.3759901523590088, + "learning_rate": 9.738289473684211e-05, + "loss": 0.4121, + "step": 29959 + }, + { + "epoch": 1.6776794713853735, + "grad_norm": 1.5998016595840454, + "learning_rate": 9.738263157894737e-05, + "loss": 0.5589, + "step": 29960 + }, + { + "epoch": 1.6777354686975026, + "grad_norm": 1.2518067359924316, + "learning_rate": 9.738236842105264e-05, + "loss": 0.3976, + "step": 29961 + }, + { + "epoch": 1.6777914660096316, + "grad_norm": 1.2326905727386475, + "learning_rate": 9.73821052631579e-05, + "loss": 0.4092, + "step": 29962 + }, + { + "epoch": 1.6778474633217606, + "grad_norm": 1.0324153900146484, + "learning_rate": 9.738184210526316e-05, + "loss": 0.433, + "step": 29963 + }, + { + "epoch": 1.6779034606338896, + "grad_norm": 1.4028562307357788, + "learning_rate": 9.738157894736842e-05, + "loss": 0.5518, + "step": 29964 + }, + { + "epoch": 1.6779594579460186, + "grad_norm": 1.1557691097259521, + "learning_rate": 9.73813157894737e-05, + "loss": 0.4234, + "step": 29965 + }, + { + "epoch": 1.6780154552581477, + "grad_norm": 1.2471976280212402, + "learning_rate": 9.738105263157895e-05, + "loss": 0.4519, + "step": 29966 + }, + { + "epoch": 1.6780714525702767, + "grad_norm": 1.1806623935699463, + "learning_rate": 9.738078947368421e-05, + "loss": 0.4599, + "step": 29967 + }, + { + "epoch": 1.6781274498824057, + "grad_norm": 1.3362605571746826, + "learning_rate": 9.738052631578947e-05, + "loss": 0.6427, + "step": 29968 + }, + { + "epoch": 1.6781834471945347, + "grad_norm": 1.3070813417434692, + "learning_rate": 9.738026315789475e-05, + "loss": 0.5511, + "step": 29969 + }, + { + "epoch": 1.6782394445066637, + "grad_norm": 1.2259621620178223, + "learning_rate": 9.738e-05, + "loss": 0.3998, + "step": 29970 + }, + { + "epoch": 1.6782954418187925, + "grad_norm": 1.2757374048233032, + "learning_rate": 9.737973684210527e-05, + "loss": 0.4778, + "step": 29971 + }, + { + "epoch": 1.6783514391309216, + "grad_norm": 1.2138557434082031, + "learning_rate": 9.737947368421053e-05, + "loss": 0.3277, + "step": 29972 + }, + { + "epoch": 1.6784074364430506, + "grad_norm": 1.3146134614944458, + "learning_rate": 9.737921052631579e-05, + "loss": 0.3642, + "step": 29973 + }, + { + "epoch": 1.6784634337551796, + "grad_norm": 1.1987791061401367, + "learning_rate": 9.737894736842106e-05, + "loss": 0.4592, + "step": 29974 + }, + { + "epoch": 1.6785194310673086, + "grad_norm": 1.5726855993270874, + "learning_rate": 9.737868421052632e-05, + "loss": 0.6088, + "step": 29975 + }, + { + "epoch": 1.6785754283794376, + "grad_norm": 1.094915509223938, + "learning_rate": 9.737842105263159e-05, + "loss": 0.4098, + "step": 29976 + }, + { + "epoch": 1.6786314256915666, + "grad_norm": 1.1610496044158936, + "learning_rate": 9.737815789473684e-05, + "loss": 0.4652, + "step": 29977 + }, + { + "epoch": 1.6786874230036957, + "grad_norm": 1.342151165008545, + "learning_rate": 9.737789473684211e-05, + "loss": 0.392, + "step": 29978 + }, + { + "epoch": 1.6787434203158247, + "grad_norm": 1.1701411008834839, + "learning_rate": 9.737763157894737e-05, + "loss": 0.4119, + "step": 29979 + }, + { + "epoch": 1.6787994176279537, + "grad_norm": 1.7732369899749756, + "learning_rate": 9.737736842105264e-05, + "loss": 0.4794, + "step": 29980 + }, + { + "epoch": 1.6788554149400827, + "grad_norm": 1.4591057300567627, + "learning_rate": 9.73771052631579e-05, + "loss": 0.4294, + "step": 29981 + }, + { + "epoch": 1.6789114122522117, + "grad_norm": 1.2426187992095947, + "learning_rate": 9.737684210526316e-05, + "loss": 0.5808, + "step": 29982 + }, + { + "epoch": 1.6789674095643408, + "grad_norm": 1.490771770477295, + "learning_rate": 9.737657894736842e-05, + "loss": 0.5741, + "step": 29983 + }, + { + "epoch": 1.6790234068764698, + "grad_norm": 1.3071575164794922, + "learning_rate": 9.73763157894737e-05, + "loss": 0.4386, + "step": 29984 + }, + { + "epoch": 1.6790794041885988, + "grad_norm": 1.275038480758667, + "learning_rate": 9.737605263157896e-05, + "loss": 0.3716, + "step": 29985 + }, + { + "epoch": 1.6791354015007278, + "grad_norm": 1.1046198606491089, + "learning_rate": 9.737578947368422e-05, + "loss": 0.4326, + "step": 29986 + }, + { + "epoch": 1.6791913988128568, + "grad_norm": 1.302770972251892, + "learning_rate": 9.737552631578948e-05, + "loss": 0.4648, + "step": 29987 + }, + { + "epoch": 1.6792473961249859, + "grad_norm": 1.6733739376068115, + "learning_rate": 9.737526315789474e-05, + "loss": 0.4772, + "step": 29988 + }, + { + "epoch": 1.6793033934371149, + "grad_norm": 1.2860265970230103, + "learning_rate": 9.737500000000001e-05, + "loss": 0.312, + "step": 29989 + }, + { + "epoch": 1.679359390749244, + "grad_norm": 1.3980674743652344, + "learning_rate": 9.737473684210527e-05, + "loss": 0.517, + "step": 29990 + }, + { + "epoch": 1.679415388061373, + "grad_norm": 1.5490573644638062, + "learning_rate": 9.737447368421053e-05, + "loss": 0.6049, + "step": 29991 + }, + { + "epoch": 1.679471385373502, + "grad_norm": 1.5674378871917725, + "learning_rate": 9.737421052631579e-05, + "loss": 0.5021, + "step": 29992 + }, + { + "epoch": 1.679527382685631, + "grad_norm": 1.4125771522521973, + "learning_rate": 9.737394736842106e-05, + "loss": 0.4054, + "step": 29993 + }, + { + "epoch": 1.67958337999776, + "grad_norm": 4.377709865570068, + "learning_rate": 9.737368421052632e-05, + "loss": 0.2836, + "step": 29994 + }, + { + "epoch": 1.679639377309889, + "grad_norm": 1.6075434684753418, + "learning_rate": 9.737342105263158e-05, + "loss": 0.3507, + "step": 29995 + }, + { + "epoch": 1.679695374622018, + "grad_norm": 1.4482122659683228, + "learning_rate": 9.737315789473684e-05, + "loss": 0.4288, + "step": 29996 + }, + { + "epoch": 1.679751371934147, + "grad_norm": 1.3585349321365356, + "learning_rate": 9.737289473684211e-05, + "loss": 0.5288, + "step": 29997 + }, + { + "epoch": 1.679807369246276, + "grad_norm": 1.1839277744293213, + "learning_rate": 9.737263157894737e-05, + "loss": 0.4808, + "step": 29998 + }, + { + "epoch": 1.679863366558405, + "grad_norm": 1.5054590702056885, + "learning_rate": 9.737236842105263e-05, + "loss": 0.4966, + "step": 29999 + }, + { + "epoch": 1.679919363870534, + "grad_norm": 1.5555325746536255, + "learning_rate": 9.737210526315789e-05, + "loss": 0.4613, + "step": 30000 + }, + { + "epoch": 1.679975361182663, + "grad_norm": 1.2994585037231445, + "learning_rate": 9.737184210526317e-05, + "loss": 0.4076, + "step": 30001 + }, + { + "epoch": 1.6800313584947921, + "grad_norm": 1.3233389854431152, + "learning_rate": 9.737157894736843e-05, + "loss": 0.4217, + "step": 30002 + }, + { + "epoch": 1.6800873558069211, + "grad_norm": 1.224224328994751, + "learning_rate": 9.73713157894737e-05, + "loss": 0.3775, + "step": 30003 + }, + { + "epoch": 1.6801433531190502, + "grad_norm": 1.1796660423278809, + "learning_rate": 9.737105263157895e-05, + "loss": 0.4234, + "step": 30004 + }, + { + "epoch": 1.6801993504311792, + "grad_norm": 1.211627721786499, + "learning_rate": 9.73707894736842e-05, + "loss": 0.4587, + "step": 30005 + }, + { + "epoch": 1.6802553477433082, + "grad_norm": 1.2654528617858887, + "learning_rate": 9.737052631578948e-05, + "loss": 0.4753, + "step": 30006 + }, + { + "epoch": 1.6803113450554372, + "grad_norm": 1.498504400253296, + "learning_rate": 9.737026315789474e-05, + "loss": 0.3996, + "step": 30007 + }, + { + "epoch": 1.6803673423675662, + "grad_norm": 1.0283262729644775, + "learning_rate": 9.737000000000001e-05, + "loss": 0.3684, + "step": 30008 + }, + { + "epoch": 1.6804233396796953, + "grad_norm": 1.4832878112792969, + "learning_rate": 9.736973684210526e-05, + "loss": 0.5157, + "step": 30009 + }, + { + "epoch": 1.6804793369918243, + "grad_norm": 1.3012858629226685, + "learning_rate": 9.736947368421053e-05, + "loss": 0.5147, + "step": 30010 + }, + { + "epoch": 1.6805353343039533, + "grad_norm": 1.2735188007354736, + "learning_rate": 9.736921052631579e-05, + "loss": 0.5182, + "step": 30011 + }, + { + "epoch": 1.6805913316160823, + "grad_norm": 1.273616909980774, + "learning_rate": 9.736894736842106e-05, + "loss": 0.4831, + "step": 30012 + }, + { + "epoch": 1.6806473289282113, + "grad_norm": 1.2145116329193115, + "learning_rate": 9.736868421052632e-05, + "loss": 0.4781, + "step": 30013 + }, + { + "epoch": 1.6807033262403404, + "grad_norm": 1.4690943956375122, + "learning_rate": 9.736842105263158e-05, + "loss": 0.677, + "step": 30014 + }, + { + "epoch": 1.6807593235524694, + "grad_norm": 1.4068059921264648, + "learning_rate": 9.736815789473684e-05, + "loss": 0.4209, + "step": 30015 + }, + { + "epoch": 1.6808153208645984, + "grad_norm": 1.0696946382522583, + "learning_rate": 9.736789473684212e-05, + "loss": 0.4339, + "step": 30016 + }, + { + "epoch": 1.6808713181767274, + "grad_norm": 1.1998757123947144, + "learning_rate": 9.736763157894738e-05, + "loss": 0.4082, + "step": 30017 + }, + { + "epoch": 1.6809273154888564, + "grad_norm": 1.4615795612335205, + "learning_rate": 9.736736842105264e-05, + "loss": 0.4241, + "step": 30018 + }, + { + "epoch": 1.6809833128009855, + "grad_norm": 1.3013535737991333, + "learning_rate": 9.73671052631579e-05, + "loss": 0.4069, + "step": 30019 + }, + { + "epoch": 1.6810393101131145, + "grad_norm": 1.12428617477417, + "learning_rate": 9.736684210526317e-05, + "loss": 0.4252, + "step": 30020 + }, + { + "epoch": 1.6810953074252435, + "grad_norm": 1.2561031579971313, + "learning_rate": 9.736657894736843e-05, + "loss": 0.4285, + "step": 30021 + }, + { + "epoch": 1.6811513047373725, + "grad_norm": 1.296723484992981, + "learning_rate": 9.736631578947369e-05, + "loss": 0.5986, + "step": 30022 + }, + { + "epoch": 1.6812073020495015, + "grad_norm": 1.1818569898605347, + "learning_rate": 9.736605263157895e-05, + "loss": 0.4809, + "step": 30023 + }, + { + "epoch": 1.6812632993616305, + "grad_norm": 1.256476879119873, + "learning_rate": 9.736578947368421e-05, + "loss": 0.4643, + "step": 30024 + }, + { + "epoch": 1.6813192966737596, + "grad_norm": 1.585464596748352, + "learning_rate": 9.736552631578948e-05, + "loss": 0.4811, + "step": 30025 + }, + { + "epoch": 1.6813752939858886, + "grad_norm": 1.2282109260559082, + "learning_rate": 9.736526315789474e-05, + "loss": 0.4674, + "step": 30026 + }, + { + "epoch": 1.6814312912980176, + "grad_norm": 2.262535810470581, + "learning_rate": 9.7365e-05, + "loss": 0.5656, + "step": 30027 + }, + { + "epoch": 1.6814872886101466, + "grad_norm": 1.455905795097351, + "learning_rate": 9.736473684210526e-05, + "loss": 0.6624, + "step": 30028 + }, + { + "epoch": 1.6815432859222756, + "grad_norm": 1.2129590511322021, + "learning_rate": 9.736447368421053e-05, + "loss": 0.3836, + "step": 30029 + }, + { + "epoch": 1.6815992832344047, + "grad_norm": 1.4859960079193115, + "learning_rate": 9.736421052631579e-05, + "loss": 0.401, + "step": 30030 + }, + { + "epoch": 1.6816552805465337, + "grad_norm": 1.437681794166565, + "learning_rate": 9.736394736842107e-05, + "loss": 0.4562, + "step": 30031 + }, + { + "epoch": 1.6817112778586627, + "grad_norm": 1.2419911623001099, + "learning_rate": 9.736368421052631e-05, + "loss": 0.5159, + "step": 30032 + }, + { + "epoch": 1.6817672751707917, + "grad_norm": 1.2178417444229126, + "learning_rate": 9.736342105263159e-05, + "loss": 0.4272, + "step": 30033 + }, + { + "epoch": 1.6818232724829207, + "grad_norm": 1.9885791540145874, + "learning_rate": 9.736315789473685e-05, + "loss": 0.7109, + "step": 30034 + }, + { + "epoch": 1.6818792697950498, + "grad_norm": 1.2620553970336914, + "learning_rate": 9.736289473684212e-05, + "loss": 0.4715, + "step": 30035 + }, + { + "epoch": 1.6819352671071788, + "grad_norm": 1.548359990119934, + "learning_rate": 9.736263157894738e-05, + "loss": 0.4702, + "step": 30036 + }, + { + "epoch": 1.6819912644193078, + "grad_norm": 2.5032172203063965, + "learning_rate": 9.736236842105264e-05, + "loss": 0.5865, + "step": 30037 + }, + { + "epoch": 1.6820472617314368, + "grad_norm": 1.3945519924163818, + "learning_rate": 9.73621052631579e-05, + "loss": 0.4448, + "step": 30038 + }, + { + "epoch": 1.6821032590435658, + "grad_norm": 1.1375930309295654, + "learning_rate": 9.736184210526317e-05, + "loss": 0.3363, + "step": 30039 + }, + { + "epoch": 1.6821592563556949, + "grad_norm": 1.2704322338104248, + "learning_rate": 9.736157894736843e-05, + "loss": 0.4223, + "step": 30040 + }, + { + "epoch": 1.6822152536678239, + "grad_norm": 1.4568498134613037, + "learning_rate": 9.736131578947368e-05, + "loss": 0.5237, + "step": 30041 + }, + { + "epoch": 1.682271250979953, + "grad_norm": 1.1758419275283813, + "learning_rate": 9.736105263157895e-05, + "loss": 0.4233, + "step": 30042 + }, + { + "epoch": 1.682327248292082, + "grad_norm": 1.3888391256332397, + "learning_rate": 9.736078947368421e-05, + "loss": 0.4914, + "step": 30043 + }, + { + "epoch": 1.682383245604211, + "grad_norm": 1.6250340938568115, + "learning_rate": 9.736052631578948e-05, + "loss": 0.5952, + "step": 30044 + }, + { + "epoch": 1.68243924291634, + "grad_norm": 2.2349987030029297, + "learning_rate": 9.736026315789474e-05, + "loss": 0.5461, + "step": 30045 + }, + { + "epoch": 1.682495240228469, + "grad_norm": 1.2714875936508179, + "learning_rate": 9.736e-05, + "loss": 0.301, + "step": 30046 + }, + { + "epoch": 1.682551237540598, + "grad_norm": 1.311805009841919, + "learning_rate": 9.735973684210526e-05, + "loss": 0.4269, + "step": 30047 + }, + { + "epoch": 1.682607234852727, + "grad_norm": 1.1518783569335938, + "learning_rate": 9.735947368421054e-05, + "loss": 0.6915, + "step": 30048 + }, + { + "epoch": 1.682663232164856, + "grad_norm": 1.5244580507278442, + "learning_rate": 9.73592105263158e-05, + "loss": 0.599, + "step": 30049 + }, + { + "epoch": 1.682719229476985, + "grad_norm": 1.5039039850234985, + "learning_rate": 9.735894736842106e-05, + "loss": 0.4116, + "step": 30050 + }, + { + "epoch": 1.682775226789114, + "grad_norm": 1.5512877702713013, + "learning_rate": 9.735868421052632e-05, + "loss": 0.4742, + "step": 30051 + }, + { + "epoch": 1.682831224101243, + "grad_norm": 1.1976828575134277, + "learning_rate": 9.735842105263159e-05, + "loss": 0.4459, + "step": 30052 + }, + { + "epoch": 1.682887221413372, + "grad_norm": 1.203157901763916, + "learning_rate": 9.735815789473685e-05, + "loss": 0.393, + "step": 30053 + }, + { + "epoch": 1.6829432187255011, + "grad_norm": 1.2579879760742188, + "learning_rate": 9.735789473684211e-05, + "loss": 0.4515, + "step": 30054 + }, + { + "epoch": 1.6829992160376301, + "grad_norm": 1.4593232870101929, + "learning_rate": 9.735763157894737e-05, + "loss": 0.5174, + "step": 30055 + }, + { + "epoch": 1.6830552133497592, + "grad_norm": 1.5238014459609985, + "learning_rate": 9.735736842105264e-05, + "loss": 0.5091, + "step": 30056 + }, + { + "epoch": 1.6831112106618882, + "grad_norm": 1.6702297925949097, + "learning_rate": 9.73571052631579e-05, + "loss": 0.5301, + "step": 30057 + }, + { + "epoch": 1.6831672079740172, + "grad_norm": 1.409104824066162, + "learning_rate": 9.735684210526316e-05, + "loss": 0.5162, + "step": 30058 + }, + { + "epoch": 1.6832232052861462, + "grad_norm": 1.4003217220306396, + "learning_rate": 9.735657894736842e-05, + "loss": 0.4034, + "step": 30059 + }, + { + "epoch": 1.6832792025982752, + "grad_norm": 1.0628284215927124, + "learning_rate": 9.735631578947368e-05, + "loss": 0.3403, + "step": 30060 + }, + { + "epoch": 1.6833351999104043, + "grad_norm": 1.4191914796829224, + "learning_rate": 9.735605263157895e-05, + "loss": 0.4711, + "step": 30061 + }, + { + "epoch": 1.6833911972225333, + "grad_norm": 1.4225420951843262, + "learning_rate": 9.735578947368421e-05, + "loss": 0.4781, + "step": 30062 + }, + { + "epoch": 1.6834471945346623, + "grad_norm": 1.2523552179336548, + "learning_rate": 9.735552631578949e-05, + "loss": 0.3511, + "step": 30063 + }, + { + "epoch": 1.6835031918467913, + "grad_norm": 1.1969324350357056, + "learning_rate": 9.735526315789473e-05, + "loss": 0.4782, + "step": 30064 + }, + { + "epoch": 1.6835591891589203, + "grad_norm": 1.314456820487976, + "learning_rate": 9.7355e-05, + "loss": 0.3676, + "step": 30065 + }, + { + "epoch": 1.6836151864710494, + "grad_norm": 3.1269609928131104, + "learning_rate": 9.735473684210527e-05, + "loss": 0.4836, + "step": 30066 + }, + { + "epoch": 1.6836711837831784, + "grad_norm": 1.3982963562011719, + "learning_rate": 9.735447368421054e-05, + "loss": 0.5443, + "step": 30067 + }, + { + "epoch": 1.6837271810953074, + "grad_norm": 1.2770054340362549, + "learning_rate": 9.73542105263158e-05, + "loss": 0.3972, + "step": 30068 + }, + { + "epoch": 1.6837831784074364, + "grad_norm": 1.6281217336654663, + "learning_rate": 9.735394736842106e-05, + "loss": 0.4614, + "step": 30069 + }, + { + "epoch": 1.6838391757195654, + "grad_norm": 1.2229751348495483, + "learning_rate": 9.735368421052632e-05, + "loss": 0.617, + "step": 30070 + }, + { + "epoch": 1.6838951730316944, + "grad_norm": 1.2579398155212402, + "learning_rate": 9.735342105263159e-05, + "loss": 0.5079, + "step": 30071 + }, + { + "epoch": 1.6839511703438235, + "grad_norm": 1.5189852714538574, + "learning_rate": 9.735315789473685e-05, + "loss": 0.4688, + "step": 30072 + }, + { + "epoch": 1.6840071676559525, + "grad_norm": 1.2876662015914917, + "learning_rate": 9.735289473684211e-05, + "loss": 0.4895, + "step": 30073 + }, + { + "epoch": 1.6840631649680815, + "grad_norm": 1.3178309202194214, + "learning_rate": 9.735263157894737e-05, + "loss": 0.5764, + "step": 30074 + }, + { + "epoch": 1.6841191622802105, + "grad_norm": 1.5533462762832642, + "learning_rate": 9.735236842105263e-05, + "loss": 0.5602, + "step": 30075 + }, + { + "epoch": 1.6841751595923395, + "grad_norm": 1.2621676921844482, + "learning_rate": 9.73521052631579e-05, + "loss": 0.4298, + "step": 30076 + }, + { + "epoch": 1.6842311569044686, + "grad_norm": 1.3253891468048096, + "learning_rate": 9.735184210526316e-05, + "loss": 0.6581, + "step": 30077 + }, + { + "epoch": 1.6842871542165976, + "grad_norm": 1.3398269414901733, + "learning_rate": 9.735157894736842e-05, + "loss": 0.5109, + "step": 30078 + }, + { + "epoch": 1.6843431515287266, + "grad_norm": 1.0978977680206299, + "learning_rate": 9.735131578947368e-05, + "loss": 0.3186, + "step": 30079 + }, + { + "epoch": 1.6843991488408556, + "grad_norm": 1.2364259958267212, + "learning_rate": 9.735105263157896e-05, + "loss": 0.4888, + "step": 30080 + }, + { + "epoch": 1.6844551461529846, + "grad_norm": 1.2050082683563232, + "learning_rate": 9.735078947368422e-05, + "loss": 0.3848, + "step": 30081 + }, + { + "epoch": 1.6845111434651137, + "grad_norm": 1.363664150238037, + "learning_rate": 9.735052631578948e-05, + "loss": 0.3568, + "step": 30082 + }, + { + "epoch": 1.6845671407772427, + "grad_norm": 1.1327582597732544, + "learning_rate": 9.735026315789473e-05, + "loss": 0.3377, + "step": 30083 + }, + { + "epoch": 1.6846231380893717, + "grad_norm": 1.2636672258377075, + "learning_rate": 9.735000000000001e-05, + "loss": 0.3855, + "step": 30084 + }, + { + "epoch": 1.6846791354015007, + "grad_norm": 1.3640165328979492, + "learning_rate": 9.734973684210527e-05, + "loss": 0.3801, + "step": 30085 + }, + { + "epoch": 1.6847351327136297, + "grad_norm": 1.1210427284240723, + "learning_rate": 9.734947368421054e-05, + "loss": 0.4005, + "step": 30086 + }, + { + "epoch": 1.6847911300257588, + "grad_norm": 1.3407551050186157, + "learning_rate": 9.734921052631579e-05, + "loss": 0.5224, + "step": 30087 + }, + { + "epoch": 1.6848471273378878, + "grad_norm": 1.1256792545318604, + "learning_rate": 9.734894736842106e-05, + "loss": 0.3795, + "step": 30088 + }, + { + "epoch": 1.6849031246500168, + "grad_norm": 1.897878885269165, + "learning_rate": 9.734868421052632e-05, + "loss": 0.7088, + "step": 30089 + }, + { + "epoch": 1.6849591219621458, + "grad_norm": 1.8243905305862427, + "learning_rate": 9.73484210526316e-05, + "loss": 0.6018, + "step": 30090 + }, + { + "epoch": 1.6850151192742748, + "grad_norm": 1.3680564165115356, + "learning_rate": 9.734815789473685e-05, + "loss": 0.4472, + "step": 30091 + }, + { + "epoch": 1.6850711165864038, + "grad_norm": 1.147234320640564, + "learning_rate": 9.73478947368421e-05, + "loss": 0.3787, + "step": 30092 + }, + { + "epoch": 1.6851271138985329, + "grad_norm": 1.3384244441986084, + "learning_rate": 9.734763157894737e-05, + "loss": 0.5763, + "step": 30093 + }, + { + "epoch": 1.6851831112106619, + "grad_norm": 1.4484529495239258, + "learning_rate": 9.734736842105263e-05, + "loss": 0.5563, + "step": 30094 + }, + { + "epoch": 1.685239108522791, + "grad_norm": 1.1401987075805664, + "learning_rate": 9.73471052631579e-05, + "loss": 0.3952, + "step": 30095 + }, + { + "epoch": 1.68529510583492, + "grad_norm": 1.1800525188446045, + "learning_rate": 9.734684210526315e-05, + "loss": 0.4631, + "step": 30096 + }, + { + "epoch": 1.685351103147049, + "grad_norm": 1.7282980680465698, + "learning_rate": 9.734657894736843e-05, + "loss": 0.6157, + "step": 30097 + }, + { + "epoch": 1.685407100459178, + "grad_norm": 1.2098634243011475, + "learning_rate": 9.734631578947368e-05, + "loss": 0.49, + "step": 30098 + }, + { + "epoch": 1.685463097771307, + "grad_norm": 1.2357712984085083, + "learning_rate": 9.734605263157896e-05, + "loss": 0.4857, + "step": 30099 + }, + { + "epoch": 1.685519095083436, + "grad_norm": 1.075027585029602, + "learning_rate": 9.734578947368422e-05, + "loss": 0.4665, + "step": 30100 + }, + { + "epoch": 1.685575092395565, + "grad_norm": 1.2004560232162476, + "learning_rate": 9.734552631578948e-05, + "loss": 0.5046, + "step": 30101 + }, + { + "epoch": 1.685631089707694, + "grad_norm": 1.4363986253738403, + "learning_rate": 9.734526315789474e-05, + "loss": 0.4104, + "step": 30102 + }, + { + "epoch": 1.685687087019823, + "grad_norm": 1.3236922025680542, + "learning_rate": 9.734500000000001e-05, + "loss": 0.4833, + "step": 30103 + }, + { + "epoch": 1.685743084331952, + "grad_norm": 1.2907474040985107, + "learning_rate": 9.734473684210527e-05, + "loss": 0.5518, + "step": 30104 + }, + { + "epoch": 1.685799081644081, + "grad_norm": 1.906058430671692, + "learning_rate": 9.734447368421053e-05, + "loss": 0.6542, + "step": 30105 + }, + { + "epoch": 1.6858550789562101, + "grad_norm": 1.234045386314392, + "learning_rate": 9.734421052631579e-05, + "loss": 0.3836, + "step": 30106 + }, + { + "epoch": 1.6859110762683391, + "grad_norm": 1.3779826164245605, + "learning_rate": 9.734394736842106e-05, + "loss": 0.5039, + "step": 30107 + }, + { + "epoch": 1.6859670735804682, + "grad_norm": 1.2399606704711914, + "learning_rate": 9.734368421052632e-05, + "loss": 0.4581, + "step": 30108 + }, + { + "epoch": 1.6860230708925972, + "grad_norm": 1.3214467763900757, + "learning_rate": 9.734342105263158e-05, + "loss": 0.4162, + "step": 30109 + }, + { + "epoch": 1.6860790682047262, + "grad_norm": 1.1875327825546265, + "learning_rate": 9.734315789473684e-05, + "loss": 0.4735, + "step": 30110 + }, + { + "epoch": 1.6861350655168552, + "grad_norm": 1.963280200958252, + "learning_rate": 9.73428947368421e-05, + "loss": 0.6456, + "step": 30111 + }, + { + "epoch": 1.6861910628289842, + "grad_norm": 1.2360131740570068, + "learning_rate": 9.734263157894738e-05, + "loss": 0.5305, + "step": 30112 + }, + { + "epoch": 1.6862470601411133, + "grad_norm": 1.077210545539856, + "learning_rate": 9.734236842105264e-05, + "loss": 0.3636, + "step": 30113 + }, + { + "epoch": 1.6863030574532423, + "grad_norm": 1.512688398361206, + "learning_rate": 9.73421052631579e-05, + "loss": 0.5859, + "step": 30114 + }, + { + "epoch": 1.6863590547653713, + "grad_norm": 2.2748279571533203, + "learning_rate": 9.734184210526315e-05, + "loss": 0.5212, + "step": 30115 + }, + { + "epoch": 1.6864150520775003, + "grad_norm": 1.3436267375946045, + "learning_rate": 9.734157894736843e-05, + "loss": 0.411, + "step": 30116 + }, + { + "epoch": 1.6864710493896293, + "grad_norm": 1.226438045501709, + "learning_rate": 9.734131578947369e-05, + "loss": 0.385, + "step": 30117 + }, + { + "epoch": 1.6865270467017583, + "grad_norm": 1.1846208572387695, + "learning_rate": 9.734105263157896e-05, + "loss": 0.3664, + "step": 30118 + }, + { + "epoch": 1.6865830440138874, + "grad_norm": 1.4303958415985107, + "learning_rate": 9.734078947368421e-05, + "loss": 0.6189, + "step": 30119 + }, + { + "epoch": 1.6866390413260164, + "grad_norm": 1.2793289422988892, + "learning_rate": 9.734052631578948e-05, + "loss": 0.4675, + "step": 30120 + }, + { + "epoch": 1.6866950386381454, + "grad_norm": 1.8236483335494995, + "learning_rate": 9.734026315789474e-05, + "loss": 0.5422, + "step": 30121 + }, + { + "epoch": 1.6867510359502744, + "grad_norm": 1.384316086769104, + "learning_rate": 9.734000000000001e-05, + "loss": 0.4779, + "step": 30122 + }, + { + "epoch": 1.6868070332624034, + "grad_norm": 1.2739804983139038, + "learning_rate": 9.733973684210527e-05, + "loss": 0.5885, + "step": 30123 + }, + { + "epoch": 1.6868630305745325, + "grad_norm": 1.3124525547027588, + "learning_rate": 9.733947368421053e-05, + "loss": 0.3851, + "step": 30124 + }, + { + "epoch": 1.6869190278866615, + "grad_norm": 5.500936031341553, + "learning_rate": 9.733921052631579e-05, + "loss": 0.5684, + "step": 30125 + }, + { + "epoch": 1.6869750251987905, + "grad_norm": 1.1165839433670044, + "learning_rate": 9.733894736842105e-05, + "loss": 0.4695, + "step": 30126 + }, + { + "epoch": 1.6870310225109195, + "grad_norm": 17.776029586791992, + "learning_rate": 9.733868421052633e-05, + "loss": 0.4529, + "step": 30127 + }, + { + "epoch": 1.6870870198230485, + "grad_norm": 14.615177154541016, + "learning_rate": 9.733842105263159e-05, + "loss": 0.6338, + "step": 30128 + }, + { + "epoch": 1.6871430171351776, + "grad_norm": 1.2860603332519531, + "learning_rate": 9.733815789473684e-05, + "loss": 0.5732, + "step": 30129 + }, + { + "epoch": 1.6871990144473066, + "grad_norm": 1.4584449529647827, + "learning_rate": 9.73378947368421e-05, + "loss": 0.5255, + "step": 30130 + }, + { + "epoch": 1.6872550117594356, + "grad_norm": 1.2533122301101685, + "learning_rate": 9.733763157894738e-05, + "loss": 0.4487, + "step": 30131 + }, + { + "epoch": 1.6873110090715646, + "grad_norm": 1.1864601373672485, + "learning_rate": 9.733736842105264e-05, + "loss": 0.3937, + "step": 30132 + }, + { + "epoch": 1.6873670063836936, + "grad_norm": 1.6280063390731812, + "learning_rate": 9.73371052631579e-05, + "loss": 0.4634, + "step": 30133 + }, + { + "epoch": 1.6874230036958227, + "grad_norm": 1.5736427307128906, + "learning_rate": 9.733684210526316e-05, + "loss": 0.4416, + "step": 30134 + }, + { + "epoch": 1.6874790010079517, + "grad_norm": 3.5148539543151855, + "learning_rate": 9.733657894736843e-05, + "loss": 0.578, + "step": 30135 + }, + { + "epoch": 1.6875349983200807, + "grad_norm": 1.5513710975646973, + "learning_rate": 9.733631578947369e-05, + "loss": 0.4843, + "step": 30136 + }, + { + "epoch": 1.6875909956322097, + "grad_norm": 1.3073911666870117, + "learning_rate": 9.733605263157895e-05, + "loss": 0.3689, + "step": 30137 + }, + { + "epoch": 1.6876469929443387, + "grad_norm": 1.873498558998108, + "learning_rate": 9.733578947368421e-05, + "loss": 0.5152, + "step": 30138 + }, + { + "epoch": 1.6877029902564677, + "grad_norm": 1.4693621397018433, + "learning_rate": 9.733552631578948e-05, + "loss": 0.5006, + "step": 30139 + }, + { + "epoch": 1.6877589875685968, + "grad_norm": 1.352457880973816, + "learning_rate": 9.733526315789474e-05, + "loss": 0.5083, + "step": 30140 + }, + { + "epoch": 1.6878149848807258, + "grad_norm": 1.1852613687515259, + "learning_rate": 9.733500000000002e-05, + "loss": 0.3672, + "step": 30141 + }, + { + "epoch": 1.6878709821928548, + "grad_norm": 1.3586770296096802, + "learning_rate": 9.733473684210526e-05, + "loss": 0.5802, + "step": 30142 + }, + { + "epoch": 1.6879269795049838, + "grad_norm": 1.1743924617767334, + "learning_rate": 9.733447368421054e-05, + "loss": 0.3909, + "step": 30143 + }, + { + "epoch": 1.6879829768171128, + "grad_norm": 1.2252435684204102, + "learning_rate": 9.73342105263158e-05, + "loss": 0.3919, + "step": 30144 + }, + { + "epoch": 1.6880389741292419, + "grad_norm": 1.4737225770950317, + "learning_rate": 9.733394736842105e-05, + "loss": 0.4831, + "step": 30145 + }, + { + "epoch": 1.6880949714413709, + "grad_norm": 1.5884093046188354, + "learning_rate": 9.733368421052633e-05, + "loss": 0.3723, + "step": 30146 + }, + { + "epoch": 1.6881509687535, + "grad_norm": 1.301109790802002, + "learning_rate": 9.733342105263157e-05, + "loss": 0.5235, + "step": 30147 + }, + { + "epoch": 1.688206966065629, + "grad_norm": 1.361692190170288, + "learning_rate": 9.733315789473685e-05, + "loss": 0.5165, + "step": 30148 + }, + { + "epoch": 1.688262963377758, + "grad_norm": 1.2293809652328491, + "learning_rate": 9.733289473684211e-05, + "loss": 0.4645, + "step": 30149 + }, + { + "epoch": 1.688318960689887, + "grad_norm": 1.2032302618026733, + "learning_rate": 9.733263157894738e-05, + "loss": 0.5969, + "step": 30150 + }, + { + "epoch": 1.688374958002016, + "grad_norm": 1.4731817245483398, + "learning_rate": 9.733236842105263e-05, + "loss": 0.4357, + "step": 30151 + }, + { + "epoch": 1.688430955314145, + "grad_norm": 1.2632147073745728, + "learning_rate": 9.73321052631579e-05, + "loss": 0.4602, + "step": 30152 + }, + { + "epoch": 1.688486952626274, + "grad_norm": 1.2067543268203735, + "learning_rate": 9.733184210526316e-05, + "loss": 0.3663, + "step": 30153 + }, + { + "epoch": 1.688542949938403, + "grad_norm": 1.1950796842575073, + "learning_rate": 9.733157894736843e-05, + "loss": 0.359, + "step": 30154 + }, + { + "epoch": 1.688598947250532, + "grad_norm": 1.176719069480896, + "learning_rate": 9.733131578947369e-05, + "loss": 0.4632, + "step": 30155 + }, + { + "epoch": 1.688654944562661, + "grad_norm": 1.5808255672454834, + "learning_rate": 9.733105263157895e-05, + "loss": 0.4925, + "step": 30156 + }, + { + "epoch": 1.68871094187479, + "grad_norm": 1.0797353982925415, + "learning_rate": 9.733078947368421e-05, + "loss": 0.3291, + "step": 30157 + }, + { + "epoch": 1.6887669391869191, + "grad_norm": 1.3204952478408813, + "learning_rate": 9.733052631578949e-05, + "loss": 0.3987, + "step": 30158 + }, + { + "epoch": 1.6888229364990481, + "grad_norm": 1.4973983764648438, + "learning_rate": 9.733026315789475e-05, + "loss": 0.4919, + "step": 30159 + }, + { + "epoch": 1.6888789338111772, + "grad_norm": 1.1468061208724976, + "learning_rate": 9.733e-05, + "loss": 0.394, + "step": 30160 + }, + { + "epoch": 1.6889349311233062, + "grad_norm": 1.3484851121902466, + "learning_rate": 9.732973684210526e-05, + "loss": 0.383, + "step": 30161 + }, + { + "epoch": 1.6889909284354352, + "grad_norm": 1.2262699604034424, + "learning_rate": 9.732947368421052e-05, + "loss": 0.4681, + "step": 30162 + }, + { + "epoch": 1.6890469257475642, + "grad_norm": 1.2382701635360718, + "learning_rate": 9.73292105263158e-05, + "loss": 0.3634, + "step": 30163 + }, + { + "epoch": 1.6891029230596932, + "grad_norm": 1.236656904220581, + "learning_rate": 9.732894736842106e-05, + "loss": 0.5847, + "step": 30164 + }, + { + "epoch": 1.6891589203718222, + "grad_norm": 1.5243562459945679, + "learning_rate": 9.732868421052632e-05, + "loss": 0.4452, + "step": 30165 + }, + { + "epoch": 1.6892149176839513, + "grad_norm": 1.3775279521942139, + "learning_rate": 9.732842105263158e-05, + "loss": 0.466, + "step": 30166 + }, + { + "epoch": 1.6892709149960803, + "grad_norm": 1.2446755170822144, + "learning_rate": 9.732815789473685e-05, + "loss": 0.4259, + "step": 30167 + }, + { + "epoch": 1.6893269123082093, + "grad_norm": 1.2202911376953125, + "learning_rate": 9.732789473684211e-05, + "loss": 0.4717, + "step": 30168 + }, + { + "epoch": 1.6893829096203383, + "grad_norm": 1.2155464887619019, + "learning_rate": 9.732763157894737e-05, + "loss": 0.4708, + "step": 30169 + }, + { + "epoch": 1.6894389069324673, + "grad_norm": 1.3304054737091064, + "learning_rate": 9.732736842105263e-05, + "loss": 0.4863, + "step": 30170 + }, + { + "epoch": 1.6894949042445964, + "grad_norm": 1.3756487369537354, + "learning_rate": 9.73271052631579e-05, + "loss": 0.5803, + "step": 30171 + }, + { + "epoch": 1.6895509015567254, + "grad_norm": 1.2009427547454834, + "learning_rate": 9.732684210526316e-05, + "loss": 0.415, + "step": 30172 + }, + { + "epoch": 1.6896068988688544, + "grad_norm": 1.3241255283355713, + "learning_rate": 9.732657894736844e-05, + "loss": 0.5297, + "step": 30173 + }, + { + "epoch": 1.6896628961809834, + "grad_norm": 1.2313463687896729, + "learning_rate": 9.732631578947368e-05, + "loss": 0.4625, + "step": 30174 + }, + { + "epoch": 1.6897188934931124, + "grad_norm": 1.45566725730896, + "learning_rate": 9.732605263157896e-05, + "loss": 0.461, + "step": 30175 + }, + { + "epoch": 1.6897748908052415, + "grad_norm": 1.1152372360229492, + "learning_rate": 9.732578947368421e-05, + "loss": 0.4818, + "step": 30176 + }, + { + "epoch": 1.6898308881173705, + "grad_norm": 1.3938652276992798, + "learning_rate": 9.732552631578949e-05, + "loss": 0.3975, + "step": 30177 + }, + { + "epoch": 1.6898868854294995, + "grad_norm": 1.3016711473464966, + "learning_rate": 9.732526315789475e-05, + "loss": 0.4593, + "step": 30178 + }, + { + "epoch": 1.6899428827416285, + "grad_norm": 1.3438842296600342, + "learning_rate": 9.7325e-05, + "loss": 0.4991, + "step": 30179 + }, + { + "epoch": 1.6899988800537575, + "grad_norm": 1.6723095178604126, + "learning_rate": 9.732473684210527e-05, + "loss": 0.4055, + "step": 30180 + }, + { + "epoch": 1.6900548773658866, + "grad_norm": 1.1403666734695435, + "learning_rate": 9.732447368421053e-05, + "loss": 0.4457, + "step": 30181 + }, + { + "epoch": 1.6901108746780156, + "grad_norm": 1.7278417348861694, + "learning_rate": 9.73242105263158e-05, + "loss": 0.6594, + "step": 30182 + }, + { + "epoch": 1.6901668719901446, + "grad_norm": 1.2462953329086304, + "learning_rate": 9.732394736842106e-05, + "loss": 0.3927, + "step": 30183 + }, + { + "epoch": 1.6902228693022736, + "grad_norm": 1.1908814907073975, + "learning_rate": 9.732368421052632e-05, + "loss": 0.4063, + "step": 30184 + }, + { + "epoch": 1.6902788666144026, + "grad_norm": 1.24465811252594, + "learning_rate": 9.732342105263158e-05, + "loss": 0.4597, + "step": 30185 + }, + { + "epoch": 1.6903348639265316, + "grad_norm": 1.2822506427764893, + "learning_rate": 9.732315789473685e-05, + "loss": 0.3647, + "step": 30186 + }, + { + "epoch": 1.6903908612386607, + "grad_norm": 1.6926192045211792, + "learning_rate": 9.732289473684211e-05, + "loss": 0.5835, + "step": 30187 + }, + { + "epoch": 1.6904468585507897, + "grad_norm": 1.2964590787887573, + "learning_rate": 9.732263157894737e-05, + "loss": 0.4284, + "step": 30188 + }, + { + "epoch": 1.6905028558629187, + "grad_norm": 1.1388252973556519, + "learning_rate": 9.732236842105263e-05, + "loss": 0.5062, + "step": 30189 + }, + { + "epoch": 1.6905588531750477, + "grad_norm": 1.3133678436279297, + "learning_rate": 9.73221052631579e-05, + "loss": 0.4705, + "step": 30190 + }, + { + "epoch": 1.6906148504871767, + "grad_norm": 1.310320496559143, + "learning_rate": 9.732184210526316e-05, + "loss": 0.5138, + "step": 30191 + }, + { + "epoch": 1.6906708477993058, + "grad_norm": 1.4054476022720337, + "learning_rate": 9.732157894736842e-05, + "loss": 0.5325, + "step": 30192 + }, + { + "epoch": 1.6907268451114348, + "grad_norm": 1.578825831413269, + "learning_rate": 9.732131578947368e-05, + "loss": 0.4885, + "step": 30193 + }, + { + "epoch": 1.6907828424235638, + "grad_norm": 1.1037230491638184, + "learning_rate": 9.732105263157896e-05, + "loss": 0.3731, + "step": 30194 + }, + { + "epoch": 1.6908388397356928, + "grad_norm": 1.074224829673767, + "learning_rate": 9.732078947368422e-05, + "loss": 0.4612, + "step": 30195 + }, + { + "epoch": 1.6908948370478218, + "grad_norm": 1.2621155977249146, + "learning_rate": 9.732052631578948e-05, + "loss": 0.41, + "step": 30196 + }, + { + "epoch": 1.6909508343599509, + "grad_norm": 1.3471922874450684, + "learning_rate": 9.732026315789474e-05, + "loss": 0.4656, + "step": 30197 + }, + { + "epoch": 1.6910068316720799, + "grad_norm": 1.1260102987289429, + "learning_rate": 9.732e-05, + "loss": 0.5334, + "step": 30198 + }, + { + "epoch": 1.691062828984209, + "grad_norm": 1.2001433372497559, + "learning_rate": 9.731973684210527e-05, + "loss": 0.4569, + "step": 30199 + }, + { + "epoch": 1.691118826296338, + "grad_norm": 1.347712516784668, + "learning_rate": 9.731947368421053e-05, + "loss": 0.553, + "step": 30200 + }, + { + "epoch": 1.691174823608467, + "grad_norm": 1.417059302330017, + "learning_rate": 9.731921052631579e-05, + "loss": 0.6363, + "step": 30201 + }, + { + "epoch": 1.691230820920596, + "grad_norm": 1.2156375646591187, + "learning_rate": 9.731894736842105e-05, + "loss": 0.4198, + "step": 30202 + }, + { + "epoch": 1.691286818232725, + "grad_norm": 1.3916971683502197, + "learning_rate": 9.731868421052632e-05, + "loss": 0.4246, + "step": 30203 + }, + { + "epoch": 1.691342815544854, + "grad_norm": 1.130895972251892, + "learning_rate": 9.731842105263158e-05, + "loss": 0.3977, + "step": 30204 + }, + { + "epoch": 1.691398812856983, + "grad_norm": 1.2875492572784424, + "learning_rate": 9.731815789473686e-05, + "loss": 0.5469, + "step": 30205 + }, + { + "epoch": 1.691454810169112, + "grad_norm": 1.2050881385803223, + "learning_rate": 9.73178947368421e-05, + "loss": 0.4667, + "step": 30206 + }, + { + "epoch": 1.691510807481241, + "grad_norm": 1.1337792873382568, + "learning_rate": 9.731763157894737e-05, + "loss": 0.4197, + "step": 30207 + }, + { + "epoch": 1.69156680479337, + "grad_norm": 1.0673487186431885, + "learning_rate": 9.731736842105263e-05, + "loss": 0.3491, + "step": 30208 + }, + { + "epoch": 1.691622802105499, + "grad_norm": 1.392971158027649, + "learning_rate": 9.731710526315791e-05, + "loss": 0.6227, + "step": 30209 + }, + { + "epoch": 1.691678799417628, + "grad_norm": 1.138738989830017, + "learning_rate": 9.731684210526317e-05, + "loss": 0.4073, + "step": 30210 + }, + { + "epoch": 1.6917347967297571, + "grad_norm": 1.3383424282073975, + "learning_rate": 9.731657894736843e-05, + "loss": 0.4484, + "step": 30211 + }, + { + "epoch": 1.6917907940418861, + "grad_norm": 1.3517189025878906, + "learning_rate": 9.731631578947369e-05, + "loss": 0.3765, + "step": 30212 + }, + { + "epoch": 1.6918467913540152, + "grad_norm": 1.1367801427841187, + "learning_rate": 9.731605263157895e-05, + "loss": 0.3876, + "step": 30213 + }, + { + "epoch": 1.6919027886661442, + "grad_norm": 1.1613292694091797, + "learning_rate": 9.731578947368422e-05, + "loss": 0.4214, + "step": 30214 + }, + { + "epoch": 1.6919587859782732, + "grad_norm": 1.3283835649490356, + "learning_rate": 9.731552631578948e-05, + "loss": 0.382, + "step": 30215 + }, + { + "epoch": 1.6920147832904022, + "grad_norm": 2.5240118503570557, + "learning_rate": 9.731526315789474e-05, + "loss": 0.4319, + "step": 30216 + }, + { + "epoch": 1.6920707806025312, + "grad_norm": 1.573872685432434, + "learning_rate": 9.7315e-05, + "loss": 0.4483, + "step": 30217 + }, + { + "epoch": 1.6921267779146603, + "grad_norm": 1.1742709875106812, + "learning_rate": 9.731473684210527e-05, + "loss": 0.5286, + "step": 30218 + }, + { + "epoch": 1.692182775226789, + "grad_norm": 1.2130649089813232, + "learning_rate": 9.731447368421053e-05, + "loss": 0.4015, + "step": 30219 + }, + { + "epoch": 1.692238772538918, + "grad_norm": 1.6368238925933838, + "learning_rate": 9.731421052631579e-05, + "loss": 0.6805, + "step": 30220 + }, + { + "epoch": 1.692294769851047, + "grad_norm": 1.2728393077850342, + "learning_rate": 9.731394736842105e-05, + "loss": 0.4743, + "step": 30221 + }, + { + "epoch": 1.6923507671631761, + "grad_norm": 1.1846604347229004, + "learning_rate": 9.731368421052632e-05, + "loss": 0.4199, + "step": 30222 + }, + { + "epoch": 1.6924067644753051, + "grad_norm": 1.1958200931549072, + "learning_rate": 9.731342105263158e-05, + "loss": 0.5363, + "step": 30223 + }, + { + "epoch": 1.6924627617874342, + "grad_norm": 1.304478645324707, + "learning_rate": 9.731315789473684e-05, + "loss": 0.5076, + "step": 30224 + }, + { + "epoch": 1.6925187590995632, + "grad_norm": 1.5974457263946533, + "learning_rate": 9.73128947368421e-05, + "loss": 0.5178, + "step": 30225 + }, + { + "epoch": 1.6925747564116922, + "grad_norm": 0.9744231700897217, + "learning_rate": 9.731263157894738e-05, + "loss": 0.4195, + "step": 30226 + }, + { + "epoch": 1.6926307537238212, + "grad_norm": 1.188340663909912, + "learning_rate": 9.731236842105264e-05, + "loss": 0.4446, + "step": 30227 + }, + { + "epoch": 1.6926867510359502, + "grad_norm": 1.4804326295852661, + "learning_rate": 9.731210526315791e-05, + "loss": 0.587, + "step": 30228 + }, + { + "epoch": 1.6927427483480793, + "grad_norm": 1.5981879234313965, + "learning_rate": 9.731184210526316e-05, + "loss": 0.4969, + "step": 30229 + }, + { + "epoch": 1.6927987456602083, + "grad_norm": 1.0407193899154663, + "learning_rate": 9.731157894736842e-05, + "loss": 0.365, + "step": 30230 + }, + { + "epoch": 1.6928547429723373, + "grad_norm": 1.2792677879333496, + "learning_rate": 9.731131578947369e-05, + "loss": 0.584, + "step": 30231 + }, + { + "epoch": 1.6929107402844663, + "grad_norm": 1.2894998788833618, + "learning_rate": 9.731105263157895e-05, + "loss": 0.4905, + "step": 30232 + }, + { + "epoch": 1.6929667375965953, + "grad_norm": 1.29068124294281, + "learning_rate": 9.731078947368422e-05, + "loss": 0.4458, + "step": 30233 + }, + { + "epoch": 1.6930227349087243, + "grad_norm": 1.3974711894989014, + "learning_rate": 9.731052631578947e-05, + "loss": 0.5001, + "step": 30234 + }, + { + "epoch": 1.6930787322208534, + "grad_norm": 1.2442253828048706, + "learning_rate": 9.731026315789474e-05, + "loss": 0.2887, + "step": 30235 + }, + { + "epoch": 1.6931347295329824, + "grad_norm": 1.3117127418518066, + "learning_rate": 9.731e-05, + "loss": 0.4423, + "step": 30236 + }, + { + "epoch": 1.6931907268451114, + "grad_norm": 1.3299121856689453, + "learning_rate": 9.730973684210528e-05, + "loss": 0.5066, + "step": 30237 + }, + { + "epoch": 1.6932467241572404, + "grad_norm": 1.3412389755249023, + "learning_rate": 9.730947368421053e-05, + "loss": 0.3981, + "step": 30238 + }, + { + "epoch": 1.6933027214693694, + "grad_norm": 1.9414891004562378, + "learning_rate": 9.73092105263158e-05, + "loss": 0.5383, + "step": 30239 + }, + { + "epoch": 1.6933587187814985, + "grad_norm": 1.3416924476623535, + "learning_rate": 9.730894736842105e-05, + "loss": 0.3939, + "step": 30240 + }, + { + "epoch": 1.6934147160936275, + "grad_norm": 1.4639008045196533, + "learning_rate": 9.730868421052633e-05, + "loss": 0.4688, + "step": 30241 + }, + { + "epoch": 1.6934707134057565, + "grad_norm": 1.4314970970153809, + "learning_rate": 9.730842105263159e-05, + "loss": 0.4478, + "step": 30242 + }, + { + "epoch": 1.6935267107178855, + "grad_norm": 1.1324588060379028, + "learning_rate": 9.730815789473685e-05, + "loss": 0.4582, + "step": 30243 + }, + { + "epoch": 1.6935827080300145, + "grad_norm": 1.1943910121917725, + "learning_rate": 9.73078947368421e-05, + "loss": 0.3673, + "step": 30244 + }, + { + "epoch": 1.6936387053421436, + "grad_norm": 1.440931797027588, + "learning_rate": 9.730763157894738e-05, + "loss": 0.5169, + "step": 30245 + }, + { + "epoch": 1.6936947026542726, + "grad_norm": 1.5068817138671875, + "learning_rate": 9.730736842105264e-05, + "loss": 0.4397, + "step": 30246 + }, + { + "epoch": 1.6937506999664016, + "grad_norm": 1.0686894655227661, + "learning_rate": 9.73071052631579e-05, + "loss": 0.4375, + "step": 30247 + }, + { + "epoch": 1.6938066972785306, + "grad_norm": 1.2193092107772827, + "learning_rate": 9.730684210526316e-05, + "loss": 0.4532, + "step": 30248 + }, + { + "epoch": 1.6938626945906596, + "grad_norm": 1.2892290353775024, + "learning_rate": 9.730657894736842e-05, + "loss": 0.5527, + "step": 30249 + }, + { + "epoch": 1.6939186919027887, + "grad_norm": 1.4779232740402222, + "learning_rate": 9.730631578947369e-05, + "loss": 0.4601, + "step": 30250 + }, + { + "epoch": 1.6939746892149177, + "grad_norm": 1.5048413276672363, + "learning_rate": 9.730605263157895e-05, + "loss": 0.4469, + "step": 30251 + }, + { + "epoch": 1.6940306865270467, + "grad_norm": 1.171181082725525, + "learning_rate": 9.730578947368421e-05, + "loss": 0.422, + "step": 30252 + }, + { + "epoch": 1.6940866838391757, + "grad_norm": 1.3899632692337036, + "learning_rate": 9.730552631578947e-05, + "loss": 0.378, + "step": 30253 + }, + { + "epoch": 1.6941426811513047, + "grad_norm": 1.1190667152404785, + "learning_rate": 9.730526315789474e-05, + "loss": 0.3367, + "step": 30254 + }, + { + "epoch": 1.6941986784634337, + "grad_norm": 1.3638272285461426, + "learning_rate": 9.7305e-05, + "loss": 0.5276, + "step": 30255 + }, + { + "epoch": 1.6942546757755628, + "grad_norm": 1.307783603668213, + "learning_rate": 9.730473684210526e-05, + "loss": 0.5087, + "step": 30256 + }, + { + "epoch": 1.6943106730876918, + "grad_norm": 1.137140154838562, + "learning_rate": 9.730447368421052e-05, + "loss": 0.4028, + "step": 30257 + }, + { + "epoch": 1.6943666703998208, + "grad_norm": 1.8036515712738037, + "learning_rate": 9.73042105263158e-05, + "loss": 0.5313, + "step": 30258 + }, + { + "epoch": 1.6944226677119498, + "grad_norm": 1.3460479974746704, + "learning_rate": 9.730394736842106e-05, + "loss": 0.4697, + "step": 30259 + }, + { + "epoch": 1.6944786650240788, + "grad_norm": 1.4281642436981201, + "learning_rate": 9.730368421052633e-05, + "loss": 0.5596, + "step": 30260 + }, + { + "epoch": 1.6945346623362079, + "grad_norm": 1.271577000617981, + "learning_rate": 9.730342105263158e-05, + "loss": 0.3794, + "step": 30261 + }, + { + "epoch": 1.6945906596483369, + "grad_norm": 1.4523308277130127, + "learning_rate": 9.730315789473685e-05, + "loss": 0.5573, + "step": 30262 + }, + { + "epoch": 1.694646656960466, + "grad_norm": 1.1428872346878052, + "learning_rate": 9.730289473684211e-05, + "loss": 0.4664, + "step": 30263 + }, + { + "epoch": 1.694702654272595, + "grad_norm": 1.3204830884933472, + "learning_rate": 9.730263157894738e-05, + "loss": 0.611, + "step": 30264 + }, + { + "epoch": 1.694758651584724, + "grad_norm": 1.1934540271759033, + "learning_rate": 9.730236842105264e-05, + "loss": 0.3084, + "step": 30265 + }, + { + "epoch": 1.694814648896853, + "grad_norm": 1.0935078859329224, + "learning_rate": 9.730210526315789e-05, + "loss": 0.41, + "step": 30266 + }, + { + "epoch": 1.694870646208982, + "grad_norm": 1.3033454418182373, + "learning_rate": 9.730184210526316e-05, + "loss": 0.5077, + "step": 30267 + }, + { + "epoch": 1.694926643521111, + "grad_norm": 1.4620641469955444, + "learning_rate": 9.730157894736842e-05, + "loss": 0.4244, + "step": 30268 + }, + { + "epoch": 1.69498264083324, + "grad_norm": 1.3887606859207153, + "learning_rate": 9.73013157894737e-05, + "loss": 0.5175, + "step": 30269 + }, + { + "epoch": 1.695038638145369, + "grad_norm": 1.2040472030639648, + "learning_rate": 9.730105263157895e-05, + "loss": 0.4533, + "step": 30270 + }, + { + "epoch": 1.695094635457498, + "grad_norm": 1.8510829210281372, + "learning_rate": 9.730078947368421e-05, + "loss": 0.5017, + "step": 30271 + }, + { + "epoch": 1.695150632769627, + "grad_norm": 1.2853960990905762, + "learning_rate": 9.730052631578947e-05, + "loss": 0.3734, + "step": 30272 + }, + { + "epoch": 1.695206630081756, + "grad_norm": 1.4359183311462402, + "learning_rate": 9.730026315789475e-05, + "loss": 0.4473, + "step": 30273 + }, + { + "epoch": 1.6952626273938851, + "grad_norm": 1.0984580516815186, + "learning_rate": 9.730000000000001e-05, + "loss": 0.3766, + "step": 30274 + }, + { + "epoch": 1.6953186247060141, + "grad_norm": 1.4857829809188843, + "learning_rate": 9.729973684210527e-05, + "loss": 0.5342, + "step": 30275 + }, + { + "epoch": 1.6953746220181432, + "grad_norm": 1.3552742004394531, + "learning_rate": 9.729947368421053e-05, + "loss": 0.3975, + "step": 30276 + }, + { + "epoch": 1.6954306193302722, + "grad_norm": 1.2708369493484497, + "learning_rate": 9.72992105263158e-05, + "loss": 0.4116, + "step": 30277 + }, + { + "epoch": 1.6954866166424012, + "grad_norm": 1.426080584526062, + "learning_rate": 9.729894736842106e-05, + "loss": 0.5349, + "step": 30278 + }, + { + "epoch": 1.6955426139545302, + "grad_norm": 1.2011563777923584, + "learning_rate": 9.729868421052632e-05, + "loss": 0.388, + "step": 30279 + }, + { + "epoch": 1.6955986112666592, + "grad_norm": 1.3876416683197021, + "learning_rate": 9.729842105263158e-05, + "loss": 0.3935, + "step": 30280 + }, + { + "epoch": 1.6956546085787882, + "grad_norm": 1.0139832496643066, + "learning_rate": 9.729815789473685e-05, + "loss": 0.4902, + "step": 30281 + }, + { + "epoch": 1.6957106058909173, + "grad_norm": 1.6276042461395264, + "learning_rate": 9.729789473684211e-05, + "loss": 0.388, + "step": 30282 + }, + { + "epoch": 1.6957666032030463, + "grad_norm": 1.4506871700286865, + "learning_rate": 9.729763157894737e-05, + "loss": 0.4478, + "step": 30283 + }, + { + "epoch": 1.6958226005151753, + "grad_norm": 1.4444221258163452, + "learning_rate": 9.729736842105263e-05, + "loss": 0.4424, + "step": 30284 + }, + { + "epoch": 1.6958785978273043, + "grad_norm": 1.1168495416641235, + "learning_rate": 9.729710526315789e-05, + "loss": 0.3262, + "step": 30285 + }, + { + "epoch": 1.6959345951394333, + "grad_norm": 1.3173397779464722, + "learning_rate": 9.729684210526316e-05, + "loss": 0.4328, + "step": 30286 + }, + { + "epoch": 1.6959905924515624, + "grad_norm": 1.8876169919967651, + "learning_rate": 9.729657894736842e-05, + "loss": 0.4391, + "step": 30287 + }, + { + "epoch": 1.6960465897636914, + "grad_norm": 1.3588616847991943, + "learning_rate": 9.72963157894737e-05, + "loss": 0.3272, + "step": 30288 + }, + { + "epoch": 1.6961025870758204, + "grad_norm": 1.0950651168823242, + "learning_rate": 9.729605263157894e-05, + "loss": 0.4377, + "step": 30289 + }, + { + "epoch": 1.6961585843879494, + "grad_norm": 1.5283358097076416, + "learning_rate": 9.729578947368422e-05, + "loss": 0.4147, + "step": 30290 + }, + { + "epoch": 1.6962145817000784, + "grad_norm": 1.4009937047958374, + "learning_rate": 9.729552631578948e-05, + "loss": 0.5298, + "step": 30291 + }, + { + "epoch": 1.6962705790122075, + "grad_norm": 1.4278610944747925, + "learning_rate": 9.729526315789475e-05, + "loss": 0.4376, + "step": 30292 + }, + { + "epoch": 1.6963265763243365, + "grad_norm": 2.084202289581299, + "learning_rate": 9.729500000000001e-05, + "loss": 0.6569, + "step": 30293 + }, + { + "epoch": 1.6963825736364655, + "grad_norm": 1.378486156463623, + "learning_rate": 9.729473684210527e-05, + "loss": 0.4371, + "step": 30294 + }, + { + "epoch": 1.6964385709485945, + "grad_norm": 1.3103843927383423, + "learning_rate": 9.729447368421053e-05, + "loss": 0.4739, + "step": 30295 + }, + { + "epoch": 1.6964945682607235, + "grad_norm": 1.1588494777679443, + "learning_rate": 9.72942105263158e-05, + "loss": 0.3367, + "step": 30296 + }, + { + "epoch": 1.6965505655728526, + "grad_norm": 1.5184040069580078, + "learning_rate": 9.729394736842106e-05, + "loss": 0.7053, + "step": 30297 + }, + { + "epoch": 1.6966065628849816, + "grad_norm": 1.2309514284133911, + "learning_rate": 9.729368421052632e-05, + "loss": 0.3411, + "step": 30298 + }, + { + "epoch": 1.6966625601971106, + "grad_norm": 1.477604866027832, + "learning_rate": 9.729342105263158e-05, + "loss": 0.6026, + "step": 30299 + }, + { + "epoch": 1.6967185575092396, + "grad_norm": 1.1118342876434326, + "learning_rate": 9.729315789473684e-05, + "loss": 0.4385, + "step": 30300 + }, + { + "epoch": 1.6967745548213684, + "grad_norm": 1.553444266319275, + "learning_rate": 9.729289473684211e-05, + "loss": 0.6059, + "step": 30301 + }, + { + "epoch": 1.6968305521334974, + "grad_norm": 1.1063417196273804, + "learning_rate": 9.729263157894737e-05, + "loss": 0.3791, + "step": 30302 + }, + { + "epoch": 1.6968865494456264, + "grad_norm": 1.128703236579895, + "learning_rate": 9.729236842105263e-05, + "loss": 0.3458, + "step": 30303 + }, + { + "epoch": 1.6969425467577555, + "grad_norm": 1.374710202217102, + "learning_rate": 9.72921052631579e-05, + "loss": 0.5214, + "step": 30304 + }, + { + "epoch": 1.6969985440698845, + "grad_norm": 1.659647822380066, + "learning_rate": 9.729184210526317e-05, + "loss": 0.5203, + "step": 30305 + }, + { + "epoch": 1.6970545413820135, + "grad_norm": 1.23285710811615, + "learning_rate": 9.729157894736843e-05, + "loss": 0.588, + "step": 30306 + }, + { + "epoch": 1.6971105386941425, + "grad_norm": 1.2525181770324707, + "learning_rate": 9.729131578947369e-05, + "loss": 0.3612, + "step": 30307 + }, + { + "epoch": 1.6971665360062715, + "grad_norm": 1.2933460474014282, + "learning_rate": 9.729105263157895e-05, + "loss": 0.4537, + "step": 30308 + }, + { + "epoch": 1.6972225333184006, + "grad_norm": 1.0990279912948608, + "learning_rate": 9.729078947368422e-05, + "loss": 0.3874, + "step": 30309 + }, + { + "epoch": 1.6972785306305296, + "grad_norm": 1.3627982139587402, + "learning_rate": 9.729052631578948e-05, + "loss": 0.4063, + "step": 30310 + }, + { + "epoch": 1.6973345279426586, + "grad_norm": 1.2585375308990479, + "learning_rate": 9.729026315789474e-05, + "loss": 0.3325, + "step": 30311 + }, + { + "epoch": 1.6973905252547876, + "grad_norm": 1.0967957973480225, + "learning_rate": 9.729e-05, + "loss": 0.3409, + "step": 30312 + }, + { + "epoch": 1.6974465225669166, + "grad_norm": 1.286448359489441, + "learning_rate": 9.728973684210527e-05, + "loss": 0.3923, + "step": 30313 + }, + { + "epoch": 1.6975025198790457, + "grad_norm": 1.4949727058410645, + "learning_rate": 9.728947368421053e-05, + "loss": 0.4739, + "step": 30314 + }, + { + "epoch": 1.6975585171911747, + "grad_norm": 1.621005654335022, + "learning_rate": 9.72892105263158e-05, + "loss": 0.6308, + "step": 30315 + }, + { + "epoch": 1.6976145145033037, + "grad_norm": 1.20448637008667, + "learning_rate": 9.728894736842105e-05, + "loss": 0.4985, + "step": 30316 + }, + { + "epoch": 1.6976705118154327, + "grad_norm": 1.2671122550964355, + "learning_rate": 9.728868421052631e-05, + "loss": 0.387, + "step": 30317 + }, + { + "epoch": 1.6977265091275617, + "grad_norm": 1.430680751800537, + "learning_rate": 9.728842105263158e-05, + "loss": 0.574, + "step": 30318 + }, + { + "epoch": 1.6977825064396908, + "grad_norm": 1.3269802331924438, + "learning_rate": 9.728815789473684e-05, + "loss": 0.5057, + "step": 30319 + }, + { + "epoch": 1.6978385037518198, + "grad_norm": 2.183607339859009, + "learning_rate": 9.728789473684212e-05, + "loss": 0.5895, + "step": 30320 + }, + { + "epoch": 1.6978945010639488, + "grad_norm": 1.1722875833511353, + "learning_rate": 9.728763157894736e-05, + "loss": 0.3825, + "step": 30321 + }, + { + "epoch": 1.6979504983760778, + "grad_norm": 1.1483943462371826, + "learning_rate": 9.728736842105264e-05, + "loss": 0.3443, + "step": 30322 + }, + { + "epoch": 1.6980064956882068, + "grad_norm": 1.5114575624465942, + "learning_rate": 9.72871052631579e-05, + "loss": 0.4387, + "step": 30323 + }, + { + "epoch": 1.6980624930003358, + "grad_norm": 1.5740960836410522, + "learning_rate": 9.728684210526317e-05, + "loss": 0.4974, + "step": 30324 + }, + { + "epoch": 1.6981184903124649, + "grad_norm": 1.171128273010254, + "learning_rate": 9.728657894736843e-05, + "loss": 0.4277, + "step": 30325 + }, + { + "epoch": 1.6981744876245939, + "grad_norm": 1.138498067855835, + "learning_rate": 9.728631578947369e-05, + "loss": 0.4725, + "step": 30326 + }, + { + "epoch": 1.698230484936723, + "grad_norm": 1.3775825500488281, + "learning_rate": 9.728605263157895e-05, + "loss": 0.5069, + "step": 30327 + }, + { + "epoch": 1.698286482248852, + "grad_norm": 1.1714247465133667, + "learning_rate": 9.728578947368422e-05, + "loss": 0.3434, + "step": 30328 + }, + { + "epoch": 1.698342479560981, + "grad_norm": 1.5268391370773315, + "learning_rate": 9.728552631578948e-05, + "loss": 0.3696, + "step": 30329 + }, + { + "epoch": 1.69839847687311, + "grad_norm": 1.343772292137146, + "learning_rate": 9.728526315789474e-05, + "loss": 0.5687, + "step": 30330 + }, + { + "epoch": 1.698454474185239, + "grad_norm": 64.54393005371094, + "learning_rate": 9.7285e-05, + "loss": 0.4503, + "step": 30331 + }, + { + "epoch": 1.698510471497368, + "grad_norm": 1.3343149423599243, + "learning_rate": 9.728473684210527e-05, + "loss": 0.4719, + "step": 30332 + }, + { + "epoch": 1.698566468809497, + "grad_norm": 1.236086130142212, + "learning_rate": 9.728447368421053e-05, + "loss": 0.4541, + "step": 30333 + }, + { + "epoch": 1.698622466121626, + "grad_norm": 1.2689766883850098, + "learning_rate": 9.72842105263158e-05, + "loss": 0.4627, + "step": 30334 + }, + { + "epoch": 1.698678463433755, + "grad_norm": 1.5581690073013306, + "learning_rate": 9.728394736842105e-05, + "loss": 0.576, + "step": 30335 + }, + { + "epoch": 1.698734460745884, + "grad_norm": 1.436468482017517, + "learning_rate": 9.728368421052631e-05, + "loss": 0.4485, + "step": 30336 + }, + { + "epoch": 1.698790458058013, + "grad_norm": 1.2061339616775513, + "learning_rate": 9.728342105263159e-05, + "loss": 0.4498, + "step": 30337 + }, + { + "epoch": 1.6988464553701421, + "grad_norm": 1.6089482307434082, + "learning_rate": 9.728315789473685e-05, + "loss": 0.5003, + "step": 30338 + }, + { + "epoch": 1.6989024526822711, + "grad_norm": 1.0775550603866577, + "learning_rate": 9.72828947368421e-05, + "loss": 0.3458, + "step": 30339 + }, + { + "epoch": 1.6989584499944002, + "grad_norm": 1.5468053817749023, + "learning_rate": 9.728263157894737e-05, + "loss": 0.7323, + "step": 30340 + }, + { + "epoch": 1.6990144473065292, + "grad_norm": 1.2649682760238647, + "learning_rate": 9.728236842105264e-05, + "loss": 0.4295, + "step": 30341 + }, + { + "epoch": 1.6990704446186582, + "grad_norm": 1.3385552167892456, + "learning_rate": 9.72821052631579e-05, + "loss": 0.5028, + "step": 30342 + }, + { + "epoch": 1.6991264419307872, + "grad_norm": 1.2830957174301147, + "learning_rate": 9.728184210526317e-05, + "loss": 0.4697, + "step": 30343 + }, + { + "epoch": 1.6991824392429162, + "grad_norm": 2.1357758045196533, + "learning_rate": 9.728157894736842e-05, + "loss": 0.4494, + "step": 30344 + }, + { + "epoch": 1.6992384365550453, + "grad_norm": 1.2747297286987305, + "learning_rate": 9.728131578947369e-05, + "loss": 0.5061, + "step": 30345 + }, + { + "epoch": 1.6992944338671743, + "grad_norm": 1.3982797861099243, + "learning_rate": 9.728105263157895e-05, + "loss": 0.4909, + "step": 30346 + }, + { + "epoch": 1.6993504311793033, + "grad_norm": 1.4929579496383667, + "learning_rate": 9.728078947368422e-05, + "loss": 0.5826, + "step": 30347 + }, + { + "epoch": 1.6994064284914323, + "grad_norm": 1.6980502605438232, + "learning_rate": 9.728052631578948e-05, + "loss": 0.5291, + "step": 30348 + }, + { + "epoch": 1.6994624258035613, + "grad_norm": 1.1305783987045288, + "learning_rate": 9.728026315789474e-05, + "loss": 0.4975, + "step": 30349 + }, + { + "epoch": 1.6995184231156903, + "grad_norm": 1.2212214469909668, + "learning_rate": 9.728e-05, + "loss": 0.4564, + "step": 30350 + }, + { + "epoch": 1.6995744204278194, + "grad_norm": 1.3931679725646973, + "learning_rate": 9.727973684210526e-05, + "loss": 0.6455, + "step": 30351 + }, + { + "epoch": 1.6996304177399484, + "grad_norm": 1.271008014678955, + "learning_rate": 9.727947368421054e-05, + "loss": 0.4099, + "step": 30352 + }, + { + "epoch": 1.6996864150520774, + "grad_norm": 1.3081786632537842, + "learning_rate": 9.727921052631578e-05, + "loss": 0.4813, + "step": 30353 + }, + { + "epoch": 1.6997424123642064, + "grad_norm": 1.37801194190979, + "learning_rate": 9.727894736842106e-05, + "loss": 0.451, + "step": 30354 + }, + { + "epoch": 1.6997984096763354, + "grad_norm": 1.640711784362793, + "learning_rate": 9.727868421052632e-05, + "loss": 0.5858, + "step": 30355 + }, + { + "epoch": 1.6998544069884645, + "grad_norm": 1.121232032775879, + "learning_rate": 9.727842105263159e-05, + "loss": 0.4415, + "step": 30356 + }, + { + "epoch": 1.6999104043005935, + "grad_norm": 1.3477303981781006, + "learning_rate": 9.727815789473685e-05, + "loss": 0.4505, + "step": 30357 + }, + { + "epoch": 1.6999664016127225, + "grad_norm": 2.0943679809570312, + "learning_rate": 9.727789473684211e-05, + "loss": 0.4611, + "step": 30358 + }, + { + "epoch": 1.7000223989248515, + "grad_norm": 1.7255926132202148, + "learning_rate": 9.727763157894737e-05, + "loss": 0.4871, + "step": 30359 + }, + { + "epoch": 1.7000783962369805, + "grad_norm": 1.3967081308364868, + "learning_rate": 9.727736842105264e-05, + "loss": 0.4439, + "step": 30360 + }, + { + "epoch": 1.7001343935491096, + "grad_norm": 1.5152515172958374, + "learning_rate": 9.72771052631579e-05, + "loss": 0.4556, + "step": 30361 + }, + { + "epoch": 1.7001903908612386, + "grad_norm": 1.40230131149292, + "learning_rate": 9.727684210526316e-05, + "loss": 0.5182, + "step": 30362 + }, + { + "epoch": 1.7002463881733676, + "grad_norm": 1.2221062183380127, + "learning_rate": 9.727657894736842e-05, + "loss": 0.4226, + "step": 30363 + }, + { + "epoch": 1.7003023854854966, + "grad_norm": 6.97371768951416, + "learning_rate": 9.72763157894737e-05, + "loss": 0.4474, + "step": 30364 + }, + { + "epoch": 1.7003583827976256, + "grad_norm": 1.3779109716415405, + "learning_rate": 9.727605263157895e-05, + "loss": 0.4449, + "step": 30365 + }, + { + "epoch": 1.7004143801097547, + "grad_norm": 1.1985836029052734, + "learning_rate": 9.727578947368421e-05, + "loss": 0.3976, + "step": 30366 + }, + { + "epoch": 1.7004703774218837, + "grad_norm": 1.293669581413269, + "learning_rate": 9.727552631578947e-05, + "loss": 0.4866, + "step": 30367 + }, + { + "epoch": 1.7005263747340127, + "grad_norm": 1.30210280418396, + "learning_rate": 9.727526315789473e-05, + "loss": 0.4353, + "step": 30368 + }, + { + "epoch": 1.7005823720461417, + "grad_norm": 1.4130703210830688, + "learning_rate": 9.7275e-05, + "loss": 0.4157, + "step": 30369 + }, + { + "epoch": 1.7006383693582707, + "grad_norm": 1.3582077026367188, + "learning_rate": 9.727473684210527e-05, + "loss": 0.5089, + "step": 30370 + }, + { + "epoch": 1.7006943666703997, + "grad_norm": 1.5402058362960815, + "learning_rate": 9.727447368421053e-05, + "loss": 0.5706, + "step": 30371 + }, + { + "epoch": 1.7007503639825288, + "grad_norm": 1.088727593421936, + "learning_rate": 9.727421052631579e-05, + "loss": 0.3592, + "step": 30372 + }, + { + "epoch": 1.7008063612946578, + "grad_norm": 1.4988527297973633, + "learning_rate": 9.727394736842106e-05, + "loss": 0.3488, + "step": 30373 + }, + { + "epoch": 1.7008623586067868, + "grad_norm": 1.0835514068603516, + "learning_rate": 9.727368421052632e-05, + "loss": 0.3906, + "step": 30374 + }, + { + "epoch": 1.7009183559189158, + "grad_norm": 1.2537224292755127, + "learning_rate": 9.727342105263159e-05, + "loss": 0.4103, + "step": 30375 + }, + { + "epoch": 1.7009743532310448, + "grad_norm": 1.2662135362625122, + "learning_rate": 9.727315789473684e-05, + "loss": 0.4934, + "step": 30376 + }, + { + "epoch": 1.7010303505431739, + "grad_norm": 1.1604135036468506, + "learning_rate": 9.727289473684211e-05, + "loss": 0.3871, + "step": 30377 + }, + { + "epoch": 1.7010863478553029, + "grad_norm": 1.2488824129104614, + "learning_rate": 9.727263157894737e-05, + "loss": 0.3867, + "step": 30378 + }, + { + "epoch": 1.701142345167432, + "grad_norm": 1.395504355430603, + "learning_rate": 9.727236842105264e-05, + "loss": 0.4575, + "step": 30379 + }, + { + "epoch": 1.701198342479561, + "grad_norm": 1.2531800270080566, + "learning_rate": 9.72721052631579e-05, + "loss": 0.4608, + "step": 30380 + }, + { + "epoch": 1.70125433979169, + "grad_norm": 1.281958818435669, + "learning_rate": 9.727184210526316e-05, + "loss": 0.4547, + "step": 30381 + }, + { + "epoch": 1.701310337103819, + "grad_norm": 1.3307981491088867, + "learning_rate": 9.727157894736842e-05, + "loss": 0.3593, + "step": 30382 + }, + { + "epoch": 1.701366334415948, + "grad_norm": 1.6306116580963135, + "learning_rate": 9.72713157894737e-05, + "loss": 0.58, + "step": 30383 + }, + { + "epoch": 1.701422331728077, + "grad_norm": 1.287005066871643, + "learning_rate": 9.727105263157896e-05, + "loss": 0.4737, + "step": 30384 + }, + { + "epoch": 1.701478329040206, + "grad_norm": 1.359609603881836, + "learning_rate": 9.727078947368422e-05, + "loss": 0.5492, + "step": 30385 + }, + { + "epoch": 1.701534326352335, + "grad_norm": 1.3711774349212646, + "learning_rate": 9.727052631578948e-05, + "loss": 0.4229, + "step": 30386 + }, + { + "epoch": 1.701590323664464, + "grad_norm": 1.237109899520874, + "learning_rate": 9.727026315789474e-05, + "loss": 0.4392, + "step": 30387 + }, + { + "epoch": 1.701646320976593, + "grad_norm": 1.3945149183273315, + "learning_rate": 9.727000000000001e-05, + "loss": 0.4245, + "step": 30388 + }, + { + "epoch": 1.701702318288722, + "grad_norm": 1.4769854545593262, + "learning_rate": 9.726973684210527e-05, + "loss": 0.4906, + "step": 30389 + }, + { + "epoch": 1.7017583156008511, + "grad_norm": 1.082001805305481, + "learning_rate": 9.726947368421053e-05, + "loss": 0.3421, + "step": 30390 + }, + { + "epoch": 1.7018143129129801, + "grad_norm": 1.4194084405899048, + "learning_rate": 9.726921052631579e-05, + "loss": 0.4602, + "step": 30391 + }, + { + "epoch": 1.7018703102251092, + "grad_norm": 1.1614925861358643, + "learning_rate": 9.726894736842106e-05, + "loss": 0.5332, + "step": 30392 + }, + { + "epoch": 1.7019263075372382, + "grad_norm": 1.0861917734146118, + "learning_rate": 9.726868421052632e-05, + "loss": 0.377, + "step": 30393 + }, + { + "epoch": 1.7019823048493672, + "grad_norm": 1.1531882286071777, + "learning_rate": 9.726842105263158e-05, + "loss": 0.4217, + "step": 30394 + }, + { + "epoch": 1.7020383021614962, + "grad_norm": 1.6344202756881714, + "learning_rate": 9.726815789473684e-05, + "loss": 0.4903, + "step": 30395 + }, + { + "epoch": 1.7020942994736252, + "grad_norm": 1.258263349533081, + "learning_rate": 9.726789473684211e-05, + "loss": 0.486, + "step": 30396 + }, + { + "epoch": 1.7021502967857542, + "grad_norm": 1.3069782257080078, + "learning_rate": 9.726763157894737e-05, + "loss": 0.4587, + "step": 30397 + }, + { + "epoch": 1.7022062940978833, + "grad_norm": 1.4569857120513916, + "learning_rate": 9.726736842105265e-05, + "loss": 0.4588, + "step": 30398 + }, + { + "epoch": 1.7022622914100123, + "grad_norm": 1.7946926355361938, + "learning_rate": 9.726710526315789e-05, + "loss": 0.6473, + "step": 30399 + }, + { + "epoch": 1.7023182887221413, + "grad_norm": 1.431936502456665, + "learning_rate": 9.726684210526317e-05, + "loss": 0.457, + "step": 30400 + }, + { + "epoch": 1.7023742860342703, + "grad_norm": 1.1445733308792114, + "learning_rate": 9.726657894736843e-05, + "loss": 0.4208, + "step": 30401 + }, + { + "epoch": 1.7024302833463993, + "grad_norm": 1.2694015502929688, + "learning_rate": 9.72663157894737e-05, + "loss": 0.3792, + "step": 30402 + }, + { + "epoch": 1.7024862806585284, + "grad_norm": 1.0210872888565063, + "learning_rate": 9.726605263157895e-05, + "loss": 0.4022, + "step": 30403 + }, + { + "epoch": 1.7025422779706574, + "grad_norm": 4.105430603027344, + "learning_rate": 9.72657894736842e-05, + "loss": 0.584, + "step": 30404 + }, + { + "epoch": 1.7025982752827864, + "grad_norm": 1.0736076831817627, + "learning_rate": 9.726552631578948e-05, + "loss": 0.4297, + "step": 30405 + }, + { + "epoch": 1.7026542725949154, + "grad_norm": 1.2866970300674438, + "learning_rate": 9.726526315789474e-05, + "loss": 0.4977, + "step": 30406 + }, + { + "epoch": 1.7027102699070444, + "grad_norm": 1.7026922702789307, + "learning_rate": 9.726500000000001e-05, + "loss": 0.5121, + "step": 30407 + }, + { + "epoch": 1.7027662672191735, + "grad_norm": 1.3127936124801636, + "learning_rate": 9.726473684210526e-05, + "loss": 0.4201, + "step": 30408 + }, + { + "epoch": 1.7028222645313025, + "grad_norm": 1.456664800643921, + "learning_rate": 9.726447368421053e-05, + "loss": 0.6082, + "step": 30409 + }, + { + "epoch": 1.7028782618434315, + "grad_norm": 1.3231877088546753, + "learning_rate": 9.726421052631579e-05, + "loss": 0.496, + "step": 30410 + }, + { + "epoch": 1.7029342591555605, + "grad_norm": 1.4809296131134033, + "learning_rate": 9.726394736842106e-05, + "loss": 0.4325, + "step": 30411 + }, + { + "epoch": 1.7029902564676895, + "grad_norm": 1.2596771717071533, + "learning_rate": 9.726368421052632e-05, + "loss": 0.4379, + "step": 30412 + }, + { + "epoch": 1.7030462537798186, + "grad_norm": 1.4474269151687622, + "learning_rate": 9.726342105263158e-05, + "loss": 0.5362, + "step": 30413 + }, + { + "epoch": 1.7031022510919476, + "grad_norm": 1.1639429330825806, + "learning_rate": 9.726315789473684e-05, + "loss": 0.353, + "step": 30414 + }, + { + "epoch": 1.7031582484040766, + "grad_norm": 1.225325107574463, + "learning_rate": 9.726289473684212e-05, + "loss": 0.3527, + "step": 30415 + }, + { + "epoch": 1.7032142457162056, + "grad_norm": 1.3113058805465698, + "learning_rate": 9.726263157894738e-05, + "loss": 0.5815, + "step": 30416 + }, + { + "epoch": 1.7032702430283346, + "grad_norm": 1.1792465448379517, + "learning_rate": 9.726236842105264e-05, + "loss": 0.5102, + "step": 30417 + }, + { + "epoch": 1.7033262403404636, + "grad_norm": 1.6666264533996582, + "learning_rate": 9.72621052631579e-05, + "loss": 0.5822, + "step": 30418 + }, + { + "epoch": 1.7033822376525927, + "grad_norm": 1.060243010520935, + "learning_rate": 9.726184210526317e-05, + "loss": 0.408, + "step": 30419 + }, + { + "epoch": 1.7034382349647217, + "grad_norm": 1.1443876028060913, + "learning_rate": 9.726157894736843e-05, + "loss": 0.3858, + "step": 30420 + }, + { + "epoch": 1.7034942322768507, + "grad_norm": 1.060678482055664, + "learning_rate": 9.726131578947369e-05, + "loss": 0.399, + "step": 30421 + }, + { + "epoch": 1.7035502295889797, + "grad_norm": 1.4833637475967407, + "learning_rate": 9.726105263157895e-05, + "loss": 0.466, + "step": 30422 + }, + { + "epoch": 1.7036062269011087, + "grad_norm": 1.1668732166290283, + "learning_rate": 9.726078947368421e-05, + "loss": 0.3098, + "step": 30423 + }, + { + "epoch": 1.7036622242132378, + "grad_norm": 1.498699426651001, + "learning_rate": 9.726052631578948e-05, + "loss": 0.543, + "step": 30424 + }, + { + "epoch": 1.7037182215253668, + "grad_norm": 1.2083587646484375, + "learning_rate": 9.726026315789474e-05, + "loss": 0.3934, + "step": 30425 + }, + { + "epoch": 1.7037742188374958, + "grad_norm": 1.2850645780563354, + "learning_rate": 9.726e-05, + "loss": 0.5364, + "step": 30426 + }, + { + "epoch": 1.7038302161496248, + "grad_norm": 1.451698899269104, + "learning_rate": 9.725973684210526e-05, + "loss": 0.4178, + "step": 30427 + }, + { + "epoch": 1.7038862134617538, + "grad_norm": 1.3936408758163452, + "learning_rate": 9.725947368421053e-05, + "loss": 0.4768, + "step": 30428 + }, + { + "epoch": 1.7039422107738829, + "grad_norm": 1.2527719736099243, + "learning_rate": 9.72592105263158e-05, + "loss": 0.3909, + "step": 30429 + }, + { + "epoch": 1.7039982080860119, + "grad_norm": 1.081490159034729, + "learning_rate": 9.725894736842107e-05, + "loss": 0.4346, + "step": 30430 + }, + { + "epoch": 1.704054205398141, + "grad_norm": 1.1686931848526, + "learning_rate": 9.725868421052631e-05, + "loss": 0.4694, + "step": 30431 + }, + { + "epoch": 1.70411020271027, + "grad_norm": 1.3607361316680908, + "learning_rate": 9.725842105263159e-05, + "loss": 0.4416, + "step": 30432 + }, + { + "epoch": 1.704166200022399, + "grad_norm": 1.2696168422698975, + "learning_rate": 9.725815789473685e-05, + "loss": 0.4423, + "step": 30433 + }, + { + "epoch": 1.704222197334528, + "grad_norm": 1.1625663042068481, + "learning_rate": 9.725789473684212e-05, + "loss": 0.3651, + "step": 30434 + }, + { + "epoch": 1.704278194646657, + "grad_norm": 1.2001128196716309, + "learning_rate": 9.725763157894738e-05, + "loss": 0.3825, + "step": 30435 + }, + { + "epoch": 1.704334191958786, + "grad_norm": 1.3356788158416748, + "learning_rate": 9.725736842105264e-05, + "loss": 0.4432, + "step": 30436 + }, + { + "epoch": 1.704390189270915, + "grad_norm": 1.427314043045044, + "learning_rate": 9.72571052631579e-05, + "loss": 0.4516, + "step": 30437 + }, + { + "epoch": 1.704446186583044, + "grad_norm": 1.3588881492614746, + "learning_rate": 9.725684210526316e-05, + "loss": 0.5084, + "step": 30438 + }, + { + "epoch": 1.704502183895173, + "grad_norm": 1.1967135667800903, + "learning_rate": 9.725657894736843e-05, + "loss": 0.3756, + "step": 30439 + }, + { + "epoch": 1.704558181207302, + "grad_norm": 1.4664340019226074, + "learning_rate": 9.725631578947369e-05, + "loss": 0.5995, + "step": 30440 + }, + { + "epoch": 1.704614178519431, + "grad_norm": 1.5603938102722168, + "learning_rate": 9.725605263157895e-05, + "loss": 0.6902, + "step": 30441 + }, + { + "epoch": 1.70467017583156, + "grad_norm": 1.2841347455978394, + "learning_rate": 9.725578947368421e-05, + "loss": 0.3792, + "step": 30442 + }, + { + "epoch": 1.7047261731436891, + "grad_norm": 1.312217354774475, + "learning_rate": 9.725552631578948e-05, + "loss": 0.4777, + "step": 30443 + }, + { + "epoch": 1.7047821704558181, + "grad_norm": 1.2578442096710205, + "learning_rate": 9.725526315789474e-05, + "loss": 0.4364, + "step": 30444 + }, + { + "epoch": 1.7048381677679472, + "grad_norm": 1.2755889892578125, + "learning_rate": 9.7255e-05, + "loss": 0.4185, + "step": 30445 + }, + { + "epoch": 1.7048941650800762, + "grad_norm": 1.2359391450881958, + "learning_rate": 9.725473684210526e-05, + "loss": 0.4579, + "step": 30446 + }, + { + "epoch": 1.7049501623922052, + "grad_norm": 1.344716191291809, + "learning_rate": 9.725447368421054e-05, + "loss": 0.5177, + "step": 30447 + }, + { + "epoch": 1.7050061597043342, + "grad_norm": 1.423856258392334, + "learning_rate": 9.72542105263158e-05, + "loss": 0.5039, + "step": 30448 + }, + { + "epoch": 1.7050621570164632, + "grad_norm": 1.1566288471221924, + "learning_rate": 9.725394736842106e-05, + "loss": 0.4561, + "step": 30449 + }, + { + "epoch": 1.7051181543285923, + "grad_norm": 1.3783998489379883, + "learning_rate": 9.725368421052632e-05, + "loss": 0.4719, + "step": 30450 + }, + { + "epoch": 1.7051741516407213, + "grad_norm": 1.2949869632720947, + "learning_rate": 9.725342105263159e-05, + "loss": 0.613, + "step": 30451 + }, + { + "epoch": 1.7052301489528503, + "grad_norm": 1.324256181716919, + "learning_rate": 9.725315789473685e-05, + "loss": 0.3451, + "step": 30452 + }, + { + "epoch": 1.7052861462649793, + "grad_norm": 1.0943481922149658, + "learning_rate": 9.725289473684212e-05, + "loss": 0.3402, + "step": 30453 + }, + { + "epoch": 1.7053421435771083, + "grad_norm": 1.1040737628936768, + "learning_rate": 9.725263157894737e-05, + "loss": 0.4539, + "step": 30454 + }, + { + "epoch": 1.7053981408892374, + "grad_norm": 1.1457338333129883, + "learning_rate": 9.725236842105263e-05, + "loss": 0.4564, + "step": 30455 + }, + { + "epoch": 1.7054541382013664, + "grad_norm": 1.266891360282898, + "learning_rate": 9.72521052631579e-05, + "loss": 0.3382, + "step": 30456 + }, + { + "epoch": 1.7055101355134954, + "grad_norm": 1.2966458797454834, + "learning_rate": 9.725184210526316e-05, + "loss": 0.4611, + "step": 30457 + }, + { + "epoch": 1.7055661328256244, + "grad_norm": 1.471893548965454, + "learning_rate": 9.725157894736842e-05, + "loss": 0.4955, + "step": 30458 + }, + { + "epoch": 1.7056221301377534, + "grad_norm": 1.342434048652649, + "learning_rate": 9.725131578947368e-05, + "loss": 0.4862, + "step": 30459 + }, + { + "epoch": 1.7056781274498825, + "grad_norm": 1.1808223724365234, + "learning_rate": 9.725105263157895e-05, + "loss": 0.3575, + "step": 30460 + }, + { + "epoch": 1.7057341247620115, + "grad_norm": 1.2905265092849731, + "learning_rate": 9.725078947368421e-05, + "loss": 0.5352, + "step": 30461 + }, + { + "epoch": 1.7057901220741405, + "grad_norm": 1.2809385061264038, + "learning_rate": 9.725052631578949e-05, + "loss": 0.5049, + "step": 30462 + }, + { + "epoch": 1.7058461193862695, + "grad_norm": 1.2980560064315796, + "learning_rate": 9.725026315789473e-05, + "loss": 0.3592, + "step": 30463 + }, + { + "epoch": 1.7059021166983985, + "grad_norm": 1.2398675680160522, + "learning_rate": 9.725e-05, + "loss": 0.4474, + "step": 30464 + }, + { + "epoch": 1.7059581140105275, + "grad_norm": 1.3723958730697632, + "learning_rate": 9.724973684210527e-05, + "loss": 0.6417, + "step": 30465 + }, + { + "epoch": 1.7060141113226566, + "grad_norm": 1.1518769264221191, + "learning_rate": 9.724947368421054e-05, + "loss": 0.3915, + "step": 30466 + }, + { + "epoch": 1.7060701086347856, + "grad_norm": 1.6506885290145874, + "learning_rate": 9.72492105263158e-05, + "loss": 0.4685, + "step": 30467 + }, + { + "epoch": 1.7061261059469146, + "grad_norm": 1.1885184049606323, + "learning_rate": 9.724894736842106e-05, + "loss": 0.4079, + "step": 30468 + }, + { + "epoch": 1.7061821032590436, + "grad_norm": 1.6102256774902344, + "learning_rate": 9.724868421052632e-05, + "loss": 0.4723, + "step": 30469 + }, + { + "epoch": 1.7062381005711726, + "grad_norm": 1.0847893953323364, + "learning_rate": 9.724842105263159e-05, + "loss": 0.4572, + "step": 30470 + }, + { + "epoch": 1.7062940978833017, + "grad_norm": 1.1210451126098633, + "learning_rate": 9.724815789473685e-05, + "loss": 0.3528, + "step": 30471 + }, + { + "epoch": 1.7063500951954307, + "grad_norm": 1.3248612880706787, + "learning_rate": 9.724789473684211e-05, + "loss": 0.5108, + "step": 30472 + }, + { + "epoch": 1.7064060925075597, + "grad_norm": 1.2521291971206665, + "learning_rate": 9.724763157894737e-05, + "loss": 0.4766, + "step": 30473 + }, + { + "epoch": 1.7064620898196887, + "grad_norm": 1.1710253953933716, + "learning_rate": 9.724736842105263e-05, + "loss": 0.5097, + "step": 30474 + }, + { + "epoch": 1.7065180871318177, + "grad_norm": 1.3462587594985962, + "learning_rate": 9.72471052631579e-05, + "loss": 0.5612, + "step": 30475 + }, + { + "epoch": 1.7065740844439468, + "grad_norm": 1.2117505073547363, + "learning_rate": 9.724684210526316e-05, + "loss": 0.4326, + "step": 30476 + }, + { + "epoch": 1.7066300817560758, + "grad_norm": 1.5765436887741089, + "learning_rate": 9.724657894736842e-05, + "loss": 0.7118, + "step": 30477 + }, + { + "epoch": 1.7066860790682048, + "grad_norm": 1.361477017402649, + "learning_rate": 9.724631578947368e-05, + "loss": 0.3543, + "step": 30478 + }, + { + "epoch": 1.7067420763803338, + "grad_norm": 1.6338622570037842, + "learning_rate": 9.724605263157896e-05, + "loss": 0.528, + "step": 30479 + }, + { + "epoch": 1.7067980736924628, + "grad_norm": 1.4930986166000366, + "learning_rate": 9.724578947368422e-05, + "loss": 0.5079, + "step": 30480 + }, + { + "epoch": 1.7068540710045919, + "grad_norm": 1.363598108291626, + "learning_rate": 9.724552631578948e-05, + "loss": 0.4515, + "step": 30481 + }, + { + "epoch": 1.7069100683167209, + "grad_norm": 1.3108935356140137, + "learning_rate": 9.724526315789474e-05, + "loss": 0.5058, + "step": 30482 + }, + { + "epoch": 1.70696606562885, + "grad_norm": 1.2128300666809082, + "learning_rate": 9.724500000000001e-05, + "loss": 0.5126, + "step": 30483 + }, + { + "epoch": 1.707022062940979, + "grad_norm": 1.2338123321533203, + "learning_rate": 9.724473684210527e-05, + "loss": 0.3588, + "step": 30484 + }, + { + "epoch": 1.707078060253108, + "grad_norm": 1.3296399116516113, + "learning_rate": 9.724447368421054e-05, + "loss": 0.4617, + "step": 30485 + }, + { + "epoch": 1.707134057565237, + "grad_norm": 1.0531251430511475, + "learning_rate": 9.724421052631579e-05, + "loss": 0.3619, + "step": 30486 + }, + { + "epoch": 1.707190054877366, + "grad_norm": 1.1226277351379395, + "learning_rate": 9.724394736842106e-05, + "loss": 0.3713, + "step": 30487 + }, + { + "epoch": 1.707246052189495, + "grad_norm": 1.1927179098129272, + "learning_rate": 9.724368421052632e-05, + "loss": 0.3629, + "step": 30488 + }, + { + "epoch": 1.707302049501624, + "grad_norm": 1.546406865119934, + "learning_rate": 9.724342105263158e-05, + "loss": 0.6288, + "step": 30489 + }, + { + "epoch": 1.707358046813753, + "grad_norm": 1.3373161554336548, + "learning_rate": 9.724315789473685e-05, + "loss": 0.3687, + "step": 30490 + }, + { + "epoch": 1.707414044125882, + "grad_norm": 1.1055089235305786, + "learning_rate": 9.72428947368421e-05, + "loss": 0.4005, + "step": 30491 + }, + { + "epoch": 1.707470041438011, + "grad_norm": 1.1816829442977905, + "learning_rate": 9.724263157894737e-05, + "loss": 0.4471, + "step": 30492 + }, + { + "epoch": 1.70752603875014, + "grad_norm": 1.4974868297576904, + "learning_rate": 9.724236842105263e-05, + "loss": 0.4795, + "step": 30493 + }, + { + "epoch": 1.707582036062269, + "grad_norm": 1.2631924152374268, + "learning_rate": 9.72421052631579e-05, + "loss": 0.4412, + "step": 30494 + }, + { + "epoch": 1.7076380333743981, + "grad_norm": 1.2478843927383423, + "learning_rate": 9.724184210526317e-05, + "loss": 0.5724, + "step": 30495 + }, + { + "epoch": 1.7076940306865271, + "grad_norm": 1.5070134401321411, + "learning_rate": 9.724157894736843e-05, + "loss": 0.5999, + "step": 30496 + }, + { + "epoch": 1.7077500279986562, + "grad_norm": 1.4186972379684448, + "learning_rate": 9.724131578947369e-05, + "loss": 0.4678, + "step": 30497 + }, + { + "epoch": 1.7078060253107852, + "grad_norm": 1.4076356887817383, + "learning_rate": 9.724105263157896e-05, + "loss": 0.471, + "step": 30498 + }, + { + "epoch": 1.7078620226229142, + "grad_norm": 1.4378857612609863, + "learning_rate": 9.724078947368422e-05, + "loss": 0.4323, + "step": 30499 + }, + { + "epoch": 1.7079180199350432, + "grad_norm": 1.23332941532135, + "learning_rate": 9.724052631578948e-05, + "loss": 0.5648, + "step": 30500 + }, + { + "epoch": 1.7079740172471722, + "grad_norm": 1.2035006284713745, + "learning_rate": 9.724026315789474e-05, + "loss": 0.4644, + "step": 30501 + }, + { + "epoch": 1.7080300145593013, + "grad_norm": 1.1639639139175415, + "learning_rate": 9.724000000000001e-05, + "loss": 0.4612, + "step": 30502 + }, + { + "epoch": 1.7080860118714303, + "grad_norm": 1.1637418270111084, + "learning_rate": 9.723973684210527e-05, + "loss": 0.3284, + "step": 30503 + }, + { + "epoch": 1.7081420091835593, + "grad_norm": 1.4097344875335693, + "learning_rate": 9.723947368421053e-05, + "loss": 0.5028, + "step": 30504 + }, + { + "epoch": 1.7081980064956883, + "grad_norm": 1.4470453262329102, + "learning_rate": 9.723921052631579e-05, + "loss": 0.4567, + "step": 30505 + }, + { + "epoch": 1.7082540038078173, + "grad_norm": 1.3159265518188477, + "learning_rate": 9.723894736842106e-05, + "loss": 0.4034, + "step": 30506 + }, + { + "epoch": 1.7083100011199464, + "grad_norm": 2.2175371646881104, + "learning_rate": 9.723868421052632e-05, + "loss": 0.4138, + "step": 30507 + }, + { + "epoch": 1.7083659984320754, + "grad_norm": 1.3051400184631348, + "learning_rate": 9.723842105263158e-05, + "loss": 0.4557, + "step": 30508 + }, + { + "epoch": 1.7084219957442044, + "grad_norm": 1.4340587854385376, + "learning_rate": 9.723815789473684e-05, + "loss": 0.3551, + "step": 30509 + }, + { + "epoch": 1.7084779930563334, + "grad_norm": 1.2266170978546143, + "learning_rate": 9.72378947368421e-05, + "loss": 0.4332, + "step": 30510 + }, + { + "epoch": 1.7085339903684624, + "grad_norm": 1.3109291791915894, + "learning_rate": 9.723763157894738e-05, + "loss": 0.41, + "step": 30511 + }, + { + "epoch": 1.7085899876805914, + "grad_norm": 1.437774658203125, + "learning_rate": 9.723736842105264e-05, + "loss": 0.4451, + "step": 30512 + }, + { + "epoch": 1.7086459849927205, + "grad_norm": 1.1644784212112427, + "learning_rate": 9.72371052631579e-05, + "loss": 0.2779, + "step": 30513 + }, + { + "epoch": 1.7087019823048495, + "grad_norm": 1.1141005754470825, + "learning_rate": 9.723684210526315e-05, + "loss": 0.4475, + "step": 30514 + }, + { + "epoch": 1.7087579796169785, + "grad_norm": 1.309321403503418, + "learning_rate": 9.723657894736843e-05, + "loss": 0.3756, + "step": 30515 + }, + { + "epoch": 1.7088139769291075, + "grad_norm": 1.527395248413086, + "learning_rate": 9.723631578947369e-05, + "loss": 0.5072, + "step": 30516 + }, + { + "epoch": 1.7088699742412365, + "grad_norm": 1.4535458087921143, + "learning_rate": 9.723605263157896e-05, + "loss": 0.408, + "step": 30517 + }, + { + "epoch": 1.7089259715533656, + "grad_norm": 1.2554166316986084, + "learning_rate": 9.723578947368421e-05, + "loss": 0.5063, + "step": 30518 + }, + { + "epoch": 1.7089819688654946, + "grad_norm": 1.2566369771957397, + "learning_rate": 9.723552631578948e-05, + "loss": 0.4132, + "step": 30519 + }, + { + "epoch": 1.7090379661776236, + "grad_norm": 1.10805344581604, + "learning_rate": 9.723526315789474e-05, + "loss": 0.4375, + "step": 30520 + }, + { + "epoch": 1.7090939634897526, + "grad_norm": 1.84591543674469, + "learning_rate": 9.723500000000001e-05, + "loss": 0.5896, + "step": 30521 + }, + { + "epoch": 1.7091499608018816, + "grad_norm": 1.3341203927993774, + "learning_rate": 9.723473684210527e-05, + "loss": 0.3631, + "step": 30522 + }, + { + "epoch": 1.7092059581140107, + "grad_norm": 1.2211353778839111, + "learning_rate": 9.723447368421053e-05, + "loss": 0.4661, + "step": 30523 + }, + { + "epoch": 1.7092619554261397, + "grad_norm": 1.682071566581726, + "learning_rate": 9.723421052631579e-05, + "loss": 0.5394, + "step": 30524 + }, + { + "epoch": 1.7093179527382687, + "grad_norm": 1.1894397735595703, + "learning_rate": 9.723394736842105e-05, + "loss": 0.3991, + "step": 30525 + }, + { + "epoch": 1.7093739500503977, + "grad_norm": 1.1776165962219238, + "learning_rate": 9.723368421052633e-05, + "loss": 0.3585, + "step": 30526 + }, + { + "epoch": 1.7094299473625267, + "grad_norm": 1.2605681419372559, + "learning_rate": 9.723342105263159e-05, + "loss": 0.485, + "step": 30527 + }, + { + "epoch": 1.7094859446746558, + "grad_norm": 1.5534437894821167, + "learning_rate": 9.723315789473685e-05, + "loss": 0.5367, + "step": 30528 + }, + { + "epoch": 1.7095419419867848, + "grad_norm": 1.427203893661499, + "learning_rate": 9.72328947368421e-05, + "loss": 0.4518, + "step": 30529 + }, + { + "epoch": 1.7095979392989138, + "grad_norm": 1.098892092704773, + "learning_rate": 9.723263157894738e-05, + "loss": 0.3718, + "step": 30530 + }, + { + "epoch": 1.7096539366110428, + "grad_norm": 1.3872413635253906, + "learning_rate": 9.723236842105264e-05, + "loss": 0.4475, + "step": 30531 + }, + { + "epoch": 1.7097099339231718, + "grad_norm": 1.3392170667648315, + "learning_rate": 9.72321052631579e-05, + "loss": 0.5314, + "step": 30532 + }, + { + "epoch": 1.7097659312353009, + "grad_norm": 1.3937572240829468, + "learning_rate": 9.723184210526316e-05, + "loss": 0.4211, + "step": 30533 + }, + { + "epoch": 1.7098219285474299, + "grad_norm": 1.1660832166671753, + "learning_rate": 9.723157894736843e-05, + "loss": 0.4424, + "step": 30534 + }, + { + "epoch": 1.709877925859559, + "grad_norm": 1.8650375604629517, + "learning_rate": 9.723131578947369e-05, + "loss": 0.4601, + "step": 30535 + }, + { + "epoch": 1.709933923171688, + "grad_norm": 1.4749939441680908, + "learning_rate": 9.723105263157895e-05, + "loss": 0.4636, + "step": 30536 + }, + { + "epoch": 1.709989920483817, + "grad_norm": 1.7622720003128052, + "learning_rate": 9.723078947368421e-05, + "loss": 0.5261, + "step": 30537 + }, + { + "epoch": 1.710045917795946, + "grad_norm": 1.921120047569275, + "learning_rate": 9.723052631578948e-05, + "loss": 0.4489, + "step": 30538 + }, + { + "epoch": 1.710101915108075, + "grad_norm": 1.7017356157302856, + "learning_rate": 9.723026315789474e-05, + "loss": 0.4543, + "step": 30539 + }, + { + "epoch": 1.710157912420204, + "grad_norm": 1.1812225580215454, + "learning_rate": 9.723000000000002e-05, + "loss": 0.3206, + "step": 30540 + }, + { + "epoch": 1.710213909732333, + "grad_norm": 1.267765760421753, + "learning_rate": 9.722973684210526e-05, + "loss": 0.4081, + "step": 30541 + }, + { + "epoch": 1.710269907044462, + "grad_norm": 5.9477667808532715, + "learning_rate": 9.722947368421052e-05, + "loss": 0.4174, + "step": 30542 + }, + { + "epoch": 1.710325904356591, + "grad_norm": 1.1593431234359741, + "learning_rate": 9.72292105263158e-05, + "loss": 0.4871, + "step": 30543 + }, + { + "epoch": 1.71038190166872, + "grad_norm": 1.4129647016525269, + "learning_rate": 9.722894736842105e-05, + "loss": 0.5077, + "step": 30544 + }, + { + "epoch": 1.710437898980849, + "grad_norm": 1.1454287767410278, + "learning_rate": 9.722868421052633e-05, + "loss": 0.596, + "step": 30545 + }, + { + "epoch": 1.710493896292978, + "grad_norm": 1.4092313051223755, + "learning_rate": 9.722842105263157e-05, + "loss": 0.5816, + "step": 30546 + }, + { + "epoch": 1.7105498936051071, + "grad_norm": 1.3189082145690918, + "learning_rate": 9.722815789473685e-05, + "loss": 0.4882, + "step": 30547 + }, + { + "epoch": 1.7106058909172361, + "grad_norm": 1.5342934131622314, + "learning_rate": 9.722789473684211e-05, + "loss": 0.3852, + "step": 30548 + }, + { + "epoch": 1.710661888229365, + "grad_norm": 1.220430850982666, + "learning_rate": 9.722763157894738e-05, + "loss": 0.5128, + "step": 30549 + }, + { + "epoch": 1.710717885541494, + "grad_norm": 1.8775285482406616, + "learning_rate": 9.722736842105263e-05, + "loss": 0.5091, + "step": 30550 + }, + { + "epoch": 1.710773882853623, + "grad_norm": 1.3726714849472046, + "learning_rate": 9.72271052631579e-05, + "loss": 0.5295, + "step": 30551 + }, + { + "epoch": 1.710829880165752, + "grad_norm": 1.4516642093658447, + "learning_rate": 9.722684210526316e-05, + "loss": 0.4134, + "step": 30552 + }, + { + "epoch": 1.710885877477881, + "grad_norm": 1.3525302410125732, + "learning_rate": 9.722657894736843e-05, + "loss": 0.4878, + "step": 30553 + }, + { + "epoch": 1.71094187479001, + "grad_norm": 1.3447599411010742, + "learning_rate": 9.722631578947369e-05, + "loss": 0.565, + "step": 30554 + }, + { + "epoch": 1.710997872102139, + "grad_norm": 1.1941272020339966, + "learning_rate": 9.722605263157895e-05, + "loss": 0.3563, + "step": 30555 + }, + { + "epoch": 1.711053869414268, + "grad_norm": 1.3645598888397217, + "learning_rate": 9.722578947368421e-05, + "loss": 0.5775, + "step": 30556 + }, + { + "epoch": 1.711109866726397, + "grad_norm": 1.2135733366012573, + "learning_rate": 9.722552631578949e-05, + "loss": 0.4698, + "step": 30557 + }, + { + "epoch": 1.711165864038526, + "grad_norm": 1.188438057899475, + "learning_rate": 9.722526315789475e-05, + "loss": 0.3848, + "step": 30558 + }, + { + "epoch": 1.7112218613506551, + "grad_norm": 1.270774006843567, + "learning_rate": 9.7225e-05, + "loss": 0.4707, + "step": 30559 + }, + { + "epoch": 1.7112778586627841, + "grad_norm": 1.282051920890808, + "learning_rate": 9.722473684210526e-05, + "loss": 0.3673, + "step": 30560 + }, + { + "epoch": 1.7113338559749132, + "grad_norm": 1.2208714485168457, + "learning_rate": 9.722447368421052e-05, + "loss": 0.406, + "step": 30561 + }, + { + "epoch": 1.7113898532870422, + "grad_norm": 1.078134536743164, + "learning_rate": 9.72242105263158e-05, + "loss": 0.4927, + "step": 30562 + }, + { + "epoch": 1.7114458505991712, + "grad_norm": 7.428740978240967, + "learning_rate": 9.722394736842106e-05, + "loss": 0.4742, + "step": 30563 + }, + { + "epoch": 1.7115018479113002, + "grad_norm": 1.3124948740005493, + "learning_rate": 9.722368421052632e-05, + "loss": 0.4801, + "step": 30564 + }, + { + "epoch": 1.7115578452234292, + "grad_norm": 1.4366769790649414, + "learning_rate": 9.722342105263158e-05, + "loss": 0.4579, + "step": 30565 + }, + { + "epoch": 1.7116138425355583, + "grad_norm": 1.1832019090652466, + "learning_rate": 9.722315789473685e-05, + "loss": 0.4563, + "step": 30566 + }, + { + "epoch": 1.7116698398476873, + "grad_norm": 1.219072699546814, + "learning_rate": 9.722289473684211e-05, + "loss": 0.3572, + "step": 30567 + }, + { + "epoch": 1.7117258371598163, + "grad_norm": 1.352976679801941, + "learning_rate": 9.722263157894737e-05, + "loss": 0.4948, + "step": 30568 + }, + { + "epoch": 1.7117818344719453, + "grad_norm": 2.1166634559631348, + "learning_rate": 9.722236842105263e-05, + "loss": 0.3773, + "step": 30569 + }, + { + "epoch": 1.7118378317840743, + "grad_norm": 1.0951555967330933, + "learning_rate": 9.72221052631579e-05, + "loss": 0.4427, + "step": 30570 + }, + { + "epoch": 1.7118938290962034, + "grad_norm": 1.319106936454773, + "learning_rate": 9.722184210526316e-05, + "loss": 0.4357, + "step": 30571 + }, + { + "epoch": 1.7119498264083324, + "grad_norm": 1.6315269470214844, + "learning_rate": 9.722157894736844e-05, + "loss": 0.4057, + "step": 30572 + }, + { + "epoch": 1.7120058237204614, + "grad_norm": 1.246069073677063, + "learning_rate": 9.722131578947368e-05, + "loss": 0.3893, + "step": 30573 + }, + { + "epoch": 1.7120618210325904, + "grad_norm": 1.2473217248916626, + "learning_rate": 9.722105263157896e-05, + "loss": 0.4203, + "step": 30574 + }, + { + "epoch": 1.7121178183447194, + "grad_norm": 1.3081737756729126, + "learning_rate": 9.722078947368421e-05, + "loss": 0.4838, + "step": 30575 + }, + { + "epoch": 1.7121738156568485, + "grad_norm": 1.5456491708755493, + "learning_rate": 9.722052631578947e-05, + "loss": 0.4271, + "step": 30576 + }, + { + "epoch": 1.7122298129689775, + "grad_norm": 1.4192774295806885, + "learning_rate": 9.722026315789475e-05, + "loss": 0.4473, + "step": 30577 + }, + { + "epoch": 1.7122858102811065, + "grad_norm": 1.259068489074707, + "learning_rate": 9.722e-05, + "loss": 0.4821, + "step": 30578 + }, + { + "epoch": 1.7123418075932355, + "grad_norm": 1.2192338705062866, + "learning_rate": 9.721973684210527e-05, + "loss": 0.4147, + "step": 30579 + }, + { + "epoch": 1.7123978049053645, + "grad_norm": 1.3094992637634277, + "learning_rate": 9.721947368421053e-05, + "loss": 0.4099, + "step": 30580 + }, + { + "epoch": 1.7124538022174935, + "grad_norm": 1.2361643314361572, + "learning_rate": 9.72192105263158e-05, + "loss": 0.4245, + "step": 30581 + }, + { + "epoch": 1.7125097995296226, + "grad_norm": 1.2182124853134155, + "learning_rate": 9.721894736842106e-05, + "loss": 0.5122, + "step": 30582 + }, + { + "epoch": 1.7125657968417516, + "grad_norm": 1.3774763345718384, + "learning_rate": 9.721868421052632e-05, + "loss": 0.4316, + "step": 30583 + }, + { + "epoch": 1.7126217941538806, + "grad_norm": 1.2150533199310303, + "learning_rate": 9.721842105263158e-05, + "loss": 0.4453, + "step": 30584 + }, + { + "epoch": 1.7126777914660096, + "grad_norm": 1.3658932447433472, + "learning_rate": 9.721815789473685e-05, + "loss": 0.5215, + "step": 30585 + }, + { + "epoch": 1.7127337887781386, + "grad_norm": 1.7840657234191895, + "learning_rate": 9.721789473684211e-05, + "loss": 0.4515, + "step": 30586 + }, + { + "epoch": 1.7127897860902677, + "grad_norm": 12.474908828735352, + "learning_rate": 9.721763157894737e-05, + "loss": 0.397, + "step": 30587 + }, + { + "epoch": 1.7128457834023967, + "grad_norm": 1.279429316520691, + "learning_rate": 9.721736842105263e-05, + "loss": 0.4489, + "step": 30588 + }, + { + "epoch": 1.7129017807145257, + "grad_norm": 1.2631796598434448, + "learning_rate": 9.72171052631579e-05, + "loss": 0.4396, + "step": 30589 + }, + { + "epoch": 1.7129577780266547, + "grad_norm": 1.2091093063354492, + "learning_rate": 9.721684210526317e-05, + "loss": 0.4535, + "step": 30590 + }, + { + "epoch": 1.7130137753387837, + "grad_norm": 1.2190179824829102, + "learning_rate": 9.721657894736842e-05, + "loss": 0.3891, + "step": 30591 + }, + { + "epoch": 1.7130697726509128, + "grad_norm": 4.941834926605225, + "learning_rate": 9.721631578947368e-05, + "loss": 0.5027, + "step": 30592 + }, + { + "epoch": 1.7131257699630418, + "grad_norm": 1.272477388381958, + "learning_rate": 9.721605263157894e-05, + "loss": 0.4147, + "step": 30593 + }, + { + "epoch": 1.7131817672751708, + "grad_norm": 1.1352440118789673, + "learning_rate": 9.721578947368422e-05, + "loss": 0.4997, + "step": 30594 + }, + { + "epoch": 1.7132377645872998, + "grad_norm": 1.238242506980896, + "learning_rate": 9.721552631578948e-05, + "loss": 0.5159, + "step": 30595 + }, + { + "epoch": 1.7132937618994288, + "grad_norm": 1.2218987941741943, + "learning_rate": 9.721526315789474e-05, + "loss": 0.5147, + "step": 30596 + }, + { + "epoch": 1.7133497592115579, + "grad_norm": 1.0865470170974731, + "learning_rate": 9.7215e-05, + "loss": 0.3208, + "step": 30597 + }, + { + "epoch": 1.7134057565236869, + "grad_norm": 1.231279969215393, + "learning_rate": 9.721473684210527e-05, + "loss": 0.5322, + "step": 30598 + }, + { + "epoch": 1.713461753835816, + "grad_norm": 1.792553186416626, + "learning_rate": 9.721447368421053e-05, + "loss": 0.6053, + "step": 30599 + }, + { + "epoch": 1.713517751147945, + "grad_norm": 16.398744583129883, + "learning_rate": 9.72142105263158e-05, + "loss": 0.6179, + "step": 30600 + }, + { + "epoch": 1.713573748460074, + "grad_norm": 1.4515624046325684, + "learning_rate": 9.721394736842105e-05, + "loss": 0.4621, + "step": 30601 + }, + { + "epoch": 1.713629745772203, + "grad_norm": 1.3135499954223633, + "learning_rate": 9.721368421052632e-05, + "loss": 0.4713, + "step": 30602 + }, + { + "epoch": 1.713685743084332, + "grad_norm": 1.1159043312072754, + "learning_rate": 9.721342105263158e-05, + "loss": 0.3756, + "step": 30603 + }, + { + "epoch": 1.713741740396461, + "grad_norm": 1.162896752357483, + "learning_rate": 9.721315789473686e-05, + "loss": 0.3915, + "step": 30604 + }, + { + "epoch": 1.71379773770859, + "grad_norm": 1.2234514951705933, + "learning_rate": 9.72128947368421e-05, + "loss": 0.426, + "step": 30605 + }, + { + "epoch": 1.713853735020719, + "grad_norm": 1.1834098100662231, + "learning_rate": 9.721263157894737e-05, + "loss": 0.4264, + "step": 30606 + }, + { + "epoch": 1.713909732332848, + "grad_norm": 1.1783487796783447, + "learning_rate": 9.721236842105263e-05, + "loss": 0.3993, + "step": 30607 + }, + { + "epoch": 1.713965729644977, + "grad_norm": 1.6386569738388062, + "learning_rate": 9.721210526315791e-05, + "loss": 0.5254, + "step": 30608 + }, + { + "epoch": 1.714021726957106, + "grad_norm": 1.2830673456192017, + "learning_rate": 9.721184210526317e-05, + "loss": 0.3077, + "step": 30609 + }, + { + "epoch": 1.714077724269235, + "grad_norm": 1.170202612876892, + "learning_rate": 9.721157894736841e-05, + "loss": 0.3864, + "step": 30610 + }, + { + "epoch": 1.7141337215813641, + "grad_norm": 2.121738910675049, + "learning_rate": 9.721131578947369e-05, + "loss": 0.5497, + "step": 30611 + }, + { + "epoch": 1.7141897188934931, + "grad_norm": 1.3131927251815796, + "learning_rate": 9.721105263157895e-05, + "loss": 0.461, + "step": 30612 + }, + { + "epoch": 1.7142457162056222, + "grad_norm": 1.1239851713180542, + "learning_rate": 9.721078947368422e-05, + "loss": 0.4226, + "step": 30613 + }, + { + "epoch": 1.7143017135177512, + "grad_norm": 1.1671003103256226, + "learning_rate": 9.721052631578948e-05, + "loss": 0.5343, + "step": 30614 + }, + { + "epoch": 1.7143577108298802, + "grad_norm": 1.367677927017212, + "learning_rate": 9.721026315789474e-05, + "loss": 0.6234, + "step": 30615 + }, + { + "epoch": 1.7144137081420092, + "grad_norm": 1.6681021451950073, + "learning_rate": 9.721e-05, + "loss": 0.3351, + "step": 30616 + }, + { + "epoch": 1.7144697054541382, + "grad_norm": 1.8264193534851074, + "learning_rate": 9.720973684210527e-05, + "loss": 0.5302, + "step": 30617 + }, + { + "epoch": 1.7145257027662673, + "grad_norm": 1.5470703840255737, + "learning_rate": 9.720947368421053e-05, + "loss": 0.594, + "step": 30618 + }, + { + "epoch": 1.7145817000783963, + "grad_norm": 1.2262579202651978, + "learning_rate": 9.720921052631579e-05, + "loss": 0.3708, + "step": 30619 + }, + { + "epoch": 1.7146376973905253, + "grad_norm": 1.1791127920150757, + "learning_rate": 9.720894736842105e-05, + "loss": 0.4498, + "step": 30620 + }, + { + "epoch": 1.7146936947026543, + "grad_norm": 1.3748281002044678, + "learning_rate": 9.720868421052633e-05, + "loss": 0.4342, + "step": 30621 + }, + { + "epoch": 1.7147496920147833, + "grad_norm": 1.1307182312011719, + "learning_rate": 9.720842105263158e-05, + "loss": 0.4498, + "step": 30622 + }, + { + "epoch": 1.7148056893269124, + "grad_norm": 1.339543104171753, + "learning_rate": 9.720815789473684e-05, + "loss": 0.5279, + "step": 30623 + }, + { + "epoch": 1.7148616866390414, + "grad_norm": 1.3946506977081299, + "learning_rate": 9.72078947368421e-05, + "loss": 0.5304, + "step": 30624 + }, + { + "epoch": 1.7149176839511704, + "grad_norm": 2.007319688796997, + "learning_rate": 9.720763157894738e-05, + "loss": 0.4803, + "step": 30625 + }, + { + "epoch": 1.7149736812632994, + "grad_norm": 1.4103554487228394, + "learning_rate": 9.720736842105264e-05, + "loss": 0.4206, + "step": 30626 + }, + { + "epoch": 1.7150296785754284, + "grad_norm": 1.0439146757125854, + "learning_rate": 9.720710526315791e-05, + "loss": 0.5129, + "step": 30627 + }, + { + "epoch": 1.7150856758875574, + "grad_norm": 1.2654073238372803, + "learning_rate": 9.720684210526316e-05, + "loss": 0.4236, + "step": 30628 + }, + { + "epoch": 1.7151416731996865, + "grad_norm": 1.275828242301941, + "learning_rate": 9.720657894736842e-05, + "loss": 0.4752, + "step": 30629 + }, + { + "epoch": 1.7151976705118155, + "grad_norm": 1.4743921756744385, + "learning_rate": 9.720631578947369e-05, + "loss": 0.4481, + "step": 30630 + }, + { + "epoch": 1.7152536678239445, + "grad_norm": 1.5907094478607178, + "learning_rate": 9.720605263157895e-05, + "loss": 0.4339, + "step": 30631 + }, + { + "epoch": 1.7153096651360733, + "grad_norm": 2.2020251750946045, + "learning_rate": 9.720578947368422e-05, + "loss": 0.4501, + "step": 30632 + }, + { + "epoch": 1.7153656624482023, + "grad_norm": 1.5741947889328003, + "learning_rate": 9.720552631578947e-05, + "loss": 0.5433, + "step": 30633 + }, + { + "epoch": 1.7154216597603313, + "grad_norm": 1.4771791696548462, + "learning_rate": 9.720526315789474e-05, + "loss": 0.4377, + "step": 30634 + }, + { + "epoch": 1.7154776570724604, + "grad_norm": 1.3025338649749756, + "learning_rate": 9.7205e-05, + "loss": 0.4054, + "step": 30635 + }, + { + "epoch": 1.7155336543845894, + "grad_norm": 1.1977874040603638, + "learning_rate": 9.720473684210528e-05, + "loss": 0.4227, + "step": 30636 + }, + { + "epoch": 1.7155896516967184, + "grad_norm": 1.209682583808899, + "learning_rate": 9.720447368421053e-05, + "loss": 0.3793, + "step": 30637 + }, + { + "epoch": 1.7156456490088474, + "grad_norm": 1.4564845561981201, + "learning_rate": 9.72042105263158e-05, + "loss": 0.4768, + "step": 30638 + }, + { + "epoch": 1.7157016463209764, + "grad_norm": 1.0919175148010254, + "learning_rate": 9.720394736842105e-05, + "loss": 0.4748, + "step": 30639 + }, + { + "epoch": 1.7157576436331055, + "grad_norm": 1.1943644285202026, + "learning_rate": 9.720368421052633e-05, + "loss": 0.3998, + "step": 30640 + }, + { + "epoch": 1.7158136409452345, + "grad_norm": 1.596779704093933, + "learning_rate": 9.720342105263159e-05, + "loss": 0.4699, + "step": 30641 + }, + { + "epoch": 1.7158696382573635, + "grad_norm": 1.8138458728790283, + "learning_rate": 9.720315789473685e-05, + "loss": 0.6418, + "step": 30642 + }, + { + "epoch": 1.7159256355694925, + "grad_norm": 1.606316089630127, + "learning_rate": 9.720289473684211e-05, + "loss": 0.602, + "step": 30643 + }, + { + "epoch": 1.7159816328816215, + "grad_norm": 1.4772661924362183, + "learning_rate": 9.720263157894738e-05, + "loss": 0.4297, + "step": 30644 + }, + { + "epoch": 1.7160376301937506, + "grad_norm": 2.572352409362793, + "learning_rate": 9.720236842105264e-05, + "loss": 0.3914, + "step": 30645 + }, + { + "epoch": 1.7160936275058796, + "grad_norm": 1.6091371774673462, + "learning_rate": 9.72021052631579e-05, + "loss": 0.3716, + "step": 30646 + }, + { + "epoch": 1.7161496248180086, + "grad_norm": 1.4084526300430298, + "learning_rate": 9.720184210526316e-05, + "loss": 0.4196, + "step": 30647 + }, + { + "epoch": 1.7162056221301376, + "grad_norm": 1.1943351030349731, + "learning_rate": 9.720157894736842e-05, + "loss": 0.3734, + "step": 30648 + }, + { + "epoch": 1.7162616194422666, + "grad_norm": 1.3019176721572876, + "learning_rate": 9.720131578947369e-05, + "loss": 0.5291, + "step": 30649 + }, + { + "epoch": 1.7163176167543956, + "grad_norm": 1.2358137369155884, + "learning_rate": 9.720105263157895e-05, + "loss": 0.4952, + "step": 30650 + }, + { + "epoch": 1.7163736140665247, + "grad_norm": 1.4986878633499146, + "learning_rate": 9.720078947368421e-05, + "loss": 0.6412, + "step": 30651 + }, + { + "epoch": 1.7164296113786537, + "grad_norm": 1.4451597929000854, + "learning_rate": 9.720052631578947e-05, + "loss": 0.479, + "step": 30652 + }, + { + "epoch": 1.7164856086907827, + "grad_norm": 1.2005128860473633, + "learning_rate": 9.720026315789474e-05, + "loss": 0.5046, + "step": 30653 + }, + { + "epoch": 1.7165416060029117, + "grad_norm": 1.1480522155761719, + "learning_rate": 9.72e-05, + "loss": 0.489, + "step": 30654 + }, + { + "epoch": 1.7165976033150407, + "grad_norm": 1.278332233428955, + "learning_rate": 9.719973684210528e-05, + "loss": 0.5082, + "step": 30655 + }, + { + "epoch": 1.7166536006271698, + "grad_norm": 1.2766473293304443, + "learning_rate": 9.719947368421052e-05, + "loss": 0.4704, + "step": 30656 + }, + { + "epoch": 1.7167095979392988, + "grad_norm": 1.597607135772705, + "learning_rate": 9.71992105263158e-05, + "loss": 0.4532, + "step": 30657 + }, + { + "epoch": 1.7167655952514278, + "grad_norm": 1.3771840333938599, + "learning_rate": 9.719894736842106e-05, + "loss": 0.4287, + "step": 30658 + }, + { + "epoch": 1.7168215925635568, + "grad_norm": 1.1253502368927002, + "learning_rate": 9.719868421052633e-05, + "loss": 0.3985, + "step": 30659 + }, + { + "epoch": 1.7168775898756858, + "grad_norm": 1.0468671321868896, + "learning_rate": 9.719842105263158e-05, + "loss": 0.3813, + "step": 30660 + }, + { + "epoch": 1.7169335871878149, + "grad_norm": 1.1901935338974, + "learning_rate": 9.719815789473685e-05, + "loss": 0.4057, + "step": 30661 + }, + { + "epoch": 1.7169895844999439, + "grad_norm": 1.9568490982055664, + "learning_rate": 9.719789473684211e-05, + "loss": 0.3452, + "step": 30662 + }, + { + "epoch": 1.717045581812073, + "grad_norm": 1.33143150806427, + "learning_rate": 9.719763157894737e-05, + "loss": 0.5552, + "step": 30663 + }, + { + "epoch": 1.717101579124202, + "grad_norm": 1.3214884996414185, + "learning_rate": 9.719736842105264e-05, + "loss": 0.4655, + "step": 30664 + }, + { + "epoch": 1.717157576436331, + "grad_norm": 1.1933867931365967, + "learning_rate": 9.719710526315789e-05, + "loss": 0.4986, + "step": 30665 + }, + { + "epoch": 1.71721357374846, + "grad_norm": 1.3275268077850342, + "learning_rate": 9.719684210526316e-05, + "loss": 0.3995, + "step": 30666 + }, + { + "epoch": 1.717269571060589, + "grad_norm": 1.329003930091858, + "learning_rate": 9.719657894736842e-05, + "loss": 0.4352, + "step": 30667 + }, + { + "epoch": 1.717325568372718, + "grad_norm": 1.2409541606903076, + "learning_rate": 9.71963157894737e-05, + "loss": 0.4716, + "step": 30668 + }, + { + "epoch": 1.717381565684847, + "grad_norm": 1.4531842470169067, + "learning_rate": 9.719605263157895e-05, + "loss": 0.4473, + "step": 30669 + }, + { + "epoch": 1.717437562996976, + "grad_norm": 1.2469524145126343, + "learning_rate": 9.719578947368421e-05, + "loss": 0.47, + "step": 30670 + }, + { + "epoch": 1.717493560309105, + "grad_norm": 1.8494130373001099, + "learning_rate": 9.719552631578947e-05, + "loss": 0.42, + "step": 30671 + }, + { + "epoch": 1.717549557621234, + "grad_norm": 1.1138805150985718, + "learning_rate": 9.719526315789475e-05, + "loss": 0.4293, + "step": 30672 + }, + { + "epoch": 1.717605554933363, + "grad_norm": 1.6776938438415527, + "learning_rate": 9.719500000000001e-05, + "loss": 0.4414, + "step": 30673 + }, + { + "epoch": 1.717661552245492, + "grad_norm": 1.2268189191818237, + "learning_rate": 9.719473684210527e-05, + "loss": 0.4444, + "step": 30674 + }, + { + "epoch": 1.7177175495576211, + "grad_norm": 1.4933257102966309, + "learning_rate": 9.719447368421053e-05, + "loss": 0.5096, + "step": 30675 + }, + { + "epoch": 1.7177735468697501, + "grad_norm": 1.1103745698928833, + "learning_rate": 9.71942105263158e-05, + "loss": 0.3784, + "step": 30676 + }, + { + "epoch": 1.7178295441818792, + "grad_norm": 1.309709072113037, + "learning_rate": 9.719394736842106e-05, + "loss": 0.5753, + "step": 30677 + }, + { + "epoch": 1.7178855414940082, + "grad_norm": 1.2489408254623413, + "learning_rate": 9.719368421052632e-05, + "loss": 0.3883, + "step": 30678 + }, + { + "epoch": 1.7179415388061372, + "grad_norm": 1.180154800415039, + "learning_rate": 9.719342105263158e-05, + "loss": 0.3999, + "step": 30679 + }, + { + "epoch": 1.7179975361182662, + "grad_norm": 1.634352445602417, + "learning_rate": 9.719315789473684e-05, + "loss": 0.4748, + "step": 30680 + }, + { + "epoch": 1.7180535334303952, + "grad_norm": 1.1118837594985962, + "learning_rate": 9.719289473684211e-05, + "loss": 0.4365, + "step": 30681 + }, + { + "epoch": 1.7181095307425243, + "grad_norm": 1.3902416229248047, + "learning_rate": 9.719263157894737e-05, + "loss": 0.5039, + "step": 30682 + }, + { + "epoch": 1.7181655280546533, + "grad_norm": 1.4275425672531128, + "learning_rate": 9.719236842105263e-05, + "loss": 0.4861, + "step": 30683 + }, + { + "epoch": 1.7182215253667823, + "grad_norm": 1.4224351644515991, + "learning_rate": 9.719210526315789e-05, + "loss": 0.583, + "step": 30684 + }, + { + "epoch": 1.7182775226789113, + "grad_norm": 1.3549447059631348, + "learning_rate": 9.719184210526316e-05, + "loss": 0.4253, + "step": 30685 + }, + { + "epoch": 1.7183335199910403, + "grad_norm": 1.197293758392334, + "learning_rate": 9.719157894736842e-05, + "loss": 0.3436, + "step": 30686 + }, + { + "epoch": 1.7183895173031694, + "grad_norm": 1.121922254562378, + "learning_rate": 9.71913157894737e-05, + "loss": 0.426, + "step": 30687 + }, + { + "epoch": 1.7184455146152984, + "grad_norm": 1.2227044105529785, + "learning_rate": 9.719105263157894e-05, + "loss": 0.433, + "step": 30688 + }, + { + "epoch": 1.7185015119274274, + "grad_norm": 1.234965443611145, + "learning_rate": 9.719078947368422e-05, + "loss": 0.4819, + "step": 30689 + }, + { + "epoch": 1.7185575092395564, + "grad_norm": 1.185038685798645, + "learning_rate": 9.719052631578948e-05, + "loss": 0.4845, + "step": 30690 + }, + { + "epoch": 1.7186135065516854, + "grad_norm": 1.0895192623138428, + "learning_rate": 9.719026315789475e-05, + "loss": 0.416, + "step": 30691 + }, + { + "epoch": 1.7186695038638145, + "grad_norm": 1.145519733428955, + "learning_rate": 9.719000000000001e-05, + "loss": 0.5857, + "step": 30692 + }, + { + "epoch": 1.7187255011759435, + "grad_norm": 1.2973244190216064, + "learning_rate": 9.718973684210527e-05, + "loss": 0.5575, + "step": 30693 + }, + { + "epoch": 1.7187814984880725, + "grad_norm": 1.195210337638855, + "learning_rate": 9.718947368421053e-05, + "loss": 0.3612, + "step": 30694 + }, + { + "epoch": 1.7188374958002015, + "grad_norm": 1.2516976594924927, + "learning_rate": 9.71892105263158e-05, + "loss": 0.5291, + "step": 30695 + }, + { + "epoch": 1.7188934931123305, + "grad_norm": 1.301780104637146, + "learning_rate": 9.718894736842106e-05, + "loss": 0.4843, + "step": 30696 + }, + { + "epoch": 1.7189494904244595, + "grad_norm": 1.1459444761276245, + "learning_rate": 9.718868421052632e-05, + "loss": 0.479, + "step": 30697 + }, + { + "epoch": 1.7190054877365886, + "grad_norm": 1.6155439615249634, + "learning_rate": 9.718842105263158e-05, + "loss": 0.5189, + "step": 30698 + }, + { + "epoch": 1.7190614850487176, + "grad_norm": 1.2038792371749878, + "learning_rate": 9.718815789473684e-05, + "loss": 0.3827, + "step": 30699 + }, + { + "epoch": 1.7191174823608466, + "grad_norm": 1.0903184413909912, + "learning_rate": 9.718789473684211e-05, + "loss": 0.3966, + "step": 30700 + }, + { + "epoch": 1.7191734796729756, + "grad_norm": 1.5096611976623535, + "learning_rate": 9.718763157894737e-05, + "loss": 0.4004, + "step": 30701 + }, + { + "epoch": 1.7192294769851046, + "grad_norm": 1.1890404224395752, + "learning_rate": 9.718736842105263e-05, + "loss": 0.5064, + "step": 30702 + }, + { + "epoch": 1.7192854742972337, + "grad_norm": 1.2429345846176147, + "learning_rate": 9.71871052631579e-05, + "loss": 0.5223, + "step": 30703 + }, + { + "epoch": 1.7193414716093627, + "grad_norm": 1.2582592964172363, + "learning_rate": 9.718684210526317e-05, + "loss": 0.5147, + "step": 30704 + }, + { + "epoch": 1.7193974689214917, + "grad_norm": 1.1780757904052734, + "learning_rate": 9.718657894736843e-05, + "loss": 0.4832, + "step": 30705 + }, + { + "epoch": 1.7194534662336207, + "grad_norm": 1.1705541610717773, + "learning_rate": 9.718631578947369e-05, + "loss": 0.4167, + "step": 30706 + }, + { + "epoch": 1.7195094635457497, + "grad_norm": 1.7118525505065918, + "learning_rate": 9.718605263157895e-05, + "loss": 0.5343, + "step": 30707 + }, + { + "epoch": 1.7195654608578788, + "grad_norm": 1.2762616872787476, + "learning_rate": 9.718578947368422e-05, + "loss": 0.441, + "step": 30708 + }, + { + "epoch": 1.7196214581700078, + "grad_norm": 1.195202350616455, + "learning_rate": 9.718552631578948e-05, + "loss": 0.5421, + "step": 30709 + }, + { + "epoch": 1.7196774554821368, + "grad_norm": 1.2236769199371338, + "learning_rate": 9.718526315789475e-05, + "loss": 0.4745, + "step": 30710 + }, + { + "epoch": 1.7197334527942658, + "grad_norm": 1.5898139476776123, + "learning_rate": 9.7185e-05, + "loss": 0.5193, + "step": 30711 + }, + { + "epoch": 1.7197894501063948, + "grad_norm": 1.3720322847366333, + "learning_rate": 9.718473684210527e-05, + "loss": 0.4336, + "step": 30712 + }, + { + "epoch": 1.7198454474185239, + "grad_norm": 1.2265517711639404, + "learning_rate": 9.718447368421053e-05, + "loss": 0.443, + "step": 30713 + }, + { + "epoch": 1.7199014447306529, + "grad_norm": 1.514292597770691, + "learning_rate": 9.718421052631579e-05, + "loss": 0.4443, + "step": 30714 + }, + { + "epoch": 1.719957442042782, + "grad_norm": 1.1352652311325073, + "learning_rate": 9.718394736842105e-05, + "loss": 0.3963, + "step": 30715 + }, + { + "epoch": 1.720013439354911, + "grad_norm": 1.0488125085830688, + "learning_rate": 9.718368421052631e-05, + "loss": 0.4215, + "step": 30716 + }, + { + "epoch": 1.72006943666704, + "grad_norm": 1.1707981824874878, + "learning_rate": 9.718342105263158e-05, + "loss": 0.4291, + "step": 30717 + }, + { + "epoch": 1.720125433979169, + "grad_norm": 1.696366786956787, + "learning_rate": 9.718315789473684e-05, + "loss": 0.5962, + "step": 30718 + }, + { + "epoch": 1.720181431291298, + "grad_norm": 1.5346957445144653, + "learning_rate": 9.718289473684212e-05, + "loss": 0.5974, + "step": 30719 + }, + { + "epoch": 1.720237428603427, + "grad_norm": 1.9283950328826904, + "learning_rate": 9.718263157894736e-05, + "loss": 0.4745, + "step": 30720 + }, + { + "epoch": 1.720293425915556, + "grad_norm": 1.3071030378341675, + "learning_rate": 9.718236842105264e-05, + "loss": 0.5455, + "step": 30721 + }, + { + "epoch": 1.720349423227685, + "grad_norm": 1.199967384338379, + "learning_rate": 9.71821052631579e-05, + "loss": 0.4828, + "step": 30722 + }, + { + "epoch": 1.720405420539814, + "grad_norm": 1.2231509685516357, + "learning_rate": 9.718184210526317e-05, + "loss": 0.4276, + "step": 30723 + }, + { + "epoch": 1.720461417851943, + "grad_norm": 1.6901391744613647, + "learning_rate": 9.718157894736843e-05, + "loss": 0.5318, + "step": 30724 + }, + { + "epoch": 1.720517415164072, + "grad_norm": 1.1251397132873535, + "learning_rate": 9.718131578947369e-05, + "loss": 0.4106, + "step": 30725 + }, + { + "epoch": 1.720573412476201, + "grad_norm": 1.2451826333999634, + "learning_rate": 9.718105263157895e-05, + "loss": 0.4808, + "step": 30726 + }, + { + "epoch": 1.7206294097883301, + "grad_norm": 1.2075666189193726, + "learning_rate": 9.718078947368422e-05, + "loss": 0.4009, + "step": 30727 + }, + { + "epoch": 1.7206854071004591, + "grad_norm": 1.3249578475952148, + "learning_rate": 9.718052631578948e-05, + "loss": 0.4715, + "step": 30728 + }, + { + "epoch": 1.7207414044125882, + "grad_norm": 1.2314748764038086, + "learning_rate": 9.718026315789474e-05, + "loss": 0.6087, + "step": 30729 + }, + { + "epoch": 1.7207974017247172, + "grad_norm": 1.3579349517822266, + "learning_rate": 9.718e-05, + "loss": 0.5412, + "step": 30730 + }, + { + "epoch": 1.7208533990368462, + "grad_norm": 1.256733775138855, + "learning_rate": 9.717973684210526e-05, + "loss": 0.4667, + "step": 30731 + }, + { + "epoch": 1.7209093963489752, + "grad_norm": 1.2888498306274414, + "learning_rate": 9.717947368421053e-05, + "loss": 0.4215, + "step": 30732 + }, + { + "epoch": 1.7209653936611042, + "grad_norm": 1.2255321741104126, + "learning_rate": 9.71792105263158e-05, + "loss": 0.4204, + "step": 30733 + }, + { + "epoch": 1.7210213909732333, + "grad_norm": 1.2704070806503296, + "learning_rate": 9.717894736842105e-05, + "loss": 0.4192, + "step": 30734 + }, + { + "epoch": 1.7210773882853623, + "grad_norm": 1.1680527925491333, + "learning_rate": 9.717868421052631e-05, + "loss": 0.5187, + "step": 30735 + }, + { + "epoch": 1.7211333855974913, + "grad_norm": 1.2011053562164307, + "learning_rate": 9.717842105263159e-05, + "loss": 0.5848, + "step": 30736 + }, + { + "epoch": 1.7211893829096203, + "grad_norm": 1.4032227993011475, + "learning_rate": 9.717815789473685e-05, + "loss": 0.4313, + "step": 30737 + }, + { + "epoch": 1.7212453802217493, + "grad_norm": 1.2838280200958252, + "learning_rate": 9.71778947368421e-05, + "loss": 0.5346, + "step": 30738 + }, + { + "epoch": 1.7213013775338784, + "grad_norm": 1.3266584873199463, + "learning_rate": 9.717763157894737e-05, + "loss": 0.4865, + "step": 30739 + }, + { + "epoch": 1.7213573748460074, + "grad_norm": 1.2213317155838013, + "learning_rate": 9.717736842105264e-05, + "loss": 0.5428, + "step": 30740 + }, + { + "epoch": 1.7214133721581364, + "grad_norm": 1.1680879592895508, + "learning_rate": 9.71771052631579e-05, + "loss": 0.4599, + "step": 30741 + }, + { + "epoch": 1.7214693694702654, + "grad_norm": 1.1384347677230835, + "learning_rate": 9.717684210526317e-05, + "loss": 0.3859, + "step": 30742 + }, + { + "epoch": 1.7215253667823944, + "grad_norm": 1.2647172212600708, + "learning_rate": 9.717657894736842e-05, + "loss": 0.4213, + "step": 30743 + }, + { + "epoch": 1.7215813640945234, + "grad_norm": 1.1774641275405884, + "learning_rate": 9.717631578947369e-05, + "loss": 0.4956, + "step": 30744 + }, + { + "epoch": 1.7216373614066525, + "grad_norm": 1.4863076210021973, + "learning_rate": 9.717605263157895e-05, + "loss": 0.4511, + "step": 30745 + }, + { + "epoch": 1.7216933587187815, + "grad_norm": 1.4243805408477783, + "learning_rate": 9.717578947368422e-05, + "loss": 0.4707, + "step": 30746 + }, + { + "epoch": 1.7217493560309105, + "grad_norm": 1.3286114931106567, + "learning_rate": 9.717552631578948e-05, + "loss": 0.4154, + "step": 30747 + }, + { + "epoch": 1.7218053533430395, + "grad_norm": 1.3599361181259155, + "learning_rate": 9.717526315789474e-05, + "loss": 0.5631, + "step": 30748 + }, + { + "epoch": 1.7218613506551685, + "grad_norm": 1.6556535959243774, + "learning_rate": 9.7175e-05, + "loss": 0.4746, + "step": 30749 + }, + { + "epoch": 1.7219173479672976, + "grad_norm": 1.2787960767745972, + "learning_rate": 9.717473684210526e-05, + "loss": 0.4807, + "step": 30750 + }, + { + "epoch": 1.7219733452794266, + "grad_norm": 1.2840867042541504, + "learning_rate": 9.717447368421054e-05, + "loss": 0.3179, + "step": 30751 + }, + { + "epoch": 1.7220293425915556, + "grad_norm": 1.2753773927688599, + "learning_rate": 9.717421052631578e-05, + "loss": 0.389, + "step": 30752 + }, + { + "epoch": 1.7220853399036846, + "grad_norm": 1.3589175939559937, + "learning_rate": 9.717394736842106e-05, + "loss": 0.6011, + "step": 30753 + }, + { + "epoch": 1.7221413372158136, + "grad_norm": 1.447645902633667, + "learning_rate": 9.717368421052632e-05, + "loss": 0.3273, + "step": 30754 + }, + { + "epoch": 1.7221973345279427, + "grad_norm": 1.3061238527297974, + "learning_rate": 9.717342105263159e-05, + "loss": 0.4034, + "step": 30755 + }, + { + "epoch": 1.7222533318400717, + "grad_norm": 1.231913685798645, + "learning_rate": 9.717315789473685e-05, + "loss": 0.4542, + "step": 30756 + }, + { + "epoch": 1.7223093291522007, + "grad_norm": 1.2924394607543945, + "learning_rate": 9.717289473684211e-05, + "loss": 0.5165, + "step": 30757 + }, + { + "epoch": 1.7223653264643297, + "grad_norm": 1.3542739152908325, + "learning_rate": 9.717263157894737e-05, + "loss": 0.3954, + "step": 30758 + }, + { + "epoch": 1.7224213237764587, + "grad_norm": 1.2190207242965698, + "learning_rate": 9.717236842105264e-05, + "loss": 0.3373, + "step": 30759 + }, + { + "epoch": 1.7224773210885878, + "grad_norm": 1.327471375465393, + "learning_rate": 9.71721052631579e-05, + "loss": 0.5234, + "step": 30760 + }, + { + "epoch": 1.7225333184007168, + "grad_norm": 1.2556498050689697, + "learning_rate": 9.717184210526316e-05, + "loss": 0.3365, + "step": 30761 + }, + { + "epoch": 1.7225893157128458, + "grad_norm": 1.273229956626892, + "learning_rate": 9.717157894736842e-05, + "loss": 0.4334, + "step": 30762 + }, + { + "epoch": 1.7226453130249748, + "grad_norm": 1.2415874004364014, + "learning_rate": 9.71713157894737e-05, + "loss": 0.3701, + "step": 30763 + }, + { + "epoch": 1.7227013103371038, + "grad_norm": 1.1257222890853882, + "learning_rate": 9.717105263157895e-05, + "loss": 0.4524, + "step": 30764 + }, + { + "epoch": 1.7227573076492328, + "grad_norm": 1.111696720123291, + "learning_rate": 9.717078947368423e-05, + "loss": 0.4103, + "step": 30765 + }, + { + "epoch": 1.7228133049613619, + "grad_norm": 1.157478928565979, + "learning_rate": 9.717052631578947e-05, + "loss": 0.4071, + "step": 30766 + }, + { + "epoch": 1.7228693022734909, + "grad_norm": 1.3755488395690918, + "learning_rate": 9.717026315789473e-05, + "loss": 0.441, + "step": 30767 + }, + { + "epoch": 1.72292529958562, + "grad_norm": 1.4772863388061523, + "learning_rate": 9.717e-05, + "loss": 0.5367, + "step": 30768 + }, + { + "epoch": 1.722981296897749, + "grad_norm": 1.3974822759628296, + "learning_rate": 9.716973684210527e-05, + "loss": 0.4845, + "step": 30769 + }, + { + "epoch": 1.723037294209878, + "grad_norm": 1.4198808670043945, + "learning_rate": 9.716947368421053e-05, + "loss": 0.4812, + "step": 30770 + }, + { + "epoch": 1.723093291522007, + "grad_norm": 1.354041337966919, + "learning_rate": 9.716921052631579e-05, + "loss": 0.4878, + "step": 30771 + }, + { + "epoch": 1.723149288834136, + "grad_norm": 1.297965407371521, + "learning_rate": 9.716894736842106e-05, + "loss": 0.411, + "step": 30772 + }, + { + "epoch": 1.723205286146265, + "grad_norm": 1.2439676523208618, + "learning_rate": 9.716868421052632e-05, + "loss": 0.4477, + "step": 30773 + }, + { + "epoch": 1.723261283458394, + "grad_norm": 1.2469125986099243, + "learning_rate": 9.716842105263159e-05, + "loss": 0.48, + "step": 30774 + }, + { + "epoch": 1.723317280770523, + "grad_norm": 1.226881980895996, + "learning_rate": 9.716815789473684e-05, + "loss": 0.4352, + "step": 30775 + }, + { + "epoch": 1.723373278082652, + "grad_norm": 1.4902311563491821, + "learning_rate": 9.716789473684211e-05, + "loss": 0.3364, + "step": 30776 + }, + { + "epoch": 1.723429275394781, + "grad_norm": 1.266294002532959, + "learning_rate": 9.716763157894737e-05, + "loss": 0.3488, + "step": 30777 + }, + { + "epoch": 1.72348527270691, + "grad_norm": 1.145354986190796, + "learning_rate": 9.716736842105264e-05, + "loss": 0.3891, + "step": 30778 + }, + { + "epoch": 1.7235412700190391, + "grad_norm": 1.250402569770813, + "learning_rate": 9.71671052631579e-05, + "loss": 0.5013, + "step": 30779 + }, + { + "epoch": 1.7235972673311681, + "grad_norm": 1.360511064529419, + "learning_rate": 9.716684210526316e-05, + "loss": 0.4564, + "step": 30780 + }, + { + "epoch": 1.7236532646432972, + "grad_norm": 1.220169186592102, + "learning_rate": 9.716657894736842e-05, + "loss": 0.4816, + "step": 30781 + }, + { + "epoch": 1.7237092619554262, + "grad_norm": 1.2172590494155884, + "learning_rate": 9.71663157894737e-05, + "loss": 0.4198, + "step": 30782 + }, + { + "epoch": 1.7237652592675552, + "grad_norm": 1.2992833852767944, + "learning_rate": 9.716605263157896e-05, + "loss": 0.5251, + "step": 30783 + }, + { + "epoch": 1.7238212565796842, + "grad_norm": 1.2512496709823608, + "learning_rate": 9.716578947368422e-05, + "loss": 0.4944, + "step": 30784 + }, + { + "epoch": 1.7238772538918132, + "grad_norm": 1.255361557006836, + "learning_rate": 9.716552631578948e-05, + "loss": 0.4415, + "step": 30785 + }, + { + "epoch": 1.7239332512039423, + "grad_norm": 1.219032883644104, + "learning_rate": 9.716526315789474e-05, + "loss": 0.4437, + "step": 30786 + }, + { + "epoch": 1.7239892485160713, + "grad_norm": 1.262127161026001, + "learning_rate": 9.716500000000001e-05, + "loss": 0.4603, + "step": 30787 + }, + { + "epoch": 1.7240452458282003, + "grad_norm": 1.7772603034973145, + "learning_rate": 9.716473684210527e-05, + "loss": 0.5027, + "step": 30788 + }, + { + "epoch": 1.7241012431403293, + "grad_norm": 1.373513102531433, + "learning_rate": 9.716447368421053e-05, + "loss": 0.4535, + "step": 30789 + }, + { + "epoch": 1.7241572404524583, + "grad_norm": 1.1996124982833862, + "learning_rate": 9.716421052631579e-05, + "loss": 0.429, + "step": 30790 + }, + { + "epoch": 1.7242132377645873, + "grad_norm": 1.2211042642593384, + "learning_rate": 9.716394736842106e-05, + "loss": 0.4429, + "step": 30791 + }, + { + "epoch": 1.7242692350767164, + "grad_norm": 1.1607037782669067, + "learning_rate": 9.716368421052632e-05, + "loss": 0.3886, + "step": 30792 + }, + { + "epoch": 1.7243252323888454, + "grad_norm": 1.408096194267273, + "learning_rate": 9.716342105263158e-05, + "loss": 0.5755, + "step": 30793 + }, + { + "epoch": 1.7243812297009744, + "grad_norm": 1.4828472137451172, + "learning_rate": 9.716315789473684e-05, + "loss": 0.4418, + "step": 30794 + }, + { + "epoch": 1.7244372270131034, + "grad_norm": 1.319539189338684, + "learning_rate": 9.716289473684211e-05, + "loss": 0.4372, + "step": 30795 + }, + { + "epoch": 1.7244932243252324, + "grad_norm": 1.5043408870697021, + "learning_rate": 9.716263157894737e-05, + "loss": 0.4326, + "step": 30796 + }, + { + "epoch": 1.7245492216373615, + "grad_norm": 1.0972951650619507, + "learning_rate": 9.716236842105265e-05, + "loss": 0.5003, + "step": 30797 + }, + { + "epoch": 1.7246052189494905, + "grad_norm": 1.3848352432250977, + "learning_rate": 9.716210526315789e-05, + "loss": 0.4515, + "step": 30798 + }, + { + "epoch": 1.7246612162616195, + "grad_norm": 1.3718483448028564, + "learning_rate": 9.716184210526317e-05, + "loss": 0.7014, + "step": 30799 + }, + { + "epoch": 1.7247172135737485, + "grad_norm": 1.258340835571289, + "learning_rate": 9.716157894736843e-05, + "loss": 0.4507, + "step": 30800 + }, + { + "epoch": 1.7247732108858775, + "grad_norm": 1.5159603357315063, + "learning_rate": 9.716131578947369e-05, + "loss": 0.5084, + "step": 30801 + }, + { + "epoch": 1.7248292081980066, + "grad_norm": 1.6316993236541748, + "learning_rate": 9.716105263157896e-05, + "loss": 0.481, + "step": 30802 + }, + { + "epoch": 1.7248852055101356, + "grad_norm": 1.4929754734039307, + "learning_rate": 9.71607894736842e-05, + "loss": 0.4773, + "step": 30803 + }, + { + "epoch": 1.7249412028222646, + "grad_norm": 1.174686312675476, + "learning_rate": 9.716052631578948e-05, + "loss": 0.4631, + "step": 30804 + }, + { + "epoch": 1.7249972001343936, + "grad_norm": 1.2033119201660156, + "learning_rate": 9.716026315789474e-05, + "loss": 0.5065, + "step": 30805 + }, + { + "epoch": 1.7250531974465226, + "grad_norm": 1.6159331798553467, + "learning_rate": 9.716000000000001e-05, + "loss": 0.3934, + "step": 30806 + }, + { + "epoch": 1.7251091947586517, + "grad_norm": 1.3112629652023315, + "learning_rate": 9.715973684210526e-05, + "loss": 0.3701, + "step": 30807 + }, + { + "epoch": 1.7251651920707807, + "grad_norm": 1.2754032611846924, + "learning_rate": 9.715947368421053e-05, + "loss": 0.4287, + "step": 30808 + }, + { + "epoch": 1.7252211893829097, + "grad_norm": 2.0877411365509033, + "learning_rate": 9.715921052631579e-05, + "loss": 0.6157, + "step": 30809 + }, + { + "epoch": 1.7252771866950387, + "grad_norm": 1.2816237211227417, + "learning_rate": 9.715894736842106e-05, + "loss": 0.4074, + "step": 30810 + }, + { + "epoch": 1.7253331840071677, + "grad_norm": 1.1529109477996826, + "learning_rate": 9.715868421052632e-05, + "loss": 0.4474, + "step": 30811 + }, + { + "epoch": 1.7253891813192967, + "grad_norm": 1.130661129951477, + "learning_rate": 9.715842105263158e-05, + "loss": 0.5181, + "step": 30812 + }, + { + "epoch": 1.7254451786314258, + "grad_norm": 1.5895882844924927, + "learning_rate": 9.715815789473684e-05, + "loss": 0.4612, + "step": 30813 + }, + { + "epoch": 1.7255011759435548, + "grad_norm": 1.087876796722412, + "learning_rate": 9.715789473684212e-05, + "loss": 0.3793, + "step": 30814 + }, + { + "epoch": 1.7255571732556838, + "grad_norm": 2.8522260189056396, + "learning_rate": 9.715763157894738e-05, + "loss": 0.3504, + "step": 30815 + }, + { + "epoch": 1.7256131705678128, + "grad_norm": 2.2068569660186768, + "learning_rate": 9.715736842105264e-05, + "loss": 0.5478, + "step": 30816 + }, + { + "epoch": 1.7256691678799418, + "grad_norm": 1.260810136795044, + "learning_rate": 9.71571052631579e-05, + "loss": 0.4399, + "step": 30817 + }, + { + "epoch": 1.7257251651920709, + "grad_norm": 1.6022473573684692, + "learning_rate": 9.715684210526316e-05, + "loss": 0.3176, + "step": 30818 + }, + { + "epoch": 1.7257811625041999, + "grad_norm": 1.4457985162734985, + "learning_rate": 9.715657894736843e-05, + "loss": 0.4669, + "step": 30819 + }, + { + "epoch": 1.725837159816329, + "grad_norm": 1.2545794248580933, + "learning_rate": 9.715631578947369e-05, + "loss": 0.4905, + "step": 30820 + }, + { + "epoch": 1.725893157128458, + "grad_norm": 1.3766297101974487, + "learning_rate": 9.715605263157895e-05, + "loss": 0.432, + "step": 30821 + }, + { + "epoch": 1.725949154440587, + "grad_norm": 1.2831355333328247, + "learning_rate": 9.715578947368421e-05, + "loss": 0.419, + "step": 30822 + }, + { + "epoch": 1.726005151752716, + "grad_norm": 1.179022192955017, + "learning_rate": 9.715552631578948e-05, + "loss": 0.4321, + "step": 30823 + }, + { + "epoch": 1.726061149064845, + "grad_norm": 1.3134585618972778, + "learning_rate": 9.715526315789474e-05, + "loss": 0.5058, + "step": 30824 + }, + { + "epoch": 1.726117146376974, + "grad_norm": 1.1801894903182983, + "learning_rate": 9.7155e-05, + "loss": 0.5043, + "step": 30825 + }, + { + "epoch": 1.726173143689103, + "grad_norm": 1.549644947052002, + "learning_rate": 9.715473684210526e-05, + "loss": 0.4859, + "step": 30826 + }, + { + "epoch": 1.726229141001232, + "grad_norm": 1.0734951496124268, + "learning_rate": 9.715447368421053e-05, + "loss": 0.414, + "step": 30827 + }, + { + "epoch": 1.726285138313361, + "grad_norm": 1.5300109386444092, + "learning_rate": 9.71542105263158e-05, + "loss": 0.488, + "step": 30828 + }, + { + "epoch": 1.72634113562549, + "grad_norm": 3.6961076259613037, + "learning_rate": 9.715394736842107e-05, + "loss": 0.5406, + "step": 30829 + }, + { + "epoch": 1.726397132937619, + "grad_norm": 1.129594326019287, + "learning_rate": 9.715368421052631e-05, + "loss": 0.3016, + "step": 30830 + }, + { + "epoch": 1.7264531302497481, + "grad_norm": 1.2709170579910278, + "learning_rate": 9.715342105263159e-05, + "loss": 0.452, + "step": 30831 + }, + { + "epoch": 1.7265091275618771, + "grad_norm": 1.4917163848876953, + "learning_rate": 9.715315789473685e-05, + "loss": 0.4523, + "step": 30832 + }, + { + "epoch": 1.7265651248740062, + "grad_norm": 1.2385468482971191, + "learning_rate": 9.715289473684212e-05, + "loss": 0.3755, + "step": 30833 + }, + { + "epoch": 1.7266211221861352, + "grad_norm": 1.2770428657531738, + "learning_rate": 9.715263157894738e-05, + "loss": 0.4445, + "step": 30834 + }, + { + "epoch": 1.7266771194982642, + "grad_norm": 1.444815754890442, + "learning_rate": 9.715236842105263e-05, + "loss": 0.4496, + "step": 30835 + }, + { + "epoch": 1.7267331168103932, + "grad_norm": 1.4520589113235474, + "learning_rate": 9.71521052631579e-05, + "loss": 0.5309, + "step": 30836 + }, + { + "epoch": 1.7267891141225222, + "grad_norm": 1.0482993125915527, + "learning_rate": 9.715184210526316e-05, + "loss": 0.4375, + "step": 30837 + }, + { + "epoch": 1.7268451114346512, + "grad_norm": 1.4906712770462036, + "learning_rate": 9.715157894736843e-05, + "loss": 0.5349, + "step": 30838 + }, + { + "epoch": 1.7269011087467803, + "grad_norm": 2.543030261993408, + "learning_rate": 9.715131578947369e-05, + "loss": 0.6079, + "step": 30839 + }, + { + "epoch": 1.7269571060589093, + "grad_norm": 1.0798964500427246, + "learning_rate": 9.715105263157895e-05, + "loss": 0.4067, + "step": 30840 + }, + { + "epoch": 1.7270131033710383, + "grad_norm": 1.2734647989273071, + "learning_rate": 9.715078947368421e-05, + "loss": 0.4286, + "step": 30841 + }, + { + "epoch": 1.7270691006831673, + "grad_norm": 1.318042516708374, + "learning_rate": 9.715052631578948e-05, + "loss": 0.5077, + "step": 30842 + }, + { + "epoch": 1.7271250979952963, + "grad_norm": 1.216072678565979, + "learning_rate": 9.715026315789474e-05, + "loss": 0.5351, + "step": 30843 + }, + { + "epoch": 1.7271810953074254, + "grad_norm": 1.4882640838623047, + "learning_rate": 9.715e-05, + "loss": 0.475, + "step": 30844 + }, + { + "epoch": 1.7272370926195544, + "grad_norm": 1.2969797849655151, + "learning_rate": 9.714973684210526e-05, + "loss": 0.422, + "step": 30845 + }, + { + "epoch": 1.7272930899316834, + "grad_norm": 1.624219536781311, + "learning_rate": 9.714947368421054e-05, + "loss": 0.464, + "step": 30846 + }, + { + "epoch": 1.7273490872438124, + "grad_norm": 1.4371284246444702, + "learning_rate": 9.71492105263158e-05, + "loss": 0.4519, + "step": 30847 + }, + { + "epoch": 1.7274050845559414, + "grad_norm": 1.3296457529067993, + "learning_rate": 9.714894736842106e-05, + "loss": 0.4431, + "step": 30848 + }, + { + "epoch": 1.7274610818680705, + "grad_norm": 1.1049184799194336, + "learning_rate": 9.714868421052632e-05, + "loss": 0.376, + "step": 30849 + }, + { + "epoch": 1.7275170791801995, + "grad_norm": 1.1859382390975952, + "learning_rate": 9.714842105263159e-05, + "loss": 0.4267, + "step": 30850 + }, + { + "epoch": 1.7275730764923285, + "grad_norm": 1.1705151796340942, + "learning_rate": 9.714815789473685e-05, + "loss": 0.3619, + "step": 30851 + }, + { + "epoch": 1.7276290738044575, + "grad_norm": 1.3142000436782837, + "learning_rate": 9.714789473684211e-05, + "loss": 0.4021, + "step": 30852 + }, + { + "epoch": 1.7276850711165865, + "grad_norm": 1.1487438678741455, + "learning_rate": 9.714763157894737e-05, + "loss": 0.4081, + "step": 30853 + }, + { + "epoch": 1.7277410684287156, + "grad_norm": 1.276232361793518, + "learning_rate": 9.714736842105263e-05, + "loss": 0.4626, + "step": 30854 + }, + { + "epoch": 1.7277970657408446, + "grad_norm": 1.2493582963943481, + "learning_rate": 9.71471052631579e-05, + "loss": 0.4618, + "step": 30855 + }, + { + "epoch": 1.7278530630529736, + "grad_norm": 1.2609328031539917, + "learning_rate": 9.714684210526316e-05, + "loss": 0.3786, + "step": 30856 + }, + { + "epoch": 1.7279090603651026, + "grad_norm": 1.2448943853378296, + "learning_rate": 9.714657894736843e-05, + "loss": 0.5076, + "step": 30857 + }, + { + "epoch": 1.7279650576772316, + "grad_norm": 1.2144376039505005, + "learning_rate": 9.714631578947368e-05, + "loss": 0.5064, + "step": 30858 + }, + { + "epoch": 1.7280210549893606, + "grad_norm": 1.4060462713241577, + "learning_rate": 9.714605263157895e-05, + "loss": 0.5556, + "step": 30859 + }, + { + "epoch": 1.7280770523014897, + "grad_norm": 1.2787256240844727, + "learning_rate": 9.714578947368421e-05, + "loss": 0.3218, + "step": 30860 + }, + { + "epoch": 1.7281330496136187, + "grad_norm": 1.5079458951950073, + "learning_rate": 9.714552631578949e-05, + "loss": 0.6093, + "step": 30861 + }, + { + "epoch": 1.7281890469257477, + "grad_norm": 1.6231735944747925, + "learning_rate": 9.714526315789473e-05, + "loss": 0.5613, + "step": 30862 + }, + { + "epoch": 1.7282450442378767, + "grad_norm": 1.4514434337615967, + "learning_rate": 9.7145e-05, + "loss": 0.4394, + "step": 30863 + }, + { + "epoch": 1.7283010415500057, + "grad_norm": 1.2759157419204712, + "learning_rate": 9.714473684210527e-05, + "loss": 0.4208, + "step": 30864 + }, + { + "epoch": 1.7283570388621348, + "grad_norm": 1.2782208919525146, + "learning_rate": 9.714447368421054e-05, + "loss": 0.5318, + "step": 30865 + }, + { + "epoch": 1.7284130361742638, + "grad_norm": 2.5487778186798096, + "learning_rate": 9.71442105263158e-05, + "loss": 0.4648, + "step": 30866 + }, + { + "epoch": 1.7284690334863928, + "grad_norm": 1.0467078685760498, + "learning_rate": 9.714394736842106e-05, + "loss": 0.3735, + "step": 30867 + }, + { + "epoch": 1.7285250307985218, + "grad_norm": 1.2292864322662354, + "learning_rate": 9.714368421052632e-05, + "loss": 0.3864, + "step": 30868 + }, + { + "epoch": 1.7285810281106508, + "grad_norm": 1.3292547464370728, + "learning_rate": 9.714342105263159e-05, + "loss": 0.6821, + "step": 30869 + }, + { + "epoch": 1.7286370254227799, + "grad_norm": 1.1492819786071777, + "learning_rate": 9.714315789473685e-05, + "loss": 0.4084, + "step": 30870 + }, + { + "epoch": 1.7286930227349089, + "grad_norm": 2.3479971885681152, + "learning_rate": 9.714289473684211e-05, + "loss": 0.4776, + "step": 30871 + }, + { + "epoch": 1.728749020047038, + "grad_norm": 1.6894288063049316, + "learning_rate": 9.714263157894737e-05, + "loss": 0.5442, + "step": 30872 + }, + { + "epoch": 1.728805017359167, + "grad_norm": 1.538828730583191, + "learning_rate": 9.714236842105263e-05, + "loss": 0.5265, + "step": 30873 + }, + { + "epoch": 1.728861014671296, + "grad_norm": 1.340962290763855, + "learning_rate": 9.71421052631579e-05, + "loss": 0.5284, + "step": 30874 + }, + { + "epoch": 1.728917011983425, + "grad_norm": 1.1624521017074585, + "learning_rate": 9.714184210526316e-05, + "loss": 0.2955, + "step": 30875 + }, + { + "epoch": 1.728973009295554, + "grad_norm": 1.3382991552352905, + "learning_rate": 9.714157894736842e-05, + "loss": 0.4681, + "step": 30876 + }, + { + "epoch": 1.729029006607683, + "grad_norm": 1.409085988998413, + "learning_rate": 9.714131578947368e-05, + "loss": 0.4515, + "step": 30877 + }, + { + "epoch": 1.729085003919812, + "grad_norm": 1.0673515796661377, + "learning_rate": 9.714105263157896e-05, + "loss": 0.5178, + "step": 30878 + }, + { + "epoch": 1.729141001231941, + "grad_norm": 1.5937429666519165, + "learning_rate": 9.714078947368422e-05, + "loss": 0.4162, + "step": 30879 + }, + { + "epoch": 1.7291969985440698, + "grad_norm": 1.2507046461105347, + "learning_rate": 9.714052631578948e-05, + "loss": 0.392, + "step": 30880 + }, + { + "epoch": 1.7292529958561988, + "grad_norm": 1.7141634225845337, + "learning_rate": 9.714026315789474e-05, + "loss": 0.3829, + "step": 30881 + }, + { + "epoch": 1.7293089931683279, + "grad_norm": 1.924500584602356, + "learning_rate": 9.714000000000001e-05, + "loss": 0.5439, + "step": 30882 + }, + { + "epoch": 1.7293649904804569, + "grad_norm": 1.5594667196273804, + "learning_rate": 9.713973684210527e-05, + "loss": 0.5527, + "step": 30883 + }, + { + "epoch": 1.729420987792586, + "grad_norm": 1.3248144388198853, + "learning_rate": 9.713947368421054e-05, + "loss": 0.4547, + "step": 30884 + }, + { + "epoch": 1.729476985104715, + "grad_norm": 1.5802890062332153, + "learning_rate": 9.713921052631579e-05, + "loss": 0.6615, + "step": 30885 + }, + { + "epoch": 1.729532982416844, + "grad_norm": 1.5077229738235474, + "learning_rate": 9.713894736842106e-05, + "loss": 0.4718, + "step": 30886 + }, + { + "epoch": 1.729588979728973, + "grad_norm": 1.4542040824890137, + "learning_rate": 9.713868421052632e-05, + "loss": 0.6412, + "step": 30887 + }, + { + "epoch": 1.729644977041102, + "grad_norm": 1.3625273704528809, + "learning_rate": 9.713842105263158e-05, + "loss": 0.369, + "step": 30888 + }, + { + "epoch": 1.729700974353231, + "grad_norm": 1.2317105531692505, + "learning_rate": 9.713815789473685e-05, + "loss": 0.326, + "step": 30889 + }, + { + "epoch": 1.72975697166536, + "grad_norm": 1.2325578927993774, + "learning_rate": 9.71378947368421e-05, + "loss": 0.4449, + "step": 30890 + }, + { + "epoch": 1.729812968977489, + "grad_norm": 1.545949935913086, + "learning_rate": 9.713763157894737e-05, + "loss": 0.4058, + "step": 30891 + }, + { + "epoch": 1.729868966289618, + "grad_norm": 1.4828636646270752, + "learning_rate": 9.713736842105263e-05, + "loss": 0.5084, + "step": 30892 + }, + { + "epoch": 1.729924963601747, + "grad_norm": 1.7670361995697021, + "learning_rate": 9.71371052631579e-05, + "loss": 0.5689, + "step": 30893 + }, + { + "epoch": 1.729980960913876, + "grad_norm": 1.1305797100067139, + "learning_rate": 9.713684210526317e-05, + "loss": 0.4592, + "step": 30894 + }, + { + "epoch": 1.7300369582260051, + "grad_norm": 1.1148333549499512, + "learning_rate": 9.713657894736843e-05, + "loss": 0.5318, + "step": 30895 + }, + { + "epoch": 1.7300929555381341, + "grad_norm": 1.5832405090332031, + "learning_rate": 9.713631578947369e-05, + "loss": 0.4372, + "step": 30896 + }, + { + "epoch": 1.7301489528502632, + "grad_norm": 1.43873929977417, + "learning_rate": 9.713605263157896e-05, + "loss": 0.47, + "step": 30897 + }, + { + "epoch": 1.7302049501623922, + "grad_norm": 1.550362229347229, + "learning_rate": 9.713578947368422e-05, + "loss": 0.5809, + "step": 30898 + }, + { + "epoch": 1.7302609474745212, + "grad_norm": 1.765970230102539, + "learning_rate": 9.713552631578948e-05, + "loss": 0.4819, + "step": 30899 + }, + { + "epoch": 1.7303169447866502, + "grad_norm": 1.1776527166366577, + "learning_rate": 9.713526315789474e-05, + "loss": 0.4204, + "step": 30900 + }, + { + "epoch": 1.7303729420987792, + "grad_norm": 1.1927144527435303, + "learning_rate": 9.713500000000001e-05, + "loss": 0.4343, + "step": 30901 + }, + { + "epoch": 1.7304289394109083, + "grad_norm": 1.1371490955352783, + "learning_rate": 9.713473684210527e-05, + "loss": 0.5609, + "step": 30902 + }, + { + "epoch": 1.7304849367230373, + "grad_norm": 1.316361665725708, + "learning_rate": 9.713447368421053e-05, + "loss": 0.5007, + "step": 30903 + }, + { + "epoch": 1.7305409340351663, + "grad_norm": 1.2518609762191772, + "learning_rate": 9.713421052631579e-05, + "loss": 0.3775, + "step": 30904 + }, + { + "epoch": 1.7305969313472953, + "grad_norm": 1.2294044494628906, + "learning_rate": 9.713394736842105e-05, + "loss": 0.4788, + "step": 30905 + }, + { + "epoch": 1.7306529286594243, + "grad_norm": 1.3479539155960083, + "learning_rate": 9.713368421052632e-05, + "loss": 0.4313, + "step": 30906 + }, + { + "epoch": 1.7307089259715533, + "grad_norm": 1.2183607816696167, + "learning_rate": 9.713342105263158e-05, + "loss": 0.3896, + "step": 30907 + }, + { + "epoch": 1.7307649232836824, + "grad_norm": 17.11246109008789, + "learning_rate": 9.713315789473684e-05, + "loss": 0.636, + "step": 30908 + }, + { + "epoch": 1.7308209205958114, + "grad_norm": 1.1182912588119507, + "learning_rate": 9.71328947368421e-05, + "loss": 0.3731, + "step": 30909 + }, + { + "epoch": 1.7308769179079404, + "grad_norm": 1.2550162076950073, + "learning_rate": 9.713263157894738e-05, + "loss": 0.4245, + "step": 30910 + }, + { + "epoch": 1.7309329152200694, + "grad_norm": 1.3438735008239746, + "learning_rate": 9.713236842105264e-05, + "loss": 0.4981, + "step": 30911 + }, + { + "epoch": 1.7309889125321984, + "grad_norm": 1.236170768737793, + "learning_rate": 9.713210526315791e-05, + "loss": 0.4917, + "step": 30912 + }, + { + "epoch": 1.7310449098443275, + "grad_norm": 1.380021333694458, + "learning_rate": 9.713184210526315e-05, + "loss": 0.5406, + "step": 30913 + }, + { + "epoch": 1.7311009071564565, + "grad_norm": 1.2908653020858765, + "learning_rate": 9.713157894736843e-05, + "loss": 0.4543, + "step": 30914 + }, + { + "epoch": 1.7311569044685855, + "grad_norm": 2.054720640182495, + "learning_rate": 9.713131578947369e-05, + "loss": 0.4492, + "step": 30915 + }, + { + "epoch": 1.7312129017807145, + "grad_norm": 1.1338237524032593, + "learning_rate": 9.713105263157896e-05, + "loss": 0.3648, + "step": 30916 + }, + { + "epoch": 1.7312688990928435, + "grad_norm": 1.230956792831421, + "learning_rate": 9.713078947368421e-05, + "loss": 0.3829, + "step": 30917 + }, + { + "epoch": 1.7313248964049726, + "grad_norm": 1.3563530445098877, + "learning_rate": 9.713052631578948e-05, + "loss": 0.4956, + "step": 30918 + }, + { + "epoch": 1.7313808937171016, + "grad_norm": 1.4030046463012695, + "learning_rate": 9.713026315789474e-05, + "loss": 0.4191, + "step": 30919 + }, + { + "epoch": 1.7314368910292306, + "grad_norm": 1.2252717018127441, + "learning_rate": 9.713000000000001e-05, + "loss": 0.4479, + "step": 30920 + }, + { + "epoch": 1.7314928883413596, + "grad_norm": 1.3108750581741333, + "learning_rate": 9.712973684210527e-05, + "loss": 0.5127, + "step": 30921 + }, + { + "epoch": 1.7315488856534886, + "grad_norm": 1.378910779953003, + "learning_rate": 9.712947368421052e-05, + "loss": 0.4461, + "step": 30922 + }, + { + "epoch": 1.7316048829656177, + "grad_norm": 1.148742437362671, + "learning_rate": 9.712921052631579e-05, + "loss": 0.3799, + "step": 30923 + }, + { + "epoch": 1.7316608802777467, + "grad_norm": 1.2744581699371338, + "learning_rate": 9.712894736842105e-05, + "loss": 0.5466, + "step": 30924 + }, + { + "epoch": 1.7317168775898757, + "grad_norm": 1.3952937126159668, + "learning_rate": 9.712868421052633e-05, + "loss": 0.4239, + "step": 30925 + }, + { + "epoch": 1.7317728749020047, + "grad_norm": 1.8824533224105835, + "learning_rate": 9.712842105263159e-05, + "loss": 0.4215, + "step": 30926 + }, + { + "epoch": 1.7318288722141337, + "grad_norm": 1.224982500076294, + "learning_rate": 9.712815789473685e-05, + "loss": 0.5098, + "step": 30927 + }, + { + "epoch": 1.7318848695262627, + "grad_norm": 1.2281935214996338, + "learning_rate": 9.71278947368421e-05, + "loss": 0.5545, + "step": 30928 + }, + { + "epoch": 1.7319408668383918, + "grad_norm": 1.7226656675338745, + "learning_rate": 9.712763157894738e-05, + "loss": 0.4299, + "step": 30929 + }, + { + "epoch": 1.7319968641505208, + "grad_norm": 1.298384189605713, + "learning_rate": 9.712736842105264e-05, + "loss": 0.4545, + "step": 30930 + }, + { + "epoch": 1.7320528614626498, + "grad_norm": 1.3409208059310913, + "learning_rate": 9.71271052631579e-05, + "loss": 0.4533, + "step": 30931 + }, + { + "epoch": 1.7321088587747788, + "grad_norm": 1.1323697566986084, + "learning_rate": 9.712684210526316e-05, + "loss": 0.4413, + "step": 30932 + }, + { + "epoch": 1.7321648560869078, + "grad_norm": 1.2897166013717651, + "learning_rate": 9.712657894736843e-05, + "loss": 0.3455, + "step": 30933 + }, + { + "epoch": 1.7322208533990369, + "grad_norm": 1.7649589776992798, + "learning_rate": 9.712631578947369e-05, + "loss": 0.4775, + "step": 30934 + }, + { + "epoch": 1.7322768507111659, + "grad_norm": 1.309889316558838, + "learning_rate": 9.712605263157895e-05, + "loss": 0.5041, + "step": 30935 + }, + { + "epoch": 1.732332848023295, + "grad_norm": 1.0974708795547485, + "learning_rate": 9.712578947368421e-05, + "loss": 0.4333, + "step": 30936 + }, + { + "epoch": 1.732388845335424, + "grad_norm": 1.4677408933639526, + "learning_rate": 9.712552631578948e-05, + "loss": 0.5098, + "step": 30937 + }, + { + "epoch": 1.732444842647553, + "grad_norm": 1.055536150932312, + "learning_rate": 9.712526315789474e-05, + "loss": 0.3425, + "step": 30938 + }, + { + "epoch": 1.732500839959682, + "grad_norm": 1.0858639478683472, + "learning_rate": 9.7125e-05, + "loss": 0.4275, + "step": 30939 + }, + { + "epoch": 1.732556837271811, + "grad_norm": 1.2390285730361938, + "learning_rate": 9.712473684210526e-05, + "loss": 0.5106, + "step": 30940 + }, + { + "epoch": 1.73261283458394, + "grad_norm": 1.2363107204437256, + "learning_rate": 9.712447368421052e-05, + "loss": 0.3923, + "step": 30941 + }, + { + "epoch": 1.732668831896069, + "grad_norm": 1.291670322418213, + "learning_rate": 9.71242105263158e-05, + "loss": 0.4968, + "step": 30942 + }, + { + "epoch": 1.732724829208198, + "grad_norm": 1.3040119409561157, + "learning_rate": 9.712394736842106e-05, + "loss": 0.53, + "step": 30943 + }, + { + "epoch": 1.732780826520327, + "grad_norm": 1.4760606288909912, + "learning_rate": 9.712368421052633e-05, + "loss": 0.6047, + "step": 30944 + }, + { + "epoch": 1.732836823832456, + "grad_norm": 1.2323769330978394, + "learning_rate": 9.712342105263157e-05, + "loss": 0.401, + "step": 30945 + }, + { + "epoch": 1.732892821144585, + "grad_norm": 1.1582738161087036, + "learning_rate": 9.712315789473685e-05, + "loss": 0.4747, + "step": 30946 + }, + { + "epoch": 1.7329488184567141, + "grad_norm": 1.2419651746749878, + "learning_rate": 9.712289473684211e-05, + "loss": 0.5276, + "step": 30947 + }, + { + "epoch": 1.7330048157688431, + "grad_norm": 1.2945401668548584, + "learning_rate": 9.712263157894738e-05, + "loss": 0.4313, + "step": 30948 + }, + { + "epoch": 1.7330608130809722, + "grad_norm": 1.580261468887329, + "learning_rate": 9.712236842105264e-05, + "loss": 0.4265, + "step": 30949 + }, + { + "epoch": 1.7331168103931012, + "grad_norm": 1.2586312294006348, + "learning_rate": 9.71221052631579e-05, + "loss": 0.4663, + "step": 30950 + }, + { + "epoch": 1.7331728077052302, + "grad_norm": 2.0549309253692627, + "learning_rate": 9.712184210526316e-05, + "loss": 0.4111, + "step": 30951 + }, + { + "epoch": 1.7332288050173592, + "grad_norm": 1.2690858840942383, + "learning_rate": 9.712157894736843e-05, + "loss": 0.4778, + "step": 30952 + }, + { + "epoch": 1.7332848023294882, + "grad_norm": 1.483779788017273, + "learning_rate": 9.712131578947369e-05, + "loss": 0.5175, + "step": 30953 + }, + { + "epoch": 1.7333407996416172, + "grad_norm": 1.263841152191162, + "learning_rate": 9.712105263157895e-05, + "loss": 0.4895, + "step": 30954 + }, + { + "epoch": 1.7333967969537463, + "grad_norm": 1.1458534002304077, + "learning_rate": 9.712078947368421e-05, + "loss": 0.4272, + "step": 30955 + }, + { + "epoch": 1.7334527942658753, + "grad_norm": 1.2037091255187988, + "learning_rate": 9.712052631578947e-05, + "loss": 0.3797, + "step": 30956 + }, + { + "epoch": 1.7335087915780043, + "grad_norm": 1.631466269493103, + "learning_rate": 9.712026315789475e-05, + "loss": 0.6362, + "step": 30957 + }, + { + "epoch": 1.7335647888901333, + "grad_norm": 1.3013267517089844, + "learning_rate": 9.712e-05, + "loss": 0.5013, + "step": 30958 + }, + { + "epoch": 1.7336207862022623, + "grad_norm": 1.4832836389541626, + "learning_rate": 9.711973684210526e-05, + "loss": 0.5935, + "step": 30959 + }, + { + "epoch": 1.7336767835143914, + "grad_norm": 1.1693936586380005, + "learning_rate": 9.711947368421052e-05, + "loss": 0.3655, + "step": 30960 + }, + { + "epoch": 1.7337327808265204, + "grad_norm": 1.3943352699279785, + "learning_rate": 9.71192105263158e-05, + "loss": 0.5045, + "step": 30961 + }, + { + "epoch": 1.7337887781386494, + "grad_norm": 1.4526394605636597, + "learning_rate": 9.711894736842106e-05, + "loss": 0.4856, + "step": 30962 + }, + { + "epoch": 1.7338447754507782, + "grad_norm": 1.68649423122406, + "learning_rate": 9.711868421052632e-05, + "loss": 0.4439, + "step": 30963 + }, + { + "epoch": 1.7339007727629072, + "grad_norm": 1.2721911668777466, + "learning_rate": 9.711842105263158e-05, + "loss": 0.5415, + "step": 30964 + }, + { + "epoch": 1.7339567700750362, + "grad_norm": 1.460144281387329, + "learning_rate": 9.711815789473685e-05, + "loss": 0.552, + "step": 30965 + }, + { + "epoch": 1.7340127673871653, + "grad_norm": 1.6503968238830566, + "learning_rate": 9.711789473684211e-05, + "loss": 0.6655, + "step": 30966 + }, + { + "epoch": 1.7340687646992943, + "grad_norm": 1.4066368341445923, + "learning_rate": 9.711763157894738e-05, + "loss": 0.5155, + "step": 30967 + }, + { + "epoch": 1.7341247620114233, + "grad_norm": 1.1384308338165283, + "learning_rate": 9.711736842105263e-05, + "loss": 0.4905, + "step": 30968 + }, + { + "epoch": 1.7341807593235523, + "grad_norm": 1.749942660331726, + "learning_rate": 9.71171052631579e-05, + "loss": 0.5055, + "step": 30969 + }, + { + "epoch": 1.7342367566356813, + "grad_norm": 1.4464553594589233, + "learning_rate": 9.711684210526316e-05, + "loss": 0.452, + "step": 30970 + }, + { + "epoch": 1.7342927539478104, + "grad_norm": 1.3154737949371338, + "learning_rate": 9.711657894736844e-05, + "loss": 0.4312, + "step": 30971 + }, + { + "epoch": 1.7343487512599394, + "grad_norm": 1.425906777381897, + "learning_rate": 9.711631578947368e-05, + "loss": 0.615, + "step": 30972 + }, + { + "epoch": 1.7344047485720684, + "grad_norm": 1.1637827157974243, + "learning_rate": 9.711605263157894e-05, + "loss": 0.4332, + "step": 30973 + }, + { + "epoch": 1.7344607458841974, + "grad_norm": 1.2237335443496704, + "learning_rate": 9.711578947368422e-05, + "loss": 0.3171, + "step": 30974 + }, + { + "epoch": 1.7345167431963264, + "grad_norm": 1.2054741382598877, + "learning_rate": 9.711552631578947e-05, + "loss": 0.4881, + "step": 30975 + }, + { + "epoch": 1.7345727405084554, + "grad_norm": 1.2754313945770264, + "learning_rate": 9.711526315789475e-05, + "loss": 0.4256, + "step": 30976 + }, + { + "epoch": 1.7346287378205845, + "grad_norm": 1.0154242515563965, + "learning_rate": 9.7115e-05, + "loss": 0.3056, + "step": 30977 + }, + { + "epoch": 1.7346847351327135, + "grad_norm": 1.235609531402588, + "learning_rate": 9.711473684210527e-05, + "loss": 0.459, + "step": 30978 + }, + { + "epoch": 1.7347407324448425, + "grad_norm": 1.4229402542114258, + "learning_rate": 9.711447368421053e-05, + "loss": 0.4944, + "step": 30979 + }, + { + "epoch": 1.7347967297569715, + "grad_norm": 1.2148220539093018, + "learning_rate": 9.71142105263158e-05, + "loss": 0.5224, + "step": 30980 + }, + { + "epoch": 1.7348527270691005, + "grad_norm": 1.2697978019714355, + "learning_rate": 9.711394736842106e-05, + "loss": 0.3654, + "step": 30981 + }, + { + "epoch": 1.7349087243812296, + "grad_norm": 1.1354392766952515, + "learning_rate": 9.711368421052632e-05, + "loss": 0.4228, + "step": 30982 + }, + { + "epoch": 1.7349647216933586, + "grad_norm": 1.4376211166381836, + "learning_rate": 9.711342105263158e-05, + "loss": 0.4614, + "step": 30983 + }, + { + "epoch": 1.7350207190054876, + "grad_norm": 1.1127361059188843, + "learning_rate": 9.711315789473685e-05, + "loss": 0.3225, + "step": 30984 + }, + { + "epoch": 1.7350767163176166, + "grad_norm": 1.2308012247085571, + "learning_rate": 9.711289473684211e-05, + "loss": 0.4746, + "step": 30985 + }, + { + "epoch": 1.7351327136297456, + "grad_norm": 1.4758014678955078, + "learning_rate": 9.711263157894737e-05, + "loss": 0.4164, + "step": 30986 + }, + { + "epoch": 1.7351887109418747, + "grad_norm": 6.427015781402588, + "learning_rate": 9.711236842105263e-05, + "loss": 0.5271, + "step": 30987 + }, + { + "epoch": 1.7352447082540037, + "grad_norm": 1.6755577325820923, + "learning_rate": 9.71121052631579e-05, + "loss": 0.5436, + "step": 30988 + }, + { + "epoch": 1.7353007055661327, + "grad_norm": 1.2384088039398193, + "learning_rate": 9.711184210526317e-05, + "loss": 0.3598, + "step": 30989 + }, + { + "epoch": 1.7353567028782617, + "grad_norm": 1.1982392072677612, + "learning_rate": 9.711157894736842e-05, + "loss": 0.3884, + "step": 30990 + }, + { + "epoch": 1.7354127001903907, + "grad_norm": 1.1121225357055664, + "learning_rate": 9.711131578947368e-05, + "loss": 0.4059, + "step": 30991 + }, + { + "epoch": 1.7354686975025198, + "grad_norm": 1.5421463251113892, + "learning_rate": 9.711105263157894e-05, + "loss": 0.395, + "step": 30992 + }, + { + "epoch": 1.7355246948146488, + "grad_norm": 1.4363654851913452, + "learning_rate": 9.711078947368422e-05, + "loss": 0.5026, + "step": 30993 + }, + { + "epoch": 1.7355806921267778, + "grad_norm": 1.8139078617095947, + "learning_rate": 9.711052631578948e-05, + "loss": 0.6181, + "step": 30994 + }, + { + "epoch": 1.7356366894389068, + "grad_norm": 1.1919149160385132, + "learning_rate": 9.711026315789474e-05, + "loss": 0.4262, + "step": 30995 + }, + { + "epoch": 1.7356926867510358, + "grad_norm": 1.5225441455841064, + "learning_rate": 9.711e-05, + "loss": 0.5237, + "step": 30996 + }, + { + "epoch": 1.7357486840631648, + "grad_norm": 1.6076759099960327, + "learning_rate": 9.710973684210527e-05, + "loss": 0.4948, + "step": 30997 + }, + { + "epoch": 1.7358046813752939, + "grad_norm": 1.1972556114196777, + "learning_rate": 9.710947368421053e-05, + "loss": 0.3819, + "step": 30998 + }, + { + "epoch": 1.7358606786874229, + "grad_norm": 1.541182041168213, + "learning_rate": 9.71092105263158e-05, + "loss": 0.5491, + "step": 30999 + }, + { + "epoch": 1.735916675999552, + "grad_norm": 1.5775507688522339, + "learning_rate": 9.710894736842105e-05, + "loss": 0.5367, + "step": 31000 + }, + { + "epoch": 1.735972673311681, + "grad_norm": 1.3196219205856323, + "learning_rate": 9.710868421052632e-05, + "loss": 0.508, + "step": 31001 + }, + { + "epoch": 1.73602867062381, + "grad_norm": 1.4850828647613525, + "learning_rate": 9.710842105263158e-05, + "loss": 0.5157, + "step": 31002 + }, + { + "epoch": 1.736084667935939, + "grad_norm": 1.3666611909866333, + "learning_rate": 9.710815789473686e-05, + "loss": 0.5597, + "step": 31003 + }, + { + "epoch": 1.736140665248068, + "grad_norm": 1.231253981590271, + "learning_rate": 9.710789473684212e-05, + "loss": 0.4525, + "step": 31004 + }, + { + "epoch": 1.736196662560197, + "grad_norm": 1.0799951553344727, + "learning_rate": 9.710763157894738e-05, + "loss": 0.387, + "step": 31005 + }, + { + "epoch": 1.736252659872326, + "grad_norm": 1.2046945095062256, + "learning_rate": 9.710736842105263e-05, + "loss": 0.4121, + "step": 31006 + }, + { + "epoch": 1.736308657184455, + "grad_norm": 1.1753010749816895, + "learning_rate": 9.710710526315791e-05, + "loss": 0.3987, + "step": 31007 + }, + { + "epoch": 1.736364654496584, + "grad_norm": 1.2389849424362183, + "learning_rate": 9.710684210526317e-05, + "loss": 0.5673, + "step": 31008 + }, + { + "epoch": 1.736420651808713, + "grad_norm": 1.3578776121139526, + "learning_rate": 9.710657894736841e-05, + "loss": 0.4629, + "step": 31009 + }, + { + "epoch": 1.736476649120842, + "grad_norm": 1.1498032808303833, + "learning_rate": 9.710631578947369e-05, + "loss": 0.5048, + "step": 31010 + }, + { + "epoch": 1.7365326464329711, + "grad_norm": 1.311370611190796, + "learning_rate": 9.710605263157895e-05, + "loss": 0.4367, + "step": 31011 + }, + { + "epoch": 1.7365886437451001, + "grad_norm": 1.094753623008728, + "learning_rate": 9.710578947368422e-05, + "loss": 0.4911, + "step": 31012 + }, + { + "epoch": 1.7366446410572292, + "grad_norm": 1.3457807302474976, + "learning_rate": 9.710552631578948e-05, + "loss": 0.3923, + "step": 31013 + }, + { + "epoch": 1.7367006383693582, + "grad_norm": 1.321081280708313, + "learning_rate": 9.710526315789474e-05, + "loss": 0.402, + "step": 31014 + }, + { + "epoch": 1.7367566356814872, + "grad_norm": 1.2904243469238281, + "learning_rate": 9.7105e-05, + "loss": 0.4171, + "step": 31015 + }, + { + "epoch": 1.7368126329936162, + "grad_norm": 1.4380868673324585, + "learning_rate": 9.710473684210527e-05, + "loss": 0.3509, + "step": 31016 + }, + { + "epoch": 1.7368686303057452, + "grad_norm": 1.2378318309783936, + "learning_rate": 9.710447368421053e-05, + "loss": 0.4268, + "step": 31017 + }, + { + "epoch": 1.7369246276178743, + "grad_norm": 1.30984628200531, + "learning_rate": 9.710421052631579e-05, + "loss": 0.4519, + "step": 31018 + }, + { + "epoch": 1.7369806249300033, + "grad_norm": 1.7147924900054932, + "learning_rate": 9.710394736842105e-05, + "loss": 0.536, + "step": 31019 + }, + { + "epoch": 1.7370366222421323, + "grad_norm": 1.2327755689620972, + "learning_rate": 9.710368421052633e-05, + "loss": 0.4454, + "step": 31020 + }, + { + "epoch": 1.7370926195542613, + "grad_norm": 1.0665234327316284, + "learning_rate": 9.710342105263158e-05, + "loss": 0.3624, + "step": 31021 + }, + { + "epoch": 1.7371486168663903, + "grad_norm": 1.1231725215911865, + "learning_rate": 9.710315789473684e-05, + "loss": 0.398, + "step": 31022 + }, + { + "epoch": 1.7372046141785193, + "grad_norm": 1.2381103038787842, + "learning_rate": 9.71028947368421e-05, + "loss": 0.445, + "step": 31023 + }, + { + "epoch": 1.7372606114906484, + "grad_norm": 1.2365427017211914, + "learning_rate": 9.710263157894738e-05, + "loss": 0.4542, + "step": 31024 + }, + { + "epoch": 1.7373166088027774, + "grad_norm": 1.264270305633545, + "learning_rate": 9.710236842105264e-05, + "loss": 0.5108, + "step": 31025 + }, + { + "epoch": 1.7373726061149064, + "grad_norm": 1.1615774631500244, + "learning_rate": 9.71021052631579e-05, + "loss": 0.4658, + "step": 31026 + }, + { + "epoch": 1.7374286034270354, + "grad_norm": 1.5205755233764648, + "learning_rate": 9.710184210526316e-05, + "loss": 0.7027, + "step": 31027 + }, + { + "epoch": 1.7374846007391644, + "grad_norm": 1.4298266172409058, + "learning_rate": 9.710157894736842e-05, + "loss": 0.4847, + "step": 31028 + }, + { + "epoch": 1.7375405980512935, + "grad_norm": 1.3484586477279663, + "learning_rate": 9.710131578947369e-05, + "loss": 0.5279, + "step": 31029 + }, + { + "epoch": 1.7375965953634225, + "grad_norm": 1.2147866487503052, + "learning_rate": 9.710105263157895e-05, + "loss": 0.4323, + "step": 31030 + }, + { + "epoch": 1.7376525926755515, + "grad_norm": 1.224869966506958, + "learning_rate": 9.710078947368422e-05, + "loss": 0.5024, + "step": 31031 + }, + { + "epoch": 1.7377085899876805, + "grad_norm": 1.0645188093185425, + "learning_rate": 9.710052631578947e-05, + "loss": 0.5958, + "step": 31032 + }, + { + "epoch": 1.7377645872998095, + "grad_norm": 1.4227310419082642, + "learning_rate": 9.710026315789474e-05, + "loss": 0.4599, + "step": 31033 + }, + { + "epoch": 1.7378205846119386, + "grad_norm": 1.4048056602478027, + "learning_rate": 9.71e-05, + "loss": 0.4401, + "step": 31034 + }, + { + "epoch": 1.7378765819240676, + "grad_norm": 1.2840206623077393, + "learning_rate": 9.709973684210528e-05, + "loss": 0.4556, + "step": 31035 + }, + { + "epoch": 1.7379325792361966, + "grad_norm": 1.3036516904830933, + "learning_rate": 9.709947368421054e-05, + "loss": 0.4384, + "step": 31036 + }, + { + "epoch": 1.7379885765483256, + "grad_norm": 1.2935075759887695, + "learning_rate": 9.70992105263158e-05, + "loss": 0.4812, + "step": 31037 + }, + { + "epoch": 1.7380445738604546, + "grad_norm": 1.361349105834961, + "learning_rate": 9.709894736842105e-05, + "loss": 0.42, + "step": 31038 + }, + { + "epoch": 1.7381005711725837, + "grad_norm": 1.3498167991638184, + "learning_rate": 9.709868421052633e-05, + "loss": 0.458, + "step": 31039 + }, + { + "epoch": 1.7381565684847127, + "grad_norm": 1.1751586198806763, + "learning_rate": 9.709842105263159e-05, + "loss": 0.4148, + "step": 31040 + }, + { + "epoch": 1.7382125657968417, + "grad_norm": 1.1283135414123535, + "learning_rate": 9.709815789473685e-05, + "loss": 0.2819, + "step": 31041 + }, + { + "epoch": 1.7382685631089707, + "grad_norm": 1.1883455514907837, + "learning_rate": 9.709789473684211e-05, + "loss": 0.4182, + "step": 31042 + }, + { + "epoch": 1.7383245604210997, + "grad_norm": 1.396022081375122, + "learning_rate": 9.709763157894737e-05, + "loss": 0.4821, + "step": 31043 + }, + { + "epoch": 1.7383805577332287, + "grad_norm": 1.1936545372009277, + "learning_rate": 9.709736842105264e-05, + "loss": 0.4972, + "step": 31044 + }, + { + "epoch": 1.7384365550453578, + "grad_norm": 1.2382044792175293, + "learning_rate": 9.70971052631579e-05, + "loss": 0.5357, + "step": 31045 + }, + { + "epoch": 1.7384925523574868, + "grad_norm": 1.3015687465667725, + "learning_rate": 9.709684210526316e-05, + "loss": 0.4523, + "step": 31046 + }, + { + "epoch": 1.7385485496696158, + "grad_norm": 1.3992559909820557, + "learning_rate": 9.709657894736842e-05, + "loss": 0.3672, + "step": 31047 + }, + { + "epoch": 1.7386045469817448, + "grad_norm": 1.282843828201294, + "learning_rate": 9.709631578947369e-05, + "loss": 0.4421, + "step": 31048 + }, + { + "epoch": 1.7386605442938738, + "grad_norm": 1.4199289083480835, + "learning_rate": 9.709605263157895e-05, + "loss": 0.5129, + "step": 31049 + }, + { + "epoch": 1.7387165416060029, + "grad_norm": 0.9760697484016418, + "learning_rate": 9.709578947368421e-05, + "loss": 0.3753, + "step": 31050 + }, + { + "epoch": 1.7387725389181319, + "grad_norm": 1.2729848623275757, + "learning_rate": 9.709552631578947e-05, + "loss": 0.4728, + "step": 31051 + }, + { + "epoch": 1.738828536230261, + "grad_norm": 1.1899892091751099, + "learning_rate": 9.709526315789474e-05, + "loss": 0.4492, + "step": 31052 + }, + { + "epoch": 1.73888453354239, + "grad_norm": 1.205531358718872, + "learning_rate": 9.7095e-05, + "loss": 0.5223, + "step": 31053 + }, + { + "epoch": 1.738940530854519, + "grad_norm": 1.3818005323410034, + "learning_rate": 9.709473684210528e-05, + "loss": 0.3802, + "step": 31054 + }, + { + "epoch": 1.738996528166648, + "grad_norm": 1.2059881687164307, + "learning_rate": 9.709447368421052e-05, + "loss": 0.4753, + "step": 31055 + }, + { + "epoch": 1.739052525478777, + "grad_norm": 1.1895244121551514, + "learning_rate": 9.70942105263158e-05, + "loss": 0.4158, + "step": 31056 + }, + { + "epoch": 1.739108522790906, + "grad_norm": 1.4219253063201904, + "learning_rate": 9.709394736842106e-05, + "loss": 0.398, + "step": 31057 + }, + { + "epoch": 1.739164520103035, + "grad_norm": 1.3253235816955566, + "learning_rate": 9.709368421052633e-05, + "loss": 0.5903, + "step": 31058 + }, + { + "epoch": 1.739220517415164, + "grad_norm": 1.251280665397644, + "learning_rate": 9.709342105263159e-05, + "loss": 0.484, + "step": 31059 + }, + { + "epoch": 1.739276514727293, + "grad_norm": 1.4518582820892334, + "learning_rate": 9.709315789473684e-05, + "loss": 0.5893, + "step": 31060 + }, + { + "epoch": 1.739332512039422, + "grad_norm": 1.418822169303894, + "learning_rate": 9.709289473684211e-05, + "loss": 0.5994, + "step": 31061 + }, + { + "epoch": 1.739388509351551, + "grad_norm": 1.2382903099060059, + "learning_rate": 9.709263157894737e-05, + "loss": 0.4333, + "step": 31062 + }, + { + "epoch": 1.7394445066636801, + "grad_norm": 1.23009192943573, + "learning_rate": 9.709236842105264e-05, + "loss": 0.5559, + "step": 31063 + }, + { + "epoch": 1.7395005039758091, + "grad_norm": 1.3467671871185303, + "learning_rate": 9.709210526315789e-05, + "loss": 0.441, + "step": 31064 + }, + { + "epoch": 1.7395565012879382, + "grad_norm": 1.2569304704666138, + "learning_rate": 9.709184210526316e-05, + "loss": 0.4839, + "step": 31065 + }, + { + "epoch": 1.7396124986000672, + "grad_norm": 1.2604433298110962, + "learning_rate": 9.709157894736842e-05, + "loss": 0.4684, + "step": 31066 + }, + { + "epoch": 1.7396684959121962, + "grad_norm": 1.1469084024429321, + "learning_rate": 9.70913157894737e-05, + "loss": 0.4653, + "step": 31067 + }, + { + "epoch": 1.7397244932243252, + "grad_norm": 1.407639503479004, + "learning_rate": 9.709105263157895e-05, + "loss": 0.4579, + "step": 31068 + }, + { + "epoch": 1.7397804905364542, + "grad_norm": 1.30097496509552, + "learning_rate": 9.709078947368421e-05, + "loss": 0.6514, + "step": 31069 + }, + { + "epoch": 1.7398364878485832, + "grad_norm": 1.5303374528884888, + "learning_rate": 9.709052631578947e-05, + "loss": 0.477, + "step": 31070 + }, + { + "epoch": 1.7398924851607123, + "grad_norm": 1.188430905342102, + "learning_rate": 9.709026315789475e-05, + "loss": 0.4511, + "step": 31071 + }, + { + "epoch": 1.7399484824728413, + "grad_norm": 1.0792704820632935, + "learning_rate": 9.709000000000001e-05, + "loss": 0.3328, + "step": 31072 + }, + { + "epoch": 1.7400044797849703, + "grad_norm": 1.2438716888427734, + "learning_rate": 9.708973684210527e-05, + "loss": 0.3821, + "step": 31073 + }, + { + "epoch": 1.7400604770970993, + "grad_norm": 1.2731200456619263, + "learning_rate": 9.708947368421053e-05, + "loss": 0.4448, + "step": 31074 + }, + { + "epoch": 1.7401164744092283, + "grad_norm": 1.2387651205062866, + "learning_rate": 9.70892105263158e-05, + "loss": 0.4412, + "step": 31075 + }, + { + "epoch": 1.7401724717213574, + "grad_norm": 1.2949508428573608, + "learning_rate": 9.708894736842106e-05, + "loss": 0.47, + "step": 31076 + }, + { + "epoch": 1.7402284690334864, + "grad_norm": 1.228320598602295, + "learning_rate": 9.708868421052632e-05, + "loss": 0.408, + "step": 31077 + }, + { + "epoch": 1.7402844663456154, + "grad_norm": 1.0673197507858276, + "learning_rate": 9.708842105263158e-05, + "loss": 0.3355, + "step": 31078 + }, + { + "epoch": 1.7403404636577444, + "grad_norm": 1.6339677572250366, + "learning_rate": 9.708815789473684e-05, + "loss": 0.5258, + "step": 31079 + }, + { + "epoch": 1.7403964609698734, + "grad_norm": 1.0141102075576782, + "learning_rate": 9.708789473684211e-05, + "loss": 0.3434, + "step": 31080 + }, + { + "epoch": 1.7404524582820025, + "grad_norm": 1.1632977724075317, + "learning_rate": 9.708763157894737e-05, + "loss": 0.4494, + "step": 31081 + }, + { + "epoch": 1.7405084555941315, + "grad_norm": 1.386582374572754, + "learning_rate": 9.708736842105263e-05, + "loss": 0.5816, + "step": 31082 + }, + { + "epoch": 1.7405644529062605, + "grad_norm": 1.2117961645126343, + "learning_rate": 9.708710526315789e-05, + "loss": 0.3829, + "step": 31083 + }, + { + "epoch": 1.7406204502183895, + "grad_norm": 1.2344613075256348, + "learning_rate": 9.708684210526316e-05, + "loss": 0.6839, + "step": 31084 + }, + { + "epoch": 1.7406764475305185, + "grad_norm": 1.557342529296875, + "learning_rate": 9.708657894736842e-05, + "loss": 0.6371, + "step": 31085 + }, + { + "epoch": 1.7407324448426476, + "grad_norm": 1.478472352027893, + "learning_rate": 9.70863157894737e-05, + "loss": 0.4407, + "step": 31086 + }, + { + "epoch": 1.7407884421547766, + "grad_norm": 1.3969019651412964, + "learning_rate": 9.708605263157894e-05, + "loss": 0.4745, + "step": 31087 + }, + { + "epoch": 1.7408444394669056, + "grad_norm": 1.1291042566299438, + "learning_rate": 9.708578947368422e-05, + "loss": 0.3991, + "step": 31088 + }, + { + "epoch": 1.7409004367790346, + "grad_norm": 1.3396730422973633, + "learning_rate": 9.708552631578948e-05, + "loss": 0.4702, + "step": 31089 + }, + { + "epoch": 1.7409564340911636, + "grad_norm": 1.233237385749817, + "learning_rate": 9.708526315789475e-05, + "loss": 0.4785, + "step": 31090 + }, + { + "epoch": 1.7410124314032926, + "grad_norm": 1.2310079336166382, + "learning_rate": 9.708500000000001e-05, + "loss": 0.3986, + "step": 31091 + }, + { + "epoch": 1.7410684287154217, + "grad_norm": 1.2281066179275513, + "learning_rate": 9.708473684210527e-05, + "loss": 0.3578, + "step": 31092 + }, + { + "epoch": 1.7411244260275507, + "grad_norm": 1.6963021755218506, + "learning_rate": 9.708447368421053e-05, + "loss": 0.7621, + "step": 31093 + }, + { + "epoch": 1.7411804233396797, + "grad_norm": 1.2462592124938965, + "learning_rate": 9.708421052631579e-05, + "loss": 0.5078, + "step": 31094 + }, + { + "epoch": 1.7412364206518087, + "grad_norm": 1.1407694816589355, + "learning_rate": 9.708394736842106e-05, + "loss": 0.464, + "step": 31095 + }, + { + "epoch": 1.7412924179639377, + "grad_norm": 1.5024384260177612, + "learning_rate": 9.708368421052632e-05, + "loss": 0.354, + "step": 31096 + }, + { + "epoch": 1.7413484152760668, + "grad_norm": 1.2973623275756836, + "learning_rate": 9.708342105263158e-05, + "loss": 0.3644, + "step": 31097 + }, + { + "epoch": 1.7414044125881958, + "grad_norm": 1.157495141029358, + "learning_rate": 9.708315789473684e-05, + "loss": 0.4756, + "step": 31098 + }, + { + "epoch": 1.7414604099003248, + "grad_norm": 1.5629620552062988, + "learning_rate": 9.708289473684211e-05, + "loss": 0.5234, + "step": 31099 + }, + { + "epoch": 1.7415164072124538, + "grad_norm": 1.3443243503570557, + "learning_rate": 9.708263157894737e-05, + "loss": 0.456, + "step": 31100 + }, + { + "epoch": 1.7415724045245828, + "grad_norm": 1.2040119171142578, + "learning_rate": 9.708236842105263e-05, + "loss": 0.3734, + "step": 31101 + }, + { + "epoch": 1.7416284018367119, + "grad_norm": 1.5180034637451172, + "learning_rate": 9.70821052631579e-05, + "loss": 0.4813, + "step": 31102 + }, + { + "epoch": 1.7416843991488409, + "grad_norm": 1.3192877769470215, + "learning_rate": 9.708184210526317e-05, + "loss": 0.4648, + "step": 31103 + }, + { + "epoch": 1.74174039646097, + "grad_norm": 1.2469027042388916, + "learning_rate": 9.708157894736843e-05, + "loss": 0.4622, + "step": 31104 + }, + { + "epoch": 1.741796393773099, + "grad_norm": 1.5461679697036743, + "learning_rate": 9.708131578947369e-05, + "loss": 0.3631, + "step": 31105 + }, + { + "epoch": 1.741852391085228, + "grad_norm": 1.1623367071151733, + "learning_rate": 9.708105263157895e-05, + "loss": 0.5347, + "step": 31106 + }, + { + "epoch": 1.741908388397357, + "grad_norm": 1.2339093685150146, + "learning_rate": 9.708078947368422e-05, + "loss": 0.4605, + "step": 31107 + }, + { + "epoch": 1.741964385709486, + "grad_norm": 1.197686791419983, + "learning_rate": 9.708052631578948e-05, + "loss": 0.5984, + "step": 31108 + }, + { + "epoch": 1.742020383021615, + "grad_norm": 1.1987754106521606, + "learning_rate": 9.708026315789475e-05, + "loss": 0.3691, + "step": 31109 + }, + { + "epoch": 1.742076380333744, + "grad_norm": 1.3697737455368042, + "learning_rate": 9.708e-05, + "loss": 0.446, + "step": 31110 + }, + { + "epoch": 1.742132377645873, + "grad_norm": 1.2985669374465942, + "learning_rate": 9.707973684210527e-05, + "loss": 0.4275, + "step": 31111 + }, + { + "epoch": 1.742188374958002, + "grad_norm": 1.3137775659561157, + "learning_rate": 9.707947368421053e-05, + "loss": 0.4253, + "step": 31112 + }, + { + "epoch": 1.742244372270131, + "grad_norm": 1.1247609853744507, + "learning_rate": 9.707921052631579e-05, + "loss": 0.4312, + "step": 31113 + }, + { + "epoch": 1.74230036958226, + "grad_norm": 1.3682801723480225, + "learning_rate": 9.707894736842106e-05, + "loss": 0.5176, + "step": 31114 + }, + { + "epoch": 1.742356366894389, + "grad_norm": 1.2950327396392822, + "learning_rate": 9.707868421052631e-05, + "loss": 0.4822, + "step": 31115 + }, + { + "epoch": 1.7424123642065181, + "grad_norm": 1.2705243825912476, + "learning_rate": 9.707842105263158e-05, + "loss": 0.4913, + "step": 31116 + }, + { + "epoch": 1.7424683615186471, + "grad_norm": 1.3573381900787354, + "learning_rate": 9.707815789473684e-05, + "loss": 0.4824, + "step": 31117 + }, + { + "epoch": 1.7425243588307762, + "grad_norm": 1.2273638248443604, + "learning_rate": 9.707789473684212e-05, + "loss": 0.4039, + "step": 31118 + }, + { + "epoch": 1.7425803561429052, + "grad_norm": 1.455743670463562, + "learning_rate": 9.707763157894736e-05, + "loss": 0.4769, + "step": 31119 + }, + { + "epoch": 1.7426363534550342, + "grad_norm": 1.5062799453735352, + "learning_rate": 9.707736842105264e-05, + "loss": 0.3431, + "step": 31120 + }, + { + "epoch": 1.7426923507671632, + "grad_norm": 1.1785157918930054, + "learning_rate": 9.70771052631579e-05, + "loss": 0.6031, + "step": 31121 + }, + { + "epoch": 1.7427483480792922, + "grad_norm": 1.2476261854171753, + "learning_rate": 9.707684210526317e-05, + "loss": 0.3797, + "step": 31122 + }, + { + "epoch": 1.7428043453914213, + "grad_norm": 1.257611870765686, + "learning_rate": 9.707657894736843e-05, + "loss": 0.4104, + "step": 31123 + }, + { + "epoch": 1.7428603427035503, + "grad_norm": 1.2240301370620728, + "learning_rate": 9.707631578947369e-05, + "loss": 0.4853, + "step": 31124 + }, + { + "epoch": 1.7429163400156793, + "grad_norm": 1.4875454902648926, + "learning_rate": 9.707605263157895e-05, + "loss": 0.4447, + "step": 31125 + }, + { + "epoch": 1.7429723373278083, + "grad_norm": 1.1892359256744385, + "learning_rate": 9.707578947368422e-05, + "loss": 0.4327, + "step": 31126 + }, + { + "epoch": 1.7430283346399373, + "grad_norm": 1.1689327955245972, + "learning_rate": 9.707552631578948e-05, + "loss": 0.3766, + "step": 31127 + }, + { + "epoch": 1.7430843319520664, + "grad_norm": 1.5304299592971802, + "learning_rate": 9.707526315789474e-05, + "loss": 0.6333, + "step": 31128 + }, + { + "epoch": 1.7431403292641954, + "grad_norm": 1.2294846773147583, + "learning_rate": 9.7075e-05, + "loss": 0.4744, + "step": 31129 + }, + { + "epoch": 1.7431963265763244, + "grad_norm": 1.062276005744934, + "learning_rate": 9.707473684210526e-05, + "loss": 0.3565, + "step": 31130 + }, + { + "epoch": 1.7432523238884534, + "grad_norm": 1.1798160076141357, + "learning_rate": 9.707447368421053e-05, + "loss": 0.3753, + "step": 31131 + }, + { + "epoch": 1.7433083212005824, + "grad_norm": 1.4706741571426392, + "learning_rate": 9.70742105263158e-05, + "loss": 0.6998, + "step": 31132 + }, + { + "epoch": 1.7433643185127115, + "grad_norm": 1.335087776184082, + "learning_rate": 9.707394736842105e-05, + "loss": 0.4379, + "step": 31133 + }, + { + "epoch": 1.7434203158248405, + "grad_norm": 1.7656105756759644, + "learning_rate": 9.707368421052631e-05, + "loss": 0.4847, + "step": 31134 + }, + { + "epoch": 1.7434763131369695, + "grad_norm": 1.0609756708145142, + "learning_rate": 9.707342105263159e-05, + "loss": 0.3443, + "step": 31135 + }, + { + "epoch": 1.7435323104490985, + "grad_norm": 1.1438069343566895, + "learning_rate": 9.707315789473685e-05, + "loss": 0.4442, + "step": 31136 + }, + { + "epoch": 1.7435883077612275, + "grad_norm": 1.1084822416305542, + "learning_rate": 9.70728947368421e-05, + "loss": 0.4142, + "step": 31137 + }, + { + "epoch": 1.7436443050733565, + "grad_norm": 1.147504210472107, + "learning_rate": 9.707263157894737e-05, + "loss": 0.3977, + "step": 31138 + }, + { + "epoch": 1.7437003023854856, + "grad_norm": 1.682477355003357, + "learning_rate": 9.707236842105264e-05, + "loss": 0.4326, + "step": 31139 + }, + { + "epoch": 1.7437562996976146, + "grad_norm": 1.200075387954712, + "learning_rate": 9.70721052631579e-05, + "loss": 0.3948, + "step": 31140 + }, + { + "epoch": 1.7438122970097436, + "grad_norm": 1.2038331031799316, + "learning_rate": 9.707184210526317e-05, + "loss": 0.3892, + "step": 31141 + }, + { + "epoch": 1.7438682943218726, + "grad_norm": 1.3430256843566895, + "learning_rate": 9.707157894736842e-05, + "loss": 0.499, + "step": 31142 + }, + { + "epoch": 1.7439242916340016, + "grad_norm": 1.3392972946166992, + "learning_rate": 9.707131578947369e-05, + "loss": 0.5093, + "step": 31143 + }, + { + "epoch": 1.7439802889461307, + "grad_norm": 1.2368052005767822, + "learning_rate": 9.707105263157895e-05, + "loss": 0.3795, + "step": 31144 + }, + { + "epoch": 1.7440362862582597, + "grad_norm": 1.3254647254943848, + "learning_rate": 9.707078947368422e-05, + "loss": 0.41, + "step": 31145 + }, + { + "epoch": 1.7440922835703887, + "grad_norm": 1.5145714282989502, + "learning_rate": 9.707052631578948e-05, + "loss": 0.4794, + "step": 31146 + }, + { + "epoch": 1.7441482808825177, + "grad_norm": 1.277005910873413, + "learning_rate": 9.707026315789473e-05, + "loss": 0.4421, + "step": 31147 + }, + { + "epoch": 1.7442042781946467, + "grad_norm": 1.0948920249938965, + "learning_rate": 9.707e-05, + "loss": 0.3123, + "step": 31148 + }, + { + "epoch": 1.7442602755067758, + "grad_norm": 1.3008092641830444, + "learning_rate": 9.706973684210526e-05, + "loss": 0.5022, + "step": 31149 + }, + { + "epoch": 1.7443162728189048, + "grad_norm": 1.3784140348434448, + "learning_rate": 9.706947368421054e-05, + "loss": 0.4771, + "step": 31150 + }, + { + "epoch": 1.7443722701310338, + "grad_norm": 1.32255220413208, + "learning_rate": 9.70692105263158e-05, + "loss": 0.4237, + "step": 31151 + }, + { + "epoch": 1.7444282674431628, + "grad_norm": 1.2068743705749512, + "learning_rate": 9.706894736842106e-05, + "loss": 0.4902, + "step": 31152 + }, + { + "epoch": 1.7444842647552918, + "grad_norm": 1.5408138036727905, + "learning_rate": 9.706868421052632e-05, + "loss": 0.5773, + "step": 31153 + }, + { + "epoch": 1.7445402620674209, + "grad_norm": 2.509183883666992, + "learning_rate": 9.706842105263159e-05, + "loss": 0.5349, + "step": 31154 + }, + { + "epoch": 1.7445962593795499, + "grad_norm": 1.1979178190231323, + "learning_rate": 9.706815789473685e-05, + "loss": 0.3273, + "step": 31155 + }, + { + "epoch": 1.744652256691679, + "grad_norm": 1.3609654903411865, + "learning_rate": 9.706789473684211e-05, + "loss": 0.4675, + "step": 31156 + }, + { + "epoch": 1.744708254003808, + "grad_norm": 1.5484447479248047, + "learning_rate": 9.706763157894737e-05, + "loss": 0.5337, + "step": 31157 + }, + { + "epoch": 1.744764251315937, + "grad_norm": 1.31448495388031, + "learning_rate": 9.706736842105264e-05, + "loss": 0.4891, + "step": 31158 + }, + { + "epoch": 1.744820248628066, + "grad_norm": 1.3487775325775146, + "learning_rate": 9.70671052631579e-05, + "loss": 0.553, + "step": 31159 + }, + { + "epoch": 1.744876245940195, + "grad_norm": 1.3651710748672485, + "learning_rate": 9.706684210526316e-05, + "loss": 0.6034, + "step": 31160 + }, + { + "epoch": 1.744932243252324, + "grad_norm": 1.2114300727844238, + "learning_rate": 9.706657894736842e-05, + "loss": 0.3901, + "step": 31161 + }, + { + "epoch": 1.744988240564453, + "grad_norm": 1.1772379875183105, + "learning_rate": 9.70663157894737e-05, + "loss": 0.4121, + "step": 31162 + }, + { + "epoch": 1.745044237876582, + "grad_norm": 1.1749367713928223, + "learning_rate": 9.706605263157895e-05, + "loss": 0.4912, + "step": 31163 + }, + { + "epoch": 1.745100235188711, + "grad_norm": 1.1842727661132812, + "learning_rate": 9.706578947368421e-05, + "loss": 0.4274, + "step": 31164 + }, + { + "epoch": 1.74515623250084, + "grad_norm": 1.3142340183258057, + "learning_rate": 9.706552631578947e-05, + "loss": 0.5152, + "step": 31165 + }, + { + "epoch": 1.745212229812969, + "grad_norm": 1.4865119457244873, + "learning_rate": 9.706526315789473e-05, + "loss": 0.486, + "step": 31166 + }, + { + "epoch": 1.745268227125098, + "grad_norm": 1.3705570697784424, + "learning_rate": 9.7065e-05, + "loss": 0.5238, + "step": 31167 + }, + { + "epoch": 1.7453242244372271, + "grad_norm": 1.315129041671753, + "learning_rate": 9.706473684210527e-05, + "loss": 0.5359, + "step": 31168 + }, + { + "epoch": 1.7453802217493561, + "grad_norm": 1.2519727945327759, + "learning_rate": 9.706447368421054e-05, + "loss": 0.3968, + "step": 31169 + }, + { + "epoch": 1.7454362190614852, + "grad_norm": 1.2539435625076294, + "learning_rate": 9.706421052631579e-05, + "loss": 0.3755, + "step": 31170 + }, + { + "epoch": 1.7454922163736142, + "grad_norm": 1.0977662801742554, + "learning_rate": 9.706394736842106e-05, + "loss": 0.4074, + "step": 31171 + }, + { + "epoch": 1.7455482136857432, + "grad_norm": 1.293164849281311, + "learning_rate": 9.706368421052632e-05, + "loss": 0.3691, + "step": 31172 + }, + { + "epoch": 1.7456042109978722, + "grad_norm": 1.315238356590271, + "learning_rate": 9.706342105263159e-05, + "loss": 0.4713, + "step": 31173 + }, + { + "epoch": 1.7456602083100012, + "grad_norm": 1.282739520072937, + "learning_rate": 9.706315789473684e-05, + "loss": 0.3478, + "step": 31174 + }, + { + "epoch": 1.7457162056221303, + "grad_norm": 1.2086079120635986, + "learning_rate": 9.706289473684211e-05, + "loss": 0.5047, + "step": 31175 + }, + { + "epoch": 1.7457722029342593, + "grad_norm": 2.1796000003814697, + "learning_rate": 9.706263157894737e-05, + "loss": 0.5282, + "step": 31176 + }, + { + "epoch": 1.7458282002463883, + "grad_norm": 1.298432469367981, + "learning_rate": 9.706236842105264e-05, + "loss": 0.4878, + "step": 31177 + }, + { + "epoch": 1.7458841975585173, + "grad_norm": 1.4929630756378174, + "learning_rate": 9.70621052631579e-05, + "loss": 0.505, + "step": 31178 + }, + { + "epoch": 1.7459401948706463, + "grad_norm": 1.1581120491027832, + "learning_rate": 9.706184210526316e-05, + "loss": 0.3978, + "step": 31179 + }, + { + "epoch": 1.7459961921827754, + "grad_norm": 1.0985528230667114, + "learning_rate": 9.706157894736842e-05, + "loss": 0.4282, + "step": 31180 + }, + { + "epoch": 1.7460521894949044, + "grad_norm": 1.657619595527649, + "learning_rate": 9.706131578947368e-05, + "loss": 0.7524, + "step": 31181 + }, + { + "epoch": 1.7461081868070334, + "grad_norm": 1.113402009010315, + "learning_rate": 9.706105263157896e-05, + "loss": 0.3417, + "step": 31182 + }, + { + "epoch": 1.7461641841191624, + "grad_norm": 1.1263939142227173, + "learning_rate": 9.706078947368422e-05, + "loss": 0.4483, + "step": 31183 + }, + { + "epoch": 1.7462201814312914, + "grad_norm": 1.3888018131256104, + "learning_rate": 9.706052631578948e-05, + "loss": 0.6088, + "step": 31184 + }, + { + "epoch": 1.7462761787434204, + "grad_norm": 1.1755828857421875, + "learning_rate": 9.706026315789474e-05, + "loss": 0.423, + "step": 31185 + }, + { + "epoch": 1.7463321760555495, + "grad_norm": 1.1729487180709839, + "learning_rate": 9.706000000000001e-05, + "loss": 0.4509, + "step": 31186 + }, + { + "epoch": 1.7463881733676785, + "grad_norm": 1.492467999458313, + "learning_rate": 9.705973684210527e-05, + "loss": 0.5417, + "step": 31187 + }, + { + "epoch": 1.7464441706798075, + "grad_norm": 1.4126147031784058, + "learning_rate": 9.705947368421053e-05, + "loss": 0.5848, + "step": 31188 + }, + { + "epoch": 1.7465001679919365, + "grad_norm": 1.270298719406128, + "learning_rate": 9.705921052631579e-05, + "loss": 0.473, + "step": 31189 + }, + { + "epoch": 1.7465561653040655, + "grad_norm": 1.3339388370513916, + "learning_rate": 9.705894736842106e-05, + "loss": 0.5668, + "step": 31190 + }, + { + "epoch": 1.7466121626161946, + "grad_norm": 1.4679417610168457, + "learning_rate": 9.705868421052632e-05, + "loss": 0.6043, + "step": 31191 + }, + { + "epoch": 1.7466681599283236, + "grad_norm": 1.3213615417480469, + "learning_rate": 9.705842105263158e-05, + "loss": 0.4504, + "step": 31192 + }, + { + "epoch": 1.7467241572404526, + "grad_norm": 1.5918020009994507, + "learning_rate": 9.705815789473684e-05, + "loss": 0.5611, + "step": 31193 + }, + { + "epoch": 1.7467801545525816, + "grad_norm": 1.4029831886291504, + "learning_rate": 9.705789473684211e-05, + "loss": 0.3647, + "step": 31194 + }, + { + "epoch": 1.7468361518647106, + "grad_norm": 1.5055946111679077, + "learning_rate": 9.705763157894737e-05, + "loss": 0.5016, + "step": 31195 + }, + { + "epoch": 1.7468921491768397, + "grad_norm": 1.0396732091903687, + "learning_rate": 9.705736842105265e-05, + "loss": 0.3701, + "step": 31196 + }, + { + "epoch": 1.7469481464889687, + "grad_norm": 1.2922356128692627, + "learning_rate": 9.70571052631579e-05, + "loss": 0.5646, + "step": 31197 + }, + { + "epoch": 1.7470041438010977, + "grad_norm": 1.3413194417953491, + "learning_rate": 9.705684210526315e-05, + "loss": 0.4088, + "step": 31198 + }, + { + "epoch": 1.7470601411132267, + "grad_norm": 1.216722846031189, + "learning_rate": 9.705657894736843e-05, + "loss": 0.5008, + "step": 31199 + }, + { + "epoch": 1.7471161384253557, + "grad_norm": 1.3155068159103394, + "learning_rate": 9.705631578947369e-05, + "loss": 0.4174, + "step": 31200 + }, + { + "epoch": 1.7471721357374848, + "grad_norm": 1.207533836364746, + "learning_rate": 9.705605263157896e-05, + "loss": 0.5191, + "step": 31201 + }, + { + "epoch": 1.7472281330496138, + "grad_norm": 1.6117228269577026, + "learning_rate": 9.70557894736842e-05, + "loss": 0.5692, + "step": 31202 + }, + { + "epoch": 1.7472841303617428, + "grad_norm": 1.204028844833374, + "learning_rate": 9.705552631578948e-05, + "loss": 0.4536, + "step": 31203 + }, + { + "epoch": 1.7473401276738718, + "grad_norm": 1.4153550863265991, + "learning_rate": 9.705526315789474e-05, + "loss": 0.438, + "step": 31204 + }, + { + "epoch": 1.7473961249860008, + "grad_norm": 1.072204828262329, + "learning_rate": 9.705500000000001e-05, + "loss": 0.3561, + "step": 31205 + }, + { + "epoch": 1.7474521222981299, + "grad_norm": 1.1738938093185425, + "learning_rate": 9.705473684210527e-05, + "loss": 0.411, + "step": 31206 + }, + { + "epoch": 1.7475081196102589, + "grad_norm": 1.049439549446106, + "learning_rate": 9.705447368421053e-05, + "loss": 0.3586, + "step": 31207 + }, + { + "epoch": 1.747564116922388, + "grad_norm": 1.2313178777694702, + "learning_rate": 9.705421052631579e-05, + "loss": 0.4807, + "step": 31208 + }, + { + "epoch": 1.747620114234517, + "grad_norm": 1.2798324823379517, + "learning_rate": 9.705394736842106e-05, + "loss": 0.3847, + "step": 31209 + }, + { + "epoch": 1.747676111546646, + "grad_norm": 1.2703906297683716, + "learning_rate": 9.705368421052632e-05, + "loss": 0.4997, + "step": 31210 + }, + { + "epoch": 1.7477321088587747, + "grad_norm": 1.3671642541885376, + "learning_rate": 9.705342105263158e-05, + "loss": 0.4343, + "step": 31211 + }, + { + "epoch": 1.7477881061709037, + "grad_norm": 1.385200023651123, + "learning_rate": 9.705315789473684e-05, + "loss": 0.478, + "step": 31212 + }, + { + "epoch": 1.7478441034830328, + "grad_norm": 1.0814567804336548, + "learning_rate": 9.705289473684212e-05, + "loss": 0.3617, + "step": 31213 + }, + { + "epoch": 1.7479001007951618, + "grad_norm": 1.0792593955993652, + "learning_rate": 9.705263157894738e-05, + "loss": 0.3795, + "step": 31214 + }, + { + "epoch": 1.7479560981072908, + "grad_norm": 1.7391676902770996, + "learning_rate": 9.705236842105264e-05, + "loss": 0.4514, + "step": 31215 + }, + { + "epoch": 1.7480120954194198, + "grad_norm": 1.5550775527954102, + "learning_rate": 9.70521052631579e-05, + "loss": 0.6263, + "step": 31216 + }, + { + "epoch": 1.7480680927315488, + "grad_norm": 1.538326382637024, + "learning_rate": 9.705184210526316e-05, + "loss": 0.5031, + "step": 31217 + }, + { + "epoch": 1.7481240900436779, + "grad_norm": 1.129977822303772, + "learning_rate": 9.705157894736843e-05, + "loss": 0.3993, + "step": 31218 + }, + { + "epoch": 1.7481800873558069, + "grad_norm": 1.1480122804641724, + "learning_rate": 9.705131578947369e-05, + "loss": 0.4013, + "step": 31219 + }, + { + "epoch": 1.748236084667936, + "grad_norm": 1.175819754600525, + "learning_rate": 9.705105263157895e-05, + "loss": 0.3819, + "step": 31220 + }, + { + "epoch": 1.748292081980065, + "grad_norm": 1.3679624795913696, + "learning_rate": 9.705078947368421e-05, + "loss": 0.442, + "step": 31221 + }, + { + "epoch": 1.748348079292194, + "grad_norm": 1.1933364868164062, + "learning_rate": 9.705052631578948e-05, + "loss": 0.4248, + "step": 31222 + }, + { + "epoch": 1.748404076604323, + "grad_norm": 1.593381404876709, + "learning_rate": 9.705026315789474e-05, + "loss": 0.3883, + "step": 31223 + }, + { + "epoch": 1.748460073916452, + "grad_norm": 1.7808988094329834, + "learning_rate": 9.705e-05, + "loss": 0.517, + "step": 31224 + }, + { + "epoch": 1.748516071228581, + "grad_norm": 1.6892181634902954, + "learning_rate": 9.704973684210526e-05, + "loss": 0.5065, + "step": 31225 + }, + { + "epoch": 1.74857206854071, + "grad_norm": 1.612975001335144, + "learning_rate": 9.704947368421053e-05, + "loss": 0.5153, + "step": 31226 + }, + { + "epoch": 1.748628065852839, + "grad_norm": 1.4767022132873535, + "learning_rate": 9.70492105263158e-05, + "loss": 0.4598, + "step": 31227 + }, + { + "epoch": 1.748684063164968, + "grad_norm": 1.2587836980819702, + "learning_rate": 9.704894736842107e-05, + "loss": 0.4505, + "step": 31228 + }, + { + "epoch": 1.748740060477097, + "grad_norm": 1.2305335998535156, + "learning_rate": 9.704868421052631e-05, + "loss": 0.4602, + "step": 31229 + }, + { + "epoch": 1.748796057789226, + "grad_norm": 1.1922690868377686, + "learning_rate": 9.704842105263159e-05, + "loss": 0.5075, + "step": 31230 + }, + { + "epoch": 1.748852055101355, + "grad_norm": 1.8206361532211304, + "learning_rate": 9.704815789473685e-05, + "loss": 0.482, + "step": 31231 + }, + { + "epoch": 1.7489080524134841, + "grad_norm": 1.375199556350708, + "learning_rate": 9.704789473684212e-05, + "loss": 0.4182, + "step": 31232 + }, + { + "epoch": 1.7489640497256131, + "grad_norm": 1.3915208578109741, + "learning_rate": 9.704763157894738e-05, + "loss": 0.4135, + "step": 31233 + }, + { + "epoch": 1.7490200470377422, + "grad_norm": 1.5195192098617554, + "learning_rate": 9.704736842105263e-05, + "loss": 0.4485, + "step": 31234 + }, + { + "epoch": 1.7490760443498712, + "grad_norm": 1.5032994747161865, + "learning_rate": 9.70471052631579e-05, + "loss": 0.4478, + "step": 31235 + }, + { + "epoch": 1.7491320416620002, + "grad_norm": 1.0254724025726318, + "learning_rate": 9.704684210526316e-05, + "loss": 0.3381, + "step": 31236 + }, + { + "epoch": 1.7491880389741292, + "grad_norm": 1.2206236124038696, + "learning_rate": 9.704657894736843e-05, + "loss": 0.4974, + "step": 31237 + }, + { + "epoch": 1.7492440362862582, + "grad_norm": 1.2313346862792969, + "learning_rate": 9.704631578947369e-05, + "loss": 0.3825, + "step": 31238 + }, + { + "epoch": 1.7493000335983873, + "grad_norm": 1.2586702108383179, + "learning_rate": 9.704605263157895e-05, + "loss": 0.3527, + "step": 31239 + }, + { + "epoch": 1.7493560309105163, + "grad_norm": 1.4547477960586548, + "learning_rate": 9.704578947368421e-05, + "loss": 0.5049, + "step": 31240 + }, + { + "epoch": 1.7494120282226453, + "grad_norm": 1.1846121549606323, + "learning_rate": 9.704552631578948e-05, + "loss": 0.3511, + "step": 31241 + }, + { + "epoch": 1.7494680255347743, + "grad_norm": 1.0702062845230103, + "learning_rate": 9.704526315789474e-05, + "loss": 0.3378, + "step": 31242 + }, + { + "epoch": 1.7495240228469033, + "grad_norm": 1.2872754335403442, + "learning_rate": 9.7045e-05, + "loss": 0.3932, + "step": 31243 + }, + { + "epoch": 1.7495800201590324, + "grad_norm": 1.1747994422912598, + "learning_rate": 9.704473684210526e-05, + "loss": 0.4573, + "step": 31244 + }, + { + "epoch": 1.7496360174711614, + "grad_norm": 1.3304767608642578, + "learning_rate": 9.704447368421054e-05, + "loss": 0.4008, + "step": 31245 + }, + { + "epoch": 1.7496920147832904, + "grad_norm": 1.310982584953308, + "learning_rate": 9.70442105263158e-05, + "loss": 0.4587, + "step": 31246 + }, + { + "epoch": 1.7497480120954194, + "grad_norm": 1.4259971380233765, + "learning_rate": 9.704394736842106e-05, + "loss": 0.4796, + "step": 31247 + }, + { + "epoch": 1.7498040094075484, + "grad_norm": 1.1854883432388306, + "learning_rate": 9.704368421052632e-05, + "loss": 0.5095, + "step": 31248 + }, + { + "epoch": 1.7498600067196775, + "grad_norm": 1.2379854917526245, + "learning_rate": 9.704342105263159e-05, + "loss": 0.4267, + "step": 31249 + }, + { + "epoch": 1.7499160040318065, + "grad_norm": 1.5443087816238403, + "learning_rate": 9.704315789473685e-05, + "loss": 0.6044, + "step": 31250 + }, + { + "epoch": 1.7499720013439355, + "grad_norm": 1.7336145639419556, + "learning_rate": 9.704289473684211e-05, + "loss": 0.5171, + "step": 31251 + }, + { + "epoch": 1.7500279986560645, + "grad_norm": 1.352027177810669, + "learning_rate": 9.704263157894737e-05, + "loss": 0.5536, + "step": 31252 + }, + { + "epoch": 1.7500839959681935, + "grad_norm": 1.3359984159469604, + "learning_rate": 9.704236842105263e-05, + "loss": 0.4543, + "step": 31253 + }, + { + "epoch": 1.7501399932803225, + "grad_norm": 1.0503405332565308, + "learning_rate": 9.70421052631579e-05, + "loss": 0.3971, + "step": 31254 + }, + { + "epoch": 1.7501959905924516, + "grad_norm": 1.1917916536331177, + "learning_rate": 9.704184210526316e-05, + "loss": 0.3951, + "step": 31255 + }, + { + "epoch": 1.7502519879045806, + "grad_norm": 1.2114452123641968, + "learning_rate": 9.704157894736843e-05, + "loss": 0.4025, + "step": 31256 + }, + { + "epoch": 1.7503079852167096, + "grad_norm": 1.156714677810669, + "learning_rate": 9.704131578947368e-05, + "loss": 0.4568, + "step": 31257 + }, + { + "epoch": 1.7503639825288386, + "grad_norm": 1.41642165184021, + "learning_rate": 9.704105263157895e-05, + "loss": 0.5325, + "step": 31258 + }, + { + "epoch": 1.7504199798409676, + "grad_norm": 1.2985272407531738, + "learning_rate": 9.704078947368421e-05, + "loss": 0.4317, + "step": 31259 + }, + { + "epoch": 1.7504759771530967, + "grad_norm": 1.1710045337677002, + "learning_rate": 9.704052631578949e-05, + "loss": 0.4385, + "step": 31260 + }, + { + "epoch": 1.7505319744652257, + "grad_norm": 1.4962447881698608, + "learning_rate": 9.704026315789475e-05, + "loss": 0.4705, + "step": 31261 + }, + { + "epoch": 1.7505879717773547, + "grad_norm": 1.2892462015151978, + "learning_rate": 9.704e-05, + "loss": 0.4027, + "step": 31262 + }, + { + "epoch": 1.7506439690894837, + "grad_norm": 1.217913031578064, + "learning_rate": 9.703973684210527e-05, + "loss": 0.5367, + "step": 31263 + }, + { + "epoch": 1.7506999664016127, + "grad_norm": 1.1747641563415527, + "learning_rate": 9.703947368421054e-05, + "loss": 0.4703, + "step": 31264 + }, + { + "epoch": 1.7507559637137418, + "grad_norm": 1.229805588722229, + "learning_rate": 9.70392105263158e-05, + "loss": 0.3878, + "step": 31265 + }, + { + "epoch": 1.7508119610258708, + "grad_norm": 1.64512038230896, + "learning_rate": 9.703894736842106e-05, + "loss": 0.5059, + "step": 31266 + }, + { + "epoch": 1.7508679583379998, + "grad_norm": 1.3795711994171143, + "learning_rate": 9.703868421052632e-05, + "loss": 0.59, + "step": 31267 + }, + { + "epoch": 1.7509239556501288, + "grad_norm": 1.3502846956253052, + "learning_rate": 9.703842105263158e-05, + "loss": 0.5191, + "step": 31268 + }, + { + "epoch": 1.7509799529622578, + "grad_norm": 1.1269354820251465, + "learning_rate": 9.703815789473685e-05, + "loss": 0.4288, + "step": 31269 + }, + { + "epoch": 1.7510359502743869, + "grad_norm": 1.2391060590744019, + "learning_rate": 9.703789473684211e-05, + "loss": 0.3725, + "step": 31270 + }, + { + "epoch": 1.7510919475865159, + "grad_norm": 1.2322804927825928, + "learning_rate": 9.703763157894737e-05, + "loss": 0.5085, + "step": 31271 + }, + { + "epoch": 1.751147944898645, + "grad_norm": 1.2614835500717163, + "learning_rate": 9.703736842105263e-05, + "loss": 0.4143, + "step": 31272 + }, + { + "epoch": 1.751203942210774, + "grad_norm": 1.2989394664764404, + "learning_rate": 9.70371052631579e-05, + "loss": 0.5407, + "step": 31273 + }, + { + "epoch": 1.751259939522903, + "grad_norm": 1.300110101699829, + "learning_rate": 9.703684210526316e-05, + "loss": 0.5379, + "step": 31274 + }, + { + "epoch": 1.751315936835032, + "grad_norm": 1.4079710245132446, + "learning_rate": 9.703657894736842e-05, + "loss": 0.4106, + "step": 31275 + }, + { + "epoch": 1.751371934147161, + "grad_norm": 1.0841021537780762, + "learning_rate": 9.703631578947368e-05, + "loss": 0.3562, + "step": 31276 + }, + { + "epoch": 1.75142793145929, + "grad_norm": 41.36603927612305, + "learning_rate": 9.703605263157896e-05, + "loss": 0.4181, + "step": 31277 + }, + { + "epoch": 1.751483928771419, + "grad_norm": 1.1688511371612549, + "learning_rate": 9.703578947368422e-05, + "loss": 0.3864, + "step": 31278 + }, + { + "epoch": 1.751539926083548, + "grad_norm": 1.1154115200042725, + "learning_rate": 9.703552631578948e-05, + "loss": 0.3512, + "step": 31279 + }, + { + "epoch": 1.751595923395677, + "grad_norm": 1.5570732355117798, + "learning_rate": 9.703526315789474e-05, + "loss": 0.5818, + "step": 31280 + }, + { + "epoch": 1.751651920707806, + "grad_norm": 1.2714945077896118, + "learning_rate": 9.703500000000001e-05, + "loss": 0.3954, + "step": 31281 + }, + { + "epoch": 1.751707918019935, + "grad_norm": 1.1105530261993408, + "learning_rate": 9.703473684210527e-05, + "loss": 0.4644, + "step": 31282 + }, + { + "epoch": 1.751763915332064, + "grad_norm": 1.3542370796203613, + "learning_rate": 9.703447368421054e-05, + "loss": 0.4844, + "step": 31283 + }, + { + "epoch": 1.7518199126441931, + "grad_norm": 1.2920011281967163, + "learning_rate": 9.703421052631579e-05, + "loss": 0.3696, + "step": 31284 + }, + { + "epoch": 1.7518759099563221, + "grad_norm": 1.2199699878692627, + "learning_rate": 9.703394736842105e-05, + "loss": 0.3776, + "step": 31285 + }, + { + "epoch": 1.7519319072684512, + "grad_norm": 1.2068777084350586, + "learning_rate": 9.703368421052632e-05, + "loss": 0.4286, + "step": 31286 + }, + { + "epoch": 1.7519879045805802, + "grad_norm": 1.2400809526443481, + "learning_rate": 9.703342105263158e-05, + "loss": 0.4208, + "step": 31287 + }, + { + "epoch": 1.7520439018927092, + "grad_norm": 1.454540491104126, + "learning_rate": 9.703315789473685e-05, + "loss": 0.5424, + "step": 31288 + }, + { + "epoch": 1.7520998992048382, + "grad_norm": 1.419374942779541, + "learning_rate": 9.70328947368421e-05, + "loss": 0.4436, + "step": 31289 + }, + { + "epoch": 1.7521558965169672, + "grad_norm": 1.0190922021865845, + "learning_rate": 9.703263157894737e-05, + "loss": 0.4034, + "step": 31290 + }, + { + "epoch": 1.7522118938290963, + "grad_norm": 1.2758339643478394, + "learning_rate": 9.703236842105263e-05, + "loss": 0.5161, + "step": 31291 + }, + { + "epoch": 1.7522678911412253, + "grad_norm": 1.1510674953460693, + "learning_rate": 9.70321052631579e-05, + "loss": 0.4194, + "step": 31292 + }, + { + "epoch": 1.752323888453354, + "grad_norm": 1.3816848993301392, + "learning_rate": 9.703184210526317e-05, + "loss": 0.5782, + "step": 31293 + }, + { + "epoch": 1.752379885765483, + "grad_norm": 1.1880531311035156, + "learning_rate": 9.703157894736843e-05, + "loss": 0.4683, + "step": 31294 + }, + { + "epoch": 1.752435883077612, + "grad_norm": 1.1447632312774658, + "learning_rate": 9.703131578947369e-05, + "loss": 0.5645, + "step": 31295 + }, + { + "epoch": 1.7524918803897411, + "grad_norm": 1.6359468698501587, + "learning_rate": 9.703105263157896e-05, + "loss": 0.6439, + "step": 31296 + }, + { + "epoch": 1.7525478777018701, + "grad_norm": 1.5488758087158203, + "learning_rate": 9.703078947368422e-05, + "loss": 0.4205, + "step": 31297 + }, + { + "epoch": 1.7526038750139992, + "grad_norm": 1.2353792190551758, + "learning_rate": 9.703052631578948e-05, + "loss": 0.3828, + "step": 31298 + }, + { + "epoch": 1.7526598723261282, + "grad_norm": 1.3690663576126099, + "learning_rate": 9.703026315789474e-05, + "loss": 0.4517, + "step": 31299 + }, + { + "epoch": 1.7527158696382572, + "grad_norm": 5.144920825958252, + "learning_rate": 9.703000000000001e-05, + "loss": 0.5015, + "step": 31300 + }, + { + "epoch": 1.7527718669503862, + "grad_norm": 1.2079733610153198, + "learning_rate": 9.702973684210527e-05, + "loss": 0.5603, + "step": 31301 + }, + { + "epoch": 1.7528278642625152, + "grad_norm": 1.635956048965454, + "learning_rate": 9.702947368421053e-05, + "loss": 0.438, + "step": 31302 + }, + { + "epoch": 1.7528838615746443, + "grad_norm": 1.3473219871520996, + "learning_rate": 9.702921052631579e-05, + "loss": 0.4354, + "step": 31303 + }, + { + "epoch": 1.7529398588867733, + "grad_norm": 1.3026502132415771, + "learning_rate": 9.702894736842105e-05, + "loss": 0.3847, + "step": 31304 + }, + { + "epoch": 1.7529958561989023, + "grad_norm": 1.169182300567627, + "learning_rate": 9.702868421052632e-05, + "loss": 0.4121, + "step": 31305 + }, + { + "epoch": 1.7530518535110313, + "grad_norm": 0.9932965040206909, + "learning_rate": 9.702842105263158e-05, + "loss": 0.3512, + "step": 31306 + }, + { + "epoch": 1.7531078508231603, + "grad_norm": 1.2884613275527954, + "learning_rate": 9.702815789473684e-05, + "loss": 0.373, + "step": 31307 + }, + { + "epoch": 1.7531638481352894, + "grad_norm": 1.0677106380462646, + "learning_rate": 9.70278947368421e-05, + "loss": 0.3906, + "step": 31308 + }, + { + "epoch": 1.7532198454474184, + "grad_norm": 1.265514612197876, + "learning_rate": 9.702763157894738e-05, + "loss": 0.6163, + "step": 31309 + }, + { + "epoch": 1.7532758427595474, + "grad_norm": 1.2068476676940918, + "learning_rate": 9.702736842105264e-05, + "loss": 0.3683, + "step": 31310 + }, + { + "epoch": 1.7533318400716764, + "grad_norm": 1.3675965070724487, + "learning_rate": 9.702710526315791e-05, + "loss": 0.4875, + "step": 31311 + }, + { + "epoch": 1.7533878373838054, + "grad_norm": 1.547469139099121, + "learning_rate": 9.702684210526316e-05, + "loss": 0.4268, + "step": 31312 + }, + { + "epoch": 1.7534438346959345, + "grad_norm": 1.2892094850540161, + "learning_rate": 9.702657894736843e-05, + "loss": 0.6642, + "step": 31313 + }, + { + "epoch": 1.7534998320080635, + "grad_norm": 1.1886186599731445, + "learning_rate": 9.702631578947369e-05, + "loss": 0.4739, + "step": 31314 + }, + { + "epoch": 1.7535558293201925, + "grad_norm": 1.0835310220718384, + "learning_rate": 9.702605263157896e-05, + "loss": 0.4139, + "step": 31315 + }, + { + "epoch": 1.7536118266323215, + "grad_norm": 1.205816388130188, + "learning_rate": 9.702578947368422e-05, + "loss": 0.4162, + "step": 31316 + }, + { + "epoch": 1.7536678239444505, + "grad_norm": 1.6776663064956665, + "learning_rate": 9.702552631578948e-05, + "loss": 0.4073, + "step": 31317 + }, + { + "epoch": 1.7537238212565796, + "grad_norm": 4.810730934143066, + "learning_rate": 9.702526315789474e-05, + "loss": 0.4364, + "step": 31318 + }, + { + "epoch": 1.7537798185687086, + "grad_norm": 1.4386926889419556, + "learning_rate": 9.7025e-05, + "loss": 0.4643, + "step": 31319 + }, + { + "epoch": 1.7538358158808376, + "grad_norm": 1.1380045413970947, + "learning_rate": 9.702473684210527e-05, + "loss": 0.4737, + "step": 31320 + }, + { + "epoch": 1.7538918131929666, + "grad_norm": 1.463484764099121, + "learning_rate": 9.702447368421052e-05, + "loss": 0.3943, + "step": 31321 + }, + { + "epoch": 1.7539478105050956, + "grad_norm": 1.4343619346618652, + "learning_rate": 9.702421052631579e-05, + "loss": 0.3837, + "step": 31322 + }, + { + "epoch": 1.7540038078172246, + "grad_norm": 1.1751424074172974, + "learning_rate": 9.702394736842105e-05, + "loss": 0.397, + "step": 31323 + }, + { + "epoch": 1.7540598051293537, + "grad_norm": 1.3465496301651, + "learning_rate": 9.702368421052633e-05, + "loss": 0.5806, + "step": 31324 + }, + { + "epoch": 1.7541158024414827, + "grad_norm": 1.3034881353378296, + "learning_rate": 9.702342105263159e-05, + "loss": 0.4874, + "step": 31325 + }, + { + "epoch": 1.7541717997536117, + "grad_norm": 1.4874991178512573, + "learning_rate": 9.702315789473685e-05, + "loss": 0.492, + "step": 31326 + }, + { + "epoch": 1.7542277970657407, + "grad_norm": 1.558430552482605, + "learning_rate": 9.70228947368421e-05, + "loss": 0.5006, + "step": 31327 + }, + { + "epoch": 1.7542837943778697, + "grad_norm": 1.315670132637024, + "learning_rate": 9.702263157894738e-05, + "loss": 0.5163, + "step": 31328 + }, + { + "epoch": 1.7543397916899988, + "grad_norm": 1.3746200799942017, + "learning_rate": 9.702236842105264e-05, + "loss": 0.4011, + "step": 31329 + }, + { + "epoch": 1.7543957890021278, + "grad_norm": 1.3804941177368164, + "learning_rate": 9.70221052631579e-05, + "loss": 0.5946, + "step": 31330 + }, + { + "epoch": 1.7544517863142568, + "grad_norm": 1.374734878540039, + "learning_rate": 9.702184210526316e-05, + "loss": 0.3677, + "step": 31331 + }, + { + "epoch": 1.7545077836263858, + "grad_norm": 1.3483905792236328, + "learning_rate": 9.702157894736843e-05, + "loss": 0.5115, + "step": 31332 + }, + { + "epoch": 1.7545637809385148, + "grad_norm": 1.217339277267456, + "learning_rate": 9.702131578947369e-05, + "loss": 0.4084, + "step": 31333 + }, + { + "epoch": 1.7546197782506439, + "grad_norm": 1.4455842971801758, + "learning_rate": 9.702105263157895e-05, + "loss": 0.5797, + "step": 31334 + }, + { + "epoch": 1.7546757755627729, + "grad_norm": 1.247218370437622, + "learning_rate": 9.702078947368421e-05, + "loss": 0.3732, + "step": 31335 + }, + { + "epoch": 1.754731772874902, + "grad_norm": 1.3854154348373413, + "learning_rate": 9.702052631578947e-05, + "loss": 0.5129, + "step": 31336 + }, + { + "epoch": 1.754787770187031, + "grad_norm": 1.4090889692306519, + "learning_rate": 9.702026315789474e-05, + "loss": 0.5575, + "step": 31337 + }, + { + "epoch": 1.75484376749916, + "grad_norm": 1.278238296508789, + "learning_rate": 9.702e-05, + "loss": 0.5516, + "step": 31338 + }, + { + "epoch": 1.754899764811289, + "grad_norm": 1.2656220197677612, + "learning_rate": 9.701973684210526e-05, + "loss": 0.4156, + "step": 31339 + }, + { + "epoch": 1.754955762123418, + "grad_norm": 1.3355220556259155, + "learning_rate": 9.701947368421052e-05, + "loss": 0.4433, + "step": 31340 + }, + { + "epoch": 1.755011759435547, + "grad_norm": 1.4254103899002075, + "learning_rate": 9.70192105263158e-05, + "loss": 0.4262, + "step": 31341 + }, + { + "epoch": 1.755067756747676, + "grad_norm": 1.6784453392028809, + "learning_rate": 9.701894736842106e-05, + "loss": 0.5253, + "step": 31342 + }, + { + "epoch": 1.755123754059805, + "grad_norm": 1.319821834564209, + "learning_rate": 9.701868421052633e-05, + "loss": 0.592, + "step": 31343 + }, + { + "epoch": 1.755179751371934, + "grad_norm": 1.1004695892333984, + "learning_rate": 9.701842105263157e-05, + "loss": 0.3536, + "step": 31344 + }, + { + "epoch": 1.755235748684063, + "grad_norm": 1.2576149702072144, + "learning_rate": 9.701815789473685e-05, + "loss": 0.4572, + "step": 31345 + }, + { + "epoch": 1.755291745996192, + "grad_norm": 1.5715019702911377, + "learning_rate": 9.701789473684211e-05, + "loss": 0.4802, + "step": 31346 + }, + { + "epoch": 1.755347743308321, + "grad_norm": 1.1008951663970947, + "learning_rate": 9.701763157894738e-05, + "loss": 0.4025, + "step": 31347 + }, + { + "epoch": 1.7554037406204501, + "grad_norm": 1.357969045639038, + "learning_rate": 9.701736842105264e-05, + "loss": 0.386, + "step": 31348 + }, + { + "epoch": 1.7554597379325791, + "grad_norm": 1.2820796966552734, + "learning_rate": 9.70171052631579e-05, + "loss": 0.4113, + "step": 31349 + }, + { + "epoch": 1.7555157352447082, + "grad_norm": 1.2205705642700195, + "learning_rate": 9.701684210526316e-05, + "loss": 0.4281, + "step": 31350 + }, + { + "epoch": 1.7555717325568372, + "grad_norm": 1.1640247106552124, + "learning_rate": 9.701657894736843e-05, + "loss": 0.4124, + "step": 31351 + }, + { + "epoch": 1.7556277298689662, + "grad_norm": 1.018678903579712, + "learning_rate": 9.70163157894737e-05, + "loss": 0.3908, + "step": 31352 + }, + { + "epoch": 1.7556837271810952, + "grad_norm": 1.4704970121383667, + "learning_rate": 9.701605263157895e-05, + "loss": 0.4808, + "step": 31353 + }, + { + "epoch": 1.7557397244932242, + "grad_norm": 1.1616512537002563, + "learning_rate": 9.701578947368421e-05, + "loss": 0.3945, + "step": 31354 + }, + { + "epoch": 1.7557957218053533, + "grad_norm": 1.2916069030761719, + "learning_rate": 9.701552631578947e-05, + "loss": 0.4441, + "step": 31355 + }, + { + "epoch": 1.7558517191174823, + "grad_norm": 1.2617357969284058, + "learning_rate": 9.701526315789475e-05, + "loss": 0.4459, + "step": 31356 + }, + { + "epoch": 1.7559077164296113, + "grad_norm": 1.7488200664520264, + "learning_rate": 9.7015e-05, + "loss": 0.5532, + "step": 31357 + }, + { + "epoch": 1.7559637137417403, + "grad_norm": 1.1443363428115845, + "learning_rate": 9.701473684210527e-05, + "loss": 0.3739, + "step": 31358 + }, + { + "epoch": 1.7560197110538693, + "grad_norm": 1.3983464241027832, + "learning_rate": 9.701447368421052e-05, + "loss": 0.4532, + "step": 31359 + }, + { + "epoch": 1.7560757083659984, + "grad_norm": 1.4170891046524048, + "learning_rate": 9.70142105263158e-05, + "loss": 0.4413, + "step": 31360 + }, + { + "epoch": 1.7561317056781274, + "grad_norm": 1.5757806301116943, + "learning_rate": 9.701394736842106e-05, + "loss": 0.5593, + "step": 31361 + }, + { + "epoch": 1.7561877029902564, + "grad_norm": 1.6156803369522095, + "learning_rate": 9.701368421052632e-05, + "loss": 0.456, + "step": 31362 + }, + { + "epoch": 1.7562437003023854, + "grad_norm": 1.2452237606048584, + "learning_rate": 9.701342105263158e-05, + "loss": 0.3817, + "step": 31363 + }, + { + "epoch": 1.7562996976145144, + "grad_norm": 1.4902656078338623, + "learning_rate": 9.701315789473685e-05, + "loss": 0.5332, + "step": 31364 + }, + { + "epoch": 1.7563556949266435, + "grad_norm": 1.3083916902542114, + "learning_rate": 9.701289473684211e-05, + "loss": 0.5033, + "step": 31365 + }, + { + "epoch": 1.7564116922387725, + "grad_norm": 1.261059284210205, + "learning_rate": 9.701263157894738e-05, + "loss": 0.5927, + "step": 31366 + }, + { + "epoch": 1.7564676895509015, + "grad_norm": 1.3278404474258423, + "learning_rate": 9.701236842105263e-05, + "loss": 0.3786, + "step": 31367 + }, + { + "epoch": 1.7565236868630305, + "grad_norm": 1.5977632999420166, + "learning_rate": 9.70121052631579e-05, + "loss": 0.4335, + "step": 31368 + }, + { + "epoch": 1.7565796841751595, + "grad_norm": 1.1443051099777222, + "learning_rate": 9.701184210526316e-05, + "loss": 0.4152, + "step": 31369 + }, + { + "epoch": 1.7566356814872885, + "grad_norm": 1.3411881923675537, + "learning_rate": 9.701157894736844e-05, + "loss": 0.443, + "step": 31370 + }, + { + "epoch": 1.7566916787994176, + "grad_norm": 1.2317705154418945, + "learning_rate": 9.701131578947368e-05, + "loss": 0.4129, + "step": 31371 + }, + { + "epoch": 1.7567476761115466, + "grad_norm": 1.2970744371414185, + "learning_rate": 9.701105263157894e-05, + "loss": 0.5434, + "step": 31372 + }, + { + "epoch": 1.7568036734236756, + "grad_norm": 1.1788445711135864, + "learning_rate": 9.701078947368422e-05, + "loss": 0.3618, + "step": 31373 + }, + { + "epoch": 1.7568596707358046, + "grad_norm": 1.4097920656204224, + "learning_rate": 9.701052631578948e-05, + "loss": 0.513, + "step": 31374 + }, + { + "epoch": 1.7569156680479336, + "grad_norm": 1.1962306499481201, + "learning_rate": 9.701026315789475e-05, + "loss": 0.5054, + "step": 31375 + }, + { + "epoch": 1.7569716653600627, + "grad_norm": 1.5471967458724976, + "learning_rate": 9.701e-05, + "loss": 0.4983, + "step": 31376 + }, + { + "epoch": 1.7570276626721917, + "grad_norm": 1.1592347621917725, + "learning_rate": 9.700973684210527e-05, + "loss": 0.6716, + "step": 31377 + }, + { + "epoch": 1.7570836599843207, + "grad_norm": 1.21383535861969, + "learning_rate": 9.700947368421053e-05, + "loss": 0.4102, + "step": 31378 + }, + { + "epoch": 1.7571396572964497, + "grad_norm": 1.246001958847046, + "learning_rate": 9.70092105263158e-05, + "loss": 0.4585, + "step": 31379 + }, + { + "epoch": 1.7571956546085787, + "grad_norm": 1.214787244796753, + "learning_rate": 9.700894736842106e-05, + "loss": 0.3326, + "step": 31380 + }, + { + "epoch": 1.7572516519207078, + "grad_norm": 1.2083290815353394, + "learning_rate": 9.700868421052632e-05, + "loss": 0.3861, + "step": 31381 + }, + { + "epoch": 1.7573076492328368, + "grad_norm": 1.724625825881958, + "learning_rate": 9.700842105263158e-05, + "loss": 0.7243, + "step": 31382 + }, + { + "epoch": 1.7573636465449658, + "grad_norm": 1.2969224452972412, + "learning_rate": 9.700815789473685e-05, + "loss": 0.561, + "step": 31383 + }, + { + "epoch": 1.7574196438570948, + "grad_norm": 1.1480454206466675, + "learning_rate": 9.700789473684211e-05, + "loss": 0.5084, + "step": 31384 + }, + { + "epoch": 1.7574756411692238, + "grad_norm": 1.3230129480361938, + "learning_rate": 9.700763157894737e-05, + "loss": 0.5435, + "step": 31385 + }, + { + "epoch": 1.7575316384813529, + "grad_norm": 1.4467577934265137, + "learning_rate": 9.700736842105263e-05, + "loss": 0.5748, + "step": 31386 + }, + { + "epoch": 1.7575876357934819, + "grad_norm": 1.3462480306625366, + "learning_rate": 9.70071052631579e-05, + "loss": 0.429, + "step": 31387 + }, + { + "epoch": 1.757643633105611, + "grad_norm": 1.230965495109558, + "learning_rate": 9.700684210526317e-05, + "loss": 0.5786, + "step": 31388 + }, + { + "epoch": 1.75769963041774, + "grad_norm": 1.4086639881134033, + "learning_rate": 9.700657894736843e-05, + "loss": 0.4782, + "step": 31389 + }, + { + "epoch": 1.757755627729869, + "grad_norm": 1.478774905204773, + "learning_rate": 9.700631578947368e-05, + "loss": 0.3529, + "step": 31390 + }, + { + "epoch": 1.757811625041998, + "grad_norm": 1.061188817024231, + "learning_rate": 9.700605263157894e-05, + "loss": 0.3758, + "step": 31391 + }, + { + "epoch": 1.757867622354127, + "grad_norm": 1.043335199356079, + "learning_rate": 9.700578947368422e-05, + "loss": 0.435, + "step": 31392 + }, + { + "epoch": 1.757923619666256, + "grad_norm": 1.258521318435669, + "learning_rate": 9.700552631578948e-05, + "loss": 0.3658, + "step": 31393 + }, + { + "epoch": 1.757979616978385, + "grad_norm": 4.0094757080078125, + "learning_rate": 9.700526315789474e-05, + "loss": 0.3804, + "step": 31394 + }, + { + "epoch": 1.758035614290514, + "grad_norm": 1.100940227508545, + "learning_rate": 9.7005e-05, + "loss": 0.3954, + "step": 31395 + }, + { + "epoch": 1.758091611602643, + "grad_norm": 1.1431269645690918, + "learning_rate": 9.700473684210527e-05, + "loss": 0.3382, + "step": 31396 + }, + { + "epoch": 1.758147608914772, + "grad_norm": 1.1409684419631958, + "learning_rate": 9.700447368421053e-05, + "loss": 0.5586, + "step": 31397 + }, + { + "epoch": 1.758203606226901, + "grad_norm": 1.3768917322158813, + "learning_rate": 9.70042105263158e-05, + "loss": 0.4479, + "step": 31398 + }, + { + "epoch": 1.75825960353903, + "grad_norm": 1.5339504480361938, + "learning_rate": 9.700394736842105e-05, + "loss": 0.4439, + "step": 31399 + }, + { + "epoch": 1.7583156008511591, + "grad_norm": 1.4344311952590942, + "learning_rate": 9.700368421052632e-05, + "loss": 0.4942, + "step": 31400 + }, + { + "epoch": 1.7583715981632881, + "grad_norm": 1.1855796575546265, + "learning_rate": 9.700342105263158e-05, + "loss": 0.4133, + "step": 31401 + }, + { + "epoch": 1.7584275954754172, + "grad_norm": 1.260161280632019, + "learning_rate": 9.700315789473686e-05, + "loss": 0.3815, + "step": 31402 + }, + { + "epoch": 1.7584835927875462, + "grad_norm": 1.1264280080795288, + "learning_rate": 9.700289473684212e-05, + "loss": 0.5768, + "step": 31403 + }, + { + "epoch": 1.7585395900996752, + "grad_norm": 1.344635248184204, + "learning_rate": 9.700263157894738e-05, + "loss": 0.5187, + "step": 31404 + }, + { + "epoch": 1.7585955874118042, + "grad_norm": 1.3495181798934937, + "learning_rate": 9.700236842105263e-05, + "loss": 0.5547, + "step": 31405 + }, + { + "epoch": 1.7586515847239332, + "grad_norm": 1.0829442739486694, + "learning_rate": 9.70021052631579e-05, + "loss": 0.3704, + "step": 31406 + }, + { + "epoch": 1.7587075820360623, + "grad_norm": 1.1291688680648804, + "learning_rate": 9.700184210526317e-05, + "loss": 0.3036, + "step": 31407 + }, + { + "epoch": 1.7587635793481913, + "grad_norm": 1.5994068384170532, + "learning_rate": 9.700157894736843e-05, + "loss": 0.4838, + "step": 31408 + }, + { + "epoch": 1.7588195766603203, + "grad_norm": 1.3823555707931519, + "learning_rate": 9.700131578947369e-05, + "loss": 0.3572, + "step": 31409 + }, + { + "epoch": 1.7588755739724493, + "grad_norm": 1.4652913808822632, + "learning_rate": 9.700105263157895e-05, + "loss": 0.7145, + "step": 31410 + }, + { + "epoch": 1.7589315712845783, + "grad_norm": 1.3560162782669067, + "learning_rate": 9.700078947368422e-05, + "loss": 0.4733, + "step": 31411 + }, + { + "epoch": 1.7589875685967074, + "grad_norm": 1.3950577974319458, + "learning_rate": 9.700052631578948e-05, + "loss": 0.5525, + "step": 31412 + }, + { + "epoch": 1.7590435659088364, + "grad_norm": 1.1825885772705078, + "learning_rate": 9.700026315789474e-05, + "loss": 0.3966, + "step": 31413 + }, + { + "epoch": 1.7590995632209654, + "grad_norm": 1.2298526763916016, + "learning_rate": 9.7e-05, + "loss": 0.4035, + "step": 31414 + }, + { + "epoch": 1.7591555605330944, + "grad_norm": 1.301370620727539, + "learning_rate": 9.699973684210527e-05, + "loss": 0.5156, + "step": 31415 + }, + { + "epoch": 1.7592115578452234, + "grad_norm": 1.4733564853668213, + "learning_rate": 9.699947368421053e-05, + "loss": 0.4461, + "step": 31416 + }, + { + "epoch": 1.7592675551573524, + "grad_norm": 1.7607483863830566, + "learning_rate": 9.699921052631579e-05, + "loss": 0.5734, + "step": 31417 + }, + { + "epoch": 1.7593235524694815, + "grad_norm": 1.5247420072555542, + "learning_rate": 9.699894736842105e-05, + "loss": 0.4377, + "step": 31418 + }, + { + "epoch": 1.7593795497816105, + "grad_norm": 1.4009897708892822, + "learning_rate": 9.699868421052633e-05, + "loss": 0.4704, + "step": 31419 + }, + { + "epoch": 1.7594355470937395, + "grad_norm": 1.2151381969451904, + "learning_rate": 9.699842105263159e-05, + "loss": 0.4475, + "step": 31420 + }, + { + "epoch": 1.7594915444058685, + "grad_norm": 1.4483447074890137, + "learning_rate": 9.699815789473686e-05, + "loss": 0.463, + "step": 31421 + }, + { + "epoch": 1.7595475417179975, + "grad_norm": 1.5088638067245483, + "learning_rate": 9.69978947368421e-05, + "loss": 0.4674, + "step": 31422 + }, + { + "epoch": 1.7596035390301266, + "grad_norm": 1.3373827934265137, + "learning_rate": 9.699763157894736e-05, + "loss": 0.4543, + "step": 31423 + }, + { + "epoch": 1.7596595363422556, + "grad_norm": 1.1299997568130493, + "learning_rate": 9.699736842105264e-05, + "loss": 0.4847, + "step": 31424 + }, + { + "epoch": 1.7597155336543846, + "grad_norm": 1.358533501625061, + "learning_rate": 9.69971052631579e-05, + "loss": 0.3672, + "step": 31425 + }, + { + "epoch": 1.7597715309665136, + "grad_norm": 1.4248813390731812, + "learning_rate": 9.699684210526316e-05, + "loss": 0.3376, + "step": 31426 + }, + { + "epoch": 1.7598275282786426, + "grad_norm": 1.6210975646972656, + "learning_rate": 9.699657894736842e-05, + "loss": 0.602, + "step": 31427 + }, + { + "epoch": 1.7598835255907717, + "grad_norm": 1.2547893524169922, + "learning_rate": 9.699631578947369e-05, + "loss": 0.4421, + "step": 31428 + }, + { + "epoch": 1.7599395229029007, + "grad_norm": 1.2690328359603882, + "learning_rate": 9.699605263157895e-05, + "loss": 0.432, + "step": 31429 + }, + { + "epoch": 1.7599955202150297, + "grad_norm": 1.2691785097122192, + "learning_rate": 9.699578947368422e-05, + "loss": 0.4426, + "step": 31430 + }, + { + "epoch": 1.7600515175271587, + "grad_norm": 1.4508098363876343, + "learning_rate": 9.699552631578947e-05, + "loss": 0.5126, + "step": 31431 + }, + { + "epoch": 1.7601075148392877, + "grad_norm": 1.277583360671997, + "learning_rate": 9.699526315789474e-05, + "loss": 0.4527, + "step": 31432 + }, + { + "epoch": 1.7601635121514168, + "grad_norm": 1.5339820384979248, + "learning_rate": 9.6995e-05, + "loss": 0.459, + "step": 31433 + }, + { + "epoch": 1.7602195094635458, + "grad_norm": 1.3452469110488892, + "learning_rate": 9.699473684210528e-05, + "loss": 0.5097, + "step": 31434 + }, + { + "epoch": 1.7602755067756748, + "grad_norm": 1.2519586086273193, + "learning_rate": 9.699447368421054e-05, + "loss": 0.4339, + "step": 31435 + }, + { + "epoch": 1.7603315040878038, + "grad_norm": 1.7836685180664062, + "learning_rate": 9.69942105263158e-05, + "loss": 0.4599, + "step": 31436 + }, + { + "epoch": 1.7603875013999328, + "grad_norm": 1.384204387664795, + "learning_rate": 9.699394736842105e-05, + "loss": 0.5123, + "step": 31437 + }, + { + "epoch": 1.7604434987120618, + "grad_norm": 1.1255943775177002, + "learning_rate": 9.699368421052633e-05, + "loss": 0.4688, + "step": 31438 + }, + { + "epoch": 1.7604994960241909, + "grad_norm": 1.056462287902832, + "learning_rate": 9.699342105263159e-05, + "loss": 0.333, + "step": 31439 + }, + { + "epoch": 1.7605554933363199, + "grad_norm": 1.203423261642456, + "learning_rate": 9.699315789473685e-05, + "loss": 0.4784, + "step": 31440 + }, + { + "epoch": 1.760611490648449, + "grad_norm": 5.010548114776611, + "learning_rate": 9.699289473684211e-05, + "loss": 0.5922, + "step": 31441 + }, + { + "epoch": 1.760667487960578, + "grad_norm": 1.019890546798706, + "learning_rate": 9.699263157894737e-05, + "loss": 0.4258, + "step": 31442 + }, + { + "epoch": 1.760723485272707, + "grad_norm": 1.202873945236206, + "learning_rate": 9.699236842105264e-05, + "loss": 0.539, + "step": 31443 + }, + { + "epoch": 1.760779482584836, + "grad_norm": 1.1808617115020752, + "learning_rate": 9.69921052631579e-05, + "loss": 0.5889, + "step": 31444 + }, + { + "epoch": 1.760835479896965, + "grad_norm": 1.1226969957351685, + "learning_rate": 9.699184210526316e-05, + "loss": 0.374, + "step": 31445 + }, + { + "epoch": 1.760891477209094, + "grad_norm": 1.376338243484497, + "learning_rate": 9.699157894736842e-05, + "loss": 0.3882, + "step": 31446 + }, + { + "epoch": 1.760947474521223, + "grad_norm": 1.1124764680862427, + "learning_rate": 9.699131578947369e-05, + "loss": 0.4151, + "step": 31447 + }, + { + "epoch": 1.761003471833352, + "grad_norm": 1.4831949472427368, + "learning_rate": 9.699105263157895e-05, + "loss": 0.4305, + "step": 31448 + }, + { + "epoch": 1.761059469145481, + "grad_norm": 1.3852840662002563, + "learning_rate": 9.699078947368421e-05, + "loss": 0.3966, + "step": 31449 + }, + { + "epoch": 1.76111546645761, + "grad_norm": 1.6167633533477783, + "learning_rate": 9.699052631578947e-05, + "loss": 0.411, + "step": 31450 + }, + { + "epoch": 1.761171463769739, + "grad_norm": 1.3998875617980957, + "learning_rate": 9.699026315789475e-05, + "loss": 0.5188, + "step": 31451 + }, + { + "epoch": 1.7612274610818681, + "grad_norm": 1.45525062084198, + "learning_rate": 9.699e-05, + "loss": 0.5463, + "step": 31452 + }, + { + "epoch": 1.7612834583939971, + "grad_norm": 0.9640327095985413, + "learning_rate": 9.698973684210528e-05, + "loss": 0.3042, + "step": 31453 + }, + { + "epoch": 1.7613394557061262, + "grad_norm": 1.1937028169631958, + "learning_rate": 9.698947368421052e-05, + "loss": 0.4544, + "step": 31454 + }, + { + "epoch": 1.7613954530182552, + "grad_norm": 1.2516857385635376, + "learning_rate": 9.69892105263158e-05, + "loss": 0.4241, + "step": 31455 + }, + { + "epoch": 1.7614514503303842, + "grad_norm": 1.1950222253799438, + "learning_rate": 9.698894736842106e-05, + "loss": 0.4083, + "step": 31456 + }, + { + "epoch": 1.7615074476425132, + "grad_norm": 1.199872374534607, + "learning_rate": 9.698868421052632e-05, + "loss": 0.5317, + "step": 31457 + }, + { + "epoch": 1.7615634449546422, + "grad_norm": 1.4942198991775513, + "learning_rate": 9.698842105263159e-05, + "loss": 0.3739, + "step": 31458 + }, + { + "epoch": 1.7616194422667713, + "grad_norm": 2.0371768474578857, + "learning_rate": 9.698815789473684e-05, + "loss": 0.5374, + "step": 31459 + }, + { + "epoch": 1.7616754395789003, + "grad_norm": 1.3608322143554688, + "learning_rate": 9.698789473684211e-05, + "loss": 0.3797, + "step": 31460 + }, + { + "epoch": 1.7617314368910293, + "grad_norm": 1.410219430923462, + "learning_rate": 9.698763157894737e-05, + "loss": 0.3973, + "step": 31461 + }, + { + "epoch": 1.7617874342031583, + "grad_norm": 1.2892036437988281, + "learning_rate": 9.698736842105264e-05, + "loss": 0.5025, + "step": 31462 + }, + { + "epoch": 1.7618434315152873, + "grad_norm": 1.354514241218567, + "learning_rate": 9.69871052631579e-05, + "loss": 0.584, + "step": 31463 + }, + { + "epoch": 1.7618994288274163, + "grad_norm": 1.2871614694595337, + "learning_rate": 9.698684210526316e-05, + "loss": 0.4542, + "step": 31464 + }, + { + "epoch": 1.7619554261395454, + "grad_norm": 0.9832156300544739, + "learning_rate": 9.698657894736842e-05, + "loss": 0.3554, + "step": 31465 + }, + { + "epoch": 1.7620114234516744, + "grad_norm": 1.3597266674041748, + "learning_rate": 9.69863157894737e-05, + "loss": 0.5007, + "step": 31466 + }, + { + "epoch": 1.7620674207638034, + "grad_norm": 1.1829832792282104, + "learning_rate": 9.698605263157895e-05, + "loss": 0.4606, + "step": 31467 + }, + { + "epoch": 1.7621234180759324, + "grad_norm": 1.450077772140503, + "learning_rate": 9.698578947368421e-05, + "loss": 0.4254, + "step": 31468 + }, + { + "epoch": 1.7621794153880614, + "grad_norm": 1.1420611143112183, + "learning_rate": 9.698552631578947e-05, + "loss": 0.4703, + "step": 31469 + }, + { + "epoch": 1.7622354127001905, + "grad_norm": 1.348101019859314, + "learning_rate": 9.698526315789475e-05, + "loss": 0.514, + "step": 31470 + }, + { + "epoch": 1.7622914100123195, + "grad_norm": 1.2099101543426514, + "learning_rate": 9.698500000000001e-05, + "loss": 0.4555, + "step": 31471 + }, + { + "epoch": 1.7623474073244485, + "grad_norm": 1.2844172716140747, + "learning_rate": 9.698473684210527e-05, + "loss": 0.5856, + "step": 31472 + }, + { + "epoch": 1.7624034046365775, + "grad_norm": 1.1239879131317139, + "learning_rate": 9.698447368421053e-05, + "loss": 0.3856, + "step": 31473 + }, + { + "epoch": 1.7624594019487065, + "grad_norm": 1.1519743204116821, + "learning_rate": 9.69842105263158e-05, + "loss": 0.3621, + "step": 31474 + }, + { + "epoch": 1.7625153992608356, + "grad_norm": 1.4256031513214111, + "learning_rate": 9.698394736842106e-05, + "loss": 0.4518, + "step": 31475 + }, + { + "epoch": 1.7625713965729646, + "grad_norm": 1.134522557258606, + "learning_rate": 9.698368421052632e-05, + "loss": 0.3928, + "step": 31476 + }, + { + "epoch": 1.7626273938850936, + "grad_norm": 1.2252823114395142, + "learning_rate": 9.698342105263158e-05, + "loss": 0.4037, + "step": 31477 + }, + { + "epoch": 1.7626833911972226, + "grad_norm": 1.4766465425491333, + "learning_rate": 9.698315789473684e-05, + "loss": 0.6407, + "step": 31478 + }, + { + "epoch": 1.7627393885093516, + "grad_norm": 1.9454439878463745, + "learning_rate": 9.698289473684211e-05, + "loss": 0.5116, + "step": 31479 + }, + { + "epoch": 1.7627953858214807, + "grad_norm": 1.4040120840072632, + "learning_rate": 9.698263157894737e-05, + "loss": 0.5018, + "step": 31480 + }, + { + "epoch": 1.7628513831336097, + "grad_norm": 1.3570436239242554, + "learning_rate": 9.698236842105263e-05, + "loss": 0.4183, + "step": 31481 + }, + { + "epoch": 1.7629073804457387, + "grad_norm": 1.0709501504898071, + "learning_rate": 9.698210526315789e-05, + "loss": 0.4102, + "step": 31482 + }, + { + "epoch": 1.7629633777578677, + "grad_norm": 1.4571372270584106, + "learning_rate": 9.698184210526316e-05, + "loss": 0.6113, + "step": 31483 + }, + { + "epoch": 1.7630193750699967, + "grad_norm": 1.3944772481918335, + "learning_rate": 9.698157894736842e-05, + "loss": 0.4673, + "step": 31484 + }, + { + "epoch": 1.7630753723821257, + "grad_norm": 1.5022156238555908, + "learning_rate": 9.69813157894737e-05, + "loss": 0.5727, + "step": 31485 + }, + { + "epoch": 1.7631313696942548, + "grad_norm": 1.2592860460281372, + "learning_rate": 9.698105263157894e-05, + "loss": 0.4493, + "step": 31486 + }, + { + "epoch": 1.7631873670063838, + "grad_norm": 1.3246036767959595, + "learning_rate": 9.698078947368422e-05, + "loss": 0.4089, + "step": 31487 + }, + { + "epoch": 1.7632433643185128, + "grad_norm": 1.2681739330291748, + "learning_rate": 9.698052631578948e-05, + "loss": 0.4553, + "step": 31488 + }, + { + "epoch": 1.7632993616306418, + "grad_norm": 1.3387362957000732, + "learning_rate": 9.698026315789475e-05, + "loss": 0.409, + "step": 31489 + }, + { + "epoch": 1.7633553589427708, + "grad_norm": 1.2997803688049316, + "learning_rate": 9.698000000000001e-05, + "loss": 0.4797, + "step": 31490 + }, + { + "epoch": 1.7634113562548999, + "grad_norm": 1.271246075630188, + "learning_rate": 9.697973684210527e-05, + "loss": 0.3911, + "step": 31491 + }, + { + "epoch": 1.7634673535670289, + "grad_norm": 1.261385440826416, + "learning_rate": 9.697947368421053e-05, + "loss": 0.4139, + "step": 31492 + }, + { + "epoch": 1.763523350879158, + "grad_norm": 1.2858843803405762, + "learning_rate": 9.697921052631579e-05, + "loss": 0.5789, + "step": 31493 + }, + { + "epoch": 1.763579348191287, + "grad_norm": 1.1422197818756104, + "learning_rate": 9.697894736842106e-05, + "loss": 0.3957, + "step": 31494 + }, + { + "epoch": 1.763635345503416, + "grad_norm": 1.1950825452804565, + "learning_rate": 9.697868421052632e-05, + "loss": 0.3616, + "step": 31495 + }, + { + "epoch": 1.763691342815545, + "grad_norm": 1.162674903869629, + "learning_rate": 9.697842105263158e-05, + "loss": 0.4708, + "step": 31496 + }, + { + "epoch": 1.763747340127674, + "grad_norm": 1.1652240753173828, + "learning_rate": 9.697815789473684e-05, + "loss": 0.4366, + "step": 31497 + }, + { + "epoch": 1.763803337439803, + "grad_norm": 1.3446221351623535, + "learning_rate": 9.697789473684211e-05, + "loss": 0.5055, + "step": 31498 + }, + { + "epoch": 1.763859334751932, + "grad_norm": 1.241191029548645, + "learning_rate": 9.697763157894737e-05, + "loss": 0.4349, + "step": 31499 + }, + { + "epoch": 1.763915332064061, + "grad_norm": 1.277579426765442, + "learning_rate": 9.697736842105263e-05, + "loss": 0.5438, + "step": 31500 + }, + { + "epoch": 1.76397132937619, + "grad_norm": 1.2783100605010986, + "learning_rate": 9.69771052631579e-05, + "loss": 0.4395, + "step": 31501 + }, + { + "epoch": 1.764027326688319, + "grad_norm": 1.133917212486267, + "learning_rate": 9.697684210526317e-05, + "loss": 0.4213, + "step": 31502 + }, + { + "epoch": 1.764083324000448, + "grad_norm": 1.1544266939163208, + "learning_rate": 9.697657894736843e-05, + "loss": 0.4736, + "step": 31503 + }, + { + "epoch": 1.7641393213125771, + "grad_norm": 1.3574148416519165, + "learning_rate": 9.697631578947369e-05, + "loss": 0.3828, + "step": 31504 + }, + { + "epoch": 1.7641953186247061, + "grad_norm": 1.3751143217086792, + "learning_rate": 9.697605263157895e-05, + "loss": 0.4168, + "step": 31505 + }, + { + "epoch": 1.7642513159368352, + "grad_norm": 1.498292326927185, + "learning_rate": 9.697578947368422e-05, + "loss": 0.5091, + "step": 31506 + }, + { + "epoch": 1.7643073132489642, + "grad_norm": 1.330779790878296, + "learning_rate": 9.697552631578948e-05, + "loss": 0.5447, + "step": 31507 + }, + { + "epoch": 1.7643633105610932, + "grad_norm": 1.4283132553100586, + "learning_rate": 9.697526315789475e-05, + "loss": 0.389, + "step": 31508 + }, + { + "epoch": 1.7644193078732222, + "grad_norm": 1.2465740442276, + "learning_rate": 9.6975e-05, + "loss": 0.379, + "step": 31509 + }, + { + "epoch": 1.7644753051853512, + "grad_norm": 1.3268877267837524, + "learning_rate": 9.697473684210526e-05, + "loss": 0.5558, + "step": 31510 + }, + { + "epoch": 1.7645313024974802, + "grad_norm": 1.5753016471862793, + "learning_rate": 9.697447368421053e-05, + "loss": 0.4791, + "step": 31511 + }, + { + "epoch": 1.7645872998096093, + "grad_norm": 1.1190284490585327, + "learning_rate": 9.697421052631579e-05, + "loss": 0.4404, + "step": 31512 + }, + { + "epoch": 1.7646432971217383, + "grad_norm": 1.1840065717697144, + "learning_rate": 9.697394736842107e-05, + "loss": 0.4407, + "step": 31513 + }, + { + "epoch": 1.7646992944338673, + "grad_norm": 1.1299233436584473, + "learning_rate": 9.697368421052631e-05, + "loss": 0.4273, + "step": 31514 + }, + { + "epoch": 1.7647552917459963, + "grad_norm": 1.1534067392349243, + "learning_rate": 9.697342105263158e-05, + "loss": 0.4088, + "step": 31515 + }, + { + "epoch": 1.7648112890581253, + "grad_norm": 1.4165939092636108, + "learning_rate": 9.697315789473684e-05, + "loss": 0.3707, + "step": 31516 + }, + { + "epoch": 1.7648672863702544, + "grad_norm": 1.1244193315505981, + "learning_rate": 9.697289473684212e-05, + "loss": 0.5268, + "step": 31517 + }, + { + "epoch": 1.7649232836823834, + "grad_norm": 1.745214819908142, + "learning_rate": 9.697263157894738e-05, + "loss": 0.5335, + "step": 31518 + }, + { + "epoch": 1.7649792809945124, + "grad_norm": 1.2726930379867554, + "learning_rate": 9.697236842105264e-05, + "loss": 0.4248, + "step": 31519 + }, + { + "epoch": 1.7650352783066414, + "grad_norm": 1.469014286994934, + "learning_rate": 9.69721052631579e-05, + "loss": 0.5316, + "step": 31520 + }, + { + "epoch": 1.7650912756187704, + "grad_norm": 1.2842581272125244, + "learning_rate": 9.697184210526317e-05, + "loss": 0.5588, + "step": 31521 + }, + { + "epoch": 1.7651472729308995, + "grad_norm": 1.3293139934539795, + "learning_rate": 9.697157894736843e-05, + "loss": 0.5061, + "step": 31522 + }, + { + "epoch": 1.7652032702430285, + "grad_norm": 1.244208574295044, + "learning_rate": 9.697131578947369e-05, + "loss": 0.4003, + "step": 31523 + }, + { + "epoch": 1.7652592675551575, + "grad_norm": 1.1883312463760376, + "learning_rate": 9.697105263157895e-05, + "loss": 0.4022, + "step": 31524 + }, + { + "epoch": 1.7653152648672865, + "grad_norm": 0.9889773726463318, + "learning_rate": 9.697078947368422e-05, + "loss": 0.2647, + "step": 31525 + }, + { + "epoch": 1.7653712621794155, + "grad_norm": 1.2561534643173218, + "learning_rate": 9.697052631578948e-05, + "loss": 0.4171, + "step": 31526 + }, + { + "epoch": 1.7654272594915446, + "grad_norm": 1.361660361289978, + "learning_rate": 9.697026315789474e-05, + "loss": 0.4491, + "step": 31527 + }, + { + "epoch": 1.7654832568036736, + "grad_norm": 1.2190622091293335, + "learning_rate": 9.697e-05, + "loss": 0.4789, + "step": 31528 + }, + { + "epoch": 1.7655392541158026, + "grad_norm": 1.2869620323181152, + "learning_rate": 9.696973684210526e-05, + "loss": 0.4522, + "step": 31529 + }, + { + "epoch": 1.7655952514279316, + "grad_norm": 1.0969651937484741, + "learning_rate": 9.696947368421053e-05, + "loss": 0.4566, + "step": 31530 + }, + { + "epoch": 1.7656512487400606, + "grad_norm": 1.3023502826690674, + "learning_rate": 9.69692105263158e-05, + "loss": 0.4476, + "step": 31531 + }, + { + "epoch": 1.7657072460521896, + "grad_norm": 1.3227877616882324, + "learning_rate": 9.696894736842105e-05, + "loss": 0.5335, + "step": 31532 + }, + { + "epoch": 1.7657632433643187, + "grad_norm": 1.197417140007019, + "learning_rate": 9.696868421052631e-05, + "loss": 0.444, + "step": 31533 + }, + { + "epoch": 1.7658192406764477, + "grad_norm": 1.3425732851028442, + "learning_rate": 9.696842105263159e-05, + "loss": 0.5227, + "step": 31534 + }, + { + "epoch": 1.7658752379885767, + "grad_norm": 1.4290789365768433, + "learning_rate": 9.696815789473685e-05, + "loss": 0.5945, + "step": 31535 + }, + { + "epoch": 1.7659312353007057, + "grad_norm": 1.2781115770339966, + "learning_rate": 9.69678947368421e-05, + "loss": 0.3768, + "step": 31536 + }, + { + "epoch": 1.7659872326128347, + "grad_norm": 1.3153653144836426, + "learning_rate": 9.696763157894737e-05, + "loss": 0.4119, + "step": 31537 + }, + { + "epoch": 1.7660432299249638, + "grad_norm": 1.438663363456726, + "learning_rate": 9.696736842105264e-05, + "loss": 0.3812, + "step": 31538 + }, + { + "epoch": 1.7660992272370928, + "grad_norm": 1.6345983743667603, + "learning_rate": 9.69671052631579e-05, + "loss": 0.808, + "step": 31539 + }, + { + "epoch": 1.7661552245492218, + "grad_norm": 1.1660674810409546, + "learning_rate": 9.696684210526317e-05, + "loss": 0.4568, + "step": 31540 + }, + { + "epoch": 1.7662112218613506, + "grad_norm": 1.3210314512252808, + "learning_rate": 9.696657894736842e-05, + "loss": 0.5017, + "step": 31541 + }, + { + "epoch": 1.7662672191734796, + "grad_norm": 1.3592729568481445, + "learning_rate": 9.696631578947369e-05, + "loss": 0.5468, + "step": 31542 + }, + { + "epoch": 1.7663232164856086, + "grad_norm": 1.4642986059188843, + "learning_rate": 9.696605263157895e-05, + "loss": 0.4787, + "step": 31543 + }, + { + "epoch": 1.7663792137977377, + "grad_norm": 1.237945318222046, + "learning_rate": 9.696578947368421e-05, + "loss": 0.3869, + "step": 31544 + }, + { + "epoch": 1.7664352111098667, + "grad_norm": 1.135084867477417, + "learning_rate": 9.696552631578948e-05, + "loss": 0.3375, + "step": 31545 + }, + { + "epoch": 1.7664912084219957, + "grad_norm": 1.292121171951294, + "learning_rate": 9.696526315789473e-05, + "loss": 0.4826, + "step": 31546 + }, + { + "epoch": 1.7665472057341247, + "grad_norm": 1.1377503871917725, + "learning_rate": 9.6965e-05, + "loss": 0.442, + "step": 31547 + }, + { + "epoch": 1.7666032030462537, + "grad_norm": 1.2419637441635132, + "learning_rate": 9.696473684210526e-05, + "loss": 0.5338, + "step": 31548 + }, + { + "epoch": 1.7666592003583828, + "grad_norm": 1.1220835447311401, + "learning_rate": 9.696447368421054e-05, + "loss": 0.3862, + "step": 31549 + }, + { + "epoch": 1.7667151976705118, + "grad_norm": 1.1769870519638062, + "learning_rate": 9.69642105263158e-05, + "loss": 0.3768, + "step": 31550 + }, + { + "epoch": 1.7667711949826408, + "grad_norm": 1.450187087059021, + "learning_rate": 9.696394736842106e-05, + "loss": 0.4583, + "step": 31551 + }, + { + "epoch": 1.7668271922947698, + "grad_norm": 1.1313263177871704, + "learning_rate": 9.696368421052632e-05, + "loss": 0.4579, + "step": 31552 + }, + { + "epoch": 1.7668831896068988, + "grad_norm": 1.4563243389129639, + "learning_rate": 9.696342105263159e-05, + "loss": 0.5501, + "step": 31553 + }, + { + "epoch": 1.7669391869190278, + "grad_norm": 1.1168112754821777, + "learning_rate": 9.696315789473685e-05, + "loss": 0.359, + "step": 31554 + }, + { + "epoch": 1.7669951842311569, + "grad_norm": 1.2646751403808594, + "learning_rate": 9.696289473684211e-05, + "loss": 0.4124, + "step": 31555 + }, + { + "epoch": 1.7670511815432859, + "grad_norm": 1.4349724054336548, + "learning_rate": 9.696263157894737e-05, + "loss": 0.5039, + "step": 31556 + }, + { + "epoch": 1.767107178855415, + "grad_norm": 1.2223607301712036, + "learning_rate": 9.696236842105264e-05, + "loss": 0.3452, + "step": 31557 + }, + { + "epoch": 1.767163176167544, + "grad_norm": 1.445101022720337, + "learning_rate": 9.69621052631579e-05, + "loss": 0.5003, + "step": 31558 + }, + { + "epoch": 1.767219173479673, + "grad_norm": 1.5038913488388062, + "learning_rate": 9.696184210526316e-05, + "loss": 0.5452, + "step": 31559 + }, + { + "epoch": 1.767275170791802, + "grad_norm": 1.1562330722808838, + "learning_rate": 9.696157894736842e-05, + "loss": 0.4638, + "step": 31560 + }, + { + "epoch": 1.767331168103931, + "grad_norm": 1.3902146816253662, + "learning_rate": 9.696131578947368e-05, + "loss": 0.4532, + "step": 31561 + }, + { + "epoch": 1.76738716541606, + "grad_norm": 1.4367204904556274, + "learning_rate": 9.696105263157895e-05, + "loss": 0.6274, + "step": 31562 + }, + { + "epoch": 1.767443162728189, + "grad_norm": 1.3518548011779785, + "learning_rate": 9.696078947368421e-05, + "loss": 0.4238, + "step": 31563 + }, + { + "epoch": 1.767499160040318, + "grad_norm": 1.3402560949325562, + "learning_rate": 9.696052631578947e-05, + "loss": 0.4032, + "step": 31564 + }, + { + "epoch": 1.767555157352447, + "grad_norm": 1.5832061767578125, + "learning_rate": 9.696026315789473e-05, + "loss": 0.4415, + "step": 31565 + }, + { + "epoch": 1.767611154664576, + "grad_norm": 1.2894824743270874, + "learning_rate": 9.696000000000001e-05, + "loss": 0.3704, + "step": 31566 + }, + { + "epoch": 1.767667151976705, + "grad_norm": 1.0190800428390503, + "learning_rate": 9.695973684210527e-05, + "loss": 0.3231, + "step": 31567 + }, + { + "epoch": 1.7677231492888341, + "grad_norm": 1.3909094333648682, + "learning_rate": 9.695947368421054e-05, + "loss": 0.5809, + "step": 31568 + }, + { + "epoch": 1.7677791466009631, + "grad_norm": 1.6655607223510742, + "learning_rate": 9.695921052631579e-05, + "loss": 0.7583, + "step": 31569 + }, + { + "epoch": 1.7678351439130922, + "grad_norm": 4.349919319152832, + "learning_rate": 9.695894736842106e-05, + "loss": 0.5639, + "step": 31570 + }, + { + "epoch": 1.7678911412252212, + "grad_norm": 9.490988731384277, + "learning_rate": 9.695868421052632e-05, + "loss": 0.3731, + "step": 31571 + }, + { + "epoch": 1.7679471385373502, + "grad_norm": 1.2213760614395142, + "learning_rate": 9.695842105263159e-05, + "loss": 0.4255, + "step": 31572 + }, + { + "epoch": 1.7680031358494792, + "grad_norm": 1.6073302030563354, + "learning_rate": 9.695815789473684e-05, + "loss": 0.4538, + "step": 31573 + }, + { + "epoch": 1.7680591331616082, + "grad_norm": 1.3122426271438599, + "learning_rate": 9.695789473684211e-05, + "loss": 0.3353, + "step": 31574 + }, + { + "epoch": 1.7681151304737373, + "grad_norm": 1.3224055767059326, + "learning_rate": 9.695763157894737e-05, + "loss": 0.3719, + "step": 31575 + }, + { + "epoch": 1.7681711277858663, + "grad_norm": 1.2959257364273071, + "learning_rate": 9.695736842105264e-05, + "loss": 0.5298, + "step": 31576 + }, + { + "epoch": 1.7682271250979953, + "grad_norm": 1.9519927501678467, + "learning_rate": 9.69571052631579e-05, + "loss": 0.5065, + "step": 31577 + }, + { + "epoch": 1.7682831224101243, + "grad_norm": 1.4113036394119263, + "learning_rate": 9.695684210526315e-05, + "loss": 0.5254, + "step": 31578 + }, + { + "epoch": 1.7683391197222533, + "grad_norm": 1.2891052961349487, + "learning_rate": 9.695657894736842e-05, + "loss": 0.6011, + "step": 31579 + }, + { + "epoch": 1.7683951170343823, + "grad_norm": 1.668357491493225, + "learning_rate": 9.695631578947368e-05, + "loss": 0.4809, + "step": 31580 + }, + { + "epoch": 1.7684511143465114, + "grad_norm": 1.218103051185608, + "learning_rate": 9.695605263157896e-05, + "loss": 0.5375, + "step": 31581 + }, + { + "epoch": 1.7685071116586404, + "grad_norm": 1.2777475118637085, + "learning_rate": 9.695578947368422e-05, + "loss": 0.3912, + "step": 31582 + }, + { + "epoch": 1.7685631089707694, + "grad_norm": 1.4177066087722778, + "learning_rate": 9.695552631578948e-05, + "loss": 0.4315, + "step": 31583 + }, + { + "epoch": 1.7686191062828984, + "grad_norm": 1.2517859935760498, + "learning_rate": 9.695526315789474e-05, + "loss": 0.4109, + "step": 31584 + }, + { + "epoch": 1.7686751035950274, + "grad_norm": 1.3994783163070679, + "learning_rate": 9.695500000000001e-05, + "loss": 0.3294, + "step": 31585 + }, + { + "epoch": 1.7687311009071565, + "grad_norm": 1.345992088317871, + "learning_rate": 9.695473684210527e-05, + "loss": 0.4645, + "step": 31586 + }, + { + "epoch": 1.7687870982192855, + "grad_norm": 1.1893301010131836, + "learning_rate": 9.695447368421053e-05, + "loss": 0.4279, + "step": 31587 + }, + { + "epoch": 1.7688430955314145, + "grad_norm": 1.310773253440857, + "learning_rate": 9.695421052631579e-05, + "loss": 0.4225, + "step": 31588 + }, + { + "epoch": 1.7688990928435435, + "grad_norm": 1.2024987936019897, + "learning_rate": 9.695394736842106e-05, + "loss": 0.4784, + "step": 31589 + }, + { + "epoch": 1.7689550901556725, + "grad_norm": 1.2498165369033813, + "learning_rate": 9.695368421052632e-05, + "loss": 0.5249, + "step": 31590 + }, + { + "epoch": 1.7690110874678016, + "grad_norm": 1.200305700302124, + "learning_rate": 9.695342105263158e-05, + "loss": 0.3937, + "step": 31591 + }, + { + "epoch": 1.7690670847799306, + "grad_norm": 1.3510410785675049, + "learning_rate": 9.695315789473684e-05, + "loss": 0.606, + "step": 31592 + }, + { + "epoch": 1.7691230820920596, + "grad_norm": 1.2939341068267822, + "learning_rate": 9.695289473684211e-05, + "loss": 0.4131, + "step": 31593 + }, + { + "epoch": 1.7691790794041886, + "grad_norm": 1.2193002700805664, + "learning_rate": 9.695263157894737e-05, + "loss": 0.3932, + "step": 31594 + }, + { + "epoch": 1.7692350767163176, + "grad_norm": 2.1687841415405273, + "learning_rate": 9.695236842105265e-05, + "loss": 0.5081, + "step": 31595 + }, + { + "epoch": 1.7692910740284467, + "grad_norm": 1.2129395008087158, + "learning_rate": 9.69521052631579e-05, + "loss": 0.3913, + "step": 31596 + }, + { + "epoch": 1.7693470713405757, + "grad_norm": 1.280544638633728, + "learning_rate": 9.695184210526315e-05, + "loss": 0.4863, + "step": 31597 + }, + { + "epoch": 1.7694030686527047, + "grad_norm": 1.1567386388778687, + "learning_rate": 9.695157894736843e-05, + "loss": 0.4065, + "step": 31598 + }, + { + "epoch": 1.7694590659648337, + "grad_norm": 1.4139891862869263, + "learning_rate": 9.695131578947369e-05, + "loss": 0.3961, + "step": 31599 + }, + { + "epoch": 1.7695150632769627, + "grad_norm": 1.68169367313385, + "learning_rate": 9.695105263157896e-05, + "loss": 0.454, + "step": 31600 + }, + { + "epoch": 1.7695710605890917, + "grad_norm": 1.2753052711486816, + "learning_rate": 9.69507894736842e-05, + "loss": 0.4978, + "step": 31601 + }, + { + "epoch": 1.7696270579012208, + "grad_norm": 1.3094966411590576, + "learning_rate": 9.695052631578948e-05, + "loss": 0.4663, + "step": 31602 + }, + { + "epoch": 1.7696830552133498, + "grad_norm": 1.4657502174377441, + "learning_rate": 9.695026315789474e-05, + "loss": 0.428, + "step": 31603 + }, + { + "epoch": 1.7697390525254788, + "grad_norm": 1.3512753248214722, + "learning_rate": 9.695000000000001e-05, + "loss": 0.4366, + "step": 31604 + }, + { + "epoch": 1.7697950498376078, + "grad_norm": 1.6124919652938843, + "learning_rate": 9.694973684210527e-05, + "loss": 0.4766, + "step": 31605 + }, + { + "epoch": 1.7698510471497368, + "grad_norm": 1.2926450967788696, + "learning_rate": 9.694947368421053e-05, + "loss": 0.5164, + "step": 31606 + }, + { + "epoch": 1.7699070444618659, + "grad_norm": 1.2213493585586548, + "learning_rate": 9.694921052631579e-05, + "loss": 0.3471, + "step": 31607 + }, + { + "epoch": 1.7699630417739949, + "grad_norm": 1.5887097120285034, + "learning_rate": 9.694894736842106e-05, + "loss": 0.4045, + "step": 31608 + }, + { + "epoch": 1.770019039086124, + "grad_norm": 1.4204438924789429, + "learning_rate": 9.694868421052632e-05, + "loss": 0.5412, + "step": 31609 + }, + { + "epoch": 1.770075036398253, + "grad_norm": 1.4192122220993042, + "learning_rate": 9.694842105263158e-05, + "loss": 0.3931, + "step": 31610 + }, + { + "epoch": 1.770131033710382, + "grad_norm": 1.5752617120742798, + "learning_rate": 9.694815789473684e-05, + "loss": 0.6127, + "step": 31611 + }, + { + "epoch": 1.770187031022511, + "grad_norm": 1.273874044418335, + "learning_rate": 9.694789473684212e-05, + "loss": 0.3776, + "step": 31612 + }, + { + "epoch": 1.77024302833464, + "grad_norm": 1.2905644178390503, + "learning_rate": 9.694763157894738e-05, + "loss": 0.4609, + "step": 31613 + }, + { + "epoch": 1.770299025646769, + "grad_norm": 1.2650513648986816, + "learning_rate": 9.694736842105264e-05, + "loss": 0.5233, + "step": 31614 + }, + { + "epoch": 1.770355022958898, + "grad_norm": 1.0819251537322998, + "learning_rate": 9.69471052631579e-05, + "loss": 0.4006, + "step": 31615 + }, + { + "epoch": 1.770411020271027, + "grad_norm": 1.757305383682251, + "learning_rate": 9.694684210526316e-05, + "loss": 0.5248, + "step": 31616 + }, + { + "epoch": 1.770467017583156, + "grad_norm": 1.2096786499023438, + "learning_rate": 9.694657894736843e-05, + "loss": 0.4087, + "step": 31617 + }, + { + "epoch": 1.770523014895285, + "grad_norm": 1.274585247039795, + "learning_rate": 9.694631578947369e-05, + "loss": 0.5563, + "step": 31618 + }, + { + "epoch": 1.770579012207414, + "grad_norm": 1.2701423168182373, + "learning_rate": 9.694605263157895e-05, + "loss": 0.4777, + "step": 31619 + }, + { + "epoch": 1.7706350095195431, + "grad_norm": 1.6738379001617432, + "learning_rate": 9.694578947368421e-05, + "loss": 0.5118, + "step": 31620 + }, + { + "epoch": 1.7706910068316721, + "grad_norm": 1.7709014415740967, + "learning_rate": 9.694552631578948e-05, + "loss": 0.4175, + "step": 31621 + }, + { + "epoch": 1.7707470041438012, + "grad_norm": 1.1367491483688354, + "learning_rate": 9.694526315789474e-05, + "loss": 0.4296, + "step": 31622 + }, + { + "epoch": 1.7708030014559302, + "grad_norm": 1.3250112533569336, + "learning_rate": 9.694500000000001e-05, + "loss": 0.4725, + "step": 31623 + }, + { + "epoch": 1.770858998768059, + "grad_norm": 1.6657941341400146, + "learning_rate": 9.694473684210526e-05, + "loss": 0.4841, + "step": 31624 + }, + { + "epoch": 1.770914996080188, + "grad_norm": 1.638959527015686, + "learning_rate": 9.694447368421053e-05, + "loss": 0.4254, + "step": 31625 + }, + { + "epoch": 1.770970993392317, + "grad_norm": 1.305229663848877, + "learning_rate": 9.69442105263158e-05, + "loss": 0.7032, + "step": 31626 + }, + { + "epoch": 1.771026990704446, + "grad_norm": 1.2624949216842651, + "learning_rate": 9.694394736842107e-05, + "loss": 0.4501, + "step": 31627 + }, + { + "epoch": 1.771082988016575, + "grad_norm": 1.4390772581100464, + "learning_rate": 9.694368421052631e-05, + "loss": 0.4499, + "step": 31628 + }, + { + "epoch": 1.771138985328704, + "grad_norm": 1.0659310817718506, + "learning_rate": 9.694342105263159e-05, + "loss": 0.4712, + "step": 31629 + }, + { + "epoch": 1.771194982640833, + "grad_norm": 1.2920441627502441, + "learning_rate": 9.694315789473685e-05, + "loss": 0.6832, + "step": 31630 + }, + { + "epoch": 1.771250979952962, + "grad_norm": 1.3546836376190186, + "learning_rate": 9.69428947368421e-05, + "loss": 0.4744, + "step": 31631 + }, + { + "epoch": 1.7713069772650911, + "grad_norm": 1.0631438493728638, + "learning_rate": 9.694263157894738e-05, + "loss": 0.3273, + "step": 31632 + }, + { + "epoch": 1.7713629745772201, + "grad_norm": 1.3528141975402832, + "learning_rate": 9.694236842105263e-05, + "loss": 0.5658, + "step": 31633 + }, + { + "epoch": 1.7714189718893492, + "grad_norm": 1.164036750793457, + "learning_rate": 9.69421052631579e-05, + "loss": 0.5891, + "step": 31634 + }, + { + "epoch": 1.7714749692014782, + "grad_norm": 1.2094208002090454, + "learning_rate": 9.694184210526316e-05, + "loss": 0.4431, + "step": 31635 + }, + { + "epoch": 1.7715309665136072, + "grad_norm": 1.2523390054702759, + "learning_rate": 9.694157894736843e-05, + "loss": 0.4231, + "step": 31636 + }, + { + "epoch": 1.7715869638257362, + "grad_norm": 1.4777216911315918, + "learning_rate": 9.694131578947369e-05, + "loss": 0.5889, + "step": 31637 + }, + { + "epoch": 1.7716429611378652, + "grad_norm": 1.0440623760223389, + "learning_rate": 9.694105263157895e-05, + "loss": 0.4355, + "step": 31638 + }, + { + "epoch": 1.7716989584499943, + "grad_norm": 2.2610154151916504, + "learning_rate": 9.694078947368421e-05, + "loss": 0.5215, + "step": 31639 + }, + { + "epoch": 1.7717549557621233, + "grad_norm": 1.2607110738754272, + "learning_rate": 9.694052631578948e-05, + "loss": 0.4782, + "step": 31640 + }, + { + "epoch": 1.7718109530742523, + "grad_norm": 1.078346848487854, + "learning_rate": 9.694026315789474e-05, + "loss": 0.3725, + "step": 31641 + }, + { + "epoch": 1.7718669503863813, + "grad_norm": 2.2641713619232178, + "learning_rate": 9.694e-05, + "loss": 0.3142, + "step": 31642 + }, + { + "epoch": 1.7719229476985103, + "grad_norm": 1.4688613414764404, + "learning_rate": 9.693973684210526e-05, + "loss": 0.3142, + "step": 31643 + }, + { + "epoch": 1.7719789450106394, + "grad_norm": 1.177298665046692, + "learning_rate": 9.693947368421054e-05, + "loss": 0.3972, + "step": 31644 + }, + { + "epoch": 1.7720349423227684, + "grad_norm": 1.3002843856811523, + "learning_rate": 9.69392105263158e-05, + "loss": 0.439, + "step": 31645 + }, + { + "epoch": 1.7720909396348974, + "grad_norm": 1.273055911064148, + "learning_rate": 9.693894736842106e-05, + "loss": 0.4135, + "step": 31646 + }, + { + "epoch": 1.7721469369470264, + "grad_norm": 1.3855077028274536, + "learning_rate": 9.693868421052632e-05, + "loss": 0.4321, + "step": 31647 + }, + { + "epoch": 1.7722029342591554, + "grad_norm": 1.1521739959716797, + "learning_rate": 9.693842105263158e-05, + "loss": 0.3381, + "step": 31648 + }, + { + "epoch": 1.7722589315712844, + "grad_norm": 1.1715768575668335, + "learning_rate": 9.693815789473685e-05, + "loss": 0.4246, + "step": 31649 + }, + { + "epoch": 1.7723149288834135, + "grad_norm": 1.096937894821167, + "learning_rate": 9.693789473684211e-05, + "loss": 0.4936, + "step": 31650 + }, + { + "epoch": 1.7723709261955425, + "grad_norm": 1.292404055595398, + "learning_rate": 9.693763157894737e-05, + "loss": 0.4372, + "step": 31651 + }, + { + "epoch": 1.7724269235076715, + "grad_norm": 1.322475790977478, + "learning_rate": 9.693736842105263e-05, + "loss": 0.3626, + "step": 31652 + }, + { + "epoch": 1.7724829208198005, + "grad_norm": 1.5499192476272583, + "learning_rate": 9.69371052631579e-05, + "loss": 0.5435, + "step": 31653 + }, + { + "epoch": 1.7725389181319295, + "grad_norm": 1.8029277324676514, + "learning_rate": 9.693684210526316e-05, + "loss": 0.5153, + "step": 31654 + }, + { + "epoch": 1.7725949154440586, + "grad_norm": 1.4051363468170166, + "learning_rate": 9.693657894736843e-05, + "loss": 0.4287, + "step": 31655 + }, + { + "epoch": 1.7726509127561876, + "grad_norm": 1.1059962511062622, + "learning_rate": 9.693631578947368e-05, + "loss": 0.4496, + "step": 31656 + }, + { + "epoch": 1.7727069100683166, + "grad_norm": 1.195513129234314, + "learning_rate": 9.693605263157895e-05, + "loss": 0.4725, + "step": 31657 + }, + { + "epoch": 1.7727629073804456, + "grad_norm": 1.163993239402771, + "learning_rate": 9.693578947368421e-05, + "loss": 0.4145, + "step": 31658 + }, + { + "epoch": 1.7728189046925746, + "grad_norm": 1.1289546489715576, + "learning_rate": 9.693552631578949e-05, + "loss": 0.3838, + "step": 31659 + }, + { + "epoch": 1.7728749020047037, + "grad_norm": 1.1771734952926636, + "learning_rate": 9.693526315789475e-05, + "loss": 0.5962, + "step": 31660 + }, + { + "epoch": 1.7729308993168327, + "grad_norm": 1.2909501791000366, + "learning_rate": 9.6935e-05, + "loss": 0.4819, + "step": 31661 + }, + { + "epoch": 1.7729868966289617, + "grad_norm": 1.3093260526657104, + "learning_rate": 9.693473684210527e-05, + "loss": 0.4384, + "step": 31662 + }, + { + "epoch": 1.7730428939410907, + "grad_norm": 1.278106927871704, + "learning_rate": 9.693447368421054e-05, + "loss": 0.5425, + "step": 31663 + }, + { + "epoch": 1.7730988912532197, + "grad_norm": 1.4459409713745117, + "learning_rate": 9.69342105263158e-05, + "loss": 0.689, + "step": 31664 + }, + { + "epoch": 1.7731548885653488, + "grad_norm": 1.3192808628082275, + "learning_rate": 9.693394736842106e-05, + "loss": 0.4512, + "step": 31665 + }, + { + "epoch": 1.7732108858774778, + "grad_norm": 1.1529093980789185, + "learning_rate": 9.693368421052632e-05, + "loss": 0.4549, + "step": 31666 + }, + { + "epoch": 1.7732668831896068, + "grad_norm": 1.417559266090393, + "learning_rate": 9.693342105263158e-05, + "loss": 0.4787, + "step": 31667 + }, + { + "epoch": 1.7733228805017358, + "grad_norm": 1.1452620029449463, + "learning_rate": 9.693315789473685e-05, + "loss": 0.4548, + "step": 31668 + }, + { + "epoch": 1.7733788778138648, + "grad_norm": 1.2665936946868896, + "learning_rate": 9.693289473684211e-05, + "loss": 0.4287, + "step": 31669 + }, + { + "epoch": 1.7734348751259938, + "grad_norm": 1.3512331247329712, + "learning_rate": 9.693263157894737e-05, + "loss": 0.4704, + "step": 31670 + }, + { + "epoch": 1.7734908724381229, + "grad_norm": 1.2202987670898438, + "learning_rate": 9.693236842105263e-05, + "loss": 0.4467, + "step": 31671 + }, + { + "epoch": 1.7735468697502519, + "grad_norm": 1.2009615898132324, + "learning_rate": 9.69321052631579e-05, + "loss": 0.4032, + "step": 31672 + }, + { + "epoch": 1.773602867062381, + "grad_norm": 1.3964484930038452, + "learning_rate": 9.693184210526316e-05, + "loss": 0.4818, + "step": 31673 + }, + { + "epoch": 1.77365886437451, + "grad_norm": 1.4268285036087036, + "learning_rate": 9.693157894736842e-05, + "loss": 0.4191, + "step": 31674 + }, + { + "epoch": 1.773714861686639, + "grad_norm": 1.3885416984558105, + "learning_rate": 9.693131578947368e-05, + "loss": 0.4025, + "step": 31675 + }, + { + "epoch": 1.773770858998768, + "grad_norm": 1.4444143772125244, + "learning_rate": 9.693105263157896e-05, + "loss": 0.4656, + "step": 31676 + }, + { + "epoch": 1.773826856310897, + "grad_norm": 1.0805639028549194, + "learning_rate": 9.693078947368422e-05, + "loss": 0.2582, + "step": 31677 + }, + { + "epoch": 1.773882853623026, + "grad_norm": 1.2887513637542725, + "learning_rate": 9.693052631578949e-05, + "loss": 0.4735, + "step": 31678 + }, + { + "epoch": 1.773938850935155, + "grad_norm": 1.3559648990631104, + "learning_rate": 9.693026315789474e-05, + "loss": 0.4658, + "step": 31679 + }, + { + "epoch": 1.773994848247284, + "grad_norm": 1.340680718421936, + "learning_rate": 9.693000000000001e-05, + "loss": 0.4844, + "step": 31680 + }, + { + "epoch": 1.774050845559413, + "grad_norm": 1.3505420684814453, + "learning_rate": 9.692973684210527e-05, + "loss": 0.4911, + "step": 31681 + }, + { + "epoch": 1.774106842871542, + "grad_norm": 1.1470973491668701, + "learning_rate": 9.692947368421053e-05, + "loss": 0.4863, + "step": 31682 + }, + { + "epoch": 1.774162840183671, + "grad_norm": 1.459671139717102, + "learning_rate": 9.692921052631579e-05, + "loss": 0.6615, + "step": 31683 + }, + { + "epoch": 1.7742188374958001, + "grad_norm": 1.1560617685317993, + "learning_rate": 9.692894736842105e-05, + "loss": 0.3681, + "step": 31684 + }, + { + "epoch": 1.7742748348079291, + "grad_norm": 1.1295814514160156, + "learning_rate": 9.692868421052632e-05, + "loss": 0.4043, + "step": 31685 + }, + { + "epoch": 1.7743308321200582, + "grad_norm": 1.4021835327148438, + "learning_rate": 9.692842105263158e-05, + "loss": 0.5041, + "step": 31686 + }, + { + "epoch": 1.7743868294321872, + "grad_norm": 1.4055055379867554, + "learning_rate": 9.692815789473685e-05, + "loss": 0.41, + "step": 31687 + }, + { + "epoch": 1.7744428267443162, + "grad_norm": 3.6340811252593994, + "learning_rate": 9.69278947368421e-05, + "loss": 0.3583, + "step": 31688 + }, + { + "epoch": 1.7744988240564452, + "grad_norm": 1.1289743185043335, + "learning_rate": 9.692763157894737e-05, + "loss": 0.4102, + "step": 31689 + }, + { + "epoch": 1.7745548213685742, + "grad_norm": 1.2529717683792114, + "learning_rate": 9.692736842105263e-05, + "loss": 0.3426, + "step": 31690 + }, + { + "epoch": 1.7746108186807033, + "grad_norm": 1.2105739116668701, + "learning_rate": 9.69271052631579e-05, + "loss": 0.3889, + "step": 31691 + }, + { + "epoch": 1.7746668159928323, + "grad_norm": 1.1601111888885498, + "learning_rate": 9.692684210526317e-05, + "loss": 0.3919, + "step": 31692 + }, + { + "epoch": 1.7747228133049613, + "grad_norm": 1.2139086723327637, + "learning_rate": 9.692657894736843e-05, + "loss": 0.3789, + "step": 31693 + }, + { + "epoch": 1.7747788106170903, + "grad_norm": 1.1330914497375488, + "learning_rate": 9.692631578947369e-05, + "loss": 0.6112, + "step": 31694 + }, + { + "epoch": 1.7748348079292193, + "grad_norm": 1.6613965034484863, + "learning_rate": 9.692605263157896e-05, + "loss": 0.5128, + "step": 31695 + }, + { + "epoch": 1.7748908052413483, + "grad_norm": 1.1898295879364014, + "learning_rate": 9.692578947368422e-05, + "loss": 0.4033, + "step": 31696 + }, + { + "epoch": 1.7749468025534774, + "grad_norm": 1.0605305433273315, + "learning_rate": 9.692552631578948e-05, + "loss": 0.4209, + "step": 31697 + }, + { + "epoch": 1.7750027998656064, + "grad_norm": 1.1873576641082764, + "learning_rate": 9.692526315789474e-05, + "loss": 0.4714, + "step": 31698 + }, + { + "epoch": 1.7750587971777354, + "grad_norm": 1.2370866537094116, + "learning_rate": 9.6925e-05, + "loss": 0.3921, + "step": 31699 + }, + { + "epoch": 1.7751147944898644, + "grad_norm": 1.138181447982788, + "learning_rate": 9.692473684210527e-05, + "loss": 0.4276, + "step": 31700 + }, + { + "epoch": 1.7751707918019934, + "grad_norm": 1.3021535873413086, + "learning_rate": 9.692447368421053e-05, + "loss": 0.3714, + "step": 31701 + }, + { + "epoch": 1.7752267891141225, + "grad_norm": 1.5191811323165894, + "learning_rate": 9.692421052631579e-05, + "loss": 0.4694, + "step": 31702 + }, + { + "epoch": 1.7752827864262515, + "grad_norm": 1.4879953861236572, + "learning_rate": 9.692394736842105e-05, + "loss": 0.5514, + "step": 31703 + }, + { + "epoch": 1.7753387837383805, + "grad_norm": 1.4922763109207153, + "learning_rate": 9.692368421052632e-05, + "loss": 0.637, + "step": 31704 + }, + { + "epoch": 1.7753947810505095, + "grad_norm": 1.2755488157272339, + "learning_rate": 9.692342105263158e-05, + "loss": 0.473, + "step": 31705 + }, + { + "epoch": 1.7754507783626385, + "grad_norm": 1.2252918481826782, + "learning_rate": 9.692315789473684e-05, + "loss": 0.3126, + "step": 31706 + }, + { + "epoch": 1.7755067756747676, + "grad_norm": 1.1464699506759644, + "learning_rate": 9.69228947368421e-05, + "loss": 0.4335, + "step": 31707 + }, + { + "epoch": 1.7755627729868966, + "grad_norm": 1.3130724430084229, + "learning_rate": 9.692263157894738e-05, + "loss": 0.3764, + "step": 31708 + }, + { + "epoch": 1.7756187702990256, + "grad_norm": 2.37186861038208, + "learning_rate": 9.692236842105264e-05, + "loss": 0.4217, + "step": 31709 + }, + { + "epoch": 1.7756747676111546, + "grad_norm": 1.2758076190948486, + "learning_rate": 9.692210526315791e-05, + "loss": 0.4001, + "step": 31710 + }, + { + "epoch": 1.7757307649232836, + "grad_norm": 1.1145694255828857, + "learning_rate": 9.692184210526316e-05, + "loss": 0.5205, + "step": 31711 + }, + { + "epoch": 1.7757867622354127, + "grad_norm": 1.9133871793746948, + "learning_rate": 9.692157894736843e-05, + "loss": 0.4757, + "step": 31712 + }, + { + "epoch": 1.7758427595475417, + "grad_norm": 1.3190933465957642, + "learning_rate": 9.692131578947369e-05, + "loss": 0.5608, + "step": 31713 + }, + { + "epoch": 1.7758987568596707, + "grad_norm": 2.1781973838806152, + "learning_rate": 9.692105263157896e-05, + "loss": 0.4955, + "step": 31714 + }, + { + "epoch": 1.7759547541717997, + "grad_norm": 1.279032826423645, + "learning_rate": 9.692078947368422e-05, + "loss": 0.4946, + "step": 31715 + }, + { + "epoch": 1.7760107514839287, + "grad_norm": 1.3642264604568481, + "learning_rate": 9.692052631578948e-05, + "loss": 0.6054, + "step": 31716 + }, + { + "epoch": 1.7760667487960577, + "grad_norm": 1.3150238990783691, + "learning_rate": 9.692026315789474e-05, + "loss": 0.3583, + "step": 31717 + }, + { + "epoch": 1.7761227461081868, + "grad_norm": 1.5483708381652832, + "learning_rate": 9.692e-05, + "loss": 0.5425, + "step": 31718 + }, + { + "epoch": 1.7761787434203158, + "grad_norm": 1.29545259475708, + "learning_rate": 9.691973684210527e-05, + "loss": 0.569, + "step": 31719 + }, + { + "epoch": 1.7762347407324448, + "grad_norm": 1.1991771459579468, + "learning_rate": 9.691947368421053e-05, + "loss": 0.5426, + "step": 31720 + }, + { + "epoch": 1.7762907380445738, + "grad_norm": 1.1908551454544067, + "learning_rate": 9.691921052631579e-05, + "loss": 0.4921, + "step": 31721 + }, + { + "epoch": 1.7763467353567028, + "grad_norm": 1.421525001525879, + "learning_rate": 9.691894736842105e-05, + "loss": 0.6218, + "step": 31722 + }, + { + "epoch": 1.7764027326688319, + "grad_norm": 1.1339898109436035, + "learning_rate": 9.691868421052633e-05, + "loss": 0.4113, + "step": 31723 + }, + { + "epoch": 1.7764587299809609, + "grad_norm": 1.093232274055481, + "learning_rate": 9.691842105263159e-05, + "loss": 0.362, + "step": 31724 + }, + { + "epoch": 1.77651472729309, + "grad_norm": 1.2974497079849243, + "learning_rate": 9.691815789473685e-05, + "loss": 0.378, + "step": 31725 + }, + { + "epoch": 1.776570724605219, + "grad_norm": 1.2342931032180786, + "learning_rate": 9.69178947368421e-05, + "loss": 0.4887, + "step": 31726 + }, + { + "epoch": 1.776626721917348, + "grad_norm": 1.3725272417068481, + "learning_rate": 9.691763157894738e-05, + "loss": 0.4088, + "step": 31727 + }, + { + "epoch": 1.776682719229477, + "grad_norm": 1.853804349899292, + "learning_rate": 9.691736842105264e-05, + "loss": 0.4583, + "step": 31728 + }, + { + "epoch": 1.776738716541606, + "grad_norm": 1.1662169694900513, + "learning_rate": 9.69171052631579e-05, + "loss": 0.3787, + "step": 31729 + }, + { + "epoch": 1.776794713853735, + "grad_norm": 1.2693150043487549, + "learning_rate": 9.691684210526316e-05, + "loss": 0.4017, + "step": 31730 + }, + { + "epoch": 1.776850711165864, + "grad_norm": 1.52790367603302, + "learning_rate": 9.691657894736843e-05, + "loss": 0.5531, + "step": 31731 + }, + { + "epoch": 1.776906708477993, + "grad_norm": 1.3320693969726562, + "learning_rate": 9.691631578947369e-05, + "loss": 0.4455, + "step": 31732 + }, + { + "epoch": 1.776962705790122, + "grad_norm": 1.2238317728042603, + "learning_rate": 9.691605263157896e-05, + "loss": 0.5328, + "step": 31733 + }, + { + "epoch": 1.777018703102251, + "grad_norm": 1.2253999710083008, + "learning_rate": 9.691578947368421e-05, + "loss": 0.4084, + "step": 31734 + }, + { + "epoch": 1.77707470041438, + "grad_norm": 1.1530673503875732, + "learning_rate": 9.691552631578947e-05, + "loss": 0.3968, + "step": 31735 + }, + { + "epoch": 1.7771306977265091, + "grad_norm": 1.1781492233276367, + "learning_rate": 9.691526315789474e-05, + "loss": 0.4335, + "step": 31736 + }, + { + "epoch": 1.7771866950386381, + "grad_norm": 1.2266974449157715, + "learning_rate": 9.6915e-05, + "loss": 0.4273, + "step": 31737 + }, + { + "epoch": 1.7772426923507672, + "grad_norm": 1.2361249923706055, + "learning_rate": 9.691473684210526e-05, + "loss": 0.4069, + "step": 31738 + }, + { + "epoch": 1.7772986896628962, + "grad_norm": 1.9679622650146484, + "learning_rate": 9.691447368421052e-05, + "loss": 0.5161, + "step": 31739 + }, + { + "epoch": 1.7773546869750252, + "grad_norm": 1.1759806871414185, + "learning_rate": 9.69142105263158e-05, + "loss": 0.4516, + "step": 31740 + }, + { + "epoch": 1.7774106842871542, + "grad_norm": 1.8541347980499268, + "learning_rate": 9.691394736842106e-05, + "loss": 0.4834, + "step": 31741 + }, + { + "epoch": 1.7774666815992832, + "grad_norm": 1.1747126579284668, + "learning_rate": 9.691368421052633e-05, + "loss": 0.3733, + "step": 31742 + }, + { + "epoch": 1.7775226789114122, + "grad_norm": 1.3951380252838135, + "learning_rate": 9.691342105263157e-05, + "loss": 0.4915, + "step": 31743 + }, + { + "epoch": 1.7775786762235413, + "grad_norm": 1.2836487293243408, + "learning_rate": 9.691315789473685e-05, + "loss": 0.4666, + "step": 31744 + }, + { + "epoch": 1.7776346735356703, + "grad_norm": 1.1611849069595337, + "learning_rate": 9.691289473684211e-05, + "loss": 0.4434, + "step": 31745 + }, + { + "epoch": 1.7776906708477993, + "grad_norm": 1.3393611907958984, + "learning_rate": 9.691263157894738e-05, + "loss": 0.4784, + "step": 31746 + }, + { + "epoch": 1.7777466681599283, + "grad_norm": 1.1358977556228638, + "learning_rate": 9.691236842105264e-05, + "loss": 0.4414, + "step": 31747 + }, + { + "epoch": 1.7778026654720573, + "grad_norm": 1.4379245042800903, + "learning_rate": 9.69121052631579e-05, + "loss": 0.6311, + "step": 31748 + }, + { + "epoch": 1.7778586627841864, + "grad_norm": 1.019580364227295, + "learning_rate": 9.691184210526316e-05, + "loss": 0.3647, + "step": 31749 + }, + { + "epoch": 1.7779146600963154, + "grad_norm": 1.4219930171966553, + "learning_rate": 9.691157894736843e-05, + "loss": 0.5379, + "step": 31750 + }, + { + "epoch": 1.7779706574084444, + "grad_norm": 1.107913851737976, + "learning_rate": 9.69113157894737e-05, + "loss": 0.3647, + "step": 31751 + }, + { + "epoch": 1.7780266547205734, + "grad_norm": 1.3188385963439941, + "learning_rate": 9.691105263157895e-05, + "loss": 0.3607, + "step": 31752 + }, + { + "epoch": 1.7780826520327024, + "grad_norm": 1.370132565498352, + "learning_rate": 9.691078947368421e-05, + "loss": 0.4885, + "step": 31753 + }, + { + "epoch": 1.7781386493448315, + "grad_norm": 1.3688284158706665, + "learning_rate": 9.691052631578947e-05, + "loss": 0.4173, + "step": 31754 + }, + { + "epoch": 1.7781946466569605, + "grad_norm": 1.4062292575836182, + "learning_rate": 9.691026315789475e-05, + "loss": 0.3259, + "step": 31755 + }, + { + "epoch": 1.7782506439690895, + "grad_norm": 1.1700189113616943, + "learning_rate": 9.691e-05, + "loss": 0.4681, + "step": 31756 + }, + { + "epoch": 1.7783066412812185, + "grad_norm": 1.588115930557251, + "learning_rate": 9.690973684210527e-05, + "loss": 0.4796, + "step": 31757 + }, + { + "epoch": 1.7783626385933475, + "grad_norm": 1.4379734992980957, + "learning_rate": 9.690947368421053e-05, + "loss": 0.5249, + "step": 31758 + }, + { + "epoch": 1.7784186359054766, + "grad_norm": 1.4565213918685913, + "learning_rate": 9.69092105263158e-05, + "loss": 0.4669, + "step": 31759 + }, + { + "epoch": 1.7784746332176056, + "grad_norm": 1.3071175813674927, + "learning_rate": 9.690894736842106e-05, + "loss": 0.4272, + "step": 31760 + }, + { + "epoch": 1.7785306305297346, + "grad_norm": 1.2956658601760864, + "learning_rate": 9.690868421052632e-05, + "loss": 0.4045, + "step": 31761 + }, + { + "epoch": 1.7785866278418636, + "grad_norm": 1.0321497917175293, + "learning_rate": 9.690842105263158e-05, + "loss": 0.3443, + "step": 31762 + }, + { + "epoch": 1.7786426251539926, + "grad_norm": 1.2439674139022827, + "learning_rate": 9.690815789473685e-05, + "loss": 0.422, + "step": 31763 + }, + { + "epoch": 1.7786986224661216, + "grad_norm": 1.3908679485321045, + "learning_rate": 9.690789473684211e-05, + "loss": 0.5081, + "step": 31764 + }, + { + "epoch": 1.7787546197782507, + "grad_norm": 1.2918809652328491, + "learning_rate": 9.690763157894738e-05, + "loss": 0.5505, + "step": 31765 + }, + { + "epoch": 1.7788106170903797, + "grad_norm": 1.4847381114959717, + "learning_rate": 9.690736842105263e-05, + "loss": 0.6401, + "step": 31766 + }, + { + "epoch": 1.7788666144025087, + "grad_norm": 1.180594801902771, + "learning_rate": 9.69071052631579e-05, + "loss": 0.3993, + "step": 31767 + }, + { + "epoch": 1.7789226117146377, + "grad_norm": 1.1299070119857788, + "learning_rate": 9.690684210526316e-05, + "loss": 0.3554, + "step": 31768 + }, + { + "epoch": 1.7789786090267667, + "grad_norm": 1.3463633060455322, + "learning_rate": 9.690657894736842e-05, + "loss": 0.3062, + "step": 31769 + }, + { + "epoch": 1.7790346063388958, + "grad_norm": 0.982157826423645, + "learning_rate": 9.69063157894737e-05, + "loss": 0.3706, + "step": 31770 + }, + { + "epoch": 1.7790906036510248, + "grad_norm": 1.5274629592895508, + "learning_rate": 9.690605263157894e-05, + "loss": 0.4789, + "step": 31771 + }, + { + "epoch": 1.7791466009631538, + "grad_norm": 1.5049922466278076, + "learning_rate": 9.690578947368422e-05, + "loss": 0.5137, + "step": 31772 + }, + { + "epoch": 1.7792025982752828, + "grad_norm": 1.152225375175476, + "learning_rate": 9.690552631578948e-05, + "loss": 0.4106, + "step": 31773 + }, + { + "epoch": 1.7792585955874118, + "grad_norm": 1.4260683059692383, + "learning_rate": 9.690526315789475e-05, + "loss": 0.5347, + "step": 31774 + }, + { + "epoch": 1.7793145928995409, + "grad_norm": 2.1656501293182373, + "learning_rate": 9.6905e-05, + "loss": 0.6394, + "step": 31775 + }, + { + "epoch": 1.7793705902116699, + "grad_norm": 1.3388731479644775, + "learning_rate": 9.690473684210527e-05, + "loss": 0.4603, + "step": 31776 + }, + { + "epoch": 1.779426587523799, + "grad_norm": 1.748445749282837, + "learning_rate": 9.690447368421053e-05, + "loss": 0.5135, + "step": 31777 + }, + { + "epoch": 1.779482584835928, + "grad_norm": 1.1211421489715576, + "learning_rate": 9.69042105263158e-05, + "loss": 0.3823, + "step": 31778 + }, + { + "epoch": 1.779538582148057, + "grad_norm": 1.2308905124664307, + "learning_rate": 9.690394736842106e-05, + "loss": 0.6448, + "step": 31779 + }, + { + "epoch": 1.779594579460186, + "grad_norm": 1.1907392740249634, + "learning_rate": 9.690368421052632e-05, + "loss": 0.4445, + "step": 31780 + }, + { + "epoch": 1.779650576772315, + "grad_norm": 1.2433491945266724, + "learning_rate": 9.690342105263158e-05, + "loss": 0.4855, + "step": 31781 + }, + { + "epoch": 1.779706574084444, + "grad_norm": 1.5997979640960693, + "learning_rate": 9.690315789473685e-05, + "loss": 0.5141, + "step": 31782 + }, + { + "epoch": 1.779762571396573, + "grad_norm": 1.5376241207122803, + "learning_rate": 9.690289473684211e-05, + "loss": 0.4068, + "step": 31783 + }, + { + "epoch": 1.779818568708702, + "grad_norm": 1.3327856063842773, + "learning_rate": 9.690263157894737e-05, + "loss": 0.4554, + "step": 31784 + }, + { + "epoch": 1.779874566020831, + "grad_norm": 1.2942867279052734, + "learning_rate": 9.690236842105263e-05, + "loss": 0.4102, + "step": 31785 + }, + { + "epoch": 1.77993056333296, + "grad_norm": 1.1606141328811646, + "learning_rate": 9.690210526315789e-05, + "loss": 0.4507, + "step": 31786 + }, + { + "epoch": 1.779986560645089, + "grad_norm": 1.7103277444839478, + "learning_rate": 9.690184210526317e-05, + "loss": 0.6373, + "step": 31787 + }, + { + "epoch": 1.780042557957218, + "grad_norm": 1.3943488597869873, + "learning_rate": 9.690157894736843e-05, + "loss": 0.4965, + "step": 31788 + }, + { + "epoch": 1.7800985552693471, + "grad_norm": 1.1858793497085571, + "learning_rate": 9.690131578947369e-05, + "loss": 0.4665, + "step": 31789 + }, + { + "epoch": 1.7801545525814761, + "grad_norm": 1.5315145254135132, + "learning_rate": 9.690105263157894e-05, + "loss": 0.4587, + "step": 31790 + }, + { + "epoch": 1.7802105498936052, + "grad_norm": 1.3065170049667358, + "learning_rate": 9.690078947368422e-05, + "loss": 0.4388, + "step": 31791 + }, + { + "epoch": 1.7802665472057342, + "grad_norm": 1.2521973848342896, + "learning_rate": 9.690052631578948e-05, + "loss": 0.3834, + "step": 31792 + }, + { + "epoch": 1.7803225445178632, + "grad_norm": 1.2697452306747437, + "learning_rate": 9.690026315789474e-05, + "loss": 0.5289, + "step": 31793 + }, + { + "epoch": 1.7803785418299922, + "grad_norm": 1.2355313301086426, + "learning_rate": 9.69e-05, + "loss": 0.4728, + "step": 31794 + }, + { + "epoch": 1.7804345391421212, + "grad_norm": 1.1875896453857422, + "learning_rate": 9.689973684210527e-05, + "loss": 0.4261, + "step": 31795 + }, + { + "epoch": 1.7804905364542503, + "grad_norm": 1.2282297611236572, + "learning_rate": 9.689947368421053e-05, + "loss": 0.5512, + "step": 31796 + }, + { + "epoch": 1.7805465337663793, + "grad_norm": 1.7952834367752075, + "learning_rate": 9.68992105263158e-05, + "loss": 0.4245, + "step": 31797 + }, + { + "epoch": 1.7806025310785083, + "grad_norm": 2.2149720191955566, + "learning_rate": 9.689894736842105e-05, + "loss": 0.4552, + "step": 31798 + }, + { + "epoch": 1.7806585283906373, + "grad_norm": 1.2255440950393677, + "learning_rate": 9.689868421052632e-05, + "loss": 0.4927, + "step": 31799 + }, + { + "epoch": 1.7807145257027663, + "grad_norm": 1.1667736768722534, + "learning_rate": 9.689842105263158e-05, + "loss": 0.3893, + "step": 31800 + }, + { + "epoch": 1.7807705230148954, + "grad_norm": 1.3435007333755493, + "learning_rate": 9.689815789473686e-05, + "loss": 0.4568, + "step": 31801 + }, + { + "epoch": 1.7808265203270244, + "grad_norm": 1.2837270498275757, + "learning_rate": 9.689789473684212e-05, + "loss": 0.4608, + "step": 31802 + }, + { + "epoch": 1.7808825176391534, + "grad_norm": 1.213241457939148, + "learning_rate": 9.689763157894736e-05, + "loss": 0.4059, + "step": 31803 + }, + { + "epoch": 1.7809385149512824, + "grad_norm": 1.6467320919036865, + "learning_rate": 9.689736842105264e-05, + "loss": 0.4102, + "step": 31804 + }, + { + "epoch": 1.7809945122634114, + "grad_norm": 1.0240404605865479, + "learning_rate": 9.68971052631579e-05, + "loss": 0.3627, + "step": 31805 + }, + { + "epoch": 1.7810505095755405, + "grad_norm": 1.316407561302185, + "learning_rate": 9.689684210526317e-05, + "loss": 0.4339, + "step": 31806 + }, + { + "epoch": 1.7811065068876695, + "grad_norm": 1.584269404411316, + "learning_rate": 9.689657894736843e-05, + "loss": 0.5503, + "step": 31807 + }, + { + "epoch": 1.7811625041997985, + "grad_norm": 1.0722568035125732, + "learning_rate": 9.689631578947369e-05, + "loss": 0.4073, + "step": 31808 + }, + { + "epoch": 1.7812185015119275, + "grad_norm": 1.6375370025634766, + "learning_rate": 9.689605263157895e-05, + "loss": 0.6101, + "step": 31809 + }, + { + "epoch": 1.7812744988240565, + "grad_norm": 1.1459671258926392, + "learning_rate": 9.689578947368422e-05, + "loss": 0.4613, + "step": 31810 + }, + { + "epoch": 1.7813304961361855, + "grad_norm": 1.2400115728378296, + "learning_rate": 9.689552631578948e-05, + "loss": 0.4566, + "step": 31811 + }, + { + "epoch": 1.7813864934483146, + "grad_norm": 1.147714614868164, + "learning_rate": 9.689526315789474e-05, + "loss": 0.4565, + "step": 31812 + }, + { + "epoch": 1.7814424907604436, + "grad_norm": 1.325195550918579, + "learning_rate": 9.6895e-05, + "loss": 0.4775, + "step": 31813 + }, + { + "epoch": 1.7814984880725726, + "grad_norm": 1.9836220741271973, + "learning_rate": 9.689473684210527e-05, + "loss": 0.5257, + "step": 31814 + }, + { + "epoch": 1.7815544853847016, + "grad_norm": 1.124781847000122, + "learning_rate": 9.689447368421053e-05, + "loss": 0.391, + "step": 31815 + }, + { + "epoch": 1.7816104826968306, + "grad_norm": 1.200110673904419, + "learning_rate": 9.689421052631579e-05, + "loss": 0.4523, + "step": 31816 + }, + { + "epoch": 1.7816664800089597, + "grad_norm": 1.1694585084915161, + "learning_rate": 9.689394736842105e-05, + "loss": 0.3514, + "step": 31817 + }, + { + "epoch": 1.7817224773210887, + "grad_norm": 1.551188588142395, + "learning_rate": 9.689368421052633e-05, + "loss": 0.5292, + "step": 31818 + }, + { + "epoch": 1.7817784746332177, + "grad_norm": 1.1470431089401245, + "learning_rate": 9.689342105263159e-05, + "loss": 0.5614, + "step": 31819 + }, + { + "epoch": 1.7818344719453467, + "grad_norm": 1.3186626434326172, + "learning_rate": 9.689315789473684e-05, + "loss": 0.4121, + "step": 31820 + }, + { + "epoch": 1.7818904692574757, + "grad_norm": 0.9956709742546082, + "learning_rate": 9.68928947368421e-05, + "loss": 0.3453, + "step": 31821 + }, + { + "epoch": 1.7819464665696048, + "grad_norm": 1.278331995010376, + "learning_rate": 9.689263157894736e-05, + "loss": 0.3733, + "step": 31822 + }, + { + "epoch": 1.7820024638817338, + "grad_norm": 1.5626790523529053, + "learning_rate": 9.689236842105264e-05, + "loss": 0.477, + "step": 31823 + }, + { + "epoch": 1.7820584611938628, + "grad_norm": 1.3329910039901733, + "learning_rate": 9.68921052631579e-05, + "loss": 0.5075, + "step": 31824 + }, + { + "epoch": 1.7821144585059918, + "grad_norm": 1.240760087966919, + "learning_rate": 9.689184210526317e-05, + "loss": 0.4999, + "step": 31825 + }, + { + "epoch": 1.7821704558181208, + "grad_norm": 1.1359931230545044, + "learning_rate": 9.689157894736842e-05, + "loss": 0.3785, + "step": 31826 + }, + { + "epoch": 1.7822264531302499, + "grad_norm": 12.997991561889648, + "learning_rate": 9.689131578947369e-05, + "loss": 0.5553, + "step": 31827 + }, + { + "epoch": 1.7822824504423789, + "grad_norm": 1.2757093906402588, + "learning_rate": 9.689105263157895e-05, + "loss": 0.4396, + "step": 31828 + }, + { + "epoch": 1.782338447754508, + "grad_norm": 1.2319806814193726, + "learning_rate": 9.689078947368422e-05, + "loss": 0.4742, + "step": 31829 + }, + { + "epoch": 1.782394445066637, + "grad_norm": 1.4098560810089111, + "learning_rate": 9.689052631578947e-05, + "loss": 0.4182, + "step": 31830 + }, + { + "epoch": 1.782450442378766, + "grad_norm": 1.340309500694275, + "learning_rate": 9.689026315789474e-05, + "loss": 0.4386, + "step": 31831 + }, + { + "epoch": 1.782506439690895, + "grad_norm": 1.3434604406356812, + "learning_rate": 9.689e-05, + "loss": 0.4141, + "step": 31832 + }, + { + "epoch": 1.782562437003024, + "grad_norm": 1.1592700481414795, + "learning_rate": 9.688973684210528e-05, + "loss": 0.3565, + "step": 31833 + }, + { + "epoch": 1.782618434315153, + "grad_norm": 1.4475698471069336, + "learning_rate": 9.688947368421054e-05, + "loss": 0.526, + "step": 31834 + }, + { + "epoch": 1.782674431627282, + "grad_norm": 1.2637836933135986, + "learning_rate": 9.68892105263158e-05, + "loss": 0.5198, + "step": 31835 + }, + { + "epoch": 1.782730428939411, + "grad_norm": 1.7472718954086304, + "learning_rate": 9.688894736842105e-05, + "loss": 0.5239, + "step": 31836 + }, + { + "epoch": 1.78278642625154, + "grad_norm": 1.2594273090362549, + "learning_rate": 9.688868421052633e-05, + "loss": 0.6498, + "step": 31837 + }, + { + "epoch": 1.782842423563669, + "grad_norm": 1.727853775024414, + "learning_rate": 9.688842105263159e-05, + "loss": 0.6166, + "step": 31838 + }, + { + "epoch": 1.782898420875798, + "grad_norm": 1.330910086631775, + "learning_rate": 9.688815789473685e-05, + "loss": 0.5106, + "step": 31839 + }, + { + "epoch": 1.782954418187927, + "grad_norm": 1.4381299018859863, + "learning_rate": 9.688789473684211e-05, + "loss": 0.4625, + "step": 31840 + }, + { + "epoch": 1.7830104155000561, + "grad_norm": 1.1630643606185913, + "learning_rate": 9.688763157894737e-05, + "loss": 0.5307, + "step": 31841 + }, + { + "epoch": 1.7830664128121851, + "grad_norm": 1.0656017065048218, + "learning_rate": 9.688736842105264e-05, + "loss": 0.431, + "step": 31842 + }, + { + "epoch": 1.7831224101243142, + "grad_norm": 1.2093061208724976, + "learning_rate": 9.68871052631579e-05, + "loss": 0.3468, + "step": 31843 + }, + { + "epoch": 1.7831784074364432, + "grad_norm": 1.1307623386383057, + "learning_rate": 9.688684210526316e-05, + "loss": 0.3215, + "step": 31844 + }, + { + "epoch": 1.7832344047485722, + "grad_norm": 1.2576932907104492, + "learning_rate": 9.688657894736842e-05, + "loss": 0.4666, + "step": 31845 + }, + { + "epoch": 1.7832904020607012, + "grad_norm": 1.3892356157302856, + "learning_rate": 9.688631578947369e-05, + "loss": 0.3826, + "step": 31846 + }, + { + "epoch": 1.7833463993728302, + "grad_norm": 1.03448486328125, + "learning_rate": 9.688605263157895e-05, + "loss": 0.3517, + "step": 31847 + }, + { + "epoch": 1.7834023966849593, + "grad_norm": 1.18561851978302, + "learning_rate": 9.688578947368421e-05, + "loss": 0.4066, + "step": 31848 + }, + { + "epoch": 1.7834583939970883, + "grad_norm": 1.6200153827667236, + "learning_rate": 9.688552631578947e-05, + "loss": 0.4927, + "step": 31849 + }, + { + "epoch": 1.7835143913092173, + "grad_norm": 1.7550350427627563, + "learning_rate": 9.688526315789475e-05, + "loss": 0.4092, + "step": 31850 + }, + { + "epoch": 1.7835703886213463, + "grad_norm": 1.1973174810409546, + "learning_rate": 9.6885e-05, + "loss": 0.4021, + "step": 31851 + }, + { + "epoch": 1.7836263859334753, + "grad_norm": 1.3861836194992065, + "learning_rate": 9.688473684210528e-05, + "loss": 0.5933, + "step": 31852 + }, + { + "epoch": 1.7836823832456044, + "grad_norm": 1.1747161149978638, + "learning_rate": 9.688447368421052e-05, + "loss": 0.5082, + "step": 31853 + }, + { + "epoch": 1.7837383805577334, + "grad_norm": 1.2632472515106201, + "learning_rate": 9.68842105263158e-05, + "loss": 0.4313, + "step": 31854 + }, + { + "epoch": 1.7837943778698624, + "grad_norm": 1.205872893333435, + "learning_rate": 9.688394736842106e-05, + "loss": 0.4356, + "step": 31855 + }, + { + "epoch": 1.7838503751819914, + "grad_norm": 1.2383676767349243, + "learning_rate": 9.688368421052632e-05, + "loss": 0.3702, + "step": 31856 + }, + { + "epoch": 1.7839063724941204, + "grad_norm": 1.3884590864181519, + "learning_rate": 9.688342105263159e-05, + "loss": 0.6019, + "step": 31857 + }, + { + "epoch": 1.7839623698062494, + "grad_norm": 1.1572792530059814, + "learning_rate": 9.688315789473684e-05, + "loss": 0.3724, + "step": 31858 + }, + { + "epoch": 1.7840183671183785, + "grad_norm": 1.1126223802566528, + "learning_rate": 9.688289473684211e-05, + "loss": 0.4468, + "step": 31859 + }, + { + "epoch": 1.7840743644305075, + "grad_norm": 1.447487235069275, + "learning_rate": 9.688263157894737e-05, + "loss": 0.4622, + "step": 31860 + }, + { + "epoch": 1.7841303617426365, + "grad_norm": 1.2140978574752808, + "learning_rate": 9.688236842105264e-05, + "loss": 0.4691, + "step": 31861 + }, + { + "epoch": 1.7841863590547655, + "grad_norm": 1.0958054065704346, + "learning_rate": 9.68821052631579e-05, + "loss": 0.5264, + "step": 31862 + }, + { + "epoch": 1.7842423563668945, + "grad_norm": 1.5366441011428833, + "learning_rate": 9.688184210526316e-05, + "loss": 0.4639, + "step": 31863 + }, + { + "epoch": 1.7842983536790236, + "grad_norm": 1.129201054573059, + "learning_rate": 9.688157894736842e-05, + "loss": 0.4285, + "step": 31864 + }, + { + "epoch": 1.7843543509911526, + "grad_norm": 1.1247044801712036, + "learning_rate": 9.68813157894737e-05, + "loss": 0.3693, + "step": 31865 + }, + { + "epoch": 1.7844103483032816, + "grad_norm": 1.5763261318206787, + "learning_rate": 9.688105263157896e-05, + "loss": 0.5131, + "step": 31866 + }, + { + "epoch": 1.7844663456154106, + "grad_norm": 1.141681432723999, + "learning_rate": 9.688078947368421e-05, + "loss": 0.5572, + "step": 31867 + }, + { + "epoch": 1.7845223429275396, + "grad_norm": 1.4665062427520752, + "learning_rate": 9.688052631578947e-05, + "loss": 0.5142, + "step": 31868 + }, + { + "epoch": 1.7845783402396687, + "grad_norm": 1.1503149271011353, + "learning_rate": 9.688026315789475e-05, + "loss": 0.3358, + "step": 31869 + }, + { + "epoch": 1.7846343375517977, + "grad_norm": 1.065591812133789, + "learning_rate": 9.688000000000001e-05, + "loss": 0.3363, + "step": 31870 + }, + { + "epoch": 1.7846903348639267, + "grad_norm": 1.360927700996399, + "learning_rate": 9.687973684210527e-05, + "loss": 0.3881, + "step": 31871 + }, + { + "epoch": 1.7847463321760555, + "grad_norm": 1.1576305627822876, + "learning_rate": 9.687947368421053e-05, + "loss": 0.4286, + "step": 31872 + }, + { + "epoch": 1.7848023294881845, + "grad_norm": 1.765459656715393, + "learning_rate": 9.687921052631579e-05, + "loss": 0.6039, + "step": 31873 + }, + { + "epoch": 1.7848583268003135, + "grad_norm": 1.1548362970352173, + "learning_rate": 9.687894736842106e-05, + "loss": 0.5625, + "step": 31874 + }, + { + "epoch": 1.7849143241124426, + "grad_norm": 1.1519672870635986, + "learning_rate": 9.687868421052632e-05, + "loss": 0.4924, + "step": 31875 + }, + { + "epoch": 1.7849703214245716, + "grad_norm": 1.2811933755874634, + "learning_rate": 9.687842105263158e-05, + "loss": 0.492, + "step": 31876 + }, + { + "epoch": 1.7850263187367006, + "grad_norm": 1.201781153678894, + "learning_rate": 9.687815789473684e-05, + "loss": 0.3948, + "step": 31877 + }, + { + "epoch": 1.7850823160488296, + "grad_norm": 1.2015191316604614, + "learning_rate": 9.687789473684211e-05, + "loss": 0.3957, + "step": 31878 + }, + { + "epoch": 1.7851383133609586, + "grad_norm": 1.3081945180892944, + "learning_rate": 9.687763157894737e-05, + "loss": 0.4933, + "step": 31879 + }, + { + "epoch": 1.7851943106730876, + "grad_norm": 1.3227640390396118, + "learning_rate": 9.687736842105265e-05, + "loss": 0.4365, + "step": 31880 + }, + { + "epoch": 1.7852503079852167, + "grad_norm": 1.1125644445419312, + "learning_rate": 9.687710526315789e-05, + "loss": 0.4648, + "step": 31881 + }, + { + "epoch": 1.7853063052973457, + "grad_norm": 1.2398297786712646, + "learning_rate": 9.687684210526316e-05, + "loss": 0.387, + "step": 31882 + }, + { + "epoch": 1.7853623026094747, + "grad_norm": 1.2380250692367554, + "learning_rate": 9.687657894736842e-05, + "loss": 0.4519, + "step": 31883 + }, + { + "epoch": 1.7854182999216037, + "grad_norm": 1.1946462392807007, + "learning_rate": 9.68763157894737e-05, + "loss": 0.476, + "step": 31884 + }, + { + "epoch": 1.7854742972337327, + "grad_norm": 1.0793193578720093, + "learning_rate": 9.687605263157894e-05, + "loss": 0.4237, + "step": 31885 + }, + { + "epoch": 1.7855302945458618, + "grad_norm": 1.292043685913086, + "learning_rate": 9.687578947368422e-05, + "loss": 0.4286, + "step": 31886 + }, + { + "epoch": 1.7855862918579908, + "grad_norm": 1.3193550109863281, + "learning_rate": 9.687552631578948e-05, + "loss": 0.4822, + "step": 31887 + }, + { + "epoch": 1.7856422891701198, + "grad_norm": 1.0490700006484985, + "learning_rate": 9.687526315789475e-05, + "loss": 0.4018, + "step": 31888 + }, + { + "epoch": 1.7856982864822488, + "grad_norm": 2.052964210510254, + "learning_rate": 9.687500000000001e-05, + "loss": 0.5584, + "step": 31889 + }, + { + "epoch": 1.7857542837943778, + "grad_norm": 1.2843341827392578, + "learning_rate": 9.687473684210526e-05, + "loss": 0.4075, + "step": 31890 + }, + { + "epoch": 1.7858102811065069, + "grad_norm": 1.354588270187378, + "learning_rate": 9.687447368421053e-05, + "loss": 0.3725, + "step": 31891 + }, + { + "epoch": 1.7858662784186359, + "grad_norm": 1.2183945178985596, + "learning_rate": 9.687421052631579e-05, + "loss": 0.4345, + "step": 31892 + }, + { + "epoch": 1.785922275730765, + "grad_norm": 1.8452256917953491, + "learning_rate": 9.687394736842106e-05, + "loss": 0.4525, + "step": 31893 + }, + { + "epoch": 1.785978273042894, + "grad_norm": 1.266893744468689, + "learning_rate": 9.687368421052632e-05, + "loss": 0.4361, + "step": 31894 + }, + { + "epoch": 1.786034270355023, + "grad_norm": 1.3150335550308228, + "learning_rate": 9.687342105263158e-05, + "loss": 0.4323, + "step": 31895 + }, + { + "epoch": 1.786090267667152, + "grad_norm": 1.535183072090149, + "learning_rate": 9.687315789473684e-05, + "loss": 0.4749, + "step": 31896 + }, + { + "epoch": 1.786146264979281, + "grad_norm": 1.301011085510254, + "learning_rate": 9.687289473684212e-05, + "loss": 0.4234, + "step": 31897 + }, + { + "epoch": 1.78620226229141, + "grad_norm": 1.994390606880188, + "learning_rate": 9.687263157894737e-05, + "loss": 0.4824, + "step": 31898 + }, + { + "epoch": 1.786258259603539, + "grad_norm": 1.3605796098709106, + "learning_rate": 9.687236842105263e-05, + "loss": 0.457, + "step": 31899 + }, + { + "epoch": 1.786314256915668, + "grad_norm": 1.096146583557129, + "learning_rate": 9.68721052631579e-05, + "loss": 0.3761, + "step": 31900 + }, + { + "epoch": 1.786370254227797, + "grad_norm": 1.2055675983428955, + "learning_rate": 9.687184210526317e-05, + "loss": 0.4838, + "step": 31901 + }, + { + "epoch": 1.786426251539926, + "grad_norm": 1.2578544616699219, + "learning_rate": 9.687157894736843e-05, + "loss": 0.4031, + "step": 31902 + }, + { + "epoch": 1.786482248852055, + "grad_norm": 1.4588252305984497, + "learning_rate": 9.687131578947369e-05, + "loss": 0.4556, + "step": 31903 + }, + { + "epoch": 1.786538246164184, + "grad_norm": 2.3814427852630615, + "learning_rate": 9.687105263157895e-05, + "loss": 0.4625, + "step": 31904 + }, + { + "epoch": 1.7865942434763131, + "grad_norm": 1.379195213317871, + "learning_rate": 9.687078947368422e-05, + "loss": 0.495, + "step": 31905 + }, + { + "epoch": 1.7866502407884421, + "grad_norm": 1.1372191905975342, + "learning_rate": 9.687052631578948e-05, + "loss": 0.3828, + "step": 31906 + }, + { + "epoch": 1.7867062381005712, + "grad_norm": 1.2545710802078247, + "learning_rate": 9.687026315789474e-05, + "loss": 0.6164, + "step": 31907 + }, + { + "epoch": 1.7867622354127002, + "grad_norm": 1.309998631477356, + "learning_rate": 9.687e-05, + "loss": 0.4626, + "step": 31908 + }, + { + "epoch": 1.7868182327248292, + "grad_norm": 1.2679873704910278, + "learning_rate": 9.686973684210526e-05, + "loss": 0.4296, + "step": 31909 + }, + { + "epoch": 1.7868742300369582, + "grad_norm": 1.700130581855774, + "learning_rate": 9.686947368421053e-05, + "loss": 0.5619, + "step": 31910 + }, + { + "epoch": 1.7869302273490872, + "grad_norm": 1.367583155632019, + "learning_rate": 9.686921052631579e-05, + "loss": 0.6026, + "step": 31911 + }, + { + "epoch": 1.7869862246612163, + "grad_norm": 1.5523277521133423, + "learning_rate": 9.686894736842107e-05, + "loss": 0.3854, + "step": 31912 + }, + { + "epoch": 1.7870422219733453, + "grad_norm": 1.2146656513214111, + "learning_rate": 9.686868421052631e-05, + "loss": 0.4527, + "step": 31913 + }, + { + "epoch": 1.7870982192854743, + "grad_norm": 1.5013121366500854, + "learning_rate": 9.686842105263158e-05, + "loss": 0.4224, + "step": 31914 + }, + { + "epoch": 1.7871542165976033, + "grad_norm": 1.2803595066070557, + "learning_rate": 9.686815789473684e-05, + "loss": 0.4494, + "step": 31915 + }, + { + "epoch": 1.7872102139097323, + "grad_norm": 1.6458550691604614, + "learning_rate": 9.686789473684212e-05, + "loss": 0.4969, + "step": 31916 + }, + { + "epoch": 1.7872662112218614, + "grad_norm": 1.4115514755249023, + "learning_rate": 9.686763157894738e-05, + "loss": 0.4788, + "step": 31917 + }, + { + "epoch": 1.7873222085339904, + "grad_norm": 1.4544378519058228, + "learning_rate": 9.686736842105264e-05, + "loss": 0.5141, + "step": 31918 + }, + { + "epoch": 1.7873782058461194, + "grad_norm": 1.1197162866592407, + "learning_rate": 9.68671052631579e-05, + "loss": 0.4807, + "step": 31919 + }, + { + "epoch": 1.7874342031582484, + "grad_norm": 1.3073194026947021, + "learning_rate": 9.686684210526317e-05, + "loss": 0.5076, + "step": 31920 + }, + { + "epoch": 1.7874902004703774, + "grad_norm": 1.1209121942520142, + "learning_rate": 9.686657894736843e-05, + "loss": 0.4475, + "step": 31921 + }, + { + "epoch": 1.7875461977825065, + "grad_norm": 1.3270071744918823, + "learning_rate": 9.686631578947369e-05, + "loss": 0.5461, + "step": 31922 + }, + { + "epoch": 1.7876021950946355, + "grad_norm": 1.3773369789123535, + "learning_rate": 9.686605263157895e-05, + "loss": 0.3995, + "step": 31923 + }, + { + "epoch": 1.7876581924067645, + "grad_norm": 1.25208580493927, + "learning_rate": 9.686578947368421e-05, + "loss": 0.422, + "step": 31924 + }, + { + "epoch": 1.7877141897188935, + "grad_norm": 1.4375327825546265, + "learning_rate": 9.686552631578948e-05, + "loss": 0.4481, + "step": 31925 + }, + { + "epoch": 1.7877701870310225, + "grad_norm": 1.1717000007629395, + "learning_rate": 9.686526315789474e-05, + "loss": 0.3859, + "step": 31926 + }, + { + "epoch": 1.7878261843431515, + "grad_norm": 1.3534276485443115, + "learning_rate": 9.6865e-05, + "loss": 0.5196, + "step": 31927 + }, + { + "epoch": 1.7878821816552806, + "grad_norm": 1.1850370168685913, + "learning_rate": 9.686473684210526e-05, + "loss": 0.4482, + "step": 31928 + }, + { + "epoch": 1.7879381789674096, + "grad_norm": 1.0730195045471191, + "learning_rate": 9.686447368421053e-05, + "loss": 0.3513, + "step": 31929 + }, + { + "epoch": 1.7879941762795386, + "grad_norm": 1.3491233587265015, + "learning_rate": 9.68642105263158e-05, + "loss": 0.466, + "step": 31930 + }, + { + "epoch": 1.7880501735916676, + "grad_norm": 1.3401660919189453, + "learning_rate": 9.686394736842105e-05, + "loss": 0.5137, + "step": 31931 + }, + { + "epoch": 1.7881061709037966, + "grad_norm": 1.0419365167617798, + "learning_rate": 9.686368421052631e-05, + "loss": 0.3629, + "step": 31932 + }, + { + "epoch": 1.7881621682159257, + "grad_norm": 1.366532564163208, + "learning_rate": 9.686342105263159e-05, + "loss": 0.4584, + "step": 31933 + }, + { + "epoch": 1.7882181655280547, + "grad_norm": 1.2646071910858154, + "learning_rate": 9.686315789473685e-05, + "loss": 0.5024, + "step": 31934 + }, + { + "epoch": 1.7882741628401837, + "grad_norm": 1.360987663269043, + "learning_rate": 9.686289473684212e-05, + "loss": 0.3943, + "step": 31935 + }, + { + "epoch": 1.7883301601523127, + "grad_norm": 1.5033934116363525, + "learning_rate": 9.686263157894737e-05, + "loss": 0.4614, + "step": 31936 + }, + { + "epoch": 1.7883861574644417, + "grad_norm": 1.2284682989120483, + "learning_rate": 9.686236842105264e-05, + "loss": 0.4583, + "step": 31937 + }, + { + "epoch": 1.7884421547765708, + "grad_norm": 1.2431849241256714, + "learning_rate": 9.68621052631579e-05, + "loss": 0.4344, + "step": 31938 + }, + { + "epoch": 1.7884981520886998, + "grad_norm": 1.0632922649383545, + "learning_rate": 9.686184210526317e-05, + "loss": 0.3701, + "step": 31939 + }, + { + "epoch": 1.7885541494008288, + "grad_norm": 1.4324793815612793, + "learning_rate": 9.686157894736842e-05, + "loss": 0.5288, + "step": 31940 + }, + { + "epoch": 1.7886101467129578, + "grad_norm": 1.2565683126449585, + "learning_rate": 9.686131578947368e-05, + "loss": 0.4015, + "step": 31941 + }, + { + "epoch": 1.7886661440250868, + "grad_norm": 1.568869709968567, + "learning_rate": 9.686105263157895e-05, + "loss": 0.4316, + "step": 31942 + }, + { + "epoch": 1.7887221413372159, + "grad_norm": 1.173990249633789, + "learning_rate": 9.686078947368421e-05, + "loss": 0.4195, + "step": 31943 + }, + { + "epoch": 1.7887781386493449, + "grad_norm": 1.4450441598892212, + "learning_rate": 9.686052631578948e-05, + "loss": 0.4647, + "step": 31944 + }, + { + "epoch": 1.788834135961474, + "grad_norm": 1.2189844846725464, + "learning_rate": 9.686026315789473e-05, + "loss": 0.5044, + "step": 31945 + }, + { + "epoch": 1.788890133273603, + "grad_norm": 1.457152009010315, + "learning_rate": 9.686e-05, + "loss": 0.5644, + "step": 31946 + }, + { + "epoch": 1.788946130585732, + "grad_norm": 1.3943636417388916, + "learning_rate": 9.685973684210526e-05, + "loss": 0.363, + "step": 31947 + }, + { + "epoch": 1.789002127897861, + "grad_norm": 1.5694568157196045, + "learning_rate": 9.685947368421054e-05, + "loss": 0.416, + "step": 31948 + }, + { + "epoch": 1.78905812520999, + "grad_norm": 1.3233616352081299, + "learning_rate": 9.68592105263158e-05, + "loss": 0.5417, + "step": 31949 + }, + { + "epoch": 1.789114122522119, + "grad_norm": 1.9907273054122925, + "learning_rate": 9.685894736842106e-05, + "loss": 0.4553, + "step": 31950 + }, + { + "epoch": 1.789170119834248, + "grad_norm": 1.4094562530517578, + "learning_rate": 9.685868421052632e-05, + "loss": 0.5634, + "step": 31951 + }, + { + "epoch": 1.789226117146377, + "grad_norm": 1.2532178163528442, + "learning_rate": 9.685842105263159e-05, + "loss": 0.4212, + "step": 31952 + }, + { + "epoch": 1.789282114458506, + "grad_norm": 1.1450448036193848, + "learning_rate": 9.685815789473685e-05, + "loss": 0.4174, + "step": 31953 + }, + { + "epoch": 1.789338111770635, + "grad_norm": 5.6791582107543945, + "learning_rate": 9.685789473684211e-05, + "loss": 0.4486, + "step": 31954 + }, + { + "epoch": 1.7893941090827639, + "grad_norm": 1.223522424697876, + "learning_rate": 9.685763157894737e-05, + "loss": 0.5301, + "step": 31955 + }, + { + "epoch": 1.7894501063948929, + "grad_norm": 1.2860651016235352, + "learning_rate": 9.685736842105264e-05, + "loss": 0.6105, + "step": 31956 + }, + { + "epoch": 1.789506103707022, + "grad_norm": 1.1884591579437256, + "learning_rate": 9.68571052631579e-05, + "loss": 0.4867, + "step": 31957 + }, + { + "epoch": 1.789562101019151, + "grad_norm": 1.3899544477462769, + "learning_rate": 9.685684210526316e-05, + "loss": 0.5301, + "step": 31958 + }, + { + "epoch": 1.78961809833128, + "grad_norm": 1.0374916791915894, + "learning_rate": 9.685657894736842e-05, + "loss": 0.3354, + "step": 31959 + }, + { + "epoch": 1.789674095643409, + "grad_norm": 1.2505470514297485, + "learning_rate": 9.685631578947368e-05, + "loss": 0.4811, + "step": 31960 + }, + { + "epoch": 1.789730092955538, + "grad_norm": 3.660677671432495, + "learning_rate": 9.685605263157895e-05, + "loss": 0.4431, + "step": 31961 + }, + { + "epoch": 1.789786090267667, + "grad_norm": 1.2718796730041504, + "learning_rate": 9.685578947368421e-05, + "loss": 0.439, + "step": 31962 + }, + { + "epoch": 1.789842087579796, + "grad_norm": 1.0270181894302368, + "learning_rate": 9.685552631578947e-05, + "loss": 0.3657, + "step": 31963 + }, + { + "epoch": 1.789898084891925, + "grad_norm": 1.5077179670333862, + "learning_rate": 9.685526315789473e-05, + "loss": 0.7349, + "step": 31964 + }, + { + "epoch": 1.789954082204054, + "grad_norm": 1.1391503810882568, + "learning_rate": 9.685500000000001e-05, + "loss": 0.3734, + "step": 31965 + }, + { + "epoch": 1.790010079516183, + "grad_norm": 1.2708402872085571, + "learning_rate": 9.685473684210527e-05, + "loss": 0.4743, + "step": 31966 + }, + { + "epoch": 1.790066076828312, + "grad_norm": 1.5185188055038452, + "learning_rate": 9.685447368421054e-05, + "loss": 0.3838, + "step": 31967 + }, + { + "epoch": 1.790122074140441, + "grad_norm": 1.0767401456832886, + "learning_rate": 9.685421052631579e-05, + "loss": 0.332, + "step": 31968 + }, + { + "epoch": 1.7901780714525701, + "grad_norm": 2.1739299297332764, + "learning_rate": 9.685394736842106e-05, + "loss": 0.5035, + "step": 31969 + }, + { + "epoch": 1.7902340687646991, + "grad_norm": 1.1915934085845947, + "learning_rate": 9.685368421052632e-05, + "loss": 0.3914, + "step": 31970 + }, + { + "epoch": 1.7902900660768282, + "grad_norm": 1.3421036005020142, + "learning_rate": 9.685342105263159e-05, + "loss": 0.4314, + "step": 31971 + }, + { + "epoch": 1.7903460633889572, + "grad_norm": 1.072020411491394, + "learning_rate": 9.685315789473685e-05, + "loss": 0.3705, + "step": 31972 + }, + { + "epoch": 1.7904020607010862, + "grad_norm": 1.4651178121566772, + "learning_rate": 9.685289473684211e-05, + "loss": 0.4829, + "step": 31973 + }, + { + "epoch": 1.7904580580132152, + "grad_norm": 1.6425532102584839, + "learning_rate": 9.685263157894737e-05, + "loss": 0.5433, + "step": 31974 + }, + { + "epoch": 1.7905140553253442, + "grad_norm": 1.3565775156021118, + "learning_rate": 9.685236842105264e-05, + "loss": 0.4022, + "step": 31975 + }, + { + "epoch": 1.7905700526374733, + "grad_norm": 1.4639005661010742, + "learning_rate": 9.68521052631579e-05, + "loss": 0.3853, + "step": 31976 + }, + { + "epoch": 1.7906260499496023, + "grad_norm": 1.363292932510376, + "learning_rate": 9.685184210526315e-05, + "loss": 0.5404, + "step": 31977 + }, + { + "epoch": 1.7906820472617313, + "grad_norm": 1.3989355564117432, + "learning_rate": 9.685157894736842e-05, + "loss": 0.439, + "step": 31978 + }, + { + "epoch": 1.7907380445738603, + "grad_norm": 1.1867221593856812, + "learning_rate": 9.685131578947368e-05, + "loss": 0.351, + "step": 31979 + }, + { + "epoch": 1.7907940418859893, + "grad_norm": 1.2818231582641602, + "learning_rate": 9.685105263157896e-05, + "loss": 0.3876, + "step": 31980 + }, + { + "epoch": 1.7908500391981184, + "grad_norm": 1.0945988893508911, + "learning_rate": 9.685078947368422e-05, + "loss": 0.4235, + "step": 31981 + }, + { + "epoch": 1.7909060365102474, + "grad_norm": 1.424734354019165, + "learning_rate": 9.685052631578948e-05, + "loss": 0.4813, + "step": 31982 + }, + { + "epoch": 1.7909620338223764, + "grad_norm": 1.2352250814437866, + "learning_rate": 9.685026315789474e-05, + "loss": 0.4418, + "step": 31983 + }, + { + "epoch": 1.7910180311345054, + "grad_norm": 1.2557188272476196, + "learning_rate": 9.685000000000001e-05, + "loss": 0.4996, + "step": 31984 + }, + { + "epoch": 1.7910740284466344, + "grad_norm": 1.4204963445663452, + "learning_rate": 9.684973684210527e-05, + "loss": 0.5375, + "step": 31985 + }, + { + "epoch": 1.7911300257587635, + "grad_norm": 1.0954538583755493, + "learning_rate": 9.684947368421053e-05, + "loss": 0.4255, + "step": 31986 + }, + { + "epoch": 1.7911860230708925, + "grad_norm": 1.6155771017074585, + "learning_rate": 9.684921052631579e-05, + "loss": 0.5792, + "step": 31987 + }, + { + "epoch": 1.7912420203830215, + "grad_norm": 1.3473423719406128, + "learning_rate": 9.684894736842106e-05, + "loss": 0.6393, + "step": 31988 + }, + { + "epoch": 1.7912980176951505, + "grad_norm": 1.2579264640808105, + "learning_rate": 9.684868421052632e-05, + "loss": 0.4093, + "step": 31989 + }, + { + "epoch": 1.7913540150072795, + "grad_norm": 1.2967183589935303, + "learning_rate": 9.68484210526316e-05, + "loss": 0.501, + "step": 31990 + }, + { + "epoch": 1.7914100123194086, + "grad_norm": 1.2125723361968994, + "learning_rate": 9.684815789473684e-05, + "loss": 0.4765, + "step": 31991 + }, + { + "epoch": 1.7914660096315376, + "grad_norm": 1.2356135845184326, + "learning_rate": 9.684789473684211e-05, + "loss": 0.4129, + "step": 31992 + }, + { + "epoch": 1.7915220069436666, + "grad_norm": 1.2769567966461182, + "learning_rate": 9.684763157894737e-05, + "loss": 0.4385, + "step": 31993 + }, + { + "epoch": 1.7915780042557956, + "grad_norm": 1.0065739154815674, + "learning_rate": 9.684736842105263e-05, + "loss": 0.381, + "step": 31994 + }, + { + "epoch": 1.7916340015679246, + "grad_norm": 1.2326743602752686, + "learning_rate": 9.68471052631579e-05, + "loss": 0.5205, + "step": 31995 + }, + { + "epoch": 1.7916899988800536, + "grad_norm": 1.2099952697753906, + "learning_rate": 9.684684210526315e-05, + "loss": 0.4812, + "step": 31996 + }, + { + "epoch": 1.7917459961921827, + "grad_norm": 1.4518769979476929, + "learning_rate": 9.684657894736843e-05, + "loss": 0.5582, + "step": 31997 + }, + { + "epoch": 1.7918019935043117, + "grad_norm": 1.2784314155578613, + "learning_rate": 9.684631578947369e-05, + "loss": 0.4311, + "step": 31998 + }, + { + "epoch": 1.7918579908164407, + "grad_norm": 1.170072317123413, + "learning_rate": 9.684605263157896e-05, + "loss": 0.454, + "step": 31999 + }, + { + "epoch": 1.7919139881285697, + "grad_norm": 1.7834513187408447, + "learning_rate": 9.68457894736842e-05, + "loss": 0.5746, + "step": 32000 + } + ], + "logging_steps": 1, + "max_steps": 400000, + "num_input_tokens_seen": 0, + "num_train_epochs": 23, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.647152720620421e+17, + "train_batch_size": 28, + "trial_name": null, + "trial_params": null +}