diff --git "a/checkpoint-1200/trainer_state.json" "b/checkpoint-1200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1200/trainer_state.json" @@ -0,0 +1,8433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.37025609379821045, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000308546744831842, + "grad_norm": 3.8197367191314697, + "learning_rate": 1.0000000000000001e-07, + "loss": 4.3327, + "step": 1 + }, + { + "epoch": 0.000617093489663684, + "grad_norm": 3.611845016479492, + "learning_rate": 2.0000000000000002e-07, + "loss": 4.2669, + "step": 2 + }, + { + "epoch": 0.0009256402344955261, + "grad_norm": 5.70314359664917, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.4971, + "step": 3 + }, + { + "epoch": 0.001234186979327368, + "grad_norm": 3.392024517059326, + "learning_rate": 4.0000000000000003e-07, + "loss": 4.1838, + "step": 4 + }, + { + "epoch": 0.0015427337241592102, + "grad_norm": 4.388611316680908, + "learning_rate": 5e-07, + "loss": 4.8521, + "step": 5 + }, + { + "epoch": 0.0018512804689910522, + "grad_norm": 3.260829210281372, + "learning_rate": 6.000000000000001e-07, + "loss": 4.0387, + "step": 6 + }, + { + "epoch": 0.0021598272138228943, + "grad_norm": 6.02992057800293, + "learning_rate": 7.000000000000001e-07, + "loss": 5.9947, + "step": 7 + }, + { + "epoch": 0.002468373958654736, + "grad_norm": 4.1300554275512695, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6317, + "step": 8 + }, + { + "epoch": 0.002776920703486578, + "grad_norm": 4.799147605895996, + "learning_rate": 9e-07, + "loss": 4.9666, + "step": 9 + }, + { + "epoch": 0.0030854674483184203, + "grad_norm": 3.797393798828125, + "learning_rate": 1e-06, + "loss": 4.4269, + "step": 10 + }, + { + "epoch": 0.003394014193150262, + "grad_norm": 3.5223026275634766, + "learning_rate": 1.1e-06, + "loss": 4.1057, + "step": 11 + }, + { + "epoch": 0.0037025609379821045, + "grad_norm": 4.609493255615234, + "learning_rate": 1.2000000000000002e-06, + "loss": 5.0402, + "step": 12 + }, + { + "epoch": 0.004011107682813947, + "grad_norm": 2.8431596755981445, + "learning_rate": 1.3e-06, + "loss": 3.7792, + "step": 13 + }, + { + "epoch": 0.004319654427645789, + "grad_norm": 5.449502944946289, + "learning_rate": 1.4000000000000001e-06, + "loss": 5.3713, + "step": 14 + }, + { + "epoch": 0.0046282011724776305, + "grad_norm": 4.320559501647949, + "learning_rate": 1.5e-06, + "loss": 4.6332, + "step": 15 + }, + { + "epoch": 0.004936747917309472, + "grad_norm": 4.168860912322998, + "learning_rate": 1.6000000000000001e-06, + "loss": 4.5831, + "step": 16 + }, + { + "epoch": 0.005245294662141314, + "grad_norm": 3.160491943359375, + "learning_rate": 1.7e-06, + "loss": 4.0278, + "step": 17 + }, + { + "epoch": 0.005553841406973156, + "grad_norm": 4.066011428833008, + "learning_rate": 1.8e-06, + "loss": 4.7029, + "step": 18 + }, + { + "epoch": 0.005862388151804999, + "grad_norm": 5.592798233032227, + "learning_rate": 1.9000000000000002e-06, + "loss": 5.9086, + "step": 19 + }, + { + "epoch": 0.006170934896636841, + "grad_norm": 2.722200870513916, + "learning_rate": 2e-06, + "loss": 3.5714, + "step": 20 + }, + { + "epoch": 0.0064794816414686825, + "grad_norm": 6.213608741760254, + "learning_rate": 2.1000000000000002e-06, + "loss": 5.5262, + "step": 21 + }, + { + "epoch": 0.006788028386300524, + "grad_norm": 6.304937839508057, + "learning_rate": 2.2e-06, + "loss": 5.9214, + "step": 22 + }, + { + "epoch": 0.007096575131132366, + "grad_norm": 4.180171012878418, + "learning_rate": 2.3e-06, + "loss": 4.5205, + "step": 23 + }, + { + "epoch": 0.007405121875964209, + "grad_norm": 3.755070209503174, + "learning_rate": 2.4000000000000003e-06, + "loss": 4.2664, + "step": 24 + }, + { + "epoch": 0.007713668620796051, + "grad_norm": 2.4497368335723877, + "learning_rate": 2.4999999999999998e-06, + "loss": 3.497, + "step": 25 + }, + { + "epoch": 0.008022215365627893, + "grad_norm": 4.276626110076904, + "learning_rate": 2.6e-06, + "loss": 4.7956, + "step": 26 + }, + { + "epoch": 0.008330762110459735, + "grad_norm": 4.274627208709717, + "learning_rate": 2.7e-06, + "loss": 4.9595, + "step": 27 + }, + { + "epoch": 0.008639308855291577, + "grad_norm": 2.989254951477051, + "learning_rate": 2.8000000000000003e-06, + "loss": 4.0622, + "step": 28 + }, + { + "epoch": 0.008947855600123419, + "grad_norm": 4.647172451019287, + "learning_rate": 2.9e-06, + "loss": 5.7777, + "step": 29 + }, + { + "epoch": 0.009256402344955261, + "grad_norm": 3.569882869720459, + "learning_rate": 3e-06, + "loss": 4.9304, + "step": 30 + }, + { + "epoch": 0.009564949089787103, + "grad_norm": 2.1073899269104004, + "learning_rate": 3.1e-06, + "loss": 3.8024, + "step": 31 + }, + { + "epoch": 0.009873495834618945, + "grad_norm": 1.0976536273956299, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.873, + "step": 32 + }, + { + "epoch": 0.010182042579450786, + "grad_norm": 2.685044527053833, + "learning_rate": 3.3e-06, + "loss": 4.6695, + "step": 33 + }, + { + "epoch": 0.010490589324282628, + "grad_norm": 1.5810956954956055, + "learning_rate": 3.4e-06, + "loss": 3.5091, + "step": 34 + }, + { + "epoch": 0.01079913606911447, + "grad_norm": 1.168277621269226, + "learning_rate": 3.5e-06, + "loss": 3.1201, + "step": 35 + }, + { + "epoch": 0.011107682813946312, + "grad_norm": 1.4001338481903076, + "learning_rate": 3.6e-06, + "loss": 3.4514, + "step": 36 + }, + { + "epoch": 0.011416229558778156, + "grad_norm": 1.0257220268249512, + "learning_rate": 3.7e-06, + "loss": 2.9701, + "step": 37 + }, + { + "epoch": 0.011724776303609998, + "grad_norm": 1.1348458528518677, + "learning_rate": 3.8000000000000005e-06, + "loss": 3.2321, + "step": 38 + }, + { + "epoch": 0.01203332304844184, + "grad_norm": 1.5109179019927979, + "learning_rate": 3.9e-06, + "loss": 3.8248, + "step": 39 + }, + { + "epoch": 0.012341869793273681, + "grad_norm": 1.1383541822433472, + "learning_rate": 4e-06, + "loss": 3.2195, + "step": 40 + }, + { + "epoch": 0.012650416538105523, + "grad_norm": 1.4988147020339966, + "learning_rate": 4.1e-06, + "loss": 3.7383, + "step": 41 + }, + { + "epoch": 0.012958963282937365, + "grad_norm": 0.7626239061355591, + "learning_rate": 4.2000000000000004e-06, + "loss": 2.8485, + "step": 42 + }, + { + "epoch": 0.013267510027769207, + "grad_norm": 1.1359997987747192, + "learning_rate": 4.3e-06, + "loss": 3.3959, + "step": 43 + }, + { + "epoch": 0.013576056772601049, + "grad_norm": 1.991447925567627, + "learning_rate": 4.4e-06, + "loss": 4.5384, + "step": 44 + }, + { + "epoch": 0.01388460351743289, + "grad_norm": 1.2268764972686768, + "learning_rate": 4.5e-06, + "loss": 3.5075, + "step": 45 + }, + { + "epoch": 0.014193150262264732, + "grad_norm": 0.9016637802124023, + "learning_rate": 4.6e-06, + "loss": 3.2113, + "step": 46 + }, + { + "epoch": 0.014501697007096576, + "grad_norm": 1.3222975730895996, + "learning_rate": 4.700000000000001e-06, + "loss": 3.9122, + "step": 47 + }, + { + "epoch": 0.014810243751928418, + "grad_norm": 0.5912073850631714, + "learning_rate": 4.800000000000001e-06, + "loss": 2.8042, + "step": 48 + }, + { + "epoch": 0.01511879049676026, + "grad_norm": 1.091262936592102, + "learning_rate": 4.9e-06, + "loss": 3.5044, + "step": 49 + }, + { + "epoch": 0.015427337241592102, + "grad_norm": 1.0276159048080444, + "learning_rate": 4.9999999999999996e-06, + "loss": 3.5583, + "step": 50 + }, + { + "epoch": 0.01573588398642394, + "grad_norm": 1.0583761930465698, + "learning_rate": 5.1e-06, + "loss": 3.8359, + "step": 51 + }, + { + "epoch": 0.016044430731255787, + "grad_norm": 1.0619677305221558, + "learning_rate": 5.2e-06, + "loss": 3.8607, + "step": 52 + }, + { + "epoch": 0.01635297747608763, + "grad_norm": 0.8387205004692078, + "learning_rate": 5.3e-06, + "loss": 3.232, + "step": 53 + }, + { + "epoch": 0.01666152422091947, + "grad_norm": 1.1671696901321411, + "learning_rate": 5.4e-06, + "loss": 4.1826, + "step": 54 + }, + { + "epoch": 0.016970070965751313, + "grad_norm": 0.8083593249320984, + "learning_rate": 5.5e-06, + "loss": 3.4237, + "step": 55 + }, + { + "epoch": 0.017278617710583154, + "grad_norm": 0.99895179271698, + "learning_rate": 5.600000000000001e-06, + "loss": 3.8469, + "step": 56 + }, + { + "epoch": 0.017587164455414996, + "grad_norm": 0.7100948095321655, + "learning_rate": 5.7000000000000005e-06, + "loss": 3.2944, + "step": 57 + }, + { + "epoch": 0.017895711200246838, + "grad_norm": 0.6428288221359253, + "learning_rate": 5.8e-06, + "loss": 3.0704, + "step": 58 + }, + { + "epoch": 0.01820425794507868, + "grad_norm": 0.8618065118789673, + "learning_rate": 5.899999999999999e-06, + "loss": 3.8168, + "step": 59 + }, + { + "epoch": 0.018512804689910522, + "grad_norm": 0.7153589725494385, + "learning_rate": 6e-06, + "loss": 3.5421, + "step": 60 + }, + { + "epoch": 0.018821351434742364, + "grad_norm": 0.5318284630775452, + "learning_rate": 6.1e-06, + "loss": 3.1603, + "step": 61 + }, + { + "epoch": 0.019129898179574206, + "grad_norm": 0.348753422498703, + "learning_rate": 6.2e-06, + "loss": 2.692, + "step": 62 + }, + { + "epoch": 0.019438444924406047, + "grad_norm": 0.5460414290428162, + "learning_rate": 6.3e-06, + "loss": 3.08, + "step": 63 + }, + { + "epoch": 0.01974699166923789, + "grad_norm": 0.9770793318748474, + "learning_rate": 6.4000000000000006e-06, + "loss": 3.4058, + "step": 64 + }, + { + "epoch": 0.02005553841406973, + "grad_norm": 0.55307936668396, + "learning_rate": 6.5000000000000004e-06, + "loss": 2.9789, + "step": 65 + }, + { + "epoch": 0.020364085158901573, + "grad_norm": 0.7234558463096619, + "learning_rate": 6.6e-06, + "loss": 3.3505, + "step": 66 + }, + { + "epoch": 0.020672631903733415, + "grad_norm": 0.5592244863510132, + "learning_rate": 6.7e-06, + "loss": 3.1349, + "step": 67 + }, + { + "epoch": 0.020981178648565257, + "grad_norm": 0.8060568571090698, + "learning_rate": 6.8e-06, + "loss": 3.7522, + "step": 68 + }, + { + "epoch": 0.0212897253933971, + "grad_norm": 0.6033865809440613, + "learning_rate": 6.900000000000001e-06, + "loss": 3.5573, + "step": 69 + }, + { + "epoch": 0.02159827213822894, + "grad_norm": 0.5775837302207947, + "learning_rate": 7e-06, + "loss": 3.3392, + "step": 70 + }, + { + "epoch": 0.021906818883060782, + "grad_norm": 0.591890811920166, + "learning_rate": 7.1e-06, + "loss": 3.1473, + "step": 71 + }, + { + "epoch": 0.022215365627892624, + "grad_norm": 0.6378189325332642, + "learning_rate": 7.2e-06, + "loss": 3.2234, + "step": 72 + }, + { + "epoch": 0.02252391237272447, + "grad_norm": 0.5335432887077332, + "learning_rate": 7.3e-06, + "loss": 3.0607, + "step": 73 + }, + { + "epoch": 0.02283245911755631, + "grad_norm": 0.4598855674266815, + "learning_rate": 7.4e-06, + "loss": 3.0668, + "step": 74 + }, + { + "epoch": 0.023141005862388153, + "grad_norm": 0.5637675523757935, + "learning_rate": 7.5e-06, + "loss": 3.3607, + "step": 75 + }, + { + "epoch": 0.023449552607219995, + "grad_norm": 0.6915042996406555, + "learning_rate": 7.600000000000001e-06, + "loss": 3.1566, + "step": 76 + }, + { + "epoch": 0.023758099352051837, + "grad_norm": 0.44307759404182434, + "learning_rate": 7.699999999999999e-06, + "loss": 3.1364, + "step": 77 + }, + { + "epoch": 0.02406664609688368, + "grad_norm": 0.43005770444869995, + "learning_rate": 7.8e-06, + "loss": 3.1683, + "step": 78 + }, + { + "epoch": 0.02437519284171552, + "grad_norm": 0.6191526055335999, + "learning_rate": 7.899999999999999e-06, + "loss": 3.3363, + "step": 79 + }, + { + "epoch": 0.024683739586547362, + "grad_norm": 0.35434824228286743, + "learning_rate": 8e-06, + "loss": 2.5193, + "step": 80 + }, + { + "epoch": 0.024992286331379204, + "grad_norm": 0.656644344329834, + "learning_rate": 8.1e-06, + "loss": 3.7158, + "step": 81 + }, + { + "epoch": 0.025300833076211046, + "grad_norm": 0.40206533670425415, + "learning_rate": 8.2e-06, + "loss": 2.7845, + "step": 82 + }, + { + "epoch": 0.025609379821042888, + "grad_norm": 0.5501867532730103, + "learning_rate": 8.3e-06, + "loss": 2.7171, + "step": 83 + }, + { + "epoch": 0.02591792656587473, + "grad_norm": 0.4792475998401642, + "learning_rate": 8.400000000000001e-06, + "loss": 3.0822, + "step": 84 + }, + { + "epoch": 0.02622647331070657, + "grad_norm": 0.3702377676963806, + "learning_rate": 8.5e-06, + "loss": 2.6305, + "step": 85 + }, + { + "epoch": 0.026535020055538414, + "grad_norm": 0.37643131613731384, + "learning_rate": 8.6e-06, + "loss": 2.6588, + "step": 86 + }, + { + "epoch": 0.026843566800370255, + "grad_norm": 0.604988157749176, + "learning_rate": 8.7e-06, + "loss": 3.3621, + "step": 87 + }, + { + "epoch": 0.027152113545202097, + "grad_norm": 1.1603400707244873, + "learning_rate": 8.8e-06, + "loss": 2.8903, + "step": 88 + }, + { + "epoch": 0.02746066029003394, + "grad_norm": 0.6672317981719971, + "learning_rate": 8.900000000000001e-06, + "loss": 3.202, + "step": 89 + }, + { + "epoch": 0.02776920703486578, + "grad_norm": 0.404778391122818, + "learning_rate": 9e-06, + "loss": 3.0092, + "step": 90 + }, + { + "epoch": 0.028077753779697623, + "grad_norm": 0.7324615716934204, + "learning_rate": 9.100000000000001e-06, + "loss": 3.3472, + "step": 91 + }, + { + "epoch": 0.028386300524529465, + "grad_norm": 0.5391788482666016, + "learning_rate": 9.2e-06, + "loss": 2.9677, + "step": 92 + }, + { + "epoch": 0.028694847269361307, + "grad_norm": 0.4934414327144623, + "learning_rate": 9.3e-06, + "loss": 3.1551, + "step": 93 + }, + { + "epoch": 0.029003394014193152, + "grad_norm": 0.6892746686935425, + "learning_rate": 9.400000000000001e-06, + "loss": 3.6399, + "step": 94 + }, + { + "epoch": 0.029311940759024994, + "grad_norm": 0.40343931317329407, + "learning_rate": 9.5e-06, + "loss": 2.6643, + "step": 95 + }, + { + "epoch": 0.029620487503856836, + "grad_norm": 0.37016206979751587, + "learning_rate": 9.600000000000001e-06, + "loss": 2.8084, + "step": 96 + }, + { + "epoch": 0.029929034248688677, + "grad_norm": 0.31692177057266235, + "learning_rate": 9.699999999999999e-06, + "loss": 2.5649, + "step": 97 + }, + { + "epoch": 0.03023758099352052, + "grad_norm": 0.48820602893829346, + "learning_rate": 9.8e-06, + "loss": 2.7648, + "step": 98 + }, + { + "epoch": 0.03054612773835236, + "grad_norm": 0.4864860773086548, + "learning_rate": 9.9e-06, + "loss": 2.778, + "step": 99 + }, + { + "epoch": 0.030854674483184203, + "grad_norm": 0.4217410087585449, + "learning_rate": 9.999999999999999e-06, + "loss": 2.6541, + "step": 100 + }, + { + "epoch": 0.031163221228016045, + "grad_norm": 0.41729119420051575, + "learning_rate": 1.01e-05, + "loss": 2.5602, + "step": 101 + }, + { + "epoch": 0.03147176797284788, + "grad_norm": 0.5560382604598999, + "learning_rate": 1.02e-05, + "loss": 3.1223, + "step": 102 + }, + { + "epoch": 0.031780314717679725, + "grad_norm": 0.3572539985179901, + "learning_rate": 1.03e-05, + "loss": 2.706, + "step": 103 + }, + { + "epoch": 0.032088861462511574, + "grad_norm": 0.49619024991989136, + "learning_rate": 1.04e-05, + "loss": 3.0865, + "step": 104 + }, + { + "epoch": 0.032397408207343416, + "grad_norm": 0.4121740758419037, + "learning_rate": 1.05e-05, + "loss": 2.9064, + "step": 105 + }, + { + "epoch": 0.03270595495217526, + "grad_norm": 0.5392472743988037, + "learning_rate": 1.06e-05, + "loss": 3.0245, + "step": 106 + }, + { + "epoch": 0.0330145016970071, + "grad_norm": 0.37481802701950073, + "learning_rate": 1.0700000000000001e-05, + "loss": 3.0453, + "step": 107 + }, + { + "epoch": 0.03332304844183894, + "grad_norm": 0.6298946142196655, + "learning_rate": 1.08e-05, + "loss": 2.9881, + "step": 108 + }, + { + "epoch": 0.03363159518667078, + "grad_norm": 0.495768666267395, + "learning_rate": 1.09e-05, + "loss": 3.3919, + "step": 109 + }, + { + "epoch": 0.033940141931502625, + "grad_norm": 0.7116994261741638, + "learning_rate": 1.1e-05, + "loss": 3.188, + "step": 110 + }, + { + "epoch": 0.03424868867633447, + "grad_norm": 0.6338837146759033, + "learning_rate": 1.11e-05, + "loss": 3.1988, + "step": 111 + }, + { + "epoch": 0.03455723542116631, + "grad_norm": 0.6400672793388367, + "learning_rate": 1.1200000000000001e-05, + "loss": 3.3323, + "step": 112 + }, + { + "epoch": 0.03486578216599815, + "grad_norm": 0.48958930373191833, + "learning_rate": 1.13e-05, + "loss": 2.7639, + "step": 113 + }, + { + "epoch": 0.03517432891082999, + "grad_norm": 0.7853876948356628, + "learning_rate": 1.1400000000000001e-05, + "loss": 3.216, + "step": 114 + }, + { + "epoch": 0.035482875655661834, + "grad_norm": 0.4167158603668213, + "learning_rate": 1.1500000000000002e-05, + "loss": 2.5009, + "step": 115 + }, + { + "epoch": 0.035791422400493676, + "grad_norm": 0.43414178490638733, + "learning_rate": 1.16e-05, + "loss": 2.9166, + "step": 116 + }, + { + "epoch": 0.03609996914532552, + "grad_norm": 0.5888383984565735, + "learning_rate": 1.1700000000000001e-05, + "loss": 3.4765, + "step": 117 + }, + { + "epoch": 0.03640851589015736, + "grad_norm": 0.4863530099391937, + "learning_rate": 1.1799999999999999e-05, + "loss": 2.6866, + "step": 118 + }, + { + "epoch": 0.0367170626349892, + "grad_norm": 0.5108136534690857, + "learning_rate": 1.19e-05, + "loss": 3.0799, + "step": 119 + }, + { + "epoch": 0.037025609379821044, + "grad_norm": 0.72972172498703, + "learning_rate": 1.2e-05, + "loss": 3.3096, + "step": 120 + }, + { + "epoch": 0.037334156124652886, + "grad_norm": 0.4917832016944885, + "learning_rate": 1.21e-05, + "loss": 3.3209, + "step": 121 + }, + { + "epoch": 0.03764270286948473, + "grad_norm": 0.5651227831840515, + "learning_rate": 1.22e-05, + "loss": 3.2901, + "step": 122 + }, + { + "epoch": 0.03795124961431657, + "grad_norm": 0.4422552287578583, + "learning_rate": 1.2299999999999999e-05, + "loss": 2.7225, + "step": 123 + }, + { + "epoch": 0.03825979635914841, + "grad_norm": 0.4900197982788086, + "learning_rate": 1.24e-05, + "loss": 2.676, + "step": 124 + }, + { + "epoch": 0.03856834310398025, + "grad_norm": 0.5758654475212097, + "learning_rate": 1.25e-05, + "loss": 2.7869, + "step": 125 + }, + { + "epoch": 0.038876889848812095, + "grad_norm": 0.4216962456703186, + "learning_rate": 1.26e-05, + "loss": 2.5913, + "step": 126 + }, + { + "epoch": 0.03918543659364394, + "grad_norm": 0.5779691934585571, + "learning_rate": 1.27e-05, + "loss": 2.93, + "step": 127 + }, + { + "epoch": 0.03949398333847578, + "grad_norm": 0.5078710317611694, + "learning_rate": 1.2800000000000001e-05, + "loss": 2.873, + "step": 128 + }, + { + "epoch": 0.03980253008330762, + "grad_norm": 0.7252984046936035, + "learning_rate": 1.29e-05, + "loss": 3.2651, + "step": 129 + }, + { + "epoch": 0.04011107682813946, + "grad_norm": 0.30763328075408936, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.5202, + "step": 130 + }, + { + "epoch": 0.040419623572971304, + "grad_norm": 0.5326187610626221, + "learning_rate": 1.31e-05, + "loss": 2.7101, + "step": 131 + }, + { + "epoch": 0.040728170317803146, + "grad_norm": 0.5274989604949951, + "learning_rate": 1.32e-05, + "loss": 2.8888, + "step": 132 + }, + { + "epoch": 0.04103671706263499, + "grad_norm": 0.4721854031085968, + "learning_rate": 1.3300000000000001e-05, + "loss": 3.1288, + "step": 133 + }, + { + "epoch": 0.04134526380746683, + "grad_norm": 0.4977727234363556, + "learning_rate": 1.34e-05, + "loss": 2.7822, + "step": 134 + }, + { + "epoch": 0.04165381055229867, + "grad_norm": 0.5609750747680664, + "learning_rate": 1.3500000000000001e-05, + "loss": 2.9943, + "step": 135 + }, + { + "epoch": 0.04196235729713051, + "grad_norm": 0.5416237711906433, + "learning_rate": 1.36e-05, + "loss": 2.9631, + "step": 136 + }, + { + "epoch": 0.042270904041962355, + "grad_norm": 0.7931625247001648, + "learning_rate": 1.3700000000000001e-05, + "loss": 3.2442, + "step": 137 + }, + { + "epoch": 0.0425794507867942, + "grad_norm": 0.41734540462493896, + "learning_rate": 1.3800000000000002e-05, + "loss": 2.7296, + "step": 138 + }, + { + "epoch": 0.04288799753162604, + "grad_norm": 0.5195217132568359, + "learning_rate": 1.39e-05, + "loss": 3.0406, + "step": 139 + }, + { + "epoch": 0.04319654427645788, + "grad_norm": 0.43490850925445557, + "learning_rate": 1.4e-05, + "loss": 2.7431, + "step": 140 + }, + { + "epoch": 0.04350509102128972, + "grad_norm": 0.6502295732498169, + "learning_rate": 1.4099999999999999e-05, + "loss": 3.0238, + "step": 141 + }, + { + "epoch": 0.043813637766121565, + "grad_norm": 0.44002243876457214, + "learning_rate": 1.42e-05, + "loss": 2.9071, + "step": 142 + }, + { + "epoch": 0.044122184510953406, + "grad_norm": 0.578681468963623, + "learning_rate": 1.43e-05, + "loss": 3.1955, + "step": 143 + }, + { + "epoch": 0.04443073125578525, + "grad_norm": 0.5037577152252197, + "learning_rate": 1.44e-05, + "loss": 2.6283, + "step": 144 + }, + { + "epoch": 0.04473927800061709, + "grad_norm": 0.46894946694374084, + "learning_rate": 1.45e-05, + "loss": 2.5679, + "step": 145 + }, + { + "epoch": 0.04504782474544894, + "grad_norm": 0.7387483716011047, + "learning_rate": 1.46e-05, + "loss": 3.3251, + "step": 146 + }, + { + "epoch": 0.04535637149028078, + "grad_norm": 0.4696255624294281, + "learning_rate": 1.47e-05, + "loss": 2.7964, + "step": 147 + }, + { + "epoch": 0.04566491823511262, + "grad_norm": 0.6008668541908264, + "learning_rate": 1.48e-05, + "loss": 2.7697, + "step": 148 + }, + { + "epoch": 0.045973464979944464, + "grad_norm": 0.8156336545944214, + "learning_rate": 1.49e-05, + "loss": 3.2155, + "step": 149 + }, + { + "epoch": 0.046282011724776306, + "grad_norm": 0.37692368030548096, + "learning_rate": 1.5e-05, + "loss": 2.7007, + "step": 150 + }, + { + "epoch": 0.04659055846960815, + "grad_norm": 0.7642585635185242, + "learning_rate": 1.51e-05, + "loss": 3.6619, + "step": 151 + }, + { + "epoch": 0.04689910521443999, + "grad_norm": 0.6754474639892578, + "learning_rate": 1.5200000000000002e-05, + "loss": 3.1602, + "step": 152 + }, + { + "epoch": 0.04720765195927183, + "grad_norm": 0.6416701674461365, + "learning_rate": 1.53e-05, + "loss": 3.3756, + "step": 153 + }, + { + "epoch": 0.047516198704103674, + "grad_norm": 0.654334306716919, + "learning_rate": 1.5399999999999998e-05, + "loss": 3.168, + "step": 154 + }, + { + "epoch": 0.047824745448935516, + "grad_norm": 0.6520740389823914, + "learning_rate": 1.55e-05, + "loss": 3.2226, + "step": 155 + }, + { + "epoch": 0.04813329219376736, + "grad_norm": 0.384902685880661, + "learning_rate": 1.56e-05, + "loss": 2.5777, + "step": 156 + }, + { + "epoch": 0.0484418389385992, + "grad_norm": 0.5706138014793396, + "learning_rate": 1.57e-05, + "loss": 3.2847, + "step": 157 + }, + { + "epoch": 0.04875038568343104, + "grad_norm": 0.5485690832138062, + "learning_rate": 1.5799999999999998e-05, + "loss": 3.1548, + "step": 158 + }, + { + "epoch": 0.04905893242826288, + "grad_norm": 0.5101959705352783, + "learning_rate": 1.59e-05, + "loss": 2.8384, + "step": 159 + }, + { + "epoch": 0.049367479173094725, + "grad_norm": 0.4596351683139801, + "learning_rate": 1.6e-05, + "loss": 2.9464, + "step": 160 + }, + { + "epoch": 0.04967602591792657, + "grad_norm": 0.6397862434387207, + "learning_rate": 1.61e-05, + "loss": 3.2851, + "step": 161 + }, + { + "epoch": 0.04998457266275841, + "grad_norm": 0.3608058989048004, + "learning_rate": 1.62e-05, + "loss": 2.43, + "step": 162 + }, + { + "epoch": 0.05029311940759025, + "grad_norm": 0.7656980752944946, + "learning_rate": 1.63e-05, + "loss": 3.5049, + "step": 163 + }, + { + "epoch": 0.05060166615242209, + "grad_norm": 0.4537852108478546, + "learning_rate": 1.64e-05, + "loss": 2.6858, + "step": 164 + }, + { + "epoch": 0.050910212897253934, + "grad_norm": 0.46467283368110657, + "learning_rate": 1.65e-05, + "loss": 2.8103, + "step": 165 + }, + { + "epoch": 0.051218759642085776, + "grad_norm": 0.6642769575119019, + "learning_rate": 1.66e-05, + "loss": 3.2552, + "step": 166 + }, + { + "epoch": 0.05152730638691762, + "grad_norm": 0.707314133644104, + "learning_rate": 1.67e-05, + "loss": 3.5667, + "step": 167 + }, + { + "epoch": 0.05183585313174946, + "grad_norm": 0.4103851020336151, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.8039, + "step": 168 + }, + { + "epoch": 0.0521443998765813, + "grad_norm": 0.978696882724762, + "learning_rate": 1.69e-05, + "loss": 2.8174, + "step": 169 + }, + { + "epoch": 0.05245294662141314, + "grad_norm": 0.4563823640346527, + "learning_rate": 1.7e-05, + "loss": 3.1365, + "step": 170 + }, + { + "epoch": 0.052761493366244985, + "grad_norm": 0.4271509051322937, + "learning_rate": 1.71e-05, + "loss": 2.6638, + "step": 171 + }, + { + "epoch": 0.05307004011107683, + "grad_norm": 0.570706844329834, + "learning_rate": 1.72e-05, + "loss": 2.9269, + "step": 172 + }, + { + "epoch": 0.05337858685590867, + "grad_norm": 0.38479119539260864, + "learning_rate": 1.73e-05, + "loss": 2.1779, + "step": 173 + }, + { + "epoch": 0.05368713360074051, + "grad_norm": 0.5205680727958679, + "learning_rate": 1.74e-05, + "loss": 2.8916, + "step": 174 + }, + { + "epoch": 0.05399568034557235, + "grad_norm": 0.5419281721115112, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.8412, + "step": 175 + }, + { + "epoch": 0.054304227090404195, + "grad_norm": 0.5400970578193665, + "learning_rate": 1.76e-05, + "loss": 3.1398, + "step": 176 + }, + { + "epoch": 0.054612773835236036, + "grad_norm": 0.4808214604854584, + "learning_rate": 1.77e-05, + "loss": 2.9332, + "step": 177 + }, + { + "epoch": 0.05492132058006788, + "grad_norm": 0.5370422005653381, + "learning_rate": 1.7800000000000002e-05, + "loss": 2.6539, + "step": 178 + }, + { + "epoch": 0.05522986732489972, + "grad_norm": 0.5261011719703674, + "learning_rate": 1.79e-05, + "loss": 2.6934, + "step": 179 + }, + { + "epoch": 0.05553841406973156, + "grad_norm": 0.4394947588443756, + "learning_rate": 1.8e-05, + "loss": 2.6768, + "step": 180 + }, + { + "epoch": 0.055846960814563404, + "grad_norm": 0.3960530459880829, + "learning_rate": 1.8100000000000003e-05, + "loss": 2.7511, + "step": 181 + }, + { + "epoch": 0.056155507559395246, + "grad_norm": 0.4033471941947937, + "learning_rate": 1.8200000000000002e-05, + "loss": 2.6748, + "step": 182 + }, + { + "epoch": 0.05646405430422709, + "grad_norm": 0.46211346983909607, + "learning_rate": 1.83e-05, + "loss": 2.5696, + "step": 183 + }, + { + "epoch": 0.05677260104905893, + "grad_norm": 0.48176342248916626, + "learning_rate": 1.84e-05, + "loss": 2.6858, + "step": 184 + }, + { + "epoch": 0.05708114779389077, + "grad_norm": 0.5655994415283203, + "learning_rate": 1.8500000000000002e-05, + "loss": 3.2977, + "step": 185 + }, + { + "epoch": 0.05738969453872261, + "grad_norm": 0.7159984111785889, + "learning_rate": 1.86e-05, + "loss": 3.7336, + "step": 186 + }, + { + "epoch": 0.05769824128355446, + "grad_norm": 0.5511050224304199, + "learning_rate": 1.87e-05, + "loss": 2.7869, + "step": 187 + }, + { + "epoch": 0.058006788028386304, + "grad_norm": 0.5503979921340942, + "learning_rate": 1.8800000000000003e-05, + "loss": 2.7981, + "step": 188 + }, + { + "epoch": 0.058315334773218146, + "grad_norm": 0.4197019040584564, + "learning_rate": 1.8900000000000002e-05, + "loss": 2.5163, + "step": 189 + }, + { + "epoch": 0.05862388151804999, + "grad_norm": 0.5114262104034424, + "learning_rate": 1.9e-05, + "loss": 2.9493, + "step": 190 + }, + { + "epoch": 0.05893242826288183, + "grad_norm": 0.8843671679496765, + "learning_rate": 1.9100000000000003e-05, + "loss": 3.295, + "step": 191 + }, + { + "epoch": 0.05924097500771367, + "grad_norm": 0.3419288396835327, + "learning_rate": 1.9200000000000003e-05, + "loss": 2.3456, + "step": 192 + }, + { + "epoch": 0.05954952175254551, + "grad_norm": 0.5502027869224548, + "learning_rate": 1.9299999999999998e-05, + "loss": 2.3674, + "step": 193 + }, + { + "epoch": 0.059858068497377355, + "grad_norm": 0.7232564091682434, + "learning_rate": 1.9399999999999997e-05, + "loss": 3.4577, + "step": 194 + }, + { + "epoch": 0.0601666152422092, + "grad_norm": 0.4990410804748535, + "learning_rate": 1.95e-05, + "loss": 2.5966, + "step": 195 + }, + { + "epoch": 0.06047516198704104, + "grad_norm": 0.5583244562149048, + "learning_rate": 1.96e-05, + "loss": 2.58, + "step": 196 + }, + { + "epoch": 0.06078370873187288, + "grad_norm": 0.8695673942565918, + "learning_rate": 1.9699999999999998e-05, + "loss": 3.292, + "step": 197 + }, + { + "epoch": 0.06109225547670472, + "grad_norm": 0.47713199257850647, + "learning_rate": 1.98e-05, + "loss": 2.5254, + "step": 198 + }, + { + "epoch": 0.061400802221536564, + "grad_norm": 0.7606462836265564, + "learning_rate": 1.99e-05, + "loss": 2.9146, + "step": 199 + }, + { + "epoch": 0.061709348966368406, + "grad_norm": 0.6246155500411987, + "learning_rate": 1.9999999999999998e-05, + "loss": 3.1394, + "step": 200 + }, + { + "epoch": 0.06201789571120025, + "grad_norm": 0.741635262966156, + "learning_rate": 2.01e-05, + "loss": 2.8496, + "step": 201 + }, + { + "epoch": 0.06232644245603209, + "grad_norm": 0.5045515894889832, + "learning_rate": 2.02e-05, + "loss": 2.7577, + "step": 202 + }, + { + "epoch": 0.06263498920086392, + "grad_norm": 0.6223952174186707, + "learning_rate": 2.03e-05, + "loss": 2.7214, + "step": 203 + }, + { + "epoch": 0.06294353594569577, + "grad_norm": 0.5237772464752197, + "learning_rate": 2.04e-05, + "loss": 2.9416, + "step": 204 + }, + { + "epoch": 0.06325208269052761, + "grad_norm": 1.014267086982727, + "learning_rate": 2.05e-05, + "loss": 3.0007, + "step": 205 + }, + { + "epoch": 0.06356062943535945, + "grad_norm": 0.6386702060699463, + "learning_rate": 2.06e-05, + "loss": 2.8931, + "step": 206 + }, + { + "epoch": 0.06386917618019129, + "grad_norm": 0.6047970056533813, + "learning_rate": 2.07e-05, + "loss": 2.4792, + "step": 207 + }, + { + "epoch": 0.06417772292502315, + "grad_norm": 0.5924057364463806, + "learning_rate": 2.08e-05, + "loss": 2.9367, + "step": 208 + }, + { + "epoch": 0.06448626966985499, + "grad_norm": 0.5173370242118835, + "learning_rate": 2.09e-05, + "loss": 2.499, + "step": 209 + }, + { + "epoch": 0.06479481641468683, + "grad_norm": 0.9348816275596619, + "learning_rate": 2.1e-05, + "loss": 2.9291, + "step": 210 + }, + { + "epoch": 0.06510336315951867, + "grad_norm": 0.500065267086029, + "learning_rate": 2.11e-05, + "loss": 3.0239, + "step": 211 + }, + { + "epoch": 0.06541190990435052, + "grad_norm": 0.6069537401199341, + "learning_rate": 2.12e-05, + "loss": 2.9213, + "step": 212 + }, + { + "epoch": 0.06572045664918236, + "grad_norm": 0.8060280084609985, + "learning_rate": 2.13e-05, + "loss": 2.9865, + "step": 213 + }, + { + "epoch": 0.0660290033940142, + "grad_norm": 0.5638638138771057, + "learning_rate": 2.1400000000000002e-05, + "loss": 3.1433, + "step": 214 + }, + { + "epoch": 0.06633755013884604, + "grad_norm": 0.9096368551254272, + "learning_rate": 2.15e-05, + "loss": 3.3942, + "step": 215 + }, + { + "epoch": 0.06664609688367788, + "grad_norm": 0.4683953523635864, + "learning_rate": 2.16e-05, + "loss": 2.5365, + "step": 216 + }, + { + "epoch": 0.06695464362850972, + "grad_norm": 0.6851359009742737, + "learning_rate": 2.1700000000000002e-05, + "loss": 2.6351, + "step": 217 + }, + { + "epoch": 0.06726319037334157, + "grad_norm": 0.5846662521362305, + "learning_rate": 2.18e-05, + "loss": 2.8887, + "step": 218 + }, + { + "epoch": 0.06757173711817341, + "grad_norm": 0.42902109026908875, + "learning_rate": 2.19e-05, + "loss": 2.5048, + "step": 219 + }, + { + "epoch": 0.06788028386300525, + "grad_norm": 0.5018395185470581, + "learning_rate": 2.2e-05, + "loss": 2.854, + "step": 220 + }, + { + "epoch": 0.06818883060783709, + "grad_norm": 0.5672473311424255, + "learning_rate": 2.2100000000000002e-05, + "loss": 2.6505, + "step": 221 + }, + { + "epoch": 0.06849737735266893, + "grad_norm": 0.4538055956363678, + "learning_rate": 2.22e-05, + "loss": 2.6871, + "step": 222 + }, + { + "epoch": 0.06880592409750078, + "grad_norm": 0.4177427589893341, + "learning_rate": 2.23e-05, + "loss": 2.5534, + "step": 223 + }, + { + "epoch": 0.06911447084233262, + "grad_norm": 0.5405595898628235, + "learning_rate": 2.2400000000000002e-05, + "loss": 2.8893, + "step": 224 + }, + { + "epoch": 0.06942301758716446, + "grad_norm": 0.5323094725608826, + "learning_rate": 2.25e-05, + "loss": 2.5933, + "step": 225 + }, + { + "epoch": 0.0697315643319963, + "grad_norm": 0.6036468744277954, + "learning_rate": 2.26e-05, + "loss": 2.8331, + "step": 226 + }, + { + "epoch": 0.07004011107682814, + "grad_norm": 0.545752763748169, + "learning_rate": 2.2700000000000003e-05, + "loss": 2.6558, + "step": 227 + }, + { + "epoch": 0.07034865782165999, + "grad_norm": 0.3827911913394928, + "learning_rate": 2.2800000000000002e-05, + "loss": 2.3423, + "step": 228 + }, + { + "epoch": 0.07065720456649183, + "grad_norm": 0.4605464041233063, + "learning_rate": 2.29e-05, + "loss": 2.7205, + "step": 229 + }, + { + "epoch": 0.07096575131132367, + "grad_norm": 0.5113175511360168, + "learning_rate": 2.3000000000000003e-05, + "loss": 2.5991, + "step": 230 + }, + { + "epoch": 0.07127429805615551, + "grad_norm": 1.0119787454605103, + "learning_rate": 2.3100000000000002e-05, + "loss": 3.7089, + "step": 231 + }, + { + "epoch": 0.07158284480098735, + "grad_norm": 0.9721714854240417, + "learning_rate": 2.32e-05, + "loss": 2.7576, + "step": 232 + }, + { + "epoch": 0.0718913915458192, + "grad_norm": 0.7502716779708862, + "learning_rate": 2.33e-05, + "loss": 2.5823, + "step": 233 + }, + { + "epoch": 0.07219993829065104, + "grad_norm": 0.46226513385772705, + "learning_rate": 2.3400000000000003e-05, + "loss": 2.4788, + "step": 234 + }, + { + "epoch": 0.07250848503548288, + "grad_norm": 0.6776145696640015, + "learning_rate": 2.3500000000000002e-05, + "loss": 2.7757, + "step": 235 + }, + { + "epoch": 0.07281703178031472, + "grad_norm": 1.051665186882019, + "learning_rate": 2.3599999999999998e-05, + "loss": 3.2042, + "step": 236 + }, + { + "epoch": 0.07312557852514656, + "grad_norm": 0.4878885746002197, + "learning_rate": 2.37e-05, + "loss": 2.6037, + "step": 237 + }, + { + "epoch": 0.0734341252699784, + "grad_norm": 1.0094560384750366, + "learning_rate": 2.38e-05, + "loss": 3.0565, + "step": 238 + }, + { + "epoch": 0.07374267201481025, + "grad_norm": 0.6273016929626465, + "learning_rate": 2.3899999999999998e-05, + "loss": 2.6155, + "step": 239 + }, + { + "epoch": 0.07405121875964209, + "grad_norm": 0.7550265192985535, + "learning_rate": 2.4e-05, + "loss": 2.7951, + "step": 240 + }, + { + "epoch": 0.07435976550447393, + "grad_norm": 0.9445892572402954, + "learning_rate": 2.41e-05, + "loss": 2.8222, + "step": 241 + }, + { + "epoch": 0.07466831224930577, + "grad_norm": 0.7777389883995056, + "learning_rate": 2.42e-05, + "loss": 3.1079, + "step": 242 + }, + { + "epoch": 0.07497685899413761, + "grad_norm": 0.6355181932449341, + "learning_rate": 2.43e-05, + "loss": 2.6854, + "step": 243 + }, + { + "epoch": 0.07528540573896945, + "grad_norm": 1.261012315750122, + "learning_rate": 2.44e-05, + "loss": 2.6395, + "step": 244 + }, + { + "epoch": 0.0755939524838013, + "grad_norm": 0.659038245677948, + "learning_rate": 2.45e-05, + "loss": 2.5814, + "step": 245 + }, + { + "epoch": 0.07590249922863314, + "grad_norm": 0.873461127281189, + "learning_rate": 2.4599999999999998e-05, + "loss": 3.2166, + "step": 246 + }, + { + "epoch": 0.07621104597346498, + "grad_norm": 0.5640694499015808, + "learning_rate": 2.47e-05, + "loss": 2.6797, + "step": 247 + }, + { + "epoch": 0.07651959271829682, + "grad_norm": 0.8357604742050171, + "learning_rate": 2.48e-05, + "loss": 3.1053, + "step": 248 + }, + { + "epoch": 0.07682813946312866, + "grad_norm": 0.6787241697311401, + "learning_rate": 2.49e-05, + "loss": 3.0971, + "step": 249 + }, + { + "epoch": 0.0771366862079605, + "grad_norm": 1.1458957195281982, + "learning_rate": 2.5e-05, + "loss": 3.6878, + "step": 250 + }, + { + "epoch": 0.07744523295279235, + "grad_norm": 0.5414482355117798, + "learning_rate": 2.51e-05, + "loss": 2.7741, + "step": 251 + }, + { + "epoch": 0.07775377969762419, + "grad_norm": 0.6754476428031921, + "learning_rate": 2.52e-05, + "loss": 2.9239, + "step": 252 + }, + { + "epoch": 0.07806232644245603, + "grad_norm": 0.6492197513580322, + "learning_rate": 2.5300000000000002e-05, + "loss": 2.9217, + "step": 253 + }, + { + "epoch": 0.07837087318728787, + "grad_norm": 0.4921364188194275, + "learning_rate": 2.54e-05, + "loss": 2.4709, + "step": 254 + }, + { + "epoch": 0.07867941993211972, + "grad_norm": 0.535729169845581, + "learning_rate": 2.55e-05, + "loss": 2.4383, + "step": 255 + }, + { + "epoch": 0.07898796667695156, + "grad_norm": 0.6355189085006714, + "learning_rate": 2.5600000000000002e-05, + "loss": 2.8076, + "step": 256 + }, + { + "epoch": 0.0792965134217834, + "grad_norm": 0.5118688344955444, + "learning_rate": 2.57e-05, + "loss": 2.8182, + "step": 257 + }, + { + "epoch": 0.07960506016661524, + "grad_norm": 0.49077171087265015, + "learning_rate": 2.58e-05, + "loss": 2.4566, + "step": 258 + }, + { + "epoch": 0.07991360691144708, + "grad_norm": 0.698340654373169, + "learning_rate": 2.59e-05, + "loss": 2.8407, + "step": 259 + }, + { + "epoch": 0.08022215365627892, + "grad_norm": 0.6589793562889099, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.7079, + "step": 260 + }, + { + "epoch": 0.08053070040111077, + "grad_norm": 0.4942406117916107, + "learning_rate": 2.61e-05, + "loss": 2.7984, + "step": 261 + }, + { + "epoch": 0.08083924714594261, + "grad_norm": 0.5735039710998535, + "learning_rate": 2.62e-05, + "loss": 2.7639, + "step": 262 + }, + { + "epoch": 0.08114779389077445, + "grad_norm": 0.5074241757392883, + "learning_rate": 2.6300000000000002e-05, + "loss": 2.8036, + "step": 263 + }, + { + "epoch": 0.08145634063560629, + "grad_norm": 0.4602759778499603, + "learning_rate": 2.64e-05, + "loss": 2.257, + "step": 264 + }, + { + "epoch": 0.08176488738043813, + "grad_norm": 0.8142499327659607, + "learning_rate": 2.65e-05, + "loss": 3.1813, + "step": 265 + }, + { + "epoch": 0.08207343412526998, + "grad_norm": 0.44730252027511597, + "learning_rate": 2.6600000000000003e-05, + "loss": 2.6948, + "step": 266 + }, + { + "epoch": 0.08238198087010182, + "grad_norm": 0.3990883529186249, + "learning_rate": 2.6700000000000002e-05, + "loss": 2.5447, + "step": 267 + }, + { + "epoch": 0.08269052761493366, + "grad_norm": 0.7322868704795837, + "learning_rate": 2.68e-05, + "loss": 2.9178, + "step": 268 + }, + { + "epoch": 0.0829990743597655, + "grad_norm": 0.7331518530845642, + "learning_rate": 2.69e-05, + "loss": 2.953, + "step": 269 + }, + { + "epoch": 0.08330762110459734, + "grad_norm": 0.44720855355262756, + "learning_rate": 2.7000000000000002e-05, + "loss": 2.6966, + "step": 270 + }, + { + "epoch": 0.08361616784942918, + "grad_norm": 0.4966452717781067, + "learning_rate": 2.71e-05, + "loss": 2.9062, + "step": 271 + }, + { + "epoch": 0.08392471459426103, + "grad_norm": 0.6715177893638611, + "learning_rate": 2.72e-05, + "loss": 3.301, + "step": 272 + }, + { + "epoch": 0.08423326133909287, + "grad_norm": 0.5892104506492615, + "learning_rate": 2.7300000000000003e-05, + "loss": 3.0694, + "step": 273 + }, + { + "epoch": 0.08454180808392471, + "grad_norm": 0.7469586730003357, + "learning_rate": 2.7400000000000002e-05, + "loss": 3.3209, + "step": 274 + }, + { + "epoch": 0.08485035482875655, + "grad_norm": 0.41894596815109253, + "learning_rate": 2.75e-05, + "loss": 2.5125, + "step": 275 + }, + { + "epoch": 0.0851589015735884, + "grad_norm": 0.7170995473861694, + "learning_rate": 2.7600000000000003e-05, + "loss": 2.9459, + "step": 276 + }, + { + "epoch": 0.08546744831842024, + "grad_norm": 0.4706147015094757, + "learning_rate": 2.7700000000000002e-05, + "loss": 2.3327, + "step": 277 + }, + { + "epoch": 0.08577599506325208, + "grad_norm": 0.6144742965698242, + "learning_rate": 2.78e-05, + "loss": 2.6478, + "step": 278 + }, + { + "epoch": 0.08608454180808392, + "grad_norm": 0.5859728455543518, + "learning_rate": 2.79e-05, + "loss": 2.6092, + "step": 279 + }, + { + "epoch": 0.08639308855291576, + "grad_norm": 0.5228575468063354, + "learning_rate": 2.8e-05, + "loss": 2.8229, + "step": 280 + }, + { + "epoch": 0.0867016352977476, + "grad_norm": 0.7771852612495422, + "learning_rate": 2.81e-05, + "loss": 3.0858, + "step": 281 + }, + { + "epoch": 0.08701018204257945, + "grad_norm": 0.6817198395729065, + "learning_rate": 2.8199999999999998e-05, + "loss": 2.5982, + "step": 282 + }, + { + "epoch": 0.08731872878741129, + "grad_norm": 0.8393194079399109, + "learning_rate": 2.83e-05, + "loss": 3.2083, + "step": 283 + }, + { + "epoch": 0.08762727553224313, + "grad_norm": 0.5846584439277649, + "learning_rate": 2.84e-05, + "loss": 2.6578, + "step": 284 + }, + { + "epoch": 0.08793582227707497, + "grad_norm": 0.7105833292007446, + "learning_rate": 2.8499999999999998e-05, + "loss": 2.3854, + "step": 285 + }, + { + "epoch": 0.08824436902190681, + "grad_norm": 0.5590451955795288, + "learning_rate": 2.86e-05, + "loss": 2.7077, + "step": 286 + }, + { + "epoch": 0.08855291576673865, + "grad_norm": 0.3551070988178253, + "learning_rate": 2.87e-05, + "loss": 2.3749, + "step": 287 + }, + { + "epoch": 0.0888614625115705, + "grad_norm": 0.6778650283813477, + "learning_rate": 2.88e-05, + "loss": 2.5384, + "step": 288 + }, + { + "epoch": 0.08917000925640234, + "grad_norm": 0.5005120635032654, + "learning_rate": 2.89e-05, + "loss": 2.4651, + "step": 289 + }, + { + "epoch": 0.08947855600123418, + "grad_norm": 0.7873884439468384, + "learning_rate": 2.9e-05, + "loss": 3.0269, + "step": 290 + }, + { + "epoch": 0.08978710274606604, + "grad_norm": 0.8063921928405762, + "learning_rate": 2.91e-05, + "loss": 2.8679, + "step": 291 + }, + { + "epoch": 0.09009564949089788, + "grad_norm": 0.75018310546875, + "learning_rate": 2.92e-05, + "loss": 2.6179, + "step": 292 + }, + { + "epoch": 0.09040419623572972, + "grad_norm": 0.3952403962612152, + "learning_rate": 2.93e-05, + "loss": 2.3271, + "step": 293 + }, + { + "epoch": 0.09071274298056156, + "grad_norm": 0.49011290073394775, + "learning_rate": 2.94e-05, + "loss": 2.8457, + "step": 294 + }, + { + "epoch": 0.0910212897253934, + "grad_norm": 0.720333456993103, + "learning_rate": 2.95e-05, + "loss": 2.5857, + "step": 295 + }, + { + "epoch": 0.09132983647022525, + "grad_norm": 0.4137563705444336, + "learning_rate": 2.96e-05, + "loss": 2.4882, + "step": 296 + }, + { + "epoch": 0.09163838321505709, + "grad_norm": 0.5216336846351624, + "learning_rate": 2.97e-05, + "loss": 2.7284, + "step": 297 + }, + { + "epoch": 0.09194692995988893, + "grad_norm": 0.7018035054206848, + "learning_rate": 2.98e-05, + "loss": 3.4228, + "step": 298 + }, + { + "epoch": 0.09225547670472077, + "grad_norm": 0.5406361818313599, + "learning_rate": 2.9900000000000002e-05, + "loss": 3.1286, + "step": 299 + }, + { + "epoch": 0.09256402344955261, + "grad_norm": 0.48260602355003357, + "learning_rate": 3e-05, + "loss": 2.9251, + "step": 300 + }, + { + "epoch": 0.09287257019438445, + "grad_norm": 0.599492073059082, + "learning_rate": 3e-05, + "loss": 2.9756, + "step": 301 + }, + { + "epoch": 0.0931811169392163, + "grad_norm": 0.4386523962020874, + "learning_rate": 3e-05, + "loss": 2.8846, + "step": 302 + }, + { + "epoch": 0.09348966368404814, + "grad_norm": 0.45703089237213135, + "learning_rate": 3e-05, + "loss": 2.8327, + "step": 303 + }, + { + "epoch": 0.09379821042887998, + "grad_norm": 0.5618659257888794, + "learning_rate": 3e-05, + "loss": 2.442, + "step": 304 + }, + { + "epoch": 0.09410675717371182, + "grad_norm": 0.5637118220329285, + "learning_rate": 3e-05, + "loss": 3.2367, + "step": 305 + }, + { + "epoch": 0.09441530391854366, + "grad_norm": 0.7520857453346252, + "learning_rate": 3e-05, + "loss": 2.925, + "step": 306 + }, + { + "epoch": 0.0947238506633755, + "grad_norm": 0.5939932465553284, + "learning_rate": 3e-05, + "loss": 3.1811, + "step": 307 + }, + { + "epoch": 0.09503239740820735, + "grad_norm": 0.3925078809261322, + "learning_rate": 3e-05, + "loss": 2.8408, + "step": 308 + }, + { + "epoch": 0.09534094415303919, + "grad_norm": 0.415669322013855, + "learning_rate": 3e-05, + "loss": 2.5195, + "step": 309 + }, + { + "epoch": 0.09564949089787103, + "grad_norm": 0.6630593538284302, + "learning_rate": 3e-05, + "loss": 3.2672, + "step": 310 + }, + { + "epoch": 0.09595803764270287, + "grad_norm": 0.4107421934604645, + "learning_rate": 3e-05, + "loss": 2.4262, + "step": 311 + }, + { + "epoch": 0.09626658438753471, + "grad_norm": 0.5593919157981873, + "learning_rate": 3e-05, + "loss": 2.5541, + "step": 312 + }, + { + "epoch": 0.09657513113236656, + "grad_norm": 0.42898955941200256, + "learning_rate": 3e-05, + "loss": 2.5179, + "step": 313 + }, + { + "epoch": 0.0968836778771984, + "grad_norm": 0.31650015711784363, + "learning_rate": 3e-05, + "loss": 2.3117, + "step": 314 + }, + { + "epoch": 0.09719222462203024, + "grad_norm": 0.6258980631828308, + "learning_rate": 3e-05, + "loss": 2.6524, + "step": 315 + }, + { + "epoch": 0.09750077136686208, + "grad_norm": 0.45299839973449707, + "learning_rate": 3e-05, + "loss": 2.689, + "step": 316 + }, + { + "epoch": 0.09780931811169392, + "grad_norm": 0.4562253952026367, + "learning_rate": 3e-05, + "loss": 2.5171, + "step": 317 + }, + { + "epoch": 0.09811786485652577, + "grad_norm": 0.5021069049835205, + "learning_rate": 3e-05, + "loss": 2.4173, + "step": 318 + }, + { + "epoch": 0.09842641160135761, + "grad_norm": 0.664849042892456, + "learning_rate": 3e-05, + "loss": 3.1166, + "step": 319 + }, + { + "epoch": 0.09873495834618945, + "grad_norm": 0.45309123396873474, + "learning_rate": 3e-05, + "loss": 2.7596, + "step": 320 + }, + { + "epoch": 0.09904350509102129, + "grad_norm": 0.7038612365722656, + "learning_rate": 3e-05, + "loss": 2.7522, + "step": 321 + }, + { + "epoch": 0.09935205183585313, + "grad_norm": 1.059171199798584, + "learning_rate": 3e-05, + "loss": 3.175, + "step": 322 + }, + { + "epoch": 0.09966059858068498, + "grad_norm": 0.5896137356758118, + "learning_rate": 3e-05, + "loss": 2.9216, + "step": 323 + }, + { + "epoch": 0.09996914532551682, + "grad_norm": 0.637228786945343, + "learning_rate": 3e-05, + "loss": 2.8087, + "step": 324 + }, + { + "epoch": 0.10027769207034866, + "grad_norm": 0.8929901123046875, + "learning_rate": 3e-05, + "loss": 2.948, + "step": 325 + }, + { + "epoch": 0.1005862388151805, + "grad_norm": 0.48769569396972656, + "learning_rate": 3e-05, + "loss": 2.7344, + "step": 326 + }, + { + "epoch": 0.10089478556001234, + "grad_norm": 0.507175862789154, + "learning_rate": 3e-05, + "loss": 2.5428, + "step": 327 + }, + { + "epoch": 0.10120333230484418, + "grad_norm": 1.1506534814834595, + "learning_rate": 3e-05, + "loss": 2.9784, + "step": 328 + }, + { + "epoch": 0.10151187904967603, + "grad_norm": 0.8921215534210205, + "learning_rate": 3e-05, + "loss": 3.1315, + "step": 329 + }, + { + "epoch": 0.10182042579450787, + "grad_norm": 0.9674636125564575, + "learning_rate": 3e-05, + "loss": 2.9945, + "step": 330 + }, + { + "epoch": 0.10212897253933971, + "grad_norm": 0.7912929058074951, + "learning_rate": 3e-05, + "loss": 2.7517, + "step": 331 + }, + { + "epoch": 0.10243751928417155, + "grad_norm": 0.6095521450042725, + "learning_rate": 3e-05, + "loss": 3.4603, + "step": 332 + }, + { + "epoch": 0.1027460660290034, + "grad_norm": 0.3913911283016205, + "learning_rate": 3e-05, + "loss": 2.3178, + "step": 333 + }, + { + "epoch": 0.10305461277383524, + "grad_norm": 0.5703157782554626, + "learning_rate": 3e-05, + "loss": 2.7245, + "step": 334 + }, + { + "epoch": 0.10336315951866708, + "grad_norm": 0.4117796719074249, + "learning_rate": 3e-05, + "loss": 2.2675, + "step": 335 + }, + { + "epoch": 0.10367170626349892, + "grad_norm": 0.4819619357585907, + "learning_rate": 3e-05, + "loss": 2.5655, + "step": 336 + }, + { + "epoch": 0.10398025300833076, + "grad_norm": 0.579757809638977, + "learning_rate": 3e-05, + "loss": 2.8346, + "step": 337 + }, + { + "epoch": 0.1042887997531626, + "grad_norm": 0.5951722860336304, + "learning_rate": 3e-05, + "loss": 3.3011, + "step": 338 + }, + { + "epoch": 0.10459734649799445, + "grad_norm": 0.37846288084983826, + "learning_rate": 3e-05, + "loss": 2.4172, + "step": 339 + }, + { + "epoch": 0.10490589324282629, + "grad_norm": 0.5596647262573242, + "learning_rate": 3e-05, + "loss": 2.8481, + "step": 340 + }, + { + "epoch": 0.10521443998765813, + "grad_norm": 0.4432225227355957, + "learning_rate": 3e-05, + "loss": 2.9247, + "step": 341 + }, + { + "epoch": 0.10552298673248997, + "grad_norm": 0.5610328316688538, + "learning_rate": 3e-05, + "loss": 3.0414, + "step": 342 + }, + { + "epoch": 0.10583153347732181, + "grad_norm": 0.6313160061836243, + "learning_rate": 3e-05, + "loss": 2.7949, + "step": 343 + }, + { + "epoch": 0.10614008022215365, + "grad_norm": 0.5100327730178833, + "learning_rate": 3e-05, + "loss": 2.9491, + "step": 344 + }, + { + "epoch": 0.1064486269669855, + "grad_norm": 0.4908035099506378, + "learning_rate": 3e-05, + "loss": 2.727, + "step": 345 + }, + { + "epoch": 0.10675717371181734, + "grad_norm": 0.5060238242149353, + "learning_rate": 3e-05, + "loss": 2.4032, + "step": 346 + }, + { + "epoch": 0.10706572045664918, + "grad_norm": 0.5218526124954224, + "learning_rate": 3e-05, + "loss": 2.8393, + "step": 347 + }, + { + "epoch": 0.10737426720148102, + "grad_norm": 0.45432546734809875, + "learning_rate": 3e-05, + "loss": 2.3492, + "step": 348 + }, + { + "epoch": 0.10768281394631286, + "grad_norm": 0.39769670367240906, + "learning_rate": 3e-05, + "loss": 2.4955, + "step": 349 + }, + { + "epoch": 0.1079913606911447, + "grad_norm": 0.4049716591835022, + "learning_rate": 3e-05, + "loss": 2.5978, + "step": 350 + }, + { + "epoch": 0.10829990743597655, + "grad_norm": 0.43709230422973633, + "learning_rate": 3e-05, + "loss": 2.4538, + "step": 351 + }, + { + "epoch": 0.10860845418080839, + "grad_norm": 0.6582475900650024, + "learning_rate": 3e-05, + "loss": 2.7746, + "step": 352 + }, + { + "epoch": 0.10891700092564023, + "grad_norm": 0.5041898488998413, + "learning_rate": 3e-05, + "loss": 2.8402, + "step": 353 + }, + { + "epoch": 0.10922554767047207, + "grad_norm": 0.4751797318458557, + "learning_rate": 3e-05, + "loss": 2.5371, + "step": 354 + }, + { + "epoch": 0.10953409441530391, + "grad_norm": 0.8427959084510803, + "learning_rate": 3e-05, + "loss": 2.6777, + "step": 355 + }, + { + "epoch": 0.10984264116013576, + "grad_norm": 0.41446399688720703, + "learning_rate": 3e-05, + "loss": 2.5396, + "step": 356 + }, + { + "epoch": 0.1101511879049676, + "grad_norm": 0.6712360978126526, + "learning_rate": 3e-05, + "loss": 3.3444, + "step": 357 + }, + { + "epoch": 0.11045973464979944, + "grad_norm": 0.6713085174560547, + "learning_rate": 3e-05, + "loss": 3.2281, + "step": 358 + }, + { + "epoch": 0.11076828139463128, + "grad_norm": 0.9303755164146423, + "learning_rate": 3e-05, + "loss": 3.2125, + "step": 359 + }, + { + "epoch": 0.11107682813946312, + "grad_norm": 0.4587060511112213, + "learning_rate": 3e-05, + "loss": 2.786, + "step": 360 + }, + { + "epoch": 0.11138537488429497, + "grad_norm": 0.7472328543663025, + "learning_rate": 3e-05, + "loss": 2.8521, + "step": 361 + }, + { + "epoch": 0.11169392162912681, + "grad_norm": 0.502673864364624, + "learning_rate": 3e-05, + "loss": 2.7606, + "step": 362 + }, + { + "epoch": 0.11200246837395865, + "grad_norm": 0.631190836429596, + "learning_rate": 3e-05, + "loss": 2.9696, + "step": 363 + }, + { + "epoch": 0.11231101511879049, + "grad_norm": 0.6717808842658997, + "learning_rate": 3e-05, + "loss": 3.1333, + "step": 364 + }, + { + "epoch": 0.11261956186362233, + "grad_norm": 0.8173141479492188, + "learning_rate": 3e-05, + "loss": 3.5001, + "step": 365 + }, + { + "epoch": 0.11292810860845418, + "grad_norm": 0.5130720138549805, + "learning_rate": 3e-05, + "loss": 3.0127, + "step": 366 + }, + { + "epoch": 0.11323665535328602, + "grad_norm": 0.5507920980453491, + "learning_rate": 3e-05, + "loss": 2.6635, + "step": 367 + }, + { + "epoch": 0.11354520209811786, + "grad_norm": 0.5814347267150879, + "learning_rate": 3e-05, + "loss": 2.5873, + "step": 368 + }, + { + "epoch": 0.1138537488429497, + "grad_norm": 0.4552783668041229, + "learning_rate": 3e-05, + "loss": 2.2747, + "step": 369 + }, + { + "epoch": 0.11416229558778154, + "grad_norm": 0.48348626494407654, + "learning_rate": 3e-05, + "loss": 2.5721, + "step": 370 + }, + { + "epoch": 0.11447084233261338, + "grad_norm": 0.7298465967178345, + "learning_rate": 3e-05, + "loss": 3.0346, + "step": 371 + }, + { + "epoch": 0.11477938907744523, + "grad_norm": 0.4409029483795166, + "learning_rate": 3e-05, + "loss": 2.7849, + "step": 372 + }, + { + "epoch": 0.11508793582227707, + "grad_norm": 0.47219765186309814, + "learning_rate": 3e-05, + "loss": 2.7278, + "step": 373 + }, + { + "epoch": 0.11539648256710892, + "grad_norm": 0.6541340947151184, + "learning_rate": 3e-05, + "loss": 2.9001, + "step": 374 + }, + { + "epoch": 0.11570502931194077, + "grad_norm": 0.696343183517456, + "learning_rate": 3e-05, + "loss": 2.6636, + "step": 375 + }, + { + "epoch": 0.11601357605677261, + "grad_norm": 0.8152115941047668, + "learning_rate": 3e-05, + "loss": 2.7692, + "step": 376 + }, + { + "epoch": 0.11632212280160445, + "grad_norm": 0.5533241033554077, + "learning_rate": 3e-05, + "loss": 2.6548, + "step": 377 + }, + { + "epoch": 0.11663066954643629, + "grad_norm": 0.7412006855010986, + "learning_rate": 3e-05, + "loss": 2.847, + "step": 378 + }, + { + "epoch": 0.11693921629126813, + "grad_norm": 0.3919863700866699, + "learning_rate": 3e-05, + "loss": 2.4736, + "step": 379 + }, + { + "epoch": 0.11724776303609998, + "grad_norm": 1.019760251045227, + "learning_rate": 3e-05, + "loss": 3.2336, + "step": 380 + }, + { + "epoch": 0.11755630978093182, + "grad_norm": 0.6420919895172119, + "learning_rate": 3e-05, + "loss": 2.887, + "step": 381 + }, + { + "epoch": 0.11786485652576366, + "grad_norm": 0.3596365749835968, + "learning_rate": 3e-05, + "loss": 2.2894, + "step": 382 + }, + { + "epoch": 0.1181734032705955, + "grad_norm": 0.5054413676261902, + "learning_rate": 3e-05, + "loss": 2.1737, + "step": 383 + }, + { + "epoch": 0.11848195001542734, + "grad_norm": 0.6912227869033813, + "learning_rate": 3e-05, + "loss": 2.8684, + "step": 384 + }, + { + "epoch": 0.11879049676025918, + "grad_norm": 0.5603222846984863, + "learning_rate": 3e-05, + "loss": 2.9749, + "step": 385 + }, + { + "epoch": 0.11909904350509103, + "grad_norm": 0.4751299321651459, + "learning_rate": 3e-05, + "loss": 2.743, + "step": 386 + }, + { + "epoch": 0.11940759024992287, + "grad_norm": 0.9485656023025513, + "learning_rate": 3e-05, + "loss": 3.1394, + "step": 387 + }, + { + "epoch": 0.11971613699475471, + "grad_norm": 0.528243362903595, + "learning_rate": 3e-05, + "loss": 3.0556, + "step": 388 + }, + { + "epoch": 0.12002468373958655, + "grad_norm": 0.48346731066703796, + "learning_rate": 3e-05, + "loss": 2.92, + "step": 389 + }, + { + "epoch": 0.1203332304844184, + "grad_norm": 0.8272796869277954, + "learning_rate": 3e-05, + "loss": 2.78, + "step": 390 + }, + { + "epoch": 0.12064177722925024, + "grad_norm": 0.5784688591957092, + "learning_rate": 3e-05, + "loss": 2.7566, + "step": 391 + }, + { + "epoch": 0.12095032397408208, + "grad_norm": 0.5071232914924622, + "learning_rate": 3e-05, + "loss": 2.7876, + "step": 392 + }, + { + "epoch": 0.12125887071891392, + "grad_norm": 0.4624871015548706, + "learning_rate": 3e-05, + "loss": 2.6045, + "step": 393 + }, + { + "epoch": 0.12156741746374576, + "grad_norm": 0.7949879169464111, + "learning_rate": 3e-05, + "loss": 3.0567, + "step": 394 + }, + { + "epoch": 0.1218759642085776, + "grad_norm": 0.4112931191921234, + "learning_rate": 3e-05, + "loss": 2.5882, + "step": 395 + }, + { + "epoch": 0.12218451095340944, + "grad_norm": 0.508385956287384, + "learning_rate": 3e-05, + "loss": 2.8354, + "step": 396 + }, + { + "epoch": 0.12249305769824129, + "grad_norm": 0.571725606918335, + "learning_rate": 3e-05, + "loss": 3.1637, + "step": 397 + }, + { + "epoch": 0.12280160444307313, + "grad_norm": 0.4010452628135681, + "learning_rate": 3e-05, + "loss": 2.2202, + "step": 398 + }, + { + "epoch": 0.12311015118790497, + "grad_norm": 0.6162316203117371, + "learning_rate": 3e-05, + "loss": 3.0857, + "step": 399 + }, + { + "epoch": 0.12341869793273681, + "grad_norm": 0.5714832544326782, + "learning_rate": 3e-05, + "loss": 2.6672, + "step": 400 + }, + { + "epoch": 0.12372724467756865, + "grad_norm": 0.7332398295402527, + "learning_rate": 3e-05, + "loss": 2.6827, + "step": 401 + }, + { + "epoch": 0.1240357914224005, + "grad_norm": 0.8438281416893005, + "learning_rate": 3e-05, + "loss": 3.2429, + "step": 402 + }, + { + "epoch": 0.12434433816723234, + "grad_norm": 0.5126581192016602, + "learning_rate": 3e-05, + "loss": 2.7263, + "step": 403 + }, + { + "epoch": 0.12465288491206418, + "grad_norm": 1.201810598373413, + "learning_rate": 3e-05, + "loss": 2.9671, + "step": 404 + }, + { + "epoch": 0.12496143165689602, + "grad_norm": 0.5391373634338379, + "learning_rate": 3e-05, + "loss": 2.596, + "step": 405 + }, + { + "epoch": 0.12526997840172785, + "grad_norm": 0.49124547839164734, + "learning_rate": 3e-05, + "loss": 2.772, + "step": 406 + }, + { + "epoch": 0.1255785251465597, + "grad_norm": 0.6671253442764282, + "learning_rate": 3e-05, + "loss": 2.4786, + "step": 407 + }, + { + "epoch": 0.12588707189139153, + "grad_norm": 0.671453058719635, + "learning_rate": 3e-05, + "loss": 2.7801, + "step": 408 + }, + { + "epoch": 0.1261956186362234, + "grad_norm": 0.5589339137077332, + "learning_rate": 3e-05, + "loss": 2.4786, + "step": 409 + }, + { + "epoch": 0.12650416538105522, + "grad_norm": 0.48147571086883545, + "learning_rate": 3e-05, + "loss": 2.5111, + "step": 410 + }, + { + "epoch": 0.12681271212588707, + "grad_norm": 0.7984548807144165, + "learning_rate": 3e-05, + "loss": 3.3761, + "step": 411 + }, + { + "epoch": 0.1271212588707189, + "grad_norm": 0.4393971264362335, + "learning_rate": 3e-05, + "loss": 2.6791, + "step": 412 + }, + { + "epoch": 0.12742980561555076, + "grad_norm": 0.3642028868198395, + "learning_rate": 3e-05, + "loss": 2.3538, + "step": 413 + }, + { + "epoch": 0.12773835236038258, + "grad_norm": 0.4370051324367523, + "learning_rate": 3e-05, + "loss": 2.6664, + "step": 414 + }, + { + "epoch": 0.12804689910521444, + "grad_norm": 0.4502509534358978, + "learning_rate": 3e-05, + "loss": 2.4552, + "step": 415 + }, + { + "epoch": 0.1283554458500463, + "grad_norm": 0.8213755488395691, + "learning_rate": 3e-05, + "loss": 3.2743, + "step": 416 + }, + { + "epoch": 0.12866399259487812, + "grad_norm": 0.3885883390903473, + "learning_rate": 3e-05, + "loss": 2.625, + "step": 417 + }, + { + "epoch": 0.12897253933970998, + "grad_norm": 0.672049343585968, + "learning_rate": 3e-05, + "loss": 2.5793, + "step": 418 + }, + { + "epoch": 0.1292810860845418, + "grad_norm": 0.8375455141067505, + "learning_rate": 3e-05, + "loss": 3.2976, + "step": 419 + }, + { + "epoch": 0.12958963282937366, + "grad_norm": 0.29910895228385925, + "learning_rate": 3e-05, + "loss": 2.1132, + "step": 420 + }, + { + "epoch": 0.1298981795742055, + "grad_norm": 0.5657820701599121, + "learning_rate": 3e-05, + "loss": 3.0597, + "step": 421 + }, + { + "epoch": 0.13020672631903735, + "grad_norm": 0.5639760494232178, + "learning_rate": 3e-05, + "loss": 2.8884, + "step": 422 + }, + { + "epoch": 0.13051527306386917, + "grad_norm": 0.567143976688385, + "learning_rate": 3e-05, + "loss": 3.3492, + "step": 423 + }, + { + "epoch": 0.13082381980870103, + "grad_norm": 0.42509710788726807, + "learning_rate": 3e-05, + "loss": 2.754, + "step": 424 + }, + { + "epoch": 0.13113236655353286, + "grad_norm": 0.5575029850006104, + "learning_rate": 3e-05, + "loss": 2.7359, + "step": 425 + }, + { + "epoch": 0.13144091329836471, + "grad_norm": 0.577087938785553, + "learning_rate": 3e-05, + "loss": 2.4267, + "step": 426 + }, + { + "epoch": 0.13174946004319654, + "grad_norm": 0.6447242498397827, + "learning_rate": 3e-05, + "loss": 2.7924, + "step": 427 + }, + { + "epoch": 0.1320580067880284, + "grad_norm": 0.5749005675315857, + "learning_rate": 3e-05, + "loss": 2.9875, + "step": 428 + }, + { + "epoch": 0.13236655353286023, + "grad_norm": 0.5711660385131836, + "learning_rate": 3e-05, + "loss": 2.7619, + "step": 429 + }, + { + "epoch": 0.13267510027769208, + "grad_norm": 0.9919552803039551, + "learning_rate": 3e-05, + "loss": 3.5351, + "step": 430 + }, + { + "epoch": 0.1329836470225239, + "grad_norm": 0.5573298335075378, + "learning_rate": 3e-05, + "loss": 2.6573, + "step": 431 + }, + { + "epoch": 0.13329219376735577, + "grad_norm": 0.6087166666984558, + "learning_rate": 3e-05, + "loss": 2.871, + "step": 432 + }, + { + "epoch": 0.1336007405121876, + "grad_norm": 0.9867172241210938, + "learning_rate": 3e-05, + "loss": 2.6612, + "step": 433 + }, + { + "epoch": 0.13390928725701945, + "grad_norm": 0.4709426760673523, + "learning_rate": 3e-05, + "loss": 2.494, + "step": 434 + }, + { + "epoch": 0.13421783400185128, + "grad_norm": 0.8016806244850159, + "learning_rate": 3e-05, + "loss": 3.5046, + "step": 435 + }, + { + "epoch": 0.13452638074668313, + "grad_norm": 0.8725690245628357, + "learning_rate": 3e-05, + "loss": 3.0465, + "step": 436 + }, + { + "epoch": 0.13483492749151496, + "grad_norm": 0.4316865801811218, + "learning_rate": 3e-05, + "loss": 2.5318, + "step": 437 + }, + { + "epoch": 0.13514347423634682, + "grad_norm": 0.5138392448425293, + "learning_rate": 3e-05, + "loss": 2.2298, + "step": 438 + }, + { + "epoch": 0.13545202098117864, + "grad_norm": 0.5295316576957703, + "learning_rate": 3e-05, + "loss": 2.3288, + "step": 439 + }, + { + "epoch": 0.1357605677260105, + "grad_norm": 0.3984488248825073, + "learning_rate": 3e-05, + "loss": 2.3586, + "step": 440 + }, + { + "epoch": 0.13606911447084233, + "grad_norm": 0.8097387552261353, + "learning_rate": 3e-05, + "loss": 3.2978, + "step": 441 + }, + { + "epoch": 0.13637766121567418, + "grad_norm": 0.6348075270652771, + "learning_rate": 3e-05, + "loss": 2.7686, + "step": 442 + }, + { + "epoch": 0.136686207960506, + "grad_norm": 0.6103842854499817, + "learning_rate": 3e-05, + "loss": 2.8613, + "step": 443 + }, + { + "epoch": 0.13699475470533787, + "grad_norm": 0.5461227297782898, + "learning_rate": 3e-05, + "loss": 2.8534, + "step": 444 + }, + { + "epoch": 0.1373033014501697, + "grad_norm": 0.6284303069114685, + "learning_rate": 3e-05, + "loss": 2.6065, + "step": 445 + }, + { + "epoch": 0.13761184819500155, + "grad_norm": 0.3893953561782837, + "learning_rate": 3e-05, + "loss": 2.599, + "step": 446 + }, + { + "epoch": 0.13792039493983338, + "grad_norm": 0.34796831011772156, + "learning_rate": 3e-05, + "loss": 2.2519, + "step": 447 + }, + { + "epoch": 0.13822894168466524, + "grad_norm": 0.4476280212402344, + "learning_rate": 3e-05, + "loss": 2.4558, + "step": 448 + }, + { + "epoch": 0.13853748842949706, + "grad_norm": 0.5699636936187744, + "learning_rate": 3e-05, + "loss": 2.6878, + "step": 449 + }, + { + "epoch": 0.13884603517432892, + "grad_norm": 0.2940067648887634, + "learning_rate": 3e-05, + "loss": 2.281, + "step": 450 + }, + { + "epoch": 0.13915458191916075, + "grad_norm": 0.9075848460197449, + "learning_rate": 3e-05, + "loss": 3.2938, + "step": 451 + }, + { + "epoch": 0.1394631286639926, + "grad_norm": 0.4113154113292694, + "learning_rate": 3e-05, + "loss": 2.5557, + "step": 452 + }, + { + "epoch": 0.13977167540882443, + "grad_norm": 0.5481420755386353, + "learning_rate": 3e-05, + "loss": 2.5868, + "step": 453 + }, + { + "epoch": 0.1400802221536563, + "grad_norm": 0.4203355312347412, + "learning_rate": 3e-05, + "loss": 2.5495, + "step": 454 + }, + { + "epoch": 0.14038876889848811, + "grad_norm": 0.40559303760528564, + "learning_rate": 3e-05, + "loss": 2.5116, + "step": 455 + }, + { + "epoch": 0.14069731564331997, + "grad_norm": 0.5221837162971497, + "learning_rate": 3e-05, + "loss": 2.8499, + "step": 456 + }, + { + "epoch": 0.1410058623881518, + "grad_norm": 0.48368218541145325, + "learning_rate": 3e-05, + "loss": 2.6947, + "step": 457 + }, + { + "epoch": 0.14131440913298365, + "grad_norm": 0.4436626732349396, + "learning_rate": 3e-05, + "loss": 2.5644, + "step": 458 + }, + { + "epoch": 0.14162295587781548, + "grad_norm": 0.5758005380630493, + "learning_rate": 3e-05, + "loss": 2.5727, + "step": 459 + }, + { + "epoch": 0.14193150262264734, + "grad_norm": 0.2992756962776184, + "learning_rate": 3e-05, + "loss": 2.1367, + "step": 460 + }, + { + "epoch": 0.14224004936747917, + "grad_norm": 0.4820541739463806, + "learning_rate": 3e-05, + "loss": 2.8957, + "step": 461 + }, + { + "epoch": 0.14254859611231102, + "grad_norm": 0.6373788118362427, + "learning_rate": 3e-05, + "loss": 2.9102, + "step": 462 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.49387943744659424, + "learning_rate": 3e-05, + "loss": 2.9304, + "step": 463 + }, + { + "epoch": 0.1431656896019747, + "grad_norm": 0.3972255289554596, + "learning_rate": 3e-05, + "loss": 2.5099, + "step": 464 + }, + { + "epoch": 0.14347423634680653, + "grad_norm": 0.49275124073028564, + "learning_rate": 3e-05, + "loss": 2.6418, + "step": 465 + }, + { + "epoch": 0.1437827830916384, + "grad_norm": 0.39669597148895264, + "learning_rate": 3e-05, + "loss": 2.3448, + "step": 466 + }, + { + "epoch": 0.14409132983647022, + "grad_norm": 0.6432366967201233, + "learning_rate": 3e-05, + "loss": 2.9027, + "step": 467 + }, + { + "epoch": 0.14439987658130207, + "grad_norm": 0.43117955327033997, + "learning_rate": 3e-05, + "loss": 2.7841, + "step": 468 + }, + { + "epoch": 0.1447084233261339, + "grad_norm": 0.4256863296031952, + "learning_rate": 3e-05, + "loss": 2.5825, + "step": 469 + }, + { + "epoch": 0.14501697007096576, + "grad_norm": 0.43717724084854126, + "learning_rate": 3e-05, + "loss": 2.6433, + "step": 470 + }, + { + "epoch": 0.14532551681579758, + "grad_norm": 0.759493887424469, + "learning_rate": 3e-05, + "loss": 2.7323, + "step": 471 + }, + { + "epoch": 0.14563406356062944, + "grad_norm": 0.33263617753982544, + "learning_rate": 3e-05, + "loss": 2.417, + "step": 472 + }, + { + "epoch": 0.14594261030546127, + "grad_norm": 0.34977760910987854, + "learning_rate": 3e-05, + "loss": 2.3743, + "step": 473 + }, + { + "epoch": 0.14625115705029312, + "grad_norm": 0.5000836253166199, + "learning_rate": 3e-05, + "loss": 2.5705, + "step": 474 + }, + { + "epoch": 0.14655970379512495, + "grad_norm": 1.2762246131896973, + "learning_rate": 3e-05, + "loss": 3.133, + "step": 475 + }, + { + "epoch": 0.1468682505399568, + "grad_norm": 0.6496342420578003, + "learning_rate": 3e-05, + "loss": 2.9256, + "step": 476 + }, + { + "epoch": 0.14717679728478864, + "grad_norm": 0.716373085975647, + "learning_rate": 3e-05, + "loss": 2.5074, + "step": 477 + }, + { + "epoch": 0.1474853440296205, + "grad_norm": 0.7680580019950867, + "learning_rate": 3e-05, + "loss": 2.8563, + "step": 478 + }, + { + "epoch": 0.14779389077445232, + "grad_norm": 0.4760282337665558, + "learning_rate": 3e-05, + "loss": 2.4567, + "step": 479 + }, + { + "epoch": 0.14810243751928417, + "grad_norm": 0.7085632681846619, + "learning_rate": 3e-05, + "loss": 3.3149, + "step": 480 + }, + { + "epoch": 0.148410984264116, + "grad_norm": 0.48510226607322693, + "learning_rate": 3e-05, + "loss": 2.9825, + "step": 481 + }, + { + "epoch": 0.14871953100894786, + "grad_norm": 0.5194237232208252, + "learning_rate": 3e-05, + "loss": 2.8049, + "step": 482 + }, + { + "epoch": 0.1490280777537797, + "grad_norm": 0.6330525279045105, + "learning_rate": 3e-05, + "loss": 3.0564, + "step": 483 + }, + { + "epoch": 0.14933662449861154, + "grad_norm": 0.38547223806381226, + "learning_rate": 3e-05, + "loss": 2.7878, + "step": 484 + }, + { + "epoch": 0.14964517124344337, + "grad_norm": 0.5610853433609009, + "learning_rate": 3e-05, + "loss": 3.0895, + "step": 485 + }, + { + "epoch": 0.14995371798827523, + "grad_norm": 0.6889973878860474, + "learning_rate": 3e-05, + "loss": 3.396, + "step": 486 + }, + { + "epoch": 0.15026226473310705, + "grad_norm": 0.374765008687973, + "learning_rate": 3e-05, + "loss": 2.2703, + "step": 487 + }, + { + "epoch": 0.1505708114779389, + "grad_norm": 0.5405288338661194, + "learning_rate": 3e-05, + "loss": 2.8522, + "step": 488 + }, + { + "epoch": 0.15087935822277074, + "grad_norm": 0.4556944668292999, + "learning_rate": 3e-05, + "loss": 2.9866, + "step": 489 + }, + { + "epoch": 0.1511879049676026, + "grad_norm": 0.326506108045578, + "learning_rate": 3e-05, + "loss": 2.2067, + "step": 490 + }, + { + "epoch": 0.15149645171243442, + "grad_norm": 0.3182726204395294, + "learning_rate": 3e-05, + "loss": 2.134, + "step": 491 + }, + { + "epoch": 0.15180499845726628, + "grad_norm": 0.3820968568325043, + "learning_rate": 3e-05, + "loss": 2.5072, + "step": 492 + }, + { + "epoch": 0.1521135452020981, + "grad_norm": 0.43182340264320374, + "learning_rate": 3e-05, + "loss": 2.5903, + "step": 493 + }, + { + "epoch": 0.15242209194692996, + "grad_norm": 0.614032506942749, + "learning_rate": 3e-05, + "loss": 3.0159, + "step": 494 + }, + { + "epoch": 0.1527306386917618, + "grad_norm": 0.34464317560195923, + "learning_rate": 3e-05, + "loss": 2.5565, + "step": 495 + }, + { + "epoch": 0.15303918543659364, + "grad_norm": 0.7838470935821533, + "learning_rate": 3e-05, + "loss": 3.299, + "step": 496 + }, + { + "epoch": 0.15334773218142547, + "grad_norm": 0.7292729616165161, + "learning_rate": 3e-05, + "loss": 3.1353, + "step": 497 + }, + { + "epoch": 0.15365627892625733, + "grad_norm": 0.6598738431930542, + "learning_rate": 3e-05, + "loss": 3.0993, + "step": 498 + }, + { + "epoch": 0.15396482567108918, + "grad_norm": 0.3278651535511017, + "learning_rate": 3e-05, + "loss": 2.4751, + "step": 499 + }, + { + "epoch": 0.154273372415921, + "grad_norm": 0.3951056897640228, + "learning_rate": 3e-05, + "loss": 2.3225, + "step": 500 + }, + { + "epoch": 0.15458191916075287, + "grad_norm": 0.5355321764945984, + "learning_rate": 3e-05, + "loss": 2.6751, + "step": 501 + }, + { + "epoch": 0.1548904659055847, + "grad_norm": 0.4398941993713379, + "learning_rate": 3e-05, + "loss": 2.3192, + "step": 502 + }, + { + "epoch": 0.15519901265041655, + "grad_norm": 0.9720609784126282, + "learning_rate": 3e-05, + "loss": 3.1218, + "step": 503 + }, + { + "epoch": 0.15550755939524838, + "grad_norm": 0.5093293190002441, + "learning_rate": 3e-05, + "loss": 2.6208, + "step": 504 + }, + { + "epoch": 0.15581610614008023, + "grad_norm": 0.5958328247070312, + "learning_rate": 3e-05, + "loss": 2.9366, + "step": 505 + }, + { + "epoch": 0.15612465288491206, + "grad_norm": 1.0065058469772339, + "learning_rate": 3e-05, + "loss": 3.086, + "step": 506 + }, + { + "epoch": 0.15643319962974392, + "grad_norm": 0.9630736708641052, + "learning_rate": 3e-05, + "loss": 2.7257, + "step": 507 + }, + { + "epoch": 0.15674174637457575, + "grad_norm": 0.730649471282959, + "learning_rate": 3e-05, + "loss": 2.6617, + "step": 508 + }, + { + "epoch": 0.1570502931194076, + "grad_norm": 0.9365665316581726, + "learning_rate": 3e-05, + "loss": 3.1122, + "step": 509 + }, + { + "epoch": 0.15735883986423943, + "grad_norm": 0.6446232795715332, + "learning_rate": 3e-05, + "loss": 2.7233, + "step": 510 + }, + { + "epoch": 0.15766738660907129, + "grad_norm": 0.37989816069602966, + "learning_rate": 3e-05, + "loss": 2.6314, + "step": 511 + }, + { + "epoch": 0.15797593335390311, + "grad_norm": 0.7095012068748474, + "learning_rate": 3e-05, + "loss": 3.1006, + "step": 512 + }, + { + "epoch": 0.15828448009873497, + "grad_norm": 0.7077836990356445, + "learning_rate": 3e-05, + "loss": 2.7112, + "step": 513 + }, + { + "epoch": 0.1585930268435668, + "grad_norm": 0.6317113637924194, + "learning_rate": 3e-05, + "loss": 2.8544, + "step": 514 + }, + { + "epoch": 0.15890157358839865, + "grad_norm": 0.53568035364151, + "learning_rate": 3e-05, + "loss": 2.8805, + "step": 515 + }, + { + "epoch": 0.15921012033323048, + "grad_norm": 0.42564329504966736, + "learning_rate": 3e-05, + "loss": 2.4129, + "step": 516 + }, + { + "epoch": 0.15951866707806234, + "grad_norm": 0.8857656121253967, + "learning_rate": 3e-05, + "loss": 3.2094, + "step": 517 + }, + { + "epoch": 0.15982721382289417, + "grad_norm": 0.717786431312561, + "learning_rate": 3e-05, + "loss": 3.2728, + "step": 518 + }, + { + "epoch": 0.16013576056772602, + "grad_norm": 0.5533412098884583, + "learning_rate": 3e-05, + "loss": 3.1927, + "step": 519 + }, + { + "epoch": 0.16044430731255785, + "grad_norm": 0.5395249724388123, + "learning_rate": 3e-05, + "loss": 2.2287, + "step": 520 + }, + { + "epoch": 0.1607528540573897, + "grad_norm": 1.1015528440475464, + "learning_rate": 3e-05, + "loss": 3.1084, + "step": 521 + }, + { + "epoch": 0.16106140080222153, + "grad_norm": 0.4234045743942261, + "learning_rate": 3e-05, + "loss": 2.7718, + "step": 522 + }, + { + "epoch": 0.1613699475470534, + "grad_norm": 0.5807662606239319, + "learning_rate": 3e-05, + "loss": 2.5903, + "step": 523 + }, + { + "epoch": 0.16167849429188522, + "grad_norm": 0.6421242356300354, + "learning_rate": 3e-05, + "loss": 2.7322, + "step": 524 + }, + { + "epoch": 0.16198704103671707, + "grad_norm": 0.5576215386390686, + "learning_rate": 3e-05, + "loss": 2.4455, + "step": 525 + }, + { + "epoch": 0.1622955877815489, + "grad_norm": 0.3627455234527588, + "learning_rate": 3e-05, + "loss": 2.6403, + "step": 526 + }, + { + "epoch": 0.16260413452638076, + "grad_norm": 0.6139925122261047, + "learning_rate": 3e-05, + "loss": 2.7791, + "step": 527 + }, + { + "epoch": 0.16291268127121258, + "grad_norm": 0.475038081407547, + "learning_rate": 3e-05, + "loss": 2.3931, + "step": 528 + }, + { + "epoch": 0.16322122801604444, + "grad_norm": 0.5573770999908447, + "learning_rate": 3e-05, + "loss": 3.0576, + "step": 529 + }, + { + "epoch": 0.16352977476087627, + "grad_norm": 0.3340505361557007, + "learning_rate": 3e-05, + "loss": 2.427, + "step": 530 + }, + { + "epoch": 0.16383832150570812, + "grad_norm": 0.3796028792858124, + "learning_rate": 3e-05, + "loss": 2.623, + "step": 531 + }, + { + "epoch": 0.16414686825053995, + "grad_norm": 0.499763160943985, + "learning_rate": 3e-05, + "loss": 2.6086, + "step": 532 + }, + { + "epoch": 0.1644554149953718, + "grad_norm": 0.5351141691207886, + "learning_rate": 3e-05, + "loss": 2.773, + "step": 533 + }, + { + "epoch": 0.16476396174020363, + "grad_norm": 0.5103338956832886, + "learning_rate": 3e-05, + "loss": 2.8165, + "step": 534 + }, + { + "epoch": 0.1650725084850355, + "grad_norm": 0.49195244908332825, + "learning_rate": 3e-05, + "loss": 2.8805, + "step": 535 + }, + { + "epoch": 0.16538105522986732, + "grad_norm": 0.6390337347984314, + "learning_rate": 3e-05, + "loss": 2.556, + "step": 536 + }, + { + "epoch": 0.16568960197469917, + "grad_norm": 0.6301625370979309, + "learning_rate": 3e-05, + "loss": 2.8134, + "step": 537 + }, + { + "epoch": 0.165998148719531, + "grad_norm": 0.30376285314559937, + "learning_rate": 3e-05, + "loss": 2.1669, + "step": 538 + }, + { + "epoch": 0.16630669546436286, + "grad_norm": 0.4260924756526947, + "learning_rate": 3e-05, + "loss": 2.578, + "step": 539 + }, + { + "epoch": 0.16661524220919469, + "grad_norm": 0.9274812936782837, + "learning_rate": 3e-05, + "loss": 3.1846, + "step": 540 + }, + { + "epoch": 0.16692378895402654, + "grad_norm": 0.31101086735725403, + "learning_rate": 3e-05, + "loss": 2.3578, + "step": 541 + }, + { + "epoch": 0.16723233569885837, + "grad_norm": 0.4664568603038788, + "learning_rate": 3e-05, + "loss": 2.6042, + "step": 542 + }, + { + "epoch": 0.16754088244369023, + "grad_norm": 0.41750529408454895, + "learning_rate": 3e-05, + "loss": 2.707, + "step": 543 + }, + { + "epoch": 0.16784942918852205, + "grad_norm": 0.745094895362854, + "learning_rate": 3e-05, + "loss": 2.5031, + "step": 544 + }, + { + "epoch": 0.1681579759333539, + "grad_norm": 0.53835129737854, + "learning_rate": 3e-05, + "loss": 2.6455, + "step": 545 + }, + { + "epoch": 0.16846652267818574, + "grad_norm": 0.47485771775245667, + "learning_rate": 3e-05, + "loss": 2.8771, + "step": 546 + }, + { + "epoch": 0.1687750694230176, + "grad_norm": 0.427839457988739, + "learning_rate": 3e-05, + "loss": 2.4684, + "step": 547 + }, + { + "epoch": 0.16908361616784942, + "grad_norm": 0.459409236907959, + "learning_rate": 3e-05, + "loss": 2.834, + "step": 548 + }, + { + "epoch": 0.16939216291268128, + "grad_norm": 0.594539225101471, + "learning_rate": 3e-05, + "loss": 2.8524, + "step": 549 + }, + { + "epoch": 0.1697007096575131, + "grad_norm": 0.3290991485118866, + "learning_rate": 3e-05, + "loss": 2.4509, + "step": 550 + }, + { + "epoch": 0.17000925640234496, + "grad_norm": 0.6383848786354065, + "learning_rate": 3e-05, + "loss": 3.06, + "step": 551 + }, + { + "epoch": 0.1703178031471768, + "grad_norm": 0.4995775818824768, + "learning_rate": 3e-05, + "loss": 2.6543, + "step": 552 + }, + { + "epoch": 0.17062634989200864, + "grad_norm": 0.2920311391353607, + "learning_rate": 3e-05, + "loss": 2.3851, + "step": 553 + }, + { + "epoch": 0.17093489663684047, + "grad_norm": 0.37657687067985535, + "learning_rate": 3e-05, + "loss": 2.5885, + "step": 554 + }, + { + "epoch": 0.17124344338167233, + "grad_norm": 0.3489648997783661, + "learning_rate": 3e-05, + "loss": 2.3189, + "step": 555 + }, + { + "epoch": 0.17155199012650416, + "grad_norm": 0.5701825618743896, + "learning_rate": 3e-05, + "loss": 2.7273, + "step": 556 + }, + { + "epoch": 0.171860536871336, + "grad_norm": 0.4302188456058502, + "learning_rate": 3e-05, + "loss": 2.7104, + "step": 557 + }, + { + "epoch": 0.17216908361616784, + "grad_norm": 0.39768311381340027, + "learning_rate": 3e-05, + "loss": 2.7275, + "step": 558 + }, + { + "epoch": 0.1724776303609997, + "grad_norm": 0.3526521623134613, + "learning_rate": 3e-05, + "loss": 2.4574, + "step": 559 + }, + { + "epoch": 0.17278617710583152, + "grad_norm": 0.4580765962600708, + "learning_rate": 3e-05, + "loss": 2.6142, + "step": 560 + }, + { + "epoch": 0.17309472385066338, + "grad_norm": 0.6257379055023193, + "learning_rate": 3e-05, + "loss": 2.6212, + "step": 561 + }, + { + "epoch": 0.1734032705954952, + "grad_norm": 0.5225767493247986, + "learning_rate": 3e-05, + "loss": 3.0037, + "step": 562 + }, + { + "epoch": 0.17371181734032706, + "grad_norm": 0.3681781589984894, + "learning_rate": 3e-05, + "loss": 2.4477, + "step": 563 + }, + { + "epoch": 0.1740203640851589, + "grad_norm": 0.3147395849227905, + "learning_rate": 3e-05, + "loss": 2.291, + "step": 564 + }, + { + "epoch": 0.17432891082999075, + "grad_norm": 0.5599997639656067, + "learning_rate": 3e-05, + "loss": 2.761, + "step": 565 + }, + { + "epoch": 0.17463745757482257, + "grad_norm": 0.895828127861023, + "learning_rate": 3e-05, + "loss": 3.2657, + "step": 566 + }, + { + "epoch": 0.17494600431965443, + "grad_norm": 0.48360684514045715, + "learning_rate": 3e-05, + "loss": 3.0389, + "step": 567 + }, + { + "epoch": 0.17525455106448626, + "grad_norm": 0.4734765887260437, + "learning_rate": 3e-05, + "loss": 2.7359, + "step": 568 + }, + { + "epoch": 0.1755630978093181, + "grad_norm": 0.5472472310066223, + "learning_rate": 3e-05, + "loss": 2.5452, + "step": 569 + }, + { + "epoch": 0.17587164455414994, + "grad_norm": 0.5069577693939209, + "learning_rate": 3e-05, + "loss": 3.0763, + "step": 570 + }, + { + "epoch": 0.1761801912989818, + "grad_norm": 0.6229672431945801, + "learning_rate": 3e-05, + "loss": 3.2135, + "step": 571 + }, + { + "epoch": 0.17648873804381363, + "grad_norm": 0.5861048102378845, + "learning_rate": 3e-05, + "loss": 3.4906, + "step": 572 + }, + { + "epoch": 0.17679728478864548, + "grad_norm": 0.4805934429168701, + "learning_rate": 3e-05, + "loss": 2.8969, + "step": 573 + }, + { + "epoch": 0.1771058315334773, + "grad_norm": 0.6094735860824585, + "learning_rate": 3e-05, + "loss": 3.0226, + "step": 574 + }, + { + "epoch": 0.17741437827830916, + "grad_norm": 0.44033339619636536, + "learning_rate": 3e-05, + "loss": 2.4886, + "step": 575 + }, + { + "epoch": 0.177722925023141, + "grad_norm": 1.1240124702453613, + "learning_rate": 3e-05, + "loss": 3.4272, + "step": 576 + }, + { + "epoch": 0.17803147176797285, + "grad_norm": 0.4928964078426361, + "learning_rate": 3e-05, + "loss": 2.3991, + "step": 577 + }, + { + "epoch": 0.17834001851280468, + "grad_norm": 0.45760032534599304, + "learning_rate": 3e-05, + "loss": 2.5567, + "step": 578 + }, + { + "epoch": 0.17864856525763653, + "grad_norm": 0.46122652292251587, + "learning_rate": 3e-05, + "loss": 2.4686, + "step": 579 + }, + { + "epoch": 0.17895711200246836, + "grad_norm": 0.4241897761821747, + "learning_rate": 3e-05, + "loss": 2.3896, + "step": 580 + }, + { + "epoch": 0.17926565874730022, + "grad_norm": 0.7329486012458801, + "learning_rate": 3e-05, + "loss": 2.8377, + "step": 581 + }, + { + "epoch": 0.17957420549213207, + "grad_norm": 0.5408822894096375, + "learning_rate": 3e-05, + "loss": 2.895, + "step": 582 + }, + { + "epoch": 0.1798827522369639, + "grad_norm": 0.4442683756351471, + "learning_rate": 3e-05, + "loss": 2.4398, + "step": 583 + }, + { + "epoch": 0.18019129898179576, + "grad_norm": 0.7230685949325562, + "learning_rate": 3e-05, + "loss": 3.0208, + "step": 584 + }, + { + "epoch": 0.18049984572662758, + "grad_norm": 0.45991250872612, + "learning_rate": 3e-05, + "loss": 2.8429, + "step": 585 + }, + { + "epoch": 0.18080839247145944, + "grad_norm": 0.5567431449890137, + "learning_rate": 3e-05, + "loss": 3.0907, + "step": 586 + }, + { + "epoch": 0.18111693921629127, + "grad_norm": 0.39667779207229614, + "learning_rate": 3e-05, + "loss": 2.5474, + "step": 587 + }, + { + "epoch": 0.18142548596112312, + "grad_norm": 0.4259324371814728, + "learning_rate": 3e-05, + "loss": 2.937, + "step": 588 + }, + { + "epoch": 0.18173403270595495, + "grad_norm": 0.5309762358665466, + "learning_rate": 3e-05, + "loss": 2.904, + "step": 589 + }, + { + "epoch": 0.1820425794507868, + "grad_norm": 0.4646052420139313, + "learning_rate": 3e-05, + "loss": 2.7825, + "step": 590 + }, + { + "epoch": 0.18235112619561863, + "grad_norm": 0.5502670407295227, + "learning_rate": 3e-05, + "loss": 3.3798, + "step": 591 + }, + { + "epoch": 0.1826596729404505, + "grad_norm": 0.4678283929824829, + "learning_rate": 3e-05, + "loss": 2.7709, + "step": 592 + }, + { + "epoch": 0.18296821968528232, + "grad_norm": 0.44272318482398987, + "learning_rate": 3e-05, + "loss": 2.7197, + "step": 593 + }, + { + "epoch": 0.18327676643011417, + "grad_norm": 0.34605056047439575, + "learning_rate": 3e-05, + "loss": 2.6256, + "step": 594 + }, + { + "epoch": 0.183585313174946, + "grad_norm": 0.32907217741012573, + "learning_rate": 3e-05, + "loss": 2.183, + "step": 595 + }, + { + "epoch": 0.18389385991977786, + "grad_norm": 0.5004732608795166, + "learning_rate": 3e-05, + "loss": 2.4808, + "step": 596 + }, + { + "epoch": 0.18420240666460969, + "grad_norm": 0.4207105338573456, + "learning_rate": 3e-05, + "loss": 2.7112, + "step": 597 + }, + { + "epoch": 0.18451095340944154, + "grad_norm": 0.4427211284637451, + "learning_rate": 3e-05, + "loss": 2.7198, + "step": 598 + }, + { + "epoch": 0.18481950015427337, + "grad_norm": 0.3510686159133911, + "learning_rate": 3e-05, + "loss": 2.6319, + "step": 599 + }, + { + "epoch": 0.18512804689910523, + "grad_norm": 0.3347536623477936, + "learning_rate": 3e-05, + "loss": 2.479, + "step": 600 + }, + { + "epoch": 0.18543659364393705, + "grad_norm": 0.4486810266971588, + "learning_rate": 3e-05, + "loss": 2.7633, + "step": 601 + }, + { + "epoch": 0.1857451403887689, + "grad_norm": 0.7267583012580872, + "learning_rate": 3e-05, + "loss": 3.2443, + "step": 602 + }, + { + "epoch": 0.18605368713360074, + "grad_norm": 0.5485150814056396, + "learning_rate": 3e-05, + "loss": 3.0257, + "step": 603 + }, + { + "epoch": 0.1863622338784326, + "grad_norm": 0.6103752255439758, + "learning_rate": 3e-05, + "loss": 2.8336, + "step": 604 + }, + { + "epoch": 0.18667078062326442, + "grad_norm": 0.4755418002605438, + "learning_rate": 3e-05, + "loss": 2.7703, + "step": 605 + }, + { + "epoch": 0.18697932736809628, + "grad_norm": 0.523270308971405, + "learning_rate": 3e-05, + "loss": 2.792, + "step": 606 + }, + { + "epoch": 0.1872878741129281, + "grad_norm": 0.7174893021583557, + "learning_rate": 3e-05, + "loss": 3.1485, + "step": 607 + }, + { + "epoch": 0.18759642085775996, + "grad_norm": 0.4338546097278595, + "learning_rate": 3e-05, + "loss": 2.6676, + "step": 608 + }, + { + "epoch": 0.1879049676025918, + "grad_norm": 0.5212303996086121, + "learning_rate": 3e-05, + "loss": 2.5303, + "step": 609 + }, + { + "epoch": 0.18821351434742364, + "grad_norm": 0.4013713002204895, + "learning_rate": 3e-05, + "loss": 2.5814, + "step": 610 + }, + { + "epoch": 0.18852206109225547, + "grad_norm": 0.38565197587013245, + "learning_rate": 3e-05, + "loss": 2.5641, + "step": 611 + }, + { + "epoch": 0.18883060783708733, + "grad_norm": 0.5289196372032166, + "learning_rate": 3e-05, + "loss": 3.1017, + "step": 612 + }, + { + "epoch": 0.18913915458191916, + "grad_norm": 0.4261009097099304, + "learning_rate": 3e-05, + "loss": 2.7536, + "step": 613 + }, + { + "epoch": 0.189447701326751, + "grad_norm": 0.6289636492729187, + "learning_rate": 3e-05, + "loss": 2.5995, + "step": 614 + }, + { + "epoch": 0.18975624807158284, + "grad_norm": 0.3841557800769806, + "learning_rate": 3e-05, + "loss": 2.9889, + "step": 615 + }, + { + "epoch": 0.1900647948164147, + "grad_norm": 0.37914833426475525, + "learning_rate": 3e-05, + "loss": 2.4147, + "step": 616 + }, + { + "epoch": 0.19037334156124652, + "grad_norm": 0.5321159958839417, + "learning_rate": 3e-05, + "loss": 2.8679, + "step": 617 + }, + { + "epoch": 0.19068188830607838, + "grad_norm": 0.4842563271522522, + "learning_rate": 3e-05, + "loss": 2.7819, + "step": 618 + }, + { + "epoch": 0.1909904350509102, + "grad_norm": 0.35459670424461365, + "learning_rate": 3e-05, + "loss": 2.768, + "step": 619 + }, + { + "epoch": 0.19129898179574206, + "grad_norm": 0.6710373163223267, + "learning_rate": 3e-05, + "loss": 3.077, + "step": 620 + }, + { + "epoch": 0.1916075285405739, + "grad_norm": 0.4304696023464203, + "learning_rate": 3e-05, + "loss": 2.4043, + "step": 621 + }, + { + "epoch": 0.19191607528540575, + "grad_norm": 0.500830352306366, + "learning_rate": 3e-05, + "loss": 2.6387, + "step": 622 + }, + { + "epoch": 0.19222462203023757, + "grad_norm": 0.5076797008514404, + "learning_rate": 3e-05, + "loss": 2.3244, + "step": 623 + }, + { + "epoch": 0.19253316877506943, + "grad_norm": 0.30540212988853455, + "learning_rate": 3e-05, + "loss": 2.0853, + "step": 624 + }, + { + "epoch": 0.19284171551990126, + "grad_norm": 0.6184133887290955, + "learning_rate": 3e-05, + "loss": 2.6062, + "step": 625 + }, + { + "epoch": 0.1931502622647331, + "grad_norm": 0.4118046164512634, + "learning_rate": 3e-05, + "loss": 2.1961, + "step": 626 + }, + { + "epoch": 0.19345880900956494, + "grad_norm": 0.6084356904029846, + "learning_rate": 3e-05, + "loss": 2.8398, + "step": 627 + }, + { + "epoch": 0.1937673557543968, + "grad_norm": 0.44872429966926575, + "learning_rate": 3e-05, + "loss": 2.9109, + "step": 628 + }, + { + "epoch": 0.19407590249922863, + "grad_norm": 0.5909293293952942, + "learning_rate": 3e-05, + "loss": 2.4278, + "step": 629 + }, + { + "epoch": 0.19438444924406048, + "grad_norm": 0.5620918273925781, + "learning_rate": 3e-05, + "loss": 2.8395, + "step": 630 + }, + { + "epoch": 0.1946929959888923, + "grad_norm": 0.5947256088256836, + "learning_rate": 3e-05, + "loss": 3.1204, + "step": 631 + }, + { + "epoch": 0.19500154273372416, + "grad_norm": 0.42888760566711426, + "learning_rate": 3e-05, + "loss": 2.6668, + "step": 632 + }, + { + "epoch": 0.195310089478556, + "grad_norm": 0.8526038527488708, + "learning_rate": 3e-05, + "loss": 2.9337, + "step": 633 + }, + { + "epoch": 0.19561863622338785, + "grad_norm": 0.5663727521896362, + "learning_rate": 3e-05, + "loss": 3.103, + "step": 634 + }, + { + "epoch": 0.19592718296821968, + "grad_norm": 0.5431751012802124, + "learning_rate": 3e-05, + "loss": 2.6397, + "step": 635 + }, + { + "epoch": 0.19623572971305153, + "grad_norm": 0.5891326069831848, + "learning_rate": 3e-05, + "loss": 3.2095, + "step": 636 + }, + { + "epoch": 0.19654427645788336, + "grad_norm": 0.5660468339920044, + "learning_rate": 3e-05, + "loss": 2.9853, + "step": 637 + }, + { + "epoch": 0.19685282320271522, + "grad_norm": 0.3460230529308319, + "learning_rate": 3e-05, + "loss": 2.4755, + "step": 638 + }, + { + "epoch": 0.19716136994754704, + "grad_norm": 0.30621814727783203, + "learning_rate": 3e-05, + "loss": 2.5505, + "step": 639 + }, + { + "epoch": 0.1974699166923789, + "grad_norm": 0.4897528290748596, + "learning_rate": 3e-05, + "loss": 2.8266, + "step": 640 + }, + { + "epoch": 0.19777846343721073, + "grad_norm": 0.7390705347061157, + "learning_rate": 3e-05, + "loss": 3.0495, + "step": 641 + }, + { + "epoch": 0.19808701018204258, + "grad_norm": 0.403690904378891, + "learning_rate": 3e-05, + "loss": 2.6824, + "step": 642 + }, + { + "epoch": 0.1983955569268744, + "grad_norm": 0.6214193105697632, + "learning_rate": 3e-05, + "loss": 2.9677, + "step": 643 + }, + { + "epoch": 0.19870410367170627, + "grad_norm": 0.3992424011230469, + "learning_rate": 3e-05, + "loss": 2.7133, + "step": 644 + }, + { + "epoch": 0.1990126504165381, + "grad_norm": 0.5738241672515869, + "learning_rate": 3e-05, + "loss": 2.7717, + "step": 645 + }, + { + "epoch": 0.19932119716136995, + "grad_norm": 0.5687406659126282, + "learning_rate": 3e-05, + "loss": 3.1489, + "step": 646 + }, + { + "epoch": 0.19962974390620178, + "grad_norm": 0.44407641887664795, + "learning_rate": 3e-05, + "loss": 3.0794, + "step": 647 + }, + { + "epoch": 0.19993829065103363, + "grad_norm": 0.5712667107582092, + "learning_rate": 3e-05, + "loss": 2.7482, + "step": 648 + }, + { + "epoch": 0.20024683739586546, + "grad_norm": 0.3764342963695526, + "learning_rate": 3e-05, + "loss": 2.6002, + "step": 649 + }, + { + "epoch": 0.20055538414069732, + "grad_norm": 0.3133735954761505, + "learning_rate": 3e-05, + "loss": 2.3715, + "step": 650 + }, + { + "epoch": 0.20086393088552915, + "grad_norm": 0.5320349931716919, + "learning_rate": 3e-05, + "loss": 3.0982, + "step": 651 + }, + { + "epoch": 0.201172477630361, + "grad_norm": 0.498043030500412, + "learning_rate": 3e-05, + "loss": 3.0611, + "step": 652 + }, + { + "epoch": 0.20148102437519283, + "grad_norm": 0.611578106880188, + "learning_rate": 3e-05, + "loss": 3.2704, + "step": 653 + }, + { + "epoch": 0.20178957112002469, + "grad_norm": 0.43669798970222473, + "learning_rate": 3e-05, + "loss": 2.5288, + "step": 654 + }, + { + "epoch": 0.2020981178648565, + "grad_norm": 0.4220907688140869, + "learning_rate": 3e-05, + "loss": 2.8483, + "step": 655 + }, + { + "epoch": 0.20240666460968837, + "grad_norm": 0.46355968713760376, + "learning_rate": 3e-05, + "loss": 3.1131, + "step": 656 + }, + { + "epoch": 0.2027152113545202, + "grad_norm": 0.3843071758747101, + "learning_rate": 3e-05, + "loss": 2.6449, + "step": 657 + }, + { + "epoch": 0.20302375809935205, + "grad_norm": 0.3454858660697937, + "learning_rate": 3e-05, + "loss": 2.4922, + "step": 658 + }, + { + "epoch": 0.20333230484418388, + "grad_norm": 0.31287965178489685, + "learning_rate": 3e-05, + "loss": 2.5445, + "step": 659 + }, + { + "epoch": 0.20364085158901574, + "grad_norm": 0.49061527848243713, + "learning_rate": 3e-05, + "loss": 2.4979, + "step": 660 + }, + { + "epoch": 0.20394939833384756, + "grad_norm": 0.5406720638275146, + "learning_rate": 3e-05, + "loss": 2.7902, + "step": 661 + }, + { + "epoch": 0.20425794507867942, + "grad_norm": 0.5429649353027344, + "learning_rate": 3e-05, + "loss": 2.8121, + "step": 662 + }, + { + "epoch": 0.20456649182351125, + "grad_norm": 0.47672000527381897, + "learning_rate": 3e-05, + "loss": 3.1438, + "step": 663 + }, + { + "epoch": 0.2048750385683431, + "grad_norm": 0.3863488435745239, + "learning_rate": 3e-05, + "loss": 2.4956, + "step": 664 + }, + { + "epoch": 0.20518358531317496, + "grad_norm": 0.5311136245727539, + "learning_rate": 3e-05, + "loss": 2.404, + "step": 665 + }, + { + "epoch": 0.2054921320580068, + "grad_norm": 0.30825114250183105, + "learning_rate": 3e-05, + "loss": 2.2225, + "step": 666 + }, + { + "epoch": 0.20580067880283864, + "grad_norm": 0.4122128188610077, + "learning_rate": 3e-05, + "loss": 2.4823, + "step": 667 + }, + { + "epoch": 0.20610922554767047, + "grad_norm": 0.45997029542922974, + "learning_rate": 3e-05, + "loss": 2.456, + "step": 668 + }, + { + "epoch": 0.20641777229250233, + "grad_norm": 0.38074791431427, + "learning_rate": 3e-05, + "loss": 2.4485, + "step": 669 + }, + { + "epoch": 0.20672631903733416, + "grad_norm": 0.7642366886138916, + "learning_rate": 3e-05, + "loss": 3.2876, + "step": 670 + }, + { + "epoch": 0.207034865782166, + "grad_norm": 0.6110647320747375, + "learning_rate": 3e-05, + "loss": 2.901, + "step": 671 + }, + { + "epoch": 0.20734341252699784, + "grad_norm": 0.36458998918533325, + "learning_rate": 3e-05, + "loss": 2.2886, + "step": 672 + }, + { + "epoch": 0.2076519592718297, + "grad_norm": 0.49400243163108826, + "learning_rate": 3e-05, + "loss": 2.7685, + "step": 673 + }, + { + "epoch": 0.20796050601666152, + "grad_norm": 0.4072665274143219, + "learning_rate": 3e-05, + "loss": 2.6262, + "step": 674 + }, + { + "epoch": 0.20826905276149338, + "grad_norm": 0.3744329512119293, + "learning_rate": 3e-05, + "loss": 2.6089, + "step": 675 + }, + { + "epoch": 0.2085775995063252, + "grad_norm": 0.4581242501735687, + "learning_rate": 3e-05, + "loss": 2.6608, + "step": 676 + }, + { + "epoch": 0.20888614625115706, + "grad_norm": 0.46983492374420166, + "learning_rate": 3e-05, + "loss": 2.4597, + "step": 677 + }, + { + "epoch": 0.2091946929959889, + "grad_norm": 0.5508672595024109, + "learning_rate": 3e-05, + "loss": 2.8413, + "step": 678 + }, + { + "epoch": 0.20950323974082075, + "grad_norm": 0.3159502148628235, + "learning_rate": 3e-05, + "loss": 2.3252, + "step": 679 + }, + { + "epoch": 0.20981178648565257, + "grad_norm": 0.5377674102783203, + "learning_rate": 3e-05, + "loss": 2.2233, + "step": 680 + }, + { + "epoch": 0.21012033323048443, + "grad_norm": 0.40905702114105225, + "learning_rate": 3e-05, + "loss": 2.1659, + "step": 681 + }, + { + "epoch": 0.21042887997531626, + "grad_norm": 0.3416579067707062, + "learning_rate": 3e-05, + "loss": 2.0977, + "step": 682 + }, + { + "epoch": 0.2107374267201481, + "grad_norm": 0.5408785939216614, + "learning_rate": 3e-05, + "loss": 3.0337, + "step": 683 + }, + { + "epoch": 0.21104597346497994, + "grad_norm": 0.6323843002319336, + "learning_rate": 3e-05, + "loss": 2.9733, + "step": 684 + }, + { + "epoch": 0.2113545202098118, + "grad_norm": 0.8620406985282898, + "learning_rate": 3e-05, + "loss": 3.3109, + "step": 685 + }, + { + "epoch": 0.21166306695464362, + "grad_norm": 0.38318270444869995, + "learning_rate": 3e-05, + "loss": 2.786, + "step": 686 + }, + { + "epoch": 0.21197161369947548, + "grad_norm": 0.44390928745269775, + "learning_rate": 3e-05, + "loss": 2.9674, + "step": 687 + }, + { + "epoch": 0.2122801604443073, + "grad_norm": 0.6823718547821045, + "learning_rate": 3e-05, + "loss": 3.0883, + "step": 688 + }, + { + "epoch": 0.21258870718913916, + "grad_norm": 0.451790988445282, + "learning_rate": 3e-05, + "loss": 2.5927, + "step": 689 + }, + { + "epoch": 0.212897253933971, + "grad_norm": 0.4029495120048523, + "learning_rate": 3e-05, + "loss": 2.6611, + "step": 690 + }, + { + "epoch": 0.21320580067880285, + "grad_norm": 0.4392080307006836, + "learning_rate": 3e-05, + "loss": 2.7574, + "step": 691 + }, + { + "epoch": 0.21351434742363468, + "grad_norm": 0.6579667925834656, + "learning_rate": 3e-05, + "loss": 3.1202, + "step": 692 + }, + { + "epoch": 0.21382289416846653, + "grad_norm": 0.5804861783981323, + "learning_rate": 3e-05, + "loss": 3.3316, + "step": 693 + }, + { + "epoch": 0.21413144091329836, + "grad_norm": 0.4412527084350586, + "learning_rate": 3e-05, + "loss": 3.7257, + "step": 694 + }, + { + "epoch": 0.21443998765813022, + "grad_norm": 0.6037363409996033, + "learning_rate": 3e-05, + "loss": 2.5479, + "step": 695 + }, + { + "epoch": 0.21474853440296204, + "grad_norm": 0.6885823011398315, + "learning_rate": 3e-05, + "loss": 3.063, + "step": 696 + }, + { + "epoch": 0.2150570811477939, + "grad_norm": 0.5372409224510193, + "learning_rate": 3e-05, + "loss": 2.7169, + "step": 697 + }, + { + "epoch": 0.21536562789262573, + "grad_norm": 0.5530219674110413, + "learning_rate": 3e-05, + "loss": 3.0568, + "step": 698 + }, + { + "epoch": 0.21567417463745758, + "grad_norm": 0.7457128167152405, + "learning_rate": 3e-05, + "loss": 2.5894, + "step": 699 + }, + { + "epoch": 0.2159827213822894, + "grad_norm": 0.6711127161979675, + "learning_rate": 3e-05, + "loss": 2.7789, + "step": 700 + }, + { + "epoch": 0.21629126812712127, + "grad_norm": 0.6075369119644165, + "learning_rate": 3e-05, + "loss": 2.9943, + "step": 701 + }, + { + "epoch": 0.2165998148719531, + "grad_norm": 0.5779369473457336, + "learning_rate": 3e-05, + "loss": 2.5556, + "step": 702 + }, + { + "epoch": 0.21690836161678495, + "grad_norm": 0.9133353233337402, + "learning_rate": 3e-05, + "loss": 2.6227, + "step": 703 + }, + { + "epoch": 0.21721690836161678, + "grad_norm": 0.8465149402618408, + "learning_rate": 3e-05, + "loss": 3.1094, + "step": 704 + }, + { + "epoch": 0.21752545510644863, + "grad_norm": 0.30936139822006226, + "learning_rate": 3e-05, + "loss": 2.7395, + "step": 705 + }, + { + "epoch": 0.21783400185128046, + "grad_norm": 0.7300909757614136, + "learning_rate": 3e-05, + "loss": 2.9833, + "step": 706 + }, + { + "epoch": 0.21814254859611232, + "grad_norm": 0.7368529438972473, + "learning_rate": 3e-05, + "loss": 2.7301, + "step": 707 + }, + { + "epoch": 0.21845109534094415, + "grad_norm": 0.4861367642879486, + "learning_rate": 3e-05, + "loss": 2.5439, + "step": 708 + }, + { + "epoch": 0.218759642085776, + "grad_norm": 0.4448826313018799, + "learning_rate": 3e-05, + "loss": 2.7462, + "step": 709 + }, + { + "epoch": 0.21906818883060783, + "grad_norm": 0.43111348152160645, + "learning_rate": 3e-05, + "loss": 2.2179, + "step": 710 + }, + { + "epoch": 0.21937673557543969, + "grad_norm": 0.6709856390953064, + "learning_rate": 3e-05, + "loss": 2.3648, + "step": 711 + }, + { + "epoch": 0.2196852823202715, + "grad_norm": 0.5811859965324402, + "learning_rate": 3e-05, + "loss": 2.2098, + "step": 712 + }, + { + "epoch": 0.21999382906510337, + "grad_norm": 0.44703492522239685, + "learning_rate": 3e-05, + "loss": 2.4668, + "step": 713 + }, + { + "epoch": 0.2203023758099352, + "grad_norm": 0.32512566447257996, + "learning_rate": 3e-05, + "loss": 2.5741, + "step": 714 + }, + { + "epoch": 0.22061092255476705, + "grad_norm": 0.4460441768169403, + "learning_rate": 3e-05, + "loss": 2.5822, + "step": 715 + }, + { + "epoch": 0.22091946929959888, + "grad_norm": 0.46903368830680847, + "learning_rate": 3e-05, + "loss": 2.3405, + "step": 716 + }, + { + "epoch": 0.22122801604443074, + "grad_norm": 0.4886974096298218, + "learning_rate": 3e-05, + "loss": 2.3086, + "step": 717 + }, + { + "epoch": 0.22153656278926256, + "grad_norm": 0.7623785734176636, + "learning_rate": 3e-05, + "loss": 3.0441, + "step": 718 + }, + { + "epoch": 0.22184510953409442, + "grad_norm": 0.39860019087791443, + "learning_rate": 3e-05, + "loss": 2.7577, + "step": 719 + }, + { + "epoch": 0.22215365627892625, + "grad_norm": 0.5369051098823547, + "learning_rate": 3e-05, + "loss": 2.4445, + "step": 720 + }, + { + "epoch": 0.2224622030237581, + "grad_norm": 0.6205699443817139, + "learning_rate": 3e-05, + "loss": 3.322, + "step": 721 + }, + { + "epoch": 0.22277074976858993, + "grad_norm": 0.4737318456172943, + "learning_rate": 3e-05, + "loss": 2.5368, + "step": 722 + }, + { + "epoch": 0.2230792965134218, + "grad_norm": 0.4158051311969757, + "learning_rate": 3e-05, + "loss": 2.3423, + "step": 723 + }, + { + "epoch": 0.22338784325825362, + "grad_norm": 0.44715094566345215, + "learning_rate": 3e-05, + "loss": 2.9117, + "step": 724 + }, + { + "epoch": 0.22369639000308547, + "grad_norm": 0.40355610847473145, + "learning_rate": 3e-05, + "loss": 2.8261, + "step": 725 + }, + { + "epoch": 0.2240049367479173, + "grad_norm": 0.40845543146133423, + "learning_rate": 3e-05, + "loss": 2.9491, + "step": 726 + }, + { + "epoch": 0.22431348349274915, + "grad_norm": 0.40344637632369995, + "learning_rate": 3e-05, + "loss": 2.6911, + "step": 727 + }, + { + "epoch": 0.22462203023758098, + "grad_norm": 0.44544005393981934, + "learning_rate": 3e-05, + "loss": 2.5146, + "step": 728 + }, + { + "epoch": 0.22493057698241284, + "grad_norm": 0.33173346519470215, + "learning_rate": 3e-05, + "loss": 2.3432, + "step": 729 + }, + { + "epoch": 0.22523912372724467, + "grad_norm": 0.3437408208847046, + "learning_rate": 3e-05, + "loss": 2.642, + "step": 730 + }, + { + "epoch": 0.22554767047207652, + "grad_norm": 0.3419845700263977, + "learning_rate": 3e-05, + "loss": 2.7049, + "step": 731 + }, + { + "epoch": 0.22585621721690835, + "grad_norm": 0.38495057821273804, + "learning_rate": 3e-05, + "loss": 2.7977, + "step": 732 + }, + { + "epoch": 0.2261647639617402, + "grad_norm": 0.46839049458503723, + "learning_rate": 3e-05, + "loss": 2.3997, + "step": 733 + }, + { + "epoch": 0.22647331070657203, + "grad_norm": 0.5214048624038696, + "learning_rate": 3e-05, + "loss": 3.0801, + "step": 734 + }, + { + "epoch": 0.2267818574514039, + "grad_norm": 0.32730308175086975, + "learning_rate": 3e-05, + "loss": 2.6502, + "step": 735 + }, + { + "epoch": 0.22709040419623572, + "grad_norm": 0.459028035402298, + "learning_rate": 3e-05, + "loss": 2.8593, + "step": 736 + }, + { + "epoch": 0.22739895094106757, + "grad_norm": 0.5912216901779175, + "learning_rate": 3e-05, + "loss": 3.0517, + "step": 737 + }, + { + "epoch": 0.2277074976858994, + "grad_norm": 0.5747165083885193, + "learning_rate": 3e-05, + "loss": 3.1073, + "step": 738 + }, + { + "epoch": 0.22801604443073126, + "grad_norm": 0.39477410912513733, + "learning_rate": 3e-05, + "loss": 2.531, + "step": 739 + }, + { + "epoch": 0.22832459117556309, + "grad_norm": 0.47743627429008484, + "learning_rate": 3e-05, + "loss": 2.7878, + "step": 740 + }, + { + "epoch": 0.22863313792039494, + "grad_norm": 0.6934757232666016, + "learning_rate": 3e-05, + "loss": 2.844, + "step": 741 + }, + { + "epoch": 0.22894168466522677, + "grad_norm": 0.39980217814445496, + "learning_rate": 3e-05, + "loss": 2.7462, + "step": 742 + }, + { + "epoch": 0.22925023141005862, + "grad_norm": 0.393174409866333, + "learning_rate": 3e-05, + "loss": 2.6384, + "step": 743 + }, + { + "epoch": 0.22955877815489045, + "grad_norm": 0.5676349997520447, + "learning_rate": 3e-05, + "loss": 3.0358, + "step": 744 + }, + { + "epoch": 0.2298673248997223, + "grad_norm": 0.8145315647125244, + "learning_rate": 3e-05, + "loss": 3.4264, + "step": 745 + }, + { + "epoch": 0.23017587164455414, + "grad_norm": 0.3674360513687134, + "learning_rate": 3e-05, + "loss": 2.6974, + "step": 746 + }, + { + "epoch": 0.230484418389386, + "grad_norm": 0.39227521419525146, + "learning_rate": 3e-05, + "loss": 2.466, + "step": 747 + }, + { + "epoch": 0.23079296513421785, + "grad_norm": 0.6145668625831604, + "learning_rate": 3e-05, + "loss": 2.9016, + "step": 748 + }, + { + "epoch": 0.23110151187904968, + "grad_norm": 0.3767462968826294, + "learning_rate": 3e-05, + "loss": 2.8179, + "step": 749 + }, + { + "epoch": 0.23141005862388153, + "grad_norm": 0.358978807926178, + "learning_rate": 3e-05, + "loss": 2.4825, + "step": 750 + }, + { + "epoch": 0.23171860536871336, + "grad_norm": 0.4695563018321991, + "learning_rate": 3e-05, + "loss": 3.0992, + "step": 751 + }, + { + "epoch": 0.23202715211354522, + "grad_norm": 0.4999081790447235, + "learning_rate": 3e-05, + "loss": 2.8782, + "step": 752 + }, + { + "epoch": 0.23233569885837704, + "grad_norm": 0.4756012558937073, + "learning_rate": 3e-05, + "loss": 3.299, + "step": 753 + }, + { + "epoch": 0.2326442456032089, + "grad_norm": 0.42113542556762695, + "learning_rate": 3e-05, + "loss": 2.9299, + "step": 754 + }, + { + "epoch": 0.23295279234804073, + "grad_norm": 0.4862482249736786, + "learning_rate": 3e-05, + "loss": 2.6376, + "step": 755 + }, + { + "epoch": 0.23326133909287258, + "grad_norm": 0.44577065110206604, + "learning_rate": 3e-05, + "loss": 2.8854, + "step": 756 + }, + { + "epoch": 0.2335698858377044, + "grad_norm": 0.36711904406547546, + "learning_rate": 3e-05, + "loss": 2.7238, + "step": 757 + }, + { + "epoch": 0.23387843258253627, + "grad_norm": 0.45583394169807434, + "learning_rate": 3e-05, + "loss": 2.9171, + "step": 758 + }, + { + "epoch": 0.2341869793273681, + "grad_norm": 0.4282878041267395, + "learning_rate": 3e-05, + "loss": 2.753, + "step": 759 + }, + { + "epoch": 0.23449552607219995, + "grad_norm": 0.3927743136882782, + "learning_rate": 3e-05, + "loss": 2.8325, + "step": 760 + }, + { + "epoch": 0.23480407281703178, + "grad_norm": 0.4064248204231262, + "learning_rate": 3e-05, + "loss": 2.3949, + "step": 761 + }, + { + "epoch": 0.23511261956186363, + "grad_norm": 0.5085972547531128, + "learning_rate": 3e-05, + "loss": 3.3134, + "step": 762 + }, + { + "epoch": 0.23542116630669546, + "grad_norm": 0.43735840916633606, + "learning_rate": 3e-05, + "loss": 2.8876, + "step": 763 + }, + { + "epoch": 0.23572971305152732, + "grad_norm": 0.5270010232925415, + "learning_rate": 3e-05, + "loss": 3.1755, + "step": 764 + }, + { + "epoch": 0.23603825979635915, + "grad_norm": 0.4824548065662384, + "learning_rate": 3e-05, + "loss": 2.8461, + "step": 765 + }, + { + "epoch": 0.236346806541191, + "grad_norm": 0.5891832113265991, + "learning_rate": 3e-05, + "loss": 2.864, + "step": 766 + }, + { + "epoch": 0.23665535328602283, + "grad_norm": 0.3647357225418091, + "learning_rate": 3e-05, + "loss": 3.0185, + "step": 767 + }, + { + "epoch": 0.23696390003085469, + "grad_norm": 0.3849000930786133, + "learning_rate": 3e-05, + "loss": 2.5547, + "step": 768 + }, + { + "epoch": 0.2372724467756865, + "grad_norm": 0.3377460241317749, + "learning_rate": 3e-05, + "loss": 2.3713, + "step": 769 + }, + { + "epoch": 0.23758099352051837, + "grad_norm": 0.4434642493724823, + "learning_rate": 3e-05, + "loss": 2.5315, + "step": 770 + }, + { + "epoch": 0.2378895402653502, + "grad_norm": 0.3554992079734802, + "learning_rate": 3e-05, + "loss": 2.2536, + "step": 771 + }, + { + "epoch": 0.23819808701018205, + "grad_norm": 0.49346596002578735, + "learning_rate": 3e-05, + "loss": 2.6857, + "step": 772 + }, + { + "epoch": 0.23850663375501388, + "grad_norm": 0.5243836641311646, + "learning_rate": 3e-05, + "loss": 2.9841, + "step": 773 + }, + { + "epoch": 0.23881518049984574, + "grad_norm": 0.47306209802627563, + "learning_rate": 3e-05, + "loss": 2.5651, + "step": 774 + }, + { + "epoch": 0.23912372724467756, + "grad_norm": 0.4085429310798645, + "learning_rate": 3e-05, + "loss": 2.7709, + "step": 775 + }, + { + "epoch": 0.23943227398950942, + "grad_norm": 0.5058587789535522, + "learning_rate": 3e-05, + "loss": 2.8431, + "step": 776 + }, + { + "epoch": 0.23974082073434125, + "grad_norm": 0.423749178647995, + "learning_rate": 3e-05, + "loss": 3.1645, + "step": 777 + }, + { + "epoch": 0.2400493674791731, + "grad_norm": 0.44455215334892273, + "learning_rate": 3e-05, + "loss": 2.4221, + "step": 778 + }, + { + "epoch": 0.24035791422400493, + "grad_norm": 0.4022437334060669, + "learning_rate": 3e-05, + "loss": 2.7517, + "step": 779 + }, + { + "epoch": 0.2406664609688368, + "grad_norm": 0.3913156986236572, + "learning_rate": 3e-05, + "loss": 2.8265, + "step": 780 + }, + { + "epoch": 0.24097500771366862, + "grad_norm": 0.5418980717658997, + "learning_rate": 3e-05, + "loss": 3.1478, + "step": 781 + }, + { + "epoch": 0.24128355445850047, + "grad_norm": 0.3950507938861847, + "learning_rate": 3e-05, + "loss": 2.5474, + "step": 782 + }, + { + "epoch": 0.2415921012033323, + "grad_norm": 0.43344080448150635, + "learning_rate": 3e-05, + "loss": 2.5813, + "step": 783 + }, + { + "epoch": 0.24190064794816415, + "grad_norm": 0.32457104325294495, + "learning_rate": 3e-05, + "loss": 2.6936, + "step": 784 + }, + { + "epoch": 0.24220919469299598, + "grad_norm": 0.29046568274497986, + "learning_rate": 3e-05, + "loss": 2.2027, + "step": 785 + }, + { + "epoch": 0.24251774143782784, + "grad_norm": 0.47084319591522217, + "learning_rate": 3e-05, + "loss": 2.7889, + "step": 786 + }, + { + "epoch": 0.24282628818265967, + "grad_norm": 0.4512122571468353, + "learning_rate": 3e-05, + "loss": 2.8724, + "step": 787 + }, + { + "epoch": 0.24313483492749152, + "grad_norm": 0.3429430425167084, + "learning_rate": 3e-05, + "loss": 2.5882, + "step": 788 + }, + { + "epoch": 0.24344338167232335, + "grad_norm": 0.2793130576610565, + "learning_rate": 3e-05, + "loss": 2.3702, + "step": 789 + }, + { + "epoch": 0.2437519284171552, + "grad_norm": 0.6752581000328064, + "learning_rate": 3e-05, + "loss": 2.9438, + "step": 790 + }, + { + "epoch": 0.24406047516198703, + "grad_norm": 0.3320951461791992, + "learning_rate": 3e-05, + "loss": 2.5836, + "step": 791 + }, + { + "epoch": 0.2443690219068189, + "grad_norm": 0.443316787481308, + "learning_rate": 3e-05, + "loss": 2.8967, + "step": 792 + }, + { + "epoch": 0.24467756865165072, + "grad_norm": 0.44384002685546875, + "learning_rate": 3e-05, + "loss": 2.1344, + "step": 793 + }, + { + "epoch": 0.24498611539648257, + "grad_norm": 0.5478724241256714, + "learning_rate": 3e-05, + "loss": 2.8676, + "step": 794 + }, + { + "epoch": 0.2452946621413144, + "grad_norm": 0.4789648950099945, + "learning_rate": 3e-05, + "loss": 2.954, + "step": 795 + }, + { + "epoch": 0.24560320888614626, + "grad_norm": 0.30075979232788086, + "learning_rate": 3e-05, + "loss": 2.2147, + "step": 796 + }, + { + "epoch": 0.24591175563097808, + "grad_norm": 0.8388468027114868, + "learning_rate": 3e-05, + "loss": 3.0801, + "step": 797 + }, + { + "epoch": 0.24622030237580994, + "grad_norm": 0.46896687150001526, + "learning_rate": 3e-05, + "loss": 3.0219, + "step": 798 + }, + { + "epoch": 0.24652884912064177, + "grad_norm": 0.37118545174598694, + "learning_rate": 3e-05, + "loss": 2.7809, + "step": 799 + }, + { + "epoch": 0.24683739586547362, + "grad_norm": 0.3290826380252838, + "learning_rate": 3e-05, + "loss": 2.4913, + "step": 800 + }, + { + "epoch": 0.24714594261030545, + "grad_norm": 0.4458785951137543, + "learning_rate": 3e-05, + "loss": 2.8324, + "step": 801 + }, + { + "epoch": 0.2474544893551373, + "grad_norm": 0.39164820313453674, + "learning_rate": 3e-05, + "loss": 2.8643, + "step": 802 + }, + { + "epoch": 0.24776303609996914, + "grad_norm": 0.37570327520370483, + "learning_rate": 3e-05, + "loss": 2.6302, + "step": 803 + }, + { + "epoch": 0.248071582844801, + "grad_norm": 0.36291664838790894, + "learning_rate": 3e-05, + "loss": 2.3188, + "step": 804 + }, + { + "epoch": 0.24838012958963282, + "grad_norm": 0.7497803568840027, + "learning_rate": 3e-05, + "loss": 3.1793, + "step": 805 + }, + { + "epoch": 0.24868867633446468, + "grad_norm": 0.3503245413303375, + "learning_rate": 3e-05, + "loss": 2.6478, + "step": 806 + }, + { + "epoch": 0.2489972230792965, + "grad_norm": 0.4229944944381714, + "learning_rate": 3e-05, + "loss": 3.166, + "step": 807 + }, + { + "epoch": 0.24930576982412836, + "grad_norm": 0.39964744448661804, + "learning_rate": 3e-05, + "loss": 2.5633, + "step": 808 + }, + { + "epoch": 0.2496143165689602, + "grad_norm": 0.4407540559768677, + "learning_rate": 3e-05, + "loss": 2.8108, + "step": 809 + }, + { + "epoch": 0.24992286331379204, + "grad_norm": 0.5962763428688049, + "learning_rate": 3e-05, + "loss": 2.7762, + "step": 810 + }, + { + "epoch": 0.25023141005862387, + "grad_norm": 0.35750457644462585, + "learning_rate": 3e-05, + "loss": 2.5792, + "step": 811 + }, + { + "epoch": 0.2505399568034557, + "grad_norm": 0.6067323088645935, + "learning_rate": 3e-05, + "loss": 2.9188, + "step": 812 + }, + { + "epoch": 0.2508485035482876, + "grad_norm": 0.5473812818527222, + "learning_rate": 3e-05, + "loss": 2.9941, + "step": 813 + }, + { + "epoch": 0.2511570502931194, + "grad_norm": 0.4247765839099884, + "learning_rate": 3e-05, + "loss": 2.9066, + "step": 814 + }, + { + "epoch": 0.25146559703795124, + "grad_norm": 0.36779630184173584, + "learning_rate": 3e-05, + "loss": 2.1791, + "step": 815 + }, + { + "epoch": 0.25177414378278307, + "grad_norm": 0.4601776599884033, + "learning_rate": 3e-05, + "loss": 2.4513, + "step": 816 + }, + { + "epoch": 0.25208269052761495, + "grad_norm": 0.4455990493297577, + "learning_rate": 3e-05, + "loss": 3.0133, + "step": 817 + }, + { + "epoch": 0.2523912372724468, + "grad_norm": 0.469711035490036, + "learning_rate": 3e-05, + "loss": 2.7601, + "step": 818 + }, + { + "epoch": 0.2526997840172786, + "grad_norm": 0.35248449444770813, + "learning_rate": 3e-05, + "loss": 2.4141, + "step": 819 + }, + { + "epoch": 0.25300833076211043, + "grad_norm": 0.5230732560157776, + "learning_rate": 3e-05, + "loss": 2.7922, + "step": 820 + }, + { + "epoch": 0.2533168775069423, + "grad_norm": 0.4009235203266144, + "learning_rate": 3e-05, + "loss": 2.8466, + "step": 821 + }, + { + "epoch": 0.25362542425177415, + "grad_norm": 0.4277903139591217, + "learning_rate": 3e-05, + "loss": 2.6193, + "step": 822 + }, + { + "epoch": 0.253933970996606, + "grad_norm": 0.35308367013931274, + "learning_rate": 3e-05, + "loss": 2.7359, + "step": 823 + }, + { + "epoch": 0.2542425177414378, + "grad_norm": 0.5506107807159424, + "learning_rate": 3e-05, + "loss": 3.3045, + "step": 824 + }, + { + "epoch": 0.2545510644862697, + "grad_norm": 0.4381810128688812, + "learning_rate": 3e-05, + "loss": 2.5703, + "step": 825 + }, + { + "epoch": 0.2548596112311015, + "grad_norm": 0.6535665392875671, + "learning_rate": 3e-05, + "loss": 2.9752, + "step": 826 + }, + { + "epoch": 0.25516815797593334, + "grad_norm": 0.31507301330566406, + "learning_rate": 3e-05, + "loss": 2.4288, + "step": 827 + }, + { + "epoch": 0.25547670472076517, + "grad_norm": 0.628995418548584, + "learning_rate": 3e-05, + "loss": 2.9277, + "step": 828 + }, + { + "epoch": 0.25578525146559705, + "grad_norm": 0.3475759029388428, + "learning_rate": 3e-05, + "loss": 2.6116, + "step": 829 + }, + { + "epoch": 0.2560937982104289, + "grad_norm": 0.35610201954841614, + "learning_rate": 3e-05, + "loss": 2.5474, + "step": 830 + }, + { + "epoch": 0.2564023449552607, + "grad_norm": 0.3411518335342407, + "learning_rate": 3e-05, + "loss": 2.6045, + "step": 831 + }, + { + "epoch": 0.2567108917000926, + "grad_norm": 0.36465325951576233, + "learning_rate": 3e-05, + "loss": 2.3901, + "step": 832 + }, + { + "epoch": 0.2570194384449244, + "grad_norm": 0.3037632703781128, + "learning_rate": 3e-05, + "loss": 2.3478, + "step": 833 + }, + { + "epoch": 0.25732798518975625, + "grad_norm": 0.44962364435195923, + "learning_rate": 3e-05, + "loss": 2.8154, + "step": 834 + }, + { + "epoch": 0.2576365319345881, + "grad_norm": 0.3245837688446045, + "learning_rate": 3e-05, + "loss": 2.4922, + "step": 835 + }, + { + "epoch": 0.25794507867941996, + "grad_norm": 0.37248003482818604, + "learning_rate": 3e-05, + "loss": 2.6044, + "step": 836 + }, + { + "epoch": 0.2582536254242518, + "grad_norm": 0.3687119483947754, + "learning_rate": 3e-05, + "loss": 2.7271, + "step": 837 + }, + { + "epoch": 0.2585621721690836, + "grad_norm": 0.3978298604488373, + "learning_rate": 3e-05, + "loss": 2.9363, + "step": 838 + }, + { + "epoch": 0.25887071891391544, + "grad_norm": 0.43831732869148254, + "learning_rate": 3e-05, + "loss": 2.633, + "step": 839 + }, + { + "epoch": 0.2591792656587473, + "grad_norm": 0.39611366391181946, + "learning_rate": 3e-05, + "loss": 2.9044, + "step": 840 + }, + { + "epoch": 0.25948781240357915, + "grad_norm": 0.37481510639190674, + "learning_rate": 3e-05, + "loss": 2.7067, + "step": 841 + }, + { + "epoch": 0.259796359148411, + "grad_norm": 0.33353596925735474, + "learning_rate": 3e-05, + "loss": 2.4132, + "step": 842 + }, + { + "epoch": 0.2601049058932428, + "grad_norm": 0.42422613501548767, + "learning_rate": 3e-05, + "loss": 2.4306, + "step": 843 + }, + { + "epoch": 0.2604134526380747, + "grad_norm": 0.7475231289863586, + "learning_rate": 3e-05, + "loss": 3.2851, + "step": 844 + }, + { + "epoch": 0.2607219993829065, + "grad_norm": 0.8938561081886292, + "learning_rate": 3e-05, + "loss": 3.2137, + "step": 845 + }, + { + "epoch": 0.26103054612773835, + "grad_norm": 0.5334686636924744, + "learning_rate": 3e-05, + "loss": 2.3994, + "step": 846 + }, + { + "epoch": 0.2613390928725702, + "grad_norm": 0.6619825959205627, + "learning_rate": 3e-05, + "loss": 2.5175, + "step": 847 + }, + { + "epoch": 0.26164763961740206, + "grad_norm": 0.8206683993339539, + "learning_rate": 3e-05, + "loss": 2.6581, + "step": 848 + }, + { + "epoch": 0.2619561863622339, + "grad_norm": 0.4362182319164276, + "learning_rate": 3e-05, + "loss": 3.1203, + "step": 849 + }, + { + "epoch": 0.2622647331070657, + "grad_norm": 0.5157708525657654, + "learning_rate": 3e-05, + "loss": 2.8433, + "step": 850 + }, + { + "epoch": 0.26257327985189755, + "grad_norm": 0.5386294722557068, + "learning_rate": 3e-05, + "loss": 2.5983, + "step": 851 + }, + { + "epoch": 0.26288182659672943, + "grad_norm": 0.7352175712585449, + "learning_rate": 3e-05, + "loss": 2.8269, + "step": 852 + }, + { + "epoch": 0.26319037334156126, + "grad_norm": 0.4389644265174866, + "learning_rate": 3e-05, + "loss": 2.7614, + "step": 853 + }, + { + "epoch": 0.2634989200863931, + "grad_norm": 0.5747153162956238, + "learning_rate": 3e-05, + "loss": 2.8439, + "step": 854 + }, + { + "epoch": 0.2638074668312249, + "grad_norm": 0.442501962184906, + "learning_rate": 3e-05, + "loss": 2.5673, + "step": 855 + }, + { + "epoch": 0.2641160135760568, + "grad_norm": 0.5041112899780273, + "learning_rate": 3e-05, + "loss": 3.4168, + "step": 856 + }, + { + "epoch": 0.2644245603208886, + "grad_norm": 0.4333510994911194, + "learning_rate": 3e-05, + "loss": 2.5954, + "step": 857 + }, + { + "epoch": 0.26473310706572045, + "grad_norm": 0.2841757535934448, + "learning_rate": 3e-05, + "loss": 2.2267, + "step": 858 + }, + { + "epoch": 0.2650416538105523, + "grad_norm": 0.3240799903869629, + "learning_rate": 3e-05, + "loss": 2.4861, + "step": 859 + }, + { + "epoch": 0.26535020055538416, + "grad_norm": 0.8906437754631042, + "learning_rate": 3e-05, + "loss": 3.1556, + "step": 860 + }, + { + "epoch": 0.265658747300216, + "grad_norm": 0.503738284111023, + "learning_rate": 3e-05, + "loss": 2.8386, + "step": 861 + }, + { + "epoch": 0.2659672940450478, + "grad_norm": 0.3821960985660553, + "learning_rate": 3e-05, + "loss": 2.4592, + "step": 862 + }, + { + "epoch": 0.26627584078987965, + "grad_norm": 0.4790137708187103, + "learning_rate": 3e-05, + "loss": 2.9267, + "step": 863 + }, + { + "epoch": 0.26658438753471153, + "grad_norm": 0.4529775083065033, + "learning_rate": 3e-05, + "loss": 2.7667, + "step": 864 + }, + { + "epoch": 0.26689293427954336, + "grad_norm": 0.38088151812553406, + "learning_rate": 3e-05, + "loss": 2.8161, + "step": 865 + }, + { + "epoch": 0.2672014810243752, + "grad_norm": 0.38208597898483276, + "learning_rate": 3e-05, + "loss": 2.6606, + "step": 866 + }, + { + "epoch": 0.267510027769207, + "grad_norm": 0.3878786861896515, + "learning_rate": 3e-05, + "loss": 2.8936, + "step": 867 + }, + { + "epoch": 0.2678185745140389, + "grad_norm": 0.3223757743835449, + "learning_rate": 3e-05, + "loss": 2.6747, + "step": 868 + }, + { + "epoch": 0.2681271212588707, + "grad_norm": 0.380728155374527, + "learning_rate": 3e-05, + "loss": 2.9688, + "step": 869 + }, + { + "epoch": 0.26843566800370255, + "grad_norm": 0.3616756796836853, + "learning_rate": 3e-05, + "loss": 2.9107, + "step": 870 + }, + { + "epoch": 0.2687442147485344, + "grad_norm": 0.4382908344268799, + "learning_rate": 3e-05, + "loss": 2.7253, + "step": 871 + }, + { + "epoch": 0.26905276149336627, + "grad_norm": 0.6541967988014221, + "learning_rate": 3e-05, + "loss": 2.9171, + "step": 872 + }, + { + "epoch": 0.2693613082381981, + "grad_norm": 0.5041164755821228, + "learning_rate": 3e-05, + "loss": 2.697, + "step": 873 + }, + { + "epoch": 0.2696698549830299, + "grad_norm": 0.40384742617607117, + "learning_rate": 3e-05, + "loss": 2.832, + "step": 874 + }, + { + "epoch": 0.26997840172786175, + "grad_norm": 0.4746786952018738, + "learning_rate": 3e-05, + "loss": 3.0941, + "step": 875 + }, + { + "epoch": 0.27028694847269363, + "grad_norm": 0.49661922454833984, + "learning_rate": 3e-05, + "loss": 2.9883, + "step": 876 + }, + { + "epoch": 0.27059549521752546, + "grad_norm": 0.323425829410553, + "learning_rate": 3e-05, + "loss": 2.4742, + "step": 877 + }, + { + "epoch": 0.2709040419623573, + "grad_norm": 0.383983314037323, + "learning_rate": 3e-05, + "loss": 2.5993, + "step": 878 + }, + { + "epoch": 0.2712125887071891, + "grad_norm": 0.44553110003471375, + "learning_rate": 3e-05, + "loss": 2.6372, + "step": 879 + }, + { + "epoch": 0.271521135452021, + "grad_norm": 0.5241272449493408, + "learning_rate": 3e-05, + "loss": 3.2384, + "step": 880 + }, + { + "epoch": 0.27182968219685283, + "grad_norm": 0.6117975115776062, + "learning_rate": 3e-05, + "loss": 2.8964, + "step": 881 + }, + { + "epoch": 0.27213822894168466, + "grad_norm": 0.3425690233707428, + "learning_rate": 3e-05, + "loss": 2.3628, + "step": 882 + }, + { + "epoch": 0.2724467756865165, + "grad_norm": 0.49667888879776, + "learning_rate": 3e-05, + "loss": 3.1708, + "step": 883 + }, + { + "epoch": 0.27275532243134837, + "grad_norm": 0.3282347321510315, + "learning_rate": 3e-05, + "loss": 2.5675, + "step": 884 + }, + { + "epoch": 0.2730638691761802, + "grad_norm": 0.371405690908432, + "learning_rate": 3e-05, + "loss": 2.3648, + "step": 885 + }, + { + "epoch": 0.273372415921012, + "grad_norm": 0.46397897601127625, + "learning_rate": 3e-05, + "loss": 2.4369, + "step": 886 + }, + { + "epoch": 0.27368096266584385, + "grad_norm": 0.45648089051246643, + "learning_rate": 3e-05, + "loss": 2.5306, + "step": 887 + }, + { + "epoch": 0.27398950941067574, + "grad_norm": 0.3604811728000641, + "learning_rate": 3e-05, + "loss": 2.6633, + "step": 888 + }, + { + "epoch": 0.27429805615550756, + "grad_norm": 0.3597293496131897, + "learning_rate": 3e-05, + "loss": 2.5673, + "step": 889 + }, + { + "epoch": 0.2746066029003394, + "grad_norm": 0.4115181565284729, + "learning_rate": 3e-05, + "loss": 2.6737, + "step": 890 + }, + { + "epoch": 0.2749151496451712, + "grad_norm": 0.5663295388221741, + "learning_rate": 3e-05, + "loss": 3.3189, + "step": 891 + }, + { + "epoch": 0.2752236963900031, + "grad_norm": 0.3877559006214142, + "learning_rate": 3e-05, + "loss": 2.3662, + "step": 892 + }, + { + "epoch": 0.27553224313483493, + "grad_norm": 0.3957102596759796, + "learning_rate": 3e-05, + "loss": 2.6481, + "step": 893 + }, + { + "epoch": 0.27584078987966676, + "grad_norm": 0.4517107307910919, + "learning_rate": 3e-05, + "loss": 2.6274, + "step": 894 + }, + { + "epoch": 0.2761493366244986, + "grad_norm": 0.3920423090457916, + "learning_rate": 3e-05, + "loss": 2.3127, + "step": 895 + }, + { + "epoch": 0.27645788336933047, + "grad_norm": 0.4129326641559601, + "learning_rate": 3e-05, + "loss": 2.8305, + "step": 896 + }, + { + "epoch": 0.2767664301141623, + "grad_norm": 0.434194415807724, + "learning_rate": 3e-05, + "loss": 2.9748, + "step": 897 + }, + { + "epoch": 0.2770749768589941, + "grad_norm": 0.35281941294670105, + "learning_rate": 3e-05, + "loss": 2.6046, + "step": 898 + }, + { + "epoch": 0.27738352360382595, + "grad_norm": 0.3993188142776489, + "learning_rate": 3e-05, + "loss": 3.0656, + "step": 899 + }, + { + "epoch": 0.27769207034865784, + "grad_norm": 0.38514411449432373, + "learning_rate": 3e-05, + "loss": 2.6093, + "step": 900 + }, + { + "epoch": 0.27800061709348967, + "grad_norm": 0.46261972188949585, + "learning_rate": 3e-05, + "loss": 2.7449, + "step": 901 + }, + { + "epoch": 0.2783091638383215, + "grad_norm": 0.35552743077278137, + "learning_rate": 3e-05, + "loss": 2.4989, + "step": 902 + }, + { + "epoch": 0.2786177105831533, + "grad_norm": 0.3849910795688629, + "learning_rate": 3e-05, + "loss": 2.8201, + "step": 903 + }, + { + "epoch": 0.2789262573279852, + "grad_norm": 0.3854105770587921, + "learning_rate": 3e-05, + "loss": 2.866, + "step": 904 + }, + { + "epoch": 0.27923480407281703, + "grad_norm": 0.41260039806365967, + "learning_rate": 3e-05, + "loss": 2.6602, + "step": 905 + }, + { + "epoch": 0.27954335081764886, + "grad_norm": 0.347160667181015, + "learning_rate": 3e-05, + "loss": 2.3522, + "step": 906 + }, + { + "epoch": 0.2798518975624807, + "grad_norm": 0.2681293785572052, + "learning_rate": 3e-05, + "loss": 2.2104, + "step": 907 + }, + { + "epoch": 0.2801604443073126, + "grad_norm": 0.5947923064231873, + "learning_rate": 3e-05, + "loss": 2.9183, + "step": 908 + }, + { + "epoch": 0.2804689910521444, + "grad_norm": 0.3329774737358093, + "learning_rate": 3e-05, + "loss": 2.6277, + "step": 909 + }, + { + "epoch": 0.28077753779697623, + "grad_norm": 0.5521478652954102, + "learning_rate": 3e-05, + "loss": 3.0918, + "step": 910 + }, + { + "epoch": 0.28108608454180806, + "grad_norm": 0.4059542417526245, + "learning_rate": 3e-05, + "loss": 2.7583, + "step": 911 + }, + { + "epoch": 0.28139463128663994, + "grad_norm": 0.406534343957901, + "learning_rate": 3e-05, + "loss": 2.8539, + "step": 912 + }, + { + "epoch": 0.28170317803147177, + "grad_norm": 0.6377788782119751, + "learning_rate": 3e-05, + "loss": 2.7076, + "step": 913 + }, + { + "epoch": 0.2820117247763036, + "grad_norm": 0.3285945653915405, + "learning_rate": 3e-05, + "loss": 2.233, + "step": 914 + }, + { + "epoch": 0.2823202715211355, + "grad_norm": 0.3589547276496887, + "learning_rate": 3e-05, + "loss": 2.6757, + "step": 915 + }, + { + "epoch": 0.2826288182659673, + "grad_norm": 0.5185350775718689, + "learning_rate": 3e-05, + "loss": 2.84, + "step": 916 + }, + { + "epoch": 0.28293736501079914, + "grad_norm": 0.5448761582374573, + "learning_rate": 3e-05, + "loss": 2.6148, + "step": 917 + }, + { + "epoch": 0.28324591175563096, + "grad_norm": 0.40239185094833374, + "learning_rate": 3e-05, + "loss": 2.6744, + "step": 918 + }, + { + "epoch": 0.28355445850046285, + "grad_norm": 0.4547919034957886, + "learning_rate": 3e-05, + "loss": 3.0762, + "step": 919 + }, + { + "epoch": 0.2838630052452947, + "grad_norm": 0.6027969717979431, + "learning_rate": 3e-05, + "loss": 2.5929, + "step": 920 + }, + { + "epoch": 0.2841715519901265, + "grad_norm": 0.4554506242275238, + "learning_rate": 3e-05, + "loss": 2.4028, + "step": 921 + }, + { + "epoch": 0.28448009873495833, + "grad_norm": 0.4088301360607147, + "learning_rate": 3e-05, + "loss": 2.2121, + "step": 922 + }, + { + "epoch": 0.2847886454797902, + "grad_norm": 0.38209256529808044, + "learning_rate": 3e-05, + "loss": 2.8373, + "step": 923 + }, + { + "epoch": 0.28509719222462204, + "grad_norm": 0.3328472673892975, + "learning_rate": 3e-05, + "loss": 2.4444, + "step": 924 + }, + { + "epoch": 0.28540573896945387, + "grad_norm": 0.33774638175964355, + "learning_rate": 3e-05, + "loss": 2.3775, + "step": 925 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.5520322322845459, + "learning_rate": 3e-05, + "loss": 2.6919, + "step": 926 + }, + { + "epoch": 0.2860228324591176, + "grad_norm": 0.42964938282966614, + "learning_rate": 3e-05, + "loss": 2.7063, + "step": 927 + }, + { + "epoch": 0.2863313792039494, + "grad_norm": 0.3997243642807007, + "learning_rate": 3e-05, + "loss": 2.4581, + "step": 928 + }, + { + "epoch": 0.28663992594878124, + "grad_norm": 0.34988248348236084, + "learning_rate": 3e-05, + "loss": 2.6436, + "step": 929 + }, + { + "epoch": 0.28694847269361307, + "grad_norm": 0.5160298347473145, + "learning_rate": 3e-05, + "loss": 2.5583, + "step": 930 + }, + { + "epoch": 0.28725701943844495, + "grad_norm": 0.5120193362236023, + "learning_rate": 3e-05, + "loss": 2.879, + "step": 931 + }, + { + "epoch": 0.2875655661832768, + "grad_norm": 0.380231112241745, + "learning_rate": 3e-05, + "loss": 2.8559, + "step": 932 + }, + { + "epoch": 0.2878741129281086, + "grad_norm": 0.3843991756439209, + "learning_rate": 3e-05, + "loss": 2.547, + "step": 933 + }, + { + "epoch": 0.28818265967294043, + "grad_norm": 0.48810240626335144, + "learning_rate": 3e-05, + "loss": 2.7941, + "step": 934 + }, + { + "epoch": 0.2884912064177723, + "grad_norm": 0.4541907012462616, + "learning_rate": 3e-05, + "loss": 2.6922, + "step": 935 + }, + { + "epoch": 0.28879975316260414, + "grad_norm": 0.45531538128852844, + "learning_rate": 3e-05, + "loss": 2.9196, + "step": 936 + }, + { + "epoch": 0.289108299907436, + "grad_norm": 0.33889535069465637, + "learning_rate": 3e-05, + "loss": 2.6043, + "step": 937 + }, + { + "epoch": 0.2894168466522678, + "grad_norm": 0.41679373383522034, + "learning_rate": 3e-05, + "loss": 2.3954, + "step": 938 + }, + { + "epoch": 0.2897253933970997, + "grad_norm": 0.5239324569702148, + "learning_rate": 3e-05, + "loss": 2.7328, + "step": 939 + }, + { + "epoch": 0.2900339401419315, + "grad_norm": 0.45569634437561035, + "learning_rate": 3e-05, + "loss": 2.8944, + "step": 940 + }, + { + "epoch": 0.29034248688676334, + "grad_norm": 0.29057788848876953, + "learning_rate": 3e-05, + "loss": 2.1483, + "step": 941 + }, + { + "epoch": 0.29065103363159517, + "grad_norm": 0.3790557384490967, + "learning_rate": 3e-05, + "loss": 2.4723, + "step": 942 + }, + { + "epoch": 0.29095958037642705, + "grad_norm": 0.4859466850757599, + "learning_rate": 3e-05, + "loss": 2.8131, + "step": 943 + }, + { + "epoch": 0.2912681271212589, + "grad_norm": 0.5878845453262329, + "learning_rate": 3e-05, + "loss": 2.8616, + "step": 944 + }, + { + "epoch": 0.2915766738660907, + "grad_norm": 0.3621906638145447, + "learning_rate": 3e-05, + "loss": 2.6452, + "step": 945 + }, + { + "epoch": 0.29188522061092254, + "grad_norm": 0.39704960584640503, + "learning_rate": 3e-05, + "loss": 2.688, + "step": 946 + }, + { + "epoch": 0.2921937673557544, + "grad_norm": 0.438273549079895, + "learning_rate": 3e-05, + "loss": 2.6861, + "step": 947 + }, + { + "epoch": 0.29250231410058625, + "grad_norm": 0.42470529675483704, + "learning_rate": 3e-05, + "loss": 2.7117, + "step": 948 + }, + { + "epoch": 0.2928108608454181, + "grad_norm": 0.3004021942615509, + "learning_rate": 3e-05, + "loss": 2.2417, + "step": 949 + }, + { + "epoch": 0.2931194075902499, + "grad_norm": 0.3381553590297699, + "learning_rate": 3e-05, + "loss": 2.7548, + "step": 950 + }, + { + "epoch": 0.2934279543350818, + "grad_norm": 0.32822176814079285, + "learning_rate": 3e-05, + "loss": 2.6632, + "step": 951 + }, + { + "epoch": 0.2937365010799136, + "grad_norm": 0.48720303177833557, + "learning_rate": 3e-05, + "loss": 2.9131, + "step": 952 + }, + { + "epoch": 0.29404504782474544, + "grad_norm": 0.5922092795372009, + "learning_rate": 3e-05, + "loss": 2.6821, + "step": 953 + }, + { + "epoch": 0.29435359456957727, + "grad_norm": 0.36750638484954834, + "learning_rate": 3e-05, + "loss": 2.9744, + "step": 954 + }, + { + "epoch": 0.29466214131440915, + "grad_norm": 0.3950546085834503, + "learning_rate": 3e-05, + "loss": 2.8421, + "step": 955 + }, + { + "epoch": 0.294970688059241, + "grad_norm": 0.33338621258735657, + "learning_rate": 3e-05, + "loss": 2.3283, + "step": 956 + }, + { + "epoch": 0.2952792348040728, + "grad_norm": 0.32808199524879456, + "learning_rate": 3e-05, + "loss": 2.575, + "step": 957 + }, + { + "epoch": 0.29558778154890464, + "grad_norm": 0.40493538975715637, + "learning_rate": 3e-05, + "loss": 2.696, + "step": 958 + }, + { + "epoch": 0.2958963282937365, + "grad_norm": 0.5946307182312012, + "learning_rate": 3e-05, + "loss": 3.0542, + "step": 959 + }, + { + "epoch": 0.29620487503856835, + "grad_norm": 0.49499088525772095, + "learning_rate": 3e-05, + "loss": 2.9535, + "step": 960 + }, + { + "epoch": 0.2965134217834002, + "grad_norm": 0.3947402238845825, + "learning_rate": 3e-05, + "loss": 2.2794, + "step": 961 + }, + { + "epoch": 0.296821968528232, + "grad_norm": 0.6068766713142395, + "learning_rate": 3e-05, + "loss": 2.7137, + "step": 962 + }, + { + "epoch": 0.2971305152730639, + "grad_norm": 0.3848678469657898, + "learning_rate": 3e-05, + "loss": 2.8856, + "step": 963 + }, + { + "epoch": 0.2974390620178957, + "grad_norm": 0.3639110028743744, + "learning_rate": 3e-05, + "loss": 2.6542, + "step": 964 + }, + { + "epoch": 0.29774760876272754, + "grad_norm": 0.4631502330303192, + "learning_rate": 3e-05, + "loss": 2.801, + "step": 965 + }, + { + "epoch": 0.2980561555075594, + "grad_norm": 0.4895254671573639, + "learning_rate": 3e-05, + "loss": 2.5745, + "step": 966 + }, + { + "epoch": 0.29836470225239126, + "grad_norm": 0.32088950276374817, + "learning_rate": 3e-05, + "loss": 2.5099, + "step": 967 + }, + { + "epoch": 0.2986732489972231, + "grad_norm": 0.4241865277290344, + "learning_rate": 3e-05, + "loss": 2.8371, + "step": 968 + }, + { + "epoch": 0.2989817957420549, + "grad_norm": 0.4249207675457001, + "learning_rate": 3e-05, + "loss": 2.786, + "step": 969 + }, + { + "epoch": 0.29929034248688674, + "grad_norm": 0.3566865622997284, + "learning_rate": 3e-05, + "loss": 2.4608, + "step": 970 + }, + { + "epoch": 0.2995988892317186, + "grad_norm": 0.4732634127140045, + "learning_rate": 3e-05, + "loss": 2.4006, + "step": 971 + }, + { + "epoch": 0.29990743597655045, + "grad_norm": 0.31523409485816956, + "learning_rate": 3e-05, + "loss": 2.5125, + "step": 972 + }, + { + "epoch": 0.3002159827213823, + "grad_norm": 0.31227585673332214, + "learning_rate": 3e-05, + "loss": 2.5859, + "step": 973 + }, + { + "epoch": 0.3005245294662141, + "grad_norm": 0.5433022379875183, + "learning_rate": 3e-05, + "loss": 2.7815, + "step": 974 + }, + { + "epoch": 0.300833076211046, + "grad_norm": 0.32463932037353516, + "learning_rate": 3e-05, + "loss": 2.5574, + "step": 975 + }, + { + "epoch": 0.3011416229558778, + "grad_norm": 0.45843788981437683, + "learning_rate": 3e-05, + "loss": 3.3021, + "step": 976 + }, + { + "epoch": 0.30145016970070965, + "grad_norm": 0.43129628896713257, + "learning_rate": 3e-05, + "loss": 2.7665, + "step": 977 + }, + { + "epoch": 0.3017587164455415, + "grad_norm": 0.4639509618282318, + "learning_rate": 3e-05, + "loss": 2.9852, + "step": 978 + }, + { + "epoch": 0.30206726319037336, + "grad_norm": 0.6034327745437622, + "learning_rate": 3e-05, + "loss": 2.7809, + "step": 979 + }, + { + "epoch": 0.3023758099352052, + "grad_norm": 0.4655909538269043, + "learning_rate": 3e-05, + "loss": 2.788, + "step": 980 + }, + { + "epoch": 0.302684356680037, + "grad_norm": 0.43474000692367554, + "learning_rate": 3e-05, + "loss": 2.7155, + "step": 981 + }, + { + "epoch": 0.30299290342486884, + "grad_norm": 0.4621677100658417, + "learning_rate": 3e-05, + "loss": 2.649, + "step": 982 + }, + { + "epoch": 0.3033014501697007, + "grad_norm": 0.39728179574012756, + "learning_rate": 3e-05, + "loss": 2.7499, + "step": 983 + }, + { + "epoch": 0.30360999691453255, + "grad_norm": 0.41515350341796875, + "learning_rate": 3e-05, + "loss": 2.4945, + "step": 984 + }, + { + "epoch": 0.3039185436593644, + "grad_norm": 0.4541874825954437, + "learning_rate": 3e-05, + "loss": 2.9019, + "step": 985 + }, + { + "epoch": 0.3042270904041962, + "grad_norm": 0.4044342339038849, + "learning_rate": 3e-05, + "loss": 2.5939, + "step": 986 + }, + { + "epoch": 0.3045356371490281, + "grad_norm": 0.5499434471130371, + "learning_rate": 3e-05, + "loss": 3.1831, + "step": 987 + }, + { + "epoch": 0.3048441838938599, + "grad_norm": 0.4253259599208832, + "learning_rate": 3e-05, + "loss": 2.6482, + "step": 988 + }, + { + "epoch": 0.30515273063869175, + "grad_norm": 0.5634761452674866, + "learning_rate": 3e-05, + "loss": 3.0694, + "step": 989 + }, + { + "epoch": 0.3054612773835236, + "grad_norm": 0.3745432496070862, + "learning_rate": 3e-05, + "loss": 2.864, + "step": 990 + }, + { + "epoch": 0.30576982412835546, + "grad_norm": 0.31692636013031006, + "learning_rate": 3e-05, + "loss": 2.4637, + "step": 991 + }, + { + "epoch": 0.3060783708731873, + "grad_norm": 0.45177918672561646, + "learning_rate": 3e-05, + "loss": 2.7045, + "step": 992 + }, + { + "epoch": 0.3063869176180191, + "grad_norm": 0.3548758924007416, + "learning_rate": 3e-05, + "loss": 2.4143, + "step": 993 + }, + { + "epoch": 0.30669546436285094, + "grad_norm": 0.4161062240600586, + "learning_rate": 3e-05, + "loss": 3.0351, + "step": 994 + }, + { + "epoch": 0.30700401110768283, + "grad_norm": 0.5312502384185791, + "learning_rate": 3e-05, + "loss": 2.8126, + "step": 995 + }, + { + "epoch": 0.30731255785251466, + "grad_norm": 0.4165649712085724, + "learning_rate": 3e-05, + "loss": 2.311, + "step": 996 + }, + { + "epoch": 0.3076211045973465, + "grad_norm": 0.4763493537902832, + "learning_rate": 3e-05, + "loss": 2.5855, + "step": 997 + }, + { + "epoch": 0.30792965134217837, + "grad_norm": 0.3590225279331207, + "learning_rate": 3e-05, + "loss": 2.4563, + "step": 998 + }, + { + "epoch": 0.3082381980870102, + "grad_norm": 0.43557247519493103, + "learning_rate": 3e-05, + "loss": 2.7756, + "step": 999 + }, + { + "epoch": 0.308546744831842, + "grad_norm": 0.5323253870010376, + "learning_rate": 3e-05, + "loss": 2.9261, + "step": 1000 + }, + { + "epoch": 0.30885529157667385, + "grad_norm": 0.3511796295642853, + "learning_rate": 3e-05, + "loss": 2.7418, + "step": 1001 + }, + { + "epoch": 0.30916383832150574, + "grad_norm": 0.37199896574020386, + "learning_rate": 3e-05, + "loss": 2.6009, + "step": 1002 + }, + { + "epoch": 0.30947238506633756, + "grad_norm": 0.34085142612457275, + "learning_rate": 3e-05, + "loss": 2.6959, + "step": 1003 + }, + { + "epoch": 0.3097809318111694, + "grad_norm": 0.4471310079097748, + "learning_rate": 3e-05, + "loss": 2.8914, + "step": 1004 + }, + { + "epoch": 0.3100894785560012, + "grad_norm": 0.3000938296318054, + "learning_rate": 3e-05, + "loss": 2.3222, + "step": 1005 + }, + { + "epoch": 0.3103980253008331, + "grad_norm": 0.4500029385089874, + "learning_rate": 3e-05, + "loss": 2.6863, + "step": 1006 + }, + { + "epoch": 0.31070657204566493, + "grad_norm": 0.36391642689704895, + "learning_rate": 3e-05, + "loss": 2.4052, + "step": 1007 + }, + { + "epoch": 0.31101511879049676, + "grad_norm": 0.33557793498039246, + "learning_rate": 3e-05, + "loss": 2.4544, + "step": 1008 + }, + { + "epoch": 0.3113236655353286, + "grad_norm": 0.5357667803764343, + "learning_rate": 3e-05, + "loss": 2.9365, + "step": 1009 + }, + { + "epoch": 0.31163221228016047, + "grad_norm": 0.40679875016212463, + "learning_rate": 3e-05, + "loss": 2.6265, + "step": 1010 + }, + { + "epoch": 0.3119407590249923, + "grad_norm": 0.4948636591434479, + "learning_rate": 3e-05, + "loss": 3.1441, + "step": 1011 + }, + { + "epoch": 0.3122493057698241, + "grad_norm": 0.3034040331840515, + "learning_rate": 3e-05, + "loss": 2.331, + "step": 1012 + }, + { + "epoch": 0.31255785251465595, + "grad_norm": 0.4402029812335968, + "learning_rate": 3e-05, + "loss": 2.7928, + "step": 1013 + }, + { + "epoch": 0.31286639925948784, + "grad_norm": 0.40119442343711853, + "learning_rate": 3e-05, + "loss": 2.7245, + "step": 1014 + }, + { + "epoch": 0.31317494600431967, + "grad_norm": 0.4709239900112152, + "learning_rate": 3e-05, + "loss": 2.8281, + "step": 1015 + }, + { + "epoch": 0.3134834927491515, + "grad_norm": 0.43850257992744446, + "learning_rate": 3e-05, + "loss": 2.4768, + "step": 1016 + }, + { + "epoch": 0.3137920394939833, + "grad_norm": 0.3689449429512024, + "learning_rate": 3e-05, + "loss": 2.4916, + "step": 1017 + }, + { + "epoch": 0.3141005862388152, + "grad_norm": 0.3573774993419647, + "learning_rate": 3e-05, + "loss": 2.5842, + "step": 1018 + }, + { + "epoch": 0.31440913298364703, + "grad_norm": 0.5338215231895447, + "learning_rate": 3e-05, + "loss": 3.1707, + "step": 1019 + }, + { + "epoch": 0.31471767972847886, + "grad_norm": 0.47685402631759644, + "learning_rate": 3e-05, + "loss": 2.8287, + "step": 1020 + }, + { + "epoch": 0.3150262264733107, + "grad_norm": 0.29378730058670044, + "learning_rate": 3e-05, + "loss": 2.3566, + "step": 1021 + }, + { + "epoch": 0.31533477321814257, + "grad_norm": 0.3345448076725006, + "learning_rate": 3e-05, + "loss": 2.8883, + "step": 1022 + }, + { + "epoch": 0.3156433199629744, + "grad_norm": 0.3130493760108948, + "learning_rate": 3e-05, + "loss": 2.2636, + "step": 1023 + }, + { + "epoch": 0.31595186670780623, + "grad_norm": 0.4794589877128601, + "learning_rate": 3e-05, + "loss": 2.515, + "step": 1024 + }, + { + "epoch": 0.31626041345263806, + "grad_norm": 0.4308234751224518, + "learning_rate": 3e-05, + "loss": 3.2071, + "step": 1025 + }, + { + "epoch": 0.31656896019746994, + "grad_norm": 0.33414486050605774, + "learning_rate": 3e-05, + "loss": 2.5691, + "step": 1026 + }, + { + "epoch": 0.31687750694230177, + "grad_norm": 0.3229790925979614, + "learning_rate": 3e-05, + "loss": 2.3644, + "step": 1027 + }, + { + "epoch": 0.3171860536871336, + "grad_norm": 0.5011124610900879, + "learning_rate": 3e-05, + "loss": 2.7893, + "step": 1028 + }, + { + "epoch": 0.3174946004319654, + "grad_norm": 0.37093424797058105, + "learning_rate": 3e-05, + "loss": 2.7773, + "step": 1029 + }, + { + "epoch": 0.3178031471767973, + "grad_norm": 0.5820983052253723, + "learning_rate": 3e-05, + "loss": 2.826, + "step": 1030 + }, + { + "epoch": 0.31811169392162914, + "grad_norm": 0.3745529353618622, + "learning_rate": 3e-05, + "loss": 2.678, + "step": 1031 + }, + { + "epoch": 0.31842024066646096, + "grad_norm": 0.4362945854663849, + "learning_rate": 3e-05, + "loss": 2.7813, + "step": 1032 + }, + { + "epoch": 0.3187287874112928, + "grad_norm": 0.45973077416419983, + "learning_rate": 3e-05, + "loss": 3.0007, + "step": 1033 + }, + { + "epoch": 0.3190373341561247, + "grad_norm": 0.3230498731136322, + "learning_rate": 3e-05, + "loss": 2.1809, + "step": 1034 + }, + { + "epoch": 0.3193458809009565, + "grad_norm": 0.3093605041503906, + "learning_rate": 3e-05, + "loss": 2.6337, + "step": 1035 + }, + { + "epoch": 0.31965442764578833, + "grad_norm": 0.33831560611724854, + "learning_rate": 3e-05, + "loss": 2.667, + "step": 1036 + }, + { + "epoch": 0.31996297439062016, + "grad_norm": 0.5394222140312195, + "learning_rate": 3e-05, + "loss": 3.0874, + "step": 1037 + }, + { + "epoch": 0.32027152113545204, + "grad_norm": 0.46193042397499084, + "learning_rate": 3e-05, + "loss": 3.1571, + "step": 1038 + }, + { + "epoch": 0.32058006788028387, + "grad_norm": 0.4071999788284302, + "learning_rate": 3e-05, + "loss": 2.7319, + "step": 1039 + }, + { + "epoch": 0.3208886146251157, + "grad_norm": 0.4034838080406189, + "learning_rate": 3e-05, + "loss": 2.5469, + "step": 1040 + }, + { + "epoch": 0.3211971613699475, + "grad_norm": 0.4690152406692505, + "learning_rate": 3e-05, + "loss": 2.7243, + "step": 1041 + }, + { + "epoch": 0.3215057081147794, + "grad_norm": 0.33456355333328247, + "learning_rate": 3e-05, + "loss": 2.2573, + "step": 1042 + }, + { + "epoch": 0.32181425485961124, + "grad_norm": 0.3276689350605011, + "learning_rate": 3e-05, + "loss": 2.3228, + "step": 1043 + }, + { + "epoch": 0.32212280160444307, + "grad_norm": 0.3934038579463959, + "learning_rate": 3e-05, + "loss": 2.7742, + "step": 1044 + }, + { + "epoch": 0.3224313483492749, + "grad_norm": 0.3162141740322113, + "learning_rate": 3e-05, + "loss": 2.3948, + "step": 1045 + }, + { + "epoch": 0.3227398950941068, + "grad_norm": 0.35283178091049194, + "learning_rate": 3e-05, + "loss": 2.5482, + "step": 1046 + }, + { + "epoch": 0.3230484418389386, + "grad_norm": 0.30801281332969666, + "learning_rate": 3e-05, + "loss": 2.3499, + "step": 1047 + }, + { + "epoch": 0.32335698858377043, + "grad_norm": 0.2911578118801117, + "learning_rate": 3e-05, + "loss": 2.5108, + "step": 1048 + }, + { + "epoch": 0.32366553532860226, + "grad_norm": 0.4680328965187073, + "learning_rate": 3e-05, + "loss": 2.8629, + "step": 1049 + }, + { + "epoch": 0.32397408207343414, + "grad_norm": 0.27995920181274414, + "learning_rate": 3e-05, + "loss": 2.2291, + "step": 1050 + }, + { + "epoch": 0.32428262881826597, + "grad_norm": 0.29879486560821533, + "learning_rate": 3e-05, + "loss": 2.4503, + "step": 1051 + }, + { + "epoch": 0.3245911755630978, + "grad_norm": 0.4487646520137787, + "learning_rate": 3e-05, + "loss": 3.1378, + "step": 1052 + }, + { + "epoch": 0.32489972230792963, + "grad_norm": 0.4540494680404663, + "learning_rate": 3e-05, + "loss": 3.0161, + "step": 1053 + }, + { + "epoch": 0.3252082690527615, + "grad_norm": 0.6287296414375305, + "learning_rate": 3e-05, + "loss": 2.7699, + "step": 1054 + }, + { + "epoch": 0.32551681579759334, + "grad_norm": 0.3549972176551819, + "learning_rate": 3e-05, + "loss": 2.517, + "step": 1055 + }, + { + "epoch": 0.32582536254242517, + "grad_norm": 0.35076603293418884, + "learning_rate": 3e-05, + "loss": 2.5529, + "step": 1056 + }, + { + "epoch": 0.326133909287257, + "grad_norm": 0.5306907296180725, + "learning_rate": 3e-05, + "loss": 2.6375, + "step": 1057 + }, + { + "epoch": 0.3264424560320889, + "grad_norm": 0.6471700072288513, + "learning_rate": 3e-05, + "loss": 3.0257, + "step": 1058 + }, + { + "epoch": 0.3267510027769207, + "grad_norm": 0.4791683256626129, + "learning_rate": 3e-05, + "loss": 2.673, + "step": 1059 + }, + { + "epoch": 0.32705954952175254, + "grad_norm": 0.3479447364807129, + "learning_rate": 3e-05, + "loss": 2.4918, + "step": 1060 + }, + { + "epoch": 0.32736809626658436, + "grad_norm": 0.5895714163780212, + "learning_rate": 3e-05, + "loss": 2.7425, + "step": 1061 + }, + { + "epoch": 0.32767664301141625, + "grad_norm": 0.600107729434967, + "learning_rate": 3e-05, + "loss": 3.1736, + "step": 1062 + }, + { + "epoch": 0.3279851897562481, + "grad_norm": 0.37535980343818665, + "learning_rate": 3e-05, + "loss": 2.5499, + "step": 1063 + }, + { + "epoch": 0.3282937365010799, + "grad_norm": 0.5155684351921082, + "learning_rate": 3e-05, + "loss": 2.6339, + "step": 1064 + }, + { + "epoch": 0.32860228324591173, + "grad_norm": 0.4304593801498413, + "learning_rate": 3e-05, + "loss": 2.9669, + "step": 1065 + }, + { + "epoch": 0.3289108299907436, + "grad_norm": 0.38700851798057556, + "learning_rate": 3e-05, + "loss": 2.6592, + "step": 1066 + }, + { + "epoch": 0.32921937673557544, + "grad_norm": 0.42323800921440125, + "learning_rate": 3e-05, + "loss": 2.468, + "step": 1067 + }, + { + "epoch": 0.32952792348040727, + "grad_norm": 0.4084995687007904, + "learning_rate": 3e-05, + "loss": 2.908, + "step": 1068 + }, + { + "epoch": 0.3298364702252391, + "grad_norm": 0.4313088655471802, + "learning_rate": 3e-05, + "loss": 2.6691, + "step": 1069 + }, + { + "epoch": 0.330145016970071, + "grad_norm": 0.7936158776283264, + "learning_rate": 3e-05, + "loss": 2.912, + "step": 1070 + }, + { + "epoch": 0.3304535637149028, + "grad_norm": 0.300073504447937, + "learning_rate": 3e-05, + "loss": 2.2288, + "step": 1071 + }, + { + "epoch": 0.33076211045973464, + "grad_norm": 0.37451618909835815, + "learning_rate": 3e-05, + "loss": 2.9299, + "step": 1072 + }, + { + "epoch": 0.33107065720456647, + "grad_norm": 0.5415565967559814, + "learning_rate": 3e-05, + "loss": 2.7914, + "step": 1073 + }, + { + "epoch": 0.33137920394939835, + "grad_norm": 0.35867804288864136, + "learning_rate": 3e-05, + "loss": 2.8536, + "step": 1074 + }, + { + "epoch": 0.3316877506942302, + "grad_norm": 0.4286486506462097, + "learning_rate": 3e-05, + "loss": 2.7861, + "step": 1075 + }, + { + "epoch": 0.331996297439062, + "grad_norm": 0.31424498558044434, + "learning_rate": 3e-05, + "loss": 2.5602, + "step": 1076 + }, + { + "epoch": 0.33230484418389383, + "grad_norm": 0.2903349697589874, + "learning_rate": 3e-05, + "loss": 2.2673, + "step": 1077 + }, + { + "epoch": 0.3326133909287257, + "grad_norm": 0.3674512505531311, + "learning_rate": 3e-05, + "loss": 2.7277, + "step": 1078 + }, + { + "epoch": 0.33292193767355754, + "grad_norm": 0.5027512311935425, + "learning_rate": 3e-05, + "loss": 2.6751, + "step": 1079 + }, + { + "epoch": 0.33323048441838937, + "grad_norm": 0.33197149634361267, + "learning_rate": 3e-05, + "loss": 2.2715, + "step": 1080 + }, + { + "epoch": 0.33353903116322126, + "grad_norm": 0.3024391233921051, + "learning_rate": 3e-05, + "loss": 2.5931, + "step": 1081 + }, + { + "epoch": 0.3338475779080531, + "grad_norm": 0.5677501559257507, + "learning_rate": 3e-05, + "loss": 2.7854, + "step": 1082 + }, + { + "epoch": 0.3341561246528849, + "grad_norm": 0.349448025226593, + "learning_rate": 3e-05, + "loss": 2.5758, + "step": 1083 + }, + { + "epoch": 0.33446467139771674, + "grad_norm": 0.4105369746685028, + "learning_rate": 3e-05, + "loss": 2.5552, + "step": 1084 + }, + { + "epoch": 0.3347732181425486, + "grad_norm": 0.423350989818573, + "learning_rate": 3e-05, + "loss": 2.8894, + "step": 1085 + }, + { + "epoch": 0.33508176488738045, + "grad_norm": 0.45168107748031616, + "learning_rate": 3e-05, + "loss": 2.7562, + "step": 1086 + }, + { + "epoch": 0.3353903116322123, + "grad_norm": 0.354754775762558, + "learning_rate": 3e-05, + "loss": 2.8504, + "step": 1087 + }, + { + "epoch": 0.3356988583770441, + "grad_norm": 0.31282752752304077, + "learning_rate": 3e-05, + "loss": 2.6208, + "step": 1088 + }, + { + "epoch": 0.336007405121876, + "grad_norm": 0.3565126359462738, + "learning_rate": 3e-05, + "loss": 2.7854, + "step": 1089 + }, + { + "epoch": 0.3363159518667078, + "grad_norm": 0.4445134401321411, + "learning_rate": 3e-05, + "loss": 2.8022, + "step": 1090 + }, + { + "epoch": 0.33662449861153965, + "grad_norm": 0.682829737663269, + "learning_rate": 3e-05, + "loss": 2.7264, + "step": 1091 + }, + { + "epoch": 0.3369330453563715, + "grad_norm": 0.4143361747264862, + "learning_rate": 3e-05, + "loss": 2.8082, + "step": 1092 + }, + { + "epoch": 0.33724159210120336, + "grad_norm": 0.4276883602142334, + "learning_rate": 3e-05, + "loss": 2.5857, + "step": 1093 + }, + { + "epoch": 0.3375501388460352, + "grad_norm": 0.5904972553253174, + "learning_rate": 3e-05, + "loss": 2.4174, + "step": 1094 + }, + { + "epoch": 0.337858685590867, + "grad_norm": 0.5379177927970886, + "learning_rate": 3e-05, + "loss": 2.9506, + "step": 1095 + }, + { + "epoch": 0.33816723233569884, + "grad_norm": 0.3578889071941376, + "learning_rate": 3e-05, + "loss": 2.6252, + "step": 1096 + }, + { + "epoch": 0.3384757790805307, + "grad_norm": 0.459988534450531, + "learning_rate": 3e-05, + "loss": 2.7379, + "step": 1097 + }, + { + "epoch": 0.33878432582536255, + "grad_norm": 0.5335902571678162, + "learning_rate": 3e-05, + "loss": 2.7689, + "step": 1098 + }, + { + "epoch": 0.3390928725701944, + "grad_norm": 0.427128404378891, + "learning_rate": 3e-05, + "loss": 2.5377, + "step": 1099 + }, + { + "epoch": 0.3394014193150262, + "grad_norm": 0.36371105909347534, + "learning_rate": 3e-05, + "loss": 2.3228, + "step": 1100 + }, + { + "epoch": 0.3397099660598581, + "grad_norm": 0.6602774262428284, + "learning_rate": 3e-05, + "loss": 2.7878, + "step": 1101 + }, + { + "epoch": 0.3400185128046899, + "grad_norm": 0.27885714173316956, + "learning_rate": 3e-05, + "loss": 2.2656, + "step": 1102 + }, + { + "epoch": 0.34032705954952175, + "grad_norm": 0.5975661277770996, + "learning_rate": 3e-05, + "loss": 2.9421, + "step": 1103 + }, + { + "epoch": 0.3406356062943536, + "grad_norm": 0.6099026203155518, + "learning_rate": 3e-05, + "loss": 3.1214, + "step": 1104 + }, + { + "epoch": 0.34094415303918546, + "grad_norm": 0.4762013554573059, + "learning_rate": 3e-05, + "loss": 2.3022, + "step": 1105 + }, + { + "epoch": 0.3412526997840173, + "grad_norm": 0.6812524795532227, + "learning_rate": 3e-05, + "loss": 2.5754, + "step": 1106 + }, + { + "epoch": 0.3415612465288491, + "grad_norm": 0.41604599356651306, + "learning_rate": 3e-05, + "loss": 2.7934, + "step": 1107 + }, + { + "epoch": 0.34186979327368094, + "grad_norm": 0.40861639380455017, + "learning_rate": 3e-05, + "loss": 2.7791, + "step": 1108 + }, + { + "epoch": 0.3421783400185128, + "grad_norm": 0.533877968788147, + "learning_rate": 3e-05, + "loss": 3.0471, + "step": 1109 + }, + { + "epoch": 0.34248688676334466, + "grad_norm": 0.34040915966033936, + "learning_rate": 3e-05, + "loss": 2.1942, + "step": 1110 + }, + { + "epoch": 0.3427954335081765, + "grad_norm": 0.5931209921836853, + "learning_rate": 3e-05, + "loss": 2.6718, + "step": 1111 + }, + { + "epoch": 0.3431039802530083, + "grad_norm": 0.4841914772987366, + "learning_rate": 3e-05, + "loss": 2.5222, + "step": 1112 + }, + { + "epoch": 0.3434125269978402, + "grad_norm": 0.32610583305358887, + "learning_rate": 3e-05, + "loss": 2.5451, + "step": 1113 + }, + { + "epoch": 0.343721073742672, + "grad_norm": 0.42859357595443726, + "learning_rate": 3e-05, + "loss": 2.9902, + "step": 1114 + }, + { + "epoch": 0.34402962048750385, + "grad_norm": 0.4420587122440338, + "learning_rate": 3e-05, + "loss": 2.6019, + "step": 1115 + }, + { + "epoch": 0.3443381672323357, + "grad_norm": 0.4523164927959442, + "learning_rate": 3e-05, + "loss": 2.6256, + "step": 1116 + }, + { + "epoch": 0.34464671397716756, + "grad_norm": 0.4764743447303772, + "learning_rate": 3e-05, + "loss": 2.942, + "step": 1117 + }, + { + "epoch": 0.3449552607219994, + "grad_norm": 0.40522849559783936, + "learning_rate": 3e-05, + "loss": 2.6806, + "step": 1118 + }, + { + "epoch": 0.3452638074668312, + "grad_norm": 0.331582248210907, + "learning_rate": 3e-05, + "loss": 2.3276, + "step": 1119 + }, + { + "epoch": 0.34557235421166305, + "grad_norm": 0.4719706177711487, + "learning_rate": 3e-05, + "loss": 3.0291, + "step": 1120 + }, + { + "epoch": 0.34588090095649493, + "grad_norm": 0.44904038310050964, + "learning_rate": 3e-05, + "loss": 2.8547, + "step": 1121 + }, + { + "epoch": 0.34618944770132676, + "grad_norm": 0.43688273429870605, + "learning_rate": 3e-05, + "loss": 2.4615, + "step": 1122 + }, + { + "epoch": 0.3464979944461586, + "grad_norm": 0.4403668940067291, + "learning_rate": 3e-05, + "loss": 3.0372, + "step": 1123 + }, + { + "epoch": 0.3468065411909904, + "grad_norm": 0.3114491105079651, + "learning_rate": 3e-05, + "loss": 2.5904, + "step": 1124 + }, + { + "epoch": 0.3471150879358223, + "grad_norm": 0.43030625581741333, + "learning_rate": 3e-05, + "loss": 2.5914, + "step": 1125 + }, + { + "epoch": 0.3474236346806541, + "grad_norm": 0.5228371620178223, + "learning_rate": 3e-05, + "loss": 3.269, + "step": 1126 + }, + { + "epoch": 0.34773218142548595, + "grad_norm": 0.35842400789260864, + "learning_rate": 3e-05, + "loss": 2.6338, + "step": 1127 + }, + { + "epoch": 0.3480407281703178, + "grad_norm": 0.33594179153442383, + "learning_rate": 3e-05, + "loss": 2.5405, + "step": 1128 + }, + { + "epoch": 0.34834927491514966, + "grad_norm": 0.34110668301582336, + "learning_rate": 3e-05, + "loss": 2.5228, + "step": 1129 + }, + { + "epoch": 0.3486578216599815, + "grad_norm": 0.42432937026023865, + "learning_rate": 3e-05, + "loss": 2.8293, + "step": 1130 + }, + { + "epoch": 0.3489663684048133, + "grad_norm": 0.42318588495254517, + "learning_rate": 3e-05, + "loss": 2.2771, + "step": 1131 + }, + { + "epoch": 0.34927491514964515, + "grad_norm": 0.34015128016471863, + "learning_rate": 3e-05, + "loss": 2.7245, + "step": 1132 + }, + { + "epoch": 0.34958346189447703, + "grad_norm": 0.3692342936992645, + "learning_rate": 3e-05, + "loss": 2.5157, + "step": 1133 + }, + { + "epoch": 0.34989200863930886, + "grad_norm": 0.5286168456077576, + "learning_rate": 3e-05, + "loss": 3.3151, + "step": 1134 + }, + { + "epoch": 0.3502005553841407, + "grad_norm": 0.4966541528701782, + "learning_rate": 3e-05, + "loss": 3.3489, + "step": 1135 + }, + { + "epoch": 0.3505091021289725, + "grad_norm": 0.5056973695755005, + "learning_rate": 3e-05, + "loss": 2.8628, + "step": 1136 + }, + { + "epoch": 0.3508176488738044, + "grad_norm": 0.40126872062683105, + "learning_rate": 3e-05, + "loss": 3.2355, + "step": 1137 + }, + { + "epoch": 0.3511261956186362, + "grad_norm": 0.4347645938396454, + "learning_rate": 3e-05, + "loss": 2.3129, + "step": 1138 + }, + { + "epoch": 0.35143474236346806, + "grad_norm": 0.5079309940338135, + "learning_rate": 3e-05, + "loss": 2.6779, + "step": 1139 + }, + { + "epoch": 0.3517432891082999, + "grad_norm": 0.34270283579826355, + "learning_rate": 3e-05, + "loss": 2.6367, + "step": 1140 + }, + { + "epoch": 0.35205183585313177, + "grad_norm": 0.3936125636100769, + "learning_rate": 3e-05, + "loss": 2.7157, + "step": 1141 + }, + { + "epoch": 0.3523603825979636, + "grad_norm": 0.5401539206504822, + "learning_rate": 3e-05, + "loss": 3.1242, + "step": 1142 + }, + { + "epoch": 0.3526689293427954, + "grad_norm": 0.3918668329715729, + "learning_rate": 3e-05, + "loss": 2.9379, + "step": 1143 + }, + { + "epoch": 0.35297747608762725, + "grad_norm": 0.3724942207336426, + "learning_rate": 3e-05, + "loss": 2.5844, + "step": 1144 + }, + { + "epoch": 0.35328602283245913, + "grad_norm": 0.4116598963737488, + "learning_rate": 3e-05, + "loss": 2.6892, + "step": 1145 + }, + { + "epoch": 0.35359456957729096, + "grad_norm": 0.5307642817497253, + "learning_rate": 3e-05, + "loss": 2.7121, + "step": 1146 + }, + { + "epoch": 0.3539031163221228, + "grad_norm": 0.34816232323646545, + "learning_rate": 3e-05, + "loss": 2.6197, + "step": 1147 + }, + { + "epoch": 0.3542116630669546, + "grad_norm": 0.3448748290538788, + "learning_rate": 3e-05, + "loss": 2.5077, + "step": 1148 + }, + { + "epoch": 0.3545202098117865, + "grad_norm": 0.39467447996139526, + "learning_rate": 3e-05, + "loss": 2.3641, + "step": 1149 + }, + { + "epoch": 0.35482875655661833, + "grad_norm": 0.39322465658187866, + "learning_rate": 3e-05, + "loss": 2.6104, + "step": 1150 + }, + { + "epoch": 0.35513730330145016, + "grad_norm": 0.34313109517097473, + "learning_rate": 3e-05, + "loss": 2.7748, + "step": 1151 + }, + { + "epoch": 0.355445850046282, + "grad_norm": 0.32297638058662415, + "learning_rate": 3e-05, + "loss": 2.6006, + "step": 1152 + }, + { + "epoch": 0.35575439679111387, + "grad_norm": 0.35026779770851135, + "learning_rate": 3e-05, + "loss": 2.3611, + "step": 1153 + }, + { + "epoch": 0.3560629435359457, + "grad_norm": 0.4179763197898865, + "learning_rate": 3e-05, + "loss": 3.1878, + "step": 1154 + }, + { + "epoch": 0.3563714902807775, + "grad_norm": 0.3217645287513733, + "learning_rate": 3e-05, + "loss": 2.3414, + "step": 1155 + }, + { + "epoch": 0.35668003702560935, + "grad_norm": 0.43228569626808167, + "learning_rate": 3e-05, + "loss": 3.0959, + "step": 1156 + }, + { + "epoch": 0.35698858377044124, + "grad_norm": 0.5505173802375793, + "learning_rate": 3e-05, + "loss": 3.0136, + "step": 1157 + }, + { + "epoch": 0.35729713051527306, + "grad_norm": 0.31909194588661194, + "learning_rate": 3e-05, + "loss": 2.4238, + "step": 1158 + }, + { + "epoch": 0.3576056772601049, + "grad_norm": 0.38182443380355835, + "learning_rate": 3e-05, + "loss": 2.8554, + "step": 1159 + }, + { + "epoch": 0.3579142240049367, + "grad_norm": 0.3917055130004883, + "learning_rate": 3e-05, + "loss": 2.4918, + "step": 1160 + }, + { + "epoch": 0.3582227707497686, + "grad_norm": 0.30668020248413086, + "learning_rate": 3e-05, + "loss": 2.4466, + "step": 1161 + }, + { + "epoch": 0.35853131749460043, + "grad_norm": 0.4843204617500305, + "learning_rate": 3e-05, + "loss": 3.0447, + "step": 1162 + }, + { + "epoch": 0.35883986423943226, + "grad_norm": 0.3819452226161957, + "learning_rate": 3e-05, + "loss": 2.8378, + "step": 1163 + }, + { + "epoch": 0.35914841098426414, + "grad_norm": 0.41383904218673706, + "learning_rate": 3e-05, + "loss": 2.558, + "step": 1164 + }, + { + "epoch": 0.35945695772909597, + "grad_norm": 0.31443139910697937, + "learning_rate": 3e-05, + "loss": 2.7383, + "step": 1165 + }, + { + "epoch": 0.3597655044739278, + "grad_norm": 0.39199626445770264, + "learning_rate": 3e-05, + "loss": 2.7038, + "step": 1166 + }, + { + "epoch": 0.3600740512187596, + "grad_norm": 0.3552488386631012, + "learning_rate": 3e-05, + "loss": 2.3299, + "step": 1167 + }, + { + "epoch": 0.3603825979635915, + "grad_norm": 0.45004716515541077, + "learning_rate": 3e-05, + "loss": 3.0991, + "step": 1168 + }, + { + "epoch": 0.36069114470842334, + "grad_norm": 0.3720468282699585, + "learning_rate": 3e-05, + "loss": 2.5241, + "step": 1169 + }, + { + "epoch": 0.36099969145325517, + "grad_norm": 0.4281458258628845, + "learning_rate": 3e-05, + "loss": 2.9697, + "step": 1170 + }, + { + "epoch": 0.361308238198087, + "grad_norm": 0.4345632791519165, + "learning_rate": 3e-05, + "loss": 2.473, + "step": 1171 + }, + { + "epoch": 0.3616167849429189, + "grad_norm": 0.28444620966911316, + "learning_rate": 3e-05, + "loss": 2.2297, + "step": 1172 + }, + { + "epoch": 0.3619253316877507, + "grad_norm": 0.33514413237571716, + "learning_rate": 3e-05, + "loss": 2.7778, + "step": 1173 + }, + { + "epoch": 0.36223387843258253, + "grad_norm": 0.29754677414894104, + "learning_rate": 3e-05, + "loss": 2.5157, + "step": 1174 + }, + { + "epoch": 0.36254242517741436, + "grad_norm": 0.3521910309791565, + "learning_rate": 3e-05, + "loss": 2.8631, + "step": 1175 + }, + { + "epoch": 0.36285097192224625, + "grad_norm": 0.35081109404563904, + "learning_rate": 3e-05, + "loss": 2.3547, + "step": 1176 + }, + { + "epoch": 0.3631595186670781, + "grad_norm": 0.6115928292274475, + "learning_rate": 3e-05, + "loss": 3.0622, + "step": 1177 + }, + { + "epoch": 0.3634680654119099, + "grad_norm": 0.3421701192855835, + "learning_rate": 3e-05, + "loss": 2.5053, + "step": 1178 + }, + { + "epoch": 0.36377661215674173, + "grad_norm": 0.42282208800315857, + "learning_rate": 3e-05, + "loss": 3.1358, + "step": 1179 + }, + { + "epoch": 0.3640851589015736, + "grad_norm": 0.3527933359146118, + "learning_rate": 3e-05, + "loss": 2.7091, + "step": 1180 + }, + { + "epoch": 0.36439370564640544, + "grad_norm": 0.40788233280181885, + "learning_rate": 3e-05, + "loss": 2.7765, + "step": 1181 + }, + { + "epoch": 0.36470225239123727, + "grad_norm": 0.39160051941871643, + "learning_rate": 3e-05, + "loss": 2.8549, + "step": 1182 + }, + { + "epoch": 0.3650107991360691, + "grad_norm": 0.4997316598892212, + "learning_rate": 3e-05, + "loss": 2.7178, + "step": 1183 + }, + { + "epoch": 0.365319345880901, + "grad_norm": 0.376682847738266, + "learning_rate": 3e-05, + "loss": 2.7221, + "step": 1184 + }, + { + "epoch": 0.3656278926257328, + "grad_norm": 0.5439577102661133, + "learning_rate": 3e-05, + "loss": 2.7612, + "step": 1185 + }, + { + "epoch": 0.36593643937056464, + "grad_norm": 0.375169962644577, + "learning_rate": 3e-05, + "loss": 2.8623, + "step": 1186 + }, + { + "epoch": 0.36624498611539646, + "grad_norm": 0.3876233994960785, + "learning_rate": 3e-05, + "loss": 2.7473, + "step": 1187 + }, + { + "epoch": 0.36655353286022835, + "grad_norm": 0.36275288462638855, + "learning_rate": 3e-05, + "loss": 2.7514, + "step": 1188 + }, + { + "epoch": 0.3668620796050602, + "grad_norm": 0.36678746342658997, + "learning_rate": 3e-05, + "loss": 2.8159, + "step": 1189 + }, + { + "epoch": 0.367170626349892, + "grad_norm": 0.4482974410057068, + "learning_rate": 3e-05, + "loss": 2.6262, + "step": 1190 + }, + { + "epoch": 0.36747917309472383, + "grad_norm": 0.5268934369087219, + "learning_rate": 3e-05, + "loss": 3.0075, + "step": 1191 + }, + { + "epoch": 0.3677877198395557, + "grad_norm": 0.5856001377105713, + "learning_rate": 3e-05, + "loss": 3.1768, + "step": 1192 + }, + { + "epoch": 0.36809626658438754, + "grad_norm": 0.36955198645591736, + "learning_rate": 3e-05, + "loss": 2.2928, + "step": 1193 + }, + { + "epoch": 0.36840481332921937, + "grad_norm": 0.4243110120296478, + "learning_rate": 3e-05, + "loss": 2.8757, + "step": 1194 + }, + { + "epoch": 0.3687133600740512, + "grad_norm": 0.5759037733078003, + "learning_rate": 3e-05, + "loss": 3.1536, + "step": 1195 + }, + { + "epoch": 0.3690219068188831, + "grad_norm": 0.39592766761779785, + "learning_rate": 3e-05, + "loss": 2.6084, + "step": 1196 + }, + { + "epoch": 0.3693304535637149, + "grad_norm": 0.4080641567707062, + "learning_rate": 3e-05, + "loss": 2.5055, + "step": 1197 + }, + { + "epoch": 0.36963900030854674, + "grad_norm": 0.46009865403175354, + "learning_rate": 3e-05, + "loss": 2.9701, + "step": 1198 + }, + { + "epoch": 0.36994754705337857, + "grad_norm": 0.2832425832748413, + "learning_rate": 3e-05, + "loss": 2.3101, + "step": 1199 + }, + { + "epoch": 0.37025609379821045, + "grad_norm": 0.3997393548488617, + "learning_rate": 3e-05, + "loss": 2.5356, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 3241, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.248993565474816e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}