| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.906474820143885, |
| "eval_steps": 500, |
| "global_step": 483, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014388489208633094, |
| "grad_norm": 4.076230049133301, |
| "learning_rate": 0.0, |
| "loss": 0.921, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.02877697841726619, |
| "grad_norm": 3.67684268951416, |
| "learning_rate": 4.306765580733931e-06, |
| "loss": 0.8454, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.04316546762589928, |
| "grad_norm": 4.125087261199951, |
| "learning_rate": 6.826061944859854e-06, |
| "loss": 0.8948, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05755395683453238, |
| "grad_norm": 3.6336395740509033, |
| "learning_rate": 8.613531161467863e-06, |
| "loss": 0.863, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.07194244604316546, |
| "grad_norm": 3.8533761501312256, |
| "learning_rate": 1e-05, |
| "loss": 0.8857, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08633093525179857, |
| "grad_norm": 3.875922441482544, |
| "learning_rate": 1e-05, |
| "loss": 0.9125, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.10071942446043165, |
| "grad_norm": 3.7486376762390137, |
| "learning_rate": 1e-05, |
| "loss": 0.8707, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.11510791366906475, |
| "grad_norm": 3.658844470977783, |
| "learning_rate": 1e-05, |
| "loss": 0.8578, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.12949640287769784, |
| "grad_norm": 3.4762141704559326, |
| "learning_rate": 1e-05, |
| "loss": 0.8535, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.14388489208633093, |
| "grad_norm": 3.545123815536499, |
| "learning_rate": 1e-05, |
| "loss": 0.8528, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.15827338129496402, |
| "grad_norm": 3.4922280311584473, |
| "learning_rate": 1e-05, |
| "loss": 0.8331, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.17266187050359713, |
| "grad_norm": 3.3063366413116455, |
| "learning_rate": 1e-05, |
| "loss": 0.808, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.18705035971223022, |
| "grad_norm": 3.1859560012817383, |
| "learning_rate": 1e-05, |
| "loss": 0.8333, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.2014388489208633, |
| "grad_norm": 2.746596097946167, |
| "learning_rate": 1e-05, |
| "loss": 0.7443, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2158273381294964, |
| "grad_norm": 2.932962417602539, |
| "learning_rate": 1e-05, |
| "loss": 0.8005, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2302158273381295, |
| "grad_norm": 3.172394275665283, |
| "learning_rate": 1e-05, |
| "loss": 0.7817, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2446043165467626, |
| "grad_norm": 3.240671157836914, |
| "learning_rate": 1e-05, |
| "loss": 0.8046, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.2589928057553957, |
| "grad_norm": 3.0684902667999268, |
| "learning_rate": 1e-05, |
| "loss": 0.824, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2733812949640288, |
| "grad_norm": 2.8672099113464355, |
| "learning_rate": 1e-05, |
| "loss": 0.763, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.28776978417266186, |
| "grad_norm": 2.709623336791992, |
| "learning_rate": 1e-05, |
| "loss": 0.7257, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.302158273381295, |
| "grad_norm": 2.6737098693847656, |
| "learning_rate": 1e-05, |
| "loss": 0.7494, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.31654676258992803, |
| "grad_norm": 2.6766083240509033, |
| "learning_rate": 1e-05, |
| "loss": 0.7645, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.33093525179856115, |
| "grad_norm": 2.478818416595459, |
| "learning_rate": 1e-05, |
| "loss": 0.7308, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.34532374100719426, |
| "grad_norm": 2.2721455097198486, |
| "learning_rate": 1e-05, |
| "loss": 0.7185, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.3597122302158273, |
| "grad_norm": 2.28304386138916, |
| "learning_rate": 1e-05, |
| "loss": 0.7122, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.37410071942446044, |
| "grad_norm": 2.481429100036621, |
| "learning_rate": 1e-05, |
| "loss": 0.7487, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.38848920863309355, |
| "grad_norm": 2.129704475402832, |
| "learning_rate": 1e-05, |
| "loss": 0.6795, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.4028776978417266, |
| "grad_norm": 2.2383480072021484, |
| "learning_rate": 1e-05, |
| "loss": 0.7063, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.4172661870503597, |
| "grad_norm": 2.1663718223571777, |
| "learning_rate": 1e-05, |
| "loss": 0.7156, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.4316546762589928, |
| "grad_norm": 2.1527349948883057, |
| "learning_rate": 1e-05, |
| "loss": 0.7308, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4460431654676259, |
| "grad_norm": 2.1730828285217285, |
| "learning_rate": 1e-05, |
| "loss": 0.709, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.460431654676259, |
| "grad_norm": 1.9204847812652588, |
| "learning_rate": 1e-05, |
| "loss": 0.6463, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4748201438848921, |
| "grad_norm": 2.027853488922119, |
| "learning_rate": 1e-05, |
| "loss": 0.6651, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.4892086330935252, |
| "grad_norm": 1.85696542263031, |
| "learning_rate": 1e-05, |
| "loss": 0.6683, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.5035971223021583, |
| "grad_norm": 1.8172882795333862, |
| "learning_rate": 1e-05, |
| "loss": 0.6673, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5179856115107914, |
| "grad_norm": 1.975711703300476, |
| "learning_rate": 1e-05, |
| "loss": 0.7042, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.5323741007194245, |
| "grad_norm": 1.663536787033081, |
| "learning_rate": 1e-05, |
| "loss": 0.604, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.5467625899280576, |
| "grad_norm": 1.6366227865219116, |
| "learning_rate": 1e-05, |
| "loss": 0.6293, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.5611510791366906, |
| "grad_norm": 1.649810552597046, |
| "learning_rate": 1e-05, |
| "loss": 0.6724, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5755395683453237, |
| "grad_norm": 1.7814414501190186, |
| "learning_rate": 1e-05, |
| "loss": 0.6427, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5899280575539568, |
| "grad_norm": 1.7005351781845093, |
| "learning_rate": 1e-05, |
| "loss": 0.635, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.60431654676259, |
| "grad_norm": 1.7959794998168945, |
| "learning_rate": 1e-05, |
| "loss": 0.6269, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.6187050359712231, |
| "grad_norm": 1.7720296382904053, |
| "learning_rate": 1e-05, |
| "loss": 0.6109, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6330935251798561, |
| "grad_norm": 1.6320812702178955, |
| "learning_rate": 1e-05, |
| "loss": 0.62, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6474820143884892, |
| "grad_norm": 1.5529130697250366, |
| "learning_rate": 1e-05, |
| "loss": 0.6236, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6618705035971223, |
| "grad_norm": 1.5511460304260254, |
| "learning_rate": 1e-05, |
| "loss": 0.6303, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.6762589928057554, |
| "grad_norm": 1.5195655822753906, |
| "learning_rate": 1e-05, |
| "loss": 0.6311, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6906474820143885, |
| "grad_norm": 1.4825903177261353, |
| "learning_rate": 1e-05, |
| "loss": 0.6388, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.7050359712230215, |
| "grad_norm": 1.4872578382492065, |
| "learning_rate": 1e-05, |
| "loss": 0.5893, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.7194244604316546, |
| "grad_norm": 1.4323009252548218, |
| "learning_rate": 1e-05, |
| "loss": 0.5866, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.7338129496402878, |
| "grad_norm": 1.3664767742156982, |
| "learning_rate": 1e-05, |
| "loss": 0.5898, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.7482014388489209, |
| "grad_norm": 1.3744959831237793, |
| "learning_rate": 1e-05, |
| "loss": 0.6132, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.762589928057554, |
| "grad_norm": 1.4650039672851562, |
| "learning_rate": 1e-05, |
| "loss": 0.5864, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.7769784172661871, |
| "grad_norm": 1.3167423009872437, |
| "learning_rate": 1e-05, |
| "loss": 0.594, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.7913669064748201, |
| "grad_norm": 1.3216445446014404, |
| "learning_rate": 1e-05, |
| "loss": 0.585, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.8057553956834532, |
| "grad_norm": 1.3380017280578613, |
| "learning_rate": 1e-05, |
| "loss": 0.6012, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.8201438848920863, |
| "grad_norm": 1.2493780851364136, |
| "learning_rate": 1e-05, |
| "loss": 0.5894, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.8345323741007195, |
| "grad_norm": 1.2462879419326782, |
| "learning_rate": 1e-05, |
| "loss": 0.5909, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.8489208633093526, |
| "grad_norm": 1.196331262588501, |
| "learning_rate": 1e-05, |
| "loss": 0.5701, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.8633093525179856, |
| "grad_norm": 1.1711316108703613, |
| "learning_rate": 1e-05, |
| "loss": 0.5928, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.8776978417266187, |
| "grad_norm": 1.1381713151931763, |
| "learning_rate": 1e-05, |
| "loss": 0.5489, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.8920863309352518, |
| "grad_norm": 1.047317385673523, |
| "learning_rate": 1e-05, |
| "loss": 0.5523, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.9064748201438849, |
| "grad_norm": 1.0775929689407349, |
| "learning_rate": 1e-05, |
| "loss": 0.5977, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.920863309352518, |
| "grad_norm": 1.031957983970642, |
| "learning_rate": 1e-05, |
| "loss": 0.5786, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.935251798561151, |
| "grad_norm": 1.0746288299560547, |
| "learning_rate": 1e-05, |
| "loss": 0.5461, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.9496402877697842, |
| "grad_norm": 1.0490261316299438, |
| "learning_rate": 1e-05, |
| "loss": 0.5597, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.9640287769784173, |
| "grad_norm": 1.0236090421676636, |
| "learning_rate": 1e-05, |
| "loss": 0.5681, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.9784172661870504, |
| "grad_norm": 0.9030091762542725, |
| "learning_rate": 1e-05, |
| "loss": 0.5484, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.9928057553956835, |
| "grad_norm": 0.9361598491668701, |
| "learning_rate": 1e-05, |
| "loss": 0.5388, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.9361598491668701, |
| "learning_rate": 1e-05, |
| "loss": 0.2987, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.014388489208633, |
| "grad_norm": 0.9445173740386963, |
| "learning_rate": 1e-05, |
| "loss": 0.5295, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.0287769784172662, |
| "grad_norm": 0.8961930871009827, |
| "learning_rate": 1e-05, |
| "loss": 0.5482, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.0431654676258992, |
| "grad_norm": 0.8979491591453552, |
| "learning_rate": 1e-05, |
| "loss": 0.551, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.0575539568345325, |
| "grad_norm": 0.9740926623344421, |
| "learning_rate": 1e-05, |
| "loss": 0.5461, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.0719424460431655, |
| "grad_norm": 0.872488260269165, |
| "learning_rate": 1e-05, |
| "loss": 0.5468, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.0863309352517985, |
| "grad_norm": 0.8811886310577393, |
| "learning_rate": 1e-05, |
| "loss": 0.5305, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.1007194244604317, |
| "grad_norm": 0.8566317558288574, |
| "learning_rate": 1e-05, |
| "loss": 0.5478, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.1151079136690647, |
| "grad_norm": 0.8720346093177795, |
| "learning_rate": 1e-05, |
| "loss": 0.5336, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.129496402877698, |
| "grad_norm": 0.888973593711853, |
| "learning_rate": 1e-05, |
| "loss": 0.5652, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.143884892086331, |
| "grad_norm": 0.8784003853797913, |
| "learning_rate": 1e-05, |
| "loss": 0.5437, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.158273381294964, |
| "grad_norm": 0.8504276275634766, |
| "learning_rate": 1e-05, |
| "loss": 0.5565, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.1726618705035972, |
| "grad_norm": 0.846744954586029, |
| "learning_rate": 1e-05, |
| "loss": 0.5255, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.1870503597122302, |
| "grad_norm": 0.8063694834709167, |
| "learning_rate": 1e-05, |
| "loss": 0.5265, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.2014388489208634, |
| "grad_norm": 0.7846396565437317, |
| "learning_rate": 1e-05, |
| "loss": 0.5022, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.2158273381294964, |
| "grad_norm": 0.7946937680244446, |
| "learning_rate": 1e-05, |
| "loss": 0.5274, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.2302158273381294, |
| "grad_norm": 0.7619222402572632, |
| "learning_rate": 1e-05, |
| "loss": 0.4833, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.2446043165467626, |
| "grad_norm": 0.9151326417922974, |
| "learning_rate": 1e-05, |
| "loss": 0.5245, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.2589928057553956, |
| "grad_norm": 0.773263692855835, |
| "learning_rate": 1e-05, |
| "loss": 0.4793, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.2733812949640289, |
| "grad_norm": 0.780850350856781, |
| "learning_rate": 1e-05, |
| "loss": 0.5233, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.2877697841726619, |
| "grad_norm": 0.8392658233642578, |
| "learning_rate": 1e-05, |
| "loss": 0.5239, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.3021582733812949, |
| "grad_norm": 0.7620977163314819, |
| "learning_rate": 1e-05, |
| "loss": 0.4928, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.316546762589928, |
| "grad_norm": 0.7995700240135193, |
| "learning_rate": 1e-05, |
| "loss": 0.5069, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.330935251798561, |
| "grad_norm": 0.7457262277603149, |
| "learning_rate": 1e-05, |
| "loss": 0.5026, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.3453237410071943, |
| "grad_norm": 0.7410624027252197, |
| "learning_rate": 1e-05, |
| "loss": 0.5132, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.3597122302158273, |
| "grad_norm": 0.7126360535621643, |
| "learning_rate": 1e-05, |
| "loss": 0.5226, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.3741007194244603, |
| "grad_norm": 0.673366129398346, |
| "learning_rate": 1e-05, |
| "loss": 0.5099, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.3884892086330936, |
| "grad_norm": 0.6633723974227905, |
| "learning_rate": 1e-05, |
| "loss": 0.4757, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.4028776978417266, |
| "grad_norm": 0.6487114429473877, |
| "learning_rate": 1e-05, |
| "loss": 0.5028, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.4172661870503598, |
| "grad_norm": 0.6453342437744141, |
| "learning_rate": 1e-05, |
| "loss": 0.5221, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.4316546762589928, |
| "grad_norm": 0.6909589767456055, |
| "learning_rate": 1e-05, |
| "loss": 0.5162, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.4460431654676258, |
| "grad_norm": 0.6747501492500305, |
| "learning_rate": 1e-05, |
| "loss": 0.4911, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.460431654676259, |
| "grad_norm": 0.6311056613922119, |
| "learning_rate": 1e-05, |
| "loss": 0.522, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.474820143884892, |
| "grad_norm": 0.6742913126945496, |
| "learning_rate": 1e-05, |
| "loss": 0.4902, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.4892086330935252, |
| "grad_norm": 0.6874509453773499, |
| "learning_rate": 1e-05, |
| "loss": 0.4913, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.5035971223021583, |
| "grad_norm": 0.643722653388977, |
| "learning_rate": 1e-05, |
| "loss": 0.5285, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.5179856115107913, |
| "grad_norm": 0.5956730842590332, |
| "learning_rate": 1e-05, |
| "loss": 0.4861, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.5323741007194245, |
| "grad_norm": 0.6326804161071777, |
| "learning_rate": 1e-05, |
| "loss": 0.4954, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.5467625899280577, |
| "grad_norm": 0.6249592304229736, |
| "learning_rate": 1e-05, |
| "loss": 0.4946, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.5611510791366907, |
| "grad_norm": 0.6768006086349487, |
| "learning_rate": 1e-05, |
| "loss": 0.5081, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.5755395683453237, |
| "grad_norm": 0.5977171659469604, |
| "learning_rate": 1e-05, |
| "loss": 0.478, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.5899280575539567, |
| "grad_norm": 0.6536011099815369, |
| "learning_rate": 1e-05, |
| "loss": 0.4947, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.60431654676259, |
| "grad_norm": 0.6527348756790161, |
| "learning_rate": 1e-05, |
| "loss": 0.5137, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.6187050359712232, |
| "grad_norm": 0.6480767130851746, |
| "learning_rate": 1e-05, |
| "loss": 0.4685, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.6330935251798562, |
| "grad_norm": 0.5723071098327637, |
| "learning_rate": 1e-05, |
| "loss": 0.5056, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.6474820143884892, |
| "grad_norm": 0.6576047539710999, |
| "learning_rate": 1e-05, |
| "loss": 0.4963, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.6618705035971222, |
| "grad_norm": 0.6563637256622314, |
| "learning_rate": 1e-05, |
| "loss": 0.4973, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.6762589928057554, |
| "grad_norm": 0.5932542085647583, |
| "learning_rate": 1e-05, |
| "loss": 0.5001, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.6906474820143886, |
| "grad_norm": 0.6055029630661011, |
| "learning_rate": 1e-05, |
| "loss": 0.4854, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.7050359712230216, |
| "grad_norm": 0.6184179186820984, |
| "learning_rate": 1e-05, |
| "loss": 0.5105, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.7194244604316546, |
| "grad_norm": 0.6108628511428833, |
| "learning_rate": 1e-05, |
| "loss": 0.4625, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.7338129496402876, |
| "grad_norm": 0.601538896560669, |
| "learning_rate": 1e-05, |
| "loss": 0.4877, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.7482014388489209, |
| "grad_norm": 0.5939962267875671, |
| "learning_rate": 1e-05, |
| "loss": 0.4701, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.762589928057554, |
| "grad_norm": 0.5988656878471375, |
| "learning_rate": 1e-05, |
| "loss": 0.4958, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.776978417266187, |
| "grad_norm": 0.5639538764953613, |
| "learning_rate": 1e-05, |
| "loss": 0.4777, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.79136690647482, |
| "grad_norm": 0.617649257183075, |
| "learning_rate": 1e-05, |
| "loss": 0.5032, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.8057553956834531, |
| "grad_norm": 0.5906999707221985, |
| "learning_rate": 1e-05, |
| "loss": 0.4663, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.8201438848920863, |
| "grad_norm": 0.5897840857505798, |
| "learning_rate": 1e-05, |
| "loss": 0.4531, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.8345323741007196, |
| "grad_norm": 0.5925832390785217, |
| "learning_rate": 1e-05, |
| "loss": 0.4817, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.8489208633093526, |
| "grad_norm": 0.5942744612693787, |
| "learning_rate": 1e-05, |
| "loss": 0.467, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.8633093525179856, |
| "grad_norm": 0.5901297330856323, |
| "learning_rate": 1e-05, |
| "loss": 0.4863, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.8776978417266186, |
| "grad_norm": 0.592784583568573, |
| "learning_rate": 1e-05, |
| "loss": 0.502, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.8920863309352518, |
| "grad_norm": 0.5842126607894897, |
| "learning_rate": 1e-05, |
| "loss": 0.4796, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.906474820143885, |
| "grad_norm": 0.6174241304397583, |
| "learning_rate": 1e-05, |
| "loss": 0.5029, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.920863309352518, |
| "grad_norm": 0.5956327319145203, |
| "learning_rate": 1e-05, |
| "loss": 0.475, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.935251798561151, |
| "grad_norm": 0.5690869092941284, |
| "learning_rate": 1e-05, |
| "loss": 0.4554, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.949640287769784, |
| "grad_norm": 0.5758855938911438, |
| "learning_rate": 1e-05, |
| "loss": 0.475, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.9640287769784173, |
| "grad_norm": 0.6002164483070374, |
| "learning_rate": 1e-05, |
| "loss": 0.4802, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.9784172661870505, |
| "grad_norm": 0.6135545372962952, |
| "learning_rate": 1e-05, |
| "loss": 0.472, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.9928057553956835, |
| "grad_norm": 0.6322203278541565, |
| "learning_rate": 1e-05, |
| "loss": 0.4835, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.6329755783081055, |
| "learning_rate": 1e-05, |
| "loss": 0.2255, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.014388489208633, |
| "grad_norm": 0.5560349822044373, |
| "learning_rate": 1e-05, |
| "loss": 0.4591, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.028776978417266, |
| "grad_norm": 0.5675270557403564, |
| "learning_rate": 1e-05, |
| "loss": 0.4741, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.0431654676258995, |
| "grad_norm": 0.6017403602600098, |
| "learning_rate": 1e-05, |
| "loss": 0.4545, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.0575539568345325, |
| "grad_norm": 0.63222736120224, |
| "learning_rate": 1e-05, |
| "loss": 0.4829, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.0719424460431655, |
| "grad_norm": 0.5998539328575134, |
| "learning_rate": 1e-05, |
| "loss": 0.4564, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.0863309352517985, |
| "grad_norm": 0.5533970594406128, |
| "learning_rate": 1e-05, |
| "loss": 0.4533, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.1007194244604315, |
| "grad_norm": 0.5739572048187256, |
| "learning_rate": 1e-05, |
| "loss": 0.4632, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.115107913669065, |
| "grad_norm": 0.5614170432090759, |
| "learning_rate": 1e-05, |
| "loss": 0.4509, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.129496402877698, |
| "grad_norm": 0.5687401294708252, |
| "learning_rate": 1e-05, |
| "loss": 0.4588, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.143884892086331, |
| "grad_norm": 0.5750886797904968, |
| "learning_rate": 1e-05, |
| "loss": 0.4713, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.158273381294964, |
| "grad_norm": 0.5580982565879822, |
| "learning_rate": 1e-05, |
| "loss": 0.4525, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.172661870503597, |
| "grad_norm": 0.6040503978729248, |
| "learning_rate": 1e-05, |
| "loss": 0.4681, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.1870503597122304, |
| "grad_norm": 0.5956637263298035, |
| "learning_rate": 1e-05, |
| "loss": 0.4569, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.2014388489208634, |
| "grad_norm": 0.6189143061637878, |
| "learning_rate": 1e-05, |
| "loss": 0.465, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.2158273381294964, |
| "grad_norm": 0.6555837988853455, |
| "learning_rate": 1e-05, |
| "loss": 0.4735, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.2302158273381294, |
| "grad_norm": 0.6033856868743896, |
| "learning_rate": 1e-05, |
| "loss": 0.4728, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.2446043165467624, |
| "grad_norm": 0.6196707487106323, |
| "learning_rate": 1e-05, |
| "loss": 0.4695, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.258992805755396, |
| "grad_norm": 0.6161644458770752, |
| "learning_rate": 1e-05, |
| "loss": 0.4811, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.273381294964029, |
| "grad_norm": 0.530752956867218, |
| "learning_rate": 1e-05, |
| "loss": 0.4427, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.287769784172662, |
| "grad_norm": 0.6008985638618469, |
| "learning_rate": 1e-05, |
| "loss": 0.4471, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.302158273381295, |
| "grad_norm": 0.5926287770271301, |
| "learning_rate": 1e-05, |
| "loss": 0.4628, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.316546762589928, |
| "grad_norm": 0.5955716967582703, |
| "learning_rate": 1e-05, |
| "loss": 0.437, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.3309352517985613, |
| "grad_norm": 0.5975937247276306, |
| "learning_rate": 1e-05, |
| "loss": 0.4768, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.3453237410071943, |
| "grad_norm": 0.5534577369689941, |
| "learning_rate": 1e-05, |
| "loss": 0.4348, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.3597122302158273, |
| "grad_norm": 0.5710106492042542, |
| "learning_rate": 1e-05, |
| "loss": 0.4488, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.3741007194244603, |
| "grad_norm": 0.5983866453170776, |
| "learning_rate": 1e-05, |
| "loss": 0.4794, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.3884892086330938, |
| "grad_norm": 0.5854120850563049, |
| "learning_rate": 1e-05, |
| "loss": 0.4532, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.402877697841727, |
| "grad_norm": 0.5704571008682251, |
| "learning_rate": 1e-05, |
| "loss": 0.4595, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.41726618705036, |
| "grad_norm": 0.584438145160675, |
| "learning_rate": 1e-05, |
| "loss": 0.4346, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.431654676258993, |
| "grad_norm": 0.5847723484039307, |
| "learning_rate": 1e-05, |
| "loss": 0.4317, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.446043165467626, |
| "grad_norm": 0.6004253029823303, |
| "learning_rate": 1e-05, |
| "loss": 0.4635, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.460431654676259, |
| "grad_norm": 0.5731106400489807, |
| "learning_rate": 1e-05, |
| "loss": 0.4736, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.4748201438848922, |
| "grad_norm": 0.5508257150650024, |
| "learning_rate": 1e-05, |
| "loss": 0.4472, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.4892086330935252, |
| "grad_norm": 0.5863555669784546, |
| "learning_rate": 1e-05, |
| "loss": 0.4559, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.5035971223021583, |
| "grad_norm": 0.6082853078842163, |
| "learning_rate": 1e-05, |
| "loss": 0.4406, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.5179856115107913, |
| "grad_norm": 0.6021372079849243, |
| "learning_rate": 1e-05, |
| "loss": 0.4498, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.5323741007194247, |
| "grad_norm": 0.578254222869873, |
| "learning_rate": 1e-05, |
| "loss": 0.4765, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.5467625899280577, |
| "grad_norm": 0.6056034564971924, |
| "learning_rate": 1e-05, |
| "loss": 0.4749, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.5611510791366907, |
| "grad_norm": 0.6150136590003967, |
| "learning_rate": 1e-05, |
| "loss": 0.4437, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.5755395683453237, |
| "grad_norm": 0.5927571654319763, |
| "learning_rate": 1e-05, |
| "loss": 0.4498, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.5899280575539567, |
| "grad_norm": 0.587335467338562, |
| "learning_rate": 1e-05, |
| "loss": 0.4813, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.6043165467625897, |
| "grad_norm": 0.5799887776374817, |
| "learning_rate": 1e-05, |
| "loss": 0.4541, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.618705035971223, |
| "grad_norm": 0.575372576713562, |
| "learning_rate": 1e-05, |
| "loss": 0.4581, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.633093525179856, |
| "grad_norm": 0.5604987740516663, |
| "learning_rate": 1e-05, |
| "loss": 0.4724, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.647482014388489, |
| "grad_norm": 0.5532740950584412, |
| "learning_rate": 1e-05, |
| "loss": 0.4473, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.661870503597122, |
| "grad_norm": 0.6173651814460754, |
| "learning_rate": 1e-05, |
| "loss": 0.4208, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.6762589928057556, |
| "grad_norm": 0.5891541838645935, |
| "learning_rate": 1e-05, |
| "loss": 0.4341, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.6906474820143886, |
| "grad_norm": 0.6045330762863159, |
| "learning_rate": 1e-05, |
| "loss": 0.4832, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.7050359712230216, |
| "grad_norm": 0.6483354568481445, |
| "learning_rate": 1e-05, |
| "loss": 0.4557, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.7194244604316546, |
| "grad_norm": 0.5970315933227539, |
| "learning_rate": 1e-05, |
| "loss": 0.4428, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.7338129496402876, |
| "grad_norm": 0.5684317946434021, |
| "learning_rate": 1e-05, |
| "loss": 0.4379, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.7482014388489207, |
| "grad_norm": 0.5632618069648743, |
| "learning_rate": 1e-05, |
| "loss": 0.4334, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.762589928057554, |
| "grad_norm": 0.6029064059257507, |
| "learning_rate": 1e-05, |
| "loss": 0.42, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.776978417266187, |
| "grad_norm": 0.5825706124305725, |
| "learning_rate": 1e-05, |
| "loss": 0.4366, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.79136690647482, |
| "grad_norm": 0.5910109281539917, |
| "learning_rate": 1e-05, |
| "loss": 0.4752, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.805755395683453, |
| "grad_norm": 0.5682767033576965, |
| "learning_rate": 1e-05, |
| "loss": 0.4394, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.8201438848920866, |
| "grad_norm": 0.5666372179985046, |
| "learning_rate": 1e-05, |
| "loss": 0.4456, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.8345323741007196, |
| "grad_norm": 0.6003570556640625, |
| "learning_rate": 1e-05, |
| "loss": 0.4332, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.8489208633093526, |
| "grad_norm": 0.612468957901001, |
| "learning_rate": 1e-05, |
| "loss": 0.4486, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.8633093525179856, |
| "grad_norm": 0.6046081185340881, |
| "learning_rate": 1e-05, |
| "loss": 0.4566, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.8776978417266186, |
| "grad_norm": 0.5841667056083679, |
| "learning_rate": 1e-05, |
| "loss": 0.4612, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.8920863309352516, |
| "grad_norm": 0.6191533207893372, |
| "learning_rate": 1e-05, |
| "loss": 0.4438, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.906474820143885, |
| "grad_norm": 0.5575750470161438, |
| "learning_rate": 1e-05, |
| "loss": 0.4268, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.920863309352518, |
| "grad_norm": 0.5611399412155151, |
| "learning_rate": 1e-05, |
| "loss": 0.4775, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.935251798561151, |
| "grad_norm": 0.5896828770637512, |
| "learning_rate": 1e-05, |
| "loss": 0.4332, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.949640287769784, |
| "grad_norm": 0.6095684170722961, |
| "learning_rate": 1e-05, |
| "loss": 0.4747, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.9640287769784175, |
| "grad_norm": 0.567354142665863, |
| "learning_rate": 1e-05, |
| "loss": 0.447, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.9784172661870505, |
| "grad_norm": 0.5809361934661865, |
| "learning_rate": 1e-05, |
| "loss": 0.474, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.9928057553956835, |
| "grad_norm": 0.5782424807548523, |
| "learning_rate": 1e-05, |
| "loss": 0.4429, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.5782424807548523, |
| "learning_rate": 1e-05, |
| "loss": 0.2245, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.014388489208633, |
| "grad_norm": 0.582120954990387, |
| "learning_rate": 1e-05, |
| "loss": 0.4354, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.028776978417266, |
| "grad_norm": 0.5369551181793213, |
| "learning_rate": 1e-05, |
| "loss": 0.396, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.0431654676258995, |
| "grad_norm": 0.5849142670631409, |
| "learning_rate": 1e-05, |
| "loss": 0.4632, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.0575539568345325, |
| "grad_norm": 0.5897305607795715, |
| "learning_rate": 1e-05, |
| "loss": 0.4055, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.0719424460431655, |
| "grad_norm": 0.5784671306610107, |
| "learning_rate": 1e-05, |
| "loss": 0.4391, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.0863309352517985, |
| "grad_norm": 0.5891516208648682, |
| "learning_rate": 1e-05, |
| "loss": 0.4493, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.1007194244604315, |
| "grad_norm": 0.6021131277084351, |
| "learning_rate": 1e-05, |
| "loss": 0.4329, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.115107913669065, |
| "grad_norm": 0.5639146566390991, |
| "learning_rate": 1e-05, |
| "loss": 0.4403, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.129496402877698, |
| "grad_norm": 0.5492843985557556, |
| "learning_rate": 1e-05, |
| "loss": 0.4745, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.143884892086331, |
| "grad_norm": 0.582566499710083, |
| "learning_rate": 1e-05, |
| "loss": 0.4324, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.158273381294964, |
| "grad_norm": 0.5682006478309631, |
| "learning_rate": 1e-05, |
| "loss": 0.4375, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.172661870503597, |
| "grad_norm": 0.5832618474960327, |
| "learning_rate": 1e-05, |
| "loss": 0.4256, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.1870503597122304, |
| "grad_norm": 0.5585988759994507, |
| "learning_rate": 1e-05, |
| "loss": 0.4476, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.2014388489208634, |
| "grad_norm": 0.5622900724411011, |
| "learning_rate": 1e-05, |
| "loss": 0.415, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.2158273381294964, |
| "grad_norm": 0.5588183999061584, |
| "learning_rate": 1e-05, |
| "loss": 0.4291, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.2302158273381294, |
| "grad_norm": 0.5371769666671753, |
| "learning_rate": 1e-05, |
| "loss": 0.4267, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.2446043165467624, |
| "grad_norm": 0.5771037340164185, |
| "learning_rate": 1e-05, |
| "loss": 0.4301, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.258992805755396, |
| "grad_norm": 0.5530989766120911, |
| "learning_rate": 1e-05, |
| "loss": 0.4246, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.273381294964029, |
| "grad_norm": 0.5538605451583862, |
| "learning_rate": 1e-05, |
| "loss": 0.4185, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.287769784172662, |
| "grad_norm": 0.6077790856361389, |
| "learning_rate": 1e-05, |
| "loss": 0.4455, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.302158273381295, |
| "grad_norm": 0.5410200953483582, |
| "learning_rate": 1e-05, |
| "loss": 0.4434, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.316546762589928, |
| "grad_norm": 0.5968116521835327, |
| "learning_rate": 1e-05, |
| "loss": 0.4317, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.3309352517985613, |
| "grad_norm": 0.6267459392547607, |
| "learning_rate": 1e-05, |
| "loss": 0.4431, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.3453237410071943, |
| "grad_norm": 0.5526189804077148, |
| "learning_rate": 1e-05, |
| "loss": 0.4552, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.3597122302158273, |
| "grad_norm": 0.5781810283660889, |
| "learning_rate": 1e-05, |
| "loss": 0.4406, |
| "step": 235 |
| }, |
| { |
| "epoch": 3.3741007194244603, |
| "grad_norm": 0.581947922706604, |
| "learning_rate": 1e-05, |
| "loss": 0.4334, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.3884892086330938, |
| "grad_norm": 0.5779341459274292, |
| "learning_rate": 1e-05, |
| "loss": 0.4148, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.402877697841727, |
| "grad_norm": 0.6320657134056091, |
| "learning_rate": 1e-05, |
| "loss": 0.45, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.41726618705036, |
| "grad_norm": 0.5703683495521545, |
| "learning_rate": 1e-05, |
| "loss": 0.4283, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.431654676258993, |
| "grad_norm": 0.6058019399642944, |
| "learning_rate": 1e-05, |
| "loss": 0.4511, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.446043165467626, |
| "grad_norm": 0.5477652549743652, |
| "learning_rate": 1e-05, |
| "loss": 0.4043, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.460431654676259, |
| "grad_norm": 0.5975249409675598, |
| "learning_rate": 1e-05, |
| "loss": 0.4197, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.4748201438848922, |
| "grad_norm": 0.5762581825256348, |
| "learning_rate": 1e-05, |
| "loss": 0.4353, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.4892086330935252, |
| "grad_norm": 0.5558130741119385, |
| "learning_rate": 1e-05, |
| "loss": 0.4388, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.5035971223021583, |
| "grad_norm": 0.5559906363487244, |
| "learning_rate": 1e-05, |
| "loss": 0.4243, |
| "step": 245 |
| }, |
| { |
| "epoch": 3.5179856115107913, |
| "grad_norm": 0.5471354722976685, |
| "learning_rate": 1e-05, |
| "loss": 0.4299, |
| "step": 246 |
| }, |
| { |
| "epoch": 3.5323741007194247, |
| "grad_norm": 0.5834676623344421, |
| "learning_rate": 1e-05, |
| "loss": 0.4605, |
| "step": 247 |
| }, |
| { |
| "epoch": 3.5467625899280577, |
| "grad_norm": 0.5565128922462463, |
| "learning_rate": 1e-05, |
| "loss": 0.443, |
| "step": 248 |
| }, |
| { |
| "epoch": 3.5611510791366907, |
| "grad_norm": 0.5589417219161987, |
| "learning_rate": 1e-05, |
| "loss": 0.4353, |
| "step": 249 |
| }, |
| { |
| "epoch": 3.5755395683453237, |
| "grad_norm": 0.5215206146240234, |
| "learning_rate": 1e-05, |
| "loss": 0.4088, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.5899280575539567, |
| "grad_norm": 0.5717970132827759, |
| "learning_rate": 1e-05, |
| "loss": 0.4429, |
| "step": 251 |
| }, |
| { |
| "epoch": 3.6043165467625897, |
| "grad_norm": 0.583791196346283, |
| "learning_rate": 1e-05, |
| "loss": 0.4416, |
| "step": 252 |
| }, |
| { |
| "epoch": 3.618705035971223, |
| "grad_norm": 0.529136598110199, |
| "learning_rate": 1e-05, |
| "loss": 0.4169, |
| "step": 253 |
| }, |
| { |
| "epoch": 3.633093525179856, |
| "grad_norm": 0.5891124606132507, |
| "learning_rate": 1e-05, |
| "loss": 0.4218, |
| "step": 254 |
| }, |
| { |
| "epoch": 3.647482014388489, |
| "grad_norm": 0.5782115459442139, |
| "learning_rate": 1e-05, |
| "loss": 0.4169, |
| "step": 255 |
| }, |
| { |
| "epoch": 3.661870503597122, |
| "grad_norm": 0.533729076385498, |
| "learning_rate": 1e-05, |
| "loss": 0.4452, |
| "step": 256 |
| }, |
| { |
| "epoch": 3.6762589928057556, |
| "grad_norm": 0.590743899345398, |
| "learning_rate": 1e-05, |
| "loss": 0.4296, |
| "step": 257 |
| }, |
| { |
| "epoch": 3.6906474820143886, |
| "grad_norm": 0.6053430438041687, |
| "learning_rate": 1e-05, |
| "loss": 0.4602, |
| "step": 258 |
| }, |
| { |
| "epoch": 3.7050359712230216, |
| "grad_norm": 0.5711214542388916, |
| "learning_rate": 1e-05, |
| "loss": 0.4444, |
| "step": 259 |
| }, |
| { |
| "epoch": 3.7194244604316546, |
| "grad_norm": 0.5764939785003662, |
| "learning_rate": 1e-05, |
| "loss": 0.4108, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.7338129496402876, |
| "grad_norm": 0.5643983483314514, |
| "learning_rate": 1e-05, |
| "loss": 0.4324, |
| "step": 261 |
| }, |
| { |
| "epoch": 3.7482014388489207, |
| "grad_norm": 0.5655452609062195, |
| "learning_rate": 1e-05, |
| "loss": 0.4332, |
| "step": 262 |
| }, |
| { |
| "epoch": 3.762589928057554, |
| "grad_norm": 0.5671420097351074, |
| "learning_rate": 1e-05, |
| "loss": 0.4082, |
| "step": 263 |
| }, |
| { |
| "epoch": 3.776978417266187, |
| "grad_norm": 0.5838333964347839, |
| "learning_rate": 1e-05, |
| "loss": 0.4209, |
| "step": 264 |
| }, |
| { |
| "epoch": 3.79136690647482, |
| "grad_norm": 0.5561381578445435, |
| "learning_rate": 1e-05, |
| "loss": 0.4461, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.805755395683453, |
| "grad_norm": 0.5730894207954407, |
| "learning_rate": 1e-05, |
| "loss": 0.4361, |
| "step": 266 |
| }, |
| { |
| "epoch": 3.8201438848920866, |
| "grad_norm": 0.5769463181495667, |
| "learning_rate": 1e-05, |
| "loss": 0.4194, |
| "step": 267 |
| }, |
| { |
| "epoch": 3.8345323741007196, |
| "grad_norm": 0.5742232799530029, |
| "learning_rate": 1e-05, |
| "loss": 0.4386, |
| "step": 268 |
| }, |
| { |
| "epoch": 3.8489208633093526, |
| "grad_norm": 0.6530919671058655, |
| "learning_rate": 1e-05, |
| "loss": 0.4308, |
| "step": 269 |
| }, |
| { |
| "epoch": 3.8633093525179856, |
| "grad_norm": 0.6178688406944275, |
| "learning_rate": 1e-05, |
| "loss": 0.4319, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.8776978417266186, |
| "grad_norm": 0.61128169298172, |
| "learning_rate": 1e-05, |
| "loss": 0.4397, |
| "step": 271 |
| }, |
| { |
| "epoch": 3.8920863309352516, |
| "grad_norm": 0.6222262382507324, |
| "learning_rate": 1e-05, |
| "loss": 0.461, |
| "step": 272 |
| }, |
| { |
| "epoch": 3.906474820143885, |
| "grad_norm": 0.5616589784622192, |
| "learning_rate": 1e-05, |
| "loss": 0.4297, |
| "step": 273 |
| }, |
| { |
| "epoch": 3.920863309352518, |
| "grad_norm": 0.5833615064620972, |
| "learning_rate": 1e-05, |
| "loss": 0.4175, |
| "step": 274 |
| }, |
| { |
| "epoch": 3.935251798561151, |
| "grad_norm": 0.5919567346572876, |
| "learning_rate": 1e-05, |
| "loss": 0.443, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.949640287769784, |
| "grad_norm": 0.6515227556228638, |
| "learning_rate": 1e-05, |
| "loss": 0.3965, |
| "step": 276 |
| }, |
| { |
| "epoch": 3.9640287769784175, |
| "grad_norm": 0.5902924537658691, |
| "learning_rate": 1e-05, |
| "loss": 0.4358, |
| "step": 277 |
| }, |
| { |
| "epoch": 3.9784172661870505, |
| "grad_norm": 0.555015504360199, |
| "learning_rate": 1e-05, |
| "loss": 0.414, |
| "step": 278 |
| }, |
| { |
| "epoch": 3.9928057553956835, |
| "grad_norm": 0.5393680334091187, |
| "learning_rate": 1e-05, |
| "loss": 0.4246, |
| "step": 279 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.609384298324585, |
| "learning_rate": 1e-05, |
| "loss": 0.2305, |
| "step": 280 |
| }, |
| { |
| "epoch": 4.014388489208633, |
| "grad_norm": 0.5638090372085571, |
| "learning_rate": 1e-05, |
| "loss": 0.4217, |
| "step": 281 |
| }, |
| { |
| "epoch": 4.028776978417266, |
| "grad_norm": 0.5361935496330261, |
| "learning_rate": 1e-05, |
| "loss": 0.4237, |
| "step": 282 |
| }, |
| { |
| "epoch": 4.043165467625899, |
| "grad_norm": 0.6069725751876831, |
| "learning_rate": 1e-05, |
| "loss": 0.4246, |
| "step": 283 |
| }, |
| { |
| "epoch": 4.057553956834532, |
| "grad_norm": 0.5956954956054688, |
| "learning_rate": 1e-05, |
| "loss": 0.4365, |
| "step": 284 |
| }, |
| { |
| "epoch": 4.071942446043165, |
| "grad_norm": 0.5414948463439941, |
| "learning_rate": 1e-05, |
| "loss": 0.4147, |
| "step": 285 |
| }, |
| { |
| "epoch": 4.086330935251799, |
| "grad_norm": 0.5541677474975586, |
| "learning_rate": 1e-05, |
| "loss": 0.4021, |
| "step": 286 |
| }, |
| { |
| "epoch": 4.100719424460432, |
| "grad_norm": 0.5846985578536987, |
| "learning_rate": 1e-05, |
| "loss": 0.4087, |
| "step": 287 |
| }, |
| { |
| "epoch": 4.115107913669065, |
| "grad_norm": 0.5740228295326233, |
| "learning_rate": 1e-05, |
| "loss": 0.4141, |
| "step": 288 |
| }, |
| { |
| "epoch": 4.129496402877698, |
| "grad_norm": 0.5534329414367676, |
| "learning_rate": 1e-05, |
| "loss": 0.3993, |
| "step": 289 |
| }, |
| { |
| "epoch": 4.143884892086331, |
| "grad_norm": 0.5441880822181702, |
| "learning_rate": 1e-05, |
| "loss": 0.4097, |
| "step": 290 |
| }, |
| { |
| "epoch": 4.158273381294964, |
| "grad_norm": 0.6086047887802124, |
| "learning_rate": 1e-05, |
| "loss": 0.4356, |
| "step": 291 |
| }, |
| { |
| "epoch": 4.172661870503597, |
| "grad_norm": 0.5841561555862427, |
| "learning_rate": 1e-05, |
| "loss": 0.4384, |
| "step": 292 |
| }, |
| { |
| "epoch": 4.18705035971223, |
| "grad_norm": 0.5554783344268799, |
| "learning_rate": 1e-05, |
| "loss": 0.4301, |
| "step": 293 |
| }, |
| { |
| "epoch": 4.201438848920863, |
| "grad_norm": 0.5936840176582336, |
| "learning_rate": 1e-05, |
| "loss": 0.4194, |
| "step": 294 |
| }, |
| { |
| "epoch": 4.215827338129497, |
| "grad_norm": 0.5931441783905029, |
| "learning_rate": 1e-05, |
| "loss": 0.4354, |
| "step": 295 |
| }, |
| { |
| "epoch": 4.23021582733813, |
| "grad_norm": 0.6169295907020569, |
| "learning_rate": 1e-05, |
| "loss": 0.4368, |
| "step": 296 |
| }, |
| { |
| "epoch": 4.244604316546763, |
| "grad_norm": 0.5904280543327332, |
| "learning_rate": 1e-05, |
| "loss": 0.3923, |
| "step": 297 |
| }, |
| { |
| "epoch": 4.258992805755396, |
| "grad_norm": 0.6404329538345337, |
| "learning_rate": 1e-05, |
| "loss": 0.4012, |
| "step": 298 |
| }, |
| { |
| "epoch": 4.273381294964029, |
| "grad_norm": 0.5566670894622803, |
| "learning_rate": 1e-05, |
| "loss": 0.4097, |
| "step": 299 |
| }, |
| { |
| "epoch": 4.287769784172662, |
| "grad_norm": 0.542718768119812, |
| "learning_rate": 1e-05, |
| "loss": 0.419, |
| "step": 300 |
| }, |
| { |
| "epoch": 4.302158273381295, |
| "grad_norm": 0.5549004077911377, |
| "learning_rate": 1e-05, |
| "loss": 0.4256, |
| "step": 301 |
| }, |
| { |
| "epoch": 4.316546762589928, |
| "grad_norm": 0.5786470770835876, |
| "learning_rate": 1e-05, |
| "loss": 0.3886, |
| "step": 302 |
| }, |
| { |
| "epoch": 4.330935251798561, |
| "grad_norm": 0.5484781265258789, |
| "learning_rate": 1e-05, |
| "loss": 0.4163, |
| "step": 303 |
| }, |
| { |
| "epoch": 4.345323741007194, |
| "grad_norm": 0.5796488523483276, |
| "learning_rate": 1e-05, |
| "loss": 0.4031, |
| "step": 304 |
| }, |
| { |
| "epoch": 4.359712230215827, |
| "grad_norm": 0.612375795841217, |
| "learning_rate": 1e-05, |
| "loss": 0.4064, |
| "step": 305 |
| }, |
| { |
| "epoch": 4.374100719424461, |
| "grad_norm": 0.562300443649292, |
| "learning_rate": 1e-05, |
| "loss": 0.3964, |
| "step": 306 |
| }, |
| { |
| "epoch": 4.388489208633094, |
| "grad_norm": 0.5949317216873169, |
| "learning_rate": 1e-05, |
| "loss": 0.4215, |
| "step": 307 |
| }, |
| { |
| "epoch": 4.402877697841727, |
| "grad_norm": 0.6002021431922913, |
| "learning_rate": 1e-05, |
| "loss": 0.4292, |
| "step": 308 |
| }, |
| { |
| "epoch": 4.41726618705036, |
| "grad_norm": 0.5166479349136353, |
| "learning_rate": 1e-05, |
| "loss": 0.4106, |
| "step": 309 |
| }, |
| { |
| "epoch": 4.431654676258993, |
| "grad_norm": 0.538162350654602, |
| "learning_rate": 1e-05, |
| "loss": 0.3994, |
| "step": 310 |
| }, |
| { |
| "epoch": 4.446043165467626, |
| "grad_norm": 0.5681014657020569, |
| "learning_rate": 1e-05, |
| "loss": 0.4193, |
| "step": 311 |
| }, |
| { |
| "epoch": 4.460431654676259, |
| "grad_norm": 0.5828492045402527, |
| "learning_rate": 1e-05, |
| "loss": 0.4291, |
| "step": 312 |
| }, |
| { |
| "epoch": 4.474820143884892, |
| "grad_norm": 0.6029539108276367, |
| "learning_rate": 1e-05, |
| "loss": 0.4155, |
| "step": 313 |
| }, |
| { |
| "epoch": 4.489208633093525, |
| "grad_norm": 0.5848509669303894, |
| "learning_rate": 1e-05, |
| "loss": 0.4301, |
| "step": 314 |
| }, |
| { |
| "epoch": 4.503597122302159, |
| "grad_norm": 0.553223192691803, |
| "learning_rate": 1e-05, |
| "loss": 0.4087, |
| "step": 315 |
| }, |
| { |
| "epoch": 4.517985611510792, |
| "grad_norm": 0.520281195640564, |
| "learning_rate": 1e-05, |
| "loss": 0.4148, |
| "step": 316 |
| }, |
| { |
| "epoch": 4.532374100719425, |
| "grad_norm": 0.546238899230957, |
| "learning_rate": 1e-05, |
| "loss": 0.3904, |
| "step": 317 |
| }, |
| { |
| "epoch": 4.546762589928058, |
| "grad_norm": 0.6016330718994141, |
| "learning_rate": 1e-05, |
| "loss": 0.4311, |
| "step": 318 |
| }, |
| { |
| "epoch": 4.561151079136691, |
| "grad_norm": 0.5516873598098755, |
| "learning_rate": 1e-05, |
| "loss": 0.4049, |
| "step": 319 |
| }, |
| { |
| "epoch": 4.575539568345324, |
| "grad_norm": 0.5393434166908264, |
| "learning_rate": 1e-05, |
| "loss": 0.4189, |
| "step": 320 |
| }, |
| { |
| "epoch": 4.589928057553957, |
| "grad_norm": 0.548603892326355, |
| "learning_rate": 1e-05, |
| "loss": 0.4195, |
| "step": 321 |
| }, |
| { |
| "epoch": 4.60431654676259, |
| "grad_norm": 0.5740833878517151, |
| "learning_rate": 1e-05, |
| "loss": 0.4107, |
| "step": 322 |
| }, |
| { |
| "epoch": 4.618705035971223, |
| "grad_norm": 0.5864909887313843, |
| "learning_rate": 1e-05, |
| "loss": 0.4271, |
| "step": 323 |
| }, |
| { |
| "epoch": 4.633093525179856, |
| "grad_norm": 0.5860151052474976, |
| "learning_rate": 1e-05, |
| "loss": 0.4402, |
| "step": 324 |
| }, |
| { |
| "epoch": 4.647482014388489, |
| "grad_norm": 0.5694665908813477, |
| "learning_rate": 1e-05, |
| "loss": 0.414, |
| "step": 325 |
| }, |
| { |
| "epoch": 4.661870503597123, |
| "grad_norm": 0.5657534003257751, |
| "learning_rate": 1e-05, |
| "loss": 0.4316, |
| "step": 326 |
| }, |
| { |
| "epoch": 4.676258992805756, |
| "grad_norm": 0.555111825466156, |
| "learning_rate": 1e-05, |
| "loss": 0.3999, |
| "step": 327 |
| }, |
| { |
| "epoch": 4.690647482014389, |
| "grad_norm": 0.5448899865150452, |
| "learning_rate": 1e-05, |
| "loss": 0.4322, |
| "step": 328 |
| }, |
| { |
| "epoch": 4.705035971223022, |
| "grad_norm": 0.584388792514801, |
| "learning_rate": 1e-05, |
| "loss": 0.414, |
| "step": 329 |
| }, |
| { |
| "epoch": 4.719424460431655, |
| "grad_norm": 0.6015664339065552, |
| "learning_rate": 1e-05, |
| "loss": 0.4249, |
| "step": 330 |
| }, |
| { |
| "epoch": 4.733812949640288, |
| "grad_norm": 0.5766135454177856, |
| "learning_rate": 1e-05, |
| "loss": 0.4131, |
| "step": 331 |
| }, |
| { |
| "epoch": 4.748201438848921, |
| "grad_norm": 0.5562160015106201, |
| "learning_rate": 1e-05, |
| "loss": 0.431, |
| "step": 332 |
| }, |
| { |
| "epoch": 4.762589928057554, |
| "grad_norm": 0.5359159708023071, |
| "learning_rate": 1e-05, |
| "loss": 0.4243, |
| "step": 333 |
| }, |
| { |
| "epoch": 4.7769784172661875, |
| "grad_norm": 0.5550519227981567, |
| "learning_rate": 1e-05, |
| "loss": 0.3906, |
| "step": 334 |
| }, |
| { |
| "epoch": 4.7913669064748206, |
| "grad_norm": 0.541607677936554, |
| "learning_rate": 1e-05, |
| "loss": 0.4137, |
| "step": 335 |
| }, |
| { |
| "epoch": 4.805755395683454, |
| "grad_norm": 0.5710314512252808, |
| "learning_rate": 1e-05, |
| "loss": 0.4317, |
| "step": 336 |
| }, |
| { |
| "epoch": 4.820143884892087, |
| "grad_norm": 0.5825111865997314, |
| "learning_rate": 1e-05, |
| "loss": 0.428, |
| "step": 337 |
| }, |
| { |
| "epoch": 4.83453237410072, |
| "grad_norm": 0.5356572866439819, |
| "learning_rate": 1e-05, |
| "loss": 0.4104, |
| "step": 338 |
| }, |
| { |
| "epoch": 4.848920863309353, |
| "grad_norm": 0.5596460700035095, |
| "learning_rate": 1e-05, |
| "loss": 0.4273, |
| "step": 339 |
| }, |
| { |
| "epoch": 4.863309352517986, |
| "grad_norm": 0.5650801658630371, |
| "learning_rate": 1e-05, |
| "loss": 0.4229, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.877697841726619, |
| "grad_norm": 0.5750253200531006, |
| "learning_rate": 1e-05, |
| "loss": 0.4181, |
| "step": 341 |
| }, |
| { |
| "epoch": 4.892086330935252, |
| "grad_norm": 0.5675643682479858, |
| "learning_rate": 1e-05, |
| "loss": 0.4372, |
| "step": 342 |
| }, |
| { |
| "epoch": 4.906474820143885, |
| "grad_norm": 0.6039531230926514, |
| "learning_rate": 1e-05, |
| "loss": 0.418, |
| "step": 343 |
| }, |
| { |
| "epoch": 4.920863309352518, |
| "grad_norm": 0.5539713501930237, |
| "learning_rate": 1e-05, |
| "loss": 0.3937, |
| "step": 344 |
| }, |
| { |
| "epoch": 4.935251798561151, |
| "grad_norm": 0.5814469456672668, |
| "learning_rate": 1e-05, |
| "loss": 0.4476, |
| "step": 345 |
| }, |
| { |
| "epoch": 4.9496402877697845, |
| "grad_norm": 0.5315111875534058, |
| "learning_rate": 1e-05, |
| "loss": 0.4004, |
| "step": 346 |
| }, |
| { |
| "epoch": 4.9640287769784175, |
| "grad_norm": 0.5901346802711487, |
| "learning_rate": 1e-05, |
| "loss": 0.4179, |
| "step": 347 |
| }, |
| { |
| "epoch": 4.9784172661870505, |
| "grad_norm": 0.5380561947822571, |
| "learning_rate": 1e-05, |
| "loss": 0.4021, |
| "step": 348 |
| }, |
| { |
| "epoch": 4.9928057553956835, |
| "grad_norm": 0.5694971680641174, |
| "learning_rate": 1e-05, |
| "loss": 0.4103, |
| "step": 349 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.5694971680641174, |
| "learning_rate": 1e-05, |
| "loss": 0.1957, |
| "step": 350 |
| }, |
| { |
| "epoch": 5.014388489208633, |
| "grad_norm": 0.5580344200134277, |
| "learning_rate": 1e-05, |
| "loss": 0.3912, |
| "step": 351 |
| }, |
| { |
| "epoch": 5.028776978417266, |
| "grad_norm": 0.5583423376083374, |
| "learning_rate": 1e-05, |
| "loss": 0.4192, |
| "step": 352 |
| }, |
| { |
| "epoch": 5.043165467625899, |
| "grad_norm": 0.5418814420700073, |
| "learning_rate": 1e-05, |
| "loss": 0.3996, |
| "step": 353 |
| }, |
| { |
| "epoch": 5.057553956834532, |
| "grad_norm": 0.5449907779693604, |
| "learning_rate": 1e-05, |
| "loss": 0.3862, |
| "step": 354 |
| }, |
| { |
| "epoch": 5.071942446043165, |
| "grad_norm": 0.5734506249427795, |
| "learning_rate": 1e-05, |
| "loss": 0.3984, |
| "step": 355 |
| }, |
| { |
| "epoch": 5.086330935251799, |
| "grad_norm": 0.5431396961212158, |
| "learning_rate": 1e-05, |
| "loss": 0.4044, |
| "step": 356 |
| }, |
| { |
| "epoch": 5.100719424460432, |
| "grad_norm": 0.5451382994651794, |
| "learning_rate": 1e-05, |
| "loss": 0.3857, |
| "step": 357 |
| }, |
| { |
| "epoch": 5.115107913669065, |
| "grad_norm": 0.6146738529205322, |
| "learning_rate": 1e-05, |
| "loss": 0.4073, |
| "step": 358 |
| }, |
| { |
| "epoch": 5.129496402877698, |
| "grad_norm": 0.5780620574951172, |
| "learning_rate": 1e-05, |
| "loss": 0.4065, |
| "step": 359 |
| }, |
| { |
| "epoch": 5.143884892086331, |
| "grad_norm": 0.6047171950340271, |
| "learning_rate": 1e-05, |
| "loss": 0.3894, |
| "step": 360 |
| }, |
| { |
| "epoch": 5.158273381294964, |
| "grad_norm": 0.5551854968070984, |
| "learning_rate": 1e-05, |
| "loss": 0.4182, |
| "step": 361 |
| }, |
| { |
| "epoch": 5.172661870503597, |
| "grad_norm": 0.5953412055969238, |
| "learning_rate": 1e-05, |
| "loss": 0.3954, |
| "step": 362 |
| }, |
| { |
| "epoch": 5.18705035971223, |
| "grad_norm": 0.5963872671127319, |
| "learning_rate": 1e-05, |
| "loss": 0.4046, |
| "step": 363 |
| }, |
| { |
| "epoch": 5.201438848920863, |
| "grad_norm": 0.5667490363121033, |
| "learning_rate": 1e-05, |
| "loss": 0.4233, |
| "step": 364 |
| }, |
| { |
| "epoch": 5.215827338129497, |
| "grad_norm": 0.5441871285438538, |
| "learning_rate": 1e-05, |
| "loss": 0.3837, |
| "step": 365 |
| }, |
| { |
| "epoch": 5.23021582733813, |
| "grad_norm": 0.6119952201843262, |
| "learning_rate": 1e-05, |
| "loss": 0.4019, |
| "step": 366 |
| }, |
| { |
| "epoch": 5.244604316546763, |
| "grad_norm": 0.6040645241737366, |
| "learning_rate": 1e-05, |
| "loss": 0.4168, |
| "step": 367 |
| }, |
| { |
| "epoch": 5.258992805755396, |
| "grad_norm": 0.5911147594451904, |
| "learning_rate": 1e-05, |
| "loss": 0.3639, |
| "step": 368 |
| }, |
| { |
| "epoch": 5.273381294964029, |
| "grad_norm": 0.559020459651947, |
| "learning_rate": 1e-05, |
| "loss": 0.403, |
| "step": 369 |
| }, |
| { |
| "epoch": 5.287769784172662, |
| "grad_norm": 0.5709179043769836, |
| "learning_rate": 1e-05, |
| "loss": 0.4056, |
| "step": 370 |
| }, |
| { |
| "epoch": 5.302158273381295, |
| "grad_norm": 0.5694558024406433, |
| "learning_rate": 1e-05, |
| "loss": 0.4206, |
| "step": 371 |
| }, |
| { |
| "epoch": 5.316546762589928, |
| "grad_norm": 0.5538829565048218, |
| "learning_rate": 1e-05, |
| "loss": 0.4084, |
| "step": 372 |
| }, |
| { |
| "epoch": 5.330935251798561, |
| "grad_norm": 0.5424035787582397, |
| "learning_rate": 1e-05, |
| "loss": 0.4286, |
| "step": 373 |
| }, |
| { |
| "epoch": 5.345323741007194, |
| "grad_norm": 0.6042952537536621, |
| "learning_rate": 1e-05, |
| "loss": 0.4194, |
| "step": 374 |
| }, |
| { |
| "epoch": 5.359712230215827, |
| "grad_norm": 0.5840588808059692, |
| "learning_rate": 1e-05, |
| "loss": 0.4018, |
| "step": 375 |
| }, |
| { |
| "epoch": 5.374100719424461, |
| "grad_norm": 0.5363032221794128, |
| "learning_rate": 1e-05, |
| "loss": 0.413, |
| "step": 376 |
| }, |
| { |
| "epoch": 5.388489208633094, |
| "grad_norm": 0.5760980248451233, |
| "learning_rate": 1e-05, |
| "loss": 0.3918, |
| "step": 377 |
| }, |
| { |
| "epoch": 5.402877697841727, |
| "grad_norm": 0.5672991275787354, |
| "learning_rate": 1e-05, |
| "loss": 0.4085, |
| "step": 378 |
| }, |
| { |
| "epoch": 5.41726618705036, |
| "grad_norm": 0.5906910300254822, |
| "learning_rate": 1e-05, |
| "loss": 0.4205, |
| "step": 379 |
| }, |
| { |
| "epoch": 5.431654676258993, |
| "grad_norm": 0.5852826833724976, |
| "learning_rate": 1e-05, |
| "loss": 0.4056, |
| "step": 380 |
| }, |
| { |
| "epoch": 5.446043165467626, |
| "grad_norm": 0.5464699864387512, |
| "learning_rate": 1e-05, |
| "loss": 0.4117, |
| "step": 381 |
| }, |
| { |
| "epoch": 5.460431654676259, |
| "grad_norm": 0.5488054156303406, |
| "learning_rate": 1e-05, |
| "loss": 0.4169, |
| "step": 382 |
| }, |
| { |
| "epoch": 5.474820143884892, |
| "grad_norm": 0.5442216992378235, |
| "learning_rate": 1e-05, |
| "loss": 0.4144, |
| "step": 383 |
| }, |
| { |
| "epoch": 5.489208633093525, |
| "grad_norm": 0.5626085996627808, |
| "learning_rate": 1e-05, |
| "loss": 0.4056, |
| "step": 384 |
| }, |
| { |
| "epoch": 5.503597122302159, |
| "grad_norm": 0.544670045375824, |
| "learning_rate": 1e-05, |
| "loss": 0.4055, |
| "step": 385 |
| }, |
| { |
| "epoch": 5.517985611510792, |
| "grad_norm": 0.5305049419403076, |
| "learning_rate": 1e-05, |
| "loss": 0.4107, |
| "step": 386 |
| }, |
| { |
| "epoch": 5.532374100719425, |
| "grad_norm": 0.5415787100791931, |
| "learning_rate": 1e-05, |
| "loss": 0.3992, |
| "step": 387 |
| }, |
| { |
| "epoch": 5.546762589928058, |
| "grad_norm": 0.5582872033119202, |
| "learning_rate": 1e-05, |
| "loss": 0.4256, |
| "step": 388 |
| }, |
| { |
| "epoch": 5.561151079136691, |
| "grad_norm": 0.5652105212211609, |
| "learning_rate": 1e-05, |
| "loss": 0.4074, |
| "step": 389 |
| }, |
| { |
| "epoch": 5.575539568345324, |
| "grad_norm": 0.5543160438537598, |
| "learning_rate": 1e-05, |
| "loss": 0.3895, |
| "step": 390 |
| }, |
| { |
| "epoch": 5.589928057553957, |
| "grad_norm": 0.5894019603729248, |
| "learning_rate": 1e-05, |
| "loss": 0.4186, |
| "step": 391 |
| }, |
| { |
| "epoch": 5.60431654676259, |
| "grad_norm": 0.5580301880836487, |
| "learning_rate": 1e-05, |
| "loss": 0.3899, |
| "step": 392 |
| }, |
| { |
| "epoch": 5.618705035971223, |
| "grad_norm": 0.6336041688919067, |
| "learning_rate": 1e-05, |
| "loss": 0.4123, |
| "step": 393 |
| }, |
| { |
| "epoch": 5.633093525179856, |
| "grad_norm": 0.539900004863739, |
| "learning_rate": 1e-05, |
| "loss": 0.4189, |
| "step": 394 |
| }, |
| { |
| "epoch": 5.647482014388489, |
| "grad_norm": 0.5859925746917725, |
| "learning_rate": 1e-05, |
| "loss": 0.4256, |
| "step": 395 |
| }, |
| { |
| "epoch": 5.661870503597123, |
| "grad_norm": 0.5636417269706726, |
| "learning_rate": 1e-05, |
| "loss": 0.3834, |
| "step": 396 |
| }, |
| { |
| "epoch": 5.676258992805756, |
| "grad_norm": 0.6011795997619629, |
| "learning_rate": 1e-05, |
| "loss": 0.407, |
| "step": 397 |
| }, |
| { |
| "epoch": 5.690647482014389, |
| "grad_norm": 0.5949985980987549, |
| "learning_rate": 1e-05, |
| "loss": 0.4507, |
| "step": 398 |
| }, |
| { |
| "epoch": 5.705035971223022, |
| "grad_norm": 0.6050012111663818, |
| "learning_rate": 1e-05, |
| "loss": 0.4257, |
| "step": 399 |
| }, |
| { |
| "epoch": 5.719424460431655, |
| "grad_norm": 0.5403311252593994, |
| "learning_rate": 1e-05, |
| "loss": 0.4184, |
| "step": 400 |
| }, |
| { |
| "epoch": 5.733812949640288, |
| "grad_norm": 0.5509918332099915, |
| "learning_rate": 1e-05, |
| "loss": 0.3943, |
| "step": 401 |
| }, |
| { |
| "epoch": 5.748201438848921, |
| "grad_norm": 0.5679648518562317, |
| "learning_rate": 1e-05, |
| "loss": 0.4089, |
| "step": 402 |
| }, |
| { |
| "epoch": 5.762589928057554, |
| "grad_norm": 0.5924245715141296, |
| "learning_rate": 1e-05, |
| "loss": 0.4139, |
| "step": 403 |
| }, |
| { |
| "epoch": 5.7769784172661875, |
| "grad_norm": 0.6032975912094116, |
| "learning_rate": 1e-05, |
| "loss": 0.4254, |
| "step": 404 |
| }, |
| { |
| "epoch": 5.7913669064748206, |
| "grad_norm": 0.6058276891708374, |
| "learning_rate": 1e-05, |
| "loss": 0.3954, |
| "step": 405 |
| }, |
| { |
| "epoch": 5.805755395683454, |
| "grad_norm": 0.559075117111206, |
| "learning_rate": 1e-05, |
| "loss": 0.4197, |
| "step": 406 |
| }, |
| { |
| "epoch": 5.820143884892087, |
| "grad_norm": 0.559196949005127, |
| "learning_rate": 1e-05, |
| "loss": 0.3966, |
| "step": 407 |
| }, |
| { |
| "epoch": 5.83453237410072, |
| "grad_norm": 0.5982166528701782, |
| "learning_rate": 1e-05, |
| "loss": 0.3851, |
| "step": 408 |
| }, |
| { |
| "epoch": 5.848920863309353, |
| "grad_norm": 0.5499190092086792, |
| "learning_rate": 1e-05, |
| "loss": 0.4253, |
| "step": 409 |
| }, |
| { |
| "epoch": 5.863309352517986, |
| "grad_norm": 0.5379040241241455, |
| "learning_rate": 1e-05, |
| "loss": 0.3881, |
| "step": 410 |
| }, |
| { |
| "epoch": 5.877697841726619, |
| "grad_norm": 0.5733298063278198, |
| "learning_rate": 1e-05, |
| "loss": 0.3765, |
| "step": 411 |
| }, |
| { |
| "epoch": 5.892086330935252, |
| "grad_norm": 0.5757260322570801, |
| "learning_rate": 1e-05, |
| "loss": 0.4089, |
| "step": 412 |
| }, |
| { |
| "epoch": 5.906474820143885, |
| "grad_norm": 0.5678840279579163, |
| "learning_rate": 1e-05, |
| "loss": 0.4006, |
| "step": 413 |
| }, |
| { |
| "epoch": 5.920863309352518, |
| "grad_norm": 0.540944516658783, |
| "learning_rate": 1e-05, |
| "loss": 0.4012, |
| "step": 414 |
| }, |
| { |
| "epoch": 5.935251798561151, |
| "grad_norm": 0.5221725106239319, |
| "learning_rate": 1e-05, |
| "loss": 0.3859, |
| "step": 415 |
| }, |
| { |
| "epoch": 5.9496402877697845, |
| "grad_norm": 0.5563617944717407, |
| "learning_rate": 1e-05, |
| "loss": 0.4138, |
| "step": 416 |
| }, |
| { |
| "epoch": 5.9640287769784175, |
| "grad_norm": 0.5671050548553467, |
| "learning_rate": 1e-05, |
| "loss": 0.4097, |
| "step": 417 |
| }, |
| { |
| "epoch": 5.9784172661870505, |
| "grad_norm": 0.539849579334259, |
| "learning_rate": 1e-05, |
| "loss": 0.4119, |
| "step": 418 |
| }, |
| { |
| "epoch": 5.9928057553956835, |
| "grad_norm": 0.5199679732322693, |
| "learning_rate": 1e-05, |
| "loss": 0.3781, |
| "step": 419 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.534730076789856, |
| "learning_rate": 1e-05, |
| "loss": 0.1785, |
| "step": 420 |
| }, |
| { |
| "epoch": 6.014388489208633, |
| "grad_norm": 0.5909907817840576, |
| "learning_rate": 1e-05, |
| "loss": 0.402, |
| "step": 421 |
| }, |
| { |
| "epoch": 6.028776978417266, |
| "grad_norm": 0.539289116859436, |
| "learning_rate": 1e-05, |
| "loss": 0.3905, |
| "step": 422 |
| }, |
| { |
| "epoch": 6.043165467625899, |
| "grad_norm": 0.5408132076263428, |
| "learning_rate": 1e-05, |
| "loss": 0.3796, |
| "step": 423 |
| }, |
| { |
| "epoch": 6.057553956834532, |
| "grad_norm": 0.5502517819404602, |
| "learning_rate": 1e-05, |
| "loss": 0.3903, |
| "step": 424 |
| }, |
| { |
| "epoch": 6.071942446043165, |
| "grad_norm": 0.5412729382514954, |
| "learning_rate": 1e-05, |
| "loss": 0.3795, |
| "step": 425 |
| }, |
| { |
| "epoch": 6.086330935251799, |
| "grad_norm": 0.5389096140861511, |
| "learning_rate": 1e-05, |
| "loss": 0.3791, |
| "step": 426 |
| }, |
| { |
| "epoch": 6.100719424460432, |
| "grad_norm": 0.5934701561927795, |
| "learning_rate": 1e-05, |
| "loss": 0.3977, |
| "step": 427 |
| }, |
| { |
| "epoch": 6.115107913669065, |
| "grad_norm": 0.5387526750564575, |
| "learning_rate": 1e-05, |
| "loss": 0.4024, |
| "step": 428 |
| }, |
| { |
| "epoch": 6.129496402877698, |
| "grad_norm": 0.5928563475608826, |
| "learning_rate": 1e-05, |
| "loss": 0.3912, |
| "step": 429 |
| }, |
| { |
| "epoch": 6.143884892086331, |
| "grad_norm": 0.547256350517273, |
| "learning_rate": 1e-05, |
| "loss": 0.422, |
| "step": 430 |
| }, |
| { |
| "epoch": 6.158273381294964, |
| "grad_norm": 0.5635731220245361, |
| "learning_rate": 1e-05, |
| "loss": 0.3895, |
| "step": 431 |
| }, |
| { |
| "epoch": 6.172661870503597, |
| "grad_norm": 0.5396108627319336, |
| "learning_rate": 1e-05, |
| "loss": 0.3843, |
| "step": 432 |
| }, |
| { |
| "epoch": 6.18705035971223, |
| "grad_norm": 0.5764632225036621, |
| "learning_rate": 1e-05, |
| "loss": 0.3798, |
| "step": 433 |
| }, |
| { |
| "epoch": 6.201438848920863, |
| "grad_norm": 0.49766188859939575, |
| "learning_rate": 1e-05, |
| "loss": 0.3793, |
| "step": 434 |
| }, |
| { |
| "epoch": 6.215827338129497, |
| "grad_norm": 0.5574280023574829, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "step": 435 |
| }, |
| { |
| "epoch": 6.23021582733813, |
| "grad_norm": 0.5793395638465881, |
| "learning_rate": 1e-05, |
| "loss": 0.3916, |
| "step": 436 |
| }, |
| { |
| "epoch": 6.244604316546763, |
| "grad_norm": 0.6094810962677002, |
| "learning_rate": 1e-05, |
| "loss": 0.4137, |
| "step": 437 |
| }, |
| { |
| "epoch": 6.258992805755396, |
| "grad_norm": 0.5817932486534119, |
| "learning_rate": 1e-05, |
| "loss": 0.3947, |
| "step": 438 |
| }, |
| { |
| "epoch": 6.273381294964029, |
| "grad_norm": 0.5829856991767883, |
| "learning_rate": 1e-05, |
| "loss": 0.411, |
| "step": 439 |
| }, |
| { |
| "epoch": 6.287769784172662, |
| "grad_norm": 0.5500520467758179, |
| "learning_rate": 1e-05, |
| "loss": 0.3811, |
| "step": 440 |
| }, |
| { |
| "epoch": 6.302158273381295, |
| "grad_norm": 0.5310588479042053, |
| "learning_rate": 1e-05, |
| "loss": 0.3785, |
| "step": 441 |
| }, |
| { |
| "epoch": 6.316546762589928, |
| "grad_norm": 0.5661012530326843, |
| "learning_rate": 1e-05, |
| "loss": 0.4055, |
| "step": 442 |
| }, |
| { |
| "epoch": 6.330935251798561, |
| "grad_norm": 0.5334185361862183, |
| "learning_rate": 1e-05, |
| "loss": 0.4005, |
| "step": 443 |
| }, |
| { |
| "epoch": 6.345323741007194, |
| "grad_norm": 0.5455158352851868, |
| "learning_rate": 1e-05, |
| "loss": 0.3983, |
| "step": 444 |
| }, |
| { |
| "epoch": 6.359712230215827, |
| "grad_norm": 0.5767692923545837, |
| "learning_rate": 1e-05, |
| "loss": 0.4211, |
| "step": 445 |
| }, |
| { |
| "epoch": 6.374100719424461, |
| "grad_norm": 0.5657313466072083, |
| "learning_rate": 1e-05, |
| "loss": 0.3917, |
| "step": 446 |
| }, |
| { |
| "epoch": 6.388489208633094, |
| "grad_norm": 0.5818557143211365, |
| "learning_rate": 1e-05, |
| "loss": 0.4046, |
| "step": 447 |
| }, |
| { |
| "epoch": 6.402877697841727, |
| "grad_norm": 0.5562762022018433, |
| "learning_rate": 1e-05, |
| "loss": 0.4076, |
| "step": 448 |
| }, |
| { |
| "epoch": 6.41726618705036, |
| "grad_norm": 0.5309922695159912, |
| "learning_rate": 1e-05, |
| "loss": 0.3988, |
| "step": 449 |
| }, |
| { |
| "epoch": 6.431654676258993, |
| "grad_norm": 0.5536403059959412, |
| "learning_rate": 1e-05, |
| "loss": 0.3967, |
| "step": 450 |
| }, |
| { |
| "epoch": 6.446043165467626, |
| "grad_norm": 0.5385812520980835, |
| "learning_rate": 1e-05, |
| "loss": 0.3821, |
| "step": 451 |
| }, |
| { |
| "epoch": 6.460431654676259, |
| "grad_norm": 0.5951981544494629, |
| "learning_rate": 1e-05, |
| "loss": 0.4109, |
| "step": 452 |
| }, |
| { |
| "epoch": 6.474820143884892, |
| "grad_norm": 0.5818688869476318, |
| "learning_rate": 1e-05, |
| "loss": 0.4054, |
| "step": 453 |
| }, |
| { |
| "epoch": 6.489208633093525, |
| "grad_norm": 0.5380371809005737, |
| "learning_rate": 1e-05, |
| "loss": 0.3975, |
| "step": 454 |
| }, |
| { |
| "epoch": 6.503597122302159, |
| "grad_norm": 0.5604183077812195, |
| "learning_rate": 1e-05, |
| "loss": 0.4014, |
| "step": 455 |
| }, |
| { |
| "epoch": 6.517985611510792, |
| "grad_norm": 0.5463094711303711, |
| "learning_rate": 1e-05, |
| "loss": 0.3972, |
| "step": 456 |
| }, |
| { |
| "epoch": 6.532374100719425, |
| "grad_norm": 0.5421119332313538, |
| "learning_rate": 1e-05, |
| "loss": 0.3992, |
| "step": 457 |
| }, |
| { |
| "epoch": 6.546762589928058, |
| "grad_norm": 0.5993166565895081, |
| "learning_rate": 1e-05, |
| "loss": 0.3972, |
| "step": 458 |
| }, |
| { |
| "epoch": 6.561151079136691, |
| "grad_norm": 0.6011131405830383, |
| "learning_rate": 1e-05, |
| "loss": 0.4188, |
| "step": 459 |
| }, |
| { |
| "epoch": 6.575539568345324, |
| "grad_norm": 0.5480906367301941, |
| "learning_rate": 1e-05, |
| "loss": 0.3915, |
| "step": 460 |
| }, |
| { |
| "epoch": 6.589928057553957, |
| "grad_norm": 0.5569404363632202, |
| "learning_rate": 1e-05, |
| "loss": 0.3877, |
| "step": 461 |
| }, |
| { |
| "epoch": 6.60431654676259, |
| "grad_norm": 0.5606852769851685, |
| "learning_rate": 1e-05, |
| "loss": 0.4023, |
| "step": 462 |
| }, |
| { |
| "epoch": 6.618705035971223, |
| "grad_norm": 0.5609032511711121, |
| "learning_rate": 1e-05, |
| "loss": 0.4142, |
| "step": 463 |
| }, |
| { |
| "epoch": 6.633093525179856, |
| "grad_norm": 0.6198331117630005, |
| "learning_rate": 1e-05, |
| "loss": 0.4239, |
| "step": 464 |
| }, |
| { |
| "epoch": 6.647482014388489, |
| "grad_norm": 0.5522207021713257, |
| "learning_rate": 1e-05, |
| "loss": 0.373, |
| "step": 465 |
| }, |
| { |
| "epoch": 6.661870503597123, |
| "grad_norm": 0.5408725738525391, |
| "learning_rate": 1e-05, |
| "loss": 0.372, |
| "step": 466 |
| }, |
| { |
| "epoch": 6.676258992805756, |
| "grad_norm": 0.561644434928894, |
| "learning_rate": 1e-05, |
| "loss": 0.3928, |
| "step": 467 |
| }, |
| { |
| "epoch": 6.690647482014389, |
| "grad_norm": 0.5675777792930603, |
| "learning_rate": 1e-05, |
| "loss": 0.3884, |
| "step": 468 |
| }, |
| { |
| "epoch": 6.705035971223022, |
| "grad_norm": 0.5949506163597107, |
| "learning_rate": 1e-05, |
| "loss": 0.3803, |
| "step": 469 |
| }, |
| { |
| "epoch": 6.719424460431655, |
| "grad_norm": 0.5625572204589844, |
| "learning_rate": 1e-05, |
| "loss": 0.3888, |
| "step": 470 |
| }, |
| { |
| "epoch": 6.733812949640288, |
| "grad_norm": 0.5368185639381409, |
| "learning_rate": 1e-05, |
| "loss": 0.4097, |
| "step": 471 |
| }, |
| { |
| "epoch": 6.748201438848921, |
| "grad_norm": 0.5578059554100037, |
| "learning_rate": 1e-05, |
| "loss": 0.3842, |
| "step": 472 |
| }, |
| { |
| "epoch": 6.762589928057554, |
| "grad_norm": 0.5098408460617065, |
| "learning_rate": 1e-05, |
| "loss": 0.3842, |
| "step": 473 |
| }, |
| { |
| "epoch": 6.7769784172661875, |
| "grad_norm": 0.5511586666107178, |
| "learning_rate": 1e-05, |
| "loss": 0.4178, |
| "step": 474 |
| }, |
| { |
| "epoch": 6.7913669064748206, |
| "grad_norm": 0.5386340618133545, |
| "learning_rate": 1e-05, |
| "loss": 0.4094, |
| "step": 475 |
| }, |
| { |
| "epoch": 6.805755395683454, |
| "grad_norm": 0.5731641054153442, |
| "learning_rate": 1e-05, |
| "loss": 0.4132, |
| "step": 476 |
| }, |
| { |
| "epoch": 6.820143884892087, |
| "grad_norm": 0.5776832103729248, |
| "learning_rate": 1e-05, |
| "loss": 0.3838, |
| "step": 477 |
| }, |
| { |
| "epoch": 6.83453237410072, |
| "grad_norm": 0.5477452278137207, |
| "learning_rate": 1e-05, |
| "loss": 0.3722, |
| "step": 478 |
| }, |
| { |
| "epoch": 6.848920863309353, |
| "grad_norm": 0.5660219788551331, |
| "learning_rate": 1e-05, |
| "loss": 0.3941, |
| "step": 479 |
| }, |
| { |
| "epoch": 6.863309352517986, |
| "grad_norm": 0.5811092853546143, |
| "learning_rate": 1e-05, |
| "loss": 0.4127, |
| "step": 480 |
| }, |
| { |
| "epoch": 6.877697841726619, |
| "grad_norm": 0.5366553664207458, |
| "learning_rate": 1e-05, |
| "loss": 0.3689, |
| "step": 481 |
| }, |
| { |
| "epoch": 6.892086330935252, |
| "grad_norm": 0.6058823466300964, |
| "learning_rate": 1e-05, |
| "loss": 0.3974, |
| "step": 482 |
| }, |
| { |
| "epoch": 6.906474820143885, |
| "grad_norm": 0.5690235495567322, |
| "learning_rate": 1e-05, |
| "loss": 0.3996, |
| "step": 483 |
| }, |
| { |
| "epoch": 6.906474820143885, |
| "step": 483, |
| "total_flos": 3.1451286845584835e+19, |
| "train_loss": 0.46938293981009144, |
| "train_runtime": 37591.0516, |
| "train_samples_per_second": 1.655, |
| "train_steps_per_second": 0.013 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 483, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 7, |
| "save_steps": 1000000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.1451286845584835e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|