| { |
| "best_global_step": 500, |
| "best_metric": 0.9080732464790344, |
| "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-500", |
| "epoch": 0.2109704641350211, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0008438818565400844, |
| "grad_norm": 1.597854733467102, |
| "learning_rate": 8.787346221441124e-08, |
| "loss": 1.3927901983261108, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0016877637130801688, |
| "grad_norm": 1.6547431945800781, |
| "learning_rate": 2.6362038664323375e-07, |
| "loss": 1.407160758972168, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.002531645569620253, |
| "grad_norm": 1.8221601247787476, |
| "learning_rate": 4.393673110720563e-07, |
| "loss": 1.376656174659729, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0033755274261603376, |
| "grad_norm": 1.4831048250198364, |
| "learning_rate": 6.151142355008788e-07, |
| "loss": 1.247712254524231, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.004219409282700422, |
| "grad_norm": 1.668201208114624, |
| "learning_rate": 7.908611599297013e-07, |
| "loss": 1.2685163021087646, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005063291139240506, |
| "grad_norm": 1.67417311668396, |
| "learning_rate": 9.666080843585237e-07, |
| "loss": 1.2942761182785034, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.00590717299578059, |
| "grad_norm": 1.7154079675674438, |
| "learning_rate": 1.1423550087873463e-06, |
| "loss": 1.3638604879379272, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.006751054852320675, |
| "grad_norm": 1.729427456855774, |
| "learning_rate": 1.3181019332161688e-06, |
| "loss": 1.3476728200912476, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.007594936708860759, |
| "grad_norm": 1.3813447952270508, |
| "learning_rate": 1.4938488576449913e-06, |
| "loss": 1.3476393222808838, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.008438818565400843, |
| "grad_norm": 1.557220458984375, |
| "learning_rate": 1.6695957820738139e-06, |
| "loss": 1.2449309825897217, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009282700421940928, |
| "grad_norm": 1.1883500814437866, |
| "learning_rate": 1.8453427065026362e-06, |
| "loss": 1.3125361204147339, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.010126582278481013, |
| "grad_norm": 1.7290029525756836, |
| "learning_rate": 2.0210896309314587e-06, |
| "loss": 1.3724769353866577, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.010970464135021098, |
| "grad_norm": 1.5627557039260864, |
| "learning_rate": 2.1968365553602812e-06, |
| "loss": 1.3401387929916382, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.01181434599156118, |
| "grad_norm": 1.796866774559021, |
| "learning_rate": 2.3725834797891038e-06, |
| "loss": 1.365437388420105, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.012658227848101266, |
| "grad_norm": 1.7030404806137085, |
| "learning_rate": 2.5483304042179263e-06, |
| "loss": 1.2706533670425415, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.01350210970464135, |
| "grad_norm": 1.3186293840408325, |
| "learning_rate": 2.724077328646749e-06, |
| "loss": 1.3084994554519653, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.014345991561181435, |
| "grad_norm": 1.5762513875961304, |
| "learning_rate": 2.8998242530755714e-06, |
| "loss": 1.3259696960449219, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.015189873417721518, |
| "grad_norm": 1.422295331954956, |
| "learning_rate": 3.075571177504394e-06, |
| "loss": 1.3205676078796387, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.016033755274261603, |
| "grad_norm": 1.495523452758789, |
| "learning_rate": 3.2513181019332165e-06, |
| "loss": 1.3740568161010742, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.016877637130801686, |
| "grad_norm": 1.5112254619598389, |
| "learning_rate": 3.427065026362039e-06, |
| "loss": 1.321828842163086, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.017721518987341773, |
| "grad_norm": 1.4667807817459106, |
| "learning_rate": 3.602811950790861e-06, |
| "loss": 1.3673173189163208, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.018565400843881856, |
| "grad_norm": 1.6609723567962646, |
| "learning_rate": 3.7785588752196836e-06, |
| "loss": 1.3968093395233154, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.019409282700421943, |
| "grad_norm": 1.59381103515625, |
| "learning_rate": 3.954305799648506e-06, |
| "loss": 1.4295302629470825, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.020253164556962026, |
| "grad_norm": 1.1470608711242676, |
| "learning_rate": 4.130052724077329e-06, |
| "loss": 1.2536572217941284, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.02109704641350211, |
| "grad_norm": 1.2014588117599487, |
| "learning_rate": 4.305799648506151e-06, |
| "loss": 1.242217779159546, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.021940928270042195, |
| "grad_norm": 1.2327464818954468, |
| "learning_rate": 4.481546572934974e-06, |
| "loss": 1.2166963815689087, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.02278481012658228, |
| "grad_norm": 1.9708983898162842, |
| "learning_rate": 4.657293497363796e-06, |
| "loss": 1.25709867477417, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.02362869198312236, |
| "grad_norm": 1.180569052696228, |
| "learning_rate": 4.833040421792619e-06, |
| "loss": 1.2886158227920532, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.024472573839662448, |
| "grad_norm": 1.5029548406600952, |
| "learning_rate": 5.008787346221441e-06, |
| "loss": 1.29886794090271, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.02531645569620253, |
| "grad_norm": 1.5380216836929321, |
| "learning_rate": 5.184534270650264e-06, |
| "loss": 1.2387628555297852, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.026160337552742614, |
| "grad_norm": 1.572144865989685, |
| "learning_rate": 5.3602811950790864e-06, |
| "loss": 1.2177000045776367, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0270042194092827, |
| "grad_norm": 1.4882780313491821, |
| "learning_rate": 5.536028119507909e-06, |
| "loss": 1.181516170501709, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.027848101265822784, |
| "grad_norm": 1.2982488870620728, |
| "learning_rate": 5.7117750439367315e-06, |
| "loss": 1.2101733684539795, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.02869198312236287, |
| "grad_norm": 1.5236955881118774, |
| "learning_rate": 5.887521968365554e-06, |
| "loss": 1.2277681827545166, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.029535864978902954, |
| "grad_norm": 1.4521006345748901, |
| "learning_rate": 6.0632688927943766e-06, |
| "loss": 1.1688424348831177, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.030379746835443037, |
| "grad_norm": 1.2352311611175537, |
| "learning_rate": 6.239015817223199e-06, |
| "loss": 1.273059368133545, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.031223628691983123, |
| "grad_norm": 1.3438209295272827, |
| "learning_rate": 6.414762741652021e-06, |
| "loss": 1.1609034538269043, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.032067510548523206, |
| "grad_norm": 1.9009398221969604, |
| "learning_rate": 6.590509666080843e-06, |
| "loss": 1.2508260011672974, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.03291139240506329, |
| "grad_norm": 1.6718412637710571, |
| "learning_rate": 6.766256590509666e-06, |
| "loss": 1.2524956464767456, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.03375527426160337, |
| "grad_norm": 1.249891757965088, |
| "learning_rate": 6.942003514938488e-06, |
| "loss": 1.1472493410110474, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.03459915611814346, |
| "grad_norm": 1.4398653507232666, |
| "learning_rate": 7.117750439367312e-06, |
| "loss": 1.0845389366149902, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.035443037974683546, |
| "grad_norm": 1.3701167106628418, |
| "learning_rate": 7.293497363796134e-06, |
| "loss": 1.1088868379592896, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.036286919831223625, |
| "grad_norm": 1.277998924255371, |
| "learning_rate": 7.469244288224957e-06, |
| "loss": 1.1513772010803223, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.03713080168776371, |
| "grad_norm": 1.4970002174377441, |
| "learning_rate": 7.644991212653779e-06, |
| "loss": 1.1385771036148071, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.0379746835443038, |
| "grad_norm": 1.3384218215942383, |
| "learning_rate": 7.820738137082601e-06, |
| "loss": 1.1632680892944336, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.038818565400843885, |
| "grad_norm": 1.4317446947097778, |
| "learning_rate": 7.996485061511425e-06, |
| "loss": 1.2256064414978027, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.039662447257383965, |
| "grad_norm": 1.8743640184402466, |
| "learning_rate": 8.172231985940246e-06, |
| "loss": 1.1935789585113525, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.04050632911392405, |
| "grad_norm": 1.4789546728134155, |
| "learning_rate": 8.347978910369069e-06, |
| "loss": 1.1429362297058105, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.04135021097046414, |
| "grad_norm": 1.658605694770813, |
| "learning_rate": 8.523725834797891e-06, |
| "loss": 1.1831508874893188, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.04219409282700422, |
| "grad_norm": 1.5077892541885376, |
| "learning_rate": 8.699472759226714e-06, |
| "loss": 1.0539867877960205, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04219409282700422, |
| "eval_loss": 1.138856053352356, |
| "eval_runtime": 859.7128, |
| "eval_samples_per_second": 2.451, |
| "eval_steps_per_second": 2.451, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.043037974683544304, |
| "grad_norm": 1.4335681200027466, |
| "learning_rate": 8.875219683655536e-06, |
| "loss": 1.0719901323318481, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.04388185654008439, |
| "grad_norm": 1.7387681007385254, |
| "learning_rate": 9.050966608084359e-06, |
| "loss": 1.0654313564300537, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.04472573839662447, |
| "grad_norm": 1.6071950197219849, |
| "learning_rate": 9.226713532513181e-06, |
| "loss": 1.0752698183059692, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.04556962025316456, |
| "grad_norm": 1.40005362033844, |
| "learning_rate": 9.402460456942004e-06, |
| "loss": 1.1029763221740723, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.046413502109704644, |
| "grad_norm": 2.2338669300079346, |
| "learning_rate": 9.578207381370826e-06, |
| "loss": 1.1157960891723633, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.04725738396624472, |
| "grad_norm": 1.4972727298736572, |
| "learning_rate": 9.753954305799649e-06, |
| "loss": 1.1095420122146606, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.04810126582278481, |
| "grad_norm": 1.317979097366333, |
| "learning_rate": 9.929701230228471e-06, |
| "loss": 1.109113097190857, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.048945147679324896, |
| "grad_norm": 1.496346116065979, |
| "learning_rate": 1.0105448154657294e-05, |
| "loss": 1.1055104732513428, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.049789029535864976, |
| "grad_norm": 1.385406732559204, |
| "learning_rate": 1.0281195079086117e-05, |
| "loss": 1.118395209312439, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.05063291139240506, |
| "grad_norm": 1.524222731590271, |
| "learning_rate": 1.0456942003514939e-05, |
| "loss": 1.1008446216583252, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.05147679324894515, |
| "grad_norm": 1.6308200359344482, |
| "learning_rate": 1.0632688927943762e-05, |
| "loss": 1.0891425609588623, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.05232067510548523, |
| "grad_norm": 1.3681106567382812, |
| "learning_rate": 1.0808435852372584e-05, |
| "loss": 0.9080473184585571, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.053164556962025315, |
| "grad_norm": 1.9429908990859985, |
| "learning_rate": 1.0984182776801407e-05, |
| "loss": 1.0337369441986084, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.0540084388185654, |
| "grad_norm": 1.5830830335617065, |
| "learning_rate": 1.115992970123023e-05, |
| "loss": 1.0703333616256714, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.05485232067510549, |
| "grad_norm": 1.4792555570602417, |
| "learning_rate": 1.1335676625659052e-05, |
| "loss": 1.004652738571167, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.05569620253164557, |
| "grad_norm": 1.7196226119995117, |
| "learning_rate": 1.1511423550087874e-05, |
| "loss": 0.9798293709754944, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.056540084388185655, |
| "grad_norm": 1.8733659982681274, |
| "learning_rate": 1.1687170474516697e-05, |
| "loss": 1.0213249921798706, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.05738396624472574, |
| "grad_norm": 1.3431142568588257, |
| "learning_rate": 1.186291739894552e-05, |
| "loss": 1.0358591079711914, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.05822784810126582, |
| "grad_norm": 1.527864933013916, |
| "learning_rate": 1.2038664323374342e-05, |
| "loss": 0.9372249841690063, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.05907172995780591, |
| "grad_norm": 1.5495563745498657, |
| "learning_rate": 1.2214411247803164e-05, |
| "loss": 1.0277758836746216, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.059915611814345994, |
| "grad_norm": 1.6792418956756592, |
| "learning_rate": 1.2390158172231985e-05, |
| "loss": 1.0349801778793335, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.060759493670886074, |
| "grad_norm": 1.6468945741653442, |
| "learning_rate": 1.256590509666081e-05, |
| "loss": 0.9578297734260559, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.06160337552742616, |
| "grad_norm": 1.7243824005126953, |
| "learning_rate": 1.2741652021089632e-05, |
| "loss": 1.0628854036331177, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.06244725738396625, |
| "grad_norm": 1.7286981344223022, |
| "learning_rate": 1.2917398945518455e-05, |
| "loss": 0.9336449503898621, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 1.6411832571029663, |
| "learning_rate": 1.3093145869947277e-05, |
| "loss": 0.953730583190918, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.06413502109704641, |
| "grad_norm": 1.8297001123428345, |
| "learning_rate": 1.3268892794376098e-05, |
| "loss": 1.051239013671875, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.06497890295358649, |
| "grad_norm": 1.9660519361495972, |
| "learning_rate": 1.3444639718804922e-05, |
| "loss": 0.9955035448074341, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.06582278481012659, |
| "grad_norm": 1.8423733711242676, |
| "learning_rate": 1.3620386643233743e-05, |
| "loss": 0.913300096988678, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.06666666666666667, |
| "grad_norm": 1.9146347045898438, |
| "learning_rate": 1.3796133567662567e-05, |
| "loss": 1.0429846048355103, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.06751054852320675, |
| "grad_norm": 1.6221821308135986, |
| "learning_rate": 1.3971880492091388e-05, |
| "loss": 1.0360238552093506, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06835443037974684, |
| "grad_norm": 2.173283338546753, |
| "learning_rate": 1.4147627416520212e-05, |
| "loss": 1.0227266550064087, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.06919831223628692, |
| "grad_norm": 1.7091665267944336, |
| "learning_rate": 1.4323374340949033e-05, |
| "loss": 1.0075194835662842, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.070042194092827, |
| "grad_norm": 1.7219135761260986, |
| "learning_rate": 1.4499121265377857e-05, |
| "loss": 1.0044782161712646, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.07088607594936709, |
| "grad_norm": 1.6558159589767456, |
| "learning_rate": 1.4674868189806678e-05, |
| "loss": 0.9393973350524902, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.07172995780590717, |
| "grad_norm": 1.9362739324569702, |
| "learning_rate": 1.4850615114235502e-05, |
| "loss": 0.9955337643623352, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.07257383966244725, |
| "grad_norm": 1.7792853116989136, |
| "learning_rate": 1.5026362038664323e-05, |
| "loss": 0.9659126400947571, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.07341772151898734, |
| "grad_norm": 1.7184511423110962, |
| "learning_rate": 1.5202108963093147e-05, |
| "loss": 0.9077855348587036, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.07426160337552742, |
| "grad_norm": 1.5701428651809692, |
| "learning_rate": 1.537785588752197e-05, |
| "loss": 0.9305018782615662, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.0751054852320675, |
| "grad_norm": 1.970229148864746, |
| "learning_rate": 1.555360281195079e-05, |
| "loss": 1.0211774110794067, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.0759493670886076, |
| "grad_norm": 1.8410269021987915, |
| "learning_rate": 1.5729349736379615e-05, |
| "loss": 0.9479315876960754, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.07679324894514768, |
| "grad_norm": 1.8991246223449707, |
| "learning_rate": 1.5905096660808434e-05, |
| "loss": 1.0629050731658936, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.07763713080168777, |
| "grad_norm": 1.8052008152008057, |
| "learning_rate": 1.608084358523726e-05, |
| "loss": 0.946983814239502, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.07848101265822785, |
| "grad_norm": 1.547108769416809, |
| "learning_rate": 1.625659050966608e-05, |
| "loss": 0.9413356184959412, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.07932489451476793, |
| "grad_norm": 1.9713538885116577, |
| "learning_rate": 1.6432337434094905e-05, |
| "loss": 0.9337888956069946, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.08016877637130802, |
| "grad_norm": 1.708789348602295, |
| "learning_rate": 1.6608084358523728e-05, |
| "loss": 0.9816337823867798, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0810126582278481, |
| "grad_norm": 1.815292477607727, |
| "learning_rate": 1.678383128295255e-05, |
| "loss": 1.017122507095337, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.08185654008438818, |
| "grad_norm": 1.7950682640075684, |
| "learning_rate": 1.6959578207381373e-05, |
| "loss": 0.991599440574646, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.08270042194092828, |
| "grad_norm": 1.692512035369873, |
| "learning_rate": 1.7135325131810195e-05, |
| "loss": 0.9570834040641785, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.08354430379746836, |
| "grad_norm": 2.056089162826538, |
| "learning_rate": 1.7311072056239018e-05, |
| "loss": 1.035754919052124, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.08438818565400844, |
| "grad_norm": 1.7022203207015991, |
| "learning_rate": 1.7486818980667837e-05, |
| "loss": 1.0124205350875854, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08438818565400844, |
| "eval_loss": 0.995743453502655, |
| "eval_runtime": 846.8257, |
| "eval_samples_per_second": 2.488, |
| "eval_steps_per_second": 2.488, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08523206751054853, |
| "grad_norm": 1.6088604927062988, |
| "learning_rate": 1.7662565905096663e-05, |
| "loss": 0.8946985006332397, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.08607594936708861, |
| "grad_norm": 2.02270770072937, |
| "learning_rate": 1.7838312829525482e-05, |
| "loss": 0.976133406162262, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.08691983122362869, |
| "grad_norm": 1.7832789421081543, |
| "learning_rate": 1.8014059753954308e-05, |
| "loss": 0.9079383611679077, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.08776371308016878, |
| "grad_norm": 1.9793545007705688, |
| "learning_rate": 1.8189806678383127e-05, |
| "loss": 0.8650367856025696, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.08860759493670886, |
| "grad_norm": 1.8124271631240845, |
| "learning_rate": 1.8365553602811953e-05, |
| "loss": 0.9327266812324524, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.08945147679324894, |
| "grad_norm": 1.8581212759017944, |
| "learning_rate": 1.8541300527240772e-05, |
| "loss": 0.9811079502105713, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.09029535864978903, |
| "grad_norm": 2.001699447631836, |
| "learning_rate": 1.8717047451669598e-05, |
| "loss": 0.9546971321105957, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.09113924050632911, |
| "grad_norm": 1.6994978189468384, |
| "learning_rate": 1.8892794376098417e-05, |
| "loss": 0.9611319899559021, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.0919831223628692, |
| "grad_norm": 2.1379497051239014, |
| "learning_rate": 1.9068541300527243e-05, |
| "loss": 0.9781531095504761, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.09282700421940929, |
| "grad_norm": 1.8961224555969238, |
| "learning_rate": 1.9244288224956066e-05, |
| "loss": 0.9374833106994629, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.09367088607594937, |
| "grad_norm": 1.851464033126831, |
| "learning_rate": 1.9420035149384885e-05, |
| "loss": 0.9681299328804016, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.09451476793248945, |
| "grad_norm": 2.0642266273498535, |
| "learning_rate": 1.959578207381371e-05, |
| "loss": 1.0086225271224976, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.09535864978902954, |
| "grad_norm": 1.8658756017684937, |
| "learning_rate": 1.977152899824253e-05, |
| "loss": 0.9190312623977661, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.09620253164556962, |
| "grad_norm": 2.4398674964904785, |
| "learning_rate": 1.9947275922671356e-05, |
| "loss": 0.9740874171257019, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.0970464135021097, |
| "grad_norm": 1.849183440208435, |
| "learning_rate": 2.0123022847100175e-05, |
| "loss": 0.884376049041748, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.09789029535864979, |
| "grad_norm": 2.027320384979248, |
| "learning_rate": 2.0298769771529e-05, |
| "loss": 0.9116487503051758, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.09873417721518987, |
| "grad_norm": 1.6800135374069214, |
| "learning_rate": 2.047451669595782e-05, |
| "loss": 0.9035115242004395, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.09957805907172995, |
| "grad_norm": 2.2362256050109863, |
| "learning_rate": 2.0650263620386646e-05, |
| "loss": 0.9043796062469482, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.10042194092827005, |
| "grad_norm": 1.938215970993042, |
| "learning_rate": 2.0826010544815465e-05, |
| "loss": 1.0888828039169312, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.10126582278481013, |
| "grad_norm": 1.890328049659729, |
| "learning_rate": 2.100175746924429e-05, |
| "loss": 0.9960280656814575, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1021097046413502, |
| "grad_norm": 2.021235227584839, |
| "learning_rate": 2.117750439367311e-05, |
| "loss": 0.9848901629447937, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.1029535864978903, |
| "grad_norm": 2.023920774459839, |
| "learning_rate": 2.1353251318101936e-05, |
| "loss": 0.891694188117981, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.10379746835443038, |
| "grad_norm": 1.8061069250106812, |
| "learning_rate": 2.1528998242530755e-05, |
| "loss": 0.9059976935386658, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.10464135021097046, |
| "grad_norm": 2.176302194595337, |
| "learning_rate": 2.1704745166959578e-05, |
| "loss": 1.0056109428405762, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.10548523206751055, |
| "grad_norm": 1.9820969104766846, |
| "learning_rate": 2.18804920913884e-05, |
| "loss": 0.9645357728004456, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.10632911392405063, |
| "grad_norm": 1.8764572143554688, |
| "learning_rate": 2.2056239015817223e-05, |
| "loss": 1.0178182125091553, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.10717299578059072, |
| "grad_norm": 2.56221342086792, |
| "learning_rate": 2.223198594024605e-05, |
| "loss": 0.9546761512756348, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.1080168776371308, |
| "grad_norm": 2.6779074668884277, |
| "learning_rate": 2.2407732864674868e-05, |
| "loss": 0.9300968647003174, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.10886075949367088, |
| "grad_norm": 2.140897512435913, |
| "learning_rate": 2.2583479789103694e-05, |
| "loss": 0.926638662815094, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.10970464135021098, |
| "grad_norm": 2.0880508422851562, |
| "learning_rate": 2.2759226713532513e-05, |
| "loss": 1.0681840181350708, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.11054852320675106, |
| "grad_norm": 2.7273616790771484, |
| "learning_rate": 2.293497363796134e-05, |
| "loss": 1.0840941667556763, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.11139240506329114, |
| "grad_norm": 1.6723874807357788, |
| "learning_rate": 2.3110720562390158e-05, |
| "loss": 0.8637182116508484, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.11223628691983123, |
| "grad_norm": 1.806243896484375, |
| "learning_rate": 2.3286467486818984e-05, |
| "loss": 0.9554686546325684, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.11308016877637131, |
| "grad_norm": 1.9086743593215942, |
| "learning_rate": 2.3462214411247803e-05, |
| "loss": 0.9556593894958496, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.11392405063291139, |
| "grad_norm": 2.1822304725646973, |
| "learning_rate": 2.3637961335676626e-05, |
| "loss": 0.9177709817886353, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.11476793248945148, |
| "grad_norm": 2.1009039878845215, |
| "learning_rate": 2.3813708260105448e-05, |
| "loss": 0.9288759827613831, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.11561181434599156, |
| "grad_norm": 1.9814810752868652, |
| "learning_rate": 2.398945518453427e-05, |
| "loss": 0.9881691932678223, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.11645569620253164, |
| "grad_norm": 1.9946284294128418, |
| "learning_rate": 2.4165202108963093e-05, |
| "loss": 0.9390727281570435, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.11729957805907174, |
| "grad_norm": 2.4489169120788574, |
| "learning_rate": 2.4340949033391916e-05, |
| "loss": 0.9625692963600159, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.11814345991561181, |
| "grad_norm": 2.0919103622436523, |
| "learning_rate": 2.451669595782074e-05, |
| "loss": 0.9304702877998352, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1189873417721519, |
| "grad_norm": 1.912914752960205, |
| "learning_rate": 2.469244288224956e-05, |
| "loss": 0.9313994646072388, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.11983122362869199, |
| "grad_norm": 2.1553256511688232, |
| "learning_rate": 2.4868189806678387e-05, |
| "loss": 1.004011869430542, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.12067510548523207, |
| "grad_norm": 2.0129058361053467, |
| "learning_rate": 2.504393673110721e-05, |
| "loss": 0.9092531204223633, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.12151898734177215, |
| "grad_norm": 2.1632325649261475, |
| "learning_rate": 2.5219683655536032e-05, |
| "loss": 0.993347704410553, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.12236286919831224, |
| "grad_norm": 2.3072738647460938, |
| "learning_rate": 2.539543057996485e-05, |
| "loss": 0.978348433971405, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.12320675105485232, |
| "grad_norm": 2.056560516357422, |
| "learning_rate": 2.5571177504393674e-05, |
| "loss": 1.0018101930618286, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.1240506329113924, |
| "grad_norm": 1.8906747102737427, |
| "learning_rate": 2.5746924428822493e-05, |
| "loss": 0.9607775211334229, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.1248945147679325, |
| "grad_norm": 2.1375651359558105, |
| "learning_rate": 2.5922671353251322e-05, |
| "loss": 0.9259153008460999, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.1257383966244726, |
| "grad_norm": 1.9994823932647705, |
| "learning_rate": 2.609841827768014e-05, |
| "loss": 0.8524524569511414, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 2.2421181201934814, |
| "learning_rate": 2.6274165202108964e-05, |
| "loss": 1.0047069787979126, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "eval_loss": 0.9517185688018799, |
| "eval_runtime": 860.0287, |
| "eval_samples_per_second": 2.45, |
| "eval_steps_per_second": 2.45, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12742616033755275, |
| "grad_norm": 2.1206254959106445, |
| "learning_rate": 2.6449912126537786e-05, |
| "loss": 0.8475471138954163, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.12827004219409283, |
| "grad_norm": 1.885161280632019, |
| "learning_rate": 2.6625659050966612e-05, |
| "loss": 0.8643121123313904, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.1291139240506329, |
| "grad_norm": 3.1441781520843506, |
| "learning_rate": 2.680140597539543e-05, |
| "loss": 0.8804612159729004, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.12995780590717299, |
| "grad_norm": 1.953133225440979, |
| "learning_rate": 2.6977152899824254e-05, |
| "loss": 0.8348029255867004, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.1308016877637131, |
| "grad_norm": 2.3762667179107666, |
| "learning_rate": 2.7152899824253076e-05, |
| "loss": 0.8889057040214539, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.13164556962025317, |
| "grad_norm": 2.4651103019714355, |
| "learning_rate": 2.7328646748681902e-05, |
| "loss": 1.025565505027771, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.13248945147679325, |
| "grad_norm": 1.8522284030914307, |
| "learning_rate": 2.7504393673110725e-05, |
| "loss": 0.868915855884552, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.13333333333333333, |
| "grad_norm": 1.8048083782196045, |
| "learning_rate": 2.7680140597539544e-05, |
| "loss": 0.8821638226509094, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.1341772151898734, |
| "grad_norm": 1.9933605194091797, |
| "learning_rate": 2.7855887521968367e-05, |
| "loss": 0.8735360503196716, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.1350210970464135, |
| "grad_norm": 2.044337034225464, |
| "learning_rate": 2.8031634446397186e-05, |
| "loss": 0.8288834691047668, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1358649789029536, |
| "grad_norm": 2.416067361831665, |
| "learning_rate": 2.8207381370826015e-05, |
| "loss": 0.9104969501495361, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.13670886075949368, |
| "grad_norm": 2.0731265544891357, |
| "learning_rate": 2.8383128295254834e-05, |
| "loss": 0.8689924478530884, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.13755274261603376, |
| "grad_norm": 2.049126386642456, |
| "learning_rate": 2.8558875219683657e-05, |
| "loss": 0.9312222003936768, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.13839662447257384, |
| "grad_norm": 2.131026268005371, |
| "learning_rate": 2.8734622144112476e-05, |
| "loss": 0.8933501839637756, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.13924050632911392, |
| "grad_norm": 1.766754150390625, |
| "learning_rate": 2.8910369068541305e-05, |
| "loss": 0.8998261094093323, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.140084388185654, |
| "grad_norm": 2.197706460952759, |
| "learning_rate": 2.9086115992970124e-05, |
| "loss": 0.8826426267623901, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.1409282700421941, |
| "grad_norm": 1.953715443611145, |
| "learning_rate": 2.9261862917398947e-05, |
| "loss": 0.8590307831764221, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.14177215189873418, |
| "grad_norm": 2.200929880142212, |
| "learning_rate": 2.943760984182777e-05, |
| "loss": 0.9317060708999634, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.14261603375527426, |
| "grad_norm": 2.1195082664489746, |
| "learning_rate": 2.961335676625659e-05, |
| "loss": 0.9965578317642212, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.14345991561181434, |
| "grad_norm": 2.3449771404266357, |
| "learning_rate": 2.9789103690685414e-05, |
| "loss": 0.8353848457336426, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.14430379746835442, |
| "grad_norm": 2.000497579574585, |
| "learning_rate": 2.9964850615114237e-05, |
| "loss": 0.9154735803604126, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.1451476793248945, |
| "grad_norm": 2.141890525817871, |
| "learning_rate": 3.014059753954306e-05, |
| "loss": 0.9530655741691589, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.1459915611814346, |
| "grad_norm": 1.7717392444610596, |
| "learning_rate": 3.031634446397188e-05, |
| "loss": 0.896998405456543, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.1468354430379747, |
| "grad_norm": 1.8796685934066772, |
| "learning_rate": 3.0492091388400708e-05, |
| "loss": 0.9084208011627197, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.14767932489451477, |
| "grad_norm": 2.0298709869384766, |
| "learning_rate": 3.066783831282953e-05, |
| "loss": 0.9183387756347656, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.14852320675105485, |
| "grad_norm": 1.9245645999908447, |
| "learning_rate": 3.084358523725835e-05, |
| "loss": 0.8624772429466248, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.14936708860759493, |
| "grad_norm": 2.325681209564209, |
| "learning_rate": 3.101933216168717e-05, |
| "loss": 0.9142400026321411, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.150210970464135, |
| "grad_norm": 2.1200530529022217, |
| "learning_rate": 3.1195079086115995e-05, |
| "loss": 0.9064018130302429, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.15105485232067511, |
| "grad_norm": 1.979314923286438, |
| "learning_rate": 3.137082601054482e-05, |
| "loss": 0.9199238419532776, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.1518987341772152, |
| "grad_norm": 2.1122689247131348, |
| "learning_rate": 3.154657293497364e-05, |
| "loss": 0.8030132055282593, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.15274261603375527, |
| "grad_norm": 2.105767250061035, |
| "learning_rate": 3.172231985940246e-05, |
| "loss": 0.9185854196548462, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.15358649789029535, |
| "grad_norm": 2.179471015930176, |
| "learning_rate": 3.1898066783831285e-05, |
| "loss": 0.9365083575248718, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.15443037974683543, |
| "grad_norm": 2.1444311141967773, |
| "learning_rate": 3.207381370826011e-05, |
| "loss": 0.8965140581130981, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.15527426160337554, |
| "grad_norm": 2.4171674251556396, |
| "learning_rate": 3.224956063268893e-05, |
| "loss": 0.8787504434585571, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.15611814345991562, |
| "grad_norm": 2.418628215789795, |
| "learning_rate": 3.242530755711775e-05, |
| "loss": 0.8925284147262573, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1569620253164557, |
| "grad_norm": 2.2228314876556396, |
| "learning_rate": 3.2601054481546575e-05, |
| "loss": 0.876179039478302, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.15780590717299578, |
| "grad_norm": 2.324237108230591, |
| "learning_rate": 3.27768014059754e-05, |
| "loss": 0.8365707993507385, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.15864978902953586, |
| "grad_norm": 2.6344552040100098, |
| "learning_rate": 3.295254833040422e-05, |
| "loss": 0.7864399552345276, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.15949367088607594, |
| "grad_norm": 2.047536611557007, |
| "learning_rate": 3.312829525483304e-05, |
| "loss": 0.9271875023841858, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.16033755274261605, |
| "grad_norm": 2.120025157928467, |
| "learning_rate": 3.3304042179261865e-05, |
| "loss": 0.8799133896827698, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.16118143459915613, |
| "grad_norm": 2.363692045211792, |
| "learning_rate": 3.347978910369069e-05, |
| "loss": 0.8973530530929565, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.1620253164556962, |
| "grad_norm": 2.1796772480010986, |
| "learning_rate": 3.365553602811951e-05, |
| "loss": 1.0277652740478516, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.16286919831223629, |
| "grad_norm": 1.9192595481872559, |
| "learning_rate": 3.383128295254833e-05, |
| "loss": 0.8909643888473511, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.16371308016877636, |
| "grad_norm": 1.7874376773834229, |
| "learning_rate": 3.4007029876977155e-05, |
| "loss": 0.837049663066864, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.16455696202531644, |
| "grad_norm": 2.3402366638183594, |
| "learning_rate": 3.4182776801405974e-05, |
| "loss": 0.8625202775001526, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.16540084388185655, |
| "grad_norm": 2.1137185096740723, |
| "learning_rate": 3.43585237258348e-05, |
| "loss": 0.9288321137428284, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.16624472573839663, |
| "grad_norm": 2.3776895999908447, |
| "learning_rate": 3.453427065026362e-05, |
| "loss": 0.9328726530075073, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.1670886075949367, |
| "grad_norm": 2.34941029548645, |
| "learning_rate": 3.4710017574692445e-05, |
| "loss": 0.9273309707641602, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.1679324894514768, |
| "grad_norm": 2.1272573471069336, |
| "learning_rate": 3.4885764499121264e-05, |
| "loss": 0.8703887462615967, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.16877637130801687, |
| "grad_norm": 2.047290802001953, |
| "learning_rate": 3.506151142355009e-05, |
| "loss": 0.8808165788650513, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16877637130801687, |
| "eval_loss": 0.9282881617546082, |
| "eval_runtime": 869.6867, |
| "eval_samples_per_second": 2.423, |
| "eval_steps_per_second": 2.423, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16962025316455695, |
| "grad_norm": 1.9874159097671509, |
| "learning_rate": 3.5237258347978916e-05, |
| "loss": 0.9643645286560059, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.17046413502109706, |
| "grad_norm": 1.9299919605255127, |
| "learning_rate": 3.5413005272407735e-05, |
| "loss": 0.9173495769500732, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.17130801687763714, |
| "grad_norm": 2.3379697799682617, |
| "learning_rate": 3.5588752196836555e-05, |
| "loss": 0.8998411893844604, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.17215189873417722, |
| "grad_norm": 2.241370916366577, |
| "learning_rate": 3.5764499121265374e-05, |
| "loss": 0.9310802221298218, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.1729957805907173, |
| "grad_norm": 2.4490108489990234, |
| "learning_rate": 3.5940246045694206e-05, |
| "loss": 0.9605053067207336, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.17383966244725738, |
| "grad_norm": 1.8247230052947998, |
| "learning_rate": 3.6115992970123026e-05, |
| "loss": 0.8485683798789978, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.17468354430379746, |
| "grad_norm": 2.4608843326568604, |
| "learning_rate": 3.6291739894551845e-05, |
| "loss": 0.9325968623161316, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.17552742616033756, |
| "grad_norm": 1.8923161029815674, |
| "learning_rate": 3.646748681898067e-05, |
| "loss": 0.9125096201896667, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.17637130801687764, |
| "grad_norm": 1.8502769470214844, |
| "learning_rate": 3.6643233743409497e-05, |
| "loss": 0.8852217197418213, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.17721518987341772, |
| "grad_norm": 1.9155100584030151, |
| "learning_rate": 3.6818980667838316e-05, |
| "loss": 0.9192792773246765, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.1780590717299578, |
| "grad_norm": 2.181476593017578, |
| "learning_rate": 3.6994727592267135e-05, |
| "loss": 0.8787404298782349, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.17890295358649788, |
| "grad_norm": 2.2469847202301025, |
| "learning_rate": 3.717047451669596e-05, |
| "loss": 0.9109582901000977, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.17974683544303796, |
| "grad_norm": 2.08145809173584, |
| "learning_rate": 3.734622144112479e-05, |
| "loss": 0.8560389280319214, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.18059071729957807, |
| "grad_norm": 4.121932506561279, |
| "learning_rate": 3.7521968365553606e-05, |
| "loss": 0.9456104040145874, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.18143459915611815, |
| "grad_norm": 2.177459478378296, |
| "learning_rate": 3.7697715289982425e-05, |
| "loss": 0.8421300649642944, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.18227848101265823, |
| "grad_norm": 2.324970245361328, |
| "learning_rate": 3.787346221441125e-05, |
| "loss": 0.9199858903884888, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.1831223628691983, |
| "grad_norm": 2.133718490600586, |
| "learning_rate": 3.804920913884007e-05, |
| "loss": 0.8953126668930054, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.1839662447257384, |
| "grad_norm": 1.8527995347976685, |
| "learning_rate": 3.8224956063268896e-05, |
| "loss": 0.8732239007949829, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.1848101265822785, |
| "grad_norm": 1.95817232131958, |
| "learning_rate": 3.8400702987697715e-05, |
| "loss": 0.8818746209144592, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.18565400843881857, |
| "grad_norm": 2.2107293605804443, |
| "learning_rate": 3.857644991212654e-05, |
| "loss": 0.9153507947921753, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.18649789029535865, |
| "grad_norm": 2.004754066467285, |
| "learning_rate": 3.875219683655536e-05, |
| "loss": 0.8960154056549072, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.18734177215189873, |
| "grad_norm": 2.1851706504821777, |
| "learning_rate": 3.8927943760984186e-05, |
| "loss": 0.909011721611023, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.1881856540084388, |
| "grad_norm": 2.4492485523223877, |
| "learning_rate": 3.9103690685413005e-05, |
| "loss": 0.8880158066749573, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.1890295358649789, |
| "grad_norm": 2.745453119277954, |
| "learning_rate": 3.927943760984183e-05, |
| "loss": 0.8500842452049255, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 2.1924264430999756, |
| "learning_rate": 3.945518453427065e-05, |
| "loss": 0.9004045724868774, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.19071729957805908, |
| "grad_norm": 2.4051687717437744, |
| "learning_rate": 3.9630931458699476e-05, |
| "loss": 0.9020664095878601, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.19156118143459916, |
| "grad_norm": 1.8077667951583862, |
| "learning_rate": 3.9806678383128295e-05, |
| "loss": 0.8639500737190247, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.19240506329113924, |
| "grad_norm": 2.089043378829956, |
| "learning_rate": 3.998242530755712e-05, |
| "loss": 0.8642048239707947, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.19324894514767932, |
| "grad_norm": 2.029578447341919, |
| "learning_rate": 4.015817223198594e-05, |
| "loss": 0.9371927380561829, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.1940928270042194, |
| "grad_norm": 2.26582407951355, |
| "learning_rate": 4.033391915641476e-05, |
| "loss": 0.9120588302612305, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.1949367088607595, |
| "grad_norm": 1.8671411275863647, |
| "learning_rate": 4.050966608084359e-05, |
| "loss": 0.8758644461631775, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.19578059071729959, |
| "grad_norm": 1.9403492212295532, |
| "learning_rate": 4.068541300527241e-05, |
| "loss": 0.914577305316925, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.19662447257383966, |
| "grad_norm": 1.9939641952514648, |
| "learning_rate": 4.086115992970123e-05, |
| "loss": 0.8592531681060791, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.19746835443037974, |
| "grad_norm": 2.1511380672454834, |
| "learning_rate": 4.103690685413005e-05, |
| "loss": 0.9251965880393982, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.19831223628691982, |
| "grad_norm": 2.2260982990264893, |
| "learning_rate": 4.121265377855888e-05, |
| "loss": 0.8465172052383423, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.1991561181434599, |
| "grad_norm": 2.0510010719299316, |
| "learning_rate": 4.13884007029877e-05, |
| "loss": 0.8943672180175781, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.2040133476257324, |
| "learning_rate": 4.156414762741652e-05, |
| "loss": 0.9594319462776184, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.2008438818565401, |
| "grad_norm": 2.355181932449341, |
| "learning_rate": 4.173989455184534e-05, |
| "loss": 0.9031813144683838, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.20168776371308017, |
| "grad_norm": 2.8434665203094482, |
| "learning_rate": 4.1915641476274166e-05, |
| "loss": 0.9225798845291138, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.20253164556962025, |
| "grad_norm": 2.1715340614318848, |
| "learning_rate": 4.209138840070299e-05, |
| "loss": 0.894163966178894, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.20337552742616033, |
| "grad_norm": 2.078916072845459, |
| "learning_rate": 4.226713532513181e-05, |
| "loss": 0.8424109816551208, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.2042194092827004, |
| "grad_norm": 1.9760961532592773, |
| "learning_rate": 4.244288224956064e-05, |
| "loss": 0.9102715849876404, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.20506329113924052, |
| "grad_norm": 1.9684507846832275, |
| "learning_rate": 4.2618629173989456e-05, |
| "loss": 0.8693854808807373, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.2059071729957806, |
| "grad_norm": 2.1633450984954834, |
| "learning_rate": 4.279437609841828e-05, |
| "loss": 0.8617543578147888, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.20675105485232068, |
| "grad_norm": 2.2695257663726807, |
| "learning_rate": 4.29701230228471e-05, |
| "loss": 0.9167086482048035, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.20759493670886076, |
| "grad_norm": 2.4180049896240234, |
| "learning_rate": 4.314586994727593e-05, |
| "loss": 0.8333520889282227, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.20843881856540084, |
| "grad_norm": 2.2942769527435303, |
| "learning_rate": 4.3321616871704746e-05, |
| "loss": 0.918351411819458, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.20928270042194091, |
| "grad_norm": 1.826458215713501, |
| "learning_rate": 4.349736379613357e-05, |
| "loss": 0.8565171957015991, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.21012658227848102, |
| "grad_norm": 1.9694055318832397, |
| "learning_rate": 4.367311072056239e-05, |
| "loss": 0.8684167861938477, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.2109704641350211, |
| "grad_norm": 1.892659306526184, |
| "learning_rate": 4.384885764499122e-05, |
| "loss": 0.7752788662910461, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2109704641350211, |
| "eval_loss": 0.9080732464790344, |
| "eval_runtime": 857.0753, |
| "eval_samples_per_second": 2.458, |
| "eval_steps_per_second": 2.458, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 14220, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.1928835720736154e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|