| { |
| "best_global_step": 24000, |
| "best_metric": 0.5216008424758911, |
| "best_model_checkpoint": "/content/drive/MyDrive/fyp-2025/ModelFinetuniningData/Mistral-7b-instruct-customerservice/outputs/checkpoint-24000", |
| "epoch": 3.0, |
| "eval_steps": 2000, |
| "global_step": 24063, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006233636703652911, |
| "grad_norm": 7.343134880065918, |
| "learning_rate": 8.139534883720931e-07, |
| "loss": 2.5324, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.012467273407305822, |
| "grad_norm": 4.1025919914245605, |
| "learning_rate": 1.6445182724252492e-06, |
| "loss": 2.1604, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.018700910110958733, |
| "grad_norm": 1.4959216117858887, |
| "learning_rate": 2.4750830564784057e-06, |
| "loss": 1.2872, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.024934546814611644, |
| "grad_norm": 1.0798927545547485, |
| "learning_rate": 3.305647840531562e-06, |
| "loss": 0.9336, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.031168183518264555, |
| "grad_norm": 1.264250636100769, |
| "learning_rate": 4.136212624584718e-06, |
| "loss": 0.8438, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.037401820221917466, |
| "grad_norm": 1.5219762325286865, |
| "learning_rate": 4.966777408637874e-06, |
| "loss": 0.8018, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04363545692557038, |
| "grad_norm": 1.3620986938476562, |
| "learning_rate": 5.79734219269103e-06, |
| "loss": 0.7756, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04986909362922329, |
| "grad_norm": 1.436906099319458, |
| "learning_rate": 6.627906976744186e-06, |
| "loss": 0.7417, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0561027303328762, |
| "grad_norm": 1.5860514640808105, |
| "learning_rate": 7.4584717607973425e-06, |
| "loss": 0.7307, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.06233636703652911, |
| "grad_norm": 1.6227495670318604, |
| "learning_rate": 8.2890365448505e-06, |
| "loss": 0.7104, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06857000374018202, |
| "grad_norm": 1.5297900438308716, |
| "learning_rate": 9.119601328903655e-06, |
| "loss": 0.6905, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.07480364044383493, |
| "grad_norm": 1.5481727123260498, |
| "learning_rate": 9.950166112956811e-06, |
| "loss": 0.675, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.08103727714748785, |
| "grad_norm": 1.565331220626831, |
| "learning_rate": 1.0780730897009968e-05, |
| "loss": 0.6703, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.08727091385114076, |
| "grad_norm": 1.5457186698913574, |
| "learning_rate": 1.1611295681063124e-05, |
| "loss": 0.6644, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.09350455055479366, |
| "grad_norm": 1.7019542455673218, |
| "learning_rate": 1.244186046511628e-05, |
| "loss": 0.6624, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.09973818725844658, |
| "grad_norm": 1.5325978994369507, |
| "learning_rate": 1.3272425249169436e-05, |
| "loss": 0.6505, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10597182396209949, |
| "grad_norm": 1.5614399909973145, |
| "learning_rate": 1.4102990033222592e-05, |
| "loss": 0.6345, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1122054606657524, |
| "grad_norm": 1.5235605239868164, |
| "learning_rate": 1.4933554817275748e-05, |
| "loss": 0.6409, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.1184390973694053, |
| "grad_norm": 1.5380282402038574, |
| "learning_rate": 1.5764119601328905e-05, |
| "loss": 0.6352, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.12467273407305822, |
| "grad_norm": 1.4291930198669434, |
| "learning_rate": 1.659468438538206e-05, |
| "loss": 0.6402, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.13090637077671113, |
| "grad_norm": 1.4465759992599487, |
| "learning_rate": 1.7425249169435217e-05, |
| "loss": 0.6301, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.13714000748036403, |
| "grad_norm": 1.5303502082824707, |
| "learning_rate": 1.825581395348837e-05, |
| "loss": 0.6277, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.14337364418401696, |
| "grad_norm": 1.3553271293640137, |
| "learning_rate": 1.908637873754153e-05, |
| "loss": 0.6183, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.14960728088766986, |
| "grad_norm": 1.3562167882919312, |
| "learning_rate": 1.9916943521594686e-05, |
| "loss": 0.6284, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.15584091759132276, |
| "grad_norm": 1.3724236488342285, |
| "learning_rate": 1.999980875991073e-05, |
| "loss": 0.6299, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1620745542949757, |
| "grad_norm": 1.3770631551742554, |
| "learning_rate": 1.9999147692449697e-05, |
| "loss": 0.6206, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.1683081909986286, |
| "grad_norm": 1.3478386402130127, |
| "learning_rate": 1.999801446783615e-05, |
| "loss": 0.6145, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.17454182770228152, |
| "grad_norm": 1.395735740661621, |
| "learning_rate": 1.9996409139580664e-05, |
| "loss": 0.6173, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.18077546440593442, |
| "grad_norm": 1.3741596937179565, |
| "learning_rate": 1.9994331783486415e-05, |
| "loss": 0.6139, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.18700910110958732, |
| "grad_norm": 1.259312629699707, |
| "learning_rate": 1.9991782497645624e-05, |
| "loss": 0.6039, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.19324273781324025, |
| "grad_norm": 1.2414480447769165, |
| "learning_rate": 1.99887614024349e-05, |
| "loss": 0.6081, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.19947637451689315, |
| "grad_norm": 1.2906510829925537, |
| "learning_rate": 1.9985268640509576e-05, |
| "loss": 0.6048, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.20571001122054608, |
| "grad_norm": 1.2173662185668945, |
| "learning_rate": 1.998130437679696e-05, |
| "loss": 0.6133, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.21194364792419898, |
| "grad_norm": 1.2505919933319092, |
| "learning_rate": 1.997686879848855e-05, |
| "loss": 0.6012, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.21817728462785188, |
| "grad_norm": 1.214409589767456, |
| "learning_rate": 1.997196211503121e-05, |
| "loss": 0.6105, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.2244109213315048, |
| "grad_norm": 1.2314575910568237, |
| "learning_rate": 1.9966584558117242e-05, |
| "loss": 0.6, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.2306445580351577, |
| "grad_norm": 1.1521075963974, |
| "learning_rate": 1.9960736381673492e-05, |
| "loss": 0.6025, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.2368781947388106, |
| "grad_norm": 1.131106972694397, |
| "learning_rate": 1.9954417861849332e-05, |
| "loss": 0.6036, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.24311183144246354, |
| "grad_norm": 1.252383828163147, |
| "learning_rate": 1.994762929700362e-05, |
| "loss": 0.5986, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.24934546814611644, |
| "grad_norm": 1.1381412744522095, |
| "learning_rate": 1.9940371007690626e-05, |
| "loss": 0.5949, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.24934546814611644, |
| "eval_loss": 0.5942733287811279, |
| "eval_runtime": 346.2523, |
| "eval_samples_per_second": 52.947, |
| "eval_steps_per_second": 6.619, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.25557910484976937, |
| "grad_norm": 1.1564639806747437, |
| "learning_rate": 1.9932643336644877e-05, |
| "loss": 0.5945, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.26181274155342227, |
| "grad_norm": 1.1378273963928223, |
| "learning_rate": 1.9924446648764995e-05, |
| "loss": 0.5905, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.26804637825707517, |
| "grad_norm": 1.1501036882400513, |
| "learning_rate": 1.991578133109645e-05, |
| "loss": 0.589, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.27428001496072807, |
| "grad_norm": 1.1266804933547974, |
| "learning_rate": 1.990664779281328e-05, |
| "loss": 0.5849, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.280513651664381, |
| "grad_norm": 1.1410062313079834, |
| "learning_rate": 1.9897046465198794e-05, |
| "loss": 0.5864, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.2867472883680339, |
| "grad_norm": 1.0691190958023071, |
| "learning_rate": 1.9886977801625176e-05, |
| "loss": 0.5846, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2929809250716868, |
| "grad_norm": 1.2022607326507568, |
| "learning_rate": 1.987644227753211e-05, |
| "loss": 0.5928, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2992145617753397, |
| "grad_norm": 1.1218560934066772, |
| "learning_rate": 1.98654403904043e-05, |
| "loss": 0.591, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.3054481984789926, |
| "grad_norm": 1.0694878101348877, |
| "learning_rate": 1.9853972659747986e-05, |
| "loss": 0.5748, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.31168183518264553, |
| "grad_norm": 1.09988534450531, |
| "learning_rate": 1.9842039627066433e-05, |
| "loss": 0.5794, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3179154718862985, |
| "grad_norm": 1.0877939462661743, |
| "learning_rate": 1.982964185583434e-05, |
| "loss": 0.5842, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.3241491085899514, |
| "grad_norm": 1.0780099630355835, |
| "learning_rate": 1.9816779931471238e-05, |
| "loss": 0.5893, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3303827452936043, |
| "grad_norm": 1.0458643436431885, |
| "learning_rate": 1.980345446131385e-05, |
| "loss": 0.5866, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.3366163819972572, |
| "grad_norm": 1.0329034328460693, |
| "learning_rate": 1.9789666074587405e-05, |
| "loss": 0.5882, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3428500187009101, |
| "grad_norm": 1.116700530052185, |
| "learning_rate": 1.9775415422375942e-05, |
| "loss": 0.5853, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.34908365540456304, |
| "grad_norm": 1.059531807899475, |
| "learning_rate": 1.9760703177591547e-05, |
| "loss": 0.5842, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.35531729210821594, |
| "grad_norm": 1.053161859512329, |
| "learning_rate": 1.9745530034942594e-05, |
| "loss": 0.5884, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.36155092881186884, |
| "grad_norm": 1.052735686302185, |
| "learning_rate": 1.9729896710900927e-05, |
| "loss": 0.5783, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.36778456551552174, |
| "grad_norm": 1.0767582654953003, |
| "learning_rate": 1.9713803943668045e-05, |
| "loss": 0.5813, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.37401820221917464, |
| "grad_norm": 1.0745905637741089, |
| "learning_rate": 1.9697252493140228e-05, |
| "loss": 0.5725, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3802518389228276, |
| "grad_norm": 1.0551797151565552, |
| "learning_rate": 1.9680243140872664e-05, |
| "loss": 0.5772, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.3864854756264805, |
| "grad_norm": 1.0367578268051147, |
| "learning_rate": 1.966277669004254e-05, |
| "loss": 0.5843, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3927191123301334, |
| "grad_norm": 1.0583189725875854, |
| "learning_rate": 1.9644853965411125e-05, |
| "loss": 0.57, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3989527490337863, |
| "grad_norm": 1.012893557548523, |
| "learning_rate": 1.96264758132848e-05, |
| "loss": 0.5798, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.4051863857374392, |
| "grad_norm": 1.0265179872512817, |
| "learning_rate": 1.9607643101475146e-05, |
| "loss": 0.5714, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.41142002244109216, |
| "grad_norm": 1.0228148698806763, |
| "learning_rate": 1.95883567192579e-05, |
| "loss": 0.5754, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.41765365914474506, |
| "grad_norm": 1.0399527549743652, |
| "learning_rate": 1.9568617577331014e-05, |
| "loss": 0.5758, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.42388729584839796, |
| "grad_norm": 0.999904453754425, |
| "learning_rate": 1.954842660777164e-05, |
| "loss": 0.571, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.43012093255205086, |
| "grad_norm": 1.0446258783340454, |
| "learning_rate": 1.9527784763992106e-05, |
| "loss": 0.5751, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.43635456925570376, |
| "grad_norm": 0.9872069358825684, |
| "learning_rate": 1.9506693020694904e-05, |
| "loss": 0.5754, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.44258820595935666, |
| "grad_norm": 1.0320584774017334, |
| "learning_rate": 1.948515237382666e-05, |
| "loss": 0.573, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.4488218426630096, |
| "grad_norm": 0.9758449792861938, |
| "learning_rate": 1.9463163840531125e-05, |
| "loss": 0.5741, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.4550554793666625, |
| "grad_norm": 0.9836538434028625, |
| "learning_rate": 1.9440728459101112e-05, |
| "loss": 0.571, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.4612891160703154, |
| "grad_norm": 0.9504910111427307, |
| "learning_rate": 1.9417847288929495e-05, |
| "loss": 0.5725, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.4675227527739683, |
| "grad_norm": 1.0080476999282837, |
| "learning_rate": 1.9394521410459182e-05, |
| "loss": 0.5669, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.4737563894776212, |
| "grad_norm": 1.0220303535461426, |
| "learning_rate": 1.9370751925132082e-05, |
| "loss": 0.5738, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4799900261812742, |
| "grad_norm": 0.9933121204376221, |
| "learning_rate": 1.9346539955337113e-05, |
| "loss": 0.5613, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.4862236628849271, |
| "grad_norm": 0.9892274141311646, |
| "learning_rate": 1.9321886644357178e-05, |
| "loss": 0.5624, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.49245729958858, |
| "grad_norm": 1.0384228229522705, |
| "learning_rate": 1.9296793156315216e-05, |
| "loss": 0.5649, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.4986909362922329, |
| "grad_norm": 1.04031240940094, |
| "learning_rate": 1.9271260676119205e-05, |
| "loss": 0.5648, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4986909362922329, |
| "eval_loss": 0.5658419728279114, |
| "eval_runtime": 345.5066, |
| "eval_samples_per_second": 53.061, |
| "eval_steps_per_second": 6.634, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5049245729958858, |
| "grad_norm": 1.0244519710540771, |
| "learning_rate": 1.924529040940621e-05, |
| "loss": 0.5726, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.5111582096995387, |
| "grad_norm": 1.0034270286560059, |
| "learning_rate": 1.9218883582485476e-05, |
| "loss": 0.5654, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.5173918464031916, |
| "grad_norm": 0.9908378720283508, |
| "learning_rate": 1.9192041442280494e-05, |
| "loss": 0.5658, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.5236254831068445, |
| "grad_norm": 0.9905117750167847, |
| "learning_rate": 1.916476525627014e-05, |
| "loss": 0.5645, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.5298591198104975, |
| "grad_norm": 0.9901898503303528, |
| "learning_rate": 1.9137056312428827e-05, |
| "loss": 0.5647, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.5360927565141503, |
| "grad_norm": 0.9914993643760681, |
| "learning_rate": 1.910891591916567e-05, |
| "loss": 0.5721, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.5423263932178033, |
| "grad_norm": 0.9866006970405579, |
| "learning_rate": 1.908034540526272e-05, |
| "loss": 0.5657, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.5485600299214561, |
| "grad_norm": 0.9906086921691895, |
| "learning_rate": 1.9051346119812208e-05, |
| "loss": 0.5632, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5547936666251091, |
| "grad_norm": 0.8937438726425171, |
| "learning_rate": 1.902191943215285e-05, |
| "loss": 0.5642, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.561027303328762, |
| "grad_norm": 0.9465627074241638, |
| "learning_rate": 1.8992066731805175e-05, |
| "loss": 0.5677, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5672609400324149, |
| "grad_norm": 0.9565852284431458, |
| "learning_rate": 1.8961789428405933e-05, |
| "loss": 0.5674, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.5734945767360679, |
| "grad_norm": 0.949268639087677, |
| "learning_rate": 1.8931088951641512e-05, |
| "loss": 0.5619, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5797282134397207, |
| "grad_norm": 0.9713729023933411, |
| "learning_rate": 1.8899966751180435e-05, |
| "loss": 0.5625, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.5859618501433737, |
| "grad_norm": 0.9643019437789917, |
| "learning_rate": 1.8868424296604913e-05, |
| "loss": 0.5548, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5921954868470266, |
| "grad_norm": 1.0023061037063599, |
| "learning_rate": 1.8836463077341447e-05, |
| "loss": 0.5649, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5984291235506795, |
| "grad_norm": 0.9449487924575806, |
| "learning_rate": 1.880408460259049e-05, |
| "loss": 0.5605, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.6046627602543324, |
| "grad_norm": 0.9681399464607239, |
| "learning_rate": 1.8771290401255194e-05, |
| "loss": 0.5569, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.6108963969579853, |
| "grad_norm": 1.015436053276062, |
| "learning_rate": 1.873808202186922e-05, |
| "loss": 0.5593, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.6171300336616382, |
| "grad_norm": 0.9706466197967529, |
| "learning_rate": 1.87044610325236e-05, |
| "loss": 0.5576, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.6233636703652911, |
| "grad_norm": 0.9654951691627502, |
| "learning_rate": 1.8670429020792703e-05, |
| "loss": 0.5588, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.629597307068944, |
| "grad_norm": 0.9507623910903931, |
| "learning_rate": 1.8635987593659274e-05, |
| "loss": 0.5573, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.635830943772597, |
| "grad_norm": 0.9596995115280151, |
| "learning_rate": 1.860113837743853e-05, |
| "loss": 0.5581, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6420645804762498, |
| "grad_norm": 0.956089973449707, |
| "learning_rate": 1.8565883017701404e-05, |
| "loss": 0.5549, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.6482982171799028, |
| "grad_norm": 0.9539560675621033, |
| "learning_rate": 1.8530223179196807e-05, |
| "loss": 0.5514, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.6545318538835556, |
| "grad_norm": 0.9579432010650635, |
| "learning_rate": 1.8494160545773036e-05, |
| "loss": 0.562, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.6607654905872086, |
| "grad_norm": 0.921850860118866, |
| "learning_rate": 1.8457696820298253e-05, |
| "loss": 0.5572, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.6669991272908615, |
| "grad_norm": 0.9380226731300354, |
| "learning_rate": 1.84208337245801e-05, |
| "loss": 0.5538, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.6732327639945144, |
| "grad_norm": 0.8936309218406677, |
| "learning_rate": 1.8383572999284353e-05, |
| "loss": 0.5513, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6794664006981673, |
| "grad_norm": 0.9373565912246704, |
| "learning_rate": 1.8345916403852777e-05, |
| "loss": 0.5623, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.6857000374018202, |
| "grad_norm": 0.9267479777336121, |
| "learning_rate": 1.8307865716420005e-05, |
| "loss": 0.5535, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6919336741054731, |
| "grad_norm": 0.9392421245574951, |
| "learning_rate": 1.8269422733729597e-05, |
| "loss": 0.5546, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.6981673108091261, |
| "grad_norm": 0.9371375441551208, |
| "learning_rate": 1.8230589271049196e-05, |
| "loss": 0.5573, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.7044009475127789, |
| "grad_norm": 0.9446937441825867, |
| "learning_rate": 1.819136716208481e-05, |
| "loss": 0.5523, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.7106345842164319, |
| "grad_norm": 0.915303111076355, |
| "learning_rate": 1.815175825889421e-05, |
| "loss": 0.5566, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.7168682209200847, |
| "grad_norm": 0.9039698839187622, |
| "learning_rate": 1.811176443179951e-05, |
| "loss": 0.5509, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.7231018576237377, |
| "grad_norm": 0.9758985638618469, |
| "learning_rate": 1.807138756929881e-05, |
| "loss": 0.5571, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.7293354943273906, |
| "grad_norm": 0.9203677773475647, |
| "learning_rate": 1.8030629577977064e-05, |
| "loss": 0.5467, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.7355691310310435, |
| "grad_norm": 0.9455954432487488, |
| "learning_rate": 1.798949238241601e-05, |
| "loss": 0.5539, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7418027677346964, |
| "grad_norm": 0.9083104729652405, |
| "learning_rate": 1.7947977925103315e-05, |
| "loss": 0.5497, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.7480364044383493, |
| "grad_norm": 0.8917014598846436, |
| "learning_rate": 1.7906088166340864e-05, |
| "loss": 0.5544, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7480364044383493, |
| "eval_loss": 0.5512064099311829, |
| "eval_runtime": 341.5157, |
| "eval_samples_per_second": 53.681, |
| "eval_steps_per_second": 6.711, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7542700411420022, |
| "grad_norm": 0.9733327031135559, |
| "learning_rate": 1.786382508415216e-05, |
| "loss": 0.5557, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.7605036778456552, |
| "grad_norm": 0.9236845970153809, |
| "learning_rate": 1.7821190674188953e-05, |
| "loss": 0.5549, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.766737314549308, |
| "grad_norm": 0.9276295304298401, |
| "learning_rate": 1.7778186949636983e-05, |
| "loss": 0.5491, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.772970951252961, |
| "grad_norm": 0.9669515490531921, |
| "learning_rate": 1.7734815941120933e-05, |
| "loss": 0.548, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.7792045879566138, |
| "grad_norm": 0.9030852913856506, |
| "learning_rate": 1.769107969660855e-05, |
| "loss": 0.5588, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.7854382246602668, |
| "grad_norm": 0.9322410225868225, |
| "learning_rate": 1.7646980281313917e-05, |
| "loss": 0.5463, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.7916718613639198, |
| "grad_norm": 0.9386406540870667, |
| "learning_rate": 1.760251977759995e-05, |
| "loss": 0.539, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.7979054980675726, |
| "grad_norm": 0.9279153347015381, |
| "learning_rate": 1.7557700284880063e-05, |
| "loss": 0.5508, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.8041391347712256, |
| "grad_norm": 0.9351046681404114, |
| "learning_rate": 1.751252391951905e-05, |
| "loss": 0.5495, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.8103727714748784, |
| "grad_norm": 0.9292935729026794, |
| "learning_rate": 1.7466992814733123e-05, |
| "loss": 0.5558, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.8166064081785314, |
| "grad_norm": 0.9247336983680725, |
| "learning_rate": 1.7421109120489206e-05, |
| "loss": 0.5582, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.8228400448821843, |
| "grad_norm": 0.9447380900382996, |
| "learning_rate": 1.7374875003403402e-05, |
| "loss": 0.5404, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.8290736815858372, |
| "grad_norm": 0.8972445726394653, |
| "learning_rate": 1.7328292646638694e-05, |
| "loss": 0.5486, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.8353073182894901, |
| "grad_norm": 0.9727171659469604, |
| "learning_rate": 1.7281364249801846e-05, |
| "loss": 0.5443, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.841540954993143, |
| "grad_norm": 0.9603117108345032, |
| "learning_rate": 1.723409202883955e-05, |
| "loss": 0.5418, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.8477745916967959, |
| "grad_norm": 0.9171497225761414, |
| "learning_rate": 1.7186478215933776e-05, |
| "loss": 0.5487, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.8540082284004489, |
| "grad_norm": 0.9400027990341187, |
| "learning_rate": 1.713852505939639e-05, |
| "loss": 0.5476, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.8602418651041017, |
| "grad_norm": 0.9132832288742065, |
| "learning_rate": 1.7090234823562956e-05, |
| "loss": 0.5502, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.8664755018077547, |
| "grad_norm": 0.9160686135292053, |
| "learning_rate": 1.7041609788685853e-05, |
| "loss": 0.5407, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.8727091385114075, |
| "grad_norm": 0.9070727825164795, |
| "learning_rate": 1.699265225082658e-05, |
| "loss": 0.5477, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8789427752150605, |
| "grad_norm": 0.9333738684654236, |
| "learning_rate": 1.694336452174733e-05, |
| "loss": 0.5486, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.8851764119187133, |
| "grad_norm": 0.9226990938186646, |
| "learning_rate": 1.689374892880185e-05, |
| "loss": 0.5494, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.8914100486223663, |
| "grad_norm": 0.9787217974662781, |
| "learning_rate": 1.684380781482553e-05, |
| "loss": 0.5415, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.8976436853260192, |
| "grad_norm": 0.8988795876502991, |
| "learning_rate": 1.679354353802478e-05, |
| "loss": 0.5481, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.9038773220296721, |
| "grad_norm": 0.8944117426872253, |
| "learning_rate": 1.674295847186567e-05, |
| "loss": 0.5469, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.910110958733325, |
| "grad_norm": 0.8955491185188293, |
| "learning_rate": 1.6692055004961867e-05, |
| "loss": 0.547, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.9163445954369779, |
| "grad_norm": 0.938137412071228, |
| "learning_rate": 1.664083554096183e-05, |
| "loss": 0.5505, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.9225782321406308, |
| "grad_norm": 0.940488874912262, |
| "learning_rate": 1.6589302498435324e-05, |
| "loss": 0.5427, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.9288118688442838, |
| "grad_norm": 0.8848482966423035, |
| "learning_rate": 1.6537458310759215e-05, |
| "loss": 0.5407, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.9350455055479366, |
| "grad_norm": 0.8875752687454224, |
| "learning_rate": 1.648530542600255e-05, |
| "loss": 0.5414, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.9412791422515896, |
| "grad_norm": 0.9346476197242737, |
| "learning_rate": 1.6432846306810982e-05, |
| "loss": 0.5407, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.9475127789552424, |
| "grad_norm": 0.8926489949226379, |
| "learning_rate": 1.6380083430290467e-05, |
| "loss": 0.5399, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.9537464156588954, |
| "grad_norm": 0.9111624360084534, |
| "learning_rate": 1.632701928789031e-05, |
| "loss": 0.5462, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.9599800523625484, |
| "grad_norm": 0.9109166264533997, |
| "learning_rate": 1.627365638528551e-05, |
| "loss": 0.545, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.9662136890662012, |
| "grad_norm": 0.9245206117630005, |
| "learning_rate": 1.621999724225844e-05, |
| "loss": 0.5414, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.9724473257698542, |
| "grad_norm": 1.0112046003341675, |
| "learning_rate": 1.6166044392579877e-05, |
| "loss": 0.5397, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.978680962473507, |
| "grad_norm": 0.9774346351623535, |
| "learning_rate": 1.6111800383889345e-05, |
| "loss": 0.5466, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.98491459917716, |
| "grad_norm": 0.9566652774810791, |
| "learning_rate": 1.605726777757482e-05, |
| "loss": 0.5442, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.9911482358808129, |
| "grad_norm": 0.9225612878799438, |
| "learning_rate": 1.6002449148651784e-05, |
| "loss": 0.5395, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.9973818725844658, |
| "grad_norm": 0.8973495364189148, |
| "learning_rate": 1.5947347085641632e-05, |
| "loss": 0.5447, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9973818725844658, |
| "eval_loss": 0.5422428250312805, |
| "eval_runtime": 342.2432, |
| "eval_samples_per_second": 53.567, |
| "eval_steps_per_second": 6.697, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.0036155092881187, |
| "grad_norm": 0.9412503838539124, |
| "learning_rate": 1.5891964190449447e-05, |
| "loss": 0.5338, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.0098491459917716, |
| "grad_norm": 0.9484879374504089, |
| "learning_rate": 1.583630307824113e-05, |
| "loss": 0.5168, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.0160827826954244, |
| "grad_norm": 0.9464488625526428, |
| "learning_rate": 1.5780366377319913e-05, |
| "loss": 0.5243, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.0223164193990775, |
| "grad_norm": 0.92261803150177, |
| "learning_rate": 1.572415672900226e-05, |
| "loss": 0.5313, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.0285500561027303, |
| "grad_norm": 0.9324407577514648, |
| "learning_rate": 1.5667676787493148e-05, |
| "loss": 0.5255, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.0347836928063832, |
| "grad_norm": 0.9365196824073792, |
| "learning_rate": 1.5610929219760715e-05, |
| "loss": 0.5232, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.0410173295100362, |
| "grad_norm": 0.9687336683273315, |
| "learning_rate": 1.5553916705410347e-05, |
| "loss": 0.5255, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.047250966213689, |
| "grad_norm": 0.9781963229179382, |
| "learning_rate": 1.5496641936558135e-05, |
| "loss": 0.5205, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.053484602917342, |
| "grad_norm": 0.969921350479126, |
| "learning_rate": 1.543910761770377e-05, |
| "loss": 0.528, |
| "step": 8450 |
| }, |
| { |
| "epoch": 1.059718239620995, |
| "grad_norm": 0.988711416721344, |
| "learning_rate": 1.5381316465602808e-05, |
| "loss": 0.5226, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.0659518763246478, |
| "grad_norm": 0.9715408682823181, |
| "learning_rate": 1.532327120913843e-05, |
| "loss": 0.5292, |
| "step": 8550 |
| }, |
| { |
| "epoch": 1.0721855130283007, |
| "grad_norm": 1.0321426391601562, |
| "learning_rate": 1.526497458919253e-05, |
| "loss": 0.5285, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.0784191497319535, |
| "grad_norm": 0.9939231872558594, |
| "learning_rate": 1.5206429358516341e-05, |
| "loss": 0.5247, |
| "step": 8650 |
| }, |
| { |
| "epoch": 1.0846527864356066, |
| "grad_norm": 0.9716272950172424, |
| "learning_rate": 1.5147638281600423e-05, |
| "loss": 0.5215, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.0908864231392594, |
| "grad_norm": 1.0110036134719849, |
| "learning_rate": 1.5088604134544135e-05, |
| "loss": 0.5234, |
| "step": 8750 |
| }, |
| { |
| "epoch": 1.0971200598429123, |
| "grad_norm": 0.9568915367126465, |
| "learning_rate": 1.502932970492454e-05, |
| "loss": 0.5157, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.1033536965465653, |
| "grad_norm": 0.9463698267936707, |
| "learning_rate": 1.4969817791664779e-05, |
| "loss": 0.5174, |
| "step": 8850 |
| }, |
| { |
| "epoch": 1.1095873332502182, |
| "grad_norm": 0.9600862264633179, |
| "learning_rate": 1.4910071204901916e-05, |
| "loss": 0.5202, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.115820969953871, |
| "grad_norm": 0.9758629202842712, |
| "learning_rate": 1.4850092765854233e-05, |
| "loss": 0.5244, |
| "step": 8950 |
| }, |
| { |
| "epoch": 1.122054606657524, |
| "grad_norm": 0.9561620950698853, |
| "learning_rate": 1.4789885306688019e-05, |
| "loss": 0.5201, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.128288243361177, |
| "grad_norm": 0.9549373388290405, |
| "learning_rate": 1.4729451670383829e-05, |
| "loss": 0.5298, |
| "step": 9050 |
| }, |
| { |
| "epoch": 1.1345218800648298, |
| "grad_norm": 0.9686883687973022, |
| "learning_rate": 1.4668794710602248e-05, |
| "loss": 0.5193, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.1407555167684826, |
| "grad_norm": 0.9705237150192261, |
| "learning_rate": 1.4607917291549131e-05, |
| "loss": 0.5195, |
| "step": 9150 |
| }, |
| { |
| "epoch": 1.1469891534721357, |
| "grad_norm": 0.9853324890136719, |
| "learning_rate": 1.4546822287840372e-05, |
| "loss": 0.5271, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.1532227901757885, |
| "grad_norm": 0.9839003086090088, |
| "learning_rate": 1.4485512584366146e-05, |
| "loss": 0.5287, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.1594564268794414, |
| "grad_norm": 0.9832494258880615, |
| "learning_rate": 1.4423991076154704e-05, |
| "loss": 0.5193, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.1656900635830945, |
| "grad_norm": 0.9991750121116638, |
| "learning_rate": 1.436226066823566e-05, |
| "loss": 0.5248, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.1719237002867473, |
| "grad_norm": 1.0099903345108032, |
| "learning_rate": 1.4300324275502806e-05, |
| "loss": 0.525, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.1781573369904002, |
| "grad_norm": 0.9502778053283691, |
| "learning_rate": 1.4238184822576499e-05, |
| "loss": 0.5237, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.184390973694053, |
| "grad_norm": 0.9819866418838501, |
| "learning_rate": 1.4175845243665536e-05, |
| "loss": 0.5212, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.190624610397706, |
| "grad_norm": 0.9730864763259888, |
| "learning_rate": 1.4113308482428617e-05, |
| "loss": 0.5244, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.196858247101359, |
| "grad_norm": 0.9329859614372253, |
| "learning_rate": 1.4050577491835338e-05, |
| "loss": 0.5319, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.2030918838050118, |
| "grad_norm": 0.9817786812782288, |
| "learning_rate": 1.3987655234026752e-05, |
| "loss": 0.5186, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.2093255205086648, |
| "grad_norm": 1.0095436573028564, |
| "learning_rate": 1.39245446801755e-05, |
| "loss": 0.5207, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.2155591572123177, |
| "grad_norm": 0.9972400069236755, |
| "learning_rate": 1.3861248810345516e-05, |
| "loss": 0.5216, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.2217927939159705, |
| "grad_norm": 0.9834696650505066, |
| "learning_rate": 1.3797770613351307e-05, |
| "loss": 0.5234, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.2280264306196236, |
| "grad_norm": 0.9960749745368958, |
| "learning_rate": 1.373411308661682e-05, |
| "loss": 0.5165, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.2342600673232764, |
| "grad_norm": 0.9419467449188232, |
| "learning_rate": 1.3670279236033902e-05, |
| "loss": 0.5222, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.2404937040269293, |
| "grad_norm": 0.962372899055481, |
| "learning_rate": 1.3606272075820374e-05, |
| "loss": 0.5258, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.2467273407305823, |
| "grad_norm": 0.9631684422492981, |
| "learning_rate": 1.3542094628377686e-05, |
| "loss": 0.5216, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2467273407305823, |
| "eval_loss": 0.5370460748672485, |
| "eval_runtime": 342.3384, |
| "eval_samples_per_second": 53.552, |
| "eval_steps_per_second": 6.695, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2529609774342352, |
| "grad_norm": 0.9372801780700684, |
| "learning_rate": 1.3477749924148206e-05, |
| "loss": 0.5237, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.259194614137888, |
| "grad_norm": 0.985061764717102, |
| "learning_rate": 1.3413241001472132e-05, |
| "loss": 0.5257, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.2654282508415409, |
| "grad_norm": 0.9807039499282837, |
| "learning_rate": 1.334857090644401e-05, |
| "loss": 0.524, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.271661887545194, |
| "grad_norm": 0.9594578146934509, |
| "learning_rate": 1.3283742692768892e-05, |
| "loss": 0.52, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.2778955242488468, |
| "grad_norm": 0.963344156742096, |
| "learning_rate": 1.3218759421618172e-05, |
| "loss": 0.5212, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.2841291609524996, |
| "grad_norm": 0.9528459906578064, |
| "learning_rate": 1.3153624161485001e-05, |
| "loss": 0.5106, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.2903627976561527, |
| "grad_norm": 0.9546725749969482, |
| "learning_rate": 1.308833998803942e-05, |
| "loss": 0.5236, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.2965964343598055, |
| "grad_norm": 0.9785650372505188, |
| "learning_rate": 1.302290998398311e-05, |
| "loss": 0.5207, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.3028300710634584, |
| "grad_norm": 0.9910630583763123, |
| "learning_rate": 1.295733723890384e-05, |
| "loss": 0.5171, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.3090637077671112, |
| "grad_norm": 0.9939322471618652, |
| "learning_rate": 1.2891624849129572e-05, |
| "loss": 0.528, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.3152973444707643, |
| "grad_norm": 1.005935549736023, |
| "learning_rate": 1.2825775917582257e-05, |
| "loss": 0.5208, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.3215309811744171, |
| "grad_norm": 0.9852765202522278, |
| "learning_rate": 1.2759793553631307e-05, |
| "loss": 0.5147, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.32776461787807, |
| "grad_norm": 0.9535738229751587, |
| "learning_rate": 1.2693680872946787e-05, |
| "loss": 0.5169, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.333998254581723, |
| "grad_norm": 0.992779552936554, |
| "learning_rate": 1.2627440997352269e-05, |
| "loss": 0.5195, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.340231891285376, |
| "grad_norm": 1.0103819370269775, |
| "learning_rate": 1.256107705467745e-05, |
| "loss": 0.517, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.3464655279890287, |
| "grad_norm": 1.0152695178985596, |
| "learning_rate": 1.2494592178610438e-05, |
| "loss": 0.5175, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.3526991646926816, |
| "grad_norm": 0.989223837852478, |
| "learning_rate": 1.2427989508549781e-05, |
| "loss": 0.5213, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.3589328013963347, |
| "grad_norm": 0.9971133470535278, |
| "learning_rate": 1.236127218945623e-05, |
| "loss": 0.5175, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.3651664380999875, |
| "grad_norm": 0.9640889763832092, |
| "learning_rate": 1.2294443371704237e-05, |
| "loss": 0.5238, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.3714000748036406, |
| "grad_norm": 0.994499921798706, |
| "learning_rate": 1.2227506210933187e-05, |
| "loss": 0.5192, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.3776337115072934, |
| "grad_norm": 1.0173231363296509, |
| "learning_rate": 1.2160463867898398e-05, |
| "loss": 0.5111, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.3838673482109463, |
| "grad_norm": 1.0041266679763794, |
| "learning_rate": 1.2093319508321863e-05, |
| "loss": 0.5188, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.390100984914599, |
| "grad_norm": 0.9839298725128174, |
| "learning_rate": 1.2026076302742778e-05, |
| "loss": 0.5194, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.3963346216182522, |
| "grad_norm": 0.9892042875289917, |
| "learning_rate": 1.1958737426367806e-05, |
| "loss": 0.5186, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.402568258321905, |
| "grad_norm": 1.0081249475479126, |
| "learning_rate": 1.1891306058921178e-05, |
| "loss": 0.5153, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.4088018950255579, |
| "grad_norm": 0.9584305286407471, |
| "learning_rate": 1.1823785384494515e-05, |
| "loss": 0.5219, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.415035531729211, |
| "grad_norm": 0.9678763747215271, |
| "learning_rate": 1.1756178591396499e-05, |
| "loss": 0.5182, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.4212691684328638, |
| "grad_norm": 0.9715519547462463, |
| "learning_rate": 1.168848887200231e-05, |
| "loss": 0.5153, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.4275028051365166, |
| "grad_norm": 1.041542410850525, |
| "learning_rate": 1.162071942260289e-05, |
| "loss": 0.5177, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.4337364418401695, |
| "grad_norm": 0.9940893650054932, |
| "learning_rate": 1.1552873443254002e-05, |
| "loss": 0.5253, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.4399700785438225, |
| "grad_norm": 1.0045311450958252, |
| "learning_rate": 1.1484954137625141e-05, |
| "loss": 0.5138, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.4462037152474754, |
| "grad_norm": 1.021355152130127, |
| "learning_rate": 1.1416964712848248e-05, |
| "loss": 0.5158, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.4524373519511282, |
| "grad_norm": 0.9734411835670471, |
| "learning_rate": 1.1348908379366275e-05, |
| "loss": 0.5155, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.4586709886547813, |
| "grad_norm": 0.9819545745849609, |
| "learning_rate": 1.1280788350781583e-05, |
| "loss": 0.5189, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.4649046253584341, |
| "grad_norm": 1.0048792362213135, |
| "learning_rate": 1.121260784370419e-05, |
| "loss": 0.5189, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.471138262062087, |
| "grad_norm": 0.9754243493080139, |
| "learning_rate": 1.1144370077599908e-05, |
| "loss": 0.5165, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.4773718987657398, |
| "grad_norm": 1.0066121816635132, |
| "learning_rate": 1.10760782746383e-05, |
| "loss": 0.5215, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.4836055354693929, |
| "grad_norm": 1.0248697996139526, |
| "learning_rate": 1.1007735659540531e-05, |
| "loss": 0.5172, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.4898391721730457, |
| "grad_norm": 0.9748446941375732, |
| "learning_rate": 1.0939345459427106e-05, |
| "loss": 0.513, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.4960728088766988, |
| "grad_norm": 1.0014723539352417, |
| "learning_rate": 1.0870910903665479e-05, |
| "loss": 0.5141, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.4960728088766988, |
| "eval_loss": 0.5317554473876953, |
| "eval_runtime": 343.396, |
| "eval_samples_per_second": 53.387, |
| "eval_steps_per_second": 6.675, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.5023064455803516, |
| "grad_norm": 1.040360450744629, |
| "learning_rate": 1.080243522371757e-05, |
| "loss": 0.5132, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.5085400822840045, |
| "grad_norm": 0.9970097541809082, |
| "learning_rate": 1.0733921652987172e-05, |
| "loss": 0.5183, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.5147737189876573, |
| "grad_norm": 1.002963900566101, |
| "learning_rate": 1.0665373426667264e-05, |
| "loss": 0.5196, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.5210073556913102, |
| "grad_norm": 0.9906361103057861, |
| "learning_rate": 1.0596793781587264e-05, |
| "loss": 0.5127, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.5272409923949632, |
| "grad_norm": 0.9885673522949219, |
| "learning_rate": 1.0528185956060173e-05, |
| "loss": 0.5124, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.533474629098616, |
| "grad_norm": 0.9601041674613953, |
| "learning_rate": 1.045955318972965e-05, |
| "loss": 0.5146, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.5397082658022692, |
| "grad_norm": 1.070622205734253, |
| "learning_rate": 1.0390898723417073e-05, |
| "loss": 0.5138, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.545941902505922, |
| "grad_norm": 1.0367426872253418, |
| "learning_rate": 1.0322225798968482e-05, |
| "loss": 0.5111, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.5521755392095749, |
| "grad_norm": 0.990876317024231, |
| "learning_rate": 1.0253537659101495e-05, |
| "loss": 0.5183, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.5584091759132277, |
| "grad_norm": 0.9871241450309753, |
| "learning_rate": 1.0184837547252213e-05, |
| "loss": 0.5123, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.5646428126168805, |
| "grad_norm": 1.015692114830017, |
| "learning_rate": 1.0116128707422039e-05, |
| "loss": 0.5137, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.5708764493205336, |
| "grad_norm": 1.0144940614700317, |
| "learning_rate": 1.0047414384024513e-05, |
| "loss": 0.5143, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.5771100860241867, |
| "grad_norm": 1.0433917045593262, |
| "learning_rate": 9.978697821732109e-06, |
| "loss": 0.5179, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.5833437227278395, |
| "grad_norm": 1.0353728532791138, |
| "learning_rate": 9.90998226532302e-06, |
| "loss": 0.5167, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.5895773594314924, |
| "grad_norm": 1.0195094347000122, |
| "learning_rate": 9.841270959527945e-06, |
| "loss": 0.5161, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.5958109961351452, |
| "grad_norm": 0.9943532347679138, |
| "learning_rate": 9.772567148876859e-06, |
| "loss": 0.5103, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.602044632838798, |
| "grad_norm": 0.9999650716781616, |
| "learning_rate": 9.703874077545837e-06, |
| "loss": 0.5049, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.608278269542451, |
| "grad_norm": 1.030554175376892, |
| "learning_rate": 9.635194989203822e-06, |
| "loss": 0.5195, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.614511906246104, |
| "grad_norm": 0.9951714873313904, |
| "learning_rate": 9.566533126859509e-06, |
| "loss": 0.5154, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.620745542949757, |
| "grad_norm": 1.039230465888977, |
| "learning_rate": 9.497891732708168e-06, |
| "loss": 0.5079, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.6269791796534099, |
| "grad_norm": 0.9834098815917969, |
| "learning_rate": 9.429274047978574e-06, |
| "loss": 0.5169, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.6332128163570627, |
| "grad_norm": 1.0020710229873657, |
| "learning_rate": 9.360683312779942e-06, |
| "loss": 0.5144, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.6394464530607156, |
| "grad_norm": 0.9868305921554565, |
| "learning_rate": 9.29212276594895e-06, |
| "loss": 0.5197, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.6456800897643684, |
| "grad_norm": 0.9840037822723389, |
| "learning_rate": 9.223595644896773e-06, |
| "loss": 0.5161, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.6519137264680215, |
| "grad_norm": 1.0117952823638916, |
| "learning_rate": 9.15510518545625e-06, |
| "loss": 0.5098, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.6581473631716743, |
| "grad_norm": 1.0590097904205322, |
| "learning_rate": 9.086654621729046e-06, |
| "loss": 0.5113, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.6643809998753274, |
| "grad_norm": 1.0118435621261597, |
| "learning_rate": 9.018247185932973e-06, |
| "loss": 0.5089, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.6706146365789802, |
| "grad_norm": 0.9989214539527893, |
| "learning_rate": 8.949886108249358e-06, |
| "loss": 0.5191, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.676848273282633, |
| "grad_norm": 0.9902826547622681, |
| "learning_rate": 8.881574616670493e-06, |
| "loss": 0.5061, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.683081909986286, |
| "grad_norm": 1.0481892824172974, |
| "learning_rate": 8.813315936847247e-06, |
| "loss": 0.5173, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.6893155466899388, |
| "grad_norm": 1.0011539459228516, |
| "learning_rate": 8.745113291936718e-06, |
| "loss": 0.5107, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.6955491833935918, |
| "grad_norm": 1.0297352075576782, |
| "learning_rate": 8.676969902450054e-06, |
| "loss": 0.5151, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.7017828200972447, |
| "grad_norm": 0.9961735010147095, |
| "learning_rate": 8.608888986100374e-06, |
| "loss": 0.5152, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.7080164568008978, |
| "grad_norm": 1.0135923624038696, |
| "learning_rate": 8.540873757650845e-06, |
| "loss": 0.5121, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.7142500935045506, |
| "grad_norm": 1.0060099363327026, |
| "learning_rate": 8.472927428762845e-06, |
| "loss": 0.5141, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.7204837302082034, |
| "grad_norm": 1.02751886844635, |
| "learning_rate": 8.405053207844358e-06, |
| "loss": 0.5208, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.7267173669118563, |
| "grad_norm": 0.9756371378898621, |
| "learning_rate": 8.337254299898432e-06, |
| "loss": 0.5168, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.7329510036155091, |
| "grad_norm": 1.016017198562622, |
| "learning_rate": 8.269533906371862e-06, |
| "loss": 0.5128, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.7391846403191622, |
| "grad_norm": 1.0274937152862549, |
| "learning_rate": 8.201895225004004e-06, |
| "loss": 0.5114, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.7454182770228153, |
| "grad_norm": 1.0203967094421387, |
| "learning_rate": 8.134341449675802e-06, |
| "loss": 0.5111, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.7454182770228153, |
| "eval_loss": 0.5270397067070007, |
| "eval_runtime": 340.6696, |
| "eval_samples_per_second": 53.815, |
| "eval_steps_per_second": 6.728, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.7516519137264681, |
| "grad_norm": 1.0635205507278442, |
| "learning_rate": 8.066875770258952e-06, |
| "loss": 0.51, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.757885550430121, |
| "grad_norm": 1.030682921409607, |
| "learning_rate": 7.99950137246528e-06, |
| "loss": 0.5156, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.7641191871337738, |
| "grad_norm": 0.9917548298835754, |
| "learning_rate": 7.932221437696324e-06, |
| "loss": 0.5101, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.7703528238374266, |
| "grad_norm": 1.0255886316299438, |
| "learning_rate": 7.865039142893091e-06, |
| "loss": 0.5155, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.7765864605410797, |
| "grad_norm": 1.0358428955078125, |
| "learning_rate": 7.797957660386072e-06, |
| "loss": 0.5125, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.7828200972447326, |
| "grad_norm": 1.0858733654022217, |
| "learning_rate": 7.730980157745414e-06, |
| "loss": 0.5198, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.7890537339483856, |
| "grad_norm": 1.0054926872253418, |
| "learning_rate": 7.664109797631365e-06, |
| "loss": 0.5058, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.7952873706520385, |
| "grad_norm": 1.0060533285140991, |
| "learning_rate": 7.5973497376449304e-06, |
| "loss": 0.5126, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.8015210073556913, |
| "grad_norm": 1.0656771659851074, |
| "learning_rate": 7.530703130178781e-06, |
| "loss": 0.5149, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.8077546440593442, |
| "grad_norm": 1.032486081123352, |
| "learning_rate": 7.46417312226837e-06, |
| "loss": 0.5174, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.813988280762997, |
| "grad_norm": 0.996094286441803, |
| "learning_rate": 7.397762855443374e-06, |
| "loss": 0.5112, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.82022191746665, |
| "grad_norm": 1.0176514387130737, |
| "learning_rate": 7.331475465579303e-06, |
| "loss": 0.5063, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.826455554170303, |
| "grad_norm": 1.049377202987671, |
| "learning_rate": 7.265314082749471e-06, |
| "loss": 0.5082, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.832689190873956, |
| "grad_norm": 1.0229572057724, |
| "learning_rate": 7.199281831077148e-06, |
| "loss": 0.513, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.8389228275776088, |
| "grad_norm": 0.9690967202186584, |
| "learning_rate": 7.133381828588088e-06, |
| "loss": 0.5052, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.8451564642812617, |
| "grad_norm": 1.0039163827896118, |
| "learning_rate": 7.0676171870632646e-06, |
| "loss": 0.5082, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.8513901009849145, |
| "grad_norm": 1.0232114791870117, |
| "learning_rate": 7.001991011891936e-06, |
| "loss": 0.5158, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.8576237376885674, |
| "grad_norm": 0.964314341545105, |
| "learning_rate": 6.93650640192502e-06, |
| "loss": 0.5056, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.8638573743922204, |
| "grad_norm": 1.1142281293869019, |
| "learning_rate": 6.871166449328759e-06, |
| "loss": 0.5137, |
| "step": 14950 |
| }, |
| { |
| "epoch": 1.8700910110958733, |
| "grad_norm": 1.028795838356018, |
| "learning_rate": 6.8059742394387215e-06, |
| "loss": 0.509, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.8763246477995263, |
| "grad_norm": 1.0885367393493652, |
| "learning_rate": 6.74093285061409e-06, |
| "loss": 0.5091, |
| "step": 15050 |
| }, |
| { |
| "epoch": 1.8825582845031792, |
| "grad_norm": 1.0302170515060425, |
| "learning_rate": 6.67604535409233e-06, |
| "loss": 0.5134, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.888791921206832, |
| "grad_norm": 1.0290040969848633, |
| "learning_rate": 6.611314813844139e-06, |
| "loss": 0.5103, |
| "step": 15150 |
| }, |
| { |
| "epoch": 1.8950255579104849, |
| "grad_norm": 1.0670026540756226, |
| "learning_rate": 6.54674428642879e-06, |
| "loss": 0.5117, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.9012591946141377, |
| "grad_norm": 1.023722767829895, |
| "learning_rate": 6.482336820849784e-06, |
| "loss": 0.5097, |
| "step": 15250 |
| }, |
| { |
| "epoch": 1.9074928313177908, |
| "grad_norm": 0.9924819469451904, |
| "learning_rate": 6.418095458410894e-06, |
| "loss": 0.5135, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.9137264680214439, |
| "grad_norm": 1.0372419357299805, |
| "learning_rate": 6.3540232325725325e-06, |
| "loss": 0.51, |
| "step": 15350 |
| }, |
| { |
| "epoch": 1.9199601047250967, |
| "grad_norm": 1.0652357339859009, |
| "learning_rate": 6.2901231688085416e-06, |
| "loss": 0.5093, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.9261937414287496, |
| "grad_norm": 0.9893951416015625, |
| "learning_rate": 6.226398284463306e-06, |
| "loss": 0.5089, |
| "step": 15450 |
| }, |
| { |
| "epoch": 1.9324273781324024, |
| "grad_norm": 1.0378330945968628, |
| "learning_rate": 6.16285158860928e-06, |
| "loss": 0.5072, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.9386610148360552, |
| "grad_norm": 1.0170749425888062, |
| "learning_rate": 6.099486081904914e-06, |
| "loss": 0.5108, |
| "step": 15550 |
| }, |
| { |
| "epoch": 1.9448946515397083, |
| "grad_norm": 1.0508410930633545, |
| "learning_rate": 6.036304756452942e-06, |
| "loss": 0.5063, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.9511282882433612, |
| "grad_norm": 0.998593270778656, |
| "learning_rate": 5.973310595659123e-06, |
| "loss": 0.5129, |
| "step": 15650 |
| }, |
| { |
| "epoch": 1.9573619249470142, |
| "grad_norm": 1.011422872543335, |
| "learning_rate": 5.91050657409133e-06, |
| "loss": 0.5116, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.963595561650667, |
| "grad_norm": 1.0654633045196533, |
| "learning_rate": 5.847895657339131e-06, |
| "loss": 0.5098, |
| "step": 15750 |
| }, |
| { |
| "epoch": 1.96982919835432, |
| "grad_norm": 1.0089906454086304, |
| "learning_rate": 5.785480801873717e-06, |
| "loss": 0.5094, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.9760628350579728, |
| "grad_norm": 0.9850724339485168, |
| "learning_rate": 5.723264954908329e-06, |
| "loss": 0.5108, |
| "step": 15850 |
| }, |
| { |
| "epoch": 1.9822964717616256, |
| "grad_norm": 1.0490802526474, |
| "learning_rate": 5.661251054259072e-06, |
| "loss": 0.5102, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.9885301084652787, |
| "grad_norm": 1.0309854745864868, |
| "learning_rate": 5.5994420282062e-06, |
| "loss": 0.5049, |
| "step": 15950 |
| }, |
| { |
| "epoch": 1.9947637451689315, |
| "grad_norm": 1.0320066213607788, |
| "learning_rate": 5.537840795355844e-06, |
| "loss": 0.5085, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.9947637451689315, |
| "eval_loss": 0.523627519607544, |
| "eval_runtime": 342.119, |
| "eval_samples_per_second": 53.587, |
| "eval_steps_per_second": 6.699, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.0009973818725846, |
| "grad_norm": 1.0371078252792358, |
| "learning_rate": 5.47645026450219e-06, |
| "loss": 0.5068, |
| "step": 16050 |
| }, |
| { |
| "epoch": 2.0072310185762374, |
| "grad_norm": 1.0706619024276733, |
| "learning_rate": 5.4152733344901344e-06, |
| "loss": 0.4852, |
| "step": 16100 |
| }, |
| { |
| "epoch": 2.0134646552798903, |
| "grad_norm": 1.088426113128662, |
| "learning_rate": 5.354312894078395e-06, |
| "loss": 0.4924, |
| "step": 16150 |
| }, |
| { |
| "epoch": 2.019698291983543, |
| "grad_norm": 1.0751584768295288, |
| "learning_rate": 5.293571821803107e-06, |
| "loss": 0.4915, |
| "step": 16200 |
| }, |
| { |
| "epoch": 2.025931928687196, |
| "grad_norm": 1.0685985088348389, |
| "learning_rate": 5.2330529858419e-06, |
| "loss": 0.4848, |
| "step": 16250 |
| }, |
| { |
| "epoch": 2.032165565390849, |
| "grad_norm": 1.0188616514205933, |
| "learning_rate": 5.172759243878465e-06, |
| "loss": 0.4916, |
| "step": 16300 |
| }, |
| { |
| "epoch": 2.038399202094502, |
| "grad_norm": 1.0863087177276611, |
| "learning_rate": 5.112693442967606e-06, |
| "loss": 0.4966, |
| "step": 16350 |
| }, |
| { |
| "epoch": 2.044632838798155, |
| "grad_norm": 1.0906935930252075, |
| "learning_rate": 5.052858419400823e-06, |
| "loss": 0.4912, |
| "step": 16400 |
| }, |
| { |
| "epoch": 2.050866475501808, |
| "grad_norm": 1.1264746189117432, |
| "learning_rate": 4.993256998572349e-06, |
| "loss": 0.4926, |
| "step": 16450 |
| }, |
| { |
| "epoch": 2.0571001122054606, |
| "grad_norm": 1.0933997631072998, |
| "learning_rate": 4.933891994845781e-06, |
| "loss": 0.4861, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.0633337489091135, |
| "grad_norm": 1.0643893480300903, |
| "learning_rate": 4.874766211421137e-06, |
| "loss": 0.4886, |
| "step": 16550 |
| }, |
| { |
| "epoch": 2.0695673856127663, |
| "grad_norm": 1.1163561344146729, |
| "learning_rate": 4.815882440202541e-06, |
| "loss": 0.4931, |
| "step": 16600 |
| }, |
| { |
| "epoch": 2.0758010223164196, |
| "grad_norm": 1.1243481636047363, |
| "learning_rate": 4.757243461666341e-06, |
| "loss": 0.4927, |
| "step": 16650 |
| }, |
| { |
| "epoch": 2.0820346590200725, |
| "grad_norm": 1.0660301446914673, |
| "learning_rate": 4.698852044729848e-06, |
| "loss": 0.4932, |
| "step": 16700 |
| }, |
| { |
| "epoch": 2.0882682957237253, |
| "grad_norm": 1.1082499027252197, |
| "learning_rate": 4.640710946620579e-06, |
| "loss": 0.4914, |
| "step": 16750 |
| }, |
| { |
| "epoch": 2.094501932427378, |
| "grad_norm": 1.1362559795379639, |
| "learning_rate": 4.58282291274606e-06, |
| "loss": 0.4886, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.100735569131031, |
| "grad_norm": 1.0977436304092407, |
| "learning_rate": 4.525190676564189e-06, |
| "loss": 0.4927, |
| "step": 16850 |
| }, |
| { |
| "epoch": 2.106969205834684, |
| "grad_norm": 1.1538190841674805, |
| "learning_rate": 4.467816959454166e-06, |
| "loss": 0.4934, |
| "step": 16900 |
| }, |
| { |
| "epoch": 2.1132028425383367, |
| "grad_norm": 1.1539462804794312, |
| "learning_rate": 4.4107044705879835e-06, |
| "loss": 0.491, |
| "step": 16950 |
| }, |
| { |
| "epoch": 2.11943647924199, |
| "grad_norm": 1.1588478088378906, |
| "learning_rate": 4.353855906802508e-06, |
| "loss": 0.4867, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.125670115945643, |
| "grad_norm": 1.1863791942596436, |
| "learning_rate": 4.297273952472128e-06, |
| "loss": 0.4897, |
| "step": 17050 |
| }, |
| { |
| "epoch": 2.1319037526492957, |
| "grad_norm": 1.0877330303192139, |
| "learning_rate": 4.24096127938201e-06, |
| "loss": 0.4822, |
| "step": 17100 |
| }, |
| { |
| "epoch": 2.1381373893529485, |
| "grad_norm": 1.1087538003921509, |
| "learning_rate": 4.184920546601927e-06, |
| "loss": 0.4859, |
| "step": 17150 |
| }, |
| { |
| "epoch": 2.1443710260566013, |
| "grad_norm": 1.1509010791778564, |
| "learning_rate": 4.129154400360691e-06, |
| "loss": 0.4891, |
| "step": 17200 |
| }, |
| { |
| "epoch": 2.150604662760254, |
| "grad_norm": 1.1167641878128052, |
| "learning_rate": 4.073665473921232e-06, |
| "loss": 0.4915, |
| "step": 17250 |
| }, |
| { |
| "epoch": 2.156838299463907, |
| "grad_norm": 1.1157797574996948, |
| "learning_rate": 4.018456387456207e-06, |
| "loss": 0.4824, |
| "step": 17300 |
| }, |
| { |
| "epoch": 2.1630719361675603, |
| "grad_norm": 1.1385409832000732, |
| "learning_rate": 3.963529747924326e-06, |
| "loss": 0.4865, |
| "step": 17350 |
| }, |
| { |
| "epoch": 2.169305572871213, |
| "grad_norm": 1.1239948272705078, |
| "learning_rate": 3.90888814894721e-06, |
| "loss": 0.4904, |
| "step": 17400 |
| }, |
| { |
| "epoch": 2.175539209574866, |
| "grad_norm": 1.0895740985870361, |
| "learning_rate": 3.854534170686943e-06, |
| "loss": 0.4876, |
| "step": 17450 |
| }, |
| { |
| "epoch": 2.181772846278519, |
| "grad_norm": 1.1307170391082764, |
| "learning_rate": 3.8004703797242514e-06, |
| "loss": 0.4866, |
| "step": 17500 |
| }, |
| { |
| "epoch": 2.1880064829821717, |
| "grad_norm": 1.1071311235427856, |
| "learning_rate": 3.746699328937261e-06, |
| "loss": 0.4899, |
| "step": 17550 |
| }, |
| { |
| "epoch": 2.1942401196858246, |
| "grad_norm": 1.146642804145813, |
| "learning_rate": 3.693223557381016e-06, |
| "loss": 0.4859, |
| "step": 17600 |
| }, |
| { |
| "epoch": 2.200473756389478, |
| "grad_norm": 1.14662504196167, |
| "learning_rate": 3.6400455901675248e-06, |
| "loss": 0.4866, |
| "step": 17650 |
| }, |
| { |
| "epoch": 2.2067073930931307, |
| "grad_norm": 1.1178981065750122, |
| "learning_rate": 3.5871679383465687e-06, |
| "loss": 0.4919, |
| "step": 17700 |
| }, |
| { |
| "epoch": 2.2129410297967835, |
| "grad_norm": 1.1385935544967651, |
| "learning_rate": 3.534593098787107e-06, |
| "loss": 0.4907, |
| "step": 17750 |
| }, |
| { |
| "epoch": 2.2191746665004364, |
| "grad_norm": 1.1478828191757202, |
| "learning_rate": 3.4823235540593857e-06, |
| "loss": 0.4923, |
| "step": 17800 |
| }, |
| { |
| "epoch": 2.2254083032040892, |
| "grad_norm": 1.1568485498428345, |
| "learning_rate": 3.4303617723177085e-06, |
| "loss": 0.4955, |
| "step": 17850 |
| }, |
| { |
| "epoch": 2.231641939907742, |
| "grad_norm": 1.1410853862762451, |
| "learning_rate": 3.3787102071838907e-06, |
| "loss": 0.4895, |
| "step": 17900 |
| }, |
| { |
| "epoch": 2.237875576611395, |
| "grad_norm": 1.1371911764144897, |
| "learning_rate": 3.3273712976313966e-06, |
| "loss": 0.4952, |
| "step": 17950 |
| }, |
| { |
| "epoch": 2.244109213315048, |
| "grad_norm": 1.1390269994735718, |
| "learning_rate": 3.2763474678701847e-06, |
| "loss": 0.4942, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.244109213315048, |
| "eval_loss": 0.5241288542747498, |
| "eval_runtime": 340.6458, |
| "eval_samples_per_second": 53.818, |
| "eval_steps_per_second": 6.728, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.250342850018701, |
| "grad_norm": 1.1387605667114258, |
| "learning_rate": 3.2256411272322097e-06, |
| "loss": 0.4917, |
| "step": 18050 |
| }, |
| { |
| "epoch": 2.256576486722354, |
| "grad_norm": 1.1434866189956665, |
| "learning_rate": 3.175254670057698e-06, |
| "loss": 0.4891, |
| "step": 18100 |
| }, |
| { |
| "epoch": 2.2628101234260067, |
| "grad_norm": 1.196864128112793, |
| "learning_rate": 3.125190475582034e-06, |
| "loss": 0.4881, |
| "step": 18150 |
| }, |
| { |
| "epoch": 2.2690437601296596, |
| "grad_norm": 1.175753116607666, |
| "learning_rate": 3.0754509078234663e-06, |
| "loss": 0.4874, |
| "step": 18200 |
| }, |
| { |
| "epoch": 2.2752773968333124, |
| "grad_norm": 1.183666706085205, |
| "learning_rate": 3.0260383154714425e-06, |
| "loss": 0.487, |
| "step": 18250 |
| }, |
| { |
| "epoch": 2.2815110335369653, |
| "grad_norm": 1.1405208110809326, |
| "learning_rate": 2.9769550317757078e-06, |
| "loss": 0.497, |
| "step": 18300 |
| }, |
| { |
| "epoch": 2.2877446702406186, |
| "grad_norm": 1.082384467124939, |
| "learning_rate": 2.9282033744361613e-06, |
| "loss": 0.4868, |
| "step": 18350 |
| }, |
| { |
| "epoch": 2.2939783069442714, |
| "grad_norm": 1.2056792974472046, |
| "learning_rate": 2.8797856454933694e-06, |
| "loss": 0.4912, |
| "step": 18400 |
| }, |
| { |
| "epoch": 2.3002119436479243, |
| "grad_norm": 1.1480531692504883, |
| "learning_rate": 2.831704131219899e-06, |
| "loss": 0.4899, |
| "step": 18450 |
| }, |
| { |
| "epoch": 2.306445580351577, |
| "grad_norm": 1.1439813375473022, |
| "learning_rate": 2.7839611020123447e-06, |
| "loss": 0.4959, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.31267921705523, |
| "grad_norm": 1.1246402263641357, |
| "learning_rate": 2.7365588122841227e-06, |
| "loss": 0.4917, |
| "step": 18550 |
| }, |
| { |
| "epoch": 2.318912853758883, |
| "grad_norm": 1.1865195035934448, |
| "learning_rate": 2.689499500359022e-06, |
| "loss": 0.4861, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.3251464904625356, |
| "grad_norm": 1.115790605545044, |
| "learning_rate": 2.6427853883655085e-06, |
| "loss": 0.4971, |
| "step": 18650 |
| }, |
| { |
| "epoch": 2.331380127166189, |
| "grad_norm": 1.1578739881515503, |
| "learning_rate": 2.5964186821317963e-06, |
| "loss": 0.4878, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.3376137638698418, |
| "grad_norm": 1.16416597366333, |
| "learning_rate": 2.550401571081692e-06, |
| "loss": 0.4893, |
| "step": 18750 |
| }, |
| { |
| "epoch": 2.3438474005734946, |
| "grad_norm": 1.160459041595459, |
| "learning_rate": 2.5047362281312004e-06, |
| "loss": 0.4893, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.3500810372771475, |
| "grad_norm": 1.1183321475982666, |
| "learning_rate": 2.459424809585943e-06, |
| "loss": 0.4863, |
| "step": 18850 |
| }, |
| { |
| "epoch": 2.3563146739808003, |
| "grad_norm": 1.1559134721755981, |
| "learning_rate": 2.41446945503931e-06, |
| "loss": 0.4906, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.362548310684453, |
| "grad_norm": 1.1574698686599731, |
| "learning_rate": 2.3698722872714486e-06, |
| "loss": 0.4891, |
| "step": 18950 |
| }, |
| { |
| "epoch": 2.368781947388106, |
| "grad_norm": 1.1323516368865967, |
| "learning_rate": 2.3256354121490197e-06, |
| "loss": 0.4904, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.3750155840917593, |
| "grad_norm": 1.1418200731277466, |
| "learning_rate": 2.2817609185257493e-06, |
| "loss": 0.4847, |
| "step": 19050 |
| }, |
| { |
| "epoch": 2.381249220795412, |
| "grad_norm": 1.1572823524475098, |
| "learning_rate": 2.2382508781438217e-06, |
| "loss": 0.4907, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.387482857499065, |
| "grad_norm": 1.1278266906738281, |
| "learning_rate": 2.195107345536013e-06, |
| "loss": 0.4954, |
| "step": 19150 |
| }, |
| { |
| "epoch": 2.393716494202718, |
| "grad_norm": 1.1290051937103271, |
| "learning_rate": 2.152332357928719e-06, |
| "loss": 0.4822, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.3999501309063707, |
| "grad_norm": 1.236810564994812, |
| "learning_rate": 2.109927935145718e-06, |
| "loss": 0.4883, |
| "step": 19250 |
| }, |
| { |
| "epoch": 2.4061837676100235, |
| "grad_norm": 1.193284511566162, |
| "learning_rate": 2.0678960795128234e-06, |
| "loss": 0.4781, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.4124174043136764, |
| "grad_norm": 1.1248869895935059, |
| "learning_rate": 2.026238775763322e-06, |
| "loss": 0.4822, |
| "step": 19350 |
| }, |
| { |
| "epoch": 2.4186510410173296, |
| "grad_norm": 1.124065637588501, |
| "learning_rate": 1.9849579909442595e-06, |
| "loss": 0.4928, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.4248846777209825, |
| "grad_norm": 1.1190842390060425, |
| "learning_rate": 1.944055674323554e-06, |
| "loss": 0.484, |
| "step": 19450 |
| }, |
| { |
| "epoch": 2.4311183144246353, |
| "grad_norm": 1.1386178731918335, |
| "learning_rate": 1.9035337572979561e-06, |
| "loss": 0.4901, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.437351951128288, |
| "grad_norm": 1.2505484819412231, |
| "learning_rate": 1.8633941533018428e-06, |
| "loss": 0.4939, |
| "step": 19550 |
| }, |
| { |
| "epoch": 2.443585587831941, |
| "grad_norm": 1.1106178760528564, |
| "learning_rate": 1.8236387577168735e-06, |
| "loss": 0.4882, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.4498192245355943, |
| "grad_norm": 1.1336780786514282, |
| "learning_rate": 1.784269447782484e-06, |
| "loss": 0.4853, |
| "step": 19650 |
| }, |
| { |
| "epoch": 2.456052861239247, |
| "grad_norm": 1.2004270553588867, |
| "learning_rate": 1.7452880825072448e-06, |
| "loss": 0.4825, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.4622864979429, |
| "grad_norm": 1.18308687210083, |
| "learning_rate": 1.7066965025810844e-06, |
| "loss": 0.4862, |
| "step": 19750 |
| }, |
| { |
| "epoch": 2.468520134646553, |
| "grad_norm": 1.110257863998413, |
| "learning_rate": 1.668496530288366e-06, |
| "loss": 0.4874, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.4747537713502057, |
| "grad_norm": 1.1742489337921143, |
| "learning_rate": 1.6306899694218436e-06, |
| "loss": 0.4829, |
| "step": 19850 |
| }, |
| { |
| "epoch": 2.4809874080538585, |
| "grad_norm": 1.2101446390151978, |
| "learning_rate": 1.5932786051974792e-06, |
| "loss": 0.4898, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.4872210447575114, |
| "grad_norm": 1.154740571975708, |
| "learning_rate": 1.556264204170167e-06, |
| "loss": 0.4927, |
| "step": 19950 |
| }, |
| { |
| "epoch": 2.4934546814611647, |
| "grad_norm": 1.1110808849334717, |
| "learning_rate": 1.519648514150286e-06, |
| "loss": 0.4821, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.4934546814611647, |
| "eval_loss": 0.5227229595184326, |
| "eval_runtime": 342.2762, |
| "eval_samples_per_second": 53.562, |
| "eval_steps_per_second": 6.696, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.4996883181648175, |
| "grad_norm": 1.1265305280685425, |
| "learning_rate": 1.4834332641211956e-06, |
| "loss": 0.4924, |
| "step": 20050 |
| }, |
| { |
| "epoch": 2.5059219548684704, |
| "grad_norm": 1.1256088018417358, |
| "learning_rate": 1.4476201641575793e-06, |
| "loss": 0.4873, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.512155591572123, |
| "grad_norm": 1.1528393030166626, |
| "learning_rate": 1.4122109053446997e-06, |
| "loss": 0.4799, |
| "step": 20150 |
| }, |
| { |
| "epoch": 2.518389228275776, |
| "grad_norm": 1.158719539642334, |
| "learning_rate": 1.3772071596985448e-06, |
| "loss": 0.4809, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.524622864979429, |
| "grad_norm": 1.1396048069000244, |
| "learning_rate": 1.3426105800868782e-06, |
| "loss": 0.4954, |
| "step": 20250 |
| }, |
| { |
| "epoch": 2.5308565016830817, |
| "grad_norm": 1.1778205633163452, |
| "learning_rate": 1.3084228001511867e-06, |
| "loss": 0.4886, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.537090138386735, |
| "grad_norm": 1.1746467351913452, |
| "learning_rate": 1.2746454342295456e-06, |
| "loss": 0.4903, |
| "step": 20350 |
| }, |
| { |
| "epoch": 2.543323775090388, |
| "grad_norm": 1.1871678829193115, |
| "learning_rate": 1.2412800772803846e-06, |
| "loss": 0.485, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.5495574117940407, |
| "grad_norm": 1.1898722648620605, |
| "learning_rate": 1.208328304807178e-06, |
| "loss": 0.487, |
| "step": 20450 |
| }, |
| { |
| "epoch": 2.5557910484976936, |
| "grad_norm": 1.1466889381408691, |
| "learning_rate": 1.1757916727840502e-06, |
| "loss": 0.4884, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.5620246852013464, |
| "grad_norm": 1.174965500831604, |
| "learning_rate": 1.1436717175822976e-06, |
| "loss": 0.4919, |
| "step": 20550 |
| }, |
| { |
| "epoch": 2.5682583219049993, |
| "grad_norm": 1.2029653787612915, |
| "learning_rate": 1.1119699558978525e-06, |
| "loss": 0.4908, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.574491958608652, |
| "grad_norm": 1.1164456605911255, |
| "learning_rate": 1.0806878846796454e-06, |
| "loss": 0.4903, |
| "step": 20650 |
| }, |
| { |
| "epoch": 2.5807255953123054, |
| "grad_norm": 1.139102816581726, |
| "learning_rate": 1.0498269810589501e-06, |
| "loss": 0.4904, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.5869592320159582, |
| "grad_norm": 1.1943669319152832, |
| "learning_rate": 1.019388702279599e-06, |
| "loss": 0.4883, |
| "step": 20750 |
| }, |
| { |
| "epoch": 2.593192868719611, |
| "grad_norm": 1.1451510190963745, |
| "learning_rate": 9.89374485629202e-07, |
| "loss": 0.4911, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.599426505423264, |
| "grad_norm": 1.1352810859680176, |
| "learning_rate": 9.59785748371257e-07, |
| "loss": 0.481, |
| "step": 20850 |
| }, |
| { |
| "epoch": 2.6056601421269168, |
| "grad_norm": 1.1264628171920776, |
| "learning_rate": 9.306238876782381e-07, |
| "loss": 0.4878, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.6118937788305696, |
| "grad_norm": 1.1740506887435913, |
| "learning_rate": 9.018902805656249e-07, |
| "loss": 0.4843, |
| "step": 20950 |
| }, |
| { |
| "epoch": 2.6181274155342225, |
| "grad_norm": 1.189261794090271, |
| "learning_rate": 8.735862838268638e-07, |
| "loss": 0.486, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.6243610522378757, |
| "grad_norm": 1.1971431970596313, |
| "learning_rate": 8.457132339693231e-07, |
| "loss": 0.4808, |
| "step": 21050 |
| }, |
| { |
| "epoch": 2.6305946889415286, |
| "grad_norm": 1.1666436195373535, |
| "learning_rate": 8.182724471511605e-07, |
| "loss": 0.4933, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.6368283256451814, |
| "grad_norm": 1.1445931196212769, |
| "learning_rate": 7.912652191191905e-07, |
| "loss": 0.4884, |
| "step": 21150 |
| }, |
| { |
| "epoch": 2.6430619623488343, |
| "grad_norm": 1.2019484043121338, |
| "learning_rate": 7.64692825147696e-07, |
| "loss": 0.4892, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.649295599052487, |
| "grad_norm": 1.184685468673706, |
| "learning_rate": 7.385565199782063e-07, |
| "loss": 0.4874, |
| "step": 21250 |
| }, |
| { |
| "epoch": 2.65552923575614, |
| "grad_norm": 1.1623753309249878, |
| "learning_rate": 7.128575377602509e-07, |
| "loss": 0.4862, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.661762872459793, |
| "grad_norm": 1.17208993434906, |
| "learning_rate": 6.87597091993083e-07, |
| "loss": 0.4841, |
| "step": 21350 |
| }, |
| { |
| "epoch": 2.667996509163446, |
| "grad_norm": 1.1734015941619873, |
| "learning_rate": 6.627763754683824e-07, |
| "loss": 0.4904, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.674230145867099, |
| "grad_norm": 1.1537295579910278, |
| "learning_rate": 6.383965602139253e-07, |
| "loss": 0.49, |
| "step": 21450 |
| }, |
| { |
| "epoch": 2.680463782570752, |
| "grad_norm": 1.1923189163208008, |
| "learning_rate": 6.144587974382399e-07, |
| "loss": 0.4856, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.6866974192744046, |
| "grad_norm": 1.1346355676651, |
| "learning_rate": 5.909642174762642e-07, |
| "loss": 0.4838, |
| "step": 21550 |
| }, |
| { |
| "epoch": 2.6929310559780575, |
| "grad_norm": 1.2050766944885254, |
| "learning_rate": 5.679139297359448e-07, |
| "loss": 0.4884, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.6991646926817108, |
| "grad_norm": 1.2148991823196411, |
| "learning_rate": 5.453090226458758e-07, |
| "loss": 0.4857, |
| "step": 21650 |
| }, |
| { |
| "epoch": 2.705398329385363, |
| "grad_norm": 1.1881086826324463, |
| "learning_rate": 5.231505636038881e-07, |
| "loss": 0.4927, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.7116319660890165, |
| "grad_norm": 1.219338297843933, |
| "learning_rate": 5.014395989266496e-07, |
| "loss": 0.4894, |
| "step": 21750 |
| }, |
| { |
| "epoch": 2.7178656027926693, |
| "grad_norm": 1.163949728012085, |
| "learning_rate": 4.801771538002687e-07, |
| "loss": 0.4859, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.724099239496322, |
| "grad_norm": 1.17982816696167, |
| "learning_rate": 4.59364232231867e-07, |
| "loss": 0.4896, |
| "step": 21850 |
| }, |
| { |
| "epoch": 2.730332876199975, |
| "grad_norm": 1.1526880264282227, |
| "learning_rate": 4.3900181700219035e-07, |
| "loss": 0.4896, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.736566512903628, |
| "grad_norm": 1.204048991203308, |
| "learning_rate": 4.190908696191853e-07, |
| "loss": 0.4908, |
| "step": 21950 |
| }, |
| { |
| "epoch": 2.742800149607281, |
| "grad_norm": 1.1354132890701294, |
| "learning_rate": 3.9963233027260794e-07, |
| "loss": 0.4833, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.742800149607281, |
| "eval_loss": 0.5217877626419067, |
| "eval_runtime": 342.2156, |
| "eval_samples_per_second": 53.571, |
| "eval_steps_per_second": 6.698, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.7490337863109335, |
| "grad_norm": 1.1562778949737549, |
| "learning_rate": 3.806271177896248e-07, |
| "loss": 0.494, |
| "step": 22050 |
| }, |
| { |
| "epoch": 2.755267423014587, |
| "grad_norm": 1.1327722072601318, |
| "learning_rate": 3.6207612959142213e-07, |
| "loss": 0.4829, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.7615010597182397, |
| "grad_norm": 1.1516318321228027, |
| "learning_rate": 3.4398024165083864e-07, |
| "loss": 0.4876, |
| "step": 22150 |
| }, |
| { |
| "epoch": 2.7677346964218925, |
| "grad_norm": 1.1770250797271729, |
| "learning_rate": 3.2634030845099417e-07, |
| "loss": 0.4858, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.7739683331255454, |
| "grad_norm": 1.1872680187225342, |
| "learning_rate": 3.0915716294494193e-07, |
| "loss": 0.4914, |
| "step": 22250 |
| }, |
| { |
| "epoch": 2.780201969829198, |
| "grad_norm": 1.1848747730255127, |
| "learning_rate": 2.9243161651634654e-07, |
| "loss": 0.4866, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.7864356065328515, |
| "grad_norm": 1.0984443426132202, |
| "learning_rate": 2.7616445894115607e-07, |
| "loss": 0.4914, |
| "step": 22350 |
| }, |
| { |
| "epoch": 2.7926692432365043, |
| "grad_norm": 1.195947289466858, |
| "learning_rate": 2.6035645835032044e-07, |
| "loss": 0.4889, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.798902879940157, |
| "grad_norm": 1.1779003143310547, |
| "learning_rate": 2.4500836119351503e-07, |
| "loss": 0.4878, |
| "step": 22450 |
| }, |
| { |
| "epoch": 2.80513651664381, |
| "grad_norm": 1.1525861024856567, |
| "learning_rate": 2.301208922038911e-07, |
| "loss": 0.4827, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.811370153347463, |
| "grad_norm": 1.1666817665100098, |
| "learning_rate": 2.1569475436386546e-07, |
| "loss": 0.4849, |
| "step": 22550 |
| }, |
| { |
| "epoch": 2.8176037900511157, |
| "grad_norm": 1.1665741205215454, |
| "learning_rate": 2.0173062887190898e-07, |
| "loss": 0.4873, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.8238374267547686, |
| "grad_norm": 1.2046200037002563, |
| "learning_rate": 1.8822917511039818e-07, |
| "loss": 0.4843, |
| "step": 22650 |
| }, |
| { |
| "epoch": 2.830071063458422, |
| "grad_norm": 1.1819406747817993, |
| "learning_rate": 1.7519103061446552e-07, |
| "loss": 0.4925, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.8363047001620747, |
| "grad_norm": 1.1489243507385254, |
| "learning_rate": 1.626168110419013e-07, |
| "loss": 0.493, |
| "step": 22750 |
| }, |
| { |
| "epoch": 2.8425383368657275, |
| "grad_norm": 1.1299190521240234, |
| "learning_rate": 1.505071101440836e-07, |
| "loss": 0.4878, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.8487719735693804, |
| "grad_norm": 1.1881701946258545, |
| "learning_rate": 1.388624997379373e-07, |
| "loss": 0.4894, |
| "step": 22850 |
| }, |
| { |
| "epoch": 2.8550056102730332, |
| "grad_norm": 1.1636464595794678, |
| "learning_rate": 1.2768352967893582e-07, |
| "loss": 0.4855, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.861239246976686, |
| "grad_norm": 1.117014765739441, |
| "learning_rate": 1.169707278351373e-07, |
| "loss": 0.4886, |
| "step": 22950 |
| }, |
| { |
| "epoch": 2.867472883680339, |
| "grad_norm": 1.1891297101974487, |
| "learning_rate": 1.0672460006225682e-07, |
| "loss": 0.4942, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.873706520383992, |
| "grad_norm": 1.1670392751693726, |
| "learning_rate": 9.694563017978331e-08, |
| "loss": 0.4835, |
| "step": 23050 |
| }, |
| { |
| "epoch": 2.879940157087645, |
| "grad_norm": 1.1889266967773438, |
| "learning_rate": 8.763427994813112e-08, |
| "loss": 0.4951, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.886173793791298, |
| "grad_norm": 1.2196145057678223, |
| "learning_rate": 7.879098904683303e-08, |
| "loss": 0.4861, |
| "step": 23150 |
| }, |
| { |
| "epoch": 2.8924074304949507, |
| "grad_norm": 1.0960079431533813, |
| "learning_rate": 7.041617505378573e-08, |
| "loss": 0.4834, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.8986410671986036, |
| "grad_norm": 1.1753352880477905, |
| "learning_rate": 6.251023342552787e-08, |
| "loss": 0.486, |
| "step": 23250 |
| }, |
| { |
| "epoch": 2.9048747039022564, |
| "grad_norm": 1.19828462600708, |
| "learning_rate": 5.5073537478566034e-08, |
| "loss": 0.4873, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.9111083406059093, |
| "grad_norm": 1.1725274324417114, |
| "learning_rate": 4.810643837174667e-08, |
| "loss": 0.4869, |
| "step": 23350 |
| }, |
| { |
| "epoch": 2.9173419773095626, |
| "grad_norm": 1.1990926265716553, |
| "learning_rate": 4.160926508967822e-08, |
| "loss": 0.4897, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.9235756140132154, |
| "grad_norm": 1.2078295946121216, |
| "learning_rate": 3.558232442719245e-08, |
| "loss": 0.484, |
| "step": 23450 |
| }, |
| { |
| "epoch": 2.9298092507168683, |
| "grad_norm": 1.1486384868621826, |
| "learning_rate": 3.002590097485936e-08, |
| "loss": 0.4801, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.936042887420521, |
| "grad_norm": 1.1498249769210815, |
| "learning_rate": 2.4940257105547928e-08, |
| "loss": 0.4861, |
| "step": 23550 |
| }, |
| { |
| "epoch": 2.942276524124174, |
| "grad_norm": 1.2030309438705444, |
| "learning_rate": 2.0325632962039376e-08, |
| "loss": 0.4938, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.948510160827827, |
| "grad_norm": 1.2331132888793945, |
| "learning_rate": 1.6182246445685114e-08, |
| "loss": 0.4882, |
| "step": 23650 |
| }, |
| { |
| "epoch": 2.9547437975314796, |
| "grad_norm": 1.1595110893249512, |
| "learning_rate": 1.2510293206118296e-08, |
| "loss": 0.4888, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.960977434235133, |
| "grad_norm": 1.1479148864746094, |
| "learning_rate": 9.309946632015676e-09, |
| "loss": 0.4864, |
| "step": 23750 |
| }, |
| { |
| "epoch": 2.9672110709387858, |
| "grad_norm": 1.2049928903579712, |
| "learning_rate": 6.581357842909697e-09, |
| "loss": 0.4834, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.9734447076424386, |
| "grad_norm": 1.1694363355636597, |
| "learning_rate": 4.324655682051982e-09, |
| "loss": 0.4882, |
| "step": 23850 |
| }, |
| { |
| "epoch": 2.9796783443460915, |
| "grad_norm": 1.132237434387207, |
| "learning_rate": 2.5399467103337518e-09, |
| "loss": 0.4906, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.9859119810497443, |
| "grad_norm": 1.1664302349090576, |
| "learning_rate": 1.2273152012465262e-09, |
| "loss": 0.4884, |
| "step": 23950 |
| }, |
| { |
| "epoch": 2.9921456177533976, |
| "grad_norm": 1.1637336015701294, |
| "learning_rate": 3.8682313690974194e-10, |
| "loss": 0.4896, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.9921456177533976, |
| "eval_loss": 0.5216008424758911, |
| "eval_runtime": 342.4337, |
| "eval_samples_per_second": 53.537, |
| "eval_steps_per_second": 6.693, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.99837925445705, |
| "grad_norm": 1.1456950902938843, |
| "learning_rate": 1.8510205138655424e-11, |
| "loss": 0.4816, |
| "step": 24050 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 24063, |
| "total_flos": 8.463357208732631e+18, |
| "train_loss": 0.0, |
| "train_runtime": 21.1751, |
| "train_samples_per_second": 18181.955, |
| "train_steps_per_second": 1136.381 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 24063, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 2, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.463357208732631e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|