| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 24651, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012169891687963978, |
| "grad_norm": 937035.375, |
| "learning_rate": 1.9919678714859437e-05, |
| "loss": 3.204, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.024339783375927956, |
| "grad_norm": 1048863.375, |
| "learning_rate": 1.9838546103606345e-05, |
| "loss": 1.7829, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.03650967506389193, |
| "grad_norm": 1297520.75, |
| "learning_rate": 1.9757413492353253e-05, |
| "loss": 1.5258, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04867956675185591, |
| "grad_norm": 876169.625, |
| "learning_rate": 1.967628088110016e-05, |
| "loss": 1.3856, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.060849458439819884, |
| "grad_norm": 1111828.5, |
| "learning_rate": 1.9595148269847068e-05, |
| "loss": 1.2483, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07301935012778386, |
| "grad_norm": 876964.1875, |
| "learning_rate": 1.9514015658593976e-05, |
| "loss": 1.2228, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.08518924181574784, |
| "grad_norm": 667343.125, |
| "learning_rate": 1.943288304734088e-05, |
| "loss": 1.2008, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.09735913350371182, |
| "grad_norm": 1427273.75, |
| "learning_rate": 1.9351750436087787e-05, |
| "loss": 1.1185, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10952902519167579, |
| "grad_norm": 1462503.125, |
| "learning_rate": 1.9270617824834695e-05, |
| "loss": 1.0793, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.12169891687963977, |
| "grad_norm": 704485.8125, |
| "learning_rate": 1.91894852135816e-05, |
| "loss": 1.0943, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.13386880856760375, |
| "grad_norm": 1196744.0, |
| "learning_rate": 1.9108352602328507e-05, |
| "loss": 1.0702, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.14603870025556773, |
| "grad_norm": 883526.5, |
| "learning_rate": 1.9027219991075415e-05, |
| "loss": 1.0606, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1582085919435317, |
| "grad_norm": 857247.3125, |
| "learning_rate": 1.8946087379822322e-05, |
| "loss": 1.0503, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.1703784836314957, |
| "grad_norm": 877640.125, |
| "learning_rate": 1.8864954768569227e-05, |
| "loss": 0.9371, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.18254837531945967, |
| "grad_norm": 1136452.625, |
| "learning_rate": 1.8783822157316134e-05, |
| "loss": 0.9607, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.19471826700742365, |
| "grad_norm": 776925.5625, |
| "learning_rate": 1.8702689546063042e-05, |
| "loss": 1.0074, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.2068881586953876, |
| "grad_norm": 700356.4375, |
| "learning_rate": 1.862155693480995e-05, |
| "loss": 0.9649, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.21905805038335158, |
| "grad_norm": 889704.0, |
| "learning_rate": 1.8540424323556857e-05, |
| "loss": 0.9464, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.23122794207131556, |
| "grad_norm": 480890.5, |
| "learning_rate": 1.845929171230376e-05, |
| "loss": 0.9489, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.24339783375927954, |
| "grad_norm": 622537.625, |
| "learning_rate": 1.837815910105067e-05, |
| "loss": 0.9185, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.25556772544724354, |
| "grad_norm": 1142430.625, |
| "learning_rate": 1.8297026489797577e-05, |
| "loss": 0.9657, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2677376171352075, |
| "grad_norm": 418717.90625, |
| "learning_rate": 1.821589387854448e-05, |
| "loss": 0.9109, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.27990750882317145, |
| "grad_norm": 1248165.75, |
| "learning_rate": 1.813476126729139e-05, |
| "loss": 0.9193, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.29207740051113545, |
| "grad_norm": 794509.5, |
| "learning_rate": 1.8053628656038296e-05, |
| "loss": 0.8797, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.3042472921990994, |
| "grad_norm": 610876.1875, |
| "learning_rate": 1.7972496044785204e-05, |
| "loss": 0.8851, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3164171838870634, |
| "grad_norm": 1630106.5, |
| "learning_rate": 1.7891363433532108e-05, |
| "loss": 0.8487, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.32858707557502737, |
| "grad_norm": 1035289.8125, |
| "learning_rate": 1.7810230822279016e-05, |
| "loss": 0.8658, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3407569672629914, |
| "grad_norm": 1528019.375, |
| "learning_rate": 1.7729098211025923e-05, |
| "loss": 0.8819, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3529268589509553, |
| "grad_norm": 1408853.375, |
| "learning_rate": 1.764796559977283e-05, |
| "loss": 0.952, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.36509675063891933, |
| "grad_norm": 754760.5, |
| "learning_rate": 1.756683298851974e-05, |
| "loss": 0.928, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3772666423268833, |
| "grad_norm": 890047.3125, |
| "learning_rate": 1.7485700377266643e-05, |
| "loss": 0.8566, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3894365340148473, |
| "grad_norm": 564804.375, |
| "learning_rate": 1.740456776601355e-05, |
| "loss": 0.8467, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.40160642570281124, |
| "grad_norm": 540386.1875, |
| "learning_rate": 1.7323435154760458e-05, |
| "loss": 0.8995, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.4137763173907752, |
| "grad_norm": 733578.0, |
| "learning_rate": 1.7242302543507362e-05, |
| "loss": 0.8114, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.4259462090787392, |
| "grad_norm": 815934.875, |
| "learning_rate": 1.716116993225427e-05, |
| "loss": 0.856, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.43811610076670315, |
| "grad_norm": 1008148.0, |
| "learning_rate": 1.7080037321001177e-05, |
| "loss": 0.7677, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.45028599245466716, |
| "grad_norm": 708381.8125, |
| "learning_rate": 1.6998904709748085e-05, |
| "loss": 0.796, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.4624558841426311, |
| "grad_norm": 949816.8125, |
| "learning_rate": 1.6917772098494993e-05, |
| "loss": 0.8212, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4746257758305951, |
| "grad_norm": 720175.4375, |
| "learning_rate": 1.68366394872419e-05, |
| "loss": 0.8439, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.48679566751855907, |
| "grad_norm": 582352.25, |
| "learning_rate": 1.6755506875988805e-05, |
| "loss": 0.8144, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4989655592065231, |
| "grad_norm": 823707.625, |
| "learning_rate": 1.6674374264735712e-05, |
| "loss": 0.7497, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.5111354508944871, |
| "grad_norm": 732386.0625, |
| "learning_rate": 1.659324165348262e-05, |
| "loss": 0.8184, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.523305342582451, |
| "grad_norm": 770159.0, |
| "learning_rate": 1.6512109042229524e-05, |
| "loss": 0.7994, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.535475234270415, |
| "grad_norm": 752353.4375, |
| "learning_rate": 1.6430976430976432e-05, |
| "loss": 0.8176, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.547645125958379, |
| "grad_norm": 1363265.25, |
| "learning_rate": 1.634984381972334e-05, |
| "loss": 0.7828, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5598150176463429, |
| "grad_norm": 1718103.0, |
| "learning_rate": 1.6268711208470247e-05, |
| "loss": 0.8077, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5719849093343069, |
| "grad_norm": 930578.8125, |
| "learning_rate": 1.618757859721715e-05, |
| "loss": 0.7249, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5841548010222709, |
| "grad_norm": 677129.1875, |
| "learning_rate": 1.610644598596406e-05, |
| "loss": 0.7853, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5963246927102349, |
| "grad_norm": 619335.1875, |
| "learning_rate": 1.6025313374710967e-05, |
| "loss": 0.7623, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.6084945843981988, |
| "grad_norm": 793190.875, |
| "learning_rate": 1.5944180763457874e-05, |
| "loss": 0.7554, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6206644760861628, |
| "grad_norm": 676459.5, |
| "learning_rate": 1.5863048152204782e-05, |
| "loss": 0.7342, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6328343677741268, |
| "grad_norm": 1202478.25, |
| "learning_rate": 1.5781915540951686e-05, |
| "loss": 0.7866, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.6450042594620908, |
| "grad_norm": 958839.5, |
| "learning_rate": 1.5700782929698594e-05, |
| "loss": 0.8038, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.6571741511500547, |
| "grad_norm": 781309.25, |
| "learning_rate": 1.56196503184455e-05, |
| "loss": 0.7764, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6693440428380187, |
| "grad_norm": 1317343.625, |
| "learning_rate": 1.5538517707192406e-05, |
| "loss": 0.7965, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6815139345259827, |
| "grad_norm": 928345.4375, |
| "learning_rate": 1.5457385095939313e-05, |
| "loss": 0.7097, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6936838262139466, |
| "grad_norm": 1027113.25, |
| "learning_rate": 1.537625248468622e-05, |
| "loss": 0.8198, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.7058537179019106, |
| "grad_norm": 698121.9375, |
| "learning_rate": 1.529511987343313e-05, |
| "loss": 0.783, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.7180236095898747, |
| "grad_norm": 835533.125, |
| "learning_rate": 1.5213987262180034e-05, |
| "loss": 0.7083, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7301935012778387, |
| "grad_norm": 444278.4375, |
| "learning_rate": 1.513285465092694e-05, |
| "loss": 0.7802, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7423633929658026, |
| "grad_norm": 673798.875, |
| "learning_rate": 1.5051722039673848e-05, |
| "loss": 0.797, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.7545332846537666, |
| "grad_norm": 886280.0, |
| "learning_rate": 1.4970589428420754e-05, |
| "loss": 0.7827, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.7667031763417306, |
| "grad_norm": 492442.40625, |
| "learning_rate": 1.4889456817167662e-05, |
| "loss": 0.7156, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.7788730680296946, |
| "grad_norm": 1017422.6875, |
| "learning_rate": 1.4808324205914568e-05, |
| "loss": 0.6945, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.7910429597176585, |
| "grad_norm": 737922.875, |
| "learning_rate": 1.4727191594661477e-05, |
| "loss": 0.7559, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.8032128514056225, |
| "grad_norm": 696342.0, |
| "learning_rate": 1.4646058983408383e-05, |
| "loss": 0.7982, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.8153827430935865, |
| "grad_norm": 676595.75, |
| "learning_rate": 1.4564926372155289e-05, |
| "loss": 0.7437, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.8275526347815504, |
| "grad_norm": 931158.625, |
| "learning_rate": 1.4483793760902196e-05, |
| "loss": 0.7817, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.8397225264695144, |
| "grad_norm": 510165.03125, |
| "learning_rate": 1.4402661149649102e-05, |
| "loss": 0.7618, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.8518924181574784, |
| "grad_norm": 660383.375, |
| "learning_rate": 1.432152853839601e-05, |
| "loss": 0.7658, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8640623098454424, |
| "grad_norm": 528319.4375, |
| "learning_rate": 1.4240395927142916e-05, |
| "loss": 0.7113, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.8762322015334063, |
| "grad_norm": 1037643.125, |
| "learning_rate": 1.4159263315889823e-05, |
| "loss": 0.7364, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.8884020932213703, |
| "grad_norm": 249296.375, |
| "learning_rate": 1.407813070463673e-05, |
| "loss": 0.7371, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.9005719849093343, |
| "grad_norm": 259082.21875, |
| "learning_rate": 1.3996998093383635e-05, |
| "loss": 0.7758, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.9127418765972983, |
| "grad_norm": 603928.8125, |
| "learning_rate": 1.3915865482130545e-05, |
| "loss": 0.7161, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.9249117682852622, |
| "grad_norm": 722792.625, |
| "learning_rate": 1.3834732870877449e-05, |
| "loss": 0.7751, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.9370816599732262, |
| "grad_norm": 660232.3125, |
| "learning_rate": 1.3753600259624358e-05, |
| "loss": 0.7112, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.9492515516611902, |
| "grad_norm": 500458.6875, |
| "learning_rate": 1.3672467648371264e-05, |
| "loss": 0.768, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.9614214433491542, |
| "grad_norm": 604712.9375, |
| "learning_rate": 1.3591335037118172e-05, |
| "loss": 0.7293, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.9735913350371181, |
| "grad_norm": 373637.78125, |
| "learning_rate": 1.3510202425865078e-05, |
| "loss": 0.7558, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9857612267250822, |
| "grad_norm": 681396.625, |
| "learning_rate": 1.3429069814611984e-05, |
| "loss": 0.746, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.9979311184130462, |
| "grad_norm": 448930.375, |
| "learning_rate": 1.3347937203358891e-05, |
| "loss": 0.6882, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6668452620506287, |
| "eval_runtime": 263.7204, |
| "eval_samples_per_second": 45.886, |
| "eval_steps_per_second": 2.87, |
| "step": 8217 |
| }, |
| { |
| "epoch": 1.0101010101010102, |
| "grad_norm": 526495.75, |
| "learning_rate": 1.3266804592105797e-05, |
| "loss": 0.6339, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.0222709017889742, |
| "grad_norm": 562899.25, |
| "learning_rate": 1.3185671980852705e-05, |
| "loss": 0.5614, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.034440793476938, |
| "grad_norm": 724823.875, |
| "learning_rate": 1.3104539369599611e-05, |
| "loss": 0.6053, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.046610685164902, |
| "grad_norm": 585095.625, |
| "learning_rate": 1.3023406758346517e-05, |
| "loss": 0.6184, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.058780576852866, |
| "grad_norm": 1030241.4375, |
| "learning_rate": 1.2942274147093426e-05, |
| "loss": 0.5891, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.07095046854083, |
| "grad_norm": 380056.78125, |
| "learning_rate": 1.2861141535840332e-05, |
| "loss": 0.5728, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.083120360228794, |
| "grad_norm": 639405.0, |
| "learning_rate": 1.278000892458724e-05, |
| "loss": 0.5113, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.095290251916758, |
| "grad_norm": 532266.0625, |
| "learning_rate": 1.2698876313334146e-05, |
| "loss": 0.5679, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.107460143604722, |
| "grad_norm": 739227.4375, |
| "learning_rate": 1.2617743702081053e-05, |
| "loss": 0.5704, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.119630035292686, |
| "grad_norm": 610794.125, |
| "learning_rate": 1.253661109082796e-05, |
| "loss": 0.5849, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.1317999269806498, |
| "grad_norm": 1094627.0, |
| "learning_rate": 1.2455478479574865e-05, |
| "loss": 0.5622, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.1439698186686138, |
| "grad_norm": 484573.78125, |
| "learning_rate": 1.2374345868321773e-05, |
| "loss": 0.5225, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.1561397103565778, |
| "grad_norm": 616542.0, |
| "learning_rate": 1.2293213257068679e-05, |
| "loss": 0.6138, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.1683096020445418, |
| "grad_norm": 774001.75, |
| "learning_rate": 1.2212080645815586e-05, |
| "loss": 0.5751, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.1804794937325058, |
| "grad_norm": 426144.40625, |
| "learning_rate": 1.2130948034562492e-05, |
| "loss": 0.5364, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.1926493854204698, |
| "grad_norm": 837338.25, |
| "learning_rate": 1.2049815423309402e-05, |
| "loss": 0.5749, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.2048192771084336, |
| "grad_norm": 1166716.5, |
| "learning_rate": 1.1968682812056308e-05, |
| "loss": 0.5591, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.2169891687963976, |
| "grad_norm": 1038052.8125, |
| "learning_rate": 1.1887550200803213e-05, |
| "loss": 0.5873, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2291590604843616, |
| "grad_norm": 720955.0625, |
| "learning_rate": 1.1806417589550121e-05, |
| "loss": 0.5493, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.2413289521723256, |
| "grad_norm": 1045855.625, |
| "learning_rate": 1.1725284978297027e-05, |
| "loss": 0.5848, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.2534988438602896, |
| "grad_norm": 464351.90625, |
| "learning_rate": 1.1644152367043935e-05, |
| "loss": 0.5139, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.2656687355482537, |
| "grad_norm": 761802.375, |
| "learning_rate": 1.156301975579084e-05, |
| "loss": 0.5724, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.2778386272362177, |
| "grad_norm": 348438.75, |
| "learning_rate": 1.1481887144537748e-05, |
| "loss": 0.5744, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.2900085189241817, |
| "grad_norm": 918051.5, |
| "learning_rate": 1.1400754533284654e-05, |
| "loss": 0.5343, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.3021784106121457, |
| "grad_norm": 787824.0625, |
| "learning_rate": 1.131962192203156e-05, |
| "loss": 0.5321, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.3143483023001095, |
| "grad_norm": 448052.0, |
| "learning_rate": 1.123848931077847e-05, |
| "loss": 0.5562, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.3265181939880735, |
| "grad_norm": 1413978.625, |
| "learning_rate": 1.1157356699525375e-05, |
| "loss": 0.5763, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.3386880856760375, |
| "grad_norm": 366529.03125, |
| "learning_rate": 1.1076224088272283e-05, |
| "loss": 0.6262, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.3508579773640015, |
| "grad_norm": 875949.5, |
| "learning_rate": 1.0995091477019189e-05, |
| "loss": 0.5719, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.3630278690519655, |
| "grad_norm": 646206.9375, |
| "learning_rate": 1.0913958865766097e-05, |
| "loss": 0.5938, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.3751977607399295, |
| "grad_norm": 1301331.5, |
| "learning_rate": 1.0832826254513003e-05, |
| "loss": 0.5525, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.3873676524278933, |
| "grad_norm": 575320.25, |
| "learning_rate": 1.0751693643259908e-05, |
| "loss": 0.5473, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.3995375441158573, |
| "grad_norm": 1271057.875, |
| "learning_rate": 1.0670561032006816e-05, |
| "loss": 0.5698, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.4117074358038213, |
| "grad_norm": 296549.71875, |
| "learning_rate": 1.0589428420753722e-05, |
| "loss": 0.5891, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.4238773274917853, |
| "grad_norm": 843789.0, |
| "learning_rate": 1.050829580950063e-05, |
| "loss": 0.5645, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.4360472191797493, |
| "grad_norm": 1364688.875, |
| "learning_rate": 1.0427163198247536e-05, |
| "loss": 0.5928, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.4482171108677133, |
| "grad_norm": 949730.625, |
| "learning_rate": 1.0346030586994442e-05, |
| "loss": 0.5808, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.4603870025556773, |
| "grad_norm": 1451533.5, |
| "learning_rate": 1.0264897975741351e-05, |
| "loss": 0.5724, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.4725568942436413, |
| "grad_norm": 765235.5, |
| "learning_rate": 1.0183765364488257e-05, |
| "loss": 0.5717, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.4847267859316053, |
| "grad_norm": 678600.6875, |
| "learning_rate": 1.0102632753235164e-05, |
| "loss": 0.5869, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.4968966776195691, |
| "grad_norm": 968140.4375, |
| "learning_rate": 1.002150014198207e-05, |
| "loss": 0.5524, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.5090665693075331, |
| "grad_norm": 997379.0625, |
| "learning_rate": 9.940367530728976e-06, |
| "loss": 0.5984, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.5212364609954971, |
| "grad_norm": 553474.9375, |
| "learning_rate": 9.859234919475884e-06, |
| "loss": 0.5907, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.5334063526834611, |
| "grad_norm": 764892.25, |
| "learning_rate": 9.778102308222792e-06, |
| "loss": 0.5059, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.545576244371425, |
| "grad_norm": 807238.5, |
| "learning_rate": 9.696969696969698e-06, |
| "loss": 0.5545, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.557746136059389, |
| "grad_norm": 732502.3125, |
| "learning_rate": 9.615837085716605e-06, |
| "loss": 0.5859, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.569916027747353, |
| "grad_norm": 346234.3125, |
| "learning_rate": 9.534704474463511e-06, |
| "loss": 0.5527, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.582085919435317, |
| "grad_norm": 959444.1875, |
| "learning_rate": 9.453571863210417e-06, |
| "loss": 0.5873, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.594255811123281, |
| "grad_norm": 506975.46875, |
| "learning_rate": 9.372439251957325e-06, |
| "loss": 0.5604, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.606425702811245, |
| "grad_norm": 452580.34375, |
| "learning_rate": 9.291306640704232e-06, |
| "loss": 0.5076, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.618595594499209, |
| "grad_norm": 889885.8125, |
| "learning_rate": 9.210174029451138e-06, |
| "loss": 0.5833, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.630765486187173, |
| "grad_norm": 423025.90625, |
| "learning_rate": 9.129041418198046e-06, |
| "loss": 0.6075, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.642935377875137, |
| "grad_norm": 948548.1875, |
| "learning_rate": 9.047908806944954e-06, |
| "loss": 0.5736, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.655105269563101, |
| "grad_norm": 917991.125, |
| "learning_rate": 8.96677619569186e-06, |
| "loss": 0.5497, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.667275161251065, |
| "grad_norm": 352658.0625, |
| "learning_rate": 8.885643584438765e-06, |
| "loss": 0.5475, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.6794450529390288, |
| "grad_norm": 807333.25, |
| "learning_rate": 8.804510973185673e-06, |
| "loss": 0.56, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.6916149446269928, |
| "grad_norm": 530145.875, |
| "learning_rate": 8.723378361932579e-06, |
| "loss": 0.5408, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.7037848363149568, |
| "grad_norm": 630776.3125, |
| "learning_rate": 8.642245750679487e-06, |
| "loss": 0.5707, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.7159547280029208, |
| "grad_norm": 859784.0625, |
| "learning_rate": 8.561113139426394e-06, |
| "loss": 0.5526, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.7281246196908846, |
| "grad_norm": 383244.90625, |
| "learning_rate": 8.4799805281733e-06, |
| "loss": 0.5415, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.7402945113788486, |
| "grad_norm": 183796.140625, |
| "learning_rate": 8.398847916920206e-06, |
| "loss": 0.5929, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.7524644030668126, |
| "grad_norm": 834435.0625, |
| "learning_rate": 8.317715305667114e-06, |
| "loss": 0.5811, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.7646342947547766, |
| "grad_norm": 1005049.8125, |
| "learning_rate": 8.23658269441402e-06, |
| "loss": 0.4738, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.7768041864427406, |
| "grad_norm": 1233122.5, |
| "learning_rate": 8.155450083160927e-06, |
| "loss": 0.5355, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.7889740781307046, |
| "grad_norm": 1253067.5, |
| "learning_rate": 8.074317471907835e-06, |
| "loss": 0.5409, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.8011439698186686, |
| "grad_norm": 286456.0, |
| "learning_rate": 7.993184860654741e-06, |
| "loss": 0.5088, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.8133138615066327, |
| "grad_norm": 776422.375, |
| "learning_rate": 7.912052249401647e-06, |
| "loss": 0.5306, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.8254837531945967, |
| "grad_norm": 764024.75, |
| "learning_rate": 7.830919638148554e-06, |
| "loss": 0.5295, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.8376536448825607, |
| "grad_norm": 1123857.125, |
| "learning_rate": 7.74978702689546e-06, |
| "loss": 0.5476, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.8498235365705247, |
| "grad_norm": 606708.5, |
| "learning_rate": 7.668654415642368e-06, |
| "loss": 0.5496, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.8619934282584885, |
| "grad_norm": 632422.75, |
| "learning_rate": 7.587521804389275e-06, |
| "loss": 0.5469, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.8741633199464525, |
| "grad_norm": 354093.15625, |
| "learning_rate": 7.5063891931361825e-06, |
| "loss": 0.5983, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.8863332116344165, |
| "grad_norm": 285148.65625, |
| "learning_rate": 7.425256581883089e-06, |
| "loss": 0.5253, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.8985031033223805, |
| "grad_norm": 1560757.125, |
| "learning_rate": 7.344123970629995e-06, |
| "loss": 0.5476, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.9106729950103443, |
| "grad_norm": 608123.4375, |
| "learning_rate": 7.262991359376902e-06, |
| "loss": 0.5116, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.9228428866983083, |
| "grad_norm": 1332108.375, |
| "learning_rate": 7.181858748123809e-06, |
| "loss": 0.516, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.9350127783862723, |
| "grad_norm": 376606.625, |
| "learning_rate": 7.1007261368707156e-06, |
| "loss": 0.5317, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.9471826700742363, |
| "grad_norm": 913279.125, |
| "learning_rate": 7.019593525617623e-06, |
| "loss": 0.5573, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.9593525617622003, |
| "grad_norm": 290156.71875, |
| "learning_rate": 6.93846091436453e-06, |
| "loss": 0.5275, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.9715224534501643, |
| "grad_norm": 760140.5625, |
| "learning_rate": 6.857328303111436e-06, |
| "loss": 0.5982, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.9836923451381283, |
| "grad_norm": 312559.34375, |
| "learning_rate": 6.776195691858343e-06, |
| "loss": 0.5991, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.9958622368260923, |
| "grad_norm": 860316.0, |
| "learning_rate": 6.6950630806052495e-06, |
| "loss": 0.5509, |
| "step": 16400 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7069754004478455, |
| "eval_runtime": 263.7582, |
| "eval_samples_per_second": 45.879, |
| "eval_steps_per_second": 2.87, |
| "step": 16434 |
| }, |
| { |
| "epoch": 2.0080321285140563, |
| "grad_norm": 1111842.375, |
| "learning_rate": 6.613930469352156e-06, |
| "loss": 0.4853, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.0202020202020203, |
| "grad_norm": 734144.25, |
| "learning_rate": 6.532797858099064e-06, |
| "loss": 0.3957, |
| "step": 16600 |
| }, |
| { |
| "epoch": 2.0323719118899843, |
| "grad_norm": 915932.9375, |
| "learning_rate": 6.451665246845971e-06, |
| "loss": 0.4254, |
| "step": 16700 |
| }, |
| { |
| "epoch": 2.0445418035779483, |
| "grad_norm": 282760.53125, |
| "learning_rate": 6.3705326355928775e-06, |
| "loss": 0.4188, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.0567116952659124, |
| "grad_norm": 248830.09375, |
| "learning_rate": 6.289400024339783e-06, |
| "loss": 0.444, |
| "step": 16900 |
| }, |
| { |
| "epoch": 2.068881586953876, |
| "grad_norm": 1877179.75, |
| "learning_rate": 6.20826741308669e-06, |
| "loss": 0.4139, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.08105147864184, |
| "grad_norm": 1332360.75, |
| "learning_rate": 6.127134801833598e-06, |
| "loss": 0.4682, |
| "step": 17100 |
| }, |
| { |
| "epoch": 2.093221370329804, |
| "grad_norm": 1522515.625, |
| "learning_rate": 6.046002190580505e-06, |
| "loss": 0.4316, |
| "step": 17200 |
| }, |
| { |
| "epoch": 2.105391262017768, |
| "grad_norm": 904717.9375, |
| "learning_rate": 5.964869579327411e-06, |
| "loss": 0.424, |
| "step": 17300 |
| }, |
| { |
| "epoch": 2.117561153705732, |
| "grad_norm": 585235.875, |
| "learning_rate": 5.883736968074318e-06, |
| "loss": 0.4258, |
| "step": 17400 |
| }, |
| { |
| "epoch": 2.129731045393696, |
| "grad_norm": 938551.75, |
| "learning_rate": 5.802604356821224e-06, |
| "loss": 0.388, |
| "step": 17500 |
| }, |
| { |
| "epoch": 2.14190093708166, |
| "grad_norm": 274852.875, |
| "learning_rate": 5.721471745568131e-06, |
| "loss": 0.4287, |
| "step": 17600 |
| }, |
| { |
| "epoch": 2.154070828769624, |
| "grad_norm": 832885.125, |
| "learning_rate": 5.6403391343150385e-06, |
| "loss": 0.4446, |
| "step": 17700 |
| }, |
| { |
| "epoch": 2.166240720457588, |
| "grad_norm": 729798.5625, |
| "learning_rate": 5.559206523061945e-06, |
| "loss": 0.3869, |
| "step": 17800 |
| }, |
| { |
| "epoch": 2.178410612145552, |
| "grad_norm": 690389.625, |
| "learning_rate": 5.478073911808852e-06, |
| "loss": 0.3944, |
| "step": 17900 |
| }, |
| { |
| "epoch": 2.190580503833516, |
| "grad_norm": 337837.53125, |
| "learning_rate": 5.396941300555759e-06, |
| "loss": 0.398, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.20275039552148, |
| "grad_norm": 1882286.5, |
| "learning_rate": 5.3158086893026665e-06, |
| "loss": 0.3846, |
| "step": 18100 |
| }, |
| { |
| "epoch": 2.214920287209444, |
| "grad_norm": 662988.8125, |
| "learning_rate": 5.234676078049572e-06, |
| "loss": 0.4342, |
| "step": 18200 |
| }, |
| { |
| "epoch": 2.227090178897408, |
| "grad_norm": 207260.828125, |
| "learning_rate": 5.153543466796479e-06, |
| "loss": 0.4045, |
| "step": 18300 |
| }, |
| { |
| "epoch": 2.239260070585372, |
| "grad_norm": 1592776.0, |
| "learning_rate": 5.072410855543386e-06, |
| "loss": 0.4304, |
| "step": 18400 |
| }, |
| { |
| "epoch": 2.2514299622733356, |
| "grad_norm": 273817.09375, |
| "learning_rate": 4.991278244290293e-06, |
| "loss": 0.4213, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.2635998539612996, |
| "grad_norm": 735677.5, |
| "learning_rate": 4.9101456330372e-06, |
| "loss": 0.4479, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.2757697456492636, |
| "grad_norm": 848157.9375, |
| "learning_rate": 4.829013021784106e-06, |
| "loss": 0.4594, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.2879396373372276, |
| "grad_norm": 587701.5625, |
| "learning_rate": 4.747880410531013e-06, |
| "loss": 0.4424, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.3001095290251916, |
| "grad_norm": 1382945.0, |
| "learning_rate": 4.66674779927792e-06, |
| "loss": 0.4185, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.3122794207131556, |
| "grad_norm": 712288.0625, |
| "learning_rate": 4.585615188024827e-06, |
| "loss": 0.4261, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.3244493124011196, |
| "grad_norm": 1519380.375, |
| "learning_rate": 4.5044825767717336e-06, |
| "loss": 0.4388, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.3366192040890836, |
| "grad_norm": 1865403.5, |
| "learning_rate": 4.42334996551864e-06, |
| "loss": 0.4399, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.3487890957770476, |
| "grad_norm": 1357723.375, |
| "learning_rate": 4.342217354265547e-06, |
| "loss": 0.4483, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.3609589874650116, |
| "grad_norm": 249520.984375, |
| "learning_rate": 4.261084743012454e-06, |
| "loss": 0.4371, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.3731288791529757, |
| "grad_norm": 1260475.25, |
| "learning_rate": 4.179952131759361e-06, |
| "loss": 0.4268, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.3852987708409397, |
| "grad_norm": 1328097.5, |
| "learning_rate": 4.098819520506268e-06, |
| "loss": 0.4109, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.3974686625289037, |
| "grad_norm": 538742.125, |
| "learning_rate": 4.017686909253174e-06, |
| "loss": 0.4352, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.4096385542168672, |
| "grad_norm": 950668.5, |
| "learning_rate": 3.936554298000082e-06, |
| "loss": 0.4105, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.4218084459048317, |
| "grad_norm": 881245.6875, |
| "learning_rate": 3.855421686746989e-06, |
| "loss": 0.4359, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.4339783375927952, |
| "grad_norm": 354214.0, |
| "learning_rate": 3.774289075493895e-06, |
| "loss": 0.4111, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.4461482292807593, |
| "grad_norm": 1019421.125, |
| "learning_rate": 3.693156464240802e-06, |
| "loss": 0.4191, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.4583181209687233, |
| "grad_norm": 600526.75, |
| "learning_rate": 3.612023852987709e-06, |
| "loss": 0.4142, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.4704880126566873, |
| "grad_norm": 633427.0625, |
| "learning_rate": 3.5308912417346154e-06, |
| "loss": 0.4216, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.4826579043446513, |
| "grad_norm": 451850.40625, |
| "learning_rate": 3.449758630481522e-06, |
| "loss": 0.4172, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.4948277960326153, |
| "grad_norm": 1016081.8125, |
| "learning_rate": 3.3686260192284294e-06, |
| "loss": 0.4243, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.5069976877205793, |
| "grad_norm": 1442025.375, |
| "learning_rate": 3.287493407975336e-06, |
| "loss": 0.3936, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.5191675794085433, |
| "grad_norm": 1038331.9375, |
| "learning_rate": 3.2063607967222425e-06, |
| "loss": 0.453, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.5313374710965073, |
| "grad_norm": 1017244.0, |
| "learning_rate": 3.1252281854691498e-06, |
| "loss": 0.4306, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.5435073627844713, |
| "grad_norm": 3449184.25, |
| "learning_rate": 3.0440955742160565e-06, |
| "loss": 0.4218, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.5556772544724353, |
| "grad_norm": 1605587.875, |
| "learning_rate": 2.962962962962963e-06, |
| "loss": 0.4492, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.5678471461603993, |
| "grad_norm": 386683.15625, |
| "learning_rate": 2.88183035170987e-06, |
| "loss": 0.3956, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.5800170378483633, |
| "grad_norm": 997831.125, |
| "learning_rate": 2.800697740456777e-06, |
| "loss": 0.4212, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.592186929536327, |
| "grad_norm": 746647.625, |
| "learning_rate": 2.7195651292036833e-06, |
| "loss": 0.4191, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.6043568212242914, |
| "grad_norm": 1180154.625, |
| "learning_rate": 2.6384325179505905e-06, |
| "loss": 0.4352, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.616526712912255, |
| "grad_norm": 262904.28125, |
| "learning_rate": 2.5572999066974973e-06, |
| "loss": 0.4068, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.628696604600219, |
| "grad_norm": 846819.8125, |
| "learning_rate": 2.476167295444404e-06, |
| "loss": 0.4141, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.640866496288183, |
| "grad_norm": 891045.8125, |
| "learning_rate": 2.395034684191311e-06, |
| "loss": 0.4611, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.653036387976147, |
| "grad_norm": 345922.375, |
| "learning_rate": 2.3139020729382176e-06, |
| "loss": 0.4244, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.665206279664111, |
| "grad_norm": 1789013.0, |
| "learning_rate": 2.2327694616851244e-06, |
| "loss": 0.4097, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.677376171352075, |
| "grad_norm": 295374.0625, |
| "learning_rate": 2.1516368504320316e-06, |
| "loss": 0.386, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.689546063040039, |
| "grad_norm": 625661.4375, |
| "learning_rate": 2.070504239178938e-06, |
| "loss": 0.4457, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.701715954728003, |
| "grad_norm": 1791765.5, |
| "learning_rate": 1.9893716279258448e-06, |
| "loss": 0.4024, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.713885846415967, |
| "grad_norm": 307518.59375, |
| "learning_rate": 1.908239016672752e-06, |
| "loss": 0.3637, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.726055738103931, |
| "grad_norm": 1208860.25, |
| "learning_rate": 1.8271064054196585e-06, |
| "loss": 0.3669, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.738225629791895, |
| "grad_norm": 728501.125, |
| "learning_rate": 1.7459737941665653e-06, |
| "loss": 0.4139, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.750395521479859, |
| "grad_norm": 1411384.125, |
| "learning_rate": 1.6648411829134723e-06, |
| "loss": 0.4193, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.762565413167823, |
| "grad_norm": 672326.3125, |
| "learning_rate": 1.583708571660379e-06, |
| "loss": 0.4063, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.7747353048557866, |
| "grad_norm": 634662.1875, |
| "learning_rate": 1.502575960407286e-06, |
| "loss": 0.4427, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.786905196543751, |
| "grad_norm": 1894593.625, |
| "learning_rate": 1.4214433491541927e-06, |
| "loss": 0.427, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.7990750882317146, |
| "grad_norm": 1145494.5, |
| "learning_rate": 1.3403107379010993e-06, |
| "loss": 0.4209, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.8112449799196786, |
| "grad_norm": 661343.0625, |
| "learning_rate": 1.2591781266480063e-06, |
| "loss": 0.4445, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.8234148716076426, |
| "grad_norm": 380578.34375, |
| "learning_rate": 1.178045515394913e-06, |
| "loss": 0.4197, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.8355847632956066, |
| "grad_norm": 1426323.625, |
| "learning_rate": 1.0969129041418198e-06, |
| "loss": 0.4222, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.8477546549835706, |
| "grad_norm": 931245.875, |
| "learning_rate": 1.0157802928887266e-06, |
| "loss": 0.3771, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.8599245466715346, |
| "grad_norm": 725616.1875, |
| "learning_rate": 9.346476816356335e-07, |
| "loss": 0.4225, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.8720944383594986, |
| "grad_norm": 2420769.25, |
| "learning_rate": 8.535150703825404e-07, |
| "loss": 0.4444, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.8842643300474626, |
| "grad_norm": 716980.3125, |
| "learning_rate": 7.723824591294471e-07, |
| "loss": 0.3999, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.8964342217354266, |
| "grad_norm": 770046.9375, |
| "learning_rate": 6.91249847876354e-07, |
| "loss": 0.4085, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.9086041134233906, |
| "grad_norm": 723776.625, |
| "learning_rate": 6.101172366232608e-07, |
| "loss": 0.4564, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.9207740051113547, |
| "grad_norm": 1597413.125, |
| "learning_rate": 5.289846253701675e-07, |
| "loss": 0.3898, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.932943896799318, |
| "grad_norm": 1187440.875, |
| "learning_rate": 4.478520141170744e-07, |
| "loss": 0.4256, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.9451137884872827, |
| "grad_norm": 780778.4375, |
| "learning_rate": 3.667194028639812e-07, |
| "loss": 0.392, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.9572836801752462, |
| "grad_norm": 1588163.625, |
| "learning_rate": 2.85586791610888e-07, |
| "loss": 0.4521, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.9694535718632107, |
| "grad_norm": 766974.1875, |
| "learning_rate": 2.0445418035779482e-07, |
| "loss": 0.4102, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.9816234635511742, |
| "grad_norm": 393891.59375, |
| "learning_rate": 1.2332156910470163e-07, |
| "loss": 0.4191, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.9937933552391383, |
| "grad_norm": 637999.0, |
| "learning_rate": 4.218895785160846e-08, |
| "loss": 0.389, |
| "step": 24600 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7935315370559692, |
| "eval_runtime": 263.7684, |
| "eval_samples_per_second": 45.877, |
| "eval_steps_per_second": 2.87, |
| "step": 24651 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 24651, |
| "total_flos": 7.72931723781289e+16, |
| "train_loss": 0.6282787045998813, |
| "train_runtime": 25250.2136, |
| "train_samples_per_second": 15.62, |
| "train_steps_per_second": 0.976 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 24651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.72931723781289e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|