| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 5393, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.612574777007103, | |
| "epoch": 0.0018544704327870373, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 4.991655850176154e-05, | |
| "loss": 1.0597, | |
| "mean_token_accuracy": 0.7317381598055362, | |
| "num_tokens": 1026412.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.5644195660948754, | |
| "epoch": 0.0037089408655740746, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.9823845725941034e-05, | |
| "loss": 1.089, | |
| "mean_token_accuracy": 0.7293614901602268, | |
| "num_tokens": 2080952.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.5576038718223573, | |
| "epoch": 0.005563411298361112, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.9731132950120526e-05, | |
| "loss": 1.0469, | |
| "mean_token_accuracy": 0.7326322324573994, | |
| "num_tokens": 3137628.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.5587382912635803, | |
| "epoch": 0.007417881731148149, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.963842017430002e-05, | |
| "loss": 1.0605, | |
| "mean_token_accuracy": 0.7342366375029087, | |
| "num_tokens": 4193558.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5575660154223443, | |
| "epoch": 0.009272352163935186, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 4.9545707398479516e-05, | |
| "loss": 1.0476, | |
| "mean_token_accuracy": 0.7343964815139771, | |
| "num_tokens": 5267795.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.546275109052658, | |
| "epoch": 0.011126822596722223, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 4.945299462265901e-05, | |
| "loss": 1.0752, | |
| "mean_token_accuracy": 0.728316018730402, | |
| "num_tokens": 6297705.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.5558656230568886, | |
| "epoch": 0.012981293029509261, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.93602818468385e-05, | |
| "loss": 1.0122, | |
| "mean_token_accuracy": 0.7419391557574272, | |
| "num_tokens": 7355308.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.5574305072426795, | |
| "epoch": 0.014835763462296299, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 4.926756907101799e-05, | |
| "loss": 1.0532, | |
| "mean_token_accuracy": 0.7330756388604641, | |
| "num_tokens": 8391755.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.5481105983257293, | |
| "epoch": 0.016690233895083336, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 4.917485629519748e-05, | |
| "loss": 1.0572, | |
| "mean_token_accuracy": 0.7318947434425354, | |
| "num_tokens": 9441626.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.551408438384533, | |
| "epoch": 0.018544704327870372, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.9082143519376974e-05, | |
| "loss": 1.0383, | |
| "mean_token_accuracy": 0.7377031579613685, | |
| "num_tokens": 10497952.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.5357420697808266, | |
| "epoch": 0.02039917476065741, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.8989430743556466e-05, | |
| "loss": 1.0164, | |
| "mean_token_accuracy": 0.7395447820425034, | |
| "num_tokens": 11554625.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.556403924524784, | |
| "epoch": 0.022253645193444447, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.889671796773596e-05, | |
| "loss": 1.0536, | |
| "mean_token_accuracy": 0.7335591673851013, | |
| "num_tokens": 12606977.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5795823469758035, | |
| "epoch": 0.024108115626231483, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 4.880400519191545e-05, | |
| "loss": 1.0717, | |
| "mean_token_accuracy": 0.7308689467608929, | |
| "num_tokens": 13659604.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.5415235817432404, | |
| "epoch": 0.025962586059018522, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 4.871129241609494e-05, | |
| "loss": 1.0354, | |
| "mean_token_accuracy": 0.7350596480071545, | |
| "num_tokens": 14718857.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.5156891584396361, | |
| "epoch": 0.027817056491805558, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.861857964027443e-05, | |
| "loss": 1.0342, | |
| "mean_token_accuracy": 0.736380535364151, | |
| "num_tokens": 15791072.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.550842559337616, | |
| "epoch": 0.029671526924592597, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 4.8525866864453924e-05, | |
| "loss": 1.0327, | |
| "mean_token_accuracy": 0.7376597911119461, | |
| "num_tokens": 16841237.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.5402591615915298, | |
| "epoch": 0.03152599735737963, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.8433154088633416e-05, | |
| "loss": 1.0554, | |
| "mean_token_accuracy": 0.7302822440862655, | |
| "num_tokens": 17877179.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.5491653725504875, | |
| "epoch": 0.03338046779016667, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 4.834044131281291e-05, | |
| "loss": 1.0231, | |
| "mean_token_accuracy": 0.7375590361654758, | |
| "num_tokens": 18961179.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.5091556265950203, | |
| "epoch": 0.035234938222953704, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.82477285369924e-05, | |
| "loss": 1.0179, | |
| "mean_token_accuracy": 0.7398052349686622, | |
| "num_tokens": 20014811.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.5289593398571015, | |
| "epoch": 0.037089408655740744, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 4.815501576117189e-05, | |
| "loss": 1.0503, | |
| "mean_token_accuracy": 0.7322728201746941, | |
| "num_tokens": 21089433.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.5411681681871414, | |
| "epoch": 0.03894387908852778, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 4.806230298535138e-05, | |
| "loss": 1.037, | |
| "mean_token_accuracy": 0.7373237326741219, | |
| "num_tokens": 22168111.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.516893781721592, | |
| "epoch": 0.04079834952131482, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.7969590209530874e-05, | |
| "loss": 1.0389, | |
| "mean_token_accuracy": 0.7351879917085171, | |
| "num_tokens": 23236197.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.524234440922737, | |
| "epoch": 0.042652819954101855, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.7876877433710365e-05, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.734955656528473, | |
| "num_tokens": 24267939.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.5440999507904052, | |
| "epoch": 0.044507290386888894, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.778416465788986e-05, | |
| "loss": 1.0575, | |
| "mean_token_accuracy": 0.7326981708407402, | |
| "num_tokens": 25312892.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.4938133835792542, | |
| "epoch": 0.04636176081967593, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 4.769145188206935e-05, | |
| "loss": 0.9767, | |
| "mean_token_accuracy": 0.7476728469133377, | |
| "num_tokens": 26411392.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.516909132897854, | |
| "epoch": 0.048216231252462965, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.759873910624884e-05, | |
| "loss": 1.0163, | |
| "mean_token_accuracy": 0.7412026561796665, | |
| "num_tokens": 27469188.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.5302777588367462, | |
| "epoch": 0.050070701685250005, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.750602633042833e-05, | |
| "loss": 1.031, | |
| "mean_token_accuracy": 0.7391552790999413, | |
| "num_tokens": 28560611.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.4983129650354385, | |
| "epoch": 0.051925172118037044, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.741331355460783e-05, | |
| "loss": 1.0019, | |
| "mean_token_accuracy": 0.7446308046579361, | |
| "num_tokens": 29604794.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.5453171014785767, | |
| "epoch": 0.05377964255082408, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.732060077878732e-05, | |
| "loss": 1.0335, | |
| "mean_token_accuracy": 0.7375391863286496, | |
| "num_tokens": 30661934.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.5289832100272178, | |
| "epoch": 0.055634112983611116, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.7227888002966814e-05, | |
| "loss": 1.0276, | |
| "mean_token_accuracy": 0.7369511865079403, | |
| "num_tokens": 31728346.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.4998823776841164, | |
| "epoch": 0.057488583416398155, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.7135175227146305e-05, | |
| "loss": 1.0343, | |
| "mean_token_accuracy": 0.7386834770441055, | |
| "num_tokens": 32777762.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.5244294270873069, | |
| "epoch": 0.059343053849185194, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.70424624513258e-05, | |
| "loss": 1.0417, | |
| "mean_token_accuracy": 0.7354931272566319, | |
| "num_tokens": 33829813.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.5431315779685975, | |
| "epoch": 0.061197524281972226, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.694974967550529e-05, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.7328706897795201, | |
| "num_tokens": 34884334.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.5122175738215446, | |
| "epoch": 0.06305199471475927, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.685703689968478e-05, | |
| "loss": 1.0273, | |
| "mean_token_accuracy": 0.7385680362582207, | |
| "num_tokens": 35958882.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.5036789387464524, | |
| "epoch": 0.0649064651475463, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 4.676432412386427e-05, | |
| "loss": 0.9769, | |
| "mean_token_accuracy": 0.7481208391487598, | |
| "num_tokens": 36993229.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.5510423615574838, | |
| "epoch": 0.06676093558033334, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.6671611348043763e-05, | |
| "loss": 1.0223, | |
| "mean_token_accuracy": 0.7393691584467887, | |
| "num_tokens": 38042508.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.515822634100914, | |
| "epoch": 0.06861540601312038, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.6578898572223255e-05, | |
| "loss": 0.998, | |
| "mean_token_accuracy": 0.7431743949651718, | |
| "num_tokens": 39118086.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.503666016459465, | |
| "epoch": 0.07046987644590741, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 4.648618579640275e-05, | |
| "loss": 1.0047, | |
| "mean_token_accuracy": 0.7415208213031292, | |
| "num_tokens": 40187466.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.5302211076021195, | |
| "epoch": 0.07232434687869445, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.639347302058224e-05, | |
| "loss": 1.015, | |
| "mean_token_accuracy": 0.7381173759698868, | |
| "num_tokens": 41224132.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.5342731535434724, | |
| "epoch": 0.07417881731148149, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.630076024476173e-05, | |
| "loss": 1.0367, | |
| "mean_token_accuracy": 0.7346886761486531, | |
| "num_tokens": 42290229.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.5021552205085755, | |
| "epoch": 0.07603328774426853, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.620804746894122e-05, | |
| "loss": 0.9984, | |
| "mean_token_accuracy": 0.74442353323102, | |
| "num_tokens": 43332195.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.5242351979017257, | |
| "epoch": 0.07788775817705557, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.611533469312071e-05, | |
| "loss": 1.0272, | |
| "mean_token_accuracy": 0.7385884746909142, | |
| "num_tokens": 44378105.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.5297550037503242, | |
| "epoch": 0.0797422286098426, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 4.6022621917300205e-05, | |
| "loss": 1.0215, | |
| "mean_token_accuracy": 0.7384527139365673, | |
| "num_tokens": 45419106.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.5240361213684082, | |
| "epoch": 0.08159669904262964, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 4.5929909141479696e-05, | |
| "loss": 1.0387, | |
| "mean_token_accuracy": 0.7316593080759048, | |
| "num_tokens": 46480238.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.4925807863473892, | |
| "epoch": 0.08345116947541667, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.583719636565919e-05, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.7405316449701786, | |
| "num_tokens": 47544033.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.4952182561159133, | |
| "epoch": 0.08530563990820371, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.574448358983868e-05, | |
| "loss": 0.9968, | |
| "mean_token_accuracy": 0.7426001988351345, | |
| "num_tokens": 48603539.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.5336338073015212, | |
| "epoch": 0.08716011034099075, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.565177081401817e-05, | |
| "loss": 1.0077, | |
| "mean_token_accuracy": 0.7403936669230461, | |
| "num_tokens": 49684457.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.5218110263347626, | |
| "epoch": 0.08901458077377779, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.555905803819766e-05, | |
| "loss": 1.0594, | |
| "mean_token_accuracy": 0.7292280294001102, | |
| "num_tokens": 50764443.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.4603750854730606, | |
| "epoch": 0.09086905120656483, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 4.5466345262377155e-05, | |
| "loss": 0.9411, | |
| "mean_token_accuracy": 0.7545233644545078, | |
| "num_tokens": 51861103.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.5292219877243043, | |
| "epoch": 0.09272352163935187, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.537363248655665e-05, | |
| "loss": 1.0463, | |
| "mean_token_accuracy": 0.7310812592506408, | |
| "num_tokens": 52939631.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.4591013997793199, | |
| "epoch": 0.0945779920721389, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.5280919710736145e-05, | |
| "loss": 0.966, | |
| "mean_token_accuracy": 0.7504445768892765, | |
| "num_tokens": 54014127.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.5337822422385217, | |
| "epoch": 0.09643246250492593, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.5188206934915636e-05, | |
| "loss": 0.9849, | |
| "mean_token_accuracy": 0.7473617434501648, | |
| "num_tokens": 55098552.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.4903638631105423, | |
| "epoch": 0.09828693293771297, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.509549415909513e-05, | |
| "loss": 0.9802, | |
| "mean_token_accuracy": 0.7482960596680641, | |
| "num_tokens": 56178827.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.5502985909581184, | |
| "epoch": 0.10014140337050001, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.500278138327462e-05, | |
| "loss": 1.0414, | |
| "mean_token_accuracy": 0.7344899229705334, | |
| "num_tokens": 57236189.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.523072436451912, | |
| "epoch": 0.10199587380328705, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 4.491006860745411e-05, | |
| "loss": 1.0081, | |
| "mean_token_accuracy": 0.741492023319006, | |
| "num_tokens": 58313037.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.5185051456093788, | |
| "epoch": 0.10385034423607409, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.48173558316336e-05, | |
| "loss": 1.0159, | |
| "mean_token_accuracy": 0.7368122689425946, | |
| "num_tokens": 59390151.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.4947978630661964, | |
| "epoch": 0.10570481466886113, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 4.4724643055813094e-05, | |
| "loss": 0.9832, | |
| "mean_token_accuracy": 0.7445899307727813, | |
| "num_tokens": 60443977.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.4771181985735893, | |
| "epoch": 0.10755928510164817, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 4.4631930279992586e-05, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7501099176704884, | |
| "num_tokens": 61518541.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.5381808206439018, | |
| "epoch": 0.10941375553443519, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.453921750417208e-05, | |
| "loss": 0.9962, | |
| "mean_token_accuracy": 0.7437006868422031, | |
| "num_tokens": 62553343.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.5086082085967063, | |
| "epoch": 0.11126822596722223, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.444650472835157e-05, | |
| "loss": 1.0159, | |
| "mean_token_accuracy": 0.7395556323230267, | |
| "num_tokens": 63584621.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.528603471815586, | |
| "epoch": 0.11312269640000927, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 4.435379195253106e-05, | |
| "loss": 1.0195, | |
| "mean_token_accuracy": 0.7399595864117146, | |
| "num_tokens": 64622816.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.5517917990684509, | |
| "epoch": 0.11497716683279631, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.426107917671055e-05, | |
| "loss": 1.0368, | |
| "mean_token_accuracy": 0.7350802809000015, | |
| "num_tokens": 65689744.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.5145041599869729, | |
| "epoch": 0.11683163726558335, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.4168366400890044e-05, | |
| "loss": 1.0162, | |
| "mean_token_accuracy": 0.7422668196260929, | |
| "num_tokens": 66751805.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.5180778324604034, | |
| "epoch": 0.11868610769837039, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.4075653625069536e-05, | |
| "loss": 1.0128, | |
| "mean_token_accuracy": 0.7411701023578644, | |
| "num_tokens": 67801386.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.466907811164856, | |
| "epoch": 0.12054057813115743, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.398294084924903e-05, | |
| "loss": 0.9628, | |
| "mean_token_accuracy": 0.748704032599926, | |
| "num_tokens": 68851715.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.4689898312091827, | |
| "epoch": 0.12239504856394445, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.389022807342852e-05, | |
| "loss": 0.943, | |
| "mean_token_accuracy": 0.7534093335270882, | |
| "num_tokens": 69902577.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.5318989798426628, | |
| "epoch": 0.12424951899673149, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.379751529760801e-05, | |
| "loss": 1.0131, | |
| "mean_token_accuracy": 0.7399744331836701, | |
| "num_tokens": 70978987.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.531233723461628, | |
| "epoch": 0.12610398942951853, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.37048025217875e-05, | |
| "loss": 1.0024, | |
| "mean_token_accuracy": 0.7424432031810284, | |
| "num_tokens": 72041161.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.4848274305462836, | |
| "epoch": 0.12795845986230556, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 4.3612089745966994e-05, | |
| "loss": 0.964, | |
| "mean_token_accuracy": 0.7487917475402355, | |
| "num_tokens": 73077596.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.4835757672786714, | |
| "epoch": 0.1298129302950926, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.3519376970146486e-05, | |
| "loss": 0.9687, | |
| "mean_token_accuracy": 0.7482482351362705, | |
| "num_tokens": 74115021.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.5183353215456008, | |
| "epoch": 0.13166740072787964, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.342666419432598e-05, | |
| "loss": 1.0006, | |
| "mean_token_accuracy": 0.7437005870044231, | |
| "num_tokens": 75187471.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.501795943081379, | |
| "epoch": 0.1335218711606667, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.333395141850547e-05, | |
| "loss": 0.9968, | |
| "mean_token_accuracy": 0.7440856315195561, | |
| "num_tokens": 76245992.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.5099968641996384, | |
| "epoch": 0.13537634159345371, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.324123864268496e-05, | |
| "loss": 1.0042, | |
| "mean_token_accuracy": 0.7434575214982033, | |
| "num_tokens": 77301129.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.5205446511507035, | |
| "epoch": 0.13723081202624077, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.314852586686446e-05, | |
| "loss": 1.0117, | |
| "mean_token_accuracy": 0.7393212102353572, | |
| "num_tokens": 78339584.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.5229708418250083, | |
| "epoch": 0.1390852824590278, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.305581309104395e-05, | |
| "loss": 1.0003, | |
| "mean_token_accuracy": 0.7429314963519573, | |
| "num_tokens": 79407714.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.4949795320630073, | |
| "epoch": 0.14093975289181482, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.296310031522344e-05, | |
| "loss": 0.9999, | |
| "mean_token_accuracy": 0.7443600594997406, | |
| "num_tokens": 80492225.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.520840474963188, | |
| "epoch": 0.14279422332460187, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.2870387539402934e-05, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.744787298887968, | |
| "num_tokens": 81549110.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.486540971696377, | |
| "epoch": 0.1446486937573889, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.2777674763582425e-05, | |
| "loss": 0.9894, | |
| "mean_token_accuracy": 0.7451410122215748, | |
| "num_tokens": 82614221.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.5056209504604339, | |
| "epoch": 0.14650316419017595, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.268496198776192e-05, | |
| "loss": 0.9869, | |
| "mean_token_accuracy": 0.7462142176926136, | |
| "num_tokens": 83655235.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.5422687619924544, | |
| "epoch": 0.14835763462296297, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.259224921194141e-05, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.740646617859602, | |
| "num_tokens": 84714057.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.520611123740673, | |
| "epoch": 0.15021210505575003, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.24995364361209e-05, | |
| "loss": 1.0428, | |
| "mean_token_accuracy": 0.7361539803445339, | |
| "num_tokens": 85772585.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.4799606420099736, | |
| "epoch": 0.15206657548853705, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.240682366030039e-05, | |
| "loss": 0.9771, | |
| "mean_token_accuracy": 0.7459641695022583, | |
| "num_tokens": 86849690.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.4758174255490304, | |
| "epoch": 0.15392104592132408, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.2314110884479884e-05, | |
| "loss": 0.9595, | |
| "mean_token_accuracy": 0.7533868886530399, | |
| "num_tokens": 87915136.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.5151072204113007, | |
| "epoch": 0.15577551635411113, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.2221398108659375e-05, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7444874249398709, | |
| "num_tokens": 88990018.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.5166364774107932, | |
| "epoch": 0.15762998678689816, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.212868533283887e-05, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.7433667846024037, | |
| "num_tokens": 90114083.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.5378105223178864, | |
| "epoch": 0.1594844572196852, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.203597255701836e-05, | |
| "loss": 0.9871, | |
| "mean_token_accuracy": 0.7433856271207333, | |
| "num_tokens": 91178482.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.498812098801136, | |
| "epoch": 0.16133892765247224, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.194325978119785e-05, | |
| "loss": 0.9887, | |
| "mean_token_accuracy": 0.746368583291769, | |
| "num_tokens": 92239235.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.5183847174048424, | |
| "epoch": 0.1631933980852593, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.185054700537734e-05, | |
| "loss": 1.0047, | |
| "mean_token_accuracy": 0.7415020234882832, | |
| "num_tokens": 93284913.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.4750665351748466, | |
| "epoch": 0.16504786851804631, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.175783422955683e-05, | |
| "loss": 0.9878, | |
| "mean_token_accuracy": 0.7462855473160743, | |
| "num_tokens": 94333429.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.5011137649416924, | |
| "epoch": 0.16690233895083334, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.1665121453736325e-05, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7396205604076386, | |
| "num_tokens": 95381587.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.4897055223584175, | |
| "epoch": 0.1687568093836204, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.1572408677915817e-05, | |
| "loss": 1.006, | |
| "mean_token_accuracy": 0.7384402513504028, | |
| "num_tokens": 96462074.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.4897184401750565, | |
| "epoch": 0.17061127981640742, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 4.147969590209531e-05, | |
| "loss": 0.9703, | |
| "mean_token_accuracy": 0.7500183641910553, | |
| "num_tokens": 97511351.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.4779999554157257, | |
| "epoch": 0.17246575024919447, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.13869831262748e-05, | |
| "loss": 0.9877, | |
| "mean_token_accuracy": 0.7460473470389843, | |
| "num_tokens": 98560916.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.4812358170747757, | |
| "epoch": 0.1743202206819815, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.12942703504543e-05, | |
| "loss": 0.9649, | |
| "mean_token_accuracy": 0.7487124249339103, | |
| "num_tokens": 99613348.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.5195035934448242, | |
| "epoch": 0.17617469111476855, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.120155757463379e-05, | |
| "loss": 0.9794, | |
| "mean_token_accuracy": 0.7441080920398235, | |
| "num_tokens": 100662327.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.4909209072589875, | |
| "epoch": 0.17802916154755558, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.110884479881328e-05, | |
| "loss": 0.9726, | |
| "mean_token_accuracy": 0.7496254183351994, | |
| "num_tokens": 101747208.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.4799237817525863, | |
| "epoch": 0.1798836319803426, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.101613202299277e-05, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7510715290904045, | |
| "num_tokens": 102813412.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.5013772562146186, | |
| "epoch": 0.18173810241312965, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.0923419247172265e-05, | |
| "loss": 0.9857, | |
| "mean_token_accuracy": 0.7471497923135757, | |
| "num_tokens": 103867765.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.474899211525917, | |
| "epoch": 0.18359257284591668, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.0830706471351756e-05, | |
| "loss": 0.9807, | |
| "mean_token_accuracy": 0.7472601749002934, | |
| "num_tokens": 104960748.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.4903257325291634, | |
| "epoch": 0.18544704327870373, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.073799369553125e-05, | |
| "loss": 0.9782, | |
| "mean_token_accuracy": 0.7442355081439018, | |
| "num_tokens": 106029310.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.5116073459386825, | |
| "epoch": 0.18730151371149076, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 4.064528091971074e-05, | |
| "loss": 1.0134, | |
| "mean_token_accuracy": 0.7415754482150078, | |
| "num_tokens": 107088537.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.4644516192376613, | |
| "epoch": 0.1891559841442778, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 4.055256814389023e-05, | |
| "loss": 0.9485, | |
| "mean_token_accuracy": 0.7534352712333202, | |
| "num_tokens": 108146436.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.532536183297634, | |
| "epoch": 0.19101045457706484, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.045985536806972e-05, | |
| "loss": 1.0094, | |
| "mean_token_accuracy": 0.7411122426390648, | |
| "num_tokens": 109199689.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.514697441458702, | |
| "epoch": 0.19286492500985186, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.0367142592249215e-05, | |
| "loss": 1.0167, | |
| "mean_token_accuracy": 0.7425492964684963, | |
| "num_tokens": 110261314.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.497881819307804, | |
| "epoch": 0.19471939544263892, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 4.0274429816428706e-05, | |
| "loss": 0.9608, | |
| "mean_token_accuracy": 0.7492932274937629, | |
| "num_tokens": 111348480.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.5163912072777748, | |
| "epoch": 0.19657386587542594, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.01817170406082e-05, | |
| "loss": 0.9872, | |
| "mean_token_accuracy": 0.743515857309103, | |
| "num_tokens": 112407457.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.512566539645195, | |
| "epoch": 0.198428336308213, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.008900426478769e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7439120054244995, | |
| "num_tokens": 113482290.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.4773399017751216, | |
| "epoch": 0.20028280674100002, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.999629148896718e-05, | |
| "loss": 0.9407, | |
| "mean_token_accuracy": 0.7565898634493351, | |
| "num_tokens": 114561740.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.4678212240338326, | |
| "epoch": 0.20213727717378707, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.990357871314667e-05, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7542161449790001, | |
| "num_tokens": 115614610.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.513384173810482, | |
| "epoch": 0.2039917476065741, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.9810865937326164e-05, | |
| "loss": 0.9928, | |
| "mean_token_accuracy": 0.7445978745818138, | |
| "num_tokens": 116681453.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.49861471503973, | |
| "epoch": 0.20584621803936112, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.9718153161505656e-05, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.7394719265401364, | |
| "num_tokens": 117716368.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.477181686460972, | |
| "epoch": 0.20770068847214818, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.962544038568515e-05, | |
| "loss": 0.9584, | |
| "mean_token_accuracy": 0.7510823294520378, | |
| "num_tokens": 118776307.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.5219816774129868, | |
| "epoch": 0.2095551589049352, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.953272760986464e-05, | |
| "loss": 0.9801, | |
| "mean_token_accuracy": 0.7465045012533664, | |
| "num_tokens": 119839344.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.5113320291042327, | |
| "epoch": 0.21140962933772225, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.944001483404413e-05, | |
| "loss": 0.9885, | |
| "mean_token_accuracy": 0.7440638624131679, | |
| "num_tokens": 120895258.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.5221231281757355, | |
| "epoch": 0.21326409977050928, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.934730205822362e-05, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.743220105022192, | |
| "num_tokens": 121958257.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.509648172557354, | |
| "epoch": 0.21511857020329633, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.9254589282403114e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.749625925719738, | |
| "num_tokens": 123007006.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.4795884430408477, | |
| "epoch": 0.21697304063608336, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.9161876506582606e-05, | |
| "loss": 0.9797, | |
| "mean_token_accuracy": 0.746769518405199, | |
| "num_tokens": 124081635.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.5002055689692497, | |
| "epoch": 0.21882751106887038, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.90691637307621e-05, | |
| "loss": 1.0061, | |
| "mean_token_accuracy": 0.7404768541455269, | |
| "num_tokens": 125122475.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.5008674770593644, | |
| "epoch": 0.22068198150165744, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.897645095494159e-05, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.7393870085477829, | |
| "num_tokens": 126193751.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.482962355017662, | |
| "epoch": 0.22253645193444446, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.888373817912109e-05, | |
| "loss": 0.9687, | |
| "mean_token_accuracy": 0.752807629853487, | |
| "num_tokens": 127268622.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.4758433610200883, | |
| "epoch": 0.22439092236723152, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.879102540330058e-05, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.7505301244556903, | |
| "num_tokens": 128326581.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.500441959500313, | |
| "epoch": 0.22624539280001854, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.869831262748007e-05, | |
| "loss": 1.017, | |
| "mean_token_accuracy": 0.7375108540058136, | |
| "num_tokens": 129376850.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.4893997624516486, | |
| "epoch": 0.2280998632328056, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.860559985165956e-05, | |
| "loss": 1.0042, | |
| "mean_token_accuracy": 0.741396465152502, | |
| "num_tokens": 130412564.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.505383348464966, | |
| "epoch": 0.22995433366559262, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.8512887075839054e-05, | |
| "loss": 0.9524, | |
| "mean_token_accuracy": 0.7526863746345043, | |
| "num_tokens": 131457427.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.5027053102850914, | |
| "epoch": 0.23180880409837964, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.8420174300018546e-05, | |
| "loss": 0.9584, | |
| "mean_token_accuracy": 0.7506951965391636, | |
| "num_tokens": 132510274.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.53069761544466, | |
| "epoch": 0.2336632745311667, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.832746152419804e-05, | |
| "loss": 0.9875, | |
| "mean_token_accuracy": 0.7463310994207859, | |
| "num_tokens": 133593109.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.5053236052393912, | |
| "epoch": 0.23551774496395372, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.823474874837753e-05, | |
| "loss": 0.9813, | |
| "mean_token_accuracy": 0.7454123921692372, | |
| "num_tokens": 134658291.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.4890306636691093, | |
| "epoch": 0.23737221539674078, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.814203597255702e-05, | |
| "loss": 0.987, | |
| "mean_token_accuracy": 0.7441953986883163, | |
| "num_tokens": 135686637.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.4929763719439506, | |
| "epoch": 0.2392266858295278, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.804932319673651e-05, | |
| "loss": 0.9688, | |
| "mean_token_accuracy": 0.7510817535221577, | |
| "num_tokens": 136739191.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.5134969413280488, | |
| "epoch": 0.24108115626231486, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.7956610420916004e-05, | |
| "loss": 1.0046, | |
| "mean_token_accuracy": 0.741378902643919, | |
| "num_tokens": 137780168.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.47280178591609, | |
| "epoch": 0.24293562669510188, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.7863897645095495e-05, | |
| "loss": 0.967, | |
| "mean_token_accuracy": 0.7471256792545319, | |
| "num_tokens": 138834613.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.4809684470295905, | |
| "epoch": 0.2447900971278889, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.777118486927499e-05, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7508305542171001, | |
| "num_tokens": 139912411.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.468713527917862, | |
| "epoch": 0.24664456756067596, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.767847209345448e-05, | |
| "loss": 0.9404, | |
| "mean_token_accuracy": 0.7560968786478043, | |
| "num_tokens": 140966807.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.5095023021101952, | |
| "epoch": 0.24849903799346298, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.758575931763397e-05, | |
| "loss": 1.0102, | |
| "mean_token_accuracy": 0.7405634470283985, | |
| "num_tokens": 142046360.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.5112322628498078, | |
| "epoch": 0.25035350842625004, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.749304654181346e-05, | |
| "loss": 1.0183, | |
| "mean_token_accuracy": 0.7409091025590897, | |
| "num_tokens": 143080934.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.521301081776619, | |
| "epoch": 0.25220797885903706, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.7400333765992953e-05, | |
| "loss": 0.9798, | |
| "mean_token_accuracy": 0.7482521809637547, | |
| "num_tokens": 144128012.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.4606486022472382, | |
| "epoch": 0.2540624492918241, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.7307620990172445e-05, | |
| "loss": 0.9558, | |
| "mean_token_accuracy": 0.7522253535687924, | |
| "num_tokens": 145186509.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.4678262144327163, | |
| "epoch": 0.2559169197246111, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.721490821435194e-05, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.7514731995761395, | |
| "num_tokens": 146238399.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.5264454185962677, | |
| "epoch": 0.2577713901573982, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.7122195438531435e-05, | |
| "loss": 0.9914, | |
| "mean_token_accuracy": 0.7449414484202862, | |
| "num_tokens": 147294065.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.4994682416319847, | |
| "epoch": 0.2596258605901852, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.702948266271093e-05, | |
| "loss": 0.9968, | |
| "mean_token_accuracy": 0.7422976262867451, | |
| "num_tokens": 148377295.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.4982701525092126, | |
| "epoch": 0.26148033102297225, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.693676988689042e-05, | |
| "loss": 0.9463, | |
| "mean_token_accuracy": 0.7523621313273907, | |
| "num_tokens": 149426310.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.47211245149374, | |
| "epoch": 0.26333480145575927, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.684405711106991e-05, | |
| "loss": 0.9609, | |
| "mean_token_accuracy": 0.7525891557335853, | |
| "num_tokens": 150501686.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.494711072742939, | |
| "epoch": 0.26518927188854635, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.67513443352494e-05, | |
| "loss": 0.9851, | |
| "mean_token_accuracy": 0.7467999920248986, | |
| "num_tokens": 151559601.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.4969945684075356, | |
| "epoch": 0.2670437423213334, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.665863155942889e-05, | |
| "loss": 0.988, | |
| "mean_token_accuracy": 0.744317352026701, | |
| "num_tokens": 152597095.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.504962182044983, | |
| "epoch": 0.2688982127541204, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.6565918783608385e-05, | |
| "loss": 0.9703, | |
| "mean_token_accuracy": 0.7496489994227886, | |
| "num_tokens": 153637105.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.5350501209497451, | |
| "epoch": 0.27075268318690743, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.6473206007787877e-05, | |
| "loss": 0.9836, | |
| "mean_token_accuracy": 0.7430909477174282, | |
| "num_tokens": 154669656.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.4891722574830055, | |
| "epoch": 0.27260715361969445, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.638049323196737e-05, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7513696648180485, | |
| "num_tokens": 155729440.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.4993504837155343, | |
| "epoch": 0.27446162405248153, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.628778045614686e-05, | |
| "loss": 0.9512, | |
| "mean_token_accuracy": 0.7535622462630271, | |
| "num_tokens": 156797340.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.5004150480031968, | |
| "epoch": 0.27631609448526856, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.619506768032635e-05, | |
| "loss": 0.9907, | |
| "mean_token_accuracy": 0.7451579593122005, | |
| "num_tokens": 157845176.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.4831660851836204, | |
| "epoch": 0.2781705649180556, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.610235490450584e-05, | |
| "loss": 0.9825, | |
| "mean_token_accuracy": 0.7440999209880829, | |
| "num_tokens": 158883545.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.4968566060066224, | |
| "epoch": 0.2800250353508426, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.6009642128685335e-05, | |
| "loss": 0.9741, | |
| "mean_token_accuracy": 0.7467134885489941, | |
| "num_tokens": 159924615.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.4774213805794716, | |
| "epoch": 0.28187950578362964, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.5916929352864826e-05, | |
| "loss": 0.9695, | |
| "mean_token_accuracy": 0.7494573444128036, | |
| "num_tokens": 160966123.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.4931775212287903, | |
| "epoch": 0.2837339762164167, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.582421657704432e-05, | |
| "loss": 0.9583, | |
| "mean_token_accuracy": 0.7511327356100083, | |
| "num_tokens": 162040108.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.5106368839740754, | |
| "epoch": 0.28558844664920374, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.573150380122381e-05, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.7396759584546089, | |
| "num_tokens": 163114195.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.4995526686310767, | |
| "epoch": 0.28744291708199077, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.56387910254033e-05, | |
| "loss": 1.0012, | |
| "mean_token_accuracy": 0.7397923909127713, | |
| "num_tokens": 164176306.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.4953254774212836, | |
| "epoch": 0.2892973875147778, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.554607824958279e-05, | |
| "loss": 0.9878, | |
| "mean_token_accuracy": 0.7479016840457916, | |
| "num_tokens": 165223454.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.5092918753623963, | |
| "epoch": 0.2911518579475649, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.5453365473762284e-05, | |
| "loss": 0.9576, | |
| "mean_token_accuracy": 0.7502336800098419, | |
| "num_tokens": 166308635.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.5152800485491753, | |
| "epoch": 0.2930063283803519, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.5360652697941776e-05, | |
| "loss": 0.9817, | |
| "mean_token_accuracy": 0.7467466108500957, | |
| "num_tokens": 167357475.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.5347634688019753, | |
| "epoch": 0.2948607988131389, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.526793992212127e-05, | |
| "loss": 0.9901, | |
| "mean_token_accuracy": 0.7459274418652058, | |
| "num_tokens": 168413479.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.5061669379472733, | |
| "epoch": 0.29671526924592595, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.517522714630076e-05, | |
| "loss": 0.989, | |
| "mean_token_accuracy": 0.7424930393695831, | |
| "num_tokens": 169468304.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.4844420589506626, | |
| "epoch": 0.298569739678713, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.508251437048025e-05, | |
| "loss": 0.9882, | |
| "mean_token_accuracy": 0.7433069244027137, | |
| "num_tokens": 170559283.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.4843671947717667, | |
| "epoch": 0.30042421011150006, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.498980159465974e-05, | |
| "loss": 0.9713, | |
| "mean_token_accuracy": 0.7474540777504444, | |
| "num_tokens": 171595438.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.4851327747106553, | |
| "epoch": 0.3022786805442871, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.4897088818839234e-05, | |
| "loss": 0.9658, | |
| "mean_token_accuracy": 0.7487363338470459, | |
| "num_tokens": 172700560.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.493249311298132, | |
| "epoch": 0.3041331509770741, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.4804376043018726e-05, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7439596347510815, | |
| "num_tokens": 173741924.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.523690328001976, | |
| "epoch": 0.30598762140986113, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.471166326719822e-05, | |
| "loss": 0.9937, | |
| "mean_token_accuracy": 0.7441366732120513, | |
| "num_tokens": 174772510.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.4831879079341888, | |
| "epoch": 0.30784209184264816, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.461895049137771e-05, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7515099786221981, | |
| "num_tokens": 175812474.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.520809744298458, | |
| "epoch": 0.30969656227543524, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.452623771555721e-05, | |
| "loss": 0.9632, | |
| "mean_token_accuracy": 0.751189862191677, | |
| "num_tokens": 176888248.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.4896851450204849, | |
| "epoch": 0.31155103270822226, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.44335249397367e-05, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7493222691118717, | |
| "num_tokens": 177955268.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.4986834585666657, | |
| "epoch": 0.3134055031410093, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.434081216391619e-05, | |
| "loss": 0.9713, | |
| "mean_token_accuracy": 0.7483300775289535, | |
| "num_tokens": 179026897.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.4932934895157814, | |
| "epoch": 0.3152599735737963, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.424809938809568e-05, | |
| "loss": 0.9734, | |
| "mean_token_accuracy": 0.7462395824491977, | |
| "num_tokens": 180093781.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.497939896583557, | |
| "epoch": 0.3171144440065834, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.4155386612275174e-05, | |
| "loss": 0.9927, | |
| "mean_token_accuracy": 0.7445473819971085, | |
| "num_tokens": 181140087.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.4972395211458207, | |
| "epoch": 0.3189689144393704, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.4062673836454666e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7440039031207561, | |
| "num_tokens": 182201728.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.4789512246847152, | |
| "epoch": 0.32082338487215745, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.396996106063416e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7514682106673718, | |
| "num_tokens": 183265211.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.4693973064422607, | |
| "epoch": 0.32267785530494447, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.387724828481365e-05, | |
| "loss": 0.9645, | |
| "mean_token_accuracy": 0.7500231243669987, | |
| "num_tokens": 184347041.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.4899096086621284, | |
| "epoch": 0.3245323257377315, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.378453550899314e-05, | |
| "loss": 0.9601, | |
| "mean_token_accuracy": 0.749112206697464, | |
| "num_tokens": 185411788.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.514881856739521, | |
| "epoch": 0.3263867961705186, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.369182273317263e-05, | |
| "loss": 0.9808, | |
| "mean_token_accuracy": 0.745984373241663, | |
| "num_tokens": 186482339.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.459674072265625, | |
| "epoch": 0.3282412666033056, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.3599109957352124e-05, | |
| "loss": 0.9168, | |
| "mean_token_accuracy": 0.7601988978683949, | |
| "num_tokens": 187526267.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.5338073745369911, | |
| "epoch": 0.33009573703609263, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.3506397181531615e-05, | |
| "loss": 1.0065, | |
| "mean_token_accuracy": 0.739216773211956, | |
| "num_tokens": 188582544.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.492903406918049, | |
| "epoch": 0.33195020746887965, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.341368440571111e-05, | |
| "loss": 0.9633, | |
| "mean_token_accuracy": 0.7488138407468796, | |
| "num_tokens": 189627447.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.5170182392001152, | |
| "epoch": 0.3338046779016667, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.33209716298906e-05, | |
| "loss": 0.9682, | |
| "mean_token_accuracy": 0.7511688120663166, | |
| "num_tokens": 190672216.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.5092007040977478, | |
| "epoch": 0.33565914833445376, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.322825885407009e-05, | |
| "loss": 1.0091, | |
| "mean_token_accuracy": 0.7405895851552486, | |
| "num_tokens": 191725828.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.5075189456343652, | |
| "epoch": 0.3375136187672408, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.313554607824958e-05, | |
| "loss": 0.9983, | |
| "mean_token_accuracy": 0.7437210775911808, | |
| "num_tokens": 192780642.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.5275956213474273, | |
| "epoch": 0.3393680892000278, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.304283330242908e-05, | |
| "loss": 0.9815, | |
| "mean_token_accuracy": 0.7441540375351906, | |
| "num_tokens": 193829443.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.5041967332363129, | |
| "epoch": 0.34122255963281484, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.295012052660857e-05, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7466503635048867, | |
| "num_tokens": 194889098.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.5120372980833054, | |
| "epoch": 0.3430770300656019, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.2857407750788064e-05, | |
| "loss": 1.0067, | |
| "mean_token_accuracy": 0.7406684674322606, | |
| "num_tokens": 195920081.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.5008864670991897, | |
| "epoch": 0.34493150049838894, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.2764694974967555e-05, | |
| "loss": 0.9604, | |
| "mean_token_accuracy": 0.750801931321621, | |
| "num_tokens": 196972010.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.4698647096753121, | |
| "epoch": 0.34678597093117597, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.267198219914705e-05, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.7496237128973007, | |
| "num_tokens": 198032653.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.57385745793581, | |
| "epoch": 0.348640441363963, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.257926942332654e-05, | |
| "loss": 1.0244, | |
| "mean_token_accuracy": 0.738067239522934, | |
| "num_tokens": 199076250.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.5126190572977065, | |
| "epoch": 0.35049491179675, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.248655664750603e-05, | |
| "loss": 0.995, | |
| "mean_token_accuracy": 0.741955729573965, | |
| "num_tokens": 200137212.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.5020122960209847, | |
| "epoch": 0.3523493822295371, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.239384387168552e-05, | |
| "loss": 0.9853, | |
| "mean_token_accuracy": 0.7463338769972324, | |
| "num_tokens": 201216761.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.4905213937163353, | |
| "epoch": 0.3542038526623241, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.230113109586501e-05, | |
| "loss": 0.9425, | |
| "mean_token_accuracy": 0.7550199687480926, | |
| "num_tokens": 202268167.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.4862460494041443, | |
| "epoch": 0.35605832309511115, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.2208418320044505e-05, | |
| "loss": 0.9788, | |
| "mean_token_accuracy": 0.745230408012867, | |
| "num_tokens": 203314805.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.4856702387332916, | |
| "epoch": 0.3579127935278982, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.2115705544224e-05, | |
| "loss": 0.9742, | |
| "mean_token_accuracy": 0.7479361824691295, | |
| "num_tokens": 204393323.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.4965133637189865, | |
| "epoch": 0.3597672639606852, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.202299276840349e-05, | |
| "loss": 0.9926, | |
| "mean_token_accuracy": 0.7441742211580277, | |
| "num_tokens": 205446845.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.477495039999485, | |
| "epoch": 0.3616217343934723, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.193027999258298e-05, | |
| "loss": 0.9485, | |
| "mean_token_accuracy": 0.7558086968958377, | |
| "num_tokens": 206508930.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.4898368567228317, | |
| "epoch": 0.3634762048262593, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.183756721676247e-05, | |
| "loss": 0.9436, | |
| "mean_token_accuracy": 0.75498091802001, | |
| "num_tokens": 207579206.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.4964845299720764, | |
| "epoch": 0.36533067525904633, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.174485444094196e-05, | |
| "loss": 0.9594, | |
| "mean_token_accuracy": 0.7514640748500824, | |
| "num_tokens": 208652674.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.525666716694832, | |
| "epoch": 0.36718514569183336, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.1652141665121455e-05, | |
| "loss": 1.0205, | |
| "mean_token_accuracy": 0.7403210617601872, | |
| "num_tokens": 209702217.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.4894435659050942, | |
| "epoch": 0.36903961612462044, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.1559428889300946e-05, | |
| "loss": 0.9834, | |
| "mean_token_accuracy": 0.7469839192926884, | |
| "num_tokens": 210754519.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.4560721650719644, | |
| "epoch": 0.37089408655740747, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.146671611348044e-05, | |
| "loss": 0.9091, | |
| "mean_token_accuracy": 0.7607064582407475, | |
| "num_tokens": 211830747.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.487524376809597, | |
| "epoch": 0.3727485569901945, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.137400333765993e-05, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7553953982889652, | |
| "num_tokens": 212900832.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.4863699212670327, | |
| "epoch": 0.3746030274229815, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.128129056183942e-05, | |
| "loss": 0.9824, | |
| "mean_token_accuracy": 0.7463753134012222, | |
| "num_tokens": 213986813.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.4884080365300179, | |
| "epoch": 0.37645749785576854, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.118857778601891e-05, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7518541663885117, | |
| "num_tokens": 215034477.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.4824556156992912, | |
| "epoch": 0.3783119682885556, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.1095865010198405e-05, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.757309564948082, | |
| "num_tokens": 216107587.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.5091084837913513, | |
| "epoch": 0.38016643872134265, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.1003152234377896e-05, | |
| "loss": 0.9881, | |
| "mean_token_accuracy": 0.7450352385640144, | |
| "num_tokens": 217164463.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.4799865633249283, | |
| "epoch": 0.3820209091541297, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.091043945855739e-05, | |
| "loss": 0.9328, | |
| "mean_token_accuracy": 0.7579915829002857, | |
| "num_tokens": 218211418.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.495280773937702, | |
| "epoch": 0.3838753795869167, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.081772668273688e-05, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.7481582410633564, | |
| "num_tokens": 219283144.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.4717520207166672, | |
| "epoch": 0.3857298500197037, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.072501390691637e-05, | |
| "loss": 0.9638, | |
| "mean_token_accuracy": 0.7489688910543919, | |
| "num_tokens": 220346088.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.5372625097632409, | |
| "epoch": 0.3875843204524908, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.063230113109586e-05, | |
| "loss": 1.0178, | |
| "mean_token_accuracy": 0.7383482500910759, | |
| "num_tokens": 221384371.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.513966853916645, | |
| "epoch": 0.38943879088527783, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.0539588355275354e-05, | |
| "loss": 0.9712, | |
| "mean_token_accuracy": 0.7481911785900592, | |
| "num_tokens": 222437994.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.5028332889080047, | |
| "epoch": 0.39129326131806486, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.044687557945485e-05, | |
| "loss": 0.9774, | |
| "mean_token_accuracy": 0.7485150240361691, | |
| "num_tokens": 223495329.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.499116560816765, | |
| "epoch": 0.3931477317508519, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.035416280363434e-05, | |
| "loss": 0.9651, | |
| "mean_token_accuracy": 0.7515447452664376, | |
| "num_tokens": 224569139.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.4938910588622094, | |
| "epoch": 0.39500220218363896, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.0261450027813836e-05, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.745938503742218, | |
| "num_tokens": 225635075.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.5413131147623063, | |
| "epoch": 0.396856672616426, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.0168737251993328e-05, | |
| "loss": 1.0066, | |
| "mean_token_accuracy": 0.7391817659139633, | |
| "num_tokens": 226672304.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.4767320200800895, | |
| "epoch": 0.398711143049213, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 3.007602447617282e-05, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7535664431750775, | |
| "num_tokens": 227735942.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.4989089101552964, | |
| "epoch": 0.40056561348200004, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 2.998331170035231e-05, | |
| "loss": 0.9566, | |
| "mean_token_accuracy": 0.7526751324534416, | |
| "num_tokens": 228790369.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.5012748405337333, | |
| "epoch": 0.40242008391478706, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.9890598924531803e-05, | |
| "loss": 0.9661, | |
| "mean_token_accuracy": 0.7478947170078755, | |
| "num_tokens": 229850971.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.4950363628566266, | |
| "epoch": 0.40427455434757414, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.9797886148711298e-05, | |
| "loss": 0.9851, | |
| "mean_token_accuracy": 0.7478720583021641, | |
| "num_tokens": 230907089.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.5064437121152878, | |
| "epoch": 0.40612902478036117, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.970517337289079e-05, | |
| "loss": 0.9872, | |
| "mean_token_accuracy": 0.7438164681196213, | |
| "num_tokens": 231960587.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.4914230838418008, | |
| "epoch": 0.4079834952131482, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.961246059707028e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.74920649304986, | |
| "num_tokens": 233042527.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.5093663066625596, | |
| "epoch": 0.4098379656459352, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 2.9519747821249772e-05, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7494742095470428, | |
| "num_tokens": 234104443.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.4405021965503693, | |
| "epoch": 0.41169243607872225, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.9427035045429264e-05, | |
| "loss": 0.9488, | |
| "mean_token_accuracy": 0.754053720831871, | |
| "num_tokens": 235212312.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.5070395275950432, | |
| "epoch": 0.4135469065115093, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.9334322269608756e-05, | |
| "loss": 1.0017, | |
| "mean_token_accuracy": 0.7438942410051823, | |
| "num_tokens": 236272812.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.4942486226558684, | |
| "epoch": 0.41540137694429635, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.9241609493788247e-05, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7500807568430901, | |
| "num_tokens": 237342813.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.478409792482853, | |
| "epoch": 0.4172558473770834, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.914889671796774e-05, | |
| "loss": 0.9608, | |
| "mean_token_accuracy": 0.7491750225424767, | |
| "num_tokens": 238432371.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.473175536096096, | |
| "epoch": 0.4191103178098704, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 2.905618394214723e-05, | |
| "loss": 0.9479, | |
| "mean_token_accuracy": 0.7542769216001034, | |
| "num_tokens": 239500279.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.481423157453537, | |
| "epoch": 0.4209647882426575, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.8963471166326722e-05, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.7434468001127243, | |
| "num_tokens": 240555389.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.4694676980376244, | |
| "epoch": 0.4228192586754445, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.8870758390506214e-05, | |
| "loss": 0.9543, | |
| "mean_token_accuracy": 0.7546335682272911, | |
| "num_tokens": 241597038.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.4982939019799233, | |
| "epoch": 0.42467372910823153, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.8778045614685705e-05, | |
| "loss": 0.9902, | |
| "mean_token_accuracy": 0.7457614749670028, | |
| "num_tokens": 242684521.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.521801333129406, | |
| "epoch": 0.42652819954101856, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 2.8685332838865197e-05, | |
| "loss": 0.9603, | |
| "mean_token_accuracy": 0.7483425281941891, | |
| "num_tokens": 243748375.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.4592545449733734, | |
| "epoch": 0.4283826699738056, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 2.859262006304469e-05, | |
| "loss": 0.9493, | |
| "mean_token_accuracy": 0.7528668269515038, | |
| "num_tokens": 244813166.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.496128936111927, | |
| "epoch": 0.43023714040659267, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.849990728722418e-05, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7490008033812046, | |
| "num_tokens": 245887016.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.4878642790019512, | |
| "epoch": 0.4320916108393797, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 2.8407194511403672e-05, | |
| "loss": 0.9878, | |
| "mean_token_accuracy": 0.7439650490880012, | |
| "num_tokens": 246926092.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.4560546413064004, | |
| "epoch": 0.4339460812721667, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.8314481735583164e-05, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7542134150862694, | |
| "num_tokens": 247986019.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.5113477945327758, | |
| "epoch": 0.43580055170495374, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.8221768959762655e-05, | |
| "loss": 0.9715, | |
| "mean_token_accuracy": 0.749272059649229, | |
| "num_tokens": 249051241.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.4563384518027305, | |
| "epoch": 0.43765502213774077, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.8129056183942147e-05, | |
| "loss": 0.9464, | |
| "mean_token_accuracy": 0.7528363347053528, | |
| "num_tokens": 250105280.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.4920080468058585, | |
| "epoch": 0.43950949257052785, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.8036343408121642e-05, | |
| "loss": 0.9825, | |
| "mean_token_accuracy": 0.74784589484334, | |
| "num_tokens": 251169434.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.4813191577792169, | |
| "epoch": 0.4413639630033149, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 2.7943630632301133e-05, | |
| "loss": 0.9475, | |
| "mean_token_accuracy": 0.752784612774849, | |
| "num_tokens": 252211794.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.492856302857399, | |
| "epoch": 0.4432184334361019, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 2.7850917856480625e-05, | |
| "loss": 0.959, | |
| "mean_token_accuracy": 0.7523039512336254, | |
| "num_tokens": 253259848.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.5127600625157356, | |
| "epoch": 0.4450729038688889, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.7758205080660117e-05, | |
| "loss": 1.0054, | |
| "mean_token_accuracy": 0.7410042941570282, | |
| "num_tokens": 254328601.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.454398836195469, | |
| "epoch": 0.44692737430167595, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 2.766549230483961e-05, | |
| "loss": 0.9416, | |
| "mean_token_accuracy": 0.7552081108093261, | |
| "num_tokens": 255348940.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.5135699644684792, | |
| "epoch": 0.44878184473446303, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.75727795290191e-05, | |
| "loss": 0.9744, | |
| "mean_token_accuracy": 0.7477516449987889, | |
| "num_tokens": 256398405.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.4741832569241524, | |
| "epoch": 0.45063631516725006, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.748006675319859e-05, | |
| "loss": 0.9375, | |
| "mean_token_accuracy": 0.753797996789217, | |
| "num_tokens": 257444530.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.48117203861475, | |
| "epoch": 0.4524907856000371, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.7387353977378083e-05, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.7464524216949939, | |
| "num_tokens": 258537377.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.524972130358219, | |
| "epoch": 0.4543452560328241, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.7294641201557575e-05, | |
| "loss": 1.0074, | |
| "mean_token_accuracy": 0.7406359665095806, | |
| "num_tokens": 259586462.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.5012261882424354, | |
| "epoch": 0.4561997264656112, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 2.7201928425737067e-05, | |
| "loss": 0.9615, | |
| "mean_token_accuracy": 0.7501843862235547, | |
| "num_tokens": 260632871.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.4786252811551095, | |
| "epoch": 0.4580541968983982, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.7109215649916558e-05, | |
| "loss": 0.96, | |
| "mean_token_accuracy": 0.7490208201110363, | |
| "num_tokens": 261691442.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.4770388960838319, | |
| "epoch": 0.45990866733118524, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.701650287409605e-05, | |
| "loss": 0.9707, | |
| "mean_token_accuracy": 0.7508393302559853, | |
| "num_tokens": 262751076.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.5058487862348557, | |
| "epoch": 0.46176313776397226, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.692379009827554e-05, | |
| "loss": 0.9618, | |
| "mean_token_accuracy": 0.7508329167962075, | |
| "num_tokens": 263815315.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.5378570273518561, | |
| "epoch": 0.4636176081967593, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.6831077322455033e-05, | |
| "loss": 0.995, | |
| "mean_token_accuracy": 0.7431889310479164, | |
| "num_tokens": 264880145.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.4923908486962318, | |
| "epoch": 0.46547207862954637, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.6738364546634525e-05, | |
| "loss": 0.9708, | |
| "mean_token_accuracy": 0.7466483362019062, | |
| "num_tokens": 265940636.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.4968330979347229, | |
| "epoch": 0.4673265490623334, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.6645651770814016e-05, | |
| "loss": 0.9674, | |
| "mean_token_accuracy": 0.7497738540172577, | |
| "num_tokens": 267012320.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.4725614503026008, | |
| "epoch": 0.4691810194951204, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.6552938994993508e-05, | |
| "loss": 0.9507, | |
| "mean_token_accuracy": 0.7529752597212791, | |
| "num_tokens": 268063135.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.4885497689247131, | |
| "epoch": 0.47103548992790745, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.6460226219173e-05, | |
| "loss": 0.9361, | |
| "mean_token_accuracy": 0.7578341595828533, | |
| "num_tokens": 269105968.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.5032660901546477, | |
| "epoch": 0.4728899603606945, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.636751344335249e-05, | |
| "loss": 0.9802, | |
| "mean_token_accuracy": 0.7475208982825279, | |
| "num_tokens": 270150663.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.4990951001644135, | |
| "epoch": 0.47474443079348155, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.6274800667531986e-05, | |
| "loss": 0.9973, | |
| "mean_token_accuracy": 0.7451814636588097, | |
| "num_tokens": 271216839.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 1.485686069726944, | |
| "epoch": 0.4765989012262686, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.6182087891711478e-05, | |
| "loss": 0.9651, | |
| "mean_token_accuracy": 0.7499825455248356, | |
| "num_tokens": 272292596.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.4842319995164872, | |
| "epoch": 0.4784533716590556, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 2.608937511589097e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7453496947884559, | |
| "num_tokens": 273336790.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.5063641920685769, | |
| "epoch": 0.48030784209184263, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.5996662340070464e-05, | |
| "loss": 0.9756, | |
| "mean_token_accuracy": 0.7470906026661396, | |
| "num_tokens": 274408187.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.4787015438079834, | |
| "epoch": 0.4821623125246297, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.5903949564249956e-05, | |
| "loss": 0.9387, | |
| "mean_token_accuracy": 0.7545268073678016, | |
| "num_tokens": 275477280.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.4689789846539498, | |
| "epoch": 0.48401678295741674, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.5811236788429448e-05, | |
| "loss": 0.9599, | |
| "mean_token_accuracy": 0.7509848773479462, | |
| "num_tokens": 276535000.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 1.4680343508720397, | |
| "epoch": 0.48587125339020376, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.571852401260894e-05, | |
| "loss": 0.9426, | |
| "mean_token_accuracy": 0.7553624622523785, | |
| "num_tokens": 277580274.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.4680645748972894, | |
| "epoch": 0.4877257238229908, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.5625811236788434e-05, | |
| "loss": 0.9673, | |
| "mean_token_accuracy": 0.7474786549806595, | |
| "num_tokens": 278646253.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.4596327722072602, | |
| "epoch": 0.4895801942557778, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.5533098460967926e-05, | |
| "loss": 0.966, | |
| "mean_token_accuracy": 0.7492500238120556, | |
| "num_tokens": 279699537.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.498211531341076, | |
| "epoch": 0.4914346646885649, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.5440385685147418e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7491111300885678, | |
| "num_tokens": 280774526.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.4900087475776673, | |
| "epoch": 0.4932891351213519, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.534767290932691e-05, | |
| "loss": 0.9564, | |
| "mean_token_accuracy": 0.752580750733614, | |
| "num_tokens": 281826301.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 1.4648251444101335, | |
| "epoch": 0.49514360555413894, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.52549601335064e-05, | |
| "loss": 0.9348, | |
| "mean_token_accuracy": 0.7561902843415738, | |
| "num_tokens": 282910944.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.4814371794462204, | |
| "epoch": 0.49699807598692597, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.5162247357685893e-05, | |
| "loss": 0.9848, | |
| "mean_token_accuracy": 0.7451262950897217, | |
| "num_tokens": 283971901.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.4791611835360527, | |
| "epoch": 0.498852546419713, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.5069534581865384e-05, | |
| "loss": 0.9268, | |
| "mean_token_accuracy": 0.7565168611705303, | |
| "num_tokens": 285014033.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.4891420349478721, | |
| "epoch": 0.5007070168525001, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.4976821806044876e-05, | |
| "loss": 0.9735, | |
| "mean_token_accuracy": 0.7464899241924285, | |
| "num_tokens": 286070419.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.4785126805305482, | |
| "epoch": 0.502561487285287, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.4884109030224367e-05, | |
| "loss": 0.936, | |
| "mean_token_accuracy": 0.7564310371875763, | |
| "num_tokens": 287115304.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 1.4757246285676957, | |
| "epoch": 0.5044159577180741, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.479139625440386e-05, | |
| "loss": 0.9784, | |
| "mean_token_accuracy": 0.7486630484461785, | |
| "num_tokens": 288157812.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.4749191030859947, | |
| "epoch": 0.5062704281508612, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.469868347858335e-05, | |
| "loss": 0.9481, | |
| "mean_token_accuracy": 0.7532638981938362, | |
| "num_tokens": 289240520.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.5210166841745376, | |
| "epoch": 0.5081248985836482, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.4605970702762842e-05, | |
| "loss": 0.9694, | |
| "mean_token_accuracy": 0.7475578837096691, | |
| "num_tokens": 290299565.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.4943155393004417, | |
| "epoch": 0.5099793690164353, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.4513257926942334e-05, | |
| "loss": 0.9542, | |
| "mean_token_accuracy": 0.7541745394468308, | |
| "num_tokens": 291402079.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.4766088038682939, | |
| "epoch": 0.5118338394492222, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.4420545151121826e-05, | |
| "loss": 0.9803, | |
| "mean_token_accuracy": 0.7470546357333661, | |
| "num_tokens": 292453179.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.4746219411492347, | |
| "epoch": 0.5136883098820093, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.4327832375301317e-05, | |
| "loss": 0.9636, | |
| "mean_token_accuracy": 0.7486713968217373, | |
| "num_tokens": 293513612.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.4913909077644347, | |
| "epoch": 0.5155427803147964, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.423511959948081e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7459882214665413, | |
| "num_tokens": 294583601.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.4834832921624184, | |
| "epoch": 0.5173972507475834, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.41424068236603e-05, | |
| "loss": 0.965, | |
| "mean_token_accuracy": 0.7506721802055836, | |
| "num_tokens": 295635275.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.5102943912148477, | |
| "epoch": 0.5192517211803704, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.4049694047839792e-05, | |
| "loss": 0.9881, | |
| "mean_token_accuracy": 0.7445619903504849, | |
| "num_tokens": 296696345.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.4816748037934304, | |
| "epoch": 0.5211061916131575, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.3956981272019287e-05, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.7549451880156994, | |
| "num_tokens": 297730143.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.4684258542954922, | |
| "epoch": 0.5229606620459445, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.386426849619878e-05, | |
| "loss": 0.9508, | |
| "mean_token_accuracy": 0.7554137200117111, | |
| "num_tokens": 298823200.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.4529480740427971, | |
| "epoch": 0.5248151324787316, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.377155572037827e-05, | |
| "loss": 0.9117, | |
| "mean_token_accuracy": 0.7614098109304905, | |
| "num_tokens": 299909022.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.5192692294716834, | |
| "epoch": 0.5266696029115185, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.3678842944557762e-05, | |
| "loss": 1.0104, | |
| "mean_token_accuracy": 0.7408441670238972, | |
| "num_tokens": 300965402.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.5039107143878936, | |
| "epoch": 0.5285240733443056, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 2.3586130168737254e-05, | |
| "loss": 0.9589, | |
| "mean_token_accuracy": 0.7500914677977562, | |
| "num_tokens": 302032279.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.4741551741957664, | |
| "epoch": 0.5303785437770927, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 2.3493417392916745e-05, | |
| "loss": 0.9946, | |
| "mean_token_accuracy": 0.7439890936017036, | |
| "num_tokens": 303087076.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.4996902346611023, | |
| "epoch": 0.5322330142098797, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.3400704617096237e-05, | |
| "loss": 0.9642, | |
| "mean_token_accuracy": 0.7501103632152081, | |
| "num_tokens": 304135591.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.4848915442824364, | |
| "epoch": 0.5340874846426668, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 2.330799184127573e-05, | |
| "loss": 0.9851, | |
| "mean_token_accuracy": 0.7483362331986427, | |
| "num_tokens": 305198236.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.5109708786010743, | |
| "epoch": 0.5359419550754537, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.321527906545522e-05, | |
| "loss": 0.979, | |
| "mean_token_accuracy": 0.7481191471219063, | |
| "num_tokens": 306264580.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.4944818690419197, | |
| "epoch": 0.5377964255082408, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.3122566289634712e-05, | |
| "loss": 0.9697, | |
| "mean_token_accuracy": 0.7479268446564674, | |
| "num_tokens": 307330523.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.463561537861824, | |
| "epoch": 0.5396508959410279, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.3029853513814203e-05, | |
| "loss": 0.9272, | |
| "mean_token_accuracy": 0.757349643856287, | |
| "num_tokens": 308404021.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.5021937012672424, | |
| "epoch": 0.5415053663738149, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.2937140737993695e-05, | |
| "loss": 0.9997, | |
| "mean_token_accuracy": 0.7431203842163085, | |
| "num_tokens": 309470134.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.4963906899094581, | |
| "epoch": 0.5433598368066019, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.284442796217319e-05, | |
| "loss": 1.001, | |
| "mean_token_accuracy": 0.7428174577653408, | |
| "num_tokens": 310528655.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.5017639353871346, | |
| "epoch": 0.5452143072393889, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.275171518635268e-05, | |
| "loss": 0.9866, | |
| "mean_token_accuracy": 0.745575213432312, | |
| "num_tokens": 311626877.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.5059634268283844, | |
| "epoch": 0.547068777672176, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.2659002410532173e-05, | |
| "loss": 0.9784, | |
| "mean_token_accuracy": 0.7477149114012718, | |
| "num_tokens": 312694756.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.4771594122052192, | |
| "epoch": 0.5489232481049631, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.2566289634711665e-05, | |
| "loss": 0.9768, | |
| "mean_token_accuracy": 0.7453847780823708, | |
| "num_tokens": 313732408.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.5033201217651366, | |
| "epoch": 0.55077771853775, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.2473576858891157e-05, | |
| "loss": 0.9826, | |
| "mean_token_accuracy": 0.7462266206741333, | |
| "num_tokens": 314792581.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.500457538664341, | |
| "epoch": 0.5526321889705371, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.2380864083070648e-05, | |
| "loss": 1.0032, | |
| "mean_token_accuracy": 0.7431966498494148, | |
| "num_tokens": 315840781.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.492275284230709, | |
| "epoch": 0.5544866594033241, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.228815130725014e-05, | |
| "loss": 0.9862, | |
| "mean_token_accuracy": 0.7439711920917034, | |
| "num_tokens": 316882397.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.5086371764540671, | |
| "epoch": 0.5563411298361112, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.219543853142963e-05, | |
| "loss": 0.97, | |
| "mean_token_accuracy": 0.7470115289092064, | |
| "num_tokens": 317929537.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.5073087349534036, | |
| "epoch": 0.5581956002688983, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 2.2102725755609123e-05, | |
| "loss": 0.9942, | |
| "mean_token_accuracy": 0.7448109589517117, | |
| "num_tokens": 319000305.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 1.5109971106052398, | |
| "epoch": 0.5600500707016852, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.2010012979788615e-05, | |
| "loss": 0.9914, | |
| "mean_token_accuracy": 0.7461639195680618, | |
| "num_tokens": 320064134.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.4933106511831284, | |
| "epoch": 0.5619045411344723, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.1917300203968106e-05, | |
| "loss": 0.9611, | |
| "mean_token_accuracy": 0.7532710127532483, | |
| "num_tokens": 321124200.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.5030161023139954, | |
| "epoch": 0.5637590115672593, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.18245874281476e-05, | |
| "loss": 1.0027, | |
| "mean_token_accuracy": 0.742536649107933, | |
| "num_tokens": 322198783.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.4719531655311584, | |
| "epoch": 0.5656134820000464, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.1731874652327093e-05, | |
| "loss": 0.9503, | |
| "mean_token_accuracy": 0.7514505423605442, | |
| "num_tokens": 323300829.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.492191916704178, | |
| "epoch": 0.5674679524328334, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.1639161876506585e-05, | |
| "loss": 0.9477, | |
| "mean_token_accuracy": 0.753364497423172, | |
| "num_tokens": 324377836.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 1.4711406081914902, | |
| "epoch": 0.5693224228656204, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.1546449100686076e-05, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.7476269051432609, | |
| "num_tokens": 325446116.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.4610206931829453, | |
| "epoch": 0.5711768932984075, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.1453736324865568e-05, | |
| "loss": 0.9439, | |
| "mean_token_accuracy": 0.755161764472723, | |
| "num_tokens": 326486752.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 1.4874543443322181, | |
| "epoch": 0.5730313637311946, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.136102354904506e-05, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.7532952725887299, | |
| "num_tokens": 327553081.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.5083105087280273, | |
| "epoch": 0.5748858341639815, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 2.126831077322455e-05, | |
| "loss": 1.0124, | |
| "mean_token_accuracy": 0.7425495445728302, | |
| "num_tokens": 328593686.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.4853515386581422, | |
| "epoch": 0.5767403045967686, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 2.1175597997404043e-05, | |
| "loss": 0.927, | |
| "mean_token_accuracy": 0.7558034144341945, | |
| "num_tokens": 329656622.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.4809951201081275, | |
| "epoch": 0.5785947750295556, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.1082885221583534e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.7493963360786438, | |
| "num_tokens": 330707482.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.4996532037854196, | |
| "epoch": 0.5804492454623427, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.0990172445763026e-05, | |
| "loss": 0.9428, | |
| "mean_token_accuracy": 0.7534454755485058, | |
| "num_tokens": 331768437.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.5152997568249702, | |
| "epoch": 0.5823037158951297, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 2.0897459669942518e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.7480122342705726, | |
| "num_tokens": 332802757.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.4685736253857613, | |
| "epoch": 0.5841581863279167, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.080474689412201e-05, | |
| "loss": 0.935, | |
| "mean_token_accuracy": 0.7560048542916775, | |
| "num_tokens": 333888594.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.4684315398335457, | |
| "epoch": 0.5860126567607038, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.0712034118301504e-05, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.751179576665163, | |
| "num_tokens": 334927100.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.4995916575193404, | |
| "epoch": 0.5878671271934908, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.0619321342480996e-05, | |
| "loss": 0.9474, | |
| "mean_token_accuracy": 0.7539438650012016, | |
| "num_tokens": 335989790.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 1.4737093344330787, | |
| "epoch": 0.5897215976262778, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.0526608566660488e-05, | |
| "loss": 0.9411, | |
| "mean_token_accuracy": 0.7552854031324386, | |
| "num_tokens": 337047619.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.5014163866639136, | |
| "epoch": 0.5915760680590649, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 2.043389579083998e-05, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7403368279337883, | |
| "num_tokens": 338076390.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.509169700741768, | |
| "epoch": 0.5934305384918519, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.034118301501947e-05, | |
| "loss": 0.9648, | |
| "mean_token_accuracy": 0.7509156338870525, | |
| "num_tokens": 339135038.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.4819589957594872, | |
| "epoch": 0.595285008924639, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.0248470239198962e-05, | |
| "loss": 0.9461, | |
| "mean_token_accuracy": 0.7544753670692443, | |
| "num_tokens": 340206702.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.5020015835762024, | |
| "epoch": 0.597139479357426, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.0155757463378454e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.750271425396204, | |
| "num_tokens": 341278133.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.496476447582245, | |
| "epoch": 0.598993949790213, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.0063044687557946e-05, | |
| "loss": 0.9793, | |
| "mean_token_accuracy": 0.7465926133096218, | |
| "num_tokens": 342346590.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 1.4929743528366088, | |
| "epoch": 0.6008484202230001, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 1.9970331911737437e-05, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.7477845475077629, | |
| "num_tokens": 343409959.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.489282250404358, | |
| "epoch": 0.6027028906557871, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.987761913591693e-05, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.746171435713768, | |
| "num_tokens": 344472251.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.46794663220644, | |
| "epoch": 0.6045573610885742, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.9784906360096424e-05, | |
| "loss": 0.9864, | |
| "mean_token_accuracy": 0.7453903436660767, | |
| "num_tokens": 345523269.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.451069823652506, | |
| "epoch": 0.6064118315213611, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.9692193584275916e-05, | |
| "loss": 0.9068, | |
| "mean_token_accuracy": 0.763156121224165, | |
| "num_tokens": 346593676.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.5116953164339066, | |
| "epoch": 0.6082663019541482, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.9599480808455407e-05, | |
| "loss": 0.9689, | |
| "mean_token_accuracy": 0.7481848277151585, | |
| "num_tokens": 347631200.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 1.492818634212017, | |
| "epoch": 0.6101207723869353, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.95067680326349e-05, | |
| "loss": 0.98, | |
| "mean_token_accuracy": 0.7490465089678764, | |
| "num_tokens": 348691705.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.452520215511322, | |
| "epoch": 0.6119752428197223, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.941405525681439e-05, | |
| "loss": 0.9176, | |
| "mean_token_accuracy": 0.7612765401601791, | |
| "num_tokens": 349759178.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.498873221874237, | |
| "epoch": 0.6138297132525093, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.9321342480993882e-05, | |
| "loss": 0.9976, | |
| "mean_token_accuracy": 0.7423401869833469, | |
| "num_tokens": 350806373.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.470912192761898, | |
| "epoch": 0.6156841836852963, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.9228629705173374e-05, | |
| "loss": 0.9561, | |
| "mean_token_accuracy": 0.7500808753073216, | |
| "num_tokens": 351865350.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.4916383937001227, | |
| "epoch": 0.6175386541180834, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.9135916929352865e-05, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.7531688518822193, | |
| "num_tokens": 352950751.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 1.4598018139600755, | |
| "epoch": 0.6193931245508705, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.9043204153532357e-05, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7517839275300503, | |
| "num_tokens": 353996465.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.4958252064883708, | |
| "epoch": 0.6212475949836574, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.895049137771185e-05, | |
| "loss": 0.9408, | |
| "mean_token_accuracy": 0.7561353132128715, | |
| "num_tokens": 355078338.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.5032134041190148, | |
| "epoch": 0.6231020654164445, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.885777860189134e-05, | |
| "loss": 0.933, | |
| "mean_token_accuracy": 0.7582812868058681, | |
| "num_tokens": 356131684.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.5012152999639512, | |
| "epoch": 0.6249565358492316, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.8765065826070832e-05, | |
| "loss": 0.9295, | |
| "mean_token_accuracy": 0.7572812087833881, | |
| "num_tokens": 357226073.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.4986595943570138, | |
| "epoch": 0.6268110062820186, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.8672353050250324e-05, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7492790259420872, | |
| "num_tokens": 358286834.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.4919024623930455, | |
| "epoch": 0.6286654767148057, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.857964027442982e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7474049381911755, | |
| "num_tokens": 359369248.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.4804231166839599, | |
| "epoch": 0.6305199471475926, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.848692749860931e-05, | |
| "loss": 0.9343, | |
| "mean_token_accuracy": 0.7546016350388527, | |
| "num_tokens": 360440098.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.4833512425422668, | |
| "epoch": 0.6323744175803797, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.8394214722788802e-05, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.7520769476890564, | |
| "num_tokens": 361521377.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.5190963119268417, | |
| "epoch": 0.6342288880131668, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.8301501946968293e-05, | |
| "loss": 1.0118, | |
| "mean_token_accuracy": 0.7398212127387523, | |
| "num_tokens": 362586843.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.4864340499043465, | |
| "epoch": 0.6360833584459538, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.8208789171147785e-05, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.7533850878477096, | |
| "num_tokens": 363665374.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 1.502338708937168, | |
| "epoch": 0.6379378288787408, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.8116076395327277e-05, | |
| "loss": 0.9712, | |
| "mean_token_accuracy": 0.7485703691840172, | |
| "num_tokens": 364737586.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.4999055236577987, | |
| "epoch": 0.6397922993115278, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.802336361950677e-05, | |
| "loss": 0.9707, | |
| "mean_token_accuracy": 0.7463179692626, | |
| "num_tokens": 365800440.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.4834224879741669, | |
| "epoch": 0.6416467697443149, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 1.793065084368626e-05, | |
| "loss": 0.9637, | |
| "mean_token_accuracy": 0.7493973925709725, | |
| "num_tokens": 366877573.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.4893981993198395, | |
| "epoch": 0.643501240177102, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.783793806786575e-05, | |
| "loss": 0.9818, | |
| "mean_token_accuracy": 0.7473383404314518, | |
| "num_tokens": 367928733.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.4837512448430061, | |
| "epoch": 0.6453557106098889, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.7745225292045247e-05, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7507466569542884, | |
| "num_tokens": 368987327.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.4967763364315032, | |
| "epoch": 0.647210181042676, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.7652512516224738e-05, | |
| "loss": 0.9989, | |
| "mean_token_accuracy": 0.7445353485643864, | |
| "num_tokens": 370043783.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.4885783538222312, | |
| "epoch": 0.649064651475463, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.755979974040423e-05, | |
| "loss": 0.9638, | |
| "mean_token_accuracy": 0.7516114957630634, | |
| "num_tokens": 371090558.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.4688554242253304, | |
| "epoch": 0.6509191219082501, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.746708696458372e-05, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7488493897020817, | |
| "num_tokens": 372160561.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.4996945574879645, | |
| "epoch": 0.6527735923410372, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.7374374188763213e-05, | |
| "loss": 0.9925, | |
| "mean_token_accuracy": 0.7454711809754372, | |
| "num_tokens": 373225734.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 1.5062290534377099, | |
| "epoch": 0.6546280627738241, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.7281661412942705e-05, | |
| "loss": 0.9363, | |
| "mean_token_accuracy": 0.7560000211000443, | |
| "num_tokens": 374318316.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.4643490687012672, | |
| "epoch": 0.6564825332066112, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.7188948637122196e-05, | |
| "loss": 0.9649, | |
| "mean_token_accuracy": 0.7522344447672367, | |
| "num_tokens": 375370562.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.4726564817130565, | |
| "epoch": 0.6583370036393982, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.7096235861301688e-05, | |
| "loss": 0.9354, | |
| "mean_token_accuracy": 0.7573029786348343, | |
| "num_tokens": 376426378.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.4889881744980813, | |
| "epoch": 0.6601914740721853, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.700352308548118e-05, | |
| "loss": 0.9629, | |
| "mean_token_accuracy": 0.7513776242733001, | |
| "num_tokens": 377531558.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.4645411357283593, | |
| "epoch": 0.6620459445049723, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.691081030966067e-05, | |
| "loss": 0.914, | |
| "mean_token_accuracy": 0.7601237520575523, | |
| "num_tokens": 378607556.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.4682490974664688, | |
| "epoch": 0.6639004149377593, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.6818097533840163e-05, | |
| "loss": 0.9821, | |
| "mean_token_accuracy": 0.7478334411978722, | |
| "num_tokens": 379667530.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.4883572086691856, | |
| "epoch": 0.6657548853705464, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.6725384758019655e-05, | |
| "loss": 0.9792, | |
| "mean_token_accuracy": 0.7484480515122414, | |
| "num_tokens": 380719586.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.4914529219269752, | |
| "epoch": 0.6676093558033334, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.6632671982199146e-05, | |
| "loss": 0.9662, | |
| "mean_token_accuracy": 0.7495297357439995, | |
| "num_tokens": 381790507.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.4941829800605775, | |
| "epoch": 0.6694638262361204, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 1.6539959206378638e-05, | |
| "loss": 0.9627, | |
| "mean_token_accuracy": 0.752415356785059, | |
| "num_tokens": 382833962.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.486792764812708, | |
| "epoch": 0.6713182966689075, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.6447246430558133e-05, | |
| "loss": 0.976, | |
| "mean_token_accuracy": 0.7484071709215641, | |
| "num_tokens": 383896821.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.4851623505353928, | |
| "epoch": 0.6731727671016945, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.6354533654737624e-05, | |
| "loss": 0.9495, | |
| "mean_token_accuracy": 0.7565177015960216, | |
| "num_tokens": 384976424.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.4791170462965966, | |
| "epoch": 0.6750272375344816, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.6261820878917116e-05, | |
| "loss": 0.9566, | |
| "mean_token_accuracy": 0.7526618793606759, | |
| "num_tokens": 386040603.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.4501399517059326, | |
| "epoch": 0.6768817079672685, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.6169108103096608e-05, | |
| "loss": 0.9108, | |
| "mean_token_accuracy": 0.7589847564697265, | |
| "num_tokens": 387108397.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.464148323237896, | |
| "epoch": 0.6787361784000556, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.60763953272761e-05, | |
| "loss": 0.9485, | |
| "mean_token_accuracy": 0.7533691465854645, | |
| "num_tokens": 388181249.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.4938798516988754, | |
| "epoch": 0.6805906488328427, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.598368255145559e-05, | |
| "loss": 0.9702, | |
| "mean_token_accuracy": 0.7443335071206093, | |
| "num_tokens": 389241327.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.4977298349142074, | |
| "epoch": 0.6824451192656297, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.5890969775635083e-05, | |
| "loss": 0.9782, | |
| "mean_token_accuracy": 0.7493897087872028, | |
| "num_tokens": 390333051.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.4779933258891105, | |
| "epoch": 0.6842995896984168, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.5798256999814574e-05, | |
| "loss": 0.9387, | |
| "mean_token_accuracy": 0.756196653097868, | |
| "num_tokens": 391409337.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.4538267716765403, | |
| "epoch": 0.6861540601312038, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.570554422399407e-05, | |
| "loss": 0.938, | |
| "mean_token_accuracy": 0.7556686386466026, | |
| "num_tokens": 392463946.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.4732056334614754, | |
| "epoch": 0.6880085305639908, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.561283144817356e-05, | |
| "loss": 0.9408, | |
| "mean_token_accuracy": 0.7548085488379002, | |
| "num_tokens": 393523175.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.4476398959755898, | |
| "epoch": 0.6898630009967779, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.5520118672353052e-05, | |
| "loss": 0.9329, | |
| "mean_token_accuracy": 0.7587984256446362, | |
| "num_tokens": 394585544.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.4833741009235382, | |
| "epoch": 0.6917174714295649, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 1.5427405896532544e-05, | |
| "loss": 0.9644, | |
| "mean_token_accuracy": 0.750736665725708, | |
| "num_tokens": 395664360.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.4941645920276643, | |
| "epoch": 0.6935719418623519, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.5334693120712036e-05, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.7502763777971267, | |
| "num_tokens": 396731223.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.4608838826417923, | |
| "epoch": 0.695426412295139, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.5241980344891527e-05, | |
| "loss": 0.933, | |
| "mean_token_accuracy": 0.7558045491576195, | |
| "num_tokens": 397809040.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.4908844530582428, | |
| "epoch": 0.697280882727926, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.5149267569071019e-05, | |
| "loss": 0.9474, | |
| "mean_token_accuracy": 0.7545847944915295, | |
| "num_tokens": 398875483.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.4797591805458068, | |
| "epoch": 0.6991353531607131, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.505655479325051e-05, | |
| "loss": 0.9657, | |
| "mean_token_accuracy": 0.7486329339444637, | |
| "num_tokens": 399923099.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.5088838368654252, | |
| "epoch": 0.7009898235935, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 1.4963842017430002e-05, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.7505823604762554, | |
| "num_tokens": 400961448.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 1.4767660841345787, | |
| "epoch": 0.7028442940262871, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.4871129241609494e-05, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.7565016031265259, | |
| "num_tokens": 402006078.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.5171352908015252, | |
| "epoch": 0.7046987644590742, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.4778416465788986e-05, | |
| "loss": 0.9829, | |
| "mean_token_accuracy": 0.7484120242297649, | |
| "num_tokens": 403052668.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.4764634355902673, | |
| "epoch": 0.7065532348918612, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.4685703689968477e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7503684118390084, | |
| "num_tokens": 404108938.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.4879567474126816, | |
| "epoch": 0.7084077053246483, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.4592990914147969e-05, | |
| "loss": 0.9605, | |
| "mean_token_accuracy": 0.7502434328198433, | |
| "num_tokens": 405156625.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.4794864535331727, | |
| "epoch": 0.7102621757574352, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.450027813832746e-05, | |
| "loss": 0.9461, | |
| "mean_token_accuracy": 0.7536133147776127, | |
| "num_tokens": 406205334.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 1.4959635689854622, | |
| "epoch": 0.7121166461902223, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.4407565362506954e-05, | |
| "loss": 0.9428, | |
| "mean_token_accuracy": 0.7550864294171333, | |
| "num_tokens": 407251695.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 1.4515886753797531, | |
| "epoch": 0.7139711166230094, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 1.4314852586686447e-05, | |
| "loss": 0.9194, | |
| "mean_token_accuracy": 0.7594753108918667, | |
| "num_tokens": 408316949.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.5141615658998488, | |
| "epoch": 0.7158255870557964, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.4222139810865939e-05, | |
| "loss": 0.9763, | |
| "mean_token_accuracy": 0.7471587590873241, | |
| "num_tokens": 409392891.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 1.4961694732308388, | |
| "epoch": 0.7176800574885834, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.412942703504543e-05, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7446932211518288, | |
| "num_tokens": 410451258.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 1.5210398465394974, | |
| "epoch": 0.7195345279213704, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.4036714259224924e-05, | |
| "loss": 0.999, | |
| "mean_token_accuracy": 0.7432930871844292, | |
| "num_tokens": 411511619.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 1.484496508538723, | |
| "epoch": 0.7213889983541575, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.3944001483404415e-05, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7557766437530518, | |
| "num_tokens": 412581185.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.4914864346385002, | |
| "epoch": 0.7232434687869446, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.3851288707583907e-05, | |
| "loss": 0.9958, | |
| "mean_token_accuracy": 0.7432549558579922, | |
| "num_tokens": 413678335.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.493837621808052, | |
| "epoch": 0.7250979392197315, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.3758575931763399e-05, | |
| "loss": 0.9727, | |
| "mean_token_accuracy": 0.7480196230113506, | |
| "num_tokens": 414716445.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 1.5062513753771782, | |
| "epoch": 0.7269524096525186, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.366586315594289e-05, | |
| "loss": 0.9538, | |
| "mean_token_accuracy": 0.7520207174122333, | |
| "num_tokens": 415777634.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 1.498290665447712, | |
| "epoch": 0.7288068800853056, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.3573150380122382e-05, | |
| "loss": 0.9621, | |
| "mean_token_accuracy": 0.7493651449680329, | |
| "num_tokens": 416832630.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 1.4917837798595428, | |
| "epoch": 0.7306613505180927, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.3480437604301873e-05, | |
| "loss": 0.9494, | |
| "mean_token_accuracy": 0.7559087119996548, | |
| "num_tokens": 417904657.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 1.505025325715542, | |
| "epoch": 0.7325158209508797, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.3387724828481365e-05, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.7477701410651207, | |
| "num_tokens": 418970254.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.5279014244675637, | |
| "epoch": 0.7343702913836667, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.3295012052660857e-05, | |
| "loss": 1.0064, | |
| "mean_token_accuracy": 0.7403814144432544, | |
| "num_tokens": 420024726.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 1.483470357954502, | |
| "epoch": 0.7362247618164538, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.320229927684035e-05, | |
| "loss": 0.9493, | |
| "mean_token_accuracy": 0.7545876823365688, | |
| "num_tokens": 421116511.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.5245489403605461, | |
| "epoch": 0.7380792322492409, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.3109586501019842e-05, | |
| "loss": 1.0158, | |
| "mean_token_accuracy": 0.736254769563675, | |
| "num_tokens": 422159541.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 1.5138882473111153, | |
| "epoch": 0.7399337026820278, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.3016873725199333e-05, | |
| "loss": 0.9761, | |
| "mean_token_accuracy": 0.7460170738399029, | |
| "num_tokens": 423218690.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 1.4659401252865791, | |
| "epoch": 0.7417881731148149, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.2924160949378825e-05, | |
| "loss": 0.9523, | |
| "mean_token_accuracy": 0.753886741399765, | |
| "num_tokens": 424291397.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.450306124985218, | |
| "epoch": 0.7436426435476019, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.2831448173558316e-05, | |
| "loss": 0.9319, | |
| "mean_token_accuracy": 0.7554542690515518, | |
| "num_tokens": 425368218.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 1.494744434952736, | |
| "epoch": 0.745497113980389, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.2738735397737808e-05, | |
| "loss": 0.9705, | |
| "mean_token_accuracy": 0.7492655225098133, | |
| "num_tokens": 426452863.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 1.4815644845366478, | |
| "epoch": 0.7473515844131761, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.26460226219173e-05, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.7470330819487572, | |
| "num_tokens": 427540425.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 1.4723653048276901, | |
| "epoch": 0.749206054845963, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.2553309846096791e-05, | |
| "loss": 0.9647, | |
| "mean_token_accuracy": 0.7512865029275417, | |
| "num_tokens": 428606863.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 1.487472727894783, | |
| "epoch": 0.7510605252787501, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.2460597070276285e-05, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.7529655545949936, | |
| "num_tokens": 429685129.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.4989902302622795, | |
| "epoch": 0.7529149957115371, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.2367884294455776e-05, | |
| "loss": 0.946, | |
| "mean_token_accuracy": 0.7536524005234242, | |
| "num_tokens": 430719719.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 1.4869430616497994, | |
| "epoch": 0.7547694661443242, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.2275171518635268e-05, | |
| "loss": 0.9538, | |
| "mean_token_accuracy": 0.7513498216867447, | |
| "num_tokens": 431802510.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 1.4747512489557266, | |
| "epoch": 0.7566239365771112, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.2182458742814761e-05, | |
| "loss": 1.0073, | |
| "mean_token_accuracy": 0.7413670666515827, | |
| "num_tokens": 432851747.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 1.5243913546204566, | |
| "epoch": 0.7584784070098982, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.2089745966994253e-05, | |
| "loss": 0.9909, | |
| "mean_token_accuracy": 0.7449499808251858, | |
| "num_tokens": 433895784.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 1.504411555826664, | |
| "epoch": 0.7603328774426853, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.1997033191173745e-05, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.7530606985092163, | |
| "num_tokens": 434974159.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.4855747856199741, | |
| "epoch": 0.7621873478754723, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.1904320415353236e-05, | |
| "loss": 0.9437, | |
| "mean_token_accuracy": 0.7568481601774693, | |
| "num_tokens": 436025090.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.4951087072491647, | |
| "epoch": 0.7640418183082593, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.1811607639532728e-05, | |
| "loss": 0.9768, | |
| "mean_token_accuracy": 0.7467369221150875, | |
| "num_tokens": 437118270.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 1.4841661915183066, | |
| "epoch": 0.7658962887410464, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.171889486371222e-05, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7506843000650406, | |
| "num_tokens": 438183727.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 1.4763000190258027, | |
| "epoch": 0.7677507591738334, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.1626182087891713e-05, | |
| "loss": 0.9303, | |
| "mean_token_accuracy": 0.7582049675285816, | |
| "num_tokens": 439268510.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 1.5008054867386817, | |
| "epoch": 0.7696052296066205, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.1533469312071204e-05, | |
| "loss": 0.967, | |
| "mean_token_accuracy": 0.7497551701962948, | |
| "num_tokens": 440345674.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.485274675488472, | |
| "epoch": 0.7714597000394074, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.1440756536250696e-05, | |
| "loss": 0.9672, | |
| "mean_token_accuracy": 0.7495969615876674, | |
| "num_tokens": 441398908.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 1.4768414929509164, | |
| "epoch": 0.7733141704721945, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.1348043760430188e-05, | |
| "loss": 0.9713, | |
| "mean_token_accuracy": 0.7479192078113556, | |
| "num_tokens": 442470473.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 1.4754391759634018, | |
| "epoch": 0.7751686409049816, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.125533098460968e-05, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.7518132753670216, | |
| "num_tokens": 443541296.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 1.4812248334288598, | |
| "epoch": 0.7770231113377686, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.1162618208789171e-05, | |
| "loss": 0.9599, | |
| "mean_token_accuracy": 0.7512252993881703, | |
| "num_tokens": 444585794.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 1.4488537713885308, | |
| "epoch": 0.7788775817705557, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.1069905432968664e-05, | |
| "loss": 0.9144, | |
| "mean_token_accuracy": 0.7594800740480423, | |
| "num_tokens": 445628110.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.4917938500642776, | |
| "epoch": 0.7807320522033426, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.0977192657148156e-05, | |
| "loss": 0.9711, | |
| "mean_token_accuracy": 0.7509251795709133, | |
| "num_tokens": 446695715.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 1.5019617602229118, | |
| "epoch": 0.7825865226361297, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.0884479881327647e-05, | |
| "loss": 1.0043, | |
| "mean_token_accuracy": 0.7404218807816505, | |
| "num_tokens": 447723973.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 1.5044475719332695, | |
| "epoch": 0.7844409930689168, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.0791767105507139e-05, | |
| "loss": 0.9739, | |
| "mean_token_accuracy": 0.7469606220722198, | |
| "num_tokens": 448769856.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 1.458498264849186, | |
| "epoch": 0.7862954635017038, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.069905432968663e-05, | |
| "loss": 0.9617, | |
| "mean_token_accuracy": 0.7501473240554333, | |
| "num_tokens": 449801247.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.4931430265307426, | |
| "epoch": 0.7881499339344908, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.0606341553866122e-05, | |
| "loss": 0.9898, | |
| "mean_token_accuracy": 0.7454059682786465, | |
| "num_tokens": 450850271.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.4930280596017838, | |
| "epoch": 0.7900044043672779, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.0513628778045616e-05, | |
| "loss": 0.9881, | |
| "mean_token_accuracy": 0.7425899639725685, | |
| "num_tokens": 451896099.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 1.5165223598480224, | |
| "epoch": 0.7918588748000649, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.0420916002225107e-05, | |
| "loss": 0.9757, | |
| "mean_token_accuracy": 0.7508614018559456, | |
| "num_tokens": 452975189.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 1.4934850245714189, | |
| "epoch": 0.793713345232852, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.0328203226404599e-05, | |
| "loss": 0.9854, | |
| "mean_token_accuracy": 0.7447437711060048, | |
| "num_tokens": 454021880.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 1.504435983300209, | |
| "epoch": 0.7955678156656389, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.023549045058409e-05, | |
| "loss": 1.004, | |
| "mean_token_accuracy": 0.7430733263492584, | |
| "num_tokens": 455066251.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 1.4768665179610252, | |
| "epoch": 0.797422286098426, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.0142777674763582e-05, | |
| "loss": 0.9499, | |
| "mean_token_accuracy": 0.7518707752227783, | |
| "num_tokens": 456103788.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.4671560034155846, | |
| "epoch": 0.7992767565312131, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.0050064898943076e-05, | |
| "loss": 0.9655, | |
| "mean_token_accuracy": 0.7501025825738907, | |
| "num_tokens": 457167675.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 1.5254767999053, | |
| "epoch": 0.8011312269640001, | |
| "grad_norm": 0.625, | |
| "learning_rate": 9.957352123122567e-06, | |
| "loss": 0.9962, | |
| "mean_token_accuracy": 0.7426642939448357, | |
| "num_tokens": 458202815.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 1.5128788575530052, | |
| "epoch": 0.8029856973967872, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 9.864639347302059e-06, | |
| "loss": 0.9749, | |
| "mean_token_accuracy": 0.7444698058068753, | |
| "num_tokens": 459240557.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 1.500946070253849, | |
| "epoch": 0.8048401678295741, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 9.77192657148155e-06, | |
| "loss": 0.9782, | |
| "mean_token_accuracy": 0.7480835281312466, | |
| "num_tokens": 460301313.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 1.4815599471330643, | |
| "epoch": 0.8066946382623612, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 9.679213795661042e-06, | |
| "loss": 0.989, | |
| "mean_token_accuracy": 0.7446364529430867, | |
| "num_tokens": 461363186.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.460189698636532, | |
| "epoch": 0.8085491086951483, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 9.586501019840534e-06, | |
| "loss": 0.9226, | |
| "mean_token_accuracy": 0.7591656059026718, | |
| "num_tokens": 462394061.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 1.46696138381958, | |
| "epoch": 0.8104035791279353, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 9.493788244020027e-06, | |
| "loss": 0.919, | |
| "mean_token_accuracy": 0.7599939718842507, | |
| "num_tokens": 463453093.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 1.5002279579639435, | |
| "epoch": 0.8122580495607223, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 9.401075468199519e-06, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.752779620885849, | |
| "num_tokens": 464507158.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 1.4883873075246812, | |
| "epoch": 0.8141125199935093, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 9.30836269237901e-06, | |
| "loss": 0.9293, | |
| "mean_token_accuracy": 0.7580308496952057, | |
| "num_tokens": 465593305.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 1.4969554737210273, | |
| "epoch": 0.8159669904262964, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 9.215649916558502e-06, | |
| "loss": 0.9807, | |
| "mean_token_accuracy": 0.749377204477787, | |
| "num_tokens": 466637907.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.4888636887073516, | |
| "epoch": 0.8178214608590835, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 9.122937140737994e-06, | |
| "loss": 0.9776, | |
| "mean_token_accuracy": 0.7479160398244857, | |
| "num_tokens": 467707886.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 1.4951920330524444, | |
| "epoch": 0.8196759312918704, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 9.030224364917487e-06, | |
| "loss": 0.986, | |
| "mean_token_accuracy": 0.7464122280478478, | |
| "num_tokens": 468769214.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 1.471150666475296, | |
| "epoch": 0.8215304017246575, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 8.937511589096978e-06, | |
| "loss": 0.9592, | |
| "mean_token_accuracy": 0.75238643810153, | |
| "num_tokens": 469813760.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.4797371938824653, | |
| "epoch": 0.8233848721574445, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 8.84479881327647e-06, | |
| "loss": 0.9628, | |
| "mean_token_accuracy": 0.7495879493653774, | |
| "num_tokens": 470883023.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 1.5028559476137162, | |
| "epoch": 0.8252393425902316, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 8.752086037455962e-06, | |
| "loss": 0.953, | |
| "mean_token_accuracy": 0.7539477720856667, | |
| "num_tokens": 471942972.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.4695367753505706, | |
| "epoch": 0.8270938130230187, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 8.659373261635453e-06, | |
| "loss": 0.9503, | |
| "mean_token_accuracy": 0.7525465279817581, | |
| "num_tokens": 473022352.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 1.471600492298603, | |
| "epoch": 0.8289482834558056, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 8.566660485814945e-06, | |
| "loss": 0.9395, | |
| "mean_token_accuracy": 0.7543421745300293, | |
| "num_tokens": 474071367.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 1.4563724800944329, | |
| "epoch": 0.8308027538885927, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 8.473947709994437e-06, | |
| "loss": 0.939, | |
| "mean_token_accuracy": 0.7556210935115815, | |
| "num_tokens": 475129809.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 1.4725616209208965, | |
| "epoch": 0.8326572243213797, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 8.38123493417393e-06, | |
| "loss": 0.9467, | |
| "mean_token_accuracy": 0.7529217921197414, | |
| "num_tokens": 476192120.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 1.4959609627723693, | |
| "epoch": 0.8345116947541668, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.288522158353422e-06, | |
| "loss": 0.986, | |
| "mean_token_accuracy": 0.7469360053539276, | |
| "num_tokens": 477270913.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.5043706215918065, | |
| "epoch": 0.8363661651869538, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 8.195809382532913e-06, | |
| "loss": 0.9942, | |
| "mean_token_accuracy": 0.7434839032590389, | |
| "num_tokens": 478336794.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 1.4592665687203408, | |
| "epoch": 0.8382206356197408, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 8.103096606712405e-06, | |
| "loss": 0.9578, | |
| "mean_token_accuracy": 0.7528082154691219, | |
| "num_tokens": 479380108.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 1.4847895950078964, | |
| "epoch": 0.8400751060525279, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.010383830891898e-06, | |
| "loss": 0.9442, | |
| "mean_token_accuracy": 0.7523408196866512, | |
| "num_tokens": 480462623.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 1.457891409099102, | |
| "epoch": 0.841929576485315, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.91767105507139e-06, | |
| "loss": 0.9152, | |
| "mean_token_accuracy": 0.7595615901052952, | |
| "num_tokens": 481527124.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 1.4442754536867142, | |
| "epoch": 0.8437840469181019, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 7.824958279250881e-06, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7508052803575993, | |
| "num_tokens": 482574513.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.4993748903274535, | |
| "epoch": 0.845638517350889, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 7.732245503430373e-06, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.7478637613356114, | |
| "num_tokens": 483617910.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 1.4834584772586823, | |
| "epoch": 0.847492987783676, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 7.639532727609865e-06, | |
| "loss": 0.9532, | |
| "mean_token_accuracy": 0.7517675705254078, | |
| "num_tokens": 484697320.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 1.497210255265236, | |
| "epoch": 0.8493474582164631, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 7.546819951789356e-06, | |
| "loss": 0.9734, | |
| "mean_token_accuracy": 0.748406857252121, | |
| "num_tokens": 485743420.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 1.4931289002299308, | |
| "epoch": 0.8512019286492502, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 7.454107175968848e-06, | |
| "loss": 0.9569, | |
| "mean_token_accuracy": 0.7529506534337997, | |
| "num_tokens": 486829183.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 1.4777323231101036, | |
| "epoch": 0.8530563990820371, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 7.361394400148341e-06, | |
| "loss": 0.9453, | |
| "mean_token_accuracy": 0.7546572618186473, | |
| "num_tokens": 487907679.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.4949002787470818, | |
| "epoch": 0.8549108695148242, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 7.268681624327833e-06, | |
| "loss": 0.9681, | |
| "mean_token_accuracy": 0.7518519349396229, | |
| "num_tokens": 488932937.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 1.4767025902867317, | |
| "epoch": 0.8567653399476112, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 7.175968848507325e-06, | |
| "loss": 0.945, | |
| "mean_token_accuracy": 0.7544593453407288, | |
| "num_tokens": 489993145.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 1.490280681848526, | |
| "epoch": 0.8586198103803983, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 7.083256072686817e-06, | |
| "loss": 0.9726, | |
| "mean_token_accuracy": 0.7484356872737408, | |
| "num_tokens": 491049130.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 1.444966398179531, | |
| "epoch": 0.8604742808131853, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 6.990543296866309e-06, | |
| "loss": 0.9671, | |
| "mean_token_accuracy": 0.7509759172797204, | |
| "num_tokens": 492113185.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 1.4867697358131409, | |
| "epoch": 0.8623287512459723, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 6.8978305210458e-06, | |
| "loss": 0.9891, | |
| "mean_token_accuracy": 0.7448230169713497, | |
| "num_tokens": 493165668.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.5139823421835898, | |
| "epoch": 0.8641832216787594, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 6.805117745225293e-06, | |
| "loss": 0.9563, | |
| "mean_token_accuracy": 0.749880088120699, | |
| "num_tokens": 494211850.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 1.4759756982326508, | |
| "epoch": 0.8660376921115464, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 6.712404969404784e-06, | |
| "loss": 0.9669, | |
| "mean_token_accuracy": 0.7497481346130371, | |
| "num_tokens": 495255030.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 1.4659351661801339, | |
| "epoch": 0.8678921625443334, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 6.619692193584276e-06, | |
| "loss": 0.9603, | |
| "mean_token_accuracy": 0.7508170709013939, | |
| "num_tokens": 496299253.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 1.5076110348105431, | |
| "epoch": 0.8697466329771205, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 6.526979417763768e-06, | |
| "loss": 0.9694, | |
| "mean_token_accuracy": 0.7490433253347873, | |
| "num_tokens": 497347663.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 1.447922769188881, | |
| "epoch": 0.8716011034099075, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 6.434266641943259e-06, | |
| "loss": 0.9155, | |
| "mean_token_accuracy": 0.7609165765345096, | |
| "num_tokens": 498413931.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.48031188249588, | |
| "epoch": 0.8734555738426946, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 6.341553866122752e-06, | |
| "loss": 0.9898, | |
| "mean_token_accuracy": 0.7449010275304317, | |
| "num_tokens": 499447555.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 1.4997192814946174, | |
| "epoch": 0.8753100442754815, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 6.248841090302244e-06, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7547892071306705, | |
| "num_tokens": 500518374.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 1.5000934675335884, | |
| "epoch": 0.8771645147082686, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 6.156128314481736e-06, | |
| "loss": 0.9856, | |
| "mean_token_accuracy": 0.745396564155817, | |
| "num_tokens": 501579792.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 1.5113018915057181, | |
| "epoch": 0.8790189851410557, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 6.0634155386612275e-06, | |
| "loss": 0.9743, | |
| "mean_token_accuracy": 0.750109052658081, | |
| "num_tokens": 502663374.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 1.49411730915308, | |
| "epoch": 0.8808734555738427, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 5.97070276284072e-06, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7551098071038723, | |
| "num_tokens": 503699582.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.5184060662984848, | |
| "epoch": 0.8827279260066297, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 5.8779899870202116e-06, | |
| "loss": 1.0052, | |
| "mean_token_accuracy": 0.7431839637458324, | |
| "num_tokens": 504771657.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 1.4596238881349564, | |
| "epoch": 0.8845823964394167, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 5.785277211199704e-06, | |
| "loss": 0.9394, | |
| "mean_token_accuracy": 0.7539000272750854, | |
| "num_tokens": 505833985.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 1.4942304596304894, | |
| "epoch": 0.8864368668722038, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 5.692564435379196e-06, | |
| "loss": 0.9458, | |
| "mean_token_accuracy": 0.7517348796129226, | |
| "num_tokens": 506899192.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 1.4776636615395546, | |
| "epoch": 0.8882913373049909, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 5.599851659558687e-06, | |
| "loss": 0.9686, | |
| "mean_token_accuracy": 0.749453266710043, | |
| "num_tokens": 507962997.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 1.5113597124814988, | |
| "epoch": 0.8901458077377778, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 5.507138883738179e-06, | |
| "loss": 0.9927, | |
| "mean_token_accuracy": 0.7455503046512604, | |
| "num_tokens": 509022628.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.522510838508606, | |
| "epoch": 0.8920002781705649, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 5.414426107917671e-06, | |
| "loss": 0.9769, | |
| "mean_token_accuracy": 0.7477643340826035, | |
| "num_tokens": 510091086.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 1.5017392978072166, | |
| "epoch": 0.8938547486033519, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 5.321713332097163e-06, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7480161942541599, | |
| "num_tokens": 511144514.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 1.4844235144555569, | |
| "epoch": 0.895709219036139, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 5.2290005562766555e-06, | |
| "loss": 0.9673, | |
| "mean_token_accuracy": 0.7491404734551906, | |
| "num_tokens": 512205294.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 1.5136632181704044, | |
| "epoch": 0.8975636894689261, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 5.136287780456147e-06, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.7480661280453205, | |
| "num_tokens": 513232686.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 1.4878306895494462, | |
| "epoch": 0.899418159901713, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 5.043575004635639e-06, | |
| "loss": 0.9377, | |
| "mean_token_accuracy": 0.7544656574726105, | |
| "num_tokens": 514295562.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.4805591061711312, | |
| "epoch": 0.9012726303345001, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.950862228815131e-06, | |
| "loss": 0.9583, | |
| "mean_token_accuracy": 0.7537032641470433, | |
| "num_tokens": 515349765.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 1.5317182704806327, | |
| "epoch": 0.9031271007672872, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.858149452994623e-06, | |
| "loss": 1.0149, | |
| "mean_token_accuracy": 0.7421710386872291, | |
| "num_tokens": 516380200.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 1.4920184709131719, | |
| "epoch": 0.9049815712000742, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.7654366771741145e-06, | |
| "loss": 0.9818, | |
| "mean_token_accuracy": 0.7498961910605431, | |
| "num_tokens": 517450183.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 1.4551678031682969, | |
| "epoch": 0.9068360416328612, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.672723901353607e-06, | |
| "loss": 0.9402, | |
| "mean_token_accuracy": 0.7537682630121708, | |
| "num_tokens": 518518495.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 1.4773698151111603, | |
| "epoch": 0.9086905120656482, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.580011125533099e-06, | |
| "loss": 0.9435, | |
| "mean_token_accuracy": 0.7546230234205723, | |
| "num_tokens": 519595131.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.4985239595174789, | |
| "epoch": 0.9105449824984353, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 4.48729834971259e-06, | |
| "loss": 0.963, | |
| "mean_token_accuracy": 0.7507427543401718, | |
| "num_tokens": 520656548.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 1.4794475808739662, | |
| "epoch": 0.9123994529312224, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.394585573892083e-06, | |
| "loss": 0.9589, | |
| "mean_token_accuracy": 0.7512788020074368, | |
| "num_tokens": 521694522.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 1.4926507860422134, | |
| "epoch": 0.9142539233640093, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.301872798071574e-06, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.7528703108429908, | |
| "num_tokens": 522756988.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 1.473665152490139, | |
| "epoch": 0.9161083937967964, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.209160022251067e-06, | |
| "loss": 0.9217, | |
| "mean_token_accuracy": 0.7571736149489879, | |
| "num_tokens": 523827072.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 1.4985759064555169, | |
| "epoch": 0.9179628642295834, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 4.1164472464305584e-06, | |
| "loss": 0.9765, | |
| "mean_token_accuracy": 0.7496349468827248, | |
| "num_tokens": 524893580.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.4969960138201714, | |
| "epoch": 0.9198173346623705, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.02373447061005e-06, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7546924255788326, | |
| "num_tokens": 525948091.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 1.488446156680584, | |
| "epoch": 0.9216718050951576, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.931021694789542e-06, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.7508613869547844, | |
| "num_tokens": 527029405.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 1.4901771679520608, | |
| "epoch": 0.9235262755279445, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.838308918969034e-06, | |
| "loss": 0.9464, | |
| "mean_token_accuracy": 0.7563538022339344, | |
| "num_tokens": 528100070.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 1.505025638639927, | |
| "epoch": 0.9253807459607316, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.7455961431485262e-06, | |
| "loss": 0.9891, | |
| "mean_token_accuracy": 0.7449918761849403, | |
| "num_tokens": 529157682.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 1.4832553416490555, | |
| "epoch": 0.9272352163935186, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.652883367328018e-06, | |
| "loss": 0.9425, | |
| "mean_token_accuracy": 0.7574765972793103, | |
| "num_tokens": 530207090.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.5145885661244392, | |
| "epoch": 0.9290896868263057, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.56017059150751e-06, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.7519233793020248, | |
| "num_tokens": 531289042.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "entropy": 1.4676032707095146, | |
| "epoch": 0.9309441572590927, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.4674578156870015e-06, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7508913226425648, | |
| "num_tokens": 532332779.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "entropy": 1.526107704639435, | |
| "epoch": 0.9327986276918797, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.3747450398664936e-06, | |
| "loss": 1.0244, | |
| "mean_token_accuracy": 0.7377512022852898, | |
| "num_tokens": 533379621.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "entropy": 1.4882061570882796, | |
| "epoch": 0.9346530981246668, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.282032264045986e-06, | |
| "loss": 0.9556, | |
| "mean_token_accuracy": 0.7504644826054573, | |
| "num_tokens": 534442588.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "entropy": 1.4703703567385673, | |
| "epoch": 0.9365075685574538, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.1893194882254777e-06, | |
| "loss": 0.9546, | |
| "mean_token_accuracy": 0.7510296694934369, | |
| "num_tokens": 535476065.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.5207170024514198, | |
| "epoch": 0.9383620389902408, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.0966067124049698e-06, | |
| "loss": 0.9837, | |
| "mean_token_accuracy": 0.744877602159977, | |
| "num_tokens": 536536854.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "entropy": 1.5004464775323867, | |
| "epoch": 0.9402165094230279, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.0038939365844614e-06, | |
| "loss": 1.0054, | |
| "mean_token_accuracy": 0.7421227216720581, | |
| "num_tokens": 537579141.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "entropy": 1.4758819058537482, | |
| "epoch": 0.9420709798558149, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.9111811607639534e-06, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.7511624835431576, | |
| "num_tokens": 538602192.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "entropy": 1.4848722845315934, | |
| "epoch": 0.943925450288602, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.8184683849434455e-06, | |
| "loss": 0.9586, | |
| "mean_token_accuracy": 0.7529581107199192, | |
| "num_tokens": 539640883.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "entropy": 1.481642335653305, | |
| "epoch": 0.945779920721389, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.725755609122937e-06, | |
| "loss": 0.9536, | |
| "mean_token_accuracy": 0.7543608367443084, | |
| "num_tokens": 540702969.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.5358406245708465, | |
| "epoch": 0.947634391154176, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.633042833302429e-06, | |
| "loss": 0.9988, | |
| "mean_token_accuracy": 0.7425594002008438, | |
| "num_tokens": 541770757.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "entropy": 1.4897444009780885, | |
| "epoch": 0.9494888615869631, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.5403300574819212e-06, | |
| "loss": 0.9348, | |
| "mean_token_accuracy": 0.7558170072734356, | |
| "num_tokens": 542814005.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "entropy": 1.506165799498558, | |
| "epoch": 0.9513433320197501, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.447617281661413e-06, | |
| "loss": 0.9654, | |
| "mean_token_accuracy": 0.7477248691022396, | |
| "num_tokens": 543849282.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "entropy": 1.5137786835432052, | |
| "epoch": 0.9531978024525372, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 2.354904505840905e-06, | |
| "loss": 0.9474, | |
| "mean_token_accuracy": 0.7553800746798516, | |
| "num_tokens": 544867971.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "entropy": 1.4783537894487382, | |
| "epoch": 0.9550522728853242, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.262191730020397e-06, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7512776233255863, | |
| "num_tokens": 545928864.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.518006481230259, | |
| "epoch": 0.9569067433181112, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.1694789541998886e-06, | |
| "loss": 0.9263, | |
| "mean_token_accuracy": 0.7559382028877735, | |
| "num_tokens": 547015699.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "entropy": 1.452364021539688, | |
| "epoch": 0.9587612137508983, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.0767661783793806e-06, | |
| "loss": 0.9226, | |
| "mean_token_accuracy": 0.7577701672911644, | |
| "num_tokens": 548082426.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "entropy": 1.502952517569065, | |
| "epoch": 0.9606156841836853, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.9840534025588727e-06, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7505509339272975, | |
| "num_tokens": 549139639.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "entropy": 1.492076651006937, | |
| "epoch": 0.9624701546164723, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.8913406267383647e-06, | |
| "loss": 0.972, | |
| "mean_token_accuracy": 0.7481108136475086, | |
| "num_tokens": 550208398.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "entropy": 1.5004539504647254, | |
| "epoch": 0.9643246250492594, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.7986278509178566e-06, | |
| "loss": 0.9848, | |
| "mean_token_accuracy": 0.7464699812233448, | |
| "num_tokens": 551284570.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.48575449436903, | |
| "epoch": 0.9661790954820464, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.7059150750973484e-06, | |
| "loss": 0.9616, | |
| "mean_token_accuracy": 0.7523142404854297, | |
| "num_tokens": 552360156.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "entropy": 1.4977748692035675, | |
| "epoch": 0.9680335659148335, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.6132022992768405e-06, | |
| "loss": 0.9781, | |
| "mean_token_accuracy": 0.7472750477492809, | |
| "num_tokens": 553396119.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "entropy": 1.5222928524017334, | |
| "epoch": 0.9698880363476204, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.5204895234563323e-06, | |
| "loss": 0.9637, | |
| "mean_token_accuracy": 0.749769189953804, | |
| "num_tokens": 554469233.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "entropy": 1.4755815327167512, | |
| "epoch": 0.9717425067804075, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.4277767476358244e-06, | |
| "loss": 0.9251, | |
| "mean_token_accuracy": 0.7568557754158973, | |
| "num_tokens": 555561027.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "entropy": 1.4556399047374726, | |
| "epoch": 0.9735969772131946, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.3350639718153162e-06, | |
| "loss": 0.9428, | |
| "mean_token_accuracy": 0.7547634690999985, | |
| "num_tokens": 556626461.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.5005512908101082, | |
| "epoch": 0.9754514476459816, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.242351195994808e-06, | |
| "loss": 0.9492, | |
| "mean_token_accuracy": 0.7519237779080867, | |
| "num_tokens": 557717999.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "entropy": 1.48595682233572, | |
| "epoch": 0.9773059180787687, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.1496384201743001e-06, | |
| "loss": 0.9717, | |
| "mean_token_accuracy": 0.747084479033947, | |
| "num_tokens": 558782308.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "entropy": 1.5278505682945251, | |
| "epoch": 0.9791603885115556, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 1.056925644353792e-06, | |
| "loss": 0.9913, | |
| "mean_token_accuracy": 0.7438154302537441, | |
| "num_tokens": 559841825.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "entropy": 1.470687361061573, | |
| "epoch": 0.9810148589443427, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 9.64212868533284e-07, | |
| "loss": 0.9738, | |
| "mean_token_accuracy": 0.7483159713447094, | |
| "num_tokens": 560888844.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "entropy": 1.4721205845475196, | |
| "epoch": 0.9828693293771298, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 8.71500092712776e-07, | |
| "loss": 0.9606, | |
| "mean_token_accuracy": 0.7524561606347561, | |
| "num_tokens": 561956928.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.48909293115139, | |
| "epoch": 0.9847237998099168, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.787873168922678e-07, | |
| "loss": 0.9737, | |
| "mean_token_accuracy": 0.7490025669336319, | |
| "num_tokens": 563027994.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "entropy": 1.5337512582540511, | |
| "epoch": 0.9865782702427038, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 6.860745410717597e-07, | |
| "loss": 0.9991, | |
| "mean_token_accuracy": 0.74337690025568, | |
| "num_tokens": 564115664.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "entropy": 1.4832084849476814, | |
| "epoch": 0.9884327406754908, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 5.933617652512517e-07, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7524187438189983, | |
| "num_tokens": 565176091.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "entropy": 1.4854494094848634, | |
| "epoch": 0.9902872111082779, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 5.006489894307436e-07, | |
| "loss": 0.9526, | |
| "mean_token_accuracy": 0.7515230976045132, | |
| "num_tokens": 566218058.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "entropy": 1.4775687217712403, | |
| "epoch": 0.992141681541065, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.079362136102355e-07, | |
| "loss": 0.9616, | |
| "mean_token_accuracy": 0.7508705228567123, | |
| "num_tokens": 567284029.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.4860795721411706, | |
| "epoch": 0.9939961519738519, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.152234377897274e-07, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7553019225597382, | |
| "num_tokens": 568350062.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "entropy": 1.5090449512004853, | |
| "epoch": 0.995850622406639, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.2251066196921937e-07, | |
| "loss": 0.9625, | |
| "mean_token_accuracy": 0.7491112671792507, | |
| "num_tokens": 569445914.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "entropy": 1.4674130707979203, | |
| "epoch": 0.997705092839426, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.297978861487113e-07, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7501638606190681, | |
| "num_tokens": 570512526.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "entropy": 1.4686502367258072, | |
| "epoch": 0.9995595632722131, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.708511032820323e-08, | |
| "loss": 0.9287, | |
| "mean_token_accuracy": 0.7583843737840652, | |
| "num_tokens": 571567532.0, | |
| "step": 5390 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5393, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.298444105476997e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |