| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.999081207152449, |
| "eval_steps": 500, |
| "global_step": 1767, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016962329493250407, |
| "grad_norm": 3.53125, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 2.1453, |
| "mean_token_accuracy": 0.5800132380177577, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.033924658986500815, |
| "grad_norm": 2.53125, |
| "learning_rate": 3.518518518518519e-05, |
| "loss": 1.713, |
| "mean_token_accuracy": 0.6173386509219806, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05088698847975122, |
| "grad_norm": 1.671875, |
| "learning_rate": 5.370370370370371e-05, |
| "loss": 1.5855, |
| "mean_token_accuracy": 0.6299800969660282, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06784931797300163, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.222222222222222e-05, |
| "loss": 1.5199, |
| "mean_token_accuracy": 0.6394588612020016, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08481164746625203, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.074074074074075e-05, |
| "loss": 1.5192, |
| "mean_token_accuracy": 0.6397132301082213, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10177397695950244, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.999789785826972e-05, |
| "loss": 1.5276, |
| "mean_token_accuracy": 0.6391662692030271, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11873630645275285, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.998108178497258e-05, |
| "loss": 1.5183, |
| "mean_token_accuracy": 0.6404511784513791, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13569863594600326, |
| "grad_norm": 6.3125, |
| "learning_rate": 9.99474552942225e-05, |
| "loss": 1.5112, |
| "mean_token_accuracy": 0.6418155938386917, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15266096543925367, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.989702969580565e-05, |
| "loss": 1.5161, |
| "mean_token_accuracy": 0.6407449401915073, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16962329493250405, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.982982194964625e-05, |
| "loss": 1.4945, |
| "mean_token_accuracy": 0.6452280322710673, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.18658562442575446, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.974585466010236e-05, |
| "loss": 1.4927, |
| "mean_token_accuracy": 0.6472473913182815, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20354795391900488, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.964515606836321e-05, |
| "loss": 1.4876, |
| "mean_token_accuracy": 0.646234019100666, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2205102834122553, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.952776004295077e-05, |
| "loss": 1.4768, |
| "mean_token_accuracy": 0.6493511145313581, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2374726129055057, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.939370606832841e-05, |
| "loss": 1.4627, |
| "mean_token_accuracy": 0.6511956502993902, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2544349423987561, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.924303923162097e-05, |
| "loss": 1.4855, |
| "mean_token_accuracy": 0.650639555354913, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2713972718920065, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.907581020745037e-05, |
| "loss": 1.4492, |
| "mean_token_accuracy": 0.6547389343380928, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2883596013852569, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.889207524089187e-05, |
| "loss": 1.4298, |
| "mean_token_accuracy": 0.65942026724418, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.30532193087850734, |
| "grad_norm": 1.2109375, |
| "learning_rate": 9.869189612855685e-05, |
| "loss": 1.4457, |
| "mean_token_accuracy": 0.6557608020802339, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3222842603717577, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.847534019780848e-05, |
| "loss": 1.4136, |
| "mean_token_accuracy": 0.6603276548286279, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3392465898650081, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.824248028411703e-05, |
| "loss": 1.4263, |
| "mean_token_accuracy": 0.6607817115883032, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35620891935825855, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.79933947065628e-05, |
| "loss": 1.4148, |
| "mean_token_accuracy": 0.6610926086703937, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.37317124885150893, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.772816724149459e-05, |
| "loss": 1.3812, |
| "mean_token_accuracy": 0.6686675310134887, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.39013357834475937, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.744688709435268e-05, |
| "loss": 1.4174, |
| "mean_token_accuracy": 0.6620935648679733, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.40709590783800975, |
| "grad_norm": 1.125, |
| "learning_rate": 9.714964886966598e-05, |
| "loss": 1.4102, |
| "mean_token_accuracy": 0.6628513303895791, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.42405823733126013, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.6836552539233e-05, |
| "loss": 1.3909, |
| "mean_token_accuracy": 0.6663404104610284, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4410205668245106, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.650770340849796e-05, |
| "loss": 1.4062, |
| "mean_token_accuracy": 0.6653375633060932, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.45798289631776096, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.616321208113262e-05, |
| "loss": 1.4091, |
| "mean_token_accuracy": 0.6641857360800107, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4749452258110114, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.580319442183654e-05, |
| "loss": 1.3828, |
| "mean_token_accuracy": 0.6680692491432031, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4919075553042618, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.542777151736746e-05, |
| "loss": 1.3675, |
| "mean_token_accuracy": 0.6707451656460762, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5088698847975122, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.503706963581562e-05, |
| "loss": 1.3782, |
| "mean_token_accuracy": 0.6665912042061488, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5258322142907625, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.463122018413532e-05, |
| "loss": 1.3536, |
| "mean_token_accuracy": 0.6734739691019058, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.542794543784013, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.4210359663948e-05, |
| "loss": 1.3887, |
| "mean_token_accuracy": 0.668233826259772, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5597568732772634, |
| "grad_norm": 1.0078125, |
| "learning_rate": 9.377462962563195e-05, |
| "loss": 1.3819, |
| "mean_token_accuracy": 0.6673479390641054, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5767192027705138, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.332417662071385e-05, |
| "loss": 1.3679, |
| "mean_token_accuracy": 0.6706842251121998, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5936815322637642, |
| "grad_norm": 0.9609375, |
| "learning_rate": 9.285915215257828e-05, |
| "loss": 1.3615, |
| "mean_token_accuracy": 0.6692039887110393, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6106438617570147, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.237971262551175e-05, |
| "loss": 1.3566, |
| "mean_token_accuracy": 0.6739495868484179, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6276061912502651, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.188601929209835e-05, |
| "loss": 1.3103, |
| "mean_token_accuracy": 0.6818199207385381, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6445685207435154, |
| "grad_norm": 1.0, |
| "learning_rate": 9.137823819898477e-05, |
| "loss": 1.3527, |
| "mean_token_accuracy": 0.6732211743791898, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6615308502367658, |
| "grad_norm": 0.9453125, |
| "learning_rate": 9.08565401310329e-05, |
| "loss": 1.3332, |
| "mean_token_accuracy": 0.6759632855653763, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6784931797300162, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.03211005538788e-05, |
| "loss": 1.3457, |
| "mean_token_accuracy": 0.6737116026381652, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6954555092232667, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.977209955491739e-05, |
| "loss": 1.3713, |
| "mean_token_accuracy": 0.6700817617277305, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7124178387165171, |
| "grad_norm": 1.03125, |
| "learning_rate": 8.920972178273257e-05, |
| "loss": 1.3243, |
| "mean_token_accuracy": 0.6765588760375977, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7293801682097675, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.863415638499341e-05, |
| "loss": 1.3272, |
| "mean_token_accuracy": 0.6768454472223918, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7463424977030179, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.8045596944837e-05, |
| "loss": 1.3411, |
| "mean_token_accuracy": 0.6759745722015699, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7633048271962682, |
| "grad_norm": 1.0, |
| "learning_rate": 8.744424141575959e-05, |
| "loss": 1.3138, |
| "mean_token_accuracy": 0.6793065622448922, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7802671566895187, |
| "grad_norm": 0.9765625, |
| "learning_rate": 8.683029205503773e-05, |
| "loss": 1.3562, |
| "mean_token_accuracy": 0.6727576293051243, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7972294861827691, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.620395535570198e-05, |
| "loss": 1.347, |
| "mean_token_accuracy": 0.6732823781669139, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8141918156760195, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.556544197708596e-05, |
| "loss": 1.3011, |
| "mean_token_accuracy": 0.6819205803175767, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8311541451692699, |
| "grad_norm": 0.96484375, |
| "learning_rate": 8.491496667397408e-05, |
| "loss": 1.3051, |
| "mean_token_accuracy": 0.6827595402797063, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8481164746625203, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.42527482243719e-05, |
| "loss": 1.2991, |
| "mean_token_accuracy": 0.6841210166613261, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8650788041557708, |
| "grad_norm": 0.9375, |
| "learning_rate": 8.357900935592327e-05, |
| "loss": 1.3055, |
| "mean_token_accuracy": 0.679613892485698, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8820411336490211, |
| "grad_norm": 0.95703125, |
| "learning_rate": 8.289397667099909e-05, |
| "loss": 1.3155, |
| "mean_token_accuracy": 0.6812887417773406, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8990034631422715, |
| "grad_norm": 0.9453125, |
| "learning_rate": 8.219788057048286e-05, |
| "loss": 1.3154, |
| "mean_token_accuracy": 0.6792417210837205, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9159657926355219, |
| "grad_norm": 0.9375, |
| "learning_rate": 8.149095517627871e-05, |
| "loss": 1.3034, |
| "mean_token_accuracy": 0.6806264075140158, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.9329281221287723, |
| "grad_norm": 0.8984375, |
| "learning_rate": 8.077343825256783e-05, |
| "loss": 1.3126, |
| "mean_token_accuracy": 0.6810529338816802, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9498904516220228, |
| "grad_norm": 0.9609375, |
| "learning_rate": 8.004557112583986e-05, |
| "loss": 1.3134, |
| "mean_token_accuracy": 0.6798362337052822, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9668527811152732, |
| "grad_norm": 0.921875, |
| "learning_rate": 7.930759860372628e-05, |
| "loss": 1.2856, |
| "mean_token_accuracy": 0.683349988112847, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9838151106085236, |
| "grad_norm": 0.88671875, |
| "learning_rate": 7.855976889266288e-05, |
| "loss": 1.2901, |
| "mean_token_accuracy": 0.6849393486976624, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.001696232949325, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.780233351440903e-05, |
| "loss": 1.395, |
| "mean_token_accuracy": 0.6899135421733467, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.0186585624425755, |
| "grad_norm": 0.93359375, |
| "learning_rate": 7.703554722145201e-05, |
| "loss": 1.1017, |
| "mean_token_accuracy": 0.7215727421144644, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0356208919358258, |
| "grad_norm": 0.9609375, |
| "learning_rate": 7.625966791132468e-05, |
| "loss": 1.0907, |
| "mean_token_accuracy": 0.7208627772827944, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0525832214290762, |
| "grad_norm": 0.99609375, |
| "learning_rate": 7.547495653986536e-05, |
| "loss": 1.0893, |
| "mean_token_accuracy": 0.7245491112271945, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0695455509223266, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.468167703344902e-05, |
| "loss": 1.0853, |
| "mean_token_accuracy": 0.7234922610223293, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.086507880415577, |
| "grad_norm": 0.92578125, |
| "learning_rate": 7.388009620021959e-05, |
| "loss": 1.1004, |
| "mean_token_accuracy": 0.7199652560055256, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.1034702099088274, |
| "grad_norm": 0.94921875, |
| "learning_rate": 7.307048364035266e-05, |
| "loss": 1.1206, |
| "mean_token_accuracy": 0.719177692135175, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.120432539402078, |
| "grad_norm": 0.9375, |
| "learning_rate": 7.225311165537956e-05, |
| "loss": 1.0905, |
| "mean_token_accuracy": 0.7229047452410062, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.1373948688953284, |
| "grad_norm": 0.90234375, |
| "learning_rate": 7.142825515660259e-05, |
| "loss": 1.1184, |
| "mean_token_accuracy": 0.7195753792921702, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.1543571983885788, |
| "grad_norm": 0.9453125, |
| "learning_rate": 7.059619157263245e-05, |
| "loss": 1.1152, |
| "mean_token_accuracy": 0.7191205089290936, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1713195278818291, |
| "grad_norm": 0.984375, |
| "learning_rate": 6.975720075607927e-05, |
| "loss": 1.1029, |
| "mean_token_accuracy": 0.7206906000773112, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1882818573750795, |
| "grad_norm": 0.953125, |
| "learning_rate": 6.891156488942811e-05, |
| "loss": 1.0929, |
| "mean_token_accuracy": 0.7221428496142228, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.20524418686833, |
| "grad_norm": 1.03125, |
| "learning_rate": 6.805956839013107e-05, |
| "loss": 1.1047, |
| "mean_token_accuracy": 0.7189865835011006, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.2222065163615803, |
| "grad_norm": 0.97265625, |
| "learning_rate": 6.720149781494738e-05, |
| "loss": 1.1151, |
| "mean_token_accuracy": 0.7195135744909446, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.2391688458548307, |
| "grad_norm": 1.0, |
| "learning_rate": 6.633764176356434e-05, |
| "loss": 1.0717, |
| "mean_token_accuracy": 0.7264206613103549, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.2561311753480813, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.546829078153086e-05, |
| "loss": 1.109, |
| "mean_token_accuracy": 0.718661529570818, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2730935048413317, |
| "grad_norm": 0.94140625, |
| "learning_rate": 6.459373726253672e-05, |
| "loss": 1.0936, |
| "mean_token_accuracy": 0.7220857585469882, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.290055834334582, |
| "grad_norm": 0.9453125, |
| "learning_rate": 6.371427535007008e-05, |
| "loss": 1.0909, |
| "mean_token_accuracy": 0.7229609449704488, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.3070181638278324, |
| "grad_norm": 0.8984375, |
| "learning_rate": 6.283020083848661e-05, |
| "loss": 1.1011, |
| "mean_token_accuracy": 0.7228824739654859, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.3239804933210828, |
| "grad_norm": 0.91015625, |
| "learning_rate": 6.194181107352331e-05, |
| "loss": 1.0762, |
| "mean_token_accuracy": 0.7260710549851258, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.3409428228143332, |
| "grad_norm": 0.88671875, |
| "learning_rate": 6.104940485229054e-05, |
| "loss": 1.097, |
| "mean_token_accuracy": 0.7203553736209869, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3579051523075836, |
| "grad_norm": 1.0234375, |
| "learning_rate": 6.015328232277593e-05, |
| "loss": 1.1041, |
| "mean_token_accuracy": 0.722166525820891, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.374867481800834, |
| "grad_norm": 0.97265625, |
| "learning_rate": 5.925374488289388e-05, |
| "loss": 1.096, |
| "mean_token_accuracy": 0.7233567799131075, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.3918298112940843, |
| "grad_norm": 0.96875, |
| "learning_rate": 5.8351095079114745e-05, |
| "loss": 1.103, |
| "mean_token_accuracy": 0.721570813159148, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.4087921407873347, |
| "grad_norm": 0.95703125, |
| "learning_rate": 5.74456365047077e-05, |
| "loss": 1.1058, |
| "mean_token_accuracy": 0.7204289863506953, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.425754470280585, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.653767369763148e-05, |
| "loss": 1.0895, |
| "mean_token_accuracy": 0.7256171715756258, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.4427167997738355, |
| "grad_norm": 0.953125, |
| "learning_rate": 5.562751203810742e-05, |
| "loss": 1.0696, |
| "mean_token_accuracy": 0.72752467567722, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.459679129267086, |
| "grad_norm": 0.95703125, |
| "learning_rate": 5.471545764590924e-05, |
| "loss": 1.0586, |
| "mean_token_accuracy": 0.7292891172071297, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4766414587603365, |
| "grad_norm": 0.92578125, |
| "learning_rate": 5.3801817277404066e-05, |
| "loss": 1.0876, |
| "mean_token_accuracy": 0.7224949277937412, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.4936037882535869, |
| "grad_norm": 0.921875, |
| "learning_rate": 5.28868982223793e-05, |
| "loss": 1.0675, |
| "mean_token_accuracy": 0.7286781263848146, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.5105661177468372, |
| "grad_norm": 0.94140625, |
| "learning_rate": 5.197100820069016e-05, |
| "loss": 1.0845, |
| "mean_token_accuracy": 0.7255125172436238, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.5275284472400876, |
| "grad_norm": 0.94140625, |
| "learning_rate": 5.1054455258762535e-05, |
| "loss": 1.0998, |
| "mean_token_accuracy": 0.7214603280027707, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.544490776733338, |
| "grad_norm": 0.99609375, |
| "learning_rate": 5.0137547665985985e-05, |
| "loss": 1.1018, |
| "mean_token_accuracy": 0.7225766807794571, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.5614531062265886, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.9220593811031786e-05, |
| "loss": 1.0482, |
| "mean_token_accuracy": 0.7321401789784432, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.578415435719839, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.83039020981308e-05, |
| "loss": 1.0618, |
| "mean_token_accuracy": 0.7308160757025083, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.5953777652130894, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.738778084334625e-05, |
| "loss": 1.079, |
| "mean_token_accuracy": 0.7259436552723249, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.6123400947063398, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.6472538170875924e-05, |
| "loss": 1.054, |
| "mean_token_accuracy": 0.7302558933695157, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.6293024241995901, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.5558481909419095e-05, |
| "loss": 1.0522, |
| "mean_token_accuracy": 0.7308388692637284, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.6462647536928405, |
| "grad_norm": 0.93359375, |
| "learning_rate": 4.46459194886428e-05, |
| "loss": 1.0886, |
| "mean_token_accuracy": 0.7265370438496271, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.663227083186091, |
| "grad_norm": 0.9375, |
| "learning_rate": 4.373515783578226e-05, |
| "loss": 1.0657, |
| "mean_token_accuracy": 0.7278412433962027, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.6801894126793413, |
| "grad_norm": 0.90234375, |
| "learning_rate": 4.2826503272410304e-05, |
| "loss": 1.0636, |
| "mean_token_accuracy": 0.7297646810611089, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.6971517421725917, |
| "grad_norm": 0.90625, |
| "learning_rate": 4.1920261411410536e-05, |
| "loss": 1.0802, |
| "mean_token_accuracy": 0.7267571208377679, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.714114071665842, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.101673705418888e-05, |
| "loss": 1.0609, |
| "mean_token_accuracy": 0.7287999058763186, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.7310764011590924, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.011623408815799e-05, |
| "loss": 1.0515, |
| "mean_token_accuracy": 0.7311748243868351, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.7480387306523428, |
| "grad_norm": 0.8984375, |
| "learning_rate": 3.9219055384529e-05, |
| "loss": 1.0815, |
| "mean_token_accuracy": 0.7260699895521004, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.7650010601455932, |
| "grad_norm": 0.95703125, |
| "learning_rate": 3.83255026964453e-05, |
| "loss": 1.0436, |
| "mean_token_accuracy": 0.7335223399102688, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.7819633896388436, |
| "grad_norm": 0.89453125, |
| "learning_rate": 3.7435876557492156e-05, |
| "loss": 1.0713, |
| "mean_token_accuracy": 0.7298086928824584, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.798925719132094, |
| "grad_norm": 1.0, |
| "learning_rate": 3.655047618061648e-05, |
| "loss": 1.0633, |
| "mean_token_accuracy": 0.7291242313881715, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.8158880486253446, |
| "grad_norm": 0.96875, |
| "learning_rate": 3.566959935749101e-05, |
| "loss": 1.062, |
| "mean_token_accuracy": 0.7298740123709043, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.832850378118595, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.479354235835622e-05, |
| "loss": 1.041, |
| "mean_token_accuracy": 0.7348680111269156, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.8498127076118454, |
| "grad_norm": 0.9375, |
| "learning_rate": 3.3922599832374226e-05, |
| "loss": 1.0601, |
| "mean_token_accuracy": 0.7297768058876196, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.8667750371050957, |
| "grad_norm": 0.9296875, |
| "learning_rate": 3.3057064708527686e-05, |
| "loss": 1.0516, |
| "mean_token_accuracy": 0.7326766779025395, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.8837373665983461, |
| "grad_norm": 0.90234375, |
| "learning_rate": 3.2197228097097346e-05, |
| "loss": 1.0737, |
| "mean_token_accuracy": 0.7286487720906735, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.9006996960915967, |
| "grad_norm": 0.90625, |
| "learning_rate": 3.1343379191751364e-05, |
| "loss": 1.0685, |
| "mean_token_accuracy": 0.7287176544467608, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.917662025584847, |
| "grad_norm": 0.83203125, |
| "learning_rate": 3.0495805172279167e-05, |
| "loss": 1.0279, |
| "mean_token_accuracy": 0.7362240366637707, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.9346243550780975, |
| "grad_norm": 0.9765625, |
| "learning_rate": 2.9654791108002567e-05, |
| "loss": 1.0658, |
| "mean_token_accuracy": 0.7271889204780261, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.9515866845713479, |
| "grad_norm": 0.96875, |
| "learning_rate": 2.8820619861896907e-05, |
| "loss": 1.0521, |
| "mean_token_accuracy": 0.7311085325976213, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.9685490140645983, |
| "grad_norm": 0.875, |
| "learning_rate": 2.7993571995454126e-05, |
| "loss": 1.0457, |
| "mean_token_accuracy": 0.7344075481096903, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.9855113435578486, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.7173925674319957e-05, |
| "loss": 1.0498, |
| "mean_token_accuracy": 0.7325132201115291, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.00339246589865, |
| "grad_norm": 1.40625, |
| "learning_rate": 2.6361956574736868e-05, |
| "loss": 1.139, |
| "mean_token_accuracy": 0.7362643013195116, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.0203547953919005, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.5557937790824382e-05, |
| "loss": 0.8593, |
| "mean_token_accuracy": 0.774844840914011, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.037317124885151, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.4762139742727797e-05, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7745203738411267, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.0542794543784013, |
| "grad_norm": 0.9765625, |
| "learning_rate": 2.3974830085666237e-05, |
| "loss": 0.8389, |
| "mean_token_accuracy": 0.7786116595069568, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.0712417838716517, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.3196273619910637e-05, |
| "loss": 0.8757, |
| "mean_token_accuracy": 0.7714692994952201, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.088204113364902, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.242673220172209e-05, |
| "loss": 0.869, |
| "mean_token_accuracy": 0.7752320093413194, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.1051664428581525, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.1666464655280133e-05, |
| "loss": 0.8593, |
| "mean_token_accuracy": 0.7770325655738513, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.122128772351403, |
| "grad_norm": 0.9765625, |
| "learning_rate": 2.0915726685631075e-05, |
| "loss": 0.8809, |
| "mean_token_accuracy": 0.7723904815812905, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.1390911018446532, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.0174770792685243e-05, |
| "loss": 0.8523, |
| "mean_token_accuracy": 0.7762084101637204, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.1560534313379036, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.9443846186292202e-05, |
| "loss": 0.8619, |
| "mean_token_accuracy": 0.7747002402941386, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.173015760831154, |
| "grad_norm": 0.9453125, |
| "learning_rate": 1.8723198702422724e-05, |
| "loss": 0.8542, |
| "mean_token_accuracy": 0.7765256710350513, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.1899780903244044, |
| "grad_norm": 0.921875, |
| "learning_rate": 1.8013070720485354e-05, |
| "loss": 0.8496, |
| "mean_token_accuracy": 0.7781426074604193, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.2069404198176548, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.7313701081805506e-05, |
| "loss": 0.8945, |
| "mean_token_accuracy": 0.7692260307570299, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.2239027493109056, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.6625325009294774e-05, |
| "loss": 0.8587, |
| "mean_token_accuracy": 0.7756107933819294, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.240865078804156, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.594817402833693e-05, |
| "loss": 0.8394, |
| "mean_token_accuracy": 0.7796109855175019, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.2578274082974064, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.5282475888917835e-05, |
| "loss": 0.8728, |
| "mean_token_accuracy": 0.7744500560065111, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.2747897377906567, |
| "grad_norm": 1.0, |
| "learning_rate": 1.4628454489024934e-05, |
| "loss": 0.8518, |
| "mean_token_accuracy": 0.777750201523304, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.291752067283907, |
| "grad_norm": 1.0859375, |
| "learning_rate": 1.398632979934235e-05, |
| "loss": 0.8548, |
| "mean_token_accuracy": 0.7758344347278278, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.3087143967771575, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.335631778926702e-05, |
| "loss": 0.8651, |
| "mean_token_accuracy": 0.7755650137861569, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.325676726270408, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.2738630354270437e-05, |
| "loss": 0.8697, |
| "mean_token_accuracy": 0.7738447397947311, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.3426390557636583, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.2133475244630615e-05, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7746792284150918, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.3596013852569087, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.154105599555837e-05, |
| "loss": 0.8817, |
| "mean_token_accuracy": 0.7732328993578752, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.376563714750159, |
| "grad_norm": 0.9921875, |
| "learning_rate": 1.0961571858741088e-05, |
| "loss": 0.861, |
| "mean_token_accuracy": 0.7760703690350056, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.3935260442434094, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.0395217735327362e-05, |
| "loss": 0.8615, |
| "mean_token_accuracy": 0.775514493137598, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.41048837373666, |
| "grad_norm": 0.984375, |
| "learning_rate": 9.84218411037477e-06, |
| "loss": 0.874, |
| "mean_token_accuracy": 0.7736226240793864, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.42745070322991, |
| "grad_norm": 1.0, |
| "learning_rate": 9.30265698878291e-06, |
| "loss": 0.8676, |
| "mean_token_accuracy": 0.7738830464581649, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.4444130327231606, |
| "grad_norm": 1.5, |
| "learning_rate": 8.776817832733436e-06, |
| "loss": 0.8733, |
| "mean_token_accuracy": 0.7751010999083519, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.461375362216411, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.264843500657799e-06, |
| "loss": 0.8521, |
| "mean_token_accuracy": 0.7777772823969523, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.4783376917096613, |
| "grad_norm": 0.9765625, |
| "learning_rate": 7.766906187753442e-06, |
| "loss": 0.8635, |
| "mean_token_accuracy": 0.773188495139281, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.4953000212029117, |
| "grad_norm": 0.96875, |
| "learning_rate": 7.283173368068497e-06, |
| "loss": 0.844, |
| "mean_token_accuracy": 0.7778890219827493, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.5122623506961625, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.8138077381742e-06, |
| "loss": 0.8645, |
| "mean_token_accuracy": 0.7747706746061643, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.5292246801894125, |
| "grad_norm": 1.015625, |
| "learning_rate": 6.3589671624443405e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.774104047069947, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.5461870096826633, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.918804619959806e-06, |
| "loss": 0.8732, |
| "mean_token_accuracy": 0.7745006288091342, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.5631493391759133, |
| "grad_norm": 0.98828125, |
| "learning_rate": 5.493468153056236e-06, |
| "loss": 0.8811, |
| "mean_token_accuracy": 0.7711024150252342, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.580111668669164, |
| "grad_norm": 0.96484375, |
| "learning_rate": 5.083100817532177e-06, |
| "loss": 0.8709, |
| "mean_token_accuracy": 0.7753738241891066, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.5970739981624145, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.687840634534302e-06, |
| "loss": 0.8596, |
| "mean_token_accuracy": 0.7756442760427793, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.614036327655665, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.307820544135937e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.7786166049540043, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.6309986571489152, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.943168360624672e-06, |
| "loss": 0.8614, |
| "mean_token_accuracy": 0.7758901623388131, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.6479609866421656, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.594006729513771e-06, |
| "loss": 0.8507, |
| "mean_token_accuracy": 0.7781767211854458, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.664923316135416, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.260453086292187e-06, |
| "loss": 0.8734, |
| "mean_token_accuracy": 0.7742785550653934, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.6818856456286664, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.942619616926806e-06, |
| "loss": 0.8792, |
| "mean_token_accuracy": 0.772778149942557, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.6988479751219168, |
| "grad_norm": 1.0, |
| "learning_rate": 2.640613220130278e-06, |
| "loss": 0.8412, |
| "mean_token_accuracy": 0.7816825255751609, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.715810304615167, |
| "grad_norm": 0.96875, |
| "learning_rate": 2.3545354714072264e-06, |
| "loss": 0.8622, |
| "mean_token_accuracy": 0.7764831451078256, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.7327726341084175, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.0844825888907738e-06, |
| "loss": 0.8662, |
| "mean_token_accuracy": 0.7740148122111956, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.749734963601668, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.8305454009809097e-06, |
| "loss": 0.8801, |
| "mean_token_accuracy": 0.7726842557390531, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.7666972930949183, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.5928093157957403e-06, |
| "loss": 0.8601, |
| "mean_token_accuracy": 0.7756263218820095, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.7836596225881687, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.3713542924456479e-06, |
| "loss": 0.8507, |
| "mean_token_accuracy": 0.777658429245154, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.800621952081419, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.1662548141402163e-06, |
| "loss": 0.8511, |
| "mean_token_accuracy": 0.7770614944398403, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.8175842815746694, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.775798631368626e-07, |
| "loss": 0.8463, |
| "mean_token_accuracy": 0.7784749428431194, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.83454661106792, |
| "grad_norm": 1.03125, |
| "learning_rate": 8.053928975396418e-07, |
| "loss": 0.8785, |
| "mean_token_accuracy": 0.7732983765502771, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.85150894056117, |
| "grad_norm": 1.03125, |
| "learning_rate": 6.497518299560634e-07, |
| "loss": 0.8695, |
| "mean_token_accuracy": 0.7760962655146917, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.868471270054421, |
| "grad_norm": 0.9453125, |
| "learning_rate": 5.107090080189725e-07, |
| "loss": 0.844, |
| "mean_token_accuracy": 0.7783816205958526, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.885433599547671, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.883111967802111e-07, |
| "loss": 0.8782, |
| "mean_token_accuracy": 0.7730122750004133, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.902395929040922, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.8259956298185566e-07, |
| "loss": 0.8752, |
| "mean_token_accuracy": 0.7732014996310075, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.919358258534172, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.9360966121035329e-07, |
| "loss": 0.8592, |
| "mean_token_accuracy": 0.7777578723927339, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.9363205880274226, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.213714219382933e-07, |
| "loss": 0.8657, |
| "mean_token_accuracy": 0.7750948662559192, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.953282917520673, |
| "grad_norm": 1.015625, |
| "learning_rate": 6.59091414576929e-08, |
| "loss": 0.8642, |
| "mean_token_accuracy": 0.7749369906882445, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.9702452470139233, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.7241473708283783e-08, |
| "loss": 0.8584, |
| "mean_token_accuracy": 0.7779290979107221, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.9872075765071737, |
| "grad_norm": 0.94921875, |
| "learning_rate": 5.381424003553237e-09, |
| "loss": 0.8599, |
| "mean_token_accuracy": 0.7751329804460207, |
| "step": 1760 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1767, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.981901778352865e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|