| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8566756864731733, | |
| "eval_steps": 500, | |
| "global_step": 39000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001098302162145094, | |
| "grad_norm": 1.8242720365524292, | |
| "learning_rate": 1e-05, | |
| "loss": 10.2008, | |
| "num_input_tokens_seen": 52428800, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.002196604324290188, | |
| "grad_norm": 1.7588739395141602, | |
| "learning_rate": 2e-05, | |
| "loss": 9.6579, | |
| "num_input_tokens_seen": 104857600, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.003294906486435282, | |
| "grad_norm": 1.4990028142929077, | |
| "learning_rate": 3e-05, | |
| "loss": 8.9277, | |
| "num_input_tokens_seen": 157286400, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.004393208648580376, | |
| "grad_norm": 0.9206030368804932, | |
| "learning_rate": 4e-05, | |
| "loss": 8.0739, | |
| "num_input_tokens_seen": 209715200, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.00549151081072547, | |
| "grad_norm": 0.4887239933013916, | |
| "learning_rate": 5e-05, | |
| "loss": 7.406, | |
| "num_input_tokens_seen": 262144000, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.006589812972870564, | |
| "grad_norm": 0.7044657468795776, | |
| "learning_rate": 6e-05, | |
| "loss": 6.9708, | |
| "num_input_tokens_seen": 314572800, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.007688115135015658, | |
| "grad_norm": 0.9420009255409241, | |
| "learning_rate": 7.000000000000001e-05, | |
| "loss": 6.6177, | |
| "num_input_tokens_seen": 367001600, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.008786417297160752, | |
| "grad_norm": 1.4098254442214966, | |
| "learning_rate": 8e-05, | |
| "loss": 6.3285, | |
| "num_input_tokens_seen": 419430400, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.009884719459305845, | |
| "grad_norm": 0.5596774220466614, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 6.0918, | |
| "num_input_tokens_seen": 471859200, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.01098302162145094, | |
| "grad_norm": 0.5934723615646362, | |
| "learning_rate": 0.0001, | |
| "loss": 5.8926, | |
| "num_input_tokens_seen": 524288000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01098302162145094, | |
| "eval_loss": 5.72648286819458, | |
| "eval_runtime": 65.255, | |
| "eval_samples_per_second": 76.622, | |
| "eval_steps_per_second": 19.156, | |
| "num_input_tokens_seen": 524288000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.012081323783596033, | |
| "grad_norm": 0.6180132627487183, | |
| "learning_rate": 0.00011, | |
| "loss": 5.713, | |
| "num_input_tokens_seen": 576716800, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.013179625945741128, | |
| "grad_norm": 0.7194430232048035, | |
| "learning_rate": 0.00012, | |
| "loss": 5.5604, | |
| "num_input_tokens_seen": 629145600, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.014277928107886221, | |
| "grad_norm": 0.7763974070549011, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 5.4212, | |
| "num_input_tokens_seen": 681574400, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.015376230270031316, | |
| "grad_norm": 0.7948254942893982, | |
| "learning_rate": 0.00014000000000000001, | |
| "loss": 5.2875, | |
| "num_input_tokens_seen": 734003200, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01647453243217641, | |
| "grad_norm": 0.7185749411582947, | |
| "learning_rate": 0.00015, | |
| "loss": 5.1765, | |
| "num_input_tokens_seen": 786432000, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.017572834594321504, | |
| "grad_norm": 0.673218846321106, | |
| "learning_rate": 0.00016, | |
| "loss": 5.0599, | |
| "num_input_tokens_seen": 838860800, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.018671136756466596, | |
| "grad_norm": 0.6499584317207336, | |
| "learning_rate": 0.00017, | |
| "loss": 4.9475, | |
| "num_input_tokens_seen": 891289600, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.01976943891861169, | |
| "grad_norm": 0.9044798016548157, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 4.8334, | |
| "num_input_tokens_seen": 943718400, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.020867741080756785, | |
| "grad_norm": 0.886431872844696, | |
| "learning_rate": 0.00019, | |
| "loss": 4.723, | |
| "num_input_tokens_seen": 996147200, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.02196604324290188, | |
| "grad_norm": 0.6721145510673523, | |
| "learning_rate": 0.0002, | |
| "loss": 4.6106, | |
| "num_input_tokens_seen": 1048576000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02196604324290188, | |
| "eval_loss": 4.456684589385986, | |
| "eval_runtime": 66.2606, | |
| "eval_samples_per_second": 75.46, | |
| "eval_steps_per_second": 18.865, | |
| "num_input_tokens_seen": 1048576000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02306434540504697, | |
| "grad_norm": 0.6067565083503723, | |
| "learning_rate": 0.00021, | |
| "loss": 4.5355, | |
| "num_input_tokens_seen": 1101004800, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.024162647567192067, | |
| "grad_norm": 0.6668316721916199, | |
| "learning_rate": 0.00022, | |
| "loss": 4.4383, | |
| "num_input_tokens_seen": 1153433600, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.02526094972933716, | |
| "grad_norm": 0.3714616000652313, | |
| "learning_rate": 0.00023, | |
| "loss": 4.3538, | |
| "num_input_tokens_seen": 1205862400, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.026359251891482256, | |
| "grad_norm": 0.439012348651886, | |
| "learning_rate": 0.00024, | |
| "loss": 4.2848, | |
| "num_input_tokens_seen": 1258291200, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.027457554053627348, | |
| "grad_norm": 0.5026286840438843, | |
| "learning_rate": 0.00025, | |
| "loss": 4.2181, | |
| "num_input_tokens_seen": 1310720000, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.028555856215772443, | |
| "grad_norm": 0.4865541160106659, | |
| "learning_rate": 0.00026000000000000003, | |
| "loss": 4.1495, | |
| "num_input_tokens_seen": 1363148800, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.029654158377917537, | |
| "grad_norm": 0.5259677767753601, | |
| "learning_rate": 0.00027, | |
| "loss": 4.0873, | |
| "num_input_tokens_seen": 1415577600, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.030752460540062632, | |
| "grad_norm": 0.4151704013347626, | |
| "learning_rate": 0.00028000000000000003, | |
| "loss": 4.0369, | |
| "num_input_tokens_seen": 1468006400, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.03185076270220773, | |
| "grad_norm": 0.5806245803833008, | |
| "learning_rate": 0.00029, | |
| "loss": 3.9881, | |
| "num_input_tokens_seen": 1520435200, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.03294906486435282, | |
| "grad_norm": 0.46140730381011963, | |
| "learning_rate": 0.0003, | |
| "loss": 3.9311, | |
| "num_input_tokens_seen": 1572864000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03294906486435282, | |
| "eval_loss": 3.8112432956695557, | |
| "eval_runtime": 65.8947, | |
| "eval_samples_per_second": 75.879, | |
| "eval_steps_per_second": 18.97, | |
| "num_input_tokens_seen": 1572864000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03404736702649791, | |
| "grad_norm": 0.4219188392162323, | |
| "learning_rate": 0.00031, | |
| "loss": 3.8972, | |
| "num_input_tokens_seen": 1625292800, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.03514566918864301, | |
| "grad_norm": 0.3506027162075043, | |
| "learning_rate": 0.00032, | |
| "loss": 3.8596, | |
| "num_input_tokens_seen": 1677721600, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0362439713507881, | |
| "grad_norm": 0.5210819840431213, | |
| "learning_rate": 0.00033, | |
| "loss": 3.8182, | |
| "num_input_tokens_seen": 1730150400, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.03734227351293319, | |
| "grad_norm": 0.5830159783363342, | |
| "learning_rate": 0.00034, | |
| "loss": 3.7766, | |
| "num_input_tokens_seen": 1782579200, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.03844057567507829, | |
| "grad_norm": 0.4602348804473877, | |
| "learning_rate": 0.00035, | |
| "loss": 3.7362, | |
| "num_input_tokens_seen": 1835008000, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.03953887783722338, | |
| "grad_norm": 0.40075036883354187, | |
| "learning_rate": 0.00035999999999999997, | |
| "loss": 3.7136, | |
| "num_input_tokens_seen": 1887436800, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.04063717999936848, | |
| "grad_norm": 0.3893415629863739, | |
| "learning_rate": 0.00037, | |
| "loss": 3.6809, | |
| "num_input_tokens_seen": 1939865600, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.04173548216151357, | |
| "grad_norm": 0.2921469211578369, | |
| "learning_rate": 0.00038, | |
| "loss": 3.6565, | |
| "num_input_tokens_seen": 1992294400, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.04283378432365866, | |
| "grad_norm": 0.49007460474967957, | |
| "learning_rate": 0.00039000000000000005, | |
| "loss": 3.6215, | |
| "num_input_tokens_seen": 2044723200, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.04393208648580376, | |
| "grad_norm": 0.2980474531650543, | |
| "learning_rate": 0.0004, | |
| "loss": 3.591, | |
| "num_input_tokens_seen": 2097152000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04393208648580376, | |
| "eval_loss": 3.4769670963287354, | |
| "eval_runtime": 62.8853, | |
| "eval_samples_per_second": 79.51, | |
| "eval_steps_per_second": 19.877, | |
| "num_input_tokens_seen": 2097152000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04503038864794885, | |
| "grad_norm": 0.33002936840057373, | |
| "learning_rate": 0.00041, | |
| "loss": 3.5684, | |
| "num_input_tokens_seen": 2149580800, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.04612869081009394, | |
| "grad_norm": 0.43806758522987366, | |
| "learning_rate": 0.00042, | |
| "loss": 3.5436, | |
| "num_input_tokens_seen": 2202009600, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.04722699297223904, | |
| "grad_norm": 0.32842758297920227, | |
| "learning_rate": 0.00043, | |
| "loss": 3.5191, | |
| "num_input_tokens_seen": 2254438400, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.04832529513438413, | |
| "grad_norm": 0.3068505525588989, | |
| "learning_rate": 0.00044, | |
| "loss": 3.5009, | |
| "num_input_tokens_seen": 2306867200, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.049423597296529224, | |
| "grad_norm": 0.2950410544872284, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 3.4796, | |
| "num_input_tokens_seen": 2359296000, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.05052189945867432, | |
| "grad_norm": 0.29731425642967224, | |
| "learning_rate": 0.00046, | |
| "loss": 3.4583, | |
| "num_input_tokens_seen": 2411724800, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.051620201620819414, | |
| "grad_norm": 0.2702693045139313, | |
| "learning_rate": 0.00047, | |
| "loss": 3.4385, | |
| "num_input_tokens_seen": 2464153600, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.05271850378296451, | |
| "grad_norm": 0.2418452948331833, | |
| "learning_rate": 0.00048, | |
| "loss": 3.4244, | |
| "num_input_tokens_seen": 2516582400, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.053816805945109604, | |
| "grad_norm": 0.28668686747550964, | |
| "learning_rate": 0.00049, | |
| "loss": 3.3977, | |
| "num_input_tokens_seen": 2569011200, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.054915108107254695, | |
| "grad_norm": 0.3115544319152832, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3881, | |
| "num_input_tokens_seen": 2621440000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.054915108107254695, | |
| "eval_loss": 3.2789928913116455, | |
| "eval_runtime": 62.6749, | |
| "eval_samples_per_second": 79.777, | |
| "eval_steps_per_second": 19.944, | |
| "num_input_tokens_seen": 2621440000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.056013410269399794, | |
| "grad_norm": 0.32340022921562195, | |
| "learning_rate": 0.00051, | |
| "loss": 3.3667, | |
| "num_input_tokens_seen": 2673868800, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.057111712431544885, | |
| "grad_norm": 0.2612442970275879, | |
| "learning_rate": 0.0005200000000000001, | |
| "loss": 3.3612, | |
| "num_input_tokens_seen": 2726297600, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.05821001459368998, | |
| "grad_norm": 0.29934820532798767, | |
| "learning_rate": 0.0005300000000000001, | |
| "loss": 3.3386, | |
| "num_input_tokens_seen": 2778726400, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.059308316755835075, | |
| "grad_norm": 0.2737022042274475, | |
| "learning_rate": 0.00054, | |
| "loss": 3.3274, | |
| "num_input_tokens_seen": 2831155200, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.060406618917980166, | |
| "grad_norm": 0.2101408988237381, | |
| "learning_rate": 0.00055, | |
| "loss": 3.3153, | |
| "num_input_tokens_seen": 2883584000, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.061504921080125265, | |
| "grad_norm": 0.3240911066532135, | |
| "learning_rate": 0.0005600000000000001, | |
| "loss": 3.2978, | |
| "num_input_tokens_seen": 2936012800, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.06260322324227036, | |
| "grad_norm": 0.20592735707759857, | |
| "learning_rate": 0.00057, | |
| "loss": 3.2984, | |
| "num_input_tokens_seen": 2988441600, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.06370152540441545, | |
| "grad_norm": 0.263443261384964, | |
| "learning_rate": 0.00058, | |
| "loss": 3.2706, | |
| "num_input_tokens_seen": 3040870400, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.06479982756656054, | |
| "grad_norm": 0.24249990284442902, | |
| "learning_rate": 0.00059, | |
| "loss": 3.2673, | |
| "num_input_tokens_seen": 3093299200, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.06589812972870564, | |
| "grad_norm": 0.25961214303970337, | |
| "learning_rate": 0.0006, | |
| "loss": 3.2512, | |
| "num_input_tokens_seen": 3145728000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06589812972870564, | |
| "eval_loss": 3.150442600250244, | |
| "eval_runtime": 65.9549, | |
| "eval_samples_per_second": 75.809, | |
| "eval_steps_per_second": 18.952, | |
| "num_input_tokens_seen": 3145728000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06699643189085074, | |
| "grad_norm": 0.21884848177433014, | |
| "learning_rate": 0.00061, | |
| "loss": 3.2437, | |
| "num_input_tokens_seen": 3198156800, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.06809473405299582, | |
| "grad_norm": 0.2534893751144409, | |
| "learning_rate": 0.00062, | |
| "loss": 3.2366, | |
| "num_input_tokens_seen": 3250585600, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.06919303621514092, | |
| "grad_norm": 0.2408875823020935, | |
| "learning_rate": 0.00063, | |
| "loss": 3.2264, | |
| "num_input_tokens_seen": 3303014400, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.07029133837728602, | |
| "grad_norm": 0.22240856289863586, | |
| "learning_rate": 0.00064, | |
| "loss": 3.2102, | |
| "num_input_tokens_seen": 3355443200, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.0713896405394311, | |
| "grad_norm": 0.21527299284934998, | |
| "learning_rate": 0.0006500000000000001, | |
| "loss": 3.1985, | |
| "num_input_tokens_seen": 3407872000, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.0724879427015762, | |
| "grad_norm": 0.26642242074012756, | |
| "learning_rate": 0.00066, | |
| "loss": 3.1923, | |
| "num_input_tokens_seen": 3460300800, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0735862448637213, | |
| "grad_norm": 0.22164040803909302, | |
| "learning_rate": 0.00067, | |
| "loss": 3.1848, | |
| "num_input_tokens_seen": 3512729600, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.07468454702586638, | |
| "grad_norm": 0.21594341099262238, | |
| "learning_rate": 0.00068, | |
| "loss": 3.1764, | |
| "num_input_tokens_seen": 3565158400, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.07578284918801148, | |
| "grad_norm": 0.1921539604663849, | |
| "learning_rate": 0.00069, | |
| "loss": 3.1643, | |
| "num_input_tokens_seen": 3617587200, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.07688115135015658, | |
| "grad_norm": 0.2266080528497696, | |
| "learning_rate": 0.0007, | |
| "loss": 3.1647, | |
| "num_input_tokens_seen": 3670016000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07688115135015658, | |
| "eval_loss": 3.061373472213745, | |
| "eval_runtime": 63.388, | |
| "eval_samples_per_second": 78.879, | |
| "eval_steps_per_second": 19.72, | |
| "num_input_tokens_seen": 3670016000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07797945351230168, | |
| "grad_norm": 0.19900226593017578, | |
| "learning_rate": 0.00071, | |
| "loss": 3.1557, | |
| "num_input_tokens_seen": 3722444800, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.07907775567444676, | |
| "grad_norm": 0.20299012959003448, | |
| "learning_rate": 0.0007199999999999999, | |
| "loss": 3.1503, | |
| "num_input_tokens_seen": 3774873600, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.08017605783659186, | |
| "grad_norm": 0.232399120926857, | |
| "learning_rate": 0.00073, | |
| "loss": 3.1387, | |
| "num_input_tokens_seen": 3827302400, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.08127435999873696, | |
| "grad_norm": 0.2127719670534134, | |
| "learning_rate": 0.00074, | |
| "loss": 3.1388, | |
| "num_input_tokens_seen": 3879731200, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.08237266216088204, | |
| "grad_norm": 0.22336533665657043, | |
| "learning_rate": 0.00075, | |
| "loss": 3.1247, | |
| "num_input_tokens_seen": 3932160000, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.08347096432302714, | |
| "grad_norm": 0.18270662426948547, | |
| "learning_rate": 0.00076, | |
| "loss": 3.1192, | |
| "num_input_tokens_seen": 3984588800, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.08456926648517224, | |
| "grad_norm": 0.16843897104263306, | |
| "learning_rate": 0.0007700000000000001, | |
| "loss": 3.1153, | |
| "num_input_tokens_seen": 4037017600, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.08566756864731732, | |
| "grad_norm": 0.19947747886180878, | |
| "learning_rate": 0.0007800000000000001, | |
| "loss": 3.1048, | |
| "num_input_tokens_seen": 4089446400, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.08676587080946242, | |
| "grad_norm": 0.17078733444213867, | |
| "learning_rate": 0.00079, | |
| "loss": 3.1014, | |
| "num_input_tokens_seen": 4141875200, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.08786417297160752, | |
| "grad_norm": 0.22091113030910492, | |
| "learning_rate": 0.0008, | |
| "loss": 3.0982, | |
| "num_input_tokens_seen": 4194304000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08786417297160752, | |
| "eval_loss": 2.9978296756744385, | |
| "eval_runtime": 65.6064, | |
| "eval_samples_per_second": 76.212, | |
| "eval_steps_per_second": 19.053, | |
| "num_input_tokens_seen": 4194304000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0889624751337526, | |
| "grad_norm": 0.1839856207370758, | |
| "learning_rate": 0.0008100000000000001, | |
| "loss": 3.0862, | |
| "num_input_tokens_seen": 4246732800, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.0900607772958977, | |
| "grad_norm": 0.17331145703792572, | |
| "learning_rate": 0.00082, | |
| "loss": 3.087, | |
| "num_input_tokens_seen": 4299161600, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.0911590794580428, | |
| "grad_norm": 0.18384258449077606, | |
| "learning_rate": 0.00083, | |
| "loss": 3.076, | |
| "num_input_tokens_seen": 4351590400, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.09225738162018789, | |
| "grad_norm": 0.17061170935630798, | |
| "learning_rate": 0.00084, | |
| "loss": 3.0693, | |
| "num_input_tokens_seen": 4404019200, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.09335568378233298, | |
| "grad_norm": 0.18157647550106049, | |
| "learning_rate": 0.00085, | |
| "loss": 3.0698, | |
| "num_input_tokens_seen": 4456448000, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.09445398594447808, | |
| "grad_norm": 0.15678547322750092, | |
| "learning_rate": 0.00086, | |
| "loss": 3.064, | |
| "num_input_tokens_seen": 4508876800, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.09555228810662317, | |
| "grad_norm": 0.19118325412273407, | |
| "learning_rate": 0.00087, | |
| "loss": 3.0541, | |
| "num_input_tokens_seen": 4561305600, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.09665059026876827, | |
| "grad_norm": 0.17620691657066345, | |
| "learning_rate": 0.00088, | |
| "loss": 3.0532, | |
| "num_input_tokens_seen": 4613734400, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.09774889243091336, | |
| "grad_norm": 0.17351101338863373, | |
| "learning_rate": 0.0008900000000000001, | |
| "loss": 3.0549, | |
| "num_input_tokens_seen": 4666163200, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.09884719459305845, | |
| "grad_norm": 0.15183581411838531, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 3.0485, | |
| "num_input_tokens_seen": 4718592000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09884719459305845, | |
| "eval_loss": 2.9479379653930664, | |
| "eval_runtime": 66.5611, | |
| "eval_samples_per_second": 75.119, | |
| "eval_steps_per_second": 18.78, | |
| "num_input_tokens_seen": 4718592000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09994549675520355, | |
| "grad_norm": 0.1681961864233017, | |
| "learning_rate": 0.00091, | |
| "loss": 3.0395, | |
| "num_input_tokens_seen": 4771020800, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.10104379891734865, | |
| "grad_norm": 0.17382557690143585, | |
| "learning_rate": 0.00092, | |
| "loss": 3.0371, | |
| "num_input_tokens_seen": 4823449600, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.10214210107949374, | |
| "grad_norm": 0.14377906918525696, | |
| "learning_rate": 0.00093, | |
| "loss": 3.0377, | |
| "num_input_tokens_seen": 4875878400, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.10324040324163883, | |
| "grad_norm": 0.1590214967727661, | |
| "learning_rate": 0.00094, | |
| "loss": 3.0305, | |
| "num_input_tokens_seen": 4928307200, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.10433870540378393, | |
| "grad_norm": 0.15563353896141052, | |
| "learning_rate": 0.00095, | |
| "loss": 3.0254, | |
| "num_input_tokens_seen": 4980736000, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.10543700756592903, | |
| "grad_norm": 0.16002103686332703, | |
| "learning_rate": 0.00096, | |
| "loss": 3.0222, | |
| "num_input_tokens_seen": 5033164800, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.10653530972807411, | |
| "grad_norm": 0.1406039148569107, | |
| "learning_rate": 0.0009699999999999999, | |
| "loss": 3.0185, | |
| "num_input_tokens_seen": 5085593600, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.10763361189021921, | |
| "grad_norm": 0.14609627425670624, | |
| "learning_rate": 0.00098, | |
| "loss": 3.0177, | |
| "num_input_tokens_seen": 5138022400, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.1087319140523643, | |
| "grad_norm": 0.16061657667160034, | |
| "learning_rate": 0.00099, | |
| "loss": 3.0137, | |
| "num_input_tokens_seen": 5190451200, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.10983021621450939, | |
| "grad_norm": 0.18423974514007568, | |
| "learning_rate": 0.001, | |
| "loss": 3.016, | |
| "num_input_tokens_seen": 5242880000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.10983021621450939, | |
| "eval_loss": 2.9132862091064453, | |
| "eval_runtime": 65.7163, | |
| "eval_samples_per_second": 76.085, | |
| "eval_steps_per_second": 19.021, | |
| "num_input_tokens_seen": 5242880000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.11092851837665449, | |
| "grad_norm": 0.15302155911922455, | |
| "learning_rate": 0.001, | |
| "loss": 3.0037, | |
| "num_input_tokens_seen": 5295308800, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.11202682053879959, | |
| "grad_norm": 0.1474563181400299, | |
| "learning_rate": 0.001, | |
| "loss": 3.0063, | |
| "num_input_tokens_seen": 5347737600, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.11312512270094467, | |
| "grad_norm": 0.14318443834781647, | |
| "learning_rate": 0.001, | |
| "loss": 3.0011, | |
| "num_input_tokens_seen": 5400166400, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.11422342486308977, | |
| "grad_norm": 0.1521013379096985, | |
| "learning_rate": 0.001, | |
| "loss": 2.9946, | |
| "num_input_tokens_seen": 5452595200, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.11532172702523487, | |
| "grad_norm": 0.14434175193309784, | |
| "learning_rate": 0.001, | |
| "loss": 2.9909, | |
| "num_input_tokens_seen": 5505024000, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.11642002918737995, | |
| "grad_norm": 0.16284991800785065, | |
| "learning_rate": 0.001, | |
| "loss": 2.9846, | |
| "num_input_tokens_seen": 5557452800, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.11751833134952505, | |
| "grad_norm": 0.15281164646148682, | |
| "learning_rate": 0.001, | |
| "loss": 2.9843, | |
| "num_input_tokens_seen": 5609881600, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.11861663351167015, | |
| "grad_norm": 0.1227719634771347, | |
| "learning_rate": 0.001, | |
| "loss": 2.9778, | |
| "num_input_tokens_seen": 5662310400, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.11971493567381523, | |
| "grad_norm": 0.1346055269241333, | |
| "learning_rate": 0.001, | |
| "loss": 2.9745, | |
| "num_input_tokens_seen": 5714739200, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.12081323783596033, | |
| "grad_norm": 0.15828204154968262, | |
| "learning_rate": 0.001, | |
| "loss": 2.9723, | |
| "num_input_tokens_seen": 5767168000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.12081323783596033, | |
| "eval_loss": 2.8801000118255615, | |
| "eval_runtime": 65.3935, | |
| "eval_samples_per_second": 76.46, | |
| "eval_steps_per_second": 19.115, | |
| "num_input_tokens_seen": 5767168000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.12191153999810543, | |
| "grad_norm": 0.1391400694847107, | |
| "learning_rate": 0.001, | |
| "loss": 2.9609, | |
| "num_input_tokens_seen": 5819596800, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.12300984216025053, | |
| "grad_norm": 0.14347107708454132, | |
| "learning_rate": 0.001, | |
| "loss": 2.9697, | |
| "num_input_tokens_seen": 5872025600, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.12410814432239561, | |
| "grad_norm": 0.13779127597808838, | |
| "learning_rate": 0.001, | |
| "loss": 2.9609, | |
| "num_input_tokens_seen": 5924454400, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.1252064464845407, | |
| "grad_norm": 0.13017955422401428, | |
| "learning_rate": 0.001, | |
| "loss": 2.9545, | |
| "num_input_tokens_seen": 5976883200, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.1263047486466858, | |
| "grad_norm": 0.12697578966617584, | |
| "learning_rate": 0.001, | |
| "loss": 2.9563, | |
| "num_input_tokens_seen": 6029312000, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.1274030508088309, | |
| "grad_norm": 0.15175020694732666, | |
| "learning_rate": 0.001, | |
| "loss": 2.9502, | |
| "num_input_tokens_seen": 6081740800, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.12850135297097598, | |
| "grad_norm": 0.1209852397441864, | |
| "learning_rate": 0.001, | |
| "loss": 2.9516, | |
| "num_input_tokens_seen": 6134169600, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.12959965513312108, | |
| "grad_norm": 0.16521666944026947, | |
| "learning_rate": 0.001, | |
| "loss": 2.9528, | |
| "num_input_tokens_seen": 6186598400, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.13069795729526618, | |
| "grad_norm": 0.12271756678819656, | |
| "learning_rate": 0.001, | |
| "loss": 2.9382, | |
| "num_input_tokens_seen": 6239027200, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.13179625945741127, | |
| "grad_norm": 0.1376461535692215, | |
| "learning_rate": 0.001, | |
| "loss": 2.9464, | |
| "num_input_tokens_seen": 6291456000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.13179625945741127, | |
| "eval_loss": 2.84769606590271, | |
| "eval_runtime": 65.8814, | |
| "eval_samples_per_second": 75.894, | |
| "eval_steps_per_second": 18.973, | |
| "num_input_tokens_seen": 6291456000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.13289456161955637, | |
| "grad_norm": 0.11629872024059296, | |
| "learning_rate": 0.001, | |
| "loss": 2.9406, | |
| "num_input_tokens_seen": 6343884800, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.13399286378170147, | |
| "grad_norm": 0.13740529119968414, | |
| "learning_rate": 0.001, | |
| "loss": 2.9343, | |
| "num_input_tokens_seen": 6396313600, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.13509116594384657, | |
| "grad_norm": 0.11548039317131042, | |
| "learning_rate": 0.001, | |
| "loss": 2.9374, | |
| "num_input_tokens_seen": 6448742400, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.13618946810599164, | |
| "grad_norm": 0.11710146814584732, | |
| "learning_rate": 0.001, | |
| "loss": 2.9376, | |
| "num_input_tokens_seen": 6501171200, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.13728777026813674, | |
| "grad_norm": 0.11223472654819489, | |
| "learning_rate": 0.001, | |
| "loss": 2.9284, | |
| "num_input_tokens_seen": 6553600000, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.13838607243028184, | |
| "grad_norm": 0.12880656123161316, | |
| "learning_rate": 0.001, | |
| "loss": 2.9303, | |
| "num_input_tokens_seen": 6606028800, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.13948437459242694, | |
| "grad_norm": 0.11898139119148254, | |
| "learning_rate": 0.001, | |
| "loss": 2.9246, | |
| "num_input_tokens_seen": 6658457600, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.14058267675457203, | |
| "grad_norm": 0.11154898256063461, | |
| "learning_rate": 0.001, | |
| "loss": 2.9254, | |
| "num_input_tokens_seen": 6710886400, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.14168097891671713, | |
| "grad_norm": 0.12669232487678528, | |
| "learning_rate": 0.001, | |
| "loss": 2.9162, | |
| "num_input_tokens_seen": 6763315200, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.1427792810788622, | |
| "grad_norm": 0.12259842455387115, | |
| "learning_rate": 0.001, | |
| "loss": 2.9179, | |
| "num_input_tokens_seen": 6815744000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1427792810788622, | |
| "eval_loss": 2.8220207691192627, | |
| "eval_runtime": 65.2868, | |
| "eval_samples_per_second": 76.585, | |
| "eval_steps_per_second": 19.146, | |
| "num_input_tokens_seen": 6815744000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1438775832410073, | |
| "grad_norm": 0.13403092324733734, | |
| "learning_rate": 0.001, | |
| "loss": 2.9102, | |
| "num_input_tokens_seen": 6868172800, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.1449758854031524, | |
| "grad_norm": 0.13063696026802063, | |
| "learning_rate": 0.001, | |
| "loss": 2.9112, | |
| "num_input_tokens_seen": 6920601600, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.1460741875652975, | |
| "grad_norm": 0.11871635168790817, | |
| "learning_rate": 0.001, | |
| "loss": 2.9085, | |
| "num_input_tokens_seen": 6973030400, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.1471724897274426, | |
| "grad_norm": 0.11007633060216904, | |
| "learning_rate": 0.001, | |
| "loss": 2.9098, | |
| "num_input_tokens_seen": 7025459200, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.1482707918895877, | |
| "grad_norm": 0.10521857440471649, | |
| "learning_rate": 0.001, | |
| "loss": 2.9086, | |
| "num_input_tokens_seen": 7077888000, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.14936909405173276, | |
| "grad_norm": 0.11179310083389282, | |
| "learning_rate": 0.001, | |
| "loss": 2.9066, | |
| "num_input_tokens_seen": 7130316800, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.15046739621387786, | |
| "grad_norm": 0.1192353144288063, | |
| "learning_rate": 0.001, | |
| "loss": 2.9135, | |
| "num_input_tokens_seen": 7182745600, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.15156569837602296, | |
| "grad_norm": 0.11084350198507309, | |
| "learning_rate": 0.001, | |
| "loss": 2.9054, | |
| "num_input_tokens_seen": 7235174400, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.15266400053816806, | |
| "grad_norm": 0.11826325207948685, | |
| "learning_rate": 0.001, | |
| "loss": 2.9054, | |
| "num_input_tokens_seen": 7287603200, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.15376230270031316, | |
| "grad_norm": 0.12597590684890747, | |
| "learning_rate": 0.001, | |
| "loss": 2.8945, | |
| "num_input_tokens_seen": 7340032000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.15376230270031316, | |
| "eval_loss": 2.802734851837158, | |
| "eval_runtime": 65.3332, | |
| "eval_samples_per_second": 76.531, | |
| "eval_steps_per_second": 19.133, | |
| "num_input_tokens_seen": 7340032000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.15486060486245826, | |
| "grad_norm": 0.11222469806671143, | |
| "learning_rate": 0.001, | |
| "loss": 2.8997, | |
| "num_input_tokens_seen": 7392460800, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.15595890702460335, | |
| "grad_norm": 0.11488104611635208, | |
| "learning_rate": 0.001, | |
| "loss": 2.8965, | |
| "num_input_tokens_seen": 7444889600, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.15705720918674843, | |
| "grad_norm": 0.1285555213689804, | |
| "learning_rate": 0.001, | |
| "loss": 2.8909, | |
| "num_input_tokens_seen": 7497318400, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.15815551134889352, | |
| "grad_norm": 0.12659265100955963, | |
| "learning_rate": 0.001, | |
| "loss": 2.8833, | |
| "num_input_tokens_seen": 7549747200, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.15925381351103862, | |
| "grad_norm": 0.10823842883110046, | |
| "learning_rate": 0.001, | |
| "loss": 2.9031, | |
| "num_input_tokens_seen": 7602176000, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.16035211567318372, | |
| "grad_norm": 0.12597811222076416, | |
| "learning_rate": 0.001, | |
| "loss": 2.8831, | |
| "num_input_tokens_seen": 7654604800, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.16145041783532882, | |
| "grad_norm": 0.1285410374403, | |
| "learning_rate": 0.001, | |
| "loss": 2.8931, | |
| "num_input_tokens_seen": 7707033600, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.16254871999747392, | |
| "grad_norm": 0.11170299351215363, | |
| "learning_rate": 0.001, | |
| "loss": 2.8861, | |
| "num_input_tokens_seen": 7759462400, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.163647022159619, | |
| "grad_norm": 0.11146055907011032, | |
| "learning_rate": 0.001, | |
| "loss": 2.8756, | |
| "num_input_tokens_seen": 7811891200, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.1647453243217641, | |
| "grad_norm": 0.10750412940979004, | |
| "learning_rate": 0.001, | |
| "loss": 2.8808, | |
| "num_input_tokens_seen": 7864320000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.1647453243217641, | |
| "eval_loss": 2.785506248474121, | |
| "eval_runtime": 65.0661, | |
| "eval_samples_per_second": 76.845, | |
| "eval_steps_per_second": 19.211, | |
| "num_input_tokens_seen": 7864320000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.16584362648390918, | |
| "grad_norm": 0.11221355944871902, | |
| "learning_rate": 0.001, | |
| "loss": 2.8834, | |
| "num_input_tokens_seen": 7916748800, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.16694192864605428, | |
| "grad_norm": 0.1089220717549324, | |
| "learning_rate": 0.001, | |
| "loss": 2.8796, | |
| "num_input_tokens_seen": 7969177600, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.16804023080819938, | |
| "grad_norm": 0.11125486344099045, | |
| "learning_rate": 0.001, | |
| "loss": 2.8836, | |
| "num_input_tokens_seen": 8021606400, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.16913853297034448, | |
| "grad_norm": 0.12804660201072693, | |
| "learning_rate": 0.001, | |
| "loss": 2.8754, | |
| "num_input_tokens_seen": 8074035200, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.17023683513248955, | |
| "grad_norm": 0.11395713686943054, | |
| "learning_rate": 0.001, | |
| "loss": 2.8736, | |
| "num_input_tokens_seen": 8126464000, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.17133513729463465, | |
| "grad_norm": 0.1095738559961319, | |
| "learning_rate": 0.001, | |
| "loss": 2.8743, | |
| "num_input_tokens_seen": 8178892800, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.17243343945677975, | |
| "grad_norm": 0.10545111447572708, | |
| "learning_rate": 0.001, | |
| "loss": 2.8718, | |
| "num_input_tokens_seen": 8231321600, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.17353174161892485, | |
| "grad_norm": 0.13135021924972534, | |
| "learning_rate": 0.001, | |
| "loss": 2.8648, | |
| "num_input_tokens_seen": 8283750400, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.17463004378106994, | |
| "grad_norm": 0.12348899990320206, | |
| "learning_rate": 0.001, | |
| "loss": 2.8628, | |
| "num_input_tokens_seen": 8336179200, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.17572834594321504, | |
| "grad_norm": 0.10604492574930191, | |
| "learning_rate": 0.001, | |
| "loss": 2.8676, | |
| "num_input_tokens_seen": 8388608000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.17572834594321504, | |
| "eval_loss": 2.7698919773101807, | |
| "eval_runtime": 65.5096, | |
| "eval_samples_per_second": 76.325, | |
| "eval_steps_per_second": 19.081, | |
| "num_input_tokens_seen": 8388608000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.17682664810536014, | |
| "grad_norm": 0.12299258261919022, | |
| "learning_rate": 0.001, | |
| "loss": 2.8626, | |
| "num_input_tokens_seen": 8441036800, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.1779249502675052, | |
| "grad_norm": 0.11638012528419495, | |
| "learning_rate": 0.001, | |
| "loss": 2.864, | |
| "num_input_tokens_seen": 8493465600, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.1790232524296503, | |
| "grad_norm": 0.10978250205516815, | |
| "learning_rate": 0.001, | |
| "loss": 2.8589, | |
| "num_input_tokens_seen": 8545894400, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.1801215545917954, | |
| "grad_norm": 0.11229872703552246, | |
| "learning_rate": 0.001, | |
| "loss": 2.8671, | |
| "num_input_tokens_seen": 8598323200, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.1812198567539405, | |
| "grad_norm": 0.13177119195461273, | |
| "learning_rate": 0.001, | |
| "loss": 2.8524, | |
| "num_input_tokens_seen": 8650752000, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.1823181589160856, | |
| "grad_norm": 0.11021032929420471, | |
| "learning_rate": 0.001, | |
| "loss": 2.8552, | |
| "num_input_tokens_seen": 8703180800, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.1834164610782307, | |
| "grad_norm": 0.11381058394908905, | |
| "learning_rate": 0.001, | |
| "loss": 2.8529, | |
| "num_input_tokens_seen": 8755609600, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.18451476324037577, | |
| "grad_norm": 0.10889217257499695, | |
| "learning_rate": 0.001, | |
| "loss": 2.8581, | |
| "num_input_tokens_seen": 8808038400, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.18561306540252087, | |
| "grad_norm": 0.13519708812236786, | |
| "learning_rate": 0.001, | |
| "loss": 2.8518, | |
| "num_input_tokens_seen": 8860467200, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.18671136756466597, | |
| "grad_norm": 0.1265636533498764, | |
| "learning_rate": 0.001, | |
| "loss": 2.8452, | |
| "num_input_tokens_seen": 8912896000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.18671136756466597, | |
| "eval_loss": 2.754452705383301, | |
| "eval_runtime": 65.4439, | |
| "eval_samples_per_second": 76.401, | |
| "eval_steps_per_second": 19.1, | |
| "num_input_tokens_seen": 8912896000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.18780966972681107, | |
| "grad_norm": 0.12250006198883057, | |
| "learning_rate": 0.001, | |
| "loss": 2.8506, | |
| "num_input_tokens_seen": 8965324800, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.18890797188895617, | |
| "grad_norm": 0.1371607929468155, | |
| "learning_rate": 0.001, | |
| "loss": 2.8472, | |
| "num_input_tokens_seen": 9017753600, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.19000627405110126, | |
| "grad_norm": 0.11844755709171295, | |
| "learning_rate": 0.001, | |
| "loss": 2.8492, | |
| "num_input_tokens_seen": 9070182400, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.19110457621324634, | |
| "grad_norm": 0.38294216990470886, | |
| "learning_rate": 0.001, | |
| "loss": 6.3226, | |
| "num_input_tokens_seen": 9122611200, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.19220287837539143, | |
| "grad_norm": 0.44077590107917786, | |
| "learning_rate": 0.001, | |
| "loss": 6.7001, | |
| "num_input_tokens_seen": 9175040000, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.19330118053753653, | |
| "grad_norm": 0.4238772392272949, | |
| "learning_rate": 0.001, | |
| "loss": 5.8714, | |
| "num_input_tokens_seen": 9227468800, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.19439948269968163, | |
| "grad_norm": 0.2830688953399658, | |
| "learning_rate": 0.001, | |
| "loss": 4.8951, | |
| "num_input_tokens_seen": 9279897600, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.19549778486182673, | |
| "grad_norm": 0.2485039383172989, | |
| "learning_rate": 0.001, | |
| "loss": 3.928, | |
| "num_input_tokens_seen": 9332326400, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.19659608702397183, | |
| "grad_norm": 0.20515842735767365, | |
| "learning_rate": 0.001, | |
| "loss": 3.4277, | |
| "num_input_tokens_seen": 9384755200, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1976943891861169, | |
| "grad_norm": 0.13605651259422302, | |
| "learning_rate": 0.001, | |
| "loss": 3.2263, | |
| "num_input_tokens_seen": 9437184000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1976943891861169, | |
| "eval_loss": 3.014314889907837, | |
| "eval_runtime": 65.8851, | |
| "eval_samples_per_second": 75.89, | |
| "eval_steps_per_second": 18.972, | |
| "num_input_tokens_seen": 9437184000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.198792691348262, | |
| "grad_norm": 0.17666102945804596, | |
| "learning_rate": 0.001, | |
| "loss": 3.0728, | |
| "num_input_tokens_seen": 9489612800, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.1998909935104071, | |
| "grad_norm": 0.202484592795372, | |
| "learning_rate": 0.001, | |
| "loss": 2.9818, | |
| "num_input_tokens_seen": 9542041600, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.2009892956725522, | |
| "grad_norm": 0.15095236897468567, | |
| "learning_rate": 0.001, | |
| "loss": 2.9423, | |
| "num_input_tokens_seen": 9594470400, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.2020875978346973, | |
| "grad_norm": 0.13089850544929504, | |
| "learning_rate": 0.001, | |
| "loss": 2.9227, | |
| "num_input_tokens_seen": 9646899200, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.2031858999968424, | |
| "grad_norm": 0.14022304117679596, | |
| "learning_rate": 0.001, | |
| "loss": 2.8988, | |
| "num_input_tokens_seen": 9699328000, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.2042842021589875, | |
| "grad_norm": 0.13116785883903503, | |
| "learning_rate": 0.001, | |
| "loss": 2.8716, | |
| "num_input_tokens_seen": 9751756800, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.20538250432113256, | |
| "grad_norm": 0.1395471841096878, | |
| "learning_rate": 0.001, | |
| "loss": 2.8727, | |
| "num_input_tokens_seen": 9804185600, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.20648080648327766, | |
| "grad_norm": 0.1271878033876419, | |
| "learning_rate": 0.001, | |
| "loss": 2.864, | |
| "num_input_tokens_seen": 9856614400, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.20757910864542276, | |
| "grad_norm": 0.14148685336112976, | |
| "learning_rate": 0.001, | |
| "loss": 2.8604, | |
| "num_input_tokens_seen": 9909043200, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.20867741080756785, | |
| "grad_norm": 0.1292584091424942, | |
| "learning_rate": 0.001, | |
| "loss": 2.8547, | |
| "num_input_tokens_seen": 9961472000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.20867741080756785, | |
| "eval_loss": 2.756131649017334, | |
| "eval_runtime": 65.0495, | |
| "eval_samples_per_second": 76.865, | |
| "eval_steps_per_second": 19.216, | |
| "num_input_tokens_seen": 9961472000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.20977571296971295, | |
| "grad_norm": 0.10929372161626816, | |
| "learning_rate": 0.001, | |
| "loss": 2.8467, | |
| "num_input_tokens_seen": 10013900800, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.21087401513185805, | |
| "grad_norm": 0.1180899515748024, | |
| "learning_rate": 0.001, | |
| "loss": 2.8501, | |
| "num_input_tokens_seen": 10066329600, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.21197231729400312, | |
| "grad_norm": 0.12041448056697845, | |
| "learning_rate": 0.001, | |
| "loss": 2.8438, | |
| "num_input_tokens_seen": 10118758400, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.21307061945614822, | |
| "grad_norm": 0.13195224106311798, | |
| "learning_rate": 0.001, | |
| "loss": 2.8341, | |
| "num_input_tokens_seen": 10171187200, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.21416892161829332, | |
| "grad_norm": 0.11887054890394211, | |
| "learning_rate": 0.001, | |
| "loss": 2.8349, | |
| "num_input_tokens_seen": 10223616000, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.21526722378043842, | |
| "grad_norm": 0.1044996827840805, | |
| "learning_rate": 0.001, | |
| "loss": 2.8428, | |
| "num_input_tokens_seen": 10276044800, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.21636552594258351, | |
| "grad_norm": 0.11951665580272675, | |
| "learning_rate": 0.001, | |
| "loss": 2.8323, | |
| "num_input_tokens_seen": 10328473600, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.2174638281047286, | |
| "grad_norm": 0.11673793941736221, | |
| "learning_rate": 0.001, | |
| "loss": 2.8271, | |
| "num_input_tokens_seen": 10380902400, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.21856213026687368, | |
| "grad_norm": 0.1178969219326973, | |
| "learning_rate": 0.001, | |
| "loss": 2.8328, | |
| "num_input_tokens_seen": 10433331200, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.21966043242901878, | |
| "grad_norm": 0.11995361745357513, | |
| "learning_rate": 0.001, | |
| "loss": 2.8182, | |
| "num_input_tokens_seen": 10485760000, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.21966043242901878, | |
| "eval_loss": 2.732673168182373, | |
| "eval_runtime": 66.3377, | |
| "eval_samples_per_second": 75.372, | |
| "eval_steps_per_second": 18.843, | |
| "num_input_tokens_seen": 10485760000, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.22075873459116388, | |
| "grad_norm": 0.13463908433914185, | |
| "learning_rate": 0.001, | |
| "loss": 2.8242, | |
| "num_input_tokens_seen": 10538188800, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.22185703675330898, | |
| "grad_norm": 0.11778156459331512, | |
| "learning_rate": 0.001, | |
| "loss": 2.8234, | |
| "num_input_tokens_seen": 10590617600, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.22295533891545408, | |
| "grad_norm": 0.11393869668245316, | |
| "learning_rate": 0.001, | |
| "loss": 2.8204, | |
| "num_input_tokens_seen": 10643046400, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.22405364107759917, | |
| "grad_norm": 0.12454303354024887, | |
| "learning_rate": 0.001, | |
| "loss": 2.8185, | |
| "num_input_tokens_seen": 10695475200, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.22515194323974427, | |
| "grad_norm": 0.1148439347743988, | |
| "learning_rate": 0.001, | |
| "loss": 2.8219, | |
| "num_input_tokens_seen": 10747904000, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.22625024540188934, | |
| "grad_norm": 0.13888292014598846, | |
| "learning_rate": 0.001, | |
| "loss": 2.8157, | |
| "num_input_tokens_seen": 10800332800, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.22734854756403444, | |
| "grad_norm": 0.12242749333381653, | |
| "learning_rate": 0.001, | |
| "loss": 2.8165, | |
| "num_input_tokens_seen": 10852761600, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.22844684972617954, | |
| "grad_norm": 0.13651017844676971, | |
| "learning_rate": 0.001, | |
| "loss": 2.8165, | |
| "num_input_tokens_seen": 10905190400, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.22954515188832464, | |
| "grad_norm": 0.12349703162908554, | |
| "learning_rate": 0.001, | |
| "loss": 2.8126, | |
| "num_input_tokens_seen": 10957619200, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.23064345405046974, | |
| "grad_norm": 0.13448943197727203, | |
| "learning_rate": 0.001, | |
| "loss": 2.8162, | |
| "num_input_tokens_seen": 11010048000, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.23064345405046974, | |
| "eval_loss": 2.720102071762085, | |
| "eval_runtime": 65.0663, | |
| "eval_samples_per_second": 76.845, | |
| "eval_steps_per_second": 19.211, | |
| "num_input_tokens_seen": 11010048000, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.23174175621261484, | |
| "grad_norm": 0.1171165183186531, | |
| "learning_rate": 0.001, | |
| "loss": 2.817, | |
| "num_input_tokens_seen": 11062476800, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.2328400583747599, | |
| "grad_norm": 0.1417781263589859, | |
| "learning_rate": 0.001, | |
| "loss": 2.8159, | |
| "num_input_tokens_seen": 11114905600, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.233938360536905, | |
| "grad_norm": 0.13051685690879822, | |
| "learning_rate": 0.001, | |
| "loss": 2.8062, | |
| "num_input_tokens_seen": 11167334400, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.2350366626990501, | |
| "grad_norm": 0.12536808848381042, | |
| "learning_rate": 0.001, | |
| "loss": 2.8166, | |
| "num_input_tokens_seen": 11219763200, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.2361349648611952, | |
| "grad_norm": 0.11859289556741714, | |
| "learning_rate": 0.001, | |
| "loss": 2.8075, | |
| "num_input_tokens_seen": 11272192000, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.2372332670233403, | |
| "grad_norm": 0.14844287931919098, | |
| "learning_rate": 0.001, | |
| "loss": 2.8139, | |
| "num_input_tokens_seen": 11324620800, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.2383315691854854, | |
| "grad_norm": 0.12877844274044037, | |
| "learning_rate": 0.001, | |
| "loss": 2.8031, | |
| "num_input_tokens_seen": 11377049600, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.23942987134763047, | |
| "grad_norm": 0.13911722600460052, | |
| "learning_rate": 0.001, | |
| "loss": 2.7992, | |
| "num_input_tokens_seen": 11429478400, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.24052817350977557, | |
| "grad_norm": 0.156200110912323, | |
| "learning_rate": 0.001, | |
| "loss": 2.8059, | |
| "num_input_tokens_seen": 11481907200, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.24162647567192067, | |
| "grad_norm": 0.12990960478782654, | |
| "learning_rate": 0.001, | |
| "loss": 2.7984, | |
| "num_input_tokens_seen": 11534336000, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.24162647567192067, | |
| "eval_loss": 2.7103493213653564, | |
| "eval_runtime": 65.6611, | |
| "eval_samples_per_second": 76.149, | |
| "eval_steps_per_second": 19.037, | |
| "num_input_tokens_seen": 11534336000, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.24272477783406576, | |
| "grad_norm": 0.1190350204706192, | |
| "learning_rate": 0.001, | |
| "loss": 2.7994, | |
| "num_input_tokens_seen": 11586764800, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.24382307999621086, | |
| "grad_norm": 0.12825961410999298, | |
| "learning_rate": 0.001, | |
| "loss": 2.7992, | |
| "num_input_tokens_seen": 11639193600, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.24492138215835596, | |
| "grad_norm": 0.12561525404453278, | |
| "learning_rate": 0.001, | |
| "loss": 2.8009, | |
| "num_input_tokens_seen": 11691622400, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.24601968432050106, | |
| "grad_norm": 0.12596049904823303, | |
| "learning_rate": 0.001, | |
| "loss": 2.8002, | |
| "num_input_tokens_seen": 11744051200, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.24711798648264613, | |
| "grad_norm": 0.1415141373872757, | |
| "learning_rate": 0.001, | |
| "loss": 2.8004, | |
| "num_input_tokens_seen": 11796480000, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.24821628864479123, | |
| "grad_norm": 0.1359766125679016, | |
| "learning_rate": 0.001, | |
| "loss": 2.7988, | |
| "num_input_tokens_seen": 11848908800, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.24931459080693633, | |
| "grad_norm": 0.13459013402462006, | |
| "learning_rate": 0.001, | |
| "loss": 2.7991, | |
| "num_input_tokens_seen": 11901337600, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.2504128929690814, | |
| "grad_norm": 0.1344253420829773, | |
| "learning_rate": 0.001, | |
| "loss": 2.805, | |
| "num_input_tokens_seen": 11953766400, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.2515111951312265, | |
| "grad_norm": 0.13629016280174255, | |
| "learning_rate": 0.001, | |
| "loss": 2.7954, | |
| "num_input_tokens_seen": 12006195200, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.2526094972933716, | |
| "grad_norm": 0.12940892577171326, | |
| "learning_rate": 0.001, | |
| "loss": 2.8009, | |
| "num_input_tokens_seen": 12058624000, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.2526094972933716, | |
| "eval_loss": 2.7012581825256348, | |
| "eval_runtime": 65.7039, | |
| "eval_samples_per_second": 76.099, | |
| "eval_steps_per_second": 19.025, | |
| "num_input_tokens_seen": 12058624000, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.2537077994555167, | |
| "grad_norm": 0.15021966397762299, | |
| "learning_rate": 0.001, | |
| "loss": 2.7963, | |
| "num_input_tokens_seen": 12111052800, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.2548061016176618, | |
| "grad_norm": 0.12381847202777863, | |
| "learning_rate": 0.001, | |
| "loss": 2.7954, | |
| "num_input_tokens_seen": 12163481600, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.2559044037798069, | |
| "grad_norm": 0.14849607646465302, | |
| "learning_rate": 0.001, | |
| "loss": 2.7837, | |
| "num_input_tokens_seen": 12215910400, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.25700270594195196, | |
| "grad_norm": 0.1286240816116333, | |
| "learning_rate": 0.001, | |
| "loss": 2.7999, | |
| "num_input_tokens_seen": 12268339200, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.2581010081040971, | |
| "grad_norm": 0.11861539632081985, | |
| "learning_rate": 0.001, | |
| "loss": 2.7979, | |
| "num_input_tokens_seen": 12320768000, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.25919931026624216, | |
| "grad_norm": 0.11512617021799088, | |
| "learning_rate": 0.001, | |
| "loss": 2.7926, | |
| "num_input_tokens_seen": 12373196800, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.2602976124283873, | |
| "grad_norm": 0.13469178974628448, | |
| "learning_rate": 0.001, | |
| "loss": 2.7881, | |
| "num_input_tokens_seen": 12425625600, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.26139591459053235, | |
| "grad_norm": 0.15504290163516998, | |
| "learning_rate": 0.001, | |
| "loss": 2.7917, | |
| "num_input_tokens_seen": 12478054400, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.2624942167526775, | |
| "grad_norm": 0.1363905370235443, | |
| "learning_rate": 0.001, | |
| "loss": 2.7869, | |
| "num_input_tokens_seen": 12530483200, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.26359251891482255, | |
| "grad_norm": 0.11095720529556274, | |
| "learning_rate": 0.001, | |
| "loss": 2.7883, | |
| "num_input_tokens_seen": 12582912000, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.26359251891482255, | |
| "eval_loss": 2.6911227703094482, | |
| "eval_runtime": 65.4928, | |
| "eval_samples_per_second": 76.344, | |
| "eval_steps_per_second": 19.086, | |
| "num_input_tokens_seen": 12582912000, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.2646908210769676, | |
| "grad_norm": 0.1443321257829666, | |
| "learning_rate": 0.001, | |
| "loss": 2.7866, | |
| "num_input_tokens_seen": 12635340800, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.26578912323911275, | |
| "grad_norm": 0.12249191850423813, | |
| "learning_rate": 0.001, | |
| "loss": 2.8, | |
| "num_input_tokens_seen": 12687769600, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.2668874254012578, | |
| "grad_norm": 0.1505623608827591, | |
| "learning_rate": 0.001, | |
| "loss": 2.7934, | |
| "num_input_tokens_seen": 12740198400, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.26798572756340294, | |
| "grad_norm": 0.17367833852767944, | |
| "learning_rate": 0.001, | |
| "loss": 2.7905, | |
| "num_input_tokens_seen": 12792627200, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.269084029725548, | |
| "grad_norm": 0.12189670652151108, | |
| "learning_rate": 0.001, | |
| "loss": 2.7878, | |
| "num_input_tokens_seen": 12845056000, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.27018233188769314, | |
| "grad_norm": 0.12834201753139496, | |
| "learning_rate": 0.001, | |
| "loss": 2.7822, | |
| "num_input_tokens_seen": 12897484800, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.2712806340498382, | |
| "grad_norm": 0.1277332305908203, | |
| "learning_rate": 0.001, | |
| "loss": 2.7846, | |
| "num_input_tokens_seen": 12949913600, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.2723789362119833, | |
| "grad_norm": 0.14190761744976044, | |
| "learning_rate": 0.001, | |
| "loss": 2.7845, | |
| "num_input_tokens_seen": 13002342400, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.2734772383741284, | |
| "grad_norm": 0.14843693375587463, | |
| "learning_rate": 0.001, | |
| "loss": 2.7847, | |
| "num_input_tokens_seen": 13054771200, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.2745755405362735, | |
| "grad_norm": 0.14427120983600616, | |
| "learning_rate": 0.001, | |
| "loss": 2.78, | |
| "num_input_tokens_seen": 13107200000, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2745755405362735, | |
| "eval_loss": 2.6847124099731445, | |
| "eval_runtime": 65.0448, | |
| "eval_samples_per_second": 76.87, | |
| "eval_steps_per_second": 19.218, | |
| "num_input_tokens_seen": 13107200000, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2756738426984186, | |
| "grad_norm": 0.14408434927463531, | |
| "learning_rate": 0.001, | |
| "loss": 2.7794, | |
| "num_input_tokens_seen": 13159628800, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.2767721448605637, | |
| "grad_norm": 0.1557396501302719, | |
| "learning_rate": 0.001, | |
| "loss": 2.7754, | |
| "num_input_tokens_seen": 13212057600, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.27787044702270874, | |
| "grad_norm": 0.11494632810354233, | |
| "learning_rate": 0.001, | |
| "loss": 2.7839, | |
| "num_input_tokens_seen": 13264486400, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.27896874918485387, | |
| "grad_norm": 0.12402207404375076, | |
| "learning_rate": 0.001, | |
| "loss": 2.7773, | |
| "num_input_tokens_seen": 13316915200, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.28006705134699894, | |
| "grad_norm": 0.1308801770210266, | |
| "learning_rate": 0.001, | |
| "loss": 2.7864, | |
| "num_input_tokens_seen": 13369344000, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.28116535350914407, | |
| "grad_norm": 0.13596223294734955, | |
| "learning_rate": 0.001, | |
| "loss": 2.7763, | |
| "num_input_tokens_seen": 13421772800, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.28226365567128914, | |
| "grad_norm": 0.13256165385246277, | |
| "learning_rate": 0.001, | |
| "loss": 2.7762, | |
| "num_input_tokens_seen": 13474201600, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.28336195783343426, | |
| "grad_norm": 0.12955094873905182, | |
| "learning_rate": 0.001, | |
| "loss": 2.7823, | |
| "num_input_tokens_seen": 13526630400, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.28446025999557933, | |
| "grad_norm": 0.13506431877613068, | |
| "learning_rate": 0.001, | |
| "loss": 2.774, | |
| "num_input_tokens_seen": 13579059200, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.2855585621577244, | |
| "grad_norm": 0.14323291182518005, | |
| "learning_rate": 0.001, | |
| "loss": 2.7755, | |
| "num_input_tokens_seen": 13631488000, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.2855585621577244, | |
| "eval_loss": 2.6779518127441406, | |
| "eval_runtime": 66.0334, | |
| "eval_samples_per_second": 75.719, | |
| "eval_steps_per_second": 18.93, | |
| "num_input_tokens_seen": 13631488000, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.28665686431986953, | |
| "grad_norm": 0.13635839521884918, | |
| "learning_rate": 0.001, | |
| "loss": 2.7705, | |
| "num_input_tokens_seen": 13683916800, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.2877551664820146, | |
| "grad_norm": 0.1449163854122162, | |
| "learning_rate": 0.001, | |
| "loss": 2.775, | |
| "num_input_tokens_seen": 13736345600, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.2888534686441597, | |
| "grad_norm": 0.1385536640882492, | |
| "learning_rate": 0.001, | |
| "loss": 2.7705, | |
| "num_input_tokens_seen": 13788774400, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.2899517708063048, | |
| "grad_norm": 0.14647842943668365, | |
| "learning_rate": 0.001, | |
| "loss": 2.7709, | |
| "num_input_tokens_seen": 13841203200, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.2910500729684499, | |
| "grad_norm": 0.14193060994148254, | |
| "learning_rate": 0.001, | |
| "loss": 2.7753, | |
| "num_input_tokens_seen": 13893632000, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.292148375130595, | |
| "grad_norm": 0.15065765380859375, | |
| "learning_rate": 0.001, | |
| "loss": 2.7725, | |
| "num_input_tokens_seen": 13946060800, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.29324667729274007, | |
| "grad_norm": 0.1726570725440979, | |
| "learning_rate": 0.001, | |
| "loss": 2.7677, | |
| "num_input_tokens_seen": 13998489600, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.2943449794548852, | |
| "grad_norm": 0.13577735424041748, | |
| "learning_rate": 0.001, | |
| "loss": 2.7661, | |
| "num_input_tokens_seen": 14050918400, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.29544328161703026, | |
| "grad_norm": 0.1286347657442093, | |
| "learning_rate": 0.001, | |
| "loss": 2.7642, | |
| "num_input_tokens_seen": 14103347200, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.2965415837791754, | |
| "grad_norm": 0.12374001741409302, | |
| "learning_rate": 0.001, | |
| "loss": 2.7651, | |
| "num_input_tokens_seen": 14155776000, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.2965415837791754, | |
| "eval_loss": 2.6711983680725098, | |
| "eval_runtime": 65.6737, | |
| "eval_samples_per_second": 76.134, | |
| "eval_steps_per_second": 19.033, | |
| "num_input_tokens_seen": 14155776000, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.29763988594132046, | |
| "grad_norm": 0.1733749508857727, | |
| "learning_rate": 0.001, | |
| "loss": 2.765, | |
| "num_input_tokens_seen": 14208204800, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.29873818810346553, | |
| "grad_norm": 0.1459003984928131, | |
| "learning_rate": 0.001, | |
| "loss": 2.7683, | |
| "num_input_tokens_seen": 14260633600, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.29983649026561066, | |
| "grad_norm": 0.1527784913778305, | |
| "learning_rate": 0.001, | |
| "loss": 2.7678, | |
| "num_input_tokens_seen": 14313062400, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.3009347924277557, | |
| "grad_norm": 0.1344996690750122, | |
| "learning_rate": 0.001, | |
| "loss": 2.7613, | |
| "num_input_tokens_seen": 14365491200, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.30203309458990085, | |
| "grad_norm": 0.1291748583316803, | |
| "learning_rate": 0.001, | |
| "loss": 2.7682, | |
| "num_input_tokens_seen": 14417920000, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.3031313967520459, | |
| "grad_norm": 0.1352360099554062, | |
| "learning_rate": 0.001, | |
| "loss": 2.764, | |
| "num_input_tokens_seen": 14470348800, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.30422969891419105, | |
| "grad_norm": 0.13686618208885193, | |
| "learning_rate": 0.001, | |
| "loss": 2.7638, | |
| "num_input_tokens_seen": 14522777600, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.3053280010763361, | |
| "grad_norm": 0.15377116203308105, | |
| "learning_rate": 0.001, | |
| "loss": 2.7639, | |
| "num_input_tokens_seen": 14575206400, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.3064263032384812, | |
| "grad_norm": 0.13904446363449097, | |
| "learning_rate": 0.001, | |
| "loss": 2.7666, | |
| "num_input_tokens_seen": 14627635200, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.3075246054006263, | |
| "grad_norm": 0.12402611970901489, | |
| "learning_rate": 0.001, | |
| "loss": 2.759, | |
| "num_input_tokens_seen": 14680064000, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.3075246054006263, | |
| "eval_loss": 2.6654388904571533, | |
| "eval_runtime": 65.2775, | |
| "eval_samples_per_second": 76.596, | |
| "eval_steps_per_second": 19.149, | |
| "num_input_tokens_seen": 14680064000, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.3086229075627714, | |
| "grad_norm": 0.13326038420200348, | |
| "learning_rate": 0.001, | |
| "loss": 2.7622, | |
| "num_input_tokens_seen": 14732492800, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.3097212097249165, | |
| "grad_norm": 0.14305976033210754, | |
| "learning_rate": 0.001, | |
| "loss": 2.7597, | |
| "num_input_tokens_seen": 14784921600, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.3108195118870616, | |
| "grad_norm": 0.1182415783405304, | |
| "learning_rate": 0.001, | |
| "loss": 2.758, | |
| "num_input_tokens_seen": 14837350400, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.3119178140492067, | |
| "grad_norm": 0.12919387221336365, | |
| "learning_rate": 0.001, | |
| "loss": 2.759, | |
| "num_input_tokens_seen": 14889779200, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.3130161162113518, | |
| "grad_norm": 0.1420537382364273, | |
| "learning_rate": 0.001, | |
| "loss": 2.7519, | |
| "num_input_tokens_seen": 14942208000, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.31411441837349685, | |
| "grad_norm": 0.14349806308746338, | |
| "learning_rate": 0.001, | |
| "loss": 2.7653, | |
| "num_input_tokens_seen": 14994636800, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.315212720535642, | |
| "grad_norm": 0.16453324258327484, | |
| "learning_rate": 0.001, | |
| "loss": 2.7642, | |
| "num_input_tokens_seen": 15047065600, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.31631102269778705, | |
| "grad_norm": 0.11806487292051315, | |
| "learning_rate": 0.001, | |
| "loss": 2.7605, | |
| "num_input_tokens_seen": 15099494400, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.3174093248599322, | |
| "grad_norm": 0.12850746512413025, | |
| "learning_rate": 0.001, | |
| "loss": 2.7539, | |
| "num_input_tokens_seen": 15151923200, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.31850762702207724, | |
| "grad_norm": 0.1480904221534729, | |
| "learning_rate": 0.001, | |
| "loss": 2.7574, | |
| "num_input_tokens_seen": 15204352000, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.31850762702207724, | |
| "eval_loss": 2.6607398986816406, | |
| "eval_runtime": 65.6281, | |
| "eval_samples_per_second": 76.187, | |
| "eval_steps_per_second": 19.047, | |
| "num_input_tokens_seen": 15204352000, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.3196059291842223, | |
| "grad_norm": 0.13606210052967072, | |
| "learning_rate": 0.001, | |
| "loss": 2.763, | |
| "num_input_tokens_seen": 15256780800, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.32070423134636744, | |
| "grad_norm": 0.12546846270561218, | |
| "learning_rate": 0.001, | |
| "loss": 2.7556, | |
| "num_input_tokens_seen": 15309209600, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.3218025335085125, | |
| "grad_norm": 0.1267230361700058, | |
| "learning_rate": 0.001, | |
| "loss": 2.7617, | |
| "num_input_tokens_seen": 15361638400, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.32290083567065764, | |
| "grad_norm": 0.13812699913978577, | |
| "learning_rate": 0.001, | |
| "loss": 2.7533, | |
| "num_input_tokens_seen": 15414067200, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.3239991378328027, | |
| "grad_norm": 0.12577973306179047, | |
| "learning_rate": 0.001, | |
| "loss": 2.7519, | |
| "num_input_tokens_seen": 15466496000, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.32509743999494783, | |
| "grad_norm": 0.14296036958694458, | |
| "learning_rate": 0.001, | |
| "loss": 2.7479, | |
| "num_input_tokens_seen": 15518924800, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.3261957421570929, | |
| "grad_norm": 0.12737593054771423, | |
| "learning_rate": 0.001, | |
| "loss": 2.7546, | |
| "num_input_tokens_seen": 15571353600, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.327294044319238, | |
| "grad_norm": 0.1349722445011139, | |
| "learning_rate": 0.001, | |
| "loss": 2.7477, | |
| "num_input_tokens_seen": 15623782400, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.3283923464813831, | |
| "grad_norm": 0.12827487289905548, | |
| "learning_rate": 0.001, | |
| "loss": 2.7492, | |
| "num_input_tokens_seen": 15676211200, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.3294906486435282, | |
| "grad_norm": 0.13282813131809235, | |
| "learning_rate": 0.001, | |
| "loss": 2.7466, | |
| "num_input_tokens_seen": 15728640000, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3294906486435282, | |
| "eval_loss": 2.6524744033813477, | |
| "eval_runtime": 65.8996, | |
| "eval_samples_per_second": 75.873, | |
| "eval_steps_per_second": 18.968, | |
| "num_input_tokens_seen": 15728640000, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3305889508056733, | |
| "grad_norm": 0.11965218186378479, | |
| "learning_rate": 0.001, | |
| "loss": 2.7443, | |
| "num_input_tokens_seen": 15781068800, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.33168725296781837, | |
| "grad_norm": 0.14668309688568115, | |
| "learning_rate": 0.001, | |
| "loss": 2.7496, | |
| "num_input_tokens_seen": 15833497600, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.3327855551299635, | |
| "grad_norm": 0.12492749840021133, | |
| "learning_rate": 0.001, | |
| "loss": 2.7485, | |
| "num_input_tokens_seen": 15885926400, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.33388385729210857, | |
| "grad_norm": 0.1333470493555069, | |
| "learning_rate": 0.001, | |
| "loss": 2.7511, | |
| "num_input_tokens_seen": 15938355200, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.33498215945425364, | |
| "grad_norm": 0.14136457443237305, | |
| "learning_rate": 0.001, | |
| "loss": 2.74, | |
| "num_input_tokens_seen": 15990784000, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.33608046161639876, | |
| "grad_norm": 0.14975622296333313, | |
| "learning_rate": 0.001, | |
| "loss": 2.7543, | |
| "num_input_tokens_seen": 16043212800, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.33717876377854383, | |
| "grad_norm": 0.1193549856543541, | |
| "learning_rate": 0.001, | |
| "loss": 2.7497, | |
| "num_input_tokens_seen": 16095641600, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.33827706594068896, | |
| "grad_norm": 0.1429223120212555, | |
| "learning_rate": 0.001, | |
| "loss": 2.7463, | |
| "num_input_tokens_seen": 16148070400, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.33937536810283403, | |
| "grad_norm": 0.16827304661273956, | |
| "learning_rate": 0.001, | |
| "loss": 2.7415, | |
| "num_input_tokens_seen": 16200499200, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.3404736702649791, | |
| "grad_norm": 0.13952937722206116, | |
| "learning_rate": 0.001, | |
| "loss": 2.7388, | |
| "num_input_tokens_seen": 16252928000, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.3404736702649791, | |
| "eval_loss": 2.6472089290618896, | |
| "eval_runtime": 65.4943, | |
| "eval_samples_per_second": 76.343, | |
| "eval_steps_per_second": 19.086, | |
| "num_input_tokens_seen": 16252928000, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.3415719724271242, | |
| "grad_norm": 0.13359376788139343, | |
| "learning_rate": 0.001, | |
| "loss": 2.7522, | |
| "num_input_tokens_seen": 16305356800, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.3426702745892693, | |
| "grad_norm": 0.13101224601268768, | |
| "learning_rate": 0.001, | |
| "loss": 2.7483, | |
| "num_input_tokens_seen": 16357785600, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.3437685767514144, | |
| "grad_norm": 0.14006133377552032, | |
| "learning_rate": 0.001, | |
| "loss": 2.7439, | |
| "num_input_tokens_seen": 16410214400, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.3448668789135595, | |
| "grad_norm": 0.15062059462070465, | |
| "learning_rate": 0.001, | |
| "loss": 2.7454, | |
| "num_input_tokens_seen": 16462643200, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.3459651810757046, | |
| "grad_norm": 0.13822610676288605, | |
| "learning_rate": 0.001, | |
| "loss": 2.74, | |
| "num_input_tokens_seen": 16515072000, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.3470634832378497, | |
| "grad_norm": 0.1368207335472107, | |
| "learning_rate": 0.001, | |
| "loss": 2.745, | |
| "num_input_tokens_seen": 16567500800, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.34816178539999476, | |
| "grad_norm": 0.14573991298675537, | |
| "learning_rate": 0.001, | |
| "loss": 2.742, | |
| "num_input_tokens_seen": 16619929600, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.3492600875621399, | |
| "grad_norm": 12.025542259216309, | |
| "learning_rate": 0.001, | |
| "loss": 3.3278, | |
| "num_input_tokens_seen": 16672358400, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.35035838972428496, | |
| "grad_norm": 0.15699023008346558, | |
| "learning_rate": 0.001, | |
| "loss": 4.04, | |
| "num_input_tokens_seen": 16724787200, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.3514566918864301, | |
| "grad_norm": 0.13041897118091583, | |
| "learning_rate": 0.001, | |
| "loss": 2.8233, | |
| "num_input_tokens_seen": 16777216000, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.3514566918864301, | |
| "eval_loss": 2.689638614654541, | |
| "eval_runtime": 66.0949, | |
| "eval_samples_per_second": 75.649, | |
| "eval_steps_per_second": 18.912, | |
| "num_input_tokens_seen": 16777216000, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.35255499404857515, | |
| "grad_norm": 0.1446143537759781, | |
| "learning_rate": 0.001, | |
| "loss": 2.7837, | |
| "num_input_tokens_seen": 16829644800, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.3536532962107203, | |
| "grad_norm": 0.12466421723365784, | |
| "learning_rate": 0.001, | |
| "loss": 2.7808, | |
| "num_input_tokens_seen": 16882073600, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.35475159837286535, | |
| "grad_norm": 0.13154324889183044, | |
| "learning_rate": 0.001, | |
| "loss": 2.7608, | |
| "num_input_tokens_seen": 16934502400, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.3558499005350104, | |
| "grad_norm": 0.12929347157478333, | |
| "learning_rate": 0.001, | |
| "loss": 2.7599, | |
| "num_input_tokens_seen": 16986931200, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.35694820269715555, | |
| "grad_norm": 0.12805528938770294, | |
| "learning_rate": 0.001, | |
| "loss": 2.7562, | |
| "num_input_tokens_seen": 17039360000, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.3580465048593006, | |
| "grad_norm": 0.12885579466819763, | |
| "learning_rate": 0.001, | |
| "loss": 2.7498, | |
| "num_input_tokens_seen": 17091788800, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.35914480702144574, | |
| "grad_norm": 0.14422497153282166, | |
| "learning_rate": 0.001, | |
| "loss": 2.7518, | |
| "num_input_tokens_seen": 17144217600, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.3602431091835908, | |
| "grad_norm": 0.13284224271774292, | |
| "learning_rate": 0.001, | |
| "loss": 2.7453, | |
| "num_input_tokens_seen": 17196646400, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.3613414113457359, | |
| "grad_norm": 0.1408185362815857, | |
| "learning_rate": 0.001, | |
| "loss": 2.7422, | |
| "num_input_tokens_seen": 17249075200, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.362439713507881, | |
| "grad_norm": 0.1295713484287262, | |
| "learning_rate": 0.001, | |
| "loss": 2.7394, | |
| "num_input_tokens_seen": 17301504000, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.362439713507881, | |
| "eval_loss": 2.6431446075439453, | |
| "eval_runtime": 65.9239, | |
| "eval_samples_per_second": 75.845, | |
| "eval_steps_per_second": 18.961, | |
| "num_input_tokens_seen": 17301504000, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.3635380156700261, | |
| "grad_norm": 0.1245918869972229, | |
| "learning_rate": 0.001, | |
| "loss": 2.7434, | |
| "num_input_tokens_seen": 17353932800, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.3646363178321712, | |
| "grad_norm": 0.15865615010261536, | |
| "learning_rate": 0.001, | |
| "loss": 2.7378, | |
| "num_input_tokens_seen": 17406361600, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.3657346199943163, | |
| "grad_norm": 0.1391313523054123, | |
| "learning_rate": 0.001, | |
| "loss": 2.7415, | |
| "num_input_tokens_seen": 17458790400, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.3668329221564614, | |
| "grad_norm": 0.13604389131069183, | |
| "learning_rate": 0.001, | |
| "loss": 2.7394, | |
| "num_input_tokens_seen": 17511219200, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.3679312243186065, | |
| "grad_norm": 0.14926299452781677, | |
| "learning_rate": 0.001, | |
| "loss": 2.732, | |
| "num_input_tokens_seen": 17563648000, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.36902952648075155, | |
| "grad_norm": 0.12619628012180328, | |
| "learning_rate": 0.001, | |
| "loss": 2.7275, | |
| "num_input_tokens_seen": 17616076800, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.3701278286428967, | |
| "grad_norm": 0.1268402636051178, | |
| "learning_rate": 0.001, | |
| "loss": 2.7309, | |
| "num_input_tokens_seen": 17668505600, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.37122613080504174, | |
| "grad_norm": 0.1379624754190445, | |
| "learning_rate": 0.001, | |
| "loss": 2.7266, | |
| "num_input_tokens_seen": 17720934400, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.37232443296718687, | |
| "grad_norm": 0.1443478763103485, | |
| "learning_rate": 0.001, | |
| "loss": 2.7321, | |
| "num_input_tokens_seen": 17773363200, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.37342273512933194, | |
| "grad_norm": 0.15214091539382935, | |
| "learning_rate": 0.001, | |
| "loss": 2.7284, | |
| "num_input_tokens_seen": 17825792000, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.37342273512933194, | |
| "eval_loss": 2.63478946685791, | |
| "eval_runtime": 65.141, | |
| "eval_samples_per_second": 76.757, | |
| "eval_steps_per_second": 19.189, | |
| "num_input_tokens_seen": 17825792000, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.374521037291477, | |
| "grad_norm": 0.1361106038093567, | |
| "learning_rate": 0.001, | |
| "loss": 2.7342, | |
| "num_input_tokens_seen": 17878220800, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.37561933945362214, | |
| "grad_norm": 0.13839572668075562, | |
| "learning_rate": 0.001, | |
| "loss": 2.7259, | |
| "num_input_tokens_seen": 17930649600, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.3767176416157672, | |
| "grad_norm": 0.13055244088172913, | |
| "learning_rate": 0.001, | |
| "loss": 2.7306, | |
| "num_input_tokens_seen": 17983078400, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.37781594377791233, | |
| "grad_norm": 0.1444411724805832, | |
| "learning_rate": 0.001, | |
| "loss": 2.7315, | |
| "num_input_tokens_seen": 18035507200, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.3789142459400574, | |
| "grad_norm": 0.151028573513031, | |
| "learning_rate": 0.001, | |
| "loss": 2.7211, | |
| "num_input_tokens_seen": 18087936000, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.38001254810220253, | |
| "grad_norm": 0.15638011693954468, | |
| "learning_rate": 0.001, | |
| "loss": 2.7269, | |
| "num_input_tokens_seen": 18140364800, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.3811108502643476, | |
| "grad_norm": 0.1508658230304718, | |
| "learning_rate": 0.001, | |
| "loss": 2.7263, | |
| "num_input_tokens_seen": 18192793600, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.38220915242649267, | |
| "grad_norm": 0.13167701661586761, | |
| "learning_rate": 0.001, | |
| "loss": 2.7296, | |
| "num_input_tokens_seen": 18245222400, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.3833074545886378, | |
| "grad_norm": 0.14609253406524658, | |
| "learning_rate": 0.001, | |
| "loss": 2.7249, | |
| "num_input_tokens_seen": 18297651200, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.38440575675078287, | |
| "grad_norm": 0.13172782957553864, | |
| "learning_rate": 0.001, | |
| "loss": 2.7252, | |
| "num_input_tokens_seen": 18350080000, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.38440575675078287, | |
| "eval_loss": 2.630176544189453, | |
| "eval_runtime": 66.0667, | |
| "eval_samples_per_second": 75.681, | |
| "eval_steps_per_second": 18.92, | |
| "num_input_tokens_seen": 18350080000, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.385504058912928, | |
| "grad_norm": 0.149306520819664, | |
| "learning_rate": 0.001, | |
| "loss": 2.7245, | |
| "num_input_tokens_seen": 18402508800, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.38660236107507306, | |
| "grad_norm": 0.14191772043704987, | |
| "learning_rate": 0.001, | |
| "loss": 2.7204, | |
| "num_input_tokens_seen": 18454937600, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.3877006632372182, | |
| "grad_norm": 0.13731072843074799, | |
| "learning_rate": 0.001, | |
| "loss": 2.7243, | |
| "num_input_tokens_seen": 18507366400, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.38879896539936326, | |
| "grad_norm": 0.1466369777917862, | |
| "learning_rate": 0.001, | |
| "loss": 2.7262, | |
| "num_input_tokens_seen": 18559795200, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.38989726756150833, | |
| "grad_norm": 0.13290658593177795, | |
| "learning_rate": 0.001, | |
| "loss": 2.7314, | |
| "num_input_tokens_seen": 18612224000, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.39099556972365346, | |
| "grad_norm": 0.13785040378570557, | |
| "learning_rate": 0.001, | |
| "loss": 2.7252, | |
| "num_input_tokens_seen": 18664652800, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.39209387188579853, | |
| "grad_norm": 0.13384000957012177, | |
| "learning_rate": 0.001, | |
| "loss": 2.7321, | |
| "num_input_tokens_seen": 18717081600, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.39319217404794365, | |
| "grad_norm": 0.14927875995635986, | |
| "learning_rate": 0.001, | |
| "loss": 2.7236, | |
| "num_input_tokens_seen": 18769510400, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.3942904762100887, | |
| "grad_norm": 0.13494938611984253, | |
| "learning_rate": 0.001, | |
| "loss": 2.7234, | |
| "num_input_tokens_seen": 18821939200, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.3953887783722338, | |
| "grad_norm": 0.15054813027381897, | |
| "learning_rate": 0.001, | |
| "loss": 2.7236, | |
| "num_input_tokens_seen": 18874368000, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.3953887783722338, | |
| "eval_loss": 2.62626051902771, | |
| "eval_runtime": 65.3965, | |
| "eval_samples_per_second": 76.457, | |
| "eval_steps_per_second": 19.114, | |
| "num_input_tokens_seen": 18874368000, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.3964870805343789, | |
| "grad_norm": 0.1353403478860855, | |
| "learning_rate": 0.001, | |
| "loss": 2.724, | |
| "num_input_tokens_seen": 18926796800, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.397585382696524, | |
| "grad_norm": 0.15004459023475647, | |
| "learning_rate": 0.001, | |
| "loss": 2.717, | |
| "num_input_tokens_seen": 18979225600, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.3986836848586691, | |
| "grad_norm": 0.1293007880449295, | |
| "learning_rate": 0.001, | |
| "loss": 2.7187, | |
| "num_input_tokens_seen": 19031654400, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.3997819870208142, | |
| "grad_norm": 0.16373878717422485, | |
| "learning_rate": 0.001, | |
| "loss": 2.7217, | |
| "num_input_tokens_seen": 19084083200, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.4008802891829593, | |
| "grad_norm": 0.1529611349105835, | |
| "learning_rate": 0.001, | |
| "loss": 2.722, | |
| "num_input_tokens_seen": 19136512000, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.4019785913451044, | |
| "grad_norm": 0.14109951257705688, | |
| "learning_rate": 0.001, | |
| "loss": 2.7232, | |
| "num_input_tokens_seen": 19188940800, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.40307689350724946, | |
| "grad_norm": 0.13841493427753448, | |
| "learning_rate": 0.001, | |
| "loss": 2.7195, | |
| "num_input_tokens_seen": 19241369600, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.4041751956693946, | |
| "grad_norm": 0.13508476316928864, | |
| "learning_rate": 0.001, | |
| "loss": 2.7166, | |
| "num_input_tokens_seen": 19293798400, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.40527349783153965, | |
| "grad_norm": 0.1372646540403366, | |
| "learning_rate": 0.001, | |
| "loss": 2.7212, | |
| "num_input_tokens_seen": 19346227200, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.4063717999936848, | |
| "grad_norm": 0.1485033482313156, | |
| "learning_rate": 0.001, | |
| "loss": 2.7186, | |
| "num_input_tokens_seen": 19398656000, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.4063717999936848, | |
| "eval_loss": 2.622330904006958, | |
| "eval_runtime": 66.3601, | |
| "eval_samples_per_second": 75.346, | |
| "eval_steps_per_second": 18.837, | |
| "num_input_tokens_seen": 19398656000, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.40747010215582985, | |
| "grad_norm": 0.1484711617231369, | |
| "learning_rate": 0.001, | |
| "loss": 2.7235, | |
| "num_input_tokens_seen": 19451084800, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.408568404317975, | |
| "grad_norm": 0.141770601272583, | |
| "learning_rate": 0.001, | |
| "loss": 2.7225, | |
| "num_input_tokens_seen": 19503513600, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.40966670648012005, | |
| "grad_norm": 0.1213323250412941, | |
| "learning_rate": 0.001, | |
| "loss": 2.7212, | |
| "num_input_tokens_seen": 19555942400, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.4107650086422651, | |
| "grad_norm": 0.14149373769760132, | |
| "learning_rate": 0.001, | |
| "loss": 2.7181, | |
| "num_input_tokens_seen": 19608371200, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.41186331080441024, | |
| "grad_norm": 0.13964049518108368, | |
| "learning_rate": 0.001, | |
| "loss": 2.7147, | |
| "num_input_tokens_seen": 19660800000, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.4129616129665553, | |
| "grad_norm": 0.1384592205286026, | |
| "learning_rate": 0.001, | |
| "loss": 2.7141, | |
| "num_input_tokens_seen": 19713228800, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.41405991512870044, | |
| "grad_norm": 0.15027381479740143, | |
| "learning_rate": 0.001, | |
| "loss": 2.7185, | |
| "num_input_tokens_seen": 19765657600, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.4151582172908455, | |
| "grad_norm": 0.15221597254276276, | |
| "learning_rate": 0.001, | |
| "loss": 2.7206, | |
| "num_input_tokens_seen": 19818086400, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.4162565194529906, | |
| "grad_norm": 0.1272735893726349, | |
| "learning_rate": 0.001, | |
| "loss": 2.7183, | |
| "num_input_tokens_seen": 19870515200, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.4173548216151357, | |
| "grad_norm": 0.1258268654346466, | |
| "learning_rate": 0.001, | |
| "loss": 2.7117, | |
| "num_input_tokens_seen": 19922944000, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.4173548216151357, | |
| "eval_loss": 2.619187116622925, | |
| "eval_runtime": 65.7537, | |
| "eval_samples_per_second": 76.041, | |
| "eval_steps_per_second": 19.01, | |
| "num_input_tokens_seen": 19922944000, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.4184531237772808, | |
| "grad_norm": 0.12389284372329712, | |
| "learning_rate": 0.001, | |
| "loss": 2.7222, | |
| "num_input_tokens_seen": 19975372800, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.4195514259394259, | |
| "grad_norm": 0.14157339930534363, | |
| "learning_rate": 0.001, | |
| "loss": 2.7178, | |
| "num_input_tokens_seen": 20027801600, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.420649728101571, | |
| "grad_norm": 0.1490466445684433, | |
| "learning_rate": 0.001, | |
| "loss": 2.7185, | |
| "num_input_tokens_seen": 20080230400, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.4217480302637161, | |
| "grad_norm": 0.14112494885921478, | |
| "learning_rate": 0.001, | |
| "loss": 2.7166, | |
| "num_input_tokens_seen": 20132659200, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.42284633242586117, | |
| "grad_norm": 0.13986504077911377, | |
| "learning_rate": 0.001, | |
| "loss": 2.7201, | |
| "num_input_tokens_seen": 20185088000, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.42394463458800624, | |
| "grad_norm": 0.14087803661823273, | |
| "learning_rate": 0.001, | |
| "loss": 2.7175, | |
| "num_input_tokens_seen": 20237516800, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.42504293675015137, | |
| "grad_norm": 0.165438711643219, | |
| "learning_rate": 0.001, | |
| "loss": 2.7155, | |
| "num_input_tokens_seen": 20289945600, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.42614123891229644, | |
| "grad_norm": 0.132109135389328, | |
| "learning_rate": 0.001, | |
| "loss": 2.7116, | |
| "num_input_tokens_seen": 20342374400, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.42723954107444156, | |
| "grad_norm": 0.1372772753238678, | |
| "learning_rate": 0.001, | |
| "loss": 2.7137, | |
| "num_input_tokens_seen": 20394803200, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.42833784323658664, | |
| "grad_norm": 0.1470147669315338, | |
| "learning_rate": 0.001, | |
| "loss": 2.7081, | |
| "num_input_tokens_seen": 20447232000, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.42833784323658664, | |
| "eval_loss": 2.615947961807251, | |
| "eval_runtime": 65.588, | |
| "eval_samples_per_second": 76.233, | |
| "eval_steps_per_second": 19.058, | |
| "num_input_tokens_seen": 20447232000, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.42943614539873176, | |
| "grad_norm": 0.15671676397323608, | |
| "learning_rate": 0.001, | |
| "loss": 2.7176, | |
| "num_input_tokens_seen": 20499660800, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.43053444756087683, | |
| "grad_norm": 0.13104794919490814, | |
| "learning_rate": 0.001, | |
| "loss": 2.7108, | |
| "num_input_tokens_seen": 20552089600, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.4316327497230219, | |
| "grad_norm": 0.14532406628131866, | |
| "learning_rate": 0.001, | |
| "loss": 2.7087, | |
| "num_input_tokens_seen": 20604518400, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.43273105188516703, | |
| "grad_norm": 0.16199354827404022, | |
| "learning_rate": 0.001, | |
| "loss": 2.7178, | |
| "num_input_tokens_seen": 20656947200, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.4338293540473121, | |
| "grad_norm": 0.13537316024303436, | |
| "learning_rate": 0.001, | |
| "loss": 2.7124, | |
| "num_input_tokens_seen": 20709376000, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.4349276562094572, | |
| "grad_norm": 0.15098537504673004, | |
| "learning_rate": 0.001, | |
| "loss": 2.7119, | |
| "num_input_tokens_seen": 20761804800, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.4360259583716023, | |
| "grad_norm": 0.21563659608364105, | |
| "learning_rate": 0.001, | |
| "loss": 2.7118, | |
| "num_input_tokens_seen": 20814233600, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.43712426053374737, | |
| "grad_norm": 0.15981121361255646, | |
| "learning_rate": 0.001, | |
| "loss": 2.7043, | |
| "num_input_tokens_seen": 20866662400, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.4382225626958925, | |
| "grad_norm": 0.15192069113254547, | |
| "learning_rate": 0.001, | |
| "loss": 2.7137, | |
| "num_input_tokens_seen": 20919091200, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.43932086485803756, | |
| "grad_norm": 0.14211437106132507, | |
| "learning_rate": 0.001, | |
| "loss": 2.7128, | |
| "num_input_tokens_seen": 20971520000, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.43932086485803756, | |
| "eval_loss": 2.611689567565918, | |
| "eval_runtime": 66.3456, | |
| "eval_samples_per_second": 75.363, | |
| "eval_steps_per_second": 18.841, | |
| "num_input_tokens_seen": 20971520000, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4404191670201827, | |
| "grad_norm": 0.14489957690238953, | |
| "learning_rate": 0.001, | |
| "loss": 2.7139, | |
| "num_input_tokens_seen": 21023948800, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.44151746918232776, | |
| "grad_norm": 0.13994646072387695, | |
| "learning_rate": 0.001, | |
| "loss": 2.7091, | |
| "num_input_tokens_seen": 21076377600, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.4426157713444729, | |
| "grad_norm": 0.17211903631687164, | |
| "learning_rate": 0.001, | |
| "loss": 2.7176, | |
| "num_input_tokens_seen": 21128806400, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.44371407350661796, | |
| "grad_norm": 0.16364862024784088, | |
| "learning_rate": 0.001, | |
| "loss": 2.7181, | |
| "num_input_tokens_seen": 21181235200, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.444812375668763, | |
| "grad_norm": 0.14166216552257538, | |
| "learning_rate": 0.001, | |
| "loss": 2.7127, | |
| "num_input_tokens_seen": 21233664000, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.44591067783090815, | |
| "grad_norm": 0.12995755672454834, | |
| "learning_rate": 0.001, | |
| "loss": 2.7085, | |
| "num_input_tokens_seen": 21286092800, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.4470089799930532, | |
| "grad_norm": 0.15717202425003052, | |
| "learning_rate": 0.001, | |
| "loss": 2.7071, | |
| "num_input_tokens_seen": 21338521600, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.44810728215519835, | |
| "grad_norm": 0.13354860246181488, | |
| "learning_rate": 0.001, | |
| "loss": 2.7094, | |
| "num_input_tokens_seen": 21390950400, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.4492055843173434, | |
| "grad_norm": 0.16004188358783722, | |
| "learning_rate": 0.001, | |
| "loss": 2.7109, | |
| "num_input_tokens_seen": 21443379200, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.45030388647948855, | |
| "grad_norm": 0.148077592253685, | |
| "learning_rate": 0.001, | |
| "loss": 2.7058, | |
| "num_input_tokens_seen": 21495808000, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.45030388647948855, | |
| "eval_loss": 2.6089115142822266, | |
| "eval_runtime": 65.5589, | |
| "eval_samples_per_second": 76.267, | |
| "eval_steps_per_second": 19.067, | |
| "num_input_tokens_seen": 21495808000, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.4514021886416336, | |
| "grad_norm": 0.16992634534835815, | |
| "learning_rate": 0.001, | |
| "loss": 2.7026, | |
| "num_input_tokens_seen": 21548236800, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.4525004908037787, | |
| "grad_norm": 0.14876551926136017, | |
| "learning_rate": 0.001, | |
| "loss": 2.7105, | |
| "num_input_tokens_seen": 21600665600, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.4535987929659238, | |
| "grad_norm": 0.16025613248348236, | |
| "learning_rate": 0.001, | |
| "loss": 2.707, | |
| "num_input_tokens_seen": 21653094400, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.4546970951280689, | |
| "grad_norm": 0.14609012007713318, | |
| "learning_rate": 0.001, | |
| "loss": 2.7086, | |
| "num_input_tokens_seen": 21705523200, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.455795397290214, | |
| "grad_norm": 0.14725832641124725, | |
| "learning_rate": 0.001, | |
| "loss": 2.7075, | |
| "num_input_tokens_seen": 21757952000, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.4568936994523591, | |
| "grad_norm": 0.1736454963684082, | |
| "learning_rate": 0.001, | |
| "loss": 2.7033, | |
| "num_input_tokens_seen": 21810380800, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.45799200161450415, | |
| "grad_norm": 0.14904257655143738, | |
| "learning_rate": 0.001, | |
| "loss": 2.7012, | |
| "num_input_tokens_seen": 21862809600, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.4590903037766493, | |
| "grad_norm": 0.14407765865325928, | |
| "learning_rate": 0.001, | |
| "loss": 2.7055, | |
| "num_input_tokens_seen": 21915238400, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.46018860593879435, | |
| "grad_norm": 0.13943473994731903, | |
| "learning_rate": 0.001, | |
| "loss": 2.6999, | |
| "num_input_tokens_seen": 21967667200, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.4612869081009395, | |
| "grad_norm": 0.1592896729707718, | |
| "learning_rate": 0.001, | |
| "loss": 2.7072, | |
| "num_input_tokens_seen": 22020096000, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.4612869081009395, | |
| "eval_loss": 2.605719566345215, | |
| "eval_runtime": 65.6879, | |
| "eval_samples_per_second": 76.117, | |
| "eval_steps_per_second": 19.029, | |
| "num_input_tokens_seen": 22020096000, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.46238521026308455, | |
| "grad_norm": 0.1428702473640442, | |
| "learning_rate": 0.001, | |
| "loss": 2.7042, | |
| "num_input_tokens_seen": 22072524800, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.46348351242522967, | |
| "grad_norm": 0.13529072701931, | |
| "learning_rate": 0.001, | |
| "loss": 2.7093, | |
| "num_input_tokens_seen": 22124953600, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.46458181458737474, | |
| "grad_norm": 0.17529748380184174, | |
| "learning_rate": 0.001, | |
| "loss": 2.713, | |
| "num_input_tokens_seen": 22177382400, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.4656801167495198, | |
| "grad_norm": 0.1479254513978958, | |
| "learning_rate": 0.001, | |
| "loss": 2.6984, | |
| "num_input_tokens_seen": 22229811200, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.46677841891166494, | |
| "grad_norm": 0.15110637247562408, | |
| "learning_rate": 0.001, | |
| "loss": 2.7128, | |
| "num_input_tokens_seen": 22282240000, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.46787672107381, | |
| "grad_norm": 0.13746944069862366, | |
| "learning_rate": 0.001, | |
| "loss": 2.7036, | |
| "num_input_tokens_seen": 22334668800, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.46897502323595514, | |
| "grad_norm": 0.17940136790275574, | |
| "learning_rate": 0.001, | |
| "loss": 2.7048, | |
| "num_input_tokens_seen": 22387097600, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.4700733253981002, | |
| "grad_norm": 0.14203256368637085, | |
| "learning_rate": 0.001, | |
| "loss": 2.6997, | |
| "num_input_tokens_seen": 22439526400, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.47117162756024533, | |
| "grad_norm": 0.14260704815387726, | |
| "learning_rate": 0.001, | |
| "loss": 2.7092, | |
| "num_input_tokens_seen": 22491955200, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.4722699297223904, | |
| "grad_norm": 0.16455897688865662, | |
| "learning_rate": 0.001, | |
| "loss": 2.6969, | |
| "num_input_tokens_seen": 22544384000, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.4722699297223904, | |
| "eval_loss": 2.60367751121521, | |
| "eval_runtime": 65.4304, | |
| "eval_samples_per_second": 76.417, | |
| "eval_steps_per_second": 19.104, | |
| "num_input_tokens_seen": 22544384000, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.4733682318845355, | |
| "grad_norm": 0.1529170274734497, | |
| "learning_rate": 0.001, | |
| "loss": 2.7003, | |
| "num_input_tokens_seen": 22596812800, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.4744665340466806, | |
| "grad_norm": 0.1921636164188385, | |
| "learning_rate": 0.001, | |
| "loss": 2.7014, | |
| "num_input_tokens_seen": 22649241600, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.47556483620882567, | |
| "grad_norm": 0.16029173135757446, | |
| "learning_rate": 0.001, | |
| "loss": 2.7028, | |
| "num_input_tokens_seen": 22701670400, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.4766631383709708, | |
| "grad_norm": 0.14740578830242157, | |
| "learning_rate": 0.001, | |
| "loss": 2.7019, | |
| "num_input_tokens_seen": 22754099200, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.47776144053311587, | |
| "grad_norm": 0.1734548658132553, | |
| "learning_rate": 0.001, | |
| "loss": 2.6985, | |
| "num_input_tokens_seen": 22806528000, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.47885974269526094, | |
| "grad_norm": 0.15502890944480896, | |
| "learning_rate": 0.001, | |
| "loss": 2.6973, | |
| "num_input_tokens_seen": 22858956800, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.47995804485740606, | |
| "grad_norm": 0.16783900558948517, | |
| "learning_rate": 0.001, | |
| "loss": 2.7003, | |
| "num_input_tokens_seen": 22911385600, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.48105634701955113, | |
| "grad_norm": 0.14911381900310516, | |
| "learning_rate": 0.001, | |
| "loss": 2.6992, | |
| "num_input_tokens_seen": 22963814400, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.48215464918169626, | |
| "grad_norm": 0.15027394890785217, | |
| "learning_rate": 0.001, | |
| "loss": 2.6957, | |
| "num_input_tokens_seen": 23016243200, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.48325295134384133, | |
| "grad_norm": 0.1261301189661026, | |
| "learning_rate": 0.001, | |
| "loss": 2.7064, | |
| "num_input_tokens_seen": 23068672000, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.48325295134384133, | |
| "eval_loss": 2.6012015342712402, | |
| "eval_runtime": 64.9701, | |
| "eval_samples_per_second": 76.958, | |
| "eval_steps_per_second": 19.24, | |
| "num_input_tokens_seen": 23068672000, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.48435125350598646, | |
| "grad_norm": 0.15728288888931274, | |
| "learning_rate": 0.001, | |
| "loss": 2.703, | |
| "num_input_tokens_seen": 23121100800, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 0.4854495556681315, | |
| "grad_norm": 0.13599443435668945, | |
| "learning_rate": 0.001, | |
| "loss": 2.6984, | |
| "num_input_tokens_seen": 23173529600, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.4865478578302766, | |
| "grad_norm": 0.25702551007270813, | |
| "learning_rate": 0.001, | |
| "loss": 2.9388, | |
| "num_input_tokens_seen": 23225958400, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 0.4876461599924217, | |
| "grad_norm": 0.12942279875278473, | |
| "learning_rate": 0.001, | |
| "loss": 2.7568, | |
| "num_input_tokens_seen": 23278383360, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.4887444621545668, | |
| "grad_norm": 0.12908817827701569, | |
| "learning_rate": 0.001, | |
| "loss": 2.7195, | |
| "num_input_tokens_seen": 23330812160, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 0.4898427643167119, | |
| "grad_norm": 0.1351587176322937, | |
| "learning_rate": 0.001, | |
| "loss": 2.7155, | |
| "num_input_tokens_seen": 23383240960, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.490941066478857, | |
| "grad_norm": 0.1245250552892685, | |
| "learning_rate": 0.001, | |
| "loss": 2.7074, | |
| "num_input_tokens_seen": 23435669760, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 0.4920393686410021, | |
| "grad_norm": 0.13818837702274323, | |
| "learning_rate": 0.001, | |
| "loss": 2.7064, | |
| "num_input_tokens_seen": 23488098560, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.4931376708031472, | |
| "grad_norm": 0.15505041182041168, | |
| "learning_rate": 0.001, | |
| "loss": 2.7044, | |
| "num_input_tokens_seen": 23540527360, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 0.49423597296529226, | |
| "grad_norm": 0.14414137601852417, | |
| "learning_rate": 0.001, | |
| "loss": 2.7046, | |
| "num_input_tokens_seen": 23592956160, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.49423597296529226, | |
| "eval_loss": 2.60188627243042, | |
| "eval_runtime": 67.3268, | |
| "eval_samples_per_second": 74.265, | |
| "eval_steps_per_second": 18.566, | |
| "num_input_tokens_seen": 23592956160, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.4953342751274374, | |
| "grad_norm": 0.14763414859771729, | |
| "learning_rate": 0.001, | |
| "loss": 2.695, | |
| "num_input_tokens_seen": 23645384960, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 0.49643257728958246, | |
| "grad_norm": 0.14800110459327698, | |
| "learning_rate": 0.001, | |
| "loss": 2.6939, | |
| "num_input_tokens_seen": 23697813760, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.4975308794517276, | |
| "grad_norm": 0.13590902090072632, | |
| "learning_rate": 0.001, | |
| "loss": 2.6967, | |
| "num_input_tokens_seen": 23750242560, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 0.49862918161387265, | |
| "grad_norm": 0.1315733939409256, | |
| "learning_rate": 0.001, | |
| "loss": 2.6909, | |
| "num_input_tokens_seen": 23802671360, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.4997274837760177, | |
| "grad_norm": 0.13714700937271118, | |
| "learning_rate": 0.001, | |
| "loss": 2.6957, | |
| "num_input_tokens_seen": 23855100160, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 0.5008257859381628, | |
| "grad_norm": 0.1412438154220581, | |
| "learning_rate": 0.001, | |
| "loss": 2.6977, | |
| "num_input_tokens_seen": 23907528960, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.501924088100308, | |
| "grad_norm": 0.15368172526359558, | |
| "learning_rate": 0.001, | |
| "loss": 2.6977, | |
| "num_input_tokens_seen": 23959957760, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 0.503022390262453, | |
| "grad_norm": 0.14018824696540833, | |
| "learning_rate": 0.001, | |
| "loss": 2.6992, | |
| "num_input_tokens_seen": 24012386560, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.5041206924245981, | |
| "grad_norm": 0.1284814178943634, | |
| "learning_rate": 0.001, | |
| "loss": 2.6962, | |
| "num_input_tokens_seen": 24064815360, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 0.5052189945867432, | |
| "grad_norm": 0.15145835280418396, | |
| "learning_rate": 0.001, | |
| "loss": 2.692, | |
| "num_input_tokens_seen": 24117244160, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.5052189945867432, | |
| "eval_loss": 2.5970778465270996, | |
| "eval_runtime": 66.1666, | |
| "eval_samples_per_second": 75.567, | |
| "eval_steps_per_second": 18.892, | |
| "num_input_tokens_seen": 24117244160, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.5063172967488883, | |
| "grad_norm": 0.15117652714252472, | |
| "learning_rate": 0.001, | |
| "loss": 2.696, | |
| "num_input_tokens_seen": 24169672960, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 0.5074155989110334, | |
| "grad_norm": 0.15605470538139343, | |
| "learning_rate": 0.001, | |
| "loss": 2.6918, | |
| "num_input_tokens_seen": 24222101760, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.5085139010731785, | |
| "grad_norm": 0.17503651976585388, | |
| "learning_rate": 0.001, | |
| "loss": 2.688, | |
| "num_input_tokens_seen": 24274530560, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 0.5096122032353236, | |
| "grad_norm": 0.1622135490179062, | |
| "learning_rate": 0.001, | |
| "loss": 2.6949, | |
| "num_input_tokens_seen": 24326959360, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.5107105053974687, | |
| "grad_norm": 0.1331271231174469, | |
| "learning_rate": 0.001, | |
| "loss": 2.6876, | |
| "num_input_tokens_seen": 24379388160, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 0.5118088075596138, | |
| "grad_norm": 0.14365510642528534, | |
| "learning_rate": 0.001, | |
| "loss": 2.7027, | |
| "num_input_tokens_seen": 24431816960, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.5129071097217589, | |
| "grad_norm": 0.13621902465820312, | |
| "learning_rate": 0.001, | |
| "loss": 2.6946, | |
| "num_input_tokens_seen": 24484245760, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 0.5140054118839039, | |
| "grad_norm": 0.12506547570228577, | |
| "learning_rate": 0.001, | |
| "loss": 2.6864, | |
| "num_input_tokens_seen": 24536674560, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.515103714046049, | |
| "grad_norm": 0.12824128568172455, | |
| "learning_rate": 0.001, | |
| "loss": 2.6871, | |
| "num_input_tokens_seen": 24589103360, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 0.5162020162081942, | |
| "grad_norm": 0.14310036599636078, | |
| "learning_rate": 0.001, | |
| "loss": 2.6936, | |
| "num_input_tokens_seen": 24641532160, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.5162020162081942, | |
| "eval_loss": 2.592362880706787, | |
| "eval_runtime": 66.663, | |
| "eval_samples_per_second": 75.004, | |
| "eval_steps_per_second": 18.751, | |
| "num_input_tokens_seen": 24641532160, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.5173003183703393, | |
| "grad_norm": 0.1362077295780182, | |
| "learning_rate": 0.001, | |
| "loss": 2.6924, | |
| "num_input_tokens_seen": 24693960960, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 0.5183986205324843, | |
| "grad_norm": 0.13662473857402802, | |
| "learning_rate": 0.001, | |
| "loss": 2.6972, | |
| "num_input_tokens_seen": 24746389760, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.5194969226946294, | |
| "grad_norm": 0.12603560090065002, | |
| "learning_rate": 0.001, | |
| "loss": 2.6908, | |
| "num_input_tokens_seen": 24798818560, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 0.5205952248567746, | |
| "grad_norm": 0.16597150266170502, | |
| "learning_rate": 0.001, | |
| "loss": 2.6882, | |
| "num_input_tokens_seen": 24851247360, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.5216935270189196, | |
| "grad_norm": 0.13665246963500977, | |
| "learning_rate": 0.001, | |
| "loss": 2.6958, | |
| "num_input_tokens_seen": 24903676160, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 0.5227918291810647, | |
| "grad_norm": 0.14349523186683655, | |
| "learning_rate": 0.001, | |
| "loss": 2.6874, | |
| "num_input_tokens_seen": 24956104960, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.5238901313432098, | |
| "grad_norm": 0.15857954323291779, | |
| "learning_rate": 0.001, | |
| "loss": 2.6882, | |
| "num_input_tokens_seen": 25008533760, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 0.524988433505355, | |
| "grad_norm": 0.15056300163269043, | |
| "learning_rate": 0.001, | |
| "loss": 2.694, | |
| "num_input_tokens_seen": 25060962560, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.5260867356675, | |
| "grad_norm": 0.12861080467700958, | |
| "learning_rate": 0.001, | |
| "loss": 2.6899, | |
| "num_input_tokens_seen": 25113391360, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 0.5271850378296451, | |
| "grad_norm": 0.14443258941173553, | |
| "learning_rate": 0.001, | |
| "loss": 2.6929, | |
| "num_input_tokens_seen": 25165820160, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.5271850378296451, | |
| "eval_loss": 2.5910630226135254, | |
| "eval_runtime": 66.9014, | |
| "eval_samples_per_second": 74.737, | |
| "eval_steps_per_second": 18.684, | |
| "num_input_tokens_seen": 25165820160, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.5282833399917902, | |
| "grad_norm": 0.14083649218082428, | |
| "learning_rate": 0.001, | |
| "loss": 2.6851, | |
| "num_input_tokens_seen": 25218248960, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 0.5293816421539352, | |
| "grad_norm": 0.13934968411922455, | |
| "learning_rate": 0.001, | |
| "loss": 2.6863, | |
| "num_input_tokens_seen": 25270677760, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.5304799443160804, | |
| "grad_norm": 0.15416787564754486, | |
| "learning_rate": 0.001, | |
| "loss": 2.6894, | |
| "num_input_tokens_seen": 25323106560, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 0.5315782464782255, | |
| "grad_norm": 0.17290246486663818, | |
| "learning_rate": 0.001, | |
| "loss": 2.6907, | |
| "num_input_tokens_seen": 25375535360, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.5326765486403706, | |
| "grad_norm": 0.14260552823543549, | |
| "learning_rate": 0.001, | |
| "loss": 2.6832, | |
| "num_input_tokens_seen": 25427964160, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 0.5337748508025156, | |
| "grad_norm": 0.14795690774917603, | |
| "learning_rate": 0.001, | |
| "loss": 2.6895, | |
| "num_input_tokens_seen": 25480392960, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.5348731529646608, | |
| "grad_norm": 0.15009699761867523, | |
| "learning_rate": 0.001, | |
| "loss": 2.6819, | |
| "num_input_tokens_seen": 25532821760, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 0.5359714551268059, | |
| "grad_norm": 0.15425953269004822, | |
| "learning_rate": 0.001, | |
| "loss": 2.6874, | |
| "num_input_tokens_seen": 25585250560, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.5370697572889509, | |
| "grad_norm": 0.14639410376548767, | |
| "learning_rate": 0.001, | |
| "loss": 2.6878, | |
| "num_input_tokens_seen": 25637679360, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 0.538168059451096, | |
| "grad_norm": 0.14785613119602203, | |
| "learning_rate": 0.001, | |
| "loss": 2.6841, | |
| "num_input_tokens_seen": 25690108160, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.538168059451096, | |
| "eval_loss": 2.5875706672668457, | |
| "eval_runtime": 66.9296, | |
| "eval_samples_per_second": 74.705, | |
| "eval_steps_per_second": 18.676, | |
| "num_input_tokens_seen": 25690108160, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.5392663616132412, | |
| "grad_norm": 0.14224180579185486, | |
| "learning_rate": 0.001, | |
| "loss": 2.6876, | |
| "num_input_tokens_seen": 25742536960, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 0.5403646637753863, | |
| "grad_norm": 0.14881493151187897, | |
| "learning_rate": 0.001, | |
| "loss": 2.6827, | |
| "num_input_tokens_seen": 25794965760, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.5414629659375313, | |
| "grad_norm": 0.17951786518096924, | |
| "learning_rate": 0.001, | |
| "loss": 2.688, | |
| "num_input_tokens_seen": 25847394560, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 0.5425612680996764, | |
| "grad_norm": 0.1400926560163498, | |
| "learning_rate": 0.001, | |
| "loss": 2.6945, | |
| "num_input_tokens_seen": 25899823360, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.5436595702618215, | |
| "grad_norm": 0.1421627402305603, | |
| "learning_rate": 0.001, | |
| "loss": 2.6852, | |
| "num_input_tokens_seen": 25952252160, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 0.5447578724239666, | |
| "grad_norm": 0.1617737114429474, | |
| "learning_rate": 0.001, | |
| "loss": 2.686, | |
| "num_input_tokens_seen": 26004680960, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.5458561745861117, | |
| "grad_norm": 0.1523471176624298, | |
| "learning_rate": 0.001, | |
| "loss": 2.6945, | |
| "num_input_tokens_seen": 26057109760, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 0.5469544767482568, | |
| "grad_norm": 0.13078247010707855, | |
| "learning_rate": 0.001, | |
| "loss": 2.6829, | |
| "num_input_tokens_seen": 26109538560, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.5480527789104018, | |
| "grad_norm": 0.14831651747226715, | |
| "learning_rate": 0.001, | |
| "loss": 2.6898, | |
| "num_input_tokens_seen": 26161967360, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 0.549151081072547, | |
| "grad_norm": 0.1782410740852356, | |
| "learning_rate": 0.001, | |
| "loss": 2.6871, | |
| "num_input_tokens_seen": 26214396160, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.549151081072547, | |
| "eval_loss": 2.5877788066864014, | |
| "eval_runtime": 67.2223, | |
| "eval_samples_per_second": 74.38, | |
| "eval_steps_per_second": 18.595, | |
| "num_input_tokens_seen": 26214396160, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.5502493832346921, | |
| "grad_norm": 0.16484692692756653, | |
| "learning_rate": 0.001, | |
| "loss": 2.6843, | |
| "num_input_tokens_seen": 26266824960, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 0.5513476853968372, | |
| "grad_norm": 0.1583317369222641, | |
| "learning_rate": 0.001, | |
| "loss": 2.6825, | |
| "num_input_tokens_seen": 26319253760, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.5524459875589822, | |
| "grad_norm": 0.1569424867630005, | |
| "learning_rate": 0.001, | |
| "loss": 2.6787, | |
| "num_input_tokens_seen": 26371682560, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 0.5535442897211273, | |
| "grad_norm": 0.13633306324481964, | |
| "learning_rate": 0.001, | |
| "loss": 2.6872, | |
| "num_input_tokens_seen": 26424111360, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.5546425918832725, | |
| "grad_norm": 0.1480533927679062, | |
| "learning_rate": 0.001, | |
| "loss": 2.6842, | |
| "num_input_tokens_seen": 26476540160, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 0.5557408940454175, | |
| "grad_norm": 0.1267666518688202, | |
| "learning_rate": 0.001, | |
| "loss": 2.6839, | |
| "num_input_tokens_seen": 26528968960, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.5568391962075626, | |
| "grad_norm": 0.13951599597930908, | |
| "learning_rate": 0.001, | |
| "loss": 2.6799, | |
| "num_input_tokens_seen": 26581397760, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 0.5579374983697077, | |
| "grad_norm": 0.15044580399990082, | |
| "learning_rate": 0.001, | |
| "loss": 2.6846, | |
| "num_input_tokens_seen": 26633826560, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.5590358005318529, | |
| "grad_norm": 0.12891829013824463, | |
| "learning_rate": 0.001, | |
| "loss": 2.682, | |
| "num_input_tokens_seen": 26686255360, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 0.5601341026939979, | |
| "grad_norm": 0.12812241911888123, | |
| "learning_rate": 0.001, | |
| "loss": 2.684, | |
| "num_input_tokens_seen": 26738684160, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.5601341026939979, | |
| "eval_loss": 2.5832085609436035, | |
| "eval_runtime": 66.9038, | |
| "eval_samples_per_second": 74.734, | |
| "eval_steps_per_second": 18.684, | |
| "num_input_tokens_seen": 26738684160, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.561232404856143, | |
| "grad_norm": 0.14243654906749725, | |
| "learning_rate": 0.001, | |
| "loss": 2.6883, | |
| "num_input_tokens_seen": 26791112960, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 0.5623307070182881, | |
| "grad_norm": 0.14436320960521698, | |
| "learning_rate": 0.001, | |
| "loss": 2.6835, | |
| "num_input_tokens_seen": 26843541760, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.5634290091804331, | |
| "grad_norm": 0.1516960710287094, | |
| "learning_rate": 0.001, | |
| "loss": 2.6752, | |
| "num_input_tokens_seen": 26895970560, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 0.5645273113425783, | |
| "grad_norm": 0.14002515375614166, | |
| "learning_rate": 0.001, | |
| "loss": 2.6817, | |
| "num_input_tokens_seen": 26948399360, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.5656256135047234, | |
| "grad_norm": 0.1379036009311676, | |
| "learning_rate": 0.001, | |
| "loss": 2.6904, | |
| "num_input_tokens_seen": 27000828160, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 0.5667239156668685, | |
| "grad_norm": 0.16127964854240417, | |
| "learning_rate": 0.001, | |
| "loss": 2.6813, | |
| "num_input_tokens_seen": 27053256960, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.5678222178290135, | |
| "grad_norm": 0.15714125335216522, | |
| "learning_rate": 0.001, | |
| "loss": 2.6851, | |
| "num_input_tokens_seen": 27105685760, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 0.5689205199911587, | |
| "grad_norm": 0.15288160741329193, | |
| "learning_rate": 0.001, | |
| "loss": 2.6832, | |
| "num_input_tokens_seen": 27158114560, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.5700188221533038, | |
| "grad_norm": 0.1398363709449768, | |
| "learning_rate": 0.001, | |
| "loss": 2.6814, | |
| "num_input_tokens_seen": 27210543360, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 0.5711171243154488, | |
| "grad_norm": 0.15253235399723053, | |
| "learning_rate": 0.001, | |
| "loss": 2.6755, | |
| "num_input_tokens_seen": 27262972160, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5711171243154488, | |
| "eval_loss": 2.5809168815612793, | |
| "eval_runtime": 66.151, | |
| "eval_samples_per_second": 75.585, | |
| "eval_steps_per_second": 18.896, | |
| "num_input_tokens_seen": 27262972160, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5722154264775939, | |
| "grad_norm": 0.1538383513689041, | |
| "learning_rate": 0.001, | |
| "loss": 2.6783, | |
| "num_input_tokens_seen": 27315400960, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 0.5733137286397391, | |
| "grad_norm": 0.15545998513698578, | |
| "learning_rate": 0.001, | |
| "loss": 2.6798, | |
| "num_input_tokens_seen": 27367829760, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.5744120308018842, | |
| "grad_norm": 0.15456970036029816, | |
| "learning_rate": 0.001, | |
| "loss": 2.6836, | |
| "num_input_tokens_seen": 27420258560, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 0.5755103329640292, | |
| "grad_norm": 0.1353277862071991, | |
| "learning_rate": 0.001, | |
| "loss": 2.6777, | |
| "num_input_tokens_seen": 27472687360, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.5766086351261743, | |
| "grad_norm": 0.15124258399009705, | |
| "learning_rate": 0.001, | |
| "loss": 2.681, | |
| "num_input_tokens_seen": 27525116160, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 0.5777069372883195, | |
| "grad_norm": 0.14200901985168457, | |
| "learning_rate": 0.001, | |
| "loss": 2.6827, | |
| "num_input_tokens_seen": 27577544960, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.5788052394504645, | |
| "grad_norm": 0.15356388688087463, | |
| "learning_rate": 0.001, | |
| "loss": 2.6802, | |
| "num_input_tokens_seen": 27629973760, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 0.5799035416126096, | |
| "grad_norm": 0.17395390570163727, | |
| "learning_rate": 0.001, | |
| "loss": 2.6921, | |
| "num_input_tokens_seen": 27682402560, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.5810018437747547, | |
| "grad_norm": 0.1507692188024521, | |
| "learning_rate": 0.001, | |
| "loss": 2.6811, | |
| "num_input_tokens_seen": 27734831360, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 0.5821001459368998, | |
| "grad_norm": 0.14512786269187927, | |
| "learning_rate": 0.001, | |
| "loss": 2.6798, | |
| "num_input_tokens_seen": 27787260160, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.5821001459368998, | |
| "eval_loss": 2.5802626609802246, | |
| "eval_runtime": 67.1032, | |
| "eval_samples_per_second": 74.512, | |
| "eval_steps_per_second": 18.628, | |
| "num_input_tokens_seen": 27787260160, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.5831984480990449, | |
| "grad_norm": 0.15365912020206451, | |
| "learning_rate": 0.001, | |
| "loss": 2.6813, | |
| "num_input_tokens_seen": 27839688960, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 0.58429675026119, | |
| "grad_norm": 0.14015646278858185, | |
| "learning_rate": 0.001, | |
| "loss": 2.6774, | |
| "num_input_tokens_seen": 27892117760, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.5853950524233351, | |
| "grad_norm": 0.1529797911643982, | |
| "learning_rate": 0.001, | |
| "loss": 2.6751, | |
| "num_input_tokens_seen": 27944546560, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 0.5864933545854801, | |
| "grad_norm": 0.16909636557102203, | |
| "learning_rate": 0.001, | |
| "loss": 2.6795, | |
| "num_input_tokens_seen": 27996975360, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.5875916567476253, | |
| "grad_norm": 0.14130276441574097, | |
| "learning_rate": 0.001, | |
| "loss": 2.6809, | |
| "num_input_tokens_seen": 28049404160, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 0.5886899589097704, | |
| "grad_norm": 0.15182790160179138, | |
| "learning_rate": 0.001, | |
| "loss": 2.685, | |
| "num_input_tokens_seen": 28101832960, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.5897882610719154, | |
| "grad_norm": 0.12757331132888794, | |
| "learning_rate": 0.001, | |
| "loss": 2.6766, | |
| "num_input_tokens_seen": 28154261760, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 0.5908865632340605, | |
| "grad_norm": 0.1527504026889801, | |
| "learning_rate": 0.001, | |
| "loss": 2.6767, | |
| "num_input_tokens_seen": 28206690560, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.5919848653962057, | |
| "grad_norm": 0.18337304890155792, | |
| "learning_rate": 0.001, | |
| "loss": 2.6752, | |
| "num_input_tokens_seen": 28259119360, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 0.5930831675583508, | |
| "grad_norm": 0.1472473442554474, | |
| "learning_rate": 0.001, | |
| "loss": 2.6717, | |
| "num_input_tokens_seen": 28311548160, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.5930831675583508, | |
| "eval_loss": 2.5781941413879395, | |
| "eval_runtime": 66.2194, | |
| "eval_samples_per_second": 75.507, | |
| "eval_steps_per_second": 18.877, | |
| "num_input_tokens_seen": 28311548160, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.5941814697204958, | |
| "grad_norm": 0.15350718796253204, | |
| "learning_rate": 0.001, | |
| "loss": 2.6787, | |
| "num_input_tokens_seen": 28363976960, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 0.5952797718826409, | |
| "grad_norm": 0.1393333077430725, | |
| "learning_rate": 0.001, | |
| "loss": 2.6759, | |
| "num_input_tokens_seen": 28416405760, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.596378074044786, | |
| "grad_norm": 0.1485709846019745, | |
| "learning_rate": 0.001, | |
| "loss": 2.6772, | |
| "num_input_tokens_seen": 28468834560, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 0.5974763762069311, | |
| "grad_norm": 0.13909003138542175, | |
| "learning_rate": 0.001, | |
| "loss": 2.6729, | |
| "num_input_tokens_seen": 28521263360, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.5985746783690762, | |
| "grad_norm": 0.15117496252059937, | |
| "learning_rate": 0.001, | |
| "loss": 2.6704, | |
| "num_input_tokens_seen": 28573692160, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 0.5996729805312213, | |
| "grad_norm": 0.14054876565933228, | |
| "learning_rate": 0.001, | |
| "loss": 2.6748, | |
| "num_input_tokens_seen": 28626120960, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.6007712826933664, | |
| "grad_norm": 0.15437620878219604, | |
| "learning_rate": 0.001, | |
| "loss": 2.6778, | |
| "num_input_tokens_seen": 28678549760, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 0.6018695848555115, | |
| "grad_norm": 0.15858007967472076, | |
| "learning_rate": 0.001, | |
| "loss": 2.6763, | |
| "num_input_tokens_seen": 28730978560, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.6029678870176566, | |
| "grad_norm": 0.14459487795829773, | |
| "learning_rate": 0.001, | |
| "loss": 2.6726, | |
| "num_input_tokens_seen": 28783407360, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 0.6040661891798017, | |
| "grad_norm": 0.17691345512866974, | |
| "learning_rate": 0.001, | |
| "loss": 2.678, | |
| "num_input_tokens_seen": 28835836160, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.6040661891798017, | |
| "eval_loss": 2.576051950454712, | |
| "eval_runtime": 66.9387, | |
| "eval_samples_per_second": 74.695, | |
| "eval_steps_per_second": 18.674, | |
| "num_input_tokens_seen": 28835836160, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.6051644913419467, | |
| "grad_norm": 0.16200922429561615, | |
| "learning_rate": 0.001, | |
| "loss": 2.6763, | |
| "num_input_tokens_seen": 28888264960, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 0.6062627935040918, | |
| "grad_norm": 0.14567038416862488, | |
| "learning_rate": 0.001, | |
| "loss": 2.6795, | |
| "num_input_tokens_seen": 28940693760, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.607361095666237, | |
| "grad_norm": 0.16075611114501953, | |
| "learning_rate": 0.001, | |
| "loss": 2.6746, | |
| "num_input_tokens_seen": 28993122560, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 0.6084593978283821, | |
| "grad_norm": 0.1386987417936325, | |
| "learning_rate": 0.001, | |
| "loss": 2.6771, | |
| "num_input_tokens_seen": 29045551360, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.6095576999905271, | |
| "grad_norm": 0.14672614634037018, | |
| "learning_rate": 0.001, | |
| "loss": 2.6792, | |
| "num_input_tokens_seen": 29097980160, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 0.6106560021526722, | |
| "grad_norm": 0.22614523768424988, | |
| "learning_rate": 0.001, | |
| "loss": 2.6728, | |
| "num_input_tokens_seen": 29150408960, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.6117543043148174, | |
| "grad_norm": 0.15554341673851013, | |
| "learning_rate": 0.001, | |
| "loss": 2.676, | |
| "num_input_tokens_seen": 29202837760, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 0.6128526064769624, | |
| "grad_norm": 0.17181837558746338, | |
| "learning_rate": 0.001, | |
| "loss": 2.6811, | |
| "num_input_tokens_seen": 29255266560, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.6139509086391075, | |
| "grad_norm": 0.15763437747955322, | |
| "learning_rate": 0.001, | |
| "loss": 2.6797, | |
| "num_input_tokens_seen": 29307695360, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 0.6150492108012526, | |
| "grad_norm": 0.14721135795116425, | |
| "learning_rate": 0.001, | |
| "loss": 2.6762, | |
| "num_input_tokens_seen": 29360124160, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.6150492108012526, | |
| "eval_loss": 2.5763511657714844, | |
| "eval_runtime": 66.3236, | |
| "eval_samples_per_second": 75.388, | |
| "eval_steps_per_second": 18.847, | |
| "num_input_tokens_seen": 29360124160, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.6161475129633978, | |
| "grad_norm": 0.13857993483543396, | |
| "learning_rate": 0.001, | |
| "loss": 2.677, | |
| "num_input_tokens_seen": 29412552960, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 0.6172458151255428, | |
| "grad_norm": 0.14276473224163055, | |
| "learning_rate": 0.001, | |
| "loss": 2.6669, | |
| "num_input_tokens_seen": 29464981760, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.6183441172876879, | |
| "grad_norm": 0.1536131203174591, | |
| "learning_rate": 0.001, | |
| "loss": 2.6757, | |
| "num_input_tokens_seen": 29517410560, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 0.619442419449833, | |
| "grad_norm": 0.15733414888381958, | |
| "learning_rate": 0.001, | |
| "loss": 2.6735, | |
| "num_input_tokens_seen": 29569839360, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.620540721611978, | |
| "grad_norm": 0.14553523063659668, | |
| "learning_rate": 0.001, | |
| "loss": 2.6683, | |
| "num_input_tokens_seen": 29622268160, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 0.6216390237741232, | |
| "grad_norm": 0.15685459971427917, | |
| "learning_rate": 0.001, | |
| "loss": 2.6692, | |
| "num_input_tokens_seen": 29674696960, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.6227373259362683, | |
| "grad_norm": 0.16553767025470734, | |
| "learning_rate": 0.001, | |
| "loss": 2.6778, | |
| "num_input_tokens_seen": 29727125760, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 0.6238356280984134, | |
| "grad_norm": 0.1619853973388672, | |
| "learning_rate": 0.001, | |
| "loss": 2.6807, | |
| "num_input_tokens_seen": 29779554560, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.6249339302605584, | |
| "grad_norm": 0.12794817984104156, | |
| "learning_rate": 0.001, | |
| "loss": 2.6776, | |
| "num_input_tokens_seen": 29831983360, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 0.6260322324227036, | |
| "grad_norm": 0.17001128196716309, | |
| "learning_rate": 0.001, | |
| "loss": 2.6797, | |
| "num_input_tokens_seen": 29884412160, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.6260322324227036, | |
| "eval_loss": 2.5728061199188232, | |
| "eval_runtime": 66.7752, | |
| "eval_samples_per_second": 74.878, | |
| "eval_steps_per_second": 18.72, | |
| "num_input_tokens_seen": 29884412160, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.6271305345848487, | |
| "grad_norm": 0.12936875224113464, | |
| "learning_rate": 0.001, | |
| "loss": 2.6677, | |
| "num_input_tokens_seen": 29936840960, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 0.6282288367469937, | |
| "grad_norm": 0.14839358627796173, | |
| "learning_rate": 0.001, | |
| "loss": 2.6681, | |
| "num_input_tokens_seen": 29989269760, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.6293271389091388, | |
| "grad_norm": 0.1526126265525818, | |
| "learning_rate": 0.001, | |
| "loss": 2.6711, | |
| "num_input_tokens_seen": 30041698560, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 0.630425441071284, | |
| "grad_norm": 11.806962013244629, | |
| "learning_rate": 0.001, | |
| "loss": 2.7543, | |
| "num_input_tokens_seen": 30094127360, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.631523743233429, | |
| "grad_norm": 0.13446328043937683, | |
| "learning_rate": 0.001, | |
| "loss": 2.9466, | |
| "num_input_tokens_seen": 30146556160, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 0.6326220453955741, | |
| "grad_norm": 0.1319582760334015, | |
| "learning_rate": 0.001, | |
| "loss": 2.7002, | |
| "num_input_tokens_seen": 30198984960, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.6337203475577192, | |
| "grad_norm": 0.13955356180667877, | |
| "learning_rate": 0.001, | |
| "loss": 2.6814, | |
| "num_input_tokens_seen": 30251413760, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 0.6348186497198643, | |
| "grad_norm": 0.1295064240694046, | |
| "learning_rate": 0.001, | |
| "loss": 2.676, | |
| "num_input_tokens_seen": 30303842560, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.6359169518820094, | |
| "grad_norm": 0.1440495401620865, | |
| "learning_rate": 0.001, | |
| "loss": 2.6778, | |
| "num_input_tokens_seen": 30356271360, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 0.6370152540441545, | |
| "grad_norm": 0.13806115090847015, | |
| "learning_rate": 0.001, | |
| "loss": 2.6712, | |
| "num_input_tokens_seen": 30408700160, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.6370152540441545, | |
| "eval_loss": 2.576237440109253, | |
| "eval_runtime": 66.9761, | |
| "eval_samples_per_second": 74.653, | |
| "eval_steps_per_second": 18.663, | |
| "num_input_tokens_seen": 30408700160, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.6381135562062996, | |
| "grad_norm": 0.13853897154331207, | |
| "learning_rate": 0.001, | |
| "loss": 2.6719, | |
| "num_input_tokens_seen": 30461128960, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 0.6392118583684446, | |
| "grad_norm": 0.14228977262973785, | |
| "learning_rate": 0.001, | |
| "loss": 2.6788, | |
| "num_input_tokens_seen": 30513557760, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.6403101605305898, | |
| "grad_norm": 0.13464143872261047, | |
| "learning_rate": 0.001, | |
| "loss": 2.6743, | |
| "num_input_tokens_seen": 30565986560, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 0.6414084626927349, | |
| "grad_norm": 0.15960821509361267, | |
| "learning_rate": 0.001, | |
| "loss": 2.6729, | |
| "num_input_tokens_seen": 30618415360, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.64250676485488, | |
| "grad_norm": 0.13830585777759552, | |
| "learning_rate": 0.001, | |
| "loss": 2.6723, | |
| "num_input_tokens_seen": 30670844160, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 0.643605067017025, | |
| "grad_norm": 0.14440728724002838, | |
| "learning_rate": 0.001, | |
| "loss": 2.664, | |
| "num_input_tokens_seen": 30723272960, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.6447033691791701, | |
| "grad_norm": 0.14259463548660278, | |
| "learning_rate": 0.001, | |
| "loss": 2.6675, | |
| "num_input_tokens_seen": 30775701760, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 0.6458016713413153, | |
| "grad_norm": 0.1462564468383789, | |
| "learning_rate": 0.001, | |
| "loss": 2.6671, | |
| "num_input_tokens_seen": 30828130560, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.6468999735034603, | |
| "grad_norm": 0.1443469077348709, | |
| "learning_rate": 0.001, | |
| "loss": 2.6667, | |
| "num_input_tokens_seen": 30880559360, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 0.6479982756656054, | |
| "grad_norm": 0.143255814909935, | |
| "learning_rate": 0.001, | |
| "loss": 2.6652, | |
| "num_input_tokens_seen": 30932988160, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.6479982756656054, | |
| "eval_loss": 2.569544792175293, | |
| "eval_runtime": 66.8674, | |
| "eval_samples_per_second": 74.775, | |
| "eval_steps_per_second": 18.694, | |
| "num_input_tokens_seen": 30932988160, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.6490965778277505, | |
| "grad_norm": 0.15149758756160736, | |
| "learning_rate": 0.001, | |
| "loss": 2.6681, | |
| "num_input_tokens_seen": 30985416960, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 0.6501948799898957, | |
| "grad_norm": 0.15703468024730682, | |
| "learning_rate": 0.001, | |
| "loss": 2.6681, | |
| "num_input_tokens_seen": 31037845760, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.6512931821520407, | |
| "grad_norm": 0.14332515001296997, | |
| "learning_rate": 0.001, | |
| "loss": 2.6622, | |
| "num_input_tokens_seen": 31090274560, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 0.6523914843141858, | |
| "grad_norm": 0.13763870298862457, | |
| "learning_rate": 0.001, | |
| "loss": 2.6724, | |
| "num_input_tokens_seen": 31142703360, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.6534897864763309, | |
| "grad_norm": 0.11858976632356644, | |
| "learning_rate": 0.001, | |
| "loss": 2.6743, | |
| "num_input_tokens_seen": 31195132160, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 0.654588088638476, | |
| "grad_norm": 0.15627937018871307, | |
| "learning_rate": 0.001, | |
| "loss": 2.6653, | |
| "num_input_tokens_seen": 31247560960, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.6556863908006211, | |
| "grad_norm": 0.15052759647369385, | |
| "learning_rate": 0.001, | |
| "loss": 2.6684, | |
| "num_input_tokens_seen": 31299989760, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 0.6567846929627662, | |
| "grad_norm": 0.1648450791835785, | |
| "learning_rate": 0.001, | |
| "loss": 2.6783, | |
| "num_input_tokens_seen": 31352418560, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.6578829951249113, | |
| "grad_norm": 0.13318586349487305, | |
| "learning_rate": 0.001, | |
| "loss": 2.6712, | |
| "num_input_tokens_seen": 31404847360, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 0.6589812972870563, | |
| "grad_norm": 0.1517287641763687, | |
| "learning_rate": 0.001, | |
| "loss": 2.6688, | |
| "num_input_tokens_seen": 31457276160, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6589812972870563, | |
| "eval_loss": 2.5676708221435547, | |
| "eval_runtime": 66.0876, | |
| "eval_samples_per_second": 75.657, | |
| "eval_steps_per_second": 18.914, | |
| "num_input_tokens_seen": 31457276160, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6600795994492015, | |
| "grad_norm": 0.14465224742889404, | |
| "learning_rate": 0.001, | |
| "loss": 2.6657, | |
| "num_input_tokens_seen": 31509704960, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 0.6611779016113466, | |
| "grad_norm": 0.16096332669258118, | |
| "learning_rate": 0.001, | |
| "loss": 2.6612, | |
| "num_input_tokens_seen": 31562133760, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.6622762037734916, | |
| "grad_norm": 0.1434296816587448, | |
| "learning_rate": 0.001, | |
| "loss": 2.6695, | |
| "num_input_tokens_seen": 31614562560, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 0.6633745059356367, | |
| "grad_norm": 0.13844367861747742, | |
| "learning_rate": 0.001, | |
| "loss": 2.6649, | |
| "num_input_tokens_seen": 31666991360, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.6644728080977819, | |
| "grad_norm": 0.1579446643590927, | |
| "learning_rate": 0.001, | |
| "loss": 2.6701, | |
| "num_input_tokens_seen": 31719420160, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 0.665571110259927, | |
| "grad_norm": 0.1585385501384735, | |
| "learning_rate": 0.001, | |
| "loss": 2.665, | |
| "num_input_tokens_seen": 31771848960, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.666669412422072, | |
| "grad_norm": 0.18768636882305145, | |
| "learning_rate": 0.001, | |
| "loss": 2.6708, | |
| "num_input_tokens_seen": 31824277760, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 0.6677677145842171, | |
| "grad_norm": 0.13027966022491455, | |
| "learning_rate": 0.001, | |
| "loss": 2.6657, | |
| "num_input_tokens_seen": 31876706560, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.6688660167463623, | |
| "grad_norm": 0.13473722338676453, | |
| "learning_rate": 0.001, | |
| "loss": 2.6658, | |
| "num_input_tokens_seen": 31929135360, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 0.6699643189085073, | |
| "grad_norm": 0.14617317914962769, | |
| "learning_rate": 0.001, | |
| "loss": 2.664, | |
| "num_input_tokens_seen": 31981564160, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.6699643189085073, | |
| "eval_loss": 2.5658769607543945, | |
| "eval_runtime": 67.5011, | |
| "eval_samples_per_second": 74.073, | |
| "eval_steps_per_second": 18.518, | |
| "num_input_tokens_seen": 31981564160, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.6710626210706524, | |
| "grad_norm": 0.14581717550754547, | |
| "learning_rate": 0.001, | |
| "loss": 2.6654, | |
| "num_input_tokens_seen": 32033992960, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 0.6721609232327975, | |
| "grad_norm": 0.12281567603349686, | |
| "learning_rate": 0.001, | |
| "loss": 2.6649, | |
| "num_input_tokens_seen": 32086421760, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.6732592253949425, | |
| "grad_norm": 0.14368072152137756, | |
| "learning_rate": 0.001, | |
| "loss": 2.6605, | |
| "num_input_tokens_seen": 32138850560, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 0.6743575275570877, | |
| "grad_norm": 0.14596907794475555, | |
| "learning_rate": 0.001, | |
| "loss": 2.6651, | |
| "num_input_tokens_seen": 32191279360, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.6754558297192328, | |
| "grad_norm": 0.15414392948150635, | |
| "learning_rate": 0.001, | |
| "loss": 2.6696, | |
| "num_input_tokens_seen": 32243708160, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 0.6765541318813779, | |
| "grad_norm": 0.14875884354114532, | |
| "learning_rate": 0.001, | |
| "loss": 2.6662, | |
| "num_input_tokens_seen": 32296136960, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.6776524340435229, | |
| "grad_norm": 0.13774773478507996, | |
| "learning_rate": 0.001, | |
| "loss": 2.6649, | |
| "num_input_tokens_seen": 32348565760, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 0.6787507362056681, | |
| "grad_norm": 0.1647578626871109, | |
| "learning_rate": 0.001, | |
| "loss": 2.6693, | |
| "num_input_tokens_seen": 32400994560, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.6798490383678132, | |
| "grad_norm": 0.1620490700006485, | |
| "learning_rate": 0.001, | |
| "loss": 2.6726, | |
| "num_input_tokens_seen": 32453423360, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 0.6809473405299582, | |
| "grad_norm": 0.14238062500953674, | |
| "learning_rate": 0.001, | |
| "loss": 2.6681, | |
| "num_input_tokens_seen": 32505852160, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.6809473405299582, | |
| "eval_loss": 2.5645763874053955, | |
| "eval_runtime": 65.7725, | |
| "eval_samples_per_second": 76.02, | |
| "eval_steps_per_second": 19.005, | |
| "num_input_tokens_seen": 32505852160, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.6820456426921033, | |
| "grad_norm": 0.143716499209404, | |
| "learning_rate": 0.001, | |
| "loss": 2.6591, | |
| "num_input_tokens_seen": 32558280960, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 0.6831439448542485, | |
| "grad_norm": 0.16048283874988556, | |
| "learning_rate": 0.001, | |
| "loss": 2.659, | |
| "num_input_tokens_seen": 32610709760, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.6842422470163936, | |
| "grad_norm": 0.15203309059143066, | |
| "learning_rate": 0.001, | |
| "loss": 2.6703, | |
| "num_input_tokens_seen": 32663138560, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 0.6853405491785386, | |
| "grad_norm": 0.14977113902568817, | |
| "learning_rate": 0.001, | |
| "loss": 2.6657, | |
| "num_input_tokens_seen": 32715567360, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.6864388513406837, | |
| "grad_norm": 0.15292279422283173, | |
| "learning_rate": 0.001, | |
| "loss": 2.6629, | |
| "num_input_tokens_seen": 32767996160, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 0.6875371535028288, | |
| "grad_norm": 0.13721971213817596, | |
| "learning_rate": 0.001, | |
| "loss": 2.6641, | |
| "num_input_tokens_seen": 32820424960, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.6886354556649739, | |
| "grad_norm": 0.15564891695976257, | |
| "learning_rate": 0.001, | |
| "loss": 2.6673, | |
| "num_input_tokens_seen": 32872853760, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 0.689733757827119, | |
| "grad_norm": 0.15267717838287354, | |
| "learning_rate": 0.001, | |
| "loss": 2.6624, | |
| "num_input_tokens_seen": 32925282560, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.6908320599892641, | |
| "grad_norm": 0.15039384365081787, | |
| "learning_rate": 0.001, | |
| "loss": 2.6615, | |
| "num_input_tokens_seen": 32977711360, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 0.6919303621514092, | |
| "grad_norm": 0.14114901423454285, | |
| "learning_rate": 0.001, | |
| "loss": 2.6663, | |
| "num_input_tokens_seen": 33030140160, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.6919303621514092, | |
| "eval_loss": 2.5618767738342285, | |
| "eval_runtime": 66.9611, | |
| "eval_samples_per_second": 74.67, | |
| "eval_steps_per_second": 18.668, | |
| "num_input_tokens_seen": 33030140160, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.6930286643135543, | |
| "grad_norm": 0.1415725201368332, | |
| "learning_rate": 0.001, | |
| "loss": 2.6606, | |
| "num_input_tokens_seen": 33082568960, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 0.6941269664756994, | |
| "grad_norm": 0.14324156939983368, | |
| "learning_rate": 0.001, | |
| "loss": 2.6616, | |
| "num_input_tokens_seen": 33134997760, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.6952252686378445, | |
| "grad_norm": 0.1544431746006012, | |
| "learning_rate": 0.001, | |
| "loss": 2.6567, | |
| "num_input_tokens_seen": 33187426560, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 0.6963235707999895, | |
| "grad_norm": 0.14641186594963074, | |
| "learning_rate": 0.001, | |
| "loss": 2.6605, | |
| "num_input_tokens_seen": 33239855360, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.6974218729621346, | |
| "grad_norm": 0.13757406175136566, | |
| "learning_rate": 0.001, | |
| "loss": 2.673, | |
| "num_input_tokens_seen": 33292284160, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 0.6985201751242798, | |
| "grad_norm": 0.14516425132751465, | |
| "learning_rate": 0.001, | |
| "loss": 2.6781, | |
| "num_input_tokens_seen": 33344712960, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.6996184772864249, | |
| "grad_norm": 0.15246887505054474, | |
| "learning_rate": 0.001, | |
| "loss": 2.6683, | |
| "num_input_tokens_seen": 33397141760, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 0.7007167794485699, | |
| "grad_norm": 0.1413787305355072, | |
| "learning_rate": 0.001, | |
| "loss": 2.6591, | |
| "num_input_tokens_seen": 33449570560, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.701815081610715, | |
| "grad_norm": 0.16077399253845215, | |
| "learning_rate": 0.001, | |
| "loss": 2.6628, | |
| "num_input_tokens_seen": 33501999360, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 0.7029133837728602, | |
| "grad_norm": 0.1555839478969574, | |
| "learning_rate": 0.001, | |
| "loss": 2.6631, | |
| "num_input_tokens_seen": 33554428160, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.7029133837728602, | |
| "eval_loss": 2.561042547225952, | |
| "eval_runtime": 66.7879, | |
| "eval_samples_per_second": 74.864, | |
| "eval_steps_per_second": 18.716, | |
| "num_input_tokens_seen": 33554428160, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.7040116859350052, | |
| "grad_norm": 0.15333816409111023, | |
| "learning_rate": 0.001, | |
| "loss": 2.6605, | |
| "num_input_tokens_seen": 33606856960, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 0.7051099880971503, | |
| "grad_norm": 0.14965052902698517, | |
| "learning_rate": 0.001, | |
| "loss": 2.6551, | |
| "num_input_tokens_seen": 33659285760, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.7062082902592954, | |
| "grad_norm": 0.1994074285030365, | |
| "learning_rate": 0.001, | |
| "loss": 2.6652, | |
| "num_input_tokens_seen": 33711714560, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 0.7073065924214406, | |
| "grad_norm": 0.3089894652366638, | |
| "learning_rate": 0.001, | |
| "loss": 2.6814, | |
| "num_input_tokens_seen": 33764143360, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.7084048945835856, | |
| "grad_norm": 0.14903652667999268, | |
| "learning_rate": 0.001, | |
| "loss": 2.6834, | |
| "num_input_tokens_seen": 33816572160, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 0.7095031967457307, | |
| "grad_norm": 0.17594854533672333, | |
| "learning_rate": 0.001, | |
| "loss": 2.6618, | |
| "num_input_tokens_seen": 33869000960, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.7106014989078758, | |
| "grad_norm": 0.15634667873382568, | |
| "learning_rate": 0.001, | |
| "loss": 2.6663, | |
| "num_input_tokens_seen": 33921429760, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 0.7116998010700208, | |
| "grad_norm": 0.13893702626228333, | |
| "learning_rate": 0.001, | |
| "loss": 2.67, | |
| "num_input_tokens_seen": 33973858560, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.712798103232166, | |
| "grad_norm": 0.16974663734436035, | |
| "learning_rate": 0.001, | |
| "loss": 2.6686, | |
| "num_input_tokens_seen": 34026287360, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 0.7138964053943111, | |
| "grad_norm": 0.15336968004703522, | |
| "learning_rate": 0.001, | |
| "loss": 2.6703, | |
| "num_input_tokens_seen": 34078716160, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.7138964053943111, | |
| "eval_loss": 2.5648574829101562, | |
| "eval_runtime": 66.0796, | |
| "eval_samples_per_second": 75.666, | |
| "eval_steps_per_second": 18.917, | |
| "num_input_tokens_seen": 34078716160, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.7149947075564561, | |
| "grad_norm": 1.428727626800537, | |
| "learning_rate": 0.001, | |
| "loss": 2.8433, | |
| "num_input_tokens_seen": 34131144960, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 0.7160930097186012, | |
| "grad_norm": 0.1666879504919052, | |
| "learning_rate": 0.001, | |
| "loss": 2.7236, | |
| "num_input_tokens_seen": 34183573760, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.7171913118807464, | |
| "grad_norm": 0.16038021445274353, | |
| "learning_rate": 0.001, | |
| "loss": 2.6876, | |
| "num_input_tokens_seen": 34236002560, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 0.7182896140428915, | |
| "grad_norm": 0.1514110267162323, | |
| "learning_rate": 0.001, | |
| "loss": 2.6717, | |
| "num_input_tokens_seen": 34288431360, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.7193879162050365, | |
| "grad_norm": 0.13304661214351654, | |
| "learning_rate": 0.001, | |
| "loss": 2.6664, | |
| "num_input_tokens_seen": 34340860160, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 0.7204862183671816, | |
| "grad_norm": 0.15957415103912354, | |
| "learning_rate": 0.001, | |
| "loss": 2.6683, | |
| "num_input_tokens_seen": 34393288960, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.7215845205293268, | |
| "grad_norm": 0.14532499015331268, | |
| "learning_rate": 0.001, | |
| "loss": 2.6632, | |
| "num_input_tokens_seen": 34445717760, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 0.7226828226914718, | |
| "grad_norm": 0.1402454972267151, | |
| "learning_rate": 0.001, | |
| "loss": 2.6631, | |
| "num_input_tokens_seen": 34498146560, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.7237811248536169, | |
| "grad_norm": 0.17248420417308807, | |
| "learning_rate": 0.001, | |
| "loss": 2.6743, | |
| "num_input_tokens_seen": 34550575360, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 0.724879427015762, | |
| "grad_norm": 0.1455400288105011, | |
| "learning_rate": 0.001, | |
| "loss": 2.6598, | |
| "num_input_tokens_seen": 34603004160, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.724879427015762, | |
| "eval_loss": 2.5639312267303467, | |
| "eval_runtime": 66.9575, | |
| "eval_samples_per_second": 74.674, | |
| "eval_steps_per_second": 18.669, | |
| "num_input_tokens_seen": 34603004160, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.7259777291779071, | |
| "grad_norm": 0.14448963105678558, | |
| "learning_rate": 0.001, | |
| "loss": 2.6579, | |
| "num_input_tokens_seen": 34655432960, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 0.7270760313400522, | |
| "grad_norm": 0.15785731375217438, | |
| "learning_rate": 0.001, | |
| "loss": 2.6641, | |
| "num_input_tokens_seen": 34707861760, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.7281743335021973, | |
| "grad_norm": 0.14524365961551666, | |
| "learning_rate": 0.001, | |
| "loss": 2.6639, | |
| "num_input_tokens_seen": 34760290560, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 0.7292726356643424, | |
| "grad_norm": 0.17661139369010925, | |
| "learning_rate": 0.001, | |
| "loss": 2.666, | |
| "num_input_tokens_seen": 34812719360, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.7303709378264874, | |
| "grad_norm": 0.14052839577198029, | |
| "learning_rate": 0.001, | |
| "loss": 2.6638, | |
| "num_input_tokens_seen": 34865148160, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 0.7314692399886326, | |
| "grad_norm": 0.14182330667972565, | |
| "learning_rate": 0.001, | |
| "loss": 2.6618, | |
| "num_input_tokens_seen": 34917576960, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.7325675421507777, | |
| "grad_norm": 0.168069988489151, | |
| "learning_rate": 0.001, | |
| "loss": 2.6655, | |
| "num_input_tokens_seen": 34970005760, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 0.7336658443129228, | |
| "grad_norm": 0.1627034991979599, | |
| "learning_rate": 0.001, | |
| "loss": 2.6646, | |
| "num_input_tokens_seen": 35022434560, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.7347641464750678, | |
| "grad_norm": 0.1257403939962387, | |
| "learning_rate": 0.001, | |
| "loss": 2.6682, | |
| "num_input_tokens_seen": 35074863360, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 0.735862448637213, | |
| "grad_norm": 0.15367744863033295, | |
| "learning_rate": 0.001, | |
| "loss": 2.6693, | |
| "num_input_tokens_seen": 35127292160, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.735862448637213, | |
| "eval_loss": 2.5610554218292236, | |
| "eval_runtime": 67.0185, | |
| "eval_samples_per_second": 74.606, | |
| "eval_steps_per_second": 18.652, | |
| "num_input_tokens_seen": 35127292160, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.7369607507993581, | |
| "grad_norm": 0.16001376509666443, | |
| "learning_rate": 0.001, | |
| "loss": 2.6594, | |
| "num_input_tokens_seen": 35179720960, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 0.7380590529615031, | |
| "grad_norm": 0.14694422483444214, | |
| "learning_rate": 0.001, | |
| "loss": 2.6635, | |
| "num_input_tokens_seen": 35232149760, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.7391573551236482, | |
| "grad_norm": 0.15586304664611816, | |
| "learning_rate": 0.001, | |
| "loss": 2.6565, | |
| "num_input_tokens_seen": 35284578560, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 0.7402556572857933, | |
| "grad_norm": 0.16455145180225372, | |
| "learning_rate": 0.001, | |
| "loss": 2.6621, | |
| "num_input_tokens_seen": 35337007360, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.7413539594479385, | |
| "grad_norm": 0.13630282878875732, | |
| "learning_rate": 0.001, | |
| "loss": 2.6658, | |
| "num_input_tokens_seen": 35389436160, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 0.7424522616100835, | |
| "grad_norm": 0.15180189907550812, | |
| "learning_rate": 0.001, | |
| "loss": 2.6593, | |
| "num_input_tokens_seen": 35441864960, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.7435505637722286, | |
| "grad_norm": 0.16608890891075134, | |
| "learning_rate": 0.001, | |
| "loss": 2.6777, | |
| "num_input_tokens_seen": 35494293760, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 0.7446488659343737, | |
| "grad_norm": 0.31720519065856934, | |
| "learning_rate": 0.001, | |
| "loss": 2.6685, | |
| "num_input_tokens_seen": 35546722560, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.7457471680965188, | |
| "grad_norm": 0.24131393432617188, | |
| "learning_rate": 0.001, | |
| "loss": 2.6682, | |
| "num_input_tokens_seen": 35599151360, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 0.7468454702586639, | |
| "grad_norm": 0.1594172567129135, | |
| "learning_rate": 0.001, | |
| "loss": 2.6575, | |
| "num_input_tokens_seen": 35651580160, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.7468454702586639, | |
| "eval_loss": 2.5587804317474365, | |
| "eval_runtime": 66.6197, | |
| "eval_samples_per_second": 75.053, | |
| "eval_steps_per_second": 18.763, | |
| "num_input_tokens_seen": 35651580160, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.747943772420809, | |
| "grad_norm": 0.1586858183145523, | |
| "learning_rate": 0.001, | |
| "loss": 2.6654, | |
| "num_input_tokens_seen": 35704008960, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 0.749042074582954, | |
| "grad_norm": 0.1376073956489563, | |
| "learning_rate": 0.001, | |
| "loss": 2.6627, | |
| "num_input_tokens_seen": 35756437760, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.7501403767450991, | |
| "grad_norm": 0.13904818892478943, | |
| "learning_rate": 0.001, | |
| "loss": 2.6605, | |
| "num_input_tokens_seen": 35808866560, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 0.7512386789072443, | |
| "grad_norm": 0.14543947577476501, | |
| "learning_rate": 0.001, | |
| "loss": 2.6589, | |
| "num_input_tokens_seen": 35861295360, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.7523369810693894, | |
| "grad_norm": 0.14855198562145233, | |
| "learning_rate": 0.001, | |
| "loss": 2.6612, | |
| "num_input_tokens_seen": 35913724160, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 0.7534352832315344, | |
| "grad_norm": 0.14492908120155334, | |
| "learning_rate": 0.001, | |
| "loss": 2.6561, | |
| "num_input_tokens_seen": 35966152960, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.7545335853936795, | |
| "grad_norm": 0.1388978660106659, | |
| "learning_rate": 0.001, | |
| "loss": 2.6551, | |
| "num_input_tokens_seen": 36018581760, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 0.7556318875558247, | |
| "grad_norm": 0.14582422375679016, | |
| "learning_rate": 0.001, | |
| "loss": 2.6521, | |
| "num_input_tokens_seen": 36071010560, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.7567301897179697, | |
| "grad_norm": 0.17488695681095123, | |
| "learning_rate": 0.001, | |
| "loss": 2.6516, | |
| "num_input_tokens_seen": 36123439360, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 0.7578284918801148, | |
| "grad_norm": 0.12302416563034058, | |
| "learning_rate": 0.001, | |
| "loss": 2.6617, | |
| "num_input_tokens_seen": 36175868160, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.7578284918801148, | |
| "eval_loss": 2.5549991130828857, | |
| "eval_runtime": 67.5095, | |
| "eval_samples_per_second": 74.064, | |
| "eval_steps_per_second": 18.516, | |
| "num_input_tokens_seen": 36175868160, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.7589267940422599, | |
| "grad_norm": 0.14238396286964417, | |
| "learning_rate": 0.001, | |
| "loss": 2.6609, | |
| "num_input_tokens_seen": 36228296960, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 0.7600250962044051, | |
| "grad_norm": 0.17919403314590454, | |
| "learning_rate": 0.001, | |
| "loss": 2.6621, | |
| "num_input_tokens_seen": 36280725760, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.7611233983665501, | |
| "grad_norm": 0.13188666105270386, | |
| "learning_rate": 0.001, | |
| "loss": 2.6529, | |
| "num_input_tokens_seen": 36333154560, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 0.7622217005286952, | |
| "grad_norm": 0.16191646456718445, | |
| "learning_rate": 0.001, | |
| "loss": 2.6584, | |
| "num_input_tokens_seen": 36385583360, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.7633200026908403, | |
| "grad_norm": 0.14606165885925293, | |
| "learning_rate": 0.001, | |
| "loss": 2.6567, | |
| "num_input_tokens_seen": 36438012160, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 0.7644183048529853, | |
| "grad_norm": 0.1648443192243576, | |
| "learning_rate": 0.001, | |
| "loss": 2.6587, | |
| "num_input_tokens_seen": 36490440960, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.7655166070151305, | |
| "grad_norm": 0.19523674249649048, | |
| "learning_rate": 0.001, | |
| "loss": 2.6662, | |
| "num_input_tokens_seen": 36542869760, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 0.7666149091772756, | |
| "grad_norm": 0.1713179498910904, | |
| "learning_rate": 0.001, | |
| "loss": 2.6683, | |
| "num_input_tokens_seen": 36595298560, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.7677132113394207, | |
| "grad_norm": 0.14923711121082306, | |
| "learning_rate": 0.001, | |
| "loss": 2.6629, | |
| "num_input_tokens_seen": 36647727360, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 0.7688115135015657, | |
| "grad_norm": 0.13948023319244385, | |
| "learning_rate": 0.001, | |
| "loss": 2.6619, | |
| "num_input_tokens_seen": 36700156160, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7688115135015657, | |
| "eval_loss": 2.5569379329681396, | |
| "eval_runtime": 67.9393, | |
| "eval_samples_per_second": 73.595, | |
| "eval_steps_per_second": 18.399, | |
| "num_input_tokens_seen": 36700156160, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7699098156637109, | |
| "grad_norm": 0.14624406397342682, | |
| "learning_rate": 0.001, | |
| "loss": 2.657, | |
| "num_input_tokens_seen": 36752584960, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 0.771008117825856, | |
| "grad_norm": 0.16855786740779877, | |
| "learning_rate": 0.001, | |
| "loss": 2.6585, | |
| "num_input_tokens_seen": 36805013760, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.772106419988001, | |
| "grad_norm": 0.1439932882785797, | |
| "learning_rate": 0.001, | |
| "loss": 2.6653, | |
| "num_input_tokens_seen": 36857442560, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 0.7732047221501461, | |
| "grad_norm": 0.16299331188201904, | |
| "learning_rate": 0.001, | |
| "loss": 2.6621, | |
| "num_input_tokens_seen": 36909871360, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.7743030243122913, | |
| "grad_norm": 0.16961826384067535, | |
| "learning_rate": 0.001, | |
| "loss": 2.6545, | |
| "num_input_tokens_seen": 36962300160, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 0.7754013264744364, | |
| "grad_norm": 0.13337954878807068, | |
| "learning_rate": 0.001, | |
| "loss": 2.652, | |
| "num_input_tokens_seen": 37014728960, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.7764996286365814, | |
| "grad_norm": 0.1728074699640274, | |
| "learning_rate": 0.001, | |
| "loss": 2.6631, | |
| "num_input_tokens_seen": 37067157760, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 0.7775979307987265, | |
| "grad_norm": 0.16615192592144012, | |
| "learning_rate": 0.001, | |
| "loss": 2.6551, | |
| "num_input_tokens_seen": 37119586560, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.7786962329608716, | |
| "grad_norm": 0.1515650749206543, | |
| "learning_rate": 0.001, | |
| "loss": 2.6529, | |
| "num_input_tokens_seen": 37172015360, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 0.7797945351230167, | |
| "grad_norm": 0.1534053236246109, | |
| "learning_rate": 0.001, | |
| "loss": 2.6567, | |
| "num_input_tokens_seen": 37224444160, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.7797945351230167, | |
| "eval_loss": 2.55454683303833, | |
| "eval_runtime": 67.0727, | |
| "eval_samples_per_second": 74.546, | |
| "eval_steps_per_second": 18.637, | |
| "num_input_tokens_seen": 37224444160, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.7808928372851618, | |
| "grad_norm": 0.16377541422843933, | |
| "learning_rate": 0.001, | |
| "loss": 2.6552, | |
| "num_input_tokens_seen": 37276872960, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 0.7819911394473069, | |
| "grad_norm": 0.14807477593421936, | |
| "learning_rate": 0.001, | |
| "loss": 2.6563, | |
| "num_input_tokens_seen": 37329301760, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.783089441609452, | |
| "grad_norm": 0.13599660992622375, | |
| "learning_rate": 0.001, | |
| "loss": 2.6575, | |
| "num_input_tokens_seen": 37381730560, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 0.7841877437715971, | |
| "grad_norm": 0.16653482615947723, | |
| "learning_rate": 0.001, | |
| "loss": 2.6515, | |
| "num_input_tokens_seen": 37434159360, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.7852860459337422, | |
| "grad_norm": 0.15467293560504913, | |
| "learning_rate": 0.001, | |
| "loss": 2.6548, | |
| "num_input_tokens_seen": 37486588160, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 0.7863843480958873, | |
| "grad_norm": 0.4751467704772949, | |
| "learning_rate": 0.001, | |
| "loss": 2.6592, | |
| "num_input_tokens_seen": 37539016960, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.7874826502580323, | |
| "grad_norm": 0.15940867364406586, | |
| "learning_rate": 0.001, | |
| "loss": 2.6624, | |
| "num_input_tokens_seen": 37591445760, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 0.7885809524201775, | |
| "grad_norm": 0.137634739279747, | |
| "learning_rate": 0.001, | |
| "loss": 2.6559, | |
| "num_input_tokens_seen": 37643874560, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.7896792545823226, | |
| "grad_norm": 0.16022460162639618, | |
| "learning_rate": 0.001, | |
| "loss": 2.6555, | |
| "num_input_tokens_seen": 37696303360, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 0.7907775567444676, | |
| "grad_norm": 0.147109717130661, | |
| "learning_rate": 0.001, | |
| "loss": 2.663, | |
| "num_input_tokens_seen": 37748732160, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.7907775567444676, | |
| "eval_loss": 2.556107521057129, | |
| "eval_runtime": 67.1814, | |
| "eval_samples_per_second": 74.425, | |
| "eval_steps_per_second": 18.606, | |
| "num_input_tokens_seen": 37748732160, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.7918758589066127, | |
| "grad_norm": 0.16054154932498932, | |
| "learning_rate": 0.001, | |
| "loss": 2.6516, | |
| "num_input_tokens_seen": 37801160960, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 0.7929741610687578, | |
| "grad_norm": 0.15180550515651703, | |
| "learning_rate": 0.001, | |
| "loss": 2.6508, | |
| "num_input_tokens_seen": 37853589760, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.794072463230903, | |
| "grad_norm": 0.19564937055110931, | |
| "learning_rate": 0.001, | |
| "loss": 2.6532, | |
| "num_input_tokens_seen": 37906018560, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 0.795170765393048, | |
| "grad_norm": 0.15047501027584076, | |
| "learning_rate": 0.001, | |
| "loss": 2.6567, | |
| "num_input_tokens_seen": 37958447360, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.7962690675551931, | |
| "grad_norm": 0.1420314759016037, | |
| "learning_rate": 0.001, | |
| "loss": 2.6511, | |
| "num_input_tokens_seen": 38010876160, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 0.7973673697173382, | |
| "grad_norm": 0.14328153431415558, | |
| "learning_rate": 0.001, | |
| "loss": 2.6601, | |
| "num_input_tokens_seen": 38063304960, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.7984656718794833, | |
| "grad_norm": 0.15527622401714325, | |
| "learning_rate": 0.001, | |
| "loss": 2.6598, | |
| "num_input_tokens_seen": 38115733760, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 0.7995639740416284, | |
| "grad_norm": 0.15956974029541016, | |
| "learning_rate": 0.001, | |
| "loss": 2.6522, | |
| "num_input_tokens_seen": 38168162560, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.8006622762037735, | |
| "grad_norm": 0.15193034708499908, | |
| "learning_rate": 0.001, | |
| "loss": 2.6561, | |
| "num_input_tokens_seen": 38220591360, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 0.8017605783659186, | |
| "grad_norm": 0.1692439615726471, | |
| "learning_rate": 0.001, | |
| "loss": 2.653, | |
| "num_input_tokens_seen": 38273020160, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.8017605783659186, | |
| "eval_loss": 2.553743362426758, | |
| "eval_runtime": 66.3488, | |
| "eval_samples_per_second": 75.359, | |
| "eval_steps_per_second": 18.84, | |
| "num_input_tokens_seen": 38273020160, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.8028588805280636, | |
| "grad_norm": 0.473707377910614, | |
| "learning_rate": 0.001, | |
| "loss": 2.6604, | |
| "num_input_tokens_seen": 38325448960, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 0.8039571826902088, | |
| "grad_norm": 0.16226574778556824, | |
| "learning_rate": 0.001, | |
| "loss": 2.6615, | |
| "num_input_tokens_seen": 38377877760, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.8050554848523539, | |
| "grad_norm": 0.17274035513401031, | |
| "learning_rate": 0.001, | |
| "loss": 2.6616, | |
| "num_input_tokens_seen": 38430306560, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 0.8061537870144989, | |
| "grad_norm": 0.14171990752220154, | |
| "learning_rate": 0.001, | |
| "loss": 2.6628, | |
| "num_input_tokens_seen": 38482735360, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.807252089176644, | |
| "grad_norm": 0.3828020989894867, | |
| "learning_rate": 0.001, | |
| "loss": 2.6717, | |
| "num_input_tokens_seen": 38535164160, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 0.8083503913387892, | |
| "grad_norm": 0.20836575329303741, | |
| "learning_rate": 0.001, | |
| "loss": 2.685, | |
| "num_input_tokens_seen": 38587592960, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.8094486935009343, | |
| "grad_norm": 0.14613227546215057, | |
| "learning_rate": 0.001, | |
| "loss": 2.6687, | |
| "num_input_tokens_seen": 38640021760, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 0.8105469956630793, | |
| "grad_norm": 0.16505028307437897, | |
| "learning_rate": 0.001, | |
| "loss": 2.6654, | |
| "num_input_tokens_seen": 38692450560, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.8116452978252244, | |
| "grad_norm": 0.15305323898792267, | |
| "learning_rate": 0.001, | |
| "loss": 2.6612, | |
| "num_input_tokens_seen": 38744879360, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 0.8127435999873696, | |
| "grad_norm": 0.2416296899318695, | |
| "learning_rate": 0.001, | |
| "loss": 2.6614, | |
| "num_input_tokens_seen": 38797308160, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.8127435999873696, | |
| "eval_loss": 2.5642571449279785, | |
| "eval_runtime": 66.5631, | |
| "eval_samples_per_second": 75.117, | |
| "eval_steps_per_second": 18.779, | |
| "num_input_tokens_seen": 38797308160, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.8138419021495146, | |
| "grad_norm": 0.1504666954278946, | |
| "learning_rate": 0.001, | |
| "loss": 2.6625, | |
| "num_input_tokens_seen": 38849736960, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 0.8149402043116597, | |
| "grad_norm": 0.15831789374351501, | |
| "learning_rate": 0.001, | |
| "loss": 2.6566, | |
| "num_input_tokens_seen": 38902165760, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.8160385064738048, | |
| "grad_norm": 0.1391575187444687, | |
| "learning_rate": 0.001, | |
| "loss": 2.6609, | |
| "num_input_tokens_seen": 38954594560, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 0.81713680863595, | |
| "grad_norm": 0.22168035805225372, | |
| "learning_rate": 0.001, | |
| "loss": 2.6768, | |
| "num_input_tokens_seen": 39007023360, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.818235110798095, | |
| "grad_norm": 0.1874976009130478, | |
| "learning_rate": 0.001, | |
| "loss": 2.679, | |
| "num_input_tokens_seen": 39059452160, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 0.8193334129602401, | |
| "grad_norm": 0.1796240657567978, | |
| "learning_rate": 0.001, | |
| "loss": 2.6644, | |
| "num_input_tokens_seen": 39111880960, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.8204317151223852, | |
| "grad_norm": 0.3271934986114502, | |
| "learning_rate": 0.001, | |
| "loss": 2.6695, | |
| "num_input_tokens_seen": 39164309760, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 0.8215300172845302, | |
| "grad_norm": 0.13447704911231995, | |
| "learning_rate": 0.001, | |
| "loss": 2.6656, | |
| "num_input_tokens_seen": 39216738560, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.8226283194466754, | |
| "grad_norm": 0.1367628127336502, | |
| "learning_rate": 0.001, | |
| "loss": 2.6505, | |
| "num_input_tokens_seen": 39269167360, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 0.8237266216088205, | |
| "grad_norm": 0.1498686671257019, | |
| "learning_rate": 0.001, | |
| "loss": 2.6594, | |
| "num_input_tokens_seen": 39321596160, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.8237266216088205, | |
| "eval_loss": 2.5516529083251953, | |
| "eval_runtime": 66.8213, | |
| "eval_samples_per_second": 74.826, | |
| "eval_steps_per_second": 18.707, | |
| "num_input_tokens_seen": 39321596160, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.8248249237709656, | |
| "grad_norm": 0.14790424704551697, | |
| "learning_rate": 0.001, | |
| "loss": 2.6519, | |
| "num_input_tokens_seen": 39374024960, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 0.8259232259331106, | |
| "grad_norm": 0.15297918021678925, | |
| "learning_rate": 0.001, | |
| "loss": 2.6533, | |
| "num_input_tokens_seen": 39426453760, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.8270215280952558, | |
| "grad_norm": 0.15760953724384308, | |
| "learning_rate": 0.001, | |
| "loss": 2.6584, | |
| "num_input_tokens_seen": 39478882560, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 0.8281198302574009, | |
| "grad_norm": 0.1545770913362503, | |
| "learning_rate": 0.001, | |
| "loss": 2.6453, | |
| "num_input_tokens_seen": 39531311360, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.8292181324195459, | |
| "grad_norm": 0.17809870839118958, | |
| "learning_rate": 0.001, | |
| "loss": 2.6547, | |
| "num_input_tokens_seen": 39583740160, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 0.830316434581691, | |
| "grad_norm": 0.2712576687335968, | |
| "learning_rate": 0.001, | |
| "loss": 2.6489, | |
| "num_input_tokens_seen": 39636168960, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.8314147367438361, | |
| "grad_norm": 0.1525331437587738, | |
| "learning_rate": 0.001, | |
| "loss": 2.6558, | |
| "num_input_tokens_seen": 39688597760, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 0.8325130389059812, | |
| "grad_norm": 0.1624525785446167, | |
| "learning_rate": 0.001, | |
| "loss": 2.6465, | |
| "num_input_tokens_seen": 39741026560, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.8336113410681263, | |
| "grad_norm": 0.14974552392959595, | |
| "learning_rate": 0.001, | |
| "loss": 2.6595, | |
| "num_input_tokens_seen": 39793455360, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 0.8347096432302714, | |
| "grad_norm": 0.15206202864646912, | |
| "learning_rate": 0.001, | |
| "loss": 2.6525, | |
| "num_input_tokens_seen": 39845884160, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.8347096432302714, | |
| "eval_loss": 2.549203395843506, | |
| "eval_runtime": 66.3732, | |
| "eval_samples_per_second": 75.332, | |
| "eval_steps_per_second": 18.833, | |
| "num_input_tokens_seen": 39845884160, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.8358079453924165, | |
| "grad_norm": 0.15346269309520721, | |
| "learning_rate": 0.001, | |
| "loss": 2.645, | |
| "num_input_tokens_seen": 39898312960, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 0.8369062475545616, | |
| "grad_norm": 0.1504630148410797, | |
| "learning_rate": 0.001, | |
| "loss": 2.666, | |
| "num_input_tokens_seen": 39950741760, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.8380045497167067, | |
| "grad_norm": 0.19098903238773346, | |
| "learning_rate": 0.001, | |
| "loss": 2.6649, | |
| "num_input_tokens_seen": 40003170560, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 0.8391028518788518, | |
| "grad_norm": 0.15553973615169525, | |
| "learning_rate": 0.001, | |
| "loss": 2.6565, | |
| "num_input_tokens_seen": 40055599360, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.8402011540409968, | |
| "grad_norm": 0.15650159120559692, | |
| "learning_rate": 0.001, | |
| "loss": 2.6568, | |
| "num_input_tokens_seen": 40108028160, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 0.841299456203142, | |
| "grad_norm": 0.17787836492061615, | |
| "learning_rate": 0.001, | |
| "loss": 2.6497, | |
| "num_input_tokens_seen": 40160456960, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.8423977583652871, | |
| "grad_norm": 0.1535162478685379, | |
| "learning_rate": 0.001, | |
| "loss": 2.6492, | |
| "num_input_tokens_seen": 40212885760, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 0.8434960605274322, | |
| "grad_norm": 0.16713359951972961, | |
| "learning_rate": 0.001, | |
| "loss": 2.6534, | |
| "num_input_tokens_seen": 40265314560, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.8445943626895772, | |
| "grad_norm": 0.17087998986244202, | |
| "learning_rate": 0.001, | |
| "loss": 2.6602, | |
| "num_input_tokens_seen": 40317743360, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 0.8456926648517223, | |
| "grad_norm": 0.15651412308216095, | |
| "learning_rate": 0.001, | |
| "loss": 2.6547, | |
| "num_input_tokens_seen": 40370172160, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.8456926648517223, | |
| "eval_loss": 2.5524706840515137, | |
| "eval_runtime": 66.5023, | |
| "eval_samples_per_second": 75.185, | |
| "eval_steps_per_second": 18.796, | |
| "num_input_tokens_seen": 40370172160, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.8467909670138675, | |
| "grad_norm": 0.15205898880958557, | |
| "learning_rate": 0.001, | |
| "loss": 2.6541, | |
| "num_input_tokens_seen": 40422600960, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 0.8478892691760125, | |
| "grad_norm": 0.15865832567214966, | |
| "learning_rate": 0.001, | |
| "loss": 2.6536, | |
| "num_input_tokens_seen": 40475029760, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.8489875713381576, | |
| "grad_norm": 0.133284330368042, | |
| "learning_rate": 0.001, | |
| "loss": 2.6531, | |
| "num_input_tokens_seen": 40527458560, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 0.8500858735003027, | |
| "grad_norm": 0.1421806663274765, | |
| "learning_rate": 0.001, | |
| "loss": 2.6558, | |
| "num_input_tokens_seen": 40579887360, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.8511841756624479, | |
| "grad_norm": 0.19429996609687805, | |
| "learning_rate": 0.001, | |
| "loss": 2.6628, | |
| "num_input_tokens_seen": 40632316160, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 0.8522824778245929, | |
| "grad_norm": 0.14661937952041626, | |
| "learning_rate": 0.001, | |
| "loss": 2.6594, | |
| "num_input_tokens_seen": 40684744960, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.853380779986738, | |
| "grad_norm": 0.1694687008857727, | |
| "learning_rate": 0.001, | |
| "loss": 2.6571, | |
| "num_input_tokens_seen": 40737173760, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 0.8544790821488831, | |
| "grad_norm": 0.152188241481781, | |
| "learning_rate": 0.001, | |
| "loss": 2.6534, | |
| "num_input_tokens_seen": 40789602560, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.8555773843110281, | |
| "grad_norm": 0.1554640680551529, | |
| "learning_rate": 0.001, | |
| "loss": 2.649, | |
| "num_input_tokens_seen": 40842031360, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 0.8566756864731733, | |
| "grad_norm": 0.1481955647468567, | |
| "learning_rate": 0.001, | |
| "loss": 2.6527, | |
| "num_input_tokens_seen": 40894460160, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.8566756864731733, | |
| "eval_loss": 2.547664165496826, | |
| "eval_runtime": 66.2874, | |
| "eval_samples_per_second": 75.429, | |
| "eval_steps_per_second": 18.857, | |
| "num_input_tokens_seen": 40894460160, | |
| "step": 39000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 40894460160, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3289694735724052e+19, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |