| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.98793242156074, | |
| "eval_steps": 500, | |
| "global_step": 775, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006436041834271922, | |
| "grad_norm": 181.50096130371094, | |
| "learning_rate": 8.333333333333335e-09, | |
| "loss": 8.4196, | |
| "num_input_tokens_seen": 6848, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.012872083668543845, | |
| "grad_norm": 187.05642700195312, | |
| "learning_rate": 1.666666666666667e-08, | |
| "loss": 8.44, | |
| "num_input_tokens_seen": 14000, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.019308125502815767, | |
| "grad_norm": 182.92320251464844, | |
| "learning_rate": 2.5000000000000002e-08, | |
| "loss": 8.3839, | |
| "num_input_tokens_seen": 21152, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.02574416733708769, | |
| "grad_norm": 186.71311950683594, | |
| "learning_rate": 3.333333333333334e-08, | |
| "loss": 8.4024, | |
| "num_input_tokens_seen": 28224, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.032180209171359615, | |
| "grad_norm": 180.32656860351562, | |
| "learning_rate": 4.166666666666667e-08, | |
| "loss": 8.4594, | |
| "num_input_tokens_seen": 35360, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.038616251005631534, | |
| "grad_norm": 189.87557983398438, | |
| "learning_rate": 5.0000000000000004e-08, | |
| "loss": 8.4107, | |
| "num_input_tokens_seen": 42192, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04505229283990346, | |
| "grad_norm": 185.89984130859375, | |
| "learning_rate": 5.833333333333334e-08, | |
| "loss": 8.4551, | |
| "num_input_tokens_seen": 49088, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05148833467417538, | |
| "grad_norm": 188.8160400390625, | |
| "learning_rate": 6.666666666666668e-08, | |
| "loss": 8.4415, | |
| "num_input_tokens_seen": 55856, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.057924376508447305, | |
| "grad_norm": 190.1417236328125, | |
| "learning_rate": 7.500000000000001e-08, | |
| "loss": 8.4965, | |
| "num_input_tokens_seen": 63120, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.06436041834271923, | |
| "grad_norm": 185.3598175048828, | |
| "learning_rate": 8.333333333333334e-08, | |
| "loss": 8.4251, | |
| "num_input_tokens_seen": 69968, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07079646017699115, | |
| "grad_norm": 183.81944274902344, | |
| "learning_rate": 9.166666666666668e-08, | |
| "loss": 8.4291, | |
| "num_input_tokens_seen": 77168, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07723250201126307, | |
| "grad_norm": 196.39779663085938, | |
| "learning_rate": 1.0000000000000001e-07, | |
| "loss": 8.4463, | |
| "num_input_tokens_seen": 84272, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.083668543845535, | |
| "grad_norm": 181.4925994873047, | |
| "learning_rate": 1.0833333333333335e-07, | |
| "loss": 8.5116, | |
| "num_input_tokens_seen": 91232, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.09010458567980692, | |
| "grad_norm": 190.0314178466797, | |
| "learning_rate": 1.1666666666666668e-07, | |
| "loss": 8.4749, | |
| "num_input_tokens_seen": 97968, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.09654062751407884, | |
| "grad_norm": 188.7615203857422, | |
| "learning_rate": 1.2500000000000002e-07, | |
| "loss": 8.3311, | |
| "num_input_tokens_seen": 104864, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10297666934835076, | |
| "grad_norm": 184.1820526123047, | |
| "learning_rate": 1.3333333333333336e-07, | |
| "loss": 8.3729, | |
| "num_input_tokens_seen": 111488, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10941271118262269, | |
| "grad_norm": 181.39308166503906, | |
| "learning_rate": 1.4166666666666668e-07, | |
| "loss": 8.4261, | |
| "num_input_tokens_seen": 118384, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.11584875301689461, | |
| "grad_norm": 181.79583740234375, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "loss": 8.3051, | |
| "num_input_tokens_seen": 125360, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12228479485116653, | |
| "grad_norm": 181.36965942382812, | |
| "learning_rate": 1.5833333333333336e-07, | |
| "loss": 8.2461, | |
| "num_input_tokens_seen": 132320, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.12872083668543846, | |
| "grad_norm": 182.36839294433594, | |
| "learning_rate": 1.6666666666666668e-07, | |
| "loss": 8.2894, | |
| "num_input_tokens_seen": 139376, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13515687851971037, | |
| "grad_norm": 189.7889404296875, | |
| "learning_rate": 1.7500000000000002e-07, | |
| "loss": 8.2484, | |
| "num_input_tokens_seen": 146544, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1415929203539823, | |
| "grad_norm": 190.1185302734375, | |
| "learning_rate": 1.8333333333333336e-07, | |
| "loss": 8.3034, | |
| "num_input_tokens_seen": 153472, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.14802896218825423, | |
| "grad_norm": 183.1331024169922, | |
| "learning_rate": 1.9166666666666668e-07, | |
| "loss": 8.054, | |
| "num_input_tokens_seen": 159856, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.15446500402252614, | |
| "grad_norm": 168.13046264648438, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 7.9583, | |
| "num_input_tokens_seen": 166528, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.16090104585679807, | |
| "grad_norm": 167.57830810546875, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 7.9626, | |
| "num_input_tokens_seen": 173056, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.16733708769107, | |
| "grad_norm": 170.6557159423828, | |
| "learning_rate": 2.166666666666667e-07, | |
| "loss": 7.8761, | |
| "num_input_tokens_seen": 179616, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.1737731295253419, | |
| "grad_norm": 179.7693328857422, | |
| "learning_rate": 2.2500000000000002e-07, | |
| "loss": 7.8896, | |
| "num_input_tokens_seen": 186912, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.18020917135961384, | |
| "grad_norm": 180.4197998046875, | |
| "learning_rate": 2.3333333333333336e-07, | |
| "loss": 7.8352, | |
| "num_input_tokens_seen": 193936, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.18664521319388577, | |
| "grad_norm": 164.2944793701172, | |
| "learning_rate": 2.416666666666667e-07, | |
| "loss": 7.691, | |
| "num_input_tokens_seen": 200672, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.19308125502815768, | |
| "grad_norm": 167.71722412109375, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "loss": 7.7851, | |
| "num_input_tokens_seen": 207536, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1995172968624296, | |
| "grad_norm": 169.2217254638672, | |
| "learning_rate": 2.5833333333333333e-07, | |
| "loss": 7.7249, | |
| "num_input_tokens_seen": 214640, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.20595333869670152, | |
| "grad_norm": 155.74537658691406, | |
| "learning_rate": 2.666666666666667e-07, | |
| "loss": 6.8838, | |
| "num_input_tokens_seen": 221744, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.21238938053097345, | |
| "grad_norm": 148.12120056152344, | |
| "learning_rate": 2.75e-07, | |
| "loss": 6.7173, | |
| "num_input_tokens_seen": 228624, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.21882542236524538, | |
| "grad_norm": 150.97012329101562, | |
| "learning_rate": 2.8333333333333336e-07, | |
| "loss": 6.6793, | |
| "num_input_tokens_seen": 235456, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.2252614641995173, | |
| "grad_norm": 149.623291015625, | |
| "learning_rate": 2.916666666666667e-07, | |
| "loss": 6.725, | |
| "num_input_tokens_seen": 242768, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.23169750603378922, | |
| "grad_norm": 147.1656036376953, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 6.6905, | |
| "num_input_tokens_seen": 249552, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.23813354786806115, | |
| "grad_norm": 151.0162811279297, | |
| "learning_rate": 3.083333333333334e-07, | |
| "loss": 6.6179, | |
| "num_input_tokens_seen": 256160, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.24456958970233306, | |
| "grad_norm": 150.03030395507812, | |
| "learning_rate": 3.166666666666667e-07, | |
| "loss": 6.501, | |
| "num_input_tokens_seen": 262912, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.251005631536605, | |
| "grad_norm": 145.5784149169922, | |
| "learning_rate": 3.25e-07, | |
| "loss": 6.4588, | |
| "num_input_tokens_seen": 269600, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2574416733708769, | |
| "grad_norm": 143.5873565673828, | |
| "learning_rate": 3.3333333333333335e-07, | |
| "loss": 6.3614, | |
| "num_input_tokens_seen": 276560, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.26387771520514886, | |
| "grad_norm": 144.9624481201172, | |
| "learning_rate": 3.416666666666667e-07, | |
| "loss": 6.2775, | |
| "num_input_tokens_seen": 283696, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.27031375703942073, | |
| "grad_norm": 146.71554565429688, | |
| "learning_rate": 3.5000000000000004e-07, | |
| "loss": 5.9868, | |
| "num_input_tokens_seen": 290832, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.27674979887369267, | |
| "grad_norm": 138.25450134277344, | |
| "learning_rate": 3.583333333333334e-07, | |
| "loss": 5.2286, | |
| "num_input_tokens_seen": 298096, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2831858407079646, | |
| "grad_norm": 156.28713989257812, | |
| "learning_rate": 3.666666666666667e-07, | |
| "loss": 4.5076, | |
| "num_input_tokens_seen": 305120, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.28962188254223653, | |
| "grad_norm": 178.4820556640625, | |
| "learning_rate": 3.75e-07, | |
| "loss": 4.1167, | |
| "num_input_tokens_seen": 312000, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.29605792437650846, | |
| "grad_norm": 317.7680358886719, | |
| "learning_rate": 3.8333333333333335e-07, | |
| "loss": 3.6585, | |
| "num_input_tokens_seen": 319008, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3024939662107804, | |
| "grad_norm": 282.17803955078125, | |
| "learning_rate": 3.9166666666666675e-07, | |
| "loss": 3.3613, | |
| "num_input_tokens_seen": 326192, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3089300080450523, | |
| "grad_norm": 257.7794494628906, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 3.1068, | |
| "num_input_tokens_seen": 333664, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3153660498793242, | |
| "grad_norm": 255.1024169921875, | |
| "learning_rate": 4.083333333333334e-07, | |
| "loss": 2.9368, | |
| "num_input_tokens_seen": 340912, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.32180209171359614, | |
| "grad_norm": 259.47015380859375, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 2.3466, | |
| "num_input_tokens_seen": 347712, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.32823813354786807, | |
| "grad_norm": 263.3533935546875, | |
| "learning_rate": 4.2500000000000006e-07, | |
| "loss": 2.0645, | |
| "num_input_tokens_seen": 355232, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.33467417538214, | |
| "grad_norm": 239.1399688720703, | |
| "learning_rate": 4.333333333333334e-07, | |
| "loss": 1.7729, | |
| "num_input_tokens_seen": 361968, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3411102172164119, | |
| "grad_norm": 257.4410095214844, | |
| "learning_rate": 4.416666666666667e-07, | |
| "loss": 1.6199, | |
| "num_input_tokens_seen": 369136, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3475462590506838, | |
| "grad_norm": 169.56935119628906, | |
| "learning_rate": 4.5000000000000003e-07, | |
| "loss": 1.1593, | |
| "num_input_tokens_seen": 375904, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.35398230088495575, | |
| "grad_norm": 95.25677490234375, | |
| "learning_rate": 4.583333333333333e-07, | |
| "loss": 0.7199, | |
| "num_input_tokens_seen": 382848, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3604183427192277, | |
| "grad_norm": 48.7137451171875, | |
| "learning_rate": 4.666666666666667e-07, | |
| "loss": 0.4394, | |
| "num_input_tokens_seen": 389680, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3668543845534996, | |
| "grad_norm": 62.34474563598633, | |
| "learning_rate": 4.7500000000000006e-07, | |
| "loss": 0.3806, | |
| "num_input_tokens_seen": 396192, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.37329042638777155, | |
| "grad_norm": 30.711780548095703, | |
| "learning_rate": 4.833333333333334e-07, | |
| "loss": 0.3185, | |
| "num_input_tokens_seen": 403104, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3797264682220434, | |
| "grad_norm": 34.46913528442383, | |
| "learning_rate": 4.916666666666667e-07, | |
| "loss": 0.3056, | |
| "num_input_tokens_seen": 410176, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.38616251005631536, | |
| "grad_norm": 25.92363166809082, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.2981, | |
| "num_input_tokens_seen": 416928, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3925985518905873, | |
| "grad_norm": 11.064619064331055, | |
| "learning_rate": 5.083333333333334e-07, | |
| "loss": 0.2473, | |
| "num_input_tokens_seen": 424128, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.3990345937248592, | |
| "grad_norm": 55.367347717285156, | |
| "learning_rate": 5.166666666666667e-07, | |
| "loss": 0.2924, | |
| "num_input_tokens_seen": 430864, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.40547063555913115, | |
| "grad_norm": 42.00873947143555, | |
| "learning_rate": 5.250000000000001e-07, | |
| "loss": 0.2656, | |
| "num_input_tokens_seen": 437744, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.41190667739340303, | |
| "grad_norm": 13.313591003417969, | |
| "learning_rate": 5.333333333333335e-07, | |
| "loss": 0.2335, | |
| "num_input_tokens_seen": 444624, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.41834271922767496, | |
| "grad_norm": 60.489715576171875, | |
| "learning_rate": 5.416666666666667e-07, | |
| "loss": 0.2647, | |
| "num_input_tokens_seen": 451696, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4247787610619469, | |
| "grad_norm": 77.01821899414062, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.3003, | |
| "num_input_tokens_seen": 458784, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.43121480289621883, | |
| "grad_norm": 58.067596435546875, | |
| "learning_rate": 5.583333333333333e-07, | |
| "loss": 0.2656, | |
| "num_input_tokens_seen": 465920, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.43765084473049076, | |
| "grad_norm": 12.40570068359375, | |
| "learning_rate": 5.666666666666667e-07, | |
| "loss": 0.2212, | |
| "num_input_tokens_seen": 473152, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4440868865647627, | |
| "grad_norm": 35.392276763916016, | |
| "learning_rate": 5.750000000000001e-07, | |
| "loss": 0.2532, | |
| "num_input_tokens_seen": 480544, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4505229283990346, | |
| "grad_norm": 51.42181396484375, | |
| "learning_rate": 5.833333333333334e-07, | |
| "loss": 0.2799, | |
| "num_input_tokens_seen": 487552, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4569589702333065, | |
| "grad_norm": 45.73934555053711, | |
| "learning_rate": 5.916666666666667e-07, | |
| "loss": 0.2876, | |
| "num_input_tokens_seen": 494256, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.46339501206757844, | |
| "grad_norm": 20.654096603393555, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 0.2191, | |
| "num_input_tokens_seen": 500768, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.46983105390185037, | |
| "grad_norm": 21.078027725219727, | |
| "learning_rate": 6.083333333333334e-07, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 507136, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4762670957361223, | |
| "grad_norm": 36.7335205078125, | |
| "learning_rate": 6.166666666666668e-07, | |
| "loss": 0.2547, | |
| "num_input_tokens_seen": 514208, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4827031375703942, | |
| "grad_norm": 34.47271728515625, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 521120, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4891391794046661, | |
| "grad_norm": 5.103244781494141, | |
| "learning_rate": 6.333333333333334e-07, | |
| "loss": 0.2045, | |
| "num_input_tokens_seen": 527824, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.49557522123893805, | |
| "grad_norm": 22.47526741027832, | |
| "learning_rate": 6.416666666666667e-07, | |
| "loss": 0.2262, | |
| "num_input_tokens_seen": 534832, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.50201126307321, | |
| "grad_norm": 30.610803604125977, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.2393, | |
| "num_input_tokens_seen": 541696, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5084473049074819, | |
| "grad_norm": 10.922965049743652, | |
| "learning_rate": 6.583333333333333e-07, | |
| "loss": 0.2206, | |
| "num_input_tokens_seen": 548608, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5148833467417538, | |
| "grad_norm": 17.484182357788086, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.2029, | |
| "num_input_tokens_seen": 555456, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5213193885760258, | |
| "grad_norm": 16.49226188659668, | |
| "learning_rate": 6.750000000000001e-07, | |
| "loss": 0.2125, | |
| "num_input_tokens_seen": 562768, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5277554304102977, | |
| "grad_norm": 9.977084159851074, | |
| "learning_rate": 6.833333333333334e-07, | |
| "loss": 0.2023, | |
| "num_input_tokens_seen": 569536, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5341914722445696, | |
| "grad_norm": 17.79197120666504, | |
| "learning_rate": 6.916666666666668e-07, | |
| "loss": 0.2262, | |
| "num_input_tokens_seen": 576096, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5406275140788415, | |
| "grad_norm": 16.699260711669922, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 0.2003, | |
| "num_input_tokens_seen": 583472, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5470635559131134, | |
| "grad_norm": 25.02164077758789, | |
| "learning_rate": 7.083333333333334e-07, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 590304, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5534995977473853, | |
| "grad_norm": 3.8612709045410156, | |
| "learning_rate": 7.166666666666668e-07, | |
| "loss": 0.1839, | |
| "num_input_tokens_seen": 597152, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5599356395816573, | |
| "grad_norm": 31.555482864379883, | |
| "learning_rate": 7.25e-07, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 604208, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5663716814159292, | |
| "grad_norm": 54.94756317138672, | |
| "learning_rate": 7.333333333333334e-07, | |
| "loss": 0.2732, | |
| "num_input_tokens_seen": 610896, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5728077232502011, | |
| "grad_norm": 30.55241584777832, | |
| "learning_rate": 7.416666666666668e-07, | |
| "loss": 0.2405, | |
| "num_input_tokens_seen": 618112, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5792437650844731, | |
| "grad_norm": 16.687997817993164, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.2005, | |
| "num_input_tokens_seen": 625040, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.585679806918745, | |
| "grad_norm": 10.350790977478027, | |
| "learning_rate": 7.583333333333334e-07, | |
| "loss": 0.2005, | |
| "num_input_tokens_seen": 631840, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5921158487530169, | |
| "grad_norm": 25.88368797302246, | |
| "learning_rate": 7.666666666666667e-07, | |
| "loss": 0.2115, | |
| "num_input_tokens_seen": 638752, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5985518905872889, | |
| "grad_norm": 17.11625099182129, | |
| "learning_rate": 7.750000000000001e-07, | |
| "loss": 0.2141, | |
| "num_input_tokens_seen": 645968, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6049879324215608, | |
| "grad_norm": 12.70864200592041, | |
| "learning_rate": 7.833333333333335e-07, | |
| "loss": 0.1898, | |
| "num_input_tokens_seen": 652752, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6114239742558326, | |
| "grad_norm": 3.674001455307007, | |
| "learning_rate": 7.916666666666667e-07, | |
| "loss": 0.2099, | |
| "num_input_tokens_seen": 660048, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6178600160901045, | |
| "grad_norm": 20.51032066345215, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.2014, | |
| "num_input_tokens_seen": 666752, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6242960579243765, | |
| "grad_norm": 47.562381744384766, | |
| "learning_rate": 8.083333333333334e-07, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 673856, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.6307320997586484, | |
| "grad_norm": 35.69169998168945, | |
| "learning_rate": 8.166666666666668e-07, | |
| "loss": 0.2205, | |
| "num_input_tokens_seen": 681104, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6371681415929203, | |
| "grad_norm": 10.080629348754883, | |
| "learning_rate": 8.250000000000001e-07, | |
| "loss": 0.199, | |
| "num_input_tokens_seen": 688128, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6436041834271923, | |
| "grad_norm": 26.242666244506836, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 695216, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6500402252614642, | |
| "grad_norm": 22.0434627532959, | |
| "learning_rate": 8.416666666666667e-07, | |
| "loss": 0.2265, | |
| "num_input_tokens_seen": 701968, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.6564762670957361, | |
| "grad_norm": 27.378408432006836, | |
| "learning_rate": 8.500000000000001e-07, | |
| "loss": 0.2443, | |
| "num_input_tokens_seen": 708928, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6629123089300081, | |
| "grad_norm": 11.929069519042969, | |
| "learning_rate": 8.583333333333334e-07, | |
| "loss": 0.2086, | |
| "num_input_tokens_seen": 715952, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.66934835076428, | |
| "grad_norm": 6.677243232727051, | |
| "learning_rate": 8.666666666666668e-07, | |
| "loss": 0.1915, | |
| "num_input_tokens_seen": 722928, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6757843925985519, | |
| "grad_norm": 17.033658981323242, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.1967, | |
| "num_input_tokens_seen": 730160, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6822204344328238, | |
| "grad_norm": 6.806990146636963, | |
| "learning_rate": 8.833333333333334e-07, | |
| "loss": 0.188, | |
| "num_input_tokens_seen": 737088, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6886564762670957, | |
| "grad_norm": 4.871335506439209, | |
| "learning_rate": 8.916666666666668e-07, | |
| "loss": 0.1895, | |
| "num_input_tokens_seen": 743744, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.6950925181013676, | |
| "grad_norm": 9.054122924804688, | |
| "learning_rate": 9.000000000000001e-07, | |
| "loss": 0.1667, | |
| "num_input_tokens_seen": 750496, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7015285599356396, | |
| "grad_norm": 15.78903579711914, | |
| "learning_rate": 9.083333333333335e-07, | |
| "loss": 0.1976, | |
| "num_input_tokens_seen": 757792, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.7079646017699115, | |
| "grad_norm": 10.51429271697998, | |
| "learning_rate": 9.166666666666666e-07, | |
| "loss": 0.2057, | |
| "num_input_tokens_seen": 764992, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7144006436041834, | |
| "grad_norm": 24.346830368041992, | |
| "learning_rate": 9.25e-07, | |
| "loss": 0.2002, | |
| "num_input_tokens_seen": 771648, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7208366854384554, | |
| "grad_norm": 46.50392532348633, | |
| "learning_rate": 9.333333333333334e-07, | |
| "loss": 0.2173, | |
| "num_input_tokens_seen": 778480, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 22.505762100219727, | |
| "learning_rate": 9.416666666666667e-07, | |
| "loss": 0.1756, | |
| "num_input_tokens_seen": 785328, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7337087691069992, | |
| "grad_norm": 5.675211429595947, | |
| "learning_rate": 9.500000000000001e-07, | |
| "loss": 0.1786, | |
| "num_input_tokens_seen": 792592, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7401448109412712, | |
| "grad_norm": 14.814651489257812, | |
| "learning_rate": 9.583333333333334e-07, | |
| "loss": 0.1879, | |
| "num_input_tokens_seen": 799808, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7465808527755431, | |
| "grad_norm": 13.106173515319824, | |
| "learning_rate": 9.666666666666668e-07, | |
| "loss": 0.173, | |
| "num_input_tokens_seen": 806896, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7530168946098149, | |
| "grad_norm": 24.56918716430664, | |
| "learning_rate": 9.750000000000002e-07, | |
| "loss": 0.1714, | |
| "num_input_tokens_seen": 813536, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7594529364440868, | |
| "grad_norm": 27.256954193115234, | |
| "learning_rate": 9.833333333333334e-07, | |
| "loss": 0.2015, | |
| "num_input_tokens_seen": 820608, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7658889782783588, | |
| "grad_norm": 4.209413051605225, | |
| "learning_rate": 9.916666666666668e-07, | |
| "loss": 0.1847, | |
| "num_input_tokens_seen": 827776, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7723250201126307, | |
| "grad_norm": 18.684349060058594, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.1876, | |
| "num_input_tokens_seen": 834704, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7787610619469026, | |
| "grad_norm": 19.470041275024414, | |
| "learning_rate": 1.0083333333333333e-06, | |
| "loss": 0.1937, | |
| "num_input_tokens_seen": 841568, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.7851971037811746, | |
| "grad_norm": 11.242873191833496, | |
| "learning_rate": 1.0166666666666667e-06, | |
| "loss": 0.1974, | |
| "num_input_tokens_seen": 848704, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7916331456154465, | |
| "grad_norm": 26.72730255126953, | |
| "learning_rate": 1.025e-06, | |
| "loss": 0.2099, | |
| "num_input_tokens_seen": 855664, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.7980691874497184, | |
| "grad_norm": 41.4288215637207, | |
| "learning_rate": 1.0333333333333333e-06, | |
| "loss": 0.2239, | |
| "num_input_tokens_seen": 862464, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8045052292839904, | |
| "grad_norm": 27.283327102661133, | |
| "learning_rate": 1.0416666666666667e-06, | |
| "loss": 0.1953, | |
| "num_input_tokens_seen": 869376, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8109412711182623, | |
| "grad_norm": 4.882501602172852, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "loss": 0.1906, | |
| "num_input_tokens_seen": 876848, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8173773129525342, | |
| "grad_norm": 8.478296279907227, | |
| "learning_rate": 1.0583333333333335e-06, | |
| "loss": 0.1852, | |
| "num_input_tokens_seen": 883664, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8238133547868061, | |
| "grad_norm": 6.773479461669922, | |
| "learning_rate": 1.066666666666667e-06, | |
| "loss": 0.198, | |
| "num_input_tokens_seen": 890592, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.830249396621078, | |
| "grad_norm": 21.877212524414062, | |
| "learning_rate": 1.075e-06, | |
| "loss": 0.2105, | |
| "num_input_tokens_seen": 898048, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.8366854384553499, | |
| "grad_norm": 12.123941421508789, | |
| "learning_rate": 1.0833333333333335e-06, | |
| "loss": 0.1899, | |
| "num_input_tokens_seen": 905040, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8431214802896219, | |
| "grad_norm": 15.84151554107666, | |
| "learning_rate": 1.0916666666666667e-06, | |
| "loss": 0.1742, | |
| "num_input_tokens_seen": 912080, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8495575221238938, | |
| "grad_norm": 8.174356460571289, | |
| "learning_rate": 1.1e-06, | |
| "loss": 0.1585, | |
| "num_input_tokens_seen": 919424, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8559935639581657, | |
| "grad_norm": 14.87348461151123, | |
| "learning_rate": 1.1083333333333335e-06, | |
| "loss": 0.1878, | |
| "num_input_tokens_seen": 926608, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.8624296057924377, | |
| "grad_norm": 11.989315032958984, | |
| "learning_rate": 1.1166666666666666e-06, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 933712, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8688656476267096, | |
| "grad_norm": 9.659666061401367, | |
| "learning_rate": 1.125e-06, | |
| "loss": 0.1944, | |
| "num_input_tokens_seen": 940304, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8753016894609815, | |
| "grad_norm": 20.558237075805664, | |
| "learning_rate": 1.1333333333333334e-06, | |
| "loss": 0.1727, | |
| "num_input_tokens_seen": 947008, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.8817377312952535, | |
| "grad_norm": 8.66232967376709, | |
| "learning_rate": 1.1416666666666668e-06, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 954112, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8881737731295254, | |
| "grad_norm": 16.516559600830078, | |
| "learning_rate": 1.1500000000000002e-06, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 961120, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.8946098149637972, | |
| "grad_norm": 6.140871047973633, | |
| "learning_rate": 1.1583333333333334e-06, | |
| "loss": 0.1649, | |
| "num_input_tokens_seen": 967792, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.9010458567980691, | |
| "grad_norm": 11.593804359436035, | |
| "learning_rate": 1.1666666666666668e-06, | |
| "loss": 0.1738, | |
| "num_input_tokens_seen": 974496, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9074818986323411, | |
| "grad_norm": 26.92620849609375, | |
| "learning_rate": 1.175e-06, | |
| "loss": 0.2221, | |
| "num_input_tokens_seen": 981344, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.913917940466613, | |
| "grad_norm": 26.845230102539062, | |
| "learning_rate": 1.1833333333333334e-06, | |
| "loss": 0.1989, | |
| "num_input_tokens_seen": 988224, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.9203539823008849, | |
| "grad_norm": 12.823030471801758, | |
| "learning_rate": 1.1916666666666668e-06, | |
| "loss": 0.1569, | |
| "num_input_tokens_seen": 995552, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.9267900241351569, | |
| "grad_norm": 14.508877754211426, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 1002224, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9332260659694288, | |
| "grad_norm": 13.097854614257812, | |
| "learning_rate": 1.2083333333333333e-06, | |
| "loss": 0.1609, | |
| "num_input_tokens_seen": 1009312, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9396621078037007, | |
| "grad_norm": 12.183431625366211, | |
| "learning_rate": 1.2166666666666667e-06, | |
| "loss": 0.1649, | |
| "num_input_tokens_seen": 1016256, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9460981496379727, | |
| "grad_norm": 10.628469467163086, | |
| "learning_rate": 1.2250000000000001e-06, | |
| "loss": 0.1412, | |
| "num_input_tokens_seen": 1022880, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.9525341914722446, | |
| "grad_norm": 11.713327407836914, | |
| "learning_rate": 1.2333333333333335e-06, | |
| "loss": 0.165, | |
| "num_input_tokens_seen": 1029856, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9589702333065165, | |
| "grad_norm": 10.031126976013184, | |
| "learning_rate": 1.2416666666666667e-06, | |
| "loss": 0.1971, | |
| "num_input_tokens_seen": 1036928, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.9654062751407884, | |
| "grad_norm": 34.122074127197266, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.1843, | |
| "num_input_tokens_seen": 1044000, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9718423169750603, | |
| "grad_norm": 13.707520484924316, | |
| "learning_rate": 1.2583333333333333e-06, | |
| "loss": 0.1628, | |
| "num_input_tokens_seen": 1050928, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.9782783588093322, | |
| "grad_norm": 8.588343620300293, | |
| "learning_rate": 1.2666666666666669e-06, | |
| "loss": 0.1878, | |
| "num_input_tokens_seen": 1057920, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.9847144006436042, | |
| "grad_norm": 4.411599159240723, | |
| "learning_rate": 1.275e-06, | |
| "loss": 0.1153, | |
| "num_input_tokens_seen": 1064704, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.9911504424778761, | |
| "grad_norm": 13.095698356628418, | |
| "learning_rate": 1.2833333333333335e-06, | |
| "loss": 0.1622, | |
| "num_input_tokens_seen": 1071760, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.997586484312148, | |
| "grad_norm": 14.093315124511719, | |
| "learning_rate": 1.2916666666666669e-06, | |
| "loss": 0.1549, | |
| "num_input_tokens_seen": 1078912, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.00402252614642, | |
| "grad_norm": 17.082075119018555, | |
| "learning_rate": 1.3e-06, | |
| "loss": 0.1729, | |
| "num_input_tokens_seen": 1086288, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.010458567980692, | |
| "grad_norm": 4.992012977600098, | |
| "learning_rate": 1.3083333333333334e-06, | |
| "loss": 0.1198, | |
| "num_input_tokens_seen": 1093584, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.0168946098149638, | |
| "grad_norm": 5.45336389541626, | |
| "learning_rate": 1.3166666666666666e-06, | |
| "loss": 0.1723, | |
| "num_input_tokens_seen": 1100432, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0233306516492358, | |
| "grad_norm": 7.4880757331848145, | |
| "learning_rate": 1.3250000000000002e-06, | |
| "loss": 0.1485, | |
| "num_input_tokens_seen": 1107280, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.0297666934835077, | |
| "grad_norm": 40.28890609741211, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.1757, | |
| "num_input_tokens_seen": 1113968, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0362027353177796, | |
| "grad_norm": 39.24993896484375, | |
| "learning_rate": 1.3416666666666666e-06, | |
| "loss": 0.1907, | |
| "num_input_tokens_seen": 1120752, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.0426387771520516, | |
| "grad_norm": 5.63855504989624, | |
| "learning_rate": 1.3500000000000002e-06, | |
| "loss": 0.1842, | |
| "num_input_tokens_seen": 1127712, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0490748189863235, | |
| "grad_norm": 5.1802754402160645, | |
| "learning_rate": 1.3583333333333334e-06, | |
| "loss": 0.1549, | |
| "num_input_tokens_seen": 1134592, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.0555108608205954, | |
| "grad_norm": 4.200067043304443, | |
| "learning_rate": 1.3666666666666668e-06, | |
| "loss": 0.153, | |
| "num_input_tokens_seen": 1141888, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.0619469026548674, | |
| "grad_norm": 6.892277240753174, | |
| "learning_rate": 1.3750000000000002e-06, | |
| "loss": 0.1532, | |
| "num_input_tokens_seen": 1148688, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.068382944489139, | |
| "grad_norm": 11.852892875671387, | |
| "learning_rate": 1.3833333333333336e-06, | |
| "loss": 0.1629, | |
| "num_input_tokens_seen": 1155552, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.074818986323411, | |
| "grad_norm": 8.346076011657715, | |
| "learning_rate": 1.3916666666666668e-06, | |
| "loss": 0.1708, | |
| "num_input_tokens_seen": 1162624, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.081255028157683, | |
| "grad_norm": 7.836976528167725, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 0.1461, | |
| "num_input_tokens_seen": 1169904, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.0876910699919549, | |
| "grad_norm": 15.59913158416748, | |
| "learning_rate": 1.4083333333333335e-06, | |
| "loss": 0.1402, | |
| "num_input_tokens_seen": 1176928, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.0941271118262268, | |
| "grad_norm": 8.46536636352539, | |
| "learning_rate": 1.4166666666666667e-06, | |
| "loss": 0.143, | |
| "num_input_tokens_seen": 1184160, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1005631536604987, | |
| "grad_norm": 7.491546154022217, | |
| "learning_rate": 1.425e-06, | |
| "loss": 0.1454, | |
| "num_input_tokens_seen": 1191120, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.1069991954947707, | |
| "grad_norm": 16.70829200744629, | |
| "learning_rate": 1.4333333333333335e-06, | |
| "loss": 0.1286, | |
| "num_input_tokens_seen": 1197920, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1134352373290426, | |
| "grad_norm": 16.273927688598633, | |
| "learning_rate": 1.4416666666666667e-06, | |
| "loss": 0.1523, | |
| "num_input_tokens_seen": 1204576, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.1198712791633145, | |
| "grad_norm": 8.122928619384766, | |
| "learning_rate": 1.45e-06, | |
| "loss": 0.1345, | |
| "num_input_tokens_seen": 1211344, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.1263073209975865, | |
| "grad_norm": 27.850522994995117, | |
| "learning_rate": 1.4583333333333335e-06, | |
| "loss": 0.1749, | |
| "num_input_tokens_seen": 1218432, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1327433628318584, | |
| "grad_norm": 30.498666763305664, | |
| "learning_rate": 1.4666666666666669e-06, | |
| "loss": 0.166, | |
| "num_input_tokens_seen": 1225728, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1391794046661303, | |
| "grad_norm": 26.916791915893555, | |
| "learning_rate": 1.475e-06, | |
| "loss": 0.1708, | |
| "num_input_tokens_seen": 1232784, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.1456154465004023, | |
| "grad_norm": 13.593954086303711, | |
| "learning_rate": 1.4833333333333337e-06, | |
| "loss": 0.1363, | |
| "num_input_tokens_seen": 1239472, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1520514883346742, | |
| "grad_norm": 17.63590431213379, | |
| "learning_rate": 1.4916666666666669e-06, | |
| "loss": 0.1369, | |
| "num_input_tokens_seen": 1246864, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.1584875301689461, | |
| "grad_norm": 12.465302467346191, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.1632, | |
| "num_input_tokens_seen": 1253936, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.164923572003218, | |
| "grad_norm": 18.099266052246094, | |
| "learning_rate": 1.5083333333333336e-06, | |
| "loss": 0.1734, | |
| "num_input_tokens_seen": 1261120, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.17135961383749, | |
| "grad_norm": 12.134090423583984, | |
| "learning_rate": 1.5166666666666668e-06, | |
| "loss": 0.135, | |
| "num_input_tokens_seen": 1268208, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.177795655671762, | |
| "grad_norm": 5.747508525848389, | |
| "learning_rate": 1.525e-06, | |
| "loss": 0.1355, | |
| "num_input_tokens_seen": 1275296, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.1842316975060339, | |
| "grad_norm": 16.193449020385742, | |
| "learning_rate": 1.5333333333333334e-06, | |
| "loss": 0.1324, | |
| "num_input_tokens_seen": 1282320, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.1906677393403058, | |
| "grad_norm": 23.576427459716797, | |
| "learning_rate": 1.5416666666666668e-06, | |
| "loss": 0.1754, | |
| "num_input_tokens_seen": 1289008, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.1971037811745777, | |
| "grad_norm": 4.542221546173096, | |
| "learning_rate": 1.5500000000000002e-06, | |
| "loss": 0.1484, | |
| "num_input_tokens_seen": 1296208, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.2035398230088497, | |
| "grad_norm": 6.084584712982178, | |
| "learning_rate": 1.5583333333333334e-06, | |
| "loss": 0.1315, | |
| "num_input_tokens_seen": 1303072, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.2099758648431216, | |
| "grad_norm": 18.8467960357666, | |
| "learning_rate": 1.566666666666667e-06, | |
| "loss": 0.1665, | |
| "num_input_tokens_seen": 1310320, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2164119066773935, | |
| "grad_norm": 6.79512882232666, | |
| "learning_rate": 1.5750000000000002e-06, | |
| "loss": 0.1406, | |
| "num_input_tokens_seen": 1317728, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.2228479485116655, | |
| "grad_norm": 11.130036354064941, | |
| "learning_rate": 1.5833333333333333e-06, | |
| "loss": 0.1391, | |
| "num_input_tokens_seen": 1325216, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2292839903459372, | |
| "grad_norm": 17.00998306274414, | |
| "learning_rate": 1.591666666666667e-06, | |
| "loss": 0.1339, | |
| "num_input_tokens_seen": 1332272, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.235720032180209, | |
| "grad_norm": 16.623762130737305, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.1613, | |
| "num_input_tokens_seen": 1339008, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.242156074014481, | |
| "grad_norm": 15.660219192504883, | |
| "learning_rate": 1.6083333333333333e-06, | |
| "loss": 0.1274, | |
| "num_input_tokens_seen": 1345664, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.248592115848753, | |
| "grad_norm": 21.379770278930664, | |
| "learning_rate": 1.6166666666666667e-06, | |
| "loss": 0.1882, | |
| "num_input_tokens_seen": 1352720, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.255028157683025, | |
| "grad_norm": 8.196439743041992, | |
| "learning_rate": 1.6250000000000001e-06, | |
| "loss": 0.1106, | |
| "num_input_tokens_seen": 1359616, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2614641995172968, | |
| "grad_norm": 4.444194793701172, | |
| "learning_rate": 1.6333333333333335e-06, | |
| "loss": 0.1249, | |
| "num_input_tokens_seen": 1366656, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.2679002413515688, | |
| "grad_norm": 10.585016250610352, | |
| "learning_rate": 1.6416666666666667e-06, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 1373904, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.2743362831858407, | |
| "grad_norm": 18.406293869018555, | |
| "learning_rate": 1.6500000000000003e-06, | |
| "loss": 0.1512, | |
| "num_input_tokens_seen": 1380528, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.2807723250201126, | |
| "grad_norm": 5.323694229125977, | |
| "learning_rate": 1.6583333333333335e-06, | |
| "loss": 0.1166, | |
| "num_input_tokens_seen": 1386912, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.2872083668543846, | |
| "grad_norm": 20.726289749145508, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.21, | |
| "num_input_tokens_seen": 1393648, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2936444086886565, | |
| "grad_norm": 24.05786895751953, | |
| "learning_rate": 1.6750000000000003e-06, | |
| "loss": 0.1915, | |
| "num_input_tokens_seen": 1400640, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.3000804505229284, | |
| "grad_norm": 19.30237579345703, | |
| "learning_rate": 1.6833333333333335e-06, | |
| "loss": 0.1911, | |
| "num_input_tokens_seen": 1407984, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.3065164923572004, | |
| "grad_norm": 6.517977714538574, | |
| "learning_rate": 1.6916666666666666e-06, | |
| "loss": 0.1487, | |
| "num_input_tokens_seen": 1414672, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.3129525341914723, | |
| "grad_norm": 30.81540870666504, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "loss": 0.2154, | |
| "num_input_tokens_seen": 1421872, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3193885760257442, | |
| "grad_norm": 44.00107955932617, | |
| "learning_rate": 1.7083333333333334e-06, | |
| "loss": 0.2909, | |
| "num_input_tokens_seen": 1428640, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.3258246178600162, | |
| "grad_norm": 41.464210510253906, | |
| "learning_rate": 1.7166666666666668e-06, | |
| "loss": 0.271, | |
| "num_input_tokens_seen": 1435456, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.332260659694288, | |
| "grad_norm": 12.14904499053955, | |
| "learning_rate": 1.725e-06, | |
| "loss": 0.1616, | |
| "num_input_tokens_seen": 1442592, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.33869670152856, | |
| "grad_norm": 8.393083572387695, | |
| "learning_rate": 1.7333333333333336e-06, | |
| "loss": 0.1427, | |
| "num_input_tokens_seen": 1449200, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3451327433628317, | |
| "grad_norm": 11.04562759399414, | |
| "learning_rate": 1.7416666666666668e-06, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 1455920, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.3515687851971037, | |
| "grad_norm": 12.494465827941895, | |
| "learning_rate": 1.75e-06, | |
| "loss": 0.169, | |
| "num_input_tokens_seen": 1462624, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3580048270313756, | |
| "grad_norm": 5.395782470703125, | |
| "learning_rate": 1.7583333333333336e-06, | |
| "loss": 0.1285, | |
| "num_input_tokens_seen": 1469520, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.3644408688656475, | |
| "grad_norm": 19.773469924926758, | |
| "learning_rate": 1.7666666666666668e-06, | |
| "loss": 0.1636, | |
| "num_input_tokens_seen": 1476592, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.3708769106999195, | |
| "grad_norm": 28.318584442138672, | |
| "learning_rate": 1.7750000000000002e-06, | |
| "loss": 0.1702, | |
| "num_input_tokens_seen": 1483632, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.3773129525341914, | |
| "grad_norm": 20.225502014160156, | |
| "learning_rate": 1.7833333333333336e-06, | |
| "loss": 0.1562, | |
| "num_input_tokens_seen": 1490528, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.3837489943684633, | |
| "grad_norm": 5.386298179626465, | |
| "learning_rate": 1.7916666666666667e-06, | |
| "loss": 0.1537, | |
| "num_input_tokens_seen": 1497648, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.3901850362027353, | |
| "grad_norm": 6.181918144226074, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 1504800, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.3966210780370072, | |
| "grad_norm": 5.554294109344482, | |
| "learning_rate": 1.8083333333333335e-06, | |
| "loss": 0.1017, | |
| "num_input_tokens_seen": 1512240, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.4030571198712791, | |
| "grad_norm": 5.2657880783081055, | |
| "learning_rate": 1.816666666666667e-06, | |
| "loss": 0.1184, | |
| "num_input_tokens_seen": 1519200, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.409493161705551, | |
| "grad_norm": 8.627300262451172, | |
| "learning_rate": 1.825e-06, | |
| "loss": 0.1343, | |
| "num_input_tokens_seen": 1526272, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.415929203539823, | |
| "grad_norm": 7.965896129608154, | |
| "learning_rate": 1.8333333333333333e-06, | |
| "loss": 0.1271, | |
| "num_input_tokens_seen": 1533440, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.422365245374095, | |
| "grad_norm": 7.089397430419922, | |
| "learning_rate": 1.8416666666666669e-06, | |
| "loss": 0.1383, | |
| "num_input_tokens_seen": 1540272, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.4288012872083669, | |
| "grad_norm": 4.354486465454102, | |
| "learning_rate": 1.85e-06, | |
| "loss": 0.1558, | |
| "num_input_tokens_seen": 1547632, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4352373290426388, | |
| "grad_norm": 7.841838836669922, | |
| "learning_rate": 1.8583333333333335e-06, | |
| "loss": 0.1312, | |
| "num_input_tokens_seen": 1554608, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.4416733708769107, | |
| "grad_norm": 6.812905311584473, | |
| "learning_rate": 1.8666666666666669e-06, | |
| "loss": 0.1212, | |
| "num_input_tokens_seen": 1561472, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.4481094127111827, | |
| "grad_norm": 5.038280963897705, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 1568496, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4545454545454546, | |
| "grad_norm": 4.255394458770752, | |
| "learning_rate": 1.8833333333333334e-06, | |
| "loss": 0.096, | |
| "num_input_tokens_seen": 1575184, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.4609814963797265, | |
| "grad_norm": 3.311915397644043, | |
| "learning_rate": 1.8916666666666668e-06, | |
| "loss": 0.0982, | |
| "num_input_tokens_seen": 1582080, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.4674175382139985, | |
| "grad_norm": 4.303693771362305, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "loss": 0.1099, | |
| "num_input_tokens_seen": 1588688, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.4738535800482704, | |
| "grad_norm": 14.854019165039062, | |
| "learning_rate": 1.9083333333333334e-06, | |
| "loss": 0.1265, | |
| "num_input_tokens_seen": 1595216, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.4802896218825423, | |
| "grad_norm": 10.509958267211914, | |
| "learning_rate": 1.916666666666667e-06, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 1602336, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4867256637168142, | |
| "grad_norm": 9.096975326538086, | |
| "learning_rate": 1.925e-06, | |
| "loss": 0.1593, | |
| "num_input_tokens_seen": 1609024, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.4931617055510862, | |
| "grad_norm": 18.944650650024414, | |
| "learning_rate": 1.9333333333333336e-06, | |
| "loss": 0.1891, | |
| "num_input_tokens_seen": 1615712, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.4995977473853581, | |
| "grad_norm": 6.735738754272461, | |
| "learning_rate": 1.9416666666666666e-06, | |
| "loss": 0.0867, | |
| "num_input_tokens_seen": 1622608, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.50603378921963, | |
| "grad_norm": 12.395522117614746, | |
| "learning_rate": 1.9500000000000004e-06, | |
| "loss": 0.1286, | |
| "num_input_tokens_seen": 1629520, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.512469831053902, | |
| "grad_norm": 13.864114761352539, | |
| "learning_rate": 1.9583333333333334e-06, | |
| "loss": 0.1262, | |
| "num_input_tokens_seen": 1636320, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.518905872888174, | |
| "grad_norm": 4.206810474395752, | |
| "learning_rate": 1.9666666666666668e-06, | |
| "loss": 0.0878, | |
| "num_input_tokens_seen": 1643216, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.5253419147224458, | |
| "grad_norm": 9.294787406921387, | |
| "learning_rate": 1.975e-06, | |
| "loss": 0.1532, | |
| "num_input_tokens_seen": 1650256, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.5317779565567178, | |
| "grad_norm": 5.397519111633301, | |
| "learning_rate": 1.9833333333333335e-06, | |
| "loss": 0.1232, | |
| "num_input_tokens_seen": 1657328, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5382139983909895, | |
| "grad_norm": 4.74614953994751, | |
| "learning_rate": 1.991666666666667e-06, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 1664192, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.5446500402252614, | |
| "grad_norm": 8.80385971069336, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 1670944, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5510860820595334, | |
| "grad_norm": 12.17174243927002, | |
| "learning_rate": 2.0083333333333337e-06, | |
| "loss": 0.1224, | |
| "num_input_tokens_seen": 1677792, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.5575221238938053, | |
| "grad_norm": 6.9399800300598145, | |
| "learning_rate": 2.0166666666666667e-06, | |
| "loss": 0.106, | |
| "num_input_tokens_seen": 1684640, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.5639581657280772, | |
| "grad_norm": 5.804976463317871, | |
| "learning_rate": 2.025e-06, | |
| "loss": 0.1237, | |
| "num_input_tokens_seen": 1691664, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.5703942075623492, | |
| "grad_norm": 5.245293617248535, | |
| "learning_rate": 2.0333333333333335e-06, | |
| "loss": 0.095, | |
| "num_input_tokens_seen": 1698528, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.576830249396621, | |
| "grad_norm": 2.9305763244628906, | |
| "learning_rate": 2.041666666666667e-06, | |
| "loss": 0.0741, | |
| "num_input_tokens_seen": 1705600, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.583266291230893, | |
| "grad_norm": 10.269381523132324, | |
| "learning_rate": 2.05e-06, | |
| "loss": 0.1239, | |
| "num_input_tokens_seen": 1712704, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.589702333065165, | |
| "grad_norm": 4.453558921813965, | |
| "learning_rate": 2.0583333333333337e-06, | |
| "loss": 0.091, | |
| "num_input_tokens_seen": 1719568, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.5961383748994369, | |
| "grad_norm": 16.549911499023438, | |
| "learning_rate": 2.0666666666666666e-06, | |
| "loss": 0.1403, | |
| "num_input_tokens_seen": 1726480, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.6025744167337088, | |
| "grad_norm": 17.650426864624023, | |
| "learning_rate": 2.075e-06, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 1733936, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.6090104585679805, | |
| "grad_norm": 5.322378158569336, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.1343, | |
| "num_input_tokens_seen": 1741008, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6154465004022525, | |
| "grad_norm": 11.570721626281738, | |
| "learning_rate": 2.091666666666667e-06, | |
| "loss": 0.1558, | |
| "num_input_tokens_seen": 1748240, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.6218825422365244, | |
| "grad_norm": 2.901578426361084, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "loss": 0.0809, | |
| "num_input_tokens_seen": 1755072, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.6283185840707963, | |
| "grad_norm": 8.972208023071289, | |
| "learning_rate": 2.1083333333333336e-06, | |
| "loss": 0.1435, | |
| "num_input_tokens_seen": 1762048, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.6347546259050683, | |
| "grad_norm": 2.364783525466919, | |
| "learning_rate": 2.116666666666667e-06, | |
| "loss": 0.0887, | |
| "num_input_tokens_seen": 1769200, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6411906677393402, | |
| "grad_norm": 3.7692675590515137, | |
| "learning_rate": 2.125e-06, | |
| "loss": 0.1038, | |
| "num_input_tokens_seen": 1776112, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6476267095736121, | |
| "grad_norm": 3.0572264194488525, | |
| "learning_rate": 2.133333333333334e-06, | |
| "loss": 0.0889, | |
| "num_input_tokens_seen": 1783664, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.654062751407884, | |
| "grad_norm": 3.8316140174865723, | |
| "learning_rate": 2.1416666666666668e-06, | |
| "loss": 0.0751, | |
| "num_input_tokens_seen": 1790096, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.660498793242156, | |
| "grad_norm": 5.133974552154541, | |
| "learning_rate": 2.15e-06, | |
| "loss": 0.0921, | |
| "num_input_tokens_seen": 1796912, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.666934835076428, | |
| "grad_norm": 5.002286911010742, | |
| "learning_rate": 2.1583333333333336e-06, | |
| "loss": 0.1102, | |
| "num_input_tokens_seen": 1804144, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.6733708769106999, | |
| "grad_norm": 8.221644401550293, | |
| "learning_rate": 2.166666666666667e-06, | |
| "loss": 0.1036, | |
| "num_input_tokens_seen": 1811040, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6798069187449718, | |
| "grad_norm": 6.029963493347168, | |
| "learning_rate": 2.1750000000000004e-06, | |
| "loss": 0.1093, | |
| "num_input_tokens_seen": 1818064, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.6862429605792437, | |
| "grad_norm": 6.715224742889404, | |
| "learning_rate": 2.1833333333333333e-06, | |
| "loss": 0.1714, | |
| "num_input_tokens_seen": 1825056, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.6926790024135157, | |
| "grad_norm": 6.136181354522705, | |
| "learning_rate": 2.191666666666667e-06, | |
| "loss": 0.1007, | |
| "num_input_tokens_seen": 1831968, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.6991150442477876, | |
| "grad_norm": 5.392821788787842, | |
| "learning_rate": 2.2e-06, | |
| "loss": 0.109, | |
| "num_input_tokens_seen": 1838656, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.7055510860820595, | |
| "grad_norm": 3.0743072032928467, | |
| "learning_rate": 2.2083333333333335e-06, | |
| "loss": 0.0574, | |
| "num_input_tokens_seen": 1845760, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7119871279163315, | |
| "grad_norm": 4.986932277679443, | |
| "learning_rate": 2.216666666666667e-06, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 1852480, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.7184231697506034, | |
| "grad_norm": 3.588496685028076, | |
| "learning_rate": 2.2250000000000003e-06, | |
| "loss": 0.1188, | |
| "num_input_tokens_seen": 1859312, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.7248592115848753, | |
| "grad_norm": 3.850637912750244, | |
| "learning_rate": 2.2333333333333333e-06, | |
| "loss": 0.0998, | |
| "num_input_tokens_seen": 1866256, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.7312952534191473, | |
| "grad_norm": 10.427441596984863, | |
| "learning_rate": 2.2416666666666667e-06, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 1873104, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.7377312952534192, | |
| "grad_norm": 6.516834259033203, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.0749, | |
| "num_input_tokens_seen": 1880192, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7441673370876911, | |
| "grad_norm": 5.243050575256348, | |
| "learning_rate": 2.2583333333333335e-06, | |
| "loss": 0.0771, | |
| "num_input_tokens_seen": 1887008, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.750603378921963, | |
| "grad_norm": 3.874545097351074, | |
| "learning_rate": 2.266666666666667e-06, | |
| "loss": 0.0646, | |
| "num_input_tokens_seen": 1894096, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.757039420756235, | |
| "grad_norm": 4.2995476722717285, | |
| "learning_rate": 2.2750000000000002e-06, | |
| "loss": 0.1147, | |
| "num_input_tokens_seen": 1901216, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.763475462590507, | |
| "grad_norm": 9.720036506652832, | |
| "learning_rate": 2.2833333333333336e-06, | |
| "loss": 0.0917, | |
| "num_input_tokens_seen": 1908160, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.7699115044247788, | |
| "grad_norm": 7.985558986663818, | |
| "learning_rate": 2.2916666666666666e-06, | |
| "loss": 0.106, | |
| "num_input_tokens_seen": 1915104, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7763475462590508, | |
| "grad_norm": 4.0768327713012695, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "loss": 0.0849, | |
| "num_input_tokens_seen": 1922128, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.7827835880933227, | |
| "grad_norm": 5.870975017547607, | |
| "learning_rate": 2.3083333333333334e-06, | |
| "loss": 0.1074, | |
| "num_input_tokens_seen": 1929200, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.7892196299275946, | |
| "grad_norm": 3.490455389022827, | |
| "learning_rate": 2.316666666666667e-06, | |
| "loss": 0.0981, | |
| "num_input_tokens_seen": 1936144, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.7956556717618666, | |
| "grad_norm": 4.1171183586120605, | |
| "learning_rate": 2.325e-06, | |
| "loss": 0.1008, | |
| "num_input_tokens_seen": 1943136, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.8020917135961385, | |
| "grad_norm": 7.664264678955078, | |
| "learning_rate": 2.3333333333333336e-06, | |
| "loss": 0.1032, | |
| "num_input_tokens_seen": 1950208, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8085277554304104, | |
| "grad_norm": 4.865798473358154, | |
| "learning_rate": 2.341666666666667e-06, | |
| "loss": 0.0711, | |
| "num_input_tokens_seen": 1957056, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.8149637972646824, | |
| "grad_norm": 2.5436036586761475, | |
| "learning_rate": 2.35e-06, | |
| "loss": 0.0901, | |
| "num_input_tokens_seen": 1964176, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.8213998390989543, | |
| "grad_norm": 6.305140972137451, | |
| "learning_rate": 2.3583333333333338e-06, | |
| "loss": 0.0847, | |
| "num_input_tokens_seen": 1970736, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.827835880933226, | |
| "grad_norm": 2.6688449382781982, | |
| "learning_rate": 2.3666666666666667e-06, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 1977440, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.834271922767498, | |
| "grad_norm": 2.5124077796936035, | |
| "learning_rate": 2.375e-06, | |
| "loss": 0.068, | |
| "num_input_tokens_seen": 1984464, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.8407079646017699, | |
| "grad_norm": 6.168980121612549, | |
| "learning_rate": 2.3833333333333335e-06, | |
| "loss": 0.1088, | |
| "num_input_tokens_seen": 1991248, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.8471440064360418, | |
| "grad_norm": 5.883851051330566, | |
| "learning_rate": 2.391666666666667e-06, | |
| "loss": 0.1017, | |
| "num_input_tokens_seen": 1998496, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.8535800482703138, | |
| "grad_norm": 9.373373985290527, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.13, | |
| "num_input_tokens_seen": 2005552, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.8600160901045857, | |
| "grad_norm": 9.111586570739746, | |
| "learning_rate": 2.4083333333333337e-06, | |
| "loss": 0.0998, | |
| "num_input_tokens_seen": 2012272, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.8664521319388576, | |
| "grad_norm": 5.353252410888672, | |
| "learning_rate": 2.4166666666666667e-06, | |
| "loss": 0.0779, | |
| "num_input_tokens_seen": 2019056, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8728881737731295, | |
| "grad_norm": 6.586206436157227, | |
| "learning_rate": 2.425e-06, | |
| "loss": 0.0907, | |
| "num_input_tokens_seen": 2025760, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.8793242156074015, | |
| "grad_norm": 5.485732555389404, | |
| "learning_rate": 2.4333333333333335e-06, | |
| "loss": 0.0911, | |
| "num_input_tokens_seen": 2032928, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.8857602574416734, | |
| "grad_norm": 3.5151724815368652, | |
| "learning_rate": 2.441666666666667e-06, | |
| "loss": 0.0987, | |
| "num_input_tokens_seen": 2039856, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.8921962992759453, | |
| "grad_norm": 3.680494546890259, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "loss": 0.1254, | |
| "num_input_tokens_seen": 2046896, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.898632341110217, | |
| "grad_norm": 3.302248001098633, | |
| "learning_rate": 2.4583333333333332e-06, | |
| "loss": 0.0494, | |
| "num_input_tokens_seen": 2053600, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.905068382944489, | |
| "grad_norm": 3.605039119720459, | |
| "learning_rate": 2.466666666666667e-06, | |
| "loss": 0.1082, | |
| "num_input_tokens_seen": 2060240, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.911504424778761, | |
| "grad_norm": 2.6599857807159424, | |
| "learning_rate": 2.475e-06, | |
| "loss": 0.0785, | |
| "num_input_tokens_seen": 2067936, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.9179404666130329, | |
| "grad_norm": 7.149720191955566, | |
| "learning_rate": 2.4833333333333334e-06, | |
| "loss": 0.1026, | |
| "num_input_tokens_seen": 2074656, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.9243765084473048, | |
| "grad_norm": 4.549108982086182, | |
| "learning_rate": 2.491666666666667e-06, | |
| "loss": 0.0617, | |
| "num_input_tokens_seen": 2081568, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.9308125502815767, | |
| "grad_norm": 2.900601625442505, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0659, | |
| "num_input_tokens_seen": 2088368, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9372485921158487, | |
| "grad_norm": 6.378200531005859, | |
| "learning_rate": 2.5083333333333336e-06, | |
| "loss": 0.088, | |
| "num_input_tokens_seen": 2095728, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.9436846339501206, | |
| "grad_norm": 6.718885898590088, | |
| "learning_rate": 2.5166666666666666e-06, | |
| "loss": 0.0771, | |
| "num_input_tokens_seen": 2103104, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.9501206757843925, | |
| "grad_norm": 3.587820291519165, | |
| "learning_rate": 2.5250000000000004e-06, | |
| "loss": 0.0642, | |
| "num_input_tokens_seen": 2110032, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.9565567176186645, | |
| "grad_norm": 7.106460094451904, | |
| "learning_rate": 2.5333333333333338e-06, | |
| "loss": 0.0947, | |
| "num_input_tokens_seen": 2117056, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.9629927594529364, | |
| "grad_norm": 3.480973243713379, | |
| "learning_rate": 2.5416666666666668e-06, | |
| "loss": 0.0975, | |
| "num_input_tokens_seen": 2123552, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.9694288012872083, | |
| "grad_norm": 2.709892511367798, | |
| "learning_rate": 2.55e-06, | |
| "loss": 0.0527, | |
| "num_input_tokens_seen": 2130128, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.9758648431214803, | |
| "grad_norm": 3.3756306171417236, | |
| "learning_rate": 2.558333333333334e-06, | |
| "loss": 0.0869, | |
| "num_input_tokens_seen": 2137232, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.9823008849557522, | |
| "grad_norm": 6.785555839538574, | |
| "learning_rate": 2.566666666666667e-06, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 2143776, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.9887369267900241, | |
| "grad_norm": 3.4628372192382812, | |
| "learning_rate": 2.5750000000000003e-06, | |
| "loss": 0.0684, | |
| "num_input_tokens_seen": 2150976, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.995172968624296, | |
| "grad_norm": 3.56925892829895, | |
| "learning_rate": 2.5833333333333337e-06, | |
| "loss": 0.0701, | |
| "num_input_tokens_seen": 2158080, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.001609010458568, | |
| "grad_norm": 4.06324577331543, | |
| "learning_rate": 2.5916666666666667e-06, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 2164992, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.00804505229284, | |
| "grad_norm": 7.733395576477051, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.0949, | |
| "num_input_tokens_seen": 2171952, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.014481094127112, | |
| "grad_norm": 7.6149139404296875, | |
| "learning_rate": 2.608333333333333e-06, | |
| "loss": 0.0911, | |
| "num_input_tokens_seen": 2179072, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.020917135961384, | |
| "grad_norm": 2.538379192352295, | |
| "learning_rate": 2.616666666666667e-06, | |
| "loss": 0.0615, | |
| "num_input_tokens_seen": 2185872, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.0273531777956557, | |
| "grad_norm": 2.5334603786468506, | |
| "learning_rate": 2.6250000000000003e-06, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 2192656, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.0337892196299276, | |
| "grad_norm": 4.8344340324401855, | |
| "learning_rate": 2.6333333333333332e-06, | |
| "loss": 0.0619, | |
| "num_input_tokens_seen": 2199728, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.0402252614641996, | |
| "grad_norm": 4.393861770629883, | |
| "learning_rate": 2.6416666666666666e-06, | |
| "loss": 0.0475, | |
| "num_input_tokens_seen": 2206608, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.0466613032984715, | |
| "grad_norm": 2.7922892570495605, | |
| "learning_rate": 2.6500000000000005e-06, | |
| "loss": 0.0438, | |
| "num_input_tokens_seen": 2213856, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.0530973451327434, | |
| "grad_norm": 1.5408401489257812, | |
| "learning_rate": 2.6583333333333334e-06, | |
| "loss": 0.0245, | |
| "num_input_tokens_seen": 2220528, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.0595333869670154, | |
| "grad_norm": 5.6088433265686035, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 2227616, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.0659694288012873, | |
| "grad_norm": 9.311470985412598, | |
| "learning_rate": 2.6750000000000002e-06, | |
| "loss": 0.1015, | |
| "num_input_tokens_seen": 2234304, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.0724054706355592, | |
| "grad_norm": 5.244096279144287, | |
| "learning_rate": 2.683333333333333e-06, | |
| "loss": 0.0753, | |
| "num_input_tokens_seen": 2241088, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.078841512469831, | |
| "grad_norm": 3.443998098373413, | |
| "learning_rate": 2.691666666666667e-06, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 2247632, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.085277554304103, | |
| "grad_norm": 2.4997072219848633, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "loss": 0.0287, | |
| "num_input_tokens_seen": 2254448, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.091713596138375, | |
| "grad_norm": 4.817678928375244, | |
| "learning_rate": 2.7083333333333334e-06, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 2261424, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.098149637972647, | |
| "grad_norm": 6.326369285583496, | |
| "learning_rate": 2.7166666666666668e-06, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 2268528, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.104585679806919, | |
| "grad_norm": 3.599905490875244, | |
| "learning_rate": 2.7250000000000006e-06, | |
| "loss": 0.0438, | |
| "num_input_tokens_seen": 2275328, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.111021721641191, | |
| "grad_norm": 2.8037264347076416, | |
| "learning_rate": 2.7333333333333336e-06, | |
| "loss": 0.0475, | |
| "num_input_tokens_seen": 2282400, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.1174577634754628, | |
| "grad_norm": 2.7425622940063477, | |
| "learning_rate": 2.741666666666667e-06, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 2289312, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.1238938053097347, | |
| "grad_norm": 2.064824342727661, | |
| "learning_rate": 2.7500000000000004e-06, | |
| "loss": 0.0355, | |
| "num_input_tokens_seen": 2295824, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1303298471440066, | |
| "grad_norm": 3.695521593093872, | |
| "learning_rate": 2.7583333333333333e-06, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 2303024, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.136765888978278, | |
| "grad_norm": 3.3290112018585205, | |
| "learning_rate": 2.766666666666667e-06, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 2309904, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.14320193081255, | |
| "grad_norm": 2.751953363418579, | |
| "learning_rate": 2.7750000000000005e-06, | |
| "loss": 0.0288, | |
| "num_input_tokens_seen": 2316416, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.149637972646822, | |
| "grad_norm": 4.679827690124512, | |
| "learning_rate": 2.7833333333333335e-06, | |
| "loss": 0.0563, | |
| "num_input_tokens_seen": 2323088, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.156074014481094, | |
| "grad_norm": 9.301896095275879, | |
| "learning_rate": 2.791666666666667e-06, | |
| "loss": 0.1176, | |
| "num_input_tokens_seen": 2329968, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.162510056315366, | |
| "grad_norm": 6.16165828704834, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.0965, | |
| "num_input_tokens_seen": 2336656, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.168946098149638, | |
| "grad_norm": 2.442518711090088, | |
| "learning_rate": 2.8083333333333333e-06, | |
| "loss": 0.0359, | |
| "num_input_tokens_seen": 2343984, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.1753821399839097, | |
| "grad_norm": 3.537282943725586, | |
| "learning_rate": 2.816666666666667e-06, | |
| "loss": 0.0609, | |
| "num_input_tokens_seen": 2350912, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 5.1499223709106445, | |
| "learning_rate": 2.825e-06, | |
| "loss": 0.0768, | |
| "num_input_tokens_seen": 2357680, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.1882542236524536, | |
| "grad_norm": 8.193970680236816, | |
| "learning_rate": 2.8333333333333335e-06, | |
| "loss": 0.0849, | |
| "num_input_tokens_seen": 2364736, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.1946902654867255, | |
| "grad_norm": 2.2035670280456543, | |
| "learning_rate": 2.841666666666667e-06, | |
| "loss": 0.0581, | |
| "num_input_tokens_seen": 2371568, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.2011263073209975, | |
| "grad_norm": 2.7924435138702393, | |
| "learning_rate": 2.85e-06, | |
| "loss": 0.046, | |
| "num_input_tokens_seen": 2378384, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.2075623491552694, | |
| "grad_norm": 4.6174445152282715, | |
| "learning_rate": 2.8583333333333336e-06, | |
| "loss": 0.0674, | |
| "num_input_tokens_seen": 2385584, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.2139983909895413, | |
| "grad_norm": 2.4459989070892334, | |
| "learning_rate": 2.866666666666667e-06, | |
| "loss": 0.0563, | |
| "num_input_tokens_seen": 2392640, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.2204344328238133, | |
| "grad_norm": 2.3443846702575684, | |
| "learning_rate": 2.875e-06, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 2399936, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.226870474658085, | |
| "grad_norm": 2.865879774093628, | |
| "learning_rate": 2.8833333333333334e-06, | |
| "loss": 0.0659, | |
| "num_input_tokens_seen": 2406928, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.233306516492357, | |
| "grad_norm": 4.03169059753418, | |
| "learning_rate": 2.8916666666666672e-06, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 2413888, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.239742558326629, | |
| "grad_norm": 1.693605899810791, | |
| "learning_rate": 2.9e-06, | |
| "loss": 0.0239, | |
| "num_input_tokens_seen": 2421104, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.246178600160901, | |
| "grad_norm": 2.7058444023132324, | |
| "learning_rate": 2.9083333333333336e-06, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 2428128, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.252614641995173, | |
| "grad_norm": 3.9503567218780518, | |
| "learning_rate": 2.916666666666667e-06, | |
| "loss": 0.0561, | |
| "num_input_tokens_seen": 2434880, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.259050683829445, | |
| "grad_norm": 4.444098472595215, | |
| "learning_rate": 2.925e-06, | |
| "loss": 0.0622, | |
| "num_input_tokens_seen": 2441824, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.265486725663717, | |
| "grad_norm": 3.7014055252075195, | |
| "learning_rate": 2.9333333333333338e-06, | |
| "loss": 0.0875, | |
| "num_input_tokens_seen": 2448688, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.2719227674979887, | |
| "grad_norm": 4.078037261962891, | |
| "learning_rate": 2.941666666666667e-06, | |
| "loss": 0.0307, | |
| "num_input_tokens_seen": 2455488, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.2783588093322606, | |
| "grad_norm": 3.753711700439453, | |
| "learning_rate": 2.95e-06, | |
| "loss": 0.063, | |
| "num_input_tokens_seen": 2462240, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.2847948511665326, | |
| "grad_norm": 2.9653706550598145, | |
| "learning_rate": 2.9583333333333335e-06, | |
| "loss": 0.0404, | |
| "num_input_tokens_seen": 2469408, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.2912308930008045, | |
| "grad_norm": 3.8090925216674805, | |
| "learning_rate": 2.9666666666666673e-06, | |
| "loss": 0.0759, | |
| "num_input_tokens_seen": 2476240, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.2976669348350764, | |
| "grad_norm": 2.4684033393859863, | |
| "learning_rate": 2.9750000000000003e-06, | |
| "loss": 0.0488, | |
| "num_input_tokens_seen": 2482864, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.3041029766693484, | |
| "grad_norm": 2.0687243938446045, | |
| "learning_rate": 2.9833333333333337e-06, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 2489664, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.3105390185036203, | |
| "grad_norm": 3.223965883255005, | |
| "learning_rate": 2.991666666666667e-06, | |
| "loss": 0.0441, | |
| "num_input_tokens_seen": 2496704, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.3169750603378922, | |
| "grad_norm": 2.1407270431518555, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0485, | |
| "num_input_tokens_seen": 2503920, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.323411102172164, | |
| "grad_norm": 2.632885217666626, | |
| "learning_rate": 3.0083333333333335e-06, | |
| "loss": 0.0674, | |
| "num_input_tokens_seen": 2510544, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.329847144006436, | |
| "grad_norm": 3.258030652999878, | |
| "learning_rate": 3.0166666666666673e-06, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 2517408, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.336283185840708, | |
| "grad_norm": 6.024159908294678, | |
| "learning_rate": 3.0250000000000003e-06, | |
| "loss": 0.0618, | |
| "num_input_tokens_seen": 2524160, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.34271922767498, | |
| "grad_norm": 4.7281999588012695, | |
| "learning_rate": 3.0333333333333337e-06, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 2531072, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.349155269509252, | |
| "grad_norm": 4.178661823272705, | |
| "learning_rate": 3.0416666666666666e-06, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 2537920, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.355591311343524, | |
| "grad_norm": 1.5715197324752808, | |
| "learning_rate": 3.05e-06, | |
| "loss": 0.0361, | |
| "num_input_tokens_seen": 2544736, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.3620273531777958, | |
| "grad_norm": 2.835855722427368, | |
| "learning_rate": 3.058333333333334e-06, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 2552016, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.3684633950120677, | |
| "grad_norm": 2.870889902114868, | |
| "learning_rate": 3.066666666666667e-06, | |
| "loss": 0.0622, | |
| "num_input_tokens_seen": 2559616, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.3748994368463396, | |
| "grad_norm": 1.7411049604415894, | |
| "learning_rate": 3.075e-06, | |
| "loss": 0.0328, | |
| "num_input_tokens_seen": 2566240, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.3813354786806116, | |
| "grad_norm": 3.0499918460845947, | |
| "learning_rate": 3.0833333333333336e-06, | |
| "loss": 0.0437, | |
| "num_input_tokens_seen": 2573392, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.3877715205148835, | |
| "grad_norm": 4.242414474487305, | |
| "learning_rate": 3.0916666666666666e-06, | |
| "loss": 0.0644, | |
| "num_input_tokens_seen": 2580544, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.3942075623491554, | |
| "grad_norm": 2.962906837463379, | |
| "learning_rate": 3.1000000000000004e-06, | |
| "loss": 0.0553, | |
| "num_input_tokens_seen": 2587344, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.4006436041834274, | |
| "grad_norm": 4.431301116943359, | |
| "learning_rate": 3.1083333333333338e-06, | |
| "loss": 0.061, | |
| "num_input_tokens_seen": 2594560, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.4070796460176993, | |
| "grad_norm": 5.075587272644043, | |
| "learning_rate": 3.1166666666666668e-06, | |
| "loss": 0.0866, | |
| "num_input_tokens_seen": 2601408, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.4135156878519712, | |
| "grad_norm": 3.877520799636841, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.0632, | |
| "num_input_tokens_seen": 2608624, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.419951729686243, | |
| "grad_norm": 2.9902503490448, | |
| "learning_rate": 3.133333333333334e-06, | |
| "loss": 0.0395, | |
| "num_input_tokens_seen": 2615456, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.426387771520515, | |
| "grad_norm": 3.7800397872924805, | |
| "learning_rate": 3.141666666666667e-06, | |
| "loss": 0.0819, | |
| "num_input_tokens_seen": 2622672, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.432823813354787, | |
| "grad_norm": 2.4674911499023438, | |
| "learning_rate": 3.1500000000000003e-06, | |
| "loss": 0.064, | |
| "num_input_tokens_seen": 2629952, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.439259855189059, | |
| "grad_norm": 5.3331146240234375, | |
| "learning_rate": 3.1583333333333337e-06, | |
| "loss": 0.0803, | |
| "num_input_tokens_seen": 2637168, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.445695897023331, | |
| "grad_norm": 9.950706481933594, | |
| "learning_rate": 3.1666666666666667e-06, | |
| "loss": 0.0798, | |
| "num_input_tokens_seen": 2644144, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.4521319388576024, | |
| "grad_norm": 5.1734442710876465, | |
| "learning_rate": 3.175e-06, | |
| "loss": 0.0544, | |
| "num_input_tokens_seen": 2651376, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.4585679806918743, | |
| "grad_norm": 2.5671188831329346, | |
| "learning_rate": 3.183333333333334e-06, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 2658336, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.4650040225261463, | |
| "grad_norm": 4.357182025909424, | |
| "learning_rate": 3.191666666666667e-06, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 2665360, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.471440064360418, | |
| "grad_norm": 4.694338321685791, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 2672704, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.47787610619469, | |
| "grad_norm": 2.391195774078369, | |
| "learning_rate": 3.2083333333333337e-06, | |
| "loss": 0.0542, | |
| "num_input_tokens_seen": 2679872, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.484312148028962, | |
| "grad_norm": 3.859102249145508, | |
| "learning_rate": 3.2166666666666666e-06, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 2686672, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.490748189863234, | |
| "grad_norm": 2.4710166454315186, | |
| "learning_rate": 3.2250000000000005e-06, | |
| "loss": 0.0517, | |
| "num_input_tokens_seen": 2693520, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.497184231697506, | |
| "grad_norm": 3.309068202972412, | |
| "learning_rate": 3.2333333333333334e-06, | |
| "loss": 0.0698, | |
| "num_input_tokens_seen": 2700432, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.503620273531778, | |
| "grad_norm": 4.21011209487915, | |
| "learning_rate": 3.241666666666667e-06, | |
| "loss": 0.0573, | |
| "num_input_tokens_seen": 2707184, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.51005631536605, | |
| "grad_norm": 4.34623908996582, | |
| "learning_rate": 3.2500000000000002e-06, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 2713936, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5164923572003217, | |
| "grad_norm": 3.361445188522339, | |
| "learning_rate": 3.258333333333333e-06, | |
| "loss": 0.0669, | |
| "num_input_tokens_seen": 2721216, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.5229283990345936, | |
| "grad_norm": 2.091728925704956, | |
| "learning_rate": 3.266666666666667e-06, | |
| "loss": 0.027, | |
| "num_input_tokens_seen": 2727968, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.5293644408688656, | |
| "grad_norm": 2.1977951526641846, | |
| "learning_rate": 3.2750000000000004e-06, | |
| "loss": 0.0303, | |
| "num_input_tokens_seen": 2734816, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.5358004827031375, | |
| "grad_norm": 2.7409942150115967, | |
| "learning_rate": 3.2833333333333334e-06, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 2741744, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.5422365245374094, | |
| "grad_norm": 3.695770740509033, | |
| "learning_rate": 3.2916666666666668e-06, | |
| "loss": 0.0813, | |
| "num_input_tokens_seen": 2748640, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.5486725663716814, | |
| "grad_norm": 3.674891471862793, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "loss": 0.0403, | |
| "num_input_tokens_seen": 2755888, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.5551086082059533, | |
| "grad_norm": 1.716131567955017, | |
| "learning_rate": 3.3083333333333336e-06, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 2762464, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.5615446500402252, | |
| "grad_norm": 2.5081095695495605, | |
| "learning_rate": 3.316666666666667e-06, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 2769712, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.567980691874497, | |
| "grad_norm": 1.9974850416183472, | |
| "learning_rate": 3.3250000000000004e-06, | |
| "loss": 0.035, | |
| "num_input_tokens_seen": 2776736, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.574416733708769, | |
| "grad_norm": 4.233558177947998, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.068, | |
| "num_input_tokens_seen": 2783376, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.580852775543041, | |
| "grad_norm": 3.359081983566284, | |
| "learning_rate": 3.341666666666667e-06, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 2790528, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.587288817377313, | |
| "grad_norm": 2.669712543487549, | |
| "learning_rate": 3.3500000000000005e-06, | |
| "loss": 0.0466, | |
| "num_input_tokens_seen": 2797312, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.593724859211585, | |
| "grad_norm": 3.1529603004455566, | |
| "learning_rate": 3.3583333333333335e-06, | |
| "loss": 0.0626, | |
| "num_input_tokens_seen": 2804288, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.600160901045857, | |
| "grad_norm": 3.069842576980591, | |
| "learning_rate": 3.366666666666667e-06, | |
| "loss": 0.0589, | |
| "num_input_tokens_seen": 2811456, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.6065969428801288, | |
| "grad_norm": 1.881988525390625, | |
| "learning_rate": 3.3750000000000003e-06, | |
| "loss": 0.0415, | |
| "num_input_tokens_seen": 2818080, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.6130329847144007, | |
| "grad_norm": 1.862747073173523, | |
| "learning_rate": 3.3833333333333333e-06, | |
| "loss": 0.0344, | |
| "num_input_tokens_seen": 2825136, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.6194690265486726, | |
| "grad_norm": 2.6847071647644043, | |
| "learning_rate": 3.391666666666667e-06, | |
| "loss": 0.0423, | |
| "num_input_tokens_seen": 2832400, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.6259050683829446, | |
| "grad_norm": 3.631681203842163, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "loss": 0.0838, | |
| "num_input_tokens_seen": 2839712, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.6323411102172165, | |
| "grad_norm": 3.7878201007843018, | |
| "learning_rate": 3.4083333333333335e-06, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 2846160, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.6387771520514884, | |
| "grad_norm": 2.826582431793213, | |
| "learning_rate": 3.416666666666667e-06, | |
| "loss": 0.0464, | |
| "num_input_tokens_seen": 2853520, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.6452131938857604, | |
| "grad_norm": 2.330638885498047, | |
| "learning_rate": 3.4250000000000007e-06, | |
| "loss": 0.0387, | |
| "num_input_tokens_seen": 2860384, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.6516492357200323, | |
| "grad_norm": 2.330439567565918, | |
| "learning_rate": 3.4333333333333336e-06, | |
| "loss": 0.0507, | |
| "num_input_tokens_seen": 2867360, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.6580852775543042, | |
| "grad_norm": 3.929145336151123, | |
| "learning_rate": 3.441666666666667e-06, | |
| "loss": 0.0549, | |
| "num_input_tokens_seen": 2873648, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.664521319388576, | |
| "grad_norm": 3.001359224319458, | |
| "learning_rate": 3.45e-06, | |
| "loss": 0.0285, | |
| "num_input_tokens_seen": 2880848, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.670957361222848, | |
| "grad_norm": 2.7936651706695557, | |
| "learning_rate": 3.4583333333333334e-06, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 2888256, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.67739340305712, | |
| "grad_norm": 4.050117015838623, | |
| "learning_rate": 3.4666666666666672e-06, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 2895040, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.6838294448913915, | |
| "grad_norm": 5.509685516357422, | |
| "learning_rate": 3.475e-06, | |
| "loss": 0.066, | |
| "num_input_tokens_seen": 2902320, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.6902654867256635, | |
| "grad_norm": 3.968433380126953, | |
| "learning_rate": 3.4833333333333336e-06, | |
| "loss": 0.0495, | |
| "num_input_tokens_seen": 2908960, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.6967015285599354, | |
| "grad_norm": 2.082157611846924, | |
| "learning_rate": 3.491666666666667e-06, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 2915808, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.7031375703942073, | |
| "grad_norm": 2.403968334197998, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.0604, | |
| "num_input_tokens_seen": 2922608, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.7095736122284793, | |
| "grad_norm": 4.667454719543457, | |
| "learning_rate": 3.5083333333333338e-06, | |
| "loss": 0.0535, | |
| "num_input_tokens_seen": 2929728, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.716009654062751, | |
| "grad_norm": 2.5968987941741943, | |
| "learning_rate": 3.516666666666667e-06, | |
| "loss": 0.0369, | |
| "num_input_tokens_seen": 2937024, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.722445695897023, | |
| "grad_norm": 3.4746780395507812, | |
| "learning_rate": 3.525e-06, | |
| "loss": 0.045, | |
| "num_input_tokens_seen": 2943760, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.728881737731295, | |
| "grad_norm": 1.9599398374557495, | |
| "learning_rate": 3.5333333333333335e-06, | |
| "loss": 0.0314, | |
| "num_input_tokens_seen": 2950848, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.735317779565567, | |
| "grad_norm": 2.971634864807129, | |
| "learning_rate": 3.5416666666666673e-06, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 2957408, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.741753821399839, | |
| "grad_norm": 3.1944162845611572, | |
| "learning_rate": 3.5500000000000003e-06, | |
| "loss": 0.0478, | |
| "num_input_tokens_seen": 2964288, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.748189863234111, | |
| "grad_norm": 3.3659610748291016, | |
| "learning_rate": 3.5583333333333337e-06, | |
| "loss": 0.038, | |
| "num_input_tokens_seen": 2970912, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.754625905068383, | |
| "grad_norm": 2.965097188949585, | |
| "learning_rate": 3.566666666666667e-06, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 2978032, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.7610619469026547, | |
| "grad_norm": 2.4006049633026123, | |
| "learning_rate": 3.575e-06, | |
| "loss": 0.0478, | |
| "num_input_tokens_seen": 2985232, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.7674979887369267, | |
| "grad_norm": 3.7348554134368896, | |
| "learning_rate": 3.5833333333333335e-06, | |
| "loss": 0.0977, | |
| "num_input_tokens_seen": 2992240, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.7739340305711986, | |
| "grad_norm": 3.1373274326324463, | |
| "learning_rate": 3.5916666666666673e-06, | |
| "loss": 0.0835, | |
| "num_input_tokens_seen": 2999008, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.7803700724054705, | |
| "grad_norm": 1.9444302320480347, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 0.0406, | |
| "num_input_tokens_seen": 3005648, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.7868061142397424, | |
| "grad_norm": 1.8665870428085327, | |
| "learning_rate": 3.6083333333333337e-06, | |
| "loss": 0.0661, | |
| "num_input_tokens_seen": 3012224, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.7932421560740144, | |
| "grad_norm": 1.9893403053283691, | |
| "learning_rate": 3.616666666666667e-06, | |
| "loss": 0.0647, | |
| "num_input_tokens_seen": 3019104, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.7996781979082863, | |
| "grad_norm": 2.656529426574707, | |
| "learning_rate": 3.625e-06, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 3026096, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.8061142397425582, | |
| "grad_norm": 1.7047683000564575, | |
| "learning_rate": 3.633333333333334e-06, | |
| "loss": 0.0422, | |
| "num_input_tokens_seen": 3032784, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.81255028157683, | |
| "grad_norm": 1.6727882623672485, | |
| "learning_rate": 3.6416666666666672e-06, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 3040096, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.818986323411102, | |
| "grad_norm": 4.0175251960754395, | |
| "learning_rate": 3.65e-06, | |
| "loss": 0.0474, | |
| "num_input_tokens_seen": 3046720, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.825422365245374, | |
| "grad_norm": 8.139860153198242, | |
| "learning_rate": 3.6583333333333336e-06, | |
| "loss": 0.0801, | |
| "num_input_tokens_seen": 3053712, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.831858407079646, | |
| "grad_norm": 3.832087278366089, | |
| "learning_rate": 3.6666666666666666e-06, | |
| "loss": 0.0528, | |
| "num_input_tokens_seen": 3060528, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.838294448913918, | |
| "grad_norm": 2.881619930267334, | |
| "learning_rate": 3.6750000000000004e-06, | |
| "loss": 0.0461, | |
| "num_input_tokens_seen": 3067440, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.84473049074819, | |
| "grad_norm": 4.456245422363281, | |
| "learning_rate": 3.6833333333333338e-06, | |
| "loss": 0.0646, | |
| "num_input_tokens_seen": 3074208, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.8511665325824618, | |
| "grad_norm": 5.1570820808410645, | |
| "learning_rate": 3.6916666666666668e-06, | |
| "loss": 0.049, | |
| "num_input_tokens_seen": 3081072, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.8576025744167337, | |
| "grad_norm": 2.944526433944702, | |
| "learning_rate": 3.7e-06, | |
| "loss": 0.0531, | |
| "num_input_tokens_seen": 3088240, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.8640386162510056, | |
| "grad_norm": 2.021688222885132, | |
| "learning_rate": 3.708333333333334e-06, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 3095504, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.8704746580852776, | |
| "grad_norm": 6.054248809814453, | |
| "learning_rate": 3.716666666666667e-06, | |
| "loss": 0.0927, | |
| "num_input_tokens_seen": 3102688, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.8769106999195495, | |
| "grad_norm": 3.5824503898620605, | |
| "learning_rate": 3.7250000000000003e-06, | |
| "loss": 0.0491, | |
| "num_input_tokens_seen": 3109440, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.8833467417538214, | |
| "grad_norm": 2.0240774154663086, | |
| "learning_rate": 3.7333333333333337e-06, | |
| "loss": 0.0399, | |
| "num_input_tokens_seen": 3116720, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.8897827835880934, | |
| "grad_norm": 4.0125579833984375, | |
| "learning_rate": 3.7416666666666667e-06, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 3123568, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.8962188254223653, | |
| "grad_norm": 3.733275890350342, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.0569, | |
| "num_input_tokens_seen": 3130768, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9026548672566372, | |
| "grad_norm": 4.261077880859375, | |
| "learning_rate": 3.758333333333334e-06, | |
| "loss": 0.0608, | |
| "num_input_tokens_seen": 3138128, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 1.4142907857894897, | |
| "learning_rate": 3.766666666666667e-06, | |
| "loss": 0.0325, | |
| "num_input_tokens_seen": 3145008, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.915526950925181, | |
| "grad_norm": 2.610344171524048, | |
| "learning_rate": 3.7750000000000003e-06, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 3151792, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.921962992759453, | |
| "grad_norm": 2.9687604904174805, | |
| "learning_rate": 3.7833333333333337e-06, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 3158800, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.928399034593725, | |
| "grad_norm": 2.2706518173217773, | |
| "learning_rate": 3.7916666666666666e-06, | |
| "loss": 0.0549, | |
| "num_input_tokens_seen": 3165744, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.934835076427997, | |
| "grad_norm": 3.606792449951172, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 0.0789, | |
| "num_input_tokens_seen": 3172896, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.941271118262269, | |
| "grad_norm": 1.8851637840270996, | |
| "learning_rate": 3.808333333333334e-06, | |
| "loss": 0.0319, | |
| "num_input_tokens_seen": 3179888, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.9477071600965408, | |
| "grad_norm": 2.6292834281921387, | |
| "learning_rate": 3.816666666666667e-06, | |
| "loss": 0.05, | |
| "num_input_tokens_seen": 3186960, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.9541432019308127, | |
| "grad_norm": 2.099109172821045, | |
| "learning_rate": 3.825000000000001e-06, | |
| "loss": 0.0677, | |
| "num_input_tokens_seen": 3194208, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.9605792437650846, | |
| "grad_norm": 2.5214834213256836, | |
| "learning_rate": 3.833333333333334e-06, | |
| "loss": 0.0512, | |
| "num_input_tokens_seen": 3201120, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.9670152855993566, | |
| "grad_norm": 6.318456649780273, | |
| "learning_rate": 3.841666666666667e-06, | |
| "loss": 0.0681, | |
| "num_input_tokens_seen": 3208160, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.9734513274336285, | |
| "grad_norm": 4.119838714599609, | |
| "learning_rate": 3.85e-06, | |
| "loss": 0.0651, | |
| "num_input_tokens_seen": 3214992, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.9798873692679004, | |
| "grad_norm": 3.248420238494873, | |
| "learning_rate": 3.858333333333333e-06, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 3222192, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.9863234111021724, | |
| "grad_norm": 1.6198488473892212, | |
| "learning_rate": 3.866666666666667e-06, | |
| "loss": 0.0496, | |
| "num_input_tokens_seen": 3229504, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.9927594529364443, | |
| "grad_norm": 2.6008763313293457, | |
| "learning_rate": 3.875e-06, | |
| "loss": 0.0446, | |
| "num_input_tokens_seen": 3236400, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.9991954947707162, | |
| "grad_norm": 2.349928379058838, | |
| "learning_rate": 3.883333333333333e-06, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 3243600, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.0056315366049877, | |
| "grad_norm": 0.8590204119682312, | |
| "learning_rate": 3.891666666666667e-06, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 3249808, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.0120675784392597, | |
| "grad_norm": 1.2689623832702637, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 0.0201, | |
| "num_input_tokens_seen": 3257168, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.0185036202735316, | |
| "grad_norm": 1.329512596130371, | |
| "learning_rate": 3.908333333333334e-06, | |
| "loss": 0.0119, | |
| "num_input_tokens_seen": 3264064, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.0249396621078035, | |
| "grad_norm": 2.423644781112671, | |
| "learning_rate": 3.916666666666667e-06, | |
| "loss": 0.0305, | |
| "num_input_tokens_seen": 3270688, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.0313757039420755, | |
| "grad_norm": 3.6647322177886963, | |
| "learning_rate": 3.9250000000000005e-06, | |
| "loss": 0.0213, | |
| "num_input_tokens_seen": 3277664, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.0378117457763474, | |
| "grad_norm": 3.736281156539917, | |
| "learning_rate": 3.9333333333333335e-06, | |
| "loss": 0.035, | |
| "num_input_tokens_seen": 3284352, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.0442477876106193, | |
| "grad_norm": 2.274883270263672, | |
| "learning_rate": 3.941666666666667e-06, | |
| "loss": 0.0438, | |
| "num_input_tokens_seen": 3290864, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.0506838294448912, | |
| "grad_norm": 3.032172203063965, | |
| "learning_rate": 3.95e-06, | |
| "loss": 0.0464, | |
| "num_input_tokens_seen": 3297856, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.057119871279163, | |
| "grad_norm": 2.258751392364502, | |
| "learning_rate": 3.958333333333333e-06, | |
| "loss": 0.0172, | |
| "num_input_tokens_seen": 3305120, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.063555913113435, | |
| "grad_norm": 2.925736427307129, | |
| "learning_rate": 3.966666666666667e-06, | |
| "loss": 0.0287, | |
| "num_input_tokens_seen": 3312032, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.069991954947707, | |
| "grad_norm": 3.100857734680176, | |
| "learning_rate": 3.975000000000001e-06, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 3319424, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.076427996781979, | |
| "grad_norm": 1.753515601158142, | |
| "learning_rate": 3.983333333333334e-06, | |
| "loss": 0.0095, | |
| "num_input_tokens_seen": 3326304, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.082864038616251, | |
| "grad_norm": 2.3217740058898926, | |
| "learning_rate": 3.991666666666667e-06, | |
| "loss": 0.0238, | |
| "num_input_tokens_seen": 3333184, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.089300080450523, | |
| "grad_norm": 2.512751579284668, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0313, | |
| "num_input_tokens_seen": 3340384, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.0957361222847948, | |
| "grad_norm": 1.2185322046279907, | |
| "learning_rate": 4.008333333333334e-06, | |
| "loss": 0.0146, | |
| "num_input_tokens_seen": 3347344, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.1021721641190667, | |
| "grad_norm": 1.1303057670593262, | |
| "learning_rate": 4.0166666666666675e-06, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 3354080, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.1086082059533386, | |
| "grad_norm": 2.4247186183929443, | |
| "learning_rate": 4.0250000000000004e-06, | |
| "loss": 0.024, | |
| "num_input_tokens_seen": 3360848, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.1150442477876106, | |
| "grad_norm": 1.4767001867294312, | |
| "learning_rate": 4.033333333333333e-06, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 3367616, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.1214802896218825, | |
| "grad_norm": 2.458953857421875, | |
| "learning_rate": 4.041666666666667e-06, | |
| "loss": 0.0311, | |
| "num_input_tokens_seen": 3374880, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.1279163314561544, | |
| "grad_norm": 0.5494964718818665, | |
| "learning_rate": 4.05e-06, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 3381696, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.1343523732904264, | |
| "grad_norm": 1.5969914197921753, | |
| "learning_rate": 4.058333333333333e-06, | |
| "loss": 0.0379, | |
| "num_input_tokens_seen": 3388880, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.1407884151246983, | |
| "grad_norm": 1.7003910541534424, | |
| "learning_rate": 4.066666666666667e-06, | |
| "loss": 0.0299, | |
| "num_input_tokens_seen": 3395984, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.1472244569589702, | |
| "grad_norm": 2.297182083129883, | |
| "learning_rate": 4.075e-06, | |
| "loss": 0.0261, | |
| "num_input_tokens_seen": 3402896, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.153660498793242, | |
| "grad_norm": 2.3937814235687256, | |
| "learning_rate": 4.083333333333334e-06, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 3409888, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.160096540627514, | |
| "grad_norm": 1.349425196647644, | |
| "learning_rate": 4.091666666666667e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 3416928, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.166532582461786, | |
| "grad_norm": 3.0355069637298584, | |
| "learning_rate": 4.1e-06, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 3423968, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.172968624296058, | |
| "grad_norm": 2.680206537246704, | |
| "learning_rate": 4.1083333333333335e-06, | |
| "loss": 0.0465, | |
| "num_input_tokens_seen": 3431120, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.17940466613033, | |
| "grad_norm": 1.5906095504760742, | |
| "learning_rate": 4.116666666666667e-06, | |
| "loss": 0.0187, | |
| "num_input_tokens_seen": 3437776, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.185840707964602, | |
| "grad_norm": 0.8296425938606262, | |
| "learning_rate": 4.125e-06, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 3444480, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.1922767497988738, | |
| "grad_norm": 2.857689142227173, | |
| "learning_rate": 4.133333333333333e-06, | |
| "loss": 0.0289, | |
| "num_input_tokens_seen": 3451232, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.1987127916331457, | |
| "grad_norm": 1.0910203456878662, | |
| "learning_rate": 4.141666666666667e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 3457776, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.2051488334674176, | |
| "grad_norm": 1.3560919761657715, | |
| "learning_rate": 4.15e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 3465056, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.2115848753016896, | |
| "grad_norm": 4.861215591430664, | |
| "learning_rate": 4.158333333333334e-06, | |
| "loss": 0.0375, | |
| "num_input_tokens_seen": 3471968, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 3.2180209171359615, | |
| "grad_norm": 1.8714208602905273, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 3479648, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.2244569589702334, | |
| "grad_norm": 1.6230028867721558, | |
| "learning_rate": 4.175e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 3486272, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 3.2308930008045054, | |
| "grad_norm": 0.7852226495742798, | |
| "learning_rate": 4.183333333333334e-06, | |
| "loss": 0.0073, | |
| "num_input_tokens_seen": 3493360, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 3.2373290426387773, | |
| "grad_norm": 2.3990976810455322, | |
| "learning_rate": 4.1916666666666675e-06, | |
| "loss": 0.0186, | |
| "num_input_tokens_seen": 3500336, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 3.2437650844730492, | |
| "grad_norm": 0.796851634979248, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 3507232, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 3.250201126307321, | |
| "grad_norm": 2.7951748371124268, | |
| "learning_rate": 4.208333333333333e-06, | |
| "loss": 0.0416, | |
| "num_input_tokens_seen": 3514144, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.256637168141593, | |
| "grad_norm": 2.40897274017334, | |
| "learning_rate": 4.216666666666667e-06, | |
| "loss": 0.0266, | |
| "num_input_tokens_seen": 3520976, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 3.263073209975865, | |
| "grad_norm": 2.3974061012268066, | |
| "learning_rate": 4.225e-06, | |
| "loss": 0.0351, | |
| "num_input_tokens_seen": 3527920, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 3.2695092518101365, | |
| "grad_norm": 2.30100154876709, | |
| "learning_rate": 4.233333333333334e-06, | |
| "loss": 0.0209, | |
| "num_input_tokens_seen": 3534864, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 3.2759452936444085, | |
| "grad_norm": 2.1172518730163574, | |
| "learning_rate": 4.241666666666667e-06, | |
| "loss": 0.0434, | |
| "num_input_tokens_seen": 3541872, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 3.2823813354786804, | |
| "grad_norm": 3.7030341625213623, | |
| "learning_rate": 4.25e-06, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 3548384, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.2888173773129523, | |
| "grad_norm": 2.152125597000122, | |
| "learning_rate": 4.258333333333334e-06, | |
| "loss": 0.0529, | |
| "num_input_tokens_seen": 3555792, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 3.2952534191472242, | |
| "grad_norm": 0.6081152558326721, | |
| "learning_rate": 4.266666666666668e-06, | |
| "loss": 0.0033, | |
| "num_input_tokens_seen": 3562608, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.301689460981496, | |
| "grad_norm": 1.7042624950408936, | |
| "learning_rate": 4.2750000000000006e-06, | |
| "loss": 0.0196, | |
| "num_input_tokens_seen": 3569184, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 3.308125502815768, | |
| "grad_norm": 1.3502767086029053, | |
| "learning_rate": 4.2833333333333335e-06, | |
| "loss": 0.0242, | |
| "num_input_tokens_seen": 3576224, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.31456154465004, | |
| "grad_norm": 4.480360984802246, | |
| "learning_rate": 4.2916666666666665e-06, | |
| "loss": 0.0316, | |
| "num_input_tokens_seen": 3583328, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.320997586484312, | |
| "grad_norm": 2.2217299938201904, | |
| "learning_rate": 4.3e-06, | |
| "loss": 0.0268, | |
| "num_input_tokens_seen": 3590256, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 3.327433628318584, | |
| "grad_norm": 1.5919010639190674, | |
| "learning_rate": 4.308333333333334e-06, | |
| "loss": 0.0248, | |
| "num_input_tokens_seen": 3597328, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 3.333869670152856, | |
| "grad_norm": 2.425961971282959, | |
| "learning_rate": 4.316666666666667e-06, | |
| "loss": 0.032, | |
| "num_input_tokens_seen": 3604576, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 3.340305711987128, | |
| "grad_norm": 2.987424612045288, | |
| "learning_rate": 4.325e-06, | |
| "loss": 0.0202, | |
| "num_input_tokens_seen": 3611520, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 3.3467417538213997, | |
| "grad_norm": 2.633897304534912, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": 0.0329, | |
| "num_input_tokens_seen": 3618288, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.3531777956556716, | |
| "grad_norm": 1.0696384906768799, | |
| "learning_rate": 4.341666666666667e-06, | |
| "loss": 0.019, | |
| "num_input_tokens_seen": 3625216, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 3.3596138374899436, | |
| "grad_norm": 2.400972604751587, | |
| "learning_rate": 4.350000000000001e-06, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 3631888, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 3.3660498793242155, | |
| "grad_norm": 1.3744821548461914, | |
| "learning_rate": 4.358333333333334e-06, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 3638848, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 3.3724859211584874, | |
| "grad_norm": 1.613145112991333, | |
| "learning_rate": 4.366666666666667e-06, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 3646112, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 3.3789219629927594, | |
| "grad_norm": 2.450824499130249, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.0388, | |
| "num_input_tokens_seen": 3652928, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.3853580048270313, | |
| "grad_norm": 1.6122058629989624, | |
| "learning_rate": 4.383333333333334e-06, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 3659632, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 3.3917940466613032, | |
| "grad_norm": 1.53513765335083, | |
| "learning_rate": 4.391666666666667e-06, | |
| "loss": 0.0305, | |
| "num_input_tokens_seen": 3666480, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 3.398230088495575, | |
| "grad_norm": 2.103663444519043, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.0512, | |
| "num_input_tokens_seen": 3673136, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 3.404666130329847, | |
| "grad_norm": 0.41373467445373535, | |
| "learning_rate": 4.408333333333334e-06, | |
| "loss": 0.0031, | |
| "num_input_tokens_seen": 3679760, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 3.411102172164119, | |
| "grad_norm": 2.9610488414764404, | |
| "learning_rate": 4.416666666666667e-06, | |
| "loss": 0.0309, | |
| "num_input_tokens_seen": 3686576, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.417538213998391, | |
| "grad_norm": 2.415531873703003, | |
| "learning_rate": 4.425e-06, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 3693312, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 3.423974255832663, | |
| "grad_norm": 2.175546407699585, | |
| "learning_rate": 4.433333333333334e-06, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 3700000, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 3.430410297666935, | |
| "grad_norm": 1.0903018712997437, | |
| "learning_rate": 4.441666666666667e-06, | |
| "loss": 0.0077, | |
| "num_input_tokens_seen": 3706736, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 3.4368463395012068, | |
| "grad_norm": 0.8305991888046265, | |
| "learning_rate": 4.450000000000001e-06, | |
| "loss": 0.0064, | |
| "num_input_tokens_seen": 3714192, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 3.4432823813354787, | |
| "grad_norm": 0.9347790479660034, | |
| "learning_rate": 4.4583333333333336e-06, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 3721408, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.4497184231697506, | |
| "grad_norm": 1.7669559717178345, | |
| "learning_rate": 4.4666666666666665e-06, | |
| "loss": 0.0121, | |
| "num_input_tokens_seen": 3728144, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 3.4561544650040226, | |
| "grad_norm": 3.121467351913452, | |
| "learning_rate": 4.475e-06, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 3734960, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 3.4625905068382945, | |
| "grad_norm": 2.683410882949829, | |
| "learning_rate": 4.483333333333333e-06, | |
| "loss": 0.0319, | |
| "num_input_tokens_seen": 3741728, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 3.4690265486725664, | |
| "grad_norm": 9.728205680847168, | |
| "learning_rate": 4.491666666666667e-06, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 3749200, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 3.4754625905068384, | |
| "grad_norm": 4.415483474731445, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0255, | |
| "num_input_tokens_seen": 3755856, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.4818986323411103, | |
| "grad_norm": 3.651423692703247, | |
| "learning_rate": 4.508333333333333e-06, | |
| "loss": 0.0301, | |
| "num_input_tokens_seen": 3762528, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 3.4883346741753822, | |
| "grad_norm": 2.318000078201294, | |
| "learning_rate": 4.516666666666667e-06, | |
| "loss": 0.0589, | |
| "num_input_tokens_seen": 3769632, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 3.494770716009654, | |
| "grad_norm": 4.982158660888672, | |
| "learning_rate": 4.525000000000001e-06, | |
| "loss": 0.0442, | |
| "num_input_tokens_seen": 3776592, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 3.501206757843926, | |
| "grad_norm": 3.0872108936309814, | |
| "learning_rate": 4.533333333333334e-06, | |
| "loss": 0.0366, | |
| "num_input_tokens_seen": 3783824, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 3.507642799678198, | |
| "grad_norm": 5.150477886199951, | |
| "learning_rate": 4.541666666666667e-06, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 3790864, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.51407884151247, | |
| "grad_norm": 3.0513834953308105, | |
| "learning_rate": 4.5500000000000005e-06, | |
| "loss": 0.0213, | |
| "num_input_tokens_seen": 3797664, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 3.520514883346742, | |
| "grad_norm": 1.5530712604522705, | |
| "learning_rate": 4.5583333333333335e-06, | |
| "loss": 0.0154, | |
| "num_input_tokens_seen": 3804576, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 3.526950925181014, | |
| "grad_norm": 2.6350319385528564, | |
| "learning_rate": 4.566666666666667e-06, | |
| "loss": 0.0252, | |
| "num_input_tokens_seen": 3811440, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 3.5333869670152858, | |
| "grad_norm": 2.8993167877197266, | |
| "learning_rate": 4.575e-06, | |
| "loss": 0.038, | |
| "num_input_tokens_seen": 3818352, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 3.5398230088495577, | |
| "grad_norm": 2.0168752670288086, | |
| "learning_rate": 4.583333333333333e-06, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 3825360, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.5462590506838296, | |
| "grad_norm": 2.4160525798797607, | |
| "learning_rate": 4.591666666666667e-06, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 3832416, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 3.5526950925181016, | |
| "grad_norm": 1.543545126914978, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 0.0164, | |
| "num_input_tokens_seen": 3839344, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 3.5591311343523735, | |
| "grad_norm": 2.355316400527954, | |
| "learning_rate": 4.608333333333334e-06, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 3846688, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 3.5655671761866454, | |
| "grad_norm": 1.4751020669937134, | |
| "learning_rate": 4.616666666666667e-06, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 3853696, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 3.5720032180209174, | |
| "grad_norm": 0.9673195481300354, | |
| "learning_rate": 4.625000000000001e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 3860832, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.5784392598551893, | |
| "grad_norm": 1.1592040061950684, | |
| "learning_rate": 4.633333333333334e-06, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 3868000, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 3.5848753016894612, | |
| "grad_norm": 1.01143217086792, | |
| "learning_rate": 4.641666666666667e-06, | |
| "loss": 0.0081, | |
| "num_input_tokens_seen": 3874672, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 3.591311343523733, | |
| "grad_norm": 2.855041980743408, | |
| "learning_rate": 4.65e-06, | |
| "loss": 0.0351, | |
| "num_input_tokens_seen": 3881744, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 3.597747385358005, | |
| "grad_norm": 2.0597968101501465, | |
| "learning_rate": 4.658333333333333e-06, | |
| "loss": 0.0288, | |
| "num_input_tokens_seen": 3888256, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 3.604183427192277, | |
| "grad_norm": 2.9965226650238037, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.0335, | |
| "num_input_tokens_seen": 3895104, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.6106194690265485, | |
| "grad_norm": 3.625206708908081, | |
| "learning_rate": 4.675000000000001e-06, | |
| "loss": 0.0492, | |
| "num_input_tokens_seen": 3902208, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 3.6170555108608204, | |
| "grad_norm": 2.021160840988159, | |
| "learning_rate": 4.683333333333334e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 3909040, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 3.6234915526950924, | |
| "grad_norm": 3.4565329551696777, | |
| "learning_rate": 4.691666666666667e-06, | |
| "loss": 0.0491, | |
| "num_input_tokens_seen": 3916304, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 3.6299275945293643, | |
| "grad_norm": 3.2362654209136963, | |
| "learning_rate": 4.7e-06, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 3923216, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 3.234666347503662, | |
| "learning_rate": 4.708333333333334e-06, | |
| "loss": 0.0414, | |
| "num_input_tokens_seen": 3930448, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.642799678197908, | |
| "grad_norm": 2.1742103099823, | |
| "learning_rate": 4.7166666666666675e-06, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 3937424, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 3.64923572003218, | |
| "grad_norm": 2.9156923294067383, | |
| "learning_rate": 4.7250000000000005e-06, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 3944112, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 3.655671761866452, | |
| "grad_norm": 4.092429161071777, | |
| "learning_rate": 4.7333333333333335e-06, | |
| "loss": 0.051, | |
| "num_input_tokens_seen": 3951504, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 3.662107803700724, | |
| "grad_norm": 3.9395768642425537, | |
| "learning_rate": 4.741666666666667e-06, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 3958352, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 3.668543845534996, | |
| "grad_norm": 1.9961844682693481, | |
| "learning_rate": 4.75e-06, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 3965552, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.674979887369268, | |
| "grad_norm": 1.8078194856643677, | |
| "learning_rate": 4.758333333333334e-06, | |
| "loss": 0.0406, | |
| "num_input_tokens_seen": 3972544, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 3.6814159292035398, | |
| "grad_norm": 2.048532485961914, | |
| "learning_rate": 4.766666666666667e-06, | |
| "loss": 0.0407, | |
| "num_input_tokens_seen": 3979264, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 3.6878519710378117, | |
| "grad_norm": 1.9979974031448364, | |
| "learning_rate": 4.775e-06, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 3986240, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 3.6942880128720836, | |
| "grad_norm": 3.6126463413238525, | |
| "learning_rate": 4.783333333333334e-06, | |
| "loss": 0.0326, | |
| "num_input_tokens_seen": 3993232, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 3.7007240547063556, | |
| "grad_norm": 3.131657838821411, | |
| "learning_rate": 4.791666666666668e-06, | |
| "loss": 0.0348, | |
| "num_input_tokens_seen": 3999952, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.7071600965406275, | |
| "grad_norm": 2.2662060260772705, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.0256, | |
| "num_input_tokens_seen": 4007456, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 3.7135961383748994, | |
| "grad_norm": 4.874523639678955, | |
| "learning_rate": 4.808333333333334e-06, | |
| "loss": 0.0765, | |
| "num_input_tokens_seen": 4015024, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 3.7200321802091714, | |
| "grad_norm": 0.882166862487793, | |
| "learning_rate": 4.816666666666667e-06, | |
| "loss": 0.0099, | |
| "num_input_tokens_seen": 4021920, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 3.7264682220434433, | |
| "grad_norm": 3.1239066123962402, | |
| "learning_rate": 4.825e-06, | |
| "loss": 0.0173, | |
| "num_input_tokens_seen": 4028720, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 3.7329042638777152, | |
| "grad_norm": 1.5819370746612549, | |
| "learning_rate": 4.833333333333333e-06, | |
| "loss": 0.0084, | |
| "num_input_tokens_seen": 4035584, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.739340305711987, | |
| "grad_norm": 2.6252429485321045, | |
| "learning_rate": 4.841666666666667e-06, | |
| "loss": 0.0251, | |
| "num_input_tokens_seen": 4042464, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 3.745776347546259, | |
| "grad_norm": 2.0619590282440186, | |
| "learning_rate": 4.85e-06, | |
| "loss": 0.0909, | |
| "num_input_tokens_seen": 4049600, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 3.752212389380531, | |
| "grad_norm": 2.547422409057617, | |
| "learning_rate": 4.858333333333334e-06, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 4056320, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 3.758648431214803, | |
| "grad_norm": 1.3179091215133667, | |
| "learning_rate": 4.866666666666667e-06, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 4063200, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 3.765084473049075, | |
| "grad_norm": 3.090376377105713, | |
| "learning_rate": 4.875e-06, | |
| "loss": 0.0242, | |
| "num_input_tokens_seen": 4070112, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.771520514883347, | |
| "grad_norm": 2.50468111038208, | |
| "learning_rate": 4.883333333333334e-06, | |
| "loss": 0.0138, | |
| "num_input_tokens_seen": 4076928, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 3.7779565567176188, | |
| "grad_norm": 3.921415090560913, | |
| "learning_rate": 4.8916666666666675e-06, | |
| "loss": 0.0467, | |
| "num_input_tokens_seen": 4083792, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 3.7843925985518907, | |
| "grad_norm": 1.2243348360061646, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 0.0241, | |
| "num_input_tokens_seen": 4090672, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 3.7908286403861626, | |
| "grad_norm": 1.4968576431274414, | |
| "learning_rate": 4.9083333333333335e-06, | |
| "loss": 0.0404, | |
| "num_input_tokens_seen": 4097472, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 3.7972646822204346, | |
| "grad_norm": 1.235217809677124, | |
| "learning_rate": 4.9166666666666665e-06, | |
| "loss": 0.0094, | |
| "num_input_tokens_seen": 4104016, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.8037007240547065, | |
| "grad_norm": 1.3862783908843994, | |
| "learning_rate": 4.925e-06, | |
| "loss": 0.0196, | |
| "num_input_tokens_seen": 4110784, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 3.8101367658889784, | |
| "grad_norm": 3.560793399810791, | |
| "learning_rate": 4.933333333333334e-06, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 4117984, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 3.8165728077232504, | |
| "grad_norm": 2.008575677871704, | |
| "learning_rate": 4.941666666666667e-06, | |
| "loss": 0.0286, | |
| "num_input_tokens_seen": 4125072, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 3.823008849557522, | |
| "grad_norm": 2.3213093280792236, | |
| "learning_rate": 4.95e-06, | |
| "loss": 0.0417, | |
| "num_input_tokens_seen": 4132160, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 3.829444891391794, | |
| "grad_norm": 1.3540257215499878, | |
| "learning_rate": 4.958333333333334e-06, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 4139136, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.8358809332260657, | |
| "grad_norm": 1.289825677871704, | |
| "learning_rate": 4.966666666666667e-06, | |
| "loss": 0.0229, | |
| "num_input_tokens_seen": 4146240, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 3.8423169750603376, | |
| "grad_norm": 2.4050135612487793, | |
| "learning_rate": 4.975000000000001e-06, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 4153152, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 3.8487530168946096, | |
| "grad_norm": 1.523977518081665, | |
| "learning_rate": 4.983333333333334e-06, | |
| "loss": 0.0274, | |
| "num_input_tokens_seen": 4160080, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 3.8551890587288815, | |
| "grad_norm": 1.1898863315582275, | |
| "learning_rate": 4.991666666666667e-06, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 4167008, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 3.8616251005631534, | |
| "grad_norm": 1.992311954498291, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0429, | |
| "num_input_tokens_seen": 4174080, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.8680611423974254, | |
| "grad_norm": 0.9558950066566467, | |
| "learning_rate": 4.999597169822646e-06, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 4181104, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 3.8744971842316973, | |
| "grad_norm": 0.9275301694869995, | |
| "learning_rate": 4.998388809108304e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 4188096, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 3.8809332260659692, | |
| "grad_norm": 1.6707432270050049, | |
| "learning_rate": 4.996375307268303e-06, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 4195152, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 3.887369267900241, | |
| "grad_norm": 5.857227325439453, | |
| "learning_rate": 4.993557313182086e-06, | |
| "loss": 0.0224, | |
| "num_input_tokens_seen": 4201952, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 3.893805309734513, | |
| "grad_norm": 5.273613452911377, | |
| "learning_rate": 4.989935734988098e-06, | |
| "loss": 0.0227, | |
| "num_input_tokens_seen": 4209104, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.900241351568785, | |
| "grad_norm": 6.268670082092285, | |
| "learning_rate": 4.985511739791129e-06, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 4216496, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 3.906677393403057, | |
| "grad_norm": 3.373368501663208, | |
| "learning_rate": 4.980286753286196e-06, | |
| "loss": 0.0339, | |
| "num_input_tokens_seen": 4223840, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 3.913113435237329, | |
| "grad_norm": 1.3991198539733887, | |
| "learning_rate": 4.974262459299088e-06, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 4230752, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 3.919549477071601, | |
| "grad_norm": 0.7424534559249878, | |
| "learning_rate": 4.967440799243739e-06, | |
| "loss": 0.007, | |
| "num_input_tokens_seen": 4237360, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 3.9259855189058728, | |
| "grad_norm": 3.0347440242767334, | |
| "learning_rate": 4.959823971496575e-06, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 4244128, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.9324215607401447, | |
| "grad_norm": 2.929175853729248, | |
| "learning_rate": 4.9514144306880506e-06, | |
| "loss": 0.0296, | |
| "num_input_tokens_seen": 4251264, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 3.9388576025744166, | |
| "grad_norm": 4.076401710510254, | |
| "learning_rate": 4.942214886911619e-06, | |
| "loss": 0.0429, | |
| "num_input_tokens_seen": 4258256, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 3.9452936444086886, | |
| "grad_norm": 0.7720851302146912, | |
| "learning_rate": 4.932228304850363e-06, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 4265280, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 3.9517296862429605, | |
| "grad_norm": 1.500545859336853, | |
| "learning_rate": 4.921457902821578e-06, | |
| "loss": 0.0395, | |
| "num_input_tokens_seen": 4271968, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 3.9581657280772324, | |
| "grad_norm": 3.0767860412597656, | |
| "learning_rate": 4.909907151739634e-06, | |
| "loss": 0.03, | |
| "num_input_tokens_seen": 4278848, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.9646017699115044, | |
| "grad_norm": 1.5455620288848877, | |
| "learning_rate": 4.897579773997415e-06, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 4285808, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 3.9710378117457763, | |
| "grad_norm": 1.1472654342651367, | |
| "learning_rate": 4.884479742266731e-06, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 4292912, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 3.9774738535800482, | |
| "grad_norm": 1.3290921449661255, | |
| "learning_rate": 4.870611278218066e-06, | |
| "loss": 0.0076, | |
| "num_input_tokens_seen": 4300176, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 3.98390989541432, | |
| "grad_norm": 4.543910026550293, | |
| "learning_rate": 4.855978851160088e-06, | |
| "loss": 0.0683, | |
| "num_input_tokens_seen": 4307776, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 3.990345937248592, | |
| "grad_norm": 3.424959421157837, | |
| "learning_rate": 4.8405871765993435e-06, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 4314688, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.996781979082864, | |
| "grad_norm": 1.5345810651779175, | |
| "learning_rate": 4.824441214720629e-06, | |
| "loss": 0.0497, | |
| "num_input_tokens_seen": 4321840, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 4.003218020917136, | |
| "grad_norm": 0.5405219793319702, | |
| "learning_rate": 4.8075461687884935e-06, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 4328736, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 4.009654062751408, | |
| "grad_norm": 2.3540198802948, | |
| "learning_rate": 4.7899074834704165e-06, | |
| "loss": 0.0259, | |
| "num_input_tokens_seen": 4335952, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 4.01609010458568, | |
| "grad_norm": 0.7733599543571472, | |
| "learning_rate": 4.771530843082187e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 4342816, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 4.022526146419952, | |
| "grad_norm": 3.051017999649048, | |
| "learning_rate": 4.752422169756048e-06, | |
| "loss": 0.0359, | |
| "num_input_tokens_seen": 4349456, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 4.028962188254224, | |
| "grad_norm": 0.4645274579524994, | |
| "learning_rate": 4.732587621532214e-06, | |
| "loss": 0.0081, | |
| "num_input_tokens_seen": 4356032, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 4.035398230088496, | |
| "grad_norm": 1.9294419288635254, | |
| "learning_rate": 4.712033590374346e-06, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 4362928, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 4.041834271922768, | |
| "grad_norm": 2.5432851314544678, | |
| "learning_rate": 4.690766700109659e-06, | |
| "loss": 0.0235, | |
| "num_input_tokens_seen": 4369616, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 4.0482703137570395, | |
| "grad_norm": 1.8334590196609497, | |
| "learning_rate": 4.668793804294294e-06, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 4376656, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 4.054706355591311, | |
| "grad_norm": 0.6473208069801331, | |
| "learning_rate": 4.646121984004666e-06, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 4383696, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.061142397425583, | |
| "grad_norm": 2.0988128185272217, | |
| "learning_rate": 4.622758545555485e-06, | |
| "loss": 0.0191, | |
| "num_input_tokens_seen": 4390880, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 4.067578439259855, | |
| "grad_norm": 1.8957973718643188, | |
| "learning_rate": 4.598711018145193e-06, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 4398000, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 4.074014481094127, | |
| "grad_norm": 1.117255449295044, | |
| "learning_rate": 4.573987151429579e-06, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 4404640, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 4.080450522928399, | |
| "grad_norm": 2.326129198074341, | |
| "learning_rate": 4.54859491302433e-06, | |
| "loss": 0.0317, | |
| "num_input_tokens_seen": 4411760, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 4.086886564762671, | |
| "grad_norm": 1.6843276023864746, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 4418896, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 4.093322606596943, | |
| "grad_norm": 2.301496744155884, | |
| "learning_rate": 4.495838265931754e-06, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 4425776, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 4.099758648431215, | |
| "grad_norm": 1.434444546699524, | |
| "learning_rate": 4.4684908588200305e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 4432656, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 4.106194690265487, | |
| "grad_norm": 1.3446779251098633, | |
| "learning_rate": 4.440509077690883e-06, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 4439424, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 4.112630732099759, | |
| "grad_norm": 0.6733867526054382, | |
| "learning_rate": 4.411901940068997e-06, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 4446160, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 4.119066773934031, | |
| "grad_norm": 1.339034080505371, | |
| "learning_rate": 4.382678665009028e-06, | |
| "loss": 0.0085, | |
| "num_input_tokens_seen": 4453376, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.125502815768303, | |
| "grad_norm": 3.2036638259887695, | |
| "learning_rate": 4.352848670124637e-06, | |
| "loss": 0.0328, | |
| "num_input_tokens_seen": 4459952, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 4.131938857602575, | |
| "grad_norm": 1.1791878938674927, | |
| "learning_rate": 4.322421568553529e-06, | |
| "loss": 0.0098, | |
| "num_input_tokens_seen": 4466880, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 4.1383748994368466, | |
| "grad_norm": 1.8526674509048462, | |
| "learning_rate": 4.291407165859481e-06, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 4474064, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 4.1448109412711185, | |
| "grad_norm": 0.4795032739639282, | |
| "learning_rate": 4.259815456872363e-06, | |
| "loss": 0.0047, | |
| "num_input_tokens_seen": 4480864, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 4.15124698310539, | |
| "grad_norm": 1.4392155408859253, | |
| "learning_rate": 4.227656622467162e-06, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 4487504, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 4.157683024939662, | |
| "grad_norm": 3.185128688812256, | |
| "learning_rate": 4.194941026283053e-06, | |
| "loss": 0.0334, | |
| "num_input_tokens_seen": 4494512, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 4.164119066773934, | |
| "grad_norm": 1.7285927534103394, | |
| "learning_rate": 4.161679211383565e-06, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 4501296, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 4.170555108608206, | |
| "grad_norm": 4.266958713531494, | |
| "learning_rate": 4.127881896858934e-06, | |
| "loss": 0.0305, | |
| "num_input_tokens_seen": 4508128, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 4.176991150442478, | |
| "grad_norm": 1.000532627105713, | |
| "learning_rate": 4.093559974371725e-06, | |
| "loss": 0.0092, | |
| "num_input_tokens_seen": 4515008, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 4.18342719227675, | |
| "grad_norm": 1.1824270486831665, | |
| "learning_rate": 4.058724504646834e-06, | |
| "loss": 0.0223, | |
| "num_input_tokens_seen": 4521920, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.189863234111022, | |
| "grad_norm": 2.444427728652954, | |
| "learning_rate": 4.023386713907021e-06, | |
| "loss": 0.0234, | |
| "num_input_tokens_seen": 4528912, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 4.196299275945294, | |
| "grad_norm": 1.421184778213501, | |
| "learning_rate": 3.987557990255093e-06, | |
| "loss": 0.0185, | |
| "num_input_tokens_seen": 4535664, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 4.202735317779566, | |
| "grad_norm": 0.9019869565963745, | |
| "learning_rate": 3.951249880003934e-06, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 4542832, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 4.209171359613838, | |
| "grad_norm": 1.7373372316360474, | |
| "learning_rate": 3.914474083955537e-06, | |
| "loss": 0.0217, | |
| "num_input_tokens_seen": 4549552, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 4.21560740144811, | |
| "grad_norm": 0.31386592984199524, | |
| "learning_rate": 3.8772424536302565e-06, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 4556192, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 4.222043443282382, | |
| "grad_norm": 1.8379613161087036, | |
| "learning_rate": 3.839566987447492e-06, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 4563168, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 4.228479485116654, | |
| "grad_norm": 1.221056342124939, | |
| "learning_rate": 3.801459826859022e-06, | |
| "loss": 0.0092, | |
| "num_input_tokens_seen": 4570704, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 4.2349155269509255, | |
| "grad_norm": 0.7823006510734558, | |
| "learning_rate": 3.7629332524362532e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 4578016, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 4.2413515687851975, | |
| "grad_norm": 1.149715781211853, | |
| "learning_rate": 3.7239996799126315e-06, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 4584896, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 4.247787610619469, | |
| "grad_norm": 0.6069539189338684, | |
| "learning_rate": 3.684671656182497e-06, | |
| "loss": 0.0099, | |
| "num_input_tokens_seen": 4591984, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.254223652453741, | |
| "grad_norm": 2.427281141281128, | |
| "learning_rate": 3.644961855257669e-06, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 4598656, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 4.260659694288013, | |
| "grad_norm": 1.0770633220672607, | |
| "learning_rate": 3.6048830741830678e-06, | |
| "loss": 0.007, | |
| "num_input_tokens_seen": 4606032, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 4.267095736122285, | |
| "grad_norm": 2.4310688972473145, | |
| "learning_rate": 3.564448228912682e-06, | |
| "loss": 0.0427, | |
| "num_input_tokens_seen": 4613056, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 4.273531777956556, | |
| "grad_norm": 1.2328161001205444, | |
| "learning_rate": 3.523670350147227e-06, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 4619776, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 4.279967819790828, | |
| "grad_norm": 1.519998550415039, | |
| "learning_rate": 3.4825625791348093e-06, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 4626240, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 4.2864038616251, | |
| "grad_norm": 1.4114880561828613, | |
| "learning_rate": 3.44113816343598e-06, | |
| "loss": 0.02, | |
| "num_input_tokens_seen": 4633216, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 4.292839903459372, | |
| "grad_norm": 1.4585809707641602, | |
| "learning_rate": 3.399410452654518e-06, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 4639856, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 4.299275945293644, | |
| "grad_norm": 1.594936490058899, | |
| "learning_rate": 3.357392894135329e-06, | |
| "loss": 0.0085, | |
| "num_input_tokens_seen": 4646832, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 4.305711987127916, | |
| "grad_norm": 2.5802690982818604, | |
| "learning_rate": 3.315099028630855e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 4653648, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 4.312148028962188, | |
| "grad_norm": 1.3826483488082886, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 4660672, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.31858407079646, | |
| "grad_norm": 2.1874148845672607, | |
| "learning_rate": 3.229736980502584e-06, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 4667888, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 4.325020112630732, | |
| "grad_norm": 1.61604642868042, | |
| "learning_rate": 3.186696307005976e-06, | |
| "loss": 0.0042, | |
| "num_input_tokens_seen": 4675072, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 4.331456154465004, | |
| "grad_norm": 0.40999871492385864, | |
| "learning_rate": 3.1434343359132565e-06, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 4682016, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 4.337892196299276, | |
| "grad_norm": 0.1305094212293625, | |
| "learning_rate": 3.099965009006415e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 4688912, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 4.3443282381335475, | |
| "grad_norm": 1.6623185873031616, | |
| "learning_rate": 3.056302334890786e-06, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 4695936, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.3507642799678194, | |
| "grad_norm": 1.034837007522583, | |
| "learning_rate": 3.0124603844805767e-06, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 4703184, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 4.357200321802091, | |
| "grad_norm": 2.2049107551574707, | |
| "learning_rate": 2.9684532864643123e-06, | |
| "loss": 0.0216, | |
| "num_input_tokens_seen": 4710064, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 4.363636363636363, | |
| "grad_norm": 4.32258939743042, | |
| "learning_rate": 2.9242952227516726e-06, | |
| "loss": 0.0258, | |
| "num_input_tokens_seen": 4716336, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 4.370072405470635, | |
| "grad_norm": 1.0949031114578247, | |
| "learning_rate": 2.8800004239031687e-06, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 4723360, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 4.376508447304907, | |
| "grad_norm": 1.563004493713379, | |
| "learning_rate": 2.835583164544139e-06, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 4730464, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.382944489139179, | |
| "grad_norm": 2.775270938873291, | |
| "learning_rate": 2.791057758764557e-06, | |
| "loss": 0.0341, | |
| "num_input_tokens_seen": 4737056, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 4.389380530973451, | |
| "grad_norm": 3.1517560482025146, | |
| "learning_rate": 2.7464385555061092e-06, | |
| "loss": 0.0074, | |
| "num_input_tokens_seen": 4743936, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 4.395816572807723, | |
| "grad_norm": 1.2521913051605225, | |
| "learning_rate": 2.7017399339380435e-06, | |
| "loss": 0.0272, | |
| "num_input_tokens_seen": 4751024, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 4.402252614641995, | |
| "grad_norm": 3.4706435203552246, | |
| "learning_rate": 2.6569762988232838e-06, | |
| "loss": 0.0168, | |
| "num_input_tokens_seen": 4758000, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 4.408688656476267, | |
| "grad_norm": 0.8021034598350525, | |
| "learning_rate": 2.6121620758762877e-06, | |
| "loss": 0.0047, | |
| "num_input_tokens_seen": 4764816, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 4.415124698310539, | |
| "grad_norm": 4.709753036499023, | |
| "learning_rate": 2.5673117071141574e-06, | |
| "loss": 0.0198, | |
| "num_input_tokens_seen": 4772144, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 4.421560740144811, | |
| "grad_norm": 0.40973323583602905, | |
| "learning_rate": 2.522439646202495e-06, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 4778960, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 4.427996781979083, | |
| "grad_norm": 3.179236888885498, | |
| "learning_rate": 2.4775603537975055e-06, | |
| "loss": 0.0256, | |
| "num_input_tokens_seen": 4785952, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 4.434432823813355, | |
| "grad_norm": 2.5204341411590576, | |
| "learning_rate": 2.4326882928858435e-06, | |
| "loss": 0.0187, | |
| "num_input_tokens_seen": 4792608, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 4.4408688656476265, | |
| "grad_norm": 3.6536998748779297, | |
| "learning_rate": 2.3878379241237136e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 4799232, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.447304907481898, | |
| "grad_norm": 1.0689839124679565, | |
| "learning_rate": 2.3430237011767166e-06, | |
| "loss": 0.0036, | |
| "num_input_tokens_seen": 4806080, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 4.45374094931617, | |
| "grad_norm": 2.071629762649536, | |
| "learning_rate": 2.2982600660619574e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 4813728, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 4.460176991150442, | |
| "grad_norm": 3.4168224334716797, | |
| "learning_rate": 2.253561444493891e-06, | |
| "loss": 0.0046, | |
| "num_input_tokens_seen": 4820608, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 4.466613032984714, | |
| "grad_norm": 0.3058677017688751, | |
| "learning_rate": 2.2089422412354434e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 4827056, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 4.473049074818986, | |
| "grad_norm": 0.4175882935523987, | |
| "learning_rate": 2.1644168354558623e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 4834080, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 4.479485116653258, | |
| "grad_norm": 0.7226863503456116, | |
| "learning_rate": 2.119999576096832e-06, | |
| "loss": 0.0093, | |
| "num_input_tokens_seen": 4840912, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 4.48592115848753, | |
| "grad_norm": 0.1190720871090889, | |
| "learning_rate": 2.0757047772483278e-06, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 4848112, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 4.492357200321802, | |
| "grad_norm": 1.0061287879943848, | |
| "learning_rate": 2.031546713535688e-06, | |
| "loss": 0.0036, | |
| "num_input_tokens_seen": 4855072, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 4.498793242156074, | |
| "grad_norm": 0.9472126364707947, | |
| "learning_rate": 1.987539615519424e-06, | |
| "loss": 0.0071, | |
| "num_input_tokens_seen": 4862064, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 4.505229283990346, | |
| "grad_norm": 0.8338857889175415, | |
| "learning_rate": 1.9436976651092143e-06, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 4869104, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.511665325824618, | |
| "grad_norm": 3.2061474323272705, | |
| "learning_rate": 1.9000349909935852e-06, | |
| "loss": 0.0291, | |
| "num_input_tokens_seen": 4876112, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 4.51810136765889, | |
| "grad_norm": 3.644125461578369, | |
| "learning_rate": 1.8565656640867448e-06, | |
| "loss": 0.0407, | |
| "num_input_tokens_seen": 4883264, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 4.524537409493162, | |
| "grad_norm": 2.2370316982269287, | |
| "learning_rate": 1.813303692994025e-06, | |
| "loss": 0.0245, | |
| "num_input_tokens_seen": 4890192, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 4.530973451327434, | |
| "grad_norm": 3.3120510578155518, | |
| "learning_rate": 1.770263019497417e-06, | |
| "loss": 0.0207, | |
| "num_input_tokens_seen": 4897200, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 4.5374094931617055, | |
| "grad_norm": 1.256335973739624, | |
| "learning_rate": 1.7274575140626318e-06, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 4904016, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 4.543845534995977, | |
| "grad_norm": 0.10977872461080551, | |
| "learning_rate": 1.6849009713691456e-06, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 4910944, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 4.550281576830249, | |
| "grad_norm": 1.9825077056884766, | |
| "learning_rate": 1.6426071058646718e-06, | |
| "loss": 0.0205, | |
| "num_input_tokens_seen": 4917424, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 4.556717618664521, | |
| "grad_norm": 0.7529383897781372, | |
| "learning_rate": 1.6005895473454836e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 4924288, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 4.563153660498793, | |
| "grad_norm": 2.29215145111084, | |
| "learning_rate": 1.55886183656402e-06, | |
| "loss": 0.0239, | |
| "num_input_tokens_seen": 4931040, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 4.569589702333065, | |
| "grad_norm": 1.639636754989624, | |
| "learning_rate": 1.5174374208651913e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 4937968, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.576025744167337, | |
| "grad_norm": 1.8043317794799805, | |
| "learning_rate": 1.4763296498527744e-06, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 4945456, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 4.582461786001609, | |
| "grad_norm": 1.8007737398147583, | |
| "learning_rate": 1.4355517710873184e-06, | |
| "loss": 0.0338, | |
| "num_input_tokens_seen": 4952080, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 4.588897827835881, | |
| "grad_norm": 0.6810876131057739, | |
| "learning_rate": 1.395116925816934e-06, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 4958944, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 4.595333869670153, | |
| "grad_norm": 1.0080180168151855, | |
| "learning_rate": 1.3550381447423317e-06, | |
| "loss": 0.0126, | |
| "num_input_tokens_seen": 4966320, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 4.601769911504425, | |
| "grad_norm": 1.1210750341415405, | |
| "learning_rate": 1.3153283438175036e-06, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 4973344, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 4.608205953338697, | |
| "grad_norm": 2.2793147563934326, | |
| "learning_rate": 1.27600032008737e-06, | |
| "loss": 0.0155, | |
| "num_input_tokens_seen": 4980304, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 4.614641995172969, | |
| "grad_norm": 2.0746471881866455, | |
| "learning_rate": 1.2370667475637474e-06, | |
| "loss": 0.0349, | |
| "num_input_tokens_seen": 4987616, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 4.621078037007241, | |
| "grad_norm": 1.9974377155303955, | |
| "learning_rate": 1.1985401731409793e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 4994656, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 4.627514078841513, | |
| "grad_norm": 0.9225305914878845, | |
| "learning_rate": 1.160433012552508e-06, | |
| "loss": 0.0204, | |
| "num_input_tokens_seen": 5001776, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 4.6339501206757845, | |
| "grad_norm": 0.6030845642089844, | |
| "learning_rate": 1.122757546369744e-06, | |
| "loss": 0.0074, | |
| "num_input_tokens_seen": 5008688, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.640386162510056, | |
| "grad_norm": 1.1969950199127197, | |
| "learning_rate": 1.085525916044464e-06, | |
| "loss": 0.0154, | |
| "num_input_tokens_seen": 5015680, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 4.646822204344328, | |
| "grad_norm": 1.7312675714492798, | |
| "learning_rate": 1.048750119996066e-06, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 5022336, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 4.6532582461786, | |
| "grad_norm": 0.9403418898582458, | |
| "learning_rate": 1.0124420097449077e-06, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 5029184, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 4.659694288012872, | |
| "grad_norm": 2.2545931339263916, | |
| "learning_rate": 9.7661328609298e-07, | |
| "loss": 0.0279, | |
| "num_input_tokens_seen": 5036000, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 4.666130329847144, | |
| "grad_norm": 0.5637010931968689, | |
| "learning_rate": 9.412754953531664e-07, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 5042944, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.672566371681416, | |
| "grad_norm": 0.24136967957019806, | |
| "learning_rate": 9.064400256282757e-07, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 5049840, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 4.679002413515688, | |
| "grad_norm": 1.0340116024017334, | |
| "learning_rate": 8.721181031410661e-07, | |
| "loss": 0.0086, | |
| "num_input_tokens_seen": 5057296, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 4.68543845534996, | |
| "grad_norm": 0.548861026763916, | |
| "learning_rate": 8.383207886164366e-07, | |
| "loss": 0.005, | |
| "num_input_tokens_seen": 5064560, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 4.691874497184232, | |
| "grad_norm": 1.089135766029358, | |
| "learning_rate": 8.050589737169485e-07, | |
| "loss": 0.0096, | |
| "num_input_tokens_seen": 5071472, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 4.698310539018504, | |
| "grad_norm": 0.3106631636619568, | |
| "learning_rate": 7.723433775328385e-07, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 5078512, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.704746580852776, | |
| "grad_norm": 1.3499066829681396, | |
| "learning_rate": 7.401845431276378e-07, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 5085248, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 4.711182622687048, | |
| "grad_norm": 0.30332618951797485, | |
| "learning_rate": 7.085928341405193e-07, | |
| "loss": 0.0033, | |
| "num_input_tokens_seen": 5092160, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 4.71761866452132, | |
| "grad_norm": 0.7549375295639038, | |
| "learning_rate": 6.775784314464717e-07, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 5099360, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 4.7240547063555915, | |
| "grad_norm": 1.567395567893982, | |
| "learning_rate": 6.471513298753634e-07, | |
| "loss": 0.0117, | |
| "num_input_tokens_seen": 5106160, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 4.7304907481898635, | |
| "grad_norm": 1.192610502243042, | |
| "learning_rate": 6.17321334990973e-07, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 5113264, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 4.736926790024135, | |
| "grad_norm": 3.9402077198028564, | |
| "learning_rate": 5.880980599310041e-07, | |
| "loss": 0.0305, | |
| "num_input_tokens_seen": 5120032, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 4.743362831858407, | |
| "grad_norm": 0.3623356223106384, | |
| "learning_rate": 5.59490922309118e-07, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 5127280, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 4.749798873692679, | |
| "grad_norm": 0.815592885017395, | |
| "learning_rate": 5.3150914117997e-07, | |
| "loss": 0.0066, | |
| "num_input_tokens_seen": 5134400, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 4.756234915526951, | |
| "grad_norm": 0.4423564076423645, | |
| "learning_rate": 5.041617340682467e-07, | |
| "loss": 0.0032, | |
| "num_input_tokens_seen": 5141488, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 4.762670957361223, | |
| "grad_norm": 0.5768114924430847, | |
| "learning_rate": 4.774575140626317e-07, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 5148432, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.769106999195495, | |
| "grad_norm": 1.2286343574523926, | |
| "learning_rate": 4.514050869756703e-07, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 5155328, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 4.775543041029767, | |
| "grad_norm": 0.552872359752655, | |
| "learning_rate": 4.2601284857042263e-07, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 5163008, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 4.781979082864039, | |
| "grad_norm": 0.6165493726730347, | |
| "learning_rate": 4.012889818548069e-07, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 5170096, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 4.788415124698311, | |
| "grad_norm": 1.1403653621673584, | |
| "learning_rate": 3.772414544445163e-07, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 5177536, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 4.794851166532583, | |
| "grad_norm": 0.1795167326927185, | |
| "learning_rate": 3.538780159953348e-07, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 5184608, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 4.801287208366855, | |
| "grad_norm": 0.9326004981994629, | |
| "learning_rate": 3.312061957057061e-07, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 5191344, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 4.807723250201127, | |
| "grad_norm": 0.41363996267318726, | |
| "learning_rate": 3.092332998903416e-07, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 5198416, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 4.814159292035399, | |
| "grad_norm": 0.538027286529541, | |
| "learning_rate": 2.8796640962565374e-07, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 5205392, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 4.8205953338696705, | |
| "grad_norm": 1.531555414199829, | |
| "learning_rate": 2.674123784677868e-07, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 5213216, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 4.8270313757039425, | |
| "grad_norm": 1.671035647392273, | |
| "learning_rate": 2.4757783024395244e-07, | |
| "loss": 0.0219, | |
| "num_input_tokens_seen": 5220032, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.833467417538214, | |
| "grad_norm": 0.30722492933273315, | |
| "learning_rate": 2.284691569178138e-07, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 5226816, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 4.839903459372486, | |
| "grad_norm": 1.3107943534851074, | |
| "learning_rate": 2.100925165295839e-07, | |
| "loss": 0.019, | |
| "num_input_tokens_seen": 5233920, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 4.846339501206758, | |
| "grad_norm": 2.1163885593414307, | |
| "learning_rate": 1.9245383121150678e-07, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 5241344, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 4.85277554304103, | |
| "grad_norm": 1.2636387348175049, | |
| "learning_rate": 1.7555878527937164e-07, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 5248256, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 4.859211584875302, | |
| "grad_norm": 4.166254997253418, | |
| "learning_rate": 1.59412823400657e-07, | |
| "loss": 0.0244, | |
| "num_input_tokens_seen": 5255248, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 4.865647626709574, | |
| "grad_norm": 1.078273892402649, | |
| "learning_rate": 1.4402114883991318e-07, | |
| "loss": 0.0218, | |
| "num_input_tokens_seen": 5262048, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 4.872083668543846, | |
| "grad_norm": 2.091312885284424, | |
| "learning_rate": 1.2938872178193395e-07, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 5268848, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 4.878519710378118, | |
| "grad_norm": 1.7236751317977905, | |
| "learning_rate": 1.1552025773327008e-07, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 5275664, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 4.88495575221239, | |
| "grad_norm": 0.9874201416969299, | |
| "learning_rate": 1.0242022600258611e-07, | |
| "loss": 0.007, | |
| "num_input_tokens_seen": 5282112, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 4.891391794046662, | |
| "grad_norm": 0.6303602457046509, | |
| "learning_rate": 9.00928482603669e-08, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 5288912, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.897827835880933, | |
| "grad_norm": 0.7971038818359375, | |
| "learning_rate": 7.854209717842231e-08, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 5295920, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 4.904263877715205, | |
| "grad_norm": 1.0757670402526855, | |
| "learning_rate": 6.777169514963766e-08, | |
| "loss": 0.0087, | |
| "num_input_tokens_seen": 5302816, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 4.910699919549477, | |
| "grad_norm": 1.8044992685317993, | |
| "learning_rate": 5.778511308838108e-08, | |
| "loss": 0.0085, | |
| "num_input_tokens_seen": 5309680, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 4.917135961383749, | |
| "grad_norm": 0.3801545202732086, | |
| "learning_rate": 4.8585569311949966e-08, | |
| "loss": 0.0026, | |
| "num_input_tokens_seen": 5316848, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 4.923572003218021, | |
| "grad_norm": 0.20918627083301544, | |
| "learning_rate": 4.017602850342584e-08, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 5323760, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 4.9300080450522925, | |
| "grad_norm": 2.037950277328491, | |
| "learning_rate": 3.2559200756260845e-08, | |
| "loss": 0.0072, | |
| "num_input_tokens_seen": 5330336, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 4.936444086886564, | |
| "grad_norm": 0.8903030753135681, | |
| "learning_rate": 2.5737540700912777e-08, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 5336816, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 4.942880128720836, | |
| "grad_norm": 1.0508862733840942, | |
| "learning_rate": 1.9713246713805588e-08, | |
| "loss": 0.0275, | |
| "num_input_tokens_seen": 5344064, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 4.949316170555108, | |
| "grad_norm": 1.0068142414093018, | |
| "learning_rate": 1.4488260208871397e-08, | |
| "loss": 0.0036, | |
| "num_input_tokens_seen": 5351328, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 4.95575221238938, | |
| "grad_norm": 1.5033273696899414, | |
| "learning_rate": 1.006426501190233e-08, | |
| "loss": 0.0501, | |
| "num_input_tokens_seen": 5358672, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.962188254223652, | |
| "grad_norm": 0.667352557182312, | |
| "learning_rate": 6.442686817914878e-09, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 5365648, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 4.968624296057924, | |
| "grad_norm": 0.9037322998046875, | |
| "learning_rate": 3.6246927316976875e-09, | |
| "loss": 0.0032, | |
| "num_input_tokens_seen": 5372432, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 4.975060337892196, | |
| "grad_norm": 0.3071233630180359, | |
| "learning_rate": 1.6111908916965902e-09, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 5379648, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 4.981496379726468, | |
| "grad_norm": 0.7171315550804138, | |
| "learning_rate": 4.0283017735454066e-10, | |
| "loss": 0.0042, | |
| "num_input_tokens_seen": 5386864, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 4.98793242156074, | |
| "grad_norm": 2.855295181274414, | |
| "learning_rate": 0.0, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 5393616, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.98793242156074, | |
| "num_input_tokens_seen": 5393616, | |
| "step": 775, | |
| "total_flos": 2.1382484588285133e+17, | |
| "train_loss": 0.5414434323177463, | |
| "train_runtime": 8640.8816, | |
| "train_samples_per_second": 11.503, | |
| "train_steps_per_second": 0.09 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 775, | |
| "num_input_tokens_seen": 5393616, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1382484588285133e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |