| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.972704714640199, | |
| "eval_steps": 500, | |
| "global_step": 755, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006617038875103391, | |
| "grad_norm": 34.37208740591342, | |
| "learning_rate": 4.999978357111178e-05, | |
| "loss": 2.4934, | |
| "num_input_tokens_seen": 262144, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.013234077750206782, | |
| "grad_norm": 45.9821682566152, | |
| "learning_rate": 4.9999134288194436e-05, | |
| "loss": 3.8516, | |
| "num_input_tokens_seen": 524288, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.019851116625310174, | |
| "grad_norm": 39.177129424978055, | |
| "learning_rate": 4.9998052162489854e-05, | |
| "loss": 3.5826, | |
| "num_input_tokens_seen": 786432, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.026468155500413565, | |
| "grad_norm": 240.13346858978267, | |
| "learning_rate": 4.999653721273429e-05, | |
| "loss": 9.4935, | |
| "num_input_tokens_seen": 1048576, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.033085194375516956, | |
| "grad_norm": 32.940086480387635, | |
| "learning_rate": 4.999458946515808e-05, | |
| "loss": 3.9766, | |
| "num_input_tokens_seen": 1310720, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03970223325062035, | |
| "grad_norm": 133.3545454549638, | |
| "learning_rate": 4.99922089534851e-05, | |
| "loss": 4.7751, | |
| "num_input_tokens_seen": 1572864, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04631927212572374, | |
| "grad_norm": 27.309262699099058, | |
| "learning_rate": 4.998939571893228e-05, | |
| "loss": 3.1784, | |
| "num_input_tokens_seen": 1835008, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05293631100082713, | |
| "grad_norm": 7.830626860343412, | |
| "learning_rate": 4.998614981020884e-05, | |
| "loss": 2.5119, | |
| "num_input_tokens_seen": 2097152, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.05955334987593052, | |
| "grad_norm": 8.048304569875182, | |
| "learning_rate": 4.998247128351545e-05, | |
| "loss": 2.2489, | |
| "num_input_tokens_seen": 2359296, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.06617038875103391, | |
| "grad_norm": 8.196172984809154, | |
| "learning_rate": 4.997836020254328e-05, | |
| "loss": 2.226, | |
| "num_input_tokens_seen": 2621440, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07278742762613731, | |
| "grad_norm": 5.02594769824259, | |
| "learning_rate": 4.9973816638472846e-05, | |
| "loss": 2.0551, | |
| "num_input_tokens_seen": 2883584, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0794044665012407, | |
| "grad_norm": 5.423582402121407, | |
| "learning_rate": 4.996884066997284e-05, | |
| "loss": 1.9894, | |
| "num_input_tokens_seen": 3145728, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.08602150537634409, | |
| "grad_norm": 3.9697486334940897, | |
| "learning_rate": 4.9963432383198726e-05, | |
| "loss": 1.9212, | |
| "num_input_tokens_seen": 3407872, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.09263854425144748, | |
| "grad_norm": 3.184049663288423, | |
| "learning_rate": 4.995759187179126e-05, | |
| "loss": 1.8776, | |
| "num_input_tokens_seen": 3670016, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.09925558312655088, | |
| "grad_norm": 2.1877083301154907, | |
| "learning_rate": 4.995131923687488e-05, | |
| "loss": 1.8142, | |
| "num_input_tokens_seen": 3932160, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10587262200165426, | |
| "grad_norm": 3.193160483646996, | |
| "learning_rate": 4.9944614587055925e-05, | |
| "loss": 1.7548, | |
| "num_input_tokens_seen": 4194304, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.11248966087675766, | |
| "grad_norm": 4.417433823363654, | |
| "learning_rate": 4.993747803842081e-05, | |
| "loss": 1.8321, | |
| "num_input_tokens_seen": 4456448, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.11910669975186104, | |
| "grad_norm": 2.6131027548528114, | |
| "learning_rate": 4.992990971453394e-05, | |
| "loss": 1.7912, | |
| "num_input_tokens_seen": 4718592, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12572373862696443, | |
| "grad_norm": 2.5605507836540236, | |
| "learning_rate": 4.9921909746435633e-05, | |
| "loss": 1.7313, | |
| "num_input_tokens_seen": 4980736, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.13234077750206782, | |
| "grad_norm": 1.8403308592664107, | |
| "learning_rate": 4.991347827263982e-05, | |
| "loss": 1.7471, | |
| "num_input_tokens_seen": 5242880, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13895781637717122, | |
| "grad_norm": 1.9498430221428151, | |
| "learning_rate": 4.990461543913168e-05, | |
| "loss": 1.6706, | |
| "num_input_tokens_seen": 5505024, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.14557485525227462, | |
| "grad_norm": 1.8088575603375883, | |
| "learning_rate": 4.9895321399365044e-05, | |
| "loss": 1.6593, | |
| "num_input_tokens_seen": 5767168, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.152191894127378, | |
| "grad_norm": 1.7965239443436583, | |
| "learning_rate": 4.988559631425983e-05, | |
| "loss": 1.6705, | |
| "num_input_tokens_seen": 6029312, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1588089330024814, | |
| "grad_norm": 3.13574184340746, | |
| "learning_rate": 4.987544035219917e-05, | |
| "loss": 1.6509, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1654259718775848, | |
| "grad_norm": 2.048821572892012, | |
| "learning_rate": 4.9864853689026556e-05, | |
| "loss": 1.6473, | |
| "num_input_tokens_seen": 6553600, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.17204301075268819, | |
| "grad_norm": 1.6289600767650514, | |
| "learning_rate": 4.985383650804277e-05, | |
| "loss": 1.6402, | |
| "num_input_tokens_seen": 6815744, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.17866004962779156, | |
| "grad_norm": 1.846599816589036, | |
| "learning_rate": 4.984238900000271e-05, | |
| "loss": 1.5957, | |
| "num_input_tokens_seen": 7077888, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.18527708850289495, | |
| "grad_norm": 2.49311076745654, | |
| "learning_rate": 4.983051136311209e-05, | |
| "loss": 1.6361, | |
| "num_input_tokens_seen": 7340032, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.19189412737799835, | |
| "grad_norm": 1.7775011755473744, | |
| "learning_rate": 4.9818203803024e-05, | |
| "loss": 1.6234, | |
| "num_input_tokens_seen": 7602176, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.19851116625310175, | |
| "grad_norm": 2.3264416263522576, | |
| "learning_rate": 4.9805466532835376e-05, | |
| "loss": 1.6288, | |
| "num_input_tokens_seen": 7864320, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 1.8036645148064498, | |
| "learning_rate": 4.9792299773083276e-05, | |
| "loss": 1.6263, | |
| "num_input_tokens_seen": 8126464, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.21174524400330852, | |
| "grad_norm": 1.951872109802615, | |
| "learning_rate": 4.9778703751741076e-05, | |
| "loss": 1.5944, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.21836228287841192, | |
| "grad_norm": 1.9171459318694717, | |
| "learning_rate": 4.9764678704214506e-05, | |
| "loss": 1.5671, | |
| "num_input_tokens_seen": 8650752, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.22497932175351532, | |
| "grad_norm": 1.8596046998788465, | |
| "learning_rate": 4.9750224873337605e-05, | |
| "loss": 1.5907, | |
| "num_input_tokens_seen": 8912896, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.23159636062861869, | |
| "grad_norm": 1.9357133650068838, | |
| "learning_rate": 4.973534250936851e-05, | |
| "loss": 1.5631, | |
| "num_input_tokens_seen": 9175040, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.23821339950372208, | |
| "grad_norm": 1.9002383695379652, | |
| "learning_rate": 4.9720031869985084e-05, | |
| "loss": 1.5915, | |
| "num_input_tokens_seen": 9437184, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.24483043837882548, | |
| "grad_norm": 1.6528688694936653, | |
| "learning_rate": 4.970429322028051e-05, | |
| "loss": 1.544, | |
| "num_input_tokens_seen": 9699328, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.25144747725392885, | |
| "grad_norm": 1.7280201764860956, | |
| "learning_rate": 4.968812683275866e-05, | |
| "loss": 1.5263, | |
| "num_input_tokens_seen": 9961472, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 1.3933242558940806, | |
| "learning_rate": 4.96715329873294e-05, | |
| "loss": 1.5549, | |
| "num_input_tokens_seen": 10223616, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.26468155500413565, | |
| "grad_norm": 1.8069938406301924, | |
| "learning_rate": 4.965451197130373e-05, | |
| "loss": 1.5227, | |
| "num_input_tokens_seen": 10485760, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27129859387923905, | |
| "grad_norm": 1.520117712776801, | |
| "learning_rate": 4.963706407938881e-05, | |
| "loss": 1.5338, | |
| "num_input_tokens_seen": 10747904, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.27791563275434245, | |
| "grad_norm": 1.9100553957972657, | |
| "learning_rate": 4.961918961368287e-05, | |
| "loss": 1.5387, | |
| "num_input_tokens_seen": 11010048, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.28453267162944584, | |
| "grad_norm": 1.7226630358900272, | |
| "learning_rate": 4.960088888366998e-05, | |
| "loss": 1.5439, | |
| "num_input_tokens_seen": 11272192, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.29114971050454924, | |
| "grad_norm": 1.3593181279067512, | |
| "learning_rate": 4.9582162206214654e-05, | |
| "loss": 1.5352, | |
| "num_input_tokens_seen": 11534336, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2977667493796526, | |
| "grad_norm": 1.3032108941431264, | |
| "learning_rate": 4.956300990555643e-05, | |
| "loss": 1.5087, | |
| "num_input_tokens_seen": 11796480, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.304383788254756, | |
| "grad_norm": 1.6080688788534845, | |
| "learning_rate": 4.9543432313304184e-05, | |
| "loss": 1.4949, | |
| "num_input_tokens_seen": 12058624, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3110008271298594, | |
| "grad_norm": 1.8537446327987028, | |
| "learning_rate": 4.9523429768430445e-05, | |
| "loss": 1.5097, | |
| "num_input_tokens_seen": 12320768, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3176178660049628, | |
| "grad_norm": 1.5807168324755116, | |
| "learning_rate": 4.950300261726549e-05, | |
| "loss": 1.5391, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3242349048800662, | |
| "grad_norm": 1.4720318553915235, | |
| "learning_rate": 4.9482151213491386e-05, | |
| "loss": 1.5192, | |
| "num_input_tokens_seen": 12845056, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.3308519437551696, | |
| "grad_norm": 1.4230626865941303, | |
| "learning_rate": 4.9460875918135804e-05, | |
| "loss": 1.5129, | |
| "num_input_tokens_seen": 13107200, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.337468982630273, | |
| "grad_norm": 1.995697274136008, | |
| "learning_rate": 4.943917709956584e-05, | |
| "loss": 1.5243, | |
| "num_input_tokens_seen": 13369344, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.34408602150537637, | |
| "grad_norm": 1.2626400291619349, | |
| "learning_rate": 4.941705513348157e-05, | |
| "loss": 1.5031, | |
| "num_input_tokens_seen": 13631488, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3507030603804797, | |
| "grad_norm": 1.3497492422606214, | |
| "learning_rate": 4.939451040290961e-05, | |
| "loss": 1.4925, | |
| "num_input_tokens_seen": 13893632, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3573200992555831, | |
| "grad_norm": 1.593030123983225, | |
| "learning_rate": 4.937154329819644e-05, | |
| "loss": 1.474, | |
| "num_input_tokens_seen": 14155776, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3639371381306865, | |
| "grad_norm": 1.45845743040763, | |
| "learning_rate": 4.934815421700165e-05, | |
| "loss": 1.4776, | |
| "num_input_tokens_seen": 14417920, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3705541770057899, | |
| "grad_norm": 1.3404864487553547, | |
| "learning_rate": 4.932434356429106e-05, | |
| "loss": 1.4824, | |
| "num_input_tokens_seen": 14680064, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3771712158808933, | |
| "grad_norm": 1.2551629875891903, | |
| "learning_rate": 4.930011175232973e-05, | |
| "loss": 1.5041, | |
| "num_input_tokens_seen": 14942208, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3837882547559967, | |
| "grad_norm": 1.609822533873946, | |
| "learning_rate": 4.927545920067479e-05, | |
| "loss": 1.4685, | |
| "num_input_tokens_seen": 15204352, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3904052936311001, | |
| "grad_norm": 1.4469457354277884, | |
| "learning_rate": 4.925038633616818e-05, | |
| "loss": 1.4517, | |
| "num_input_tokens_seen": 15466496, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3970223325062035, | |
| "grad_norm": 1.6525439629778043, | |
| "learning_rate": 4.9224893592929275e-05, | |
| "loss": 1.4878, | |
| "num_input_tokens_seen": 15728640, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.40363937138130684, | |
| "grad_norm": 1.254692246634876, | |
| "learning_rate": 4.9198981412347364e-05, | |
| "loss": 1.4882, | |
| "num_input_tokens_seen": 15990784, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 1.4543664783049992, | |
| "learning_rate": 4.917265024307401e-05, | |
| "loss": 1.452, | |
| "num_input_tokens_seen": 16252928, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.41687344913151364, | |
| "grad_norm": 1.660807543545703, | |
| "learning_rate": 4.914590054101526e-05, | |
| "loss": 1.4566, | |
| "num_input_tokens_seen": 16515072, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.42349048800661704, | |
| "grad_norm": 1.417113851349832, | |
| "learning_rate": 4.911873276932376e-05, | |
| "loss": 1.4624, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 1.2145243782914297, | |
| "learning_rate": 4.909114739839079e-05, | |
| "loss": 1.4775, | |
| "num_input_tokens_seen": 17039360, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.43672456575682383, | |
| "grad_norm": 1.3977545670249865, | |
| "learning_rate": 4.906314490583802e-05, | |
| "loss": 1.437, | |
| "num_input_tokens_seen": 17301504, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.44334160463192723, | |
| "grad_norm": 1.2151379240046773, | |
| "learning_rate": 4.903472577650934e-05, | |
| "loss": 1.4492, | |
| "num_input_tokens_seen": 17563648, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.44995864350703063, | |
| "grad_norm": 1.4902123461239007, | |
| "learning_rate": 4.900589050246237e-05, | |
| "loss": 1.4503, | |
| "num_input_tokens_seen": 17825792, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.456575682382134, | |
| "grad_norm": 1.2602633654476592, | |
| "learning_rate": 4.897663958296002e-05, | |
| "loss": 1.4489, | |
| "num_input_tokens_seen": 18087936, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.46319272125723737, | |
| "grad_norm": 1.3183809307307202, | |
| "learning_rate": 4.894697352446182e-05, | |
| "loss": 1.4577, | |
| "num_input_tokens_seen": 18350080, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.46980976013234077, | |
| "grad_norm": 1.527801009411038, | |
| "learning_rate": 4.891689284061513e-05, | |
| "loss": 1.4678, | |
| "num_input_tokens_seen": 18612224, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.47642679900744417, | |
| "grad_norm": 1.6249022564808302, | |
| "learning_rate": 4.888639805224626e-05, | |
| "loss": 1.4618, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.48304383788254757, | |
| "grad_norm": 1.267461796577937, | |
| "learning_rate": 4.885548968735147e-05, | |
| "loss": 1.4576, | |
| "num_input_tokens_seen": 19136512, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.48966087675765096, | |
| "grad_norm": 1.7705838242670227, | |
| "learning_rate": 4.882416828108781e-05, | |
| "loss": 1.4191, | |
| "num_input_tokens_seen": 19398656, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.49627791563275436, | |
| "grad_norm": 1.5430488224612218, | |
| "learning_rate": 4.879243437576383e-05, | |
| "loss": 1.4178, | |
| "num_input_tokens_seen": 19660800, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5028949545078577, | |
| "grad_norm": 1.1859905942576958, | |
| "learning_rate": 4.8760288520830254e-05, | |
| "loss": 1.4184, | |
| "num_input_tokens_seen": 19922944, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5095119933829612, | |
| "grad_norm": 1.6712647868129495, | |
| "learning_rate": 4.8727731272870406e-05, | |
| "loss": 1.4604, | |
| "num_input_tokens_seen": 20185088, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 1.2289801478873525, | |
| "learning_rate": 4.8694763195590606e-05, | |
| "loss": 1.4326, | |
| "num_input_tokens_seen": 20447232, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.522746071133168, | |
| "grad_norm": 1.4590483227044124, | |
| "learning_rate": 4.866138485981041e-05, | |
| "loss": 1.4106, | |
| "num_input_tokens_seen": 20709376, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5293631100082713, | |
| "grad_norm": 1.2028857677920306, | |
| "learning_rate": 4.862759684345269e-05, | |
| "loss": 1.4125, | |
| "num_input_tokens_seen": 20971520, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5359801488833746, | |
| "grad_norm": 1.0904585549604011, | |
| "learning_rate": 4.859339973153368e-05, | |
| "loss": 1.4316, | |
| "num_input_tokens_seen": 21233664, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5425971877584781, | |
| "grad_norm": 1.3835096213926883, | |
| "learning_rate": 4.855879411615282e-05, | |
| "loss": 1.3964, | |
| "num_input_tokens_seen": 21495808, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5492142266335814, | |
| "grad_norm": 1.3130396739637342, | |
| "learning_rate": 4.8523780596482475e-05, | |
| "loss": 1.3856, | |
| "num_input_tokens_seen": 21757952, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5558312655086849, | |
| "grad_norm": 1.6210339400595701, | |
| "learning_rate": 4.848835977875764e-05, | |
| "loss": 1.443, | |
| "num_input_tokens_seen": 22020096, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5624483043837882, | |
| "grad_norm": 1.3048900511501402, | |
| "learning_rate": 4.8452532276265364e-05, | |
| "loss": 1.3976, | |
| "num_input_tokens_seen": 22282240, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5690653432588917, | |
| "grad_norm": 1.5253749617527992, | |
| "learning_rate": 4.8416298709334156e-05, | |
| "loss": 1.3844, | |
| "num_input_tokens_seen": 22544384, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.575682382133995, | |
| "grad_norm": 1.3133909248429987, | |
| "learning_rate": 4.837965970532328e-05, | |
| "loss": 1.4196, | |
| "num_input_tokens_seen": 22806528, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5822994210090985, | |
| "grad_norm": 1.5372189546773116, | |
| "learning_rate": 4.8342615898611854e-05, | |
| "loss": 1.3837, | |
| "num_input_tokens_seen": 23068672, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5889164598842018, | |
| "grad_norm": 1.6875441852021882, | |
| "learning_rate": 4.8305167930587844e-05, | |
| "loss": 1.3941, | |
| "num_input_tokens_seen": 23330816, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5955334987593052, | |
| "grad_norm": 1.49140049772374, | |
| "learning_rate": 4.8267316449637054e-05, | |
| "loss": 1.4027, | |
| "num_input_tokens_seen": 23592960, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6021505376344086, | |
| "grad_norm": 1.437082448499028, | |
| "learning_rate": 4.8229062111131764e-05, | |
| "loss": 1.4069, | |
| "num_input_tokens_seen": 23855104, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.608767576509512, | |
| "grad_norm": 1.4617250864457985, | |
| "learning_rate": 4.8190405577419506e-05, | |
| "loss": 1.3939, | |
| "num_input_tokens_seen": 24117248, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 1.6286516280162249, | |
| "learning_rate": 4.815134751781153e-05, | |
| "loss": 1.3755, | |
| "num_input_tokens_seen": 24379392, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6220016542597188, | |
| "grad_norm": 1.4334609780513905, | |
| "learning_rate": 4.8111888608571234e-05, | |
| "loss": 1.4233, | |
| "num_input_tokens_seen": 24641536, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6286186931348222, | |
| "grad_norm": 1.2470313746006922, | |
| "learning_rate": 4.8072029532902426e-05, | |
| "loss": 1.3776, | |
| "num_input_tokens_seen": 24903680, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6352357320099256, | |
| "grad_norm": 1.6301987833809242, | |
| "learning_rate": 4.803177098093757e-05, | |
| "loss": 1.3677, | |
| "num_input_tokens_seen": 25165824, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6418527708850289, | |
| "grad_norm": 1.283546564124238, | |
| "learning_rate": 4.7991113649725734e-05, | |
| "loss": 1.3608, | |
| "num_input_tokens_seen": 25427968, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.6484698097601324, | |
| "grad_norm": 1.7514307612371371, | |
| "learning_rate": 4.795005824322061e-05, | |
| "loss": 1.3782, | |
| "num_input_tokens_seen": 25690112, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6550868486352357, | |
| "grad_norm": 1.3349872986236062, | |
| "learning_rate": 4.7908605472268266e-05, | |
| "loss": 1.3891, | |
| "num_input_tokens_seen": 25952256, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6617038875103392, | |
| "grad_norm": 1.6887963390015819, | |
| "learning_rate": 4.786675605459487e-05, | |
| "loss": 1.3962, | |
| "num_input_tokens_seen": 26214400, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6683209263854425, | |
| "grad_norm": 1.3690085291694667, | |
| "learning_rate": 4.782451071479428e-05, | |
| "loss": 1.3793, | |
| "num_input_tokens_seen": 26476544, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.674937965260546, | |
| "grad_norm": 1.3438890649631892, | |
| "learning_rate": 4.7781870184315435e-05, | |
| "loss": 1.3795, | |
| "num_input_tokens_seen": 26738688, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6815550041356493, | |
| "grad_norm": 1.4251453112744144, | |
| "learning_rate": 4.773883520144974e-05, | |
| "loss": 1.3813, | |
| "num_input_tokens_seen": 27000832, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6881720430107527, | |
| "grad_norm": 1.3732536405102582, | |
| "learning_rate": 4.769540651131828e-05, | |
| "loss": 1.3582, | |
| "num_input_tokens_seen": 27262976, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6947890818858561, | |
| "grad_norm": 1.1035631693243748, | |
| "learning_rate": 4.76515848658589e-05, | |
| "loss": 1.3708, | |
| "num_input_tokens_seen": 27525120, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7014061207609594, | |
| "grad_norm": 1.2754803303745712, | |
| "learning_rate": 4.760737102381321e-05, | |
| "loss": 1.3467, | |
| "num_input_tokens_seen": 27787264, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.7080231596360629, | |
| "grad_norm": 1.1917616813939031, | |
| "learning_rate": 4.756276575071342e-05, | |
| "loss": 1.3691, | |
| "num_input_tokens_seen": 28049408, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.7146401985111662, | |
| "grad_norm": 1.2971389049692688, | |
| "learning_rate": 4.75177698188691e-05, | |
| "loss": 1.3573, | |
| "num_input_tokens_seen": 28311552, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7212572373862697, | |
| "grad_norm": 1.1796083016863366, | |
| "learning_rate": 4.7472384007353804e-05, | |
| "loss": 1.3456, | |
| "num_input_tokens_seen": 28573696, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.727874276261373, | |
| "grad_norm": 1.393759938768685, | |
| "learning_rate": 4.7426609101991605e-05, | |
| "loss": 1.3741, | |
| "num_input_tokens_seen": 28835840, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7344913151364765, | |
| "grad_norm": 1.239317627457199, | |
| "learning_rate": 4.7380445895343445e-05, | |
| "loss": 1.3871, | |
| "num_input_tokens_seen": 29097984, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7411083540115798, | |
| "grad_norm": 1.2292939808383032, | |
| "learning_rate": 4.7333895186693445e-05, | |
| "loss": 1.401, | |
| "num_input_tokens_seen": 29360128, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7477253928866832, | |
| "grad_norm": 1.0821916096299973, | |
| "learning_rate": 4.728695778203505e-05, | |
| "loss": 1.3394, | |
| "num_input_tokens_seen": 29622272, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7543424317617866, | |
| "grad_norm": 1.2136622411407614, | |
| "learning_rate": 4.723963449405709e-05, | |
| "loss": 1.3368, | |
| "num_input_tokens_seen": 29884416, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.76095947063689, | |
| "grad_norm": 1.0757650949421957, | |
| "learning_rate": 4.719192614212969e-05, | |
| "loss": 1.3414, | |
| "num_input_tokens_seen": 30146560, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7675765095119934, | |
| "grad_norm": 1.2630564279613965, | |
| "learning_rate": 4.7143833552290104e-05, | |
| "loss": 1.352, | |
| "num_input_tokens_seen": 30408704, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 1.1325930683405476, | |
| "learning_rate": 4.709535755722839e-05, | |
| "loss": 1.3444, | |
| "num_input_tokens_seen": 30670848, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7808105872622002, | |
| "grad_norm": 1.2210863008246031, | |
| "learning_rate": 4.704649899627299e-05, | |
| "loss": 1.3464, | |
| "num_input_tokens_seen": 30932992, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7874276261373035, | |
| "grad_norm": 1.1397504462999077, | |
| "learning_rate": 4.6997258715376234e-05, | |
| "loss": 1.3521, | |
| "num_input_tokens_seen": 31195136, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.794044665012407, | |
| "grad_norm": 1.2115843784948899, | |
| "learning_rate": 4.694763756709967e-05, | |
| "loss": 1.3534, | |
| "num_input_tokens_seen": 31457280, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8006617038875103, | |
| "grad_norm": 1.399225473111164, | |
| "learning_rate": 4.689763641059929e-05, | |
| "loss": 1.3368, | |
| "num_input_tokens_seen": 31719424, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.8072787427626137, | |
| "grad_norm": 1.0957293580137375, | |
| "learning_rate": 4.684725611161067e-05, | |
| "loss": 1.3561, | |
| "num_input_tokens_seen": 31981568, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.8138957816377171, | |
| "grad_norm": 1.1475976465568731, | |
| "learning_rate": 4.679649754243398e-05, | |
| "loss": 1.335, | |
| "num_input_tokens_seen": 32243712, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 1.0767288647528503, | |
| "learning_rate": 4.6745361581918866e-05, | |
| "loss": 1.3462, | |
| "num_input_tokens_seen": 32505856, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8271298593879239, | |
| "grad_norm": 1.1053947900183125, | |
| "learning_rate": 4.669384911544927e-05, | |
| "loss": 1.3474, | |
| "num_input_tokens_seen": 32768000, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8337468982630273, | |
| "grad_norm": 0.9746263899049297, | |
| "learning_rate": 4.664196103492805e-05, | |
| "loss": 1.3464, | |
| "num_input_tokens_seen": 33030144, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8403639371381307, | |
| "grad_norm": 1.1843673694125876, | |
| "learning_rate": 4.658969823876157e-05, | |
| "loss": 1.3218, | |
| "num_input_tokens_seen": 33292288, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8469809760132341, | |
| "grad_norm": 1.0675619106578007, | |
| "learning_rate": 4.6537061631844144e-05, | |
| "loss": 1.3615, | |
| "num_input_tokens_seen": 33554432, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.8535980148883374, | |
| "grad_norm": 1.1082771918099024, | |
| "learning_rate": 4.648405212554236e-05, | |
| "loss": 1.3234, | |
| "num_input_tokens_seen": 33816576, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 1.0533705567853635, | |
| "learning_rate": 4.6430670637679295e-05, | |
| "loss": 1.3131, | |
| "num_input_tokens_seen": 34078720, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8668320926385442, | |
| "grad_norm": 1.3250710828705743, | |
| "learning_rate": 4.637691809251863e-05, | |
| "loss": 1.3449, | |
| "num_input_tokens_seen": 34340864, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8734491315136477, | |
| "grad_norm": 1.0631490766176317, | |
| "learning_rate": 4.6322795420748664e-05, | |
| "loss": 1.3372, | |
| "num_input_tokens_seen": 34603008, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.880066170388751, | |
| "grad_norm": 1.1978298622584882, | |
| "learning_rate": 4.626830355946616e-05, | |
| "loss": 1.3545, | |
| "num_input_tokens_seen": 34865152, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.8866832092638545, | |
| "grad_norm": 1.2388970968450599, | |
| "learning_rate": 4.621344345216017e-05, | |
| "loss": 1.3345, | |
| "num_input_tokens_seen": 35127296, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8933002481389578, | |
| "grad_norm": 1.1244455375131996, | |
| "learning_rate": 4.615821604869564e-05, | |
| "loss": 1.3314, | |
| "num_input_tokens_seen": 35389440, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8999172870140613, | |
| "grad_norm": 1.0226285228765448, | |
| "learning_rate": 4.6102622305297015e-05, | |
| "loss": 1.3437, | |
| "num_input_tokens_seen": 35651584, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.9065343258891646, | |
| "grad_norm": 1.179215839596238, | |
| "learning_rate": 4.604666318453167e-05, | |
| "loss": 1.3109, | |
| "num_input_tokens_seen": 35913728, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.913151364764268, | |
| "grad_norm": 1.0205108272754437, | |
| "learning_rate": 4.5990339655293215e-05, | |
| "loss": 1.3061, | |
| "num_input_tokens_seen": 36175872, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.9197684036393714, | |
| "grad_norm": 1.1432719000110645, | |
| "learning_rate": 4.593365269278477e-05, | |
| "loss": 1.3036, | |
| "num_input_tokens_seen": 36438016, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.9263854425144747, | |
| "grad_norm": 1.0822518315588558, | |
| "learning_rate": 4.587660327850203e-05, | |
| "loss": 1.3078, | |
| "num_input_tokens_seen": 36700160, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9330024813895782, | |
| "grad_norm": 1.2554416669389759, | |
| "learning_rate": 4.581919240021629e-05, | |
| "loss": 1.3116, | |
| "num_input_tokens_seen": 36962304, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.9396195202646815, | |
| "grad_norm": 1.2559908169183205, | |
| "learning_rate": 4.576142105195737e-05, | |
| "loss": 1.3215, | |
| "num_input_tokens_seen": 37224448, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.946236559139785, | |
| "grad_norm": 1.1886853683206, | |
| "learning_rate": 4.570329023399636e-05, | |
| "loss": 1.325, | |
| "num_input_tokens_seen": 37486592, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.9528535980148883, | |
| "grad_norm": 1.1590288482384423, | |
| "learning_rate": 4.564480095282832e-05, | |
| "loss": 1.3397, | |
| "num_input_tokens_seen": 37748736, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9594706368899917, | |
| "grad_norm": 1.2963117712374421, | |
| "learning_rate": 4.5585954221154856e-05, | |
| "loss": 1.3371, | |
| "num_input_tokens_seen": 38010880, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9660876757650951, | |
| "grad_norm": 1.1183668865567917, | |
| "learning_rate": 4.552675105786659e-05, | |
| "loss": 1.2819, | |
| "num_input_tokens_seen": 38273024, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9727047146401985, | |
| "grad_norm": 1.4680624474377164, | |
| "learning_rate": 4.546719248802551e-05, | |
| "loss": 1.2962, | |
| "num_input_tokens_seen": 38535168, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.9793217535153019, | |
| "grad_norm": 1.0568661810526727, | |
| "learning_rate": 4.540727954284721e-05, | |
| "loss": 1.3416, | |
| "num_input_tokens_seen": 38797312, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9859387923904053, | |
| "grad_norm": 1.3165517826941278, | |
| "learning_rate": 4.534701325968308e-05, | |
| "loss": 1.3301, | |
| "num_input_tokens_seen": 39059456, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.9925558312655087, | |
| "grad_norm": 1.0620702266856872, | |
| "learning_rate": 4.528639468200226e-05, | |
| "loss": 1.3065, | |
| "num_input_tokens_seen": 39321600, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9991728701406121, | |
| "grad_norm": 1.0129869486885617, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 1.2729, | |
| "num_input_tokens_seen": 39583744, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0129869486885617, | |
| "learning_rate": 4.5164104847447825e-05, | |
| "loss": 1.2598, | |
| "num_input_tokens_seen": 39616512, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.0066170388751035, | |
| "grad_norm": 3.2796835203295367, | |
| "learning_rate": 4.5102435707938434e-05, | |
| "loss": 0.929, | |
| "num_input_tokens_seen": 39878656, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.0132340777502067, | |
| "grad_norm": 1.6915018216454805, | |
| "learning_rate": 4.5040418508604185e-05, | |
| "loss": 0.9083, | |
| "num_input_tokens_seen": 40140800, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.0198511166253101, | |
| "grad_norm": 1.6354997706408312, | |
| "learning_rate": 4.497805432323015e-05, | |
| "loss": 0.9349, | |
| "num_input_tokens_seen": 40402944, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.0264681555004136, | |
| "grad_norm": 1.7763647403929517, | |
| "learning_rate": 4.491534423160923e-05, | |
| "loss": 0.9363, | |
| "num_input_tokens_seen": 40665088, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.033085194375517, | |
| "grad_norm": 1.574116875627686, | |
| "learning_rate": 4.485228931952347e-05, | |
| "loss": 0.9249, | |
| "num_input_tokens_seen": 40927232, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.0397022332506203, | |
| "grad_norm": 1.4461856174287244, | |
| "learning_rate": 4.4788890678725224e-05, | |
| "loss": 0.8876, | |
| "num_input_tokens_seen": 41189376, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0463192721257237, | |
| "grad_norm": 1.43842705918855, | |
| "learning_rate": 4.472514940691828e-05, | |
| "loss": 0.8882, | |
| "num_input_tokens_seen": 41451520, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.0529363110008272, | |
| "grad_norm": 1.2624518837851444, | |
| "learning_rate": 4.466106660773885e-05, | |
| "loss": 0.8947, | |
| "num_input_tokens_seen": 41713664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0595533498759304, | |
| "grad_norm": 1.3665729188837292, | |
| "learning_rate": 4.4596643390736444e-05, | |
| "loss": 0.8734, | |
| "num_input_tokens_seen": 41975808, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.0661703887510339, | |
| "grad_norm": 1.3002771069796804, | |
| "learning_rate": 4.4531880871354683e-05, | |
| "loss": 0.8626, | |
| "num_input_tokens_seen": 42237952, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0727874276261373, | |
| "grad_norm": 1.2690537305241547, | |
| "learning_rate": 4.446678017091198e-05, | |
| "loss": 0.8404, | |
| "num_input_tokens_seen": 42500096, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.0794044665012408, | |
| "grad_norm": 1.330267070475486, | |
| "learning_rate": 4.4401342416582106e-05, | |
| "loss": 0.8663, | |
| "num_input_tokens_seen": 42762240, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.086021505376344, | |
| "grad_norm": 1.2652249101149649, | |
| "learning_rate": 4.43355687413747e-05, | |
| "loss": 0.8591, | |
| "num_input_tokens_seen": 43024384, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0926385442514475, | |
| "grad_norm": 1.303585645944488, | |
| "learning_rate": 4.4269460284115624e-05, | |
| "loss": 0.8794, | |
| "num_input_tokens_seen": 43286528, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.099255583126551, | |
| "grad_norm": 1.1211778159345078, | |
| "learning_rate": 4.420301818942728e-05, | |
| "loss": 0.8565, | |
| "num_input_tokens_seen": 43548672, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.1058726220016544, | |
| "grad_norm": 1.258025084841763, | |
| "learning_rate": 4.413624360770876e-05, | |
| "loss": 0.8905, | |
| "num_input_tokens_seen": 43810816, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.1124896608767576, | |
| "grad_norm": 1.125309033988079, | |
| "learning_rate": 4.406913769511594e-05, | |
| "loss": 0.8448, | |
| "num_input_tokens_seen": 44072960, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.119106699751861, | |
| "grad_norm": 1.1784006185919231, | |
| "learning_rate": 4.4001701613541456e-05, | |
| "loss": 0.8488, | |
| "num_input_tokens_seen": 44335104, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1257237386269645, | |
| "grad_norm": 1.2604324137156369, | |
| "learning_rate": 4.393393653059462e-05, | |
| "loss": 0.8803, | |
| "num_input_tokens_seen": 44597248, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.1323407775020677, | |
| "grad_norm": 1.4523529165761135, | |
| "learning_rate": 4.386584361958115e-05, | |
| "loss": 0.856, | |
| "num_input_tokens_seen": 44859392, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1389578163771712, | |
| "grad_norm": 1.2839516251773848, | |
| "learning_rate": 4.379742405948288e-05, | |
| "loss": 0.8392, | |
| "num_input_tokens_seen": 45121536, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.1455748552522746, | |
| "grad_norm": 1.1643605452939796, | |
| "learning_rate": 4.372867903493737e-05, | |
| "loss": 0.8649, | |
| "num_input_tokens_seen": 45383680, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.152191894127378, | |
| "grad_norm": 1.2437437966014462, | |
| "learning_rate": 4.3659609736217344e-05, | |
| "loss": 0.8392, | |
| "num_input_tokens_seen": 45645824, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1588089330024813, | |
| "grad_norm": 1.258435176144592, | |
| "learning_rate": 4.359021735921013e-05, | |
| "loss": 0.8563, | |
| "num_input_tokens_seen": 45907968, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1654259718775848, | |
| "grad_norm": 1.9852111216718347, | |
| "learning_rate": 4.352050310539694e-05, | |
| "loss": 0.859, | |
| "num_input_tokens_seen": 46170112, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.1720430107526882, | |
| "grad_norm": 1.3877250659239455, | |
| "learning_rate": 4.345046818183203e-05, | |
| "loss": 0.8804, | |
| "num_input_tokens_seen": 46432256, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1786600496277915, | |
| "grad_norm": 1.1698140955270617, | |
| "learning_rate": 4.3380113801121854e-05, | |
| "loss": 0.8729, | |
| "num_input_tokens_seen": 46694400, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.185277088502895, | |
| "grad_norm": 1.3931561318052186, | |
| "learning_rate": 4.330944118140407e-05, | |
| "loss": 0.8741, | |
| "num_input_tokens_seen": 46956544, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1918941273779984, | |
| "grad_norm": 1.4096016502064548, | |
| "learning_rate": 4.3238451546326367e-05, | |
| "loss": 0.8826, | |
| "num_input_tokens_seen": 47218688, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.1985111662531018, | |
| "grad_norm": 1.071563835965369, | |
| "learning_rate": 4.3167146125025385e-05, | |
| "loss": 0.8786, | |
| "num_input_tokens_seen": 47480832, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.205128205128205, | |
| "grad_norm": 1.4535954622349747, | |
| "learning_rate": 4.309552615210536e-05, | |
| "loss": 0.844, | |
| "num_input_tokens_seen": 47742976, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.2117452440033085, | |
| "grad_norm": 1.1956562696985422, | |
| "learning_rate": 4.302359286761679e-05, | |
| "loss": 0.8603, | |
| "num_input_tokens_seen": 48005120, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.218362282878412, | |
| "grad_norm": 1.4990950568465007, | |
| "learning_rate": 4.295134751703493e-05, | |
| "loss": 0.8361, | |
| "num_input_tokens_seen": 48267264, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2249793217535152, | |
| "grad_norm": 1.3042279096165876, | |
| "learning_rate": 4.287879135123826e-05, | |
| "loss": 0.8707, | |
| "num_input_tokens_seen": 48529408, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.2315963606286187, | |
| "grad_norm": 1.4236645671572632, | |
| "learning_rate": 4.2805925626486796e-05, | |
| "loss": 0.8599, | |
| "num_input_tokens_seen": 48791552, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.2382133995037221, | |
| "grad_norm": 1.1684645721401956, | |
| "learning_rate": 4.2732751604400364e-05, | |
| "loss": 0.8643, | |
| "num_input_tokens_seen": 49053696, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2448304383788256, | |
| "grad_norm": 1.3489623217080666, | |
| "learning_rate": 4.2659270551936756e-05, | |
| "loss": 0.8214, | |
| "num_input_tokens_seen": 49315840, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.2514474772539288, | |
| "grad_norm": 1.1325213775166751, | |
| "learning_rate": 4.2585483741369755e-05, | |
| "loss": 0.8434, | |
| "num_input_tokens_seen": 49577984, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2580645161290323, | |
| "grad_norm": 1.6126949606880288, | |
| "learning_rate": 4.251139245026716e-05, | |
| "loss": 0.8657, | |
| "num_input_tokens_seen": 49840128, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.2646815550041357, | |
| "grad_norm": 1.3428951116543568, | |
| "learning_rate": 4.243699796146863e-05, | |
| "loss": 0.8482, | |
| "num_input_tokens_seen": 50102272, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.271298593879239, | |
| "grad_norm": 1.4845069023405626, | |
| "learning_rate": 4.236230156306348e-05, | |
| "loss": 0.8668, | |
| "num_input_tokens_seen": 50364416, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.2779156327543424, | |
| "grad_norm": 1.3390183725385267, | |
| "learning_rate": 4.2287304548368386e-05, | |
| "loss": 0.8463, | |
| "num_input_tokens_seen": 50626560, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.2845326716294458, | |
| "grad_norm": 1.3393923999288566, | |
| "learning_rate": 4.2212008215905e-05, | |
| "loss": 0.8861, | |
| "num_input_tokens_seen": 50888704, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2911497105045493, | |
| "grad_norm": 1.3594637430884577, | |
| "learning_rate": 4.213641386937743e-05, | |
| "loss": 0.8411, | |
| "num_input_tokens_seen": 51150848, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.2977667493796525, | |
| "grad_norm": 1.354872266010071, | |
| "learning_rate": 4.206052281764973e-05, | |
| "loss": 0.8913, | |
| "num_input_tokens_seen": 51412992, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.304383788254756, | |
| "grad_norm": 1.480084514475744, | |
| "learning_rate": 4.198433637472314e-05, | |
| "loss": 0.8932, | |
| "num_input_tokens_seen": 51675136, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.3110008271298594, | |
| "grad_norm": 1.2179508694377172, | |
| "learning_rate": 4.190785585971346e-05, | |
| "loss": 0.8565, | |
| "num_input_tokens_seen": 51937280, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.3176178660049627, | |
| "grad_norm": 1.2588919604551994, | |
| "learning_rate": 4.1831082596828106e-05, | |
| "loss": 0.8268, | |
| "num_input_tokens_seen": 52199424, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3242349048800661, | |
| "grad_norm": 1.2893099525717986, | |
| "learning_rate": 4.1754017915343234e-05, | |
| "loss": 0.8533, | |
| "num_input_tokens_seen": 52461568, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.3308519437551696, | |
| "grad_norm": 1.2570685698759807, | |
| "learning_rate": 4.167666314958071e-05, | |
| "loss": 0.8529, | |
| "num_input_tokens_seen": 52723712, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.337468982630273, | |
| "grad_norm": 1.3910303184986648, | |
| "learning_rate": 4.159901963888502e-05, | |
| "loss": 0.8777, | |
| "num_input_tokens_seen": 52985856, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.3440860215053765, | |
| "grad_norm": 1.222611343425846, | |
| "learning_rate": 4.152108872760004e-05, | |
| "loss": 0.8855, | |
| "num_input_tokens_seen": 53248000, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3507030603804797, | |
| "grad_norm": 1.1899088358775345, | |
| "learning_rate": 4.144287176504582e-05, | |
| "loss": 0.8781, | |
| "num_input_tokens_seen": 53510144, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.3573200992555832, | |
| "grad_norm": 1.117237186516348, | |
| "learning_rate": 4.136437010549518e-05, | |
| "loss": 0.8227, | |
| "num_input_tokens_seen": 53772288, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.3639371381306864, | |
| "grad_norm": 1.212096841752928, | |
| "learning_rate": 4.1285585108150273e-05, | |
| "loss": 0.8606, | |
| "num_input_tokens_seen": 54034432, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.3705541770057899, | |
| "grad_norm": 1.2208761945851812, | |
| "learning_rate": 4.120651813711905e-05, | |
| "loss": 0.8681, | |
| "num_input_tokens_seen": 54296576, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3771712158808933, | |
| "grad_norm": 1.2204263612725177, | |
| "learning_rate": 4.112717056139164e-05, | |
| "loss": 0.8655, | |
| "num_input_tokens_seen": 54558720, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.3837882547559968, | |
| "grad_norm": 1.3841625088026035, | |
| "learning_rate": 4.104754375481664e-05, | |
| "loss": 0.8752, | |
| "num_input_tokens_seen": 54820864, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3904052936311002, | |
| "grad_norm": 1.291256777563656, | |
| "learning_rate": 4.096763909607737e-05, | |
| "loss": 0.8605, | |
| "num_input_tokens_seen": 55083008, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.3970223325062034, | |
| "grad_norm": 1.1816906512953695, | |
| "learning_rate": 4.088745796866793e-05, | |
| "loss": 0.9004, | |
| "num_input_tokens_seen": 55345152, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.403639371381307, | |
| "grad_norm": 1.0937124460607777, | |
| "learning_rate": 4.08070017608693e-05, | |
| "loss": 0.8701, | |
| "num_input_tokens_seen": 55607296, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 1.1192173919656867, | |
| "learning_rate": 4.072627186572531e-05, | |
| "loss": 0.8665, | |
| "num_input_tokens_seen": 55869440, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.4168734491315136, | |
| "grad_norm": 1.2027417701165573, | |
| "learning_rate": 4.064526968101844e-05, | |
| "loss": 0.8508, | |
| "num_input_tokens_seen": 56131584, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.423490488006617, | |
| "grad_norm": 1.167715206363498, | |
| "learning_rate": 4.056399660924575e-05, | |
| "loss": 0.8425, | |
| "num_input_tokens_seen": 56393728, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.4301075268817205, | |
| "grad_norm": 1.1346595160468362, | |
| "learning_rate": 4.048245405759446e-05, | |
| "loss": 0.8638, | |
| "num_input_tokens_seen": 56655872, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.436724565756824, | |
| "grad_norm": 1.197370470624576, | |
| "learning_rate": 4.040064343791767e-05, | |
| "loss": 0.8424, | |
| "num_input_tokens_seen": 56918016, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.4433416046319272, | |
| "grad_norm": 1.111136180713314, | |
| "learning_rate": 4.0318566166709925e-05, | |
| "loss": 0.8734, | |
| "num_input_tokens_seen": 57180160, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.4499586435070306, | |
| "grad_norm": 1.125826256539522, | |
| "learning_rate": 4.023622366508261e-05, | |
| "loss": 0.8303, | |
| "num_input_tokens_seen": 57442304, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4565756823821339, | |
| "grad_norm": 1.1310355381266508, | |
| "learning_rate": 4.0153617358739406e-05, | |
| "loss": 0.8845, | |
| "num_input_tokens_seen": 57704448, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.4631927212572373, | |
| "grad_norm": 1.1677707737228877, | |
| "learning_rate": 4.0070748677951605e-05, | |
| "loss": 0.8556, | |
| "num_input_tokens_seen": 57966592, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4698097601323408, | |
| "grad_norm": 1.1528540778585532, | |
| "learning_rate": 3.998761905753333e-05, | |
| "loss": 0.8533, | |
| "num_input_tokens_seen": 58228736, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.4764267990074442, | |
| "grad_norm": 1.253460257564942, | |
| "learning_rate": 3.9904229936816674e-05, | |
| "loss": 0.8773, | |
| "num_input_tokens_seen": 58490880, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.4830438378825477, | |
| "grad_norm": 1.1496232622442173, | |
| "learning_rate": 3.9820582759626825e-05, | |
| "loss": 0.8899, | |
| "num_input_tokens_seen": 58753024, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.489660876757651, | |
| "grad_norm": 1.2995455269062046, | |
| "learning_rate": 3.973667897425701e-05, | |
| "loss": 0.8717, | |
| "num_input_tokens_seen": 59015168, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.4962779156327544, | |
| "grad_norm": 1.144101297289898, | |
| "learning_rate": 3.9652520033443485e-05, | |
| "loss": 0.849, | |
| "num_input_tokens_seen": 59277312, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.5028949545078576, | |
| "grad_norm": 1.5210835767609536, | |
| "learning_rate": 3.956810739434032e-05, | |
| "loss": 0.8796, | |
| "num_input_tokens_seen": 59539456, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.509511993382961, | |
| "grad_norm": 1.313822814756041, | |
| "learning_rate": 3.948344251849421e-05, | |
| "loss": 0.8912, | |
| "num_input_tokens_seen": 59801600, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.5161290322580645, | |
| "grad_norm": 1.1203755838084284, | |
| "learning_rate": 3.9398526871819154e-05, | |
| "loss": 0.86, | |
| "num_input_tokens_seen": 60063744, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.522746071133168, | |
| "grad_norm": 1.2939167541224808, | |
| "learning_rate": 3.931336192457106e-05, | |
| "loss": 0.8617, | |
| "num_input_tokens_seen": 60325888, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.5293631100082714, | |
| "grad_norm": 1.0601911330047828, | |
| "learning_rate": 3.9227949151322326e-05, | |
| "loss": 0.8474, | |
| "num_input_tokens_seen": 60588032, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.5359801488833746, | |
| "grad_norm": 1.3013451522634765, | |
| "learning_rate": 3.914229003093627e-05, | |
| "loss": 0.8394, | |
| "num_input_tokens_seen": 60850176, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.542597187758478, | |
| "grad_norm": 1.0264196280817957, | |
| "learning_rate": 3.905638604654156e-05, | |
| "loss": 0.8639, | |
| "num_input_tokens_seen": 61112320, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.5492142266335813, | |
| "grad_norm": 1.170686578102918, | |
| "learning_rate": 3.897023868550649e-05, | |
| "loss": 0.8656, | |
| "num_input_tokens_seen": 61374464, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.5558312655086848, | |
| "grad_norm": 1.1035633097086974, | |
| "learning_rate": 3.8883849439413265e-05, | |
| "loss": 0.8839, | |
| "num_input_tokens_seen": 61636608, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.5624483043837882, | |
| "grad_norm": 1.0804813512718974, | |
| "learning_rate": 3.879721980403217e-05, | |
| "loss": 0.8833, | |
| "num_input_tokens_seen": 61898752, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.5690653432588917, | |
| "grad_norm": 1.1077709384095282, | |
| "learning_rate": 3.871035127929566e-05, | |
| "loss": 0.8818, | |
| "num_input_tokens_seen": 62160896, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5756823821339951, | |
| "grad_norm": 1.182231681118449, | |
| "learning_rate": 3.8623245369272385e-05, | |
| "loss": 0.8514, | |
| "num_input_tokens_seen": 62423040, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.5822994210090986, | |
| "grad_norm": 1.2617220805970901, | |
| "learning_rate": 3.853590358214119e-05, | |
| "loss": 0.8688, | |
| "num_input_tokens_seen": 62685184, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5889164598842018, | |
| "grad_norm": 1.1349928239204545, | |
| "learning_rate": 3.844832743016491e-05, | |
| "loss": 0.8636, | |
| "num_input_tokens_seen": 62947328, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.595533498759305, | |
| "grad_norm": 1.2926042995530984, | |
| "learning_rate": 3.83605184296643e-05, | |
| "loss": 0.8556, | |
| "num_input_tokens_seen": 63209472, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.6021505376344085, | |
| "grad_norm": 1.111484999792133, | |
| "learning_rate": 3.8272478100991714e-05, | |
| "loss": 0.8908, | |
| "num_input_tokens_seen": 63471616, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.608767576509512, | |
| "grad_norm": 1.2640736812779294, | |
| "learning_rate": 3.818420796850478e-05, | |
| "loss": 0.8654, | |
| "num_input_tokens_seen": 63733760, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 1.005896566039546, | |
| "learning_rate": 3.809570956054004e-05, | |
| "loss": 0.8693, | |
| "num_input_tokens_seen": 63995904, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.6220016542597189, | |
| "grad_norm": 1.3829300626223842, | |
| "learning_rate": 3.800698440938644e-05, | |
| "loss": 0.8818, | |
| "num_input_tokens_seen": 64258048, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.6286186931348223, | |
| "grad_norm": 1.0255361852704923, | |
| "learning_rate": 3.791803405125885e-05, | |
| "loss": 0.8915, | |
| "num_input_tokens_seen": 64520192, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.6352357320099256, | |
| "grad_norm": 1.1092615011275497, | |
| "learning_rate": 3.782886002627145e-05, | |
| "loss": 0.8551, | |
| "num_input_tokens_seen": 64782336, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.6418527708850288, | |
| "grad_norm": 1.1366146935597203, | |
| "learning_rate": 3.773946387841103e-05, | |
| "loss": 0.8436, | |
| "num_input_tokens_seen": 65044480, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.6484698097601322, | |
| "grad_norm": 1.1579215508983878, | |
| "learning_rate": 3.764984715551032e-05, | |
| "loss": 0.8664, | |
| "num_input_tokens_seen": 65306624, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6550868486352357, | |
| "grad_norm": 1.1104612323014593, | |
| "learning_rate": 3.756001140922112e-05, | |
| "loss": 0.8453, | |
| "num_input_tokens_seen": 65568768, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.6617038875103392, | |
| "grad_norm": 1.1393363090562745, | |
| "learning_rate": 3.74699581949875e-05, | |
| "loss": 0.8932, | |
| "num_input_tokens_seen": 65830912, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.6683209263854426, | |
| "grad_norm": 1.1318034968552904, | |
| "learning_rate": 3.737968907201882e-05, | |
| "loss": 0.8922, | |
| "num_input_tokens_seen": 66093056, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.674937965260546, | |
| "grad_norm": 1.072676962267901, | |
| "learning_rate": 3.728920560326275e-05, | |
| "loss": 0.8708, | |
| "num_input_tokens_seen": 66355200, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6815550041356493, | |
| "grad_norm": 1.0759090742363742, | |
| "learning_rate": 3.719850935537821e-05, | |
| "loss": 0.8904, | |
| "num_input_tokens_seen": 66617344, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6881720430107527, | |
| "grad_norm": 1.1591455124390417, | |
| "learning_rate": 3.710760189870825e-05, | |
| "loss": 0.8846, | |
| "num_input_tokens_seen": 66879488, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.694789081885856, | |
| "grad_norm": 1.048864507790183, | |
| "learning_rate": 3.701648480725286e-05, | |
| "loss": 0.8906, | |
| "num_input_tokens_seen": 67141632, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.7014061207609594, | |
| "grad_norm": 1.0992102136364237, | |
| "learning_rate": 3.692515965864169e-05, | |
| "loss": 0.8653, | |
| "num_input_tokens_seen": 67403776, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.7080231596360629, | |
| "grad_norm": 1.1084880653140272, | |
| "learning_rate": 3.683362803410678e-05, | |
| "loss": 0.9049, | |
| "num_input_tokens_seen": 67665920, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.7146401985111663, | |
| "grad_norm": 1.0193571027301067, | |
| "learning_rate": 3.674189151845515e-05, | |
| "loss": 0.8668, | |
| "num_input_tokens_seen": 67928064, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7212572373862698, | |
| "grad_norm": 1.1435915369787901, | |
| "learning_rate": 3.6649951700041366e-05, | |
| "loss": 0.85, | |
| "num_input_tokens_seen": 68190208, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.727874276261373, | |
| "grad_norm": 1.0624020069945779, | |
| "learning_rate": 3.6557810170740034e-05, | |
| "loss": 0.8833, | |
| "num_input_tokens_seen": 68452352, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.7344913151364765, | |
| "grad_norm": 1.1397221255920662, | |
| "learning_rate": 3.646546852591827e-05, | |
| "loss": 0.8979, | |
| "num_input_tokens_seen": 68714496, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.7411083540115797, | |
| "grad_norm": 1.0934989352817812, | |
| "learning_rate": 3.637292836440802e-05, | |
| "loss": 0.8789, | |
| "num_input_tokens_seen": 68976640, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.7477253928866832, | |
| "grad_norm": 1.1270519325330435, | |
| "learning_rate": 3.6280191288478436e-05, | |
| "loss": 0.8738, | |
| "num_input_tokens_seen": 69238784, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7543424317617866, | |
| "grad_norm": 1.1064043030347466, | |
| "learning_rate": 3.61872589038081e-05, | |
| "loss": 0.9082, | |
| "num_input_tokens_seen": 69500928, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.76095947063689, | |
| "grad_norm": 1.0016528471988886, | |
| "learning_rate": 3.6094132819457205e-05, | |
| "loss": 0.8879, | |
| "num_input_tokens_seen": 69763072, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.7675765095119935, | |
| "grad_norm": 1.1490777889864239, | |
| "learning_rate": 3.600081464783977e-05, | |
| "loss": 0.9091, | |
| "num_input_tokens_seen": 70025216, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 0.9851189833580496, | |
| "learning_rate": 3.5907306004695636e-05, | |
| "loss": 0.8406, | |
| "num_input_tokens_seen": 70287360, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.7808105872622002, | |
| "grad_norm": 1.0441019161167733, | |
| "learning_rate": 3.5813608509062526e-05, | |
| "loss": 0.8573, | |
| "num_input_tokens_seen": 70549504, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7874276261373034, | |
| "grad_norm": 1.0021575087386156, | |
| "learning_rate": 3.5719723783248045e-05, | |
| "loss": 0.8901, | |
| "num_input_tokens_seen": 70811648, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.794044665012407, | |
| "grad_norm": 1.0004983205114817, | |
| "learning_rate": 3.5625653452801525e-05, | |
| "loss": 0.8643, | |
| "num_input_tokens_seen": 71073792, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.8006617038875103, | |
| "grad_norm": 1.0020219633694651, | |
| "learning_rate": 3.553139914648593e-05, | |
| "loss": 0.8372, | |
| "num_input_tokens_seen": 71335936, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.8072787427626138, | |
| "grad_norm": 1.089710937917061, | |
| "learning_rate": 3.543696249624965e-05, | |
| "loss": 0.8566, | |
| "num_input_tokens_seen": 71598080, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.8138957816377173, | |
| "grad_norm": 0.9963288164384833, | |
| "learning_rate": 3.534234513719821e-05, | |
| "loss": 0.867, | |
| "num_input_tokens_seen": 71860224, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.8205128205128205, | |
| "grad_norm": 1.1734651632685695, | |
| "learning_rate": 3.5247548707565986e-05, | |
| "loss": 0.8618, | |
| "num_input_tokens_seen": 72122368, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.827129859387924, | |
| "grad_norm": 1.0888558238501158, | |
| "learning_rate": 3.5152574848687875e-05, | |
| "loss": 0.8733, | |
| "num_input_tokens_seen": 72384512, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.8337468982630272, | |
| "grad_norm": 1.0461245836544661, | |
| "learning_rate": 3.505742520497079e-05, | |
| "loss": 0.8677, | |
| "num_input_tokens_seen": 72646656, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.8403639371381306, | |
| "grad_norm": 1.086689279678754, | |
| "learning_rate": 3.496210142386527e-05, | |
| "loss": 0.8389, | |
| "num_input_tokens_seen": 72908800, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.846980976013234, | |
| "grad_norm": 1.1718920032890414, | |
| "learning_rate": 3.4866605155836915e-05, | |
| "loss": 0.8909, | |
| "num_input_tokens_seen": 73170944, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8535980148883375, | |
| "grad_norm": 1.1057752723633063, | |
| "learning_rate": 3.47709380543378e-05, | |
| "loss": 0.8844, | |
| "num_input_tokens_seen": 73433088, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.860215053763441, | |
| "grad_norm": 1.098019821064288, | |
| "learning_rate": 3.46751017757779e-05, | |
| "loss": 0.8717, | |
| "num_input_tokens_seen": 73695232, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.8668320926385442, | |
| "grad_norm": 1.0393178044085931, | |
| "learning_rate": 3.457909797949633e-05, | |
| "loss": 0.8801, | |
| "num_input_tokens_seen": 73957376, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.8734491315136477, | |
| "grad_norm": 1.2431012129822678, | |
| "learning_rate": 3.448292832773269e-05, | |
| "loss": 0.8604, | |
| "num_input_tokens_seen": 74219520, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.880066170388751, | |
| "grad_norm": 1.019422889343279, | |
| "learning_rate": 3.438659448559825e-05, | |
| "loss": 0.8681, | |
| "num_input_tokens_seen": 74481664, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.8866832092638544, | |
| "grad_norm": 1.2811321592728038, | |
| "learning_rate": 3.4290098121047114e-05, | |
| "loss": 0.8734, | |
| "num_input_tokens_seen": 74743808, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.8933002481389578, | |
| "grad_norm": 1.070768836122241, | |
| "learning_rate": 3.419344090484736e-05, | |
| "loss": 0.8605, | |
| "num_input_tokens_seen": 75005952, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.8999172870140613, | |
| "grad_norm": 1.1946653702822796, | |
| "learning_rate": 3.409662451055208e-05, | |
| "loss": 0.8714, | |
| "num_input_tokens_seen": 75268096, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.9065343258891647, | |
| "grad_norm": 1.140200266566108, | |
| "learning_rate": 3.3999650614470445e-05, | |
| "loss": 0.8573, | |
| "num_input_tokens_seen": 75530240, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.913151364764268, | |
| "grad_norm": 1.121817062197686, | |
| "learning_rate": 3.390252089563867e-05, | |
| "loss": 0.8889, | |
| "num_input_tokens_seen": 75792384, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.9197684036393714, | |
| "grad_norm": 1.1795614581286045, | |
| "learning_rate": 3.3805237035790924e-05, | |
| "loss": 0.8636, | |
| "num_input_tokens_seen": 76054528, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.9263854425144746, | |
| "grad_norm": 1.1839809443148863, | |
| "learning_rate": 3.3707800719330184e-05, | |
| "loss": 0.8729, | |
| "num_input_tokens_seen": 76316672, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.933002481389578, | |
| "grad_norm": 1.1302681193686077, | |
| "learning_rate": 3.361021363329917e-05, | |
| "loss": 0.8474, | |
| "num_input_tokens_seen": 76578816, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.9396195202646815, | |
| "grad_norm": 0.9877234338487746, | |
| "learning_rate": 3.351247746735103e-05, | |
| "loss": 0.8728, | |
| "num_input_tokens_seen": 76840960, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.946236559139785, | |
| "grad_norm": 1.112763917244249, | |
| "learning_rate": 3.341459391372016e-05, | |
| "loss": 0.8552, | |
| "num_input_tokens_seen": 77103104, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.9528535980148884, | |
| "grad_norm": 1.0135909963895178, | |
| "learning_rate": 3.331656466719284e-05, | |
| "loss": 0.8591, | |
| "num_input_tokens_seen": 77365248, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.9594706368899917, | |
| "grad_norm": 1.057962809664509, | |
| "learning_rate": 3.321839142507794e-05, | |
| "loss": 0.8703, | |
| "num_input_tokens_seen": 77627392, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.9660876757650951, | |
| "grad_norm": 1.024489249713761, | |
| "learning_rate": 3.312007588717751e-05, | |
| "loss": 0.8433, | |
| "num_input_tokens_seen": 77889536, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.9727047146401984, | |
| "grad_norm": 1.1626693600678064, | |
| "learning_rate": 3.302161975575736e-05, | |
| "loss": 0.8727, | |
| "num_input_tokens_seen": 78151680, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.9793217535153018, | |
| "grad_norm": 0.9962455067400268, | |
| "learning_rate": 3.292302473551757e-05, | |
| "loss": 0.8487, | |
| "num_input_tokens_seen": 78413824, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9859387923904053, | |
| "grad_norm": 1.1129572616806531, | |
| "learning_rate": 3.2824292533562996e-05, | |
| "loss": 0.8782, | |
| "num_input_tokens_seen": 78675968, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.9925558312655087, | |
| "grad_norm": 1.1359145638143653, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.8671, | |
| "num_input_tokens_seen": 78938112, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.9991728701406122, | |
| "grad_norm": 1.0937608862202273, | |
| "learning_rate": 3.262642342477531e-05, | |
| "loss": 0.8609, | |
| "num_input_tokens_seen": 79200256, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.0937608862202273, | |
| "learning_rate": 3.252728994390951e-05, | |
| "loss": 0.8026, | |
| "num_input_tokens_seen": 79233024, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.0066170388751035, | |
| "grad_norm": 3.398274310113878, | |
| "learning_rate": 3.2428026133204184e-05, | |
| "loss": 0.4556, | |
| "num_input_tokens_seen": 79495168, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.013234077750207, | |
| "grad_norm": 2.5271586821507293, | |
| "learning_rate": 3.232863371134385e-05, | |
| "loss": 0.4076, | |
| "num_input_tokens_seen": 79757312, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.0198511166253104, | |
| "grad_norm": 1.783345206375513, | |
| "learning_rate": 3.2229114399239816e-05, | |
| "loss": 0.4099, | |
| "num_input_tokens_seen": 80019456, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.0264681555004134, | |
| "grad_norm": 2.5054864860618133, | |
| "learning_rate": 3.21294699200004e-05, | |
| "loss": 0.3909, | |
| "num_input_tokens_seen": 80281600, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.033085194375517, | |
| "grad_norm": 2.376970275167304, | |
| "learning_rate": 3.202970199890111e-05, | |
| "loss": 0.3925, | |
| "num_input_tokens_seen": 80543744, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.0397022332506203, | |
| "grad_norm": 2.1738898999313485, | |
| "learning_rate": 3.1929812363354766e-05, | |
| "loss": 0.3738, | |
| "num_input_tokens_seen": 80805888, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0463192721257237, | |
| "grad_norm": 1.688931111731337, | |
| "learning_rate": 3.18298027428816e-05, | |
| "loss": 0.3376, | |
| "num_input_tokens_seen": 81068032, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.052936311000827, | |
| "grad_norm": 1.4853385671984687, | |
| "learning_rate": 3.172967486907928e-05, | |
| "loss": 0.3595, | |
| "num_input_tokens_seen": 81330176, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.0595533498759306, | |
| "grad_norm": 1.3248855510062758, | |
| "learning_rate": 3.1629430475592955e-05, | |
| "loss": 0.3749, | |
| "num_input_tokens_seen": 81592320, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.066170388751034, | |
| "grad_norm": 1.4372594674243309, | |
| "learning_rate": 3.1529071298085236e-05, | |
| "loss": 0.361, | |
| "num_input_tokens_seen": 81854464, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.072787427626137, | |
| "grad_norm": 1.1608848796138636, | |
| "learning_rate": 3.142859907420615e-05, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 82116608, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.0794044665012406, | |
| "grad_norm": 1.2207815470871421, | |
| "learning_rate": 3.132801554356303e-05, | |
| "loss": 0.329, | |
| "num_input_tokens_seen": 82378752, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.086021505376344, | |
| "grad_norm": 1.3338094436906303, | |
| "learning_rate": 3.122732244769041e-05, | |
| "loss": 0.3486, | |
| "num_input_tokens_seen": 82640896, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.0926385442514475, | |
| "grad_norm": 1.2172300343622242, | |
| "learning_rate": 3.1126521530019874e-05, | |
| "loss": 0.3487, | |
| "num_input_tokens_seen": 82903040, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.099255583126551, | |
| "grad_norm": 1.4126997538766228, | |
| "learning_rate": 3.102561453584987e-05, | |
| "loss": 0.3437, | |
| "num_input_tokens_seen": 83165184, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.1058726220016544, | |
| "grad_norm": 1.1853200893362168, | |
| "learning_rate": 3.092460321231547e-05, | |
| "loss": 0.3278, | |
| "num_input_tokens_seen": 83427328, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.112489660876758, | |
| "grad_norm": 1.1715327320798274, | |
| "learning_rate": 3.0823489308358174e-05, | |
| "loss": 0.3086, | |
| "num_input_tokens_seen": 83689472, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.119106699751861, | |
| "grad_norm": 1.0852203225918198, | |
| "learning_rate": 3.072227457469554e-05, | |
| "loss": 0.3321, | |
| "num_input_tokens_seen": 83951616, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.1257237386269643, | |
| "grad_norm": 1.1506537447974439, | |
| "learning_rate": 3.062096076379097e-05, | |
| "loss": 0.3396, | |
| "num_input_tokens_seen": 84213760, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.1323407775020677, | |
| "grad_norm": 1.2893107469955971, | |
| "learning_rate": 3.0519549629823286e-05, | |
| "loss": 0.3243, | |
| "num_input_tokens_seen": 84475904, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.138957816377171, | |
| "grad_norm": 1.112482245858434, | |
| "learning_rate": 3.0418042928656414e-05, | |
| "loss": 0.3304, | |
| "num_input_tokens_seen": 84738048, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.1455748552522746, | |
| "grad_norm": 1.0505114063288397, | |
| "learning_rate": 3.0316442417808954e-05, | |
| "loss": 0.3289, | |
| "num_input_tokens_seen": 85000192, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.152191894127378, | |
| "grad_norm": 1.0702654196144072, | |
| "learning_rate": 3.0214749856423745e-05, | |
| "loss": 0.3114, | |
| "num_input_tokens_seen": 85262336, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.1588089330024816, | |
| "grad_norm": 1.050296411553259, | |
| "learning_rate": 3.0112967005237443e-05, | |
| "loss": 0.3066, | |
| "num_input_tokens_seen": 85524480, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.1654259718775846, | |
| "grad_norm": 1.0882638825076911, | |
| "learning_rate": 3.0011095626549977e-05, | |
| "loss": 0.3193, | |
| "num_input_tokens_seen": 85786624, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.172043010752688, | |
| "grad_norm": 1.1623059515415315, | |
| "learning_rate": 2.990913748419411e-05, | |
| "loss": 0.3526, | |
| "num_input_tokens_seen": 86048768, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1786600496277915, | |
| "grad_norm": 1.1908398814663688, | |
| "learning_rate": 2.9807094343504804e-05, | |
| "loss": 0.3518, | |
| "num_input_tokens_seen": 86310912, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.185277088502895, | |
| "grad_norm": 1.1808598497115437, | |
| "learning_rate": 2.9704967971288755e-05, | |
| "loss": 0.3385, | |
| "num_input_tokens_seen": 86573056, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.1918941273779984, | |
| "grad_norm": 1.1384860728455102, | |
| "learning_rate": 2.9602760135793735e-05, | |
| "loss": 0.3228, | |
| "num_input_tokens_seen": 86835200, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.198511166253102, | |
| "grad_norm": 1.0916481479985516, | |
| "learning_rate": 2.9500472606677994e-05, | |
| "loss": 0.3118, | |
| "num_input_tokens_seen": 87097344, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.2051282051282053, | |
| "grad_norm": 0.9878895674767743, | |
| "learning_rate": 2.9398107154979638e-05, | |
| "loss": 0.3293, | |
| "num_input_tokens_seen": 87359488, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.2117452440033087, | |
| "grad_norm": 1.1884030862907187, | |
| "learning_rate": 2.9295665553085937e-05, | |
| "loss": 0.3331, | |
| "num_input_tokens_seen": 87621632, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.2183622828784118, | |
| "grad_norm": 1.1094941724975913, | |
| "learning_rate": 2.919314957470265e-05, | |
| "loss": 0.322, | |
| "num_input_tokens_seen": 87883776, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.224979321753515, | |
| "grad_norm": 0.9881499762640955, | |
| "learning_rate": 2.909056099482332e-05, | |
| "loss": 0.3239, | |
| "num_input_tokens_seen": 88145920, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.2315963606286187, | |
| "grad_norm": 1.0840977567322172, | |
| "learning_rate": 2.8987901589698517e-05, | |
| "loss": 0.3163, | |
| "num_input_tokens_seen": 88408064, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.238213399503722, | |
| "grad_norm": 1.0548483105248472, | |
| "learning_rate": 2.8885173136805127e-05, | |
| "loss": 0.3303, | |
| "num_input_tokens_seen": 88670208, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2448304383788256, | |
| "grad_norm": 1.1561588854960063, | |
| "learning_rate": 2.8782377414815532e-05, | |
| "loss": 0.3327, | |
| "num_input_tokens_seen": 88932352, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.251447477253929, | |
| "grad_norm": 1.1826177082260094, | |
| "learning_rate": 2.867951620356684e-05, | |
| "loss": 0.3161, | |
| "num_input_tokens_seen": 89194496, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 1.0982559371878697, | |
| "learning_rate": 2.8576591284030058e-05, | |
| "loss": 0.3238, | |
| "num_input_tokens_seen": 89456640, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.2646815550041355, | |
| "grad_norm": 1.1788800916865765, | |
| "learning_rate": 2.847360443827926e-05, | |
| "loss": 0.3423, | |
| "num_input_tokens_seen": 89718784, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.271298593879239, | |
| "grad_norm": 1.1681881241245002, | |
| "learning_rate": 2.837055744946072e-05, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 89980928, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.2779156327543424, | |
| "grad_norm": 1.150709547987284, | |
| "learning_rate": 2.826745210176207e-05, | |
| "loss": 0.3252, | |
| "num_input_tokens_seen": 90243072, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.284532671629446, | |
| "grad_norm": 1.0763555223019872, | |
| "learning_rate": 2.8164290180381365e-05, | |
| "loss": 0.3398, | |
| "num_input_tokens_seen": 90505216, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.2911497105045493, | |
| "grad_norm": 1.1937992265302106, | |
| "learning_rate": 2.8061073471496195e-05, | |
| "loss": 0.3397, | |
| "num_input_tokens_seen": 90767360, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.2977667493796528, | |
| "grad_norm": 0.9959579454351639, | |
| "learning_rate": 2.795780376223277e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 91029504, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.304383788254756, | |
| "grad_norm": 1.0702429896944252, | |
| "learning_rate": 2.7854482840634965e-05, | |
| "loss": 0.3208, | |
| "num_input_tokens_seen": 91291648, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.311000827129859, | |
| "grad_norm": 1.1450863009989223, | |
| "learning_rate": 2.7751112495633345e-05, | |
| "loss": 0.3313, | |
| "num_input_tokens_seen": 91553792, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.3176178660049627, | |
| "grad_norm": 1.0477187731328461, | |
| "learning_rate": 2.764769451701421e-05, | |
| "loss": 0.3298, | |
| "num_input_tokens_seen": 91815936, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.324234904880066, | |
| "grad_norm": 1.1549329031215105, | |
| "learning_rate": 2.7544230695388634e-05, | |
| "loss": 0.341, | |
| "num_input_tokens_seen": 92078080, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.3308519437551696, | |
| "grad_norm": 1.0366387991827954, | |
| "learning_rate": 2.744072282216139e-05, | |
| "loss": 0.3425, | |
| "num_input_tokens_seen": 92340224, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.337468982630273, | |
| "grad_norm": 1.1817424289487461, | |
| "learning_rate": 2.73371726895e-05, | |
| "loss": 0.3409, | |
| "num_input_tokens_seen": 92602368, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.3440860215053765, | |
| "grad_norm": 1.0301874196493457, | |
| "learning_rate": 2.7233582090303674e-05, | |
| "loss": 0.3229, | |
| "num_input_tokens_seen": 92864512, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.3507030603804795, | |
| "grad_norm": 1.0646554418765033, | |
| "learning_rate": 2.7129952818172256e-05, | |
| "loss": 0.3344, | |
| "num_input_tokens_seen": 93126656, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.357320099255583, | |
| "grad_norm": 1.084404941345572, | |
| "learning_rate": 2.702628666737521e-05, | |
| "loss": 0.3254, | |
| "num_input_tokens_seen": 93388800, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.3639371381306864, | |
| "grad_norm": 1.065831371281375, | |
| "learning_rate": 2.692258543282052e-05, | |
| "loss": 0.3211, | |
| "num_input_tokens_seen": 93650944, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.37055417700579, | |
| "grad_norm": 1.153320273204117, | |
| "learning_rate": 2.68188509100236e-05, | |
| "loss": 0.3374, | |
| "num_input_tokens_seen": 93913088, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3771712158808933, | |
| "grad_norm": 1.170047331616093, | |
| "learning_rate": 2.671508489507626e-05, | |
| "loss": 0.3478, | |
| "num_input_tokens_seen": 94175232, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.3837882547559968, | |
| "grad_norm": 1.1329869776410735, | |
| "learning_rate": 2.6611289184615558e-05, | |
| "loss": 0.3347, | |
| "num_input_tokens_seen": 94437376, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.3904052936311, | |
| "grad_norm": 1.0908955007085368, | |
| "learning_rate": 2.6507465575792707e-05, | |
| "loss": 0.3341, | |
| "num_input_tokens_seen": 94699520, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.3970223325062037, | |
| "grad_norm": 1.0929346197114131, | |
| "learning_rate": 2.6403615866241964e-05, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 94961664, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.4036393713813067, | |
| "grad_norm": 1.142990047990031, | |
| "learning_rate": 2.6299741854049508e-05, | |
| "loss": 0.334, | |
| "num_input_tokens_seen": 95223808, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.41025641025641, | |
| "grad_norm": 1.1552852832233795, | |
| "learning_rate": 2.6195845337722303e-05, | |
| "loss": 0.3299, | |
| "num_input_tokens_seen": 95485952, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.4168734491315136, | |
| "grad_norm": 1.111643861242986, | |
| "learning_rate": 2.6091928116156938e-05, | |
| "loss": 0.3278, | |
| "num_input_tokens_seen": 95748096, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.423490488006617, | |
| "grad_norm": 1.1676117715035266, | |
| "learning_rate": 2.598799198860853e-05, | |
| "loss": 0.3355, | |
| "num_input_tokens_seen": 96010240, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.4301075268817205, | |
| "grad_norm": 1.1044739988434005, | |
| "learning_rate": 2.5884038754659497e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 96272384, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.436724565756824, | |
| "grad_norm": 1.0768350723016058, | |
| "learning_rate": 2.5780070214188478e-05, | |
| "loss": 0.3257, | |
| "num_input_tokens_seen": 96534528, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4433416046319274, | |
| "grad_norm": 1.0381793723404524, | |
| "learning_rate": 2.5676088167339128e-05, | |
| "loss": 0.3318, | |
| "num_input_tokens_seen": 96796672, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.4499586435070304, | |
| "grad_norm": 1.2240143879709446, | |
| "learning_rate": 2.5572094414488944e-05, | |
| "loss": 0.3424, | |
| "num_input_tokens_seen": 97058816, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.456575682382134, | |
| "grad_norm": 1.0385118889958669, | |
| "learning_rate": 2.5468090756218117e-05, | |
| "loss": 0.3353, | |
| "num_input_tokens_seen": 97320960, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.4631927212572373, | |
| "grad_norm": 1.1995150658571436, | |
| "learning_rate": 2.5364078993278335e-05, | |
| "loss": 0.3345, | |
| "num_input_tokens_seen": 97583104, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.4698097601323408, | |
| "grad_norm": 1.1417927976011224, | |
| "learning_rate": 2.526006092656161e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 97845248, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.4764267990074442, | |
| "grad_norm": 1.11091502135598, | |
| "learning_rate": 2.5156038357069106e-05, | |
| "loss": 0.3378, | |
| "num_input_tokens_seen": 98107392, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.4830438378825477, | |
| "grad_norm": 1.1198492983330697, | |
| "learning_rate": 2.5052013085879955e-05, | |
| "loss": 0.3411, | |
| "num_input_tokens_seen": 98369536, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.489660876757651, | |
| "grad_norm": 1.0507799282706884, | |
| "learning_rate": 2.494798691412005e-05, | |
| "loss": 0.3356, | |
| "num_input_tokens_seen": 98631680, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.4962779156327546, | |
| "grad_norm": 1.104596832922824, | |
| "learning_rate": 2.4843961642930904e-05, | |
| "loss": 0.3303, | |
| "num_input_tokens_seen": 98893824, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.5028949545078576, | |
| "grad_norm": 1.0982025410563003, | |
| "learning_rate": 2.4739939073438397e-05, | |
| "loss": 0.3212, | |
| "num_input_tokens_seen": 99155968, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.509511993382961, | |
| "grad_norm": 1.0765071778760118, | |
| "learning_rate": 2.463592100672168e-05, | |
| "loss": 0.3335, | |
| "num_input_tokens_seen": 99418112, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.5161290322580645, | |
| "grad_norm": 1.0693943579256004, | |
| "learning_rate": 2.4531909243781885e-05, | |
| "loss": 0.3263, | |
| "num_input_tokens_seen": 99680256, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.522746071133168, | |
| "grad_norm": 1.078185931899051, | |
| "learning_rate": 2.442790558551106e-05, | |
| "loss": 0.3363, | |
| "num_input_tokens_seen": 99942400, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.5293631100082714, | |
| "grad_norm": 1.0519087719201188, | |
| "learning_rate": 2.4323911832660874e-05, | |
| "loss": 0.339, | |
| "num_input_tokens_seen": 100204544, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.5359801488833744, | |
| "grad_norm": 1.0506382318529084, | |
| "learning_rate": 2.4219929785811518e-05, | |
| "loss": 0.3359, | |
| "num_input_tokens_seen": 100466688, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.542597187758478, | |
| "grad_norm": 1.0113516063929346, | |
| "learning_rate": 2.4115961245340505e-05, | |
| "loss": 0.3379, | |
| "num_input_tokens_seen": 100728832, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.5492142266335813, | |
| "grad_norm": 1.0637623036996438, | |
| "learning_rate": 2.401200801139148e-05, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 100990976, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.555831265508685, | |
| "grad_norm": 1.1501956408026655, | |
| "learning_rate": 2.3908071883843068e-05, | |
| "loss": 0.3411, | |
| "num_input_tokens_seen": 101253120, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.5624483043837882, | |
| "grad_norm": 1.1150126958290971, | |
| "learning_rate": 2.38041546622777e-05, | |
| "loss": 0.3366, | |
| "num_input_tokens_seen": 101515264, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.5690653432588917, | |
| "grad_norm": 1.073213710305425, | |
| "learning_rate": 2.3700258145950495e-05, | |
| "loss": 0.3656, | |
| "num_input_tokens_seen": 101777408, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.575682382133995, | |
| "grad_norm": 1.078732276630299, | |
| "learning_rate": 2.359638413375804e-05, | |
| "loss": 0.3423, | |
| "num_input_tokens_seen": 102039552, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.5822994210090986, | |
| "grad_norm": 1.0226452813564892, | |
| "learning_rate": 2.349253442420731e-05, | |
| "loss": 0.3217, | |
| "num_input_tokens_seen": 102301696, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.588916459884202, | |
| "grad_norm": 1.0479991409872327, | |
| "learning_rate": 2.338871081538445e-05, | |
| "loss": 0.3174, | |
| "num_input_tokens_seen": 102563840, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.595533498759305, | |
| "grad_norm": 1.0103696849600996, | |
| "learning_rate": 2.3284915104923752e-05, | |
| "loss": 0.3346, | |
| "num_input_tokens_seen": 102825984, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.6021505376344085, | |
| "grad_norm": 1.161868760445816, | |
| "learning_rate": 2.3181149089976405e-05, | |
| "loss": 0.3444, | |
| "num_input_tokens_seen": 103088128, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.608767576509512, | |
| "grad_norm": 1.0926396656803976, | |
| "learning_rate": 2.3077414567179494e-05, | |
| "loss": 0.3348, | |
| "num_input_tokens_seen": 103350272, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 1.1576399191571771, | |
| "learning_rate": 2.2973713332624796e-05, | |
| "loss": 0.3288, | |
| "num_input_tokens_seen": 103612416, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.622001654259719, | |
| "grad_norm": 0.9917185104236572, | |
| "learning_rate": 2.2870047181827743e-05, | |
| "loss": 0.3328, | |
| "num_input_tokens_seen": 103874560, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.6286186931348223, | |
| "grad_norm": 1.037992774395136, | |
| "learning_rate": 2.2766417909696332e-05, | |
| "loss": 0.3554, | |
| "num_input_tokens_seen": 104136704, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.6352357320099253, | |
| "grad_norm": 1.0486140714606227, | |
| "learning_rate": 2.2662827310499995e-05, | |
| "loss": 0.3296, | |
| "num_input_tokens_seen": 104398848, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.641852770885029, | |
| "grad_norm": 1.0447686801537046, | |
| "learning_rate": 2.2559277177838612e-05, | |
| "loss": 0.3262, | |
| "num_input_tokens_seen": 104660992, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.6484698097601322, | |
| "grad_norm": 1.0165801431956982, | |
| "learning_rate": 2.2455769304611365e-05, | |
| "loss": 0.3453, | |
| "num_input_tokens_seen": 104923136, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.6550868486352357, | |
| "grad_norm": 0.9788796226467195, | |
| "learning_rate": 2.2352305482985793e-05, | |
| "loss": 0.3343, | |
| "num_input_tokens_seen": 105185280, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.661703887510339, | |
| "grad_norm": 1.1239336553275057, | |
| "learning_rate": 2.224888750436666e-05, | |
| "loss": 0.3231, | |
| "num_input_tokens_seen": 105447424, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.6683209263854426, | |
| "grad_norm": 1.0832025698438201, | |
| "learning_rate": 2.2145517159365044e-05, | |
| "loss": 0.3412, | |
| "num_input_tokens_seen": 105709568, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.674937965260546, | |
| "grad_norm": 1.1361074407484597, | |
| "learning_rate": 2.2042196237767233e-05, | |
| "loss": 0.3411, | |
| "num_input_tokens_seen": 105971712, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.6815550041356495, | |
| "grad_norm": 1.1514517127780615, | |
| "learning_rate": 2.1938926528503807e-05, | |
| "loss": 0.3538, | |
| "num_input_tokens_seen": 106233856, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.688172043010753, | |
| "grad_norm": 1.1204288030113607, | |
| "learning_rate": 2.183570981961864e-05, | |
| "loss": 0.3286, | |
| "num_input_tokens_seen": 106496000, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.694789081885856, | |
| "grad_norm": 1.073945290517408, | |
| "learning_rate": 2.173254789823794e-05, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 106758144, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.7014061207609594, | |
| "grad_norm": 1.1488472446305815, | |
| "learning_rate": 2.162944255053928e-05, | |
| "loss": 0.3408, | |
| "num_input_tokens_seen": 107020288, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.708023159636063, | |
| "grad_norm": 1.0097780396990714, | |
| "learning_rate": 2.1526395561720742e-05, | |
| "loss": 0.3385, | |
| "num_input_tokens_seen": 107282432, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.7146401985111663, | |
| "grad_norm": 1.159742866337566, | |
| "learning_rate": 2.1423408715969948e-05, | |
| "loss": 0.353, | |
| "num_input_tokens_seen": 107544576, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.72125723738627, | |
| "grad_norm": 1.0399730947948516, | |
| "learning_rate": 2.132048379643316e-05, | |
| "loss": 0.3249, | |
| "num_input_tokens_seen": 107806720, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.727874276261373, | |
| "grad_norm": 1.0290808670104754, | |
| "learning_rate": 2.1217622585184474e-05, | |
| "loss": 0.3175, | |
| "num_input_tokens_seen": 108068864, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.7344913151364763, | |
| "grad_norm": 1.0570208529977188, | |
| "learning_rate": 2.1114826863194882e-05, | |
| "loss": 0.3425, | |
| "num_input_tokens_seen": 108331008, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.7411083540115797, | |
| "grad_norm": 1.071729013559576, | |
| "learning_rate": 2.101209841030149e-05, | |
| "loss": 0.3336, | |
| "num_input_tokens_seen": 108593152, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.747725392886683, | |
| "grad_norm": 1.114457915850528, | |
| "learning_rate": 2.090943900517669e-05, | |
| "loss": 0.3207, | |
| "num_input_tokens_seen": 108855296, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.7543424317617866, | |
| "grad_norm": 1.077218628216003, | |
| "learning_rate": 2.0806850425297363e-05, | |
| "loss": 0.3393, | |
| "num_input_tokens_seen": 109117440, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.76095947063689, | |
| "grad_norm": 1.1199612602141782, | |
| "learning_rate": 2.070433444691407e-05, | |
| "loss": 0.3207, | |
| "num_input_tokens_seen": 109379584, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.7675765095119935, | |
| "grad_norm": 1.1174362860816687, | |
| "learning_rate": 2.060189284502037e-05, | |
| "loss": 0.3374, | |
| "num_input_tokens_seen": 109641728, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.774193548387097, | |
| "grad_norm": 1.1287911544921245, | |
| "learning_rate": 2.049952739332201e-05, | |
| "loss": 0.3191, | |
| "num_input_tokens_seen": 109903872, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.7808105872622004, | |
| "grad_norm": 1.0450302695675435, | |
| "learning_rate": 2.039723986420628e-05, | |
| "loss": 0.3559, | |
| "num_input_tokens_seen": 110166016, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.7874276261373034, | |
| "grad_norm": 1.09368946884029, | |
| "learning_rate": 2.029503202871125e-05, | |
| "loss": 0.3413, | |
| "num_input_tokens_seen": 110428160, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.794044665012407, | |
| "grad_norm": 1.022545929344771, | |
| "learning_rate": 2.01929056564952e-05, | |
| "loss": 0.3448, | |
| "num_input_tokens_seen": 110690304, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.8006617038875103, | |
| "grad_norm": 1.030074482926079, | |
| "learning_rate": 2.0090862515805898e-05, | |
| "loss": 0.3623, | |
| "num_input_tokens_seen": 110952448, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.807278742762614, | |
| "grad_norm": 1.109666478372538, | |
| "learning_rate": 1.9988904373450018e-05, | |
| "loss": 0.3223, | |
| "num_input_tokens_seen": 111214592, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.8138957816377173, | |
| "grad_norm": 1.0575561897801167, | |
| "learning_rate": 1.9887032994762563e-05, | |
| "loss": 0.347, | |
| "num_input_tokens_seen": 111476736, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 1.0648123386835917, | |
| "learning_rate": 1.9785250143576254e-05, | |
| "loss": 0.3375, | |
| "num_input_tokens_seen": 111738880, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.8271298593879237, | |
| "grad_norm": 1.1050336868555022, | |
| "learning_rate": 1.9683557582191055e-05, | |
| "loss": 0.3239, | |
| "num_input_tokens_seen": 112001024, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.833746898263027, | |
| "grad_norm": 1.0359983663485797, | |
| "learning_rate": 1.9581957071343592e-05, | |
| "loss": 0.3335, | |
| "num_input_tokens_seen": 112263168, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.8403639371381306, | |
| "grad_norm": 1.0414734239060488, | |
| "learning_rate": 1.9480450370176726e-05, | |
| "loss": 0.333, | |
| "num_input_tokens_seen": 112525312, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.846980976013234, | |
| "grad_norm": 1.0382890141371446, | |
| "learning_rate": 1.9379039236209037e-05, | |
| "loss": 0.3319, | |
| "num_input_tokens_seen": 112787456, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.8535980148883375, | |
| "grad_norm": 1.0013368189447842, | |
| "learning_rate": 1.9277725425304467e-05, | |
| "loss": 0.331, | |
| "num_input_tokens_seen": 113049600, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.860215053763441, | |
| "grad_norm": 1.0179396934960452, | |
| "learning_rate": 1.917651069164183e-05, | |
| "loss": 0.3245, | |
| "num_input_tokens_seen": 113311744, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.8668320926385444, | |
| "grad_norm": 0.9928696786268584, | |
| "learning_rate": 1.9075396787684533e-05, | |
| "loss": 0.3347, | |
| "num_input_tokens_seen": 113573888, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.873449131513648, | |
| "grad_norm": 1.046777848590959, | |
| "learning_rate": 1.8974385464150136e-05, | |
| "loss": 0.3238, | |
| "num_input_tokens_seen": 113836032, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.880066170388751, | |
| "grad_norm": 1.0405751720587308, | |
| "learning_rate": 1.8873478469980125e-05, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 114098176, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.8866832092638544, | |
| "grad_norm": 1.122608330877757, | |
| "learning_rate": 1.8772677552309596e-05, | |
| "loss": 0.3412, | |
| "num_input_tokens_seen": 114360320, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.893300248138958, | |
| "grad_norm": 1.0442286190838361, | |
| "learning_rate": 1.8671984456436968e-05, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 114622464, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.8999172870140613, | |
| "grad_norm": 1.0227002785782815, | |
| "learning_rate": 1.8571400925793855e-05, | |
| "loss": 0.3397, | |
| "num_input_tokens_seen": 114884608, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.9065343258891647, | |
| "grad_norm": 1.0156032484754163, | |
| "learning_rate": 1.8470928701914763e-05, | |
| "loss": 0.3191, | |
| "num_input_tokens_seen": 115146752, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.9131513647642677, | |
| "grad_norm": 1.1068895653021453, | |
| "learning_rate": 1.8370569524407054e-05, | |
| "loss": 0.323, | |
| "num_input_tokens_seen": 115408896, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.919768403639371, | |
| "grad_norm": 0.9691703910653604, | |
| "learning_rate": 1.8270325130920728e-05, | |
| "loss": 0.3116, | |
| "num_input_tokens_seen": 115671040, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.9263854425144746, | |
| "grad_norm": 0.9826661649635426, | |
| "learning_rate": 1.817019725711841e-05, | |
| "loss": 0.3324, | |
| "num_input_tokens_seen": 115933184, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.933002481389578, | |
| "grad_norm": 1.0159017429748878, | |
| "learning_rate": 1.807018763664524e-05, | |
| "loss": 0.328, | |
| "num_input_tokens_seen": 116195328, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.9396195202646815, | |
| "grad_norm": 1.0737382354639178, | |
| "learning_rate": 1.7970298001098897e-05, | |
| "loss": 0.3114, | |
| "num_input_tokens_seen": 116457472, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.946236559139785, | |
| "grad_norm": 1.0066108381286578, | |
| "learning_rate": 1.787053007999961e-05, | |
| "loss": 0.3332, | |
| "num_input_tokens_seen": 116719616, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.9528535980148884, | |
| "grad_norm": 1.0680776988030307, | |
| "learning_rate": 1.7770885600760183e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 116981760, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.959470636889992, | |
| "grad_norm": 1.0349807857034838, | |
| "learning_rate": 1.7671366288656152e-05, | |
| "loss": 0.3241, | |
| "num_input_tokens_seen": 117243904, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.9660876757650954, | |
| "grad_norm": 1.0388721618627996, | |
| "learning_rate": 1.7571973866795815e-05, | |
| "loss": 0.3292, | |
| "num_input_tokens_seen": 117506048, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9727047146401984, | |
| "grad_norm": 1.0181955179089524, | |
| "learning_rate": 1.7472710056090502e-05, | |
| "loss": 0.3499, | |
| "num_input_tokens_seen": 117768192, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.979321753515302, | |
| "grad_norm": 1.0554960164473584, | |
| "learning_rate": 1.7373576575224686e-05, | |
| "loss": 0.3331, | |
| "num_input_tokens_seen": 118030336, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.9859387923904053, | |
| "grad_norm": 1.0744246028347093, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.3239, | |
| "num_input_tokens_seen": 118292480, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.9925558312655087, | |
| "grad_norm": 1.0034573899878152, | |
| "learning_rate": 1.7175707466437007e-05, | |
| "loss": 0.3298, | |
| "num_input_tokens_seen": 118554624, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.999172870140612, | |
| "grad_norm": 1.0586016785419656, | |
| "learning_rate": 1.7076975264482434e-05, | |
| "loss": 0.3279, | |
| "num_input_tokens_seen": 118816768, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.0586016785419656, | |
| "learning_rate": 1.6978380244242647e-05, | |
| "loss": 0.3046, | |
| "num_input_tokens_seen": 118849536, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.0066170388751035, | |
| "grad_norm": 2.6288675198935905, | |
| "learning_rate": 1.6879924112822505e-05, | |
| "loss": 0.1025, | |
| "num_input_tokens_seen": 119111680, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.013234077750207, | |
| "grad_norm": 1.4229544470512288, | |
| "learning_rate": 1.678160857492207e-05, | |
| "loss": 0.1027, | |
| "num_input_tokens_seen": 119373824, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.0198511166253104, | |
| "grad_norm": 1.2481296003332036, | |
| "learning_rate": 1.6683435332807172e-05, | |
| "loss": 0.098, | |
| "num_input_tokens_seen": 119635968, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.0264681555004134, | |
| "grad_norm": 0.8805105822487416, | |
| "learning_rate": 1.658540608627985e-05, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 119898112, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.033085194375517, | |
| "grad_norm": 0.828690671159354, | |
| "learning_rate": 1.6487522532648962e-05, | |
| "loss": 0.0931, | |
| "num_input_tokens_seen": 120160256, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.0397022332506203, | |
| "grad_norm": 1.0491253577091149, | |
| "learning_rate": 1.6389786366700836e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 120422400, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.0463192721257237, | |
| "grad_norm": 1.006633637656072, | |
| "learning_rate": 1.629219928066982e-05, | |
| "loss": 0.0856, | |
| "num_input_tokens_seen": 120684544, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.052936311000827, | |
| "grad_norm": 1.3229784618102733, | |
| "learning_rate": 1.619476296420909e-05, | |
| "loss": 0.0999, | |
| "num_input_tokens_seen": 120946688, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.0595533498759306, | |
| "grad_norm": 1.3217791698144548, | |
| "learning_rate": 1.6097479104361326e-05, | |
| "loss": 0.0913, | |
| "num_input_tokens_seen": 121208832, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.066170388751034, | |
| "grad_norm": 1.0018438706002273, | |
| "learning_rate": 1.6000349385529557e-05, | |
| "loss": 0.0839, | |
| "num_input_tokens_seen": 121470976, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.072787427626137, | |
| "grad_norm": 0.8813830494513232, | |
| "learning_rate": 1.5903375489447925e-05, | |
| "loss": 0.0928, | |
| "num_input_tokens_seen": 121733120, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.0794044665012406, | |
| "grad_norm": 0.7791960697591414, | |
| "learning_rate": 1.5806559095152652e-05, | |
| "loss": 0.0805, | |
| "num_input_tokens_seen": 121995264, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.086021505376344, | |
| "grad_norm": 0.7326718702522317, | |
| "learning_rate": 1.570990187895289e-05, | |
| "loss": 0.0823, | |
| "num_input_tokens_seen": 122257408, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.0926385442514475, | |
| "grad_norm": 0.8396403661003932, | |
| "learning_rate": 1.561340551440176e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 122519552, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.099255583126551, | |
| "grad_norm": 0.813187861447653, | |
| "learning_rate": 1.5517071672267314e-05, | |
| "loss": 0.0881, | |
| "num_input_tokens_seen": 122781696, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.1058726220016544, | |
| "grad_norm": 0.7573337922842879, | |
| "learning_rate": 1.542090202050368e-05, | |
| "loss": 0.0835, | |
| "num_input_tokens_seen": 123043840, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.112489660876758, | |
| "grad_norm": 0.6833827588765713, | |
| "learning_rate": 1.532489822422211e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 123305984, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.119106699751861, | |
| "grad_norm": 0.7934273146364325, | |
| "learning_rate": 1.5229061945662198e-05, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 123568128, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.1257237386269643, | |
| "grad_norm": 0.7555918786299218, | |
| "learning_rate": 1.5133394844163093e-05, | |
| "loss": 0.0767, | |
| "num_input_tokens_seen": 123830272, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.1323407775020677, | |
| "grad_norm": 0.7883685737277187, | |
| "learning_rate": 1.503789857613473e-05, | |
| "loss": 0.0854, | |
| "num_input_tokens_seen": 124092416, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.138957816377171, | |
| "grad_norm": 0.7705225890710093, | |
| "learning_rate": 1.4942574795029213e-05, | |
| "loss": 0.0748, | |
| "num_input_tokens_seen": 124354560, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.1455748552522746, | |
| "grad_norm": 0.7409548352905246, | |
| "learning_rate": 1.4847425151312127e-05, | |
| "loss": 0.0728, | |
| "num_input_tokens_seen": 124616704, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.152191894127378, | |
| "grad_norm": 0.732934471932393, | |
| "learning_rate": 1.4752451292434016e-05, | |
| "loss": 0.0782, | |
| "num_input_tokens_seen": 124878848, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.1588089330024816, | |
| "grad_norm": 0.7587629849721698, | |
| "learning_rate": 1.4657654862801798e-05, | |
| "loss": 0.0758, | |
| "num_input_tokens_seen": 125140992, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.1654259718775846, | |
| "grad_norm": 0.7303711757854935, | |
| "learning_rate": 1.4563037503750366e-05, | |
| "loss": 0.0757, | |
| "num_input_tokens_seen": 125403136, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.172043010752688, | |
| "grad_norm": 0.7477005165930065, | |
| "learning_rate": 1.446860085351407e-05, | |
| "loss": 0.0798, | |
| "num_input_tokens_seen": 125665280, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.1786600496277915, | |
| "grad_norm": 0.6628074364177822, | |
| "learning_rate": 1.4374346547198486e-05, | |
| "loss": 0.0741, | |
| "num_input_tokens_seen": 125927424, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.185277088502895, | |
| "grad_norm": 0.6904627228112755, | |
| "learning_rate": 1.4280276216751956e-05, | |
| "loss": 0.0767, | |
| "num_input_tokens_seen": 126189568, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.1918941273779984, | |
| "grad_norm": 0.7078389751667725, | |
| "learning_rate": 1.4186391490937481e-05, | |
| "loss": 0.0738, | |
| "num_input_tokens_seen": 126451712, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.198511166253102, | |
| "grad_norm": 0.688986245366018, | |
| "learning_rate": 1.4092693995304368e-05, | |
| "loss": 0.0765, | |
| "num_input_tokens_seen": 126713856, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.2051282051282053, | |
| "grad_norm": 0.7409087917501026, | |
| "learning_rate": 1.3999185352160231e-05, | |
| "loss": 0.0768, | |
| "num_input_tokens_seen": 126976000, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.2117452440033087, | |
| "grad_norm": 0.712307910459613, | |
| "learning_rate": 1.3905867180542803e-05, | |
| "loss": 0.0795, | |
| "num_input_tokens_seen": 127238144, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.2183622828784118, | |
| "grad_norm": 0.700878195423877, | |
| "learning_rate": 1.3812741096191905e-05, | |
| "loss": 0.0822, | |
| "num_input_tokens_seen": 127500288, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.224979321753515, | |
| "grad_norm": 0.7268693202788183, | |
| "learning_rate": 1.3719808711521573e-05, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 127762432, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.2315963606286187, | |
| "grad_norm": 0.7051720495475905, | |
| "learning_rate": 1.3627071635591976e-05, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 128024576, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.238213399503722, | |
| "grad_norm": 0.7075601778394868, | |
| "learning_rate": 1.353453147408174e-05, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 128286720, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.2448304383788256, | |
| "grad_norm": 0.6983282572573932, | |
| "learning_rate": 1.344218982925996e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 128548864, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.251447477253929, | |
| "grad_norm": 0.6787490400858531, | |
| "learning_rate": 1.3350048299958645e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 128811008, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.258064516129032, | |
| "grad_norm": 0.6780173788911718, | |
| "learning_rate": 1.3258108481544849e-05, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 129073152, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.2646815550041355, | |
| "grad_norm": 0.6978698177594762, | |
| "learning_rate": 1.3166371965893226e-05, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 129335296, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.271298593879239, | |
| "grad_norm": 0.693822546631228, | |
| "learning_rate": 1.3074840341358318e-05, | |
| "loss": 0.0754, | |
| "num_input_tokens_seen": 129597440, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.2779156327543424, | |
| "grad_norm": 0.6691956665865821, | |
| "learning_rate": 1.2983515192747153e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 129859584, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.284532671629446, | |
| "grad_norm": 0.6867164097827277, | |
| "learning_rate": 1.2892398101291759e-05, | |
| "loss": 0.0679, | |
| "num_input_tokens_seen": 130121728, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 3.2911497105045493, | |
| "grad_norm": 0.6431526093398369, | |
| "learning_rate": 1.2801490644621789e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 130383872, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.2977667493796528, | |
| "grad_norm": 0.6685555901467412, | |
| "learning_rate": 1.271079439673726e-05, | |
| "loss": 0.0718, | |
| "num_input_tokens_seen": 130646016, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 3.304383788254756, | |
| "grad_norm": 0.682140794722891, | |
| "learning_rate": 1.2620310927981176e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 130908160, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 3.311000827129859, | |
| "grad_norm": 0.6387878739981501, | |
| "learning_rate": 1.2530041805012504e-05, | |
| "loss": 0.068, | |
| "num_input_tokens_seen": 131170304, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 3.3176178660049627, | |
| "grad_norm": 0.6757177577212953, | |
| "learning_rate": 1.2439988590778872e-05, | |
| "loss": 0.0763, | |
| "num_input_tokens_seen": 131432448, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 3.324234904880066, | |
| "grad_norm": 0.6907225144982705, | |
| "learning_rate": 1.235015284448969e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 131694592, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.3308519437551696, | |
| "grad_norm": 0.6852742550824547, | |
| "learning_rate": 1.2260536121588978e-05, | |
| "loss": 0.071, | |
| "num_input_tokens_seen": 131956736, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 3.337468982630273, | |
| "grad_norm": 0.6514568739578507, | |
| "learning_rate": 1.2171139973728566e-05, | |
| "loss": 0.0703, | |
| "num_input_tokens_seen": 132218880, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 3.3440860215053765, | |
| "grad_norm": 0.676377694947883, | |
| "learning_rate": 1.2081965948741162e-05, | |
| "loss": 0.0701, | |
| "num_input_tokens_seen": 132481024, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 3.3507030603804795, | |
| "grad_norm": 0.6239946346262029, | |
| "learning_rate": 1.1993015590613573e-05, | |
| "loss": 0.0745, | |
| "num_input_tokens_seen": 132743168, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 3.357320099255583, | |
| "grad_norm": 0.6668058973586733, | |
| "learning_rate": 1.1904290439459973e-05, | |
| "loss": 0.0684, | |
| "num_input_tokens_seen": 133005312, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.3639371381306864, | |
| "grad_norm": 0.6386467362434296, | |
| "learning_rate": 1.1815792031495224e-05, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 133267456, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 3.37055417700579, | |
| "grad_norm": 0.6136816329646979, | |
| "learning_rate": 1.172752189900829e-05, | |
| "loss": 0.0696, | |
| "num_input_tokens_seen": 133529600, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.3771712158808933, | |
| "grad_norm": 0.6315422013733348, | |
| "learning_rate": 1.1639481570335692e-05, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 133791744, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 3.3837882547559968, | |
| "grad_norm": 0.6957057963725773, | |
| "learning_rate": 1.1551672569835095e-05, | |
| "loss": 0.0715, | |
| "num_input_tokens_seen": 134053888, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.3904052936311, | |
| "grad_norm": 0.7124213473576223, | |
| "learning_rate": 1.1464096417858822e-05, | |
| "loss": 0.0782, | |
| "num_input_tokens_seen": 134316032, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.3970223325062037, | |
| "grad_norm": 0.7081989598962511, | |
| "learning_rate": 1.1376754630727616e-05, | |
| "loss": 0.0718, | |
| "num_input_tokens_seen": 134578176, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 3.4036393713813067, | |
| "grad_norm": 0.6916080492642819, | |
| "learning_rate": 1.1289648720704347e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 134840320, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 3.41025641025641, | |
| "grad_norm": 0.7300988532144203, | |
| "learning_rate": 1.1202780195967836e-05, | |
| "loss": 0.0698, | |
| "num_input_tokens_seen": 135102464, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 3.4168734491315136, | |
| "grad_norm": 0.6836195672441547, | |
| "learning_rate": 1.111615056058674e-05, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 135364608, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 3.423490488006617, | |
| "grad_norm": 0.7042584712844966, | |
| "learning_rate": 1.102976131449352e-05, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 135626752, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.4301075268817205, | |
| "grad_norm": 0.6592443491715577, | |
| "learning_rate": 1.0943613953458448e-05, | |
| "loss": 0.0702, | |
| "num_input_tokens_seen": 135888896, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 3.436724565756824, | |
| "grad_norm": 0.6737665976311692, | |
| "learning_rate": 1.0857709969063734e-05, | |
| "loss": 0.0711, | |
| "num_input_tokens_seen": 136151040, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 3.4433416046319274, | |
| "grad_norm": 0.6674839459806772, | |
| "learning_rate": 1.0772050848677682e-05, | |
| "loss": 0.0718, | |
| "num_input_tokens_seen": 136413184, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 3.4499586435070304, | |
| "grad_norm": 0.6719895509682765, | |
| "learning_rate": 1.0686638075428947e-05, | |
| "loss": 0.0672, | |
| "num_input_tokens_seen": 136675328, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 3.456575682382134, | |
| "grad_norm": 0.6756296196553735, | |
| "learning_rate": 1.0601473128180855e-05, | |
| "loss": 0.071, | |
| "num_input_tokens_seen": 136937472, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.4631927212572373, | |
| "grad_norm": 0.6590609477211611, | |
| "learning_rate": 1.0516557481505795e-05, | |
| "loss": 0.0702, | |
| "num_input_tokens_seen": 137199616, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 3.4698097601323408, | |
| "grad_norm": 0.6494091226770315, | |
| "learning_rate": 1.0431892605659685e-05, | |
| "loss": 0.0816, | |
| "num_input_tokens_seen": 137461760, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 3.4764267990074442, | |
| "grad_norm": 0.7117194125057636, | |
| "learning_rate": 1.034747996655652e-05, | |
| "loss": 0.0753, | |
| "num_input_tokens_seen": 137723904, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 3.4830438378825477, | |
| "grad_norm": 0.6790594971542001, | |
| "learning_rate": 1.0263321025742991e-05, | |
| "loss": 0.0703, | |
| "num_input_tokens_seen": 137986048, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 3.489660876757651, | |
| "grad_norm": 0.6495417825384646, | |
| "learning_rate": 1.0179417240373183e-05, | |
| "loss": 0.0742, | |
| "num_input_tokens_seen": 138248192, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.4962779156327546, | |
| "grad_norm": 0.6860713447018258, | |
| "learning_rate": 1.009577006318333e-05, | |
| "loss": 0.0706, | |
| "num_input_tokens_seen": 138510336, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 3.5028949545078576, | |
| "grad_norm": 0.6084319486167035, | |
| "learning_rate": 1.0012380942466673e-05, | |
| "loss": 0.0682, | |
| "num_input_tokens_seen": 138772480, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 3.509511993382961, | |
| "grad_norm": 0.6710562393229845, | |
| "learning_rate": 9.929251322048397e-06, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 139034624, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 3.5161290322580645, | |
| "grad_norm": 0.6662367171728351, | |
| "learning_rate": 9.8463826412606e-06, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 139296768, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 3.522746071133168, | |
| "grad_norm": 0.6609857197579974, | |
| "learning_rate": 9.763776334917399e-06, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 139558912, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.5293631100082714, | |
| "grad_norm": 0.6480323453400129, | |
| "learning_rate": 9.681433833290079e-06, | |
| "loss": 0.0695, | |
| "num_input_tokens_seen": 139821056, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 3.5359801488833744, | |
| "grad_norm": 0.6496784196977948, | |
| "learning_rate": 9.599356562082329e-06, | |
| "loss": 0.064, | |
| "num_input_tokens_seen": 140083200, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 3.542597187758478, | |
| "grad_norm": 0.6610199288253872, | |
| "learning_rate": 9.517545942405548e-06, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 140345344, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 3.5492142266335813, | |
| "grad_norm": 0.6626720993294117, | |
| "learning_rate": 9.436003390754258e-06, | |
| "loss": 0.0671, | |
| "num_input_tokens_seen": 140607488, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 3.555831265508685, | |
| "grad_norm": 0.6594895617735759, | |
| "learning_rate": 9.354730318981562e-06, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 140869632, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.5624483043837882, | |
| "grad_norm": 0.65561962599018, | |
| "learning_rate": 9.273728134274701e-06, | |
| "loss": 0.067, | |
| "num_input_tokens_seen": 141131776, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 3.5690653432588917, | |
| "grad_norm": 0.6434061253218011, | |
| "learning_rate": 9.192998239130699e-06, | |
| "loss": 0.0696, | |
| "num_input_tokens_seen": 141393920, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 3.575682382133995, | |
| "grad_norm": 0.6235071777837268, | |
| "learning_rate": 9.112542031332075e-06, | |
| "loss": 0.067, | |
| "num_input_tokens_seen": 141656064, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 3.5822994210090986, | |
| "grad_norm": 0.6916008151430184, | |
| "learning_rate": 9.032360903922635e-06, | |
| "loss": 0.064, | |
| "num_input_tokens_seen": 141918208, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 3.588916459884202, | |
| "grad_norm": 0.6316220441846853, | |
| "learning_rate": 8.95245624518336e-06, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 142180352, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.595533498759305, | |
| "grad_norm": 0.6520805302384783, | |
| "learning_rate": 8.872829438608368e-06, | |
| "loss": 0.0673, | |
| "num_input_tokens_seen": 142442496, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 3.6021505376344085, | |
| "grad_norm": 0.6414106319420477, | |
| "learning_rate": 8.793481862880953e-06, | |
| "loss": 0.0645, | |
| "num_input_tokens_seen": 142704640, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 3.608767576509512, | |
| "grad_norm": 0.6353643058896132, | |
| "learning_rate": 8.714414891849737e-06, | |
| "loss": 0.0681, | |
| "num_input_tokens_seen": 142966784, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 3.6153846153846154, | |
| "grad_norm": 0.6513946954232326, | |
| "learning_rate": 8.63562989450482e-06, | |
| "loss": 0.0736, | |
| "num_input_tokens_seen": 143228928, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 3.622001654259719, | |
| "grad_norm": 0.6801727390096496, | |
| "learning_rate": 8.55712823495419e-06, | |
| "loss": 0.068, | |
| "num_input_tokens_seen": 143491072, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.6286186931348223, | |
| "grad_norm": 0.6778450196456566, | |
| "learning_rate": 8.478911272399964e-06, | |
| "loss": 0.067, | |
| "num_input_tokens_seen": 143753216, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 3.6352357320099253, | |
| "grad_norm": 0.6519049335263876, | |
| "learning_rate": 8.400980361114985e-06, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 144015360, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 3.641852770885029, | |
| "grad_norm": 0.6425369950760212, | |
| "learning_rate": 8.323336850419289e-06, | |
| "loss": 0.0645, | |
| "num_input_tokens_seen": 144277504, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 3.6484698097601322, | |
| "grad_norm": 0.6472046415210123, | |
| "learning_rate": 8.245982084656765e-06, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 144539648, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 3.6550868486352357, | |
| "grad_norm": 0.6638130847765896, | |
| "learning_rate": 8.168917403171891e-06, | |
| "loss": 0.0647, | |
| "num_input_tokens_seen": 144801792, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.661703887510339, | |
| "grad_norm": 0.7189252678002049, | |
| "learning_rate": 8.092144140286539e-06, | |
| "loss": 0.0684, | |
| "num_input_tokens_seen": 145063936, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 3.6683209263854426, | |
| "grad_norm": 0.6024962011757397, | |
| "learning_rate": 8.015663625276864e-06, | |
| "loss": 0.0638, | |
| "num_input_tokens_seen": 145326080, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 3.674937965260546, | |
| "grad_norm": 0.6380399282992304, | |
| "learning_rate": 7.939477182350277e-06, | |
| "loss": 0.0695, | |
| "num_input_tokens_seen": 145588224, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 3.6815550041356495, | |
| "grad_norm": 0.6789043850107335, | |
| "learning_rate": 7.863586130622574e-06, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 145850368, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 3.688172043010753, | |
| "grad_norm": 0.6555026556001212, | |
| "learning_rate": 7.787991784095e-06, | |
| "loss": 0.0641, | |
| "num_input_tokens_seen": 146112512, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.694789081885856, | |
| "grad_norm": 0.6418080447317104, | |
| "learning_rate": 7.712695451631621e-06, | |
| "loss": 0.0659, | |
| "num_input_tokens_seen": 146374656, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 3.7014061207609594, | |
| "grad_norm": 0.6637076291897438, | |
| "learning_rate": 7.637698436936524e-06, | |
| "loss": 0.0659, | |
| "num_input_tokens_seen": 146636800, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 3.708023159636063, | |
| "grad_norm": 0.6535273139092588, | |
| "learning_rate": 7.563002038531383e-06, | |
| "loss": 0.0678, | |
| "num_input_tokens_seen": 146898944, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 3.7146401985111663, | |
| "grad_norm": 0.6436806576666099, | |
| "learning_rate": 7.488607549732843e-06, | |
| "loss": 0.0681, | |
| "num_input_tokens_seen": 147161088, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 3.72125723738627, | |
| "grad_norm": 0.6432691211156978, | |
| "learning_rate": 7.414516258630244e-06, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 147423232, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.727874276261373, | |
| "grad_norm": 0.6425863914086812, | |
| "learning_rate": 7.340729448063252e-06, | |
| "loss": 0.069, | |
| "num_input_tokens_seen": 147685376, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 3.7344913151364763, | |
| "grad_norm": 0.7237782721921531, | |
| "learning_rate": 7.267248395599632e-06, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 147947520, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 3.7411083540115797, | |
| "grad_norm": 0.5899400593495776, | |
| "learning_rate": 7.1940743735132126e-06, | |
| "loss": 0.0585, | |
| "num_input_tokens_seen": 148209664, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 3.747725392886683, | |
| "grad_norm": 0.6298028583777657, | |
| "learning_rate": 7.121208648761743e-06, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 148471808, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 3.7543424317617866, | |
| "grad_norm": 0.6182729924844672, | |
| "learning_rate": 7.048652482965079e-06, | |
| "loss": 0.0644, | |
| "num_input_tokens_seen": 148733952, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.76095947063689, | |
| "grad_norm": 0.6368990475721128, | |
| "learning_rate": 6.9764071323832145e-06, | |
| "loss": 0.0646, | |
| "num_input_tokens_seen": 148996096, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 3.7675765095119935, | |
| "grad_norm": 0.6253822482714361, | |
| "learning_rate": 6.904473847894652e-06, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 149258240, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 3.774193548387097, | |
| "grad_norm": 0.6509773243060786, | |
| "learning_rate": 6.832853874974629e-06, | |
| "loss": 0.0665, | |
| "num_input_tokens_seen": 149520384, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 3.7808105872622004, | |
| "grad_norm": 0.6348506211821107, | |
| "learning_rate": 6.761548453673647e-06, | |
| "loss": 0.0673, | |
| "num_input_tokens_seen": 149782528, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 3.7874276261373034, | |
| "grad_norm": 0.6635181414850976, | |
| "learning_rate": 6.690558818595943e-06, | |
| "loss": 0.0627, | |
| "num_input_tokens_seen": 150044672, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.794044665012407, | |
| "grad_norm": 0.6494205642323634, | |
| "learning_rate": 6.619886198878142e-06, | |
| "loss": 0.0701, | |
| "num_input_tokens_seen": 150306816, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 3.8006617038875103, | |
| "grad_norm": 0.6664804797773478, | |
| "learning_rate": 6.549531818167981e-06, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 150568960, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 3.807278742762614, | |
| "grad_norm": 0.6170617316337039, | |
| "learning_rate": 6.479496894603066e-06, | |
| "loss": 0.057, | |
| "num_input_tokens_seen": 150831104, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 3.8138957816377173, | |
| "grad_norm": 0.6049691188285755, | |
| "learning_rate": 6.409782640789874e-06, | |
| "loss": 0.0624, | |
| "num_input_tokens_seen": 151093248, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 3.8205128205128203, | |
| "grad_norm": 0.6624674362233367, | |
| "learning_rate": 6.340390263782655e-06, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 151355392, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.8271298593879237, | |
| "grad_norm": 0.6641352801341951, | |
| "learning_rate": 6.271320965062638e-06, | |
| "loss": 0.0663, | |
| "num_input_tokens_seen": 151617536, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 3.833746898263027, | |
| "grad_norm": 0.6545794002701258, | |
| "learning_rate": 6.202575940517122e-06, | |
| "loss": 0.0642, | |
| "num_input_tokens_seen": 151879680, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 3.8403639371381306, | |
| "grad_norm": 0.6130739092946907, | |
| "learning_rate": 6.134156380418857e-06, | |
| "loss": 0.0656, | |
| "num_input_tokens_seen": 152141824, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 3.846980976013234, | |
| "grad_norm": 0.6280514072168198, | |
| "learning_rate": 6.066063469405384e-06, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 152403968, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 3.8535980148883375, | |
| "grad_norm": 0.6275884943541263, | |
| "learning_rate": 5.998298386458545e-06, | |
| "loss": 0.0614, | |
| "num_input_tokens_seen": 152666112, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.860215053763441, | |
| "grad_norm": 0.6842304679401343, | |
| "learning_rate": 5.9308623048840685e-06, | |
| "loss": 0.0667, | |
| "num_input_tokens_seen": 152928256, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 3.8668320926385444, | |
| "grad_norm": 0.6503988865659992, | |
| "learning_rate": 5.863756392291248e-06, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 153190400, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 3.873449131513648, | |
| "grad_norm": 0.6594850152732623, | |
| "learning_rate": 5.796981810572724e-06, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 153452544, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 3.880066170388751, | |
| "grad_norm": 0.6343679688901765, | |
| "learning_rate": 5.7305397158843725e-06, | |
| "loss": 0.0585, | |
| "num_input_tokens_seen": 153714688, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 3.8866832092638544, | |
| "grad_norm": 0.6431836549542845, | |
| "learning_rate": 5.664431258625305e-06, | |
| "loss": 0.0669, | |
| "num_input_tokens_seen": 153976832, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.893300248138958, | |
| "grad_norm": 0.5999309364227289, | |
| "learning_rate": 5.598657583417896e-06, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 154238976, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 3.8999172870140613, | |
| "grad_norm": 0.6399699282669513, | |
| "learning_rate": 5.533219829088021e-06, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 154501120, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 3.9065343258891647, | |
| "grad_norm": 0.6312470036786655, | |
| "learning_rate": 5.468119128645319e-06, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 154763264, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 3.9131513647642677, | |
| "grad_norm": 0.5905035092164123, | |
| "learning_rate": 5.403356609263563e-06, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 155025408, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 3.919768403639371, | |
| "grad_norm": 0.6161426769945006, | |
| "learning_rate": 5.338933392261159e-06, | |
| "loss": 0.0665, | |
| "num_input_tokens_seen": 155287552, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.9263854425144746, | |
| "grad_norm": 0.594382143213924, | |
| "learning_rate": 5.274850593081726e-06, | |
| "loss": 0.0613, | |
| "num_input_tokens_seen": 155549696, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 3.933002481389578, | |
| "grad_norm": 0.6203429561074691, | |
| "learning_rate": 5.2111093212747845e-06, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 155811840, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 3.9396195202646815, | |
| "grad_norm": 0.6400006500847638, | |
| "learning_rate": 5.147710680476536e-06, | |
| "loss": 0.0675, | |
| "num_input_tokens_seen": 156073984, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 3.946236559139785, | |
| "grad_norm": 0.6142056516422985, | |
| "learning_rate": 5.0846557683907755e-06, | |
| "loss": 0.0635, | |
| "num_input_tokens_seen": 156336128, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 3.9528535980148884, | |
| "grad_norm": 0.6160208864975072, | |
| "learning_rate": 5.02194567676986e-06, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 156598272, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.959470636889992, | |
| "grad_norm": 0.6226902309804632, | |
| "learning_rate": 4.959581491395823e-06, | |
| "loss": 0.0636, | |
| "num_input_tokens_seen": 156860416, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 3.9660876757650954, | |
| "grad_norm": 0.6279199663238604, | |
| "learning_rate": 4.897564292061568e-06, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 157122560, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 3.9727047146401984, | |
| "grad_norm": 0.612461694031015, | |
| "learning_rate": 4.835895152552178e-06, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 157384704, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 3.979321753515302, | |
| "grad_norm": 0.6279699097628595, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.058, | |
| "num_input_tokens_seen": 157646848, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 3.9859387923904053, | |
| "grad_norm": 0.6202131769337398, | |
| "learning_rate": 4.713605317997741e-06, | |
| "loss": 0.0622, | |
| "num_input_tokens_seen": 157908992, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.9925558312655087, | |
| "grad_norm": 0.5955424851115548, | |
| "learning_rate": 4.652986740316928e-06, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 158171136, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 3.999172870140612, | |
| "grad_norm": 0.615956451068701, | |
| "learning_rate": 4.592720457152788e-06, | |
| "loss": 0.0662, | |
| "num_input_tokens_seen": 158433280, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.615956451068701, | |
| "learning_rate": 4.532807511974491e-06, | |
| "loss": 0.059, | |
| "num_input_tokens_seen": 158466048, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 4.0066170388751035, | |
| "grad_norm": 1.9322051031913439, | |
| "learning_rate": 4.47324894213341e-06, | |
| "loss": 0.0164, | |
| "num_input_tokens_seen": 158728192, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 4.013234077750207, | |
| "grad_norm": 0.4558460196163836, | |
| "learning_rate": 4.414045778845144e-06, | |
| "loss": 0.0196, | |
| "num_input_tokens_seen": 158990336, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.01985111662531, | |
| "grad_norm": 0.42853650955768524, | |
| "learning_rate": 4.355199047171685e-06, | |
| "loss": 0.0155, | |
| "num_input_tokens_seen": 159252480, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 4.026468155500414, | |
| "grad_norm": 0.39608647720989215, | |
| "learning_rate": 4.2967097660036456e-06, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 159514624, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 4.033085194375517, | |
| "grad_norm": 0.3847681953616679, | |
| "learning_rate": 4.238578948042632e-06, | |
| "loss": 0.0158, | |
| "num_input_tokens_seen": 159776768, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 4.039702233250621, | |
| "grad_norm": 0.3730962283899974, | |
| "learning_rate": 4.180807599783712e-06, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 160038912, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 4.046319272125724, | |
| "grad_norm": 0.31708613246483647, | |
| "learning_rate": 4.123396721497977e-06, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 160301056, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.052936311000827, | |
| "grad_norm": 0.2926072814394024, | |
| "learning_rate": 4.066347307215235e-06, | |
| "loss": 0.0151, | |
| "num_input_tokens_seen": 160563200, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 4.05955334987593, | |
| "grad_norm": 0.3073138304585139, | |
| "learning_rate": 4.009660344706786e-06, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 160825344, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 4.066170388751034, | |
| "grad_norm": 0.29258815799018134, | |
| "learning_rate": 3.9533368154683365e-06, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 161087488, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 4.072787427626137, | |
| "grad_norm": 0.2950143373147824, | |
| "learning_rate": 3.8973776947029864e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 161349632, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 4.079404466501241, | |
| "grad_norm": 0.31508955936467287, | |
| "learning_rate": 3.8417839513043645e-06, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 161611776, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.086021505376344, | |
| "grad_norm": 0.33771585486450106, | |
| "learning_rate": 3.7865565478398314e-06, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 161873920, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 4.0926385442514475, | |
| "grad_norm": 0.29025238047918867, | |
| "learning_rate": 3.7316964405338357e-06, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 162136064, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 4.099255583126551, | |
| "grad_norm": 0.3337321590239914, | |
| "learning_rate": 3.6772045792513336e-06, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 162398208, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 4.105872622001654, | |
| "grad_norm": 0.35876768327894176, | |
| "learning_rate": 3.6230819074813737e-06, | |
| "loss": 0.0113, | |
| "num_input_tokens_seen": 162660352, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 4.112489660876758, | |
| "grad_norm": 0.3292554272437977, | |
| "learning_rate": 3.5693293623207086e-06, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 162922496, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 4.119106699751861, | |
| "grad_norm": 0.35617949310152636, | |
| "learning_rate": 3.515947874457648e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 163184640, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 4.125723738626965, | |
| "grad_norm": 0.3625371803838258, | |
| "learning_rate": 3.4629383681558576e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 163446784, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 4.132340777502068, | |
| "grad_norm": 0.337416041836457, | |
| "learning_rate": 3.4103017612384323e-06, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 163708928, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 4.138957816377172, | |
| "grad_norm": 0.36408714339861237, | |
| "learning_rate": 3.358038965071955e-06, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 163971072, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 4.145574855252274, | |
| "grad_norm": 0.3530279053340078, | |
| "learning_rate": 3.3061508845507323e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 164233216, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.152191894127378, | |
| "grad_norm": 0.3646811917338047, | |
| "learning_rate": 3.254638418081132e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 164495360, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 4.158808933002481, | |
| "grad_norm": 0.32293791810237155, | |
| "learning_rate": 3.2035024575660237e-06, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 164757504, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 4.165425971877585, | |
| "grad_norm": 0.37559363453473765, | |
| "learning_rate": 3.152743888389334e-06, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 165019648, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 4.172043010752688, | |
| "grad_norm": 0.3279963359002966, | |
| "learning_rate": 3.1023635894007085e-06, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 165281792, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 4.1786600496277915, | |
| "grad_norm": 0.3672827713383433, | |
| "learning_rate": 3.0523624329003324e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 165543936, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 4.185277088502895, | |
| "grad_norm": 0.3151395869301595, | |
| "learning_rate": 3.002741284623764e-06, | |
| "loss": 0.0117, | |
| "num_input_tokens_seen": 165806080, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 4.191894127377998, | |
| "grad_norm": 0.2861017283610466, | |
| "learning_rate": 2.953501003727019e-06, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 166068224, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 4.198511166253102, | |
| "grad_norm": 0.32170006278076657, | |
| "learning_rate": 2.904642442771616e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 166330368, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 4.205128205128205, | |
| "grad_norm": 0.2827836397703704, | |
| "learning_rate": 2.8561664477098985e-06, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 166592512, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 4.211745244003309, | |
| "grad_norm": 0.3128634727340433, | |
| "learning_rate": 2.8080738578703054e-06, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 166854656, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.218362282878412, | |
| "grad_norm": 0.3016473217975274, | |
| "learning_rate": 2.7603655059429077e-06, | |
| "loss": 0.0119, | |
| "num_input_tokens_seen": 167116800, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 4.224979321753516, | |
| "grad_norm": 0.2861165129091277, | |
| "learning_rate": 2.7130422179649563e-06, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 167378944, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 4.231596360628619, | |
| "grad_norm": 0.2599772487106339, | |
| "learning_rate": 2.666104813306558e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 167641088, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 4.238213399503722, | |
| "grad_norm": 0.29133859690520947, | |
| "learning_rate": 2.619554104656563e-06, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 167903232, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 4.244830438378825, | |
| "grad_norm": 0.26908297741174175, | |
| "learning_rate": 2.5733908980083988e-06, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 168165376, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 4.251447477253929, | |
| "grad_norm": 0.2659036178447148, | |
| "learning_rate": 2.527615992646201e-06, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 168427520, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 4.258064516129032, | |
| "grad_norm": 0.2901657036601284, | |
| "learning_rate": 2.4822301811309066e-06, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 168689664, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 4.2646815550041355, | |
| "grad_norm": 0.2887641626437273, | |
| "learning_rate": 2.437234249286588e-06, | |
| "loss": 0.0116, | |
| "num_input_tokens_seen": 168951808, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 4.271298593879239, | |
| "grad_norm": 0.2972053563257295, | |
| "learning_rate": 2.3926289761867892e-06, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 169213952, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 4.277915632754342, | |
| "grad_norm": 0.28699830840097074, | |
| "learning_rate": 2.348415134141102e-06, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 169476096, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.284532671629446, | |
| "grad_norm": 0.2884445423690383, | |
| "learning_rate": 2.304593488681725e-06, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 169738240, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 4.291149710504549, | |
| "grad_norm": 0.2725595053360008, | |
| "learning_rate": 2.2611647985502636e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 170000384, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 4.297766749379653, | |
| "grad_norm": 0.27157106563305955, | |
| "learning_rate": 2.218129815684572e-06, | |
| "loss": 0.0117, | |
| "num_input_tokens_seen": 170262528, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 4.304383788254756, | |
| "grad_norm": 0.26577567593613255, | |
| "learning_rate": 2.1754892852057174e-06, | |
| "loss": 0.0121, | |
| "num_input_tokens_seen": 170524672, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 4.31100082712986, | |
| "grad_norm": 0.31771200106763137, | |
| "learning_rate": 2.133243945405128e-06, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 170786816, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 4.317617866004963, | |
| "grad_norm": 0.29474813542572154, | |
| "learning_rate": 2.0913945277317384e-06, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 171048960, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 4.324234904880067, | |
| "grad_norm": 0.31855204764887335, | |
| "learning_rate": 2.0499417567794e-06, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 171311104, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 4.330851943755169, | |
| "grad_norm": 0.2848669170044536, | |
| "learning_rate": 2.0088863502742665e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 171573248, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 4.337468982630273, | |
| "grad_norm": 0.2579942897761665, | |
| "learning_rate": 1.968229019062437e-06, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 171835392, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 4.344086021505376, | |
| "grad_norm": 0.32880599270669536, | |
| "learning_rate": 1.927970467097573e-06, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 172097536, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.3507030603804795, | |
| "grad_norm": 0.27491400258924964, | |
| "learning_rate": 1.8881113914287735e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 172359680, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 4.357320099255583, | |
| "grad_norm": 0.2782433259001828, | |
| "learning_rate": 1.8486524821884705e-06, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 172621824, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 4.363937138130686, | |
| "grad_norm": 0.2813636030144918, | |
| "learning_rate": 1.8095944225804961e-06, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 172883968, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 4.37055417700579, | |
| "grad_norm": 0.31012917378381333, | |
| "learning_rate": 1.7709378888682404e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 173146112, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 4.377171215880893, | |
| "grad_norm": 0.2815088227497734, | |
| "learning_rate": 1.732683550362954e-06, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 173408256, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 4.383788254755997, | |
| "grad_norm": 0.28745562870408126, | |
| "learning_rate": 1.6948320694121527e-06, | |
| "loss": 0.0116, | |
| "num_input_tokens_seen": 173670400, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 4.3904052936311, | |
| "grad_norm": 0.26246236882260804, | |
| "learning_rate": 1.6573841013881486e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 173932544, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 4.397022332506204, | |
| "grad_norm": 0.25186617226497, | |
| "learning_rate": 1.6203402946767198e-06, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 174194688, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 4.403639371381307, | |
| "grad_norm": 0.32281818675069474, | |
| "learning_rate": 1.5837012906658484e-06, | |
| "loss": 0.0126, | |
| "num_input_tokens_seen": 174456832, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 4.410256410256411, | |
| "grad_norm": 0.3347793755605357, | |
| "learning_rate": 1.5474677237346468e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 174718976, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.416873449131514, | |
| "grad_norm": 0.2831919531375974, | |
| "learning_rate": 1.5116402212423664e-06, | |
| "loss": 0.0126, | |
| "num_input_tokens_seen": 174981120, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 4.4234904880066175, | |
| "grad_norm": 0.33618014730043816, | |
| "learning_rate": 1.4762194035175286e-06, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 175243264, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 4.43010752688172, | |
| "grad_norm": 0.29188046505767673, | |
| "learning_rate": 1.4412058838471908e-06, | |
| "loss": 0.0125, | |
| "num_input_tokens_seen": 175505408, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 4.4367245657568235, | |
| "grad_norm": 0.34185968510159637, | |
| "learning_rate": 1.406600268466321e-06, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 175767552, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 4.443341604631927, | |
| "grad_norm": 0.2751875387353734, | |
| "learning_rate": 1.3724031565473112e-06, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 176029696, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.44995864350703, | |
| "grad_norm": 0.307753681169333, | |
| "learning_rate": 1.3386151401895919e-06, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 176291840, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 4.456575682382134, | |
| "grad_norm": 0.2941773412259061, | |
| "learning_rate": 1.3052368044093904e-06, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 176553984, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 4.463192721257237, | |
| "grad_norm": 0.29718218740929087, | |
| "learning_rate": 1.272268727129597e-06, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 176816128, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 4.469809760132341, | |
| "grad_norm": 0.28425687102272895, | |
| "learning_rate": 1.23971147916975e-06, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 177078272, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 4.476426799007444, | |
| "grad_norm": 0.28375740052067294, | |
| "learning_rate": 1.2075656242361732e-06, | |
| "loss": 0.0125, | |
| "num_input_tokens_seen": 177340416, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.483043837882548, | |
| "grad_norm": 0.31804138215647476, | |
| "learning_rate": 1.1758317189121987e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 177602560, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 4.489660876757651, | |
| "grad_norm": 0.27189609988366087, | |
| "learning_rate": 1.144510312648528e-06, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 177864704, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 4.496277915632755, | |
| "grad_norm": 0.27386745327873196, | |
| "learning_rate": 1.1136019477537397e-06, | |
| "loss": 0.0117, | |
| "num_input_tokens_seen": 178126848, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 4.502894954507858, | |
| "grad_norm": 0.2807660974451893, | |
| "learning_rate": 1.0831071593848747e-06, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 178388992, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 4.5095119933829615, | |
| "grad_norm": 0.2807185122191924, | |
| "learning_rate": 1.0530264755381824e-06, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 178651136, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 4.516129032258064, | |
| "grad_norm": 0.30476848574607546, | |
| "learning_rate": 1.0233604170399813e-06, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 178913280, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 4.522746071133168, | |
| "grad_norm": 0.274310182626815, | |
| "learning_rate": 9.941094975376374e-07, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 179175424, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 4.529363110008271, | |
| "grad_norm": 0.2851652151422951, | |
| "learning_rate": 9.652742234906698e-07, | |
| "loss": 0.0108, | |
| "num_input_tokens_seen": 179437568, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 4.535980148883374, | |
| "grad_norm": 0.2927927384652578, | |
| "learning_rate": 9.368550941619786e-07, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 179699712, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 4.542597187758478, | |
| "grad_norm": 0.2827600595189759, | |
| "learning_rate": 9.088526016092142e-07, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 179961856, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.549214226633581, | |
| "grad_norm": 0.2828143968736632, | |
| "learning_rate": 8.812672306762415e-07, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 180224000, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 4.555831265508685, | |
| "grad_norm": 0.28022586590651605, | |
| "learning_rate": 8.540994589847501e-07, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 180486144, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 4.562448304383788, | |
| "grad_norm": 0.26928602347071406, | |
| "learning_rate": 8.273497569259935e-07, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 180748288, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 4.569065343258892, | |
| "grad_norm": 0.2941361327088142, | |
| "learning_rate": 8.010185876526328e-07, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 181010432, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 4.575682382133995, | |
| "grad_norm": 0.3099598307049961, | |
| "learning_rate": 7.751064070707248e-07, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 181272576, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 4.582299421009099, | |
| "grad_norm": 0.32349950320623483, | |
| "learning_rate": 7.496136638318218e-07, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 181534720, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 4.588916459884202, | |
| "grad_norm": 0.29772991851883923, | |
| "learning_rate": 7.245407993252101e-07, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 181796864, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 4.5955334987593055, | |
| "grad_norm": 0.25034733406750814, | |
| "learning_rate": 6.998882476702662e-07, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 182059008, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 4.602150537634409, | |
| "grad_norm": 0.27397283744512196, | |
| "learning_rate": 6.756564357089379e-07, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 182321152, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 4.608767576509512, | |
| "grad_norm": 0.27869832899439306, | |
| "learning_rate": 6.51845782998356e-07, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 182583296, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.615384615384615, | |
| "grad_norm": 0.30104324881793254, | |
| "learning_rate": 6.28456701803562e-07, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 182845440, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 4.622001654259718, | |
| "grad_norm": 0.2901161193436187, | |
| "learning_rate": 6.054895970903945e-07, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 183107584, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 4.628618693134822, | |
| "grad_norm": 0.30087510691273794, | |
| "learning_rate": 5.829448665184339e-07, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 183369728, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 4.635235732009925, | |
| "grad_norm": 0.28826623789313743, | |
| "learning_rate": 5.608229004341686e-07, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 183631872, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 4.641852770885029, | |
| "grad_norm": 0.2760868624828502, | |
| "learning_rate": 5.391240818642007e-07, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 183894016, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 4.648469809760132, | |
| "grad_norm": 0.30478625382300967, | |
| "learning_rate": 5.178487865086179e-07, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 184156160, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 4.655086848635236, | |
| "grad_norm": 0.2764507588923767, | |
| "learning_rate": 4.96997382734507e-07, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 184418304, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 4.661703887510339, | |
| "grad_norm": 0.2910917254737324, | |
| "learning_rate": 4.7657023156955636e-07, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 184680448, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 4.668320926385443, | |
| "grad_norm": 0.26960031865863143, | |
| "learning_rate": 4.5656768669582183e-07, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 184942592, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 4.674937965260546, | |
| "grad_norm": 0.2648467075940531, | |
| "learning_rate": 4.3699009444357344e-07, | |
| "loss": 0.0112, | |
| "num_input_tokens_seen": 185204736, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.6815550041356495, | |
| "grad_norm": 0.3059089915950766, | |
| "learning_rate": 4.1783779378534727e-07, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 185466880, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 4.688172043010753, | |
| "grad_norm": 0.31042353213566753, | |
| "learning_rate": 3.9911111633002543e-07, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 185729024, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 4.694789081885856, | |
| "grad_norm": 0.2767065512527441, | |
| "learning_rate": 3.8081038631713195e-07, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 185991168, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 4.701406120760959, | |
| "grad_norm": 0.3001480980848603, | |
| "learning_rate": 3.6293592061119596e-07, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 186253312, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 4.708023159636063, | |
| "grad_norm": 0.2659712148033865, | |
| "learning_rate": 3.454880286962781e-07, | |
| "loss": 0.0116, | |
| "num_input_tokens_seen": 186515456, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 4.714640198511166, | |
| "grad_norm": 0.2885929418188894, | |
| "learning_rate": 3.2846701267060243e-07, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 186777600, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 4.721257237386269, | |
| "grad_norm": 0.3206907985460206, | |
| "learning_rate": 3.1187316724133885e-07, | |
| "loss": 0.0108, | |
| "num_input_tokens_seen": 187039744, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 4.727874276261373, | |
| "grad_norm": 0.25766634533303956, | |
| "learning_rate": 2.957067797194929e-07, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 187301888, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 4.734491315136476, | |
| "grad_norm": 0.2797485860867943, | |
| "learning_rate": 2.799681300149154e-07, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 187564032, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 4.74110835401158, | |
| "grad_norm": 0.3004663285838236, | |
| "learning_rate": 2.646574906314925e-07, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 187826176, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.747725392886683, | |
| "grad_norm": 0.29686494498566196, | |
| "learning_rate": 2.497751266623938e-07, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 188088320, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 4.754342431761787, | |
| "grad_norm": 0.28503781761594604, | |
| "learning_rate": 2.3532129578549834e-07, | |
| "loss": 0.0126, | |
| "num_input_tokens_seen": 188350464, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 4.76095947063689, | |
| "grad_norm": 0.2828533817351923, | |
| "learning_rate": 2.212962482589287e-07, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 188612608, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 4.7675765095119935, | |
| "grad_norm": 0.28500905365606427, | |
| "learning_rate": 2.0770022691672387e-07, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 188874752, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 4.774193548387097, | |
| "grad_norm": 0.2722334517755525, | |
| "learning_rate": 1.9453346716462317e-07, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 189136896, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.7808105872622, | |
| "grad_norm": 0.32973797361712476, | |
| "learning_rate": 1.8179619697600292e-07, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 189399040, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 4.787427626137304, | |
| "grad_norm": 0.28386615493093464, | |
| "learning_rate": 1.6948863688791837e-07, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 189661184, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 4.794044665012407, | |
| "grad_norm": 0.26188027672390873, | |
| "learning_rate": 1.576109999972958e-07, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 189923328, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 4.80066170388751, | |
| "grad_norm": 0.27818657182928697, | |
| "learning_rate": 1.4616349195723245e-07, | |
| "loss": 0.0117, | |
| "num_input_tokens_seen": 190185472, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 4.807278742762613, | |
| "grad_norm": 0.2713376652927809, | |
| "learning_rate": 1.351463109734441e-07, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 190447616, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.813895781637717, | |
| "grad_norm": 0.292337124074996, | |
| "learning_rate": 1.2455964780083152e-07, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 190709760, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 4.82051282051282, | |
| "grad_norm": 0.2624592360389428, | |
| "learning_rate": 1.1440368574017202e-07, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 190971904, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 4.827129859387924, | |
| "grad_norm": 0.27915994440217407, | |
| "learning_rate": 1.0467860063495538e-07, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 191234048, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 4.833746898263027, | |
| "grad_norm": 0.2988899889264722, | |
| "learning_rate": 9.538456086832237e-08, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 191496192, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 4.840363937138131, | |
| "grad_norm": 0.27437989333246415, | |
| "learning_rate": 8.652172736017816e-08, | |
| "loss": 0.0121, | |
| "num_input_tokens_seen": 191758336, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 4.846980976013234, | |
| "grad_norm": 0.3475747345133441, | |
| "learning_rate": 7.809025356436961e-08, | |
| "loss": 0.0125, | |
| "num_input_tokens_seen": 192020480, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 4.8535980148883375, | |
| "grad_norm": 0.32205466401651683, | |
| "learning_rate": 7.009028546606233e-08, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 192282624, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 4.860215053763441, | |
| "grad_norm": 0.4216447213602033, | |
| "learning_rate": 6.252196157919276e-08, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 192544768, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 4.866832092638544, | |
| "grad_norm": 0.26368939069501807, | |
| "learning_rate": 5.538541294407285e-08, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 192806912, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 4.873449131513648, | |
| "grad_norm": 0.27372393362295216, | |
| "learning_rate": 4.868076312512515e-08, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 193069056, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.880066170388751, | |
| "grad_norm": 0.3065832019500159, | |
| "learning_rate": 4.240812820874296e-08, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 193331200, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 4.886683209263855, | |
| "grad_norm": 0.2602099220704508, | |
| "learning_rate": 3.656761680127796e-08, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 193593344, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 4.893300248138958, | |
| "grad_norm": 0.28036077725532615, | |
| "learning_rate": 3.1159330027161204e-08, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 193855488, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 4.899917287014061, | |
| "grad_norm": 0.27157575496127295, | |
| "learning_rate": 2.618336152715728e-08, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 194117632, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 4.906534325889164, | |
| "grad_norm": 0.29757307933818905, | |
| "learning_rate": 2.1639797456723952e-08, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 194379776, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 4.913151364764268, | |
| "grad_norm": 0.28454058209618394, | |
| "learning_rate": 1.752871648454668e-08, | |
| "loss": 0.0108, | |
| "num_input_tokens_seen": 194641920, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 4.919768403639371, | |
| "grad_norm": 0.22677688916259928, | |
| "learning_rate": 1.3850189791161927e-08, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 194904064, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 4.926385442514475, | |
| "grad_norm": 0.2865694178009504, | |
| "learning_rate": 1.0604281067724819e-08, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 195166208, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 4.933002481389578, | |
| "grad_norm": 0.29714518565256665, | |
| "learning_rate": 7.791046514907252e-09, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 195428352, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 4.9396195202646815, | |
| "grad_norm": 0.29893263843518475, | |
| "learning_rate": 5.41053484192644e-09, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 195690496, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.946236559139785, | |
| "grad_norm": 0.32165979568623193, | |
| "learning_rate": 3.462787265703926e-09, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 195952640, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 4.9528535980148884, | |
| "grad_norm": 0.2467567792729774, | |
| "learning_rate": 1.9478375101467104e-09, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 196214784, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 4.959470636889992, | |
| "grad_norm": 0.30153729872087115, | |
| "learning_rate": 8.657118055643843e-10, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 196476928, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 4.966087675765095, | |
| "grad_norm": 0.28356258452414257, | |
| "learning_rate": 2.164288882194887e-10, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 196739072, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 4.972704714640199, | |
| "grad_norm": 0.30560004687167447, | |
| "learning_rate": 0.0, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 197001216, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 4.972704714640199, | |
| "num_input_tokens_seen": 197001216, | |
| "step": 755, | |
| "total_flos": 314645277573120.0, | |
| "train_loss": 0.5845526486795568, | |
| "train_runtime": 18622.3068, | |
| "train_samples_per_second": 10.386, | |
| "train_steps_per_second": 0.041 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 755, | |
| "num_input_tokens_seen": 197001216, | |
| "num_train_epochs": 5, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 314645277573120.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |