{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 49.82051282051282, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.41025641025641024, "grad_norm": 8.151017189025879, "learning_rate": 0.0, "loss": 1.3291, "step": 1 }, { "epoch": 0.8205128205128205, "grad_norm": 8.3428955078125, "learning_rate": 1.1111111111111112e-05, "loss": 1.3498, "step": 2 }, { "epoch": 1.4102564102564101, "grad_norm": 3.494961738586426, "learning_rate": 2.2222222222222223e-05, "loss": 1.3324, "step": 3 }, { "epoch": 1.8205128205128205, "grad_norm": 3.1425204277038574, "learning_rate": 3.3333333333333335e-05, "loss": 1.127, "step": 4 }, { "epoch": 2.41025641025641, "grad_norm": 2.282806396484375, "learning_rate": 4.4444444444444447e-05, "loss": 1.029, "step": 5 }, { "epoch": 2.8205128205128203, "grad_norm": 4.562756061553955, "learning_rate": 5.555555555555556e-05, "loss": 0.8966, "step": 6 }, { "epoch": 3.41025641025641, "grad_norm": 5.485057353973389, "learning_rate": 6.666666666666667e-05, "loss": 0.8497, "step": 7 }, { "epoch": 3.8205128205128203, "grad_norm": 1.8801462650299072, "learning_rate": 7.777777777777778e-05, "loss": 0.71, "step": 8 }, { "epoch": 4.410256410256411, "grad_norm": 1.7765756845474243, "learning_rate": 8.888888888888889e-05, "loss": 0.6853, "step": 9 }, { "epoch": 4.82051282051282, "grad_norm": 2.199324131011963, "learning_rate": 0.0001, "loss": 0.535, "step": 10 }, { "epoch": 5.410256410256411, "grad_norm": 2.8563392162323, "learning_rate": 0.00011111111111111112, "loss": 0.5501, "step": 11 }, { "epoch": 5.82051282051282, "grad_norm": 2.2397141456604004, "learning_rate": 0.00012222222222222224, "loss": 0.3953, "step": 12 }, { "epoch": 6.410256410256411, "grad_norm": 1.4788132905960083, "learning_rate": 0.00013333333333333334, "loss": 0.3417, "step": 13 }, { "epoch": 6.82051282051282, "grad_norm": 1.7762037515640259, "learning_rate": 0.00014444444444444444, "loss": 0.2219, "step": 14 }, { "epoch": 7.410256410256411, "grad_norm": 0.7586674690246582, "learning_rate": 0.00015555555555555556, "loss": 0.2104, "step": 15 }, { "epoch": 7.82051282051282, "grad_norm": 0.9284120202064514, "learning_rate": 0.0001666666666666667, "loss": 0.1662, "step": 16 }, { "epoch": 8.41025641025641, "grad_norm": 0.6511984467506409, "learning_rate": 0.00017777777777777779, "loss": 0.1547, "step": 17 }, { "epoch": 8.820512820512821, "grad_norm": 1.1308850049972534, "learning_rate": 0.00018888888888888888, "loss": 0.1444, "step": 18 }, { "epoch": 9.41025641025641, "grad_norm": 0.662821352481842, "learning_rate": 0.0002, "loss": 0.123, "step": 19 }, { "epoch": 9.820512820512821, "grad_norm": 0.5322834253311157, "learning_rate": 0.00019999854312354064, "loss": 0.1036, "step": 20 }, { "epoch": 10.41025641025641, "grad_norm": 0.5911012887954712, "learning_rate": 0.00019999417253661235, "loss": 0.0969, "step": 21 }, { "epoch": 10.820512820512821, "grad_norm": 0.628558874130249, "learning_rate": 0.00019998688836656323, "loss": 0.0857, "step": 22 }, { "epoch": 11.41025641025641, "grad_norm": 0.7193434834480286, "learning_rate": 0.00019997669082563597, "loss": 0.0748, "step": 23 }, { "epoch": 11.820512820512821, "grad_norm": 0.3524335026741028, "learning_rate": 0.00019996358021096176, "loss": 0.066, "step": 24 }, { "epoch": 12.41025641025641, "grad_norm": 0.47225716710090637, "learning_rate": 0.00019994755690455152, "loss": 0.0613, "step": 25 }, { "epoch": 12.820512820512821, "grad_norm": 0.4532913267612457, "learning_rate": 0.00019992862137328474, "loss": 0.0408, "step": 26 }, { "epoch": 13.41025641025641, "grad_norm": 0.2988104224205017, "learning_rate": 0.00019990677416889608, "loss": 0.0353, "step": 27 }, { "epoch": 13.820512820512821, "grad_norm": 0.31864798069000244, "learning_rate": 0.0001998820159279591, "loss": 0.033, "step": 28 }, { "epoch": 14.41025641025641, "grad_norm": 0.29291290044784546, "learning_rate": 0.0001998543473718677, "loss": 0.0235, "step": 29 }, { "epoch": 14.820512820512821, "grad_norm": 0.24096333980560303, "learning_rate": 0.00019982376930681531, "loss": 0.0194, "step": 30 }, { "epoch": 15.41025641025641, "grad_norm": 0.2400427609682083, "learning_rate": 0.00019979028262377118, "loss": 0.0177, "step": 31 }, { "epoch": 15.820512820512821, "grad_norm": 0.23485173285007477, "learning_rate": 0.00019975388829845448, "loss": 0.0132, "step": 32 }, { "epoch": 16.41025641025641, "grad_norm": 0.4795994758605957, "learning_rate": 0.00019971458739130598, "loss": 0.0123, "step": 33 }, { "epoch": 16.82051282051282, "grad_norm": 0.3436650335788727, "learning_rate": 0.00019967238104745696, "loss": 0.0077, "step": 34 }, { "epoch": 17.41025641025641, "grad_norm": 0.24164724349975586, "learning_rate": 0.000199627270496696, "loss": 0.0083, "step": 35 }, { "epoch": 17.82051282051282, "grad_norm": 0.11744043976068497, "learning_rate": 0.0001995792570534331, "loss": 0.0053, "step": 36 }, { "epoch": 18.41025641025641, "grad_norm": 0.2771929204463959, "learning_rate": 0.0001995283421166614, "loss": 0.0076, "step": 37 }, { "epoch": 18.82051282051282, "grad_norm": 0.14852669835090637, "learning_rate": 0.00019947452716991633, "loss": 0.0042, "step": 38 }, { "epoch": 19.41025641025641, "grad_norm": 1.1028482913970947, "learning_rate": 0.00019941781378123244, "loss": 0.0114, "step": 39 }, { "epoch": 19.82051282051282, "grad_norm": 0.23756887018680573, "learning_rate": 0.00019935820360309777, "loss": 0.0043, "step": 40 }, { "epoch": 20.41025641025641, "grad_norm": 0.8769266605377197, "learning_rate": 0.00019929569837240564, "loss": 0.0047, "step": 41 }, { "epoch": 20.82051282051282, "grad_norm": 0.531132698059082, "learning_rate": 0.00019923029991040402, "loss": 0.0063, "step": 42 }, { "epoch": 21.41025641025641, "grad_norm": 1.1996066570281982, "learning_rate": 0.00019916201012264254, "loss": 0.0143, "step": 43 }, { "epoch": 21.82051282051282, "grad_norm": 0.6255332827568054, "learning_rate": 0.0001990908309989168, "loss": 0.0168, "step": 44 }, { "epoch": 22.41025641025641, "grad_norm": 49.508148193359375, "learning_rate": 0.00019901676461321068, "loss": 0.0621, "step": 45 }, { "epoch": 22.82051282051282, "grad_norm": 8.113191604614258, "learning_rate": 0.00019893981312363562, "loss": 0.1062, "step": 46 }, { "epoch": 23.41025641025641, "grad_norm": 2.3446950912475586, "learning_rate": 0.00019885997877236788, "loss": 0.066, "step": 47 }, { "epoch": 23.82051282051282, "grad_norm": 153.14146423339844, "learning_rate": 0.00019877726388558325, "loss": 0.0612, "step": 48 }, { "epoch": 24.41025641025641, "grad_norm": 43.04759216308594, "learning_rate": 0.00019869167087338907, "loss": 0.135, "step": 49 }, { "epoch": 24.82051282051282, "grad_norm": 51.32644271850586, "learning_rate": 0.00019860320222975431, "loss": 0.1375, "step": 50 }, { "epoch": 25.41025641025641, "grad_norm": 1.9464935064315796, "learning_rate": 0.00019851186053243666, "loss": 0.3427, "step": 51 }, { "epoch": 25.82051282051282, "grad_norm": 381.23974609375, "learning_rate": 0.00019841764844290744, "loss": 2.1722, "step": 52 }, { "epoch": 26.41025641025641, "grad_norm": 119.89301300048828, "learning_rate": 0.00019832056870627417, "loss": 2.4659, "step": 53 }, { "epoch": 26.82051282051282, "grad_norm": 48.73936080932617, "learning_rate": 0.00019822062415120054, "loss": 0.8509, "step": 54 }, { "epoch": 27.41025641025641, "grad_norm": 19.564029693603516, "learning_rate": 0.0001981178176898239, "loss": 0.4258, "step": 55 }, { "epoch": 27.82051282051282, "grad_norm": 33.161495208740234, "learning_rate": 0.00019801215231767056, "loss": 0.2414, "step": 56 }, { "epoch": 28.41025641025641, "grad_norm": 5.548079013824463, "learning_rate": 0.00019790363111356837, "loss": 0.1882, "step": 57 }, { "epoch": 28.82051282051282, "grad_norm": 4.21547794342041, "learning_rate": 0.00019779225723955707, "loss": 0.1264, "step": 58 }, { "epoch": 29.41025641025641, "grad_norm": 2.7072598934173584, "learning_rate": 0.00019767803394079615, "loss": 0.2181, "step": 59 }, { "epoch": 29.82051282051282, "grad_norm": 7.378205299377441, "learning_rate": 0.0001975609645454704, "loss": 0.1593, "step": 60 }, { "epoch": 30.41025641025641, "grad_norm": 9.17626667022705, "learning_rate": 0.00019744105246469263, "loss": 0.3464, "step": 61 }, { "epoch": 30.82051282051282, "grad_norm": 27.878585815429688, "learning_rate": 0.00019731830119240463, "loss": 0.4882, "step": 62 }, { "epoch": 31.41025641025641, "grad_norm": 15.55352783203125, "learning_rate": 0.0001971927143052752, "loss": 0.7857, "step": 63 }, { "epoch": 31.82051282051282, "grad_norm": 16.477920532226562, "learning_rate": 0.00019706429546259593, "loss": 0.663, "step": 64 }, { "epoch": 32.41025641025641, "grad_norm": 13.829732894897461, "learning_rate": 0.00019693304840617457, "loss": 0.5652, "step": 65 }, { "epoch": 32.82051282051282, "grad_norm": 1.885118842124939, "learning_rate": 0.00019679897696022608, "loss": 0.2583, "step": 66 }, { "epoch": 33.41025641025641, "grad_norm": 1.9031124114990234, "learning_rate": 0.00019666208503126112, "loss": 0.2566, "step": 67 }, { "epoch": 33.82051282051282, "grad_norm": 1.2540283203125, "learning_rate": 0.0001965223766079723, "loss": 0.1855, "step": 68 }, { "epoch": 34.41025641025641, "grad_norm": 0.9428790807723999, "learning_rate": 0.00019637985576111778, "loss": 0.1633, "step": 69 }, { "epoch": 34.82051282051282, "grad_norm": 0.8358070254325867, "learning_rate": 0.00019623452664340306, "loss": 0.1277, "step": 70 }, { "epoch": 35.41025641025641, "grad_norm": 0.9116950631141663, "learning_rate": 0.0001960863934893594, "loss": 0.1124, "step": 71 }, { "epoch": 35.82051282051282, "grad_norm": 1.235021948814392, "learning_rate": 0.00019593546061522093, "loss": 0.0928, "step": 72 }, { "epoch": 36.41025641025641, "grad_norm": 0.7440080046653748, "learning_rate": 0.00019578173241879872, "loss": 0.0839, "step": 73 }, { "epoch": 36.82051282051282, "grad_norm": 0.5238239765167236, "learning_rate": 0.00019562521337935257, "loss": 0.0589, "step": 74 }, { "epoch": 37.41025641025641, "grad_norm": 0.637015700340271, "learning_rate": 0.00019546590805746052, "loss": 0.0538, "step": 75 }, { "epoch": 37.82051282051282, "grad_norm": 0.3730023205280304, "learning_rate": 0.0001953038210948861, "loss": 0.0379, "step": 76 }, { "epoch": 38.41025641025641, "grad_norm": 0.39598342776298523, "learning_rate": 0.00019513895721444286, "loss": 0.0314, "step": 77 }, { "epoch": 38.82051282051282, "grad_norm": 0.26019713282585144, "learning_rate": 0.00019497132121985695, "loss": 0.0247, "step": 78 }, { "epoch": 39.41025641025641, "grad_norm": 0.27270156145095825, "learning_rate": 0.00019480091799562704, "loss": 0.0219, "step": 79 }, { "epoch": 39.82051282051282, "grad_norm": 0.31213200092315674, "learning_rate": 0.0001946277525068821, "loss": 0.0177, "step": 80 }, { "epoch": 40.41025641025641, "grad_norm": 0.3065904676914215, "learning_rate": 0.00019445182979923654, "loss": 0.0167, "step": 81 }, { "epoch": 40.82051282051282, "grad_norm": 0.25565171241760254, "learning_rate": 0.00019427315499864344, "loss": 0.0132, "step": 82 }, { "epoch": 41.41025641025641, "grad_norm": 0.16997747123241425, "learning_rate": 0.000194091733311245, "loss": 0.0106, "step": 83 }, { "epoch": 41.82051282051282, "grad_norm": 0.13165056705474854, "learning_rate": 0.0001939075700232209, "loss": 0.0108, "step": 84 }, { "epoch": 42.41025641025641, "grad_norm": 0.10982735455036163, "learning_rate": 0.00019372067050063438, "loss": 0.0096, "step": 85 }, { "epoch": 42.82051282051282, "grad_norm": 0.10672740638256073, "learning_rate": 0.00019353104018927567, "loss": 0.0083, "step": 86 }, { "epoch": 43.41025641025641, "grad_norm": 0.1570005714893341, "learning_rate": 0.0001933386846145036, "loss": 0.0066, "step": 87 }, { "epoch": 43.82051282051282, "grad_norm": 0.1381327509880066, "learning_rate": 0.00019314360938108425, "loss": 0.008, "step": 88 }, { "epoch": 44.41025641025641, "grad_norm": 0.13799023628234863, "learning_rate": 0.00019294582017302797, "loss": 0.0075, "step": 89 }, { "epoch": 44.82051282051282, "grad_norm": 0.07857757061719894, "learning_rate": 0.00019274532275342354, "loss": 0.0058, "step": 90 }, { "epoch": 45.41025641025641, "grad_norm": 0.40940356254577637, "learning_rate": 0.00019254212296427044, "loss": 0.0078, "step": 91 }, { "epoch": 45.82051282051282, "grad_norm": 0.13838538527488708, "learning_rate": 0.0001923362267263084, "loss": 0.0063, "step": 92 }, { "epoch": 46.41025641025641, "grad_norm": 0.1280914694070816, "learning_rate": 0.0001921276400388451, "loss": 0.0051, "step": 93 }, { "epoch": 46.82051282051282, "grad_norm": 0.1300235092639923, "learning_rate": 0.00019191636897958122, "loss": 0.0045, "step": 94 }, { "epoch": 47.41025641025641, "grad_norm": 0.05682254955172539, "learning_rate": 0.00019170241970443343, "loss": 0.0045, "step": 95 }, { "epoch": 47.82051282051282, "grad_norm": 0.06927549839019775, "learning_rate": 0.00019148579844735497, "loss": 0.0032, "step": 96 }, { "epoch": 48.41025641025641, "grad_norm": 0.09624794870615005, "learning_rate": 0.00019126651152015403, "loss": 0.0041, "step": 97 }, { "epoch": 48.82051282051282, "grad_norm": 0.0919504463672638, "learning_rate": 0.00019104456531230984, "loss": 0.0032, "step": 98 }, { "epoch": 49.41025641025641, "grad_norm": 0.20492327213287354, "learning_rate": 0.00019081996629078657, "loss": 0.0039, "step": 99 }, { "epoch": 49.82051282051282, "grad_norm": 0.042908795177936554, "learning_rate": 0.0001905927209998447, "loss": 0.002, "step": 100 } ], "logging_steps": 1.0, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 300, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.187460722471731e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }