| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 49.82051282051282, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 8.151017189025879, | |
| "learning_rate": 0.0, | |
| "loss": 1.3291, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 8.3428955078125, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 1.3498, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 3.494961738586426, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 1.3324, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 1.8205128205128205, | |
| "grad_norm": 3.1425204277038574, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 1.127, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 2.41025641025641, | |
| "grad_norm": 2.282806396484375, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 1.029, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 4.562756061553955, | |
| "learning_rate": 5.555555555555556e-05, | |
| "loss": 0.8966, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 3.41025641025641, | |
| "grad_norm": 5.485057353973389, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.8497, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 3.8205128205128203, | |
| "grad_norm": 1.8801462650299072, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 0.71, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 4.410256410256411, | |
| "grad_norm": 1.7765756845474243, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 0.6853, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 4.82051282051282, | |
| "grad_norm": 2.199324131011963, | |
| "learning_rate": 0.0001, | |
| "loss": 0.535, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 5.410256410256411, | |
| "grad_norm": 2.8563392162323, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 0.5501, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 5.82051282051282, | |
| "grad_norm": 2.2397141456604004, | |
| "learning_rate": 0.00012222222222222224, | |
| "loss": 0.3953, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 6.410256410256411, | |
| "grad_norm": 1.4788132905960083, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 0.3417, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 6.82051282051282, | |
| "grad_norm": 1.7762037515640259, | |
| "learning_rate": 0.00014444444444444444, | |
| "loss": 0.2219, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 7.410256410256411, | |
| "grad_norm": 0.7586674690246582, | |
| "learning_rate": 0.00015555555555555556, | |
| "loss": 0.2104, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 7.82051282051282, | |
| "grad_norm": 0.9284120202064514, | |
| "learning_rate": 0.0001666666666666667, | |
| "loss": 0.1662, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 8.41025641025641, | |
| "grad_norm": 0.6511984467506409, | |
| "learning_rate": 0.00017777777777777779, | |
| "loss": 0.1547, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 8.820512820512821, | |
| "grad_norm": 1.1308850049972534, | |
| "learning_rate": 0.00018888888888888888, | |
| "loss": 0.1444, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 9.41025641025641, | |
| "grad_norm": 0.662821352481842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.123, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 9.820512820512821, | |
| "grad_norm": 0.5322834253311157, | |
| "learning_rate": 0.00019999854312354064, | |
| "loss": 0.1036, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 10.41025641025641, | |
| "grad_norm": 0.5911012887954712, | |
| "learning_rate": 0.00019999417253661235, | |
| "loss": 0.0969, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 10.820512820512821, | |
| "grad_norm": 0.628558874130249, | |
| "learning_rate": 0.00019998688836656323, | |
| "loss": 0.0857, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 11.41025641025641, | |
| "grad_norm": 0.7193434834480286, | |
| "learning_rate": 0.00019997669082563597, | |
| "loss": 0.0748, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 11.820512820512821, | |
| "grad_norm": 0.3524335026741028, | |
| "learning_rate": 0.00019996358021096176, | |
| "loss": 0.066, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 12.41025641025641, | |
| "grad_norm": 0.47225716710090637, | |
| "learning_rate": 0.00019994755690455152, | |
| "loss": 0.0613, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 12.820512820512821, | |
| "grad_norm": 0.4532913267612457, | |
| "learning_rate": 0.00019992862137328474, | |
| "loss": 0.0408, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 13.41025641025641, | |
| "grad_norm": 0.2988104224205017, | |
| "learning_rate": 0.00019990677416889608, | |
| "loss": 0.0353, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 13.820512820512821, | |
| "grad_norm": 0.31864798069000244, | |
| "learning_rate": 0.0001998820159279591, | |
| "loss": 0.033, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 14.41025641025641, | |
| "grad_norm": 0.29291290044784546, | |
| "learning_rate": 0.0001998543473718677, | |
| "loss": 0.0235, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 14.820512820512821, | |
| "grad_norm": 0.24096333980560303, | |
| "learning_rate": 0.00019982376930681531, | |
| "loss": 0.0194, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 15.41025641025641, | |
| "grad_norm": 0.2400427609682083, | |
| "learning_rate": 0.00019979028262377118, | |
| "loss": 0.0177, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 15.820512820512821, | |
| "grad_norm": 0.23485173285007477, | |
| "learning_rate": 0.00019975388829845448, | |
| "loss": 0.0132, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 16.41025641025641, | |
| "grad_norm": 0.4795994758605957, | |
| "learning_rate": 0.00019971458739130598, | |
| "loss": 0.0123, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 16.82051282051282, | |
| "grad_norm": 0.3436650335788727, | |
| "learning_rate": 0.00019967238104745696, | |
| "loss": 0.0077, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 17.41025641025641, | |
| "grad_norm": 0.24164724349975586, | |
| "learning_rate": 0.000199627270496696, | |
| "loss": 0.0083, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 17.82051282051282, | |
| "grad_norm": 0.11744043976068497, | |
| "learning_rate": 0.0001995792570534331, | |
| "loss": 0.0053, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 18.41025641025641, | |
| "grad_norm": 0.2771929204463959, | |
| "learning_rate": 0.0001995283421166614, | |
| "loss": 0.0076, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 18.82051282051282, | |
| "grad_norm": 0.14852669835090637, | |
| "learning_rate": 0.00019947452716991633, | |
| "loss": 0.0042, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 19.41025641025641, | |
| "grad_norm": 1.1028482913970947, | |
| "learning_rate": 0.00019941781378123244, | |
| "loss": 0.0114, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 19.82051282051282, | |
| "grad_norm": 0.23756887018680573, | |
| "learning_rate": 0.00019935820360309777, | |
| "loss": 0.0043, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 20.41025641025641, | |
| "grad_norm": 0.8769266605377197, | |
| "learning_rate": 0.00019929569837240564, | |
| "loss": 0.0047, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 20.82051282051282, | |
| "grad_norm": 0.531132698059082, | |
| "learning_rate": 0.00019923029991040402, | |
| "loss": 0.0063, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 21.41025641025641, | |
| "grad_norm": 1.1996066570281982, | |
| "learning_rate": 0.00019916201012264254, | |
| "loss": 0.0143, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 21.82051282051282, | |
| "grad_norm": 0.6255332827568054, | |
| "learning_rate": 0.0001990908309989168, | |
| "loss": 0.0168, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 22.41025641025641, | |
| "grad_norm": 49.508148193359375, | |
| "learning_rate": 0.00019901676461321068, | |
| "loss": 0.0621, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 22.82051282051282, | |
| "grad_norm": 8.113191604614258, | |
| "learning_rate": 0.00019893981312363562, | |
| "loss": 0.1062, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 23.41025641025641, | |
| "grad_norm": 2.3446950912475586, | |
| "learning_rate": 0.00019885997877236788, | |
| "loss": 0.066, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 23.82051282051282, | |
| "grad_norm": 153.14146423339844, | |
| "learning_rate": 0.00019877726388558325, | |
| "loss": 0.0612, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 24.41025641025641, | |
| "grad_norm": 43.04759216308594, | |
| "learning_rate": 0.00019869167087338907, | |
| "loss": 0.135, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 24.82051282051282, | |
| "grad_norm": 51.32644271850586, | |
| "learning_rate": 0.00019860320222975431, | |
| "loss": 0.1375, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 25.41025641025641, | |
| "grad_norm": 1.9464935064315796, | |
| "learning_rate": 0.00019851186053243666, | |
| "loss": 0.3427, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 25.82051282051282, | |
| "grad_norm": 381.23974609375, | |
| "learning_rate": 0.00019841764844290744, | |
| "loss": 2.1722, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 26.41025641025641, | |
| "grad_norm": 119.89301300048828, | |
| "learning_rate": 0.00019832056870627417, | |
| "loss": 2.4659, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 26.82051282051282, | |
| "grad_norm": 48.73936080932617, | |
| "learning_rate": 0.00019822062415120054, | |
| "loss": 0.8509, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 27.41025641025641, | |
| "grad_norm": 19.564029693603516, | |
| "learning_rate": 0.0001981178176898239, | |
| "loss": 0.4258, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 27.82051282051282, | |
| "grad_norm": 33.161495208740234, | |
| "learning_rate": 0.00019801215231767056, | |
| "loss": 0.2414, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 28.41025641025641, | |
| "grad_norm": 5.548079013824463, | |
| "learning_rate": 0.00019790363111356837, | |
| "loss": 0.1882, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 28.82051282051282, | |
| "grad_norm": 4.21547794342041, | |
| "learning_rate": 0.00019779225723955707, | |
| "loss": 0.1264, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 29.41025641025641, | |
| "grad_norm": 2.7072598934173584, | |
| "learning_rate": 0.00019767803394079615, | |
| "loss": 0.2181, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 29.82051282051282, | |
| "grad_norm": 7.378205299377441, | |
| "learning_rate": 0.0001975609645454704, | |
| "loss": 0.1593, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 30.41025641025641, | |
| "grad_norm": 9.17626667022705, | |
| "learning_rate": 0.00019744105246469263, | |
| "loss": 0.3464, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 30.82051282051282, | |
| "grad_norm": 27.878585815429688, | |
| "learning_rate": 0.00019731830119240463, | |
| "loss": 0.4882, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 31.41025641025641, | |
| "grad_norm": 15.55352783203125, | |
| "learning_rate": 0.0001971927143052752, | |
| "loss": 0.7857, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 31.82051282051282, | |
| "grad_norm": 16.477920532226562, | |
| "learning_rate": 0.00019706429546259593, | |
| "loss": 0.663, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 32.41025641025641, | |
| "grad_norm": 13.829732894897461, | |
| "learning_rate": 0.00019693304840617457, | |
| "loss": 0.5652, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 32.82051282051282, | |
| "grad_norm": 1.885118842124939, | |
| "learning_rate": 0.00019679897696022608, | |
| "loss": 0.2583, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 33.41025641025641, | |
| "grad_norm": 1.9031124114990234, | |
| "learning_rate": 0.00019666208503126112, | |
| "loss": 0.2566, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 33.82051282051282, | |
| "grad_norm": 1.2540283203125, | |
| "learning_rate": 0.0001965223766079723, | |
| "loss": 0.1855, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 34.41025641025641, | |
| "grad_norm": 0.9428790807723999, | |
| "learning_rate": 0.00019637985576111778, | |
| "loss": 0.1633, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 34.82051282051282, | |
| "grad_norm": 0.8358070254325867, | |
| "learning_rate": 0.00019623452664340306, | |
| "loss": 0.1277, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 35.41025641025641, | |
| "grad_norm": 0.9116950631141663, | |
| "learning_rate": 0.0001960863934893594, | |
| "loss": 0.1124, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 35.82051282051282, | |
| "grad_norm": 1.235021948814392, | |
| "learning_rate": 0.00019593546061522093, | |
| "loss": 0.0928, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 36.41025641025641, | |
| "grad_norm": 0.7440080046653748, | |
| "learning_rate": 0.00019578173241879872, | |
| "loss": 0.0839, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 36.82051282051282, | |
| "grad_norm": 0.5238239765167236, | |
| "learning_rate": 0.00019562521337935257, | |
| "loss": 0.0589, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 37.41025641025641, | |
| "grad_norm": 0.637015700340271, | |
| "learning_rate": 0.00019546590805746052, | |
| "loss": 0.0538, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 37.82051282051282, | |
| "grad_norm": 0.3730023205280304, | |
| "learning_rate": 0.0001953038210948861, | |
| "loss": 0.0379, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 38.41025641025641, | |
| "grad_norm": 0.39598342776298523, | |
| "learning_rate": 0.00019513895721444286, | |
| "loss": 0.0314, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 38.82051282051282, | |
| "grad_norm": 0.26019713282585144, | |
| "learning_rate": 0.00019497132121985695, | |
| "loss": 0.0247, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 39.41025641025641, | |
| "grad_norm": 0.27270156145095825, | |
| "learning_rate": 0.00019480091799562704, | |
| "loss": 0.0219, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 39.82051282051282, | |
| "grad_norm": 0.31213200092315674, | |
| "learning_rate": 0.0001946277525068821, | |
| "loss": 0.0177, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 40.41025641025641, | |
| "grad_norm": 0.3065904676914215, | |
| "learning_rate": 0.00019445182979923654, | |
| "loss": 0.0167, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 40.82051282051282, | |
| "grad_norm": 0.25565171241760254, | |
| "learning_rate": 0.00019427315499864344, | |
| "loss": 0.0132, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 41.41025641025641, | |
| "grad_norm": 0.16997747123241425, | |
| "learning_rate": 0.000194091733311245, | |
| "loss": 0.0106, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 41.82051282051282, | |
| "grad_norm": 0.13165056705474854, | |
| "learning_rate": 0.0001939075700232209, | |
| "loss": 0.0108, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 42.41025641025641, | |
| "grad_norm": 0.10982735455036163, | |
| "learning_rate": 0.00019372067050063438, | |
| "loss": 0.0096, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 42.82051282051282, | |
| "grad_norm": 0.10672740638256073, | |
| "learning_rate": 0.00019353104018927567, | |
| "loss": 0.0083, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 43.41025641025641, | |
| "grad_norm": 0.1570005714893341, | |
| "learning_rate": 0.0001933386846145036, | |
| "loss": 0.0066, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 43.82051282051282, | |
| "grad_norm": 0.1381327509880066, | |
| "learning_rate": 0.00019314360938108425, | |
| "loss": 0.008, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 44.41025641025641, | |
| "grad_norm": 0.13799023628234863, | |
| "learning_rate": 0.00019294582017302797, | |
| "loss": 0.0075, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 44.82051282051282, | |
| "grad_norm": 0.07857757061719894, | |
| "learning_rate": 0.00019274532275342354, | |
| "loss": 0.0058, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 45.41025641025641, | |
| "grad_norm": 0.40940356254577637, | |
| "learning_rate": 0.00019254212296427044, | |
| "loss": 0.0078, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 45.82051282051282, | |
| "grad_norm": 0.13838538527488708, | |
| "learning_rate": 0.0001923362267263084, | |
| "loss": 0.0063, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 46.41025641025641, | |
| "grad_norm": 0.1280914694070816, | |
| "learning_rate": 0.0001921276400388451, | |
| "loss": 0.0051, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 46.82051282051282, | |
| "grad_norm": 0.1300235092639923, | |
| "learning_rate": 0.00019191636897958122, | |
| "loss": 0.0045, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 47.41025641025641, | |
| "grad_norm": 0.05682254955172539, | |
| "learning_rate": 0.00019170241970443343, | |
| "loss": 0.0045, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 47.82051282051282, | |
| "grad_norm": 0.06927549839019775, | |
| "learning_rate": 0.00019148579844735497, | |
| "loss": 0.0032, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 48.41025641025641, | |
| "grad_norm": 0.09624794870615005, | |
| "learning_rate": 0.00019126651152015403, | |
| "loss": 0.0041, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 48.82051282051282, | |
| "grad_norm": 0.0919504463672638, | |
| "learning_rate": 0.00019104456531230984, | |
| "loss": 0.0032, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 49.41025641025641, | |
| "grad_norm": 0.20492327213287354, | |
| "learning_rate": 0.00019081996629078657, | |
| "loss": 0.0039, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 49.82051282051282, | |
| "grad_norm": 0.042908795177936554, | |
| "learning_rate": 0.0001905927209998447, | |
| "loss": 0.002, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 300, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.187460722471731e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |