| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.005333646240579447, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 5.3336462405794474e-05, |
| "grad_norm": 0.398686021566391, |
| "learning_rate": 5e-06, |
| "loss": 0.1308, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00010667292481158895, |
| "grad_norm": 0.46305230259895325, |
| "learning_rate": 1e-05, |
| "loss": 0.1388, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.00016000938721738342, |
| "grad_norm": 0.3536907136440277, |
| "learning_rate": 1.5e-05, |
| "loss": 0.1216, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0002133458496231779, |
| "grad_norm": 0.32838913798332214, |
| "learning_rate": 2e-05, |
| "loss": 0.124, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.00026668231202897234, |
| "grad_norm": 0.3058461546897888, |
| "learning_rate": 2.5e-05, |
| "loss": 0.1144, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00032001877443476684, |
| "grad_norm": 0.3052152097225189, |
| "learning_rate": 3e-05, |
| "loss": 0.109, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0003733552368405613, |
| "grad_norm": 0.3326779901981354, |
| "learning_rate": 3.5e-05, |
| "loss": 0.1233, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0004266916992463558, |
| "grad_norm": 0.3231862187385559, |
| "learning_rate": 4e-05, |
| "loss": 0.1099, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.00048002816165215024, |
| "grad_norm": 0.32896554470062256, |
| "learning_rate": 4.5e-05, |
| "loss": 0.1199, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0005333646240579447, |
| "grad_norm": 0.3078831434249878, |
| "learning_rate": 5e-05, |
| "loss": 0.1042, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005867010864637392, |
| "grad_norm": 0.2900382876396179, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 0.1108, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0006400375488695337, |
| "grad_norm": 0.28514525294303894, |
| "learning_rate": 6e-05, |
| "loss": 0.1092, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0006933740112753282, |
| "grad_norm": 0.30242955684661865, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 0.1104, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0007467104736811226, |
| "grad_norm": 0.3053489029407501, |
| "learning_rate": 7e-05, |
| "loss": 0.1209, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0008000469360869171, |
| "grad_norm": 0.32302212715148926, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 0.1273, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0008533833984927116, |
| "grad_norm": 0.29552462697029114, |
| "learning_rate": 8e-05, |
| "loss": 0.12, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0009067198608985061, |
| "grad_norm": 0.33688342571258545, |
| "learning_rate": 8.5e-05, |
| "loss": 0.135, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0009600563233043005, |
| "grad_norm": 0.29682204127311707, |
| "learning_rate": 9e-05, |
| "loss": 0.1224, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.001013392785710095, |
| "grad_norm": 0.33479437232017517, |
| "learning_rate": 9.5e-05, |
| "loss": 0.143, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0010667292481158894, |
| "grad_norm": 0.3345559537410736, |
| "learning_rate": 0.0001, |
| "loss": 0.1197, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.001120065710521684, |
| "grad_norm": 0.34470704197883606, |
| "learning_rate": 9.999999982431556e-05, |
| "loss": 0.1341, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.0011734021729274784, |
| "grad_norm": 0.3448582887649536, |
| "learning_rate": 9.999999929726225e-05, |
| "loss": 0.1404, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.001226738635333273, |
| "grad_norm": 0.33284544944763184, |
| "learning_rate": 9.999999841884006e-05, |
| "loss": 0.1399, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0012800750977390674, |
| "grad_norm": 0.3286615014076233, |
| "learning_rate": 9.999999718904902e-05, |
| "loss": 0.1353, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.0013334115601448618, |
| "grad_norm": 0.3405831456184387, |
| "learning_rate": 9.999999560788912e-05, |
| "loss": 0.1527, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0013867480225506564, |
| "grad_norm": 0.3199956715106964, |
| "learning_rate": 9.999999367536037e-05, |
| "loss": 0.1469, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0014400844849564508, |
| "grad_norm": 0.3393600881099701, |
| "learning_rate": 9.999999139146278e-05, |
| "loss": 0.1536, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0014934209473622452, |
| "grad_norm": 0.3263189196586609, |
| "learning_rate": 9.99999887561964e-05, |
| "loss": 0.1498, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0015467574097680398, |
| "grad_norm": 0.31700044870376587, |
| "learning_rate": 9.999998576956121e-05, |
| "loss": 0.1391, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.0016000938721738342, |
| "grad_norm": 0.33871933817863464, |
| "learning_rate": 9.999998243155724e-05, |
| "loss": 0.159, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0016534303345796288, |
| "grad_norm": 0.34099170565605164, |
| "learning_rate": 9.999997874218452e-05, |
| "loss": 0.1543, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0017067667969854232, |
| "grad_norm": 0.33778145909309387, |
| "learning_rate": 9.999997470144308e-05, |
| "loss": 0.1566, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.0017601032593912175, |
| "grad_norm": 0.32202789187431335, |
| "learning_rate": 9.999997030933294e-05, |
| "loss": 0.1688, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.0018134397217970122, |
| "grad_norm": 0.3160647749900818, |
| "learning_rate": 9.999996556585412e-05, |
| "loss": 0.1645, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.0018667761842028065, |
| "grad_norm": 0.3506312072277069, |
| "learning_rate": 9.999996047100669e-05, |
| "loss": 0.1764, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.001920112646608601, |
| "grad_norm": 0.3370567262172699, |
| "learning_rate": 9.999995502479064e-05, |
| "loss": 0.1817, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.0019734491090143956, |
| "grad_norm": 0.3320629596710205, |
| "learning_rate": 9.999994922720604e-05, |
| "loss": 0.1558, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00202678557142019, |
| "grad_norm": 0.3061317503452301, |
| "learning_rate": 9.999994307825292e-05, |
| "loss": 0.1645, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0020801220338259843, |
| "grad_norm": 0.3522307276725769, |
| "learning_rate": 9.999993657793131e-05, |
| "loss": 0.1795, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0021334584962317787, |
| "grad_norm": 0.33675846457481384, |
| "learning_rate": 9.999992972624131e-05, |
| "loss": 0.1695, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0021867949586375736, |
| "grad_norm": 0.34461385011672974, |
| "learning_rate": 9.999992252318289e-05, |
| "loss": 0.1698, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.002240131421043368, |
| "grad_norm": 0.3509563207626343, |
| "learning_rate": 9.999991496875616e-05, |
| "loss": 0.1857, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0022934678834491623, |
| "grad_norm": 0.34643200039863586, |
| "learning_rate": 9.999990706296113e-05, |
| "loss": 0.1855, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0023468043458549567, |
| "grad_norm": 0.33536601066589355, |
| "learning_rate": 9.99998988057979e-05, |
| "loss": 0.1767, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.002400140808260751, |
| "grad_norm": 0.3312433362007141, |
| "learning_rate": 9.99998901972665e-05, |
| "loss": 0.1922, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.002453477270666546, |
| "grad_norm": 0.34569883346557617, |
| "learning_rate": 9.999988123736699e-05, |
| "loss": 0.1777, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.0025068137330723403, |
| "grad_norm": 0.3153592646121979, |
| "learning_rate": 9.999987192609944e-05, |
| "loss": 0.1669, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0025601501954781347, |
| "grad_norm": 0.3480944037437439, |
| "learning_rate": 9.999986226346392e-05, |
| "loss": 0.1779, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.002613486657883929, |
| "grad_norm": 0.35337212681770325, |
| "learning_rate": 9.999985224946049e-05, |
| "loss": 0.1863, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.0026668231202897235, |
| "grad_norm": 0.3431400656700134, |
| "learning_rate": 9.999984188408922e-05, |
| "loss": 0.1983, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0027201595826955183, |
| "grad_norm": 0.3314003348350525, |
| "learning_rate": 9.999983116735019e-05, |
| "loss": 0.1838, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0027734960451013127, |
| "grad_norm": 0.3390146493911743, |
| "learning_rate": 9.999982009924345e-05, |
| "loss": 0.1826, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.002826832507507107, |
| "grad_norm": 0.35979676246643066, |
| "learning_rate": 9.999980867976912e-05, |
| "loss": 0.1967, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.0028801689699129015, |
| "grad_norm": 0.39572709798812866, |
| "learning_rate": 9.999979690892725e-05, |
| "loss": 0.1893, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.002933505432318696, |
| "grad_norm": 0.3344902992248535, |
| "learning_rate": 9.999978478671794e-05, |
| "loss": 0.1905, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0029868418947244903, |
| "grad_norm": 0.32258379459381104, |
| "learning_rate": 9.999977231314127e-05, |
| "loss": 0.1765, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.003040178357130285, |
| "grad_norm": 0.32133936882019043, |
| "learning_rate": 9.999975948819731e-05, |
| "loss": 0.1855, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.0030935148195360795, |
| "grad_norm": 0.33931997418403625, |
| "learning_rate": 9.999974631188618e-05, |
| "loss": 0.1881, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.003146851281941874, |
| "grad_norm": 0.3421045243740082, |
| "learning_rate": 9.999973278420795e-05, |
| "loss": 0.1937, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0032001877443476683, |
| "grad_norm": 0.337239146232605, |
| "learning_rate": 9.999971890516272e-05, |
| "loss": 0.1947, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0032535242067534627, |
| "grad_norm": 0.3348175585269928, |
| "learning_rate": 9.999970467475059e-05, |
| "loss": 0.2015, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.0033068606691592575, |
| "grad_norm": 0.33285388350486755, |
| "learning_rate": 9.999969009297165e-05, |
| "loss": 0.1954, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.003360197131565052, |
| "grad_norm": 0.33577030897140503, |
| "learning_rate": 9.999967515982604e-05, |
| "loss": 0.2033, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0034135335939708463, |
| "grad_norm": 0.3373951315879822, |
| "learning_rate": 9.999965987531382e-05, |
| "loss": 0.2059, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.0034668700563766407, |
| "grad_norm": 0.3341216742992401, |
| "learning_rate": 9.99996442394351e-05, |
| "loss": 0.191, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.003520206518782435, |
| "grad_norm": 0.3183084726333618, |
| "learning_rate": 9.999962825219002e-05, |
| "loss": 0.1917, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.00357354298118823, |
| "grad_norm": 0.33498549461364746, |
| "learning_rate": 9.999961191357869e-05, |
| "loss": 0.1895, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0036268794435940243, |
| "grad_norm": 0.3271574079990387, |
| "learning_rate": 9.999959522360118e-05, |
| "loss": 0.2054, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.0036802159059998187, |
| "grad_norm": 0.3310222923755646, |
| "learning_rate": 9.999957818225768e-05, |
| "loss": 0.2013, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.003733552368405613, |
| "grad_norm": 0.34026429057121277, |
| "learning_rate": 9.999956078954822e-05, |
| "loss": 0.2186, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0037868888308114075, |
| "grad_norm": 0.3307226300239563, |
| "learning_rate": 9.999954304547301e-05, |
| "loss": 0.2007, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.003840225293217202, |
| "grad_norm": 0.3343660533428192, |
| "learning_rate": 9.999952495003212e-05, |
| "loss": 0.2075, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.0038935617556229967, |
| "grad_norm": 0.3603450357913971, |
| "learning_rate": 9.999950650322569e-05, |
| "loss": 0.2156, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.003946898218028791, |
| "grad_norm": 0.335757315158844, |
| "learning_rate": 9.999948770505386e-05, |
| "loss": 0.203, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.0040002346804345855, |
| "grad_norm": 0.3302127718925476, |
| "learning_rate": 9.999946855551675e-05, |
| "loss": 0.1992, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.00405357114284038, |
| "grad_norm": 0.3287745714187622, |
| "learning_rate": 9.99994490546145e-05, |
| "loss": 0.2044, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.004106907605246174, |
| "grad_norm": 0.31489625573158264, |
| "learning_rate": 9.999942920234725e-05, |
| "loss": 0.2024, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.004160244067651969, |
| "grad_norm": 0.3128495216369629, |
| "learning_rate": 9.999940899871513e-05, |
| "loss": 0.2082, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.004213580530057763, |
| "grad_norm": 0.31686297059059143, |
| "learning_rate": 9.999938844371829e-05, |
| "loss": 0.2145, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.0042669169924635575, |
| "grad_norm": 0.3330387473106384, |
| "learning_rate": 9.999936753735687e-05, |
| "loss": 0.2022, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.004320253454869353, |
| "grad_norm": 0.34814751148223877, |
| "learning_rate": 9.999934627963103e-05, |
| "loss": 0.2192, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.004373589917275147, |
| "grad_norm": 0.3250124454498291, |
| "learning_rate": 9.999932467054089e-05, |
| "loss": 0.2029, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.0044269263796809415, |
| "grad_norm": 0.3646756410598755, |
| "learning_rate": 9.999930271008663e-05, |
| "loss": 0.2203, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.004480262842086736, |
| "grad_norm": 0.3267667889595032, |
| "learning_rate": 9.99992803982684e-05, |
| "loss": 0.2078, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.00453359930449253, |
| "grad_norm": 0.32010674476623535, |
| "learning_rate": 9.999925773508634e-05, |
| "loss": 0.2041, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.004586935766898325, |
| "grad_norm": 0.3199160695075989, |
| "learning_rate": 9.999923472054063e-05, |
| "loss": 0.2119, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.004640272229304119, |
| "grad_norm": 0.3363480269908905, |
| "learning_rate": 9.99992113546314e-05, |
| "loss": 0.2102, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.0046936086917099135, |
| "grad_norm": 0.3485029935836792, |
| "learning_rate": 9.999918763735886e-05, |
| "loss": 0.2146, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.004746945154115708, |
| "grad_norm": 0.3307000994682312, |
| "learning_rate": 9.999916356872314e-05, |
| "loss": 0.2006, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.004800281616521502, |
| "grad_norm": 0.32731881737709045, |
| "learning_rate": 9.999913914872443e-05, |
| "loss": 0.1934, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.004853618078927297, |
| "grad_norm": 0.3216370642185211, |
| "learning_rate": 9.99991143773629e-05, |
| "loss": 0.2149, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.004906954541333092, |
| "grad_norm": 0.3118319809436798, |
| "learning_rate": 9.999908925463872e-05, |
| "loss": 0.2066, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.004960291003738886, |
| "grad_norm": 0.3115937411785126, |
| "learning_rate": 9.999906378055205e-05, |
| "loss": 0.2016, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.005013627466144681, |
| "grad_norm": 0.31390756368637085, |
| "learning_rate": 9.999903795510308e-05, |
| "loss": 0.227, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.005066963928550475, |
| "grad_norm": 0.3215806484222412, |
| "learning_rate": 9.999901177829201e-05, |
| "loss": 0.2139, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0051203003909562695, |
| "grad_norm": 0.32381314039230347, |
| "learning_rate": 9.9998985250119e-05, |
| "loss": 0.2119, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.005173636853362064, |
| "grad_norm": 0.32022613286972046, |
| "learning_rate": 9.999895837058425e-05, |
| "loss": 0.2036, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.005226973315767858, |
| "grad_norm": 0.3156028985977173, |
| "learning_rate": 9.999893113968795e-05, |
| "loss": 0.2058, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.005280309778173653, |
| "grad_norm": 0.3254660665988922, |
| "learning_rate": 9.999890355743027e-05, |
| "loss": 0.2064, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.005333646240579447, |
| "grad_norm": 0.3044165074825287, |
| "learning_rate": 9.999887562381143e-05, |
| "loss": 0.2081, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 37496, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3964461642486907e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|