| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9985974754558204, |
| "eval_steps": 45, |
| "global_step": 178, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005610098176718092, |
| "grad_norm": 12.789434532326155, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 1.6735, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005610098176718092, |
| "eval_loss": 1.6279159784317017, |
| "eval_runtime": 411.6995, |
| "eval_samples_per_second": 3.998, |
| "eval_steps_per_second": 0.5, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.011220196353436185, |
| "grad_norm": 12.449110229764761, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 1.6787, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.016830294530154277, |
| "grad_norm": 12.965521915327457, |
| "learning_rate": 9e-07, |
| "loss": 1.7002, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02244039270687237, |
| "grad_norm": 12.312461520787544, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 1.6735, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.028050490883590462, |
| "grad_norm": 16.45705379890311, |
| "learning_rate": 1.5e-06, |
| "loss": 1.6845, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.033660589060308554, |
| "grad_norm": 18.324079461896748, |
| "learning_rate": 1.8e-06, |
| "loss": 1.7064, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03927068723702665, |
| "grad_norm": 9.790851828818997, |
| "learning_rate": 2.1e-06, |
| "loss": 1.6267, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.04488078541374474, |
| "grad_norm": 7.050739527602004, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 1.6552, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.05049088359046283, |
| "grad_norm": 4.33578468991953, |
| "learning_rate": 2.7e-06, |
| "loss": 1.6194, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.056100981767180924, |
| "grad_norm": 5.030821312124895, |
| "learning_rate": 3e-06, |
| "loss": 1.5938, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.061711079943899017, |
| "grad_norm": 5.650505437860036, |
| "learning_rate": 2.99973774136912e-06, |
| "loss": 1.5772, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06732117812061711, |
| "grad_norm": 4.829608912708524, |
| "learning_rate": 2.998951057182598e-06, |
| "loss": 1.5624, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0729312762973352, |
| "grad_norm": 4.067559287459243, |
| "learning_rate": 2.997640222526725e-06, |
| "loss": 1.5824, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0785413744740533, |
| "grad_norm": 2.856039166477444, |
| "learning_rate": 2.99580569577177e-06, |
| "loss": 1.5468, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.08415147265077139, |
| "grad_norm": 1.5746141536824405, |
| "learning_rate": 2.9934481184117006e-06, |
| "loss": 1.5761, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08976157082748948, |
| "grad_norm": 1.594708904780719, |
| "learning_rate": 2.9905683148398643e-06, |
| "loss": 1.5417, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.09537166900420757, |
| "grad_norm": 2.285228899703148, |
| "learning_rate": 2.9871672920607156e-06, |
| "loss": 1.5515, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.10098176718092566, |
| "grad_norm": 2.767131287855067, |
| "learning_rate": 2.9832462393376928e-06, |
| "loss": 1.5464, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.10659186535764376, |
| "grad_norm": 2.8597567602325418, |
| "learning_rate": 2.9788065277773537e-06, |
| "loss": 1.5283, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.11220196353436185, |
| "grad_norm": 1.8265836755261031, |
| "learning_rate": 2.9738497098499328e-06, |
| "loss": 1.5256, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11781206171107994, |
| "grad_norm": 1.4609854239411417, |
| "learning_rate": 2.968377518846473e-06, |
| "loss": 1.5102, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.12342215988779803, |
| "grad_norm": 1.4893607868634404, |
| "learning_rate": 2.9623918682727352e-06, |
| "loss": 1.5025, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 1.4138175059246425, |
| "learning_rate": 2.9558948511800864e-06, |
| "loss": 1.5052, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.13464235624123422, |
| "grad_norm": 1.0946413298956141, |
| "learning_rate": 2.9488887394336023e-06, |
| "loss": 1.48, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.1402524544179523, |
| "grad_norm": 1.4436793505790257, |
| "learning_rate": 2.9413759829176495e-06, |
| "loss": 1.4985, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1458625525946704, |
| "grad_norm": 1.12305310745146, |
| "learning_rate": 2.933359208679211e-06, |
| "loss": 1.5052, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.1514726507713885, |
| "grad_norm": 1.2555515662683936, |
| "learning_rate": 2.924841220009269e-06, |
| "loss": 1.4739, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.1570827489481066, |
| "grad_norm": 1.2671751142906982, |
| "learning_rate": 2.9158249954625514e-06, |
| "loss": 1.4685, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.16269284712482468, |
| "grad_norm": 1.0644419598203905, |
| "learning_rate": 2.906313687815999e-06, |
| "loss": 1.4829, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.16830294530154277, |
| "grad_norm": 0.9862228471334081, |
| "learning_rate": 2.8963106229663065e-06, |
| "loss": 1.4973, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.17391304347826086, |
| "grad_norm": 0.9461750827879526, |
| "learning_rate": 2.88581929876693e-06, |
| "loss": 1.5025, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.17952314165497896, |
| "grad_norm": 1.0011739456144289, |
| "learning_rate": 2.8748433838049645e-06, |
| "loss": 1.4622, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.18513323983169705, |
| "grad_norm": 0.7946864477883866, |
| "learning_rate": 2.8633867161183166e-06, |
| "loss": 1.4547, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.19074333800841514, |
| "grad_norm": 0.9031467465213865, |
| "learning_rate": 2.851453301853629e-06, |
| "loss": 1.4632, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.19635343618513323, |
| "grad_norm": 0.8215549924647519, |
| "learning_rate": 2.839047313865417e-06, |
| "loss": 1.4513, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.20196353436185133, |
| "grad_norm": 0.6987065221709342, |
| "learning_rate": 2.8261730902569146e-06, |
| "loss": 1.4159, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.20757363253856942, |
| "grad_norm": 0.7968200327984795, |
| "learning_rate": 2.8128351328631308e-06, |
| "loss": 1.4227, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.2131837307152875, |
| "grad_norm": 0.7608131707190037, |
| "learning_rate": 2.7990381056766585e-06, |
| "loss": 1.4261, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2187938288920056, |
| "grad_norm": 0.8694558959167964, |
| "learning_rate": 2.7847868332167773e-06, |
| "loss": 1.4577, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2244039270687237, |
| "grad_norm": 0.8245063288142666, |
| "learning_rate": 2.7700862988424264e-06, |
| "loss": 1.4384, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2300140252454418, |
| "grad_norm": 0.7268091063750695, |
| "learning_rate": 2.7549416430096295e-06, |
| "loss": 1.4452, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.23562412342215988, |
| "grad_norm": 0.8453387994411659, |
| "learning_rate": 2.7393581614739926e-06, |
| "loss": 1.4571, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.24123422159887797, |
| "grad_norm": 0.7332728453209687, |
| "learning_rate": 2.7233413034388904e-06, |
| "loss": 1.4106, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.24684431977559607, |
| "grad_norm": 0.7653926321695257, |
| "learning_rate": 2.7068966696500025e-06, |
| "loss": 1.4177, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.25245441795231416, |
| "grad_norm": 0.7101618711873879, |
| "learning_rate": 2.690030010436853e-06, |
| "loss": 1.4032, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.25245441795231416, |
| "eval_loss": 1.3188974857330322, |
| "eval_runtime": 411.4591, |
| "eval_samples_per_second": 4.0, |
| "eval_steps_per_second": 0.501, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.25806451612903225, |
| "grad_norm": 0.6872418512994495, |
| "learning_rate": 2.6727472237020448e-06, |
| "loss": 1.4613, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.26367461430575034, |
| "grad_norm": 0.8079045785150919, |
| "learning_rate": 2.6550543528588946e-06, |
| "loss": 1.4195, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.26928471248246844, |
| "grad_norm": 0.7835286370246854, |
| "learning_rate": 2.6369575847181795e-06, |
| "loss": 1.438, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.27489481065918653, |
| "grad_norm": 0.7388557016632363, |
| "learning_rate": 2.6184632473247484e-06, |
| "loss": 1.4499, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.2805049088359046, |
| "grad_norm": 0.7379003000881167, |
| "learning_rate": 2.5995778077447395e-06, |
| "loss": 1.4335, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2861150070126227, |
| "grad_norm": 0.7208955742904049, |
| "learning_rate": 2.58030786980419e-06, |
| "loss": 1.4219, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2917251051893408, |
| "grad_norm": 0.7240715420334977, |
| "learning_rate": 2.5606601717798212e-06, |
| "loss": 1.3834, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2973352033660589, |
| "grad_norm": 0.6878143254748253, |
| "learning_rate": 2.5406415840428124e-06, |
| "loss": 1.41, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.302945301542777, |
| "grad_norm": 0.702576965354967, |
| "learning_rate": 2.520259106656379e-06, |
| "loss": 1.423, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.3085553997194951, |
| "grad_norm": 0.6940628737560203, |
| "learning_rate": 2.499519866928006e-06, |
| "loss": 1.4233, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3141654978962132, |
| "grad_norm": 0.7706923504861696, |
| "learning_rate": 2.4784311169171817e-06, |
| "loss": 1.4052, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.31977559607293127, |
| "grad_norm": 0.7234464811104628, |
| "learning_rate": 2.457000230899513e-06, |
| "loss": 1.3801, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.32538569424964936, |
| "grad_norm": 0.7217889248580507, |
| "learning_rate": 2.4352347027881005e-06, |
| "loss": 1.4094, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.33099579242636745, |
| "grad_norm": 0.7024631703571308, |
| "learning_rate": 2.4131421435130812e-06, |
| "loss": 1.4145, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.33660589060308554, |
| "grad_norm": 0.6852493385331954, |
| "learning_rate": 2.390730278360252e-06, |
| "loss": 1.3909, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.34221598877980364, |
| "grad_norm": 0.6951742071723167, |
| "learning_rate": 2.368006944269709e-06, |
| "loss": 1.3818, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.34782608695652173, |
| "grad_norm": 0.6431308485618643, |
| "learning_rate": 2.344980087095433e-06, |
| "loss": 1.4271, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3534361851332398, |
| "grad_norm": 0.7100647383746512, |
| "learning_rate": 2.321657758826807e-06, |
| "loss": 1.4262, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3590462833099579, |
| "grad_norm": 0.7213767487790479, |
| "learning_rate": 2.298048114773005e-06, |
| "loss": 1.3896, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.364656381486676, |
| "grad_norm": 0.6448491692515501, |
| "learning_rate": 2.27415941071126e-06, |
| "loss": 1.3985, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3702664796633941, |
| "grad_norm": 0.6409510040219449, |
| "learning_rate": 2.25e-06, |
| "loss": 1.3947, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.3758765778401122, |
| "grad_norm": 0.6347781401680477, |
| "learning_rate": 2.22557833065786e-06, |
| "loss": 1.3912, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3814866760168303, |
| "grad_norm": 0.6448895288436695, |
| "learning_rate": 2.200902942409593e-06, |
| "loss": 1.3961, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 0.6681259500416868, |
| "learning_rate": 2.175982463699918e-06, |
| "loss": 1.3908, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.39270687237026647, |
| "grad_norm": 0.6645485721507981, |
| "learning_rate": 2.150825608676337e-06, |
| "loss": 1.3841, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.39831697054698456, |
| "grad_norm": 0.6566579644017881, |
| "learning_rate": 2.1254411741419925e-06, |
| "loss": 1.4203, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.40392706872370265, |
| "grad_norm": 0.6725026800665824, |
| "learning_rate": 2.0998380364796113e-06, |
| "loss": 1.3682, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.40953716690042075, |
| "grad_norm": 0.6488827604064351, |
| "learning_rate": 2.074025148547635e-06, |
| "loss": 1.3813, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.41514726507713884, |
| "grad_norm": 0.719525409048368, |
| "learning_rate": 2.048011536549593e-06, |
| "loss": 1.3905, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.42075736325385693, |
| "grad_norm": 0.6834354719235972, |
| "learning_rate": 2.0218062968778406e-06, |
| "loss": 1.3495, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.426367461430575, |
| "grad_norm": 0.6906956307555222, |
| "learning_rate": 1.9954185929327507e-06, |
| "loss": 1.4199, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.4319775596072931, |
| "grad_norm": 0.6568235975258354, |
| "learning_rate": 1.9688576519184667e-06, |
| "loss": 1.3895, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4375876577840112, |
| "grad_norm": 0.6386011841528437, |
| "learning_rate": 1.9421327616163564e-06, |
| "loss": 1.3956, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.4431977559607293, |
| "grad_norm": 0.6891639766601969, |
| "learning_rate": 1.915253267137274e-06, |
| "loss": 1.3674, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.4488078541374474, |
| "grad_norm": 0.6656583761906066, |
| "learning_rate": 1.888228567653781e-06, |
| "loss": 1.3872, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4544179523141655, |
| "grad_norm": 0.6515567085010856, |
| "learning_rate": 1.8610681131134598e-06, |
| "loss": 1.4106, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4600280504908836, |
| "grad_norm": 0.6363151573327763, |
| "learning_rate": 1.8337814009344715e-06, |
| "loss": 1.3741, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.46563814866760167, |
| "grad_norm": 0.6412726533083525, |
| "learning_rate": 1.8063779726845207e-06, |
| "loss": 1.3622, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.47124824684431976, |
| "grad_norm": 0.6306344422672963, |
| "learning_rate": 1.778867410744372e-06, |
| "loss": 1.3962, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.47685834502103785, |
| "grad_norm": 0.6430451206374499, |
| "learning_rate": 1.7512593349571046e-06, |
| "loss": 1.3781, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.48246844319775595, |
| "grad_norm": 0.6972355527865212, |
| "learning_rate": 1.7235633992642616e-06, |
| "loss": 1.3768, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.48807854137447404, |
| "grad_norm": 0.6512242836910552, |
| "learning_rate": 1.6957892883300778e-06, |
| "loss": 1.374, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.49368863955119213, |
| "grad_norm": 0.6424119203317555, |
| "learning_rate": 1.6679467141549617e-06, |
| "loss": 1.3632, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4992987377279102, |
| "grad_norm": 0.6244684823196682, |
| "learning_rate": 1.6400454126794258e-06, |
| "loss": 1.3795, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.5049088359046283, |
| "grad_norm": 0.6597628935764579, |
| "learning_rate": 1.6120951403796365e-06, |
| "loss": 1.3647, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5049088359046283, |
| "eval_loss": 1.2600828409194946, |
| "eval_runtime": 411.695, |
| "eval_samples_per_second": 3.998, |
| "eval_steps_per_second": 0.5, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5105189340813464, |
| "grad_norm": 0.6314706351932109, |
| "learning_rate": 1.5841056708557877e-06, |
| "loss": 1.3898, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.5161290322580645, |
| "grad_norm": 0.6204474126915034, |
| "learning_rate": 1.5560867914144889e-06, |
| "loss": 1.3316, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.5217391304347826, |
| "grad_norm": 0.6452070749940022, |
| "learning_rate": 1.5280482996463535e-06, |
| "loss": 1.365, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5273492286115007, |
| "grad_norm": 0.623770196055313, |
| "learning_rate": 1.5e-06, |
| "loss": 1.367, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5329593267882188, |
| "grad_norm": 0.6294036193527197, |
| "learning_rate": 1.471951700353647e-06, |
| "loss": 1.3445, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5385694249649369, |
| "grad_norm": 0.6442245103204152, |
| "learning_rate": 1.4439132085855116e-06, |
| "loss": 1.3814, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.544179523141655, |
| "grad_norm": 0.6225427313645252, |
| "learning_rate": 1.4158943291442122e-06, |
| "loss": 1.3695, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5497896213183731, |
| "grad_norm": 0.6552086842949354, |
| "learning_rate": 1.3879048596203636e-06, |
| "loss": 1.3786, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5553997194950911, |
| "grad_norm": 0.62874676463444, |
| "learning_rate": 1.3599545873205742e-06, |
| "loss": 1.3822, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5610098176718092, |
| "grad_norm": 0.6574014422172325, |
| "learning_rate": 1.3320532858450384e-06, |
| "loss": 1.3618, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5666199158485273, |
| "grad_norm": 0.6383976290918701, |
| "learning_rate": 1.304210711669923e-06, |
| "loss": 1.3588, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5722300140252454, |
| "grad_norm": 0.6270735205120959, |
| "learning_rate": 1.2764366007357383e-06, |
| "loss": 1.3777, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5778401122019635, |
| "grad_norm": 0.6351754233235616, |
| "learning_rate": 1.2487406650428957e-06, |
| "loss": 1.3828, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5834502103786816, |
| "grad_norm": 0.6527833020722423, |
| "learning_rate": 1.2211325892556282e-06, |
| "loss": 1.3954, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5890603085553997, |
| "grad_norm": 0.6427076558908459, |
| "learning_rate": 1.1936220273154798e-06, |
| "loss": 1.4066, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5946704067321178, |
| "grad_norm": 0.6691035857323574, |
| "learning_rate": 1.1662185990655286e-06, |
| "loss": 1.3558, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.6002805049088359, |
| "grad_norm": 0.6342212523114369, |
| "learning_rate": 1.138931886886541e-06, |
| "loss": 1.3616, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.605890603085554, |
| "grad_norm": 0.6425719617099173, |
| "learning_rate": 1.1117714323462188e-06, |
| "loss": 1.3708, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.6115007012622721, |
| "grad_norm": 0.6372645993247724, |
| "learning_rate": 1.084746732862726e-06, |
| "loss": 1.3392, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.6171107994389902, |
| "grad_norm": 0.6808155418004899, |
| "learning_rate": 1.0578672383836437e-06, |
| "loss": 1.3685, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6227208976157083, |
| "grad_norm": 0.625992555276094, |
| "learning_rate": 1.0311423480815335e-06, |
| "loss": 1.3819, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.6283309957924264, |
| "grad_norm": 0.6502367036299962, |
| "learning_rate": 1.0045814070672498e-06, |
| "loss": 1.3529, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6339410939691444, |
| "grad_norm": 0.6806106708733632, |
| "learning_rate": 9.78193703122159e-07, |
| "loss": 1.3678, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6395511921458625, |
| "grad_norm": 0.6207625932802451, |
| "learning_rate": 9.519884634504075e-07, |
| "loss": 1.361, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.6316217262222912, |
| "learning_rate": 9.259748514523654e-07, |
| "loss": 1.3581, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6507713884992987, |
| "grad_norm": 0.6903717607402587, |
| "learning_rate": 9.001619635203888e-07, |
| "loss": 1.3457, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6563814866760168, |
| "grad_norm": 0.6430969718754993, |
| "learning_rate": 8.745588258580084e-07, |
| "loss": 1.3689, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6619915848527349, |
| "grad_norm": 0.6191251274798687, |
| "learning_rate": 8.49174391323663e-07, |
| "loss": 1.3652, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.667601683029453, |
| "grad_norm": 0.6607471005683063, |
| "learning_rate": 8.240175363000819e-07, |
| "loss": 1.3715, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6732117812061711, |
| "grad_norm": 0.642069328346416, |
| "learning_rate": 7.99097057590407e-07, |
| "loss": 1.3533, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6788218793828892, |
| "grad_norm": 0.6140370733400766, |
| "learning_rate": 7.744216693421403e-07, |
| "loss": 1.3517, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6844319775596073, |
| "grad_norm": 0.6340075524580069, |
| "learning_rate": 7.500000000000003e-07, |
| "loss": 1.3818, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6900420757363254, |
| "grad_norm": 0.645243182347647, |
| "learning_rate": 7.258405892887399e-07, |
| "loss": 1.3672, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6956521739130435, |
| "grad_norm": 0.6702639753033074, |
| "learning_rate": 7.019518852269954e-07, |
| "loss": 1.3539, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.7012622720897616, |
| "grad_norm": 0.6387297942086914, |
| "learning_rate": 6.783422411731932e-07, |
| "loss": 1.3604, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7068723702664796, |
| "grad_norm": 0.620394148061845, |
| "learning_rate": 6.550199129045669e-07, |
| "loss": 1.383, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.7124824684431977, |
| "grad_norm": 0.6327577478686799, |
| "learning_rate": 6.319930557302914e-07, |
| "loss": 1.3602, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.7180925666199158, |
| "grad_norm": 0.6230687543936638, |
| "learning_rate": 6.092697216397478e-07, |
| "loss": 1.3563, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.7237026647966339, |
| "grad_norm": 0.6138116069294336, |
| "learning_rate": 5.868578564869191e-07, |
| "loss": 1.3562, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.729312762973352, |
| "grad_norm": 0.6378183688587699, |
| "learning_rate": 5.647652972118998e-07, |
| "loss": 1.3466, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7349228611500701, |
| "grad_norm": 0.611795840417991, |
| "learning_rate": 5.429997691004874e-07, |
| "loss": 1.3373, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7405329593267882, |
| "grad_norm": 0.6281810459467961, |
| "learning_rate": 5.215688830828188e-07, |
| "loss": 1.3747, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7461430575035063, |
| "grad_norm": 0.6349646039018948, |
| "learning_rate": 5.004801330719941e-07, |
| "loss": 1.3683, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7517531556802244, |
| "grad_norm": 0.629745084121343, |
| "learning_rate": 4.797408933436207e-07, |
| "loss": 1.3565, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7573632538569425, |
| "grad_norm": 0.6269791171808803, |
| "learning_rate": 4.5935841595718754e-07, |
| "loss": 1.3609, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7573632538569425, |
| "eval_loss": 1.2410979270935059, |
| "eval_runtime": 413.3103, |
| "eval_samples_per_second": 3.982, |
| "eval_steps_per_second": 0.498, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7629733520336606, |
| "grad_norm": 0.6267450715270643, |
| "learning_rate": 4.3933982822017883e-07, |
| "loss": 1.3764, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7685834502103787, |
| "grad_norm": 0.6372731343606247, |
| "learning_rate": 4.196921301958104e-07, |
| "loss": 1.3639, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 0.6290588594803654, |
| "learning_rate": 4.0042219225526084e-07, |
| "loss": 1.3475, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7798036465638148, |
| "grad_norm": 0.6282644727559273, |
| "learning_rate": 3.8153675267525163e-07, |
| "loss": 1.3802, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7854137447405329, |
| "grad_norm": 0.6333972099731577, |
| "learning_rate": 3.6304241528182033e-07, |
| "loss": 1.3695, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.791023842917251, |
| "grad_norm": 0.615703161986596, |
| "learning_rate": 3.449456471411058e-07, |
| "loss": 1.3712, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7966339410939691, |
| "grad_norm": 0.6341051573545761, |
| "learning_rate": 3.272527762979553e-07, |
| "loss": 1.3432, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.8022440392706872, |
| "grad_norm": 0.6146652686424743, |
| "learning_rate": 3.0996998956314745e-07, |
| "loss": 1.351, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.8078541374474053, |
| "grad_norm": 0.6013902465160864, |
| "learning_rate": 2.9310333034999746e-07, |
| "loss": 1.337, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.8134642356241234, |
| "grad_norm": 0.6173433844669265, |
| "learning_rate": 2.7665869656110975e-07, |
| "loss": 1.3672, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.8190743338008415, |
| "grad_norm": 0.6391927308858952, |
| "learning_rate": 2.6064183852600797e-07, |
| "loss": 1.3472, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.8246844319775596, |
| "grad_norm": 0.6341150140648005, |
| "learning_rate": 2.4505835699037037e-07, |
| "loss": 1.3729, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.8302945301542777, |
| "grad_norm": 0.642511533531499, |
| "learning_rate": 2.299137011575738e-07, |
| "loss": 1.3734, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8359046283309958, |
| "grad_norm": 0.6268984544418437, |
| "learning_rate": 2.15213166783223e-07, |
| "loss": 1.3552, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.8415147265077139, |
| "grad_norm": 0.599379702979861, |
| "learning_rate": 2.0096189432334195e-07, |
| "loss": 1.3036, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.847124824684432, |
| "grad_norm": 0.6046862952380841, |
| "learning_rate": 1.8716486713686948e-07, |
| "loss": 1.3505, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.85273492286115, |
| "grad_norm": 0.6115539109578072, |
| "learning_rate": 1.7382690974308551e-07, |
| "loss": 1.3102, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.8583450210378681, |
| "grad_norm": 0.6012756116742142, |
| "learning_rate": 1.6095268613458302e-07, |
| "loss": 1.3474, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.8639551192145862, |
| "grad_norm": 0.5895604903788673, |
| "learning_rate": 1.4854669814637145e-07, |
| "loss": 1.3893, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 0.608459654122053, |
| "learning_rate": 1.3661328388168358e-07, |
| "loss": 1.3721, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8751753155680224, |
| "grad_norm": 0.6357118654920566, |
| "learning_rate": 1.251566161950357e-07, |
| "loss": 1.3614, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.8807854137447405, |
| "grad_norm": 0.6224055669530941, |
| "learning_rate": 1.141807012330699e-07, |
| "loss": 1.3448, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8863955119214586, |
| "grad_norm": 0.6303897910712531, |
| "learning_rate": 1.036893770336938e-07, |
| "loss": 1.3708, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8920056100981767, |
| "grad_norm": 0.6361734072038168, |
| "learning_rate": 9.368631218400137e-08, |
| "loss": 1.339, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8976157082748948, |
| "grad_norm": 0.6146505272733668, |
| "learning_rate": 8.417500453744864e-08, |
| "loss": 1.374, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9032258064516129, |
| "grad_norm": 0.6153978550286938, |
| "learning_rate": 7.515877999073101e-08, |
| "loss": 1.365, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.908835904628331, |
| "grad_norm": 0.6345016503291776, |
| "learning_rate": 6.664079132078882e-08, |
| "loss": 1.3761, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.9144460028050491, |
| "grad_norm": 0.6151090819609036, |
| "learning_rate": 5.8624017082350765e-08, |
| "loss": 1.3381, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.9200561009817672, |
| "grad_norm": 0.6155169224032129, |
| "learning_rate": 5.11112605663977e-08, |
| "loss": 1.3534, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.9256661991584852, |
| "grad_norm": 0.6227407416490074, |
| "learning_rate": 4.4105148819913564e-08, |
| "loss": 1.393, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.9312762973352033, |
| "grad_norm": 0.6234769604342408, |
| "learning_rate": 3.7608131727264573e-08, |
| "loss": 1.3618, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.9368863955119214, |
| "grad_norm": 0.6200580565294606, |
| "learning_rate": 3.162248115352745e-08, |
| "loss": 1.3641, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.9424964936886395, |
| "grad_norm": 0.6007414904648882, |
| "learning_rate": 2.6150290150067592e-08, |
| "loss": 1.3281, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.9481065918653576, |
| "grad_norm": 0.6452144194523336, |
| "learning_rate": 2.1193472222646172e-08, |
| "loss": 1.3445, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.9537166900420757, |
| "grad_norm": 0.5953544138269807, |
| "learning_rate": 1.6753760662307216e-08, |
| "loss": 1.3349, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9593267882187938, |
| "grad_norm": 0.591032939002349, |
| "learning_rate": 1.2832707939284426e-08, |
| "loss": 1.3163, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.9649368863955119, |
| "grad_norm": 0.6080867514880087, |
| "learning_rate": 9.431685160136094e-09, |
| "loss": 1.3439, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.97054698457223, |
| "grad_norm": 0.5948670418509701, |
| "learning_rate": 6.55188158829928e-09, |
| "loss": 1.3127, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9761570827489481, |
| "grad_norm": 0.5908769690929122, |
| "learning_rate": 4.194304228229806e-09, |
| "loss": 1.3366, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.9817671809256662, |
| "grad_norm": 0.6166895839754339, |
| "learning_rate": 2.359777473275093e-09, |
| "loss": 1.3558, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9873772791023843, |
| "grad_norm": 0.604759073223834, |
| "learning_rate": 1.0489428174020875e-09, |
| "loss": 1.3575, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9929873772791024, |
| "grad_norm": 0.6214697548686083, |
| "learning_rate": 2.622586308803632e-10, |
| "loss": 1.3599, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.9985974754558204, |
| "grad_norm": 0.6024720195804787, |
| "learning_rate": 0.0, |
| "loss": 1.3521, |
| "step": 178 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 178, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 223460437524480.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|