{ "best_global_step": 203, "best_metric": 0.12307652831077576, "best_model_checkpoint": "/cache/outputs/checkpoint-203", "epoch": 7.0, "eval_steps": 500, "global_step": 203, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034482758620689655, "grad_norm": 17.74788475036621, "learning_rate": 0.0, "loss": 5.2296, "step": 1 }, { "epoch": 0.06896551724137931, "grad_norm": 18.899648666381836, "learning_rate": 2.6666666666666667e-05, "loss": 4.5763, "step": 2 }, { "epoch": 0.10344827586206896, "grad_norm": 17.478742599487305, "learning_rate": 5.333333333333333e-05, "loss": 3.8061, "step": 3 }, { "epoch": 0.13793103448275862, "grad_norm": 6.993143081665039, "learning_rate": 8e-05, "loss": 1.9885, "step": 4 }, { "epoch": 0.1724137931034483, "grad_norm": 8.001960754394531, "learning_rate": 0.00010666666666666667, "loss": 1.7205, "step": 5 }, { "epoch": 0.20689655172413793, "grad_norm": 3.1834046840667725, "learning_rate": 0.00013333333333333334, "loss": 1.1047, "step": 6 }, { "epoch": 0.2413793103448276, "grad_norm": 1.3439714908599854, "learning_rate": 0.00016, "loss": 0.7238, "step": 7 }, { "epoch": 0.27586206896551724, "grad_norm": 1.5758135318756104, "learning_rate": 0.0001866666666666667, "loss": 0.8051, "step": 8 }, { "epoch": 0.3103448275862069, "grad_norm": 1.299546241760254, "learning_rate": 0.00021333333333333333, "loss": 0.7102, "step": 9 }, { "epoch": 0.3448275862068966, "grad_norm": 0.8412534594535828, "learning_rate": 0.00024, "loss": 0.4769, "step": 10 }, { "epoch": 0.3793103448275862, "grad_norm": 0.8651800751686096, "learning_rate": 0.0002666666666666667, "loss": 0.4883, "step": 11 }, { "epoch": 0.41379310344827586, "grad_norm": 0.7943485975265503, "learning_rate": 0.0002933333333333333, "loss": 0.5219, "step": 12 }, { "epoch": 0.4482758620689655, "grad_norm": 0.6727921366691589, "learning_rate": 0.00032, "loss": 0.3604, "step": 13 }, { "epoch": 0.4827586206896552, "grad_norm": 0.578971266746521, "learning_rate": 0.00034666666666666667, "loss": 0.4373, "step": 14 }, { "epoch": 0.5172413793103449, "grad_norm": 0.5071410536766052, "learning_rate": 0.0003733333333333334, "loss": 0.4108, "step": 15 }, { "epoch": 0.5517241379310345, "grad_norm": 0.6982496976852417, "learning_rate": 0.0004, "loss": 0.4349, "step": 16 }, { "epoch": 0.5862068965517241, "grad_norm": 0.44865670800209045, "learning_rate": 0.00039997207623717143, "loss": 0.3644, "step": 17 }, { "epoch": 0.6206896551724138, "grad_norm": 0.5113717317581177, "learning_rate": 0.00039988831274605094, "loss": 0.3863, "step": 18 }, { "epoch": 0.6551724137931034, "grad_norm": 0.5012100338935852, "learning_rate": 0.0003997487329165572, "loss": 0.2768, "step": 19 }, { "epoch": 0.6896551724137931, "grad_norm": 0.4838305413722992, "learning_rate": 0.0003995533757246307, "loss": 0.3422, "step": 20 }, { "epoch": 0.7241379310344828, "grad_norm": 0.6303550004959106, "learning_rate": 0.00039930229572135033, "loss": 0.3288, "step": 21 }, { "epoch": 0.7586206896551724, "grad_norm": 0.5895312428474426, "learning_rate": 0.00039899556301770084, "loss": 0.2696, "step": 22 }, { "epoch": 0.7931034482758621, "grad_norm": 0.5373993515968323, "learning_rate": 0.00039863326326499484, "loss": 0.3868, "step": 23 }, { "epoch": 0.8275862068965517, "grad_norm": 0.48342981934547424, "learning_rate": 0.000398215497630956, "loss": 0.3162, "step": 24 }, { "epoch": 0.8620689655172413, "grad_norm": 0.28195714950561523, "learning_rate": 0.0003977423827714692, "loss": 0.2556, "step": 25 }, { "epoch": 0.896551724137931, "grad_norm": 0.637691855430603, "learning_rate": 0.00039721405079800573, "loss": 0.3809, "step": 26 }, { "epoch": 0.9310344827586207, "grad_norm": 0.3745051920413971, "learning_rate": 0.0003966306492407327, "loss": 0.2701, "step": 27 }, { "epoch": 0.9655172413793104, "grad_norm": 0.636221170425415, "learning_rate": 0.0003959923410073174, "loss": 0.308, "step": 28 }, { "epoch": 1.0, "grad_norm": 0.6435479521751404, "learning_rate": 0.0003952993043374369, "loss": 0.2667, "step": 29 }, { "epoch": 1.0, "eval_loss": 0.5053671598434448, "eval_runtime": 18.682, "eval_samples_per_second": 4.389, "eval_steps_per_second": 1.124, "step": 29 }, { "epoch": 1.0344827586206897, "grad_norm": 0.30985596776008606, "learning_rate": 0.00039455173275300745, "loss": 0.2241, "step": 30 }, { "epoch": 1.0689655172413792, "grad_norm": 0.6296059489250183, "learning_rate": 0.0003937498350041451, "loss": 0.3091, "step": 31 }, { "epoch": 1.103448275862069, "grad_norm": 0.5439299941062927, "learning_rate": 0.00039289383501087534, "loss": 0.2937, "step": 32 }, { "epoch": 1.1379310344827587, "grad_norm": 0.3232507109642029, "learning_rate": 0.0003919839718006062, "loss": 0.2528, "step": 33 }, { "epoch": 1.1724137931034484, "grad_norm": 0.511285662651062, "learning_rate": 0.0003910204994413825, "loss": 0.2235, "step": 34 }, { "epoch": 1.206896551724138, "grad_norm": 1.3702025413513184, "learning_rate": 0.00039000368697094084, "loss": 0.2724, "step": 35 }, { "epoch": 1.2413793103448276, "grad_norm": 0.5106721520423889, "learning_rate": 0.0003889338183215838, "loss": 0.289, "step": 36 }, { "epoch": 1.2758620689655173, "grad_norm": 0.4077189266681671, "learning_rate": 0.0003878111922408956, "loss": 0.2459, "step": 37 }, { "epoch": 1.3103448275862069, "grad_norm": 0.7492893934249878, "learning_rate": 0.00038663612220832055, "loss": 0.2533, "step": 38 }, { "epoch": 1.3448275862068966, "grad_norm": 0.5893272757530212, "learning_rate": 0.00038540893634762753, "loss": 0.2458, "step": 39 }, { "epoch": 1.3793103448275863, "grad_norm": 0.8055713772773743, "learning_rate": 0.00038412997733528576, "loss": 0.2893, "step": 40 }, { "epoch": 1.4137931034482758, "grad_norm": 0.37418168783187866, "learning_rate": 0.00038279960230477655, "loss": 0.2172, "step": 41 }, { "epoch": 1.4482758620689655, "grad_norm": 0.44418400526046753, "learning_rate": 0.00038141818274686816, "loss": 0.237, "step": 42 }, { "epoch": 1.4827586206896552, "grad_norm": 0.3771205246448517, "learning_rate": 0.0003799861044058816, "loss": 0.2039, "step": 43 }, { "epoch": 1.5172413793103448, "grad_norm": 0.43807318806648254, "learning_rate": 0.00037850376717197626, "loss": 0.2328, "step": 44 }, { "epoch": 1.5517241379310345, "grad_norm": 0.3352651000022888, "learning_rate": 0.00037697158496948575, "loss": 0.2319, "step": 45 }, { "epoch": 1.5862068965517242, "grad_norm": 0.595750629901886, "learning_rate": 0.00037538998564133434, "loss": 0.3058, "step": 46 }, { "epoch": 1.6206896551724137, "grad_norm": 0.4095384478569031, "learning_rate": 0.0003737594108295673, "loss": 0.2451, "step": 47 }, { "epoch": 1.6551724137931034, "grad_norm": 0.3446806073188782, "learning_rate": 0.0003720803158520279, "loss": 0.2353, "step": 48 }, { "epoch": 1.6896551724137931, "grad_norm": 0.4992334842681885, "learning_rate": 0.0003703531695752152, "loss": 0.2178, "step": 49 }, { "epoch": 1.7241379310344827, "grad_norm": 0.3720123767852783, "learning_rate": 0.0003685784542833594, "loss": 0.2211, "step": 50 }, { "epoch": 1.7586206896551724, "grad_norm": 0.3732210397720337, "learning_rate": 0.00036675666554374944, "loss": 0.2299, "step": 51 }, { "epoch": 1.793103448275862, "grad_norm": 0.3584296703338623, "learning_rate": 0.00036488831206835207, "loss": 0.2098, "step": 52 }, { "epoch": 1.8275862068965516, "grad_norm": 0.3997485637664795, "learning_rate": 0.00036297391557176066, "loss": 0.2729, "step": 53 }, { "epoch": 1.8620689655172413, "grad_norm": 0.4166489243507385, "learning_rate": 0.0003610140106255126, "loss": 0.2177, "step": 54 }, { "epoch": 1.896551724137931, "grad_norm": 0.45685556530952454, "learning_rate": 0.0003590091445088166, "loss": 0.3138, "step": 55 }, { "epoch": 1.9310344827586206, "grad_norm": 0.49145734310150146, "learning_rate": 0.0003569598770557322, "loss": 0.2876, "step": 56 }, { "epoch": 1.9655172413793105, "grad_norm": 0.5870755910873413, "learning_rate": 0.0003548667804988427, "loss": 0.2103, "step": 57 }, { "epoch": 2.0, "grad_norm": 0.601203441619873, "learning_rate": 0.00035273043930946646, "loss": 0.2372, "step": 58 }, { "epoch": 2.0, "eval_loss": 0.3301706910133362, "eval_runtime": 11.7909, "eval_samples_per_second": 6.955, "eval_steps_per_second": 1.781, "step": 58 }, { "epoch": 2.0344827586206895, "grad_norm": 0.6023654341697693, "learning_rate": 0.00035055145003445024, "loss": 0.2392, "step": 59 }, { "epoch": 2.0689655172413794, "grad_norm": 0.42505472898483276, "learning_rate": 0.00034833042112959153, "loss": 0.2214, "step": 60 }, { "epoch": 2.103448275862069, "grad_norm": 0.39177656173706055, "learning_rate": 0.0003460679727897339, "loss": 0.2266, "step": 61 }, { "epoch": 2.1379310344827585, "grad_norm": 0.4654797911643982, "learning_rate": 0.0003437647367755859, "loss": 0.2738, "step": 62 }, { "epoch": 2.1724137931034484, "grad_norm": 0.4213922917842865, "learning_rate": 0.0003414213562373095, "loss": 0.2064, "step": 63 }, { "epoch": 2.206896551724138, "grad_norm": 0.35119640827178955, "learning_rate": 0.0003390384855349285, "loss": 0.1575, "step": 64 }, { "epoch": 2.2413793103448274, "grad_norm": 0.3433196544647217, "learning_rate": 0.0003366167900556062, "loss": 0.1858, "step": 65 }, { "epoch": 2.2758620689655173, "grad_norm": 0.6012677550315857, "learning_rate": 0.0003341569460278447, "loss": 0.2321, "step": 66 }, { "epoch": 2.310344827586207, "grad_norm": 0.36528632044792175, "learning_rate": 0.00033165964033265636, "loss": 0.1871, "step": 67 }, { "epoch": 2.344827586206897, "grad_norm": 0.33606716990470886, "learning_rate": 0.0003291255703117605, "loss": 0.1316, "step": 68 }, { "epoch": 2.3793103448275863, "grad_norm": 0.4607170820236206, "learning_rate": 0.0003265554435728597, "loss": 0.23, "step": 69 }, { "epoch": 2.413793103448276, "grad_norm": 0.35001179575920105, "learning_rate": 0.00032394997779204896, "loss": 0.1665, "step": 70 }, { "epoch": 2.4482758620689653, "grad_norm": 0.3442467451095581, "learning_rate": 0.0003213099005134135, "loss": 0.1568, "step": 71 }, { "epoch": 2.4827586206896552, "grad_norm": 0.35078802704811096, "learning_rate": 0.00031863594894587105, "loss": 0.168, "step": 72 }, { "epoch": 2.5172413793103448, "grad_norm": 0.3411742150783539, "learning_rate": 0.00031592886975731553, "loss": 0.2015, "step": 73 }, { "epoch": 2.5517241379310347, "grad_norm": 0.608189046382904, "learning_rate": 0.0003131894188661191, "loss": 0.24, "step": 74 }, { "epoch": 2.586206896551724, "grad_norm": 0.9807543754577637, "learning_rate": 0.00031041836123005137, "loss": 0.2216, "step": 75 }, { "epoch": 2.6206896551724137, "grad_norm": 0.3027774691581726, "learning_rate": 0.00030761647063267457, "loss": 0.195, "step": 76 }, { "epoch": 2.655172413793103, "grad_norm": 0.3149068057537079, "learning_rate": 0.00030478452946727374, "loss": 0.1468, "step": 77 }, { "epoch": 2.689655172413793, "grad_norm": 0.29977571964263916, "learning_rate": 0.0003019233285183835, "loss": 0.162, "step": 78 }, { "epoch": 2.7241379310344827, "grad_norm": 0.33947068452835083, "learning_rate": 0.00029903366674097074, "loss": 0.1389, "step": 79 }, { "epoch": 2.7586206896551726, "grad_norm": 0.38519802689552307, "learning_rate": 0.00029611635103733675, "loss": 0.1543, "step": 80 }, { "epoch": 2.793103448275862, "grad_norm": 0.3749842941761017, "learning_rate": 0.00029317219603179964, "loss": 0.1761, "step": 81 }, { "epoch": 2.8275862068965516, "grad_norm": 0.4311760663986206, "learning_rate": 0.00029020202384322035, "loss": 0.2, "step": 82 }, { "epoch": 2.862068965517241, "grad_norm": 0.6397997736930847, "learning_rate": 0.0002872066638554366, "loss": 0.1925, "step": 83 }, { "epoch": 2.896551724137931, "grad_norm": 0.4956374764442444, "learning_rate": 0.000284186952485667, "loss": 0.16, "step": 84 }, { "epoch": 2.9310344827586206, "grad_norm": 0.3576422929763794, "learning_rate": 0.0002811437329509528, "loss": 0.1277, "step": 85 }, { "epoch": 2.9655172413793105, "grad_norm": 0.5354539155960083, "learning_rate": 0.00027807785503269894, "loss": 0.1905, "step": 86 }, { "epoch": 3.0, "grad_norm": 0.7227762341499329, "learning_rate": 0.00027499017483938426, "loss": 0.1457, "step": 87 }, { "epoch": 3.0, "eval_loss": 0.2014383226633072, "eval_runtime": 11.1734, "eval_samples_per_second": 7.339, "eval_steps_per_second": 1.879, "step": 87 }, { "epoch": 3.0344827586206895, "grad_norm": 0.4662579298019409, "learning_rate": 0.00027188155456750256, "loss": 0.1631, "step": 88 }, { "epoch": 3.0689655172413794, "grad_norm": 0.4082823097705841, "learning_rate": 0.00026875286226080603, "loss": 0.1211, "step": 89 }, { "epoch": 3.103448275862069, "grad_norm": 0.33196941018104553, "learning_rate": 0.0002656049715679138, "loss": 0.1293, "step": 90 }, { "epoch": 3.1379310344827585, "grad_norm": 0.38381531834602356, "learning_rate": 0.0002624387614983573, "loss": 0.157, "step": 91 }, { "epoch": 3.1724137931034484, "grad_norm": 0.46282660961151123, "learning_rate": 0.00025925511617712685, "loss": 0.1516, "step": 92 }, { "epoch": 3.206896551724138, "grad_norm": 0.5028776526451111, "learning_rate": 0.00025605492459779126, "loss": 0.1341, "step": 93 }, { "epoch": 3.2413793103448274, "grad_norm": 0.40640944242477417, "learning_rate": 0.00025283908037425725, "loss": 0.1473, "step": 94 }, { "epoch": 3.2758620689655173, "grad_norm": 0.36019614338874817, "learning_rate": 0.00024960848149123866, "loss": 0.1496, "step": 95 }, { "epoch": 3.310344827586207, "grad_norm": 0.40027180314064026, "learning_rate": 0.0002463640300535057, "loss": 0.1315, "step": 96 }, { "epoch": 3.344827586206897, "grad_norm": 0.3519420027732849, "learning_rate": 0.00024310663203398273, "loss": 0.1558, "step": 97 }, { "epoch": 3.3793103448275863, "grad_norm": 0.31895291805267334, "learning_rate": 0.0002398371970207672, "loss": 0.1316, "step": 98 }, { "epoch": 3.413793103448276, "grad_norm": 0.5188913345336914, "learning_rate": 0.0002365566379631381, "loss": 0.1717, "step": 99 }, { "epoch": 3.4482758620689653, "grad_norm": 0.3724120557308197, "learning_rate": 0.00023326587091662603, "loss": 0.1305, "step": 100 }, { "epoch": 3.4827586206896552, "grad_norm": 0.37616410851478577, "learning_rate": 0.0002299658147872163, "loss": 0.1123, "step": 101 }, { "epoch": 3.5172413793103448, "grad_norm": 0.3182910680770874, "learning_rate": 0.0002266573910747558, "loss": 0.1483, "step": 102 }, { "epoch": 3.5517241379310347, "grad_norm": 0.3919450044631958, "learning_rate": 0.00022334152361563528, "loss": 0.1844, "step": 103 }, { "epoch": 3.586206896551724, "grad_norm": 0.37084081768989563, "learning_rate": 0.0002200191383248197, "loss": 0.147, "step": 104 }, { "epoch": 3.6206896551724137, "grad_norm": 0.21913090348243713, "learning_rate": 0.000216691162937298, "loss": 0.0857, "step": 105 }, { "epoch": 3.655172413793103, "grad_norm": 0.5852271914482117, "learning_rate": 0.00021335852674902434, "loss": 0.153, "step": 106 }, { "epoch": 3.689655172413793, "grad_norm": 0.3712017238140106, "learning_rate": 0.00021002216035742385, "loss": 0.1354, "step": 107 }, { "epoch": 3.7241379310344827, "grad_norm": 0.3357457220554352, "learning_rate": 0.00020668299540153493, "loss": 0.1256, "step": 108 }, { "epoch": 3.7586206896551726, "grad_norm": 0.5734804272651672, "learning_rate": 0.00020334196430186018, "loss": 0.1405, "step": 109 }, { "epoch": 3.793103448275862, "grad_norm": 0.5242103934288025, "learning_rate": 0.0002, "loss": 0.1469, "step": 110 }, { "epoch": 3.8275862068965516, "grad_norm": 0.5228754878044128, "learning_rate": 0.0001966580356981398, "loss": 0.2241, "step": 111 }, { "epoch": 3.862068965517241, "grad_norm": 0.4682646691799164, "learning_rate": 0.00019331700459846517, "loss": 0.1912, "step": 112 }, { "epoch": 3.896551724137931, "grad_norm": 0.3244553208351135, "learning_rate": 0.00018997783964257617, "loss": 0.1303, "step": 113 }, { "epoch": 3.9310344827586206, "grad_norm": 0.32039758563041687, "learning_rate": 0.00018664147325097568, "loss": 0.1048, "step": 114 }, { "epoch": 3.9655172413793105, "grad_norm": 0.28546416759490967, "learning_rate": 0.00018330883706270209, "loss": 0.1166, "step": 115 }, { "epoch": 4.0, "grad_norm": 0.49577587842941284, "learning_rate": 0.00017998086167518034, "loss": 0.0808, "step": 116 }, { "epoch": 4.0, "eval_loss": 0.1669382005929947, "eval_runtime": 11.4691, "eval_samples_per_second": 7.15, "eval_steps_per_second": 1.831, "step": 116 }, { "epoch": 4.0344827586206895, "grad_norm": 0.29314538836479187, "learning_rate": 0.00017665847638436476, "loss": 0.123, "step": 117 }, { "epoch": 4.068965517241379, "grad_norm": 0.3075414299964905, "learning_rate": 0.0001733426089252443, "loss": 0.1309, "step": 118 }, { "epoch": 4.103448275862069, "grad_norm": 0.35980483889579773, "learning_rate": 0.00017003418521278373, "loss": 0.124, "step": 119 }, { "epoch": 4.137931034482759, "grad_norm": 0.3515003025531769, "learning_rate": 0.00016673412908337401, "loss": 0.0992, "step": 120 }, { "epoch": 4.172413793103448, "grad_norm": 0.3396667242050171, "learning_rate": 0.00016344336203686198, "loss": 0.1256, "step": 121 }, { "epoch": 4.206896551724138, "grad_norm": 0.3930714428424835, "learning_rate": 0.00016016280297923282, "loss": 0.1003, "step": 122 }, { "epoch": 4.241379310344827, "grad_norm": 0.3275085985660553, "learning_rate": 0.0001568933679660173, "loss": 0.1025, "step": 123 }, { "epoch": 4.275862068965517, "grad_norm": 0.29073071479797363, "learning_rate": 0.00015363596994649433, "loss": 0.117, "step": 124 }, { "epoch": 4.310344827586207, "grad_norm": 0.3251801133155823, "learning_rate": 0.00015039151850876134, "loss": 0.0929, "step": 125 }, { "epoch": 4.344827586206897, "grad_norm": 0.36564385890960693, "learning_rate": 0.00014716091962574282, "loss": 0.1114, "step": 126 }, { "epoch": 4.379310344827586, "grad_norm": 0.3105633556842804, "learning_rate": 0.00014394507540220876, "loss": 0.1069, "step": 127 }, { "epoch": 4.413793103448276, "grad_norm": 0.24759739637374878, "learning_rate": 0.00014074488382287322, "loss": 0.0831, "step": 128 }, { "epoch": 4.448275862068965, "grad_norm": 0.3449052572250366, "learning_rate": 0.00013756123850164274, "loss": 0.1032, "step": 129 }, { "epoch": 4.482758620689655, "grad_norm": 0.3175407946109772, "learning_rate": 0.00013439502843208618, "loss": 0.1003, "step": 130 }, { "epoch": 4.517241379310345, "grad_norm": 0.2750047445297241, "learning_rate": 0.00013124713773919407, "loss": 0.0731, "step": 131 }, { "epoch": 4.551724137931035, "grad_norm": 0.28757160902023315, "learning_rate": 0.00012811844543249748, "loss": 0.069, "step": 132 }, { "epoch": 4.586206896551724, "grad_norm": 0.392083078622818, "learning_rate": 0.00012500982516061582, "loss": 0.1213, "step": 133 }, { "epoch": 4.620689655172414, "grad_norm": 0.3845164179801941, "learning_rate": 0.00012192214496730105, "loss": 0.1173, "step": 134 }, { "epoch": 4.655172413793103, "grad_norm": 0.35339465737342834, "learning_rate": 0.00011885626704904729, "loss": 0.0917, "step": 135 }, { "epoch": 4.689655172413794, "grad_norm": 0.6020617485046387, "learning_rate": 0.00011581304751433304, "loss": 0.1217, "step": 136 }, { "epoch": 4.724137931034483, "grad_norm": 0.27444228529930115, "learning_rate": 0.0001127933361445635, "loss": 0.0851, "step": 137 }, { "epoch": 4.758620689655173, "grad_norm": 0.2869485020637512, "learning_rate": 0.0001097979761567796, "loss": 0.0821, "step": 138 }, { "epoch": 4.793103448275862, "grad_norm": 0.4190747141838074, "learning_rate": 0.00010682780396820038, "loss": 0.1453, "step": 139 }, { "epoch": 4.827586206896552, "grad_norm": 0.2771516442298889, "learning_rate": 0.00010388364896266325, "loss": 0.0943, "step": 140 }, { "epoch": 4.862068965517241, "grad_norm": 0.3202371299266815, "learning_rate": 0.00010096633325902931, "loss": 0.0832, "step": 141 }, { "epoch": 4.896551724137931, "grad_norm": 0.3687487840652466, "learning_rate": 9.80766714816165e-05, "loss": 0.1047, "step": 142 }, { "epoch": 4.931034482758621, "grad_norm": 0.3594297170639038, "learning_rate": 9.52154705327263e-05, "loss": 0.1058, "step": 143 }, { "epoch": 4.9655172413793105, "grad_norm": 0.3087465465068817, "learning_rate": 9.238352936732549e-05, "loss": 0.0829, "step": 144 }, { "epoch": 5.0, "grad_norm": 0.4151803255081177, "learning_rate": 8.95816387699487e-05, "loss": 0.0728, "step": 145 }, { "epoch": 5.0, "eval_loss": 0.142612025141716, "eval_runtime": 11.2217, "eval_samples_per_second": 7.307, "eval_steps_per_second": 1.871, "step": 145 }, { "epoch": 5.0344827586206895, "grad_norm": 0.32192564010620117, "learning_rate": 8.681058113388094e-05, "loss": 0.0795, "step": 146 }, { "epoch": 5.068965517241379, "grad_norm": 0.2751513421535492, "learning_rate": 8.407113024268449e-05, "loss": 0.096, "step": 147 }, { "epoch": 5.103448275862069, "grad_norm": 0.2346341460943222, "learning_rate": 8.136405105412897e-05, "loss": 0.0797, "step": 148 }, { "epoch": 5.137931034482759, "grad_norm": 0.2630951702594757, "learning_rate": 7.869009948658652e-05, "loss": 0.0687, "step": 149 }, { "epoch": 5.172413793103448, "grad_norm": 0.29806041717529297, "learning_rate": 7.605002220795106e-05, "loss": 0.0835, "step": 150 }, { "epoch": 5.206896551724138, "grad_norm": 0.3110710680484772, "learning_rate": 7.344455642714028e-05, "loss": 0.0694, "step": 151 }, { "epoch": 5.241379310344827, "grad_norm": 0.7218759655952454, "learning_rate": 7.087442968823952e-05, "loss": 0.0673, "step": 152 }, { "epoch": 5.275862068965517, "grad_norm": 0.33561110496520996, "learning_rate": 6.834035966734369e-05, "loss": 0.0874, "step": 153 }, { "epoch": 5.310344827586207, "grad_norm": 0.33989977836608887, "learning_rate": 6.584305397215536e-05, "loss": 0.0732, "step": 154 }, { "epoch": 5.344827586206897, "grad_norm": 0.4611569046974182, "learning_rate": 6.338320994439385e-05, "loss": 0.1063, "step": 155 }, { "epoch": 5.379310344827586, "grad_norm": 0.2923598289489746, "learning_rate": 6.0961514465071545e-05, "loss": 0.067, "step": 156 }, { "epoch": 5.413793103448276, "grad_norm": 0.3514823913574219, "learning_rate": 5.857864376269051e-05, "loss": 0.0924, "step": 157 }, { "epoch": 5.448275862068965, "grad_norm": 0.26756709814071655, "learning_rate": 5.623526322441417e-05, "loss": 0.0469, "step": 158 }, { "epoch": 5.482758620689655, "grad_norm": 0.3835451602935791, "learning_rate": 5.3932027210266177e-05, "loss": 0.089, "step": 159 }, { "epoch": 5.517241379310345, "grad_norm": 0.3267417848110199, "learning_rate": 5.1669578870408486e-05, "loss": 0.0818, "step": 160 }, { "epoch": 5.551724137931035, "grad_norm": 0.33168888092041016, "learning_rate": 4.944854996554973e-05, "loss": 0.0897, "step": 161 }, { "epoch": 5.586206896551724, "grad_norm": 0.3530052900314331, "learning_rate": 4.726956069053361e-05, "loss": 0.102, "step": 162 }, { "epoch": 5.620689655172414, "grad_norm": 0.2784576416015625, "learning_rate": 4.5133219501157345e-05, "loss": 0.0546, "step": 163 }, { "epoch": 5.655172413793103, "grad_norm": 0.35242149233818054, "learning_rate": 4.3040122944267805e-05, "loss": 0.0694, "step": 164 }, { "epoch": 5.689655172413794, "grad_norm": 0.31660401821136475, "learning_rate": 4.09908554911834e-05, "loss": 0.0929, "step": 165 }, { "epoch": 5.724137931034483, "grad_norm": 0.40514034032821655, "learning_rate": 3.898598937448743e-05, "loss": 0.1084, "step": 166 }, { "epoch": 5.758620689655173, "grad_norm": 0.293292760848999, "learning_rate": 3.702608442823934e-05, "loss": 0.0599, "step": 167 }, { "epoch": 5.793103448275862, "grad_norm": 0.2863250970840454, "learning_rate": 3.5111687931647984e-05, "loss": 0.097, "step": 168 }, { "epoch": 5.827586206896552, "grad_norm": 0.4727107882499695, "learning_rate": 3.3243334456250604e-05, "loss": 0.0747, "step": 169 }, { "epoch": 5.862068965517241, "grad_norm": 0.3087122142314911, "learning_rate": 3.14215457166406e-05, "loss": 0.0821, "step": 170 }, { "epoch": 5.896551724137931, "grad_norm": 0.4154495596885681, "learning_rate": 2.9646830424784754e-05, "loss": 0.061, "step": 171 }, { "epoch": 5.931034482758621, "grad_norm": 0.4096265435218811, "learning_rate": 2.791968414797217e-05, "loss": 0.0873, "step": 172 }, { "epoch": 5.9655172413793105, "grad_norm": 0.31916359066963196, "learning_rate": 2.6240589170432706e-05, "loss": 0.0711, "step": 173 }, { "epoch": 6.0, "grad_norm": 0.34077519178390503, "learning_rate": 2.46100143586657e-05, "loss": 0.0205, "step": 174 }, { "epoch": 6.0, "eval_loss": 0.13364312052726746, "eval_runtime": 11.1536, "eval_samples_per_second": 7.352, "eval_steps_per_second": 1.883, "step": 174 }, { "epoch": 6.0344827586206895, "grad_norm": 0.32007357478141785, "learning_rate": 2.3028415030514293e-05, "loss": 0.0582, "step": 175 }, { "epoch": 6.068965517241379, "grad_norm": 0.44259366393089294, "learning_rate": 2.1496232828023776e-05, "loss": 0.088, "step": 176 }, { "epoch": 6.103448275862069, "grad_norm": 0.376699835062027, "learning_rate": 2.0013895594118438e-05, "loss": 0.0621, "step": 177 }, { "epoch": 6.137931034482759, "grad_norm": 0.298380047082901, "learning_rate": 1.858181725313186e-05, "loss": 0.0851, "step": 178 }, { "epoch": 6.172413793103448, "grad_norm": 0.32520031929016113, "learning_rate": 1.7200397695223458e-05, "loss": 0.0597, "step": 179 }, { "epoch": 6.206896551724138, "grad_norm": 0.2930215895175934, "learning_rate": 1.5870022664714224e-05, "loss": 0.0568, "step": 180 }, { "epoch": 6.241379310344827, "grad_norm": 0.3769894540309906, "learning_rate": 1.4591063652372528e-05, "loss": 0.0823, "step": 181 }, { "epoch": 6.275862068965517, "grad_norm": 0.35215336084365845, "learning_rate": 1.3363877791679491e-05, "loss": 0.0865, "step": 182 }, { "epoch": 6.310344827586207, "grad_norm": 0.36714300513267517, "learning_rate": 1.2188807759104426e-05, "loss": 0.0803, "step": 183 }, { "epoch": 6.344827586206897, "grad_norm": 0.2876943051815033, "learning_rate": 1.1066181678416266e-05, "loss": 0.065, "step": 184 }, { "epoch": 6.379310344827586, "grad_norm": 0.31088706851005554, "learning_rate": 9.99631302905919e-06, "loss": 0.0489, "step": 185 }, { "epoch": 6.413793103448276, "grad_norm": 0.2680237293243408, "learning_rate": 8.979500558617515e-06, "loss": 0.0499, "step": 186 }, { "epoch": 6.448275862068965, "grad_norm": 0.2737422287464142, "learning_rate": 8.016028199393844e-06, "loss": 0.0624, "step": 187 }, { "epoch": 6.482758620689655, "grad_norm": 0.3311713635921478, "learning_rate": 7.1061649891247084e-06, "loss": 0.0806, "step": 188 }, { "epoch": 6.517241379310345, "grad_norm": 0.3095020055770874, "learning_rate": 6.250164995854935e-06, "loss": 0.0529, "step": 189 }, { "epoch": 6.551724137931035, "grad_norm": 0.24900928139686584, "learning_rate": 5.448267246992589e-06, "loss": 0.0655, "step": 190 }, { "epoch": 6.586206896551724, "grad_norm": 0.33236587047576904, "learning_rate": 4.7006956625630595e-06, "loss": 0.0801, "step": 191 }, { "epoch": 6.620689655172414, "grad_norm": 0.2984936535358429, "learning_rate": 4.00765899268265e-06, "loss": 0.0728, "step": 192 }, { "epoch": 6.655172413793103, "grad_norm": 0.25962910056114197, "learning_rate": 3.369350759267298e-06, "loss": 0.0591, "step": 193 }, { "epoch": 6.689655172413794, "grad_norm": 0.42481696605682373, "learning_rate": 2.7859492019942866e-06, "loss": 0.1136, "step": 194 }, { "epoch": 6.724137931034483, "grad_norm": 0.2698614299297333, "learning_rate": 2.257617228530773e-06, "loss": 0.0668, "step": 195 }, { "epoch": 6.758620689655173, "grad_norm": 0.31150490045547485, "learning_rate": 1.7845023690439943e-06, "loss": 0.0742, "step": 196 }, { "epoch": 6.793103448275862, "grad_norm": 0.36004438996315, "learning_rate": 1.3667367350051808e-06, "loss": 0.0591, "step": 197 }, { "epoch": 6.827586206896552, "grad_norm": 0.31732040643692017, "learning_rate": 1.0044369822991729e-06, "loss": 0.0737, "step": 198 }, { "epoch": 6.862068965517241, "grad_norm": 0.24975836277008057, "learning_rate": 6.977042786496802e-07, "loss": 0.0469, "step": 199 }, { "epoch": 6.896551724137931, "grad_norm": 0.2643037736415863, "learning_rate": 4.4662427536936725e-07, "loss": 0.0547, "step": 200 }, { "epoch": 6.931034482758621, "grad_norm": 0.24362404644489288, "learning_rate": 2.512670834428521e-07, "loss": 0.0368, "step": 201 }, { "epoch": 6.9655172413793105, "grad_norm": 0.2634768784046173, "learning_rate": 1.1168725394907764e-07, "loss": 0.0537, "step": 202 }, { "epoch": 7.0, "grad_norm": 0.3374311923980713, "learning_rate": 2.7923762828585555e-08, "loss": 0.0299, "step": 203 }, { "epoch": 7.0, "eval_loss": 0.12307652831077576, "eval_runtime": 11.2127, "eval_samples_per_second": 7.313, "eval_steps_per_second": 1.873, "step": 203 } ], "logging_steps": 1, "max_steps": 203, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2373731961512346e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }