| { | |
| "best_global_step": 203, | |
| "best_metric": 0.12307652831077576, | |
| "best_model_checkpoint": "/cache/outputs/checkpoint-203", | |
| "epoch": 7.0, | |
| "eval_steps": 500, | |
| "global_step": 203, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.034482758620689655, | |
| "grad_norm": 17.74788475036621, | |
| "learning_rate": 0.0, | |
| "loss": 5.2296, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.06896551724137931, | |
| "grad_norm": 18.899648666381836, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 4.5763, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.10344827586206896, | |
| "grad_norm": 17.478742599487305, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 3.8061, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 6.993143081665039, | |
| "learning_rate": 8e-05, | |
| "loss": 1.9885, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 8.001960754394531, | |
| "learning_rate": 0.00010666666666666667, | |
| "loss": 1.7205, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.20689655172413793, | |
| "grad_norm": 3.1834046840667725, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.1047, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.2413793103448276, | |
| "grad_norm": 1.3439714908599854, | |
| "learning_rate": 0.00016, | |
| "loss": 0.7238, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 1.5758135318756104, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 0.8051, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.3103448275862069, | |
| "grad_norm": 1.299546241760254, | |
| "learning_rate": 0.00021333333333333333, | |
| "loss": 0.7102, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 0.8412534594535828, | |
| "learning_rate": 0.00024, | |
| "loss": 0.4769, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3793103448275862, | |
| "grad_norm": 0.8651800751686096, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 0.4883, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.7943485975265503, | |
| "learning_rate": 0.0002933333333333333, | |
| "loss": 0.5219, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.4482758620689655, | |
| "grad_norm": 0.6727921366691589, | |
| "learning_rate": 0.00032, | |
| "loss": 0.3604, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.4827586206896552, | |
| "grad_norm": 0.578971266746521, | |
| "learning_rate": 0.00034666666666666667, | |
| "loss": 0.4373, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.5172413793103449, | |
| "grad_norm": 0.5071410536766052, | |
| "learning_rate": 0.0003733333333333334, | |
| "loss": 0.4108, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.6982496976852417, | |
| "learning_rate": 0.0004, | |
| "loss": 0.4349, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.5862068965517241, | |
| "grad_norm": 0.44865670800209045, | |
| "learning_rate": 0.00039997207623717143, | |
| "loss": 0.3644, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.6206896551724138, | |
| "grad_norm": 0.5113717317581177, | |
| "learning_rate": 0.00039988831274605094, | |
| "loss": 0.3863, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.6551724137931034, | |
| "grad_norm": 0.5012100338935852, | |
| "learning_rate": 0.0003997487329165572, | |
| "loss": 0.2768, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.4838305413722992, | |
| "learning_rate": 0.0003995533757246307, | |
| "loss": 0.3422, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7241379310344828, | |
| "grad_norm": 0.6303550004959106, | |
| "learning_rate": 0.00039930229572135033, | |
| "loss": 0.3288, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.7586206896551724, | |
| "grad_norm": 0.5895312428474426, | |
| "learning_rate": 0.00039899556301770084, | |
| "loss": 0.2696, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.7931034482758621, | |
| "grad_norm": 0.5373993515968323, | |
| "learning_rate": 0.00039863326326499484, | |
| "loss": 0.3868, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.48342981934547424, | |
| "learning_rate": 0.000398215497630956, | |
| "loss": 0.3162, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.8620689655172413, | |
| "grad_norm": 0.28195714950561523, | |
| "learning_rate": 0.0003977423827714692, | |
| "loss": 0.2556, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.896551724137931, | |
| "grad_norm": 0.637691855430603, | |
| "learning_rate": 0.00039721405079800573, | |
| "loss": 0.3809, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.9310344827586207, | |
| "grad_norm": 0.3745051920413971, | |
| "learning_rate": 0.0003966306492407327, | |
| "loss": 0.2701, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.636221170425415, | |
| "learning_rate": 0.0003959923410073174, | |
| "loss": 0.308, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6435479521751404, | |
| "learning_rate": 0.0003952993043374369, | |
| "loss": 0.2667, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.5053671598434448, | |
| "eval_runtime": 18.682, | |
| "eval_samples_per_second": 4.389, | |
| "eval_steps_per_second": 1.124, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.0344827586206897, | |
| "grad_norm": 0.30985596776008606, | |
| "learning_rate": 0.00039455173275300745, | |
| "loss": 0.2241, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.0689655172413792, | |
| "grad_norm": 0.6296059489250183, | |
| "learning_rate": 0.0003937498350041451, | |
| "loss": 0.3091, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.103448275862069, | |
| "grad_norm": 0.5439299941062927, | |
| "learning_rate": 0.00039289383501087534, | |
| "loss": 0.2937, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.1379310344827587, | |
| "grad_norm": 0.3232507109642029, | |
| "learning_rate": 0.0003919839718006062, | |
| "loss": 0.2528, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.1724137931034484, | |
| "grad_norm": 0.511285662651062, | |
| "learning_rate": 0.0003910204994413825, | |
| "loss": 0.2235, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.206896551724138, | |
| "grad_norm": 1.3702025413513184, | |
| "learning_rate": 0.00039000368697094084, | |
| "loss": 0.2724, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.2413793103448276, | |
| "grad_norm": 0.5106721520423889, | |
| "learning_rate": 0.0003889338183215838, | |
| "loss": 0.289, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 0.4077189266681671, | |
| "learning_rate": 0.0003878111922408956, | |
| "loss": 0.2459, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.3103448275862069, | |
| "grad_norm": 0.7492893934249878, | |
| "learning_rate": 0.00038663612220832055, | |
| "loss": 0.2533, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.3448275862068966, | |
| "grad_norm": 0.5893272757530212, | |
| "learning_rate": 0.00038540893634762753, | |
| "loss": 0.2458, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.3793103448275863, | |
| "grad_norm": 0.8055713772773743, | |
| "learning_rate": 0.00038412997733528576, | |
| "loss": 0.2893, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.4137931034482758, | |
| "grad_norm": 0.37418168783187866, | |
| "learning_rate": 0.00038279960230477655, | |
| "loss": 0.2172, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.4482758620689655, | |
| "grad_norm": 0.44418400526046753, | |
| "learning_rate": 0.00038141818274686816, | |
| "loss": 0.237, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.4827586206896552, | |
| "grad_norm": 0.3771205246448517, | |
| "learning_rate": 0.0003799861044058816, | |
| "loss": 0.2039, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.5172413793103448, | |
| "grad_norm": 0.43807318806648254, | |
| "learning_rate": 0.00037850376717197626, | |
| "loss": 0.2328, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 0.3352651000022888, | |
| "learning_rate": 0.00037697158496948575, | |
| "loss": 0.2319, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.5862068965517242, | |
| "grad_norm": 0.595750629901886, | |
| "learning_rate": 0.00037538998564133434, | |
| "loss": 0.3058, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.6206896551724137, | |
| "grad_norm": 0.4095384478569031, | |
| "learning_rate": 0.0003737594108295673, | |
| "loss": 0.2451, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.6551724137931034, | |
| "grad_norm": 0.3446806073188782, | |
| "learning_rate": 0.0003720803158520279, | |
| "loss": 0.2353, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.6896551724137931, | |
| "grad_norm": 0.4992334842681885, | |
| "learning_rate": 0.0003703531695752152, | |
| "loss": 0.2178, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.7241379310344827, | |
| "grad_norm": 0.3720123767852783, | |
| "learning_rate": 0.0003685784542833594, | |
| "loss": 0.2211, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.7586206896551724, | |
| "grad_norm": 0.3732210397720337, | |
| "learning_rate": 0.00036675666554374944, | |
| "loss": 0.2299, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.793103448275862, | |
| "grad_norm": 0.3584296703338623, | |
| "learning_rate": 0.00036488831206835207, | |
| "loss": 0.2098, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.8275862068965516, | |
| "grad_norm": 0.3997485637664795, | |
| "learning_rate": 0.00036297391557176066, | |
| "loss": 0.2729, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.8620689655172413, | |
| "grad_norm": 0.4166489243507385, | |
| "learning_rate": 0.0003610140106255126, | |
| "loss": 0.2177, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.896551724137931, | |
| "grad_norm": 0.45685556530952454, | |
| "learning_rate": 0.0003590091445088166, | |
| "loss": 0.3138, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.9310344827586206, | |
| "grad_norm": 0.49145734310150146, | |
| "learning_rate": 0.0003569598770557322, | |
| "loss": 0.2876, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.9655172413793105, | |
| "grad_norm": 0.5870755910873413, | |
| "learning_rate": 0.0003548667804988427, | |
| "loss": 0.2103, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.601203441619873, | |
| "learning_rate": 0.00035273043930946646, | |
| "loss": 0.2372, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.3301706910133362, | |
| "eval_runtime": 11.7909, | |
| "eval_samples_per_second": 6.955, | |
| "eval_steps_per_second": 1.781, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.0344827586206895, | |
| "grad_norm": 0.6023654341697693, | |
| "learning_rate": 0.00035055145003445024, | |
| "loss": 0.2392, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 2.0689655172413794, | |
| "grad_norm": 0.42505472898483276, | |
| "learning_rate": 0.00034833042112959153, | |
| "loss": 0.2214, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.103448275862069, | |
| "grad_norm": 0.39177656173706055, | |
| "learning_rate": 0.0003460679727897339, | |
| "loss": 0.2266, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.1379310344827585, | |
| "grad_norm": 0.4654797911643982, | |
| "learning_rate": 0.0003437647367755859, | |
| "loss": 0.2738, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.1724137931034484, | |
| "grad_norm": 0.4213922917842865, | |
| "learning_rate": 0.0003414213562373095, | |
| "loss": 0.2064, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.206896551724138, | |
| "grad_norm": 0.35119640827178955, | |
| "learning_rate": 0.0003390384855349285, | |
| "loss": 0.1575, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.2413793103448274, | |
| "grad_norm": 0.3433196544647217, | |
| "learning_rate": 0.0003366167900556062, | |
| "loss": 0.1858, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.2758620689655173, | |
| "grad_norm": 0.6012677550315857, | |
| "learning_rate": 0.0003341569460278447, | |
| "loss": 0.2321, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.310344827586207, | |
| "grad_norm": 0.36528632044792175, | |
| "learning_rate": 0.00033165964033265636, | |
| "loss": 0.1871, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.344827586206897, | |
| "grad_norm": 0.33606716990470886, | |
| "learning_rate": 0.0003291255703117605, | |
| "loss": 0.1316, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.3793103448275863, | |
| "grad_norm": 0.4607170820236206, | |
| "learning_rate": 0.0003265554435728597, | |
| "loss": 0.23, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.413793103448276, | |
| "grad_norm": 0.35001179575920105, | |
| "learning_rate": 0.00032394997779204896, | |
| "loss": 0.1665, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.4482758620689653, | |
| "grad_norm": 0.3442467451095581, | |
| "learning_rate": 0.0003213099005134135, | |
| "loss": 0.1568, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.4827586206896552, | |
| "grad_norm": 0.35078802704811096, | |
| "learning_rate": 0.00031863594894587105, | |
| "loss": 0.168, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.5172413793103448, | |
| "grad_norm": 0.3411742150783539, | |
| "learning_rate": 0.00031592886975731553, | |
| "loss": 0.2015, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.5517241379310347, | |
| "grad_norm": 0.608189046382904, | |
| "learning_rate": 0.0003131894188661191, | |
| "loss": 0.24, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.586206896551724, | |
| "grad_norm": 0.9807543754577637, | |
| "learning_rate": 0.00031041836123005137, | |
| "loss": 0.2216, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.6206896551724137, | |
| "grad_norm": 0.3027774691581726, | |
| "learning_rate": 0.00030761647063267457, | |
| "loss": 0.195, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.655172413793103, | |
| "grad_norm": 0.3149068057537079, | |
| "learning_rate": 0.00030478452946727374, | |
| "loss": 0.1468, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.689655172413793, | |
| "grad_norm": 0.29977571964263916, | |
| "learning_rate": 0.0003019233285183835, | |
| "loss": 0.162, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.7241379310344827, | |
| "grad_norm": 0.33947068452835083, | |
| "learning_rate": 0.00029903366674097074, | |
| "loss": 0.1389, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 0.38519802689552307, | |
| "learning_rate": 0.00029611635103733675, | |
| "loss": 0.1543, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.793103448275862, | |
| "grad_norm": 0.3749842941761017, | |
| "learning_rate": 0.00029317219603179964, | |
| "loss": 0.1761, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.8275862068965516, | |
| "grad_norm": 0.4311760663986206, | |
| "learning_rate": 0.00029020202384322035, | |
| "loss": 0.2, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.862068965517241, | |
| "grad_norm": 0.6397997736930847, | |
| "learning_rate": 0.0002872066638554366, | |
| "loss": 0.1925, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.896551724137931, | |
| "grad_norm": 0.4956374764442444, | |
| "learning_rate": 0.000284186952485667, | |
| "loss": 0.16, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.9310344827586206, | |
| "grad_norm": 0.3576422929763794, | |
| "learning_rate": 0.0002811437329509528, | |
| "loss": 0.1277, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.9655172413793105, | |
| "grad_norm": 0.5354539155960083, | |
| "learning_rate": 0.00027807785503269894, | |
| "loss": 0.1905, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.7227762341499329, | |
| "learning_rate": 0.00027499017483938426, | |
| "loss": 0.1457, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.2014383226633072, | |
| "eval_runtime": 11.1734, | |
| "eval_samples_per_second": 7.339, | |
| "eval_steps_per_second": 1.879, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 3.0344827586206895, | |
| "grad_norm": 0.4662579298019409, | |
| "learning_rate": 0.00027188155456750256, | |
| "loss": 0.1631, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 3.0689655172413794, | |
| "grad_norm": 0.4082823097705841, | |
| "learning_rate": 0.00026875286226080603, | |
| "loss": 0.1211, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 3.103448275862069, | |
| "grad_norm": 0.33196941018104553, | |
| "learning_rate": 0.0002656049715679138, | |
| "loss": 0.1293, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 3.1379310344827585, | |
| "grad_norm": 0.38381531834602356, | |
| "learning_rate": 0.0002624387614983573, | |
| "loss": 0.157, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 3.1724137931034484, | |
| "grad_norm": 0.46282660961151123, | |
| "learning_rate": 0.00025925511617712685, | |
| "loss": 0.1516, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 3.206896551724138, | |
| "grad_norm": 0.5028776526451111, | |
| "learning_rate": 0.00025605492459779126, | |
| "loss": 0.1341, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 3.2413793103448274, | |
| "grad_norm": 0.40640944242477417, | |
| "learning_rate": 0.00025283908037425725, | |
| "loss": 0.1473, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 3.2758620689655173, | |
| "grad_norm": 0.36019614338874817, | |
| "learning_rate": 0.00024960848149123866, | |
| "loss": 0.1496, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.310344827586207, | |
| "grad_norm": 0.40027180314064026, | |
| "learning_rate": 0.0002463640300535057, | |
| "loss": 0.1315, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 3.344827586206897, | |
| "grad_norm": 0.3519420027732849, | |
| "learning_rate": 0.00024310663203398273, | |
| "loss": 0.1558, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 3.3793103448275863, | |
| "grad_norm": 0.31895291805267334, | |
| "learning_rate": 0.0002398371970207672, | |
| "loss": 0.1316, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 3.413793103448276, | |
| "grad_norm": 0.5188913345336914, | |
| "learning_rate": 0.0002365566379631381, | |
| "loss": 0.1717, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 3.4482758620689653, | |
| "grad_norm": 0.3724120557308197, | |
| "learning_rate": 0.00023326587091662603, | |
| "loss": 0.1305, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.4827586206896552, | |
| "grad_norm": 0.37616410851478577, | |
| "learning_rate": 0.0002299658147872163, | |
| "loss": 0.1123, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 3.5172413793103448, | |
| "grad_norm": 0.3182910680770874, | |
| "learning_rate": 0.0002266573910747558, | |
| "loss": 0.1483, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 3.5517241379310347, | |
| "grad_norm": 0.3919450044631958, | |
| "learning_rate": 0.00022334152361563528, | |
| "loss": 0.1844, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 3.586206896551724, | |
| "grad_norm": 0.37084081768989563, | |
| "learning_rate": 0.0002200191383248197, | |
| "loss": 0.147, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 3.6206896551724137, | |
| "grad_norm": 0.21913090348243713, | |
| "learning_rate": 0.000216691162937298, | |
| "loss": 0.0857, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.655172413793103, | |
| "grad_norm": 0.5852271914482117, | |
| "learning_rate": 0.00021335852674902434, | |
| "loss": 0.153, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 3.689655172413793, | |
| "grad_norm": 0.3712017238140106, | |
| "learning_rate": 0.00021002216035742385, | |
| "loss": 0.1354, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 3.7241379310344827, | |
| "grad_norm": 0.3357457220554352, | |
| "learning_rate": 0.00020668299540153493, | |
| "loss": 0.1256, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 3.7586206896551726, | |
| "grad_norm": 0.5734804272651672, | |
| "learning_rate": 0.00020334196430186018, | |
| "loss": 0.1405, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 3.793103448275862, | |
| "grad_norm": 0.5242103934288025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1469, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.8275862068965516, | |
| "grad_norm": 0.5228754878044128, | |
| "learning_rate": 0.0001966580356981398, | |
| "loss": 0.2241, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 3.862068965517241, | |
| "grad_norm": 0.4682646691799164, | |
| "learning_rate": 0.00019331700459846517, | |
| "loss": 0.1912, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 3.896551724137931, | |
| "grad_norm": 0.3244553208351135, | |
| "learning_rate": 0.00018997783964257617, | |
| "loss": 0.1303, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 3.9310344827586206, | |
| "grad_norm": 0.32039758563041687, | |
| "learning_rate": 0.00018664147325097568, | |
| "loss": 0.1048, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 3.9655172413793105, | |
| "grad_norm": 0.28546416759490967, | |
| "learning_rate": 0.00018330883706270209, | |
| "loss": 0.1166, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.49577587842941284, | |
| "learning_rate": 0.00017998086167518034, | |
| "loss": 0.0808, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.1669382005929947, | |
| "eval_runtime": 11.4691, | |
| "eval_samples_per_second": 7.15, | |
| "eval_steps_per_second": 1.831, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 4.0344827586206895, | |
| "grad_norm": 0.29314538836479187, | |
| "learning_rate": 0.00017665847638436476, | |
| "loss": 0.123, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 4.068965517241379, | |
| "grad_norm": 0.3075414299964905, | |
| "learning_rate": 0.0001733426089252443, | |
| "loss": 0.1309, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 4.103448275862069, | |
| "grad_norm": 0.35980483889579773, | |
| "learning_rate": 0.00017003418521278373, | |
| "loss": 0.124, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 4.137931034482759, | |
| "grad_norm": 0.3515003025531769, | |
| "learning_rate": 0.00016673412908337401, | |
| "loss": 0.0992, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.172413793103448, | |
| "grad_norm": 0.3396667242050171, | |
| "learning_rate": 0.00016344336203686198, | |
| "loss": 0.1256, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 4.206896551724138, | |
| "grad_norm": 0.3930714428424835, | |
| "learning_rate": 0.00016016280297923282, | |
| "loss": 0.1003, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 4.241379310344827, | |
| "grad_norm": 0.3275085985660553, | |
| "learning_rate": 0.0001568933679660173, | |
| "loss": 0.1025, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 4.275862068965517, | |
| "grad_norm": 0.29073071479797363, | |
| "learning_rate": 0.00015363596994649433, | |
| "loss": 0.117, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 4.310344827586207, | |
| "grad_norm": 0.3251801133155823, | |
| "learning_rate": 0.00015039151850876134, | |
| "loss": 0.0929, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.344827586206897, | |
| "grad_norm": 0.36564385890960693, | |
| "learning_rate": 0.00014716091962574282, | |
| "loss": 0.1114, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 4.379310344827586, | |
| "grad_norm": 0.3105633556842804, | |
| "learning_rate": 0.00014394507540220876, | |
| "loss": 0.1069, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 4.413793103448276, | |
| "grad_norm": 0.24759739637374878, | |
| "learning_rate": 0.00014074488382287322, | |
| "loss": 0.0831, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 4.448275862068965, | |
| "grad_norm": 0.3449052572250366, | |
| "learning_rate": 0.00013756123850164274, | |
| "loss": 0.1032, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 4.482758620689655, | |
| "grad_norm": 0.3175407946109772, | |
| "learning_rate": 0.00013439502843208618, | |
| "loss": 0.1003, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.517241379310345, | |
| "grad_norm": 0.2750047445297241, | |
| "learning_rate": 0.00013124713773919407, | |
| "loss": 0.0731, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 4.551724137931035, | |
| "grad_norm": 0.28757160902023315, | |
| "learning_rate": 0.00012811844543249748, | |
| "loss": 0.069, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 4.586206896551724, | |
| "grad_norm": 0.392083078622818, | |
| "learning_rate": 0.00012500982516061582, | |
| "loss": 0.1213, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 4.620689655172414, | |
| "grad_norm": 0.3845164179801941, | |
| "learning_rate": 0.00012192214496730105, | |
| "loss": 0.1173, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 4.655172413793103, | |
| "grad_norm": 0.35339465737342834, | |
| "learning_rate": 0.00011885626704904729, | |
| "loss": 0.0917, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 4.689655172413794, | |
| "grad_norm": 0.6020617485046387, | |
| "learning_rate": 0.00011581304751433304, | |
| "loss": 0.1217, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 4.724137931034483, | |
| "grad_norm": 0.27444228529930115, | |
| "learning_rate": 0.0001127933361445635, | |
| "loss": 0.0851, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 4.758620689655173, | |
| "grad_norm": 0.2869485020637512, | |
| "learning_rate": 0.0001097979761567796, | |
| "loss": 0.0821, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 4.793103448275862, | |
| "grad_norm": 0.4190747141838074, | |
| "learning_rate": 0.00010682780396820038, | |
| "loss": 0.1453, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 4.827586206896552, | |
| "grad_norm": 0.2771516442298889, | |
| "learning_rate": 0.00010388364896266325, | |
| "loss": 0.0943, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.862068965517241, | |
| "grad_norm": 0.3202371299266815, | |
| "learning_rate": 0.00010096633325902931, | |
| "loss": 0.0832, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 4.896551724137931, | |
| "grad_norm": 0.3687487840652466, | |
| "learning_rate": 9.80766714816165e-05, | |
| "loss": 0.1047, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 4.931034482758621, | |
| "grad_norm": 0.3594297170639038, | |
| "learning_rate": 9.52154705327263e-05, | |
| "loss": 0.1058, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 4.9655172413793105, | |
| "grad_norm": 0.3087465465068817, | |
| "learning_rate": 9.238352936732549e-05, | |
| "loss": 0.0829, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.4151803255081177, | |
| "learning_rate": 8.95816387699487e-05, | |
| "loss": 0.0728, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.142612025141716, | |
| "eval_runtime": 11.2217, | |
| "eval_samples_per_second": 7.307, | |
| "eval_steps_per_second": 1.871, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 5.0344827586206895, | |
| "grad_norm": 0.32192564010620117, | |
| "learning_rate": 8.681058113388094e-05, | |
| "loss": 0.0795, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 5.068965517241379, | |
| "grad_norm": 0.2751513421535492, | |
| "learning_rate": 8.407113024268449e-05, | |
| "loss": 0.096, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 5.103448275862069, | |
| "grad_norm": 0.2346341460943222, | |
| "learning_rate": 8.136405105412897e-05, | |
| "loss": 0.0797, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 5.137931034482759, | |
| "grad_norm": 0.2630951702594757, | |
| "learning_rate": 7.869009948658652e-05, | |
| "loss": 0.0687, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 5.172413793103448, | |
| "grad_norm": 0.29806041717529297, | |
| "learning_rate": 7.605002220795106e-05, | |
| "loss": 0.0835, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 5.206896551724138, | |
| "grad_norm": 0.3110710680484772, | |
| "learning_rate": 7.344455642714028e-05, | |
| "loss": 0.0694, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 5.241379310344827, | |
| "grad_norm": 0.7218759655952454, | |
| "learning_rate": 7.087442968823952e-05, | |
| "loss": 0.0673, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 5.275862068965517, | |
| "grad_norm": 0.33561110496520996, | |
| "learning_rate": 6.834035966734369e-05, | |
| "loss": 0.0874, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 5.310344827586207, | |
| "grad_norm": 0.33989977836608887, | |
| "learning_rate": 6.584305397215536e-05, | |
| "loss": 0.0732, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 5.344827586206897, | |
| "grad_norm": 0.4611569046974182, | |
| "learning_rate": 6.338320994439385e-05, | |
| "loss": 0.1063, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 5.379310344827586, | |
| "grad_norm": 0.2923598289489746, | |
| "learning_rate": 6.0961514465071545e-05, | |
| "loss": 0.067, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 5.413793103448276, | |
| "grad_norm": 0.3514823913574219, | |
| "learning_rate": 5.857864376269051e-05, | |
| "loss": 0.0924, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 5.448275862068965, | |
| "grad_norm": 0.26756709814071655, | |
| "learning_rate": 5.623526322441417e-05, | |
| "loss": 0.0469, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 5.482758620689655, | |
| "grad_norm": 0.3835451602935791, | |
| "learning_rate": 5.3932027210266177e-05, | |
| "loss": 0.089, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 5.517241379310345, | |
| "grad_norm": 0.3267417848110199, | |
| "learning_rate": 5.1669578870408486e-05, | |
| "loss": 0.0818, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.551724137931035, | |
| "grad_norm": 0.33168888092041016, | |
| "learning_rate": 4.944854996554973e-05, | |
| "loss": 0.0897, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 5.586206896551724, | |
| "grad_norm": 0.3530052900314331, | |
| "learning_rate": 4.726956069053361e-05, | |
| "loss": 0.102, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 5.620689655172414, | |
| "grad_norm": 0.2784576416015625, | |
| "learning_rate": 4.5133219501157345e-05, | |
| "loss": 0.0546, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 5.655172413793103, | |
| "grad_norm": 0.35242149233818054, | |
| "learning_rate": 4.3040122944267805e-05, | |
| "loss": 0.0694, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 5.689655172413794, | |
| "grad_norm": 0.31660401821136475, | |
| "learning_rate": 4.09908554911834e-05, | |
| "loss": 0.0929, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 5.724137931034483, | |
| "grad_norm": 0.40514034032821655, | |
| "learning_rate": 3.898598937448743e-05, | |
| "loss": 0.1084, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 5.758620689655173, | |
| "grad_norm": 0.293292760848999, | |
| "learning_rate": 3.702608442823934e-05, | |
| "loss": 0.0599, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 5.793103448275862, | |
| "grad_norm": 0.2863250970840454, | |
| "learning_rate": 3.5111687931647984e-05, | |
| "loss": 0.097, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 5.827586206896552, | |
| "grad_norm": 0.4727107882499695, | |
| "learning_rate": 3.3243334456250604e-05, | |
| "loss": 0.0747, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 5.862068965517241, | |
| "grad_norm": 0.3087122142314911, | |
| "learning_rate": 3.14215457166406e-05, | |
| "loss": 0.0821, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 5.896551724137931, | |
| "grad_norm": 0.4154495596885681, | |
| "learning_rate": 2.9646830424784754e-05, | |
| "loss": 0.061, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 5.931034482758621, | |
| "grad_norm": 0.4096265435218811, | |
| "learning_rate": 2.791968414797217e-05, | |
| "loss": 0.0873, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 5.9655172413793105, | |
| "grad_norm": 0.31916359066963196, | |
| "learning_rate": 2.6240589170432706e-05, | |
| "loss": 0.0711, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.34077519178390503, | |
| "learning_rate": 2.46100143586657e-05, | |
| "loss": 0.0205, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.13364312052726746, | |
| "eval_runtime": 11.1536, | |
| "eval_samples_per_second": 7.352, | |
| "eval_steps_per_second": 1.883, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 6.0344827586206895, | |
| "grad_norm": 0.32007357478141785, | |
| "learning_rate": 2.3028415030514293e-05, | |
| "loss": 0.0582, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 6.068965517241379, | |
| "grad_norm": 0.44259366393089294, | |
| "learning_rate": 2.1496232828023776e-05, | |
| "loss": 0.088, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 6.103448275862069, | |
| "grad_norm": 0.376699835062027, | |
| "learning_rate": 2.0013895594118438e-05, | |
| "loss": 0.0621, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 6.137931034482759, | |
| "grad_norm": 0.298380047082901, | |
| "learning_rate": 1.858181725313186e-05, | |
| "loss": 0.0851, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 6.172413793103448, | |
| "grad_norm": 0.32520031929016113, | |
| "learning_rate": 1.7200397695223458e-05, | |
| "loss": 0.0597, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 6.206896551724138, | |
| "grad_norm": 0.2930215895175934, | |
| "learning_rate": 1.5870022664714224e-05, | |
| "loss": 0.0568, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 6.241379310344827, | |
| "grad_norm": 0.3769894540309906, | |
| "learning_rate": 1.4591063652372528e-05, | |
| "loss": 0.0823, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 6.275862068965517, | |
| "grad_norm": 0.35215336084365845, | |
| "learning_rate": 1.3363877791679491e-05, | |
| "loss": 0.0865, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 6.310344827586207, | |
| "grad_norm": 0.36714300513267517, | |
| "learning_rate": 1.2188807759104426e-05, | |
| "loss": 0.0803, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 6.344827586206897, | |
| "grad_norm": 0.2876943051815033, | |
| "learning_rate": 1.1066181678416266e-05, | |
| "loss": 0.065, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 6.379310344827586, | |
| "grad_norm": 0.31088706851005554, | |
| "learning_rate": 9.99631302905919e-06, | |
| "loss": 0.0489, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 6.413793103448276, | |
| "grad_norm": 0.2680237293243408, | |
| "learning_rate": 8.979500558617515e-06, | |
| "loss": 0.0499, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 6.448275862068965, | |
| "grad_norm": 0.2737422287464142, | |
| "learning_rate": 8.016028199393844e-06, | |
| "loss": 0.0624, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 6.482758620689655, | |
| "grad_norm": 0.3311713635921478, | |
| "learning_rate": 7.1061649891247084e-06, | |
| "loss": 0.0806, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 6.517241379310345, | |
| "grad_norm": 0.3095020055770874, | |
| "learning_rate": 6.250164995854935e-06, | |
| "loss": 0.0529, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 6.551724137931035, | |
| "grad_norm": 0.24900928139686584, | |
| "learning_rate": 5.448267246992589e-06, | |
| "loss": 0.0655, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.586206896551724, | |
| "grad_norm": 0.33236587047576904, | |
| "learning_rate": 4.7006956625630595e-06, | |
| "loss": 0.0801, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 6.620689655172414, | |
| "grad_norm": 0.2984936535358429, | |
| "learning_rate": 4.00765899268265e-06, | |
| "loss": 0.0728, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 6.655172413793103, | |
| "grad_norm": 0.25962910056114197, | |
| "learning_rate": 3.369350759267298e-06, | |
| "loss": 0.0591, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 6.689655172413794, | |
| "grad_norm": 0.42481696605682373, | |
| "learning_rate": 2.7859492019942866e-06, | |
| "loss": 0.1136, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 6.724137931034483, | |
| "grad_norm": 0.2698614299297333, | |
| "learning_rate": 2.257617228530773e-06, | |
| "loss": 0.0668, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 6.758620689655173, | |
| "grad_norm": 0.31150490045547485, | |
| "learning_rate": 1.7845023690439943e-06, | |
| "loss": 0.0742, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 6.793103448275862, | |
| "grad_norm": 0.36004438996315, | |
| "learning_rate": 1.3667367350051808e-06, | |
| "loss": 0.0591, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 6.827586206896552, | |
| "grad_norm": 0.31732040643692017, | |
| "learning_rate": 1.0044369822991729e-06, | |
| "loss": 0.0737, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 6.862068965517241, | |
| "grad_norm": 0.24975836277008057, | |
| "learning_rate": 6.977042786496802e-07, | |
| "loss": 0.0469, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 6.896551724137931, | |
| "grad_norm": 0.2643037736415863, | |
| "learning_rate": 4.4662427536936725e-07, | |
| "loss": 0.0547, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.931034482758621, | |
| "grad_norm": 0.24362404644489288, | |
| "learning_rate": 2.512670834428521e-07, | |
| "loss": 0.0368, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 6.9655172413793105, | |
| "grad_norm": 0.2634768784046173, | |
| "learning_rate": 1.1168725394907764e-07, | |
| "loss": 0.0537, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.3374311923980713, | |
| "learning_rate": 2.7923762828585555e-08, | |
| "loss": 0.0299, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.12307652831077576, | |
| "eval_runtime": 11.2127, | |
| "eval_samples_per_second": 7.313, | |
| "eval_steps_per_second": 1.873, | |
| "step": 203 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 203, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2373731961512346e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |