| { | |
| "best_global_step": 499760, | |
| "best_metric": 0.9877662062644958, | |
| "best_model_checkpoint": "/media/user/Expansion1/opus-mt-zhtw-en-google-translate3/checkpoint-499760", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 499760, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005002401152553225, | |
| "grad_norm": 11.430471420288086, | |
| "learning_rate": 4.995007603649752e-05, | |
| "loss": 2.0932, | |
| "num_input_tokens_seen": 276616, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01000480230510645, | |
| "grad_norm": 11.411553382873535, | |
| "learning_rate": 4.990005202497199e-05, | |
| "loss": 2.0509, | |
| "num_input_tokens_seen": 545376, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.015007203457659676, | |
| "grad_norm": 7.671140193939209, | |
| "learning_rate": 4.985002801344646e-05, | |
| "loss": 2.0225, | |
| "num_input_tokens_seen": 812256, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0200096046102129, | |
| "grad_norm": 10.581878662109375, | |
| "learning_rate": 4.9800004001920924e-05, | |
| "loss": 1.9906, | |
| "num_input_tokens_seen": 1089008, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02501200576276613, | |
| "grad_norm": 9.597127914428711, | |
| "learning_rate": 4.974997999039539e-05, | |
| "loss": 1.9507, | |
| "num_input_tokens_seen": 1366424, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.030014406915319352, | |
| "grad_norm": 8.959872245788574, | |
| "learning_rate": 4.9699955978869863e-05, | |
| "loss": 1.9346, | |
| "num_input_tokens_seen": 1638376, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03501680806787258, | |
| "grad_norm": 8.698769569396973, | |
| "learning_rate": 4.9649931967344327e-05, | |
| "loss": 1.9318, | |
| "num_input_tokens_seen": 1906088, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0400192092204258, | |
| "grad_norm": 9.283397674560547, | |
| "learning_rate": 4.9599907955818796e-05, | |
| "loss": 1.9044, | |
| "num_input_tokens_seen": 2180336, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04502161037297903, | |
| "grad_norm": 8.18691635131836, | |
| "learning_rate": 4.954988394429326e-05, | |
| "loss": 1.8793, | |
| "num_input_tokens_seen": 2446104, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05002401152553226, | |
| "grad_norm": 9.12602424621582, | |
| "learning_rate": 4.949985993276773e-05, | |
| "loss": 1.8827, | |
| "num_input_tokens_seen": 2707352, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.05502641267808548, | |
| "grad_norm": 7.385958194732666, | |
| "learning_rate": 4.94498359212422e-05, | |
| "loss": 1.8553, | |
| "num_input_tokens_seen": 2980496, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.060028813830638704, | |
| "grad_norm": 12.771719932556152, | |
| "learning_rate": 4.939981190971666e-05, | |
| "loss": 1.8257, | |
| "num_input_tokens_seen": 3261104, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.06503121498319193, | |
| "grad_norm": 7.724299430847168, | |
| "learning_rate": 4.934978789819113e-05, | |
| "loss": 1.82, | |
| "num_input_tokens_seen": 3530600, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07003361613574516, | |
| "grad_norm": 10.434374809265137, | |
| "learning_rate": 4.92997638866656e-05, | |
| "loss": 1.8089, | |
| "num_input_tokens_seen": 3798856, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.07503601728829838, | |
| "grad_norm": 6.624173164367676, | |
| "learning_rate": 4.924973987514007e-05, | |
| "loss": 1.8068, | |
| "num_input_tokens_seen": 4068760, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.0800384184408516, | |
| "grad_norm": 7.316147327423096, | |
| "learning_rate": 4.9199715863614536e-05, | |
| "loss": 1.7867, | |
| "num_input_tokens_seen": 4337376, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.08504081959340483, | |
| "grad_norm": 7.759427547454834, | |
| "learning_rate": 4.9149691852089006e-05, | |
| "loss": 1.7814, | |
| "num_input_tokens_seen": 4610160, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09004322074595807, | |
| "grad_norm": 7.609263896942139, | |
| "learning_rate": 4.9099667840563476e-05, | |
| "loss": 1.7981, | |
| "num_input_tokens_seen": 4885232, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.09504562189851129, | |
| "grad_norm": 6.2129669189453125, | |
| "learning_rate": 4.904964382903794e-05, | |
| "loss": 1.7689, | |
| "num_input_tokens_seen": 5155216, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.10004802305106451, | |
| "grad_norm": 6.697121620178223, | |
| "learning_rate": 4.899961981751241e-05, | |
| "loss": 1.7388, | |
| "num_input_tokens_seen": 5428160, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.10505042420361774, | |
| "grad_norm": 10.624402046203613, | |
| "learning_rate": 4.894959580598688e-05, | |
| "loss": 1.7443, | |
| "num_input_tokens_seen": 5692136, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.11005282535617096, | |
| "grad_norm": 8.177070617675781, | |
| "learning_rate": 4.889957179446135e-05, | |
| "loss": 1.7421, | |
| "num_input_tokens_seen": 5965544, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.11505522650872418, | |
| "grad_norm": 9.939962387084961, | |
| "learning_rate": 4.884954778293581e-05, | |
| "loss": 1.7128, | |
| "num_input_tokens_seen": 6230272, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.12005762766127741, | |
| "grad_norm": 8.003973007202148, | |
| "learning_rate": 4.8799523771410275e-05, | |
| "loss": 1.7197, | |
| "num_input_tokens_seen": 6505160, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.12506002881383063, | |
| "grad_norm": 6.709536075592041, | |
| "learning_rate": 4.8749499759884745e-05, | |
| "loss": 1.7021, | |
| "num_input_tokens_seen": 6782752, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.13006242996638387, | |
| "grad_norm": 7.609007835388184, | |
| "learning_rate": 4.8699475748359215e-05, | |
| "loss": 1.7083, | |
| "num_input_tokens_seen": 7059144, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.13506483111893708, | |
| "grad_norm": 7.746858596801758, | |
| "learning_rate": 4.8649451736833685e-05, | |
| "loss": 1.709, | |
| "num_input_tokens_seen": 7327608, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.14006723227149032, | |
| "grad_norm": 9.576034545898438, | |
| "learning_rate": 4.859942772530815e-05, | |
| "loss": 1.7204, | |
| "num_input_tokens_seen": 7598784, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.14506963342404355, | |
| "grad_norm": 7.995354175567627, | |
| "learning_rate": 4.854940371378262e-05, | |
| "loss": 1.6987, | |
| "num_input_tokens_seen": 7866144, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.15007203457659676, | |
| "grad_norm": 7.990232944488525, | |
| "learning_rate": 4.849937970225709e-05, | |
| "loss": 1.695, | |
| "num_input_tokens_seen": 8132240, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.15507443572915, | |
| "grad_norm": 6.7087507247924805, | |
| "learning_rate": 4.844935569073155e-05, | |
| "loss": 1.6766, | |
| "num_input_tokens_seen": 8400496, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.1600768368817032, | |
| "grad_norm": 6.8279900550842285, | |
| "learning_rate": 4.839933167920602e-05, | |
| "loss": 1.6662, | |
| "num_input_tokens_seen": 8666120, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.16507923803425645, | |
| "grad_norm": 8.427155494689941, | |
| "learning_rate": 4.834930766768049e-05, | |
| "loss": 1.6781, | |
| "num_input_tokens_seen": 8935352, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.17008163918680966, | |
| "grad_norm": 8.360432624816895, | |
| "learning_rate": 4.829928365615496e-05, | |
| "loss": 1.6682, | |
| "num_input_tokens_seen": 9209648, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.1750840403393629, | |
| "grad_norm": 7.38226842880249, | |
| "learning_rate": 4.8249259644629424e-05, | |
| "loss": 1.6475, | |
| "num_input_tokens_seen": 9475336, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.18008644149191613, | |
| "grad_norm": 8.226577758789062, | |
| "learning_rate": 4.819923563310389e-05, | |
| "loss": 1.6668, | |
| "num_input_tokens_seen": 9752088, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.18508884264446934, | |
| "grad_norm": 9.053323745727539, | |
| "learning_rate": 4.8149211621578364e-05, | |
| "loss": 1.6615, | |
| "num_input_tokens_seen": 10021568, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.19009124379702258, | |
| "grad_norm": 10.050972938537598, | |
| "learning_rate": 4.809918761005283e-05, | |
| "loss": 1.6453, | |
| "num_input_tokens_seen": 10294200, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.1950936449495758, | |
| "grad_norm": 12.779481887817383, | |
| "learning_rate": 4.80491635985273e-05, | |
| "loss": 1.6305, | |
| "num_input_tokens_seen": 10563664, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.20009604610212903, | |
| "grad_norm": 9.113444328308105, | |
| "learning_rate": 4.799913958700176e-05, | |
| "loss": 1.6374, | |
| "num_input_tokens_seen": 10833928, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.20509844725468224, | |
| "grad_norm": 9.230701446533203, | |
| "learning_rate": 4.794911557547624e-05, | |
| "loss": 1.633, | |
| "num_input_tokens_seen": 11098416, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.21010084840723547, | |
| "grad_norm": 7.994055271148682, | |
| "learning_rate": 4.78990915639507e-05, | |
| "loss": 1.6267, | |
| "num_input_tokens_seen": 11368280, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.2151032495597887, | |
| "grad_norm": 8.182880401611328, | |
| "learning_rate": 4.784906755242516e-05, | |
| "loss": 1.6199, | |
| "num_input_tokens_seen": 11634848, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.22010565071234192, | |
| "grad_norm": 8.053507804870605, | |
| "learning_rate": 4.779904354089963e-05, | |
| "loss": 1.6318, | |
| "num_input_tokens_seen": 11912456, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.22510805186489516, | |
| "grad_norm": 8.162029266357422, | |
| "learning_rate": 4.77490195293741e-05, | |
| "loss": 1.6156, | |
| "num_input_tokens_seen": 12177848, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.23011045301744837, | |
| "grad_norm": 7.447554588317871, | |
| "learning_rate": 4.769899551784857e-05, | |
| "loss": 1.6181, | |
| "num_input_tokens_seen": 12448808, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.2351128541700016, | |
| "grad_norm": 8.473464012145996, | |
| "learning_rate": 4.7648971506323036e-05, | |
| "loss": 1.6544, | |
| "num_input_tokens_seen": 12724056, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.24011525532255482, | |
| "grad_norm": 8.162822723388672, | |
| "learning_rate": 4.7598947494797506e-05, | |
| "loss": 1.6097, | |
| "num_input_tokens_seen": 12992248, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.24511765647510805, | |
| "grad_norm": 8.350125312805176, | |
| "learning_rate": 4.7548923483271976e-05, | |
| "loss": 1.5977, | |
| "num_input_tokens_seen": 13267848, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.25012005762766126, | |
| "grad_norm": 6.6851091384887695, | |
| "learning_rate": 4.749889947174644e-05, | |
| "loss": 1.6008, | |
| "num_input_tokens_seen": 13544424, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.2551224587802145, | |
| "grad_norm": 8.89284610748291, | |
| "learning_rate": 4.744887546022091e-05, | |
| "loss": 1.6099, | |
| "num_input_tokens_seen": 13818272, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.26012485993276774, | |
| "grad_norm": 8.554915428161621, | |
| "learning_rate": 4.739885144869537e-05, | |
| "loss": 1.6121, | |
| "num_input_tokens_seen": 14088432, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.265127261085321, | |
| "grad_norm": 7.6910529136657715, | |
| "learning_rate": 4.734882743716985e-05, | |
| "loss": 1.6073, | |
| "num_input_tokens_seen": 14362896, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.27012966223787416, | |
| "grad_norm": 6.856411457061768, | |
| "learning_rate": 4.729880342564431e-05, | |
| "loss": 1.604, | |
| "num_input_tokens_seen": 14629232, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.2751320633904274, | |
| "grad_norm": 8.767107009887695, | |
| "learning_rate": 4.7248779414118775e-05, | |
| "loss": 1.587, | |
| "num_input_tokens_seen": 14904336, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.28013446454298063, | |
| "grad_norm": 6.474203586578369, | |
| "learning_rate": 4.7198755402593245e-05, | |
| "loss": 1.5936, | |
| "num_input_tokens_seen": 15171200, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.28513686569553387, | |
| "grad_norm": 10.540902137756348, | |
| "learning_rate": 4.7148731391067715e-05, | |
| "loss": 1.5861, | |
| "num_input_tokens_seen": 15438392, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.2901392668480871, | |
| "grad_norm": 9.134960174560547, | |
| "learning_rate": 4.7098707379542185e-05, | |
| "loss": 1.5717, | |
| "num_input_tokens_seen": 15707768, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.2951416680006403, | |
| "grad_norm": 6.669952869415283, | |
| "learning_rate": 4.704868336801665e-05, | |
| "loss": 1.5989, | |
| "num_input_tokens_seen": 15980896, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.3001440691531935, | |
| "grad_norm": 6.744385242462158, | |
| "learning_rate": 4.699865935649112e-05, | |
| "loss": 1.5858, | |
| "num_input_tokens_seen": 16249024, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.30514647030574676, | |
| "grad_norm": 8.088431358337402, | |
| "learning_rate": 4.694863534496559e-05, | |
| "loss": 1.5612, | |
| "num_input_tokens_seen": 16516448, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.3101488714583, | |
| "grad_norm": 8.115087509155273, | |
| "learning_rate": 4.689861133344005e-05, | |
| "loss": 1.5995, | |
| "num_input_tokens_seen": 16788416, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.3151512726108532, | |
| "grad_norm": 8.037360191345215, | |
| "learning_rate": 4.684858732191452e-05, | |
| "loss": 1.5787, | |
| "num_input_tokens_seen": 17057432, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.3201536737634064, | |
| "grad_norm": 7.91753625869751, | |
| "learning_rate": 4.6798563310388984e-05, | |
| "loss": 1.5642, | |
| "num_input_tokens_seen": 17324440, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.32515607491595966, | |
| "grad_norm": 8.645842552185059, | |
| "learning_rate": 4.674853929886346e-05, | |
| "loss": 1.5685, | |
| "num_input_tokens_seen": 17597472, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.3301584760685129, | |
| "grad_norm": 7.731500148773193, | |
| "learning_rate": 4.6698515287337924e-05, | |
| "loss": 1.568, | |
| "num_input_tokens_seen": 17862696, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.33516087722106613, | |
| "grad_norm": 8.492512702941895, | |
| "learning_rate": 4.6648491275812394e-05, | |
| "loss": 1.5639, | |
| "num_input_tokens_seen": 18132104, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.3401632783736193, | |
| "grad_norm": 6.294936656951904, | |
| "learning_rate": 4.659846726428686e-05, | |
| "loss": 1.5651, | |
| "num_input_tokens_seen": 18398680, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.34516567952617255, | |
| "grad_norm": 6.749844551086426, | |
| "learning_rate": 4.654844325276133e-05, | |
| "loss": 1.5501, | |
| "num_input_tokens_seen": 18664584, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.3501680806787258, | |
| "grad_norm": 8.394216537475586, | |
| "learning_rate": 4.64984192412358e-05, | |
| "loss": 1.5655, | |
| "num_input_tokens_seen": 18935000, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.35517048183127903, | |
| "grad_norm": 7.8091607093811035, | |
| "learning_rate": 4.644839522971026e-05, | |
| "loss": 1.5545, | |
| "num_input_tokens_seen": 19215376, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.36017288298383227, | |
| "grad_norm": 8.398904800415039, | |
| "learning_rate": 4.639837121818473e-05, | |
| "loss": 1.5482, | |
| "num_input_tokens_seen": 19493104, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.36517528413638545, | |
| "grad_norm": 20.516576766967773, | |
| "learning_rate": 4.63483472066592e-05, | |
| "loss": 1.546, | |
| "num_input_tokens_seen": 19761160, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.3701776852889387, | |
| "grad_norm": 6.954863548278809, | |
| "learning_rate": 4.629832319513366e-05, | |
| "loss": 1.5581, | |
| "num_input_tokens_seen": 20034376, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.3751800864414919, | |
| "grad_norm": 7.736358642578125, | |
| "learning_rate": 4.624829918360813e-05, | |
| "loss": 1.5477, | |
| "num_input_tokens_seen": 20303800, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.38018248759404516, | |
| "grad_norm": 9.256717681884766, | |
| "learning_rate": 4.61982751720826e-05, | |
| "loss": 1.5379, | |
| "num_input_tokens_seen": 20577472, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.38518488874659834, | |
| "grad_norm": 7.987185478210449, | |
| "learning_rate": 4.614825116055707e-05, | |
| "loss": 1.5411, | |
| "num_input_tokens_seen": 20844712, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.3901872898991516, | |
| "grad_norm": 6.768152713775635, | |
| "learning_rate": 4.6098227149031536e-05, | |
| "loss": 1.5317, | |
| "num_input_tokens_seen": 21112208, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.3951896910517048, | |
| "grad_norm": 8.385108947753906, | |
| "learning_rate": 4.6048203137506006e-05, | |
| "loss": 1.522, | |
| "num_input_tokens_seen": 21378704, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.40019209220425805, | |
| "grad_norm": 6.166186809539795, | |
| "learning_rate": 4.5998179125980476e-05, | |
| "loss": 1.5167, | |
| "num_input_tokens_seen": 21648840, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.4051944933568113, | |
| "grad_norm": 6.178369045257568, | |
| "learning_rate": 4.594815511445494e-05, | |
| "loss": 1.5161, | |
| "num_input_tokens_seen": 21917272, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.4101968945093645, | |
| "grad_norm": 8.063447952270508, | |
| "learning_rate": 4.589813110292941e-05, | |
| "loss": 1.5335, | |
| "num_input_tokens_seen": 22185960, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.4151992956619177, | |
| "grad_norm": 9.296152114868164, | |
| "learning_rate": 4.584810709140387e-05, | |
| "loss": 1.5212, | |
| "num_input_tokens_seen": 22451608, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.42020169681447095, | |
| "grad_norm": 7.5216965675354, | |
| "learning_rate": 4.579808307987834e-05, | |
| "loss": 1.5275, | |
| "num_input_tokens_seen": 22720912, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.4252040979670242, | |
| "grad_norm": 7.390750408172607, | |
| "learning_rate": 4.574805906835281e-05, | |
| "loss": 1.5063, | |
| "num_input_tokens_seen": 22985424, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.4302064991195774, | |
| "grad_norm": 7.005786895751953, | |
| "learning_rate": 4.569803505682728e-05, | |
| "loss": 1.5166, | |
| "num_input_tokens_seen": 23258384, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.4352089002721306, | |
| "grad_norm": 8.231675148010254, | |
| "learning_rate": 4.5648011045301745e-05, | |
| "loss": 1.5198, | |
| "num_input_tokens_seen": 23531648, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.44021130142468384, | |
| "grad_norm": 8.228809356689453, | |
| "learning_rate": 4.5597987033776215e-05, | |
| "loss": 1.4984, | |
| "num_input_tokens_seen": 23798648, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.4452137025772371, | |
| "grad_norm": 7.101296901702881, | |
| "learning_rate": 4.5547963022250685e-05, | |
| "loss": 1.5261, | |
| "num_input_tokens_seen": 24061032, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.4502161037297903, | |
| "grad_norm": 7.954161167144775, | |
| "learning_rate": 4.549793901072515e-05, | |
| "loss": 1.5011, | |
| "num_input_tokens_seen": 24329816, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.4552185048823435, | |
| "grad_norm": 6.646183490753174, | |
| "learning_rate": 4.544791499919962e-05, | |
| "loss": 1.5281, | |
| "num_input_tokens_seen": 24600296, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.46022090603489674, | |
| "grad_norm": 6.38918924331665, | |
| "learning_rate": 4.539789098767409e-05, | |
| "loss": 1.5084, | |
| "num_input_tokens_seen": 24864680, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.46522330718745, | |
| "grad_norm": 9.28714656829834, | |
| "learning_rate": 4.534786697614855e-05, | |
| "loss": 1.5132, | |
| "num_input_tokens_seen": 25138944, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.4702257083400032, | |
| "grad_norm": 7.2790937423706055, | |
| "learning_rate": 4.529784296462302e-05, | |
| "loss": 1.4851, | |
| "num_input_tokens_seen": 25405872, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.47522810949255645, | |
| "grad_norm": 6.309217929840088, | |
| "learning_rate": 4.5247818953097484e-05, | |
| "loss": 1.4966, | |
| "num_input_tokens_seen": 25672064, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.48023051064510963, | |
| "grad_norm": 5.927547931671143, | |
| "learning_rate": 4.519779494157196e-05, | |
| "loss": 1.4871, | |
| "num_input_tokens_seen": 25944256, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.48523291179766287, | |
| "grad_norm": 7.525789260864258, | |
| "learning_rate": 4.5147770930046424e-05, | |
| "loss": 1.5077, | |
| "num_input_tokens_seen": 26218880, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.4902353129502161, | |
| "grad_norm": 7.0813517570495605, | |
| "learning_rate": 4.5097746918520894e-05, | |
| "loss": 1.5036, | |
| "num_input_tokens_seen": 26492264, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.49523771410276934, | |
| "grad_norm": 10.278409004211426, | |
| "learning_rate": 4.504772290699536e-05, | |
| "loss": 1.4888, | |
| "num_input_tokens_seen": 26761928, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.5002401152553225, | |
| "grad_norm": 10.73009967803955, | |
| "learning_rate": 4.499769889546983e-05, | |
| "loss": 1.4941, | |
| "num_input_tokens_seen": 27033072, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.5052425164078758, | |
| "grad_norm": 8.047172546386719, | |
| "learning_rate": 4.49476748839443e-05, | |
| "loss": 1.4882, | |
| "num_input_tokens_seen": 27297448, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.510244917560429, | |
| "grad_norm": 7.059236526489258, | |
| "learning_rate": 4.489765087241876e-05, | |
| "loss": 1.5093, | |
| "num_input_tokens_seen": 27572696, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.5152473187129822, | |
| "grad_norm": 6.407097816467285, | |
| "learning_rate": 4.484762686089323e-05, | |
| "loss": 1.4841, | |
| "num_input_tokens_seen": 27850272, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.5202497198655355, | |
| "grad_norm": 7.751036167144775, | |
| "learning_rate": 4.47976028493677e-05, | |
| "loss": 1.4797, | |
| "num_input_tokens_seen": 28125984, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.5252521210180887, | |
| "grad_norm": 7.869476795196533, | |
| "learning_rate": 4.474757883784217e-05, | |
| "loss": 1.4826, | |
| "num_input_tokens_seen": 28400064, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.530254522170642, | |
| "grad_norm": 6.702480316162109, | |
| "learning_rate": 4.4697554826316633e-05, | |
| "loss": 1.4888, | |
| "num_input_tokens_seen": 28668520, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.5352569233231952, | |
| "grad_norm": 5.916720867156982, | |
| "learning_rate": 4.46475308147911e-05, | |
| "loss": 1.4984, | |
| "num_input_tokens_seen": 28938448, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.5402593244757483, | |
| "grad_norm": 8.540026664733887, | |
| "learning_rate": 4.459750680326557e-05, | |
| "loss": 1.5038, | |
| "num_input_tokens_seen": 29207464, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.5452617256283016, | |
| "grad_norm": 6.425217151641846, | |
| "learning_rate": 4.4547482791740036e-05, | |
| "loss": 1.4771, | |
| "num_input_tokens_seen": 29476896, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.5502641267808548, | |
| "grad_norm": 5.669241428375244, | |
| "learning_rate": 4.4497458780214506e-05, | |
| "loss": 1.4618, | |
| "num_input_tokens_seen": 29744952, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.555266527933408, | |
| "grad_norm": 6.841111660003662, | |
| "learning_rate": 4.444743476868897e-05, | |
| "loss": 1.4899, | |
| "num_input_tokens_seen": 30012448, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.5602689290859613, | |
| "grad_norm": 6.233266353607178, | |
| "learning_rate": 4.439741075716344e-05, | |
| "loss": 1.5034, | |
| "num_input_tokens_seen": 30282976, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.5652713302385145, | |
| "grad_norm": 6.895115375518799, | |
| "learning_rate": 4.434738674563791e-05, | |
| "loss": 1.493, | |
| "num_input_tokens_seen": 30555656, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.5702737313910677, | |
| "grad_norm": 6.7635674476623535, | |
| "learning_rate": 4.429736273411237e-05, | |
| "loss": 1.4729, | |
| "num_input_tokens_seen": 30825952, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.575276132543621, | |
| "grad_norm": 8.052691459655762, | |
| "learning_rate": 4.424733872258684e-05, | |
| "loss": 1.4775, | |
| "num_input_tokens_seen": 31095152, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.5802785336961742, | |
| "grad_norm": 7.0079545974731445, | |
| "learning_rate": 4.419731471106131e-05, | |
| "loss": 1.4726, | |
| "num_input_tokens_seen": 31368672, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.5852809348487273, | |
| "grad_norm": 5.913256645202637, | |
| "learning_rate": 4.414729069953578e-05, | |
| "loss": 1.4758, | |
| "num_input_tokens_seen": 31642464, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.5902833360012806, | |
| "grad_norm": 9.248602867126465, | |
| "learning_rate": 4.4097266688010246e-05, | |
| "loss": 1.4534, | |
| "num_input_tokens_seen": 31913336, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.5952857371538338, | |
| "grad_norm": 6.6079936027526855, | |
| "learning_rate": 4.4047242676484716e-05, | |
| "loss": 1.4581, | |
| "num_input_tokens_seen": 32186344, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.600288138306387, | |
| "grad_norm": 6.47177791595459, | |
| "learning_rate": 4.3997218664959185e-05, | |
| "loss": 1.4505, | |
| "num_input_tokens_seen": 32460808, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.6052905394589403, | |
| "grad_norm": 6.106129169464111, | |
| "learning_rate": 4.394719465343365e-05, | |
| "loss": 1.4701, | |
| "num_input_tokens_seen": 32728200, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.6102929406114935, | |
| "grad_norm": 8.406516075134277, | |
| "learning_rate": 4.389717064190812e-05, | |
| "loss": 1.4591, | |
| "num_input_tokens_seen": 33005896, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.6152953417640468, | |
| "grad_norm": 9.166064262390137, | |
| "learning_rate": 4.384714663038258e-05, | |
| "loss": 1.4875, | |
| "num_input_tokens_seen": 33269392, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.6202977429166, | |
| "grad_norm": 7.39436149597168, | |
| "learning_rate": 4.379712261885706e-05, | |
| "loss": 1.4563, | |
| "num_input_tokens_seen": 33541424, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.6253001440691532, | |
| "grad_norm": 5.612057685852051, | |
| "learning_rate": 4.374709860733152e-05, | |
| "loss": 1.4761, | |
| "num_input_tokens_seen": 33806192, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.6303025452217064, | |
| "grad_norm": 7.853974342346191, | |
| "learning_rate": 4.3697074595805985e-05, | |
| "loss": 1.4617, | |
| "num_input_tokens_seen": 34076920, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.6353049463742596, | |
| "grad_norm": 8.041677474975586, | |
| "learning_rate": 4.3647050584280455e-05, | |
| "loss": 1.442, | |
| "num_input_tokens_seen": 34340464, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.6403073475268128, | |
| "grad_norm": 7.188261985778809, | |
| "learning_rate": 4.3597026572754925e-05, | |
| "loss": 1.4746, | |
| "num_input_tokens_seen": 34608464, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.6453097486793661, | |
| "grad_norm": 7.935949802398682, | |
| "learning_rate": 4.3547002561229395e-05, | |
| "loss": 1.4655, | |
| "num_input_tokens_seen": 34878936, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.6503121498319193, | |
| "grad_norm": 6.211294651031494, | |
| "learning_rate": 4.349697854970386e-05, | |
| "loss": 1.4481, | |
| "num_input_tokens_seen": 35148000, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.6553145509844726, | |
| "grad_norm": 8.786713600158691, | |
| "learning_rate": 4.344695453817833e-05, | |
| "loss": 1.4541, | |
| "num_input_tokens_seen": 35425104, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.6603169521370258, | |
| "grad_norm": 7.866344928741455, | |
| "learning_rate": 4.33969305266528e-05, | |
| "loss": 1.4537, | |
| "num_input_tokens_seen": 35693952, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.665319353289579, | |
| "grad_norm": 7.549289703369141, | |
| "learning_rate": 4.334690651512726e-05, | |
| "loss": 1.4569, | |
| "num_input_tokens_seen": 35966608, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.6703217544421323, | |
| "grad_norm": 6.588021278381348, | |
| "learning_rate": 4.329688250360173e-05, | |
| "loss": 1.4477, | |
| "num_input_tokens_seen": 36238976, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.6753241555946854, | |
| "grad_norm": 7.3665852546691895, | |
| "learning_rate": 4.32468584920762e-05, | |
| "loss": 1.4524, | |
| "num_input_tokens_seen": 36504184, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.6803265567472386, | |
| "grad_norm": 8.99618911743164, | |
| "learning_rate": 4.319683448055067e-05, | |
| "loss": 1.4559, | |
| "num_input_tokens_seen": 36767640, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.6853289578997919, | |
| "grad_norm": 8.354264259338379, | |
| "learning_rate": 4.3146810469025134e-05, | |
| "loss": 1.4552, | |
| "num_input_tokens_seen": 37033640, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.6903313590523451, | |
| "grad_norm": 6.712357521057129, | |
| "learning_rate": 4.30967864574996e-05, | |
| "loss": 1.4428, | |
| "num_input_tokens_seen": 37301576, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.6953337602048983, | |
| "grad_norm": 6.98289680480957, | |
| "learning_rate": 4.3046762445974074e-05, | |
| "loss": 1.4389, | |
| "num_input_tokens_seen": 37575040, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.7003361613574516, | |
| "grad_norm": 7.0615410804748535, | |
| "learning_rate": 4.299673843444854e-05, | |
| "loss": 1.4323, | |
| "num_input_tokens_seen": 37844304, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.7053385625100048, | |
| "grad_norm": 7.445618152618408, | |
| "learning_rate": 4.294671442292301e-05, | |
| "loss": 1.437, | |
| "num_input_tokens_seen": 38118688, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.7103409636625581, | |
| "grad_norm": 5.989320278167725, | |
| "learning_rate": 4.289669041139747e-05, | |
| "loss": 1.4525, | |
| "num_input_tokens_seen": 38387808, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.7153433648151113, | |
| "grad_norm": 6.6483283042907715, | |
| "learning_rate": 4.284666639987194e-05, | |
| "loss": 1.4389, | |
| "num_input_tokens_seen": 38648568, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.7203457659676645, | |
| "grad_norm": 7.410543918609619, | |
| "learning_rate": 4.279664238834641e-05, | |
| "loss": 1.4205, | |
| "num_input_tokens_seen": 38924584, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.7253481671202177, | |
| "grad_norm": 6.462342739105225, | |
| "learning_rate": 4.274661837682087e-05, | |
| "loss": 1.4286, | |
| "num_input_tokens_seen": 39189080, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.7303505682727709, | |
| "grad_norm": 6.404929161071777, | |
| "learning_rate": 4.269659436529534e-05, | |
| "loss": 1.4475, | |
| "num_input_tokens_seen": 39456616, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.7353529694253241, | |
| "grad_norm": 7.46643590927124, | |
| "learning_rate": 4.264657035376981e-05, | |
| "loss": 1.4374, | |
| "num_input_tokens_seen": 39729608, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.7403553705778774, | |
| "grad_norm": 6.626592636108398, | |
| "learning_rate": 4.259654634224428e-05, | |
| "loss": 1.4194, | |
| "num_input_tokens_seen": 39998368, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.7453577717304306, | |
| "grad_norm": 6.95764684677124, | |
| "learning_rate": 4.2546522330718746e-05, | |
| "loss": 1.4276, | |
| "num_input_tokens_seen": 40271760, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.7503601728829838, | |
| "grad_norm": 5.589132785797119, | |
| "learning_rate": 4.249649831919321e-05, | |
| "loss": 1.4304, | |
| "num_input_tokens_seen": 40539392, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.7553625740355371, | |
| "grad_norm": 6.854423999786377, | |
| "learning_rate": 4.2446474307667686e-05, | |
| "loss": 1.4259, | |
| "num_input_tokens_seen": 40805536, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.7603649751880903, | |
| "grad_norm": 7.378007888793945, | |
| "learning_rate": 4.239645029614215e-05, | |
| "loss": 1.4145, | |
| "num_input_tokens_seen": 41074104, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.7653673763406436, | |
| "grad_norm": 6.971609592437744, | |
| "learning_rate": 4.234642628461662e-05, | |
| "loss": 1.421, | |
| "num_input_tokens_seen": 41344576, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.7703697774931967, | |
| "grad_norm": 7.222036838531494, | |
| "learning_rate": 4.229640227309108e-05, | |
| "loss": 1.4314, | |
| "num_input_tokens_seen": 41615376, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.7753721786457499, | |
| "grad_norm": 6.561624526977539, | |
| "learning_rate": 4.224637826156556e-05, | |
| "loss": 1.409, | |
| "num_input_tokens_seen": 41886272, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.7803745797983032, | |
| "grad_norm": 6.4644646644592285, | |
| "learning_rate": 4.219635425004002e-05, | |
| "loss": 1.4101, | |
| "num_input_tokens_seen": 42156112, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.7853769809508564, | |
| "grad_norm": 6.069692611694336, | |
| "learning_rate": 4.2146330238514485e-05, | |
| "loss": 1.4213, | |
| "num_input_tokens_seen": 42424448, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.7903793821034096, | |
| "grad_norm": 6.701622486114502, | |
| "learning_rate": 4.2096306226988955e-05, | |
| "loss": 1.4204, | |
| "num_input_tokens_seen": 42685864, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.7953817832559629, | |
| "grad_norm": 8.732488632202148, | |
| "learning_rate": 4.2046282215463425e-05, | |
| "loss": 1.4225, | |
| "num_input_tokens_seen": 42959264, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.8003841844085161, | |
| "grad_norm": 7.264562129974365, | |
| "learning_rate": 4.1996258203937895e-05, | |
| "loss": 1.4044, | |
| "num_input_tokens_seen": 43231192, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.8053865855610693, | |
| "grad_norm": 7.394875526428223, | |
| "learning_rate": 4.194623419241236e-05, | |
| "loss": 1.4211, | |
| "num_input_tokens_seen": 43500168, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.8103889867136226, | |
| "grad_norm": 6.593264102935791, | |
| "learning_rate": 4.189621018088683e-05, | |
| "loss": 1.4179, | |
| "num_input_tokens_seen": 43774064, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.8153913878661757, | |
| "grad_norm": 7.966070175170898, | |
| "learning_rate": 4.18461861693613e-05, | |
| "loss": 1.4439, | |
| "num_input_tokens_seen": 44046680, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.820393789018729, | |
| "grad_norm": 10.988821029663086, | |
| "learning_rate": 4.179616215783576e-05, | |
| "loss": 1.43, | |
| "num_input_tokens_seen": 44320912, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.8253961901712822, | |
| "grad_norm": 6.874449729919434, | |
| "learning_rate": 4.174613814631023e-05, | |
| "loss": 1.4199, | |
| "num_input_tokens_seen": 44593488, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.8303985913238354, | |
| "grad_norm": 7.1776838302612305, | |
| "learning_rate": 4.1696114134784694e-05, | |
| "loss": 1.4131, | |
| "num_input_tokens_seen": 44862096, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.8354009924763887, | |
| "grad_norm": 7.381138801574707, | |
| "learning_rate": 4.164609012325917e-05, | |
| "loss": 1.4059, | |
| "num_input_tokens_seen": 45130632, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.8404033936289419, | |
| "grad_norm": 8.17155933380127, | |
| "learning_rate": 4.1596066111733634e-05, | |
| "loss": 1.4425, | |
| "num_input_tokens_seen": 45400856, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.8454057947814951, | |
| "grad_norm": 6.636998176574707, | |
| "learning_rate": 4.1546042100208104e-05, | |
| "loss": 1.3979, | |
| "num_input_tokens_seen": 45671560, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.8504081959340484, | |
| "grad_norm": 5.552203178405762, | |
| "learning_rate": 4.149601808868257e-05, | |
| "loss": 1.4255, | |
| "num_input_tokens_seen": 45945496, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.8554105970866016, | |
| "grad_norm": 7.160405158996582, | |
| "learning_rate": 4.144599407715704e-05, | |
| "loss": 1.4122, | |
| "num_input_tokens_seen": 46213464, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.8604129982391548, | |
| "grad_norm": 7.1668381690979, | |
| "learning_rate": 4.139597006563151e-05, | |
| "loss": 1.407, | |
| "num_input_tokens_seen": 46482056, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.865415399391708, | |
| "grad_norm": 6.595818996429443, | |
| "learning_rate": 4.134594605410597e-05, | |
| "loss": 1.4052, | |
| "num_input_tokens_seen": 46754792, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.8704178005442612, | |
| "grad_norm": 7.962093830108643, | |
| "learning_rate": 4.129592204258044e-05, | |
| "loss": 1.3779, | |
| "num_input_tokens_seen": 47023888, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.8754202016968144, | |
| "grad_norm": 5.4436421394348145, | |
| "learning_rate": 4.124589803105491e-05, | |
| "loss": 1.395, | |
| "num_input_tokens_seen": 47294240, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.8804226028493677, | |
| "grad_norm": 9.327848434448242, | |
| "learning_rate": 4.119587401952937e-05, | |
| "loss": 1.4015, | |
| "num_input_tokens_seen": 47564360, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.8854250040019209, | |
| "grad_norm": 5.366121768951416, | |
| "learning_rate": 4.114585000800384e-05, | |
| "loss": 1.4136, | |
| "num_input_tokens_seen": 47846352, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.8904274051544742, | |
| "grad_norm": 5.672398090362549, | |
| "learning_rate": 4.109582599647831e-05, | |
| "loss": 1.395, | |
| "num_input_tokens_seen": 48117352, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.8954298063070274, | |
| "grad_norm": 7.147487163543701, | |
| "learning_rate": 4.104580198495278e-05, | |
| "loss": 1.3946, | |
| "num_input_tokens_seen": 48390832, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.9004322074595806, | |
| "grad_norm": 9.567891120910645, | |
| "learning_rate": 4.0995777973427246e-05, | |
| "loss": 1.4111, | |
| "num_input_tokens_seen": 48667984, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.9054346086121339, | |
| "grad_norm": 7.761517524719238, | |
| "learning_rate": 4.0945753961901716e-05, | |
| "loss": 1.3972, | |
| "num_input_tokens_seen": 48941240, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.910437009764687, | |
| "grad_norm": 8.4068603515625, | |
| "learning_rate": 4.089572995037618e-05, | |
| "loss": 1.3894, | |
| "num_input_tokens_seen": 49212696, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.9154394109172402, | |
| "grad_norm": 5.621284008026123, | |
| "learning_rate": 4.084570593885065e-05, | |
| "loss": 1.3823, | |
| "num_input_tokens_seen": 49481760, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.9204418120697935, | |
| "grad_norm": 8.205471992492676, | |
| "learning_rate": 4.079568192732512e-05, | |
| "loss": 1.3886, | |
| "num_input_tokens_seen": 49753376, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.9254442132223467, | |
| "grad_norm": 8.143417358398438, | |
| "learning_rate": 4.074565791579958e-05, | |
| "loss": 1.384, | |
| "num_input_tokens_seen": 50025128, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.9304466143749, | |
| "grad_norm": 7.172451496124268, | |
| "learning_rate": 4.069563390427405e-05, | |
| "loss": 1.4011, | |
| "num_input_tokens_seen": 50301448, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.9354490155274532, | |
| "grad_norm": 7.71168851852417, | |
| "learning_rate": 4.064560989274852e-05, | |
| "loss": 1.3702, | |
| "num_input_tokens_seen": 50569616, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.9404514166800064, | |
| "grad_norm": 7.981653213500977, | |
| "learning_rate": 4.059558588122299e-05, | |
| "loss": 1.3897, | |
| "num_input_tokens_seen": 50842808, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.9454538178325597, | |
| "grad_norm": 6.760748386383057, | |
| "learning_rate": 4.0545561869697455e-05, | |
| "loss": 1.3878, | |
| "num_input_tokens_seen": 51121760, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.9504562189851129, | |
| "grad_norm": 7.034352779388428, | |
| "learning_rate": 4.0495537858171925e-05, | |
| "loss": 1.4073, | |
| "num_input_tokens_seen": 51392648, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.955458620137666, | |
| "grad_norm": 6.021711349487305, | |
| "learning_rate": 4.0445513846646395e-05, | |
| "loss": 1.4106, | |
| "num_input_tokens_seen": 51657888, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.9604610212902193, | |
| "grad_norm": 7.470587253570557, | |
| "learning_rate": 4.039548983512086e-05, | |
| "loss": 1.3982, | |
| "num_input_tokens_seen": 51934352, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.9654634224427725, | |
| "grad_norm": 6.424021244049072, | |
| "learning_rate": 4.034546582359533e-05, | |
| "loss": 1.3788, | |
| "num_input_tokens_seen": 52209064, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.9704658235953257, | |
| "grad_norm": 7.6357197761535645, | |
| "learning_rate": 4.02954418120698e-05, | |
| "loss": 1.3714, | |
| "num_input_tokens_seen": 52474304, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.975468224747879, | |
| "grad_norm": 8.156658172607422, | |
| "learning_rate": 4.024541780054426e-05, | |
| "loss": 1.3992, | |
| "num_input_tokens_seen": 52747832, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.9804706259004322, | |
| "grad_norm": 6.052001953125, | |
| "learning_rate": 4.019539378901873e-05, | |
| "loss": 1.3758, | |
| "num_input_tokens_seen": 53024352, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.9854730270529855, | |
| "grad_norm": 6.635683059692383, | |
| "learning_rate": 4.0145369777493194e-05, | |
| "loss": 1.3868, | |
| "num_input_tokens_seen": 53296880, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.9904754282055387, | |
| "grad_norm": 6.532413482666016, | |
| "learning_rate": 4.009534576596767e-05, | |
| "loss": 1.3582, | |
| "num_input_tokens_seen": 53571800, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.9954778293580919, | |
| "grad_norm": 6.029451370239258, | |
| "learning_rate": 4.0045321754442134e-05, | |
| "loss": 1.3772, | |
| "num_input_tokens_seen": 53843520, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.2156304121017456, | |
| "eval_runtime": 188.3426, | |
| "eval_samples_per_second": 1061.39, | |
| "eval_steps_per_second": 132.678, | |
| "num_input_tokens_seen": 54090088, | |
| "step": 99952 | |
| }, | |
| { | |
| "epoch": 1.000480230510645, | |
| "grad_norm": 5.276439189910889, | |
| "learning_rate": 3.9995297742916604e-05, | |
| "loss": 1.2803, | |
| "num_input_tokens_seen": 54117480, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 1.0054826316631984, | |
| "grad_norm": 6.911218643188477, | |
| "learning_rate": 3.994527373139107e-05, | |
| "loss": 1.2695, | |
| "num_input_tokens_seen": 54389512, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 1.0104850328157515, | |
| "grad_norm": 9.460619926452637, | |
| "learning_rate": 3.989524971986554e-05, | |
| "loss": 1.2787, | |
| "num_input_tokens_seen": 54665304, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 1.0154874339683049, | |
| "grad_norm": 7.135129928588867, | |
| "learning_rate": 3.984522570834001e-05, | |
| "loss": 1.2616, | |
| "num_input_tokens_seen": 54935144, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 1.020489835120858, | |
| "grad_norm": 7.705801010131836, | |
| "learning_rate": 3.979520169681447e-05, | |
| "loss": 1.2673, | |
| "num_input_tokens_seen": 55209312, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 1.0254922362734111, | |
| "grad_norm": 7.493370532989502, | |
| "learning_rate": 3.974517768528894e-05, | |
| "loss": 1.2623, | |
| "num_input_tokens_seen": 55478440, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 1.0304946374259645, | |
| "grad_norm": 6.460716724395752, | |
| "learning_rate": 3.969515367376341e-05, | |
| "loss": 1.2881, | |
| "num_input_tokens_seen": 55752896, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 1.0354970385785176, | |
| "grad_norm": 7.391408443450928, | |
| "learning_rate": 3.964512966223788e-05, | |
| "loss": 1.2692, | |
| "num_input_tokens_seen": 56028536, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 1.040499439731071, | |
| "grad_norm": 8.04489803314209, | |
| "learning_rate": 3.9595105650712343e-05, | |
| "loss": 1.2582, | |
| "num_input_tokens_seen": 56297360, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 1.045501840883624, | |
| "grad_norm": 6.487476348876953, | |
| "learning_rate": 3.9545081639186807e-05, | |
| "loss": 1.2791, | |
| "num_input_tokens_seen": 56565576, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 1.0505042420361774, | |
| "grad_norm": 7.118215084075928, | |
| "learning_rate": 3.949505762766128e-05, | |
| "loss": 1.2822, | |
| "num_input_tokens_seen": 56841624, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.0555066431887306, | |
| "grad_norm": 6.419320583343506, | |
| "learning_rate": 3.9445033616135746e-05, | |
| "loss": 1.2533, | |
| "num_input_tokens_seen": 57114504, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 1.060509044341284, | |
| "grad_norm": 6.287978649139404, | |
| "learning_rate": 3.9395009604610216e-05, | |
| "loss": 1.2735, | |
| "num_input_tokens_seen": 57384112, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 1.065511445493837, | |
| "grad_norm": 6.397841930389404, | |
| "learning_rate": 3.934498559308468e-05, | |
| "loss": 1.2715, | |
| "num_input_tokens_seen": 57658904, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 1.0705138466463904, | |
| "grad_norm": 6.377140998840332, | |
| "learning_rate": 3.929496158155915e-05, | |
| "loss": 1.2764, | |
| "num_input_tokens_seen": 57930592, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 1.0755162477989435, | |
| "grad_norm": 7.9464850425720215, | |
| "learning_rate": 3.924493757003362e-05, | |
| "loss": 1.2818, | |
| "num_input_tokens_seen": 58206432, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 1.0805186489514966, | |
| "grad_norm": 5.806307792663574, | |
| "learning_rate": 3.919491355850808e-05, | |
| "loss": 1.2724, | |
| "num_input_tokens_seen": 58476704, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 1.08552105010405, | |
| "grad_norm": 7.807882308959961, | |
| "learning_rate": 3.914488954698255e-05, | |
| "loss": 1.2901, | |
| "num_input_tokens_seen": 58749936, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 1.090523451256603, | |
| "grad_norm": 6.01190185546875, | |
| "learning_rate": 3.909486553545702e-05, | |
| "loss": 1.2691, | |
| "num_input_tokens_seen": 59014736, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 1.0955258524091565, | |
| "grad_norm": 5.44499397277832, | |
| "learning_rate": 3.904484152393149e-05, | |
| "loss": 1.2589, | |
| "num_input_tokens_seen": 59280912, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 1.1005282535617096, | |
| "grad_norm": 7.433501243591309, | |
| "learning_rate": 3.8994817512405956e-05, | |
| "loss": 1.2686, | |
| "num_input_tokens_seen": 59549544, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.105530654714263, | |
| "grad_norm": 6.828175067901611, | |
| "learning_rate": 3.8944793500880425e-05, | |
| "loss": 1.2747, | |
| "num_input_tokens_seen": 59820728, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 1.110533055866816, | |
| "grad_norm": 7.450278282165527, | |
| "learning_rate": 3.8894769489354895e-05, | |
| "loss": 1.281, | |
| "num_input_tokens_seen": 60091056, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 1.1155354570193694, | |
| "grad_norm": 8.60688591003418, | |
| "learning_rate": 3.884474547782936e-05, | |
| "loss": 1.2745, | |
| "num_input_tokens_seen": 60361616, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 1.1205378581719225, | |
| "grad_norm": 6.874872207641602, | |
| "learning_rate": 3.879472146630383e-05, | |
| "loss": 1.2663, | |
| "num_input_tokens_seen": 60628632, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 1.1255402593244757, | |
| "grad_norm": 6.488132953643799, | |
| "learning_rate": 3.874469745477829e-05, | |
| "loss": 1.2733, | |
| "num_input_tokens_seen": 60896840, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 1.130542660477029, | |
| "grad_norm": 10.988000869750977, | |
| "learning_rate": 3.869467344325277e-05, | |
| "loss": 1.2686, | |
| "num_input_tokens_seen": 61170424, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 1.1355450616295821, | |
| "grad_norm": 6.758646011352539, | |
| "learning_rate": 3.864464943172723e-05, | |
| "loss": 1.2807, | |
| "num_input_tokens_seen": 61437984, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 1.1405474627821355, | |
| "grad_norm": 6.983493804931641, | |
| "learning_rate": 3.8594625420201695e-05, | |
| "loss": 1.2807, | |
| "num_input_tokens_seen": 61710280, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 1.1455498639346886, | |
| "grad_norm": 6.096587181091309, | |
| "learning_rate": 3.8544601408676165e-05, | |
| "loss": 1.2711, | |
| "num_input_tokens_seen": 61980960, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 1.150552265087242, | |
| "grad_norm": 6.4102373123168945, | |
| "learning_rate": 3.8494577397150635e-05, | |
| "loss": 1.2986, | |
| "num_input_tokens_seen": 62254680, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.155554666239795, | |
| "grad_norm": 7.1004638671875, | |
| "learning_rate": 3.8444553385625105e-05, | |
| "loss": 1.2837, | |
| "num_input_tokens_seen": 62526128, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 1.1605570673923484, | |
| "grad_norm": 6.682748794555664, | |
| "learning_rate": 3.839452937409957e-05, | |
| "loss": 1.2557, | |
| "num_input_tokens_seen": 62793600, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 1.1655594685449016, | |
| "grad_norm": 5.091439247131348, | |
| "learning_rate": 3.834450536257404e-05, | |
| "loss": 1.2786, | |
| "num_input_tokens_seen": 63060392, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 1.1705618696974547, | |
| "grad_norm": 6.379209041595459, | |
| "learning_rate": 3.829448135104851e-05, | |
| "loss": 1.2803, | |
| "num_input_tokens_seen": 63324312, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 1.175564270850008, | |
| "grad_norm": 6.799802780151367, | |
| "learning_rate": 3.824445733952297e-05, | |
| "loss": 1.2776, | |
| "num_input_tokens_seen": 63596232, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 1.1805666720025612, | |
| "grad_norm": 5.58148193359375, | |
| "learning_rate": 3.819443332799744e-05, | |
| "loss": 1.2875, | |
| "num_input_tokens_seen": 63863096, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 1.1855690731551145, | |
| "grad_norm": 6.822576999664307, | |
| "learning_rate": 3.814440931647191e-05, | |
| "loss": 1.2895, | |
| "num_input_tokens_seen": 64138648, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 1.1905714743076676, | |
| "grad_norm": 8.899248123168945, | |
| "learning_rate": 3.809438530494638e-05, | |
| "loss": 1.2788, | |
| "num_input_tokens_seen": 64408856, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 1.195573875460221, | |
| "grad_norm": 7.763192653656006, | |
| "learning_rate": 3.8044361293420844e-05, | |
| "loss": 1.2702, | |
| "num_input_tokens_seen": 64679712, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 1.200576276612774, | |
| "grad_norm": 8.2811861038208, | |
| "learning_rate": 3.799433728189531e-05, | |
| "loss": 1.2566, | |
| "num_input_tokens_seen": 64945912, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.2055786777653275, | |
| "grad_norm": 5.707862854003906, | |
| "learning_rate": 3.794431327036978e-05, | |
| "loss": 1.2667, | |
| "num_input_tokens_seen": 65217232, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 1.2105810789178806, | |
| "grad_norm": 6.428073406219482, | |
| "learning_rate": 3.789428925884425e-05, | |
| "loss": 1.2817, | |
| "num_input_tokens_seen": 65479136, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 1.2155834800704337, | |
| "grad_norm": 6.519428730010986, | |
| "learning_rate": 3.784426524731872e-05, | |
| "loss": 1.2719, | |
| "num_input_tokens_seen": 65749968, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 1.220585881222987, | |
| "grad_norm": 5.953312873840332, | |
| "learning_rate": 3.779424123579318e-05, | |
| "loss": 1.2701, | |
| "num_input_tokens_seen": 66017680, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 1.2255882823755402, | |
| "grad_norm": 6.453891277313232, | |
| "learning_rate": 3.774421722426765e-05, | |
| "loss": 1.2668, | |
| "num_input_tokens_seen": 66288296, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 1.2305906835280935, | |
| "grad_norm": 6.9297709465026855, | |
| "learning_rate": 3.769419321274212e-05, | |
| "loss": 1.2754, | |
| "num_input_tokens_seen": 66555160, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 1.2355930846806467, | |
| "grad_norm": 5.545460224151611, | |
| "learning_rate": 3.764416920121658e-05, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 66823296, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 1.2405954858332, | |
| "grad_norm": 7.921981334686279, | |
| "learning_rate": 3.759414518969105e-05, | |
| "loss": 1.2671, | |
| "num_input_tokens_seen": 67090360, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 1.2455978869857531, | |
| "grad_norm": 7.033051490783691, | |
| "learning_rate": 3.754412117816552e-05, | |
| "loss": 1.2826, | |
| "num_input_tokens_seen": 67363312, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 1.2506002881383065, | |
| "grad_norm": 5.355251789093018, | |
| "learning_rate": 3.749409716663999e-05, | |
| "loss": 1.2712, | |
| "num_input_tokens_seen": 67642752, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 1.2556026892908596, | |
| "grad_norm": 7.9365081787109375, | |
| "learning_rate": 3.7444073155114456e-05, | |
| "loss": 1.2763, | |
| "num_input_tokens_seen": 67905152, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 1.2606050904434127, | |
| "grad_norm": 6.157983779907227, | |
| "learning_rate": 3.739404914358892e-05, | |
| "loss": 1.2615, | |
| "num_input_tokens_seen": 68168720, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 1.265607491595966, | |
| "grad_norm": 5.456648349761963, | |
| "learning_rate": 3.7344025132063396e-05, | |
| "loss": 1.2642, | |
| "num_input_tokens_seen": 68433784, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 1.2706098927485192, | |
| "grad_norm": 7.156668663024902, | |
| "learning_rate": 3.729400112053786e-05, | |
| "loss": 1.2866, | |
| "num_input_tokens_seen": 68705336, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 1.2756122939010726, | |
| "grad_norm": 6.959549903869629, | |
| "learning_rate": 3.724397710901233e-05, | |
| "loss": 1.2733, | |
| "num_input_tokens_seen": 68968320, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 1.2806146950536257, | |
| "grad_norm": 6.225592613220215, | |
| "learning_rate": 3.719395309748679e-05, | |
| "loss": 1.2684, | |
| "num_input_tokens_seen": 69244592, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 1.285617096206179, | |
| "grad_norm": 7.163039684295654, | |
| "learning_rate": 3.714392908596127e-05, | |
| "loss": 1.2823, | |
| "num_input_tokens_seen": 69518384, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 1.2906194973587322, | |
| "grad_norm": 5.474428176879883, | |
| "learning_rate": 3.709390507443573e-05, | |
| "loss": 1.2684, | |
| "num_input_tokens_seen": 69789560, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 1.2956218985112855, | |
| "grad_norm": 6.292562961578369, | |
| "learning_rate": 3.7043881062910195e-05, | |
| "loss": 1.2697, | |
| "num_input_tokens_seen": 70058688, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 1.3006242996638386, | |
| "grad_norm": 5.789345741271973, | |
| "learning_rate": 3.6993857051384665e-05, | |
| "loss": 1.2652, | |
| "num_input_tokens_seen": 70330632, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 1.3056267008163918, | |
| "grad_norm": 7.9446821212768555, | |
| "learning_rate": 3.6943833039859135e-05, | |
| "loss": 1.2928, | |
| "num_input_tokens_seen": 70594088, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 1.310629101968945, | |
| "grad_norm": 9.38175106048584, | |
| "learning_rate": 3.6893809028333605e-05, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 70858664, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 1.3156315031214985, | |
| "grad_norm": 7.6178812980651855, | |
| "learning_rate": 3.684378501680807e-05, | |
| "loss": 1.2767, | |
| "num_input_tokens_seen": 71132936, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 1.3206339042740516, | |
| "grad_norm": 7.7378435134887695, | |
| "learning_rate": 3.679376100528254e-05, | |
| "loss": 1.2752, | |
| "num_input_tokens_seen": 71398296, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 1.3256363054266047, | |
| "grad_norm": 7.162954807281494, | |
| "learning_rate": 3.674373699375701e-05, | |
| "loss": 1.2534, | |
| "num_input_tokens_seen": 71673192, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 1.330638706579158, | |
| "grad_norm": 6.323235511779785, | |
| "learning_rate": 3.669371298223147e-05, | |
| "loss": 1.265, | |
| "num_input_tokens_seen": 71949960, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 1.3356411077317112, | |
| "grad_norm": 6.324786186218262, | |
| "learning_rate": 3.664368897070594e-05, | |
| "loss": 1.2661, | |
| "num_input_tokens_seen": 72217256, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 1.3406435088842645, | |
| "grad_norm": 7.326359748840332, | |
| "learning_rate": 3.6593664959180404e-05, | |
| "loss": 1.244, | |
| "num_input_tokens_seen": 72480696, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 1.3456459100368177, | |
| "grad_norm": 7.224339962005615, | |
| "learning_rate": 3.654364094765488e-05, | |
| "loss": 1.2677, | |
| "num_input_tokens_seen": 72747888, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 1.3506483111893708, | |
| "grad_norm": 6.521255970001221, | |
| "learning_rate": 3.6493616936129344e-05, | |
| "loss": 1.26, | |
| "num_input_tokens_seen": 73016624, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 1.3556507123419241, | |
| "grad_norm": 7.485130786895752, | |
| "learning_rate": 3.644359292460381e-05, | |
| "loss": 1.2703, | |
| "num_input_tokens_seen": 73291936, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 1.3606531134944775, | |
| "grad_norm": 7.1711626052856445, | |
| "learning_rate": 3.639356891307828e-05, | |
| "loss": 1.2798, | |
| "num_input_tokens_seen": 73557640, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 1.3656555146470306, | |
| "grad_norm": 6.981902599334717, | |
| "learning_rate": 3.634354490155275e-05, | |
| "loss": 1.2485, | |
| "num_input_tokens_seen": 73827312, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 1.3706579157995837, | |
| "grad_norm": 6.2199320793151855, | |
| "learning_rate": 3.629352089002722e-05, | |
| "loss": 1.2587, | |
| "num_input_tokens_seen": 74098048, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 1.375660316952137, | |
| "grad_norm": 6.726940155029297, | |
| "learning_rate": 3.624349687850168e-05, | |
| "loss": 1.2753, | |
| "num_input_tokens_seen": 74365640, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 1.3806627181046902, | |
| "grad_norm": 5.759517669677734, | |
| "learning_rate": 3.619347286697615e-05, | |
| "loss": 1.2549, | |
| "num_input_tokens_seen": 74632424, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 1.3856651192572436, | |
| "grad_norm": 6.594145774841309, | |
| "learning_rate": 3.614344885545062e-05, | |
| "loss": 1.2601, | |
| "num_input_tokens_seen": 74906848, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 1.3906675204097967, | |
| "grad_norm": 6.375233173370361, | |
| "learning_rate": 3.609342484392508e-05, | |
| "loss": 1.2682, | |
| "num_input_tokens_seen": 75176912, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 1.3956699215623498, | |
| "grad_norm": 6.785195827484131, | |
| "learning_rate": 3.604340083239955e-05, | |
| "loss": 1.2602, | |
| "num_input_tokens_seen": 75447008, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 1.4006723227149032, | |
| "grad_norm": 6.8394694328308105, | |
| "learning_rate": 3.599337682087402e-05, | |
| "loss": 1.2432, | |
| "num_input_tokens_seen": 75720552, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.4056747238674565, | |
| "grad_norm": 6.43784236907959, | |
| "learning_rate": 3.594335280934849e-05, | |
| "loss": 1.2576, | |
| "num_input_tokens_seen": 75991000, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 1.4106771250200096, | |
| "grad_norm": 6.5971479415893555, | |
| "learning_rate": 3.5893328797822956e-05, | |
| "loss": 1.2625, | |
| "num_input_tokens_seen": 76262432, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 1.4156795261725628, | |
| "grad_norm": 5.662843227386475, | |
| "learning_rate": 3.5843304786297426e-05, | |
| "loss": 1.2822, | |
| "num_input_tokens_seen": 76537768, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 1.4206819273251161, | |
| "grad_norm": 5.13416862487793, | |
| "learning_rate": 3.579328077477189e-05, | |
| "loss": 1.2639, | |
| "num_input_tokens_seen": 76807000, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 1.4256843284776692, | |
| "grad_norm": 6.224869728088379, | |
| "learning_rate": 3.574325676324636e-05, | |
| "loss": 1.2633, | |
| "num_input_tokens_seen": 77077208, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 1.4306867296302226, | |
| "grad_norm": 5.980476379394531, | |
| "learning_rate": 3.569323275172083e-05, | |
| "loss": 1.254, | |
| "num_input_tokens_seen": 77348800, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 1.4356891307827757, | |
| "grad_norm": 5.705311298370361, | |
| "learning_rate": 3.564320874019529e-05, | |
| "loss": 1.2641, | |
| "num_input_tokens_seen": 77623192, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 1.4406915319353288, | |
| "grad_norm": 5.703660488128662, | |
| "learning_rate": 3.559318472866976e-05, | |
| "loss": 1.2509, | |
| "num_input_tokens_seen": 77892080, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 1.4456939330878822, | |
| "grad_norm": 6.834238052368164, | |
| "learning_rate": 3.554316071714423e-05, | |
| "loss": 1.2455, | |
| "num_input_tokens_seen": 78161320, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 1.4506963342404355, | |
| "grad_norm": 5.70477294921875, | |
| "learning_rate": 3.54931367056187e-05, | |
| "loss": 1.2534, | |
| "num_input_tokens_seen": 78426328, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.4556987353929887, | |
| "grad_norm": 7.84694766998291, | |
| "learning_rate": 3.5443112694093165e-05, | |
| "loss": 1.2696, | |
| "num_input_tokens_seen": 78700184, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 1.4607011365455418, | |
| "grad_norm": 6.4869914054870605, | |
| "learning_rate": 3.5393088682567635e-05, | |
| "loss": 1.2548, | |
| "num_input_tokens_seen": 78968056, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 1.4657035376980951, | |
| "grad_norm": 7.2102251052856445, | |
| "learning_rate": 3.5343064671042105e-05, | |
| "loss": 1.2643, | |
| "num_input_tokens_seen": 79236952, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 1.4707059388506483, | |
| "grad_norm": 8.560874938964844, | |
| "learning_rate": 3.529304065951657e-05, | |
| "loss": 1.2674, | |
| "num_input_tokens_seen": 79508800, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 1.4757083400032016, | |
| "grad_norm": 6.250794410705566, | |
| "learning_rate": 3.524301664799104e-05, | |
| "loss": 1.2529, | |
| "num_input_tokens_seen": 79781928, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 1.4807107411557547, | |
| "grad_norm": 5.825743198394775, | |
| "learning_rate": 3.519299263646551e-05, | |
| "loss": 1.25, | |
| "num_input_tokens_seen": 80044592, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 1.4857131423083079, | |
| "grad_norm": 8.07386589050293, | |
| "learning_rate": 3.514296862493997e-05, | |
| "loss": 1.2546, | |
| "num_input_tokens_seen": 80312648, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 1.4907155434608612, | |
| "grad_norm": 6.903604984283447, | |
| "learning_rate": 3.509294461341444e-05, | |
| "loss": 1.2465, | |
| "num_input_tokens_seen": 80580360, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 1.4957179446134146, | |
| "grad_norm": 7.45670223236084, | |
| "learning_rate": 3.5042920601888904e-05, | |
| "loss": 1.2612, | |
| "num_input_tokens_seen": 80854928, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 1.5007203457659677, | |
| "grad_norm": 7.703638553619385, | |
| "learning_rate": 3.4992896590363374e-05, | |
| "loss": 1.2494, | |
| "num_input_tokens_seen": 81122392, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.5057227469185208, | |
| "grad_norm": 7.255218982696533, | |
| "learning_rate": 3.4942872578837844e-05, | |
| "loss": 1.2549, | |
| "num_input_tokens_seen": 81393152, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 1.5107251480710742, | |
| "grad_norm": 6.001245498657227, | |
| "learning_rate": 3.4892848567312314e-05, | |
| "loss": 1.2565, | |
| "num_input_tokens_seen": 81663000, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 1.5157275492236273, | |
| "grad_norm": 8.100776672363281, | |
| "learning_rate": 3.484282455578678e-05, | |
| "loss": 1.2662, | |
| "num_input_tokens_seen": 81935248, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 1.5207299503761806, | |
| "grad_norm": 7.566408157348633, | |
| "learning_rate": 3.479280054426125e-05, | |
| "loss": 1.2521, | |
| "num_input_tokens_seen": 82198824, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 1.5257323515287338, | |
| "grad_norm": 6.650607109069824, | |
| "learning_rate": 3.474277653273572e-05, | |
| "loss": 1.2516, | |
| "num_input_tokens_seen": 82465776, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 1.530734752681287, | |
| "grad_norm": 6.24419641494751, | |
| "learning_rate": 3.469275252121018e-05, | |
| "loss": 1.2504, | |
| "num_input_tokens_seen": 82741480, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 1.5357371538338402, | |
| "grad_norm": 5.1919403076171875, | |
| "learning_rate": 3.464272850968465e-05, | |
| "loss": 1.2403, | |
| "num_input_tokens_seen": 83010176, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 1.5407395549863936, | |
| "grad_norm": 7.3934407234191895, | |
| "learning_rate": 3.459270449815912e-05, | |
| "loss": 1.2455, | |
| "num_input_tokens_seen": 83279712, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 1.5457419561389467, | |
| "grad_norm": 5.885237693786621, | |
| "learning_rate": 3.454268048663359e-05, | |
| "loss": 1.2667, | |
| "num_input_tokens_seen": 83546624, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 1.5507443572914998, | |
| "grad_norm": 6.22340726852417, | |
| "learning_rate": 3.449265647510805e-05, | |
| "loss": 1.2341, | |
| "num_input_tokens_seen": 83814832, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.5557467584440532, | |
| "grad_norm": 7.060276508331299, | |
| "learning_rate": 3.4442632463582516e-05, | |
| "loss": 1.2419, | |
| "num_input_tokens_seen": 84086544, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 1.5607491595966063, | |
| "grad_norm": 6.495555400848389, | |
| "learning_rate": 3.439260845205699e-05, | |
| "loss": 1.2552, | |
| "num_input_tokens_seen": 84352576, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 1.5657515607491597, | |
| "grad_norm": 7.454058647155762, | |
| "learning_rate": 3.4342584440531456e-05, | |
| "loss": 1.2469, | |
| "num_input_tokens_seen": 84619200, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 1.5707539619017128, | |
| "grad_norm": 6.108017444610596, | |
| "learning_rate": 3.4292560429005926e-05, | |
| "loss": 1.2358, | |
| "num_input_tokens_seen": 84891184, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 1.575756363054266, | |
| "grad_norm": 9.97182559967041, | |
| "learning_rate": 3.424253641748039e-05, | |
| "loss": 1.2482, | |
| "num_input_tokens_seen": 85164008, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 1.5807587642068193, | |
| "grad_norm": 7.4442877769470215, | |
| "learning_rate": 3.419251240595486e-05, | |
| "loss": 1.2444, | |
| "num_input_tokens_seen": 85431664, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 1.5857611653593726, | |
| "grad_norm": 5.728388786315918, | |
| "learning_rate": 3.414248839442933e-05, | |
| "loss": 1.2496, | |
| "num_input_tokens_seen": 85706112, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 1.5907635665119257, | |
| "grad_norm": 5.5090861320495605, | |
| "learning_rate": 3.409246438290379e-05, | |
| "loss": 1.2468, | |
| "num_input_tokens_seen": 85979600, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 1.5957659676644789, | |
| "grad_norm": 7.548877716064453, | |
| "learning_rate": 3.404244037137826e-05, | |
| "loss": 1.2483, | |
| "num_input_tokens_seen": 86249384, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 1.6007683688170322, | |
| "grad_norm": 6.700185775756836, | |
| "learning_rate": 3.399241635985273e-05, | |
| "loss": 1.2493, | |
| "num_input_tokens_seen": 86511120, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.6057707699695856, | |
| "grad_norm": 6.892191410064697, | |
| "learning_rate": 3.39423923483272e-05, | |
| "loss": 1.2528, | |
| "num_input_tokens_seen": 86784872, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 1.6107731711221387, | |
| "grad_norm": 5.970468521118164, | |
| "learning_rate": 3.3892368336801665e-05, | |
| "loss": 1.2309, | |
| "num_input_tokens_seen": 87058000, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 1.6157755722746918, | |
| "grad_norm": 6.773517608642578, | |
| "learning_rate": 3.384234432527613e-05, | |
| "loss": 1.2449, | |
| "num_input_tokens_seen": 87331744, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 1.620777973427245, | |
| "grad_norm": 6.08986234664917, | |
| "learning_rate": 3.3792320313750605e-05, | |
| "loss": 1.2518, | |
| "num_input_tokens_seen": 87604792, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 1.6257803745797983, | |
| "grad_norm": 6.549533843994141, | |
| "learning_rate": 3.374229630222507e-05, | |
| "loss": 1.242, | |
| "num_input_tokens_seen": 87871576, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 1.6307827757323516, | |
| "grad_norm": 5.974827289581299, | |
| "learning_rate": 3.369227229069954e-05, | |
| "loss": 1.2347, | |
| "num_input_tokens_seen": 88137032, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 1.6357851768849048, | |
| "grad_norm": 6.639895915985107, | |
| "learning_rate": 3.3642248279174e-05, | |
| "loss": 1.2241, | |
| "num_input_tokens_seen": 88403456, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 1.640787578037458, | |
| "grad_norm": 8.600828170776367, | |
| "learning_rate": 3.359222426764848e-05, | |
| "loss": 1.2496, | |
| "num_input_tokens_seen": 88676440, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 1.6457899791900112, | |
| "grad_norm": 6.850368976593018, | |
| "learning_rate": 3.354220025612294e-05, | |
| "loss": 1.232, | |
| "num_input_tokens_seen": 88947256, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 1.6507923803425646, | |
| "grad_norm": 7.311619281768799, | |
| "learning_rate": 3.3492176244597405e-05, | |
| "loss": 1.2497, | |
| "num_input_tokens_seen": 89215568, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.6557947814951177, | |
| "grad_norm": 6.298097610473633, | |
| "learning_rate": 3.3442152233071875e-05, | |
| "loss": 1.2482, | |
| "num_input_tokens_seen": 89487600, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 1.6607971826476708, | |
| "grad_norm": 6.948307514190674, | |
| "learning_rate": 3.3392128221546345e-05, | |
| "loss": 1.2274, | |
| "num_input_tokens_seen": 89760496, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 1.665799583800224, | |
| "grad_norm": 6.305945873260498, | |
| "learning_rate": 3.3342104210020814e-05, | |
| "loss": 1.2462, | |
| "num_input_tokens_seen": 90033976, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 1.6708019849527773, | |
| "grad_norm": 6.428753852844238, | |
| "learning_rate": 3.329208019849528e-05, | |
| "loss": 1.2159, | |
| "num_input_tokens_seen": 90309992, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 1.6758043861053307, | |
| "grad_norm": 6.597380638122559, | |
| "learning_rate": 3.324205618696975e-05, | |
| "loss": 1.224, | |
| "num_input_tokens_seen": 90577224, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 1.6808067872578838, | |
| "grad_norm": 6.567870140075684, | |
| "learning_rate": 3.319203217544422e-05, | |
| "loss": 1.2214, | |
| "num_input_tokens_seen": 90853608, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 1.685809188410437, | |
| "grad_norm": 6.079522609710693, | |
| "learning_rate": 3.314200816391868e-05, | |
| "loss": 1.2465, | |
| "num_input_tokens_seen": 91122360, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 1.6908115895629903, | |
| "grad_norm": 5.641016006469727, | |
| "learning_rate": 3.309198415239315e-05, | |
| "loss": 1.2456, | |
| "num_input_tokens_seen": 91395744, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 1.6958139907155436, | |
| "grad_norm": 6.620981216430664, | |
| "learning_rate": 3.304196014086762e-05, | |
| "loss": 1.2232, | |
| "num_input_tokens_seen": 91664688, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 1.7008163918680967, | |
| "grad_norm": 5.642283916473389, | |
| "learning_rate": 3.299193612934209e-05, | |
| "loss": 1.2548, | |
| "num_input_tokens_seen": 91937056, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.7058187930206499, | |
| "grad_norm": 8.295174598693848, | |
| "learning_rate": 3.2941912117816554e-05, | |
| "loss": 1.2394, | |
| "num_input_tokens_seen": 92211072, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 1.710821194173203, | |
| "grad_norm": 6.525012493133545, | |
| "learning_rate": 3.289188810629102e-05, | |
| "loss": 1.2371, | |
| "num_input_tokens_seen": 92479560, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 1.7158235953257563, | |
| "grad_norm": 5.822702884674072, | |
| "learning_rate": 3.284186409476549e-05, | |
| "loss": 1.2512, | |
| "num_input_tokens_seen": 92751592, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 1.7208259964783097, | |
| "grad_norm": 7.12557315826416, | |
| "learning_rate": 3.279184008323996e-05, | |
| "loss": 1.2252, | |
| "num_input_tokens_seen": 93027888, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 1.7258283976308628, | |
| "grad_norm": 6.948513984680176, | |
| "learning_rate": 3.2741816071714427e-05, | |
| "loss": 1.2273, | |
| "num_input_tokens_seen": 93305496, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 1.730830798783416, | |
| "grad_norm": 6.272098064422607, | |
| "learning_rate": 3.269179206018889e-05, | |
| "loss": 1.2412, | |
| "num_input_tokens_seen": 93571272, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 1.7358331999359693, | |
| "grad_norm": 6.048664569854736, | |
| "learning_rate": 3.264176804866336e-05, | |
| "loss": 1.2192, | |
| "num_input_tokens_seen": 93844768, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 1.7408356010885226, | |
| "grad_norm": 7.0680999755859375, | |
| "learning_rate": 3.259174403713783e-05, | |
| "loss": 1.2389, | |
| "num_input_tokens_seen": 94117208, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 1.7458380022410758, | |
| "grad_norm": 4.265655517578125, | |
| "learning_rate": 3.254172002561229e-05, | |
| "loss": 1.2285, | |
| "num_input_tokens_seen": 94388984, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 1.750840403393629, | |
| "grad_norm": 5.9715118408203125, | |
| "learning_rate": 3.249169601408676e-05, | |
| "loss": 1.2427, | |
| "num_input_tokens_seen": 94655112, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.755842804546182, | |
| "grad_norm": 6.257503509521484, | |
| "learning_rate": 3.244167200256123e-05, | |
| "loss": 1.2361, | |
| "num_input_tokens_seen": 94924496, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 1.7608452056987354, | |
| "grad_norm": 8.316187858581543, | |
| "learning_rate": 3.23916479910357e-05, | |
| "loss": 1.2283, | |
| "num_input_tokens_seen": 95192608, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 1.7658476068512887, | |
| "grad_norm": 6.69648551940918, | |
| "learning_rate": 3.2341623979510166e-05, | |
| "loss": 1.2364, | |
| "num_input_tokens_seen": 95459872, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 1.7708500080038418, | |
| "grad_norm": 7.617880821228027, | |
| "learning_rate": 3.229159996798463e-05, | |
| "loss": 1.2265, | |
| "num_input_tokens_seen": 95729504, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 1.775852409156395, | |
| "grad_norm": 7.258569240570068, | |
| "learning_rate": 3.2241575956459106e-05, | |
| "loss": 1.235, | |
| "num_input_tokens_seen": 95996688, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 1.7808548103089483, | |
| "grad_norm": 5.590980052947998, | |
| "learning_rate": 3.219155194493357e-05, | |
| "loss": 1.2335, | |
| "num_input_tokens_seen": 96263440, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 1.7858572114615017, | |
| "grad_norm": 5.80760383605957, | |
| "learning_rate": 3.214152793340804e-05, | |
| "loss": 1.2179, | |
| "num_input_tokens_seen": 96539352, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 1.7908596126140548, | |
| "grad_norm": 5.532135486602783, | |
| "learning_rate": 3.20915039218825e-05, | |
| "loss": 1.1994, | |
| "num_input_tokens_seen": 96805416, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 1.795862013766608, | |
| "grad_norm": 5.589640140533447, | |
| "learning_rate": 3.204147991035697e-05, | |
| "loss": 1.2264, | |
| "num_input_tokens_seen": 97073464, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 1.800864414919161, | |
| "grad_norm": 6.577908039093018, | |
| "learning_rate": 3.199145589883144e-05, | |
| "loss": 1.23, | |
| "num_input_tokens_seen": 97347808, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.8058668160717144, | |
| "grad_norm": 6.8848724365234375, | |
| "learning_rate": 3.1941431887305905e-05, | |
| "loss": 1.2167, | |
| "num_input_tokens_seen": 97615696, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 1.8108692172242677, | |
| "grad_norm": 6.463140964508057, | |
| "learning_rate": 3.1891407875780375e-05, | |
| "loss": 1.2189, | |
| "num_input_tokens_seen": 97886312, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 1.8158716183768209, | |
| "grad_norm": 8.028167724609375, | |
| "learning_rate": 3.1841383864254845e-05, | |
| "loss": 1.2384, | |
| "num_input_tokens_seen": 98151768, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 1.820874019529374, | |
| "grad_norm": 7.106721878051758, | |
| "learning_rate": 3.1791359852729315e-05, | |
| "loss": 1.2264, | |
| "num_input_tokens_seen": 98419192, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 1.8258764206819273, | |
| "grad_norm": 5.771492004394531, | |
| "learning_rate": 3.174133584120378e-05, | |
| "loss": 1.2421, | |
| "num_input_tokens_seen": 98695368, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 1.8308788218344807, | |
| "grad_norm": 5.563631534576416, | |
| "learning_rate": 3.169131182967825e-05, | |
| "loss": 1.2325, | |
| "num_input_tokens_seen": 98974312, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 1.8358812229870338, | |
| "grad_norm": 7.004051208496094, | |
| "learning_rate": 3.164128781815272e-05, | |
| "loss": 1.2107, | |
| "num_input_tokens_seen": 99244240, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 1.840883624139587, | |
| "grad_norm": 6.410153865814209, | |
| "learning_rate": 3.159126380662718e-05, | |
| "loss": 1.2251, | |
| "num_input_tokens_seen": 99515024, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 1.84588602529214, | |
| "grad_norm": 4.9155354499816895, | |
| "learning_rate": 3.154123979510165e-05, | |
| "loss": 1.2192, | |
| "num_input_tokens_seen": 99785496, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 1.8508884264446934, | |
| "grad_norm": 7.882739067077637, | |
| "learning_rate": 3.1491215783576114e-05, | |
| "loss": 1.2405, | |
| "num_input_tokens_seen": 100061088, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.8558908275972468, | |
| "grad_norm": 5.929235935211182, | |
| "learning_rate": 3.144119177205059e-05, | |
| "loss": 1.2171, | |
| "num_input_tokens_seen": 100330216, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 1.8608932287498, | |
| "grad_norm": 5.840740203857422, | |
| "learning_rate": 3.1391167760525054e-05, | |
| "loss": 1.2093, | |
| "num_input_tokens_seen": 100606080, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 1.865895629902353, | |
| "grad_norm": 6.414222717285156, | |
| "learning_rate": 3.134114374899952e-05, | |
| "loss": 1.2154, | |
| "num_input_tokens_seen": 100872752, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 1.8708980310549064, | |
| "grad_norm": 7.030595779418945, | |
| "learning_rate": 3.129111973747399e-05, | |
| "loss": 1.2209, | |
| "num_input_tokens_seen": 101144088, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 1.8759004322074597, | |
| "grad_norm": 5.81058406829834, | |
| "learning_rate": 3.124109572594846e-05, | |
| "loss": 1.2068, | |
| "num_input_tokens_seen": 101414120, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 1.8809028333600128, | |
| "grad_norm": 5.672033309936523, | |
| "learning_rate": 3.119107171442293e-05, | |
| "loss": 1.2305, | |
| "num_input_tokens_seen": 101679104, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 1.885905234512566, | |
| "grad_norm": 6.247150421142578, | |
| "learning_rate": 3.114104770289739e-05, | |
| "loss": 1.2281, | |
| "num_input_tokens_seen": 101950272, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 1.890907635665119, | |
| "grad_norm": 6.070692539215088, | |
| "learning_rate": 3.109102369137186e-05, | |
| "loss": 1.2277, | |
| "num_input_tokens_seen": 102222744, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 1.8959100368176725, | |
| "grad_norm": 7.217655181884766, | |
| "learning_rate": 3.104099967984633e-05, | |
| "loss": 1.2227, | |
| "num_input_tokens_seen": 102498072, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 1.9009124379702258, | |
| "grad_norm": 6.292141914367676, | |
| "learning_rate": 3.099097566832079e-05, | |
| "loss": 1.226, | |
| "num_input_tokens_seen": 102771184, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.905914839122779, | |
| "grad_norm": 6.4393534660339355, | |
| "learning_rate": 3.094095165679526e-05, | |
| "loss": 1.2238, | |
| "num_input_tokens_seen": 103041800, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 1.910917240275332, | |
| "grad_norm": 6.367134094238281, | |
| "learning_rate": 3.0890927645269726e-05, | |
| "loss": 1.2277, | |
| "num_input_tokens_seen": 103315416, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 1.9159196414278854, | |
| "grad_norm": 5.803537368774414, | |
| "learning_rate": 3.08409036337442e-05, | |
| "loss": 1.2101, | |
| "num_input_tokens_seen": 103584680, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 1.9209220425804387, | |
| "grad_norm": 5.529000282287598, | |
| "learning_rate": 3.0790879622218666e-05, | |
| "loss": 1.2252, | |
| "num_input_tokens_seen": 103854296, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 1.9259244437329919, | |
| "grad_norm": 6.204425811767578, | |
| "learning_rate": 3.0740855610693136e-05, | |
| "loss": 1.2234, | |
| "num_input_tokens_seen": 104127296, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 1.930926844885545, | |
| "grad_norm": 6.076712131500244, | |
| "learning_rate": 3.06908315991676e-05, | |
| "loss": 1.2245, | |
| "num_input_tokens_seen": 104402160, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 1.9359292460380981, | |
| "grad_norm": 5.718363285064697, | |
| "learning_rate": 3.064080758764207e-05, | |
| "loss": 1.2288, | |
| "num_input_tokens_seen": 104669408, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 1.9409316471906515, | |
| "grad_norm": 5.174673080444336, | |
| "learning_rate": 3.059078357611654e-05, | |
| "loss": 1.2276, | |
| "num_input_tokens_seen": 104944256, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 1.9459340483432048, | |
| "grad_norm": 6.684966564178467, | |
| "learning_rate": 3.0540759564591e-05, | |
| "loss": 1.2341, | |
| "num_input_tokens_seen": 105217600, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 1.950936449495758, | |
| "grad_norm": 6.3069562911987305, | |
| "learning_rate": 3.0490735553065475e-05, | |
| "loss": 1.211, | |
| "num_input_tokens_seen": 105491832, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.955938850648311, | |
| "grad_norm": 8.71688461303711, | |
| "learning_rate": 3.044071154153994e-05, | |
| "loss": 1.231, | |
| "num_input_tokens_seen": 105764832, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 1.9609412518008644, | |
| "grad_norm": 8.65140438079834, | |
| "learning_rate": 3.0390687530014405e-05, | |
| "loss": 1.2142, | |
| "num_input_tokens_seen": 106034576, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 1.9659436529534178, | |
| "grad_norm": 5.5850725173950195, | |
| "learning_rate": 3.0340663518488875e-05, | |
| "loss": 1.2254, | |
| "num_input_tokens_seen": 106305056, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 1.970946054105971, | |
| "grad_norm": 6.236534118652344, | |
| "learning_rate": 3.029063950696334e-05, | |
| "loss": 1.2121, | |
| "num_input_tokens_seen": 106574696, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 1.975948455258524, | |
| "grad_norm": 6.221134185791016, | |
| "learning_rate": 3.024061549543781e-05, | |
| "loss": 1.2031, | |
| "num_input_tokens_seen": 106850384, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 1.9809508564110772, | |
| "grad_norm": 7.421369552612305, | |
| "learning_rate": 3.0190591483912278e-05, | |
| "loss": 1.2214, | |
| "num_input_tokens_seen": 107126816, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 1.9859532575636305, | |
| "grad_norm": 6.951572418212891, | |
| "learning_rate": 3.0140567472386748e-05, | |
| "loss": 1.2052, | |
| "num_input_tokens_seen": 107392688, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 1.9909556587161839, | |
| "grad_norm": 7.400991439819336, | |
| "learning_rate": 3.0090543460861215e-05, | |
| "loss": 1.2091, | |
| "num_input_tokens_seen": 107668928, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 1.995958059868737, | |
| "grad_norm": 6.747934818267822, | |
| "learning_rate": 3.004051944933568e-05, | |
| "loss": 1.2001, | |
| "num_input_tokens_seen": 107939312, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.1146966218948364, | |
| "eval_runtime": 186.1963, | |
| "eval_samples_per_second": 1073.625, | |
| "eval_steps_per_second": 134.208, | |
| "num_input_tokens_seen": 108157960, | |
| "step": 199904 | |
| }, | |
| { | |
| "epoch": 2.00096046102129, | |
| "grad_norm": 5.809133052825928, | |
| "learning_rate": 2.999049543781015e-05, | |
| "loss": 1.21, | |
| "num_input_tokens_seen": 108209480, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 2.0059628621738432, | |
| "grad_norm": 7.035338401794434, | |
| "learning_rate": 2.9940471426284618e-05, | |
| "loss": 1.0955, | |
| "num_input_tokens_seen": 108480360, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 2.010965263326397, | |
| "grad_norm": 8.504626274108887, | |
| "learning_rate": 2.9890447414759088e-05, | |
| "loss": 1.1085, | |
| "num_input_tokens_seen": 108745072, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 2.01596766447895, | |
| "grad_norm": 7.347136497497559, | |
| "learning_rate": 2.9840423403233554e-05, | |
| "loss": 1.0983, | |
| "num_input_tokens_seen": 109010096, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 2.020970065631503, | |
| "grad_norm": 5.3891215324401855, | |
| "learning_rate": 2.9790399391708024e-05, | |
| "loss": 1.0941, | |
| "num_input_tokens_seen": 109280096, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 2.025972466784056, | |
| "grad_norm": 5.807075023651123, | |
| "learning_rate": 2.974037538018249e-05, | |
| "loss": 1.1066, | |
| "num_input_tokens_seen": 109550512, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 2.0309748679366098, | |
| "grad_norm": 7.344318866729736, | |
| "learning_rate": 2.9690351368656954e-05, | |
| "loss": 1.1149, | |
| "num_input_tokens_seen": 109827584, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 2.035977269089163, | |
| "grad_norm": 4.556800842285156, | |
| "learning_rate": 2.9640327357131427e-05, | |
| "loss": 1.1137, | |
| "num_input_tokens_seen": 110094320, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 2.040979670241716, | |
| "grad_norm": 6.238656044006348, | |
| "learning_rate": 2.959030334560589e-05, | |
| "loss": 1.1174, | |
| "num_input_tokens_seen": 110369888, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 2.045982071394269, | |
| "grad_norm": 6.009298801422119, | |
| "learning_rate": 2.954027933408036e-05, | |
| "loss": 1.0978, | |
| "num_input_tokens_seen": 110638112, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 2.0509844725468223, | |
| "grad_norm": 5.883254051208496, | |
| "learning_rate": 2.9490255322554827e-05, | |
| "loss": 1.1127, | |
| "num_input_tokens_seen": 110907728, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 2.055986873699376, | |
| "grad_norm": 5.4123125076293945, | |
| "learning_rate": 2.9440231311029297e-05, | |
| "loss": 1.116, | |
| "num_input_tokens_seen": 111174608, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 2.060989274851929, | |
| "grad_norm": 6.456712245941162, | |
| "learning_rate": 2.9390207299503763e-05, | |
| "loss": 1.1306, | |
| "num_input_tokens_seen": 111443896, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 2.065991676004482, | |
| "grad_norm": 7.134698390960693, | |
| "learning_rate": 2.934018328797823e-05, | |
| "loss": 1.1149, | |
| "num_input_tokens_seen": 111722672, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 2.070994077157035, | |
| "grad_norm": 5.317368984222412, | |
| "learning_rate": 2.92901592764527e-05, | |
| "loss": 1.093, | |
| "num_input_tokens_seen": 111987688, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 2.0759964783095888, | |
| "grad_norm": 5.929445743560791, | |
| "learning_rate": 2.9240135264927166e-05, | |
| "loss": 1.124, | |
| "num_input_tokens_seen": 112256088, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 2.080998879462142, | |
| "grad_norm": 6.658150672912598, | |
| "learning_rate": 2.9190111253401636e-05, | |
| "loss": 1.1068, | |
| "num_input_tokens_seen": 112530064, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 2.086001280614695, | |
| "grad_norm": 7.434782028198242, | |
| "learning_rate": 2.9140087241876103e-05, | |
| "loss": 1.11, | |
| "num_input_tokens_seen": 112807296, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 2.091003681767248, | |
| "grad_norm": 6.564949035644531, | |
| "learning_rate": 2.9090063230350566e-05, | |
| "loss": 1.1181, | |
| "num_input_tokens_seen": 113075032, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 2.0960060829198017, | |
| "grad_norm": 11.387114524841309, | |
| "learning_rate": 2.904003921882504e-05, | |
| "loss": 1.1097, | |
| "num_input_tokens_seen": 113344824, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 2.101008484072355, | |
| "grad_norm": 5.74482536315918, | |
| "learning_rate": 2.8990015207299502e-05, | |
| "loss": 1.1018, | |
| "num_input_tokens_seen": 113618576, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 2.106010885224908, | |
| "grad_norm": 5.009258270263672, | |
| "learning_rate": 2.8939991195773976e-05, | |
| "loss": 1.1039, | |
| "num_input_tokens_seen": 113889800, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 2.111013286377461, | |
| "grad_norm": 7.421350955963135, | |
| "learning_rate": 2.888996718424844e-05, | |
| "loss": 1.1173, | |
| "num_input_tokens_seen": 114168104, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 2.1160156875300142, | |
| "grad_norm": 6.955892086029053, | |
| "learning_rate": 2.8839943172722912e-05, | |
| "loss": 1.1217, | |
| "num_input_tokens_seen": 114430440, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 2.121018088682568, | |
| "grad_norm": 7.287781715393066, | |
| "learning_rate": 2.8789919161197375e-05, | |
| "loss": 1.1063, | |
| "num_input_tokens_seen": 114706600, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 2.126020489835121, | |
| "grad_norm": 7.426519870758057, | |
| "learning_rate": 2.8739895149671842e-05, | |
| "loss": 1.1181, | |
| "num_input_tokens_seen": 114975360, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 2.131022890987674, | |
| "grad_norm": 6.112298965454102, | |
| "learning_rate": 2.8689871138146312e-05, | |
| "loss": 1.1, | |
| "num_input_tokens_seen": 115252480, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 2.136025292140227, | |
| "grad_norm": 8.356368064880371, | |
| "learning_rate": 2.863984712662078e-05, | |
| "loss": 1.1382, | |
| "num_input_tokens_seen": 115527840, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 2.1410276932927808, | |
| "grad_norm": 5.211204528808594, | |
| "learning_rate": 2.858982311509525e-05, | |
| "loss": 1.1241, | |
| "num_input_tokens_seen": 115795240, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 2.146030094445334, | |
| "grad_norm": 7.513902187347412, | |
| "learning_rate": 2.8539799103569715e-05, | |
| "loss": 1.0984, | |
| "num_input_tokens_seen": 116071656, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 2.151032495597887, | |
| "grad_norm": 5.553924560546875, | |
| "learning_rate": 2.8489775092044185e-05, | |
| "loss": 1.1138, | |
| "num_input_tokens_seen": 116337944, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 2.15603489675044, | |
| "grad_norm": 5.7051920890808105, | |
| "learning_rate": 2.843975108051865e-05, | |
| "loss": 1.1193, | |
| "num_input_tokens_seen": 116608560, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 2.1610372979029933, | |
| "grad_norm": 6.199916362762451, | |
| "learning_rate": 2.8389727068993115e-05, | |
| "loss": 1.114, | |
| "num_input_tokens_seen": 116876312, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 2.166039699055547, | |
| "grad_norm": 6.054383754730225, | |
| "learning_rate": 2.8339703057467588e-05, | |
| "loss": 1.0965, | |
| "num_input_tokens_seen": 117146704, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 2.1710421002081, | |
| "grad_norm": 6.129938125610352, | |
| "learning_rate": 2.828967904594205e-05, | |
| "loss": 1.1191, | |
| "num_input_tokens_seen": 117407640, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 2.176044501360653, | |
| "grad_norm": 8.660636901855469, | |
| "learning_rate": 2.8239655034416524e-05, | |
| "loss": 1.1217, | |
| "num_input_tokens_seen": 117675248, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 2.181046902513206, | |
| "grad_norm": 5.140537261962891, | |
| "learning_rate": 2.8189631022890988e-05, | |
| "loss": 1.106, | |
| "num_input_tokens_seen": 117946856, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 2.18604930366576, | |
| "grad_norm": 8.983773231506348, | |
| "learning_rate": 2.8139607011365454e-05, | |
| "loss": 1.1055, | |
| "num_input_tokens_seen": 118218576, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 2.191051704818313, | |
| "grad_norm": 5.122745513916016, | |
| "learning_rate": 2.8089582999839924e-05, | |
| "loss": 1.1022, | |
| "num_input_tokens_seen": 118493280, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 2.196054105970866, | |
| "grad_norm": 5.861432075500488, | |
| "learning_rate": 2.803955898831439e-05, | |
| "loss": 1.0946, | |
| "num_input_tokens_seen": 118764672, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 2.201056507123419, | |
| "grad_norm": 5.456287384033203, | |
| "learning_rate": 2.798953497678886e-05, | |
| "loss": 1.1136, | |
| "num_input_tokens_seen": 119033632, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 2.2060589082759723, | |
| "grad_norm": 6.379229545593262, | |
| "learning_rate": 2.7939510965263327e-05, | |
| "loss": 1.1123, | |
| "num_input_tokens_seen": 119302136, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 2.211061309428526, | |
| "grad_norm": 7.430028438568115, | |
| "learning_rate": 2.7889486953737797e-05, | |
| "loss": 1.1079, | |
| "num_input_tokens_seen": 119571112, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 2.216063710581079, | |
| "grad_norm": 6.985309600830078, | |
| "learning_rate": 2.7839462942212264e-05, | |
| "loss": 1.142, | |
| "num_input_tokens_seen": 119840376, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 2.221066111733632, | |
| "grad_norm": 5.228456974029541, | |
| "learning_rate": 2.778943893068673e-05, | |
| "loss": 1.111, | |
| "num_input_tokens_seen": 120108496, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 2.2260685128861852, | |
| "grad_norm": 7.293130874633789, | |
| "learning_rate": 2.77394149191612e-05, | |
| "loss": 1.1067, | |
| "num_input_tokens_seen": 120381296, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 2.231070914038739, | |
| "grad_norm": 7.219442367553711, | |
| "learning_rate": 2.7689390907635667e-05, | |
| "loss": 1.1331, | |
| "num_input_tokens_seen": 120646192, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 2.236073315191292, | |
| "grad_norm": 6.636627197265625, | |
| "learning_rate": 2.7639366896110137e-05, | |
| "loss": 1.1266, | |
| "num_input_tokens_seen": 120912232, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 2.241075716343845, | |
| "grad_norm": 6.974771976470947, | |
| "learning_rate": 2.75893428845846e-05, | |
| "loss": 1.1432, | |
| "num_input_tokens_seen": 121178320, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 2.246078117496398, | |
| "grad_norm": 6.00003719329834, | |
| "learning_rate": 2.7539318873059073e-05, | |
| "loss": 1.1228, | |
| "num_input_tokens_seen": 121450400, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 2.2510805186489513, | |
| "grad_norm": 6.582889556884766, | |
| "learning_rate": 2.7489294861533536e-05, | |
| "loss": 1.1236, | |
| "num_input_tokens_seen": 121722344, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 2.256082919801505, | |
| "grad_norm": 4.923620700836182, | |
| "learning_rate": 2.7439270850008003e-05, | |
| "loss": 1.1133, | |
| "num_input_tokens_seen": 121986208, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 2.261085320954058, | |
| "grad_norm": 6.316877365112305, | |
| "learning_rate": 2.7389246838482473e-05, | |
| "loss": 1.1163, | |
| "num_input_tokens_seen": 122249640, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 2.266087722106611, | |
| "grad_norm": 6.2502241134643555, | |
| "learning_rate": 2.733922282695694e-05, | |
| "loss": 1.103, | |
| "num_input_tokens_seen": 122518696, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 2.2710901232591643, | |
| "grad_norm": 8.201516151428223, | |
| "learning_rate": 2.728919881543141e-05, | |
| "loss": 1.1212, | |
| "num_input_tokens_seen": 122783616, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 2.276092524411718, | |
| "grad_norm": 5.959327220916748, | |
| "learning_rate": 2.7239174803905876e-05, | |
| "loss": 1.1096, | |
| "num_input_tokens_seen": 123053552, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 2.281094925564271, | |
| "grad_norm": 9.138140678405762, | |
| "learning_rate": 2.7189150792380342e-05, | |
| "loss": 1.1051, | |
| "num_input_tokens_seen": 123325184, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 2.286097326716824, | |
| "grad_norm": 6.214888572692871, | |
| "learning_rate": 2.7139126780854812e-05, | |
| "loss": 1.1141, | |
| "num_input_tokens_seen": 123600592, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 2.291099727869377, | |
| "grad_norm": 6.67230224609375, | |
| "learning_rate": 2.708910276932928e-05, | |
| "loss": 1.1149, | |
| "num_input_tokens_seen": 123876144, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 2.2961021290219303, | |
| "grad_norm": 7.004880905151367, | |
| "learning_rate": 2.703907875780375e-05, | |
| "loss": 1.1136, | |
| "num_input_tokens_seen": 124149576, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 2.301104530174484, | |
| "grad_norm": 5.232549667358398, | |
| "learning_rate": 2.6989054746278215e-05, | |
| "loss": 1.136, | |
| "num_input_tokens_seen": 124420360, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 2.306106931327037, | |
| "grad_norm": 6.569345951080322, | |
| "learning_rate": 2.6939030734752685e-05, | |
| "loss": 1.1136, | |
| "num_input_tokens_seen": 124694320, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 2.31110933247959, | |
| "grad_norm": 4.602709770202637, | |
| "learning_rate": 2.688900672322715e-05, | |
| "loss": 1.131, | |
| "num_input_tokens_seen": 124973608, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 2.3161117336321433, | |
| "grad_norm": 7.659350872039795, | |
| "learning_rate": 2.6838982711701615e-05, | |
| "loss": 1.1237, | |
| "num_input_tokens_seen": 125247480, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 2.321114134784697, | |
| "grad_norm": 5.581116676330566, | |
| "learning_rate": 2.6788958700176088e-05, | |
| "loss": 1.1373, | |
| "num_input_tokens_seen": 125514944, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 2.32611653593725, | |
| "grad_norm": 6.8799238204956055, | |
| "learning_rate": 2.673893468865055e-05, | |
| "loss": 1.1185, | |
| "num_input_tokens_seen": 125786544, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 2.331118937089803, | |
| "grad_norm": 6.535116195678711, | |
| "learning_rate": 2.6688910677125025e-05, | |
| "loss": 1.1154, | |
| "num_input_tokens_seen": 126061016, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 2.3361213382423562, | |
| "grad_norm": 9.319666862487793, | |
| "learning_rate": 2.6638886665599488e-05, | |
| "loss": 1.1293, | |
| "num_input_tokens_seen": 126326104, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 2.3411237393949094, | |
| "grad_norm": 6.085050582885742, | |
| "learning_rate": 2.6588862654073958e-05, | |
| "loss": 1.1266, | |
| "num_input_tokens_seen": 126600472, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 2.346126140547463, | |
| "grad_norm": 7.938391208648682, | |
| "learning_rate": 2.6538838642548424e-05, | |
| "loss": 1.1222, | |
| "num_input_tokens_seen": 126867552, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 2.351128541700016, | |
| "grad_norm": 6.2780537605285645, | |
| "learning_rate": 2.648881463102289e-05, | |
| "loss": 1.0976, | |
| "num_input_tokens_seen": 127135696, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 2.356130942852569, | |
| "grad_norm": 6.0472731590271, | |
| "learning_rate": 2.643879061949736e-05, | |
| "loss": 1.1141, | |
| "num_input_tokens_seen": 127406128, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 2.3611333440051223, | |
| "grad_norm": 6.907486438751221, | |
| "learning_rate": 2.6388766607971827e-05, | |
| "loss": 1.1149, | |
| "num_input_tokens_seen": 127678464, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 2.366135745157676, | |
| "grad_norm": 6.429139137268066, | |
| "learning_rate": 2.6338742596446297e-05, | |
| "loss": 1.1154, | |
| "num_input_tokens_seen": 127947800, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 2.371138146310229, | |
| "grad_norm": 5.432641506195068, | |
| "learning_rate": 2.6288718584920764e-05, | |
| "loss": 1.1246, | |
| "num_input_tokens_seen": 128221960, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 2.376140547462782, | |
| "grad_norm": 6.486244201660156, | |
| "learning_rate": 2.6238694573395227e-05, | |
| "loss": 1.1295, | |
| "num_input_tokens_seen": 128492192, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 2.3811429486153353, | |
| "grad_norm": 6.889167308807373, | |
| "learning_rate": 2.61886705618697e-05, | |
| "loss": 1.1073, | |
| "num_input_tokens_seen": 128761776, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 2.3861453497678884, | |
| "grad_norm": 5.81854248046875, | |
| "learning_rate": 2.6138646550344163e-05, | |
| "loss": 1.0988, | |
| "num_input_tokens_seen": 129033976, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 2.391147750920442, | |
| "grad_norm": 6.5693864822387695, | |
| "learning_rate": 2.6088622538818637e-05, | |
| "loss": 1.1225, | |
| "num_input_tokens_seen": 129304680, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 2.396150152072995, | |
| "grad_norm": 7.3249969482421875, | |
| "learning_rate": 2.60385985272931e-05, | |
| "loss": 1.1134, | |
| "num_input_tokens_seen": 129577608, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 2.401152553225548, | |
| "grad_norm": 6.37844181060791, | |
| "learning_rate": 2.5988574515767573e-05, | |
| "loss": 1.116, | |
| "num_input_tokens_seen": 129853352, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 2.4061549543781013, | |
| "grad_norm": 6.640512943267822, | |
| "learning_rate": 2.5938550504242036e-05, | |
| "loss": 1.1158, | |
| "num_input_tokens_seen": 130123560, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 2.411157355530655, | |
| "grad_norm": 4.907979965209961, | |
| "learning_rate": 2.5888526492716503e-05, | |
| "loss": 1.1116, | |
| "num_input_tokens_seen": 130395312, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 2.416159756683208, | |
| "grad_norm": 5.592065811157227, | |
| "learning_rate": 2.5838502481190973e-05, | |
| "loss": 1.125, | |
| "num_input_tokens_seen": 130667624, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 2.421162157835761, | |
| "grad_norm": 6.227156639099121, | |
| "learning_rate": 2.578847846966544e-05, | |
| "loss": 1.1101, | |
| "num_input_tokens_seen": 130936288, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 2.4261645589883143, | |
| "grad_norm": 6.889796733856201, | |
| "learning_rate": 2.573845445813991e-05, | |
| "loss": 1.1098, | |
| "num_input_tokens_seen": 131202792, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 2.4311669601408674, | |
| "grad_norm": 6.005047798156738, | |
| "learning_rate": 2.5688430446614376e-05, | |
| "loss": 1.1227, | |
| "num_input_tokens_seen": 131474608, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 2.436169361293421, | |
| "grad_norm": 6.773987293243408, | |
| "learning_rate": 2.5638406435088846e-05, | |
| "loss": 1.1167, | |
| "num_input_tokens_seen": 131739400, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 2.441171762445974, | |
| "grad_norm": 8.459389686584473, | |
| "learning_rate": 2.5588382423563312e-05, | |
| "loss": 1.1275, | |
| "num_input_tokens_seen": 132006752, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 2.4461741635985272, | |
| "grad_norm": 6.094442367553711, | |
| "learning_rate": 2.5538358412037776e-05, | |
| "loss": 1.1035, | |
| "num_input_tokens_seen": 132280008, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 2.4511765647510804, | |
| "grad_norm": 9.516000747680664, | |
| "learning_rate": 2.548833440051225e-05, | |
| "loss": 1.1194, | |
| "num_input_tokens_seen": 132548616, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 2.456178965903634, | |
| "grad_norm": 9.94356918334961, | |
| "learning_rate": 2.5438310388986712e-05, | |
| "loss": 1.1136, | |
| "num_input_tokens_seen": 132827096, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 2.461181367056187, | |
| "grad_norm": 7.003009796142578, | |
| "learning_rate": 2.5388286377461185e-05, | |
| "loss": 1.1257, | |
| "num_input_tokens_seen": 133094168, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 2.46618376820874, | |
| "grad_norm": 6.280598163604736, | |
| "learning_rate": 2.533826236593565e-05, | |
| "loss": 1.1241, | |
| "num_input_tokens_seen": 133362264, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 2.4711861693612933, | |
| "grad_norm": 7.221234321594238, | |
| "learning_rate": 2.5288238354410115e-05, | |
| "loss": 1.1095, | |
| "num_input_tokens_seen": 133628496, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 2.4761885705138464, | |
| "grad_norm": 6.677853584289551, | |
| "learning_rate": 2.5238214342884585e-05, | |
| "loss": 1.1148, | |
| "num_input_tokens_seen": 133902784, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 2.4811909716664, | |
| "grad_norm": 6.834347248077393, | |
| "learning_rate": 2.518819033135905e-05, | |
| "loss": 1.1128, | |
| "num_input_tokens_seen": 134174984, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 2.486193372818953, | |
| "grad_norm": 5.890481948852539, | |
| "learning_rate": 2.513816631983352e-05, | |
| "loss": 1.1181, | |
| "num_input_tokens_seen": 134442008, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 2.4911957739715063, | |
| "grad_norm": 5.24491548538208, | |
| "learning_rate": 2.5088142308307988e-05, | |
| "loss": 1.112, | |
| "num_input_tokens_seen": 134721144, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 2.4961981751240594, | |
| "grad_norm": 6.424367904663086, | |
| "learning_rate": 2.5038118296782458e-05, | |
| "loss": 1.117, | |
| "num_input_tokens_seen": 134996320, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 2.501200576276613, | |
| "grad_norm": 5.759153366088867, | |
| "learning_rate": 2.4988094285256925e-05, | |
| "loss": 1.1146, | |
| "num_input_tokens_seen": 135261680, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 2.506202977429166, | |
| "grad_norm": 6.164818286895752, | |
| "learning_rate": 2.493807027373139e-05, | |
| "loss": 1.1171, | |
| "num_input_tokens_seen": 135530080, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 2.511205378581719, | |
| "grad_norm": 5.746749401092529, | |
| "learning_rate": 2.488804626220586e-05, | |
| "loss": 1.1156, | |
| "num_input_tokens_seen": 135793120, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 2.5162077797342723, | |
| "grad_norm": 8.123281478881836, | |
| "learning_rate": 2.4838022250680328e-05, | |
| "loss": 1.1139, | |
| "num_input_tokens_seen": 136063864, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 2.5212101808868255, | |
| "grad_norm": 5.1486287117004395, | |
| "learning_rate": 2.4787998239154794e-05, | |
| "loss": 1.1146, | |
| "num_input_tokens_seen": 136332816, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 2.526212582039379, | |
| "grad_norm": 5.926784038543701, | |
| "learning_rate": 2.4737974227629264e-05, | |
| "loss": 1.1072, | |
| "num_input_tokens_seen": 136604928, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 2.531214983191932, | |
| "grad_norm": 5.782299041748047, | |
| "learning_rate": 2.468795021610373e-05, | |
| "loss": 1.1114, | |
| "num_input_tokens_seen": 136875976, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 2.5362173843444853, | |
| "grad_norm": 6.699214935302734, | |
| "learning_rate": 2.4637926204578197e-05, | |
| "loss": 1.1129, | |
| "num_input_tokens_seen": 137150704, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 2.5412197854970384, | |
| "grad_norm": 6.502534866333008, | |
| "learning_rate": 2.4587902193052667e-05, | |
| "loss": 1.0949, | |
| "num_input_tokens_seen": 137430640, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 2.546222186649592, | |
| "grad_norm": 6.316598892211914, | |
| "learning_rate": 2.4537878181527134e-05, | |
| "loss": 1.1061, | |
| "num_input_tokens_seen": 137704496, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 2.551224587802145, | |
| "grad_norm": 6.855249881744385, | |
| "learning_rate": 2.44878541700016e-05, | |
| "loss": 1.1147, | |
| "num_input_tokens_seen": 137973064, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 2.5562269889546982, | |
| "grad_norm": 6.485804080963135, | |
| "learning_rate": 2.443783015847607e-05, | |
| "loss": 1.1081, | |
| "num_input_tokens_seen": 138245472, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 2.5612293901072514, | |
| "grad_norm": 5.901826858520508, | |
| "learning_rate": 2.4387806146950537e-05, | |
| "loss": 1.1316, | |
| "num_input_tokens_seen": 138521368, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 2.5662317912598045, | |
| "grad_norm": 10.232002258300781, | |
| "learning_rate": 2.4337782135425007e-05, | |
| "loss": 1.1237, | |
| "num_input_tokens_seen": 138788592, | |
| "step": 256500 | |
| }, | |
| { | |
| "epoch": 2.571234192412358, | |
| "grad_norm": 6.8045148849487305, | |
| "learning_rate": 2.4287758123899473e-05, | |
| "loss": 1.1205, | |
| "num_input_tokens_seen": 139062416, | |
| "step": 257000 | |
| }, | |
| { | |
| "epoch": 2.576236593564911, | |
| "grad_norm": 6.035918712615967, | |
| "learning_rate": 2.4237734112373943e-05, | |
| "loss": 1.1063, | |
| "num_input_tokens_seen": 139341400, | |
| "step": 257500 | |
| }, | |
| { | |
| "epoch": 2.5812389947174643, | |
| "grad_norm": 6.652617454528809, | |
| "learning_rate": 2.418771010084841e-05, | |
| "loss": 1.1018, | |
| "num_input_tokens_seen": 139618008, | |
| "step": 258000 | |
| }, | |
| { | |
| "epoch": 2.5862413958700174, | |
| "grad_norm": 5.396528720855713, | |
| "learning_rate": 2.4137686089322876e-05, | |
| "loss": 1.1036, | |
| "num_input_tokens_seen": 139893680, | |
| "step": 258500 | |
| }, | |
| { | |
| "epoch": 2.591243797022571, | |
| "grad_norm": 7.620987415313721, | |
| "learning_rate": 2.4087662077797343e-05, | |
| "loss": 1.1127, | |
| "num_input_tokens_seen": 140161304, | |
| "step": 259000 | |
| }, | |
| { | |
| "epoch": 2.596246198175124, | |
| "grad_norm": 6.9869279861450195, | |
| "learning_rate": 2.4037638066271813e-05, | |
| "loss": 1.115, | |
| "num_input_tokens_seen": 140437008, | |
| "step": 259500 | |
| }, | |
| { | |
| "epoch": 2.6012485993276773, | |
| "grad_norm": 6.20002555847168, | |
| "learning_rate": 2.398761405474628e-05, | |
| "loss": 1.1184, | |
| "num_input_tokens_seen": 140703800, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 2.6062510004802304, | |
| "grad_norm": 5.8140974044799805, | |
| "learning_rate": 2.393759004322075e-05, | |
| "loss": 1.1048, | |
| "num_input_tokens_seen": 140976992, | |
| "step": 260500 | |
| }, | |
| { | |
| "epoch": 2.6112534016327835, | |
| "grad_norm": 6.32145357131958, | |
| "learning_rate": 2.3887566031695216e-05, | |
| "loss": 1.1023, | |
| "num_input_tokens_seen": 141245264, | |
| "step": 261000 | |
| }, | |
| { | |
| "epoch": 2.616255802785337, | |
| "grad_norm": 6.455646991729736, | |
| "learning_rate": 2.3837542020169682e-05, | |
| "loss": 1.107, | |
| "num_input_tokens_seen": 141517384, | |
| "step": 261500 | |
| }, | |
| { | |
| "epoch": 2.62125820393789, | |
| "grad_norm": 6.573545455932617, | |
| "learning_rate": 2.378751800864415e-05, | |
| "loss": 1.1047, | |
| "num_input_tokens_seen": 141791208, | |
| "step": 262000 | |
| }, | |
| { | |
| "epoch": 2.6262606050904433, | |
| "grad_norm": 9.841447830200195, | |
| "learning_rate": 2.373749399711862e-05, | |
| "loss": 1.0933, | |
| "num_input_tokens_seen": 142065992, | |
| "step": 262500 | |
| }, | |
| { | |
| "epoch": 2.631263006242997, | |
| "grad_norm": 6.491105556488037, | |
| "learning_rate": 2.3687469985593085e-05, | |
| "loss": 1.1132, | |
| "num_input_tokens_seen": 142333808, | |
| "step": 263000 | |
| }, | |
| { | |
| "epoch": 2.63626540739555, | |
| "grad_norm": 5.914114952087402, | |
| "learning_rate": 2.3637445974067555e-05, | |
| "loss": 1.1106, | |
| "num_input_tokens_seen": 142602960, | |
| "step": 263500 | |
| }, | |
| { | |
| "epoch": 2.641267808548103, | |
| "grad_norm": 5.9673261642456055, | |
| "learning_rate": 2.3587421962542022e-05, | |
| "loss": 1.1387, | |
| "num_input_tokens_seen": 142870032, | |
| "step": 264000 | |
| }, | |
| { | |
| "epoch": 2.6462702097006563, | |
| "grad_norm": 6.37895393371582, | |
| "learning_rate": 2.353739795101649e-05, | |
| "loss": 1.1133, | |
| "num_input_tokens_seen": 143142496, | |
| "step": 264500 | |
| }, | |
| { | |
| "epoch": 2.6512726108532094, | |
| "grad_norm": 6.1890692710876465, | |
| "learning_rate": 2.3487373939490955e-05, | |
| "loss": 1.1088, | |
| "num_input_tokens_seen": 143420168, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 2.6562750120057625, | |
| "grad_norm": 5.707185745239258, | |
| "learning_rate": 2.3437349927965425e-05, | |
| "loss": 1.1127, | |
| "num_input_tokens_seen": 143692648, | |
| "step": 265500 | |
| }, | |
| { | |
| "epoch": 2.661277413158316, | |
| "grad_norm": 6.048717975616455, | |
| "learning_rate": 2.338732591643989e-05, | |
| "loss": 1.1068, | |
| "num_input_tokens_seen": 143964856, | |
| "step": 266000 | |
| }, | |
| { | |
| "epoch": 2.6662798143108692, | |
| "grad_norm": 5.904679775238037, | |
| "learning_rate": 2.333730190491436e-05, | |
| "loss": 1.1039, | |
| "num_input_tokens_seen": 144228744, | |
| "step": 266500 | |
| }, | |
| { | |
| "epoch": 2.6712822154634224, | |
| "grad_norm": 6.36087703704834, | |
| "learning_rate": 2.3287277893388828e-05, | |
| "loss": 1.1197, | |
| "num_input_tokens_seen": 144501872, | |
| "step": 267000 | |
| }, | |
| { | |
| "epoch": 2.676284616615976, | |
| "grad_norm": 5.9171576499938965, | |
| "learning_rate": 2.3237253881863298e-05, | |
| "loss": 1.1192, | |
| "num_input_tokens_seen": 144768432, | |
| "step": 267500 | |
| }, | |
| { | |
| "epoch": 2.681287017768529, | |
| "grad_norm": 6.9919514656066895, | |
| "learning_rate": 2.318722987033776e-05, | |
| "loss": 1.1093, | |
| "num_input_tokens_seen": 145035880, | |
| "step": 268000 | |
| }, | |
| { | |
| "epoch": 2.686289418921082, | |
| "grad_norm": 5.2417826652526855, | |
| "learning_rate": 2.313720585881223e-05, | |
| "loss": 1.1293, | |
| "num_input_tokens_seen": 145311552, | |
| "step": 268500 | |
| }, | |
| { | |
| "epoch": 2.6912918200736353, | |
| "grad_norm": 5.52398681640625, | |
| "learning_rate": 2.3087181847286697e-05, | |
| "loss": 1.107, | |
| "num_input_tokens_seen": 145584192, | |
| "step": 269000 | |
| }, | |
| { | |
| "epoch": 2.6962942212261884, | |
| "grad_norm": 6.279477119445801, | |
| "learning_rate": 2.3037157835761167e-05, | |
| "loss": 1.0918, | |
| "num_input_tokens_seen": 145854640, | |
| "step": 269500 | |
| }, | |
| { | |
| "epoch": 2.7012966223787416, | |
| "grad_norm": 8.50329303741455, | |
| "learning_rate": 2.2987133824235634e-05, | |
| "loss": 1.1232, | |
| "num_input_tokens_seen": 146125568, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 2.706299023531295, | |
| "grad_norm": 7.494457721710205, | |
| "learning_rate": 2.2937109812710104e-05, | |
| "loss": 1.1178, | |
| "num_input_tokens_seen": 146388376, | |
| "step": 270500 | |
| }, | |
| { | |
| "epoch": 2.7113014246838483, | |
| "grad_norm": 5.595491886138916, | |
| "learning_rate": 2.2887085801184567e-05, | |
| "loss": 1.1144, | |
| "num_input_tokens_seen": 146655392, | |
| "step": 271000 | |
| }, | |
| { | |
| "epoch": 2.7163038258364014, | |
| "grad_norm": 7.584702968597412, | |
| "learning_rate": 2.2837061789659037e-05, | |
| "loss": 1.107, | |
| "num_input_tokens_seen": 146920408, | |
| "step": 271500 | |
| }, | |
| { | |
| "epoch": 2.721306226988955, | |
| "grad_norm": 5.952847957611084, | |
| "learning_rate": 2.2787037778133504e-05, | |
| "loss": 1.0989, | |
| "num_input_tokens_seen": 147193680, | |
| "step": 272000 | |
| }, | |
| { | |
| "epoch": 2.726308628141508, | |
| "grad_norm": 5.385768413543701, | |
| "learning_rate": 2.2737013766607973e-05, | |
| "loss": 1.1169, | |
| "num_input_tokens_seen": 147465336, | |
| "step": 272500 | |
| }, | |
| { | |
| "epoch": 2.731311029294061, | |
| "grad_norm": 7.199370861053467, | |
| "learning_rate": 2.268698975508244e-05, | |
| "loss": 1.119, | |
| "num_input_tokens_seen": 147733784, | |
| "step": 273000 | |
| }, | |
| { | |
| "epoch": 2.7363134304466143, | |
| "grad_norm": 6.557952880859375, | |
| "learning_rate": 2.263696574355691e-05, | |
| "loss": 1.0966, | |
| "num_input_tokens_seen": 147998984, | |
| "step": 273500 | |
| }, | |
| { | |
| "epoch": 2.7413158315991675, | |
| "grad_norm": 6.291484355926514, | |
| "learning_rate": 2.2586941732031373e-05, | |
| "loss": 1.1039, | |
| "num_input_tokens_seen": 148268168, | |
| "step": 274000 | |
| }, | |
| { | |
| "epoch": 2.7463182327517206, | |
| "grad_norm": 5.747891426086426, | |
| "learning_rate": 2.2536917720505843e-05, | |
| "loss": 1.1006, | |
| "num_input_tokens_seen": 148546456, | |
| "step": 274500 | |
| }, | |
| { | |
| "epoch": 2.751320633904274, | |
| "grad_norm": 5.766910552978516, | |
| "learning_rate": 2.248689370898031e-05, | |
| "loss": 1.1216, | |
| "num_input_tokens_seen": 148815712, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 2.7563230350568273, | |
| "grad_norm": 6.185927391052246, | |
| "learning_rate": 2.243686969745478e-05, | |
| "loss": 1.1007, | |
| "num_input_tokens_seen": 149086416, | |
| "step": 275500 | |
| }, | |
| { | |
| "epoch": 2.7613254362093804, | |
| "grad_norm": 7.301943778991699, | |
| "learning_rate": 2.2386845685929246e-05, | |
| "loss": 1.1182, | |
| "num_input_tokens_seen": 149351152, | |
| "step": 276000 | |
| }, | |
| { | |
| "epoch": 2.766327837361934, | |
| "grad_norm": 4.440983295440674, | |
| "learning_rate": 2.2336821674403716e-05, | |
| "loss": 1.1125, | |
| "num_input_tokens_seen": 149617360, | |
| "step": 276500 | |
| }, | |
| { | |
| "epoch": 2.771330238514487, | |
| "grad_norm": 6.778481483459473, | |
| "learning_rate": 2.2286797662878183e-05, | |
| "loss": 1.1108, | |
| "num_input_tokens_seen": 149884296, | |
| "step": 277000 | |
| }, | |
| { | |
| "epoch": 2.7763326396670402, | |
| "grad_norm": 6.671989440917969, | |
| "learning_rate": 2.223677365135265e-05, | |
| "loss": 1.0942, | |
| "num_input_tokens_seen": 150155088, | |
| "step": 277500 | |
| }, | |
| { | |
| "epoch": 2.7813350408195934, | |
| "grad_norm": 6.532144069671631, | |
| "learning_rate": 2.218674963982712e-05, | |
| "loss": 1.1102, | |
| "num_input_tokens_seen": 150424504, | |
| "step": 278000 | |
| }, | |
| { | |
| "epoch": 2.7863374419721465, | |
| "grad_norm": 7.665340423583984, | |
| "learning_rate": 2.2136725628301586e-05, | |
| "loss": 1.0995, | |
| "num_input_tokens_seen": 150697200, | |
| "step": 278500 | |
| }, | |
| { | |
| "epoch": 2.7913398431246996, | |
| "grad_norm": 8.809953689575195, | |
| "learning_rate": 2.2086701616776052e-05, | |
| "loss": 1.0934, | |
| "num_input_tokens_seen": 150963832, | |
| "step": 279000 | |
| }, | |
| { | |
| "epoch": 2.796342244277253, | |
| "grad_norm": 6.865957260131836, | |
| "learning_rate": 2.2036677605250522e-05, | |
| "loss": 1.0941, | |
| "num_input_tokens_seen": 151236880, | |
| "step": 279500 | |
| }, | |
| { | |
| "epoch": 2.8013446454298063, | |
| "grad_norm": 8.230210304260254, | |
| "learning_rate": 2.198665359372499e-05, | |
| "loss": 1.105, | |
| "num_input_tokens_seen": 151510320, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 2.8063470465823595, | |
| "grad_norm": 5.514502048492432, | |
| "learning_rate": 2.1936629582199455e-05, | |
| "loss": 1.1129, | |
| "num_input_tokens_seen": 151773160, | |
| "step": 280500 | |
| }, | |
| { | |
| "epoch": 2.811349447734913, | |
| "grad_norm": 6.41658353805542, | |
| "learning_rate": 2.1886605570673925e-05, | |
| "loss": 1.1234, | |
| "num_input_tokens_seen": 152043968, | |
| "step": 281000 | |
| }, | |
| { | |
| "epoch": 2.816351848887466, | |
| "grad_norm": 10.474380493164062, | |
| "learning_rate": 2.183658155914839e-05, | |
| "loss": 1.1109, | |
| "num_input_tokens_seen": 152312528, | |
| "step": 281500 | |
| }, | |
| { | |
| "epoch": 2.8213542500400193, | |
| "grad_norm": 6.710339069366455, | |
| "learning_rate": 2.178655754762286e-05, | |
| "loss": 1.1007, | |
| "num_input_tokens_seen": 152583128, | |
| "step": 282000 | |
| }, | |
| { | |
| "epoch": 2.8263566511925724, | |
| "grad_norm": 6.992675304412842, | |
| "learning_rate": 2.1736533536097328e-05, | |
| "loss": 1.0936, | |
| "num_input_tokens_seen": 152859696, | |
| "step": 282500 | |
| }, | |
| { | |
| "epoch": 2.8313590523451255, | |
| "grad_norm": 5.590021133422852, | |
| "learning_rate": 2.1686509524571795e-05, | |
| "loss": 1.1047, | |
| "num_input_tokens_seen": 153129472, | |
| "step": 283000 | |
| }, | |
| { | |
| "epoch": 2.8363614534976787, | |
| "grad_norm": 5.853962421417236, | |
| "learning_rate": 2.1636485513046265e-05, | |
| "loss": 1.116, | |
| "num_input_tokens_seen": 153397560, | |
| "step": 283500 | |
| }, | |
| { | |
| "epoch": 2.8413638546502322, | |
| "grad_norm": 5.7029242515563965, | |
| "learning_rate": 2.158646150152073e-05, | |
| "loss": 1.1153, | |
| "num_input_tokens_seen": 153671176, | |
| "step": 284000 | |
| }, | |
| { | |
| "epoch": 2.8463662558027853, | |
| "grad_norm": 6.952505111694336, | |
| "learning_rate": 2.1536437489995198e-05, | |
| "loss": 1.1153, | |
| "num_input_tokens_seen": 153937560, | |
| "step": 284500 | |
| }, | |
| { | |
| "epoch": 2.8513686569553385, | |
| "grad_norm": 5.1992902755737305, | |
| "learning_rate": 2.1486413478469668e-05, | |
| "loss": 1.0892, | |
| "num_input_tokens_seen": 154210728, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 2.856371058107892, | |
| "grad_norm": 5.967268943786621, | |
| "learning_rate": 2.1436389466944134e-05, | |
| "loss": 1.0971, | |
| "num_input_tokens_seen": 154484848, | |
| "step": 285500 | |
| }, | |
| { | |
| "epoch": 2.861373459260445, | |
| "grad_norm": 7.573243618011475, | |
| "learning_rate": 2.1386365455418604e-05, | |
| "loss": 1.1015, | |
| "num_input_tokens_seen": 154756800, | |
| "step": 286000 | |
| }, | |
| { | |
| "epoch": 2.8663758604129983, | |
| "grad_norm": 6.0880584716796875, | |
| "learning_rate": 2.133634144389307e-05, | |
| "loss": 1.0941, | |
| "num_input_tokens_seen": 155027568, | |
| "step": 286500 | |
| }, | |
| { | |
| "epoch": 2.8713782615655514, | |
| "grad_norm": 8.533178329467773, | |
| "learning_rate": 2.1286317432367537e-05, | |
| "loss": 1.0951, | |
| "num_input_tokens_seen": 155296992, | |
| "step": 287000 | |
| }, | |
| { | |
| "epoch": 2.8763806627181046, | |
| "grad_norm": 6.032339096069336, | |
| "learning_rate": 2.1236293420842004e-05, | |
| "loss": 1.1088, | |
| "num_input_tokens_seen": 155565816, | |
| "step": 287500 | |
| }, | |
| { | |
| "epoch": 2.8813830638706577, | |
| "grad_norm": 7.005359649658203, | |
| "learning_rate": 2.1186269409316474e-05, | |
| "loss": 1.0966, | |
| "num_input_tokens_seen": 155835736, | |
| "step": 288000 | |
| }, | |
| { | |
| "epoch": 2.8863854650232112, | |
| "grad_norm": 6.709108829498291, | |
| "learning_rate": 2.113624539779094e-05, | |
| "loss": 1.0999, | |
| "num_input_tokens_seen": 156105320, | |
| "step": 288500 | |
| }, | |
| { | |
| "epoch": 2.8913878661757644, | |
| "grad_norm": 6.140367031097412, | |
| "learning_rate": 2.108622138626541e-05, | |
| "loss": 1.0919, | |
| "num_input_tokens_seen": 156373336, | |
| "step": 289000 | |
| }, | |
| { | |
| "epoch": 2.8963902673283175, | |
| "grad_norm": 6.799286365509033, | |
| "learning_rate": 2.1036197374739877e-05, | |
| "loss": 1.0977, | |
| "num_input_tokens_seen": 156645480, | |
| "step": 289500 | |
| }, | |
| { | |
| "epoch": 2.901392668480871, | |
| "grad_norm": 7.2591023445129395, | |
| "learning_rate": 2.0986173363214343e-05, | |
| "loss": 1.1045, | |
| "num_input_tokens_seen": 156919152, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 2.906395069633424, | |
| "grad_norm": 6.572688102722168, | |
| "learning_rate": 2.093614935168881e-05, | |
| "loss": 1.0954, | |
| "num_input_tokens_seen": 157189944, | |
| "step": 290500 | |
| }, | |
| { | |
| "epoch": 2.9113974707859773, | |
| "grad_norm": 8.598788261413574, | |
| "learning_rate": 2.088612534016328e-05, | |
| "loss": 1.1201, | |
| "num_input_tokens_seen": 157462520, | |
| "step": 291000 | |
| }, | |
| { | |
| "epoch": 2.9163998719385305, | |
| "grad_norm": 7.680613994598389, | |
| "learning_rate": 2.0836101328637746e-05, | |
| "loss": 1.0966, | |
| "num_input_tokens_seen": 157734256, | |
| "step": 291500 | |
| }, | |
| { | |
| "epoch": 2.9214022730910836, | |
| "grad_norm": 7.925107479095459, | |
| "learning_rate": 2.0786077317112216e-05, | |
| "loss": 1.0918, | |
| "num_input_tokens_seen": 158006528, | |
| "step": 292000 | |
| }, | |
| { | |
| "epoch": 2.9264046742436367, | |
| "grad_norm": 5.118693828582764, | |
| "learning_rate": 2.0736053305586683e-05, | |
| "loss": 1.0932, | |
| "num_input_tokens_seen": 158275552, | |
| "step": 292500 | |
| }, | |
| { | |
| "epoch": 2.9314070753961903, | |
| "grad_norm": 4.836045265197754, | |
| "learning_rate": 2.0686029294061153e-05, | |
| "loss": 1.0973, | |
| "num_input_tokens_seen": 158547424, | |
| "step": 293000 | |
| }, | |
| { | |
| "epoch": 2.9364094765487434, | |
| "grad_norm": 5.422683238983154, | |
| "learning_rate": 2.0636005282535616e-05, | |
| "loss": 1.1073, | |
| "num_input_tokens_seen": 158818568, | |
| "step": 293500 | |
| }, | |
| { | |
| "epoch": 2.9414118777012965, | |
| "grad_norm": 6.608382225036621, | |
| "learning_rate": 2.0585981271010086e-05, | |
| "loss": 1.0836, | |
| "num_input_tokens_seen": 159088240, | |
| "step": 294000 | |
| }, | |
| { | |
| "epoch": 2.94641427885385, | |
| "grad_norm": 5.50745153427124, | |
| "learning_rate": 2.0535957259484552e-05, | |
| "loss": 1.1019, | |
| "num_input_tokens_seen": 159356336, | |
| "step": 294500 | |
| }, | |
| { | |
| "epoch": 2.9514166800064032, | |
| "grad_norm": 5.7239251136779785, | |
| "learning_rate": 2.0485933247959022e-05, | |
| "loss": 1.0824, | |
| "num_input_tokens_seen": 159630264, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 2.9564190811589564, | |
| "grad_norm": 6.992796421051025, | |
| "learning_rate": 2.043590923643349e-05, | |
| "loss": 1.1014, | |
| "num_input_tokens_seen": 159892336, | |
| "step": 295500 | |
| }, | |
| { | |
| "epoch": 2.9614214823115095, | |
| "grad_norm": 6.0249433517456055, | |
| "learning_rate": 2.038588522490796e-05, | |
| "loss": 1.0831, | |
| "num_input_tokens_seen": 160159512, | |
| "step": 296000 | |
| }, | |
| { | |
| "epoch": 2.9664238834640626, | |
| "grad_norm": 5.429805755615234, | |
| "learning_rate": 2.0335861213382422e-05, | |
| "loss": 1.0991, | |
| "num_input_tokens_seen": 160423896, | |
| "step": 296500 | |
| }, | |
| { | |
| "epoch": 2.9714262846166157, | |
| "grad_norm": 7.0117034912109375, | |
| "learning_rate": 2.0285837201856892e-05, | |
| "loss": 1.0682, | |
| "num_input_tokens_seen": 160699992, | |
| "step": 297000 | |
| }, | |
| { | |
| "epoch": 2.9764286857691693, | |
| "grad_norm": 4.545111179351807, | |
| "learning_rate": 2.023581319033136e-05, | |
| "loss": 1.0976, | |
| "num_input_tokens_seen": 160970504, | |
| "step": 297500 | |
| }, | |
| { | |
| "epoch": 2.9814310869217224, | |
| "grad_norm": 7.641571998596191, | |
| "learning_rate": 2.018578917880583e-05, | |
| "loss": 1.1149, | |
| "num_input_tokens_seen": 161244848, | |
| "step": 298000 | |
| }, | |
| { | |
| "epoch": 2.9864334880742756, | |
| "grad_norm": 5.6191205978393555, | |
| "learning_rate": 2.0135765167280295e-05, | |
| "loss": 1.0841, | |
| "num_input_tokens_seen": 161521312, | |
| "step": 298500 | |
| }, | |
| { | |
| "epoch": 2.991435889226829, | |
| "grad_norm": 7.104705810546875, | |
| "learning_rate": 2.0085741155754765e-05, | |
| "loss": 1.1083, | |
| "num_input_tokens_seen": 161787136, | |
| "step": 299000 | |
| }, | |
| { | |
| "epoch": 2.9964382903793823, | |
| "grad_norm": 7.319199085235596, | |
| "learning_rate": 2.0035717144229228e-05, | |
| "loss": 1.0933, | |
| "num_input_tokens_seen": 162058632, | |
| "step": 299500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.0591504573822021, | |
| "eval_runtime": 193.2048, | |
| "eval_samples_per_second": 1034.679, | |
| "eval_steps_per_second": 129.339, | |
| "num_input_tokens_seen": 162248288, | |
| "step": 299856 | |
| }, | |
| { | |
| "epoch": 3.0014406915319354, | |
| "grad_norm": 6.5569539070129395, | |
| "learning_rate": 1.9985693132703698e-05, | |
| "loss": 1.0801, | |
| "num_input_tokens_seen": 162329952, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 3.0064430926844885, | |
| "grad_norm": 4.675987720489502, | |
| "learning_rate": 1.9935669121178165e-05, | |
| "loss": 0.9946, | |
| "num_input_tokens_seen": 162606936, | |
| "step": 300500 | |
| }, | |
| { | |
| "epoch": 3.0114454938370416, | |
| "grad_norm": 9.786909103393555, | |
| "learning_rate": 1.9885645109652635e-05, | |
| "loss": 1.0173, | |
| "num_input_tokens_seen": 162883112, | |
| "step": 301000 | |
| }, | |
| { | |
| "epoch": 3.016447894989595, | |
| "grad_norm": 7.118892669677734, | |
| "learning_rate": 1.98356210981271e-05, | |
| "loss": 0.9956, | |
| "num_input_tokens_seen": 163158976, | |
| "step": 301500 | |
| }, | |
| { | |
| "epoch": 3.0214502961421483, | |
| "grad_norm": 4.8414411544799805, | |
| "learning_rate": 1.978559708660157e-05, | |
| "loss": 0.9909, | |
| "num_input_tokens_seen": 163431480, | |
| "step": 302000 | |
| }, | |
| { | |
| "epoch": 3.0264526972947015, | |
| "grad_norm": 6.550401210784912, | |
| "learning_rate": 1.9735573075076038e-05, | |
| "loss": 1.0088, | |
| "num_input_tokens_seen": 163705656, | |
| "step": 302500 | |
| }, | |
| { | |
| "epoch": 3.0314550984472546, | |
| "grad_norm": 6.2179694175720215, | |
| "learning_rate": 1.9685549063550504e-05, | |
| "loss": 1.0046, | |
| "num_input_tokens_seen": 163978520, | |
| "step": 303000 | |
| }, | |
| { | |
| "epoch": 3.0364574995998077, | |
| "grad_norm": 5.6524224281311035, | |
| "learning_rate": 1.963552505202497e-05, | |
| "loss": 0.9962, | |
| "num_input_tokens_seen": 164246424, | |
| "step": 303500 | |
| }, | |
| { | |
| "epoch": 3.0414599007523613, | |
| "grad_norm": 6.216259479522705, | |
| "learning_rate": 1.958550104049944e-05, | |
| "loss": 1.0032, | |
| "num_input_tokens_seen": 164520760, | |
| "step": 304000 | |
| }, | |
| { | |
| "epoch": 3.0464623019049144, | |
| "grad_norm": 5.4327311515808105, | |
| "learning_rate": 1.9535477028973907e-05, | |
| "loss": 0.9958, | |
| "num_input_tokens_seen": 164791976, | |
| "step": 304500 | |
| }, | |
| { | |
| "epoch": 3.0514647030574675, | |
| "grad_norm": 6.64623498916626, | |
| "learning_rate": 1.9485453017448377e-05, | |
| "loss": 1.0027, | |
| "num_input_tokens_seen": 165064512, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 3.0564671042100207, | |
| "grad_norm": 5.067431449890137, | |
| "learning_rate": 1.9435429005922844e-05, | |
| "loss": 1.0095, | |
| "num_input_tokens_seen": 165335384, | |
| "step": 305500 | |
| }, | |
| { | |
| "epoch": 3.0614695053625742, | |
| "grad_norm": 5.332586765289307, | |
| "learning_rate": 1.938540499439731e-05, | |
| "loss": 1.0001, | |
| "num_input_tokens_seen": 165600752, | |
| "step": 306000 | |
| }, | |
| { | |
| "epoch": 3.0664719065151274, | |
| "grad_norm": 6.432159900665283, | |
| "learning_rate": 1.933538098287178e-05, | |
| "loss": 0.9977, | |
| "num_input_tokens_seen": 165870160, | |
| "step": 306500 | |
| }, | |
| { | |
| "epoch": 3.0714743076676805, | |
| "grad_norm": 6.297356605529785, | |
| "learning_rate": 1.9285356971346247e-05, | |
| "loss": 0.9981, | |
| "num_input_tokens_seen": 166137752, | |
| "step": 307000 | |
| }, | |
| { | |
| "epoch": 3.0764767088202336, | |
| "grad_norm": 6.82805871963501, | |
| "learning_rate": 1.9235332959820717e-05, | |
| "loss": 1.008, | |
| "num_input_tokens_seen": 166409344, | |
| "step": 307500 | |
| }, | |
| { | |
| "epoch": 3.0814791099727867, | |
| "grad_norm": 5.371485710144043, | |
| "learning_rate": 1.9185308948295183e-05, | |
| "loss": 1.0077, | |
| "num_input_tokens_seen": 166679600, | |
| "step": 308000 | |
| }, | |
| { | |
| "epoch": 3.0864815111253403, | |
| "grad_norm": 5.552392482757568, | |
| "learning_rate": 1.913528493676965e-05, | |
| "loss": 1.0154, | |
| "num_input_tokens_seen": 166951960, | |
| "step": 308500 | |
| }, | |
| { | |
| "epoch": 3.0914839122778934, | |
| "grad_norm": 5.485569953918457, | |
| "learning_rate": 1.908526092524412e-05, | |
| "loss": 1.0183, | |
| "num_input_tokens_seen": 167229768, | |
| "step": 309000 | |
| }, | |
| { | |
| "epoch": 3.0964863134304466, | |
| "grad_norm": 7.161227226257324, | |
| "learning_rate": 1.9035236913718586e-05, | |
| "loss": 0.9953, | |
| "num_input_tokens_seen": 167500816, | |
| "step": 309500 | |
| }, | |
| { | |
| "epoch": 3.1014887145829997, | |
| "grad_norm": 6.685337543487549, | |
| "learning_rate": 1.8985212902193053e-05, | |
| "loss": 1.0126, | |
| "num_input_tokens_seen": 167761944, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 3.1064911157355533, | |
| "grad_norm": 7.007294178009033, | |
| "learning_rate": 1.8935188890667523e-05, | |
| "loss": 1.0168, | |
| "num_input_tokens_seen": 168031200, | |
| "step": 310500 | |
| }, | |
| { | |
| "epoch": 3.1114935168881064, | |
| "grad_norm": 4.6598615646362305, | |
| "learning_rate": 1.888516487914199e-05, | |
| "loss": 0.9972, | |
| "num_input_tokens_seen": 168306704, | |
| "step": 311000 | |
| }, | |
| { | |
| "epoch": 3.1164959180406595, | |
| "grad_norm": 6.378694534301758, | |
| "learning_rate": 1.883514086761646e-05, | |
| "loss": 1.0057, | |
| "num_input_tokens_seen": 168578560, | |
| "step": 311500 | |
| }, | |
| { | |
| "epoch": 3.1214983191932126, | |
| "grad_norm": 4.70497465133667, | |
| "learning_rate": 1.8785116856090926e-05, | |
| "loss": 0.9805, | |
| "num_input_tokens_seen": 168857808, | |
| "step": 312000 | |
| }, | |
| { | |
| "epoch": 3.1265007203457658, | |
| "grad_norm": 6.10917329788208, | |
| "learning_rate": 1.8735092844565392e-05, | |
| "loss": 1.0013, | |
| "num_input_tokens_seen": 169130360, | |
| "step": 312500 | |
| }, | |
| { | |
| "epoch": 3.1315031214983193, | |
| "grad_norm": 5.804021835327148, | |
| "learning_rate": 1.868506883303986e-05, | |
| "loss": 1.0111, | |
| "num_input_tokens_seen": 169403088, | |
| "step": 313000 | |
| }, | |
| { | |
| "epoch": 3.1365055226508725, | |
| "grad_norm": 6.582799911499023, | |
| "learning_rate": 1.863504482151433e-05, | |
| "loss": 1.0181, | |
| "num_input_tokens_seen": 169678360, | |
| "step": 313500 | |
| }, | |
| { | |
| "epoch": 3.1415079238034256, | |
| "grad_norm": 6.149540424346924, | |
| "learning_rate": 1.8585020809988795e-05, | |
| "loss": 1.0122, | |
| "num_input_tokens_seen": 169944568, | |
| "step": 314000 | |
| }, | |
| { | |
| "epoch": 3.1465103249559787, | |
| "grad_norm": 8.258193969726562, | |
| "learning_rate": 1.8534996798463265e-05, | |
| "loss": 0.9965, | |
| "num_input_tokens_seen": 170213776, | |
| "step": 314500 | |
| }, | |
| { | |
| "epoch": 3.1515127261085323, | |
| "grad_norm": 5.790067195892334, | |
| "learning_rate": 1.8484972786937732e-05, | |
| "loss": 1.0083, | |
| "num_input_tokens_seen": 170487696, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 3.1565151272610854, | |
| "grad_norm": 6.756849765777588, | |
| "learning_rate": 1.84349487754122e-05, | |
| "loss": 1.0112, | |
| "num_input_tokens_seen": 170763864, | |
| "step": 315500 | |
| }, | |
| { | |
| "epoch": 3.1615175284136385, | |
| "grad_norm": 8.113907814025879, | |
| "learning_rate": 1.8384924763886665e-05, | |
| "loss": 1.0113, | |
| "num_input_tokens_seen": 171034920, | |
| "step": 316000 | |
| }, | |
| { | |
| "epoch": 3.1665199295661917, | |
| "grad_norm": 6.584122180938721, | |
| "learning_rate": 1.8334900752361135e-05, | |
| "loss": 1.0071, | |
| "num_input_tokens_seen": 171319328, | |
| "step": 316500 | |
| }, | |
| { | |
| "epoch": 3.1715223307187452, | |
| "grad_norm": 7.648674964904785, | |
| "learning_rate": 1.82848767408356e-05, | |
| "loss": 1.0092, | |
| "num_input_tokens_seen": 171587048, | |
| "step": 317000 | |
| }, | |
| { | |
| "epoch": 3.1765247318712984, | |
| "grad_norm": 6.150393486022949, | |
| "learning_rate": 1.823485272931007e-05, | |
| "loss": 1.0213, | |
| "num_input_tokens_seen": 171854768, | |
| "step": 317500 | |
| }, | |
| { | |
| "epoch": 3.1815271330238515, | |
| "grad_norm": 6.078028202056885, | |
| "learning_rate": 1.8184828717784538e-05, | |
| "loss": 1.0129, | |
| "num_input_tokens_seen": 172122080, | |
| "step": 318000 | |
| }, | |
| { | |
| "epoch": 3.1865295341764046, | |
| "grad_norm": 6.360128402709961, | |
| "learning_rate": 1.8134804706259008e-05, | |
| "loss": 0.9966, | |
| "num_input_tokens_seen": 172386096, | |
| "step": 318500 | |
| }, | |
| { | |
| "epoch": 3.1915319353289577, | |
| "grad_norm": 7.205709934234619, | |
| "learning_rate": 1.808478069473347e-05, | |
| "loss": 1.019, | |
| "num_input_tokens_seen": 172656264, | |
| "step": 319000 | |
| }, | |
| { | |
| "epoch": 3.1965343364815113, | |
| "grad_norm": 6.01072883605957, | |
| "learning_rate": 1.803475668320794e-05, | |
| "loss": 1.0104, | |
| "num_input_tokens_seen": 172928696, | |
| "step": 319500 | |
| }, | |
| { | |
| "epoch": 3.2015367376340644, | |
| "grad_norm": 5.552466869354248, | |
| "learning_rate": 1.7984732671682407e-05, | |
| "loss": 1.0045, | |
| "num_input_tokens_seen": 173193664, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 3.2065391387866176, | |
| "grad_norm": 6.396902561187744, | |
| "learning_rate": 1.7934708660156877e-05, | |
| "loss": 1.007, | |
| "num_input_tokens_seen": 173460776, | |
| "step": 320500 | |
| }, | |
| { | |
| "epoch": 3.2115415399391707, | |
| "grad_norm": 5.7529778480529785, | |
| "learning_rate": 1.7884684648631344e-05, | |
| "loss": 1.0185, | |
| "num_input_tokens_seen": 173728072, | |
| "step": 321000 | |
| }, | |
| { | |
| "epoch": 3.2165439410917243, | |
| "grad_norm": 8.143234252929688, | |
| "learning_rate": 1.7834660637105814e-05, | |
| "loss": 1.0306, | |
| "num_input_tokens_seen": 173997832, | |
| "step": 321500 | |
| }, | |
| { | |
| "epoch": 3.2215463422442774, | |
| "grad_norm": 5.61393928527832, | |
| "learning_rate": 1.7784636625580277e-05, | |
| "loss": 1.0206, | |
| "num_input_tokens_seen": 174268112, | |
| "step": 322000 | |
| }, | |
| { | |
| "epoch": 3.2265487433968305, | |
| "grad_norm": 5.928481578826904, | |
| "learning_rate": 1.7734612614054747e-05, | |
| "loss": 0.9988, | |
| "num_input_tokens_seen": 174533880, | |
| "step": 322500 | |
| }, | |
| { | |
| "epoch": 3.2315511445493836, | |
| "grad_norm": 5.389233589172363, | |
| "learning_rate": 1.7684588602529213e-05, | |
| "loss": 1.02, | |
| "num_input_tokens_seen": 174799568, | |
| "step": 323000 | |
| }, | |
| { | |
| "epoch": 3.2365535457019368, | |
| "grad_norm": 7.281908988952637, | |
| "learning_rate": 1.7634564591003683e-05, | |
| "loss": 0.9986, | |
| "num_input_tokens_seen": 175066168, | |
| "step": 323500 | |
| }, | |
| { | |
| "epoch": 3.2415559468544903, | |
| "grad_norm": 6.380090713500977, | |
| "learning_rate": 1.758454057947815e-05, | |
| "loss": 1.0057, | |
| "num_input_tokens_seen": 175336944, | |
| "step": 324000 | |
| }, | |
| { | |
| "epoch": 3.2465583480070435, | |
| "grad_norm": 6.550302982330322, | |
| "learning_rate": 1.753451656795262e-05, | |
| "loss": 1.0171, | |
| "num_input_tokens_seen": 175611424, | |
| "step": 324500 | |
| }, | |
| { | |
| "epoch": 3.2515607491595966, | |
| "grad_norm": 5.882409572601318, | |
| "learning_rate": 1.7484492556427083e-05, | |
| "loss": 1.0112, | |
| "num_input_tokens_seen": 175879768, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 3.2565631503121497, | |
| "grad_norm": 6.047407627105713, | |
| "learning_rate": 1.7434468544901553e-05, | |
| "loss": 1.0332, | |
| "num_input_tokens_seen": 176157824, | |
| "step": 325500 | |
| }, | |
| { | |
| "epoch": 3.2615655514647033, | |
| "grad_norm": 6.9863691329956055, | |
| "learning_rate": 1.738444453337602e-05, | |
| "loss": 1.0008, | |
| "num_input_tokens_seen": 176428696, | |
| "step": 326000 | |
| }, | |
| { | |
| "epoch": 3.2665679526172564, | |
| "grad_norm": 6.332062721252441, | |
| "learning_rate": 1.733442052185049e-05, | |
| "loss": 1.0005, | |
| "num_input_tokens_seen": 176707880, | |
| "step": 326500 | |
| }, | |
| { | |
| "epoch": 3.2715703537698095, | |
| "grad_norm": 5.405006408691406, | |
| "learning_rate": 1.7284396510324956e-05, | |
| "loss": 1.0249, | |
| "num_input_tokens_seen": 176978528, | |
| "step": 327000 | |
| }, | |
| { | |
| "epoch": 3.2765727549223627, | |
| "grad_norm": 6.076756477355957, | |
| "learning_rate": 1.7234372498799426e-05, | |
| "loss": 1.0156, | |
| "num_input_tokens_seen": 177248232, | |
| "step": 327500 | |
| }, | |
| { | |
| "epoch": 3.281575156074916, | |
| "grad_norm": 7.379303932189941, | |
| "learning_rate": 1.7184348487273893e-05, | |
| "loss": 1.0039, | |
| "num_input_tokens_seen": 177519848, | |
| "step": 328000 | |
| }, | |
| { | |
| "epoch": 3.2865775572274694, | |
| "grad_norm": 6.788669109344482, | |
| "learning_rate": 1.713432447574836e-05, | |
| "loss": 1.0092, | |
| "num_input_tokens_seen": 177789288, | |
| "step": 328500 | |
| }, | |
| { | |
| "epoch": 3.2915799583800225, | |
| "grad_norm": 6.320953369140625, | |
| "learning_rate": 1.7084300464222826e-05, | |
| "loss": 1.0143, | |
| "num_input_tokens_seen": 178056680, | |
| "step": 329000 | |
| }, | |
| { | |
| "epoch": 3.2965823595325756, | |
| "grad_norm": 6.267603397369385, | |
| "learning_rate": 1.7034276452697296e-05, | |
| "loss": 1.0023, | |
| "num_input_tokens_seen": 178329192, | |
| "step": 329500 | |
| }, | |
| { | |
| "epoch": 3.3015847606851287, | |
| "grad_norm": 5.770685195922852, | |
| "learning_rate": 1.6984252441171762e-05, | |
| "loss": 1.0279, | |
| "num_input_tokens_seen": 178597800, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 3.3065871618376823, | |
| "grad_norm": 6.4189863204956055, | |
| "learning_rate": 1.6934228429646232e-05, | |
| "loss": 1.0052, | |
| "num_input_tokens_seen": 178869992, | |
| "step": 330500 | |
| }, | |
| { | |
| "epoch": 3.3115895629902354, | |
| "grad_norm": 5.872836589813232, | |
| "learning_rate": 1.68842044181207e-05, | |
| "loss": 0.9991, | |
| "num_input_tokens_seen": 179139968, | |
| "step": 331000 | |
| }, | |
| { | |
| "epoch": 3.3165919641427886, | |
| "grad_norm": 4.180021286010742, | |
| "learning_rate": 1.6834180406595165e-05, | |
| "loss": 1.0087, | |
| "num_input_tokens_seen": 179411592, | |
| "step": 331500 | |
| }, | |
| { | |
| "epoch": 3.3215943652953417, | |
| "grad_norm": 5.875650405883789, | |
| "learning_rate": 1.6784156395069635e-05, | |
| "loss": 1.0076, | |
| "num_input_tokens_seen": 179672616, | |
| "step": 332000 | |
| }, | |
| { | |
| "epoch": 3.326596766447895, | |
| "grad_norm": 5.798732280731201, | |
| "learning_rate": 1.67341323835441e-05, | |
| "loss": 1.0121, | |
| "num_input_tokens_seen": 179943344, | |
| "step": 332500 | |
| }, | |
| { | |
| "epoch": 3.3315991676004484, | |
| "grad_norm": 5.229135513305664, | |
| "learning_rate": 1.6684108372018568e-05, | |
| "loss": 1.018, | |
| "num_input_tokens_seen": 180212504, | |
| "step": 333000 | |
| }, | |
| { | |
| "epoch": 3.3366015687530015, | |
| "grad_norm": 6.477422714233398, | |
| "learning_rate": 1.6634084360493038e-05, | |
| "loss": 0.9992, | |
| "num_input_tokens_seen": 180482184, | |
| "step": 333500 | |
| }, | |
| { | |
| "epoch": 3.3416039699055546, | |
| "grad_norm": 6.4892497062683105, | |
| "learning_rate": 1.6584060348967505e-05, | |
| "loss": 1.0223, | |
| "num_input_tokens_seen": 180754256, | |
| "step": 334000 | |
| }, | |
| { | |
| "epoch": 3.3466063710581078, | |
| "grad_norm": 5.80246639251709, | |
| "learning_rate": 1.653403633744197e-05, | |
| "loss": 1.0102, | |
| "num_input_tokens_seen": 181030712, | |
| "step": 334500 | |
| }, | |
| { | |
| "epoch": 3.3516087722106613, | |
| "grad_norm": 5.75023078918457, | |
| "learning_rate": 1.648401232591644e-05, | |
| "loss": 1.0111, | |
| "num_input_tokens_seen": 181295800, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 3.3566111733632145, | |
| "grad_norm": 5.059189796447754, | |
| "learning_rate": 1.6433988314390908e-05, | |
| "loss": 1.01, | |
| "num_input_tokens_seen": 181561000, | |
| "step": 335500 | |
| }, | |
| { | |
| "epoch": 3.3616135745157676, | |
| "grad_norm": 5.798236846923828, | |
| "learning_rate": 1.6383964302865378e-05, | |
| "loss": 1.0201, | |
| "num_input_tokens_seen": 181836528, | |
| "step": 336000 | |
| }, | |
| { | |
| "epoch": 3.3666159756683207, | |
| "grad_norm": 6.402642250061035, | |
| "learning_rate": 1.6333940291339844e-05, | |
| "loss": 1.0191, | |
| "num_input_tokens_seen": 182106640, | |
| "step": 336500 | |
| }, | |
| { | |
| "epoch": 3.371618376820874, | |
| "grad_norm": 6.876767635345459, | |
| "learning_rate": 1.6283916279814314e-05, | |
| "loss": 1.0171, | |
| "num_input_tokens_seen": 182385832, | |
| "step": 337000 | |
| }, | |
| { | |
| "epoch": 3.3766207779734274, | |
| "grad_norm": 8.243651390075684, | |
| "learning_rate": 1.623389226828878e-05, | |
| "loss": 1.0136, | |
| "num_input_tokens_seen": 182652112, | |
| "step": 337500 | |
| }, | |
| { | |
| "epoch": 3.3816231791259805, | |
| "grad_norm": 6.981409072875977, | |
| "learning_rate": 1.6183868256763247e-05, | |
| "loss": 1.0027, | |
| "num_input_tokens_seen": 182920216, | |
| "step": 338000 | |
| }, | |
| { | |
| "epoch": 3.3866255802785337, | |
| "grad_norm": 5.377172470092773, | |
| "learning_rate": 1.6133844245237714e-05, | |
| "loss": 1.0129, | |
| "num_input_tokens_seen": 183185600, | |
| "step": 338500 | |
| }, | |
| { | |
| "epoch": 3.391627981431087, | |
| "grad_norm": 7.256112575531006, | |
| "learning_rate": 1.6083820233712184e-05, | |
| "loss": 1.0061, | |
| "num_input_tokens_seen": 183453440, | |
| "step": 339000 | |
| }, | |
| { | |
| "epoch": 3.3966303825836404, | |
| "grad_norm": 5.170373916625977, | |
| "learning_rate": 1.603379622218665e-05, | |
| "loss": 1.0127, | |
| "num_input_tokens_seen": 183722560, | |
| "step": 339500 | |
| }, | |
| { | |
| "epoch": 3.4016327837361935, | |
| "grad_norm": 5.4537248611450195, | |
| "learning_rate": 1.598377221066112e-05, | |
| "loss": 1.0084, | |
| "num_input_tokens_seen": 183991520, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 3.4066351848887466, | |
| "grad_norm": 6.006635665893555, | |
| "learning_rate": 1.5933748199135587e-05, | |
| "loss": 1.0017, | |
| "num_input_tokens_seen": 184263976, | |
| "step": 340500 | |
| }, | |
| { | |
| "epoch": 3.4116375860412997, | |
| "grad_norm": 5.732070446014404, | |
| "learning_rate": 1.5883724187610053e-05, | |
| "loss": 1.0124, | |
| "num_input_tokens_seen": 184528864, | |
| "step": 341000 | |
| }, | |
| { | |
| "epoch": 3.416639987193853, | |
| "grad_norm": 7.936917304992676, | |
| "learning_rate": 1.583370017608452e-05, | |
| "loss": 1.0183, | |
| "num_input_tokens_seen": 184798528, | |
| "step": 341500 | |
| }, | |
| { | |
| "epoch": 3.4216423883464064, | |
| "grad_norm": 7.584635257720947, | |
| "learning_rate": 1.578367616455899e-05, | |
| "loss": 1.0236, | |
| "num_input_tokens_seen": 185063968, | |
| "step": 342000 | |
| }, | |
| { | |
| "epoch": 3.4266447894989596, | |
| "grad_norm": 6.110123634338379, | |
| "learning_rate": 1.5733652153033456e-05, | |
| "loss": 1.0113, | |
| "num_input_tokens_seen": 185328760, | |
| "step": 342500 | |
| }, | |
| { | |
| "epoch": 3.4316471906515127, | |
| "grad_norm": 6.163844585418701, | |
| "learning_rate": 1.5683628141507926e-05, | |
| "loss": 1.0143, | |
| "num_input_tokens_seen": 185601312, | |
| "step": 343000 | |
| }, | |
| { | |
| "epoch": 3.436649591804066, | |
| "grad_norm": 6.428155899047852, | |
| "learning_rate": 1.5633604129982393e-05, | |
| "loss": 1.0231, | |
| "num_input_tokens_seen": 185868856, | |
| "step": 343500 | |
| }, | |
| { | |
| "epoch": 3.4416519929566194, | |
| "grad_norm": 4.938517093658447, | |
| "learning_rate": 1.5583580118456863e-05, | |
| "loss": 1.029, | |
| "num_input_tokens_seen": 186141200, | |
| "step": 344000 | |
| }, | |
| { | |
| "epoch": 3.4466543941091725, | |
| "grad_norm": 6.4214019775390625, | |
| "learning_rate": 1.5533556106931326e-05, | |
| "loss": 1.0117, | |
| "num_input_tokens_seen": 186415152, | |
| "step": 344500 | |
| }, | |
| { | |
| "epoch": 3.4516567952617256, | |
| "grad_norm": 5.303710460662842, | |
| "learning_rate": 1.5483532095405796e-05, | |
| "loss": 1.0094, | |
| "num_input_tokens_seen": 186690176, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 3.4566591964142788, | |
| "grad_norm": 4.86320161819458, | |
| "learning_rate": 1.5433508083880262e-05, | |
| "loss": 1.0027, | |
| "num_input_tokens_seen": 186957432, | |
| "step": 345500 | |
| }, | |
| { | |
| "epoch": 3.461661597566832, | |
| "grad_norm": 4.321279048919678, | |
| "learning_rate": 1.5383484072354732e-05, | |
| "loss": 1.0123, | |
| "num_input_tokens_seen": 187226552, | |
| "step": 346000 | |
| }, | |
| { | |
| "epoch": 3.4666639987193855, | |
| "grad_norm": 5.8327860832214355, | |
| "learning_rate": 1.53334600608292e-05, | |
| "loss": 1.0057, | |
| "num_input_tokens_seen": 187497776, | |
| "step": 346500 | |
| }, | |
| { | |
| "epoch": 3.4716663998719386, | |
| "grad_norm": 9.191901206970215, | |
| "learning_rate": 1.528343604930367e-05, | |
| "loss": 1.0008, | |
| "num_input_tokens_seen": 187768112, | |
| "step": 347000 | |
| }, | |
| { | |
| "epoch": 3.4766688010244917, | |
| "grad_norm": 6.153154373168945, | |
| "learning_rate": 1.5233412037778134e-05, | |
| "loss": 1.0, | |
| "num_input_tokens_seen": 188036912, | |
| "step": 347500 | |
| }, | |
| { | |
| "epoch": 3.481671202177045, | |
| "grad_norm": 5.405030250549316, | |
| "learning_rate": 1.5183388026252602e-05, | |
| "loss": 1.0119, | |
| "num_input_tokens_seen": 188312952, | |
| "step": 348000 | |
| }, | |
| { | |
| "epoch": 3.4866736033295984, | |
| "grad_norm": 6.342990398406982, | |
| "learning_rate": 1.513336401472707e-05, | |
| "loss": 1.0101, | |
| "num_input_tokens_seen": 188585776, | |
| "step": 348500 | |
| }, | |
| { | |
| "epoch": 3.4916760044821515, | |
| "grad_norm": 6.2145867347717285, | |
| "learning_rate": 1.5083340003201538e-05, | |
| "loss": 1.0018, | |
| "num_input_tokens_seen": 188856160, | |
| "step": 349000 | |
| }, | |
| { | |
| "epoch": 3.4966784056347047, | |
| "grad_norm": 5.479875564575195, | |
| "learning_rate": 1.5033315991676007e-05, | |
| "loss": 1.0092, | |
| "num_input_tokens_seen": 189125536, | |
| "step": 349500 | |
| }, | |
| { | |
| "epoch": 3.501680806787258, | |
| "grad_norm": 6.38485050201416, | |
| "learning_rate": 1.4983291980150473e-05, | |
| "loss": 1.0074, | |
| "num_input_tokens_seen": 189397856, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 3.506683207939811, | |
| "grad_norm": 5.203739166259766, | |
| "learning_rate": 1.493326796862494e-05, | |
| "loss": 0.9897, | |
| "num_input_tokens_seen": 189664624, | |
| "step": 350500 | |
| }, | |
| { | |
| "epoch": 3.5116856090923645, | |
| "grad_norm": 6.554189682006836, | |
| "learning_rate": 1.4883243957099408e-05, | |
| "loss": 1.0222, | |
| "num_input_tokens_seen": 189931544, | |
| "step": 351000 | |
| }, | |
| { | |
| "epoch": 3.5166880102449176, | |
| "grad_norm": 6.045382022857666, | |
| "learning_rate": 1.4833219945573876e-05, | |
| "loss": 1.0108, | |
| "num_input_tokens_seen": 190201904, | |
| "step": 351500 | |
| }, | |
| { | |
| "epoch": 3.5216904113974707, | |
| "grad_norm": 5.883347988128662, | |
| "learning_rate": 1.4783195934048344e-05, | |
| "loss": 1.0204, | |
| "num_input_tokens_seen": 190469632, | |
| "step": 352000 | |
| }, | |
| { | |
| "epoch": 3.526692812550024, | |
| "grad_norm": 5.156943321228027, | |
| "learning_rate": 1.4733171922522813e-05, | |
| "loss": 0.9963, | |
| "num_input_tokens_seen": 190737512, | |
| "step": 352500 | |
| }, | |
| { | |
| "epoch": 3.5316952137025774, | |
| "grad_norm": 5.740571975708008, | |
| "learning_rate": 1.4683147910997281e-05, | |
| "loss": 1.0157, | |
| "num_input_tokens_seen": 191003792, | |
| "step": 353000 | |
| }, | |
| { | |
| "epoch": 3.5366976148551306, | |
| "grad_norm": 5.744316101074219, | |
| "learning_rate": 1.463312389947175e-05, | |
| "loss": 1.0182, | |
| "num_input_tokens_seen": 191276360, | |
| "step": 353500 | |
| }, | |
| { | |
| "epoch": 3.5417000160076837, | |
| "grad_norm": 6.743235111236572, | |
| "learning_rate": 1.4583099887946214e-05, | |
| "loss": 1.0212, | |
| "num_input_tokens_seen": 191547000, | |
| "step": 354000 | |
| }, | |
| { | |
| "epoch": 3.546702417160237, | |
| "grad_norm": 6.034450531005859, | |
| "learning_rate": 1.4533075876420682e-05, | |
| "loss": 1.0159, | |
| "num_input_tokens_seen": 191816024, | |
| "step": 354500 | |
| }, | |
| { | |
| "epoch": 3.55170481831279, | |
| "grad_norm": 6.9873833656311035, | |
| "learning_rate": 1.448305186489515e-05, | |
| "loss": 0.9996, | |
| "num_input_tokens_seen": 192078648, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 3.5567072194653435, | |
| "grad_norm": 4.8513078689575195, | |
| "learning_rate": 1.4433027853369619e-05, | |
| "loss": 1.0022, | |
| "num_input_tokens_seen": 192346960, | |
| "step": 355500 | |
| }, | |
| { | |
| "epoch": 3.5617096206178966, | |
| "grad_norm": 6.602761268615723, | |
| "learning_rate": 1.4383003841844087e-05, | |
| "loss": 1.0182, | |
| "num_input_tokens_seen": 192617800, | |
| "step": 356000 | |
| }, | |
| { | |
| "epoch": 3.5667120217704498, | |
| "grad_norm": 5.9454851150512695, | |
| "learning_rate": 1.4332979830318555e-05, | |
| "loss": 1.0032, | |
| "num_input_tokens_seen": 192887048, | |
| "step": 356500 | |
| }, | |
| { | |
| "epoch": 3.571714422923003, | |
| "grad_norm": 4.868193626403809, | |
| "learning_rate": 1.428295581879302e-05, | |
| "loss": 0.9844, | |
| "num_input_tokens_seen": 193153600, | |
| "step": 357000 | |
| }, | |
| { | |
| "epoch": 3.5767168240755565, | |
| "grad_norm": 5.1398749351501465, | |
| "learning_rate": 1.4232931807267488e-05, | |
| "loss": 1.0119, | |
| "num_input_tokens_seen": 193428088, | |
| "step": 357500 | |
| }, | |
| { | |
| "epoch": 3.5817192252281096, | |
| "grad_norm": 5.984772682189941, | |
| "learning_rate": 1.4182907795741957e-05, | |
| "loss": 1.0014, | |
| "num_input_tokens_seen": 193702344, | |
| "step": 358000 | |
| }, | |
| { | |
| "epoch": 3.5867216263806627, | |
| "grad_norm": 5.537957191467285, | |
| "learning_rate": 1.4132883784216425e-05, | |
| "loss": 1.0158, | |
| "num_input_tokens_seen": 193976088, | |
| "step": 358500 | |
| }, | |
| { | |
| "epoch": 3.591724027533216, | |
| "grad_norm": 5.605039119720459, | |
| "learning_rate": 1.4082859772690893e-05, | |
| "loss": 1.0175, | |
| "num_input_tokens_seen": 194241888, | |
| "step": 359000 | |
| }, | |
| { | |
| "epoch": 3.596726428685769, | |
| "grad_norm": 8.43393611907959, | |
| "learning_rate": 1.4032835761165361e-05, | |
| "loss": 1.0001, | |
| "num_input_tokens_seen": 194514168, | |
| "step": 359500 | |
| }, | |
| { | |
| "epoch": 3.6017288298383225, | |
| "grad_norm": 6.197558403015137, | |
| "learning_rate": 1.3982811749639826e-05, | |
| "loss": 1.0006, | |
| "num_input_tokens_seen": 194788128, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 3.6067312309908757, | |
| "grad_norm": 6.9046430587768555, | |
| "learning_rate": 1.3932787738114294e-05, | |
| "loss": 1.0163, | |
| "num_input_tokens_seen": 195057368, | |
| "step": 360500 | |
| }, | |
| { | |
| "epoch": 3.611733632143429, | |
| "grad_norm": 6.350090026855469, | |
| "learning_rate": 1.3882763726588763e-05, | |
| "loss": 0.9951, | |
| "num_input_tokens_seen": 195326896, | |
| "step": 361000 | |
| }, | |
| { | |
| "epoch": 3.616736033295982, | |
| "grad_norm": 7.581150531768799, | |
| "learning_rate": 1.3832739715063231e-05, | |
| "loss": 1.0004, | |
| "num_input_tokens_seen": 195599832, | |
| "step": 361500 | |
| }, | |
| { | |
| "epoch": 3.6217384344485355, | |
| "grad_norm": 5.0561017990112305, | |
| "learning_rate": 1.3782715703537699e-05, | |
| "loss": 1.0211, | |
| "num_input_tokens_seen": 195878880, | |
| "step": 362000 | |
| }, | |
| { | |
| "epoch": 3.6267408356010886, | |
| "grad_norm": 6.046396732330322, | |
| "learning_rate": 1.3732691692012167e-05, | |
| "loss": 1.0113, | |
| "num_input_tokens_seen": 196149072, | |
| "step": 362500 | |
| }, | |
| { | |
| "epoch": 3.6317432367536417, | |
| "grad_norm": 6.726164817810059, | |
| "learning_rate": 1.3682667680486636e-05, | |
| "loss": 1.0031, | |
| "num_input_tokens_seen": 196417928, | |
| "step": 363000 | |
| }, | |
| { | |
| "epoch": 3.636745637906195, | |
| "grad_norm": 10.790836334228516, | |
| "learning_rate": 1.36326436689611e-05, | |
| "loss": 1.0076, | |
| "num_input_tokens_seen": 196688920, | |
| "step": 363500 | |
| }, | |
| { | |
| "epoch": 3.641748039058748, | |
| "grad_norm": 6.029910087585449, | |
| "learning_rate": 1.3582619657435569e-05, | |
| "loss": 1.0027, | |
| "num_input_tokens_seen": 196965448, | |
| "step": 364000 | |
| }, | |
| { | |
| "epoch": 3.6467504402113016, | |
| "grad_norm": 4.5111308097839355, | |
| "learning_rate": 1.3532595645910037e-05, | |
| "loss": 0.9984, | |
| "num_input_tokens_seen": 197241664, | |
| "step": 364500 | |
| }, | |
| { | |
| "epoch": 3.6517528413638547, | |
| "grad_norm": 7.830237865447998, | |
| "learning_rate": 1.3482571634384505e-05, | |
| "loss": 1.0109, | |
| "num_input_tokens_seen": 197508048, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 3.656755242516408, | |
| "grad_norm": 6.430135726928711, | |
| "learning_rate": 1.3432547622858973e-05, | |
| "loss": 1.0041, | |
| "num_input_tokens_seen": 197782528, | |
| "step": 365500 | |
| }, | |
| { | |
| "epoch": 3.661757643668961, | |
| "grad_norm": 8.756442070007324, | |
| "learning_rate": 1.3382523611333442e-05, | |
| "loss": 0.997, | |
| "num_input_tokens_seen": 198054664, | |
| "step": 366000 | |
| }, | |
| { | |
| "epoch": 3.6667600448215145, | |
| "grad_norm": 6.538286209106445, | |
| "learning_rate": 1.3332499599807907e-05, | |
| "loss": 1.0085, | |
| "num_input_tokens_seen": 198320680, | |
| "step": 366500 | |
| }, | |
| { | |
| "epoch": 3.6717624459740676, | |
| "grad_norm": 7.97443151473999, | |
| "learning_rate": 1.3282475588282375e-05, | |
| "loss": 1.0125, | |
| "num_input_tokens_seen": 198590752, | |
| "step": 367000 | |
| }, | |
| { | |
| "epoch": 3.6767648471266208, | |
| "grad_norm": 5.407761573791504, | |
| "learning_rate": 1.3232451576756843e-05, | |
| "loss": 1.029, | |
| "num_input_tokens_seen": 198861544, | |
| "step": 367500 | |
| }, | |
| { | |
| "epoch": 3.681767248279174, | |
| "grad_norm": 6.2920355796813965, | |
| "learning_rate": 1.3182427565231311e-05, | |
| "loss": 1.0131, | |
| "num_input_tokens_seen": 199129704, | |
| "step": 368000 | |
| }, | |
| { | |
| "epoch": 3.686769649431727, | |
| "grad_norm": 8.73907470703125, | |
| "learning_rate": 1.313240355370578e-05, | |
| "loss": 1.0051, | |
| "num_input_tokens_seen": 199397264, | |
| "step": 368500 | |
| }, | |
| { | |
| "epoch": 3.6917720505842806, | |
| "grad_norm": 6.030662536621094, | |
| "learning_rate": 1.3082379542180248e-05, | |
| "loss": 1.008, | |
| "num_input_tokens_seen": 199664200, | |
| "step": 369000 | |
| }, | |
| { | |
| "epoch": 3.6967744517368337, | |
| "grad_norm": 6.953051567077637, | |
| "learning_rate": 1.3032355530654716e-05, | |
| "loss": 1.0075, | |
| "num_input_tokens_seen": 199933048, | |
| "step": 369500 | |
| }, | |
| { | |
| "epoch": 3.701776852889387, | |
| "grad_norm": 6.4026618003845215, | |
| "learning_rate": 1.2982331519129181e-05, | |
| "loss": 0.9982, | |
| "num_input_tokens_seen": 200203568, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 3.70677925404194, | |
| "grad_norm": 8.120702743530273, | |
| "learning_rate": 1.2932307507603649e-05, | |
| "loss": 1.0093, | |
| "num_input_tokens_seen": 200475136, | |
| "step": 370500 | |
| }, | |
| { | |
| "epoch": 3.7117816551944935, | |
| "grad_norm": 4.819450855255127, | |
| "learning_rate": 1.2882283496078117e-05, | |
| "loss": 0.9915, | |
| "num_input_tokens_seen": 200751456, | |
| "step": 371000 | |
| }, | |
| { | |
| "epoch": 3.7167840563470467, | |
| "grad_norm": 6.006054878234863, | |
| "learning_rate": 1.2832259484552586e-05, | |
| "loss": 0.9993, | |
| "num_input_tokens_seen": 201022872, | |
| "step": 371500 | |
| }, | |
| { | |
| "epoch": 3.7217864574996, | |
| "grad_norm": 5.173740386962891, | |
| "learning_rate": 1.2782235473027054e-05, | |
| "loss": 1.0043, | |
| "num_input_tokens_seen": 201293168, | |
| "step": 372000 | |
| }, | |
| { | |
| "epoch": 3.726788858652153, | |
| "grad_norm": 4.628252983093262, | |
| "learning_rate": 1.2732211461501522e-05, | |
| "loss": 1.0047, | |
| "num_input_tokens_seen": 201566544, | |
| "step": 372500 | |
| }, | |
| { | |
| "epoch": 3.731791259804706, | |
| "grad_norm": 5.305530548095703, | |
| "learning_rate": 1.2682187449975989e-05, | |
| "loss": 1.0084, | |
| "num_input_tokens_seen": 201835288, | |
| "step": 373000 | |
| }, | |
| { | |
| "epoch": 3.7367936609572596, | |
| "grad_norm": 4.96281623840332, | |
| "learning_rate": 1.2632163438450457e-05, | |
| "loss": 1.009, | |
| "num_input_tokens_seen": 202106240, | |
| "step": 373500 | |
| }, | |
| { | |
| "epoch": 3.7417960621098127, | |
| "grad_norm": 5.782063007354736, | |
| "learning_rate": 1.2582139426924925e-05, | |
| "loss": 0.9932, | |
| "num_input_tokens_seen": 202380080, | |
| "step": 374000 | |
| }, | |
| { | |
| "epoch": 3.746798463262366, | |
| "grad_norm": 5.399883270263672, | |
| "learning_rate": 1.2532115415399392e-05, | |
| "loss": 1.0138, | |
| "num_input_tokens_seen": 202647520, | |
| "step": 374500 | |
| }, | |
| { | |
| "epoch": 3.751800864414919, | |
| "grad_norm": 5.419944763183594, | |
| "learning_rate": 1.248209140387386e-05, | |
| "loss": 0.9949, | |
| "num_input_tokens_seen": 202913024, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 3.7568032655674726, | |
| "grad_norm": 5.332630157470703, | |
| "learning_rate": 1.2432067392348328e-05, | |
| "loss": 1.0018, | |
| "num_input_tokens_seen": 203175136, | |
| "step": 375500 | |
| }, | |
| { | |
| "epoch": 3.7618056667200257, | |
| "grad_norm": 5.563337802886963, | |
| "learning_rate": 1.2382043380822796e-05, | |
| "loss": 1.0257, | |
| "num_input_tokens_seen": 203444624, | |
| "step": 376000 | |
| }, | |
| { | |
| "epoch": 3.766808067872579, | |
| "grad_norm": 7.322454452514648, | |
| "learning_rate": 1.2332019369297265e-05, | |
| "loss": 1.0097, | |
| "num_input_tokens_seen": 203721152, | |
| "step": 376500 | |
| }, | |
| { | |
| "epoch": 3.771810469025132, | |
| "grad_norm": 5.674718379974365, | |
| "learning_rate": 1.2281995357771731e-05, | |
| "loss": 0.9849, | |
| "num_input_tokens_seen": 203993696, | |
| "step": 377000 | |
| }, | |
| { | |
| "epoch": 3.776812870177685, | |
| "grad_norm": 6.736847877502441, | |
| "learning_rate": 1.22319713462462e-05, | |
| "loss": 0.9913, | |
| "num_input_tokens_seen": 204263944, | |
| "step": 377500 | |
| }, | |
| { | |
| "epoch": 3.7818152713302386, | |
| "grad_norm": 6.920697212219238, | |
| "learning_rate": 1.2181947334720668e-05, | |
| "loss": 0.9984, | |
| "num_input_tokens_seen": 204534952, | |
| "step": 378000 | |
| }, | |
| { | |
| "epoch": 3.7868176724827918, | |
| "grad_norm": 6.516974449157715, | |
| "learning_rate": 1.2131923323195134e-05, | |
| "loss": 1.01, | |
| "num_input_tokens_seen": 204808160, | |
| "step": 378500 | |
| }, | |
| { | |
| "epoch": 3.791820073635345, | |
| "grad_norm": 5.656439781188965, | |
| "learning_rate": 1.2081899311669602e-05, | |
| "loss": 1.0013, | |
| "num_input_tokens_seen": 205082712, | |
| "step": 379000 | |
| }, | |
| { | |
| "epoch": 3.796822474787898, | |
| "grad_norm": 4.886724472045898, | |
| "learning_rate": 1.203187530014407e-05, | |
| "loss": 1.011, | |
| "num_input_tokens_seen": 205354224, | |
| "step": 379500 | |
| }, | |
| { | |
| "epoch": 3.8018248759404516, | |
| "grad_norm": 8.127188682556152, | |
| "learning_rate": 1.1981851288618537e-05, | |
| "loss": 1.0004, | |
| "num_input_tokens_seen": 205626120, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 3.8068272770930047, | |
| "grad_norm": 7.59630823135376, | |
| "learning_rate": 1.1931827277093005e-05, | |
| "loss": 1.0078, | |
| "num_input_tokens_seen": 205895872, | |
| "step": 380500 | |
| }, | |
| { | |
| "epoch": 3.811829678245558, | |
| "grad_norm": 5.615649700164795, | |
| "learning_rate": 1.1881803265567474e-05, | |
| "loss": 0.9887, | |
| "num_input_tokens_seen": 206161088, | |
| "step": 381000 | |
| }, | |
| { | |
| "epoch": 3.816832079398111, | |
| "grad_norm": 5.560026168823242, | |
| "learning_rate": 1.183177925404194e-05, | |
| "loss": 0.9986, | |
| "num_input_tokens_seen": 206434032, | |
| "step": 381500 | |
| }, | |
| { | |
| "epoch": 3.821834480550664, | |
| "grad_norm": 4.2973480224609375, | |
| "learning_rate": 1.1781755242516409e-05, | |
| "loss": 0.9843, | |
| "num_input_tokens_seen": 206711936, | |
| "step": 382000 | |
| }, | |
| { | |
| "epoch": 3.8268368817032177, | |
| "grad_norm": 4.312121391296387, | |
| "learning_rate": 1.1731731230990877e-05, | |
| "loss": 0.995, | |
| "num_input_tokens_seen": 206984224, | |
| "step": 382500 | |
| }, | |
| { | |
| "epoch": 3.831839282855771, | |
| "grad_norm": 5.747461795806885, | |
| "learning_rate": 1.1681707219465345e-05, | |
| "loss": 1.0087, | |
| "num_input_tokens_seen": 207254056, | |
| "step": 383000 | |
| }, | |
| { | |
| "epoch": 3.836841684008324, | |
| "grad_norm": 5.281491279602051, | |
| "learning_rate": 1.1631683207939812e-05, | |
| "loss": 1.0027, | |
| "num_input_tokens_seen": 207532144, | |
| "step": 383500 | |
| }, | |
| { | |
| "epoch": 3.8418440851608775, | |
| "grad_norm": 6.745446681976318, | |
| "learning_rate": 1.158165919641428e-05, | |
| "loss": 1.0079, | |
| "num_input_tokens_seen": 207801576, | |
| "step": 384000 | |
| }, | |
| { | |
| "epoch": 3.8468464863134306, | |
| "grad_norm": 6.459105968475342, | |
| "learning_rate": 1.1531635184888748e-05, | |
| "loss": 1.0094, | |
| "num_input_tokens_seen": 208065512, | |
| "step": 384500 | |
| }, | |
| { | |
| "epoch": 3.8518488874659838, | |
| "grad_norm": 5.599144458770752, | |
| "learning_rate": 1.1481611173363215e-05, | |
| "loss": 0.9959, | |
| "num_input_tokens_seen": 208339576, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 3.856851288618537, | |
| "grad_norm": 7.585973739624023, | |
| "learning_rate": 1.1431587161837683e-05, | |
| "loss": 1.0037, | |
| "num_input_tokens_seen": 208617584, | |
| "step": 385500 | |
| }, | |
| { | |
| "epoch": 3.86185368977109, | |
| "grad_norm": 6.892619609832764, | |
| "learning_rate": 1.1381563150312151e-05, | |
| "loss": 0.9997, | |
| "num_input_tokens_seen": 208880256, | |
| "step": 386000 | |
| }, | |
| { | |
| "epoch": 3.866856090923643, | |
| "grad_norm": 7.016179084777832, | |
| "learning_rate": 1.1331539138786618e-05, | |
| "loss": 1.0165, | |
| "num_input_tokens_seen": 209152592, | |
| "step": 386500 | |
| }, | |
| { | |
| "epoch": 3.8718584920761967, | |
| "grad_norm": 7.874701976776123, | |
| "learning_rate": 1.1281515127261086e-05, | |
| "loss": 0.9907, | |
| "num_input_tokens_seen": 209423248, | |
| "step": 387000 | |
| }, | |
| { | |
| "epoch": 3.87686089322875, | |
| "grad_norm": 5.5261030197143555, | |
| "learning_rate": 1.1231491115735554e-05, | |
| "loss": 1.0029, | |
| "num_input_tokens_seen": 209692384, | |
| "step": 387500 | |
| }, | |
| { | |
| "epoch": 3.881863294381303, | |
| "grad_norm": 8.382930755615234, | |
| "learning_rate": 1.118146710421002e-05, | |
| "loss": 0.9865, | |
| "num_input_tokens_seen": 209961656, | |
| "step": 388000 | |
| }, | |
| { | |
| "epoch": 3.8868656955338565, | |
| "grad_norm": 4.629281044006348, | |
| "learning_rate": 1.1131443092684489e-05, | |
| "loss": 0.995, | |
| "num_input_tokens_seen": 210227928, | |
| "step": 388500 | |
| }, | |
| { | |
| "epoch": 3.8918680966864097, | |
| "grad_norm": 4.65085506439209, | |
| "learning_rate": 1.1081419081158957e-05, | |
| "loss": 0.9815, | |
| "num_input_tokens_seen": 210494672, | |
| "step": 389000 | |
| }, | |
| { | |
| "epoch": 3.8968704978389628, | |
| "grad_norm": 5.350659370422363, | |
| "learning_rate": 1.1031395069633424e-05, | |
| "loss": 0.9988, | |
| "num_input_tokens_seen": 210757368, | |
| "step": 389500 | |
| }, | |
| { | |
| "epoch": 3.901872898991516, | |
| "grad_norm": 6.074803829193115, | |
| "learning_rate": 1.0981371058107892e-05, | |
| "loss": 0.9996, | |
| "num_input_tokens_seen": 211028544, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 3.906875300144069, | |
| "grad_norm": 5.179644584655762, | |
| "learning_rate": 1.093134704658236e-05, | |
| "loss": 0.99, | |
| "num_input_tokens_seen": 211297312, | |
| "step": 390500 | |
| }, | |
| { | |
| "epoch": 3.911877701296622, | |
| "grad_norm": 6.687560081481934, | |
| "learning_rate": 1.0881323035056828e-05, | |
| "loss": 1.0047, | |
| "num_input_tokens_seen": 211563536, | |
| "step": 391000 | |
| }, | |
| { | |
| "epoch": 3.9168801024491757, | |
| "grad_norm": 5.4230570793151855, | |
| "learning_rate": 1.0831299023531295e-05, | |
| "loss": 1.0096, | |
| "num_input_tokens_seen": 211837976, | |
| "step": 391500 | |
| }, | |
| { | |
| "epoch": 3.921882503601729, | |
| "grad_norm": 5.730657577514648, | |
| "learning_rate": 1.0781275012005763e-05, | |
| "loss": 1.0044, | |
| "num_input_tokens_seen": 212105824, | |
| "step": 392000 | |
| }, | |
| { | |
| "epoch": 3.926884904754282, | |
| "grad_norm": 6.501159191131592, | |
| "learning_rate": 1.0731251000480231e-05, | |
| "loss": 0.9933, | |
| "num_input_tokens_seen": 212377128, | |
| "step": 392500 | |
| }, | |
| { | |
| "epoch": 3.9318873059068356, | |
| "grad_norm": 6.957518100738525, | |
| "learning_rate": 1.0681226988954698e-05, | |
| "loss": 0.9987, | |
| "num_input_tokens_seen": 212653848, | |
| "step": 393000 | |
| }, | |
| { | |
| "epoch": 3.9368897070593887, | |
| "grad_norm": 6.272824287414551, | |
| "learning_rate": 1.0631202977429166e-05, | |
| "loss": 0.9901, | |
| "num_input_tokens_seen": 212924520, | |
| "step": 393500 | |
| }, | |
| { | |
| "epoch": 3.941892108211942, | |
| "grad_norm": 7.048558712005615, | |
| "learning_rate": 1.0581178965903634e-05, | |
| "loss": 0.9943, | |
| "num_input_tokens_seen": 213198256, | |
| "step": 394000 | |
| }, | |
| { | |
| "epoch": 3.946894509364495, | |
| "grad_norm": 6.269680976867676, | |
| "learning_rate": 1.0531154954378101e-05, | |
| "loss": 0.9846, | |
| "num_input_tokens_seen": 213471024, | |
| "step": 394500 | |
| }, | |
| { | |
| "epoch": 3.951896910517048, | |
| "grad_norm": 5.69096565246582, | |
| "learning_rate": 1.048113094285257e-05, | |
| "loss": 0.9993, | |
| "num_input_tokens_seen": 213740224, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 3.956899311669601, | |
| "grad_norm": 6.711835861206055, | |
| "learning_rate": 1.0431106931327038e-05, | |
| "loss": 0.9994, | |
| "num_input_tokens_seen": 214004440, | |
| "step": 395500 | |
| }, | |
| { | |
| "epoch": 3.9619017128221548, | |
| "grad_norm": 9.549476623535156, | |
| "learning_rate": 1.0381082919801504e-05, | |
| "loss": 0.9853, | |
| "num_input_tokens_seen": 214275152, | |
| "step": 396000 | |
| }, | |
| { | |
| "epoch": 3.966904113974708, | |
| "grad_norm": 6.297016620635986, | |
| "learning_rate": 1.0331058908275972e-05, | |
| "loss": 0.9931, | |
| "num_input_tokens_seen": 214549672, | |
| "step": 396500 | |
| }, | |
| { | |
| "epoch": 3.971906515127261, | |
| "grad_norm": 5.232683181762695, | |
| "learning_rate": 1.028103489675044e-05, | |
| "loss": 0.9943, | |
| "num_input_tokens_seen": 214823176, | |
| "step": 397000 | |
| }, | |
| { | |
| "epoch": 3.9769089162798146, | |
| "grad_norm": 6.6180219650268555, | |
| "learning_rate": 1.0231010885224907e-05, | |
| "loss": 0.9965, | |
| "num_input_tokens_seen": 215092304, | |
| "step": 397500 | |
| }, | |
| { | |
| "epoch": 3.9819113174323677, | |
| "grad_norm": 6.272809982299805, | |
| "learning_rate": 1.0180986873699375e-05, | |
| "loss": 0.9773, | |
| "num_input_tokens_seen": 215364504, | |
| "step": 398000 | |
| }, | |
| { | |
| "epoch": 3.986913718584921, | |
| "grad_norm": 6.778554916381836, | |
| "learning_rate": 1.0130962862173844e-05, | |
| "loss": 1.0018, | |
| "num_input_tokens_seen": 215636456, | |
| "step": 398500 | |
| }, | |
| { | |
| "epoch": 3.991916119737474, | |
| "grad_norm": 5.8704071044921875, | |
| "learning_rate": 1.0080938850648312e-05, | |
| "loss": 0.9982, | |
| "num_input_tokens_seen": 215902264, | |
| "step": 399000 | |
| }, | |
| { | |
| "epoch": 3.996918520890027, | |
| "grad_norm": 6.430477142333984, | |
| "learning_rate": 1.0030914839122778e-05, | |
| "loss": 0.9897, | |
| "num_input_tokens_seen": 216171688, | |
| "step": 399500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.0106587409973145, | |
| "eval_runtime": 189.1748, | |
| "eval_samples_per_second": 1056.721, | |
| "eval_steps_per_second": 132.095, | |
| "num_input_tokens_seen": 216341560, | |
| "step": 399808 | |
| }, | |
| { | |
| "epoch": 4.00192092204258, | |
| "grad_norm": 6.6500749588012695, | |
| "learning_rate": 9.980890827597247e-06, | |
| "loss": 0.9608, | |
| "num_input_tokens_seen": 216448456, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 4.006923323195133, | |
| "grad_norm": 5.360565662384033, | |
| "learning_rate": 9.930866816071715e-06, | |
| "loss": 0.9449, | |
| "num_input_tokens_seen": 216721080, | |
| "step": 400500 | |
| }, | |
| { | |
| "epoch": 4.0119257243476865, | |
| "grad_norm": 9.556127548217773, | |
| "learning_rate": 9.880842804546183e-06, | |
| "loss": 0.9209, | |
| "num_input_tokens_seen": 216986544, | |
| "step": 401000 | |
| }, | |
| { | |
| "epoch": 4.0169281255002405, | |
| "grad_norm": 5.1922502517700195, | |
| "learning_rate": 9.830818793020651e-06, | |
| "loss": 0.9175, | |
| "num_input_tokens_seen": 217262208, | |
| "step": 401500 | |
| }, | |
| { | |
| "epoch": 4.021930526652794, | |
| "grad_norm": 6.372469902038574, | |
| "learning_rate": 9.780794781495118e-06, | |
| "loss": 0.9079, | |
| "num_input_tokens_seen": 217532376, | |
| "step": 402000 | |
| }, | |
| { | |
| "epoch": 4.026932927805347, | |
| "grad_norm": 5.694700717926025, | |
| "learning_rate": 9.730770769969586e-06, | |
| "loss": 0.9205, | |
| "num_input_tokens_seen": 217800456, | |
| "step": 402500 | |
| }, | |
| { | |
| "epoch": 4.0319353289579, | |
| "grad_norm": 7.39500617980957, | |
| "learning_rate": 9.680746758444054e-06, | |
| "loss": 0.9262, | |
| "num_input_tokens_seen": 218065808, | |
| "step": 403000 | |
| }, | |
| { | |
| "epoch": 4.036937730110453, | |
| "grad_norm": 5.675652980804443, | |
| "learning_rate": 9.630722746918523e-06, | |
| "loss": 0.9128, | |
| "num_input_tokens_seen": 218330520, | |
| "step": 403500 | |
| }, | |
| { | |
| "epoch": 4.041940131263006, | |
| "grad_norm": 4.7625203132629395, | |
| "learning_rate": 9.58069873539299e-06, | |
| "loss": 0.9438, | |
| "num_input_tokens_seen": 218603792, | |
| "step": 404000 | |
| }, | |
| { | |
| "epoch": 4.046942532415559, | |
| "grad_norm": 5.834499835968018, | |
| "learning_rate": 9.530674723867457e-06, | |
| "loss": 0.9191, | |
| "num_input_tokens_seen": 218870800, | |
| "step": 404500 | |
| }, | |
| { | |
| "epoch": 4.051944933568112, | |
| "grad_norm": 7.351385116577148, | |
| "learning_rate": 9.480650712341926e-06, | |
| "loss": 0.9257, | |
| "num_input_tokens_seen": 219142384, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 4.0569473347206655, | |
| "grad_norm": 7.003774166107178, | |
| "learning_rate": 9.430626700816392e-06, | |
| "loss": 0.935, | |
| "num_input_tokens_seen": 219404752, | |
| "step": 405500 | |
| }, | |
| { | |
| "epoch": 4.0619497358732195, | |
| "grad_norm": 6.0088019371032715, | |
| "learning_rate": 9.38060268929086e-06, | |
| "loss": 0.9307, | |
| "num_input_tokens_seen": 219675200, | |
| "step": 406000 | |
| }, | |
| { | |
| "epoch": 4.066952137025773, | |
| "grad_norm": 6.115697383880615, | |
| "learning_rate": 9.330578677765329e-06, | |
| "loss": 0.9281, | |
| "num_input_tokens_seen": 219950728, | |
| "step": 406500 | |
| }, | |
| { | |
| "epoch": 4.071954538178326, | |
| "grad_norm": 5.1921868324279785, | |
| "learning_rate": 9.280554666239795e-06, | |
| "loss": 0.9391, | |
| "num_input_tokens_seen": 220221608, | |
| "step": 407000 | |
| }, | |
| { | |
| "epoch": 4.076956939330879, | |
| "grad_norm": 6.665195465087891, | |
| "learning_rate": 9.230530654714263e-06, | |
| "loss": 0.9296, | |
| "num_input_tokens_seen": 220497552, | |
| "step": 407500 | |
| }, | |
| { | |
| "epoch": 4.081959340483432, | |
| "grad_norm": 6.402169704437256, | |
| "learning_rate": 9.180506643188732e-06, | |
| "loss": 0.9199, | |
| "num_input_tokens_seen": 220764032, | |
| "step": 408000 | |
| }, | |
| { | |
| "epoch": 4.086961741635985, | |
| "grad_norm": 5.718327045440674, | |
| "learning_rate": 9.1304826316632e-06, | |
| "loss": 0.927, | |
| "num_input_tokens_seen": 221033992, | |
| "step": 408500 | |
| }, | |
| { | |
| "epoch": 4.091964142788538, | |
| "grad_norm": 6.588100433349609, | |
| "learning_rate": 9.080458620137667e-06, | |
| "loss": 0.9374, | |
| "num_input_tokens_seen": 221308640, | |
| "step": 409000 | |
| }, | |
| { | |
| "epoch": 4.096966543941091, | |
| "grad_norm": 5.30639123916626, | |
| "learning_rate": 9.030434608612135e-06, | |
| "loss": 0.9251, | |
| "num_input_tokens_seen": 221581024, | |
| "step": 409500 | |
| }, | |
| { | |
| "epoch": 4.1019689450936445, | |
| "grad_norm": 4.8298821449279785, | |
| "learning_rate": 8.980410597086603e-06, | |
| "loss": 0.9137, | |
| "num_input_tokens_seen": 221842152, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 4.1069713462461985, | |
| "grad_norm": 5.2428483963012695, | |
| "learning_rate": 8.93038658556107e-06, | |
| "loss": 0.9333, | |
| "num_input_tokens_seen": 222117312, | |
| "step": 410500 | |
| }, | |
| { | |
| "epoch": 4.111973747398752, | |
| "grad_norm": 7.0200114250183105, | |
| "learning_rate": 8.880362574035538e-06, | |
| "loss": 0.914, | |
| "num_input_tokens_seen": 222386728, | |
| "step": 411000 | |
| }, | |
| { | |
| "epoch": 4.116976148551305, | |
| "grad_norm": 5.761682510375977, | |
| "learning_rate": 8.830338562510006e-06, | |
| "loss": 0.9236, | |
| "num_input_tokens_seen": 222656968, | |
| "step": 411500 | |
| }, | |
| { | |
| "epoch": 4.121978549703858, | |
| "grad_norm": 5.959192752838135, | |
| "learning_rate": 8.780314550984473e-06, | |
| "loss": 0.9383, | |
| "num_input_tokens_seen": 222931472, | |
| "step": 412000 | |
| }, | |
| { | |
| "epoch": 4.126980950856411, | |
| "grad_norm": 7.374218940734863, | |
| "learning_rate": 8.73029053945894e-06, | |
| "loss": 0.918, | |
| "num_input_tokens_seen": 223197176, | |
| "step": 412500 | |
| }, | |
| { | |
| "epoch": 4.131983352008964, | |
| "grad_norm": 5.401734352111816, | |
| "learning_rate": 8.680266527933409e-06, | |
| "loss": 0.9134, | |
| "num_input_tokens_seen": 223473808, | |
| "step": 413000 | |
| }, | |
| { | |
| "epoch": 4.136985753161517, | |
| "grad_norm": 5.810543537139893, | |
| "learning_rate": 8.630242516407876e-06, | |
| "loss": 0.9219, | |
| "num_input_tokens_seen": 223749560, | |
| "step": 413500 | |
| }, | |
| { | |
| "epoch": 4.14198815431407, | |
| "grad_norm": 6.911441326141357, | |
| "learning_rate": 8.580218504882344e-06, | |
| "loss": 0.9371, | |
| "num_input_tokens_seen": 224026048, | |
| "step": 414000 | |
| }, | |
| { | |
| "epoch": 4.146990555466624, | |
| "grad_norm": 5.81462287902832, | |
| "learning_rate": 8.530194493356812e-06, | |
| "loss": 0.922, | |
| "num_input_tokens_seen": 224296456, | |
| "step": 414500 | |
| }, | |
| { | |
| "epoch": 4.1519929566191776, | |
| "grad_norm": 6.728647232055664, | |
| "learning_rate": 8.480170481831279e-06, | |
| "loss": 0.9261, | |
| "num_input_tokens_seen": 224566384, | |
| "step": 415000 | |
| }, | |
| { | |
| "epoch": 4.156995357771731, | |
| "grad_norm": 6.1371564865112305, | |
| "learning_rate": 8.430146470305747e-06, | |
| "loss": 0.9286, | |
| "num_input_tokens_seen": 224841952, | |
| "step": 415500 | |
| }, | |
| { | |
| "epoch": 4.161997758924284, | |
| "grad_norm": 7.179012775421143, | |
| "learning_rate": 8.380122458780215e-06, | |
| "loss": 0.9196, | |
| "num_input_tokens_seen": 225112416, | |
| "step": 416000 | |
| }, | |
| { | |
| "epoch": 4.167000160076837, | |
| "grad_norm": 6.620611667633057, | |
| "learning_rate": 8.330098447254682e-06, | |
| "loss": 0.9358, | |
| "num_input_tokens_seen": 225389344, | |
| "step": 416500 | |
| }, | |
| { | |
| "epoch": 4.17200256122939, | |
| "grad_norm": 6.2398905754089355, | |
| "learning_rate": 8.28007443572915e-06, | |
| "loss": 0.9178, | |
| "num_input_tokens_seen": 225663376, | |
| "step": 417000 | |
| }, | |
| { | |
| "epoch": 4.177004962381943, | |
| "grad_norm": 6.071476936340332, | |
| "learning_rate": 8.230050424203618e-06, | |
| "loss": 0.9319, | |
| "num_input_tokens_seen": 225931480, | |
| "step": 417500 | |
| }, | |
| { | |
| "epoch": 4.182007363534496, | |
| "grad_norm": 5.602684497833252, | |
| "learning_rate": 8.180026412678086e-06, | |
| "loss": 0.9326, | |
| "num_input_tokens_seen": 226198008, | |
| "step": 418000 | |
| }, | |
| { | |
| "epoch": 4.187009764687049, | |
| "grad_norm": 6.832788944244385, | |
| "learning_rate": 8.130002401152553e-06, | |
| "loss": 0.929, | |
| "num_input_tokens_seen": 226466280, | |
| "step": 418500 | |
| }, | |
| { | |
| "epoch": 4.1920121658396035, | |
| "grad_norm": 6.3188862800598145, | |
| "learning_rate": 8.079978389627021e-06, | |
| "loss": 0.94, | |
| "num_input_tokens_seen": 226741632, | |
| "step": 419000 | |
| }, | |
| { | |
| "epoch": 4.197014566992157, | |
| "grad_norm": 5.0520548820495605, | |
| "learning_rate": 8.02995437810149e-06, | |
| "loss": 0.9327, | |
| "num_input_tokens_seen": 227006288, | |
| "step": 419500 | |
| }, | |
| { | |
| "epoch": 4.20201696814471, | |
| "grad_norm": 6.5376410484313965, | |
| "learning_rate": 7.979930366575956e-06, | |
| "loss": 0.9042, | |
| "num_input_tokens_seen": 227281352, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 4.207019369297263, | |
| "grad_norm": 5.164760112762451, | |
| "learning_rate": 7.929906355050424e-06, | |
| "loss": 0.9231, | |
| "num_input_tokens_seen": 227542968, | |
| "step": 420500 | |
| }, | |
| { | |
| "epoch": 4.212021770449816, | |
| "grad_norm": 9.208584785461426, | |
| "learning_rate": 7.879882343524892e-06, | |
| "loss": 0.9333, | |
| "num_input_tokens_seen": 227813704, | |
| "step": 421000 | |
| }, | |
| { | |
| "epoch": 4.217024171602369, | |
| "grad_norm": 5.241026878356934, | |
| "learning_rate": 7.829858331999359e-06, | |
| "loss": 0.9278, | |
| "num_input_tokens_seen": 228083120, | |
| "step": 421500 | |
| }, | |
| { | |
| "epoch": 4.222026572754922, | |
| "grad_norm": 6.501145839691162, | |
| "learning_rate": 7.779834320473827e-06, | |
| "loss": 0.9219, | |
| "num_input_tokens_seen": 228352616, | |
| "step": 422000 | |
| }, | |
| { | |
| "epoch": 4.227028973907475, | |
| "grad_norm": 4.962836742401123, | |
| "learning_rate": 7.729810308948296e-06, | |
| "loss": 0.9176, | |
| "num_input_tokens_seen": 228615376, | |
| "step": 422500 | |
| }, | |
| { | |
| "epoch": 4.2320313750600285, | |
| "grad_norm": 5.714748859405518, | |
| "learning_rate": 7.679786297422762e-06, | |
| "loss": 0.9459, | |
| "num_input_tokens_seen": 228890544, | |
| "step": 423000 | |
| }, | |
| { | |
| "epoch": 4.2370337762125825, | |
| "grad_norm": 5.394798755645752, | |
| "learning_rate": 7.62976228589723e-06, | |
| "loss": 0.9305, | |
| "num_input_tokens_seen": 229158352, | |
| "step": 423500 | |
| }, | |
| { | |
| "epoch": 4.242036177365136, | |
| "grad_norm": 7.450530529022217, | |
| "learning_rate": 7.579738274371699e-06, | |
| "loss": 0.9342, | |
| "num_input_tokens_seen": 229430896, | |
| "step": 424000 | |
| }, | |
| { | |
| "epoch": 4.247038578517689, | |
| "grad_norm": 6.761574745178223, | |
| "learning_rate": 7.529714262846166e-06, | |
| "loss": 0.9324, | |
| "num_input_tokens_seen": 229692632, | |
| "step": 424500 | |
| }, | |
| { | |
| "epoch": 4.252040979670242, | |
| "grad_norm": 6.781697750091553, | |
| "learning_rate": 7.479690251320634e-06, | |
| "loss": 0.9299, | |
| "num_input_tokens_seen": 229963336, | |
| "step": 425000 | |
| }, | |
| { | |
| "epoch": 4.257043380822795, | |
| "grad_norm": 6.029842376708984, | |
| "learning_rate": 7.4296662397951024e-06, | |
| "loss": 0.9411, | |
| "num_input_tokens_seen": 230235128, | |
| "step": 425500 | |
| }, | |
| { | |
| "epoch": 4.262045781975348, | |
| "grad_norm": 6.796103477478027, | |
| "learning_rate": 7.379642228269571e-06, | |
| "loss": 0.9119, | |
| "num_input_tokens_seen": 230508152, | |
| "step": 426000 | |
| }, | |
| { | |
| "epoch": 4.267048183127901, | |
| "grad_norm": 5.397275447845459, | |
| "learning_rate": 7.329618216744037e-06, | |
| "loss": 0.9287, | |
| "num_input_tokens_seen": 230778056, | |
| "step": 426500 | |
| }, | |
| { | |
| "epoch": 4.272050584280454, | |
| "grad_norm": 6.785423755645752, | |
| "learning_rate": 7.2795942052185055e-06, | |
| "loss": 0.9249, | |
| "num_input_tokens_seen": 231050672, | |
| "step": 427000 | |
| }, | |
| { | |
| "epoch": 4.2770529854330075, | |
| "grad_norm": 6.21229362487793, | |
| "learning_rate": 7.229570193692974e-06, | |
| "loss": 0.9293, | |
| "num_input_tokens_seen": 231322192, | |
| "step": 427500 | |
| }, | |
| { | |
| "epoch": 4.2820553865855615, | |
| "grad_norm": 5.904947757720947, | |
| "learning_rate": 7.17954618216744e-06, | |
| "loss": 0.9361, | |
| "num_input_tokens_seen": 231593960, | |
| "step": 428000 | |
| }, | |
| { | |
| "epoch": 4.287057787738115, | |
| "grad_norm": 4.485384464263916, | |
| "learning_rate": 7.1295221706419085e-06, | |
| "loss": 0.9359, | |
| "num_input_tokens_seen": 231857176, | |
| "step": 428500 | |
| }, | |
| { | |
| "epoch": 4.292060188890668, | |
| "grad_norm": 5.395241737365723, | |
| "learning_rate": 7.079498159116377e-06, | |
| "loss": 0.9382, | |
| "num_input_tokens_seen": 232135880, | |
| "step": 429000 | |
| }, | |
| { | |
| "epoch": 4.297062590043221, | |
| "grad_norm": 6.106403827667236, | |
| "learning_rate": 7.029474147590843e-06, | |
| "loss": 0.9266, | |
| "num_input_tokens_seen": 232409736, | |
| "step": 429500 | |
| }, | |
| { | |
| "epoch": 4.302064991195774, | |
| "grad_norm": 5.79823637008667, | |
| "learning_rate": 6.9794501360653115e-06, | |
| "loss": 0.9083, | |
| "num_input_tokens_seen": 232681632, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 4.307067392348327, | |
| "grad_norm": 5.688789367675781, | |
| "learning_rate": 6.92942612453978e-06, | |
| "loss": 0.9145, | |
| "num_input_tokens_seen": 232953312, | |
| "step": 430500 | |
| }, | |
| { | |
| "epoch": 4.31206979350088, | |
| "grad_norm": 5.097667694091797, | |
| "learning_rate": 6.879402113014246e-06, | |
| "loss": 0.9459, | |
| "num_input_tokens_seen": 233217560, | |
| "step": 431000 | |
| }, | |
| { | |
| "epoch": 4.317072194653433, | |
| "grad_norm": 5.180954456329346, | |
| "learning_rate": 6.829378101488715e-06, | |
| "loss": 0.9302, | |
| "num_input_tokens_seen": 233491872, | |
| "step": 431500 | |
| }, | |
| { | |
| "epoch": 4.3220745958059865, | |
| "grad_norm": 5.185079574584961, | |
| "learning_rate": 6.779354089963183e-06, | |
| "loss": 0.9304, | |
| "num_input_tokens_seen": 233757296, | |
| "step": 432000 | |
| }, | |
| { | |
| "epoch": 4.3270769969585405, | |
| "grad_norm": 5.8646464347839355, | |
| "learning_rate": 6.729330078437649e-06, | |
| "loss": 0.917, | |
| "num_input_tokens_seen": 234033848, | |
| "step": 432500 | |
| }, | |
| { | |
| "epoch": 4.332079398111094, | |
| "grad_norm": 4.718979358673096, | |
| "learning_rate": 6.679306066912118e-06, | |
| "loss": 0.9355, | |
| "num_input_tokens_seen": 234301952, | |
| "step": 433000 | |
| }, | |
| { | |
| "epoch": 4.337081799263647, | |
| "grad_norm": 5.194594383239746, | |
| "learning_rate": 6.629282055386586e-06, | |
| "loss": 0.9178, | |
| "num_input_tokens_seen": 234570432, | |
| "step": 433500 | |
| }, | |
| { | |
| "epoch": 4.3420842004162, | |
| "grad_norm": 6.157474994659424, | |
| "learning_rate": 6.579258043861053e-06, | |
| "loss": 0.9306, | |
| "num_input_tokens_seen": 234848568, | |
| "step": 434000 | |
| }, | |
| { | |
| "epoch": 4.347086601568753, | |
| "grad_norm": 5.017276763916016, | |
| "learning_rate": 6.5292340323355215e-06, | |
| "loss": 0.9266, | |
| "num_input_tokens_seen": 235120480, | |
| "step": 434500 | |
| }, | |
| { | |
| "epoch": 4.352089002721306, | |
| "grad_norm": 6.485071659088135, | |
| "learning_rate": 6.479210020809989e-06, | |
| "loss": 0.9245, | |
| "num_input_tokens_seen": 235385120, | |
| "step": 435000 | |
| }, | |
| { | |
| "epoch": 4.357091403873859, | |
| "grad_norm": 6.405189514160156, | |
| "learning_rate": 6.429186009284457e-06, | |
| "loss": 0.9148, | |
| "num_input_tokens_seen": 235653424, | |
| "step": 435500 | |
| }, | |
| { | |
| "epoch": 4.362093805026412, | |
| "grad_norm": 7.216737747192383, | |
| "learning_rate": 6.3791619977589245e-06, | |
| "loss": 0.9171, | |
| "num_input_tokens_seen": 235927792, | |
| "step": 436000 | |
| }, | |
| { | |
| "epoch": 4.3670962061789655, | |
| "grad_norm": 5.484450340270996, | |
| "learning_rate": 6.329137986233393e-06, | |
| "loss": 0.9253, | |
| "num_input_tokens_seen": 236198512, | |
| "step": 436500 | |
| }, | |
| { | |
| "epoch": 4.37209860733152, | |
| "grad_norm": 5.4462971687316895, | |
| "learning_rate": 6.279113974707861e-06, | |
| "loss": 0.9052, | |
| "num_input_tokens_seen": 236470920, | |
| "step": 437000 | |
| }, | |
| { | |
| "epoch": 4.377101008484073, | |
| "grad_norm": 6.061979293823242, | |
| "learning_rate": 6.229089963182328e-06, | |
| "loss": 0.9344, | |
| "num_input_tokens_seen": 236743128, | |
| "step": 437500 | |
| }, | |
| { | |
| "epoch": 4.382103409636626, | |
| "grad_norm": 7.572735786437988, | |
| "learning_rate": 6.179065951656796e-06, | |
| "loss": 0.9262, | |
| "num_input_tokens_seen": 237019048, | |
| "step": 438000 | |
| }, | |
| { | |
| "epoch": 4.387105810789179, | |
| "grad_norm": 5.743752956390381, | |
| "learning_rate": 6.129041940131263e-06, | |
| "loss": 0.9256, | |
| "num_input_tokens_seen": 237287696, | |
| "step": 438500 | |
| }, | |
| { | |
| "epoch": 4.392108211941732, | |
| "grad_norm": 7.014731407165527, | |
| "learning_rate": 6.0790179286057314e-06, | |
| "loss": 0.9242, | |
| "num_input_tokens_seen": 237552080, | |
| "step": 439000 | |
| }, | |
| { | |
| "epoch": 4.397110613094285, | |
| "grad_norm": 7.045165061950684, | |
| "learning_rate": 6.028993917080199e-06, | |
| "loss": 0.9256, | |
| "num_input_tokens_seen": 237822224, | |
| "step": 439500 | |
| }, | |
| { | |
| "epoch": 4.402113014246838, | |
| "grad_norm": 9.347033500671387, | |
| "learning_rate": 5.978969905554666e-06, | |
| "loss": 0.9339, | |
| "num_input_tokens_seen": 238090088, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 4.4071154153993914, | |
| "grad_norm": 5.694771766662598, | |
| "learning_rate": 5.9289458940291345e-06, | |
| "loss": 0.9289, | |
| "num_input_tokens_seen": 238362352, | |
| "step": 440500 | |
| }, | |
| { | |
| "epoch": 4.412117816551945, | |
| "grad_norm": 6.164538383483887, | |
| "learning_rate": 5.878921882503602e-06, | |
| "loss": 0.9315, | |
| "num_input_tokens_seen": 238629408, | |
| "step": 441000 | |
| }, | |
| { | |
| "epoch": 4.417120217704499, | |
| "grad_norm": 6.145501136779785, | |
| "learning_rate": 5.82889787097807e-06, | |
| "loss": 0.9265, | |
| "num_input_tokens_seen": 238897488, | |
| "step": 441500 | |
| }, | |
| { | |
| "epoch": 4.422122618857052, | |
| "grad_norm": 8.055036544799805, | |
| "learning_rate": 5.7788738594525375e-06, | |
| "loss": 0.9459, | |
| "num_input_tokens_seen": 239173024, | |
| "step": 442000 | |
| }, | |
| { | |
| "epoch": 4.427125020009605, | |
| "grad_norm": 6.102746963500977, | |
| "learning_rate": 5.728849847927005e-06, | |
| "loss": 0.9413, | |
| "num_input_tokens_seen": 239441080, | |
| "step": 442500 | |
| }, | |
| { | |
| "epoch": 4.432127421162158, | |
| "grad_norm": 5.538390159606934, | |
| "learning_rate": 5.678825836401473e-06, | |
| "loss": 0.9356, | |
| "num_input_tokens_seen": 239711624, | |
| "step": 443000 | |
| }, | |
| { | |
| "epoch": 4.437129822314711, | |
| "grad_norm": 6.128843307495117, | |
| "learning_rate": 5.6288018248759405e-06, | |
| "loss": 0.9201, | |
| "num_input_tokens_seen": 239985480, | |
| "step": 443500 | |
| }, | |
| { | |
| "epoch": 4.442132223467264, | |
| "grad_norm": 6.007194995880127, | |
| "learning_rate": 5.578777813350408e-06, | |
| "loss": 0.9176, | |
| "num_input_tokens_seen": 240250312, | |
| "step": 444000 | |
| }, | |
| { | |
| "epoch": 4.447134624619817, | |
| "grad_norm": 8.6255521774292, | |
| "learning_rate": 5.528753801824876e-06, | |
| "loss": 0.9216, | |
| "num_input_tokens_seen": 240515680, | |
| "step": 444500 | |
| }, | |
| { | |
| "epoch": 4.4521370257723705, | |
| "grad_norm": 6.955540180206299, | |
| "learning_rate": 5.478729790299344e-06, | |
| "loss": 0.9196, | |
| "num_input_tokens_seen": 240785288, | |
| "step": 445000 | |
| }, | |
| { | |
| "epoch": 4.457139426924924, | |
| "grad_norm": 7.786896228790283, | |
| "learning_rate": 5.428705778773812e-06, | |
| "loss": 0.9337, | |
| "num_input_tokens_seen": 241060592, | |
| "step": 445500 | |
| }, | |
| { | |
| "epoch": 4.462141828077478, | |
| "grad_norm": 5.2199482917785645, | |
| "learning_rate": 5.378681767248279e-06, | |
| "loss": 0.9356, | |
| "num_input_tokens_seen": 241326064, | |
| "step": 446000 | |
| }, | |
| { | |
| "epoch": 4.467144229230031, | |
| "grad_norm": 4.975681781768799, | |
| "learning_rate": 5.328657755722747e-06, | |
| "loss": 0.911, | |
| "num_input_tokens_seen": 241596704, | |
| "step": 446500 | |
| }, | |
| { | |
| "epoch": 4.472146630382584, | |
| "grad_norm": 4.91240119934082, | |
| "learning_rate": 5.278633744197215e-06, | |
| "loss": 0.9276, | |
| "num_input_tokens_seen": 241870056, | |
| "step": 447000 | |
| }, | |
| { | |
| "epoch": 4.477149031535137, | |
| "grad_norm": 7.170393466949463, | |
| "learning_rate": 5.228609732671682e-06, | |
| "loss": 0.9212, | |
| "num_input_tokens_seen": 242140224, | |
| "step": 447500 | |
| }, | |
| { | |
| "epoch": 4.48215143268769, | |
| "grad_norm": 5.982038497924805, | |
| "learning_rate": 5.1785857211461505e-06, | |
| "loss": 0.9112, | |
| "num_input_tokens_seen": 242405736, | |
| "step": 448000 | |
| }, | |
| { | |
| "epoch": 4.487153833840243, | |
| "grad_norm": 6.960501670837402, | |
| "learning_rate": 5.128561709620618e-06, | |
| "loss": 0.9284, | |
| "num_input_tokens_seen": 242672696, | |
| "step": 448500 | |
| }, | |
| { | |
| "epoch": 4.492156234992796, | |
| "grad_norm": 5.97189474105835, | |
| "learning_rate": 5.078537698095086e-06, | |
| "loss": 0.9304, | |
| "num_input_tokens_seen": 242944136, | |
| "step": 449000 | |
| }, | |
| { | |
| "epoch": 4.4971586361453495, | |
| "grad_norm": 5.7404704093933105, | |
| "learning_rate": 5.028513686569554e-06, | |
| "loss": 0.9166, | |
| "num_input_tokens_seen": 243217160, | |
| "step": 449500 | |
| }, | |
| { | |
| "epoch": 4.502161037297903, | |
| "grad_norm": 14.095989227294922, | |
| "learning_rate": 4.978489675044022e-06, | |
| "loss": 0.9364, | |
| "num_input_tokens_seen": 243492064, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 4.507163438450457, | |
| "grad_norm": 6.327740669250488, | |
| "learning_rate": 4.928465663518489e-06, | |
| "loss": 0.9334, | |
| "num_input_tokens_seen": 243765824, | |
| "step": 450500 | |
| }, | |
| { | |
| "epoch": 4.51216583960301, | |
| "grad_norm": 5.019825458526611, | |
| "learning_rate": 4.878441651992957e-06, | |
| "loss": 0.9183, | |
| "num_input_tokens_seen": 244045424, | |
| "step": 451000 | |
| }, | |
| { | |
| "epoch": 4.517168240755563, | |
| "grad_norm": 5.382749557495117, | |
| "learning_rate": 4.828417640467425e-06, | |
| "loss": 0.9183, | |
| "num_input_tokens_seen": 244317544, | |
| "step": 451500 | |
| }, | |
| { | |
| "epoch": 4.522170641908116, | |
| "grad_norm": 6.0456461906433105, | |
| "learning_rate": 4.778393628941892e-06, | |
| "loss": 0.9176, | |
| "num_input_tokens_seen": 244587752, | |
| "step": 452000 | |
| }, | |
| { | |
| "epoch": 4.527173043060669, | |
| "grad_norm": 6.2013983726501465, | |
| "learning_rate": 4.7283696174163604e-06, | |
| "loss": 0.9253, | |
| "num_input_tokens_seen": 244852400, | |
| "step": 452500 | |
| }, | |
| { | |
| "epoch": 4.532175444213222, | |
| "grad_norm": 5.575494766235352, | |
| "learning_rate": 4.678345605890828e-06, | |
| "loss": 0.9315, | |
| "num_input_tokens_seen": 245130392, | |
| "step": 453000 | |
| }, | |
| { | |
| "epoch": 4.537177845365775, | |
| "grad_norm": 6.855820178985596, | |
| "learning_rate": 4.628321594365295e-06, | |
| "loss": 0.9255, | |
| "num_input_tokens_seen": 245393352, | |
| "step": 453500 | |
| }, | |
| { | |
| "epoch": 4.5421802465183285, | |
| "grad_norm": 5.4364728927612305, | |
| "learning_rate": 4.5782975828397635e-06, | |
| "loss": 0.9327, | |
| "num_input_tokens_seen": 245667248, | |
| "step": 454000 | |
| }, | |
| { | |
| "epoch": 4.547182647670882, | |
| "grad_norm": 7.509527206420898, | |
| "learning_rate": 4.528273571314231e-06, | |
| "loss": 0.9207, | |
| "num_input_tokens_seen": 245940192, | |
| "step": 454500 | |
| }, | |
| { | |
| "epoch": 4.552185048823436, | |
| "grad_norm": 5.191705226898193, | |
| "learning_rate": 4.478249559788699e-06, | |
| "loss": 0.9231, | |
| "num_input_tokens_seen": 246211320, | |
| "step": 455000 | |
| }, | |
| { | |
| "epoch": 4.557187449975989, | |
| "grad_norm": 6.908538341522217, | |
| "learning_rate": 4.4282255482631665e-06, | |
| "loss": 0.9256, | |
| "num_input_tokens_seen": 246488984, | |
| "step": 455500 | |
| }, | |
| { | |
| "epoch": 4.562189851128542, | |
| "grad_norm": 6.262028694152832, | |
| "learning_rate": 4.378201536737634e-06, | |
| "loss": 0.9219, | |
| "num_input_tokens_seen": 246764632, | |
| "step": 456000 | |
| }, | |
| { | |
| "epoch": 4.567192252281095, | |
| "grad_norm": 6.5729475021362305, | |
| "learning_rate": 4.328177525212102e-06, | |
| "loss": 0.9244, | |
| "num_input_tokens_seen": 247035024, | |
| "step": 456500 | |
| }, | |
| { | |
| "epoch": 4.572194653433648, | |
| "grad_norm": 7.030519008636475, | |
| "learning_rate": 4.2781535136865695e-06, | |
| "loss": 0.9096, | |
| "num_input_tokens_seen": 247301664, | |
| "step": 457000 | |
| }, | |
| { | |
| "epoch": 4.577197054586201, | |
| "grad_norm": 5.72337532043457, | |
| "learning_rate": 4.228129502161037e-06, | |
| "loss": 0.9243, | |
| "num_input_tokens_seen": 247574992, | |
| "step": 457500 | |
| }, | |
| { | |
| "epoch": 4.582199455738754, | |
| "grad_norm": 5.769835948944092, | |
| "learning_rate": 4.178105490635505e-06, | |
| "loss": 0.9378, | |
| "num_input_tokens_seen": 247843024, | |
| "step": 458000 | |
| }, | |
| { | |
| "epoch": 4.5872018568913075, | |
| "grad_norm": 4.633671760559082, | |
| "learning_rate": 4.128081479109973e-06, | |
| "loss": 0.9249, | |
| "num_input_tokens_seen": 248112056, | |
| "step": 458500 | |
| }, | |
| { | |
| "epoch": 4.592204258043861, | |
| "grad_norm": 6.9910569190979, | |
| "learning_rate": 4.078057467584441e-06, | |
| "loss": 0.9164, | |
| "num_input_tokens_seen": 248380816, | |
| "step": 459000 | |
| }, | |
| { | |
| "epoch": 4.597206659196415, | |
| "grad_norm": 5.471499919891357, | |
| "learning_rate": 4.028033456058908e-06, | |
| "loss": 0.9241, | |
| "num_input_tokens_seen": 248657168, | |
| "step": 459500 | |
| }, | |
| { | |
| "epoch": 4.602209060348968, | |
| "grad_norm": 5.17936897277832, | |
| "learning_rate": 3.978009444533376e-06, | |
| "loss": 0.9308, | |
| "num_input_tokens_seen": 248924720, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 4.607211461501521, | |
| "grad_norm": 7.616632461547852, | |
| "learning_rate": 3.927985433007844e-06, | |
| "loss": 0.9316, | |
| "num_input_tokens_seen": 249194600, | |
| "step": 460500 | |
| }, | |
| { | |
| "epoch": 4.612213862654074, | |
| "grad_norm": 7.818989276885986, | |
| "learning_rate": 3.877961421482311e-06, | |
| "loss": 0.9142, | |
| "num_input_tokens_seen": 249462512, | |
| "step": 461000 | |
| }, | |
| { | |
| "epoch": 4.617216263806627, | |
| "grad_norm": 6.754061698913574, | |
| "learning_rate": 3.8279374099567795e-06, | |
| "loss": 0.9213, | |
| "num_input_tokens_seen": 249729824, | |
| "step": 461500 | |
| }, | |
| { | |
| "epoch": 4.62221866495918, | |
| "grad_norm": 5.925983905792236, | |
| "learning_rate": 3.7779133984312473e-06, | |
| "loss": 0.9171, | |
| "num_input_tokens_seen": 250002496, | |
| "step": 462000 | |
| }, | |
| { | |
| "epoch": 4.6272210661117334, | |
| "grad_norm": 5.226542949676514, | |
| "learning_rate": 3.7278893869057147e-06, | |
| "loss": 0.9118, | |
| "num_input_tokens_seen": 250268376, | |
| "step": 462500 | |
| }, | |
| { | |
| "epoch": 4.632223467264287, | |
| "grad_norm": 5.781167984008789, | |
| "learning_rate": 3.677865375380183e-06, | |
| "loss": 0.9271, | |
| "num_input_tokens_seen": 250530912, | |
| "step": 463000 | |
| }, | |
| { | |
| "epoch": 4.63722586841684, | |
| "grad_norm": 4.7409210205078125, | |
| "learning_rate": 3.6278413638546503e-06, | |
| "loss": 0.9203, | |
| "num_input_tokens_seen": 250803368, | |
| "step": 463500 | |
| }, | |
| { | |
| "epoch": 4.642228269569394, | |
| "grad_norm": 4.875260353088379, | |
| "learning_rate": 3.5778173523291177e-06, | |
| "loss": 0.9235, | |
| "num_input_tokens_seen": 251074960, | |
| "step": 464000 | |
| }, | |
| { | |
| "epoch": 4.647230670721947, | |
| "grad_norm": 6.038626194000244, | |
| "learning_rate": 3.527793340803586e-06, | |
| "loss": 0.9274, | |
| "num_input_tokens_seen": 251347688, | |
| "step": 464500 | |
| }, | |
| { | |
| "epoch": 4.6522330718745, | |
| "grad_norm": 5.727837562561035, | |
| "learning_rate": 3.477769329278054e-06, | |
| "loss": 0.9173, | |
| "num_input_tokens_seen": 251623264, | |
| "step": 465000 | |
| }, | |
| { | |
| "epoch": 4.657235473027053, | |
| "grad_norm": 8.719578742980957, | |
| "learning_rate": 3.427745317752521e-06, | |
| "loss": 0.9149, | |
| "num_input_tokens_seen": 251894136, | |
| "step": 465500 | |
| }, | |
| { | |
| "epoch": 4.662237874179606, | |
| "grad_norm": 5.212377071380615, | |
| "learning_rate": 3.3777213062269894e-06, | |
| "loss": 0.9151, | |
| "num_input_tokens_seen": 252168632, | |
| "step": 466000 | |
| }, | |
| { | |
| "epoch": 4.667240275332159, | |
| "grad_norm": 5.5170440673828125, | |
| "learning_rate": 3.327697294701457e-06, | |
| "loss": 0.9177, | |
| "num_input_tokens_seen": 252446776, | |
| "step": 466500 | |
| }, | |
| { | |
| "epoch": 4.6722426764847125, | |
| "grad_norm": 5.481988906860352, | |
| "learning_rate": 3.277673283175925e-06, | |
| "loss": 0.922, | |
| "num_input_tokens_seen": 252711248, | |
| "step": 467000 | |
| }, | |
| { | |
| "epoch": 4.677245077637266, | |
| "grad_norm": 5.051156520843506, | |
| "learning_rate": 3.2276492716503925e-06, | |
| "loss": 0.9133, | |
| "num_input_tokens_seen": 252977920, | |
| "step": 467500 | |
| }, | |
| { | |
| "epoch": 4.682247478789819, | |
| "grad_norm": 5.823482036590576, | |
| "learning_rate": 3.17762526012486e-06, | |
| "loss": 0.9274, | |
| "num_input_tokens_seen": 253243784, | |
| "step": 468000 | |
| }, | |
| { | |
| "epoch": 4.687249879942373, | |
| "grad_norm": 5.723121166229248, | |
| "learning_rate": 3.127601248599328e-06, | |
| "loss": 0.9144, | |
| "num_input_tokens_seen": 253513552, | |
| "step": 468500 | |
| }, | |
| { | |
| "epoch": 4.692252281094926, | |
| "grad_norm": 6.516372203826904, | |
| "learning_rate": 3.0775772370737955e-06, | |
| "loss": 0.9162, | |
| "num_input_tokens_seen": 253780008, | |
| "step": 469000 | |
| }, | |
| { | |
| "epoch": 4.697254682247479, | |
| "grad_norm": 5.488427639007568, | |
| "learning_rate": 3.0275532255482633e-06, | |
| "loss": 0.9181, | |
| "num_input_tokens_seen": 254050256, | |
| "step": 469500 | |
| }, | |
| { | |
| "epoch": 4.702257083400032, | |
| "grad_norm": 6.543509006500244, | |
| "learning_rate": 2.977529214022731e-06, | |
| "loss": 0.9251, | |
| "num_input_tokens_seen": 254325728, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 4.707259484552585, | |
| "grad_norm": 6.277120590209961, | |
| "learning_rate": 2.9275052024971985e-06, | |
| "loss": 0.9332, | |
| "num_input_tokens_seen": 254596720, | |
| "step": 470500 | |
| }, | |
| { | |
| "epoch": 4.712261885705138, | |
| "grad_norm": 5.882318496704102, | |
| "learning_rate": 2.8774811909716664e-06, | |
| "loss": 0.9091, | |
| "num_input_tokens_seen": 254877768, | |
| "step": 471000 | |
| }, | |
| { | |
| "epoch": 4.7172642868576915, | |
| "grad_norm": 6.018533706665039, | |
| "learning_rate": 2.827457179446134e-06, | |
| "loss": 0.9112, | |
| "num_input_tokens_seen": 255146616, | |
| "step": 471500 | |
| }, | |
| { | |
| "epoch": 4.722266688010245, | |
| "grad_norm": 7.787155628204346, | |
| "learning_rate": 2.777433167920602e-06, | |
| "loss": 0.9204, | |
| "num_input_tokens_seen": 255411528, | |
| "step": 472000 | |
| }, | |
| { | |
| "epoch": 4.727269089162798, | |
| "grad_norm": 4.750833034515381, | |
| "learning_rate": 2.72740915639507e-06, | |
| "loss": 0.9229, | |
| "num_input_tokens_seen": 255673056, | |
| "step": 472500 | |
| }, | |
| { | |
| "epoch": 4.732271490315352, | |
| "grad_norm": 5.9136457443237305, | |
| "learning_rate": 2.6773851448695376e-06, | |
| "loss": 0.9291, | |
| "num_input_tokens_seen": 255947496, | |
| "step": 473000 | |
| }, | |
| { | |
| "epoch": 4.737273891467905, | |
| "grad_norm": 5.880224704742432, | |
| "learning_rate": 2.6273611333440055e-06, | |
| "loss": 0.9202, | |
| "num_input_tokens_seen": 256214416, | |
| "step": 473500 | |
| }, | |
| { | |
| "epoch": 4.742276292620458, | |
| "grad_norm": 4.128984451293945, | |
| "learning_rate": 2.577337121818473e-06, | |
| "loss": 0.9225, | |
| "num_input_tokens_seen": 256493072, | |
| "step": 474000 | |
| }, | |
| { | |
| "epoch": 4.747278693773011, | |
| "grad_norm": 4.8430914878845215, | |
| "learning_rate": 2.5273131102929407e-06, | |
| "loss": 0.9142, | |
| "num_input_tokens_seen": 256764208, | |
| "step": 474500 | |
| }, | |
| { | |
| "epoch": 4.752281094925564, | |
| "grad_norm": 4.830491542816162, | |
| "learning_rate": 2.4772890987674085e-06, | |
| "loss": 0.9118, | |
| "num_input_tokens_seen": 257032808, | |
| "step": 475000 | |
| }, | |
| { | |
| "epoch": 4.757283496078117, | |
| "grad_norm": 4.94685697555542, | |
| "learning_rate": 2.4272650872418763e-06, | |
| "loss": 0.9182, | |
| "num_input_tokens_seen": 257296856, | |
| "step": 475500 | |
| }, | |
| { | |
| "epoch": 4.7622858972306705, | |
| "grad_norm": 5.098095417022705, | |
| "learning_rate": 2.3772410757163437e-06, | |
| "loss": 0.9135, | |
| "num_input_tokens_seen": 257564344, | |
| "step": 476000 | |
| }, | |
| { | |
| "epoch": 4.767288298383224, | |
| "grad_norm": 6.16255521774292, | |
| "learning_rate": 2.3272170641908115e-06, | |
| "loss": 0.9081, | |
| "num_input_tokens_seen": 257834360, | |
| "step": 476500 | |
| }, | |
| { | |
| "epoch": 4.772290699535777, | |
| "grad_norm": 5.006162643432617, | |
| "learning_rate": 2.2771930526652793e-06, | |
| "loss": 0.924, | |
| "num_input_tokens_seen": 258105976, | |
| "step": 477000 | |
| }, | |
| { | |
| "epoch": 4.777293100688331, | |
| "grad_norm": 5.462359428405762, | |
| "learning_rate": 2.227169041139747e-06, | |
| "loss": 0.8951, | |
| "num_input_tokens_seen": 258374328, | |
| "step": 477500 | |
| }, | |
| { | |
| "epoch": 4.782295501840884, | |
| "grad_norm": 6.263942241668701, | |
| "learning_rate": 2.177145029614215e-06, | |
| "loss": 0.9173, | |
| "num_input_tokens_seen": 258646752, | |
| "step": 478000 | |
| }, | |
| { | |
| "epoch": 4.787297902993437, | |
| "grad_norm": 6.507811546325684, | |
| "learning_rate": 2.127121018088683e-06, | |
| "loss": 0.9137, | |
| "num_input_tokens_seen": 258920128, | |
| "step": 478500 | |
| }, | |
| { | |
| "epoch": 4.79230030414599, | |
| "grad_norm": 5.116788864135742, | |
| "learning_rate": 2.0770970065631506e-06, | |
| "loss": 0.9134, | |
| "num_input_tokens_seen": 259189648, | |
| "step": 479000 | |
| }, | |
| { | |
| "epoch": 4.797302705298543, | |
| "grad_norm": 5.995227336883545, | |
| "learning_rate": 2.0270729950376184e-06, | |
| "loss": 0.9169, | |
| "num_input_tokens_seen": 259457416, | |
| "step": 479500 | |
| }, | |
| { | |
| "epoch": 4.802305106451096, | |
| "grad_norm": 4.341572284698486, | |
| "learning_rate": 1.977048983512086e-06, | |
| "loss": 0.9121, | |
| "num_input_tokens_seen": 259733960, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 4.8073075076036496, | |
| "grad_norm": 6.331231117248535, | |
| "learning_rate": 1.9270249719865537e-06, | |
| "loss": 0.9096, | |
| "num_input_tokens_seen": 260007160, | |
| "step": 480500 | |
| }, | |
| { | |
| "epoch": 4.812309908756203, | |
| "grad_norm": 7.295907497406006, | |
| "learning_rate": 1.8770009604610215e-06, | |
| "loss": 0.9227, | |
| "num_input_tokens_seen": 260281344, | |
| "step": 481000 | |
| }, | |
| { | |
| "epoch": 4.817312309908756, | |
| "grad_norm": 5.934523105621338, | |
| "learning_rate": 1.8269769489354893e-06, | |
| "loss": 0.9225, | |
| "num_input_tokens_seen": 260555592, | |
| "step": 481500 | |
| }, | |
| { | |
| "epoch": 4.82231471106131, | |
| "grad_norm": 6.325069904327393, | |
| "learning_rate": 1.7769529374099567e-06, | |
| "loss": 0.9061, | |
| "num_input_tokens_seen": 260826576, | |
| "step": 482000 | |
| }, | |
| { | |
| "epoch": 4.827317112213863, | |
| "grad_norm": 4.8332977294921875, | |
| "learning_rate": 1.7269289258844245e-06, | |
| "loss": 0.9298, | |
| "num_input_tokens_seen": 261100264, | |
| "step": 482500 | |
| }, | |
| { | |
| "epoch": 4.832319513366416, | |
| "grad_norm": 6.445847988128662, | |
| "learning_rate": 1.6769049143588923e-06, | |
| "loss": 0.9189, | |
| "num_input_tokens_seen": 261372184, | |
| "step": 483000 | |
| }, | |
| { | |
| "epoch": 4.837321914518969, | |
| "grad_norm": 6.000613212585449, | |
| "learning_rate": 1.6268809028333602e-06, | |
| "loss": 0.9195, | |
| "num_input_tokens_seen": 261635736, | |
| "step": 483500 | |
| }, | |
| { | |
| "epoch": 4.842324315671522, | |
| "grad_norm": 5.839612007141113, | |
| "learning_rate": 1.5768568913078278e-06, | |
| "loss": 0.9109, | |
| "num_input_tokens_seen": 261908384, | |
| "step": 484000 | |
| }, | |
| { | |
| "epoch": 4.8473267168240755, | |
| "grad_norm": 5.0340471267700195, | |
| "learning_rate": 1.5268328797822956e-06, | |
| "loss": 0.914, | |
| "num_input_tokens_seen": 262183640, | |
| "step": 484500 | |
| }, | |
| { | |
| "epoch": 4.852329117976629, | |
| "grad_norm": 9.325509071350098, | |
| "learning_rate": 1.4768088682567634e-06, | |
| "loss": 0.9097, | |
| "num_input_tokens_seen": 262457280, | |
| "step": 485000 | |
| }, | |
| { | |
| "epoch": 4.857331519129182, | |
| "grad_norm": 4.7551703453063965, | |
| "learning_rate": 1.426784856731231e-06, | |
| "loss": 0.916, | |
| "num_input_tokens_seen": 262736336, | |
| "step": 485500 | |
| }, | |
| { | |
| "epoch": 4.862333920281735, | |
| "grad_norm": 5.392652988433838, | |
| "learning_rate": 1.3767608452056988e-06, | |
| "loss": 0.9184, | |
| "num_input_tokens_seen": 263002616, | |
| "step": 486000 | |
| }, | |
| { | |
| "epoch": 4.867336321434289, | |
| "grad_norm": 6.265237808227539, | |
| "learning_rate": 1.3267368336801664e-06, | |
| "loss": 0.9233, | |
| "num_input_tokens_seen": 263268536, | |
| "step": 486500 | |
| }, | |
| { | |
| "epoch": 4.872338722586842, | |
| "grad_norm": 9.598567008972168, | |
| "learning_rate": 1.2767128221546343e-06, | |
| "loss": 0.9266, | |
| "num_input_tokens_seen": 263537000, | |
| "step": 487000 | |
| }, | |
| { | |
| "epoch": 4.877341123739395, | |
| "grad_norm": 4.646302700042725, | |
| "learning_rate": 1.226688810629102e-06, | |
| "loss": 0.9115, | |
| "num_input_tokens_seen": 263804248, | |
| "step": 487500 | |
| }, | |
| { | |
| "epoch": 4.882343524891948, | |
| "grad_norm": 5.325549125671387, | |
| "learning_rate": 1.1766647991035699e-06, | |
| "loss": 0.9065, | |
| "num_input_tokens_seen": 264071728, | |
| "step": 488000 | |
| }, | |
| { | |
| "epoch": 4.887345926044501, | |
| "grad_norm": 5.929357528686523, | |
| "learning_rate": 1.1266407875780375e-06, | |
| "loss": 0.9124, | |
| "num_input_tokens_seen": 264342480, | |
| "step": 488500 | |
| }, | |
| { | |
| "epoch": 4.8923483271970545, | |
| "grad_norm": 7.279246807098389, | |
| "learning_rate": 1.0766167760525053e-06, | |
| "loss": 0.9112, | |
| "num_input_tokens_seen": 264615696, | |
| "step": 489000 | |
| }, | |
| { | |
| "epoch": 4.897350728349608, | |
| "grad_norm": 7.106380939483643, | |
| "learning_rate": 1.026592764526973e-06, | |
| "loss": 0.9099, | |
| "num_input_tokens_seen": 264886296, | |
| "step": 489500 | |
| }, | |
| { | |
| "epoch": 4.902353129502161, | |
| "grad_norm": 6.210377216339111, | |
| "learning_rate": 9.765687530014407e-07, | |
| "loss": 0.9227, | |
| "num_input_tokens_seen": 265156384, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 4.907355530654714, | |
| "grad_norm": 6.38276481628418, | |
| "learning_rate": 9.265447414759085e-07, | |
| "loss": 0.9205, | |
| "num_input_tokens_seen": 265426248, | |
| "step": 490500 | |
| }, | |
| { | |
| "epoch": 4.912357931807268, | |
| "grad_norm": 5.883709907531738, | |
| "learning_rate": 8.765207299503763e-07, | |
| "loss": 0.9309, | |
| "num_input_tokens_seen": 265695040, | |
| "step": 491000 | |
| }, | |
| { | |
| "epoch": 4.917360332959821, | |
| "grad_norm": 5.776634693145752, | |
| "learning_rate": 8.26496718424844e-07, | |
| "loss": 0.9042, | |
| "num_input_tokens_seen": 265968648, | |
| "step": 491500 | |
| }, | |
| { | |
| "epoch": 4.922362734112374, | |
| "grad_norm": 6.002242088317871, | |
| "learning_rate": 7.764727068993117e-07, | |
| "loss": 0.9151, | |
| "num_input_tokens_seen": 266237776, | |
| "step": 492000 | |
| }, | |
| { | |
| "epoch": 4.927365135264927, | |
| "grad_norm": 6.250047206878662, | |
| "learning_rate": 7.264486953737794e-07, | |
| "loss": 0.9021, | |
| "num_input_tokens_seen": 266510616, | |
| "step": 492500 | |
| }, | |
| { | |
| "epoch": 4.93236753641748, | |
| "grad_norm": 7.225757598876953, | |
| "learning_rate": 6.764246838482472e-07, | |
| "loss": 0.9197, | |
| "num_input_tokens_seen": 266780560, | |
| "step": 493000 | |
| }, | |
| { | |
| "epoch": 4.9373699375700335, | |
| "grad_norm": 6.095335006713867, | |
| "learning_rate": 6.26400672322715e-07, | |
| "loss": 0.916, | |
| "num_input_tokens_seen": 267050712, | |
| "step": 493500 | |
| }, | |
| { | |
| "epoch": 4.942372338722587, | |
| "grad_norm": 7.186228275299072, | |
| "learning_rate": 5.763766607971827e-07, | |
| "loss": 0.9027, | |
| "num_input_tokens_seen": 267324136, | |
| "step": 494000 | |
| }, | |
| { | |
| "epoch": 4.94737473987514, | |
| "grad_norm": 6.71329402923584, | |
| "learning_rate": 5.263526492716505e-07, | |
| "loss": 0.9205, | |
| "num_input_tokens_seen": 267592264, | |
| "step": 494500 | |
| }, | |
| { | |
| "epoch": 4.952377141027693, | |
| "grad_norm": 6.1677045822143555, | |
| "learning_rate": 4.763286377461182e-07, | |
| "loss": 0.9235, | |
| "num_input_tokens_seen": 267861184, | |
| "step": 495000 | |
| }, | |
| { | |
| "epoch": 4.957379542180247, | |
| "grad_norm": 5.632607460021973, | |
| "learning_rate": 4.263046262205859e-07, | |
| "loss": 0.9131, | |
| "num_input_tokens_seen": 268133248, | |
| "step": 495500 | |
| }, | |
| { | |
| "epoch": 4.9623819433328, | |
| "grad_norm": 7.572421073913574, | |
| "learning_rate": 3.7628061469505367e-07, | |
| "loss": 0.9228, | |
| "num_input_tokens_seen": 268408144, | |
| "step": 496000 | |
| }, | |
| { | |
| "epoch": 4.967384344485353, | |
| "grad_norm": 5.349244594573975, | |
| "learning_rate": 3.262566031695214e-07, | |
| "loss": 0.9038, | |
| "num_input_tokens_seen": 268677424, | |
| "step": 496500 | |
| }, | |
| { | |
| "epoch": 4.972386745637906, | |
| "grad_norm": 5.9130754470825195, | |
| "learning_rate": 2.7623259164398915e-07, | |
| "loss": 0.9116, | |
| "num_input_tokens_seen": 268944976, | |
| "step": 497000 | |
| }, | |
| { | |
| "epoch": 4.977389146790459, | |
| "grad_norm": 6.351227283477783, | |
| "learning_rate": 2.2620858011845684e-07, | |
| "loss": 0.9281, | |
| "num_input_tokens_seen": 269220512, | |
| "step": 497500 | |
| }, | |
| { | |
| "epoch": 4.9823915479430125, | |
| "grad_norm": 4.36036491394043, | |
| "learning_rate": 1.761845685929246e-07, | |
| "loss": 0.9158, | |
| "num_input_tokens_seen": 269491032, | |
| "step": 498000 | |
| }, | |
| { | |
| "epoch": 4.987393949095566, | |
| "grad_norm": 6.718998432159424, | |
| "learning_rate": 1.2616055706739237e-07, | |
| "loss": 0.923, | |
| "num_input_tokens_seen": 269761928, | |
| "step": 498500 | |
| }, | |
| { | |
| "epoch": 4.992396350248119, | |
| "grad_norm": 6.188157558441162, | |
| "learning_rate": 7.61365455418601e-08, | |
| "loss": 0.9097, | |
| "num_input_tokens_seen": 270030752, | |
| "step": 499000 | |
| }, | |
| { | |
| "epoch": 4.997398751400672, | |
| "grad_norm": 7.353320598602295, | |
| "learning_rate": 2.6112534016327838e-08, | |
| "loss": 0.9016, | |
| "num_input_tokens_seen": 270298112, | |
| "step": 499500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.9877662062644958, | |
| "eval_runtime": 193.3937, | |
| "eval_samples_per_second": 1033.668, | |
| "eval_steps_per_second": 129.213, | |
| "num_input_tokens_seen": 270444104, | |
| "step": 499760 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "num_input_tokens_seen": 270444104, | |
| "step": 499760, | |
| "total_flos": 7.16219760157655e+16, | |
| "train_loss": 0.8578685343122414, | |
| "train_runtime": 15819.1918, | |
| "train_samples_per_second": 252.736, | |
| "train_steps_per_second": 31.592, | |
| "train_tokens_per_second": 17096.35 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 499760, | |
| "num_input_tokens_seen": 270444104, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.16219760157655e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |