random_mnOL2GP4QKZI5nav / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
7be85db verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008695652173913044,
"grad_norm": 3.7926358600905217,
"learning_rate": 9.999533579822611e-06,
"loss": 0.5803,
"step": 1
},
{
"epoch": 0.017391304347826087,
"grad_norm": 2.2668745586493317,
"learning_rate": 9.998134406309555e-06,
"loss": 0.5189,
"step": 2
},
{
"epoch": 0.02608695652173913,
"grad_norm": 1.9862936871924635,
"learning_rate": 9.995802740501933e-06,
"loss": 0.4599,
"step": 3
},
{
"epoch": 0.034782608695652174,
"grad_norm": 1.2694729495905928,
"learning_rate": 9.99253901741414e-06,
"loss": 0.4366,
"step": 4
},
{
"epoch": 0.043478260869565216,
"grad_norm": 2.0758249796329893,
"learning_rate": 9.988343845952697e-06,
"loss": 0.4183,
"step": 5
},
{
"epoch": 0.05217391304347826,
"grad_norm": 1.3497919973348147,
"learning_rate": 9.983218008802648e-06,
"loss": 0.4296,
"step": 6
},
{
"epoch": 0.06086956521739131,
"grad_norm": 1.084195929841202,
"learning_rate": 9.977162462281544e-06,
"loss": 0.416,
"step": 7
},
{
"epoch": 0.06956521739130435,
"grad_norm": 1.4067964292092314,
"learning_rate": 9.970178336161018e-06,
"loss": 0.4288,
"step": 8
},
{
"epoch": 0.0782608695652174,
"grad_norm": 1.3249760789438039,
"learning_rate": 9.962266933456008e-06,
"loss": 0.3853,
"step": 9
},
{
"epoch": 0.08695652173913043,
"grad_norm": 1.0437807857737307,
"learning_rate": 9.953429730181653e-06,
"loss": 0.3933,
"step": 10
},
{
"epoch": 0.09565217391304348,
"grad_norm": 1.0459327079306973,
"learning_rate": 9.943668375077926e-06,
"loss": 0.3661,
"step": 11
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.9711779587776549,
"learning_rate": 9.932984689302012e-06,
"loss": 0.3516,
"step": 12
},
{
"epoch": 0.11304347826086956,
"grad_norm": 0.948229340133233,
"learning_rate": 9.921380666088558e-06,
"loss": 0.3444,
"step": 13
},
{
"epoch": 0.12173913043478261,
"grad_norm": 0.9360209169160184,
"learning_rate": 9.908858470377793e-06,
"loss": 0.3563,
"step": 14
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.9729720279408952,
"learning_rate": 9.895420438411616e-06,
"loss": 0.3614,
"step": 15
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.9194389440786153,
"learning_rate": 9.881069077297724e-06,
"loss": 0.3682,
"step": 16
},
{
"epoch": 0.14782608695652175,
"grad_norm": 0.9306147149273309,
"learning_rate": 9.865807064541878e-06,
"loss": 0.3772,
"step": 17
},
{
"epoch": 0.1565217391304348,
"grad_norm": 0.9687236184448399,
"learning_rate": 9.849637247548356e-06,
"loss": 0.3658,
"step": 18
},
{
"epoch": 0.16521739130434782,
"grad_norm": 0.8805129646890217,
"learning_rate": 9.832562643088724e-06,
"loss": 0.3536,
"step": 19
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.9082567693373124,
"learning_rate": 9.814586436738998e-06,
"loss": 0.3631,
"step": 20
},
{
"epoch": 0.1826086956521739,
"grad_norm": 0.8376153293313543,
"learning_rate": 9.795711982285317e-06,
"loss": 0.3346,
"step": 21
},
{
"epoch": 0.19130434782608696,
"grad_norm": 0.9023317044137213,
"learning_rate": 9.775942801098241e-06,
"loss": 0.3332,
"step": 22
},
{
"epoch": 0.2,
"grad_norm": 0.8710245222199772,
"learning_rate": 9.755282581475769e-06,
"loss": 0.3294,
"step": 23
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.8572636268979127,
"learning_rate": 9.733735177955219e-06,
"loss": 0.3258,
"step": 24
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.8794360826361988,
"learning_rate": 9.711304610594104e-06,
"loss": 0.3401,
"step": 25
},
{
"epoch": 0.22608695652173913,
"grad_norm": 0.8468918683843584,
"learning_rate": 9.687995064220102e-06,
"loss": 0.3207,
"step": 26
},
{
"epoch": 0.23478260869565218,
"grad_norm": 0.7951556882080077,
"learning_rate": 9.66381088765032e-06,
"loss": 0.3038,
"step": 27
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.8479752814618233,
"learning_rate": 9.638756592879923e-06,
"loss": 0.3025,
"step": 28
},
{
"epoch": 0.25217391304347825,
"grad_norm": 0.8497716303032561,
"learning_rate": 9.61283685424036e-06,
"loss": 0.316,
"step": 29
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.7882618321826594,
"learning_rate": 9.586056507527266e-06,
"loss": 0.302,
"step": 30
},
{
"epoch": 0.26956521739130435,
"grad_norm": 0.955601321887552,
"learning_rate": 9.558420549098269e-06,
"loss": 0.3375,
"step": 31
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.8703236742219601,
"learning_rate": 9.529934134940819e-06,
"loss": 0.3158,
"step": 32
},
{
"epoch": 0.28695652173913044,
"grad_norm": 0.7804427309071655,
"learning_rate": 9.500602579710256e-06,
"loss": 0.3094,
"step": 33
},
{
"epoch": 0.2956521739130435,
"grad_norm": 0.8380605647142672,
"learning_rate": 9.470431355738257e-06,
"loss": 0.3124,
"step": 34
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.8709665357527219,
"learning_rate": 9.439426092011877e-06,
"loss": 0.2978,
"step": 35
},
{
"epoch": 0.3130434782608696,
"grad_norm": 0.8331897634678387,
"learning_rate": 9.407592573123359e-06,
"loss": 0.2977,
"step": 36
},
{
"epoch": 0.3217391304347826,
"grad_norm": 0.8074752227765223,
"learning_rate": 9.374936738190913e-06,
"loss": 0.2998,
"step": 37
},
{
"epoch": 0.33043478260869563,
"grad_norm": 0.7908935634758442,
"learning_rate": 9.341464679750669e-06,
"loss": 0.2837,
"step": 38
},
{
"epoch": 0.3391304347826087,
"grad_norm": 0.8044442561944394,
"learning_rate": 9.307182642620001e-06,
"loss": 0.3048,
"step": 39
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.8115148300262227,
"learning_rate": 9.272097022732444e-06,
"loss": 0.3,
"step": 40
},
{
"epoch": 0.3565217391304348,
"grad_norm": 0.852441800223183,
"learning_rate": 9.236214365944418e-06,
"loss": 0.3159,
"step": 41
},
{
"epoch": 0.3652173913043478,
"grad_norm": 0.7948116584462347,
"learning_rate": 9.199541366813984e-06,
"loss": 0.2807,
"step": 42
},
{
"epoch": 0.3739130434782609,
"grad_norm": 0.8960986696566211,
"learning_rate": 9.16208486735184e-06,
"loss": 0.3083,
"step": 43
},
{
"epoch": 0.3826086956521739,
"grad_norm": 0.8013923826757388,
"learning_rate": 9.123851855744842e-06,
"loss": 0.2738,
"step": 44
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.8179633207004794,
"learning_rate": 9.08484946505221e-06,
"loss": 0.2743,
"step": 45
},
{
"epoch": 0.4,
"grad_norm": 0.786040806251314,
"learning_rate": 9.045084971874738e-06,
"loss": 0.3028,
"step": 46
},
{
"epoch": 0.40869565217391307,
"grad_norm": 0.8287753823112483,
"learning_rate": 9.004565794997209e-06,
"loss": 0.2907,
"step": 47
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.8122421606296455,
"learning_rate": 8.963299494004292e-06,
"loss": 0.3084,
"step": 48
},
{
"epoch": 0.4260869565217391,
"grad_norm": 0.8176925065091848,
"learning_rate": 8.921293767870157e-06,
"loss": 0.3044,
"step": 49
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.8635746143455237,
"learning_rate": 8.8785564535221e-06,
"loss": 0.2806,
"step": 50
},
{
"epoch": 0.4434782608695652,
"grad_norm": 0.7855238455407065,
"learning_rate": 8.835095524378413e-06,
"loss": 0.2796,
"step": 51
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.8884485180748448,
"learning_rate": 8.790919088860815e-06,
"loss": 0.2609,
"step": 52
},
{
"epoch": 0.4608695652173913,
"grad_norm": 0.7650813052277443,
"learning_rate": 8.746035388881655e-06,
"loss": 0.2748,
"step": 53
},
{
"epoch": 0.46956521739130436,
"grad_norm": 0.8536449205315236,
"learning_rate": 8.70045279830626e-06,
"loss": 0.291,
"step": 54
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.8275442712109471,
"learning_rate": 8.65417982139062e-06,
"loss": 0.2929,
"step": 55
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.8301805259333005,
"learning_rate": 8.60722509119478e-06,
"loss": 0.276,
"step": 56
},
{
"epoch": 0.4956521739130435,
"grad_norm": 0.8389086753476661,
"learning_rate": 8.559597367972168e-06,
"loss": 0.2735,
"step": 57
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.8089334883453542,
"learning_rate": 8.511305537535238e-06,
"loss": 0.2949,
"step": 58
},
{
"epoch": 0.5130434782608696,
"grad_norm": 0.820700967105469,
"learning_rate": 8.462358609597629e-06,
"loss": 0.2871,
"step": 59
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.7774118964213651,
"learning_rate": 8.412765716093273e-06,
"loss": 0.2954,
"step": 60
},
{
"epoch": 0.5304347826086957,
"grad_norm": 0.7652717489483709,
"learning_rate": 8.362536109472637e-06,
"loss": 0.2682,
"step": 61
},
{
"epoch": 0.5391304347826087,
"grad_norm": 0.7721075425818443,
"learning_rate": 8.31167916097654e-06,
"loss": 0.2586,
"step": 62
},
{
"epoch": 0.5478260869565217,
"grad_norm": 0.7758858894092164,
"learning_rate": 8.260204358887753e-06,
"loss": 0.2794,
"step": 63
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.8298683457635183,
"learning_rate": 8.208121306760806e-06,
"loss": 0.2918,
"step": 64
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.8165609238598426,
"learning_rate": 8.155439721630265e-06,
"loss": 0.2684,
"step": 65
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.8060261148924719,
"learning_rate": 8.102169432197842e-06,
"loss": 0.2696,
"step": 66
},
{
"epoch": 0.5826086956521739,
"grad_norm": 0.7860069264875678,
"learning_rate": 8.048320376998675e-06,
"loss": 0.2687,
"step": 67
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.8231441428650305,
"learning_rate": 7.993902602547113e-06,
"loss": 0.2924,
"step": 68
},
{
"epoch": 0.6,
"grad_norm": 0.7510155765430271,
"learning_rate": 7.938926261462366e-06,
"loss": 0.2787,
"step": 69
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.7978371064266664,
"learning_rate": 7.883401610574338e-06,
"loss": 0.2556,
"step": 70
},
{
"epoch": 0.6173913043478261,
"grad_norm": 0.7796458243960821,
"learning_rate": 7.82733900901003e-06,
"loss": 0.2484,
"step": 71
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.8061635938809482,
"learning_rate": 7.770748916260875e-06,
"loss": 0.2717,
"step": 72
},
{
"epoch": 0.6347826086956522,
"grad_norm": 0.7989011379216687,
"learning_rate": 7.71364189023131e-06,
"loss": 0.2769,
"step": 73
},
{
"epoch": 0.6434782608695652,
"grad_norm": 0.864101430205087,
"learning_rate": 7.656028585269017e-06,
"loss": 0.3021,
"step": 74
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.8278537319450044,
"learning_rate": 7.597919750177168e-06,
"loss": 0.2743,
"step": 75
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.8122110745662657,
"learning_rate": 7.539326226209032e-06,
"loss": 0.2575,
"step": 76
},
{
"epoch": 0.6695652173913044,
"grad_norm": 0.7711738609894855,
"learning_rate": 7.4802589450453415e-06,
"loss": 0.2419,
"step": 77
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.840255520753878,
"learning_rate": 7.420728926754803e-06,
"loss": 0.2603,
"step": 78
},
{
"epoch": 0.6869565217391305,
"grad_norm": 0.8099638111512892,
"learning_rate": 7.360747277738094e-06,
"loss": 0.2412,
"step": 79
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.8164686251132438,
"learning_rate": 7.300325188655762e-06,
"loss": 0.2931,
"step": 80
},
{
"epoch": 0.7043478260869566,
"grad_norm": 0.8447041702102357,
"learning_rate": 7.2394739323404105e-06,
"loss": 0.2571,
"step": 81
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.8780088662577503,
"learning_rate": 7.178204861693546e-06,
"loss": 0.2648,
"step": 82
},
{
"epoch": 0.7217391304347827,
"grad_norm": 0.7949925402138125,
"learning_rate": 7.116529407567489e-06,
"loss": 0.2531,
"step": 83
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.831142025769961,
"learning_rate": 7.054459076632742e-06,
"loss": 0.2627,
"step": 84
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.7928493372875978,
"learning_rate": 6.9920054492312086e-06,
"loss": 0.2542,
"step": 85
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.8517974005099634,
"learning_rate": 6.9291801772156775e-06,
"loss": 0.2845,
"step": 86
},
{
"epoch": 0.7565217391304347,
"grad_norm": 0.8582102116535217,
"learning_rate": 6.865994981775958e-06,
"loss": 0.2618,
"step": 87
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.767586638125915,
"learning_rate": 6.802461651252073e-06,
"loss": 0.2489,
"step": 88
},
{
"epoch": 0.7739130434782608,
"grad_norm": 0.7908516439864228,
"learning_rate": 6.738592038934946e-06,
"loss": 0.26,
"step": 89
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.769843457519541,
"learning_rate": 6.674398060854931e-06,
"loss": 0.24,
"step": 90
},
{
"epoch": 0.7913043478260869,
"grad_norm": 0.7648359919673459,
"learning_rate": 6.609891693558692e-06,
"loss": 0.2356,
"step": 91
},
{
"epoch": 0.8,
"grad_norm": 0.8568063943639027,
"learning_rate": 6.545084971874738e-06,
"loss": 0.2619,
"step": 92
},
{
"epoch": 0.808695652173913,
"grad_norm": 0.8386630007537701,
"learning_rate": 6.479989986668118e-06,
"loss": 0.2665,
"step": 93
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.8346094680957461,
"learning_rate": 6.41461888258465e-06,
"loss": 0.2512,
"step": 94
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.7848872621892734,
"learning_rate": 6.348983855785122e-06,
"loss": 0.2319,
"step": 95
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.8300577038933864,
"learning_rate": 6.283097151669869e-06,
"loss": 0.2546,
"step": 96
},
{
"epoch": 0.8434782608695652,
"grad_norm": 0.8204772764658962,
"learning_rate": 6.216971062594179e-06,
"loss": 0.2506,
"step": 97
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.7507739899425591,
"learning_rate": 6.1506179255749335e-06,
"loss": 0.2371,
"step": 98
},
{
"epoch": 0.8608695652173913,
"grad_norm": 0.8368374987307072,
"learning_rate": 6.084050119988905e-06,
"loss": 0.2932,
"step": 99
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.8038317577458294,
"learning_rate": 6.0172800652631706e-06,
"loss": 0.2353,
"step": 100
},
{
"epoch": 0.8782608695652174,
"grad_norm": 0.7429277667599836,
"learning_rate": 5.950320218558037e-06,
"loss": 0.2473,
"step": 101
},
{
"epoch": 0.8869565217391304,
"grad_norm": 0.769341715157268,
"learning_rate": 5.883183072442938e-06,
"loss": 0.2454,
"step": 102
},
{
"epoch": 0.8956521739130435,
"grad_norm": 0.7475171914223133,
"learning_rate": 5.815881152565712e-06,
"loss": 0.2453,
"step": 103
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.7639664085864444,
"learning_rate": 5.7484270153157215e-06,
"loss": 0.2549,
"step": 104
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.7488925341912562,
"learning_rate": 5.680833245481234e-06,
"loss": 0.2099,
"step": 105
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.7604429182896406,
"learning_rate": 5.613112453901493e-06,
"loss": 0.235,
"step": 106
},
{
"epoch": 0.9304347826086956,
"grad_norm": 0.8074527670983,
"learning_rate": 5.5452772751139496e-06,
"loss": 0.2712,
"step": 107
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.8194915717155427,
"learning_rate": 5.477340364997051e-06,
"loss": 0.2394,
"step": 108
},
{
"epoch": 0.9478260869565217,
"grad_norm": 0.7736384257440181,
"learning_rate": 5.409314398409067e-06,
"loss": 0.2515,
"step": 109
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.8129900825691735,
"learning_rate": 5.341212066823356e-06,
"loss": 0.2252,
"step": 110
},
{
"epoch": 0.9652173913043478,
"grad_norm": 0.7997220935225438,
"learning_rate": 5.27304607596055e-06,
"loss": 0.2361,
"step": 111
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.7419969863966672,
"learning_rate": 5.204829143418072e-06,
"loss": 0.2483,
"step": 112
},
{
"epoch": 0.9826086956521739,
"grad_norm": 0.8631021669867655,
"learning_rate": 5.136573996297431e-06,
"loss": 0.2353,
"step": 113
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.8601407448964584,
"learning_rate": 5.068293368829755e-06,
"loss": 0.2506,
"step": 114
},
{
"epoch": 1.0,
"grad_norm": 0.780980744965185,
"learning_rate": 5e-06,
"loss": 0.2027,
"step": 115
},
{
"epoch": 1.008695652173913,
"grad_norm": 0.8783073478309215,
"learning_rate": 4.931706631170246e-06,
"loss": 0.182,
"step": 116
},
{
"epoch": 1.017391304347826,
"grad_norm": 0.865055770897707,
"learning_rate": 4.863426003702572e-06,
"loss": 0.1865,
"step": 117
},
{
"epoch": 1.0260869565217392,
"grad_norm": 0.7349650330840299,
"learning_rate": 4.795170856581929e-06,
"loss": 0.1817,
"step": 118
},
{
"epoch": 1.0347826086956522,
"grad_norm": 0.9717080563732651,
"learning_rate": 4.7269539240394505e-06,
"loss": 0.1854,
"step": 119
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.0648618499611149,
"learning_rate": 4.6587879331766465e-06,
"loss": 0.1882,
"step": 120
},
{
"epoch": 1.0521739130434782,
"grad_norm": 0.8422751330473561,
"learning_rate": 4.5906856015909365e-06,
"loss": 0.193,
"step": 121
},
{
"epoch": 1.0608695652173914,
"grad_norm": 0.9085710994537606,
"learning_rate": 4.52265963500295e-06,
"loss": 0.1897,
"step": 122
},
{
"epoch": 1.0695652173913044,
"grad_norm": 0.8261531122651687,
"learning_rate": 4.454722724886051e-06,
"loss": 0.1801,
"step": 123
},
{
"epoch": 1.0782608695652174,
"grad_norm": 0.8029582775928125,
"learning_rate": 4.386887546098509e-06,
"loss": 0.1711,
"step": 124
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.7584485615555612,
"learning_rate": 4.319166754518768e-06,
"loss": 0.1864,
"step": 125
},
{
"epoch": 1.0956521739130434,
"grad_norm": 0.7830746819993128,
"learning_rate": 4.251572984684281e-06,
"loss": 0.1656,
"step": 126
},
{
"epoch": 1.1043478260869566,
"grad_norm": 0.8183661828315899,
"learning_rate": 4.18411884743429e-06,
"loss": 0.1963,
"step": 127
},
{
"epoch": 1.1130434782608696,
"grad_norm": 0.8124834712086673,
"learning_rate": 4.116816927557063e-06,
"loss": 0.1949,
"step": 128
},
{
"epoch": 1.1217391304347826,
"grad_norm": 0.8095938552168147,
"learning_rate": 4.0496797814419655e-06,
"loss": 0.1928,
"step": 129
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.8244459053183879,
"learning_rate": 3.982719934736832e-06,
"loss": 0.1637,
"step": 130
},
{
"epoch": 1.1391304347826088,
"grad_norm": 0.844230690198155,
"learning_rate": 3.915949880011096e-06,
"loss": 0.1761,
"step": 131
},
{
"epoch": 1.1478260869565218,
"grad_norm": 0.7629752124336897,
"learning_rate": 3.849382074425069e-06,
"loss": 0.1655,
"step": 132
},
{
"epoch": 1.1565217391304348,
"grad_norm": 0.8119000179231529,
"learning_rate": 3.7830289374058214e-06,
"loss": 0.1627,
"step": 133
},
{
"epoch": 1.1652173913043478,
"grad_norm": 0.846000022809166,
"learning_rate": 3.7169028483301333e-06,
"loss": 0.1825,
"step": 134
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.8594092036105717,
"learning_rate": 3.6510161442148783e-06,
"loss": 0.1623,
"step": 135
},
{
"epoch": 1.182608695652174,
"grad_norm": 0.8134429234442735,
"learning_rate": 3.58538111741535e-06,
"loss": 0.1677,
"step": 136
},
{
"epoch": 1.191304347826087,
"grad_norm": 0.8220007479652487,
"learning_rate": 3.5200100133318836e-06,
"loss": 0.1802,
"step": 137
},
{
"epoch": 1.2,
"grad_norm": 0.7370262307110818,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.1577,
"step": 138
},
{
"epoch": 1.208695652173913,
"grad_norm": 0.7776508826903247,
"learning_rate": 3.39010830644131e-06,
"loss": 0.1692,
"step": 139
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.8176638873211497,
"learning_rate": 3.3256019391450696e-06,
"loss": 0.1839,
"step": 140
},
{
"epoch": 1.2260869565217392,
"grad_norm": 0.7824859620156875,
"learning_rate": 3.261407961065056e-06,
"loss": 0.1658,
"step": 141
},
{
"epoch": 1.2347826086956522,
"grad_norm": 0.7898196741529216,
"learning_rate": 3.197538348747927e-06,
"loss": 0.1595,
"step": 142
},
{
"epoch": 1.2434782608695651,
"grad_norm": 0.7779942748101323,
"learning_rate": 3.1340050182240438e-06,
"loss": 0.1736,
"step": 143
},
{
"epoch": 1.2521739130434781,
"grad_norm": 0.7835099146449637,
"learning_rate": 3.070819822784323e-06,
"loss": 0.1765,
"step": 144
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.7750925939060943,
"learning_rate": 3.007994550768793e-06,
"loss": 0.1668,
"step": 145
},
{
"epoch": 1.2695652173913043,
"grad_norm": 0.7898432110470173,
"learning_rate": 2.9455409233672594e-06,
"loss": 0.1818,
"step": 146
},
{
"epoch": 1.2782608695652173,
"grad_norm": 0.7928568552460014,
"learning_rate": 2.883470592432512e-06,
"loss": 0.1662,
"step": 147
},
{
"epoch": 1.2869565217391306,
"grad_norm": 0.8040324383192976,
"learning_rate": 2.8217951383064546e-06,
"loss": 0.1738,
"step": 148
},
{
"epoch": 1.2956521739130435,
"grad_norm": 0.7914202714352641,
"learning_rate": 2.760526067659591e-06,
"loss": 0.1749,
"step": 149
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.7986655344728175,
"learning_rate": 2.6996748113442397e-06,
"loss": 0.1674,
"step": 150
},
{
"epoch": 1.3130434782608695,
"grad_norm": 0.7695628779858434,
"learning_rate": 2.6392527222619078e-06,
"loss": 0.1649,
"step": 151
},
{
"epoch": 1.3217391304347825,
"grad_norm": 0.8297816048388459,
"learning_rate": 2.5792710732452e-06,
"loss": 0.1861,
"step": 152
},
{
"epoch": 1.3304347826086955,
"grad_norm": 0.8506069393475439,
"learning_rate": 2.5197410549546598e-06,
"loss": 0.1821,
"step": 153
},
{
"epoch": 1.3391304347826087,
"grad_norm": 0.8301454895248155,
"learning_rate": 2.4606737737909696e-06,
"loss": 0.1773,
"step": 154
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.790839747859368,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.173,
"step": 155
},
{
"epoch": 1.3565217391304347,
"grad_norm": 0.7882458666352696,
"learning_rate": 2.3439714147309845e-06,
"loss": 0.1621,
"step": 156
},
{
"epoch": 1.365217391304348,
"grad_norm": 0.7824053265477245,
"learning_rate": 2.286358109768693e-06,
"loss": 0.1795,
"step": 157
},
{
"epoch": 1.373913043478261,
"grad_norm": 0.8373897265333546,
"learning_rate": 2.229251083739127e-06,
"loss": 0.1579,
"step": 158
},
{
"epoch": 1.382608695652174,
"grad_norm": 0.7920339711759135,
"learning_rate": 2.172660990989971e-06,
"loss": 0.1633,
"step": 159
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.7979954487075409,
"learning_rate": 2.1165983894256647e-06,
"loss": 0.1903,
"step": 160
},
{
"epoch": 1.4,
"grad_norm": 0.8149840233997447,
"learning_rate": 2.061073738537635e-06,
"loss": 0.178,
"step": 161
},
{
"epoch": 1.4086956521739131,
"grad_norm": 0.7630389521914079,
"learning_rate": 2.0060973974528873e-06,
"loss": 0.1695,
"step": 162
},
{
"epoch": 1.4173913043478261,
"grad_norm": 0.8633540398907231,
"learning_rate": 1.9516796230013275e-06,
"loss": 0.1793,
"step": 163
},
{
"epoch": 1.4260869565217391,
"grad_norm": 0.8073054302745539,
"learning_rate": 1.8978305678021598e-06,
"loss": 0.1742,
"step": 164
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.722866407642355,
"learning_rate": 1.8445602783697375e-06,
"loss": 0.1442,
"step": 165
},
{
"epoch": 1.4434782608695653,
"grad_norm": 0.7679620332308171,
"learning_rate": 1.7918786932391945e-06,
"loss": 0.1549,
"step": 166
},
{
"epoch": 1.4521739130434783,
"grad_norm": 0.8260530917417228,
"learning_rate": 1.739795641112248e-06,
"loss": 0.1835,
"step": 167
},
{
"epoch": 1.4608695652173913,
"grad_norm": 0.8308341283564098,
"learning_rate": 1.688320839023463e-06,
"loss": 0.1753,
"step": 168
},
{
"epoch": 1.4695652173913043,
"grad_norm": 0.8090290221136681,
"learning_rate": 1.6374638905273643e-06,
"loss": 0.1708,
"step": 169
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.7969580608731678,
"learning_rate": 1.5872342839067305e-06,
"loss": 0.1801,
"step": 170
},
{
"epoch": 1.4869565217391305,
"grad_norm": 0.7907013956069745,
"learning_rate": 1.5376413904023723e-06,
"loss": 0.1726,
"step": 171
},
{
"epoch": 1.4956521739130435,
"grad_norm": 0.7354823027756849,
"learning_rate": 1.4886944624647647e-06,
"loss": 0.1634,
"step": 172
},
{
"epoch": 1.5043478260869565,
"grad_norm": 0.8351529915110587,
"learning_rate": 1.4404026320278318e-06,
"loss": 0.179,
"step": 173
},
{
"epoch": 1.5130434782608697,
"grad_norm": 0.7652545719491372,
"learning_rate": 1.3927749088052218e-06,
"loss": 0.1633,
"step": 174
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.7715636328403607,
"learning_rate": 1.3458201786093795e-06,
"loss": 0.1648,
"step": 175
},
{
"epoch": 1.5304347826086957,
"grad_norm": 0.7859905267821279,
"learning_rate": 1.2995472016937405e-06,
"loss": 0.1665,
"step": 176
},
{
"epoch": 1.5391304347826087,
"grad_norm": 0.7541231060673731,
"learning_rate": 1.2539646111183452e-06,
"loss": 0.1581,
"step": 177
},
{
"epoch": 1.5478260869565217,
"grad_norm": 0.8146161995323012,
"learning_rate": 1.209080911139187e-06,
"loss": 0.1654,
"step": 178
},
{
"epoch": 1.5565217391304347,
"grad_norm": 0.7815523734014722,
"learning_rate": 1.1649044756215872e-06,
"loss": 0.172,
"step": 179
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.8360675369328597,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.1807,
"step": 180
},
{
"epoch": 1.5739130434782609,
"grad_norm": 0.7737095960966441,
"learning_rate": 1.0787062321298441e-06,
"loss": 0.1664,
"step": 181
},
{
"epoch": 1.5826086956521739,
"grad_norm": 0.8330243600872347,
"learning_rate": 1.0367005059957097e-06,
"loss": 0.1866,
"step": 182
},
{
"epoch": 1.591304347826087,
"grad_norm": 0.7473163483980059,
"learning_rate": 9.954342050027922e-07,
"loss": 0.1423,
"step": 183
},
{
"epoch": 1.6,
"grad_norm": 0.8013933158687357,
"learning_rate": 9.549150281252633e-07,
"loss": 0.1651,
"step": 184
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.7797439742691399,
"learning_rate": 9.151505349477901e-07,
"loss": 0.1637,
"step": 185
},
{
"epoch": 1.617391304347826,
"grad_norm": 0.817942121185659,
"learning_rate": 8.761481442551573e-07,
"loss": 0.1772,
"step": 186
},
{
"epoch": 1.626086956521739,
"grad_norm": 0.7623090718882819,
"learning_rate": 8.379151326481588e-07,
"loss": 0.174,
"step": 187
},
{
"epoch": 1.634782608695652,
"grad_norm": 0.7778537015682799,
"learning_rate": 8.004586331860176e-07,
"loss": 0.1692,
"step": 188
},
{
"epoch": 1.643478260869565,
"grad_norm": 0.7688198840780248,
"learning_rate": 7.637856340555822e-07,
"loss": 0.1761,
"step": 189
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.758217526690572,
"learning_rate": 7.279029772675572e-07,
"loss": 0.1593,
"step": 190
},
{
"epoch": 1.6608695652173913,
"grad_norm": 0.7687356483656284,
"learning_rate": 6.928173573800007e-07,
"loss": 0.173,
"step": 191
},
{
"epoch": 1.6695652173913045,
"grad_norm": 0.7476106387060736,
"learning_rate": 6.585353202493322e-07,
"loss": 0.1489,
"step": 192
},
{
"epoch": 1.6782608695652175,
"grad_norm": 0.7903786955623783,
"learning_rate": 6.250632618090868e-07,
"loss": 0.1474,
"step": 193
},
{
"epoch": 1.6869565217391305,
"grad_norm": 0.7763900052345729,
"learning_rate": 5.924074268766422e-07,
"loss": 0.1696,
"step": 194
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.77492512404569,
"learning_rate": 5.60573907988124e-07,
"loss": 0.1539,
"step": 195
},
{
"epoch": 1.7043478260869565,
"grad_norm": 0.7913359885775756,
"learning_rate": 5.295686442617442e-07,
"loss": 0.1635,
"step": 196
},
{
"epoch": 1.7130434782608694,
"grad_norm": 0.7559937122334403,
"learning_rate": 4.993974202897456e-07,
"loss": 0.1698,
"step": 197
},
{
"epoch": 1.7217391304347827,
"grad_norm": 0.759028302384717,
"learning_rate": 4.7006586505918273e-07,
"loss": 0.1798,
"step": 198
},
{
"epoch": 1.7304347826086957,
"grad_norm": 0.8392736281661952,
"learning_rate": 4.4157945090173294e-07,
"loss": 0.1736,
"step": 199
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.7447439836659154,
"learning_rate": 4.139434924727359e-07,
"loss": 0.1511,
"step": 200
},
{
"epoch": 1.7391304347826086,
"eval_loss": 0.24148985743522644,
"eval_runtime": 2.46,
"eval_samples_per_second": 15.041,
"eval_steps_per_second": 4.065,
"step": 200
},
{
"epoch": 1.7478260869565219,
"grad_norm": 0.7605852897700421,
"learning_rate": 3.8716314575964197e-07,
"loss": 0.168,
"step": 201
},
{
"epoch": 1.7565217391304349,
"grad_norm": 0.7737486779158945,
"learning_rate": 3.612434071200771e-07,
"loss": 0.1673,
"step": 202
},
{
"epoch": 1.7652173913043478,
"grad_norm": 0.7367635811704382,
"learning_rate": 3.361891123496824e-07,
"loss": 0.1689,
"step": 203
},
{
"epoch": 1.7739130434782608,
"grad_norm": 0.7393738966424859,
"learning_rate": 3.1200493577989875e-07,
"loss": 0.147,
"step": 204
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.7364978828192128,
"learning_rate": 2.88695389405898e-07,
"loss": 0.1638,
"step": 205
},
{
"epoch": 1.7913043478260868,
"grad_norm": 0.790856673599725,
"learning_rate": 2.662648220447811e-07,
"loss": 0.1807,
"step": 206
},
{
"epoch": 1.8,
"grad_norm": 0.7916192579497792,
"learning_rate": 2.447174185242324e-07,
"loss": 0.1788,
"step": 207
},
{
"epoch": 1.808695652173913,
"grad_norm": 0.7497255291970046,
"learning_rate": 2.240571989017598e-07,
"loss": 0.1745,
"step": 208
},
{
"epoch": 1.8173913043478263,
"grad_norm": 0.7858304474847134,
"learning_rate": 2.0428801771468388e-07,
"loss": 0.1697,
"step": 209
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.7886147342222705,
"learning_rate": 1.8541356326100436e-07,
"loss": 0.1625,
"step": 210
},
{
"epoch": 1.8347826086956522,
"grad_norm": 0.7782651194135675,
"learning_rate": 1.6743735691127639e-07,
"loss": 0.1604,
"step": 211
},
{
"epoch": 1.8434782608695652,
"grad_norm": 0.7518227466064474,
"learning_rate": 1.5036275245164377e-07,
"loss": 0.171,
"step": 212
},
{
"epoch": 1.8521739130434782,
"grad_norm": 0.787456924146497,
"learning_rate": 1.341929354581234e-07,
"loss": 0.1647,
"step": 213
},
{
"epoch": 1.8608695652173912,
"grad_norm": 0.7593791059719298,
"learning_rate": 1.1893092270227724e-07,
"loss": 0.1742,
"step": 214
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.7592456757071181,
"learning_rate": 1.0457956158838545e-07,
"loss": 0.1803,
"step": 215
},
{
"epoch": 1.8782608695652174,
"grad_norm": 0.7488455785176448,
"learning_rate": 9.114152962220734e-08,
"loss": 0.1752,
"step": 216
},
{
"epoch": 1.8869565217391304,
"grad_norm": 0.8225195996763184,
"learning_rate": 7.861933391144272e-08,
"loss": 0.1784,
"step": 217
},
{
"epoch": 1.8956521739130436,
"grad_norm": 0.7768223579129686,
"learning_rate": 6.701531069799039e-08,
"loss": 0.154,
"step": 218
},
{
"epoch": 1.9043478260869566,
"grad_norm": 0.7730963889963934,
"learning_rate": 5.633162492207633e-08,
"loss": 0.1797,
"step": 219
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.7531083589990637,
"learning_rate": 4.657026981834623e-08,
"loss": 0.1597,
"step": 220
},
{
"epoch": 1.9217391304347826,
"grad_norm": 0.7892220923489918,
"learning_rate": 3.773306654399234e-08,
"loss": 0.1736,
"step": 221
},
{
"epoch": 1.9304347826086956,
"grad_norm": 0.8043257204504718,
"learning_rate": 2.9821663838981994e-08,
"loss": 0.1809,
"step": 222
},
{
"epoch": 1.9391304347826086,
"grad_norm": 0.7485100421389712,
"learning_rate": 2.283753771845587e-08,
"loss": 0.172,
"step": 223
},
{
"epoch": 1.9478260869565216,
"grad_norm": 0.7134500086654286,
"learning_rate": 1.6781991197352133e-08,
"loss": 0.1451,
"step": 224
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.7688902110914354,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.1738,
"step": 225
},
{
"epoch": 1.9652173913043478,
"grad_norm": 0.7989562203109796,
"learning_rate": 7.460982585860144e-09,
"loss": 0.1748,
"step": 226
},
{
"epoch": 1.973913043478261,
"grad_norm": 0.7873169478978119,
"learning_rate": 4.197259498067707e-09,
"loss": 0.1809,
"step": 227
},
{
"epoch": 1.982608695652174,
"grad_norm": 0.7489493824448523,
"learning_rate": 1.865593690446588e-09,
"loss": 0.1624,
"step": 228
},
{
"epoch": 1.991304347826087,
"grad_norm": 0.779974678935488,
"learning_rate": 4.664201773896259e-10,
"loss": 0.1781,
"step": 229
},
{
"epoch": 2.0,
"grad_norm": 0.7635574474266686,
"learning_rate": 0.0,
"loss": 0.1517,
"step": 230
},
{
"epoch": 2.0,
"step": 230,
"total_flos": 15836349726720.0,
"train_loss": 0.2343279652621435,
"train_runtime": 744.2303,
"train_samples_per_second": 9.833,
"train_steps_per_second": 0.309
}
],
"logging_steps": 1,
"max_steps": 230,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 15836349726720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}