gemma-1B-SFT / trainer_state.json
ricemonster's picture
Upload folder using huggingface_hub
a6f1621 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 17121,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008761170492377781,
"grad_norm": 1.859375,
"learning_rate": 4.985690088195783e-05,
"loss": 0.5628,
"step": 50
},
{
"epoch": 0.017522340984755563,
"grad_norm": 1.453125,
"learning_rate": 4.971088137375153e-05,
"loss": 0.3821,
"step": 100
},
{
"epoch": 0.026283511477133346,
"grad_norm": 1.2734375,
"learning_rate": 4.9564861865545236e-05,
"loss": 0.3687,
"step": 150
},
{
"epoch": 0.035044681969511125,
"grad_norm": 1.234375,
"learning_rate": 4.941884235733894e-05,
"loss": 0.3548,
"step": 200
},
{
"epoch": 0.04380585246188891,
"grad_norm": 1.0625,
"learning_rate": 4.927282284913264e-05,
"loss": 0.3448,
"step": 250
},
{
"epoch": 0.05256702295426669,
"grad_norm": 1.140625,
"learning_rate": 4.912680334092635e-05,
"loss": 0.3381,
"step": 300
},
{
"epoch": 0.061328193446644474,
"grad_norm": 1.1328125,
"learning_rate": 4.898078383272006e-05,
"loss": 0.3325,
"step": 350
},
{
"epoch": 0.07008936393902225,
"grad_norm": 1.0546875,
"learning_rate": 4.883476432451376e-05,
"loss": 0.324,
"step": 400
},
{
"epoch": 0.07885053443140004,
"grad_norm": 0.9453125,
"learning_rate": 4.8688744816307465e-05,
"loss": 0.3223,
"step": 450
},
{
"epoch": 0.08761170492377782,
"grad_norm": 0.984375,
"learning_rate": 4.854272530810117e-05,
"loss": 0.3197,
"step": 500
},
{
"epoch": 0.09637287541615559,
"grad_norm": 0.94140625,
"learning_rate": 4.839670579989487e-05,
"loss": 0.3086,
"step": 550
},
{
"epoch": 0.10513404590853338,
"grad_norm": 0.9453125,
"learning_rate": 4.825068629168857e-05,
"loss": 0.3122,
"step": 600
},
{
"epoch": 0.11389521640091116,
"grad_norm": 0.90234375,
"learning_rate": 4.8104666783482275e-05,
"loss": 0.3035,
"step": 650
},
{
"epoch": 0.12265638689328895,
"grad_norm": 0.91796875,
"learning_rate": 4.795864727527598e-05,
"loss": 0.3073,
"step": 700
},
{
"epoch": 0.13141755738566674,
"grad_norm": 0.97265625,
"learning_rate": 4.781262776706969e-05,
"loss": 0.3051,
"step": 750
},
{
"epoch": 0.1401787278780445,
"grad_norm": 1.0,
"learning_rate": 4.766660825886339e-05,
"loss": 0.2992,
"step": 800
},
{
"epoch": 0.1489398983704223,
"grad_norm": 0.87890625,
"learning_rate": 4.752058875065709e-05,
"loss": 0.3007,
"step": 850
},
{
"epoch": 0.15770106886280008,
"grad_norm": 0.8828125,
"learning_rate": 4.7374569242450795e-05,
"loss": 0.2952,
"step": 900
},
{
"epoch": 0.16646223935517784,
"grad_norm": 0.875,
"learning_rate": 4.72285497342445e-05,
"loss": 0.2981,
"step": 950
},
{
"epoch": 0.17522340984755563,
"grad_norm": 0.828125,
"learning_rate": 4.70825302260382e-05,
"loss": 0.2911,
"step": 1000
},
{
"epoch": 0.18398458033993342,
"grad_norm": 0.83203125,
"learning_rate": 4.69365107178319e-05,
"loss": 0.2848,
"step": 1050
},
{
"epoch": 0.19274575083231119,
"grad_norm": 0.80078125,
"learning_rate": 4.679049120962561e-05,
"loss": 0.2932,
"step": 1100
},
{
"epoch": 0.20150692132468898,
"grad_norm": 0.828125,
"learning_rate": 4.6644471701419314e-05,
"loss": 0.2908,
"step": 1150
},
{
"epoch": 0.21026809181706677,
"grad_norm": 0.9140625,
"learning_rate": 4.649845219321302e-05,
"loss": 0.2836,
"step": 1200
},
{
"epoch": 0.21902926230944456,
"grad_norm": 0.80859375,
"learning_rate": 4.635243268500672e-05,
"loss": 0.2907,
"step": 1250
},
{
"epoch": 0.22779043280182232,
"grad_norm": 0.87109375,
"learning_rate": 4.620641317680042e-05,
"loss": 0.2859,
"step": 1300
},
{
"epoch": 0.2365516032942001,
"grad_norm": 0.85546875,
"learning_rate": 4.6060393668594125e-05,
"loss": 0.2826,
"step": 1350
},
{
"epoch": 0.2453127737865779,
"grad_norm": 0.859375,
"learning_rate": 4.591437416038783e-05,
"loss": 0.2862,
"step": 1400
},
{
"epoch": 0.2540739442789557,
"grad_norm": 0.8203125,
"learning_rate": 4.576835465218153e-05,
"loss": 0.2865,
"step": 1450
},
{
"epoch": 0.2628351147713335,
"grad_norm": 0.8046875,
"learning_rate": 4.562233514397524e-05,
"loss": 0.2792,
"step": 1500
},
{
"epoch": 0.2715962852637112,
"grad_norm": 1.0703125,
"learning_rate": 4.547631563576894e-05,
"loss": 0.281,
"step": 1550
},
{
"epoch": 0.280357455756089,
"grad_norm": 0.91015625,
"learning_rate": 4.5330296127562645e-05,
"loss": 0.28,
"step": 1600
},
{
"epoch": 0.2891186262484668,
"grad_norm": 0.89453125,
"learning_rate": 4.518427661935635e-05,
"loss": 0.2789,
"step": 1650
},
{
"epoch": 0.2978797967408446,
"grad_norm": 0.8984375,
"learning_rate": 4.503825711115005e-05,
"loss": 0.2792,
"step": 1700
},
{
"epoch": 0.3066409672332224,
"grad_norm": 0.765625,
"learning_rate": 4.489223760294375e-05,
"loss": 0.2809,
"step": 1750
},
{
"epoch": 0.31540213772560016,
"grad_norm": 0.859375,
"learning_rate": 4.4746218094737455e-05,
"loss": 0.2729,
"step": 1800
},
{
"epoch": 0.3241633082179779,
"grad_norm": 0.82421875,
"learning_rate": 4.4600198586531164e-05,
"loss": 0.2763,
"step": 1850
},
{
"epoch": 0.3329244787103557,
"grad_norm": 0.83984375,
"learning_rate": 4.445417907832487e-05,
"loss": 0.2789,
"step": 1900
},
{
"epoch": 0.3416856492027335,
"grad_norm": 0.8671875,
"learning_rate": 4.430815957011857e-05,
"loss": 0.2775,
"step": 1950
},
{
"epoch": 0.35044681969511127,
"grad_norm": 0.87890625,
"learning_rate": 4.416214006191227e-05,
"loss": 0.2727,
"step": 2000
},
{
"epoch": 0.35920799018748906,
"grad_norm": 0.87109375,
"learning_rate": 4.4016120553705975e-05,
"loss": 0.2748,
"step": 2050
},
{
"epoch": 0.36796916067986685,
"grad_norm": 0.81640625,
"learning_rate": 4.387010104549968e-05,
"loss": 0.2747,
"step": 2100
},
{
"epoch": 0.37673033117224464,
"grad_norm": 0.875,
"learning_rate": 4.372408153729338e-05,
"loss": 0.2711,
"step": 2150
},
{
"epoch": 0.38549150166462237,
"grad_norm": 0.72265625,
"learning_rate": 4.357806202908708e-05,
"loss": 0.2694,
"step": 2200
},
{
"epoch": 0.39425267215700016,
"grad_norm": 0.80078125,
"learning_rate": 4.343204252088079e-05,
"loss": 0.27,
"step": 2250
},
{
"epoch": 0.40301384264937795,
"grad_norm": 0.76171875,
"learning_rate": 4.32860230126745e-05,
"loss": 0.2662,
"step": 2300
},
{
"epoch": 0.41177501314175574,
"grad_norm": 0.81640625,
"learning_rate": 4.3140003504468204e-05,
"loss": 0.2764,
"step": 2350
},
{
"epoch": 0.42053618363413353,
"grad_norm": 0.84765625,
"learning_rate": 4.2993983996261906e-05,
"loss": 0.2709,
"step": 2400
},
{
"epoch": 0.4292973541265113,
"grad_norm": 0.921875,
"learning_rate": 4.284796448805561e-05,
"loss": 0.2694,
"step": 2450
},
{
"epoch": 0.4380585246188891,
"grad_norm": 0.77734375,
"learning_rate": 4.270194497984931e-05,
"loss": 0.2673,
"step": 2500
},
{
"epoch": 0.44681969511126685,
"grad_norm": 0.78515625,
"learning_rate": 4.2555925471643014e-05,
"loss": 0.2669,
"step": 2550
},
{
"epoch": 0.45558086560364464,
"grad_norm": 0.6796875,
"learning_rate": 4.240990596343672e-05,
"loss": 0.2686,
"step": 2600
},
{
"epoch": 0.4643420360960224,
"grad_norm": 0.765625,
"learning_rate": 4.226388645523042e-05,
"loss": 0.2663,
"step": 2650
},
{
"epoch": 0.4731032065884002,
"grad_norm": 0.7734375,
"learning_rate": 4.211786694702413e-05,
"loss": 0.2638,
"step": 2700
},
{
"epoch": 0.481864377080778,
"grad_norm": 0.765625,
"learning_rate": 4.197184743881783e-05,
"loss": 0.2644,
"step": 2750
},
{
"epoch": 0.4906255475731558,
"grad_norm": 0.921875,
"learning_rate": 4.1825827930611534e-05,
"loss": 0.2643,
"step": 2800
},
{
"epoch": 0.49938671806553353,
"grad_norm": 0.7421875,
"learning_rate": 4.1679808422405236e-05,
"loss": 0.2675,
"step": 2850
},
{
"epoch": 0.5081478885579114,
"grad_norm": 0.9140625,
"learning_rate": 4.153378891419894e-05,
"loss": 0.269,
"step": 2900
},
{
"epoch": 0.5169090590502892,
"grad_norm": 0.84765625,
"learning_rate": 4.138776940599264e-05,
"loss": 0.2607,
"step": 2950
},
{
"epoch": 0.525670229542667,
"grad_norm": 0.83203125,
"learning_rate": 4.1241749897786344e-05,
"loss": 0.2598,
"step": 3000
},
{
"epoch": 0.5344314000350446,
"grad_norm": 0.75390625,
"learning_rate": 4.1095730389580054e-05,
"loss": 0.2649,
"step": 3050
},
{
"epoch": 0.5431925705274224,
"grad_norm": 0.71484375,
"learning_rate": 4.0949710881373756e-05,
"loss": 0.2636,
"step": 3100
},
{
"epoch": 0.5519537410198002,
"grad_norm": 0.73046875,
"learning_rate": 4.080369137316746e-05,
"loss": 0.2603,
"step": 3150
},
{
"epoch": 0.560714911512178,
"grad_norm": 0.8203125,
"learning_rate": 4.065767186496116e-05,
"loss": 0.2566,
"step": 3200
},
{
"epoch": 0.5694760820045558,
"grad_norm": 0.79296875,
"learning_rate": 4.0511652356754864e-05,
"loss": 0.2577,
"step": 3250
},
{
"epoch": 0.5782372524969336,
"grad_norm": 0.828125,
"learning_rate": 4.0365632848548566e-05,
"loss": 0.2596,
"step": 3300
},
{
"epoch": 0.5869984229893114,
"grad_norm": 0.8125,
"learning_rate": 4.021961334034227e-05,
"loss": 0.2622,
"step": 3350
},
{
"epoch": 0.5957595934816892,
"grad_norm": 0.7734375,
"learning_rate": 4.007359383213597e-05,
"loss": 0.2584,
"step": 3400
},
{
"epoch": 0.604520763974067,
"grad_norm": 0.79296875,
"learning_rate": 3.992757432392968e-05,
"loss": 0.2572,
"step": 3450
},
{
"epoch": 0.6132819344664447,
"grad_norm": 0.84765625,
"learning_rate": 3.9781554815723384e-05,
"loss": 0.26,
"step": 3500
},
{
"epoch": 0.6220431049588225,
"grad_norm": 0.80859375,
"learning_rate": 3.9635535307517086e-05,
"loss": 0.26,
"step": 3550
},
{
"epoch": 0.6308042754512003,
"grad_norm": 0.82421875,
"learning_rate": 3.948951579931079e-05,
"loss": 0.2586,
"step": 3600
},
{
"epoch": 0.6395654459435781,
"grad_norm": 0.8984375,
"learning_rate": 3.934349629110449e-05,
"loss": 0.2652,
"step": 3650
},
{
"epoch": 0.6483266164359558,
"grad_norm": 0.77734375,
"learning_rate": 3.9197476782898194e-05,
"loss": 0.2576,
"step": 3700
},
{
"epoch": 0.6570877869283336,
"grad_norm": 0.7890625,
"learning_rate": 3.9051457274691897e-05,
"loss": 0.2595,
"step": 3750
},
{
"epoch": 0.6658489574207114,
"grad_norm": 0.7890625,
"learning_rate": 3.8905437766485606e-05,
"loss": 0.256,
"step": 3800
},
{
"epoch": 0.6746101279130892,
"grad_norm": 0.84375,
"learning_rate": 3.875941825827931e-05,
"loss": 0.2571,
"step": 3850
},
{
"epoch": 0.683371298405467,
"grad_norm": 0.8671875,
"learning_rate": 3.861339875007301e-05,
"loss": 0.2546,
"step": 3900
},
{
"epoch": 0.6921324688978447,
"grad_norm": 0.875,
"learning_rate": 3.8467379241866714e-05,
"loss": 0.2578,
"step": 3950
},
{
"epoch": 0.7008936393902225,
"grad_norm": 0.81640625,
"learning_rate": 3.8321359733660416e-05,
"loss": 0.2551,
"step": 4000
},
{
"epoch": 0.7096548098826003,
"grad_norm": 0.80859375,
"learning_rate": 3.817534022545412e-05,
"loss": 0.2586,
"step": 4050
},
{
"epoch": 0.7184159803749781,
"grad_norm": 0.8046875,
"learning_rate": 3.802932071724782e-05,
"loss": 0.2568,
"step": 4100
},
{
"epoch": 0.7271771508673559,
"grad_norm": 0.8515625,
"learning_rate": 3.7883301209041524e-05,
"loss": 0.2632,
"step": 4150
},
{
"epoch": 0.7359383213597337,
"grad_norm": 0.75390625,
"learning_rate": 3.7737281700835233e-05,
"loss": 0.2518,
"step": 4200
},
{
"epoch": 0.7446994918521115,
"grad_norm": 0.8203125,
"learning_rate": 3.7591262192628936e-05,
"loss": 0.2539,
"step": 4250
},
{
"epoch": 0.7534606623444893,
"grad_norm": 0.765625,
"learning_rate": 3.7445242684422645e-05,
"loss": 0.2507,
"step": 4300
},
{
"epoch": 0.762221832836867,
"grad_norm": 0.83984375,
"learning_rate": 3.729922317621635e-05,
"loss": 0.2517,
"step": 4350
},
{
"epoch": 0.7709830033292447,
"grad_norm": 0.74609375,
"learning_rate": 3.715320366801005e-05,
"loss": 0.2523,
"step": 4400
},
{
"epoch": 0.7797441738216225,
"grad_norm": 0.70703125,
"learning_rate": 3.700718415980375e-05,
"loss": 0.2492,
"step": 4450
},
{
"epoch": 0.7885053443140003,
"grad_norm": 0.8359375,
"learning_rate": 3.6861164651597456e-05,
"loss": 0.2571,
"step": 4500
},
{
"epoch": 0.7972665148063781,
"grad_norm": 0.7890625,
"learning_rate": 3.671514514339116e-05,
"loss": 0.2534,
"step": 4550
},
{
"epoch": 0.8060276852987559,
"grad_norm": 0.79296875,
"learning_rate": 3.656912563518487e-05,
"loss": 0.2573,
"step": 4600
},
{
"epoch": 0.8147888557911337,
"grad_norm": 0.80078125,
"learning_rate": 3.642310612697857e-05,
"loss": 0.2552,
"step": 4650
},
{
"epoch": 0.8235500262835115,
"grad_norm": 0.796875,
"learning_rate": 3.627708661877227e-05,
"loss": 0.252,
"step": 4700
},
{
"epoch": 0.8323111967758893,
"grad_norm": 0.75,
"learning_rate": 3.6131067110565975e-05,
"loss": 0.2545,
"step": 4750
},
{
"epoch": 0.8410723672682671,
"grad_norm": 0.77734375,
"learning_rate": 3.598504760235968e-05,
"loss": 0.2536,
"step": 4800
},
{
"epoch": 0.8498335377606449,
"grad_norm": 0.83203125,
"learning_rate": 3.583902809415338e-05,
"loss": 0.2486,
"step": 4850
},
{
"epoch": 0.8585947082530226,
"grad_norm": 0.79296875,
"learning_rate": 3.569300858594708e-05,
"loss": 0.2491,
"step": 4900
},
{
"epoch": 0.8673558787454004,
"grad_norm": 0.796875,
"learning_rate": 3.5546989077740786e-05,
"loss": 0.2493,
"step": 4950
},
{
"epoch": 0.8761170492377782,
"grad_norm": 0.8359375,
"learning_rate": 3.5400969569534495e-05,
"loss": 0.2496,
"step": 5000
},
{
"epoch": 0.8848782197301559,
"grad_norm": 0.73828125,
"learning_rate": 3.52549500613282e-05,
"loss": 0.2488,
"step": 5050
},
{
"epoch": 0.8936393902225337,
"grad_norm": 0.8046875,
"learning_rate": 3.51089305531219e-05,
"loss": 0.2497,
"step": 5100
},
{
"epoch": 0.9024005607149115,
"grad_norm": 0.80078125,
"learning_rate": 3.49629110449156e-05,
"loss": 0.253,
"step": 5150
},
{
"epoch": 0.9111617312072893,
"grad_norm": 0.7421875,
"learning_rate": 3.4816891536709306e-05,
"loss": 0.2515,
"step": 5200
},
{
"epoch": 0.9199229016996671,
"grad_norm": 0.7890625,
"learning_rate": 3.467087202850301e-05,
"loss": 0.2479,
"step": 5250
},
{
"epoch": 0.9286840721920449,
"grad_norm": 0.796875,
"learning_rate": 3.452485252029671e-05,
"loss": 0.2486,
"step": 5300
},
{
"epoch": 0.9374452426844226,
"grad_norm": 0.82421875,
"learning_rate": 3.437883301209042e-05,
"loss": 0.2518,
"step": 5350
},
{
"epoch": 0.9462064131768004,
"grad_norm": 0.85546875,
"learning_rate": 3.423281350388412e-05,
"loss": 0.2499,
"step": 5400
},
{
"epoch": 0.9549675836691782,
"grad_norm": 0.76171875,
"learning_rate": 3.4086793995677825e-05,
"loss": 0.2508,
"step": 5450
},
{
"epoch": 0.963728754161556,
"grad_norm": 0.83203125,
"learning_rate": 3.394077448747153e-05,
"loss": 0.2482,
"step": 5500
},
{
"epoch": 0.9724899246539338,
"grad_norm": 0.78515625,
"learning_rate": 3.379475497926523e-05,
"loss": 0.2491,
"step": 5550
},
{
"epoch": 0.9812510951463116,
"grad_norm": 0.84765625,
"learning_rate": 3.364873547105893e-05,
"loss": 0.2471,
"step": 5600
},
{
"epoch": 0.9900122656386894,
"grad_norm": 0.703125,
"learning_rate": 3.3502715962852636e-05,
"loss": 0.2539,
"step": 5650
},
{
"epoch": 0.9987734361310671,
"grad_norm": 0.77734375,
"learning_rate": 3.335669645464634e-05,
"loss": 0.2488,
"step": 5700
},
{
"epoch": 1.007534606623445,
"grad_norm": 0.80078125,
"learning_rate": 3.321067694644005e-05,
"loss": 0.2335,
"step": 5750
},
{
"epoch": 1.0162957771158228,
"grad_norm": 0.8046875,
"learning_rate": 3.306465743823375e-05,
"loss": 0.2302,
"step": 5800
},
{
"epoch": 1.0250569476082005,
"grad_norm": 0.84765625,
"learning_rate": 3.291863793002745e-05,
"loss": 0.2287,
"step": 5850
},
{
"epoch": 1.0338181181005783,
"grad_norm": 0.84765625,
"learning_rate": 3.2772618421821155e-05,
"loss": 0.2278,
"step": 5900
},
{
"epoch": 1.0425792885929561,
"grad_norm": 0.87109375,
"learning_rate": 3.262659891361486e-05,
"loss": 0.2322,
"step": 5950
},
{
"epoch": 1.051340459085334,
"grad_norm": 0.87109375,
"learning_rate": 3.248057940540856e-05,
"loss": 0.2305,
"step": 6000
},
{
"epoch": 1.0601016295777115,
"grad_norm": 0.95703125,
"learning_rate": 3.233455989720226e-05,
"loss": 0.2316,
"step": 6050
},
{
"epoch": 1.0688628000700893,
"grad_norm": 0.75,
"learning_rate": 3.218854038899597e-05,
"loss": 0.2366,
"step": 6100
},
{
"epoch": 1.077623970562467,
"grad_norm": 1.015625,
"learning_rate": 3.2042520880789675e-05,
"loss": 0.2331,
"step": 6150
},
{
"epoch": 1.0863851410548448,
"grad_norm": 0.80078125,
"learning_rate": 3.189650137258338e-05,
"loss": 0.2283,
"step": 6200
},
{
"epoch": 1.0951463115472226,
"grad_norm": 0.8828125,
"learning_rate": 3.175048186437708e-05,
"loss": 0.2301,
"step": 6250
},
{
"epoch": 1.1039074820396004,
"grad_norm": 0.87109375,
"learning_rate": 3.160446235617079e-05,
"loss": 0.2311,
"step": 6300
},
{
"epoch": 1.1126686525319782,
"grad_norm": 0.83203125,
"learning_rate": 3.145844284796449e-05,
"loss": 0.2278,
"step": 6350
},
{
"epoch": 1.121429823024356,
"grad_norm": 0.8359375,
"learning_rate": 3.1312423339758195e-05,
"loss": 0.231,
"step": 6400
},
{
"epoch": 1.1301909935167338,
"grad_norm": 0.97265625,
"learning_rate": 3.11664038315519e-05,
"loss": 0.2302,
"step": 6450
},
{
"epoch": 1.1389521640091116,
"grad_norm": 0.94921875,
"learning_rate": 3.10203843233456e-05,
"loss": 0.2311,
"step": 6500
},
{
"epoch": 1.1477133345014894,
"grad_norm": 0.85546875,
"learning_rate": 3.087436481513931e-05,
"loss": 0.234,
"step": 6550
},
{
"epoch": 1.1564745049938672,
"grad_norm": 0.8984375,
"learning_rate": 3.072834530693301e-05,
"loss": 0.231,
"step": 6600
},
{
"epoch": 1.165235675486245,
"grad_norm": 0.86328125,
"learning_rate": 3.0582325798726714e-05,
"loss": 0.2294,
"step": 6650
},
{
"epoch": 1.1739968459786227,
"grad_norm": 0.796875,
"learning_rate": 3.0436306290520417e-05,
"loss": 0.2316,
"step": 6700
},
{
"epoch": 1.1827580164710005,
"grad_norm": 0.8984375,
"learning_rate": 3.029028678231412e-05,
"loss": 0.2335,
"step": 6750
},
{
"epoch": 1.1915191869633783,
"grad_norm": 1.0234375,
"learning_rate": 3.0144267274107822e-05,
"loss": 0.2282,
"step": 6800
},
{
"epoch": 1.2002803574557561,
"grad_norm": 0.85546875,
"learning_rate": 2.9998247765901528e-05,
"loss": 0.2327,
"step": 6850
},
{
"epoch": 1.209041527948134,
"grad_norm": 0.8671875,
"learning_rate": 2.985222825769523e-05,
"loss": 0.2295,
"step": 6900
},
{
"epoch": 1.2178026984405117,
"grad_norm": 0.85546875,
"learning_rate": 2.9706208749488933e-05,
"loss": 0.2306,
"step": 6950
},
{
"epoch": 1.2265638689328895,
"grad_norm": 0.875,
"learning_rate": 2.9560189241282636e-05,
"loss": 0.234,
"step": 7000
},
{
"epoch": 1.2353250394252673,
"grad_norm": 0.85546875,
"learning_rate": 2.9414169733076342e-05,
"loss": 0.2302,
"step": 7050
},
{
"epoch": 1.244086209917645,
"grad_norm": 0.9375,
"learning_rate": 2.9268150224870045e-05,
"loss": 0.2273,
"step": 7100
},
{
"epoch": 1.2528473804100229,
"grad_norm": 0.83203125,
"learning_rate": 2.9122130716663747e-05,
"loss": 0.2255,
"step": 7150
},
{
"epoch": 1.2616085509024004,
"grad_norm": 0.8828125,
"learning_rate": 2.8976111208457453e-05,
"loss": 0.2305,
"step": 7200
},
{
"epoch": 1.2703697213947782,
"grad_norm": 0.8125,
"learning_rate": 2.8830091700251156e-05,
"loss": 0.2293,
"step": 7250
},
{
"epoch": 1.279130891887156,
"grad_norm": 0.91796875,
"learning_rate": 2.868407219204486e-05,
"loss": 0.2258,
"step": 7300
},
{
"epoch": 1.2878920623795338,
"grad_norm": 0.859375,
"learning_rate": 2.853805268383856e-05,
"loss": 0.227,
"step": 7350
},
{
"epoch": 1.2966532328719116,
"grad_norm": 0.91015625,
"learning_rate": 2.8392033175632267e-05,
"loss": 0.2292,
"step": 7400
},
{
"epoch": 1.3054144033642894,
"grad_norm": 0.8359375,
"learning_rate": 2.824601366742597e-05,
"loss": 0.228,
"step": 7450
},
{
"epoch": 1.3141755738566672,
"grad_norm": 0.859375,
"learning_rate": 2.8099994159219672e-05,
"loss": 0.2299,
"step": 7500
},
{
"epoch": 1.322936744349045,
"grad_norm": 0.9453125,
"learning_rate": 2.7953974651013375e-05,
"loss": 0.225,
"step": 7550
},
{
"epoch": 1.3316979148414227,
"grad_norm": 0.82421875,
"learning_rate": 2.780795514280708e-05,
"loss": 0.2274,
"step": 7600
},
{
"epoch": 1.3404590853338005,
"grad_norm": 0.921875,
"learning_rate": 2.7661935634600783e-05,
"loss": 0.2295,
"step": 7650
},
{
"epoch": 1.3492202558261783,
"grad_norm": 0.8359375,
"learning_rate": 2.7515916126394486e-05,
"loss": 0.2259,
"step": 7700
},
{
"epoch": 1.3579814263185561,
"grad_norm": 0.90234375,
"learning_rate": 2.736989661818819e-05,
"loss": 0.2321,
"step": 7750
},
{
"epoch": 1.366742596810934,
"grad_norm": 0.94140625,
"learning_rate": 2.7223877109981894e-05,
"loss": 0.2279,
"step": 7800
},
{
"epoch": 1.3755037673033117,
"grad_norm": 0.8984375,
"learning_rate": 2.7077857601775597e-05,
"loss": 0.2291,
"step": 7850
},
{
"epoch": 1.3842649377956895,
"grad_norm": 0.96484375,
"learning_rate": 2.69318380935693e-05,
"loss": 0.2273,
"step": 7900
},
{
"epoch": 1.3930261082880673,
"grad_norm": 0.9375,
"learning_rate": 2.6785818585363006e-05,
"loss": 0.2296,
"step": 7950
},
{
"epoch": 1.401787278780445,
"grad_norm": 0.91796875,
"learning_rate": 2.6639799077156708e-05,
"loss": 0.2262,
"step": 8000
},
{
"epoch": 1.4105484492728229,
"grad_norm": 0.875,
"learning_rate": 2.649377956895041e-05,
"loss": 0.2258,
"step": 8050
},
{
"epoch": 1.4193096197652006,
"grad_norm": 1.03125,
"learning_rate": 2.6347760060744113e-05,
"loss": 0.2237,
"step": 8100
},
{
"epoch": 1.4280707902575784,
"grad_norm": 0.92578125,
"learning_rate": 2.620174055253782e-05,
"loss": 0.2306,
"step": 8150
},
{
"epoch": 1.4368319607499562,
"grad_norm": 0.95703125,
"learning_rate": 2.6055721044331522e-05,
"loss": 0.2282,
"step": 8200
},
{
"epoch": 1.445593131242334,
"grad_norm": 0.82421875,
"learning_rate": 2.5909701536125224e-05,
"loss": 0.2245,
"step": 8250
},
{
"epoch": 1.4543543017347118,
"grad_norm": 0.890625,
"learning_rate": 2.5763682027918934e-05,
"loss": 0.2267,
"step": 8300
},
{
"epoch": 1.4631154722270896,
"grad_norm": 0.88671875,
"learning_rate": 2.5617662519712636e-05,
"loss": 0.2248,
"step": 8350
},
{
"epoch": 1.4718766427194674,
"grad_norm": 1.109375,
"learning_rate": 2.5471643011506342e-05,
"loss": 0.2299,
"step": 8400
},
{
"epoch": 1.4806378132118452,
"grad_norm": 0.90234375,
"learning_rate": 2.5325623503300045e-05,
"loss": 0.2222,
"step": 8450
},
{
"epoch": 1.489398983704223,
"grad_norm": 0.94921875,
"learning_rate": 2.5179603995093748e-05,
"loss": 0.228,
"step": 8500
},
{
"epoch": 1.4981601541966008,
"grad_norm": 0.83984375,
"learning_rate": 2.503358448688745e-05,
"loss": 0.2302,
"step": 8550
},
{
"epoch": 1.5069213246889785,
"grad_norm": 0.94140625,
"learning_rate": 2.4887564978681153e-05,
"loss": 0.2281,
"step": 8600
},
{
"epoch": 1.5156824951813563,
"grad_norm": 0.88671875,
"learning_rate": 2.4741545470474855e-05,
"loss": 0.225,
"step": 8650
},
{
"epoch": 1.5244436656737341,
"grad_norm": 0.921875,
"learning_rate": 2.4595525962268558e-05,
"loss": 0.2277,
"step": 8700
},
{
"epoch": 1.533204836166112,
"grad_norm": 0.796875,
"learning_rate": 2.4449506454062264e-05,
"loss": 0.2216,
"step": 8750
},
{
"epoch": 1.5419660066584897,
"grad_norm": 0.86328125,
"learning_rate": 2.430348694585597e-05,
"loss": 0.2238,
"step": 8800
},
{
"epoch": 1.5507271771508675,
"grad_norm": 0.91015625,
"learning_rate": 2.4157467437649672e-05,
"loss": 0.229,
"step": 8850
},
{
"epoch": 1.5594883476432453,
"grad_norm": 0.99609375,
"learning_rate": 2.4011447929443375e-05,
"loss": 0.2273,
"step": 8900
},
{
"epoch": 1.568249518135623,
"grad_norm": 0.91796875,
"learning_rate": 2.386542842123708e-05,
"loss": 0.2261,
"step": 8950
},
{
"epoch": 1.5770106886280009,
"grad_norm": 0.921875,
"learning_rate": 2.3719408913030784e-05,
"loss": 0.2244,
"step": 9000
},
{
"epoch": 1.5857718591203784,
"grad_norm": 0.9140625,
"learning_rate": 2.3573389404824486e-05,
"loss": 0.2243,
"step": 9050
},
{
"epoch": 1.5945330296127562,
"grad_norm": 0.93359375,
"learning_rate": 2.342736989661819e-05,
"loss": 0.2277,
"step": 9100
},
{
"epoch": 1.603294200105134,
"grad_norm": 0.83984375,
"learning_rate": 2.3281350388411895e-05,
"loss": 0.2238,
"step": 9150
},
{
"epoch": 1.6120553705975118,
"grad_norm": 0.80078125,
"learning_rate": 2.3135330880205597e-05,
"loss": 0.2251,
"step": 9200
},
{
"epoch": 1.6208165410898896,
"grad_norm": 1.015625,
"learning_rate": 2.29893113719993e-05,
"loss": 0.2257,
"step": 9250
},
{
"epoch": 1.6295777115822674,
"grad_norm": 0.890625,
"learning_rate": 2.2843291863793003e-05,
"loss": 0.2256,
"step": 9300
},
{
"epoch": 1.6383388820746452,
"grad_norm": 1.0078125,
"learning_rate": 2.269727235558671e-05,
"loss": 0.2239,
"step": 9350
},
{
"epoch": 1.647100052567023,
"grad_norm": 0.9140625,
"learning_rate": 2.255125284738041e-05,
"loss": 0.228,
"step": 9400
},
{
"epoch": 1.6558612230594008,
"grad_norm": 1.0546875,
"learning_rate": 2.2405233339174114e-05,
"loss": 0.2274,
"step": 9450
},
{
"epoch": 1.6646223935517785,
"grad_norm": 0.91796875,
"learning_rate": 2.2259213830967816e-05,
"loss": 0.2214,
"step": 9500
},
{
"epoch": 1.6733835640441563,
"grad_norm": 0.9140625,
"learning_rate": 2.2113194322761522e-05,
"loss": 0.2236,
"step": 9550
},
{
"epoch": 1.6821447345365341,
"grad_norm": 0.90625,
"learning_rate": 2.1967174814555225e-05,
"loss": 0.2222,
"step": 9600
},
{
"epoch": 1.690905905028912,
"grad_norm": 0.85546875,
"learning_rate": 2.1821155306348927e-05,
"loss": 0.2227,
"step": 9650
},
{
"epoch": 1.6996670755212895,
"grad_norm": 0.98828125,
"learning_rate": 2.1675135798142633e-05,
"loss": 0.2277,
"step": 9700
},
{
"epoch": 1.7084282460136673,
"grad_norm": 0.9296875,
"learning_rate": 2.1529116289936336e-05,
"loss": 0.2257,
"step": 9750
},
{
"epoch": 1.717189416506045,
"grad_norm": 0.97265625,
"learning_rate": 2.1383096781730042e-05,
"loss": 0.221,
"step": 9800
},
{
"epoch": 1.7259505869984229,
"grad_norm": 0.83203125,
"learning_rate": 2.1237077273523745e-05,
"loss": 0.2231,
"step": 9850
},
{
"epoch": 1.7347117574908006,
"grad_norm": 0.9140625,
"learning_rate": 2.1091057765317447e-05,
"loss": 0.22,
"step": 9900
},
{
"epoch": 1.7434729279831784,
"grad_norm": 0.875,
"learning_rate": 2.0945038257111153e-05,
"loss": 0.2241,
"step": 9950
},
{
"epoch": 1.7522340984755562,
"grad_norm": 0.82421875,
"learning_rate": 2.0799018748904856e-05,
"loss": 0.2215,
"step": 10000
},
{
"epoch": 1.760995268967934,
"grad_norm": 0.828125,
"learning_rate": 2.065299924069856e-05,
"loss": 0.2267,
"step": 10050
},
{
"epoch": 1.7697564394603118,
"grad_norm": 0.9453125,
"learning_rate": 2.050697973249226e-05,
"loss": 0.222,
"step": 10100
},
{
"epoch": 1.7785176099526896,
"grad_norm": 0.92578125,
"learning_rate": 2.0360960224285967e-05,
"loss": 0.2239,
"step": 10150
},
{
"epoch": 1.7872787804450674,
"grad_norm": 0.9140625,
"learning_rate": 2.021494071607967e-05,
"loss": 0.22,
"step": 10200
},
{
"epoch": 1.7960399509374452,
"grad_norm": 0.94140625,
"learning_rate": 2.0068921207873372e-05,
"loss": 0.2293,
"step": 10250
},
{
"epoch": 1.804801121429823,
"grad_norm": 0.96484375,
"learning_rate": 1.9922901699667078e-05,
"loss": 0.2271,
"step": 10300
},
{
"epoch": 1.8135622919222008,
"grad_norm": 0.875,
"learning_rate": 1.977688219146078e-05,
"loss": 0.2252,
"step": 10350
},
{
"epoch": 1.8223234624145785,
"grad_norm": 0.91796875,
"learning_rate": 1.9630862683254483e-05,
"loss": 0.2265,
"step": 10400
},
{
"epoch": 1.8310846329069563,
"grad_norm": 0.9453125,
"learning_rate": 1.9484843175048186e-05,
"loss": 0.2239,
"step": 10450
},
{
"epoch": 1.8398458033993341,
"grad_norm": 0.94921875,
"learning_rate": 1.9338823666841892e-05,
"loss": 0.222,
"step": 10500
},
{
"epoch": 1.848606973891712,
"grad_norm": 1.078125,
"learning_rate": 1.9192804158635594e-05,
"loss": 0.2231,
"step": 10550
},
{
"epoch": 1.8573681443840897,
"grad_norm": 0.98046875,
"learning_rate": 1.9046784650429297e-05,
"loss": 0.2266,
"step": 10600
},
{
"epoch": 1.8661293148764675,
"grad_norm": 0.91015625,
"learning_rate": 1.8900765142223e-05,
"loss": 0.2245,
"step": 10650
},
{
"epoch": 1.8748904853688453,
"grad_norm": 0.98828125,
"learning_rate": 1.8754745634016706e-05,
"loss": 0.2243,
"step": 10700
},
{
"epoch": 1.883651655861223,
"grad_norm": 0.80859375,
"learning_rate": 1.8608726125810408e-05,
"loss": 0.2194,
"step": 10750
},
{
"epoch": 1.8924128263536009,
"grad_norm": 0.87109375,
"learning_rate": 1.8462706617604114e-05,
"loss": 0.2253,
"step": 10800
},
{
"epoch": 1.9011739968459787,
"grad_norm": 0.875,
"learning_rate": 1.8316687109397817e-05,
"loss": 0.2231,
"step": 10850
},
{
"epoch": 1.9099351673383564,
"grad_norm": 0.91015625,
"learning_rate": 1.8170667601191523e-05,
"loss": 0.2215,
"step": 10900
},
{
"epoch": 1.9186963378307342,
"grad_norm": 0.99609375,
"learning_rate": 1.8024648092985225e-05,
"loss": 0.2246,
"step": 10950
},
{
"epoch": 1.927457508323112,
"grad_norm": 0.890625,
"learning_rate": 1.7878628584778928e-05,
"loss": 0.2215,
"step": 11000
},
{
"epoch": 1.9362186788154898,
"grad_norm": 0.953125,
"learning_rate": 1.773260907657263e-05,
"loss": 0.2251,
"step": 11050
},
{
"epoch": 1.9449798493078676,
"grad_norm": 0.91015625,
"learning_rate": 1.7586589568366336e-05,
"loss": 0.219,
"step": 11100
},
{
"epoch": 1.9537410198002454,
"grad_norm": 1.0234375,
"learning_rate": 1.744057006016004e-05,
"loss": 0.2194,
"step": 11150
},
{
"epoch": 1.9625021902926232,
"grad_norm": 0.84765625,
"learning_rate": 1.729455055195374e-05,
"loss": 0.219,
"step": 11200
},
{
"epoch": 1.971263360785001,
"grad_norm": 0.94140625,
"learning_rate": 1.7148531043747444e-05,
"loss": 0.2183,
"step": 11250
},
{
"epoch": 1.9800245312773788,
"grad_norm": 0.8671875,
"learning_rate": 1.700251153554115e-05,
"loss": 0.221,
"step": 11300
},
{
"epoch": 1.9887857017697566,
"grad_norm": 0.921875,
"learning_rate": 1.6856492027334853e-05,
"loss": 0.2218,
"step": 11350
},
{
"epoch": 1.9975468722621343,
"grad_norm": 0.93359375,
"learning_rate": 1.6710472519128555e-05,
"loss": 0.221,
"step": 11400
},
{
"epoch": 2.006308042754512,
"grad_norm": 1.03125,
"learning_rate": 1.6564453010922258e-05,
"loss": 0.2109,
"step": 11450
},
{
"epoch": 2.01506921324689,
"grad_norm": 1.0234375,
"learning_rate": 1.6418433502715964e-05,
"loss": 0.2091,
"step": 11500
},
{
"epoch": 2.0238303837392677,
"grad_norm": 1.078125,
"learning_rate": 1.6272413994509666e-05,
"loss": 0.2039,
"step": 11550
},
{
"epoch": 2.0325915542316455,
"grad_norm": 1.0625,
"learning_rate": 1.612639448630337e-05,
"loss": 0.2078,
"step": 11600
},
{
"epoch": 2.0413527247240233,
"grad_norm": 1.046875,
"learning_rate": 1.5980374978097075e-05,
"loss": 0.2071,
"step": 11650
},
{
"epoch": 2.050113895216401,
"grad_norm": 1.0703125,
"learning_rate": 1.5834355469890778e-05,
"loss": 0.2056,
"step": 11700
},
{
"epoch": 2.058875065708779,
"grad_norm": 1.1015625,
"learning_rate": 1.568833596168448e-05,
"loss": 0.2097,
"step": 11750
},
{
"epoch": 2.0676362362011567,
"grad_norm": 1.109375,
"learning_rate": 1.5542316453478186e-05,
"loss": 0.2083,
"step": 11800
},
{
"epoch": 2.0763974066935345,
"grad_norm": 0.984375,
"learning_rate": 1.539629694527189e-05,
"loss": 0.2047,
"step": 11850
},
{
"epoch": 2.0851585771859122,
"grad_norm": 1.0390625,
"learning_rate": 1.5250277437065593e-05,
"loss": 0.2044,
"step": 11900
},
{
"epoch": 2.09391974767829,
"grad_norm": 1.03125,
"learning_rate": 1.5104257928859297e-05,
"loss": 0.207,
"step": 11950
},
{
"epoch": 2.102680918170668,
"grad_norm": 1.1015625,
"learning_rate": 1.4958238420653e-05,
"loss": 0.2058,
"step": 12000
},
{
"epoch": 2.111442088663045,
"grad_norm": 1.09375,
"learning_rate": 1.4812218912446704e-05,
"loss": 0.2035,
"step": 12050
},
{
"epoch": 2.120203259155423,
"grad_norm": 1.125,
"learning_rate": 1.4666199404240409e-05,
"loss": 0.21,
"step": 12100
},
{
"epoch": 2.1289644296478007,
"grad_norm": 1.015625,
"learning_rate": 1.4520179896034111e-05,
"loss": 0.2079,
"step": 12150
},
{
"epoch": 2.1377256001401785,
"grad_norm": 1.015625,
"learning_rate": 1.4374160387827815e-05,
"loss": 0.2057,
"step": 12200
},
{
"epoch": 2.1464867706325563,
"grad_norm": 1.046875,
"learning_rate": 1.4228140879621518e-05,
"loss": 0.2058,
"step": 12250
},
{
"epoch": 2.155247941124934,
"grad_norm": 0.9375,
"learning_rate": 1.4082121371415222e-05,
"loss": 0.2087,
"step": 12300
},
{
"epoch": 2.164009111617312,
"grad_norm": 1.125,
"learning_rate": 1.3936101863208925e-05,
"loss": 0.2049,
"step": 12350
},
{
"epoch": 2.1727702821096897,
"grad_norm": 1.15625,
"learning_rate": 1.3790082355002629e-05,
"loss": 0.2073,
"step": 12400
},
{
"epoch": 2.1815314526020675,
"grad_norm": 1.203125,
"learning_rate": 1.3644062846796332e-05,
"loss": 0.2048,
"step": 12450
},
{
"epoch": 2.1902926230944453,
"grad_norm": 1.1171875,
"learning_rate": 1.3498043338590036e-05,
"loss": 0.2062,
"step": 12500
},
{
"epoch": 2.199053793586823,
"grad_norm": 1.15625,
"learning_rate": 1.3352023830383739e-05,
"loss": 0.2104,
"step": 12550
},
{
"epoch": 2.207814964079201,
"grad_norm": 1.109375,
"learning_rate": 1.3206004322177443e-05,
"loss": 0.2075,
"step": 12600
},
{
"epoch": 2.2165761345715786,
"grad_norm": 0.9609375,
"learning_rate": 1.3059984813971145e-05,
"loss": 0.2071,
"step": 12650
},
{
"epoch": 2.2253373050639564,
"grad_norm": 1.03125,
"learning_rate": 1.291396530576485e-05,
"loss": 0.2063,
"step": 12700
},
{
"epoch": 2.2340984755563342,
"grad_norm": 1.0625,
"learning_rate": 1.2767945797558552e-05,
"loss": 0.2033,
"step": 12750
},
{
"epoch": 2.242859646048712,
"grad_norm": 1.0625,
"learning_rate": 1.262192628935226e-05,
"loss": 0.2057,
"step": 12800
},
{
"epoch": 2.25162081654109,
"grad_norm": 1.125,
"learning_rate": 1.2475906781145961e-05,
"loss": 0.2063,
"step": 12850
},
{
"epoch": 2.2603819870334676,
"grad_norm": 1.046875,
"learning_rate": 1.2329887272939665e-05,
"loss": 0.2011,
"step": 12900
},
{
"epoch": 2.2691431575258454,
"grad_norm": 1.1328125,
"learning_rate": 1.2183867764733368e-05,
"loss": 0.2086,
"step": 12950
},
{
"epoch": 2.277904328018223,
"grad_norm": 1.171875,
"learning_rate": 1.2037848256527072e-05,
"loss": 0.2063,
"step": 13000
},
{
"epoch": 2.286665498510601,
"grad_norm": 1.0625,
"learning_rate": 1.1891828748320776e-05,
"loss": 0.2082,
"step": 13050
},
{
"epoch": 2.2954266690029788,
"grad_norm": 0.984375,
"learning_rate": 1.174580924011448e-05,
"loss": 0.2124,
"step": 13100
},
{
"epoch": 2.3041878394953565,
"grad_norm": 1.0078125,
"learning_rate": 1.1599789731908183e-05,
"loss": 0.2078,
"step": 13150
},
{
"epoch": 2.3129490099877343,
"grad_norm": 1.046875,
"learning_rate": 1.1453770223701888e-05,
"loss": 0.2098,
"step": 13200
},
{
"epoch": 2.321710180480112,
"grad_norm": 1.046875,
"learning_rate": 1.130775071549559e-05,
"loss": 0.2057,
"step": 13250
},
{
"epoch": 2.33047135097249,
"grad_norm": 1.1875,
"learning_rate": 1.1161731207289294e-05,
"loss": 0.2048,
"step": 13300
},
{
"epoch": 2.3392325214648677,
"grad_norm": 1.3203125,
"learning_rate": 1.1015711699082999e-05,
"loss": 0.2049,
"step": 13350
},
{
"epoch": 2.3479936919572455,
"grad_norm": 1.0078125,
"learning_rate": 1.0869692190876701e-05,
"loss": 0.206,
"step": 13400
},
{
"epoch": 2.3567548624496233,
"grad_norm": 1.125,
"learning_rate": 1.0723672682670406e-05,
"loss": 0.2044,
"step": 13450
},
{
"epoch": 2.365516032942001,
"grad_norm": 1.2890625,
"learning_rate": 1.0577653174464108e-05,
"loss": 0.2062,
"step": 13500
},
{
"epoch": 2.374277203434379,
"grad_norm": 1.140625,
"learning_rate": 1.0431633666257812e-05,
"loss": 0.206,
"step": 13550
},
{
"epoch": 2.3830383739267567,
"grad_norm": 1.09375,
"learning_rate": 1.0285614158051517e-05,
"loss": 0.2044,
"step": 13600
},
{
"epoch": 2.3917995444191344,
"grad_norm": 1.0546875,
"learning_rate": 1.0139594649845221e-05,
"loss": 0.207,
"step": 13650
},
{
"epoch": 2.4005607149115122,
"grad_norm": 1.1953125,
"learning_rate": 9.993575141638924e-06,
"loss": 0.2032,
"step": 13700
},
{
"epoch": 2.40932188540389,
"grad_norm": 1.03125,
"learning_rate": 9.847555633432628e-06,
"loss": 0.2037,
"step": 13750
},
{
"epoch": 2.418083055896268,
"grad_norm": 1.2265625,
"learning_rate": 9.70153612522633e-06,
"loss": 0.2076,
"step": 13800
},
{
"epoch": 2.4268442263886456,
"grad_norm": 0.984375,
"learning_rate": 9.555516617020035e-06,
"loss": 0.2049,
"step": 13850
},
{
"epoch": 2.4356053968810234,
"grad_norm": 1.125,
"learning_rate": 9.409497108813737e-06,
"loss": 0.2046,
"step": 13900
},
{
"epoch": 2.444366567373401,
"grad_norm": 1.171875,
"learning_rate": 9.263477600607442e-06,
"loss": 0.2044,
"step": 13950
},
{
"epoch": 2.453127737865779,
"grad_norm": 1.15625,
"learning_rate": 9.117458092401144e-06,
"loss": 0.2061,
"step": 14000
},
{
"epoch": 2.4618889083581568,
"grad_norm": 1.1328125,
"learning_rate": 8.97143858419485e-06,
"loss": 0.206,
"step": 14050
},
{
"epoch": 2.4706500788505346,
"grad_norm": 1.0390625,
"learning_rate": 8.825419075988553e-06,
"loss": 0.2012,
"step": 14100
},
{
"epoch": 2.4794112493429123,
"grad_norm": 1.0,
"learning_rate": 8.679399567782257e-06,
"loss": 0.2008,
"step": 14150
},
{
"epoch": 2.48817241983529,
"grad_norm": 1.1328125,
"learning_rate": 8.53338005957596e-06,
"loss": 0.2046,
"step": 14200
},
{
"epoch": 2.496933590327668,
"grad_norm": 1.1328125,
"learning_rate": 8.387360551369664e-06,
"loss": 0.2067,
"step": 14250
},
{
"epoch": 2.5056947608200457,
"grad_norm": 1.2109375,
"learning_rate": 8.241341043163366e-06,
"loss": 0.2041,
"step": 14300
},
{
"epoch": 2.5144559313124235,
"grad_norm": 1.171875,
"learning_rate": 8.09532153495707e-06,
"loss": 0.2066,
"step": 14350
},
{
"epoch": 2.523217101804801,
"grad_norm": 0.984375,
"learning_rate": 7.949302026750773e-06,
"loss": 0.2036,
"step": 14400
},
{
"epoch": 2.531978272297179,
"grad_norm": 1.1640625,
"learning_rate": 7.803282518544478e-06,
"loss": 0.2002,
"step": 14450
},
{
"epoch": 2.5407394427895564,
"grad_norm": 1.203125,
"learning_rate": 7.65726301033818e-06,
"loss": 0.2057,
"step": 14500
},
{
"epoch": 2.5495006132819347,
"grad_norm": 1.1015625,
"learning_rate": 7.511243502131886e-06,
"loss": 0.2069,
"step": 14550
},
{
"epoch": 2.558261783774312,
"grad_norm": 1.28125,
"learning_rate": 7.36522399392559e-06,
"loss": 0.1996,
"step": 14600
},
{
"epoch": 2.5670229542666902,
"grad_norm": 1.09375,
"learning_rate": 7.219204485719293e-06,
"loss": 0.2085,
"step": 14650
},
{
"epoch": 2.5757841247590676,
"grad_norm": 1.0703125,
"learning_rate": 7.0731849775129965e-06,
"loss": 0.2068,
"step": 14700
},
{
"epoch": 2.584545295251446,
"grad_norm": 1.046875,
"learning_rate": 6.9271654693067e-06,
"loss": 0.2054,
"step": 14750
},
{
"epoch": 2.593306465743823,
"grad_norm": 1.1171875,
"learning_rate": 6.781145961100403e-06,
"loss": 0.2051,
"step": 14800
},
{
"epoch": 2.6020676362362014,
"grad_norm": 1.15625,
"learning_rate": 6.635126452894107e-06,
"loss": 0.2063,
"step": 14850
},
{
"epoch": 2.6108288067285788,
"grad_norm": 1.0859375,
"learning_rate": 6.48910694468781e-06,
"loss": 0.208,
"step": 14900
},
{
"epoch": 2.619589977220957,
"grad_norm": 1.21875,
"learning_rate": 6.343087436481514e-06,
"loss": 0.2069,
"step": 14950
},
{
"epoch": 2.6283511477133343,
"grad_norm": 1.09375,
"learning_rate": 6.197067928275218e-06,
"loss": 0.2033,
"step": 15000
},
{
"epoch": 2.637112318205712,
"grad_norm": 1.109375,
"learning_rate": 6.0510484200689214e-06,
"loss": 0.2058,
"step": 15050
},
{
"epoch": 2.64587348869809,
"grad_norm": 1.0859375,
"learning_rate": 5.905028911862625e-06,
"loss": 0.2044,
"step": 15100
},
{
"epoch": 2.6546346591904677,
"grad_norm": 1.09375,
"learning_rate": 5.759009403656328e-06,
"loss": 0.2018,
"step": 15150
},
{
"epoch": 2.6633958296828455,
"grad_norm": 1.078125,
"learning_rate": 5.612989895450033e-06,
"loss": 0.2032,
"step": 15200
},
{
"epoch": 2.6721570001752233,
"grad_norm": 1.015625,
"learning_rate": 5.466970387243736e-06,
"loss": 0.206,
"step": 15250
},
{
"epoch": 2.680918170667601,
"grad_norm": 1.09375,
"learning_rate": 5.3209508790374395e-06,
"loss": 0.2051,
"step": 15300
},
{
"epoch": 2.689679341159979,
"grad_norm": 1.0546875,
"learning_rate": 5.174931370831143e-06,
"loss": 0.2048,
"step": 15350
},
{
"epoch": 2.6984405116523567,
"grad_norm": 1.2421875,
"learning_rate": 5.028911862624846e-06,
"loss": 0.2074,
"step": 15400
},
{
"epoch": 2.7072016821447344,
"grad_norm": 1.0390625,
"learning_rate": 4.882892354418551e-06,
"loss": 0.2043,
"step": 15450
},
{
"epoch": 2.7159628526371122,
"grad_norm": 1.0703125,
"learning_rate": 4.736872846212254e-06,
"loss": 0.2064,
"step": 15500
},
{
"epoch": 2.72472402312949,
"grad_norm": 1.140625,
"learning_rate": 4.5908533380059575e-06,
"loss": 0.2044,
"step": 15550
},
{
"epoch": 2.733485193621868,
"grad_norm": 1.0390625,
"learning_rate": 4.444833829799661e-06,
"loss": 0.2078,
"step": 15600
},
{
"epoch": 2.7422463641142456,
"grad_norm": 1.2265625,
"learning_rate": 4.298814321593364e-06,
"loss": 0.2018,
"step": 15650
},
{
"epoch": 2.7510075346066234,
"grad_norm": 1.15625,
"learning_rate": 4.152794813387069e-06,
"loss": 0.2072,
"step": 15700
},
{
"epoch": 2.759768705099001,
"grad_norm": 1.1484375,
"learning_rate": 4.006775305180772e-06,
"loss": 0.2054,
"step": 15750
},
{
"epoch": 2.768529875591379,
"grad_norm": 1.1328125,
"learning_rate": 3.8607557969744755e-06,
"loss": 0.2067,
"step": 15800
},
{
"epoch": 2.7772910460837568,
"grad_norm": 1.234375,
"learning_rate": 3.7147362887681794e-06,
"loss": 0.2029,
"step": 15850
},
{
"epoch": 2.7860522165761346,
"grad_norm": 1.125,
"learning_rate": 3.568716780561883e-06,
"loss": 0.2006,
"step": 15900
},
{
"epoch": 2.7948133870685123,
"grad_norm": 1.171875,
"learning_rate": 3.422697272355587e-06,
"loss": 0.2032,
"step": 15950
},
{
"epoch": 2.80357455756089,
"grad_norm": 1.015625,
"learning_rate": 3.2766777641492905e-06,
"loss": 0.2056,
"step": 16000
},
{
"epoch": 2.812335728053268,
"grad_norm": 1.1484375,
"learning_rate": 3.130658255942994e-06,
"loss": 0.2069,
"step": 16050
},
{
"epoch": 2.8210968985456457,
"grad_norm": 1.1171875,
"learning_rate": 2.984638747736698e-06,
"loss": 0.2057,
"step": 16100
},
{
"epoch": 2.8298580690380235,
"grad_norm": 1.125,
"learning_rate": 2.8386192395304013e-06,
"loss": 0.2069,
"step": 16150
},
{
"epoch": 2.8386192395304013,
"grad_norm": 1.109375,
"learning_rate": 2.6925997313241047e-06,
"loss": 0.2047,
"step": 16200
},
{
"epoch": 2.847380410022779,
"grad_norm": 1.0625,
"learning_rate": 2.5465802231178086e-06,
"loss": 0.2093,
"step": 16250
},
{
"epoch": 2.856141580515157,
"grad_norm": 1.015625,
"learning_rate": 2.4005607149115124e-06,
"loss": 0.2047,
"step": 16300
},
{
"epoch": 2.8649027510075347,
"grad_norm": 1.1953125,
"learning_rate": 2.2545412067052163e-06,
"loss": 0.2042,
"step": 16350
},
{
"epoch": 2.8736639214999125,
"grad_norm": 1.1953125,
"learning_rate": 2.1085216984989197e-06,
"loss": 0.2051,
"step": 16400
},
{
"epoch": 2.8824250919922902,
"grad_norm": 1.2734375,
"learning_rate": 1.962502190292623e-06,
"loss": 0.2026,
"step": 16450
},
{
"epoch": 2.891186262484668,
"grad_norm": 1.203125,
"learning_rate": 1.8164826820863268e-06,
"loss": 0.2134,
"step": 16500
},
{
"epoch": 2.899947432977046,
"grad_norm": 1.265625,
"learning_rate": 1.6704631738800305e-06,
"loss": 0.2035,
"step": 16550
},
{
"epoch": 2.9087086034694236,
"grad_norm": 1.0078125,
"learning_rate": 1.524443665673734e-06,
"loss": 0.2024,
"step": 16600
},
{
"epoch": 2.9174697739618014,
"grad_norm": 1.1484375,
"learning_rate": 1.3784241574674377e-06,
"loss": 0.209,
"step": 16650
},
{
"epoch": 2.926230944454179,
"grad_norm": 1.0234375,
"learning_rate": 1.2324046492611414e-06,
"loss": 0.2035,
"step": 16700
},
{
"epoch": 2.934992114946557,
"grad_norm": 1.15625,
"learning_rate": 1.0863851410548448e-06,
"loss": 0.2061,
"step": 16750
},
{
"epoch": 2.9437532854389348,
"grad_norm": 1.140625,
"learning_rate": 9.403656328485486e-07,
"loss": 0.2049,
"step": 16800
},
{
"epoch": 2.9525144559313126,
"grad_norm": 0.98046875,
"learning_rate": 7.943461246422522e-07,
"loss": 0.2058,
"step": 16850
},
{
"epoch": 2.9612756264236904,
"grad_norm": 1.046875,
"learning_rate": 6.483266164359559e-07,
"loss": 0.2049,
"step": 16900
},
{
"epoch": 2.970036796916068,
"grad_norm": 1.109375,
"learning_rate": 5.023071082296594e-07,
"loss": 0.2014,
"step": 16950
},
{
"epoch": 2.978797967408446,
"grad_norm": 1.0546875,
"learning_rate": 3.562876000233631e-07,
"loss": 0.2054,
"step": 17000
},
{
"epoch": 2.9875591379008233,
"grad_norm": 1.015625,
"learning_rate": 2.1026809181706677e-07,
"loss": 0.2021,
"step": 17050
},
{
"epoch": 2.9963203083932015,
"grad_norm": 1.03125,
"learning_rate": 6.42485836107704e-08,
"loss": 0.2041,
"step": 17100
}
],
"logging_steps": 50,
"max_steps": 17121,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.174577165177979e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}