Nephos-Llama / checkpoint-2000 /trainer_state.json
Pragya-AI's picture
Upload NEPHOS fine-tuned model
392c51b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10005503026664665,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005002751513332333,
"grad_norm": 0.9514128565788269,
"learning_rate": 0.00019999999999999998,
"loss": 2.6758,
"step": 10
},
{
"epoch": 0.0010005503026664665,
"grad_norm": 0.24659623205661774,
"learning_rate": 0.0002,
"loss": 3.1472,
"step": 20
},
{
"epoch": 0.0015008254539997,
"grad_norm": 0.15325957536697388,
"learning_rate": 0.0002,
"loss": 3.3443,
"step": 30
},
{
"epoch": 0.002001100605332933,
"grad_norm": 0.13286100327968597,
"learning_rate": 0.0002,
"loss": 3.1872,
"step": 40
},
{
"epoch": 0.0025013757566661665,
"grad_norm": 0.13320517539978027,
"learning_rate": 0.0002,
"loss": 3.1423,
"step": 50
},
{
"epoch": 0.0030016509079994,
"grad_norm": 0.15094240009784698,
"learning_rate": 0.0002,
"loss": 3.2816,
"step": 60
},
{
"epoch": 0.003501926059332633,
"grad_norm": 0.11190259456634521,
"learning_rate": 0.0002,
"loss": 3.1771,
"step": 70
},
{
"epoch": 0.004002201210665866,
"grad_norm": 0.14013367891311646,
"learning_rate": 0.0002,
"loss": 3.292,
"step": 80
},
{
"epoch": 0.004502476361999099,
"grad_norm": 0.10220601409673691,
"learning_rate": 0.0002,
"loss": 3.2377,
"step": 90
},
{
"epoch": 0.005002751513332333,
"grad_norm": 0.11760038137435913,
"learning_rate": 0.0002,
"loss": 3.0844,
"step": 100
},
{
"epoch": 0.005503026664665566,
"grad_norm": 0.11736863106489182,
"learning_rate": 0.0002,
"loss": 3.2212,
"step": 110
},
{
"epoch": 0.0060033018159988,
"grad_norm": 0.13063907623291016,
"learning_rate": 0.0002,
"loss": 3.3429,
"step": 120
},
{
"epoch": 0.006503576967332033,
"grad_norm": 0.10621850937604904,
"learning_rate": 0.0002,
"loss": 3.2445,
"step": 130
},
{
"epoch": 0.007003852118665266,
"grad_norm": 0.11594289541244507,
"learning_rate": 0.0002,
"loss": 3.2772,
"step": 140
},
{
"epoch": 0.0075041272699984994,
"grad_norm": 0.12541130185127258,
"learning_rate": 0.0002,
"loss": 3.3111,
"step": 150
},
{
"epoch": 0.008004402421331732,
"grad_norm": 0.09993274509906769,
"learning_rate": 0.0002,
"loss": 3.1917,
"step": 160
},
{
"epoch": 0.008504677572664966,
"grad_norm": 0.15057022869586945,
"learning_rate": 0.0002,
"loss": 3.4024,
"step": 170
},
{
"epoch": 0.009004952723998198,
"grad_norm": 0.2080872654914856,
"learning_rate": 0.0002,
"loss": 3.3374,
"step": 180
},
{
"epoch": 0.009505227875331432,
"grad_norm": 0.09899163246154785,
"learning_rate": 0.0002,
"loss": 3.316,
"step": 190
},
{
"epoch": 0.010005503026664666,
"grad_norm": 0.10972588509321213,
"learning_rate": 0.0002,
"loss": 3.2684,
"step": 200
},
{
"epoch": 0.010505778177997898,
"grad_norm": 0.12223390489816666,
"learning_rate": 0.0002,
"loss": 3.3524,
"step": 210
},
{
"epoch": 0.011006053329331132,
"grad_norm": 0.11201239377260208,
"learning_rate": 0.0002,
"loss": 3.4964,
"step": 220
},
{
"epoch": 0.011506328480664366,
"grad_norm": 0.11129321157932281,
"learning_rate": 0.0002,
"loss": 3.3762,
"step": 230
},
{
"epoch": 0.0120066036319976,
"grad_norm": 0.11999399214982986,
"learning_rate": 0.0002,
"loss": 3.475,
"step": 240
},
{
"epoch": 0.012506878783330832,
"grad_norm": 0.12480571120977402,
"learning_rate": 0.0002,
"loss": 3.3755,
"step": 250
},
{
"epoch": 0.013007153934664065,
"grad_norm": 0.10506771504878998,
"learning_rate": 0.0002,
"loss": 3.3847,
"step": 260
},
{
"epoch": 0.0135074290859973,
"grad_norm": 0.08484470844268799,
"learning_rate": 0.0002,
"loss": 3.3913,
"step": 270
},
{
"epoch": 0.014007704237330531,
"grad_norm": 0.09486206620931625,
"learning_rate": 0.0002,
"loss": 3.3096,
"step": 280
},
{
"epoch": 0.014507979388663765,
"grad_norm": 0.1048346683382988,
"learning_rate": 0.0002,
"loss": 3.3356,
"step": 290
},
{
"epoch": 0.015008254539996999,
"grad_norm": 0.09623505920171738,
"learning_rate": 0.0002,
"loss": 3.357,
"step": 300
},
{
"epoch": 0.015508529691330231,
"grad_norm": 0.09872445464134216,
"learning_rate": 0.0002,
"loss": 3.5141,
"step": 310
},
{
"epoch": 0.016008804842663465,
"grad_norm": 0.10426618158817291,
"learning_rate": 0.0002,
"loss": 3.4722,
"step": 320
},
{
"epoch": 0.016509079993996697,
"grad_norm": 0.10077104717493057,
"learning_rate": 0.0002,
"loss": 3.3915,
"step": 330
},
{
"epoch": 0.017009355145329932,
"grad_norm": 0.11084343492984772,
"learning_rate": 0.0002,
"loss": 3.3849,
"step": 340
},
{
"epoch": 0.017509630296663165,
"grad_norm": 0.08321600407361984,
"learning_rate": 0.0002,
"loss": 3.3542,
"step": 350
},
{
"epoch": 0.018009905447996397,
"grad_norm": 0.08220788836479187,
"learning_rate": 0.0002,
"loss": 3.4381,
"step": 360
},
{
"epoch": 0.018510180599329632,
"grad_norm": 0.10560263693332672,
"learning_rate": 0.0002,
"loss": 3.2399,
"step": 370
},
{
"epoch": 0.019010455750662864,
"grad_norm": 0.07301683723926544,
"learning_rate": 0.0002,
"loss": 3.4645,
"step": 380
},
{
"epoch": 0.019510730901996096,
"grad_norm": 0.10291752964258194,
"learning_rate": 0.0002,
"loss": 3.491,
"step": 390
},
{
"epoch": 0.020011006053329332,
"grad_norm": 0.09107044339179993,
"learning_rate": 0.0002,
"loss": 3.2015,
"step": 400
},
{
"epoch": 0.020511281204662564,
"grad_norm": 0.08597932755947113,
"learning_rate": 0.0002,
"loss": 3.5352,
"step": 410
},
{
"epoch": 0.021011556355995796,
"grad_norm": 0.080569788813591,
"learning_rate": 0.0002,
"loss": 3.3541,
"step": 420
},
{
"epoch": 0.02151183150732903,
"grad_norm": 0.08738499134778976,
"learning_rate": 0.0002,
"loss": 3.2724,
"step": 430
},
{
"epoch": 0.022012106658662264,
"grad_norm": 0.07786229997873306,
"learning_rate": 0.0002,
"loss": 3.4347,
"step": 440
},
{
"epoch": 0.0225123818099955,
"grad_norm": 0.07400281727313995,
"learning_rate": 0.0002,
"loss": 3.2489,
"step": 450
},
{
"epoch": 0.02301265696132873,
"grad_norm": 0.08507199585437775,
"learning_rate": 0.0002,
"loss": 3.3711,
"step": 460
},
{
"epoch": 0.023512932112661963,
"grad_norm": 0.07275331765413284,
"learning_rate": 0.0002,
"loss": 3.3241,
"step": 470
},
{
"epoch": 0.0240132072639952,
"grad_norm": 0.0792601928114891,
"learning_rate": 0.0002,
"loss": 3.4097,
"step": 480
},
{
"epoch": 0.02451348241532843,
"grad_norm": 0.0831415206193924,
"learning_rate": 0.0002,
"loss": 3.1606,
"step": 490
},
{
"epoch": 0.025013757566661663,
"grad_norm": 0.1146463081240654,
"learning_rate": 0.0002,
"loss": 3.424,
"step": 500
},
{
"epoch": 0.0255140327179949,
"grad_norm": 0.11574945598840714,
"learning_rate": 0.0002,
"loss": 3.4287,
"step": 510
},
{
"epoch": 0.02601430786932813,
"grad_norm": 0.06718364357948303,
"learning_rate": 0.0002,
"loss": 3.2169,
"step": 520
},
{
"epoch": 0.026514583020661363,
"grad_norm": 0.0657232478260994,
"learning_rate": 0.0002,
"loss": 3.2824,
"step": 530
},
{
"epoch": 0.0270148581719946,
"grad_norm": 0.06566020846366882,
"learning_rate": 0.0002,
"loss": 3.29,
"step": 540
},
{
"epoch": 0.02751513332332783,
"grad_norm": 0.07870512455701828,
"learning_rate": 0.0002,
"loss": 3.3454,
"step": 550
},
{
"epoch": 0.028015408474661063,
"grad_norm": 0.06590110063552856,
"learning_rate": 0.0002,
"loss": 3.2222,
"step": 560
},
{
"epoch": 0.028515683625994298,
"grad_norm": 0.08713185787200928,
"learning_rate": 0.0002,
"loss": 3.3677,
"step": 570
},
{
"epoch": 0.02901595877732753,
"grad_norm": 0.06802317500114441,
"learning_rate": 0.0002,
"loss": 3.3194,
"step": 580
},
{
"epoch": 0.029516233928660762,
"grad_norm": 0.07123348116874695,
"learning_rate": 0.0002,
"loss": 3.1211,
"step": 590
},
{
"epoch": 0.030016509079993998,
"grad_norm": 0.09105300158262253,
"learning_rate": 0.0002,
"loss": 3.3034,
"step": 600
},
{
"epoch": 0.03051678423132723,
"grad_norm": 0.09690599888563156,
"learning_rate": 0.0002,
"loss": 3.2923,
"step": 610
},
{
"epoch": 0.031017059382660462,
"grad_norm": 0.06459871679544449,
"learning_rate": 0.0002,
"loss": 3.1854,
"step": 620
},
{
"epoch": 0.0315173345339937,
"grad_norm": 0.0907784029841423,
"learning_rate": 0.0002,
"loss": 3.2063,
"step": 630
},
{
"epoch": 0.03201760968532693,
"grad_norm": 0.07171762734651566,
"learning_rate": 0.0002,
"loss": 3.4051,
"step": 640
},
{
"epoch": 0.03251788483666016,
"grad_norm": 0.0999717265367508,
"learning_rate": 0.0002,
"loss": 3.3135,
"step": 650
},
{
"epoch": 0.033018159987993394,
"grad_norm": 0.0716700628399849,
"learning_rate": 0.0002,
"loss": 3.2707,
"step": 660
},
{
"epoch": 0.03351843513932663,
"grad_norm": 0.06543900072574615,
"learning_rate": 0.0002,
"loss": 3.2032,
"step": 670
},
{
"epoch": 0.034018710290659865,
"grad_norm": 0.05876084417104721,
"learning_rate": 0.0002,
"loss": 3.332,
"step": 680
},
{
"epoch": 0.0345189854419931,
"grad_norm": 0.5182152390480042,
"learning_rate": 0.0002,
"loss": 3.3611,
"step": 690
},
{
"epoch": 0.03501926059332633,
"grad_norm": 0.07951213419437408,
"learning_rate": 0.0002,
"loss": 3.2924,
"step": 700
},
{
"epoch": 0.03551953574465956,
"grad_norm": 0.08420588076114655,
"learning_rate": 0.0002,
"loss": 3.1806,
"step": 710
},
{
"epoch": 0.03601981089599279,
"grad_norm": 0.08514729142189026,
"learning_rate": 0.0002,
"loss": 3.2537,
"step": 720
},
{
"epoch": 0.03652008604732603,
"grad_norm": 0.06989168375730515,
"learning_rate": 0.0002,
"loss": 3.4618,
"step": 730
},
{
"epoch": 0.037020361198659264,
"grad_norm": 0.07098263502120972,
"learning_rate": 0.0002,
"loss": 3.3092,
"step": 740
},
{
"epoch": 0.037520636349992496,
"grad_norm": 0.06842508912086487,
"learning_rate": 0.0002,
"loss": 3.3168,
"step": 750
},
{
"epoch": 0.03802091150132573,
"grad_norm": 0.09867072105407715,
"learning_rate": 0.0002,
"loss": 3.194,
"step": 760
},
{
"epoch": 0.03852118665265896,
"grad_norm": 0.06236390769481659,
"learning_rate": 0.0002,
"loss": 3.3323,
"step": 770
},
{
"epoch": 0.03902146180399219,
"grad_norm": 0.07258310914039612,
"learning_rate": 0.0002,
"loss": 3.2518,
"step": 780
},
{
"epoch": 0.03952173695532543,
"grad_norm": 0.060556840151548386,
"learning_rate": 0.0002,
"loss": 3.3152,
"step": 790
},
{
"epoch": 0.040022012106658664,
"grad_norm": 0.07364658266305923,
"learning_rate": 0.0002,
"loss": 3.2153,
"step": 800
},
{
"epoch": 0.040522287257991896,
"grad_norm": 0.08476244658231735,
"learning_rate": 0.0002,
"loss": 3.2618,
"step": 810
},
{
"epoch": 0.04102256240932513,
"grad_norm": 0.06534284353256226,
"learning_rate": 0.0002,
"loss": 3.2049,
"step": 820
},
{
"epoch": 0.04152283756065836,
"grad_norm": 0.07897084951400757,
"learning_rate": 0.0002,
"loss": 3.4232,
"step": 830
},
{
"epoch": 0.04202311271199159,
"grad_norm": 0.09437014162540436,
"learning_rate": 0.0002,
"loss": 3.2636,
"step": 840
},
{
"epoch": 0.04252338786332483,
"grad_norm": 0.06484173983335495,
"learning_rate": 0.0002,
"loss": 3.1474,
"step": 850
},
{
"epoch": 0.04302366301465806,
"grad_norm": 0.05979447439312935,
"learning_rate": 0.0002,
"loss": 3.2571,
"step": 860
},
{
"epoch": 0.043523938165991295,
"grad_norm": 0.09203090518712997,
"learning_rate": 0.0002,
"loss": 3.1984,
"step": 870
},
{
"epoch": 0.04402421331732453,
"grad_norm": 0.1513832062482834,
"learning_rate": 0.0002,
"loss": 3.3668,
"step": 880
},
{
"epoch": 0.04452448846865776,
"grad_norm": 0.06712643057107925,
"learning_rate": 0.0002,
"loss": 3.2346,
"step": 890
},
{
"epoch": 0.045024763619991,
"grad_norm": 0.07149570435285568,
"learning_rate": 0.0002,
"loss": 3.2834,
"step": 900
},
{
"epoch": 0.04552503877132423,
"grad_norm": 0.0681491494178772,
"learning_rate": 0.0002,
"loss": 3.1853,
"step": 910
},
{
"epoch": 0.04602531392265746,
"grad_norm": 0.06924117356538773,
"learning_rate": 0.0002,
"loss": 3.2217,
"step": 920
},
{
"epoch": 0.046525589073990695,
"grad_norm": 0.07459249347448349,
"learning_rate": 0.0002,
"loss": 3.2059,
"step": 930
},
{
"epoch": 0.04702586422532393,
"grad_norm": 0.06532080471515656,
"learning_rate": 0.0002,
"loss": 3.1643,
"step": 940
},
{
"epoch": 0.04752613937665716,
"grad_norm": 0.07453737407922745,
"learning_rate": 0.0002,
"loss": 3.3668,
"step": 950
},
{
"epoch": 0.0480264145279904,
"grad_norm": 0.07038157433271408,
"learning_rate": 0.0002,
"loss": 3.3452,
"step": 960
},
{
"epoch": 0.04852668967932363,
"grad_norm": 0.05873151868581772,
"learning_rate": 0.0002,
"loss": 3.3277,
"step": 970
},
{
"epoch": 0.04902696483065686,
"grad_norm": 0.05833908170461655,
"learning_rate": 0.0002,
"loss": 3.2321,
"step": 980
},
{
"epoch": 0.049527239981990094,
"grad_norm": 0.07476246356964111,
"learning_rate": 0.0002,
"loss": 3.3013,
"step": 990
},
{
"epoch": 0.050027515133323326,
"grad_norm": 0.06895654648542404,
"learning_rate": 0.0002,
"loss": 3.2011,
"step": 1000
},
{
"epoch": 0.05052779028465656,
"grad_norm": 0.06574366986751556,
"learning_rate": 0.0002,
"loss": 3.2071,
"step": 1010
},
{
"epoch": 0.0510280654359898,
"grad_norm": 0.05364847928285599,
"learning_rate": 0.0002,
"loss": 3.1855,
"step": 1020
},
{
"epoch": 0.05152834058732303,
"grad_norm": 0.0581735335290432,
"learning_rate": 0.0002,
"loss": 3.2567,
"step": 1030
},
{
"epoch": 0.05202861573865626,
"grad_norm": 0.09195020794868469,
"learning_rate": 0.0002,
"loss": 3.29,
"step": 1040
},
{
"epoch": 0.052528890889989494,
"grad_norm": 0.059362176805734634,
"learning_rate": 0.0002,
"loss": 3.2145,
"step": 1050
},
{
"epoch": 0.053029166041322726,
"grad_norm": 0.06778449565172195,
"learning_rate": 0.0002,
"loss": 3.2591,
"step": 1060
},
{
"epoch": 0.05352944119265596,
"grad_norm": 0.054793521761894226,
"learning_rate": 0.0002,
"loss": 3.1621,
"step": 1070
},
{
"epoch": 0.0540297163439892,
"grad_norm": 0.05222785100340843,
"learning_rate": 0.0002,
"loss": 3.1684,
"step": 1080
},
{
"epoch": 0.05452999149532243,
"grad_norm": 0.05583691596984863,
"learning_rate": 0.0002,
"loss": 3.1624,
"step": 1090
},
{
"epoch": 0.05503026664665566,
"grad_norm": 0.0779917985200882,
"learning_rate": 0.0002,
"loss": 3.2844,
"step": 1100
},
{
"epoch": 0.05553054179798889,
"grad_norm": 0.058885641396045685,
"learning_rate": 0.0002,
"loss": 3.3956,
"step": 1110
},
{
"epoch": 0.056030816949322125,
"grad_norm": 0.053786501288414,
"learning_rate": 0.0002,
"loss": 3.2465,
"step": 1120
},
{
"epoch": 0.05653109210065536,
"grad_norm": 0.06709844619035721,
"learning_rate": 0.0002,
"loss": 3.1812,
"step": 1130
},
{
"epoch": 0.057031367251988596,
"grad_norm": 0.053172528743743896,
"learning_rate": 0.0002,
"loss": 3.1945,
"step": 1140
},
{
"epoch": 0.05753164240332183,
"grad_norm": 0.0449419841170311,
"learning_rate": 0.0002,
"loss": 3.2653,
"step": 1150
},
{
"epoch": 0.05803191755465506,
"grad_norm": 0.07608778029680252,
"learning_rate": 0.0002,
"loss": 3.2171,
"step": 1160
},
{
"epoch": 0.05853219270598829,
"grad_norm": 0.05426677316427231,
"learning_rate": 0.0002,
"loss": 3.1954,
"step": 1170
},
{
"epoch": 0.059032467857321524,
"grad_norm": 0.07974937558174133,
"learning_rate": 0.0002,
"loss": 3.2709,
"step": 1180
},
{
"epoch": 0.059532743008654763,
"grad_norm": 0.07222287356853485,
"learning_rate": 0.0002,
"loss": 3.1398,
"step": 1190
},
{
"epoch": 0.060033018159987996,
"grad_norm": 0.05869804322719574,
"learning_rate": 0.0002,
"loss": 3.2176,
"step": 1200
},
{
"epoch": 0.06053329331132123,
"grad_norm": 0.053768135607242584,
"learning_rate": 0.0002,
"loss": 3.1623,
"step": 1210
},
{
"epoch": 0.06103356846265446,
"grad_norm": 0.0641162171959877,
"learning_rate": 0.0002,
"loss": 3.15,
"step": 1220
},
{
"epoch": 0.06153384361398769,
"grad_norm": 0.0566398985683918,
"learning_rate": 0.0002,
"loss": 3.1254,
"step": 1230
},
{
"epoch": 0.062034118765320924,
"grad_norm": 0.055305738002061844,
"learning_rate": 0.0002,
"loss": 3.2141,
"step": 1240
},
{
"epoch": 0.06253439391665416,
"grad_norm": 0.059914641082286835,
"learning_rate": 0.0002,
"loss": 3.1913,
"step": 1250
},
{
"epoch": 0.0630346690679874,
"grad_norm": 0.05172060430049896,
"learning_rate": 0.0002,
"loss": 3.191,
"step": 1260
},
{
"epoch": 0.06353494421932063,
"grad_norm": 0.07252514362335205,
"learning_rate": 0.0002,
"loss": 3.0212,
"step": 1270
},
{
"epoch": 0.06403521937065386,
"grad_norm": 0.07096508145332336,
"learning_rate": 0.0002,
"loss": 3.2677,
"step": 1280
},
{
"epoch": 0.06453549452198709,
"grad_norm": 0.058548085391521454,
"learning_rate": 0.0002,
"loss": 3.1471,
"step": 1290
},
{
"epoch": 0.06503576967332032,
"grad_norm": 0.053336966782808304,
"learning_rate": 0.0002,
"loss": 3.234,
"step": 1300
},
{
"epoch": 0.06553604482465356,
"grad_norm": 0.051933031529188156,
"learning_rate": 0.0002,
"loss": 3.2554,
"step": 1310
},
{
"epoch": 0.06603631997598679,
"grad_norm": 0.05643808841705322,
"learning_rate": 0.0002,
"loss": 3.3304,
"step": 1320
},
{
"epoch": 0.06653659512732002,
"grad_norm": 0.057230204343795776,
"learning_rate": 0.0002,
"loss": 3.1517,
"step": 1330
},
{
"epoch": 0.06703687027865327,
"grad_norm": 0.07095087319612503,
"learning_rate": 0.0002,
"loss": 3.0802,
"step": 1340
},
{
"epoch": 0.0675371454299865,
"grad_norm": 0.06798629462718964,
"learning_rate": 0.0002,
"loss": 3.0827,
"step": 1350
},
{
"epoch": 0.06803742058131973,
"grad_norm": 0.0938129797577858,
"learning_rate": 0.0002,
"loss": 3.3157,
"step": 1360
},
{
"epoch": 0.06853769573265296,
"grad_norm": 0.06188824027776718,
"learning_rate": 0.0002,
"loss": 3.1655,
"step": 1370
},
{
"epoch": 0.0690379708839862,
"grad_norm": 0.05734292417764664,
"learning_rate": 0.0002,
"loss": 3.2153,
"step": 1380
},
{
"epoch": 0.06953824603531943,
"grad_norm": 0.05722649022936821,
"learning_rate": 0.0002,
"loss": 3.164,
"step": 1390
},
{
"epoch": 0.07003852118665266,
"grad_norm": 0.058131471276283264,
"learning_rate": 0.0002,
"loss": 3.2015,
"step": 1400
},
{
"epoch": 0.07053879633798589,
"grad_norm": 0.07024545222520828,
"learning_rate": 0.0002,
"loss": 3.1281,
"step": 1410
},
{
"epoch": 0.07103907148931912,
"grad_norm": 0.06485693156719208,
"learning_rate": 0.0002,
"loss": 3.1668,
"step": 1420
},
{
"epoch": 0.07153934664065235,
"grad_norm": 0.056724123656749725,
"learning_rate": 0.0002,
"loss": 3.2405,
"step": 1430
},
{
"epoch": 0.07203962179198559,
"grad_norm": 0.05593548342585564,
"learning_rate": 0.0002,
"loss": 3.3399,
"step": 1440
},
{
"epoch": 0.07253989694331882,
"grad_norm": 0.06867067515850067,
"learning_rate": 0.0002,
"loss": 3.3871,
"step": 1450
},
{
"epoch": 0.07304017209465206,
"grad_norm": 0.09758540242910385,
"learning_rate": 0.0002,
"loss": 3.2629,
"step": 1460
},
{
"epoch": 0.0735404472459853,
"grad_norm": 0.0622124969959259,
"learning_rate": 0.0002,
"loss": 3.1782,
"step": 1470
},
{
"epoch": 0.07404072239731853,
"grad_norm": 0.05847143009305,
"learning_rate": 0.0002,
"loss": 3.2197,
"step": 1480
},
{
"epoch": 0.07454099754865176,
"grad_norm": 0.0578547939658165,
"learning_rate": 0.0002,
"loss": 3.2242,
"step": 1490
},
{
"epoch": 0.07504127269998499,
"grad_norm": 0.057039618492126465,
"learning_rate": 0.0002,
"loss": 3.1362,
"step": 1500
},
{
"epoch": 0.07554154785131822,
"grad_norm": 0.05607955902814865,
"learning_rate": 0.0002,
"loss": 3.1812,
"step": 1510
},
{
"epoch": 0.07604182300265146,
"grad_norm": 0.06097773090004921,
"learning_rate": 0.0002,
"loss": 3.1074,
"step": 1520
},
{
"epoch": 0.07654209815398469,
"grad_norm": 0.08337036520242691,
"learning_rate": 0.0002,
"loss": 3.1852,
"step": 1530
},
{
"epoch": 0.07704237330531792,
"grad_norm": 0.05237165838479996,
"learning_rate": 0.0002,
"loss": 3.1786,
"step": 1540
},
{
"epoch": 0.07754264845665115,
"grad_norm": 0.07054440677165985,
"learning_rate": 0.0002,
"loss": 3.1673,
"step": 1550
},
{
"epoch": 0.07804292360798439,
"grad_norm": 0.04526256397366524,
"learning_rate": 0.0002,
"loss": 3.2436,
"step": 1560
},
{
"epoch": 0.07854319875931763,
"grad_norm": 0.042845677584409714,
"learning_rate": 0.0002,
"loss": 3.1739,
"step": 1570
},
{
"epoch": 0.07904347391065086,
"grad_norm": 0.0452456995844841,
"learning_rate": 0.0002,
"loss": 3.224,
"step": 1580
},
{
"epoch": 0.0795437490619841,
"grad_norm": 0.055119115859270096,
"learning_rate": 0.0002,
"loss": 3.2443,
"step": 1590
},
{
"epoch": 0.08004402421331733,
"grad_norm": 0.06564844399690628,
"learning_rate": 0.0002,
"loss": 3.3013,
"step": 1600
},
{
"epoch": 0.08054429936465056,
"grad_norm": 0.08553501963615417,
"learning_rate": 0.0002,
"loss": 3.1565,
"step": 1610
},
{
"epoch": 0.08104457451598379,
"grad_norm": 0.057848136872053146,
"learning_rate": 0.0002,
"loss": 3.192,
"step": 1620
},
{
"epoch": 0.08154484966731702,
"grad_norm": 0.05926649644970894,
"learning_rate": 0.0002,
"loss": 3.288,
"step": 1630
},
{
"epoch": 0.08204512481865026,
"grad_norm": 0.05204610154032707,
"learning_rate": 0.0002,
"loss": 3.357,
"step": 1640
},
{
"epoch": 0.08254539996998349,
"grad_norm": 0.06709768623113632,
"learning_rate": 0.0002,
"loss": 3.2767,
"step": 1650
},
{
"epoch": 0.08304567512131672,
"grad_norm": 0.05840866640210152,
"learning_rate": 0.0002,
"loss": 3.3075,
"step": 1660
},
{
"epoch": 0.08354595027264995,
"grad_norm": 0.06196371465921402,
"learning_rate": 0.0002,
"loss": 3.2186,
"step": 1670
},
{
"epoch": 0.08404622542398318,
"grad_norm": 0.06448955088853836,
"learning_rate": 0.0002,
"loss": 3.0591,
"step": 1680
},
{
"epoch": 0.08454650057531643,
"grad_norm": 0.06016537919640541,
"learning_rate": 0.0002,
"loss": 3.1313,
"step": 1690
},
{
"epoch": 0.08504677572664966,
"grad_norm": 0.04336397349834442,
"learning_rate": 0.0002,
"loss": 3.2306,
"step": 1700
},
{
"epoch": 0.0855470508779829,
"grad_norm": 0.05283171683549881,
"learning_rate": 0.0002,
"loss": 3.2208,
"step": 1710
},
{
"epoch": 0.08604732602931613,
"grad_norm": 0.05544983223080635,
"learning_rate": 0.0002,
"loss": 3.0558,
"step": 1720
},
{
"epoch": 0.08654760118064936,
"grad_norm": 0.09242791682481766,
"learning_rate": 0.0002,
"loss": 3.1988,
"step": 1730
},
{
"epoch": 0.08704787633198259,
"grad_norm": 0.07003988325595856,
"learning_rate": 0.0002,
"loss": 3.282,
"step": 1740
},
{
"epoch": 0.08754815148331582,
"grad_norm": 0.05473213270306587,
"learning_rate": 0.0002,
"loss": 3.0874,
"step": 1750
},
{
"epoch": 0.08804842663464905,
"grad_norm": 0.05522087588906288,
"learning_rate": 0.0002,
"loss": 3.2302,
"step": 1760
},
{
"epoch": 0.08854870178598229,
"grad_norm": 0.06576565653085709,
"learning_rate": 0.0002,
"loss": 3.1786,
"step": 1770
},
{
"epoch": 0.08904897693731552,
"grad_norm": 0.05434967949986458,
"learning_rate": 0.0002,
"loss": 3.1149,
"step": 1780
},
{
"epoch": 0.08954925208864875,
"grad_norm": 0.05906340479850769,
"learning_rate": 0.0002,
"loss": 3.0553,
"step": 1790
},
{
"epoch": 0.090049527239982,
"grad_norm": 0.05728009715676308,
"learning_rate": 0.0002,
"loss": 3.0531,
"step": 1800
},
{
"epoch": 0.09054980239131523,
"grad_norm": 0.03979711979627609,
"learning_rate": 0.0002,
"loss": 3.1394,
"step": 1810
},
{
"epoch": 0.09105007754264846,
"grad_norm": 0.07336313277482986,
"learning_rate": 0.0002,
"loss": 3.2452,
"step": 1820
},
{
"epoch": 0.0915503526939817,
"grad_norm": 0.055571481585502625,
"learning_rate": 0.0002,
"loss": 3.123,
"step": 1830
},
{
"epoch": 0.09205062784531493,
"grad_norm": 0.048019275069236755,
"learning_rate": 0.0002,
"loss": 3.1898,
"step": 1840
},
{
"epoch": 0.09255090299664816,
"grad_norm": 0.04113614931702614,
"learning_rate": 0.0002,
"loss": 3.187,
"step": 1850
},
{
"epoch": 0.09305117814798139,
"grad_norm": 0.058914463967084885,
"learning_rate": 0.0002,
"loss": 3.0886,
"step": 1860
},
{
"epoch": 0.09355145329931462,
"grad_norm": 0.0579225979745388,
"learning_rate": 0.0002,
"loss": 3.113,
"step": 1870
},
{
"epoch": 0.09405172845064785,
"grad_norm": 0.05847308784723282,
"learning_rate": 0.0002,
"loss": 3.2902,
"step": 1880
},
{
"epoch": 0.09455200360198109,
"grad_norm": 0.04670713469386101,
"learning_rate": 0.0002,
"loss": 3.0735,
"step": 1890
},
{
"epoch": 0.09505227875331432,
"grad_norm": 0.058696549385786057,
"learning_rate": 0.0002,
"loss": 3.2043,
"step": 1900
},
{
"epoch": 0.09555255390464755,
"grad_norm": 0.0533798448741436,
"learning_rate": 0.0002,
"loss": 3.0289,
"step": 1910
},
{
"epoch": 0.0960528290559808,
"grad_norm": 0.04985165223479271,
"learning_rate": 0.0002,
"loss": 3.2249,
"step": 1920
},
{
"epoch": 0.09655310420731403,
"grad_norm": 0.06083301082253456,
"learning_rate": 0.0002,
"loss": 3.1653,
"step": 1930
},
{
"epoch": 0.09705337935864726,
"grad_norm": 0.055274877697229385,
"learning_rate": 0.0002,
"loss": 3.1489,
"step": 1940
},
{
"epoch": 0.09755365450998049,
"grad_norm": 0.03868628293275833,
"learning_rate": 0.0002,
"loss": 3.0902,
"step": 1950
},
{
"epoch": 0.09805392966131372,
"grad_norm": 0.05481928586959839,
"learning_rate": 0.0002,
"loss": 3.1383,
"step": 1960
},
{
"epoch": 0.09855420481264696,
"grad_norm": 0.05562729388475418,
"learning_rate": 0.0002,
"loss": 3.2148,
"step": 1970
},
{
"epoch": 0.09905447996398019,
"grad_norm": 0.04779260233044624,
"learning_rate": 0.0002,
"loss": 3.2358,
"step": 1980
},
{
"epoch": 0.09955475511531342,
"grad_norm": 0.04606562480330467,
"learning_rate": 0.0002,
"loss": 3.172,
"step": 1990
},
{
"epoch": 0.10005503026664665,
"grad_norm": 0.045945361256599426,
"learning_rate": 0.0002,
"loss": 3.1456,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.5664155394048e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}