groundhogLLM's picture
Upload folder using huggingface_hub
8edc1b4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9937769267592147,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028721876495931067,
"grad_norm": 3.700932765736394,
"learning_rate": 9.523809523809525e-07,
"loss": 0.93,
"step": 10
},
{
"epoch": 0.057443752991862135,
"grad_norm": 1.734082511082376,
"learning_rate": 1.904761904761905e-06,
"loss": 0.8877,
"step": 20
},
{
"epoch": 0.0861656294877932,
"grad_norm": 1.024208832618697,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.8525,
"step": 30
},
{
"epoch": 0.11488750598372427,
"grad_norm": 0.7469304469128593,
"learning_rate": 3.80952380952381e-06,
"loss": 0.7877,
"step": 40
},
{
"epoch": 0.14360938247965535,
"grad_norm": 0.586040626713746,
"learning_rate": 4.761904761904762e-06,
"loss": 0.7286,
"step": 50
},
{
"epoch": 0.1723312589755864,
"grad_norm": 0.54841652853263,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.7166,
"step": 60
},
{
"epoch": 0.20105313547151749,
"grad_norm": 0.5034517147897491,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6823,
"step": 70
},
{
"epoch": 0.22977501196744854,
"grad_norm": 0.49774064773427634,
"learning_rate": 7.61904761904762e-06,
"loss": 0.6824,
"step": 80
},
{
"epoch": 0.2584968884633796,
"grad_norm": 0.552315492890169,
"learning_rate": 8.571428571428571e-06,
"loss": 0.6856,
"step": 90
},
{
"epoch": 0.2872187649593107,
"grad_norm": 0.5309414818094527,
"learning_rate": 9.523809523809525e-06,
"loss": 0.658,
"step": 100
},
{
"epoch": 0.31594064145524176,
"grad_norm": 0.5385308790861494,
"learning_rate": 9.999300418283908e-06,
"loss": 0.6423,
"step": 110
},
{
"epoch": 0.3446625179511728,
"grad_norm": 0.5526511778665282,
"learning_rate": 9.993704939095376e-06,
"loss": 0.6408,
"step": 120
},
{
"epoch": 0.37338439444710386,
"grad_norm": 0.6179734532307238,
"learning_rate": 9.982520243472044e-06,
"loss": 0.6505,
"step": 130
},
{
"epoch": 0.40210627094303497,
"grad_norm": 0.5239737338093862,
"learning_rate": 9.965758849911774e-06,
"loss": 0.6161,
"step": 140
},
{
"epoch": 0.430828147438966,
"grad_norm": 0.5619528495039146,
"learning_rate": 9.943439518645193e-06,
"loss": 0.6308,
"step": 150
},
{
"epoch": 0.4595500239348971,
"grad_norm": 0.4729313140856091,
"learning_rate": 9.915587230638269e-06,
"loss": 0.6316,
"step": 160
},
{
"epoch": 0.48827190043082813,
"grad_norm": 0.4851042681486466,
"learning_rate": 9.882233159632297e-06,
"loss": 0.6273,
"step": 170
},
{
"epoch": 0.5169937769267592,
"grad_norm": 0.4990097338675921,
"learning_rate": 9.843414637252615e-06,
"loss": 0.6351,
"step": 180
},
{
"epoch": 0.5457156534226902,
"grad_norm": 0.4721715881606927,
"learning_rate": 9.79917511122509e-06,
"loss": 0.6174,
"step": 190
},
{
"epoch": 0.5744375299186214,
"grad_norm": 0.49937447981211275,
"learning_rate": 9.749564096747148e-06,
"loss": 0.619,
"step": 200
},
{
"epoch": 0.6031594064145525,
"grad_norm": 0.5273757739930043,
"learning_rate": 9.694637121067764e-06,
"loss": 0.6212,
"step": 210
},
{
"epoch": 0.6318812829104835,
"grad_norm": 0.5688431120752676,
"learning_rate": 9.63445566133846e-06,
"loss": 0.6272,
"step": 220
},
{
"epoch": 0.6606031594064146,
"grad_norm": 0.5845365795446384,
"learning_rate": 9.569087075804842e-06,
"loss": 0.6313,
"step": 230
},
{
"epoch": 0.6893250359023456,
"grad_norm": 0.5495325867850296,
"learning_rate": 9.498604528415731e-06,
"loss": 0.625,
"step": 240
},
{
"epoch": 0.7180469123982767,
"grad_norm": 0.5031448369891197,
"learning_rate": 9.423086906934228e-06,
"loss": 0.6248,
"step": 250
},
{
"epoch": 0.7467687888942077,
"grad_norm": 0.4900365577299398,
"learning_rate": 9.342618734642395e-06,
"loss": 0.6142,
"step": 260
},
{
"epoch": 0.7754906653901388,
"grad_norm": 0.5315649843783232,
"learning_rate": 9.257290075738365e-06,
"loss": 0.6016,
"step": 270
},
{
"epoch": 0.8042125418860699,
"grad_norm": 0.5256710163299946,
"learning_rate": 9.16719643453177e-06,
"loss": 0.6224,
"step": 280
},
{
"epoch": 0.832934418382001,
"grad_norm": 0.5138537637831458,
"learning_rate": 9.072438648550304e-06,
"loss": 0.6118,
"step": 290
},
{
"epoch": 0.861656294877932,
"grad_norm": 0.5363724694729143,
"learning_rate": 8.973122775677078e-06,
"loss": 0.5975,
"step": 300
},
{
"epoch": 0.8903781713738631,
"grad_norm": 0.5338491338258355,
"learning_rate": 8.869359975445085e-06,
"loss": 0.5976,
"step": 310
},
{
"epoch": 0.9191000478697942,
"grad_norm": 0.5042648527170004,
"learning_rate": 8.761266384621599e-06,
"loss": 0.6042,
"step": 320
},
{
"epoch": 0.9478219243657252,
"grad_norm": 0.6176720610638503,
"learning_rate": 8.648962987221837e-06,
"loss": 0.6096,
"step": 330
},
{
"epoch": 0.9765438008616563,
"grad_norm": 0.48986374397556326,
"learning_rate": 8.532575479097294e-06,
"loss": 0.5895,
"step": 340
},
{
"epoch": 1.0028721876495932,
"grad_norm": 1.3018686522828506,
"learning_rate": 8.412234127250353e-06,
"loss": 0.6163,
"step": 350
},
{
"epoch": 1.0315940641455241,
"grad_norm": 0.5832838255643433,
"learning_rate": 8.288073624032634e-06,
"loss": 0.5642,
"step": 360
},
{
"epoch": 1.0603159406414553,
"grad_norm": 0.5254324766786693,
"learning_rate": 8.160232936390239e-06,
"loss": 0.5657,
"step": 370
},
{
"epoch": 1.0890378171373862,
"grad_norm": 0.49615638486827607,
"learning_rate": 8.02885515032467e-06,
"loss": 0.5665,
"step": 380
},
{
"epoch": 1.1177596936333174,
"grad_norm": 0.5919689080906818,
"learning_rate": 7.894087310743468e-06,
"loss": 0.5658,
"step": 390
},
{
"epoch": 1.1464815701292483,
"grad_norm": 0.5210878612928057,
"learning_rate": 7.756080256879837e-06,
"loss": 0.544,
"step": 400
},
{
"epoch": 1.1752034466251795,
"grad_norm": 0.5194215534501085,
"learning_rate": 7.614988453465469e-06,
"loss": 0.5715,
"step": 410
},
{
"epoch": 1.2039253231211107,
"grad_norm": 0.4945404206989309,
"learning_rate": 7.470969817845518e-06,
"loss": 0.5568,
"step": 420
},
{
"epoch": 1.2326471996170416,
"grad_norm": 0.5434010317710215,
"learning_rate": 7.324185543229226e-06,
"loss": 0.5604,
"step": 430
},
{
"epoch": 1.2613690761129728,
"grad_norm": 0.49174162188431164,
"learning_rate": 7.174799918274018e-06,
"loss": 0.5594,
"step": 440
},
{
"epoch": 1.2900909526089037,
"grad_norm": 0.442685394222738,
"learning_rate": 7.022980143205046e-06,
"loss": 0.5738,
"step": 450
},
{
"epoch": 1.3188128291048349,
"grad_norm": 0.5397145615923644,
"learning_rate": 6.868896142675903e-06,
"loss": 0.5744,
"step": 460
},
{
"epoch": 1.3475347056007658,
"grad_norm": 0.49715490338776275,
"learning_rate": 6.712720375580057e-06,
"loss": 0.5738,
"step": 470
},
{
"epoch": 1.376256582096697,
"grad_norm": 0.46659441221308456,
"learning_rate": 6.554627642025807e-06,
"loss": 0.5623,
"step": 480
},
{
"epoch": 1.4049784585926282,
"grad_norm": 0.5399824841470794,
"learning_rate": 6.394794887690838e-06,
"loss": 0.5652,
"step": 490
},
{
"epoch": 1.433700335088559,
"grad_norm": 0.46213954264438495,
"learning_rate": 6.233401005775339e-06,
"loss": 0.5628,
"step": 500
},
{
"epoch": 1.4624222115844903,
"grad_norm": 0.4959443093326465,
"learning_rate": 6.070626636775349e-06,
"loss": 0.5649,
"step": 510
},
{
"epoch": 1.4911440880804212,
"grad_norm": 0.5041986397517726,
"learning_rate": 5.906653966300444e-06,
"loss": 0.5722,
"step": 520
},
{
"epoch": 1.5198659645763524,
"grad_norm": 0.4083574928666145,
"learning_rate": 5.741666521162055e-06,
"loss": 0.5484,
"step": 530
},
{
"epoch": 1.5485878410722833,
"grad_norm": 0.44781946959219954,
"learning_rate": 5.575848963960621e-06,
"loss": 0.5593,
"step": 540
},
{
"epoch": 1.5773097175682145,
"grad_norm": 0.5286251349151584,
"learning_rate": 5.4093868864015405e-06,
"loss": 0.5844,
"step": 550
},
{
"epoch": 1.6060315940641456,
"grad_norm": 0.5023049460335103,
"learning_rate": 5.24246660157119e-06,
"loss": 0.5621,
"step": 560
},
{
"epoch": 1.6347534705600766,
"grad_norm": 0.4704671038611138,
"learning_rate": 5.075274935405554e-06,
"loss": 0.5718,
"step": 570
},
{
"epoch": 1.6634753470560075,
"grad_norm": 0.5090581682145832,
"learning_rate": 4.90799901758484e-06,
"loss": 0.5633,
"step": 580
},
{
"epoch": 1.6921972235519387,
"grad_norm": 0.45798134505233995,
"learning_rate": 4.74082607208812e-06,
"loss": 0.5656,
"step": 590
},
{
"epoch": 1.7209191000478699,
"grad_norm": 0.48981735934307813,
"learning_rate": 4.573943207642452e-06,
"loss": 0.5606,
"step": 600
},
{
"epoch": 1.7496409765438008,
"grad_norm": 0.4699165430870941,
"learning_rate": 4.407537208300957e-06,
"loss": 0.5614,
"step": 610
},
{
"epoch": 1.778362853039732,
"grad_norm": 0.47494342485042146,
"learning_rate": 4.241794324384334e-06,
"loss": 0.5522,
"step": 620
},
{
"epoch": 1.8070847295356631,
"grad_norm": 0.5178845713340307,
"learning_rate": 4.076900064019721e-06,
"loss": 0.5595,
"step": 630
},
{
"epoch": 1.835806606031594,
"grad_norm": 0.5219813023574859,
"learning_rate": 3.91303898551028e-06,
"loss": 0.564,
"step": 640
},
{
"epoch": 1.864528482527525,
"grad_norm": 0.4574407490031541,
"learning_rate": 3.7503944907678543e-06,
"loss": 0.5637,
"step": 650
},
{
"epoch": 1.8932503590234562,
"grad_norm": 0.47969146853460126,
"learning_rate": 3.5891486200399413e-06,
"loss": 0.5576,
"step": 660
},
{
"epoch": 1.9219722355193873,
"grad_norm": 0.4131894295273494,
"learning_rate": 3.429481848160702e-06,
"loss": 0.5502,
"step": 670
},
{
"epoch": 1.9506941120153183,
"grad_norm": 0.41395383387812457,
"learning_rate": 3.2715728825540525e-06,
"loss": 0.5461,
"step": 680
},
{
"epoch": 1.9794159885112494,
"grad_norm": 0.45923582804755875,
"learning_rate": 3.1155984632149565e-06,
"loss": 0.5651,
"step": 690
},
{
"epoch": 2.0057443752991864,
"grad_norm": 0.4977440233074294,
"learning_rate": 2.961733164892744e-06,
"loss": 0.5437,
"step": 700
},
{
"epoch": 2.0344662517951173,
"grad_norm": 0.48802663653181644,
"learning_rate": 2.8101492016979027e-06,
"loss": 0.5382,
"step": 710
},
{
"epoch": 2.0631881282910483,
"grad_norm": 0.4673676186684926,
"learning_rate": 2.6610162343510183e-06,
"loss": 0.5221,
"step": 720
},
{
"epoch": 2.091910004786979,
"grad_norm": 0.39392152443459205,
"learning_rate": 2.5145011802895835e-06,
"loss": 0.5362,
"step": 730
},
{
"epoch": 2.1206318812829106,
"grad_norm": 0.40247433394922916,
"learning_rate": 2.370768026845276e-06,
"loss": 0.5258,
"step": 740
},
{
"epoch": 2.1493537577788415,
"grad_norm": 0.38912377413533783,
"learning_rate": 2.2299776477007073e-06,
"loss": 0.5269,
"step": 750
},
{
"epoch": 2.1780756342747725,
"grad_norm": 0.44606591149003666,
"learning_rate": 2.0922876228311833e-06,
"loss": 0.5183,
"step": 760
},
{
"epoch": 2.206797510770704,
"grad_norm": 0.42032893014171974,
"learning_rate": 1.957852062132924e-06,
"loss": 0.5275,
"step": 770
},
{
"epoch": 2.235519387266635,
"grad_norm": 0.501692538983492,
"learning_rate": 1.8268214329351797e-06,
"loss": 0.5319,
"step": 780
},
{
"epoch": 2.2642412637625657,
"grad_norm": 0.4398732872682357,
"learning_rate": 1.6993423915893241e-06,
"loss": 0.5394,
"step": 790
},
{
"epoch": 2.2929631402584967,
"grad_norm": 0.4426749294047621,
"learning_rate": 1.575557619323353e-06,
"loss": 0.5323,
"step": 800
},
{
"epoch": 2.321685016754428,
"grad_norm": 0.44560301173068867,
"learning_rate": 1.4556056625455922e-06,
"loss": 0.5308,
"step": 810
},
{
"epoch": 2.350406893250359,
"grad_norm": 0.47993419658063147,
"learning_rate": 1.3396207777762732e-06,
"loss": 0.5143,
"step": 820
},
{
"epoch": 2.37912876974629,
"grad_norm": 0.4643782762709979,
"learning_rate": 1.2277327813806123e-06,
"loss": 0.5341,
"step": 830
},
{
"epoch": 2.4078506462422213,
"grad_norm": 0.4648802469670571,
"learning_rate": 1.1200669042715163e-06,
"loss": 0.5395,
"step": 840
},
{
"epoch": 2.4365725227381523,
"grad_norm": 0.4388112871344682,
"learning_rate": 1.0167436517445777e-06,
"loss": 0.5198,
"step": 850
},
{
"epoch": 2.4652943992340832,
"grad_norm": 0.4561716886355634,
"learning_rate": 9.178786686022417e-07,
"loss": 0.5347,
"step": 860
},
{
"epoch": 2.494016275730014,
"grad_norm": 0.4516982561274727,
"learning_rate": 8.235826097180566e-07,
"loss": 0.5358,
"step": 870
},
{
"epoch": 2.5227381522259456,
"grad_norm": 0.4293278109349354,
"learning_rate": 7.339610161859618e-07,
"loss": 0.5363,
"step": 880
},
{
"epoch": 2.5514600287218765,
"grad_norm": 0.4418535368694303,
"learning_rate": 6.49114197193137e-07,
"loss": 0.5229,
"step": 890
},
{
"epoch": 2.5801819052178074,
"grad_norm": 0.5092110974871947,
"learning_rate": 5.691371177487215e-07,
"loss": 0.5211,
"step": 900
},
{
"epoch": 2.608903781713739,
"grad_norm": 0.40169505212972373,
"learning_rate": 4.941192923939769e-07,
"loss": 0.522,
"step": 910
},
{
"epoch": 2.6376256582096698,
"grad_norm": 0.42628243978147184,
"learning_rate": 4.2414468501293217e-07,
"loss": 0.5351,
"step": 920
},
{
"epoch": 2.6663475347056007,
"grad_norm": 0.4205893212947504,
"learning_rate": 3.5929161485559694e-07,
"loss": 0.5359,
"step": 930
},
{
"epoch": 2.6950694112015317,
"grad_norm": 0.43814866165359284,
"learning_rate": 2.9963266887894526e-07,
"loss": 0.5324,
"step": 940
},
{
"epoch": 2.723791287697463,
"grad_norm": 0.4397081557598701,
"learning_rate": 2.4523462050379864e-07,
"loss": 0.5487,
"step": 950
},
{
"epoch": 2.752513164193394,
"grad_norm": 0.4086230507321953,
"learning_rate": 1.9615835487849677e-07,
"loss": 0.5116,
"step": 960
},
{
"epoch": 2.781235040689325,
"grad_norm": 0.38229316714447675,
"learning_rate": 1.5245880073305963e-07,
"loss": 0.5352,
"step": 970
},
{
"epoch": 2.8099569171852563,
"grad_norm": 0.43419896659737084,
"learning_rate": 1.1418486890006574e-07,
"loss": 0.5208,
"step": 980
},
{
"epoch": 2.8386787936811873,
"grad_norm": 0.4572008563089457,
"learning_rate": 8.137939757108526e-08,
"loss": 0.5408,
"step": 990
},
{
"epoch": 2.867400670177118,
"grad_norm": 0.40361529979394384,
"learning_rate": 5.4079104349929465e-08,
"loss": 0.5343,
"step": 1000
},
{
"epoch": 2.896122546673049,
"grad_norm": 0.42876437242652693,
"learning_rate": 3.231454515638221e-08,
"loss": 0.5194,
"step": 1010
},
{
"epoch": 2.9248444231689805,
"grad_norm": 0.4906146123977682,
"learning_rate": 1.6110080026414123e-08,
"loss": 0.533,
"step": 1020
},
{
"epoch": 2.9535662996649115,
"grad_norm": 0.44570381041730334,
"learning_rate": 5.483845847151226e-09,
"loss": 0.5284,
"step": 1030
},
{
"epoch": 2.9822881761608424,
"grad_norm": 0.41654981591864015,
"learning_rate": 4.4773605712089554e-10,
"loss": 0.513,
"step": 1040
},
{
"epoch": 2.9937769267592147,
"step": 1044,
"total_flos": 372760519114752.0,
"train_loss": 0.5842336475620782,
"train_runtime": 42367.2333,
"train_samples_per_second": 1.183,
"train_steps_per_second": 0.025
}
],
"logging_steps": 10,
"max_steps": 1044,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 372760519114752.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}