oh_v1.3_slim_orca_x4 / trainer_state.json
sedrickkeh's picture
End of training
274300d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999346832135859,
"eval_steps": 500,
"global_step": 1722,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017417809710428913,
"grad_norm": 10.378825586435497,
"learning_rate": 5e-06,
"loss": 1.0838,
"step": 10
},
{
"epoch": 0.034835619420857826,
"grad_norm": 4.271546912407876,
"learning_rate": 5e-06,
"loss": 0.9373,
"step": 20
},
{
"epoch": 0.05225342913128674,
"grad_norm": 2.1390120781164823,
"learning_rate": 5e-06,
"loss": 0.8909,
"step": 30
},
{
"epoch": 0.06967123884171565,
"grad_norm": 1.4309168874054254,
"learning_rate": 5e-06,
"loss": 0.8639,
"step": 40
},
{
"epoch": 0.08708904855214457,
"grad_norm": 1.5406276876095664,
"learning_rate": 5e-06,
"loss": 0.8433,
"step": 50
},
{
"epoch": 0.10450685826257348,
"grad_norm": 1.2501465404617618,
"learning_rate": 5e-06,
"loss": 0.8247,
"step": 60
},
{
"epoch": 0.1219246679730024,
"grad_norm": 0.9239834135671998,
"learning_rate": 5e-06,
"loss": 0.8126,
"step": 70
},
{
"epoch": 0.1393424776834313,
"grad_norm": 0.8433941860985329,
"learning_rate": 5e-06,
"loss": 0.8046,
"step": 80
},
{
"epoch": 0.15676028739386022,
"grad_norm": 1.097647182335265,
"learning_rate": 5e-06,
"loss": 0.796,
"step": 90
},
{
"epoch": 0.17417809710428914,
"grad_norm": 1.0683416519694173,
"learning_rate": 5e-06,
"loss": 0.7903,
"step": 100
},
{
"epoch": 0.19159590681471805,
"grad_norm": 1.572717136046009,
"learning_rate": 5e-06,
"loss": 0.7846,
"step": 110
},
{
"epoch": 0.20901371652514697,
"grad_norm": 0.9249575894994505,
"learning_rate": 5e-06,
"loss": 0.7806,
"step": 120
},
{
"epoch": 0.2264315262355759,
"grad_norm": 0.8193084838390464,
"learning_rate": 5e-06,
"loss": 0.7783,
"step": 130
},
{
"epoch": 0.2438493359460048,
"grad_norm": 0.8732647330500742,
"learning_rate": 5e-06,
"loss": 0.7725,
"step": 140
},
{
"epoch": 0.2612671456564337,
"grad_norm": 0.7448079044745952,
"learning_rate": 5e-06,
"loss": 0.773,
"step": 150
},
{
"epoch": 0.2786849553668626,
"grad_norm": 0.9898862055097742,
"learning_rate": 5e-06,
"loss": 0.7695,
"step": 160
},
{
"epoch": 0.29610276507729155,
"grad_norm": 0.7272776397879028,
"learning_rate": 5e-06,
"loss": 0.7705,
"step": 170
},
{
"epoch": 0.31352057478772044,
"grad_norm": 1.0826222911505594,
"learning_rate": 5e-06,
"loss": 0.7737,
"step": 180
},
{
"epoch": 0.3309383844981494,
"grad_norm": 0.6748830770235253,
"learning_rate": 5e-06,
"loss": 0.7611,
"step": 190
},
{
"epoch": 0.3483561942085783,
"grad_norm": 0.7359625796839507,
"learning_rate": 5e-06,
"loss": 0.7609,
"step": 200
},
{
"epoch": 0.36577400391900716,
"grad_norm": 0.6962279114859433,
"learning_rate": 5e-06,
"loss": 0.756,
"step": 210
},
{
"epoch": 0.3831918136294361,
"grad_norm": 0.6255149151141138,
"learning_rate": 5e-06,
"loss": 0.7617,
"step": 220
},
{
"epoch": 0.400609623339865,
"grad_norm": 0.7052369366480614,
"learning_rate": 5e-06,
"loss": 0.7569,
"step": 230
},
{
"epoch": 0.41802743305029394,
"grad_norm": 0.6097155207889393,
"learning_rate": 5e-06,
"loss": 0.7546,
"step": 240
},
{
"epoch": 0.43544524276072283,
"grad_norm": 0.6836967619335058,
"learning_rate": 5e-06,
"loss": 0.7613,
"step": 250
},
{
"epoch": 0.4528630524711518,
"grad_norm": 0.5747669710959988,
"learning_rate": 5e-06,
"loss": 0.7568,
"step": 260
},
{
"epoch": 0.47028086218158066,
"grad_norm": 0.8293746415280457,
"learning_rate": 5e-06,
"loss": 0.755,
"step": 270
},
{
"epoch": 0.4876986718920096,
"grad_norm": 0.6240762658588678,
"learning_rate": 5e-06,
"loss": 0.7512,
"step": 280
},
{
"epoch": 0.5051164816024385,
"grad_norm": 0.604817000368227,
"learning_rate": 5e-06,
"loss": 0.7565,
"step": 290
},
{
"epoch": 0.5225342913128674,
"grad_norm": 0.7975782732234052,
"learning_rate": 5e-06,
"loss": 0.7536,
"step": 300
},
{
"epoch": 0.5399521010232963,
"grad_norm": 0.9097401207787359,
"learning_rate": 5e-06,
"loss": 0.7449,
"step": 310
},
{
"epoch": 0.5573699107337252,
"grad_norm": 0.5850865591811083,
"learning_rate": 5e-06,
"loss": 0.7476,
"step": 320
},
{
"epoch": 0.5747877204441542,
"grad_norm": 0.7135182489392279,
"learning_rate": 5e-06,
"loss": 0.7446,
"step": 330
},
{
"epoch": 0.5922055301545831,
"grad_norm": 0.7289623389467091,
"learning_rate": 5e-06,
"loss": 0.7461,
"step": 340
},
{
"epoch": 0.6096233398650119,
"grad_norm": 0.7209085909740518,
"learning_rate": 5e-06,
"loss": 0.7441,
"step": 350
},
{
"epoch": 0.6270411495754409,
"grad_norm": 0.7124009797840823,
"learning_rate": 5e-06,
"loss": 0.7426,
"step": 360
},
{
"epoch": 0.6444589592858698,
"grad_norm": 0.7812743047985142,
"learning_rate": 5e-06,
"loss": 0.7425,
"step": 370
},
{
"epoch": 0.6618767689962988,
"grad_norm": 0.9348863468860681,
"learning_rate": 5e-06,
"loss": 0.7426,
"step": 380
},
{
"epoch": 0.6792945787067276,
"grad_norm": 0.8887235387086234,
"learning_rate": 5e-06,
"loss": 0.7443,
"step": 390
},
{
"epoch": 0.6967123884171565,
"grad_norm": 0.5644550564287659,
"learning_rate": 5e-06,
"loss": 0.7462,
"step": 400
},
{
"epoch": 0.7141301981275855,
"grad_norm": 0.7133759479394518,
"learning_rate": 5e-06,
"loss": 0.7392,
"step": 410
},
{
"epoch": 0.7315480078380143,
"grad_norm": 0.6322425713640688,
"learning_rate": 5e-06,
"loss": 0.7438,
"step": 420
},
{
"epoch": 0.7489658175484433,
"grad_norm": 0.653356418355363,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 430
},
{
"epoch": 0.7663836272588722,
"grad_norm": 0.7172550980138845,
"learning_rate": 5e-06,
"loss": 0.7362,
"step": 440
},
{
"epoch": 0.7838014369693012,
"grad_norm": 0.5961758862117608,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 450
},
{
"epoch": 0.80121924667973,
"grad_norm": 0.682099366244341,
"learning_rate": 5e-06,
"loss": 0.7361,
"step": 460
},
{
"epoch": 0.8186370563901589,
"grad_norm": 0.6035807917106119,
"learning_rate": 5e-06,
"loss": 0.7394,
"step": 470
},
{
"epoch": 0.8360548661005879,
"grad_norm": 0.6420432430762913,
"learning_rate": 5e-06,
"loss": 0.7427,
"step": 480
},
{
"epoch": 0.8534726758110167,
"grad_norm": 0.6771119733640588,
"learning_rate": 5e-06,
"loss": 0.7423,
"step": 490
},
{
"epoch": 0.8708904855214457,
"grad_norm": 0.7126075005956054,
"learning_rate": 5e-06,
"loss": 0.7383,
"step": 500
},
{
"epoch": 0.8883082952318746,
"grad_norm": 0.7584360793951261,
"learning_rate": 5e-06,
"loss": 0.7321,
"step": 510
},
{
"epoch": 0.9057261049423035,
"grad_norm": 0.7439364499749611,
"learning_rate": 5e-06,
"loss": 0.7371,
"step": 520
},
{
"epoch": 0.9231439146527324,
"grad_norm": 0.6529063351558732,
"learning_rate": 5e-06,
"loss": 0.74,
"step": 530
},
{
"epoch": 0.9405617243631613,
"grad_norm": 0.6579238997317206,
"learning_rate": 5e-06,
"loss": 0.7339,
"step": 540
},
{
"epoch": 0.9579795340735903,
"grad_norm": 0.6529747723949795,
"learning_rate": 5e-06,
"loss": 0.7366,
"step": 550
},
{
"epoch": 0.9753973437840192,
"grad_norm": 0.7465822819939318,
"learning_rate": 5e-06,
"loss": 0.7329,
"step": 560
},
{
"epoch": 0.992815153494448,
"grad_norm": 0.6744574449718616,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 570
},
{
"epoch": 0.9997822773786197,
"eval_loss": 0.7317857146263123,
"eval_runtime": 402.0076,
"eval_samples_per_second": 38.482,
"eval_steps_per_second": 0.602,
"step": 574
},
{
"epoch": 1.010232963204877,
"grad_norm": 0.7094322700801018,
"learning_rate": 5e-06,
"loss": 0.763,
"step": 580
},
{
"epoch": 1.027650772915306,
"grad_norm": 0.619008567608486,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 590
},
{
"epoch": 1.0450685826257349,
"grad_norm": 0.7720080238431899,
"learning_rate": 5e-06,
"loss": 0.6814,
"step": 600
},
{
"epoch": 1.0624863923361638,
"grad_norm": 0.613074115463787,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 610
},
{
"epoch": 1.0799042020465925,
"grad_norm": 0.8529977128112323,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 620
},
{
"epoch": 1.0973220117570215,
"grad_norm": 0.6185916445154745,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 630
},
{
"epoch": 1.1147398214674504,
"grad_norm": 0.5960552407712587,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 640
},
{
"epoch": 1.1321576311778794,
"grad_norm": 0.6992739712457544,
"learning_rate": 5e-06,
"loss": 0.6838,
"step": 650
},
{
"epoch": 1.1495754408883083,
"grad_norm": 0.6896093508843089,
"learning_rate": 5e-06,
"loss": 0.6883,
"step": 660
},
{
"epoch": 1.1669932505987373,
"grad_norm": 0.7618446264021529,
"learning_rate": 5e-06,
"loss": 0.6866,
"step": 670
},
{
"epoch": 1.1844110603091662,
"grad_norm": 0.5993982811906263,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 680
},
{
"epoch": 1.201828870019595,
"grad_norm": 0.7774490221678223,
"learning_rate": 5e-06,
"loss": 0.6904,
"step": 690
},
{
"epoch": 1.2192466797300239,
"grad_norm": 0.7209322592758776,
"learning_rate": 5e-06,
"loss": 0.6843,
"step": 700
},
{
"epoch": 1.2366644894404528,
"grad_norm": 0.5933996592162579,
"learning_rate": 5e-06,
"loss": 0.6858,
"step": 710
},
{
"epoch": 1.2540822991508818,
"grad_norm": 0.603096793298619,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 720
},
{
"epoch": 1.2715001088613107,
"grad_norm": 0.6254264453133531,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 730
},
{
"epoch": 1.2889179185717397,
"grad_norm": 0.6255696330381725,
"learning_rate": 5e-06,
"loss": 0.6846,
"step": 740
},
{
"epoch": 1.3063357282821686,
"grad_norm": 0.5954700919121718,
"learning_rate": 5e-06,
"loss": 0.6857,
"step": 750
},
{
"epoch": 1.3237535379925975,
"grad_norm": 0.605017152001749,
"learning_rate": 5e-06,
"loss": 0.6833,
"step": 760
},
{
"epoch": 1.3411713477030263,
"grad_norm": 0.6374571812243335,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 770
},
{
"epoch": 1.3585891574134552,
"grad_norm": 0.7034685309687484,
"learning_rate": 5e-06,
"loss": 0.6862,
"step": 780
},
{
"epoch": 1.3760069671238842,
"grad_norm": 0.6478857838671078,
"learning_rate": 5e-06,
"loss": 0.6834,
"step": 790
},
{
"epoch": 1.393424776834313,
"grad_norm": 0.6774648068840645,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 800
},
{
"epoch": 1.410842586544742,
"grad_norm": 0.7999211706624636,
"learning_rate": 5e-06,
"loss": 0.6849,
"step": 810
},
{
"epoch": 1.428260396255171,
"grad_norm": 0.7093553520320318,
"learning_rate": 5e-06,
"loss": 0.6874,
"step": 820
},
{
"epoch": 1.4456782059655997,
"grad_norm": 0.6339150451411103,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 830
},
{
"epoch": 1.4630960156760286,
"grad_norm": 0.6378354921425449,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 840
},
{
"epoch": 1.4805138253864576,
"grad_norm": 0.6060950779002315,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 850
},
{
"epoch": 1.4979316350968865,
"grad_norm": 0.7027258672686724,
"learning_rate": 5e-06,
"loss": 0.6866,
"step": 860
},
{
"epoch": 1.5153494448073155,
"grad_norm": 0.6241403475122037,
"learning_rate": 5e-06,
"loss": 0.6831,
"step": 870
},
{
"epoch": 1.5327672545177444,
"grad_norm": 0.7090161192507007,
"learning_rate": 5e-06,
"loss": 0.6827,
"step": 880
},
{
"epoch": 1.5501850642281734,
"grad_norm": 0.6858971984003965,
"learning_rate": 5e-06,
"loss": 0.6792,
"step": 890
},
{
"epoch": 1.5676028739386023,
"grad_norm": 0.6546584740097385,
"learning_rate": 5e-06,
"loss": 0.6806,
"step": 900
},
{
"epoch": 1.5850206836490313,
"grad_norm": 0.6020087571697199,
"learning_rate": 5e-06,
"loss": 0.6799,
"step": 910
},
{
"epoch": 1.6024384933594602,
"grad_norm": 0.6501182817858244,
"learning_rate": 5e-06,
"loss": 0.6857,
"step": 920
},
{
"epoch": 1.619856303069889,
"grad_norm": 0.584870256422628,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 930
},
{
"epoch": 1.6372741127803179,
"grad_norm": 0.6317587237334846,
"learning_rate": 5e-06,
"loss": 0.6855,
"step": 940
},
{
"epoch": 1.6546919224907468,
"grad_norm": 0.5912078658288651,
"learning_rate": 5e-06,
"loss": 0.6831,
"step": 950
},
{
"epoch": 1.6721097322011755,
"grad_norm": 0.6128368243065496,
"learning_rate": 5e-06,
"loss": 0.6808,
"step": 960
},
{
"epoch": 1.6895275419116045,
"grad_norm": 0.7946845578841991,
"learning_rate": 5e-06,
"loss": 0.6878,
"step": 970
},
{
"epoch": 1.7069453516220334,
"grad_norm": 0.6757515701163817,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 980
},
{
"epoch": 1.7243631613324624,
"grad_norm": 0.6571419210311429,
"learning_rate": 5e-06,
"loss": 0.6823,
"step": 990
},
{
"epoch": 1.7417809710428913,
"grad_norm": 0.7598031996788027,
"learning_rate": 5e-06,
"loss": 0.6797,
"step": 1000
},
{
"epoch": 1.7591987807533203,
"grad_norm": 0.6670752253306316,
"learning_rate": 5e-06,
"loss": 0.6842,
"step": 1010
},
{
"epoch": 1.7766165904637492,
"grad_norm": 0.6401962122300333,
"learning_rate": 5e-06,
"loss": 0.6812,
"step": 1020
},
{
"epoch": 1.7940344001741781,
"grad_norm": 0.7117175290423249,
"learning_rate": 5e-06,
"loss": 0.6824,
"step": 1030
},
{
"epoch": 1.811452209884607,
"grad_norm": 0.5800625845659623,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 1040
},
{
"epoch": 1.828870019595036,
"grad_norm": 0.7320040356270946,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 1050
},
{
"epoch": 1.846287829305465,
"grad_norm": 0.6283167554926344,
"learning_rate": 5e-06,
"loss": 0.6839,
"step": 1060
},
{
"epoch": 1.8637056390158937,
"grad_norm": 0.8371943589394946,
"learning_rate": 5e-06,
"loss": 0.6812,
"step": 1070
},
{
"epoch": 1.8811234487263226,
"grad_norm": 0.6257083963805145,
"learning_rate": 5e-06,
"loss": 0.6838,
"step": 1080
},
{
"epoch": 1.8985412584367516,
"grad_norm": 0.5986261297946587,
"learning_rate": 5e-06,
"loss": 0.6797,
"step": 1090
},
{
"epoch": 1.9159590681471805,
"grad_norm": 0.6170171924536151,
"learning_rate": 5e-06,
"loss": 0.6797,
"step": 1100
},
{
"epoch": 1.9333768778576093,
"grad_norm": 0.5700527826150651,
"learning_rate": 5e-06,
"loss": 0.681,
"step": 1110
},
{
"epoch": 1.9507946875680382,
"grad_norm": 0.7918614747451274,
"learning_rate": 5e-06,
"loss": 0.6836,
"step": 1120
},
{
"epoch": 1.9682124972784671,
"grad_norm": 0.6423488525843636,
"learning_rate": 5e-06,
"loss": 0.6841,
"step": 1130
},
{
"epoch": 1.985630306988896,
"grad_norm": 0.5727071414052627,
"learning_rate": 5e-06,
"loss": 0.6807,
"step": 1140
},
{
"epoch": 1.9995645547572392,
"eval_loss": 0.7203673124313354,
"eval_runtime": 394.6464,
"eval_samples_per_second": 39.2,
"eval_steps_per_second": 0.613,
"step": 1148
},
{
"epoch": 2.003048116699325,
"grad_norm": 0.9442473115477681,
"learning_rate": 5e-06,
"loss": 0.7309,
"step": 1150
},
{
"epoch": 2.020465926409754,
"grad_norm": 0.7483105880106439,
"learning_rate": 5e-06,
"loss": 0.6325,
"step": 1160
},
{
"epoch": 2.037883736120183,
"grad_norm": 0.7540713563935749,
"learning_rate": 5e-06,
"loss": 0.6307,
"step": 1170
},
{
"epoch": 2.055301545830612,
"grad_norm": 0.6724851440947269,
"learning_rate": 5e-06,
"loss": 0.6337,
"step": 1180
},
{
"epoch": 2.072719355541041,
"grad_norm": 0.8087552584080454,
"learning_rate": 5e-06,
"loss": 0.6291,
"step": 1190
},
{
"epoch": 2.0901371652514698,
"grad_norm": 0.5977695571055209,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 1200
},
{
"epoch": 2.1075549749618987,
"grad_norm": 0.7569591780057143,
"learning_rate": 5e-06,
"loss": 0.63,
"step": 1210
},
{
"epoch": 2.1249727846723276,
"grad_norm": 0.7224022820617728,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 1220
},
{
"epoch": 2.142390594382756,
"grad_norm": 0.9103041866730269,
"learning_rate": 5e-06,
"loss": 0.6338,
"step": 1230
},
{
"epoch": 2.159808404093185,
"grad_norm": 0.6573119068550747,
"learning_rate": 5e-06,
"loss": 0.6315,
"step": 1240
},
{
"epoch": 2.177226213803614,
"grad_norm": 0.7130793786888794,
"learning_rate": 5e-06,
"loss": 0.6307,
"step": 1250
},
{
"epoch": 2.194644023514043,
"grad_norm": 0.6787527348130123,
"learning_rate": 5e-06,
"loss": 0.635,
"step": 1260
},
{
"epoch": 2.212061833224472,
"grad_norm": 0.7583316967190475,
"learning_rate": 5e-06,
"loss": 0.6328,
"step": 1270
},
{
"epoch": 2.229479642934901,
"grad_norm": 0.9232698200191256,
"learning_rate": 5e-06,
"loss": 0.6392,
"step": 1280
},
{
"epoch": 2.24689745264533,
"grad_norm": 0.7252346361915288,
"learning_rate": 5e-06,
"loss": 0.6347,
"step": 1290
},
{
"epoch": 2.2643152623557588,
"grad_norm": 0.6362752959635848,
"learning_rate": 5e-06,
"loss": 0.6304,
"step": 1300
},
{
"epoch": 2.2817330720661877,
"grad_norm": 0.6085958821854244,
"learning_rate": 5e-06,
"loss": 0.6354,
"step": 1310
},
{
"epoch": 2.2991508817766166,
"grad_norm": 0.6106358089516802,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 1320
},
{
"epoch": 2.3165686914870456,
"grad_norm": 0.6704766246511351,
"learning_rate": 5e-06,
"loss": 0.6361,
"step": 1330
},
{
"epoch": 2.3339865011974745,
"grad_norm": 0.7134352783772538,
"learning_rate": 5e-06,
"loss": 0.6363,
"step": 1340
},
{
"epoch": 2.3514043109079035,
"grad_norm": 0.691476396805842,
"learning_rate": 5e-06,
"loss": 0.6408,
"step": 1350
},
{
"epoch": 2.3688221206183324,
"grad_norm": 0.6591953333036876,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 1360
},
{
"epoch": 2.3862399303287614,
"grad_norm": 0.6492399404200755,
"learning_rate": 5e-06,
"loss": 0.6328,
"step": 1370
},
{
"epoch": 2.40365774003919,
"grad_norm": 0.615428906926297,
"learning_rate": 5e-06,
"loss": 0.6413,
"step": 1380
},
{
"epoch": 2.421075549749619,
"grad_norm": 0.6178851722594154,
"learning_rate": 5e-06,
"loss": 0.6362,
"step": 1390
},
{
"epoch": 2.4384933594600477,
"grad_norm": 0.6630448901048148,
"learning_rate": 5e-06,
"loss": 0.6354,
"step": 1400
},
{
"epoch": 2.4559111691704767,
"grad_norm": 0.6761352769262333,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 1410
},
{
"epoch": 2.4733289788809056,
"grad_norm": 0.6840809680625406,
"learning_rate": 5e-06,
"loss": 0.6347,
"step": 1420
},
{
"epoch": 2.4907467885913346,
"grad_norm": 0.7752711880925182,
"learning_rate": 5e-06,
"loss": 0.6375,
"step": 1430
},
{
"epoch": 2.5081645983017635,
"grad_norm": 0.671961987869029,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 1440
},
{
"epoch": 2.5255824080121925,
"grad_norm": 0.6066582748163826,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 1450
},
{
"epoch": 2.5430002177226214,
"grad_norm": 0.6494968644059873,
"learning_rate": 5e-06,
"loss": 0.64,
"step": 1460
},
{
"epoch": 2.5604180274330504,
"grad_norm": 0.6819270998889235,
"learning_rate": 5e-06,
"loss": 0.6392,
"step": 1470
},
{
"epoch": 2.5778358371434793,
"grad_norm": 0.6720899123226914,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 1480
},
{
"epoch": 2.5952536468539082,
"grad_norm": 0.8865903590224419,
"learning_rate": 5e-06,
"loss": 0.6418,
"step": 1490
},
{
"epoch": 2.612671456564337,
"grad_norm": 0.8255525182739956,
"learning_rate": 5e-06,
"loss": 0.637,
"step": 1500
},
{
"epoch": 2.6300892662747657,
"grad_norm": 0.6926869297282812,
"learning_rate": 5e-06,
"loss": 0.6359,
"step": 1510
},
{
"epoch": 2.647507075985195,
"grad_norm": 0.7064032548106364,
"learning_rate": 5e-06,
"loss": 0.6372,
"step": 1520
},
{
"epoch": 2.6649248856956236,
"grad_norm": 0.6147984872224924,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 1530
},
{
"epoch": 2.6823426954060525,
"grad_norm": 0.6504099699536218,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 1540
},
{
"epoch": 2.6997605051164815,
"grad_norm": 0.563483068339733,
"learning_rate": 5e-06,
"loss": 0.6348,
"step": 1550
},
{
"epoch": 2.7171783148269104,
"grad_norm": 0.730022824759867,
"learning_rate": 5e-06,
"loss": 0.6358,
"step": 1560
},
{
"epoch": 2.7345961245373394,
"grad_norm": 0.7435338593643929,
"learning_rate": 5e-06,
"loss": 0.6416,
"step": 1570
},
{
"epoch": 2.7520139342477683,
"grad_norm": 0.7041374525178048,
"learning_rate": 5e-06,
"loss": 0.6312,
"step": 1580
},
{
"epoch": 2.7694317439581972,
"grad_norm": 0.6185370005773447,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 1590
},
{
"epoch": 2.786849553668626,
"grad_norm": 0.597751813516521,
"learning_rate": 5e-06,
"loss": 0.6376,
"step": 1600
},
{
"epoch": 2.804267363379055,
"grad_norm": 0.6525590591893353,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 1610
},
{
"epoch": 2.821685173089484,
"grad_norm": 0.6520966089133831,
"learning_rate": 5e-06,
"loss": 0.6363,
"step": 1620
},
{
"epoch": 2.839102982799913,
"grad_norm": 0.6318597492523834,
"learning_rate": 5e-06,
"loss": 0.6392,
"step": 1630
},
{
"epoch": 2.856520792510342,
"grad_norm": 0.7325044927855683,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 1640
},
{
"epoch": 2.873938602220771,
"grad_norm": 0.6253335500365848,
"learning_rate": 5e-06,
"loss": 0.636,
"step": 1650
},
{
"epoch": 2.8913564119311994,
"grad_norm": 0.6704613482673505,
"learning_rate": 5e-06,
"loss": 0.6376,
"step": 1660
},
{
"epoch": 2.908774221641629,
"grad_norm": 0.7273411977567759,
"learning_rate": 5e-06,
"loss": 0.6386,
"step": 1670
},
{
"epoch": 2.9261920313520573,
"grad_norm": 0.6268248741184522,
"learning_rate": 5e-06,
"loss": 0.6378,
"step": 1680
},
{
"epoch": 2.9436098410624862,
"grad_norm": 0.5745167165482074,
"learning_rate": 5e-06,
"loss": 0.6392,
"step": 1690
},
{
"epoch": 2.961027650772915,
"grad_norm": 0.6046462612164668,
"learning_rate": 5e-06,
"loss": 0.642,
"step": 1700
},
{
"epoch": 2.978445460483344,
"grad_norm": 0.6397198084499147,
"learning_rate": 5e-06,
"loss": 0.6381,
"step": 1710
},
{
"epoch": 2.995863270193773,
"grad_norm": 0.7413825248918653,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 1720
},
{
"epoch": 2.999346832135859,
"eval_loss": 0.7229765057563782,
"eval_runtime": 389.254,
"eval_samples_per_second": 39.743,
"eval_steps_per_second": 0.622,
"step": 1722
},
{
"epoch": 2.999346832135859,
"step": 1722,
"total_flos": 2884204756992000.0,
"train_loss": 0.6974312729536448,
"train_runtime": 56418.3982,
"train_samples_per_second": 15.629,
"train_steps_per_second": 0.031
}
],
"logging_steps": 10,
"max_steps": 1722,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2884204756992000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}