c3 / checkpoint-2400 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
dc18efc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.888888888888889,
"eval_steps": 30,
"global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012044564890093345,
"grad_norm": 11.71904468536377,
"learning_rate": 7.2e-06,
"loss": 2.9594,
"step": 10
},
{
"epoch": 0.02408912978018669,
"grad_norm": 2.4383621215820312,
"learning_rate": 1.52e-05,
"loss": 1.5501,
"step": 20
},
{
"epoch": 0.036133694670280034,
"grad_norm": 0.7187284231185913,
"learning_rate": 2.32e-05,
"loss": 0.7478,
"step": 30
},
{
"epoch": 0.036133694670280034,
"eval_loss": 0.6954202651977539,
"eval_runtime": 81.243,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 2.154,
"step": 30
},
{
"epoch": 0.04817825956037338,
"grad_norm": 0.9893588423728943,
"learning_rate": 3.12e-05,
"loss": 0.6087,
"step": 40
},
{
"epoch": 0.06022282445046673,
"grad_norm": 0.6181250214576721,
"learning_rate": 3.9200000000000004e-05,
"loss": 0.4514,
"step": 50
},
{
"epoch": 0.07226738934056007,
"grad_norm": 0.5894028544425964,
"learning_rate": 4.72e-05,
"loss": 0.3809,
"step": 60
},
{
"epoch": 0.07226738934056007,
"eval_loss": 0.35219648480415344,
"eval_runtime": 80.7477,
"eval_samples_per_second": 4.334,
"eval_steps_per_second": 2.167,
"step": 60
},
{
"epoch": 0.08431195423065342,
"grad_norm": 3.429518222808838,
"learning_rate": 5.520000000000001e-05,
"loss": 0.2992,
"step": 70
},
{
"epoch": 0.09635651912074676,
"grad_norm": 0.6878916025161743,
"learning_rate": 6.32e-05,
"loss": 0.2921,
"step": 80
},
{
"epoch": 0.10840108401084012,
"grad_norm": 0.5862188339233398,
"learning_rate": 7.12e-05,
"loss": 0.2483,
"step": 90
},
{
"epoch": 0.10840108401084012,
"eval_loss": 0.22866909205913544,
"eval_runtime": 81.2164,
"eval_samples_per_second": 4.309,
"eval_steps_per_second": 2.155,
"step": 90
},
{
"epoch": 0.12044564890093346,
"grad_norm": 0.7386724948883057,
"learning_rate": 7.920000000000001e-05,
"loss": 0.2107,
"step": 100
},
{
"epoch": 0.1324902137910268,
"grad_norm": 0.6922011375427246,
"learning_rate": 8.72e-05,
"loss": 0.1962,
"step": 110
},
{
"epoch": 0.14453477868112014,
"grad_norm": 0.4779876470565796,
"learning_rate": 9.52e-05,
"loss": 0.1741,
"step": 120
},
{
"epoch": 0.14453477868112014,
"eval_loss": 0.18707558512687683,
"eval_runtime": 81.1016,
"eval_samples_per_second": 4.316,
"eval_steps_per_second": 2.158,
"step": 120
},
{
"epoch": 0.1565793435712135,
"grad_norm": 0.3711669445037842,
"learning_rate": 0.0001032,
"loss": 0.1775,
"step": 130
},
{
"epoch": 0.16862390846130684,
"grad_norm": 0.5507099032402039,
"learning_rate": 0.00011120000000000002,
"loss": 0.1663,
"step": 140
},
{
"epoch": 0.18066847335140018,
"grad_norm": 0.6897018551826477,
"learning_rate": 0.0001192,
"loss": 0.162,
"step": 150
},
{
"epoch": 0.18066847335140018,
"eval_loss": 0.1689070761203766,
"eval_runtime": 81.1763,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 2.156,
"step": 150
},
{
"epoch": 0.19271303824149352,
"grad_norm": 0.30447226762771606,
"learning_rate": 0.0001272,
"loss": 0.1654,
"step": 160
},
{
"epoch": 0.20475760313158686,
"grad_norm": 0.36739301681518555,
"learning_rate": 0.0001352,
"loss": 0.1692,
"step": 170
},
{
"epoch": 0.21680216802168023,
"grad_norm": 0.42459923028945923,
"learning_rate": 0.0001432,
"loss": 0.1479,
"step": 180
},
{
"epoch": 0.21680216802168023,
"eval_loss": 0.16287875175476074,
"eval_runtime": 81.3447,
"eval_samples_per_second": 4.303,
"eval_steps_per_second": 2.151,
"step": 180
},
{
"epoch": 0.22884673291177357,
"grad_norm": 0.24459399282932281,
"learning_rate": 0.00015120000000000002,
"loss": 0.1567,
"step": 190
},
{
"epoch": 0.2408912978018669,
"grad_norm": 0.29077818989753723,
"learning_rate": 0.00015920000000000002,
"loss": 0.1491,
"step": 200
},
{
"epoch": 0.2529358626919603,
"grad_norm": 0.3934674561023712,
"learning_rate": 0.0001672,
"loss": 0.1587,
"step": 210
},
{
"epoch": 0.2529358626919603,
"eval_loss": 0.1568612903356552,
"eval_runtime": 80.9345,
"eval_samples_per_second": 4.324,
"eval_steps_per_second": 2.162,
"step": 210
},
{
"epoch": 0.2649804275820536,
"grad_norm": 0.31920939683914185,
"learning_rate": 0.0001752,
"loss": 0.1622,
"step": 220
},
{
"epoch": 0.27702499247214696,
"grad_norm": 0.5031167268753052,
"learning_rate": 0.0001832,
"loss": 0.147,
"step": 230
},
{
"epoch": 0.28906955736224027,
"grad_norm": 0.2890608310699463,
"learning_rate": 0.0001912,
"loss": 0.1433,
"step": 240
},
{
"epoch": 0.28906955736224027,
"eval_loss": 0.15362557768821716,
"eval_runtime": 80.9234,
"eval_samples_per_second": 4.325,
"eval_steps_per_second": 2.163,
"step": 240
},
{
"epoch": 0.30111412225233364,
"grad_norm": 0.2849373519420624,
"learning_rate": 0.00019920000000000002,
"loss": 0.1471,
"step": 250
},
{
"epoch": 0.313158687142427,
"grad_norm": 0.263950914144516,
"learning_rate": 0.0001999920550627653,
"loss": 0.155,
"step": 260
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.31182682514190674,
"learning_rate": 0.00019996459270297992,
"loss": 0.138,
"step": 270
},
{
"epoch": 0.3252032520325203,
"eval_loss": 0.15008553862571716,
"eval_runtime": 80.9483,
"eval_samples_per_second": 4.324,
"eval_steps_per_second": 2.162,
"step": 270
},
{
"epoch": 0.3372478169226137,
"grad_norm": 0.41790467500686646,
"learning_rate": 0.00019991752022102606,
"loss": 0.1472,
"step": 280
},
{
"epoch": 0.349292381812707,
"grad_norm": 0.29337936639785767,
"learning_rate": 0.0001998508468511445,
"loss": 0.152,
"step": 290
},
{
"epoch": 0.36133694670280037,
"grad_norm": 0.1533428430557251,
"learning_rate": 0.0001997645856726956,
"loss": 0.1427,
"step": 300
},
{
"epoch": 0.36133694670280037,
"eval_loss": 0.14941252768039703,
"eval_runtime": 81.0588,
"eval_samples_per_second": 4.318,
"eval_steps_per_second": 2.159,
"step": 300
},
{
"epoch": 0.37338151159289373,
"grad_norm": 0.3626411557197571,
"learning_rate": 0.0001996587536075934,
"loss": 0.1539,
"step": 310
},
{
"epoch": 0.38542607648298705,
"grad_norm": 0.3157321810722351,
"learning_rate": 0.00019953337141698617,
"loss": 0.1457,
"step": 320
},
{
"epoch": 0.3974706413730804,
"grad_norm": 0.18949578702449799,
"learning_rate": 0.00019938846369718348,
"loss": 0.1471,
"step": 330
},
{
"epoch": 0.3974706413730804,
"eval_loss": 0.14507745206356049,
"eval_runtime": 81.0028,
"eval_samples_per_second": 4.321,
"eval_steps_per_second": 2.16,
"step": 330
},
{
"epoch": 0.4095152062631737,
"grad_norm": 0.2189997136592865,
"learning_rate": 0.0001992240588748314,
"loss": 0.1423,
"step": 340
},
{
"epoch": 0.4215597711532671,
"grad_norm": 0.2674465775489807,
"learning_rate": 0.00019904018920133574,
"loss": 0.1397,
"step": 350
},
{
"epoch": 0.43360433604336046,
"grad_norm": 0.3467840552330017,
"learning_rate": 0.00019883689074653548,
"loss": 0.1486,
"step": 360
},
{
"epoch": 0.43360433604336046,
"eval_loss": 0.1380164623260498,
"eval_runtime": 81.2554,
"eval_samples_per_second": 4.307,
"eval_steps_per_second": 2.154,
"step": 360
},
{
"epoch": 0.4456489009334538,
"grad_norm": 0.22817489504814148,
"learning_rate": 0.00019861420339162682,
"loss": 0.1411,
"step": 370
},
{
"epoch": 0.45769346582354714,
"grad_norm": 0.1939237415790558,
"learning_rate": 0.0001983721708213397,
"loss": 0.1337,
"step": 380
},
{
"epoch": 0.46973803071364045,
"grad_norm": 0.20223841071128845,
"learning_rate": 0.00019811084051536812,
"loss": 0.147,
"step": 390
},
{
"epoch": 0.46973803071364045,
"eval_loss": 0.13835138082504272,
"eval_runtime": 81.2083,
"eval_samples_per_second": 4.31,
"eval_steps_per_second": 2.155,
"step": 390
},
{
"epoch": 0.4817825956037338,
"grad_norm": 0.16610513627529144,
"learning_rate": 0.00019783026373905603,
"loss": 0.1261,
"step": 400
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.13892178237438202,
"learning_rate": 0.0001975304955333405,
"loss": 0.1303,
"step": 410
},
{
"epoch": 0.5058717253839206,
"grad_norm": 0.17607590556144714,
"learning_rate": 0.00019721159470395446,
"loss": 0.1408,
"step": 420
},
{
"epoch": 0.5058717253839206,
"eval_loss": 0.13557562232017517,
"eval_runtime": 81.2501,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 2.154,
"step": 420
},
{
"epoch": 0.5179162902740139,
"grad_norm": 0.2578093409538269,
"learning_rate": 0.0001968736238098906,
"loss": 0.1328,
"step": 430
},
{
"epoch": 0.5299608551641072,
"grad_norm": 0.5826597213745117,
"learning_rate": 0.00019651664915112913,
"loss": 0.1379,
"step": 440
},
{
"epoch": 0.5420054200542005,
"grad_norm": 0.2640504240989685,
"learning_rate": 0.00019614074075563178,
"loss": 0.1328,
"step": 450
},
{
"epoch": 0.5420054200542005,
"eval_loss": 0.13704629242420197,
"eval_runtime": 81.2461,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 2.154,
"step": 450
},
{
"epoch": 0.5540499849442939,
"grad_norm": 0.25430986285209656,
"learning_rate": 0.00019574597236560428,
"loss": 0.1295,
"step": 460
},
{
"epoch": 0.5660945498343872,
"grad_norm": 0.2172861099243164,
"learning_rate": 0.00019533242142303028,
"loss": 0.1274,
"step": 470
},
{
"epoch": 0.5781391147244805,
"grad_norm": 0.18936924636363983,
"learning_rate": 0.00019490016905447958,
"loss": 0.131,
"step": 480
},
{
"epoch": 0.5781391147244805,
"eval_loss": 0.13753947615623474,
"eval_runtime": 81.0949,
"eval_samples_per_second": 4.316,
"eval_steps_per_second": 2.158,
"step": 480
},
{
"epoch": 0.590183679614574,
"grad_norm": 0.26435989141464233,
"learning_rate": 0.00019444930005519347,
"loss": 0.132,
"step": 490
},
{
"epoch": 0.6022282445046673,
"grad_norm": 0.23327338695526123,
"learning_rate": 0.00019397990287245027,
"loss": 0.1417,
"step": 500
},
{
"epoch": 0.6142728093947606,
"grad_norm": 0.14256203174591064,
"learning_rate": 0.00019349206958821474,
"loss": 0.1392,
"step": 510
},
{
"epoch": 0.6142728093947606,
"eval_loss": 0.13641956448554993,
"eval_runtime": 81.0841,
"eval_samples_per_second": 4.317,
"eval_steps_per_second": 2.158,
"step": 510
},
{
"epoch": 0.626317374284854,
"grad_norm": 0.16011199355125427,
"learning_rate": 0.00019298589590107415,
"loss": 0.1406,
"step": 520
},
{
"epoch": 0.6383619391749473,
"grad_norm": 0.1269129067659378,
"learning_rate": 0.00019246148110746515,
"loss": 0.1346,
"step": 530
},
{
"epoch": 0.6504065040650406,
"grad_norm": 0.13770046830177307,
"learning_rate": 0.0001919189280821946,
"loss": 0.1315,
"step": 540
},
{
"epoch": 0.6504065040650406,
"eval_loss": 0.1363556832075119,
"eval_runtime": 81.2186,
"eval_samples_per_second": 4.309,
"eval_steps_per_second": 2.155,
"step": 540
},
{
"epoch": 0.662451068955134,
"grad_norm": 0.18796966969966888,
"learning_rate": 0.00019135834325825868,
"loss": 0.13,
"step": 550
},
{
"epoch": 0.6744956338452274,
"grad_norm": 0.12326055020093918,
"learning_rate": 0.00019077983660596365,
"loss": 0.1339,
"step": 560
},
{
"epoch": 0.6865401987353207,
"grad_norm": 0.1480596363544464,
"learning_rate": 0.00019018352161135317,
"loss": 0.1296,
"step": 570
},
{
"epoch": 0.6865401987353207,
"eval_loss": 0.13280269503593445,
"eval_runtime": 81.3546,
"eval_samples_per_second": 4.302,
"eval_steps_per_second": 2.151,
"step": 570
},
{
"epoch": 0.698584763625414,
"grad_norm": 0.15606318414211273,
"learning_rate": 0.0001895695152539455,
"loss": 0.1296,
"step": 580
},
{
"epoch": 0.7106293285155074,
"grad_norm": 0.09551403671503067,
"learning_rate": 0.00018893793798378553,
"loss": 0.1323,
"step": 590
},
{
"epoch": 0.7226738934056007,
"grad_norm": 0.11749322712421417,
"learning_rate": 0.00018828891369781605,
"loss": 0.1179,
"step": 600
},
{
"epoch": 0.7226738934056007,
"eval_loss": 0.12917345762252808,
"eval_runtime": 81.4843,
"eval_samples_per_second": 4.295,
"eval_steps_per_second": 2.148,
"step": 600
},
{
"epoch": 0.734718458295694,
"grad_norm": 0.11540284752845764,
"learning_rate": 0.0001876225697155729,
"loss": 0.1227,
"step": 610
},
{
"epoch": 0.7467630231857875,
"grad_norm": 0.3190910518169403,
"learning_rate": 0.00018693903675420846,
"loss": 0.1281,
"step": 620
},
{
"epoch": 0.7588075880758808,
"grad_norm": 0.11964758485555649,
"learning_rate": 0.00018623844890284884,
"loss": 0.1356,
"step": 630
},
{
"epoch": 0.7588075880758808,
"eval_loss": 0.12936915457248688,
"eval_runtime": 81.4409,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 2.149,
"step": 630
},
{
"epoch": 0.7708521529659741,
"grad_norm": 0.10809649527072906,
"learning_rate": 0.00018552094359628956,
"loss": 0.1337,
"step": 640
},
{
"epoch": 0.7828967178560674,
"grad_norm": 0.13179966807365417,
"learning_rate": 0.00018478666158803475,
"loss": 0.1213,
"step": 650
},
{
"epoch": 0.7949412827461608,
"grad_norm": 0.13050246238708496,
"learning_rate": 0.00018403574692268566,
"loss": 0.1274,
"step": 660
},
{
"epoch": 0.7949412827461608,
"eval_loss": 0.129548579454422,
"eval_runtime": 81.3854,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 2.15,
"step": 660
},
{
"epoch": 0.8069858476362541,
"grad_norm": 0.10548313707113266,
"learning_rate": 0.00018326834690768308,
"loss": 0.1266,
"step": 670
},
{
"epoch": 0.8190304125263475,
"grad_norm": 0.11163028329610825,
"learning_rate": 0.00018248461208441016,
"loss": 0.1275,
"step": 680
},
{
"epoch": 0.8310749774164409,
"grad_norm": 0.10246960818767548,
"learning_rate": 0.00018168469619866037,
"loss": 0.1262,
"step": 690
},
{
"epoch": 0.8310749774164409,
"eval_loss": 0.12998254597187042,
"eval_runtime": 81.2607,
"eval_samples_per_second": 4.307,
"eval_steps_per_second": 2.154,
"step": 690
},
{
"epoch": 0.8431195423065342,
"grad_norm": 0.16775400936603546,
"learning_rate": 0.00018086875617047738,
"loss": 0.128,
"step": 700
},
{
"epoch": 0.8551641071966275,
"grad_norm": 0.09823903441429138,
"learning_rate": 0.00018003695206337164,
"loss": 0.1186,
"step": 710
},
{
"epoch": 0.8672086720867209,
"grad_norm": 0.08154378831386566,
"learning_rate": 0.0001791894470529209,
"loss": 0.1304,
"step": 720
},
{
"epoch": 0.8672086720867209,
"eval_loss": 0.12851889431476593,
"eval_runtime": 81.3179,
"eval_samples_per_second": 4.304,
"eval_steps_per_second": 2.152,
"step": 720
},
{
"epoch": 0.8792532369768142,
"grad_norm": 0.13566914200782776,
"learning_rate": 0.00017832640739475964,
"loss": 0.1256,
"step": 730
},
{
"epoch": 0.8912978018669075,
"grad_norm": 0.21967343986034393,
"learning_rate": 0.00017744800239196485,
"loss": 0.1049,
"step": 740
},
{
"epoch": 0.9033423667570009,
"grad_norm": 0.12291447818279266,
"learning_rate": 0.00017655440436184361,
"loss": 0.1352,
"step": 750
},
{
"epoch": 0.9033423667570009,
"eval_loss": 0.12859711050987244,
"eval_runtime": 81.5112,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 2.147,
"step": 750
},
{
"epoch": 0.9153869316470943,
"grad_norm": 0.23631049692630768,
"learning_rate": 0.00017564578860212952,
"loss": 0.1236,
"step": 760
},
{
"epoch": 0.9274314965371876,
"grad_norm": 0.13145385682582855,
"learning_rate": 0.00017472233335659443,
"loss": 0.1235,
"step": 770
},
{
"epoch": 0.9394760614272809,
"grad_norm": 0.0923519879579544,
"learning_rate": 0.00017378421978008212,
"loss": 0.1224,
"step": 780
},
{
"epoch": 0.9394760614272809,
"eval_loss": 0.12813878059387207,
"eval_runtime": 81.2514,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 2.154,
"step": 780
},
{
"epoch": 0.9515206263173743,
"grad_norm": 0.2052728235721588,
"learning_rate": 0.0001728316319029713,
"loss": 0.1189,
"step": 790
},
{
"epoch": 0.9635651912074676,
"grad_norm": 0.19939783215522766,
"learning_rate": 0.0001718647565950739,
"loss": 0.128,
"step": 800
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.11448610574007034,
"learning_rate": 0.00017088378352897703,
"loss": 0.1103,
"step": 810
},
{
"epoch": 0.975609756097561,
"eval_loss": 0.12625984847545624,
"eval_runtime": 81.2673,
"eval_samples_per_second": 4.307,
"eval_steps_per_second": 2.153,
"step": 810
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.11135770380496979,
"learning_rate": 0.00016988890514283447,
"loss": 0.1281,
"step": 820
},
{
"epoch": 0.9996988858777477,
"grad_norm": 0.11028297245502472,
"learning_rate": 0.00016888031660261622,
"loss": 0.1225,
"step": 830
},
{
"epoch": 1.010840108401084,
"grad_norm": 0.11298307776451111,
"learning_rate": 0.00016785821576382245,
"loss": 0.1188,
"step": 840
},
{
"epoch": 1.010840108401084,
"eval_loss": 0.1276378333568573,
"eval_runtime": 81.1811,
"eval_samples_per_second": 4.311,
"eval_steps_per_second": 2.156,
"step": 840
},
{
"epoch": 1.0228846732911774,
"grad_norm": 0.09120020270347595,
"learning_rate": 0.0001668228031326702,
"loss": 0.1169,
"step": 850
},
{
"epoch": 1.0349292381812707,
"grad_norm": 0.11060494184494019,
"learning_rate": 0.00016577428182675973,
"loss": 0.1243,
"step": 860
},
{
"epoch": 1.046973803071364,
"grad_norm": 0.09636874496936798,
"learning_rate": 0.0001647128575352292,
"loss": 0.1187,
"step": 870
},
{
"epoch": 1.046973803071364,
"eval_loss": 0.12680906057357788,
"eval_runtime": 81.1164,
"eval_samples_per_second": 4.315,
"eval_steps_per_second": 2.157,
"step": 870
},
{
"epoch": 1.0590183679614573,
"grad_norm": 0.10281568765640259,
"learning_rate": 0.000163638738478404,
"loss": 0.133,
"step": 880
},
{
"epoch": 1.0710629328515506,
"grad_norm": 0.08682233840227127,
"learning_rate": 0.0001625521353669504,
"loss": 0.1171,
"step": 890
},
{
"epoch": 1.0831074977416442,
"grad_norm": 0.1339954435825348,
"learning_rate": 0.00016145326136054008,
"loss": 0.1257,
"step": 900
},
{
"epoch": 1.0831074977416442,
"eval_loss": 0.12813013792037964,
"eval_runtime": 81.4418,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 2.149,
"step": 900
},
{
"epoch": 1.0951520626317375,
"grad_norm": 0.09956265985965729,
"learning_rate": 0.00016034233202603463,
"loss": 0.1287,
"step": 910
},
{
"epoch": 1.1071966275218308,
"grad_norm": 0.07889163494110107,
"learning_rate": 0.00015921956529519747,
"loss": 0.1135,
"step": 920
},
{
"epoch": 1.119241192411924,
"grad_norm": 0.07798007130622864,
"learning_rate": 0.00015808518142194214,
"loss": 0.1128,
"step": 930
},
{
"epoch": 1.119241192411924,
"eval_loss": 0.12662966549396515,
"eval_runtime": 81.2769,
"eval_samples_per_second": 4.306,
"eval_steps_per_second": 2.153,
"step": 930
},
{
"epoch": 1.1312857573020174,
"grad_norm": 0.09110717475414276,
"learning_rate": 0.00015693940293912492,
"loss": 0.1161,
"step": 940
},
{
"epoch": 1.1433303221921107,
"grad_norm": 0.10704551637172699,
"learning_rate": 0.00015578245461489042,
"loss": 0.1237,
"step": 950
},
{
"epoch": 1.1553748870822043,
"grad_norm": 0.12007839977741241,
"learning_rate": 0.00015461456340857857,
"loss": 0.1246,
"step": 960
},
{
"epoch": 1.1553748870822043,
"eval_loss": 0.12597930431365967,
"eval_runtime": 81.2379,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 2.154,
"step": 960
},
{
"epoch": 1.1674194519722976,
"grad_norm": 0.09195715934038162,
"learning_rate": 0.00015343595842620198,
"loss": 0.1179,
"step": 970
},
{
"epoch": 1.1794640168623909,
"grad_norm": 0.11327285319566727,
"learning_rate": 0.000152246870875502,
"loss": 0.1219,
"step": 980
},
{
"epoch": 1.1915085817524842,
"grad_norm": 0.12147443741559982,
"learning_rate": 0.00015104753402059252,
"loss": 0.1198,
"step": 990
},
{
"epoch": 1.1915085817524842,
"eval_loss": 0.12678198516368866,
"eval_runtime": 81.2272,
"eval_samples_per_second": 4.309,
"eval_steps_per_second": 2.154,
"step": 990
},
{
"epoch": 1.2035531466425775,
"grad_norm": 0.12530316412448883,
"learning_rate": 0.00014983818313620047,
"loss": 0.1215,
"step": 1000
},
{
"epoch": 1.2155977115326708,
"grad_norm": 0.21503940224647522,
"learning_rate": 0.00014861905546151164,
"loss": 0.132,
"step": 1010
},
{
"epoch": 1.2276422764227641,
"grad_norm": 0.09851568937301636,
"learning_rate": 0.00014739039015363155,
"loss": 0.1255,
"step": 1020
},
{
"epoch": 1.2276422764227641,
"eval_loss": 0.1265084594488144,
"eval_runtime": 81.2978,
"eval_samples_per_second": 4.305,
"eval_steps_per_second": 2.153,
"step": 1020
},
{
"epoch": 1.2396868413128577,
"grad_norm": 0.09794170409440994,
"learning_rate": 0.0001461524282406696,
"loss": 0.1203,
"step": 1030
},
{
"epoch": 1.251731406202951,
"grad_norm": 0.09283249080181122,
"learning_rate": 0.00014490541257445664,
"loss": 0.1241,
"step": 1040
},
{
"epoch": 1.2637759710930443,
"grad_norm": 0.10757778584957123,
"learning_rate": 0.00014364958778290436,
"loss": 0.1263,
"step": 1050
},
{
"epoch": 1.2637759710930443,
"eval_loss": 0.1275068074464798,
"eval_runtime": 81.3037,
"eval_samples_per_second": 4.305,
"eval_steps_per_second": 2.152,
"step": 1050
},
{
"epoch": 1.2758205359831376,
"grad_norm": 0.08741755038499832,
"learning_rate": 0.00014238520022201665,
"loss": 0.1279,
"step": 1060
},
{
"epoch": 1.287865100873231,
"grad_norm": 0.09674480557441711,
"learning_rate": 0.00014111249792756164,
"loss": 0.1203,
"step": 1070
},
{
"epoch": 1.2999096657633242,
"grad_norm": 0.18980656564235687,
"learning_rate": 0.00013983173056641437,
"loss": 0.1273,
"step": 1080
},
{
"epoch": 1.2999096657633242,
"eval_loss": 0.12540876865386963,
"eval_runtime": 81.3684,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 2.151,
"step": 1080
},
{
"epoch": 1.3119542306534178,
"grad_norm": 0.11260770261287689,
"learning_rate": 0.00013854314938757954,
"loss": 0.1209,
"step": 1090
},
{
"epoch": 1.323998795543511,
"grad_norm": 0.08883814513683319,
"learning_rate": 0.00013724700717290385,
"loss": 0.1307,
"step": 1100
},
{
"epoch": 1.3360433604336044,
"grad_norm": 0.08555177599191666,
"learning_rate": 0.0001359435581874874,
"loss": 0.1187,
"step": 1110
},
{
"epoch": 1.3360433604336044,
"eval_loss": 0.1246650293469429,
"eval_runtime": 81.3179,
"eval_samples_per_second": 4.304,
"eval_steps_per_second": 2.152,
"step": 1110
},
{
"epoch": 1.3480879253236977,
"grad_norm": 0.09265279769897461,
"learning_rate": 0.0001346330581298046,
"loss": 0.1237,
"step": 1120
},
{
"epoch": 1.360132490213791,
"grad_norm": 0.07397205382585526,
"learning_rate": 0.0001333157640815434,
"loss": 0.1194,
"step": 1130
},
{
"epoch": 1.3721770551038843,
"grad_norm": 0.10523436218500137,
"learning_rate": 0.00013199193445717362,
"loss": 0.1215,
"step": 1140
},
{
"epoch": 1.3721770551038843,
"eval_loss": 0.12532441318035126,
"eval_runtime": 81.5003,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 2.147,
"step": 1140
},
{
"epoch": 1.3842216199939776,
"grad_norm": 0.08104018867015839,
"learning_rate": 0.00013066182895325339,
"loss": 0.1276,
"step": 1150
},
{
"epoch": 1.396266184884071,
"grad_norm": 0.1049809381365776,
"learning_rate": 0.00012932570849748446,
"loss": 0.128,
"step": 1160
},
{
"epoch": 1.4083107497741645,
"grad_norm": 0.10500895977020264,
"learning_rate": 0.00012798383519752577,
"loss": 0.1179,
"step": 1170
},
{
"epoch": 1.4083107497741645,
"eval_loss": 0.12398175895214081,
"eval_runtime": 81.4012,
"eval_samples_per_second": 4.3,
"eval_steps_per_second": 2.15,
"step": 1170
},
{
"epoch": 1.4203553146642578,
"grad_norm": 0.08427383005619049,
"learning_rate": 0.00012663647228957562,
"loss": 0.1145,
"step": 1180
},
{
"epoch": 1.432399879554351,
"grad_norm": 0.08236031979322433,
"learning_rate": 0.0001252838840867324,
"loss": 0.1216,
"step": 1190
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.08572593331336975,
"learning_rate": 0.00012392633592714423,
"loss": 0.128,
"step": 1200
},
{
"epoch": 1.4444444444444444,
"eval_loss": 0.1240333542227745,
"eval_runtime": 81.3808,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 2.15,
"step": 1200
},
{
"epoch": 1.4564890093345377,
"grad_norm": 0.08900213986635208,
"learning_rate": 0.00012256409412195727,
"loss": 0.1171,
"step": 1210
},
{
"epoch": 1.4685335742246313,
"grad_norm": 0.09740438312292099,
"learning_rate": 0.0001211974259030733,
"loss": 0.1316,
"step": 1220
},
{
"epoch": 1.4805781391147246,
"grad_norm": 0.07978689670562744,
"learning_rate": 0.00011982659937072677,
"loss": 0.1219,
"step": 1230
},
{
"epoch": 1.4805781391147246,
"eval_loss": 0.12412716448307037,
"eval_runtime": 81.2608,
"eval_samples_per_second": 4.307,
"eval_steps_per_second": 2.154,
"step": 1230
},
{
"epoch": 1.492622704004818,
"grad_norm": 0.08000902086496353,
"learning_rate": 0.00011845188344089126,
"loss": 0.1135,
"step": 1240
},
{
"epoch": 1.5046672688949112,
"grad_norm": 0.07367110252380371,
"learning_rate": 0.00011707354779252612,
"loss": 0.1206,
"step": 1250
},
{
"epoch": 1.5167118337850045,
"grad_norm": 0.07485098391771317,
"learning_rate": 0.00011569186281467335,
"loss": 0.1138,
"step": 1260
},
{
"epoch": 1.5167118337850045,
"eval_loss": 0.1249684989452362,
"eval_runtime": 81.6234,
"eval_samples_per_second": 4.288,
"eval_steps_per_second": 2.144,
"step": 1260
},
{
"epoch": 1.5287563986750978,
"grad_norm": 0.11146491765975952,
"learning_rate": 0.00011430709955341514,
"loss": 0.1272,
"step": 1270
},
{
"epoch": 1.5408009635651911,
"grad_norm": 0.08664289861917496,
"learning_rate": 0.00011291952965870269,
"loss": 0.1147,
"step": 1280
},
{
"epoch": 1.5528455284552845,
"grad_norm": 0.09051596373319626,
"learning_rate": 0.00011152942533106638,
"loss": 0.1161,
"step": 1290
},
{
"epoch": 1.5528455284552845,
"eval_loss": 0.12454230338335037,
"eval_runtime": 81.4318,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 2.149,
"step": 1290
},
{
"epoch": 1.5648900933453778,
"grad_norm": 0.06956392526626587,
"learning_rate": 0.000110137059268218,
"loss": 0.1077,
"step": 1300
},
{
"epoch": 1.5769346582354713,
"grad_norm": 0.07337108254432678,
"learning_rate": 0.00010874270461155554,
"loss": 0.124,
"step": 1310
},
{
"epoch": 1.5889792231255646,
"grad_norm": 0.08082354068756104,
"learning_rate": 0.0001073466348925807,
"loss": 0.1108,
"step": 1320
},
{
"epoch": 1.5889792231255646,
"eval_loss": 0.12420380860567093,
"eval_runtime": 81.6866,
"eval_samples_per_second": 4.285,
"eval_steps_per_second": 2.142,
"step": 1320
},
{
"epoch": 1.601023788015658,
"grad_norm": 0.07141824066638947,
"learning_rate": 0.00010594912397924018,
"loss": 0.1112,
"step": 1330
},
{
"epoch": 1.6130683529057512,
"grad_norm": 0.09476039558649063,
"learning_rate": 0.00010455044602220076,
"loss": 0.1257,
"step": 1340
},
{
"epoch": 1.6251129177958448,
"grad_norm": 0.06789754331111908,
"learning_rate": 0.00010315087540106894,
"loss": 0.1167,
"step": 1350
},
{
"epoch": 1.6251129177958448,
"eval_loss": 0.12411510944366455,
"eval_runtime": 81.5821,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 2.145,
"step": 1350
},
{
"epoch": 1.637157482685938,
"grad_norm": 0.08458850532770157,
"learning_rate": 0.00010175068667056578,
"loss": 0.1215,
"step": 1360
},
{
"epoch": 1.6492020475760314,
"grad_norm": 0.0818653479218483,
"learning_rate": 0.00010035015450666723,
"loss": 0.1193,
"step": 1370
},
{
"epoch": 1.6612466124661247,
"grad_norm": 0.060270510613918304,
"learning_rate": 9.894955365272087e-05,
"loss": 0.1094,
"step": 1380
},
{
"epoch": 1.6612466124661247,
"eval_loss": 0.12342710047960281,
"eval_runtime": 81.3723,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 2.151,
"step": 1380
},
{
"epoch": 1.673291177356218,
"grad_norm": 0.08454828709363937,
"learning_rate": 9.75491588655492e-05,
"loss": 0.1205,
"step": 1390
},
{
"epoch": 1.6853357422463113,
"grad_norm": 0.06156294047832489,
"learning_rate": 9.614924486155047e-05,
"loss": 0.1111,
"step": 1400
},
{
"epoch": 1.6973803071364046,
"grad_norm": 0.08005426079034805,
"learning_rate": 9.475008626280739e-05,
"loss": 0.1077,
"step": 1410
},
{
"epoch": 1.6973803071364046,
"eval_loss": 0.12282679229974747,
"eval_runtime": 81.2841,
"eval_samples_per_second": 4.306,
"eval_steps_per_second": 2.153,
"step": 1410
},
{
"epoch": 1.709424872026498,
"grad_norm": 0.08515966683626175,
"learning_rate": 9.335195754321427e-05,
"loss": 0.1226,
"step": 1420
},
{
"epoch": 1.7214694369165913,
"grad_norm": 0.08117437362670898,
"learning_rate": 9.195513297463339e-05,
"loss": 0.1158,
"step": 1430
},
{
"epoch": 1.7335140018066846,
"grad_norm": 0.0748409777879715,
"learning_rate": 9.055988657309075e-05,
"loss": 0.1152,
"step": 1440
},
{
"epoch": 1.7335140018066846,
"eval_loss": 0.12351047992706299,
"eval_runtime": 81.3284,
"eval_samples_per_second": 4.304,
"eval_steps_per_second": 2.152,
"step": 1440
},
{
"epoch": 1.7455585666967781,
"grad_norm": 0.08821168541908264,
"learning_rate": 8.916649204502231e-05,
"loss": 0.1231,
"step": 1450
},
{
"epoch": 1.7576031315868714,
"grad_norm": 0.06368843466043472,
"learning_rate": 8.777522273358076e-05,
"loss": 0.1144,
"step": 1460
},
{
"epoch": 1.7696476964769647,
"grad_norm": 0.08906359225511551,
"learning_rate": 8.638635156501353e-05,
"loss": 0.1278,
"step": 1470
},
{
"epoch": 1.7696476964769647,
"eval_loss": 0.12412309646606445,
"eval_runtime": 81.4647,
"eval_samples_per_second": 4.296,
"eval_steps_per_second": 2.148,
"step": 1470
},
{
"epoch": 1.7816922613670583,
"grad_norm": 0.13860240578651428,
"learning_rate": 8.500015099512282e-05,
"loss": 0.1135,
"step": 1480
},
{
"epoch": 1.7937368262571516,
"grad_norm": 0.08925935626029968,
"learning_rate": 8.361689295581759e-05,
"loss": 0.1274,
"step": 1490
},
{
"epoch": 1.805781391147245,
"grad_norm": 0.2061154991388321,
"learning_rate": 8.223684880176861e-05,
"loss": 0.1245,
"step": 1500
},
{
"epoch": 1.805781391147245,
"eval_loss": 0.12294992804527283,
"eval_runtime": 81.5386,
"eval_samples_per_second": 4.292,
"eval_steps_per_second": 2.146,
"step": 1500
},
{
"epoch": 1.8178259560373382,
"grad_norm": 0.07016266882419586,
"learning_rate": 8.086028925717661e-05,
"loss": 0.1215,
"step": 1510
},
{
"epoch": 1.8298705209274315,
"grad_norm": 0.19503405690193176,
"learning_rate": 7.948748436266409e-05,
"loss": 0.1169,
"step": 1520
},
{
"epoch": 1.8419150858175248,
"grad_norm": 0.07560829073190689,
"learning_rate": 7.811870342230127e-05,
"loss": 0.1219,
"step": 1530
},
{
"epoch": 1.8419150858175248,
"eval_loss": 0.12353645265102386,
"eval_runtime": 81.5233,
"eval_samples_per_second": 4.293,
"eval_steps_per_second": 2.147,
"step": 1530
},
{
"epoch": 1.8539596507076181,
"grad_norm": 0.06638047844171524,
"learning_rate": 7.675421495077657e-05,
"loss": 0.1174,
"step": 1540
},
{
"epoch": 1.8660042155977115,
"grad_norm": 0.07245375961065292,
"learning_rate": 7.539428662072188e-05,
"loss": 0.1263,
"step": 1550
},
{
"epoch": 1.8780487804878048,
"grad_norm": 0.08080323040485382,
"learning_rate": 7.403918521020305e-05,
"loss": 0.1261,
"step": 1560
},
{
"epoch": 1.8780487804878048,
"eval_loss": 0.12283791601657867,
"eval_runtime": 81.5734,
"eval_samples_per_second": 4.291,
"eval_steps_per_second": 2.145,
"step": 1560
},
{
"epoch": 1.890093345377898,
"grad_norm": 0.06938530504703522,
"learning_rate": 7.268917655038581e-05,
"loss": 0.1167,
"step": 1570
},
{
"epoch": 1.9021379102679916,
"grad_norm": 0.07764075696468353,
"learning_rate": 7.134452547338753e-05,
"loss": 0.1191,
"step": 1580
},
{
"epoch": 1.914182475158085,
"grad_norm": 0.10729371011257172,
"learning_rate": 7.000549576032489e-05,
"loss": 0.1175,
"step": 1590
},
{
"epoch": 1.914182475158085,
"eval_loss": 0.12257985025644302,
"eval_runtime": 81.3844,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 2.15,
"step": 1590
},
{
"epoch": 1.9262270400481782,
"grad_norm": 0.06268001347780228,
"learning_rate": 6.867235008956783e-05,
"loss": 0.1203,
"step": 1600
},
{
"epoch": 1.9382716049382716,
"grad_norm": 0.059695664793252945,
"learning_rate": 6.734534998520969e-05,
"loss": 0.1147,
"step": 1610
},
{
"epoch": 1.950316169828365,
"grad_norm": 0.07781612873077393,
"learning_rate": 6.602475576576383e-05,
"loss": 0.1191,
"step": 1620
},
{
"epoch": 1.950316169828365,
"eval_loss": 0.1225815936923027,
"eval_runtime": 81.7725,
"eval_samples_per_second": 4.28,
"eval_steps_per_second": 2.14,
"step": 1620
},
{
"epoch": 1.9623607347184584,
"grad_norm": 0.06548433750867844,
"learning_rate": 6.471082649309686e-05,
"loss": 0.1181,
"step": 1630
},
{
"epoch": 1.9744052996085517,
"grad_norm": 0.0688992366194725,
"learning_rate": 6.34038199216082e-05,
"loss": 0.1139,
"step": 1640
},
{
"epoch": 1.986449864498645,
"grad_norm": 0.07544533908367157,
"learning_rate": 6.210399244766632e-05,
"loss": 0.1194,
"step": 1650
},
{
"epoch": 1.986449864498645,
"eval_loss": 0.12259992212057114,
"eval_runtime": 81.5949,
"eval_samples_per_second": 4.289,
"eval_steps_per_second": 2.145,
"step": 1650
},
{
"epoch": 1.9984944293887383,
"grad_norm": 0.13208015263080597,
"learning_rate": 6.0811599059311195e-05,
"loss": 0.1185,
"step": 1660
},
{
"epoch": 2.009635651912075,
"grad_norm": 0.08133638650178909,
"learning_rate": 5.952689328623321e-05,
"loss": 0.1264,
"step": 1670
},
{
"epoch": 2.021680216802168,
"grad_norm": 0.08000742644071579,
"learning_rate": 5.8250127150038016e-05,
"loss": 0.1108,
"step": 1680
},
{
"epoch": 2.021680216802168,
"eval_loss": 0.12281496077775955,
"eval_runtime": 81.5447,
"eval_samples_per_second": 4.292,
"eval_steps_per_second": 2.146,
"step": 1680
},
{
"epoch": 2.0337247816922615,
"grad_norm": 0.06525713950395584,
"learning_rate": 5.698155111480722e-05,
"loss": 0.1141,
"step": 1690
},
{
"epoch": 2.0457693465823548,
"grad_norm": 0.06997241824865341,
"learning_rate": 5.57214140379649e-05,
"loss": 0.1146,
"step": 1700
},
{
"epoch": 2.057813911472448,
"grad_norm": 0.07603967189788818,
"learning_rate": 5.4469963121458776e-05,
"loss": 0.1169,
"step": 1710
},
{
"epoch": 2.057813911472448,
"eval_loss": 0.12317664176225662,
"eval_runtime": 81.5005,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 2.147,
"step": 1710
},
{
"epoch": 2.0698584763625414,
"grad_norm": 0.09060684591531754,
"learning_rate": 5.322744386326675e-05,
"loss": 0.1139,
"step": 1720
},
{
"epoch": 2.0819030412526347,
"grad_norm": 0.09253629297018051,
"learning_rate": 5.1994100009237e-05,
"loss": 0.1174,
"step": 1730
},
{
"epoch": 2.093947606142728,
"grad_norm": 0.0707191526889801,
"learning_rate": 5.077017350527269e-05,
"loss": 0.1138,
"step": 1740
},
{
"epoch": 2.093947606142728,
"eval_loss": 0.1225883737206459,
"eval_runtime": 81.4669,
"eval_samples_per_second": 4.296,
"eval_steps_per_second": 2.148,
"step": 1740
},
{
"epoch": 2.1059921710328213,
"grad_norm": 0.06803560256958008,
"learning_rate": 4.9555904449868795e-05,
"loss": 0.1196,
"step": 1750
},
{
"epoch": 2.1180367359229146,
"grad_norm": 0.08914126455783844,
"learning_rate": 4.835153104701221e-05,
"loss": 0.1129,
"step": 1760
},
{
"epoch": 2.130081300813008,
"grad_norm": 0.08918345719575882,
"learning_rate": 4.71572895594528e-05,
"loss": 0.1183,
"step": 1770
},
{
"epoch": 2.130081300813008,
"eval_loss": 0.12287717312574387,
"eval_runtime": 81.4917,
"eval_samples_per_second": 4.295,
"eval_steps_per_second": 2.147,
"step": 1770
},
{
"epoch": 2.1421258657031013,
"grad_norm": 0.07838484644889832,
"learning_rate": 4.5973414262355785e-05,
"loss": 0.1141,
"step": 1780
},
{
"epoch": 2.154170430593195,
"grad_norm": 0.11005687713623047,
"learning_rate": 4.480013739734368e-05,
"loss": 0.1223,
"step": 1790
},
{
"epoch": 2.1662149954832883,
"grad_norm": 0.08749410510063171,
"learning_rate": 4.363768912693749e-05,
"loss": 0.1117,
"step": 1800
},
{
"epoch": 2.1662149954832883,
"eval_loss": 0.12259072810411453,
"eval_runtime": 81.4558,
"eval_samples_per_second": 4.297,
"eval_steps_per_second": 2.148,
"step": 1800
},
{
"epoch": 2.1782595603733816,
"grad_norm": 0.08549398928880692,
"learning_rate": 4.24862974894053e-05,
"loss": 0.1088,
"step": 1810
},
{
"epoch": 2.190304125263475,
"grad_norm": 0.05755528435111046,
"learning_rate": 4.134618835402816e-05,
"loss": 0.1063,
"step": 1820
},
{
"epoch": 2.2023486901535683,
"grad_norm": 0.07486403733491898,
"learning_rate": 4.0217585376790834e-05,
"loss": 0.1183,
"step": 1830
},
{
"epoch": 2.2023486901535683,
"eval_loss": 0.12217788398265839,
"eval_runtime": 81.8011,
"eval_samples_per_second": 4.279,
"eval_steps_per_second": 2.139,
"step": 1830
},
{
"epoch": 2.2143932550436616,
"grad_norm": 0.08609265834093094,
"learning_rate": 3.9100709956507356e-05,
"loss": 0.1254,
"step": 1840
},
{
"epoch": 2.226437819933755,
"grad_norm": 0.0692862719297409,
"learning_rate": 3.79957811913888e-05,
"loss": 0.1121,
"step": 1850
},
{
"epoch": 2.238482384823848,
"grad_norm": 0.08547110855579376,
"learning_rate": 3.6903015836062905e-05,
"loss": 0.1097,
"step": 1860
},
{
"epoch": 2.238482384823848,
"eval_loss": 0.12183844298124313,
"eval_runtime": 81.5286,
"eval_samples_per_second": 4.293,
"eval_steps_per_second": 2.146,
"step": 1860
},
{
"epoch": 2.2505269497139415,
"grad_norm": 0.08573822677135468,
"learning_rate": 3.5822628259052906e-05,
"loss": 0.1174,
"step": 1870
},
{
"epoch": 2.262571514604035,
"grad_norm": 0.08069294691085815,
"learning_rate": 3.475483040072495e-05,
"loss": 0.1198,
"step": 1880
},
{
"epoch": 2.274616079494128,
"grad_norm": 0.08202630281448364,
"learning_rate": 3.369983173171141e-05,
"loss": 0.1132,
"step": 1890
},
{
"epoch": 2.274616079494128,
"eval_loss": 0.12189455330371857,
"eval_runtime": 81.6511,
"eval_samples_per_second": 4.287,
"eval_steps_per_second": 2.143,
"step": 1890
},
{
"epoch": 2.2866606443842215,
"grad_norm": 0.08111971616744995,
"learning_rate": 3.2657839211819085e-05,
"loss": 0.1086,
"step": 1900
},
{
"epoch": 2.298705209274315,
"grad_norm": 0.07577092200517654,
"learning_rate": 3.1629057249429527e-05,
"loss": 0.1205,
"step": 1910
},
{
"epoch": 2.3107497741644085,
"grad_norm": 0.0689835473895073,
"learning_rate": 3.0613687661400384e-05,
"loss": 0.1133,
"step": 1920
},
{
"epoch": 2.3107497741644085,
"eval_loss": 0.12182266265153885,
"eval_runtime": 81.5792,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 2.145,
"step": 1920
},
{
"epoch": 2.322794339054502,
"grad_norm": 0.09697998315095901,
"learning_rate": 2.9611929633474555e-05,
"loss": 0.1214,
"step": 1930
},
{
"epoch": 2.334838903944595,
"grad_norm": 0.07920137792825699,
"learning_rate": 2.8623979681206002e-05,
"loss": 0.1108,
"step": 1940
},
{
"epoch": 2.3468834688346885,
"grad_norm": 0.07563956826925278,
"learning_rate": 2.765003161140911e-05,
"loss": 0.1213,
"step": 1950
},
{
"epoch": 2.3468834688346885,
"eval_loss": 0.12195436656475067,
"eval_runtime": 81.5329,
"eval_samples_per_second": 4.293,
"eval_steps_per_second": 2.146,
"step": 1950
},
{
"epoch": 2.3589280337247818,
"grad_norm": 0.08294857293367386,
"learning_rate": 2.66902764841394e-05,
"loss": 0.1186,
"step": 1960
},
{
"epoch": 2.370972598614875,
"grad_norm": 0.0806855708360672,
"learning_rate": 2.5744902575213248e-05,
"loss": 0.1127,
"step": 1970
},
{
"epoch": 2.3830171635049684,
"grad_norm": 0.1039934828877449,
"learning_rate": 2.481409533927358e-05,
"loss": 0.1188,
"step": 1980
},
{
"epoch": 2.3830171635049684,
"eval_loss": 0.12177734076976776,
"eval_runtime": 81.5272,
"eval_samples_per_second": 4.293,
"eval_steps_per_second": 2.147,
"step": 1980
},
{
"epoch": 2.3950617283950617,
"grad_norm": 0.09341799467802048,
"learning_rate": 2.3898037373409276e-05,
"loss": 0.1173,
"step": 1990
},
{
"epoch": 2.407106293285155,
"grad_norm": 0.08933733403682709,
"learning_rate": 2.2996908381334736e-05,
"loss": 0.1148,
"step": 2000
},
{
"epoch": 2.4191508581752483,
"grad_norm": 0.08493391424417496,
"learning_rate": 2.211088513813754e-05,
"loss": 0.1121,
"step": 2010
},
{
"epoch": 2.4191508581752483,
"eval_loss": 0.12171091139316559,
"eval_runtime": 81.5846,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 2.145,
"step": 2010
},
{
"epoch": 2.4311954230653416,
"grad_norm": 0.1086319163441658,
"learning_rate": 2.1240141455600116e-05,
"loss": 0.1145,
"step": 2020
},
{
"epoch": 2.443239987955435,
"grad_norm": 0.09176024794578552,
"learning_rate": 2.0384848148103196e-05,
"loss": 0.1092,
"step": 2030
},
{
"epoch": 2.4552845528455283,
"grad_norm": 0.1033325344324112,
"learning_rate": 1.9545172999116812e-05,
"loss": 0.1055,
"step": 2040
},
{
"epoch": 2.4552845528455283,
"eval_loss": 0.12144716829061508,
"eval_runtime": 81.758,
"eval_samples_per_second": 4.281,
"eval_steps_per_second": 2.14,
"step": 2040
},
{
"epoch": 2.4673291177356216,
"grad_norm": 0.07238776981830597,
"learning_rate": 1.872128072828634e-05,
"loss": 0.1105,
"step": 2050
},
{
"epoch": 2.4793736826257153,
"grad_norm": 0.06941673159599304,
"learning_rate": 1.791333295911909e-05,
"loss": 0.1118,
"step": 2060
},
{
"epoch": 2.4914182475158086,
"grad_norm": 0.08462639153003693,
"learning_rate": 1.7121488187278713e-05,
"loss": 0.1082,
"step": 2070
},
{
"epoch": 2.4914182475158086,
"eval_loss": 0.12135373055934906,
"eval_runtime": 81.6101,
"eval_samples_per_second": 4.289,
"eval_steps_per_second": 2.144,
"step": 2070
},
{
"epoch": 2.503462812405902,
"grad_norm": 0.09350676834583282,
"learning_rate": 1.6345901749492887e-05,
"loss": 0.1121,
"step": 2080
},
{
"epoch": 2.5155073772959953,
"grad_norm": 0.08438068628311157,
"learning_rate": 1.5586725793080814e-05,
"loss": 0.1146,
"step": 2090
},
{
"epoch": 2.5275519421860886,
"grad_norm": 0.10219820588827133,
"learning_rate": 1.484410924610642e-05,
"loss": 0.1179,
"step": 2100
},
{
"epoch": 2.5275519421860886,
"eval_loss": 0.12126310169696808,
"eval_runtime": 81.4438,
"eval_samples_per_second": 4.297,
"eval_steps_per_second": 2.149,
"step": 2100
},
{
"epoch": 2.539596507076182,
"grad_norm": 0.0966809093952179,
"learning_rate": 1.4118197788163056e-05,
"loss": 0.1186,
"step": 2110
},
{
"epoch": 2.551641071966275,
"grad_norm": 0.1106065958738327,
"learning_rate": 1.3409133821795306e-05,
"loss": 0.1112,
"step": 2120
},
{
"epoch": 2.5636856368563685,
"grad_norm": 0.09320900589227676,
"learning_rate": 1.2717056444563957e-05,
"loss": 0.1148,
"step": 2130
},
{
"epoch": 2.5636856368563685,
"eval_loss": 0.12110390514135361,
"eval_runtime": 81.8244,
"eval_samples_per_second": 4.277,
"eval_steps_per_second": 2.139,
"step": 2130
},
{
"epoch": 2.575730201746462,
"grad_norm": 0.09060157835483551,
"learning_rate": 1.2042101421758955e-05,
"loss": 0.1219,
"step": 2140
},
{
"epoch": 2.587774766636555,
"grad_norm": 0.09531334787607193,
"learning_rate": 1.1384401159766433e-05,
"loss": 0.1136,
"step": 2150
},
{
"epoch": 2.5998193315266485,
"grad_norm": 0.07259315997362137,
"learning_rate": 1.0744084680094246e-05,
"loss": 0.1062,
"step": 2160
},
{
"epoch": 2.5998193315266485,
"eval_loss": 0.12105338275432587,
"eval_runtime": 81.5762,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 2.145,
"step": 2160
},
{
"epoch": 2.611863896416742,
"grad_norm": 0.07185359299182892,
"learning_rate": 1.0121277594061939e-05,
"loss": 0.1184,
"step": 2170
},
{
"epoch": 2.6239084613068355,
"grad_norm": 0.0885002538561821,
"learning_rate": 9.516102078159317e-06,
"loss": 0.1243,
"step": 2180
},
{
"epoch": 2.635953026196929,
"grad_norm": 0.10129860043525696,
"learning_rate": 8.928676850079133e-06,
"loss": 0.1249,
"step": 2190
},
{
"epoch": 2.635953026196929,
"eval_loss": 0.1209418997168541,
"eval_runtime": 81.4691,
"eval_samples_per_second": 4.296,
"eval_steps_per_second": 2.148,
"step": 2190
},
{
"epoch": 2.647997591087022,
"grad_norm": 0.08937793970108032,
"learning_rate": 8.359117145428053e-06,
"loss": 0.1188,
"step": 2200
},
{
"epoch": 2.6600421559771155,
"grad_norm": 0.08618754148483276,
"learning_rate": 7.807534695120911e-06,
"loss": 0.1193,
"step": 2210
},
{
"epoch": 2.6720867208672088,
"grad_norm": 0.0760350152850151,
"learning_rate": 7.274037703462244e-06,
"loss": 0.107,
"step": 2220
},
{
"epoch": 2.6720867208672088,
"eval_loss": 0.12113867700099945,
"eval_runtime": 81.609,
"eval_samples_per_second": 4.289,
"eval_steps_per_second": 2.144,
"step": 2220
},
{
"epoch": 2.684131285757302,
"grad_norm": 0.08701752126216888,
"learning_rate": 6.7587308269199786e-06,
"loss": 0.1042,
"step": 2230
},
{
"epoch": 2.6961758506473954,
"grad_norm": 0.08244354277849197,
"learning_rate": 6.261715153594627e-06,
"loss": 0.1156,
"step": 2240
},
{
"epoch": 2.7082204155374887,
"grad_norm": 0.14877063035964966,
"learning_rate": 5.783088183389062e-06,
"loss": 0.1076,
"step": 2250
},
{
"epoch": 2.7082204155374887,
"eval_loss": 0.12096786499023438,
"eval_runtime": 81.6046,
"eval_samples_per_second": 4.289,
"eval_steps_per_second": 2.144,
"step": 2250
},
{
"epoch": 2.720264980427582,
"grad_norm": 0.09160082787275314,
"learning_rate": 5.322943808881675e-06,
"loss": 0.1188,
"step": 2260
},
{
"epoch": 2.7323095453176753,
"grad_norm": 0.08697830885648727,
"learning_rate": 4.881372296907516e-06,
"loss": 0.1082,
"step": 2270
},
{
"epoch": 2.7443541102077686,
"grad_norm": 0.09075489640235901,
"learning_rate": 4.4584602708505285e-06,
"loss": 0.118,
"step": 2280
},
{
"epoch": 2.7443541102077686,
"eval_loss": 0.12092573195695877,
"eval_runtime": 81.6503,
"eval_samples_per_second": 4.287,
"eval_steps_per_second": 2.143,
"step": 2280
},
{
"epoch": 2.756398675097862,
"grad_norm": 0.0934319868683815,
"learning_rate": 4.054290693650642e-06,
"loss": 0.1091,
"step": 2290
},
{
"epoch": 2.7684432399879553,
"grad_norm": 0.08008915930986404,
"learning_rate": 3.6689428515288004e-06,
"loss": 0.1103,
"step": 2300
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.07896912097930908,
"learning_rate": 3.3024923384334163e-06,
"loss": 0.1087,
"step": 2310
},
{
"epoch": 2.7804878048780486,
"eval_loss": 0.12093985080718994,
"eval_runtime": 81.4605,
"eval_samples_per_second": 4.297,
"eval_steps_per_second": 2.148,
"step": 2310
},
{
"epoch": 2.792532369768142,
"grad_norm": 0.08136588335037231,
"learning_rate": 2.9550110412109534e-06,
"loss": 0.1106,
"step": 2320
},
{
"epoch": 2.804576934658235,
"grad_norm": 0.07197284698486328,
"learning_rate": 2.6265671255039537e-06,
"loss": 0.1265,
"step": 2330
},
{
"epoch": 2.816621499548329,
"grad_norm": 0.09639979153871536,
"learning_rate": 2.3172250223787994e-06,
"loss": 0.1168,
"step": 2340
},
{
"epoch": 2.816621499548329,
"eval_loss": 0.12088128179311752,
"eval_runtime": 81.8851,
"eval_samples_per_second": 4.274,
"eval_steps_per_second": 2.137,
"step": 2340
},
{
"epoch": 2.8286660644384223,
"grad_norm": 0.09752853214740753,
"learning_rate": 2.0270454156863905e-06,
"loss": 0.1209,
"step": 2350
},
{
"epoch": 2.8407106293285156,
"grad_norm": 0.08983393758535385,
"learning_rate": 1.7560852301575892e-06,
"loss": 0.1185,
"step": 2360
},
{
"epoch": 2.852755194218609,
"grad_norm": 0.09476975351572037,
"learning_rate": 1.5043976202363641e-06,
"loss": 0.1172,
"step": 2370
},
{
"epoch": 2.852755194218609,
"eval_loss": 0.1208338588476181,
"eval_runtime": 81.5048,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 2.147,
"step": 2370
},
{
"epoch": 2.864799759108702,
"grad_norm": 0.06642317026853561,
"learning_rate": 1.2720319596523977e-06,
"loss": 0.116,
"step": 2380
},
{
"epoch": 2.8768443239987955,
"grad_norm": 0.10626640915870667,
"learning_rate": 1.0590338317354454e-06,
"loss": 0.112,
"step": 2390
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.07917091995477676,
"learning_rate": 8.654450204731768e-07,
"loss": 0.1092,
"step": 2400
},
{
"epoch": 2.888888888888889,
"eval_loss": 0.12085919827222824,
"eval_runtime": 81.5821,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 2.145,
"step": 2400
}
],
"logging_steps": 10,
"max_steps": 2493,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.360791333022802e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}