efficientrag-labeler / trainer_state.json
Necent's picture
EfficientRAG bilingual (en+ru) — trained on Necent/efficientrag-*-training-data
7a5f5f4 verified
Raw
History Blame Contribute Delete
23.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 4942,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020234722784297856,
"grad_norm": 8.72037124633789,
"learning_rate": 1.25e-06,
"loss": 1.209,
"step": 50
},
{
"epoch": 0.04046944556859571,
"grad_norm": 5.450883865356445,
"learning_rate": 2.5e-06,
"loss": 0.9725,
"step": 100
},
{
"epoch": 0.060704168352893564,
"grad_norm": 9.1498441696167,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.6988,
"step": 150
},
{
"epoch": 0.08093889113719142,
"grad_norm": 7.334131717681885,
"learning_rate": 5e-06,
"loss": 0.5294,
"step": 200
},
{
"epoch": 0.10117361392148927,
"grad_norm": 9.698792457580566,
"learning_rate": 4.9472796288485875e-06,
"loss": 0.4225,
"step": 250
},
{
"epoch": 0.12140833670578713,
"grad_norm": 4.219217777252197,
"learning_rate": 4.894559257697175e-06,
"loss": 0.3813,
"step": 300
},
{
"epoch": 0.141643059490085,
"grad_norm": 12.876835823059082,
"learning_rate": 4.841838886545762e-06,
"loss": 0.333,
"step": 350
},
{
"epoch": 0.16187778227438285,
"grad_norm": 8.669962882995605,
"learning_rate": 4.789118515394349e-06,
"loss": 0.3039,
"step": 400
},
{
"epoch": 0.1821125050586807,
"grad_norm": 13.13197135925293,
"learning_rate": 4.736398144242936e-06,
"loss": 0.3216,
"step": 450
},
{
"epoch": 0.20234722784297854,
"grad_norm": 10.586642265319824,
"learning_rate": 4.683677773091523e-06,
"loss": 0.2824,
"step": 500
},
{
"epoch": 0.20234722784297854,
"eval_loss": 0.2994783818721771,
"eval_runtime": 99.3532,
"eval_samples_per_second": 41.851,
"eval_steps_per_second": 0.654,
"eval_tag_accuracy": 0.9016354016354017,
"eval_tag_f1": 0.9666317639529918,
"eval_tag_f1-CONTINUE": 0.9075288265882885,
"eval_tag_f1-FINISH": 0.9075288265882885,
"eval_tag_f1-TERMINATE": 0.8949396352427433,
"eval_token_accuracy": 0.9666317639529918,
"eval_token_f1": 0.9016354016354017,
"eval_token_f1_negative": 0.9075288265882885,
"eval_token_f1_positive": 0.8949396352427433,
"step": 500
},
{
"epoch": 0.2225819506272764,
"grad_norm": 5.3532562255859375,
"learning_rate": 4.63095740194011e-06,
"loss": 0.2726,
"step": 550
},
{
"epoch": 0.24281667341157426,
"grad_norm": 12.720158576965332,
"learning_rate": 4.578237030788697e-06,
"loss": 0.255,
"step": 600
},
{
"epoch": 0.2630513961958721,
"grad_norm": 6.4184088706970215,
"learning_rate": 4.525516659637284e-06,
"loss": 0.254,
"step": 650
},
{
"epoch": 0.28328611898017,
"grad_norm": 7.8461151123046875,
"learning_rate": 4.4727962884858715e-06,
"loss": 0.2647,
"step": 700
},
{
"epoch": 0.3035208417644678,
"grad_norm": 3.1113088130950928,
"learning_rate": 4.420075917334458e-06,
"loss": 0.2442,
"step": 750
},
{
"epoch": 0.3237555645487657,
"grad_norm": 8.432883262634277,
"learning_rate": 4.367355546183045e-06,
"loss": 0.247,
"step": 800
},
{
"epoch": 0.3439902873330635,
"grad_norm": 4.828508377075195,
"learning_rate": 4.314635175031632e-06,
"loss": 0.2359,
"step": 850
},
{
"epoch": 0.3642250101173614,
"grad_norm": 4.320587635040283,
"learning_rate": 4.26191480388022e-06,
"loss": 0.2549,
"step": 900
},
{
"epoch": 0.38445973290165925,
"grad_norm": 8.660967826843262,
"learning_rate": 4.209194432728807e-06,
"loss": 0.2453,
"step": 950
},
{
"epoch": 0.4046944556859571,
"grad_norm": 6.111636161804199,
"learning_rate": 4.156474061577394e-06,
"loss": 0.2447,
"step": 1000
},
{
"epoch": 0.4046944556859571,
"eval_loss": 0.26576703786849976,
"eval_runtime": 89.7698,
"eval_samples_per_second": 46.318,
"eval_steps_per_second": 0.724,
"eval_tag_accuracy": 0.9086099086099086,
"eval_tag_f1": 0.9704089737670849,
"eval_tag_f1-CONTINUE": 0.9146067415730337,
"eval_tag_f1-FINISH": 0.9146067415730337,
"eval_tag_f1-TERMINATE": 0.9017071908949819,
"eval_token_accuracy": 0.9704089737670849,
"eval_token_f1": 0.9086099086099086,
"eval_token_f1_negative": 0.9146067415730337,
"eval_token_f1_positive": 0.9017071908949819,
"step": 1000
},
{
"epoch": 0.42492917847025496,
"grad_norm": 6.170385837554932,
"learning_rate": 4.103753690425981e-06,
"loss": 0.2341,
"step": 1050
},
{
"epoch": 0.4451639012545528,
"grad_norm": 9.265089988708496,
"learning_rate": 4.051033319274568e-06,
"loss": 0.2335,
"step": 1100
},
{
"epoch": 0.4653986240388507,
"grad_norm": 8.301984786987305,
"learning_rate": 3.9983129481231555e-06,
"loss": 0.2176,
"step": 1150
},
{
"epoch": 0.4856333468231485,
"grad_norm": 2.7331886291503906,
"learning_rate": 3.945592576971743e-06,
"loss": 0.2537,
"step": 1200
},
{
"epoch": 0.5058680696074463,
"grad_norm": 3.475696325302124,
"learning_rate": 3.89287220582033e-06,
"loss": 0.2384,
"step": 1250
},
{
"epoch": 0.5261027923917442,
"grad_norm": 10.670371055603027,
"learning_rate": 3.840151834668917e-06,
"loss": 0.2076,
"step": 1300
},
{
"epoch": 0.5463375151760421,
"grad_norm": 7.286683559417725,
"learning_rate": 3.7874314635175035e-06,
"loss": 0.2209,
"step": 1350
},
{
"epoch": 0.56657223796034,
"grad_norm": 5.6653876304626465,
"learning_rate": 3.7347110923660906e-06,
"loss": 0.2277,
"step": 1400
},
{
"epoch": 0.5868069607446378,
"grad_norm": 8.548470497131348,
"learning_rate": 3.6819907212146777e-06,
"loss": 0.2178,
"step": 1450
},
{
"epoch": 0.6070416835289356,
"grad_norm": 1.2667831182479858,
"learning_rate": 3.629270350063265e-06,
"loss": 0.2116,
"step": 1500
},
{
"epoch": 0.6070416835289356,
"eval_loss": 0.21864494681358337,
"eval_runtime": 87.99,
"eval_samples_per_second": 47.255,
"eval_steps_per_second": 0.739,
"eval_tag_accuracy": 0.9336219336219336,
"eval_tag_f1": 0.9717647246654861,
"eval_tag_f1-CONTINUE": 0.9345351043643264,
"eval_tag_f1-FINISH": 0.9345351043643264,
"eval_tag_f1-TERMINATE": 0.9326829268292683,
"eval_token_accuracy": 0.9717647246654861,
"eval_token_f1": 0.9336219336219336,
"eval_token_f1_negative": 0.9345351043643264,
"eval_token_f1_positive": 0.9326829268292683,
"step": 1500
},
{
"epoch": 0.6272764063132336,
"grad_norm": 12.82320785522461,
"learning_rate": 3.576549978911852e-06,
"loss": 0.2244,
"step": 1550
},
{
"epoch": 0.6475111290975314,
"grad_norm": 5.0135955810546875,
"learning_rate": 3.523829607760439e-06,
"loss": 0.1991,
"step": 1600
},
{
"epoch": 0.6677458518818292,
"grad_norm": 5.661144733428955,
"learning_rate": 3.471109236609026e-06,
"loss": 0.2301,
"step": 1650
},
{
"epoch": 0.687980574666127,
"grad_norm": 4.150299549102783,
"learning_rate": 3.4183888654576133e-06,
"loss": 0.215,
"step": 1700
},
{
"epoch": 0.7082152974504249,
"grad_norm": 1.3773038387298584,
"learning_rate": 3.3656684943062e-06,
"loss": 0.2415,
"step": 1750
},
{
"epoch": 0.7284500202347228,
"grad_norm": 7.56788969039917,
"learning_rate": 3.312948123154787e-06,
"loss": 0.215,
"step": 1800
},
{
"epoch": 0.7486847430190207,
"grad_norm": 9.647685050964355,
"learning_rate": 3.260227752003374e-06,
"loss": 0.2187,
"step": 1850
},
{
"epoch": 0.7689194658033185,
"grad_norm": 3.352022409439087,
"learning_rate": 3.2075073808519613e-06,
"loss": 0.232,
"step": 1900
},
{
"epoch": 0.7891541885876163,
"grad_norm": 6.138416290283203,
"learning_rate": 3.1547870097005484e-06,
"loss": 0.2067,
"step": 1950
},
{
"epoch": 0.8093889113719142,
"grad_norm": 5.341176509857178,
"learning_rate": 3.1020666385491355e-06,
"loss": 0.2068,
"step": 2000
},
{
"epoch": 0.8093889113719142,
"eval_loss": 0.202407106757164,
"eval_runtime": 99.5295,
"eval_samples_per_second": 41.777,
"eval_steps_per_second": 0.653,
"eval_tag_accuracy": 0.9377104377104377,
"eval_tag_f1": 0.973316054937847,
"eval_tag_f1-CONTINUE": 0.93849441937782,
"eval_tag_f1-FINISH": 0.93849441937782,
"eval_tag_f1-TERMINATE": 0.9369062119366626,
"eval_token_accuracy": 0.973316054937847,
"eval_token_f1": 0.9377104377104377,
"eval_token_f1_negative": 0.93849441937782,
"eval_token_f1_positive": 0.9369062119366626,
"step": 2000
},
{
"epoch": 0.8296236341562121,
"grad_norm": 3.7065298557281494,
"learning_rate": 3.0493462673977226e-06,
"loss": 0.2186,
"step": 2050
},
{
"epoch": 0.8498583569405099,
"grad_norm": 11.173847198486328,
"learning_rate": 2.9966258962463097e-06,
"loss": 0.2298,
"step": 2100
},
{
"epoch": 0.8700930797248078,
"grad_norm": 7.461451053619385,
"learning_rate": 2.943905525094897e-06,
"loss": 0.2171,
"step": 2150
},
{
"epoch": 0.8903278025091056,
"grad_norm": 5.774527549743652,
"learning_rate": 2.891185153943484e-06,
"loss": 0.2008,
"step": 2200
},
{
"epoch": 0.9105625252934035,
"grad_norm": 13.355608940124512,
"learning_rate": 2.8384647827920706e-06,
"loss": 0.1928,
"step": 2250
},
{
"epoch": 0.9307972480777014,
"grad_norm": 5.512803554534912,
"learning_rate": 2.7857444116406586e-06,
"loss": 0.2151,
"step": 2300
},
{
"epoch": 0.9510319708619992,
"grad_norm": 7.355721473693848,
"learning_rate": 2.7330240404892457e-06,
"loss": 0.2313,
"step": 2350
},
{
"epoch": 0.971266693646297,
"grad_norm": 8.145475387573242,
"learning_rate": 2.680303669337833e-06,
"loss": 0.206,
"step": 2400
},
{
"epoch": 0.9915014164305949,
"grad_norm": 3.35742449760437,
"learning_rate": 2.6275832981864195e-06,
"loss": 0.2126,
"step": 2450
},
{
"epoch": 1.0117361392148927,
"grad_norm": 4.693106651306152,
"learning_rate": 2.5748629270350066e-06,
"loss": 0.1958,
"step": 2500
},
{
"epoch": 1.0117361392148927,
"eval_loss": 0.20388753712177277,
"eval_runtime": 90.2103,
"eval_samples_per_second": 46.092,
"eval_steps_per_second": 0.721,
"eval_tag_accuracy": 0.9321789321789322,
"eval_tag_f1": 0.9744642522149032,
"eval_tag_f1-CONTINUE": 0.9338338808071328,
"eval_tag_f1-FINISH": 0.9338338808071328,
"eval_tag_f1-TERMINATE": 0.9304390725209669,
"eval_token_accuracy": 0.9744642522149032,
"eval_token_f1": 0.9321789321789322,
"eval_token_f1_negative": 0.9338338808071328,
"eval_token_f1_positive": 0.9304390725209669,
"step": 2500
},
{
"epoch": 1.0319708619991905,
"grad_norm": 4.066263675689697,
"learning_rate": 2.5221425558835937e-06,
"loss": 0.1705,
"step": 2550
},
{
"epoch": 1.0522055847834886,
"grad_norm": 1.981619954109192,
"learning_rate": 2.4694221847321804e-06,
"loss": 0.2019,
"step": 2600
},
{
"epoch": 1.0724403075677864,
"grad_norm": 17.264829635620117,
"learning_rate": 2.416701813580768e-06,
"loss": 0.1502,
"step": 2650
},
{
"epoch": 1.0926750303520842,
"grad_norm": 8.771439552307129,
"learning_rate": 2.363981442429355e-06,
"loss": 0.1552,
"step": 2700
},
{
"epoch": 1.112909753136382,
"grad_norm": 11.144837379455566,
"learning_rate": 2.311261071277942e-06,
"loss": 0.1537,
"step": 2750
},
{
"epoch": 1.13314447592068,
"grad_norm": 3.879704475402832,
"learning_rate": 2.2585407001265293e-06,
"loss": 0.1816,
"step": 2800
},
{
"epoch": 1.1533791987049777,
"grad_norm": 4.202072620391846,
"learning_rate": 2.2058203289751164e-06,
"loss": 0.1522,
"step": 2850
},
{
"epoch": 1.1736139214892756,
"grad_norm": 19.130678176879883,
"learning_rate": 2.1530999578237035e-06,
"loss": 0.1646,
"step": 2900
},
{
"epoch": 1.1938486442735734,
"grad_norm": 1.3073982000350952,
"learning_rate": 2.10037958667229e-06,
"loss": 0.1594,
"step": 2950
},
{
"epoch": 1.2140833670578712,
"grad_norm": 6.3615851402282715,
"learning_rate": 2.0476592155208773e-06,
"loss": 0.1616,
"step": 3000
},
{
"epoch": 1.2140833670578712,
"eval_loss": 0.21922506392002106,
"eval_runtime": 90.428,
"eval_samples_per_second": 45.981,
"eval_steps_per_second": 0.719,
"eval_tag_accuracy": 0.9391534391534392,
"eval_tag_f1": 0.974827471052257,
"eval_tag_f1-CONTINUE": 0.9403161122906346,
"eval_tag_f1-FINISH": 0.9403161122906346,
"eval_tag_f1-TERMINATE": 0.9379445670836399,
"eval_token_accuracy": 0.974827471052257,
"eval_token_f1": 0.9391534391534392,
"eval_token_f1_negative": 0.9403161122906346,
"eval_token_f1_positive": 0.9379445670836399,
"step": 3000
},
{
"epoch": 1.2343180898421693,
"grad_norm": 5.983785152435303,
"learning_rate": 1.9949388443694644e-06,
"loss": 0.1884,
"step": 3050
},
{
"epoch": 1.254552812626467,
"grad_norm": 7.194103717803955,
"learning_rate": 1.9422184732180515e-06,
"loss": 0.1682,
"step": 3100
},
{
"epoch": 1.274787535410765,
"grad_norm": 8.178751945495605,
"learning_rate": 1.8894981020666386e-06,
"loss": 0.1621,
"step": 3150
},
{
"epoch": 1.2950222581950628,
"grad_norm": 9.569229125976562,
"learning_rate": 1.8367777309152257e-06,
"loss": 0.162,
"step": 3200
},
{
"epoch": 1.3152569809793606,
"grad_norm": 10.708954811096191,
"learning_rate": 1.7840573597638128e-06,
"loss": 0.1437,
"step": 3250
},
{
"epoch": 1.3354917037636584,
"grad_norm": 8.722265243530273,
"learning_rate": 1.7313369886124e-06,
"loss": 0.163,
"step": 3300
},
{
"epoch": 1.3557264265479563,
"grad_norm": 1.2432576417922974,
"learning_rate": 1.6786166174609872e-06,
"loss": 0.1529,
"step": 3350
},
{
"epoch": 1.375961149332254,
"grad_norm": 8.486040115356445,
"learning_rate": 1.6258962463095744e-06,
"loss": 0.1509,
"step": 3400
},
{
"epoch": 1.396195872116552,
"grad_norm": 7.521843910217285,
"learning_rate": 1.5731758751581612e-06,
"loss": 0.176,
"step": 3450
},
{
"epoch": 1.41643059490085,
"grad_norm": 5.913136959075928,
"learning_rate": 1.5204555040067484e-06,
"loss": 0.1635,
"step": 3500
},
{
"epoch": 1.41643059490085,
"eval_loss": 0.21361203491687775,
"eval_runtime": 99.2858,
"eval_samples_per_second": 41.879,
"eval_steps_per_second": 0.655,
"eval_tag_accuracy": 0.9389129389129389,
"eval_tag_f1": 0.9748354538838472,
"eval_tag_f1-CONTINUE": 0.9399243140964996,
"eval_tag_f1-FINISH": 0.9399243140964996,
"eval_tag_f1-TERMINATE": 0.937866927592955,
"eval_token_accuracy": 0.9748354538838472,
"eval_token_f1": 0.9389129389129389,
"eval_token_f1_negative": 0.9399243140964996,
"eval_token_f1_positive": 0.937866927592955,
"step": 3500
},
{
"epoch": 1.4366653176851476,
"grad_norm": 3.9816882610321045,
"learning_rate": 1.4677351328553355e-06,
"loss": 0.1598,
"step": 3550
},
{
"epoch": 1.4569000404694457,
"grad_norm": 11.924986839294434,
"learning_rate": 1.4150147617039226e-06,
"loss": 0.1722,
"step": 3600
},
{
"epoch": 1.4771347632537435,
"grad_norm": 4.420969009399414,
"learning_rate": 1.3622943905525097e-06,
"loss": 0.1408,
"step": 3650
},
{
"epoch": 1.4973694860380413,
"grad_norm": 11.450356483459473,
"learning_rate": 1.3095740194010966e-06,
"loss": 0.167,
"step": 3700
},
{
"epoch": 1.5176042088223392,
"grad_norm": 8.506190299987793,
"learning_rate": 1.2568536482496837e-06,
"loss": 0.1547,
"step": 3750
},
{
"epoch": 1.537838931606637,
"grad_norm": 2.8181309700012207,
"learning_rate": 1.2041332770982708e-06,
"loss": 0.1425,
"step": 3800
},
{
"epoch": 1.5580736543909348,
"grad_norm": 2.5935609340667725,
"learning_rate": 1.151412905946858e-06,
"loss": 0.1547,
"step": 3850
},
{
"epoch": 1.5783083771752326,
"grad_norm": 10.59027099609375,
"learning_rate": 1.098692534795445e-06,
"loss": 0.1495,
"step": 3900
},
{
"epoch": 1.5985430999595307,
"grad_norm": 2.5813493728637695,
"learning_rate": 1.0459721636440321e-06,
"loss": 0.1305,
"step": 3950
},
{
"epoch": 1.6187778227438283,
"grad_norm": 1.844016671180725,
"learning_rate": 9.932517924926192e-07,
"loss": 0.1566,
"step": 4000
},
{
"epoch": 1.6187778227438283,
"eval_loss": 0.22482524812221527,
"eval_runtime": 87.0812,
"eval_samples_per_second": 47.749,
"eval_steps_per_second": 0.746,
"eval_tag_accuracy": 0.9377104377104377,
"eval_tag_f1": 0.9749232650313393,
"eval_tag_f1-CONTINUE": 0.9394718392147698,
"eval_tag_f1-FINISH": 0.9394718392147698,
"eval_tag_f1-TERMINATE": 0.9358434481050285,
"eval_token_accuracy": 0.9749232650313393,
"eval_token_f1": 0.9377104377104377,
"eval_token_f1_negative": 0.9394718392147698,
"eval_token_f1_positive": 0.9358434481050285,
"step": 4000
},
{
"epoch": 1.6390125455281264,
"grad_norm": 9.678681373596191,
"learning_rate": 9.405314213412063e-07,
"loss": 0.1682,
"step": 4050
},
{
"epoch": 1.659247268312424,
"grad_norm": 11.279572486877441,
"learning_rate": 8.878110501897934e-07,
"loss": 0.1655,
"step": 4100
},
{
"epoch": 1.679481991096722,
"grad_norm": 6.408283710479736,
"learning_rate": 8.350906790383805e-07,
"loss": 0.1375,
"step": 4150
},
{
"epoch": 1.6997167138810199,
"grad_norm": 8.360751152038574,
"learning_rate": 7.823703078869676e-07,
"loss": 0.148,
"step": 4200
},
{
"epoch": 1.7199514366653177,
"grad_norm": 9.093498229980469,
"learning_rate": 7.296499367355547e-07,
"loss": 0.1511,
"step": 4250
},
{
"epoch": 1.7401861594496155,
"grad_norm": 8.47696304321289,
"learning_rate": 6.769295655841418e-07,
"loss": 0.183,
"step": 4300
},
{
"epoch": 1.7604208822339134,
"grad_norm": 5.258668422698975,
"learning_rate": 6.242091944327289e-07,
"loss": 0.1583,
"step": 4350
},
{
"epoch": 1.7806556050182114,
"grad_norm": 2.8200266361236572,
"learning_rate": 5.714888232813159e-07,
"loss": 0.1392,
"step": 4400
},
{
"epoch": 1.800890327802509,
"grad_norm": 1.5096231698989868,
"learning_rate": 5.18768452129903e-07,
"loss": 0.1252,
"step": 4450
},
{
"epoch": 1.821125050586807,
"grad_norm": 7.513113975524902,
"learning_rate": 4.660480809784902e-07,
"loss": 0.165,
"step": 4500
},
{
"epoch": 1.821125050586807,
"eval_loss": 0.21320512890815735,
"eval_runtime": 90.0868,
"eval_samples_per_second": 46.156,
"eval_steps_per_second": 0.722,
"eval_tag_accuracy": 0.9403559403559404,
"eval_tag_f1": 0.9753503465214146,
"eval_tag_f1-CONTINUE": 0.9416470588235294,
"eval_tag_f1-FINISH": 0.9416470588235294,
"eval_tag_f1-TERMINATE": 0.9390063944909002,
"eval_token_accuracy": 0.9753503465214146,
"eval_token_f1": 0.9403559403559404,
"eval_token_f1_negative": 0.9416470588235294,
"eval_token_f1_positive": 0.9390063944909002,
"step": 4500
},
{
"epoch": 1.8413597733711047,
"grad_norm": 3.344792604446411,
"learning_rate": 4.1332770982707723e-07,
"loss": 0.1645,
"step": 4550
},
{
"epoch": 1.8615944961554027,
"grad_norm": 1.2586418390274048,
"learning_rate": 3.606073386756643e-07,
"loss": 0.1497,
"step": 4600
},
{
"epoch": 1.8818292189397006,
"grad_norm": 5.412999629974365,
"learning_rate": 3.078869675242514e-07,
"loss": 0.1586,
"step": 4650
},
{
"epoch": 1.9020639417239984,
"grad_norm": 4.021740436553955,
"learning_rate": 2.551665963728385e-07,
"loss": 0.1804,
"step": 4700
},
{
"epoch": 1.9222986645082962,
"grad_norm": 5.421517372131348,
"learning_rate": 2.0244622522142556e-07,
"loss": 0.1426,
"step": 4750
},
{
"epoch": 1.942533387292594,
"grad_norm": 13.003674507141113,
"learning_rate": 1.4972585407001267e-07,
"loss": 0.1798,
"step": 4800
},
{
"epoch": 1.962768110076892,
"grad_norm": 2.8676435947418213,
"learning_rate": 9.700548291859976e-08,
"loss": 0.1675,
"step": 4850
},
{
"epoch": 1.9830028328611897,
"grad_norm": 5.923835277557373,
"learning_rate": 4.4285111767186845e-08,
"loss": 0.1406,
"step": 4900
}
],
"logging_steps": 50,
"max_steps": 4942,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 1.105108458176471e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}