chat-moderation-router / trainer_state.json
matterai's picture
model
48a6843 verified
{
"best_global_step": 1024,
"best_metric": 0.9530514004567643,
"best_model_checkpoint": "checkpoints/checkpoint-1024",
"epoch": 0.37730287398673545,
"eval_steps": 128,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036845983787767134,
"grad_norm": 8.914146423339844,
"learning_rate": 1.3499999999999998e-05,
"loss": 0.6841,
"step": 10
},
{
"epoch": 0.007369196757553427,
"grad_norm": 6.666998386383057,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.517,
"step": 20
},
{
"epoch": 0.01105379513633014,
"grad_norm": 5.241199016571045,
"learning_rate": 4.3499999999999993e-05,
"loss": 0.4514,
"step": 30
},
{
"epoch": 0.014738393515106854,
"grad_norm": 5.938083171844482,
"learning_rate": 5.85e-05,
"loss": 0.4963,
"step": 40
},
{
"epoch": 0.018422991893883568,
"grad_norm": 3.599522590637207,
"learning_rate": 7.35e-05,
"loss": 0.4226,
"step": 50
},
{
"epoch": 0.02210759027266028,
"grad_norm": 4.397755146026611,
"learning_rate": 8.849999999999998e-05,
"loss": 0.4117,
"step": 60
},
{
"epoch": 0.025792188651436992,
"grad_norm": 11.364407539367676,
"learning_rate": 0.00010349999999999998,
"loss": 0.3636,
"step": 70
},
{
"epoch": 0.029476787030213707,
"grad_norm": 3.12703800201416,
"learning_rate": 0.0001185,
"loss": 0.3726,
"step": 80
},
{
"epoch": 0.03316138540899042,
"grad_norm": 2.9937469959259033,
"learning_rate": 0.0001335,
"loss": 0.3421,
"step": 90
},
{
"epoch": 0.036845983787767135,
"grad_norm": 2.8763785362243652,
"learning_rate": 0.00014849999999999998,
"loss": 0.4045,
"step": 100
},
{
"epoch": 0.040530582166543844,
"grad_norm": 4.576082706451416,
"learning_rate": 0.0001635,
"loss": 0.3629,
"step": 110
},
{
"epoch": 0.04421518054532056,
"grad_norm": 4.948099613189697,
"learning_rate": 0.00017849999999999997,
"loss": 0.394,
"step": 120
},
{
"epoch": 0.04716285924834193,
"eval_has_safety_issue_roc_auc": 0.918227021974626,
"eval_loss": 0.40556982159614563,
"eval_mean_roc_auc": 0.918227021974626,
"eval_runtime": 489.5112,
"eval_samples_per_second": 76.035,
"eval_steps_per_second": 2.378,
"step": 128
},
{
"epoch": 0.047899778924097275,
"grad_norm": 2.6524598598480225,
"learning_rate": 0.0001935,
"loss": 0.3734,
"step": 130
},
{
"epoch": 0.051584377302873984,
"grad_norm": 4.847095966339111,
"learning_rate": 0.00020849999999999997,
"loss": 0.3707,
"step": 140
},
{
"epoch": 0.0552689756816507,
"grad_norm": 2.1986277103424072,
"learning_rate": 0.00022349999999999998,
"loss": 0.3914,
"step": 150
},
{
"epoch": 0.058953574060427415,
"grad_norm": 1.9227941036224365,
"learning_rate": 0.0002385,
"loss": 0.3834,
"step": 160
},
{
"epoch": 0.06263817243920412,
"grad_norm": 1.3873552083969116,
"learning_rate": 0.0002535,
"loss": 0.2576,
"step": 170
},
{
"epoch": 0.06632277081798084,
"grad_norm": 4.532320022583008,
"learning_rate": 0.00026849999999999997,
"loss": 0.4172,
"step": 180
},
{
"epoch": 0.07000736919675755,
"grad_norm": 2.5069925785064697,
"learning_rate": 0.00028349999999999995,
"loss": 0.4712,
"step": 190
},
{
"epoch": 0.07369196757553427,
"grad_norm": 1.9502817392349243,
"learning_rate": 0.0002985,
"loss": 0.3573,
"step": 200
},
{
"epoch": 0.07737656595431099,
"grad_norm": 6.129453182220459,
"learning_rate": 0.00029994004614753843,
"loss": 0.2982,
"step": 210
},
{
"epoch": 0.08106116433308769,
"grad_norm": 4.473495006561279,
"learning_rate": 0.00029973285979173177,
"loss": 0.4217,
"step": 220
},
{
"epoch": 0.0847457627118644,
"grad_norm": 1.3703476190567017,
"learning_rate": 0.0002993779051807778,
"loss": 0.3758,
"step": 230
},
{
"epoch": 0.08843036109064112,
"grad_norm": 1.4541234970092773,
"learning_rate": 0.00029887553261202354,
"loss": 0.3863,
"step": 240
},
{
"epoch": 0.09211495946941783,
"grad_norm": 1.872299075126648,
"learning_rate": 0.00029822623786654207,
"loss": 0.3525,
"step": 250
},
{
"epoch": 0.09432571849668386,
"eval_has_safety_issue_roc_auc": 0.9362037324386603,
"eval_loss": 0.33471250534057617,
"eval_mean_roc_auc": 0.9362037324386603,
"eval_runtime": 487.3367,
"eval_samples_per_second": 76.374,
"eval_steps_per_second": 2.388,
"step": 256
},
{
"epoch": 0.09579955784819455,
"grad_norm": 2.644226312637329,
"learning_rate": 0.0002974306617198568,
"loss": 0.3905,
"step": 260
},
{
"epoch": 0.09948415622697127,
"grad_norm": 2.3409223556518555,
"learning_rate": 0.0002964895893095737,
"loss": 0.3515,
"step": 270
},
{
"epoch": 0.10316875460574797,
"grad_norm": 2.0618879795074463,
"learning_rate": 0.00029540394936054435,
"loss": 0.3592,
"step": 280
},
{
"epoch": 0.10685335298452468,
"grad_norm": 0.9815084338188171,
"learning_rate": 0.00029417481326832776,
"loss": 0.3944,
"step": 290
},
{
"epoch": 0.1105379513633014,
"grad_norm": 4.9908013343811035,
"learning_rate": 0.00029280339404185146,
"loss": 0.4092,
"step": 300
},
{
"epoch": 0.11422254974207811,
"grad_norm": 0.6844871640205383,
"learning_rate": 0.00029129104510631853,
"loss": 0.4466,
"step": 310
},
{
"epoch": 0.11790714812085483,
"grad_norm": 1.8045854568481445,
"learning_rate": 0.00028963925896754035,
"loss": 0.3698,
"step": 320
},
{
"epoch": 0.12159174649963155,
"grad_norm": 4.337238788604736,
"learning_rate": 0.00028784966573901314,
"loss": 0.3892,
"step": 330
},
{
"epoch": 0.12527634487840825,
"grad_norm": 0.8228144645690918,
"learning_rate": 0.0002859240315331935,
"loss": 0.417,
"step": 340
},
{
"epoch": 0.12896094325718496,
"grad_norm": 1.259918212890625,
"learning_rate": 0.00028386425671855764,
"loss": 0.3393,
"step": 350
},
{
"epoch": 0.13264554163596168,
"grad_norm": 4.035144805908203,
"learning_rate": 0.00028167237404416826,
"loss": 0.2893,
"step": 360
},
{
"epoch": 0.1363301400147384,
"grad_norm": 1.1749995946884155,
"learning_rate": 0.0002793505466335956,
"loss": 0.4133,
"step": 370
},
{
"epoch": 0.1400147383935151,
"grad_norm": 1.3367811441421509,
"learning_rate": 0.0002769010658501763,
"loss": 0.3775,
"step": 380
},
{
"epoch": 0.1414885777450258,
"eval_has_safety_issue_roc_auc": 0.9363893661137548,
"eval_loss": 0.3488316535949707,
"eval_mean_roc_auc": 0.9363893661137548,
"eval_runtime": 487.0696,
"eval_samples_per_second": 76.416,
"eval_steps_per_second": 2.39,
"step": 384
},
{
"epoch": 0.14369933677229182,
"grad_norm": 1.7076958417892456,
"learning_rate": 0.00027432634903571426,
"loss": 0.4061,
"step": 390
},
{
"epoch": 0.14738393515106854,
"grad_norm": 1.1982675790786743,
"learning_rate": 0.000271628937124856,
"loss": 0.3591,
"step": 400
},
{
"epoch": 0.15106853352984526,
"grad_norm": 0.9970278143882751,
"learning_rate": 0.000268811492137495,
"loss": 0.3556,
"step": 410
},
{
"epoch": 0.15475313190862197,
"grad_norm": 3.2783939838409424,
"learning_rate": 0.0002658767945516796,
"loss": 0.3119,
"step": 420
},
{
"epoch": 0.1584377302873987,
"grad_norm": 0.9100527763366699,
"learning_rate": 0.0002628277405596167,
"loss": 0.3235,
"step": 430
},
{
"epoch": 0.16212232866617537,
"grad_norm": 2.0034751892089844,
"learning_rate": 0.0002596673392094796,
"loss": 0.3409,
"step": 440
},
{
"epoch": 0.1658069270449521,
"grad_norm": 2.4519588947296143,
"learning_rate": 0.00025639870943584104,
"loss": 0.3431,
"step": 450
},
{
"epoch": 0.1694915254237288,
"grad_norm": 3.1098690032958984,
"learning_rate": 0.0002530250769816612,
"loss": 0.3966,
"step": 460
},
{
"epoch": 0.17317612380250552,
"grad_norm": 1.0801705121994019,
"learning_rate": 0.0002495497712148688,
"loss": 0.3026,
"step": 470
},
{
"epoch": 0.17686072218128224,
"grad_norm": 2.5059814453125,
"learning_rate": 0.00024597622184267673,
"loss": 0.3053,
"step": 480
},
{
"epoch": 0.18054532056005895,
"grad_norm": 4.749985694885254,
"learning_rate": 0.00024230795552687568,
"loss": 0.3585,
"step": 490
},
{
"epoch": 0.18422991893883567,
"grad_norm": 2.033231735229492,
"learning_rate": 0.00023854859240344416,
"loss": 0.3503,
"step": 500
},
{
"epoch": 0.18791451731761238,
"grad_norm": 1.2591739892959595,
"learning_rate": 0.00023470184250991156,
"loss": 0.3347,
"step": 510
},
{
"epoch": 0.18865143699336773,
"eval_has_safety_issue_roc_auc": 0.941105104803345,
"eval_loss": 0.3277105987071991,
"eval_mean_roc_auc": 0.941105104803345,
"eval_runtime": 487.352,
"eval_samples_per_second": 76.372,
"eval_steps_per_second": 2.388,
"step": 512
},
{
"epoch": 0.1915991156963891,
"grad_norm": 3.9085240364074707,
"learning_rate": 0.00023077150212399899,
"loss": 0.3273,
"step": 520
},
{
"epoch": 0.19528371407516582,
"grad_norm": 0.7869892120361328,
"learning_rate": 0.00022676145001715174,
"loss": 0.3515,
"step": 530
},
{
"epoch": 0.19896831245394253,
"grad_norm": 0.9839105606079102,
"learning_rate": 0.00022267564362665968,
"loss": 0.3073,
"step": 540
},
{
"epoch": 0.20265291083271925,
"grad_norm": 0.618803083896637,
"learning_rate": 0.0002185181151501449,
"loss": 0.3734,
"step": 550
},
{
"epoch": 0.20633750921149593,
"grad_norm": 1.2680476903915405,
"learning_rate": 0.00021429296756626925,
"loss": 0.3323,
"step": 560
},
{
"epoch": 0.21002210759027265,
"grad_norm": 1.8254988193511963,
"learning_rate": 0.00021000437058558968,
"loss": 0.35,
"step": 570
},
{
"epoch": 0.21370670596904937,
"grad_norm": 1.5576568841934204,
"learning_rate": 0.00020565655653555763,
"loss": 0.2629,
"step": 580
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.9699956178665161,
"learning_rate": 0.0002012538161837225,
"loss": 0.2764,
"step": 590
},
{
"epoch": 0.2210759027266028,
"grad_norm": 0.5201606154441833,
"learning_rate": 0.00019680049450326222,
"loss": 0.267,
"step": 600
},
{
"epoch": 0.2247605011053795,
"grad_norm": 0.876549243927002,
"learning_rate": 0.00019230098638501938,
"loss": 0.3421,
"step": 610
},
{
"epoch": 0.22844509948415623,
"grad_norm": 0.6652014255523682,
"learning_rate": 0.00018775973230027458,
"loss": 0.3823,
"step": 620
},
{
"epoch": 0.23212969786293294,
"grad_norm": 1.2416579723358154,
"learning_rate": 0.00018318121391853708,
"loss": 0.2984,
"step": 630
},
{
"epoch": 0.23581429624170966,
"grad_norm": 0.8337903618812561,
"learning_rate": 0.00017856994968467845,
"loss": 0.2969,
"step": 640
},
{
"epoch": 0.23581429624170966,
"eval_has_safety_issue_roc_auc": 0.9458881980708542,
"eval_loss": 0.31154128909111023,
"eval_mean_roc_auc": 0.9458881980708542,
"eval_runtime": 487.6344,
"eval_samples_per_second": 76.328,
"eval_steps_per_second": 2.387,
"step": 640
},
{
"epoch": 0.23949889462048637,
"grad_norm": 0.8940573930740356,
"learning_rate": 0.00017393049035977292,
"loss": 0.3488,
"step": 650
},
{
"epoch": 0.2431834929992631,
"grad_norm": 1.5384459495544434,
"learning_rate": 0.00016926741453004545,
"loss": 0.3101,
"step": 660
},
{
"epoch": 0.2468680913780398,
"grad_norm": 1.3179084062576294,
"learning_rate": 0.00016458532408835993,
"loss": 0.2165,
"step": 670
},
{
"epoch": 0.2505526897568165,
"grad_norm": 1.6649147272109985,
"learning_rate": 0.00015988883969270665,
"loss": 0.2676,
"step": 680
},
{
"epoch": 0.2542372881355932,
"grad_norm": 1.43660569190979,
"learning_rate": 0.00015518259620617085,
"loss": 0.3425,
"step": 690
},
{
"epoch": 0.2579218865143699,
"grad_norm": 1.0876473188400269,
"learning_rate": 0.00015047123812288193,
"loss": 0.3139,
"step": 700
},
{
"epoch": 0.26160648489314664,
"grad_norm": 1.0270347595214844,
"learning_rate": 0.000145759414984459,
"loss": 0.3059,
"step": 710
},
{
"epoch": 0.26529108327192336,
"grad_norm": 0.6378604173660278,
"learning_rate": 0.00014105177679147446,
"loss": 0.2809,
"step": 720
},
{
"epoch": 0.26897568165070007,
"grad_norm": 0.8198941349983215,
"learning_rate": 0.00013635296941446449,
"loss": 0.3358,
"step": 730
},
{
"epoch": 0.2726602800294768,
"grad_norm": 0.7575820684432983,
"learning_rate": 0.00013166763000901655,
"loss": 0.3009,
"step": 740
},
{
"epoch": 0.2763448784082535,
"grad_norm": 1.4956635236740112,
"learning_rate": 0.00012700038243945594,
"loss": 0.3287,
"step": 750
},
{
"epoch": 0.2800294767870302,
"grad_norm": 0.945459246635437,
"learning_rate": 0.00012235583271565003,
"loss": 0.3243,
"step": 760
},
{
"epoch": 0.2829771554900516,
"eval_has_safety_issue_roc_auc": 0.9507512976389587,
"eval_loss": 0.2935161590576172,
"eval_mean_roc_auc": 0.9507512976389587,
"eval_runtime": 487.4239,
"eval_samples_per_second": 76.361,
"eval_steps_per_second": 2.388,
"step": 768
},
{
"epoch": 0.28371407516580693,
"grad_norm": 1.6803070306777954,
"learning_rate": 0.00011773856444743296,
"loss": 0.2998,
"step": 770
},
{
"epoch": 0.28739867354458365,
"grad_norm": 0.8402862548828125,
"learning_rate": 0.00011315313432113607,
"loss": 0.2805,
"step": 780
},
{
"epoch": 0.29108327192336036,
"grad_norm": 0.950634241104126,
"learning_rate": 0.00010860406760268816,
"loss": 0.3111,
"step": 790
},
{
"epoch": 0.2947678703021371,
"grad_norm": 0.8810547590255737,
"learning_rate": 0.00010409585367172489,
"loss": 0.2858,
"step": 800
},
{
"epoch": 0.2984524686809138,
"grad_norm": 0.8042846322059631,
"learning_rate": 9.96329415911129e-05,
"loss": 0.3156,
"step": 810
},
{
"epoch": 0.3021370670596905,
"grad_norm": 0.6347734332084656,
"learning_rate": 9.521973571626184e-05,
"loss": 0.347,
"step": 820
},
{
"epoch": 0.3058216654384672,
"grad_norm": 1.017738938331604,
"learning_rate": 9.086059134855733e-05,
"loss": 0.2554,
"step": 830
},
{
"epoch": 0.30950626381724394,
"grad_norm": 0.5769331455230713,
"learning_rate": 8.655981043720452e-05,
"loss": 0.2573,
"step": 840
},
{
"epoch": 0.31319086219602066,
"grad_norm": 1.6331214904785156,
"learning_rate": 8.232163733372322e-05,
"loss": 0.3412,
"step": 850
},
{
"epoch": 0.3168754605747974,
"grad_norm": 1.2017263174057007,
"learning_rate": 7.815025460328584e-05,
"loss": 0.245,
"step": 860
},
{
"epoch": 0.32056005895357403,
"grad_norm": 0.7730036377906799,
"learning_rate": 7.404977889703008e-05,
"loss": 0.3052,
"step": 870
},
{
"epoch": 0.32424465733235075,
"grad_norm": 0.643957793712616,
"learning_rate": 7.00242568894217e-05,
"loss": 0.2292,
"step": 880
},
{
"epoch": 0.32792925571112747,
"grad_norm": 1.9052222967147827,
"learning_rate": 6.607766128467497e-05,
"loss": 0.3792,
"step": 890
},
{
"epoch": 0.3301400147383935,
"eval_has_safety_issue_roc_auc": 0.9519071421594945,
"eval_loss": 0.29572874307632446,
"eval_mean_roc_auc": 0.9519071421594945,
"eval_runtime": 487.4395,
"eval_samples_per_second": 76.358,
"eval_steps_per_second": 2.388,
"step": 896
},
{
"epoch": 0.3316138540899042,
"grad_norm": 0.7808663249015808,
"learning_rate": 6.221388689617348e-05,
"loss": 0.2955,
"step": 900
},
{
"epoch": 0.3352984524686809,
"grad_norm": 0.7185715436935425,
"learning_rate": 5.843674680275963e-05,
"loss": 0.2523,
"step": 910
},
{
"epoch": 0.3389830508474576,
"grad_norm": 1.606952428817749,
"learning_rate": 5.474996858568593e-05,
"loss": 0.3108,
"step": 920
},
{
"epoch": 0.3426676492262343,
"grad_norm": 0.7948682308197021,
"learning_rate": 5.115719064994245e-05,
"loss": 0.2796,
"step": 930
},
{
"epoch": 0.34635224760501104,
"grad_norm": 0.7030922770500183,
"learning_rate": 4.766195863359054e-05,
"loss": 0.2397,
"step": 940
},
{
"epoch": 0.35003684598378776,
"grad_norm": 1.6789747476577759,
"learning_rate": 4.426772190864578e-05,
"loss": 0.2918,
"step": 950
},
{
"epoch": 0.3537214443625645,
"grad_norm": 1.2274649143218994,
"learning_rate": 4.0977830176964584e-05,
"loss": 0.2523,
"step": 960
},
{
"epoch": 0.3574060427413412,
"grad_norm": 0.8976671695709229,
"learning_rate": 3.77955301644926e-05,
"loss": 0.2475,
"step": 970
},
{
"epoch": 0.3610906411201179,
"grad_norm": 1.8330504894256592,
"learning_rate": 3.472396241713854e-05,
"loss": 0.319,
"step": 980
},
{
"epoch": 0.3647752394988946,
"grad_norm": 0.5846236944198608,
"learning_rate": 3.1766158201434e-05,
"loss": 0.2242,
"step": 990
},
{
"epoch": 0.36845983787767134,
"grad_norm": 1.2076009511947632,
"learning_rate": 2.8925036513039986e-05,
"loss": 0.306,
"step": 1000
},
{
"epoch": 0.37214443625644805,
"grad_norm": 0.7948538064956665,
"learning_rate": 2.620340119605006e-05,
"loss": 0.2373,
"step": 1010
},
{
"epoch": 0.37582903463522477,
"grad_norm": 2.274285316467285,
"learning_rate": 2.360393817593514e-05,
"loss": 0.2993,
"step": 1020
},
{
"epoch": 0.37730287398673545,
"eval_has_safety_issue_roc_auc": 0.9530514004567643,
"eval_loss": 0.2790428400039673,
"eval_mean_roc_auc": 0.9530514004567643,
"eval_runtime": 487.3953,
"eval_samples_per_second": 76.365,
"eval_steps_per_second": 2.388,
"step": 1024
}
],
"logging_steps": 10,
"max_steps": 3200,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 128,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.466350275271066e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}