| { | |
| "best_metric": 0.45181071758270264, | |
| "best_model_checkpoint": "website-classifier/checkpoint-1800", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.491672039031982, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 2.7552, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 6.483436107635498, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 2.7259, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 8.029026985168457, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 2.6183, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 10.646072387695312, | |
| "learning_rate": 3.277777777777778e-05, | |
| "loss": 2.598, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 10.159834861755371, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 2.1777, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 14.836869239807129, | |
| "learning_rate": 4.9166666666666665e-05, | |
| "loss": 1.7938, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 10.7587890625, | |
| "learning_rate": 4.9166666666666665e-05, | |
| "loss": 1.95, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 31.001468658447266, | |
| "learning_rate": 4.8240740740740744e-05, | |
| "loss": 1.7602, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 9.250581741333008, | |
| "learning_rate": 4.731481481481482e-05, | |
| "loss": 1.6735, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 14.803194046020508, | |
| "learning_rate": 4.638888888888889e-05, | |
| "loss": 1.2047, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 41.04203796386719, | |
| "learning_rate": 4.546296296296296e-05, | |
| "loss": 1.4807, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 15.050979614257812, | |
| "learning_rate": 4.4537037037037036e-05, | |
| "loss": 1.3084, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 13.59727668762207, | |
| "learning_rate": 4.3611111111111116e-05, | |
| "loss": 1.3297, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 6.688912391662598, | |
| "learning_rate": 4.268518518518519e-05, | |
| "loss": 1.3379, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 7.42189359664917, | |
| "learning_rate": 4.1820987654320994e-05, | |
| "loss": 1.4812, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 6.354011058807373, | |
| "learning_rate": 4.089506172839506e-05, | |
| "loss": 1.5017, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 25.94729995727539, | |
| "learning_rate": 3.996913580246914e-05, | |
| "loss": 1.1041, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 24.99347686767578, | |
| "learning_rate": 3.904320987654321e-05, | |
| "loss": 1.109, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 41.66750717163086, | |
| "learning_rate": 3.8117283950617286e-05, | |
| "loss": 1.073, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.238738059997559, | |
| "learning_rate": 3.719135802469136e-05, | |
| "loss": 1.2524, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.775, | |
| "eval_f1_macro": 0.7753958544292094, | |
| "eval_f1_micro": 0.775, | |
| "eval_f1_weighted": 0.7753958544292096, | |
| "eval_loss": 0.9330267310142517, | |
| "eval_precision_macro": 0.796061342783982, | |
| "eval_precision_micro": 0.775, | |
| "eval_precision_weighted": 0.7960613427839818, | |
| "eval_recall_macro": 0.775, | |
| "eval_recall_micro": 0.775, | |
| "eval_recall_weighted": 0.775, | |
| "eval_runtime": 24.168, | |
| "eval_samples_per_second": 49.652, | |
| "eval_steps_per_second": 3.103, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 8.009809494018555, | |
| "learning_rate": 3.626543209876543e-05, | |
| "loss": 0.9211, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 17.96474838256836, | |
| "learning_rate": 3.533950617283951e-05, | |
| "loss": 1.0106, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 8.952888488769531, | |
| "learning_rate": 3.441358024691358e-05, | |
| "loss": 0.8015, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 45.105072021484375, | |
| "learning_rate": 3.348765432098766e-05, | |
| "loss": 0.7559, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 7.327918529510498, | |
| "learning_rate": 3.256172839506173e-05, | |
| "loss": 1.0116, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 48.14884948730469, | |
| "learning_rate": 3.16358024691358e-05, | |
| "loss": 1.5269, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 10.317117691040039, | |
| "learning_rate": 3.0709876543209876e-05, | |
| "loss": 0.9947, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 39.09126281738281, | |
| "learning_rate": 2.9783950617283952e-05, | |
| "loss": 0.8826, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 7.045171737670898, | |
| "learning_rate": 2.8858024691358025e-05, | |
| "loss": 0.8446, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 10.068076133728027, | |
| "learning_rate": 2.79320987654321e-05, | |
| "loss": 0.7001, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 15.41380500793457, | |
| "learning_rate": 2.700617283950617e-05, | |
| "loss": 0.8341, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 9.957711219787598, | |
| "learning_rate": 2.6080246913580247e-05, | |
| "loss": 0.6945, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 4.9840545654296875, | |
| "learning_rate": 2.5154320987654324e-05, | |
| "loss": 0.7595, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 14.547338485717773, | |
| "learning_rate": 2.4228395061728396e-05, | |
| "loss": 0.4924, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.45237699151039124, | |
| "learning_rate": 2.3302469135802473e-05, | |
| "loss": 0.9098, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 17.84574317932129, | |
| "learning_rate": 2.2376543209876542e-05, | |
| "loss": 0.758, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 3.32464599609375, | |
| "learning_rate": 2.145061728395062e-05, | |
| "loss": 0.6431, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 6.051296234130859, | |
| "learning_rate": 2.052469135802469e-05, | |
| "loss": 0.8403, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 36.67551040649414, | |
| "learning_rate": 1.9598765432098768e-05, | |
| "loss": 0.8311, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 14.538824081420898, | |
| "learning_rate": 1.867283950617284e-05, | |
| "loss": 0.6665, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.855, | |
| "eval_f1_macro": 0.8538278779437969, | |
| "eval_f1_micro": 0.855, | |
| "eval_f1_weighted": 0.853827877943797, | |
| "eval_loss": 0.631514310836792, | |
| "eval_precision_macro": 0.8595148734247968, | |
| "eval_precision_micro": 0.855, | |
| "eval_precision_weighted": 0.8595148734247969, | |
| "eval_recall_macro": 0.8549999999999999, | |
| "eval_recall_micro": 0.855, | |
| "eval_recall_weighted": 0.855, | |
| "eval_runtime": 24.1701, | |
| "eval_samples_per_second": 49.648, | |
| "eval_steps_per_second": 3.103, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 4.500637531280518, | |
| "learning_rate": 1.7746913580246917e-05, | |
| "loss": 0.4377, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.43099209666252136, | |
| "learning_rate": 1.682098765432099e-05, | |
| "loss": 0.4461, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 15.149191856384277, | |
| "learning_rate": 1.5895061728395063e-05, | |
| "loss": 0.5035, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 20.928913116455078, | |
| "learning_rate": 1.4969135802469136e-05, | |
| "loss": 0.5992, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 12.046993255615234, | |
| "learning_rate": 1.4043209876543212e-05, | |
| "loss": 0.4523, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.5978269577026367, | |
| "learning_rate": 1.3117283950617285e-05, | |
| "loss": 0.483, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 7.10316801071167, | |
| "learning_rate": 1.219135802469136e-05, | |
| "loss": 0.4932, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 10.279694557189941, | |
| "learning_rate": 1.1265432098765432e-05, | |
| "loss": 0.3396, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.13266117870807648, | |
| "learning_rate": 1.0339506172839507e-05, | |
| "loss": 0.4377, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 24.841278076171875, | |
| "learning_rate": 9.413580246913581e-06, | |
| "loss": 0.4452, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.116528034210205, | |
| "learning_rate": 8.487654320987654e-06, | |
| "loss": 0.4, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 38.56851577758789, | |
| "learning_rate": 7.561728395061729e-06, | |
| "loss": 0.3195, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 12.757609367370605, | |
| "learning_rate": 6.635802469135803e-06, | |
| "loss": 0.4735, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.18463416397571564, | |
| "learning_rate": 5.7098765432098764e-06, | |
| "loss": 0.3327, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.0354253053665161, | |
| "learning_rate": 4.78395061728395e-06, | |
| "loss": 0.2695, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 6.2124433517456055, | |
| "learning_rate": 3.858024691358025e-06, | |
| "loss": 0.4273, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 51.33966827392578, | |
| "learning_rate": 2.932098765432099e-06, | |
| "loss": 0.3228, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.17042000591754913, | |
| "learning_rate": 2.0061728395061727e-06, | |
| "loss": 0.45, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 12.142667770385742, | |
| "learning_rate": 1.0802469135802469e-06, | |
| "loss": 0.3076, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.21961359679698944, | |
| "learning_rate": 1.54320987654321e-07, | |
| "loss": 0.3804, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9108333333333334, | |
| "eval_f1_macro": 0.9103583954158208, | |
| "eval_f1_micro": 0.9108333333333334, | |
| "eval_f1_weighted": 0.9103583954158205, | |
| "eval_loss": 0.45181071758270264, | |
| "eval_precision_macro": 0.9123282026272069, | |
| "eval_precision_micro": 0.9108333333333334, | |
| "eval_precision_weighted": 0.9123282026272069, | |
| "eval_recall_macro": 0.9108333333333334, | |
| "eval_recall_micro": 0.9108333333333334, | |
| "eval_recall_weighted": 0.9108333333333334, | |
| "eval_runtime": 25.1673, | |
| "eval_samples_per_second": 47.681, | |
| "eval_steps_per_second": 2.98, | |
| "step": 1800 | |
| } | |
| ], | |
| "logging_steps": 30, | |
| "max_steps": 1800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 1.34204916842496e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |