| { |
| "best_global_step": 1000, |
| "best_metric": 0.8257431354055336, |
| "best_model_checkpoint": "/kaggle/working/checkpoints_me5_simple/checkpoint-1000", |
| "epoch": 1.9276759884281582, |
| "eval_steps": 200, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03857280617164899, |
| "grad_norm": 2.771850347518921, |
| "learning_rate": 2.435897435897436e-06, |
| "loss": 0.6886, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07714561234329798, |
| "grad_norm": 4.430428504943848, |
| "learning_rate": 5e-06, |
| "loss": 0.6607, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.11571841851494696, |
| "grad_norm": 3.110168933868408, |
| "learning_rate": 7.564102564102564e-06, |
| "loss": 0.6083, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15429122468659595, |
| "grad_norm": 4.398952960968018, |
| "learning_rate": 1.012820512820513e-05, |
| "loss": 0.5112, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19286403085824494, |
| "grad_norm": 12.522466659545898, |
| "learning_rate": 1.2692307692307693e-05, |
| "loss": 0.512, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23143683702989393, |
| "grad_norm": 3.5153162479400635, |
| "learning_rate": 1.5256410256410257e-05, |
| "loss": 0.4761, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2700096432015429, |
| "grad_norm": 8.53997802734375, |
| "learning_rate": 1.7820512820512823e-05, |
| "loss": 0.5021, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3085824493731919, |
| "grad_norm": 10.39119815826416, |
| "learning_rate": 1.9957173447537473e-05, |
| "loss": 0.479, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3471552555448409, |
| "grad_norm": 8.681617736816406, |
| "learning_rate": 1.9671663097787296e-05, |
| "loss": 0.4733, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3857280617164899, |
| "grad_norm": 5.133293151855469, |
| "learning_rate": 1.9386152748037116e-05, |
| "loss": 0.451, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3857280617164899, |
| "eval_accuracy": 0.7950875288370199, |
| "eval_f1": 0.7941359630394611, |
| "eval_loss": 0.43800750374794006, |
| "eval_runtime": 106.7777, |
| "eval_samples_per_second": 69.013, |
| "eval_steps_per_second": 0.543, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.42430086788813887, |
| "grad_norm": 4.8331804275512695, |
| "learning_rate": 1.910064239828694e-05, |
| "loss": 0.4459, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.46287367405978785, |
| "grad_norm": 5.214622974395752, |
| "learning_rate": 1.8815132048536763e-05, |
| "loss": 0.458, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5014464802314368, |
| "grad_norm": 6.8793416023254395, |
| "learning_rate": 1.8529621698786583e-05, |
| "loss": 0.4355, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5400192864030858, |
| "grad_norm": 4.028793811798096, |
| "learning_rate": 1.8244111349036403e-05, |
| "loss": 0.4344, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5785920925747348, |
| "grad_norm": 3.3970532417297363, |
| "learning_rate": 1.7958600999286226e-05, |
| "loss": 0.4462, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6171648987463838, |
| "grad_norm": 4.14546012878418, |
| "learning_rate": 1.7673090649536046e-05, |
| "loss": 0.4217, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6557377049180327, |
| "grad_norm": 5.674211502075195, |
| "learning_rate": 1.738758029978587e-05, |
| "loss": 0.4358, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6943105110896818, |
| "grad_norm": 4.182572841644287, |
| "learning_rate": 1.7102069950035693e-05, |
| "loss": 0.4225, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7328833172613307, |
| "grad_norm": 3.3439126014709473, |
| "learning_rate": 1.6816559600285513e-05, |
| "loss": 0.4414, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7714561234329798, |
| "grad_norm": 7.2115912437438965, |
| "learning_rate": 1.6531049250535333e-05, |
| "loss": 0.4103, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7714561234329798, |
| "eval_accuracy": 0.8002442665219162, |
| "eval_f1": 0.8001647440873483, |
| "eval_loss": 0.4134698808193207, |
| "eval_runtime": 106.1965, |
| "eval_samples_per_second": 69.39, |
| "eval_steps_per_second": 0.546, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8100289296046287, |
| "grad_norm": 5.1122050285339355, |
| "learning_rate": 1.6245538900785153e-05, |
| "loss": 0.4194, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8486017357762777, |
| "grad_norm": 5.695277214050293, |
| "learning_rate": 1.5960028551034976e-05, |
| "loss": 0.4271, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8871745419479267, |
| "grad_norm": 4.1516523361206055, |
| "learning_rate": 1.56745182012848e-05, |
| "loss": 0.4282, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9257473481195757, |
| "grad_norm": 4.449565410614014, |
| "learning_rate": 1.538900785153462e-05, |
| "loss": 0.4051, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9643201542912246, |
| "grad_norm": 6.348763942718506, |
| "learning_rate": 1.5103497501784441e-05, |
| "loss": 0.4096, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0019286403085825, |
| "grad_norm": 4.346441268920898, |
| "learning_rate": 1.4817987152034263e-05, |
| "loss": 0.3754, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.0405014464802314, |
| "grad_norm": 3.8286328315734863, |
| "learning_rate": 1.4532476802284083e-05, |
| "loss": 0.3659, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.0790742526518804, |
| "grad_norm": 3.8285741806030273, |
| "learning_rate": 1.4246966452533906e-05, |
| "loss": 0.3518, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1176470588235294, |
| "grad_norm": 8.565245628356934, |
| "learning_rate": 1.3961456102783728e-05, |
| "loss": 0.3832, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.1562198649951785, |
| "grad_norm": 7.982793807983398, |
| "learning_rate": 1.3675945753033548e-05, |
| "loss": 0.3799, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1562198649951785, |
| "eval_accuracy": 0.8135432216040168, |
| "eval_f1": 0.8131288556339042, |
| "eval_loss": 0.40015241503715515, |
| "eval_runtime": 106.0288, |
| "eval_samples_per_second": 69.5, |
| "eval_steps_per_second": 0.547, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1947926711668273, |
| "grad_norm": 4.712324619293213, |
| "learning_rate": 1.3390435403283371e-05, |
| "loss": 0.3668, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.2333654773384763, |
| "grad_norm": 5.1233906745910645, |
| "learning_rate": 1.3104925053533191e-05, |
| "loss": 0.3609, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.2719382835101254, |
| "grad_norm": 4.738674640655518, |
| "learning_rate": 1.2819414703783013e-05, |
| "loss": 0.3538, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.3105110896817744, |
| "grad_norm": 4.720370769500732, |
| "learning_rate": 1.2533904354032836e-05, |
| "loss": 0.3591, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.3490838958534233, |
| "grad_norm": 5.274127006530762, |
| "learning_rate": 1.2248394004282656e-05, |
| "loss": 0.3469, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3876567020250723, |
| "grad_norm": 4.651730537414551, |
| "learning_rate": 1.1962883654532478e-05, |
| "loss": 0.3547, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.4262295081967213, |
| "grad_norm": 3.479980945587158, |
| "learning_rate": 1.1677373304782301e-05, |
| "loss": 0.3647, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.4648023143683702, |
| "grad_norm": 4.897223949432373, |
| "learning_rate": 1.1391862955032121e-05, |
| "loss": 0.3468, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.5033751205400194, |
| "grad_norm": 5.784550666809082, |
| "learning_rate": 1.1106352605281943e-05, |
| "loss": 0.3492, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.5419479267116682, |
| "grad_norm": 3.776372194290161, |
| "learning_rate": 1.0820842255531764e-05, |
| "loss": 0.3695, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.5419479267116682, |
| "eval_accuracy": 0.8222282534943683, |
| "eval_f1": 0.8190341805933294, |
| "eval_loss": 0.40005970001220703, |
| "eval_runtime": 106.1744, |
| "eval_samples_per_second": 69.405, |
| "eval_steps_per_second": 0.546, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.5805207328833173, |
| "grad_norm": 4.813859939575195, |
| "learning_rate": 1.0535331905781586e-05, |
| "loss": 0.3568, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.6190935390549663, |
| "grad_norm": 6.341957092285156, |
| "learning_rate": 1.0249821556031408e-05, |
| "loss": 0.363, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.6576663452266152, |
| "grad_norm": 4.9156036376953125, |
| "learning_rate": 9.96431120628123e-06, |
| "loss": 0.3492, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.6962391513982642, |
| "grad_norm": 3.488044023513794, |
| "learning_rate": 9.678800856531049e-06, |
| "loss": 0.3416, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.7348119575699132, |
| "grad_norm": 4.383437156677246, |
| "learning_rate": 9.39329050678087e-06, |
| "loss": 0.3444, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.773384763741562, |
| "grad_norm": 5.447354793548584, |
| "learning_rate": 9.107780157030694e-06, |
| "loss": 0.3486, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.8119575699132113, |
| "grad_norm": 5.164237022399902, |
| "learning_rate": 8.822269807280514e-06, |
| "loss": 0.339, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.8505303760848602, |
| "grad_norm": 4.903033256530762, |
| "learning_rate": 8.536759457530336e-06, |
| "loss": 0.3496, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.8891031822565092, |
| "grad_norm": 4.962741374969482, |
| "learning_rate": 8.251249107780157e-06, |
| "loss": 0.3563, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.9276759884281582, |
| "grad_norm": 4.129799842834473, |
| "learning_rate": 7.965738758029979e-06, |
| "loss": 0.3528, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.9276759884281582, |
| "eval_accuracy": 0.8261636585696838, |
| "eval_f1": 0.8257431354055336, |
| "eval_loss": 0.3805844485759735, |
| "eval_runtime": 106.4119, |
| "eval_samples_per_second": 69.25, |
| "eval_steps_per_second": 0.545, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 1557, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|