| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.5752072334289551, | |
| "learning_rate": 0.0001993517017828201, | |
| "loss": 1.6434, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.7842408418655396, | |
| "learning_rate": 0.0001961102106969206, | |
| "loss": 1.2847, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.4593060612678528, | |
| "learning_rate": 0.0001928687196110211, | |
| "loss": 1.1227, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.45610150694847107, | |
| "learning_rate": 0.00018962722852512156, | |
| "loss": 1.0359, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.4743576645851135, | |
| "learning_rate": 0.00018638573743922206, | |
| "loss": 0.992, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5059588551521301, | |
| "learning_rate": 0.00018314424635332255, | |
| "loss": 0.9782, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.5296839475631714, | |
| "learning_rate": 0.00017990275526742302, | |
| "loss": 0.8997, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.6169719099998474, | |
| "learning_rate": 0.00017666126418152352, | |
| "loss": 0.887, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5174543261528015, | |
| "learning_rate": 0.00017341977309562402, | |
| "loss": 0.8388, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.5253883600234985, | |
| "learning_rate": 0.0001701782820097245, | |
| "loss": 0.875, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.5177808403968811, | |
| "learning_rate": 0.00016693679092382496, | |
| "loss": 0.8193, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.50547856092453, | |
| "learning_rate": 0.00016369529983792545, | |
| "loss": 0.8444, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5066828727722168, | |
| "learning_rate": 0.00016045380875202592, | |
| "loss": 0.857, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.4495781362056732, | |
| "learning_rate": 0.00015721231766612642, | |
| "loss": 0.7897, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5362279415130615, | |
| "learning_rate": 0.00015429497568881685, | |
| "loss": 0.7992, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5713534951210022, | |
| "learning_rate": 0.00015105348460291734, | |
| "loss": 0.8446, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.4843427240848541, | |
| "learning_rate": 0.00014781199351701784, | |
| "loss": 0.7906, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.4727051556110382, | |
| "learning_rate": 0.0001445705024311183, | |
| "loss": 0.8281, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.4981895685195923, | |
| "learning_rate": 0.0001413290113452188, | |
| "loss": 0.7966, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.4925459027290344, | |
| "learning_rate": 0.0001380875202593193, | |
| "loss": 0.7889, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.4470859467983246, | |
| "learning_rate": 0.00013484602917341977, | |
| "loss": 0.7588, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.46052438020706177, | |
| "learning_rate": 0.00013160453808752027, | |
| "loss": 0.7863, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.4708721935749054, | |
| "learning_rate": 0.00012836304700162077, | |
| "loss": 0.8244, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.4670282304286957, | |
| "learning_rate": 0.00012512155591572124, | |
| "loss": 0.7391, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.4596056342124939, | |
| "learning_rate": 0.00012188006482982173, | |
| "loss": 0.8057, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.3906649053096771, | |
| "learning_rate": 0.0001186385737439222, | |
| "loss": 0.7594, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5942690372467041, | |
| "learning_rate": 0.0001153970826580227, | |
| "loss": 0.7615, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.40805208683013916, | |
| "learning_rate": 0.00011215559157212318, | |
| "loss": 0.7607, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.5320433974266052, | |
| "learning_rate": 0.00010891410048622365, | |
| "loss": 0.7666, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4362870752811432, | |
| "learning_rate": 0.00010567260940032415, | |
| "loss": 0.7683, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5106446146965027, | |
| "learning_rate": 0.00010243111831442465, | |
| "loss": 0.7777, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.506986677646637, | |
| "learning_rate": 9.918962722852513e-05, | |
| "loss": 0.7637, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.5665115118026733, | |
| "learning_rate": 9.594813614262561e-05, | |
| "loss": 0.7497, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.4510403275489807, | |
| "learning_rate": 9.27066450567261e-05, | |
| "loss": 0.7644, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5009343028068542, | |
| "learning_rate": 8.946515397082659e-05, | |
| "loss": 0.7766, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.4533093273639679, | |
| "learning_rate": 8.622366288492708e-05, | |
| "loss": 0.7579, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.4576455056667328, | |
| "learning_rate": 8.298217179902756e-05, | |
| "loss": 0.7731, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.47114574909210205, | |
| "learning_rate": 7.974068071312804e-05, | |
| "loss": 0.7612, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.5052211284637451, | |
| "learning_rate": 7.649918962722853e-05, | |
| "loss": 0.7529, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.4952121675014496, | |
| "learning_rate": 7.325769854132901e-05, | |
| "loss": 0.7645, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.5584261417388916, | |
| "learning_rate": 7.00162074554295e-05, | |
| "loss": 0.7859, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.5271150469779968, | |
| "learning_rate": 6.677471636952999e-05, | |
| "loss": 0.7662, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6051105260848999, | |
| "learning_rate": 6.353322528363047e-05, | |
| "loss": 0.7217, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.5145427584648132, | |
| "learning_rate": 6.029173419773096e-05, | |
| "loss": 0.763, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5100213885307312, | |
| "learning_rate": 5.7050243111831445e-05, | |
| "loss": 0.7493, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6406119465827942, | |
| "learning_rate": 5.380875202593193e-05, | |
| "loss": 0.7856, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.5142606496810913, | |
| "learning_rate": 5.056726094003241e-05, | |
| "loss": 0.7439, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6167362332344055, | |
| "learning_rate": 4.73257698541329e-05, | |
| "loss": 0.7615, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.5693132281303406, | |
| "learning_rate": 4.408427876823339e-05, | |
| "loss": 0.7535, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5026053190231323, | |
| "learning_rate": 4.0842787682333875e-05, | |
| "loss": 0.7397, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.5282934904098511, | |
| "learning_rate": 3.760129659643436e-05, | |
| "loss": 0.7519, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.5583475828170776, | |
| "learning_rate": 3.435980551053485e-05, | |
| "loss": 0.736, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.5823280215263367, | |
| "learning_rate": 3.111831442463534e-05, | |
| "loss": 0.7383, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.4686075747013092, | |
| "learning_rate": 2.7876823338735818e-05, | |
| "loss": 0.7272, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.5750771164894104, | |
| "learning_rate": 2.4635332252836304e-05, | |
| "loss": 0.7483, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.4452055096626282, | |
| "learning_rate": 2.1393841166936794e-05, | |
| "loss": 0.7412, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.48527467250823975, | |
| "learning_rate": 1.8152350081037278e-05, | |
| "loss": 0.7287, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6023633480072021, | |
| "learning_rate": 1.4910858995137764e-05, | |
| "loss": 0.7856, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.40449661016464233, | |
| "learning_rate": 1.1669367909238251e-05, | |
| "loss": 0.7191, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.5225895643234253, | |
| "learning_rate": 8.427876823338736e-06, | |
| "loss": 0.7132, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.4504361152648926, | |
| "learning_rate": 5.186385737439222e-06, | |
| "loss": 0.7344, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.48884958028793335, | |
| "learning_rate": 1.9448946515397086e-06, | |
| "loss": 0.7086, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7055141925811768, | |
| "eval_runtime": 58.7356, | |
| "eval_samples_per_second": 1.703, | |
| "eval_steps_per_second": 0.426, | |
| "step": 625 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "total_flos": 4350923021058048.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |