| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1172, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 2.1641297340393066, | |
| "eval_runtime": 2.1264, | |
| "eval_samples_per_second": 9.405, | |
| "eval_steps_per_second": 0.941, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.0008535154166222128, | |
| "grad_norm": 4.247310161590576, | |
| "learning_rate": 5.681818181818182e-08, | |
| "loss": 1.0032, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008535154166222128, | |
| "grad_norm": 4.285112380981445, | |
| "learning_rate": 5.681818181818182e-07, | |
| "loss": 0.9823, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017070308332444255, | |
| "grad_norm": 3.0184619426727295, | |
| "learning_rate": 1.1363636363636364e-06, | |
| "loss": 0.9382, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02560546249866638, | |
| "grad_norm": 2.0186476707458496, | |
| "learning_rate": 1.7045454545454546e-06, | |
| "loss": 0.8583, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03414061666488851, | |
| "grad_norm": 1.175138235092163, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.764, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04267577083111064, | |
| "grad_norm": 0.9045423269271851, | |
| "learning_rate": 2.8409090909090916e-06, | |
| "loss": 0.714, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05121092499733276, | |
| "grad_norm": 0.8305149078369141, | |
| "learning_rate": 3.409090909090909e-06, | |
| "loss": 0.6835, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05974607916355489, | |
| "grad_norm": 0.7908885478973389, | |
| "learning_rate": 3.9772727272727275e-06, | |
| "loss": 0.6549, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06828123332977702, | |
| "grad_norm": 0.8054904341697693, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.6368, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07681638749599914, | |
| "grad_norm": 0.7891983985900879, | |
| "learning_rate": 5.113636363636364e-06, | |
| "loss": 0.631, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08535154166222128, | |
| "grad_norm": 0.7775433659553528, | |
| "learning_rate": 5.681818181818183e-06, | |
| "loss": 0.6186, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0938866958284434, | |
| "grad_norm": 0.8340434432029724, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.6028, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10242184999466553, | |
| "grad_norm": 0.7749195098876953, | |
| "learning_rate": 6.818181818181818e-06, | |
| "loss": 0.5965, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11095700416088766, | |
| "grad_norm": 0.7949061393737793, | |
| "learning_rate": 7.386363636363637e-06, | |
| "loss": 0.5837, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11949215832710978, | |
| "grad_norm": 0.8714171051979065, | |
| "learning_rate": 7.954545454545455e-06, | |
| "loss": 0.5865, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1280273124933319, | |
| "grad_norm": 0.7890238761901855, | |
| "learning_rate": 8.522727272727273e-06, | |
| "loss": 0.5712, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13656246665955404, | |
| "grad_norm": 0.8195155262947083, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.5774, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14509762082577615, | |
| "grad_norm": 0.8219712972640991, | |
| "learning_rate": 9.65909090909091e-06, | |
| "loss": 0.5767, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1536327749919983, | |
| "grad_norm": 0.8124852776527405, | |
| "learning_rate": 1.0227272727272729e-05, | |
| "loss": 0.5643, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16216792915822043, | |
| "grad_norm": 0.8383850455284119, | |
| "learning_rate": 1.0795454545454547e-05, | |
| "loss": 0.5734, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17070308332444256, | |
| "grad_norm": 0.7605228424072266, | |
| "learning_rate": 1.1363636363636366e-05, | |
| "loss": 0.5661, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17923823749066467, | |
| "grad_norm": 0.8585752844810486, | |
| "learning_rate": 1.1931818181818183e-05, | |
| "loss": 0.5534, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1877733916568868, | |
| "grad_norm": 0.8643070459365845, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.5549, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.19630854582310894, | |
| "grad_norm": 0.9826616644859314, | |
| "learning_rate": 1.306818181818182e-05, | |
| "loss": 0.5531, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.20484369998933105, | |
| "grad_norm": 0.8799106478691101, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 0.5371, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2133788541555532, | |
| "grad_norm": 0.757698118686676, | |
| "learning_rate": 1.4204545454545455e-05, | |
| "loss": 0.5459, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22191400832177532, | |
| "grad_norm": 0.8704412579536438, | |
| "learning_rate": 1.4772727272727274e-05, | |
| "loss": 0.5369, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23044916248799743, | |
| "grad_norm": 0.7941352725028992, | |
| "learning_rate": 1.5340909090909094e-05, | |
| "loss": 0.5359, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.23898431665421957, | |
| "grad_norm": 0.7990615367889404, | |
| "learning_rate": 1.590909090909091e-05, | |
| "loss": 0.5285, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2475194708204417, | |
| "grad_norm": 0.7938647270202637, | |
| "learning_rate": 1.647727272727273e-05, | |
| "loss": 0.5391, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2560546249866638, | |
| "grad_norm": 0.7677845358848572, | |
| "learning_rate": 1.7045454545454546e-05, | |
| "loss": 0.5195, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26458977915288595, | |
| "grad_norm": 0.7977807521820068, | |
| "learning_rate": 1.7613636363636366e-05, | |
| "loss": 0.5246, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2731249333191081, | |
| "grad_norm": 0.819622814655304, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.5176, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2816600874853302, | |
| "grad_norm": 0.8428648114204407, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.513, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2901952416515523, | |
| "grad_norm": 0.7542017102241516, | |
| "learning_rate": 1.931818181818182e-05, | |
| "loss": 0.5196, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.29873039581777444, | |
| "grad_norm": 0.8601102232933044, | |
| "learning_rate": 1.9886363636363638e-05, | |
| "loss": 0.5206, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3072655499839966, | |
| "grad_norm": 0.7532691955566406, | |
| "learning_rate": 1.9999683918961086e-05, | |
| "loss": 0.5162, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3158007041502187, | |
| "grad_norm": 0.7489638924598694, | |
| "learning_rate": 1.999839987398595e-05, | |
| "loss": 0.5122, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.32433585831644085, | |
| "grad_norm": 0.7671491503715515, | |
| "learning_rate": 1.9996128236743682e-05, | |
| "loss": 0.5161, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.332871012482663, | |
| "grad_norm": 0.7274801731109619, | |
| "learning_rate": 1.9992869231615323e-05, | |
| "loss": 0.5167, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3414061666488851, | |
| "grad_norm": 0.7289360165596008, | |
| "learning_rate": 1.9988623180509206e-05, | |
| "loss": 0.5127, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3499413208151072, | |
| "grad_norm": 0.7416843175888062, | |
| "learning_rate": 1.9983390502829168e-05, | |
| "loss": 0.5208, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.35847647498132934, | |
| "grad_norm": 0.7530054450035095, | |
| "learning_rate": 1.997717171543311e-05, | |
| "loss": 0.5011, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3670116291475515, | |
| "grad_norm": 0.7372049689292908, | |
| "learning_rate": 1.9969967432581962e-05, | |
| "loss": 0.5091, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3755467833137736, | |
| "grad_norm": 0.8357003927230835, | |
| "learning_rate": 1.996177836587899e-05, | |
| "loss": 0.5114, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.38408193747999575, | |
| "grad_norm": 0.8466181755065918, | |
| "learning_rate": 1.9952605324199516e-05, | |
| "loss": 0.5009, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3926170916462179, | |
| "grad_norm": 0.7260451316833496, | |
| "learning_rate": 1.9942449213611028e-05, | |
| "loss": 0.5087, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.40115224581243997, | |
| "grad_norm": 0.6815033555030823, | |
| "learning_rate": 1.9931311037283673e-05, | |
| "loss": 0.5033, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4096873999786621, | |
| "grad_norm": 0.706380307674408, | |
| "learning_rate": 1.9919191895391176e-05, | |
| "loss": 0.4974, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.41822255414488424, | |
| "grad_norm": 0.736223042011261, | |
| "learning_rate": 1.9906092985002163e-05, | |
| "loss": 0.4981, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4267577083111064, | |
| "grad_norm": 0.9467535614967346, | |
| "learning_rate": 1.9892015599961927e-05, | |
| "loss": 0.5017, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4352928624773285, | |
| "grad_norm": 0.7372705936431885, | |
| "learning_rate": 1.9876961130764624e-05, | |
| "loss": 0.5047, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.44382801664355065, | |
| "grad_norm": 0.7471463680267334, | |
| "learning_rate": 1.9860931064415934e-05, | |
| "loss": 0.5009, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.45236317080977273, | |
| "grad_norm": 0.6775240898132324, | |
| "learning_rate": 1.9843926984286165e-05, | |
| "loss": 0.5045, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.46089832497599487, | |
| "grad_norm": 0.6793776750564575, | |
| "learning_rate": 1.9825950569953884e-05, | |
| "loss": 0.4978, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.469433479142217, | |
| "grad_norm": 0.6851901412010193, | |
| "learning_rate": 1.980700359703999e-05, | |
| "loss": 0.494, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.47796863330843914, | |
| "grad_norm": 0.6855452060699463, | |
| "learning_rate": 1.9787087937032333e-05, | |
| "loss": 0.4952, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4865037874746613, | |
| "grad_norm": 0.707453727722168, | |
| "learning_rate": 1.976620555710087e-05, | |
| "loss": 0.4949, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.4950389416408834, | |
| "grad_norm": 0.6674401760101318, | |
| "learning_rate": 1.9744358519903343e-05, | |
| "loss": 0.4863, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5035740958071055, | |
| "grad_norm": 0.7680632472038269, | |
| "learning_rate": 1.9721548983381554e-05, | |
| "loss": 0.4882, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5121092499733276, | |
| "grad_norm": 0.6789171099662781, | |
| "learning_rate": 1.9697779200548202e-05, | |
| "loss": 0.4848, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5206444041395498, | |
| "grad_norm": 0.6551903486251831, | |
| "learning_rate": 1.9673051519264342e-05, | |
| "loss": 0.4951, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5291795583057719, | |
| "grad_norm": 0.7067411541938782, | |
| "learning_rate": 1.964736838200749e-05, | |
| "loss": 0.493, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.537714712471994, | |
| "grad_norm": 0.6695910692214966, | |
| "learning_rate": 1.9620732325630342e-05, | |
| "loss": 0.4938, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5462498666382162, | |
| "grad_norm": 0.6625940203666687, | |
| "learning_rate": 1.9593145981110223e-05, | |
| "loss": 0.4873, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5547850208044383, | |
| "grad_norm": 0.7263137102127075, | |
| "learning_rate": 1.9564612073289192e-05, | |
| "loss": 0.4964, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5633201749706604, | |
| "grad_norm": 0.7034249901771545, | |
| "learning_rate": 1.9535133420604905e-05, | |
| "loss": 0.4952, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5718553291368825, | |
| "grad_norm": 0.633065402507782, | |
| "learning_rate": 1.9504712934812228e-05, | |
| "loss": 0.4982, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5803904833031046, | |
| "grad_norm": 0.7597299814224243, | |
| "learning_rate": 1.9473353620695614e-05, | |
| "loss": 0.4839, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5889256374693268, | |
| "grad_norm": 0.6461811065673828, | |
| "learning_rate": 1.9441058575772317e-05, | |
| "loss": 0.4853, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5974607916355489, | |
| "grad_norm": 0.6678339242935181, | |
| "learning_rate": 1.940783098998643e-05, | |
| "loss": 0.4814, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6059959458017711, | |
| "grad_norm": 0.6317865252494812, | |
| "learning_rate": 1.9373674145393804e-05, | |
| "loss": 0.4896, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6145310999679932, | |
| "grad_norm": 0.6536778211593628, | |
| "learning_rate": 1.9338591415837856e-05, | |
| "loss": 0.4795, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6230662541342153, | |
| "grad_norm": 0.653390645980835, | |
| "learning_rate": 1.9302586266616318e-05, | |
| "loss": 0.4862, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6316014083004374, | |
| "grad_norm": 0.6988620758056641, | |
| "learning_rate": 1.9265662254138958e-05, | |
| "loss": 0.4913, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6401365624666595, | |
| "grad_norm": 0.6603461503982544, | |
| "learning_rate": 1.922782302557628e-05, | |
| "loss": 0.4813, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6486717166328817, | |
| "grad_norm": 0.7291558980941772, | |
| "learning_rate": 1.918907231849931e-05, | |
| "loss": 0.4843, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6572068707991038, | |
| "grad_norm": 0.678535521030426, | |
| "learning_rate": 1.914941396051036e-05, | |
| "loss": 0.4819, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.665742024965326, | |
| "grad_norm": 0.7419494390487671, | |
| "learning_rate": 1.910885186886502e-05, | |
| "loss": 0.4759, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.674277179131548, | |
| "grad_norm": 0.7254301309585571, | |
| "learning_rate": 1.9067390050085183e-05, | |
| "loss": 0.4754, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6828123332977702, | |
| "grad_norm": 0.6892661452293396, | |
| "learning_rate": 1.902503259956333e-05, | |
| "loss": 0.4759, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6913474874639923, | |
| "grad_norm": 0.6353612542152405, | |
| "learning_rate": 1.8981783701157985e-05, | |
| "loss": 0.4787, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6998826416302144, | |
| "grad_norm": 0.578580915927887, | |
| "learning_rate": 1.8937647626780473e-05, | |
| "loss": 0.4748, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7084177957964366, | |
| "grad_norm": 0.6349884867668152, | |
| "learning_rate": 1.889262873597295e-05, | |
| "loss": 0.4817, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7169529499626587, | |
| "grad_norm": 0.6372865438461304, | |
| "learning_rate": 1.8846731475477796e-05, | |
| "loss": 0.4811, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7254881041288809, | |
| "grad_norm": 0.70332932472229, | |
| "learning_rate": 1.8799960378798382e-05, | |
| "loss": 0.4854, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.734023258295103, | |
| "grad_norm": 0.6345937252044678, | |
| "learning_rate": 1.8752320065751276e-05, | |
| "loss": 0.4804, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.742558412461325, | |
| "grad_norm": 0.7001857757568359, | |
| "learning_rate": 1.8703815242009927e-05, | |
| "loss": 0.4823, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7510935666275472, | |
| "grad_norm": 0.6997294425964355, | |
| "learning_rate": 1.8654450698639845e-05, | |
| "loss": 0.4772, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7596287207937693, | |
| "grad_norm": 0.6218723058700562, | |
| "learning_rate": 1.860423131162538e-05, | |
| "loss": 0.4741, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7681638749599915, | |
| "grad_norm": 0.6111359000205994, | |
| "learning_rate": 1.8553162041388096e-05, | |
| "loss": 0.4724, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.6032017469406128, | |
| "learning_rate": 1.8501247932296785e-05, | |
| "loss": 0.4769, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7852341832924358, | |
| "grad_norm": 0.6396908164024353, | |
| "learning_rate": 1.8448494112169234e-05, | |
| "loss": 0.4747, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7937693374586579, | |
| "grad_norm": 0.690779983997345, | |
| "learning_rate": 1.8394905791765714e-05, | |
| "loss": 0.4719, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8023044916248799, | |
| "grad_norm": 0.6640828251838684, | |
| "learning_rate": 1.8340488264274285e-05, | |
| "loss": 0.477, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8108396457911021, | |
| "grad_norm": 0.6492927074432373, | |
| "learning_rate": 1.8285246904787968e-05, | |
| "loss": 0.4626, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8193747999573242, | |
| "grad_norm": 0.6751061677932739, | |
| "learning_rate": 1.8229187169773805e-05, | |
| "loss": 0.4698, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8279099541235464, | |
| "grad_norm": 0.6475481390953064, | |
| "learning_rate": 1.8172314596533914e-05, | |
| "loss": 0.4698, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8364451082897685, | |
| "grad_norm": 0.6659871935844421, | |
| "learning_rate": 1.8114634802658542e-05, | |
| "loss": 0.4778, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8449802624559906, | |
| "grad_norm": 0.6479447484016418, | |
| "learning_rate": 1.8056153485471167e-05, | |
| "loss": 0.4703, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8535154166222128, | |
| "grad_norm": 0.7254726886749268, | |
| "learning_rate": 1.7996876421465764e-05, | |
| "loss": 0.4767, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8620505707884348, | |
| "grad_norm": 0.6294846534729004, | |
| "learning_rate": 1.7936809465736223e-05, | |
| "loss": 0.4615, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.870585724954657, | |
| "grad_norm": 0.6530686020851135, | |
| "learning_rate": 1.7875958551398023e-05, | |
| "loss": 0.4642, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8791208791208791, | |
| "grad_norm": 0.713152289390564, | |
| "learning_rate": 1.781432968900217e-05, | |
| "loss": 0.4692, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8876560332871013, | |
| "grad_norm": 0.7651381492614746, | |
| "learning_rate": 1.775192896594151e-05, | |
| "loss": 0.4646, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8961911874533234, | |
| "grad_norm": 0.5913043022155762, | |
| "learning_rate": 1.7688762545849466e-05, | |
| "loss": 0.4688, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9047263416195455, | |
| "grad_norm": 0.6034150719642639, | |
| "learning_rate": 1.7624836667991195e-05, | |
| "loss": 0.4574, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9132614957857677, | |
| "grad_norm": 0.6374826431274414, | |
| "learning_rate": 1.7560157646647335e-05, | |
| "loss": 0.4651, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9217966499519897, | |
| "grad_norm": 0.6261687278747559, | |
| "learning_rate": 1.749473187049028e-05, | |
| "loss": 0.4608, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9303318041182119, | |
| "grad_norm": 0.6067594885826111, | |
| "learning_rate": 1.742856580195316e-05, | |
| "loss": 0.4592, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.938866958284434, | |
| "grad_norm": 0.6240684986114502, | |
| "learning_rate": 1.7361665976591513e-05, | |
| "loss": 0.4663, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9474021124506561, | |
| "grad_norm": 0.6192986965179443, | |
| "learning_rate": 1.7294039002437724e-05, | |
| "loss": 0.476, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9559372666168783, | |
| "grad_norm": 0.6121706962585449, | |
| "learning_rate": 1.7225691559348333e-05, | |
| "loss": 0.4531, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9644724207831004, | |
| "grad_norm": 0.5956526398658752, | |
| "learning_rate": 1.715663039834421e-05, | |
| "loss": 0.4643, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9730075749493226, | |
| "grad_norm": 0.5699427127838135, | |
| "learning_rate": 1.7086862340943745e-05, | |
| "loss": 0.4601, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9815427291155446, | |
| "grad_norm": 0.6840018630027771, | |
| "learning_rate": 1.701639427848903e-05, | |
| "loss": 0.4555, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9900778832817668, | |
| "grad_norm": 0.6009624004364014, | |
| "learning_rate": 1.6945233171465193e-05, | |
| "loss": 0.4632, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.9986130374479889, | |
| "grad_norm": 0.5962963104248047, | |
| "learning_rate": 1.6873386048812854e-05, | |
| "loss": 0.4702, | |
| "step": 1170 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3513, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.729895927368909e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |