Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.6834102169827438, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006834102169827439, | |
| "grad_norm": 1.7022907733917236, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 3.5654, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013668204339654879, | |
| "grad_norm": 0.8007484078407288, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 3.0866, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.020502306509482315, | |
| "grad_norm": 2.242196559906006, | |
| "learning_rate": 4.242424242424243e-05, | |
| "loss": 2.8859, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.027336408679309757, | |
| "grad_norm": 2.408808469772339, | |
| "learning_rate": 5.757575757575758e-05, | |
| "loss": 2.8438, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03417051084913719, | |
| "grad_norm": 1.8810335397720337, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 2.6887, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04100461301896463, | |
| "grad_norm": 1.7286051511764526, | |
| "learning_rate": 8.787878787878789e-05, | |
| "loss": 2.457, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04783871518879207, | |
| "grad_norm": 1.4562296867370605, | |
| "learning_rate": 0.00010303030303030303, | |
| "loss": 2.0197, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.054672817358619515, | |
| "grad_norm": 1.280148983001709, | |
| "learning_rate": 0.0001181818181818182, | |
| "loss": 2.3125, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06150691952844695, | |
| "grad_norm": 4.578958988189697, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 2.1067, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06834102169827438, | |
| "grad_norm": 0.989810585975647, | |
| "learning_rate": 0.00014848484848484849, | |
| "loss": 2.0309, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07517512386810182, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00016363636363636366, | |
| "loss": 2.0687, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.08200922603792926, | |
| "grad_norm": 5.370285511016846, | |
| "learning_rate": 0.00017727272727272728, | |
| "loss": 2.2035, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0888433282077567, | |
| "grad_norm": 1.0226854085922241, | |
| "learning_rate": 0.00019242424242424245, | |
| "loss": 1.7328, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09567743037758414, | |
| "grad_norm": 8.442233085632324, | |
| "learning_rate": 0.00019976525821596247, | |
| "loss": 1.7355, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.10251153254741159, | |
| "grad_norm": 3.448438882827759, | |
| "learning_rate": 0.00019929577464788734, | |
| "loss": 2.0396, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10934563471723903, | |
| "grad_norm": 3.707597255706787, | |
| "learning_rate": 0.0001988262910798122, | |
| "loss": 1.8603, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11617973688706647, | |
| "grad_norm": 2.0195584297180176, | |
| "learning_rate": 0.00019835680751173712, | |
| "loss": 2.1021, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1230138390568939, | |
| "grad_norm": 2.4128806591033936, | |
| "learning_rate": 0.00019788732394366199, | |
| "loss": 2.0582, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12984794122672133, | |
| "grad_norm": 2.707796573638916, | |
| "learning_rate": 0.00019741784037558685, | |
| "loss": 2.1312, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.13668204339654877, | |
| "grad_norm": 2.6425483226776123, | |
| "learning_rate": 0.00019694835680751174, | |
| "loss": 2.086, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1435161455663762, | |
| "grad_norm": 2.4248363971710205, | |
| "learning_rate": 0.00019647887323943664, | |
| "loss": 2.0576, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.15035024773620365, | |
| "grad_norm": 1.1113543510437012, | |
| "learning_rate": 0.0001960093896713615, | |
| "loss": 1.7884, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15718434990603108, | |
| "grad_norm": 3.8414552211761475, | |
| "learning_rate": 0.0001955399061032864, | |
| "loss": 2.3521, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.16401845207585852, | |
| "grad_norm": 3.393040180206299, | |
| "learning_rate": 0.0001950704225352113, | |
| "loss": 2.0928, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.17085255424568596, | |
| "grad_norm": 3.5314691066741943, | |
| "learning_rate": 0.00019460093896713615, | |
| "loss": 1.9308, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1776866564155134, | |
| "grad_norm": 5.881885051727295, | |
| "learning_rate": 0.00019417840375586857, | |
| "loss": 1.9244, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.18452075858534084, | |
| "grad_norm": 1.4201581478118896, | |
| "learning_rate": 0.00019370892018779343, | |
| "loss": 1.872, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.19135486075516828, | |
| "grad_norm": 2.751593828201294, | |
| "learning_rate": 0.00019323943661971832, | |
| "loss": 1.9267, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.19818896292499572, | |
| "grad_norm": 0.9784806966781616, | |
| "learning_rate": 0.00019276995305164322, | |
| "loss": 1.6952, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.20502306509482318, | |
| "grad_norm": 2.9738731384277344, | |
| "learning_rate": 0.00019230046948356808, | |
| "loss": 1.8987, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.21185716726465062, | |
| "grad_norm": 2.224365472793579, | |
| "learning_rate": 0.00019183098591549297, | |
| "loss": 2.0835, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.21869126943447806, | |
| "grad_norm": 1.2849421501159668, | |
| "learning_rate": 0.00019136150234741784, | |
| "loss": 1.8553, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2255253716043055, | |
| "grad_norm": 3.2781999111175537, | |
| "learning_rate": 0.00019089201877934273, | |
| "loss": 1.8121, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.23235947377413294, | |
| "grad_norm": 0.7185825109481812, | |
| "learning_rate": 0.0001904225352112676, | |
| "loss": 1.9595, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.23919357594396037, | |
| "grad_norm": 4.366527080535889, | |
| "learning_rate": 0.0001899530516431925, | |
| "loss": 1.812, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2460276781137878, | |
| "grad_norm": 4.314450740814209, | |
| "learning_rate": 0.00018948356807511738, | |
| "loss": 1.8416, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2528617802836152, | |
| "grad_norm": 2.956653356552124, | |
| "learning_rate": 0.00018901408450704225, | |
| "loss": 1.9802, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.25969588245344266, | |
| "grad_norm": 1.395255208015442, | |
| "learning_rate": 0.00018854460093896714, | |
| "loss": 2.1336, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2665299846232701, | |
| "grad_norm": 2.8896381855010986, | |
| "learning_rate": 0.00018807511737089204, | |
| "loss": 1.8297, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.27336408679309754, | |
| "grad_norm": 2.0925827026367188, | |
| "learning_rate": 0.0001876056338028169, | |
| "loss": 1.8318, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.280198188962925, | |
| "grad_norm": 1.436551809310913, | |
| "learning_rate": 0.0001871361502347418, | |
| "loss": 1.8223, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2870322911327524, | |
| "grad_norm": 0.9357802867889404, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 1.8287, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.29386639330257985, | |
| "grad_norm": 3.6884915828704834, | |
| "learning_rate": 0.00018619718309859155, | |
| "loss": 1.8025, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3007004954724073, | |
| "grad_norm": 2.153522491455078, | |
| "learning_rate": 0.00018572769953051642, | |
| "loss": 1.5475, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.30753459764223473, | |
| "grad_norm": 3.496854305267334, | |
| "learning_rate": 0.00018525821596244134, | |
| "loss": 1.782, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.31436869981206217, | |
| "grad_norm": 2.166901111602783, | |
| "learning_rate": 0.0001847887323943662, | |
| "loss": 1.5982, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3212028019818896, | |
| "grad_norm": 2.4054338932037354, | |
| "learning_rate": 0.00018431924882629107, | |
| "loss": 2.0201, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.32803690415171705, | |
| "grad_norm": 1.3764829635620117, | |
| "learning_rate": 0.000183849765258216, | |
| "loss": 1.5294, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3348710063215445, | |
| "grad_norm": 5.117223262786865, | |
| "learning_rate": 0.00018338028169014085, | |
| "loss": 1.812, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3417051084913719, | |
| "grad_norm": 1.7153640985488892, | |
| "learning_rate": 0.00018291079812206572, | |
| "loss": 1.7249, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.34853921066119936, | |
| "grad_norm": 1.0843334197998047, | |
| "learning_rate": 0.00018244131455399064, | |
| "loss": 1.6918, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3553733128310268, | |
| "grad_norm": 3.17716383934021, | |
| "learning_rate": 0.0001819718309859155, | |
| "loss": 1.6265, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.36220741500085424, | |
| "grad_norm": 2.1360270977020264, | |
| "learning_rate": 0.00018150234741784037, | |
| "loss": 1.9059, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3690415171706817, | |
| "grad_norm": 2.659409284591675, | |
| "learning_rate": 0.0001810328638497653, | |
| "loss": 1.8237, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3758756193405091, | |
| "grad_norm": 1.9574990272521973, | |
| "learning_rate": 0.00018056338028169016, | |
| "loss": 2.0552, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.38270972151033655, | |
| "grad_norm": 3.7215640544891357, | |
| "learning_rate": 0.00018009389671361502, | |
| "loss": 1.7518, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.389543823680164, | |
| "grad_norm": 5.685362339019775, | |
| "learning_rate": 0.00017962441314553991, | |
| "loss": 1.9423, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.39637792584999143, | |
| "grad_norm": 5.373042106628418, | |
| "learning_rate": 0.0001791549295774648, | |
| "loss": 1.9343, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.40321202801981887, | |
| "grad_norm": 3.321650981903076, | |
| "learning_rate": 0.00017868544600938967, | |
| "loss": 1.718, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.41004613018964636, | |
| "grad_norm": 1.3837800025939941, | |
| "learning_rate": 0.00017821596244131457, | |
| "loss": 1.7754, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4168802323594738, | |
| "grad_norm": 1.1874879598617554, | |
| "learning_rate": 0.00017774647887323946, | |
| "loss": 1.6449, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.42371433452930124, | |
| "grad_norm": 2.3453457355499268, | |
| "learning_rate": 0.00017727699530516432, | |
| "loss": 1.9001, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4305484366991287, | |
| "grad_norm": 1.9375288486480713, | |
| "learning_rate": 0.00017680751173708922, | |
| "loss": 1.6097, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4373825388689561, | |
| "grad_norm": 3.0950772762298584, | |
| "learning_rate": 0.0001763380281690141, | |
| "loss": 1.9017, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.44421664103878356, | |
| "grad_norm": 3.009223699569702, | |
| "learning_rate": 0.00017586854460093898, | |
| "loss": 2.1678, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.451050743208611, | |
| "grad_norm": 3.1125118732452393, | |
| "learning_rate": 0.00017539906103286384, | |
| "loss": 1.5252, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.45788484537843843, | |
| "grad_norm": 1.9371854066848755, | |
| "learning_rate": 0.00017492957746478873, | |
| "loss": 1.642, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.46471894754826587, | |
| "grad_norm": 0.8981029987335205, | |
| "learning_rate": 0.00017446009389671363, | |
| "loss": 2.1059, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4715530497180933, | |
| "grad_norm": 0.9644233584403992, | |
| "learning_rate": 0.0001739906103286385, | |
| "loss": 1.482, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.47838715188792075, | |
| "grad_norm": 1.7749234437942505, | |
| "learning_rate": 0.00017352112676056338, | |
| "loss": 1.6032, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4852212540577482, | |
| "grad_norm": 1.8107513189315796, | |
| "learning_rate": 0.00017305164319248828, | |
| "loss": 1.6941, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4920553562275756, | |
| "grad_norm": 1.427687644958496, | |
| "learning_rate": 0.00017258215962441314, | |
| "loss": 1.5501, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.49888945839740306, | |
| "grad_norm": 2.521240472793579, | |
| "learning_rate": 0.00017211267605633804, | |
| "loss": 1.9205, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5057235605672304, | |
| "grad_norm": 5.503659725189209, | |
| "learning_rate": 0.00017164319248826293, | |
| "loss": 1.7239, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5125576627370579, | |
| "grad_norm": 4.041492462158203, | |
| "learning_rate": 0.0001711737089201878, | |
| "loss": 1.9131, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5193917649068853, | |
| "grad_norm": 3.602377414703369, | |
| "learning_rate": 0.0001707042253521127, | |
| "loss": 1.5152, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5262258670767128, | |
| "grad_norm": 3.496152639389038, | |
| "learning_rate": 0.00017023474178403758, | |
| "loss": 1.5333, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5330599692465402, | |
| "grad_norm": 4.799586772918701, | |
| "learning_rate": 0.00016976525821596245, | |
| "loss": 1.5604, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5398940714163677, | |
| "grad_norm": 1.314289927482605, | |
| "learning_rate": 0.00016929577464788734, | |
| "loss": 1.8581, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5467281735861951, | |
| "grad_norm": 1.540637731552124, | |
| "learning_rate": 0.0001688262910798122, | |
| "loss": 1.9712, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5535622757560226, | |
| "grad_norm": 1.2992823123931885, | |
| "learning_rate": 0.0001683568075117371, | |
| "loss": 1.6065, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.56039637792585, | |
| "grad_norm": 0.7714009881019592, | |
| "learning_rate": 0.000167887323943662, | |
| "loss": 1.8902, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5672304800956774, | |
| "grad_norm": 1.2016668319702148, | |
| "learning_rate": 0.00016741784037558685, | |
| "loss": 1.854, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5740645822655048, | |
| "grad_norm": 1.9129397869110107, | |
| "learning_rate": 0.00016694835680751175, | |
| "loss": 1.9661, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5808986844353323, | |
| "grad_norm": 1.5583465099334717, | |
| "learning_rate": 0.00016647887323943664, | |
| "loss": 1.4921, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5877327866051597, | |
| "grad_norm": 1.217874526977539, | |
| "learning_rate": 0.0001660093896713615, | |
| "loss": 1.6486, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5945668887749872, | |
| "grad_norm": 2.096747398376465, | |
| "learning_rate": 0.0001655399061032864, | |
| "loss": 1.632, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.6014009909448146, | |
| "grad_norm": 1.9565995931625366, | |
| "learning_rate": 0.0001650704225352113, | |
| "loss": 1.6879, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6082350931146421, | |
| "grad_norm": 2.8614614009857178, | |
| "learning_rate": 0.00016460093896713616, | |
| "loss": 1.7156, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6150691952844695, | |
| "grad_norm": 1.9752906560897827, | |
| "learning_rate": 0.00016413145539906105, | |
| "loss": 1.8497, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.621903297454297, | |
| "grad_norm": 1.1667951345443726, | |
| "learning_rate": 0.00016366197183098591, | |
| "loss": 1.9354, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6287373996241243, | |
| "grad_norm": 3.9172396659851074, | |
| "learning_rate": 0.0001631924882629108, | |
| "loss": 1.6389, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6355715017939518, | |
| "grad_norm": 2.9594082832336426, | |
| "learning_rate": 0.00016272300469483567, | |
| "loss": 2.0553, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6424056039637792, | |
| "grad_norm": 2.566627264022827, | |
| "learning_rate": 0.00016225352112676057, | |
| "loss": 1.547, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6492397061336067, | |
| "grad_norm": 2.285865306854248, | |
| "learning_rate": 0.00016178403755868546, | |
| "loss": 1.6803, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6560738083034341, | |
| "grad_norm": 2.1016080379486084, | |
| "learning_rate": 0.00016131455399061032, | |
| "loss": 1.6278, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6629079104732616, | |
| "grad_norm": 2.409546375274658, | |
| "learning_rate": 0.00016084507042253522, | |
| "loss": 1.6959, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.669742012643089, | |
| "grad_norm": 3.1356201171875, | |
| "learning_rate": 0.0001603755868544601, | |
| "loss": 1.6983, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6765761148129165, | |
| "grad_norm": 2.00640869140625, | |
| "learning_rate": 0.00015990610328638498, | |
| "loss": 1.5737, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6834102169827438, | |
| "grad_norm": 2.788975477218628, | |
| "learning_rate": 0.00015943661971830987, | |
| "loss": 1.8415, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4392, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7056343653711872e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |