{ "best_global_step": 4000, "best_metric": 0.3019483689209356, "best_model_checkpoint": "./SALAMA_NEW9/checkpoint-4000", "epoch": 2.040826741515693, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0051033426894615975, "grad_norm": 0.5211380124092102, "learning_rate": 1.8e-07, "loss": 0.0083, "step": 10 }, { "epoch": 0.010206685378923195, "grad_norm": 1.0282593965530396, "learning_rate": 3.8e-07, "loss": 0.0078, "step": 20 }, { "epoch": 0.015310028068384792, "grad_norm": 0.4742085933685303, "learning_rate": 5.800000000000001e-07, "loss": 0.0082, "step": 30 }, { "epoch": 0.02041337075784639, "grad_norm": 0.7376335263252258, "learning_rate": 7.8e-07, "loss": 0.0056, "step": 40 }, { "epoch": 0.025516713447307986, "grad_norm": 1.5874927043914795, "learning_rate": 9.800000000000001e-07, "loss": 0.012, "step": 50 }, { "epoch": 0.030620056136769585, "grad_norm": 0.6289522051811218, "learning_rate": 1.1800000000000001e-06, "loss": 0.0081, "step": 60 }, { "epoch": 0.035723398826231184, "grad_norm": 1.9904325008392334, "learning_rate": 1.3800000000000001e-06, "loss": 0.018, "step": 70 }, { "epoch": 0.04082674151569278, "grad_norm": 2.7978858947753906, "learning_rate": 1.5800000000000001e-06, "loss": 0.0083, "step": 80 }, { "epoch": 0.045930084205154376, "grad_norm": 1.7935121059417725, "learning_rate": 1.7800000000000001e-06, "loss": 0.0107, "step": 90 }, { "epoch": 0.05103342689461597, "grad_norm": 0.3225713074207306, "learning_rate": 1.98e-06, "loss": 0.01, "step": 100 }, { "epoch": 0.056136769584077574, "grad_norm": 0.9831269979476929, "learning_rate": 2.1800000000000003e-06, "loss": 0.0101, "step": 110 }, { "epoch": 0.06124011227353917, "grad_norm": 1.6235119104385376, "learning_rate": 2.38e-06, "loss": 0.0067, "step": 120 }, { "epoch": 0.06634345496300076, "grad_norm": 2.157205820083618, "learning_rate": 2.5800000000000003e-06, "loss": 0.008, "step": 130 }, { "epoch": 0.07144679765246237, "grad_norm": 0.38939204812049866, "learning_rate": 2.7800000000000005e-06, "loss": 0.0054, "step": 140 }, { "epoch": 0.07655014034192396, "grad_norm": 0.7885264754295349, "learning_rate": 2.9800000000000003e-06, "loss": 0.0081, "step": 150 }, { "epoch": 0.08165348303138556, "grad_norm": 1.518344759941101, "learning_rate": 3.1800000000000005e-06, "loss": 0.0088, "step": 160 }, { "epoch": 0.08675682572084716, "grad_norm": 0.5009278059005737, "learning_rate": 3.3800000000000007e-06, "loss": 0.005, "step": 170 }, { "epoch": 0.09186016841030875, "grad_norm": 2.364478588104248, "learning_rate": 3.58e-06, "loss": 0.0085, "step": 180 }, { "epoch": 0.09696351109977035, "grad_norm": 0.31605347990989685, "learning_rate": 3.7800000000000002e-06, "loss": 0.0073, "step": 190 }, { "epoch": 0.10206685378923194, "grad_norm": 1.516256332397461, "learning_rate": 3.980000000000001e-06, "loss": 0.0081, "step": 200 }, { "epoch": 0.10717019647869354, "grad_norm": 1.2425719499588013, "learning_rate": 4.18e-06, "loss": 0.0072, "step": 210 }, { "epoch": 0.11227353916815515, "grad_norm": 2.350731611251831, "learning_rate": 4.38e-06, "loss": 0.0147, "step": 220 }, { "epoch": 0.11737688185761674, "grad_norm": 0.7598681449890137, "learning_rate": 4.58e-06, "loss": 0.0117, "step": 230 }, { "epoch": 0.12248022454707834, "grad_norm": 0.8952233791351318, "learning_rate": 4.78e-06, "loss": 0.0089, "step": 240 }, { "epoch": 0.12758356723653994, "grad_norm": 0.6309432983398438, "learning_rate": 4.980000000000001e-06, "loss": 0.0047, "step": 250 }, { "epoch": 0.13268690992600152, "grad_norm": 1.7359912395477295, "learning_rate": 5.18e-06, "loss": 0.0096, "step": 260 }, { "epoch": 0.13779025261546313, "grad_norm": 2.6553421020507812, "learning_rate": 5.380000000000001e-06, "loss": 0.0078, "step": 270 }, { "epoch": 0.14289359530492474, "grad_norm": 1.1739858388900757, "learning_rate": 5.580000000000001e-06, "loss": 0.0087, "step": 280 }, { "epoch": 0.14799693799438632, "grad_norm": 0.708070695400238, "learning_rate": 5.78e-06, "loss": 0.0075, "step": 290 }, { "epoch": 0.15310028068384793, "grad_norm": 0.8523297309875488, "learning_rate": 5.98e-06, "loss": 0.0085, "step": 300 }, { "epoch": 0.1582036233733095, "grad_norm": 2.7822892665863037, "learning_rate": 6.18e-06, "loss": 0.0092, "step": 310 }, { "epoch": 0.16330696606277112, "grad_norm": 3.9056875705718994, "learning_rate": 6.380000000000001e-06, "loss": 0.0172, "step": 320 }, { "epoch": 0.1684103087522327, "grad_norm": 1.9748897552490234, "learning_rate": 6.5800000000000005e-06, "loss": 0.0086, "step": 330 }, { "epoch": 0.1735136514416943, "grad_norm": 1.313830852508545, "learning_rate": 6.780000000000001e-06, "loss": 0.0124, "step": 340 }, { "epoch": 0.17861699413115592, "grad_norm": 1.782209038734436, "learning_rate": 6.98e-06, "loss": 0.0166, "step": 350 }, { "epoch": 0.1837203368206175, "grad_norm": 2.647857427597046, "learning_rate": 7.180000000000001e-06, "loss": 0.0151, "step": 360 }, { "epoch": 0.1888236795100791, "grad_norm": 2.1850852966308594, "learning_rate": 7.3800000000000005e-06, "loss": 0.0116, "step": 370 }, { "epoch": 0.1939270221995407, "grad_norm": 2.69811749458313, "learning_rate": 7.58e-06, "loss": 0.0118, "step": 380 }, { "epoch": 0.1990303648890023, "grad_norm": 3.1227176189422607, "learning_rate": 7.78e-06, "loss": 0.0137, "step": 390 }, { "epoch": 0.20413370757846389, "grad_norm": 3.615041494369507, "learning_rate": 7.980000000000002e-06, "loss": 0.0113, "step": 400 }, { "epoch": 0.2092370502679255, "grad_norm": 2.067406177520752, "learning_rate": 8.18e-06, "loss": 0.0154, "step": 410 }, { "epoch": 0.21434039295738708, "grad_norm": 1.5332070589065552, "learning_rate": 8.380000000000001e-06, "loss": 0.0159, "step": 420 }, { "epoch": 0.2194437356468487, "grad_norm": 1.3139411211013794, "learning_rate": 8.580000000000001e-06, "loss": 0.0096, "step": 430 }, { "epoch": 0.2245470783363103, "grad_norm": 2.8063700199127197, "learning_rate": 8.78e-06, "loss": 0.0096, "step": 440 }, { "epoch": 0.22965042102577188, "grad_norm": 1.2194337844848633, "learning_rate": 8.98e-06, "loss": 0.0113, "step": 450 }, { "epoch": 0.2347537637152335, "grad_norm": 1.4826334714889526, "learning_rate": 9.180000000000002e-06, "loss": 0.01, "step": 460 }, { "epoch": 0.23985710640469507, "grad_norm": 1.4572652578353882, "learning_rate": 9.38e-06, "loss": 0.012, "step": 470 }, { "epoch": 0.24496044909415668, "grad_norm": 2.323155641555786, "learning_rate": 9.58e-06, "loss": 0.016, "step": 480 }, { "epoch": 0.25006379178361826, "grad_norm": 2.1754894256591797, "learning_rate": 9.780000000000001e-06, "loss": 0.0134, "step": 490 }, { "epoch": 0.25516713447307987, "grad_norm": 2.55068039894104, "learning_rate": 9.980000000000001e-06, "loss": 0.0133, "step": 500 }, { "epoch": 0.2602704771625415, "grad_norm": 2.78916072845459, "learning_rate": 9.983271375464685e-06, "loss": 0.0102, "step": 510 }, { "epoch": 0.26537381985200303, "grad_norm": 3.6304802894592285, "learning_rate": 9.96468401486989e-06, "loss": 0.0142, "step": 520 }, { "epoch": 0.27047716254146464, "grad_norm": 1.1248530149459839, "learning_rate": 9.946096654275093e-06, "loss": 0.0096, "step": 530 }, { "epoch": 0.27558050523092625, "grad_norm": 2.088334798812866, "learning_rate": 9.927509293680298e-06, "loss": 0.0189, "step": 540 }, { "epoch": 0.28068384792038786, "grad_norm": 2.035660982131958, "learning_rate": 9.908921933085503e-06, "loss": 0.0125, "step": 550 }, { "epoch": 0.2857871906098495, "grad_norm": 1.8379945755004883, "learning_rate": 9.890334572490708e-06, "loss": 0.01, "step": 560 }, { "epoch": 0.290890533299311, "grad_norm": 2.2616829872131348, "learning_rate": 9.871747211895911e-06, "loss": 0.0119, "step": 570 }, { "epoch": 0.29599387598877264, "grad_norm": 2.0087382793426514, "learning_rate": 9.853159851301116e-06, "loss": 0.0104, "step": 580 }, { "epoch": 0.30109721867823425, "grad_norm": 1.7350345849990845, "learning_rate": 9.83457249070632e-06, "loss": 0.0144, "step": 590 }, { "epoch": 0.30620056136769586, "grad_norm": 1.9522229433059692, "learning_rate": 9.815985130111524e-06, "loss": 0.0152, "step": 600 }, { "epoch": 0.31130390405715747, "grad_norm": 2.231642961502075, "learning_rate": 9.79739776951673e-06, "loss": 0.0138, "step": 610 }, { "epoch": 0.316407246746619, "grad_norm": 1.9675791263580322, "learning_rate": 9.778810408921934e-06, "loss": 0.0148, "step": 620 }, { "epoch": 0.32151058943608063, "grad_norm": 1.9099314212799072, "learning_rate": 9.76022304832714e-06, "loss": 0.0145, "step": 630 }, { "epoch": 0.32661393212554224, "grad_norm": 1.777403473854065, "learning_rate": 9.741635687732343e-06, "loss": 0.0164, "step": 640 }, { "epoch": 0.33171727481500385, "grad_norm": 3.160843849182129, "learning_rate": 9.723048327137548e-06, "loss": 0.0178, "step": 650 }, { "epoch": 0.3368206175044654, "grad_norm": 0.950518786907196, "learning_rate": 9.70446096654275e-06, "loss": 0.0154, "step": 660 }, { "epoch": 0.341923960193927, "grad_norm": 3.335329294204712, "learning_rate": 9.685873605947956e-06, "loss": 0.0149, "step": 670 }, { "epoch": 0.3470273028833886, "grad_norm": 2.4955413341522217, "learning_rate": 9.66728624535316e-06, "loss": 0.0095, "step": 680 }, { "epoch": 0.35213064557285023, "grad_norm": 1.4317821264266968, "learning_rate": 9.648698884758366e-06, "loss": 0.0143, "step": 690 }, { "epoch": 0.35723398826231184, "grad_norm": 1.6408915519714355, "learning_rate": 9.63011152416357e-06, "loss": 0.0262, "step": 700 }, { "epoch": 0.3623373309517734, "grad_norm": 5.888397693634033, "learning_rate": 9.611524163568774e-06, "loss": 0.0297, "step": 710 }, { "epoch": 0.367440673641235, "grad_norm": 2.526792526245117, "learning_rate": 9.592936802973979e-06, "loss": 0.0149, "step": 720 }, { "epoch": 0.3725440163306966, "grad_norm": 1.5245797634124756, "learning_rate": 9.574349442379182e-06, "loss": 0.0122, "step": 730 }, { "epoch": 0.3776473590201582, "grad_norm": 2.3768417835235596, "learning_rate": 9.555762081784387e-06, "loss": 0.0146, "step": 740 }, { "epoch": 0.3827507017096198, "grad_norm": 1.9545379877090454, "learning_rate": 9.537174721189592e-06, "loss": 0.0116, "step": 750 }, { "epoch": 0.3878540443990814, "grad_norm": 2.5888895988464355, "learning_rate": 9.518587360594797e-06, "loss": 0.0125, "step": 760 }, { "epoch": 0.392957387088543, "grad_norm": 2.554670810699463, "learning_rate": 9.5e-06, "loss": 0.0121, "step": 770 }, { "epoch": 0.3980607297780046, "grad_norm": 2.3274645805358887, "learning_rate": 9.481412639405206e-06, "loss": 0.0152, "step": 780 }, { "epoch": 0.4031640724674662, "grad_norm": 1.916551113128662, "learning_rate": 9.46282527881041e-06, "loss": 0.0181, "step": 790 }, { "epoch": 0.40826741515692777, "grad_norm": 2.7110981941223145, "learning_rate": 9.444237918215614e-06, "loss": 0.0201, "step": 800 }, { "epoch": 0.4133707578463894, "grad_norm": 2.66487193107605, "learning_rate": 9.425650557620819e-06, "loss": 0.0163, "step": 810 }, { "epoch": 0.418474100535851, "grad_norm": 4.3903303146362305, "learning_rate": 9.407063197026024e-06, "loss": 0.0203, "step": 820 }, { "epoch": 0.4235774432253126, "grad_norm": 0.8613393902778625, "learning_rate": 9.388475836431227e-06, "loss": 0.0101, "step": 830 }, { "epoch": 0.42868078591477415, "grad_norm": 14.285655975341797, "learning_rate": 9.369888475836432e-06, "loss": 0.0201, "step": 840 }, { "epoch": 0.43378412860423576, "grad_norm": 2.281245708465576, "learning_rate": 9.351301115241637e-06, "loss": 0.017, "step": 850 }, { "epoch": 0.4388874712936974, "grad_norm": 2.5612051486968994, "learning_rate": 9.33271375464684e-06, "loss": 0.0164, "step": 860 }, { "epoch": 0.443990813983159, "grad_norm": 3.728468894958496, "learning_rate": 9.314126394052045e-06, "loss": 0.0184, "step": 870 }, { "epoch": 0.4490941566726206, "grad_norm": 2.954237699508667, "learning_rate": 9.295539033457249e-06, "loss": 0.0216, "step": 880 }, { "epoch": 0.45419749936208215, "grad_norm": 2.5756335258483887, "learning_rate": 9.276951672862453e-06, "loss": 0.021, "step": 890 }, { "epoch": 0.45930084205154376, "grad_norm": 4.490197658538818, "learning_rate": 9.258364312267658e-06, "loss": 0.0138, "step": 900 }, { "epoch": 0.46440418474100537, "grad_norm": 1.9928340911865234, "learning_rate": 9.239776951672863e-06, "loss": 0.0158, "step": 910 }, { "epoch": 0.469507527430467, "grad_norm": 3.2016446590423584, "learning_rate": 9.221189591078068e-06, "loss": 0.0188, "step": 920 }, { "epoch": 0.47461087011992853, "grad_norm": 2.1624643802642822, "learning_rate": 9.202602230483272e-06, "loss": 0.0139, "step": 930 }, { "epoch": 0.47971421280939014, "grad_norm": 2.0108089447021484, "learning_rate": 9.184014869888477e-06, "loss": 0.0173, "step": 940 }, { "epoch": 0.48481755549885175, "grad_norm": 2.6266250610351562, "learning_rate": 9.16542750929368e-06, "loss": 0.0181, "step": 950 }, { "epoch": 0.48992089818831336, "grad_norm": 1.7041484117507935, "learning_rate": 9.146840148698885e-06, "loss": 0.0167, "step": 960 }, { "epoch": 0.49502424087777497, "grad_norm": 2.4042234420776367, "learning_rate": 9.12825278810409e-06, "loss": 0.017, "step": 970 }, { "epoch": 0.5001275835672365, "grad_norm": 1.770944595336914, "learning_rate": 9.109665427509295e-06, "loss": 0.01, "step": 980 }, { "epoch": 0.5052309262566982, "grad_norm": 2.101804256439209, "learning_rate": 9.0910780669145e-06, "loss": 0.0152, "step": 990 }, { "epoch": 0.5103342689461597, "grad_norm": 3.545254945755005, "learning_rate": 9.072490706319703e-06, "loss": 0.014, "step": 1000 }, { "epoch": 0.5154376116356213, "grad_norm": 2.445159912109375, "learning_rate": 9.053903345724908e-06, "loss": 0.0207, "step": 1010 }, { "epoch": 0.520540954325083, "grad_norm": 3.302297830581665, "learning_rate": 9.035315985130111e-06, "loss": 0.0212, "step": 1020 }, { "epoch": 0.5256442970145445, "grad_norm": 4.689877510070801, "learning_rate": 9.016728624535316e-06, "loss": 0.025, "step": 1030 }, { "epoch": 0.5307476397040061, "grad_norm": 4.139590740203857, "learning_rate": 8.998141263940521e-06, "loss": 0.0158, "step": 1040 }, { "epoch": 0.5358509823934677, "grad_norm": 1.6236610412597656, "learning_rate": 8.979553903345726e-06, "loss": 0.0112, "step": 1050 }, { "epoch": 0.5409543250829293, "grad_norm": 2.6642770767211914, "learning_rate": 8.96096654275093e-06, "loss": 0.0226, "step": 1060 }, { "epoch": 0.546057667772391, "grad_norm": 2.012868642807007, "learning_rate": 8.942379182156135e-06, "loss": 0.0172, "step": 1070 }, { "epoch": 0.5511610104618525, "grad_norm": 1.9676612615585327, "learning_rate": 8.92379182156134e-06, "loss": 0.0131, "step": 1080 }, { "epoch": 0.5562643531513141, "grad_norm": 3.358045816421509, "learning_rate": 8.905204460966543e-06, "loss": 0.0168, "step": 1090 }, { "epoch": 0.5613676958407757, "grad_norm": 1.9890451431274414, "learning_rate": 8.886617100371748e-06, "loss": 0.0158, "step": 1100 }, { "epoch": 0.5664710385302373, "grad_norm": 2.1915857791900635, "learning_rate": 8.868029739776953e-06, "loss": 0.015, "step": 1110 }, { "epoch": 0.571574381219699, "grad_norm": 2.0204272270202637, "learning_rate": 8.849442379182158e-06, "loss": 0.0217, "step": 1120 }, { "epoch": 0.5766777239091605, "grad_norm": 1.8702834844589233, "learning_rate": 8.830855018587361e-06, "loss": 0.014, "step": 1130 }, { "epoch": 0.581781066598622, "grad_norm": 0.8649874925613403, "learning_rate": 8.812267657992566e-06, "loss": 0.0168, "step": 1140 }, { "epoch": 0.5868844092880837, "grad_norm": 2.020085334777832, "learning_rate": 8.79368029739777e-06, "loss": 0.0166, "step": 1150 }, { "epoch": 0.5919877519775453, "grad_norm": 0.6940491199493408, "learning_rate": 8.775092936802974e-06, "loss": 0.014, "step": 1160 }, { "epoch": 0.5970910946670069, "grad_norm": 1.6421513557434082, "learning_rate": 8.75650557620818e-06, "loss": 0.0149, "step": 1170 }, { "epoch": 0.6021944373564685, "grad_norm": 1.7957764863967896, "learning_rate": 8.737918215613384e-06, "loss": 0.0194, "step": 1180 }, { "epoch": 0.60729778004593, "grad_norm": 1.6488491296768188, "learning_rate": 8.719330855018588e-06, "loss": 0.0119, "step": 1190 }, { "epoch": 0.6124011227353917, "grad_norm": 1.9999263286590576, "learning_rate": 8.700743494423793e-06, "loss": 0.0165, "step": 1200 }, { "epoch": 0.6175044654248533, "grad_norm": 1.749192237854004, "learning_rate": 8.682156133828998e-06, "loss": 0.0193, "step": 1210 }, { "epoch": 0.6226078081143149, "grad_norm": 2.414264440536499, "learning_rate": 8.663568773234201e-06, "loss": 0.0199, "step": 1220 }, { "epoch": 0.6277111508037765, "grad_norm": 2.670834541320801, "learning_rate": 8.644981412639406e-06, "loss": 0.0178, "step": 1230 }, { "epoch": 0.632814493493238, "grad_norm": 3.2673842906951904, "learning_rate": 8.626394052044609e-06, "loss": 0.0161, "step": 1240 }, { "epoch": 0.6379178361826997, "grad_norm": 2.5664849281311035, "learning_rate": 8.607806691449814e-06, "loss": 0.0213, "step": 1250 }, { "epoch": 0.6430211788721613, "grad_norm": 2.350846290588379, "learning_rate": 8.589219330855019e-06, "loss": 0.0181, "step": 1260 }, { "epoch": 0.6481245215616228, "grad_norm": 2.494407892227173, "learning_rate": 8.570631970260224e-06, "loss": 0.0128, "step": 1270 }, { "epoch": 0.6532278642510845, "grad_norm": 2.3424453735351562, "learning_rate": 8.552044609665429e-06, "loss": 0.0127, "step": 1280 }, { "epoch": 0.658331206940546, "grad_norm": 2.1651947498321533, "learning_rate": 8.533457249070632e-06, "loss": 0.0229, "step": 1290 }, { "epoch": 0.6634345496300077, "grad_norm": 0.5863803029060364, "learning_rate": 8.514869888475837e-06, "loss": 0.0145, "step": 1300 }, { "epoch": 0.6685378923194693, "grad_norm": 1.3225018978118896, "learning_rate": 8.49628252788104e-06, "loss": 0.0149, "step": 1310 }, { "epoch": 0.6736412350089308, "grad_norm": 3.3000130653381348, "learning_rate": 8.477695167286246e-06, "loss": 0.0211, "step": 1320 }, { "epoch": 0.6787445776983925, "grad_norm": 2.677570104598999, "learning_rate": 8.45910780669145e-06, "loss": 0.0113, "step": 1330 }, { "epoch": 0.683847920387854, "grad_norm": 1.235533595085144, "learning_rate": 8.440520446096656e-06, "loss": 0.0132, "step": 1340 }, { "epoch": 0.6889512630773157, "grad_norm": 1.7336188554763794, "learning_rate": 8.42193308550186e-06, "loss": 0.0147, "step": 1350 }, { "epoch": 0.6940546057667772, "grad_norm": 3.8093788623809814, "learning_rate": 8.403345724907064e-06, "loss": 0.0168, "step": 1360 }, { "epoch": 0.6991579484562388, "grad_norm": 1.9721407890319824, "learning_rate": 8.384758364312269e-06, "loss": 0.0148, "step": 1370 }, { "epoch": 0.7042612911457005, "grad_norm": 4.275414943695068, "learning_rate": 8.366171003717472e-06, "loss": 0.0171, "step": 1380 }, { "epoch": 0.709364633835162, "grad_norm": 1.36530339717865, "learning_rate": 8.347583643122677e-06, "loss": 0.0157, "step": 1390 }, { "epoch": 0.7144679765246237, "grad_norm": 2.0768120288848877, "learning_rate": 8.328996282527882e-06, "loss": 0.0197, "step": 1400 }, { "epoch": 0.7195713192140852, "grad_norm": 3.6376969814300537, "learning_rate": 8.310408921933087e-06, "loss": 0.02, "step": 1410 }, { "epoch": 0.7246746619035468, "grad_norm": 4.029935836791992, "learning_rate": 8.29182156133829e-06, "loss": 0.0132, "step": 1420 }, { "epoch": 0.7297780045930085, "grad_norm": 3.0603153705596924, "learning_rate": 8.273234200743495e-06, "loss": 0.0124, "step": 1430 }, { "epoch": 0.73488134728247, "grad_norm": 0.8475554585456848, "learning_rate": 8.2546468401487e-06, "loss": 0.0124, "step": 1440 }, { "epoch": 0.7399846899719316, "grad_norm": 1.9978291988372803, "learning_rate": 8.236059479553904e-06, "loss": 0.0117, "step": 1450 }, { "epoch": 0.7450880326613932, "grad_norm": 1.5020562410354614, "learning_rate": 8.217472118959108e-06, "loss": 0.0167, "step": 1460 }, { "epoch": 0.7501913753508548, "grad_norm": 1.614305853843689, "learning_rate": 8.198884758364313e-06, "loss": 0.0149, "step": 1470 }, { "epoch": 0.7552947180403164, "grad_norm": 2.371570110321045, "learning_rate": 8.180297397769518e-06, "loss": 0.0142, "step": 1480 }, { "epoch": 0.760398060729778, "grad_norm": 1.5552469491958618, "learning_rate": 8.161710037174722e-06, "loss": 0.0134, "step": 1490 }, { "epoch": 0.7655014034192396, "grad_norm": 1.9674372673034668, "learning_rate": 8.143122676579927e-06, "loss": 0.0225, "step": 1500 }, { "epoch": 0.7706047461087012, "grad_norm": 1.94131600856781, "learning_rate": 8.12453531598513e-06, "loss": 0.0132, "step": 1510 }, { "epoch": 0.7757080887981628, "grad_norm": 2.533285140991211, "learning_rate": 8.105947955390335e-06, "loss": 0.0164, "step": 1520 }, { "epoch": 0.7808114314876244, "grad_norm": 1.7931355237960815, "learning_rate": 8.08736059479554e-06, "loss": 0.0145, "step": 1530 }, { "epoch": 0.785914774177086, "grad_norm": 1.5637154579162598, "learning_rate": 8.068773234200745e-06, "loss": 0.0131, "step": 1540 }, { "epoch": 0.7910181168665475, "grad_norm": 1.0649983882904053, "learning_rate": 8.050185873605948e-06, "loss": 0.0317, "step": 1550 }, { "epoch": 0.7961214595560092, "grad_norm": 1.9837394952774048, "learning_rate": 8.031598513011153e-06, "loss": 0.0168, "step": 1560 }, { "epoch": 0.8012248022454708, "grad_norm": 3.6585099697113037, "learning_rate": 8.013011152416358e-06, "loss": 0.0131, "step": 1570 }, { "epoch": 0.8063281449349324, "grad_norm": 2.7953765392303467, "learning_rate": 7.994423791821561e-06, "loss": 0.0162, "step": 1580 }, { "epoch": 0.811431487624394, "grad_norm": 2.3890202045440674, "learning_rate": 7.975836431226766e-06, "loss": 0.0158, "step": 1590 }, { "epoch": 0.8165348303138555, "grad_norm": 2.073019504547119, "learning_rate": 7.95724907063197e-06, "loss": 0.0159, "step": 1600 }, { "epoch": 0.8216381730033172, "grad_norm": 2.4629039764404297, "learning_rate": 7.938661710037175e-06, "loss": 0.0117, "step": 1610 }, { "epoch": 0.8267415156927788, "grad_norm": 1.4736220836639404, "learning_rate": 7.92007434944238e-06, "loss": 0.0145, "step": 1620 }, { "epoch": 0.8318448583822403, "grad_norm": 3.2814719676971436, "learning_rate": 7.901486988847585e-06, "loss": 0.0143, "step": 1630 }, { "epoch": 0.836948201071702, "grad_norm": 2.1625795364379883, "learning_rate": 7.88289962825279e-06, "loss": 0.0118, "step": 1640 }, { "epoch": 0.8420515437611635, "grad_norm": 1.660874605178833, "learning_rate": 7.864312267657993e-06, "loss": 0.0127, "step": 1650 }, { "epoch": 0.8471548864506252, "grad_norm": 1.7518630027770996, "learning_rate": 7.845724907063198e-06, "loss": 0.0085, "step": 1660 }, { "epoch": 0.8522582291400868, "grad_norm": 1.452298879623413, "learning_rate": 7.827137546468401e-06, "loss": 0.0151, "step": 1670 }, { "epoch": 0.8573615718295483, "grad_norm": 1.8911986351013184, "learning_rate": 7.808550185873606e-06, "loss": 0.0166, "step": 1680 }, { "epoch": 0.86246491451901, "grad_norm": 3.8515708446502686, "learning_rate": 7.789962825278811e-06, "loss": 0.0221, "step": 1690 }, { "epoch": 0.8675682572084715, "grad_norm": 2.210042953491211, "learning_rate": 7.771375464684016e-06, "loss": 0.0208, "step": 1700 }, { "epoch": 0.8726715998979332, "grad_norm": 2.0735044479370117, "learning_rate": 7.75278810408922e-06, "loss": 0.0142, "step": 1710 }, { "epoch": 0.8777749425873947, "grad_norm": 2.415004253387451, "learning_rate": 7.734200743494424e-06, "loss": 0.0179, "step": 1720 }, { "epoch": 0.8828782852768563, "grad_norm": 2.272406816482544, "learning_rate": 7.71561338289963e-06, "loss": 0.0142, "step": 1730 }, { "epoch": 0.887981627966318, "grad_norm": 1.2048219442367554, "learning_rate": 7.697026022304833e-06, "loss": 0.0102, "step": 1740 }, { "epoch": 0.8930849706557795, "grad_norm": 1.414962887763977, "learning_rate": 7.678438661710038e-06, "loss": 0.0112, "step": 1750 }, { "epoch": 0.8981883133452412, "grad_norm": 0.9970257878303528, "learning_rate": 7.659851301115243e-06, "loss": 0.0116, "step": 1760 }, { "epoch": 0.9032916560347027, "grad_norm": 1.7614041566848755, "learning_rate": 7.641263940520448e-06, "loss": 0.0198, "step": 1770 }, { "epoch": 0.9083949987241643, "grad_norm": 2.285222291946411, "learning_rate": 7.622676579925651e-06, "loss": 0.0146, "step": 1780 }, { "epoch": 0.913498341413626, "grad_norm": 2.238495111465454, "learning_rate": 7.604089219330856e-06, "loss": 0.0102, "step": 1790 }, { "epoch": 0.9186016841030875, "grad_norm": 0.7516927123069763, "learning_rate": 7.58550185873606e-06, "loss": 0.0134, "step": 1800 }, { "epoch": 0.9237050267925491, "grad_norm": 1.6228662729263306, "learning_rate": 7.566914498141265e-06, "loss": 0.0182, "step": 1810 }, { "epoch": 0.9288083694820107, "grad_norm": 1.0676440000534058, "learning_rate": 7.548327137546469e-06, "loss": 0.0117, "step": 1820 }, { "epoch": 0.9339117121714723, "grad_norm": 2.345280170440674, "learning_rate": 7.529739776951673e-06, "loss": 0.0116, "step": 1830 }, { "epoch": 0.939015054860934, "grad_norm": 2.056405782699585, "learning_rate": 7.511152416356878e-06, "loss": 0.0181, "step": 1840 }, { "epoch": 0.9441183975503955, "grad_norm": 1.5895274877548218, "learning_rate": 7.492565055762082e-06, "loss": 0.0143, "step": 1850 }, { "epoch": 0.9492217402398571, "grad_norm": 3.693983554840088, "learning_rate": 7.473977695167287e-06, "loss": 0.0139, "step": 1860 }, { "epoch": 0.9543250829293187, "grad_norm": 1.7493189573287964, "learning_rate": 7.455390334572491e-06, "loss": 0.012, "step": 1870 }, { "epoch": 0.9594284256187803, "grad_norm": 6.353549957275391, "learning_rate": 7.436802973977696e-06, "loss": 0.0182, "step": 1880 }, { "epoch": 0.9645317683082419, "grad_norm": 3.067734956741333, "learning_rate": 7.4182156133829e-06, "loss": 0.0116, "step": 1890 }, { "epoch": 0.9696351109977035, "grad_norm": 2.4685025215148926, "learning_rate": 7.399628252788105e-06, "loss": 0.0153, "step": 1900 }, { "epoch": 0.974738453687165, "grad_norm": 2.9748520851135254, "learning_rate": 7.38104089219331e-06, "loss": 0.0196, "step": 1910 }, { "epoch": 0.9798417963766267, "grad_norm": 1.787302017211914, "learning_rate": 7.362453531598514e-06, "loss": 0.0139, "step": 1920 }, { "epoch": 0.9849451390660883, "grad_norm": 2.998495101928711, "learning_rate": 7.343866171003719e-06, "loss": 0.0117, "step": 1930 }, { "epoch": 0.9900484817555499, "grad_norm": 2.461190938949585, "learning_rate": 7.325278810408922e-06, "loss": 0.0188, "step": 1940 }, { "epoch": 0.9951518244450115, "grad_norm": 2.0859811305999756, "learning_rate": 7.306691449814127e-06, "loss": 0.0144, "step": 1950 }, { "epoch": 1.0, "grad_norm": 9.421244621276855, "learning_rate": 7.288104089219331e-06, "loss": 0.0214, "step": 1960 }, { "epoch": 1.0051033426894616, "grad_norm": 1.6903966665267944, "learning_rate": 7.269516728624536e-06, "loss": 0.0052, "step": 1970 }, { "epoch": 1.010206685378923, "grad_norm": 1.4454400539398193, "learning_rate": 7.25092936802974e-06, "loss": 0.0054, "step": 1980 }, { "epoch": 1.0153100280683849, "grad_norm": 1.46286141872406, "learning_rate": 7.2323420074349444e-06, "loss": 0.0036, "step": 1990 }, { "epoch": 1.0204133707578464, "grad_norm": 1.4223207235336304, "learning_rate": 7.213754646840149e-06, "loss": 0.0041, "step": 2000 }, { "epoch": 1.0204133707578464, "eval_loss": 0.007851608097553253, "eval_runtime": 5932.2159, "eval_samples_per_second": 2.642, "eval_steps_per_second": 0.33, "eval_wer": 0.6722360915468404, "step": 2000 }, { "epoch": 1.025516713447308, "grad_norm": 1.5885974168777466, "learning_rate": 7.1951672862453535e-06, "loss": 0.0058, "step": 2010 }, { "epoch": 1.0306200561367695, "grad_norm": 0.7237359881401062, "learning_rate": 7.1765799256505585e-06, "loss": 0.0056, "step": 2020 }, { "epoch": 1.035723398826231, "grad_norm": 2.5091042518615723, "learning_rate": 7.157992565055763e-06, "loss": 0.0055, "step": 2030 }, { "epoch": 1.0408267415156929, "grad_norm": 1.831894874572754, "learning_rate": 7.139405204460968e-06, "loss": 0.0054, "step": 2040 }, { "epoch": 1.0459300842051544, "grad_norm": 0.5377639532089233, "learning_rate": 7.120817843866171e-06, "loss": 0.0055, "step": 2050 }, { "epoch": 1.051033426894616, "grad_norm": 0.48973751068115234, "learning_rate": 7.102230483271376e-06, "loss": 0.0055, "step": 2060 }, { "epoch": 1.0561367695840775, "grad_norm": 1.8925316333770752, "learning_rate": 7.08364312267658e-06, "loss": 0.0097, "step": 2070 }, { "epoch": 1.061240112273539, "grad_norm": 1.005006194114685, "learning_rate": 7.065055762081785e-06, "loss": 0.0055, "step": 2080 }, { "epoch": 1.0663434549630009, "grad_norm": 1.7371063232421875, "learning_rate": 7.04646840148699e-06, "loss": 0.0067, "step": 2090 }, { "epoch": 1.0714467976524624, "grad_norm": 1.484964370727539, "learning_rate": 7.027881040892194e-06, "loss": 0.0041, "step": 2100 }, { "epoch": 1.076550140341924, "grad_norm": 1.0253629684448242, "learning_rate": 7.009293680297399e-06, "loss": 0.0065, "step": 2110 }, { "epoch": 1.0816534830313855, "grad_norm": 0.5347009897232056, "learning_rate": 6.990706319702602e-06, "loss": 0.0069, "step": 2120 }, { "epoch": 1.086756825720847, "grad_norm": 1.1465612649917603, "learning_rate": 6.972118959107807e-06, "loss": 0.0057, "step": 2130 }, { "epoch": 1.0918601684103089, "grad_norm": 0.8260084986686707, "learning_rate": 6.9535315985130115e-06, "loss": 0.0036, "step": 2140 }, { "epoch": 1.0969635110997704, "grad_norm": 0.7711835503578186, "learning_rate": 6.9349442379182165e-06, "loss": 0.0035, "step": 2150 }, { "epoch": 1.102066853789232, "grad_norm": 1.6993855237960815, "learning_rate": 6.916356877323421e-06, "loss": 0.0045, "step": 2160 }, { "epoch": 1.1071701964786935, "grad_norm": 1.6055148839950562, "learning_rate": 6.897769516728625e-06, "loss": 0.0037, "step": 2170 }, { "epoch": 1.112273539168155, "grad_norm": 1.5848637819290161, "learning_rate": 6.87918215613383e-06, "loss": 0.0057, "step": 2180 }, { "epoch": 1.1173768818576169, "grad_norm": 0.6338240504264832, "learning_rate": 6.860594795539034e-06, "loss": 0.0041, "step": 2190 }, { "epoch": 1.1224802245470784, "grad_norm": 0.5418840646743774, "learning_rate": 6.842007434944239e-06, "loss": 0.0056, "step": 2200 }, { "epoch": 1.12758356723654, "grad_norm": 1.729345679283142, "learning_rate": 6.823420074349443e-06, "loss": 0.0045, "step": 2210 }, { "epoch": 1.1326869099260015, "grad_norm": 2.351128578186035, "learning_rate": 6.804832713754648e-06, "loss": 0.0047, "step": 2220 }, { "epoch": 1.137790252615463, "grad_norm": 0.46814098954200745, "learning_rate": 6.786245353159851e-06, "loss": 0.0053, "step": 2230 }, { "epoch": 1.1428935953049248, "grad_norm": 1.0121688842773438, "learning_rate": 6.767657992565056e-06, "loss": 0.0056, "step": 2240 }, { "epoch": 1.1479969379943864, "grad_norm": 0.32681307196617126, "learning_rate": 6.74907063197026e-06, "loss": 0.0033, "step": 2250 }, { "epoch": 1.153100280683848, "grad_norm": 0.3472459614276886, "learning_rate": 6.730483271375465e-06, "loss": 0.005, "step": 2260 }, { "epoch": 1.1582036233733095, "grad_norm": 0.3251103460788727, "learning_rate": 6.7118959107806694e-06, "loss": 0.0067, "step": 2270 }, { "epoch": 1.163306966062771, "grad_norm": 1.372989535331726, "learning_rate": 6.6933085501858744e-06, "loss": 0.0066, "step": 2280 }, { "epoch": 1.1684103087522326, "grad_norm": 3.0950405597686768, "learning_rate": 6.674721189591079e-06, "loss": 0.0059, "step": 2290 }, { "epoch": 1.1735136514416944, "grad_norm": 0.6446343064308167, "learning_rate": 6.656133828996283e-06, "loss": 0.0048, "step": 2300 }, { "epoch": 1.178616994131156, "grad_norm": 2.7999908924102783, "learning_rate": 6.637546468401488e-06, "loss": 0.0075, "step": 2310 }, { "epoch": 1.1837203368206175, "grad_norm": 1.556735634803772, "learning_rate": 6.618959107806692e-06, "loss": 0.0078, "step": 2320 }, { "epoch": 1.188823679510079, "grad_norm": 0.6871877908706665, "learning_rate": 6.600371747211897e-06, "loss": 0.0039, "step": 2330 }, { "epoch": 1.1939270221995406, "grad_norm": 0.6974169611930847, "learning_rate": 6.581784386617101e-06, "loss": 0.0044, "step": 2340 }, { "epoch": 1.1990303648890024, "grad_norm": 0.34097474813461304, "learning_rate": 6.563197026022305e-06, "loss": 0.0038, "step": 2350 }, { "epoch": 1.204133707578464, "grad_norm": 1.1647700071334839, "learning_rate": 6.544609665427509e-06, "loss": 0.0042, "step": 2360 }, { "epoch": 1.2092370502679255, "grad_norm": 0.5699931383132935, "learning_rate": 6.526022304832714e-06, "loss": 0.0044, "step": 2370 }, { "epoch": 1.214340392957387, "grad_norm": 0.9477786421775818, "learning_rate": 6.507434944237919e-06, "loss": 0.0062, "step": 2380 }, { "epoch": 1.2194437356468486, "grad_norm": 1.1258920431137085, "learning_rate": 6.488847583643123e-06, "loss": 0.005, "step": 2390 }, { "epoch": 1.2245470783363104, "grad_norm": 1.580121636390686, "learning_rate": 6.470260223048328e-06, "loss": 0.0037, "step": 2400 }, { "epoch": 1.229650421025772, "grad_norm": 1.993891716003418, "learning_rate": 6.4516728624535315e-06, "loss": 0.0062, "step": 2410 }, { "epoch": 1.2347537637152335, "grad_norm": 1.034355878829956, "learning_rate": 6.4330855018587365e-06, "loss": 0.0053, "step": 2420 }, { "epoch": 1.239857106404695, "grad_norm": 1.6849045753479004, "learning_rate": 6.414498141263941e-06, "loss": 0.0056, "step": 2430 }, { "epoch": 1.2449604490941566, "grad_norm": 1.9418292045593262, "learning_rate": 6.395910780669146e-06, "loss": 0.006, "step": 2440 }, { "epoch": 1.2500637917836182, "grad_norm": 1.7483155727386475, "learning_rate": 6.37732342007435e-06, "loss": 0.0055, "step": 2450 }, { "epoch": 1.25516713447308, "grad_norm": 0.9368677139282227, "learning_rate": 6.358736059479555e-06, "loss": 0.006, "step": 2460 }, { "epoch": 1.2602704771625415, "grad_norm": 1.3387763500213623, "learning_rate": 6.34014869888476e-06, "loss": 0.0066, "step": 2470 }, { "epoch": 1.265373819852003, "grad_norm": 0.7016597986221313, "learning_rate": 6.321561338289963e-06, "loss": 0.0035, "step": 2480 }, { "epoch": 1.2704771625414646, "grad_norm": 2.289067268371582, "learning_rate": 6.302973977695168e-06, "loss": 0.0041, "step": 2490 }, { "epoch": 1.2755805052309261, "grad_norm": 2.0604097843170166, "learning_rate": 6.284386617100372e-06, "loss": 0.0029, "step": 2500 }, { "epoch": 1.280683847920388, "grad_norm": 0.09299144893884659, "learning_rate": 6.265799256505577e-06, "loss": 0.0065, "step": 2510 }, { "epoch": 1.2857871906098495, "grad_norm": 2.164297342300415, "learning_rate": 6.247211895910781e-06, "loss": 0.0041, "step": 2520 }, { "epoch": 1.290890533299311, "grad_norm": 1.1168850660324097, "learning_rate": 6.228624535315985e-06, "loss": 0.0064, "step": 2530 }, { "epoch": 1.2959938759887726, "grad_norm": 1.1941462755203247, "learning_rate": 6.2100371747211895e-06, "loss": 0.0048, "step": 2540 }, { "epoch": 1.3010972186782341, "grad_norm": 0.29545173048973083, "learning_rate": 6.1914498141263945e-06, "loss": 0.005, "step": 2550 }, { "epoch": 1.306200561367696, "grad_norm": 1.6539217233657837, "learning_rate": 6.1728624535315994e-06, "loss": 0.0035, "step": 2560 }, { "epoch": 1.3113039040571575, "grad_norm": 0.5509535670280457, "learning_rate": 6.1542750929368036e-06, "loss": 0.0081, "step": 2570 }, { "epoch": 1.316407246746619, "grad_norm": 1.1881476640701294, "learning_rate": 6.1356877323420085e-06, "loss": 0.0074, "step": 2580 }, { "epoch": 1.3215105894360806, "grad_norm": 1.1224453449249268, "learning_rate": 6.117100371747212e-06, "loss": 0.0058, "step": 2590 }, { "epoch": 1.3266139321255421, "grad_norm": 0.6658273935317993, "learning_rate": 6.098513011152417e-06, "loss": 0.0046, "step": 2600 }, { "epoch": 1.331717274815004, "grad_norm": 2.696826696395874, "learning_rate": 6.079925650557621e-06, "loss": 0.0046, "step": 2610 }, { "epoch": 1.3368206175044655, "grad_norm": 0.6089099645614624, "learning_rate": 6.061338289962826e-06, "loss": 0.0054, "step": 2620 }, { "epoch": 1.341923960193927, "grad_norm": 0.5594236850738525, "learning_rate": 6.04275092936803e-06, "loss": 0.0037, "step": 2630 }, { "epoch": 1.3470273028833886, "grad_norm": 2.5467419624328613, "learning_rate": 6.024163568773235e-06, "loss": 0.0062, "step": 2640 }, { "epoch": 1.3521306455728501, "grad_norm": 1.825701117515564, "learning_rate": 6.00557620817844e-06, "loss": 0.0074, "step": 2650 }, { "epoch": 1.357233988262312, "grad_norm": 1.2724944353103638, "learning_rate": 5.986988847583643e-06, "loss": 0.0055, "step": 2660 }, { "epoch": 1.3623373309517735, "grad_norm": 0.20556636154651642, "learning_rate": 5.968401486988848e-06, "loss": 0.0049, "step": 2670 }, { "epoch": 1.367440673641235, "grad_norm": 0.982221782207489, "learning_rate": 5.949814126394052e-06, "loss": 0.005, "step": 2680 }, { "epoch": 1.3725440163306966, "grad_norm": 0.5019739866256714, "learning_rate": 5.931226765799257e-06, "loss": 0.005, "step": 2690 }, { "epoch": 1.3776473590201581, "grad_norm": 0.9710202217102051, "learning_rate": 5.9126394052044615e-06, "loss": 0.0046, "step": 2700 }, { "epoch": 1.38275070170962, "grad_norm": 1.481512427330017, "learning_rate": 5.894052044609666e-06, "loss": 0.0039, "step": 2710 }, { "epoch": 1.3878540443990814, "grad_norm": 0.9244014024734497, "learning_rate": 5.87546468401487e-06, "loss": 0.0035, "step": 2720 }, { "epoch": 1.392957387088543, "grad_norm": 0.28111106157302856, "learning_rate": 5.856877323420075e-06, "loss": 0.0054, "step": 2730 }, { "epoch": 1.3980607297780046, "grad_norm": 1.0643965005874634, "learning_rate": 5.83828996282528e-06, "loss": 0.0048, "step": 2740 }, { "epoch": 1.403164072467466, "grad_norm": 0.3674823045730591, "learning_rate": 5.819702602230484e-06, "loss": 0.0049, "step": 2750 }, { "epoch": 1.4082674151569279, "grad_norm": 1.2270021438598633, "learning_rate": 5.801115241635689e-06, "loss": 0.0037, "step": 2760 }, { "epoch": 1.4133707578463894, "grad_norm": 4.543473243713379, "learning_rate": 5.782527881040892e-06, "loss": 0.0067, "step": 2770 }, { "epoch": 1.418474100535851, "grad_norm": 1.119815468788147, "learning_rate": 5.763940520446097e-06, "loss": 0.0036, "step": 2780 }, { "epoch": 1.4235774432253125, "grad_norm": 1.7222695350646973, "learning_rate": 5.745353159851301e-06, "loss": 0.0065, "step": 2790 }, { "epoch": 1.428680785914774, "grad_norm": 0.778711199760437, "learning_rate": 5.726765799256506e-06, "loss": 0.0034, "step": 2800 }, { "epoch": 1.4337841286042359, "grad_norm": 0.5175672173500061, "learning_rate": 5.70817843866171e-06, "loss": 0.0055, "step": 2810 }, { "epoch": 1.4388874712936974, "grad_norm": 1.3372684717178345, "learning_rate": 5.689591078066915e-06, "loss": 0.005, "step": 2820 }, { "epoch": 1.443990813983159, "grad_norm": 0.7624754309654236, "learning_rate": 5.67100371747212e-06, "loss": 0.0032, "step": 2830 }, { "epoch": 1.4490941566726205, "grad_norm": 0.597372829914093, "learning_rate": 5.652416356877324e-06, "loss": 0.0034, "step": 2840 }, { "epoch": 1.454197499362082, "grad_norm": 0.6024683713912964, "learning_rate": 5.633828996282529e-06, "loss": 0.0047, "step": 2850 }, { "epoch": 1.4593008420515439, "grad_norm": 3.4740748405456543, "learning_rate": 5.615241635687733e-06, "loss": 0.007, "step": 2860 }, { "epoch": 1.4644041847410054, "grad_norm": 1.7954155206680298, "learning_rate": 5.596654275092938e-06, "loss": 0.006, "step": 2870 }, { "epoch": 1.469507527430467, "grad_norm": 0.7482948899269104, "learning_rate": 5.578066914498142e-06, "loss": 0.0053, "step": 2880 }, { "epoch": 1.4746108701199285, "grad_norm": 2.095458507537842, "learning_rate": 5.559479553903346e-06, "loss": 0.0062, "step": 2890 }, { "epoch": 1.47971421280939, "grad_norm": 1.7963470220565796, "learning_rate": 5.54089219330855e-06, "loss": 0.0068, "step": 2900 }, { "epoch": 1.4848175554988519, "grad_norm": 2.6437880992889404, "learning_rate": 5.522304832713755e-06, "loss": 0.0092, "step": 2910 }, { "epoch": 1.4899208981883134, "grad_norm": 1.520580768585205, "learning_rate": 5.503717472118959e-06, "loss": 0.0042, "step": 2920 }, { "epoch": 1.495024240877775, "grad_norm": 4.081545352935791, "learning_rate": 5.485130111524164e-06, "loss": 0.0044, "step": 2930 }, { "epoch": 1.5001275835672365, "grad_norm": 1.9808855056762695, "learning_rate": 5.466542750929369e-06, "loss": 0.0043, "step": 2940 }, { "epoch": 1.505230926256698, "grad_norm": 0.6452007293701172, "learning_rate": 5.4479553903345724e-06, "loss": 0.0043, "step": 2950 }, { "epoch": 1.5103342689461599, "grad_norm": 0.26754477620124817, "learning_rate": 5.429368029739777e-06, "loss": 0.0036, "step": 2960 }, { "epoch": 1.5154376116356212, "grad_norm": 1.183559536933899, "learning_rate": 5.4107806691449816e-06, "loss": 0.0069, "step": 2970 }, { "epoch": 1.520540954325083, "grad_norm": 0.8674173355102539, "learning_rate": 5.3921933085501865e-06, "loss": 0.0041, "step": 2980 }, { "epoch": 1.5256442970145445, "grad_norm": 0.28192785382270813, "learning_rate": 5.373605947955391e-06, "loss": 0.0039, "step": 2990 }, { "epoch": 1.530747639704006, "grad_norm": 1.6907070875167847, "learning_rate": 5.355018587360596e-06, "loss": 0.0065, "step": 3000 }, { "epoch": 1.5358509823934678, "grad_norm": 1.0499199628829956, "learning_rate": 5.336431226765799e-06, "loss": 0.0029, "step": 3010 }, { "epoch": 1.5409543250829292, "grad_norm": 0.5462940335273743, "learning_rate": 5.317843866171004e-06, "loss": 0.0047, "step": 3020 }, { "epoch": 1.546057667772391, "grad_norm": 0.8141253590583801, "learning_rate": 5.299256505576209e-06, "loss": 0.0048, "step": 3030 }, { "epoch": 1.5511610104618525, "grad_norm": 0.5449689030647278, "learning_rate": 5.280669144981413e-06, "loss": 0.0041, "step": 3040 }, { "epoch": 1.556264353151314, "grad_norm": 1.7593544721603394, "learning_rate": 5.262081784386618e-06, "loss": 0.0051, "step": 3050 }, { "epoch": 1.5613676958407758, "grad_norm": 0.6444630026817322, "learning_rate": 5.243494423791822e-06, "loss": 0.0038, "step": 3060 }, { "epoch": 1.5664710385302372, "grad_norm": 0.7694640755653381, "learning_rate": 5.224907063197026e-06, "loss": 0.0076, "step": 3070 }, { "epoch": 1.571574381219699, "grad_norm": 0.9293046593666077, "learning_rate": 5.20631970260223e-06, "loss": 0.0057, "step": 3080 }, { "epoch": 1.5766777239091605, "grad_norm": 1.5675665140151978, "learning_rate": 5.187732342007435e-06, "loss": 0.0034, "step": 3090 }, { "epoch": 1.581781066598622, "grad_norm": 0.7534652352333069, "learning_rate": 5.1691449814126395e-06, "loss": 0.0056, "step": 3100 }, { "epoch": 1.5868844092880838, "grad_norm": 0.5952958464622498, "learning_rate": 5.1505576208178445e-06, "loss": 0.007, "step": 3110 }, { "epoch": 1.5919877519775452, "grad_norm": 2.0124547481536865, "learning_rate": 5.1319702602230495e-06, "loss": 0.0085, "step": 3120 }, { "epoch": 1.597091094667007, "grad_norm": 2.005147695541382, "learning_rate": 5.113382899628253e-06, "loss": 0.0054, "step": 3130 }, { "epoch": 1.6021944373564685, "grad_norm": 2.4814913272857666, "learning_rate": 5.094795539033458e-06, "loss": 0.0078, "step": 3140 }, { "epoch": 1.60729778004593, "grad_norm": 1.2369087934494019, "learning_rate": 5.076208178438662e-06, "loss": 0.0045, "step": 3150 }, { "epoch": 1.6124011227353918, "grad_norm": 1.4700592756271362, "learning_rate": 5.057620817843867e-06, "loss": 0.005, "step": 3160 }, { "epoch": 1.6175044654248532, "grad_norm": 0.8340181112289429, "learning_rate": 5.039033457249071e-06, "loss": 0.0039, "step": 3170 }, { "epoch": 1.622607808114315, "grad_norm": 1.1849232912063599, "learning_rate": 5.020446096654276e-06, "loss": 0.0057, "step": 3180 }, { "epoch": 1.6277111508037765, "grad_norm": 0.2937636077404022, "learning_rate": 5.001858736059479e-06, "loss": 0.0035, "step": 3190 }, { "epoch": 1.632814493493238, "grad_norm": 2.127737045288086, "learning_rate": 4.983271375464684e-06, "loss": 0.006, "step": 3200 }, { "epoch": 1.6379178361826998, "grad_norm": 0.5009581446647644, "learning_rate": 4.964684014869889e-06, "loss": 0.0049, "step": 3210 }, { "epoch": 1.6430211788721611, "grad_norm": 0.9251111745834351, "learning_rate": 4.946096654275093e-06, "loss": 0.0033, "step": 3220 }, { "epoch": 1.648124521561623, "grad_norm": 0.9365226626396179, "learning_rate": 4.9275092936802975e-06, "loss": 0.0057, "step": 3230 }, { "epoch": 1.6532278642510845, "grad_norm": 1.4188483953475952, "learning_rate": 4.9089219330855024e-06, "loss": 0.006, "step": 3240 }, { "epoch": 1.658331206940546, "grad_norm": 2.330155372619629, "learning_rate": 4.8903345724907066e-06, "loss": 0.0087, "step": 3250 }, { "epoch": 1.6634345496300078, "grad_norm": 0.6663316488265991, "learning_rate": 4.8717472118959115e-06, "loss": 0.0077, "step": 3260 }, { "epoch": 1.6685378923194691, "grad_norm": 0.3848799169063568, "learning_rate": 4.853159851301116e-06, "loss": 0.0037, "step": 3270 }, { "epoch": 1.673641235008931, "grad_norm": 1.8248586654663086, "learning_rate": 4.83457249070632e-06, "loss": 0.0048, "step": 3280 }, { "epoch": 1.6787445776983925, "grad_norm": 0.4323923885822296, "learning_rate": 4.815985130111525e-06, "loss": 0.0039, "step": 3290 }, { "epoch": 1.683847920387854, "grad_norm": 0.8399850726127625, "learning_rate": 4.797397769516729e-06, "loss": 0.0048, "step": 3300 }, { "epoch": 1.6889512630773158, "grad_norm": 1.9225555658340454, "learning_rate": 4.778810408921933e-06, "loss": 0.0049, "step": 3310 }, { "epoch": 1.6940546057667771, "grad_norm": 3.109381675720215, "learning_rate": 4.760223048327138e-06, "loss": 0.0032, "step": 3320 }, { "epoch": 1.699157948456239, "grad_norm": 0.18898829817771912, "learning_rate": 4.741635687732342e-06, "loss": 0.0044, "step": 3330 }, { "epoch": 1.7042612911457005, "grad_norm": 1.3611193895339966, "learning_rate": 4.723048327137547e-06, "loss": 0.0043, "step": 3340 }, { "epoch": 1.709364633835162, "grad_norm": 2.0131754875183105, "learning_rate": 4.704460966542751e-06, "loss": 0.0055, "step": 3350 }, { "epoch": 1.7144679765246238, "grad_norm": 0.367348313331604, "learning_rate": 4.685873605947956e-06, "loss": 0.0035, "step": 3360 }, { "epoch": 1.7195713192140851, "grad_norm": 0.44550269842147827, "learning_rate": 4.66728624535316e-06, "loss": 0.0048, "step": 3370 }, { "epoch": 1.724674661903547, "grad_norm": 0.8190656900405884, "learning_rate": 4.6486988847583645e-06, "loss": 0.0034, "step": 3380 }, { "epoch": 1.7297780045930085, "grad_norm": 1.4577871561050415, "learning_rate": 4.6301115241635695e-06, "loss": 0.0036, "step": 3390 }, { "epoch": 1.73488134728247, "grad_norm": 1.2969541549682617, "learning_rate": 4.611524163568774e-06, "loss": 0.0056, "step": 3400 }, { "epoch": 1.7399846899719316, "grad_norm": 1.2122235298156738, "learning_rate": 4.592936802973978e-06, "loss": 0.005, "step": 3410 }, { "epoch": 1.7450880326613931, "grad_norm": 1.4638077020645142, "learning_rate": 4.574349442379183e-06, "loss": 0.0067, "step": 3420 }, { "epoch": 1.750191375350855, "grad_norm": 0.8848273158073425, "learning_rate": 4.555762081784387e-06, "loss": 0.0043, "step": 3430 }, { "epoch": 1.7552947180403164, "grad_norm": 1.0738859176635742, "learning_rate": 4.537174721189592e-06, "loss": 0.003, "step": 3440 }, { "epoch": 1.760398060729778, "grad_norm": 0.7346417307853699, "learning_rate": 4.518587360594796e-06, "loss": 0.0064, "step": 3450 }, { "epoch": 1.7655014034192396, "grad_norm": 0.6269612908363342, "learning_rate": 4.5e-06, "loss": 0.0029, "step": 3460 }, { "epoch": 1.770604746108701, "grad_norm": 0.3140880763530731, "learning_rate": 4.481412639405205e-06, "loss": 0.0031, "step": 3470 }, { "epoch": 1.7757080887981629, "grad_norm": 1.5578272342681885, "learning_rate": 4.462825278810409e-06, "loss": 0.0049, "step": 3480 }, { "epoch": 1.7808114314876244, "grad_norm": 1.5797828435897827, "learning_rate": 4.444237918215613e-06, "loss": 0.0038, "step": 3490 }, { "epoch": 1.785914774177086, "grad_norm": 0.6058505773544312, "learning_rate": 4.425650557620818e-06, "loss": 0.0041, "step": 3500 }, { "epoch": 1.7910181168665475, "grad_norm": 1.7516237497329712, "learning_rate": 4.4070631970260225e-06, "loss": 0.0034, "step": 3510 }, { "epoch": 1.796121459556009, "grad_norm": 1.806767463684082, "learning_rate": 4.388475836431227e-06, "loss": 0.0055, "step": 3520 }, { "epoch": 1.8012248022454709, "grad_norm": 1.1925740242004395, "learning_rate": 4.369888475836432e-06, "loss": 0.009, "step": 3530 }, { "epoch": 1.8063281449349324, "grad_norm": 0.4238371253013611, "learning_rate": 4.3513011152416366e-06, "loss": 0.0049, "step": 3540 }, { "epoch": 1.811431487624394, "grad_norm": 0.4037840664386749, "learning_rate": 4.332713754646841e-06, "loss": 0.0036, "step": 3550 }, { "epoch": 1.8165348303138555, "grad_norm": 0.35048696398735046, "learning_rate": 4.314126394052045e-06, "loss": 0.0041, "step": 3560 }, { "epoch": 1.821638173003317, "grad_norm": 0.916644811630249, "learning_rate": 4.29553903345725e-06, "loss": 0.0052, "step": 3570 }, { "epoch": 1.8267415156927789, "grad_norm": 1.2729437351226807, "learning_rate": 4.276951672862454e-06, "loss": 0.0039, "step": 3580 }, { "epoch": 1.8318448583822402, "grad_norm": 0.14079026877880096, "learning_rate": 4.258364312267658e-06, "loss": 0.0052, "step": 3590 }, { "epoch": 1.836948201071702, "grad_norm": 0.7596153616905212, "learning_rate": 4.239776951672863e-06, "loss": 0.0036, "step": 3600 }, { "epoch": 1.8420515437611635, "grad_norm": 0.5967218279838562, "learning_rate": 4.221189591078067e-06, "loss": 0.0034, "step": 3610 }, { "epoch": 1.847154886450625, "grad_norm": 3.4013657569885254, "learning_rate": 4.202602230483272e-06, "loss": 0.0037, "step": 3620 }, { "epoch": 1.8522582291400869, "grad_norm": 0.5800639986991882, "learning_rate": 4.184014869888476e-06, "loss": 0.0054, "step": 3630 }, { "epoch": 1.8573615718295482, "grad_norm": 1.089106798171997, "learning_rate": 4.16542750929368e-06, "loss": 0.0034, "step": 3640 }, { "epoch": 1.86246491451901, "grad_norm": 1.3403905630111694, "learning_rate": 4.146840148698885e-06, "loss": 0.0081, "step": 3650 }, { "epoch": 1.8675682572084715, "grad_norm": 1.1740740537643433, "learning_rate": 4.1282527881040895e-06, "loss": 0.006, "step": 3660 }, { "epoch": 1.872671599897933, "grad_norm": 1.466586709022522, "learning_rate": 4.109665427509294e-06, "loss": 0.0057, "step": 3670 }, { "epoch": 1.8777749425873949, "grad_norm": 0.5066618919372559, "learning_rate": 4.091078066914499e-06, "loss": 0.0052, "step": 3680 }, { "epoch": 1.8828782852768562, "grad_norm": 1.9082309007644653, "learning_rate": 4.072490706319703e-06, "loss": 0.0107, "step": 3690 }, { "epoch": 1.887981627966318, "grad_norm": 1.2575207948684692, "learning_rate": 4.053903345724907e-06, "loss": 0.0036, "step": 3700 }, { "epoch": 1.8930849706557795, "grad_norm": 2.694517135620117, "learning_rate": 4.035315985130112e-06, "loss": 0.0037, "step": 3710 }, { "epoch": 1.898188313345241, "grad_norm": 1.3405297994613647, "learning_rate": 4.016728624535317e-06, "loss": 0.0035, "step": 3720 }, { "epoch": 1.9032916560347029, "grad_norm": 0.7254632115364075, "learning_rate": 3.998141263940521e-06, "loss": 0.0058, "step": 3730 }, { "epoch": 1.9083949987241642, "grad_norm": 0.9581726789474487, "learning_rate": 3.979553903345725e-06, "loss": 0.0035, "step": 3740 }, { "epoch": 1.913498341413626, "grad_norm": 1.521457314491272, "learning_rate": 3.96096654275093e-06, "loss": 0.0028, "step": 3750 }, { "epoch": 1.9186016841030875, "grad_norm": 1.7886348962783813, "learning_rate": 3.942379182156134e-06, "loss": 0.0036, "step": 3760 }, { "epoch": 1.923705026792549, "grad_norm": 1.3068257570266724, "learning_rate": 3.923791821561338e-06, "loss": 0.0035, "step": 3770 }, { "epoch": 1.9288083694820108, "grad_norm": 0.31176239252090454, "learning_rate": 3.905204460966543e-06, "loss": 0.0044, "step": 3780 }, { "epoch": 1.9339117121714722, "grad_norm": 1.6306222677230835, "learning_rate": 3.8866171003717475e-06, "loss": 0.0056, "step": 3790 }, { "epoch": 1.939015054860934, "grad_norm": 0.576551616191864, "learning_rate": 3.868029739776952e-06, "loss": 0.0043, "step": 3800 }, { "epoch": 1.9441183975503955, "grad_norm": 0.834531843662262, "learning_rate": 3.849442379182157e-06, "loss": 0.0042, "step": 3810 }, { "epoch": 1.949221740239857, "grad_norm": 0.5537549257278442, "learning_rate": 3.830855018587361e-06, "loss": 0.0032, "step": 3820 }, { "epoch": 1.9543250829293188, "grad_norm": 1.457414150238037, "learning_rate": 3.8122676579925653e-06, "loss": 0.0046, "step": 3830 }, { "epoch": 1.9594284256187802, "grad_norm": 1.4577444791793823, "learning_rate": 3.79368029739777e-06, "loss": 0.0042, "step": 3840 }, { "epoch": 1.964531768308242, "grad_norm": 1.1170302629470825, "learning_rate": 3.7750929368029744e-06, "loss": 0.0024, "step": 3850 }, { "epoch": 1.9696351109977035, "grad_norm": 1.491133213043213, "learning_rate": 3.7565055762081785e-06, "loss": 0.0034, "step": 3860 }, { "epoch": 1.974738453687165, "grad_norm": 0.6399077773094177, "learning_rate": 3.737918215613383e-06, "loss": 0.0037, "step": 3870 }, { "epoch": 1.9798417963766268, "grad_norm": 0.1969028264284134, "learning_rate": 3.7193308550185876e-06, "loss": 0.0032, "step": 3880 }, { "epoch": 1.9849451390660882, "grad_norm": 0.8877514600753784, "learning_rate": 3.7007434944237918e-06, "loss": 0.0048, "step": 3890 }, { "epoch": 1.99004848175555, "grad_norm": 0.6252923607826233, "learning_rate": 3.6821561338289967e-06, "loss": 0.0023, "step": 3900 }, { "epoch": 1.9951518244450115, "grad_norm": 0.5143133997917175, "learning_rate": 3.6635687732342013e-06, "loss": 0.0032, "step": 3910 }, { "epoch": 2.0, "grad_norm": 0.16264012455940247, "learning_rate": 3.6449814126394054e-06, "loss": 0.0032, "step": 3920 }, { "epoch": 2.0051033426894618, "grad_norm": 1.059360146522522, "learning_rate": 3.62639405204461e-06, "loss": 0.0019, "step": 3930 }, { "epoch": 2.010206685378923, "grad_norm": 0.20279403030872345, "learning_rate": 3.6078066914498145e-06, "loss": 0.0026, "step": 3940 }, { "epoch": 2.015310028068385, "grad_norm": 3.2196450233459473, "learning_rate": 3.5892193308550187e-06, "loss": 0.0014, "step": 3950 }, { "epoch": 2.020413370757846, "grad_norm": 0.14018088579177856, "learning_rate": 3.5706319702602232e-06, "loss": 0.0025, "step": 3960 }, { "epoch": 2.025516713447308, "grad_norm": 0.1343514323234558, "learning_rate": 3.5520446096654278e-06, "loss": 0.0014, "step": 3970 }, { "epoch": 2.0306200561367698, "grad_norm": 0.08496394008398056, "learning_rate": 3.533457249070632e-06, "loss": 0.0008, "step": 3980 }, { "epoch": 2.035723398826231, "grad_norm": 0.08516672253608704, "learning_rate": 3.514869888475837e-06, "loss": 0.0009, "step": 3990 }, { "epoch": 2.040826741515693, "grad_norm": 0.12272001802921295, "learning_rate": 3.4962825278810415e-06, "loss": 0.0047, "step": 4000 }, { "epoch": 2.040826741515693, "eval_loss": 0.0037899946328252554, "eval_runtime": 5936.4458, "eval_samples_per_second": 2.64, "eval_steps_per_second": 0.33, "eval_wer": 0.3019483689209356, "step": 4000 } ], "logging_steps": 10, "max_steps": 5880, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.692623348334592e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }