| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.6722360915468404, | |
| "best_model_checkpoint": "./SALAMA_NEW9/checkpoint-2000", | |
| "epoch": 1.0204133707578464, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0051033426894615975, | |
| "grad_norm": 0.5211380124092102, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0083, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010206685378923195, | |
| "grad_norm": 1.0282593965530396, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.0078, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.015310028068384792, | |
| "grad_norm": 0.4742085933685303, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.0082, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02041337075784639, | |
| "grad_norm": 0.7376335263252258, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.0056, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.025516713447307986, | |
| "grad_norm": 1.5874927043914795, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.012, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.030620056136769585, | |
| "grad_norm": 0.6289522051811218, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.0081, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.035723398826231184, | |
| "grad_norm": 1.9904325008392334, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.018, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04082674151569278, | |
| "grad_norm": 2.7978858947753906, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.0083, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.045930084205154376, | |
| "grad_norm": 1.7935121059417725, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.0107, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05103342689461597, | |
| "grad_norm": 0.3225713074207306, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.01, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.056136769584077574, | |
| "grad_norm": 0.9831269979476929, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.0101, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06124011227353917, | |
| "grad_norm": 1.6235119104385376, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.0067, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06634345496300076, | |
| "grad_norm": 2.157205820083618, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.008, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07144679765246237, | |
| "grad_norm": 0.38939204812049866, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.0054, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07655014034192396, | |
| "grad_norm": 0.7885264754295349, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.0081, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08165348303138556, | |
| "grad_norm": 1.518344759941101, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.0088, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08675682572084716, | |
| "grad_norm": 0.5009278059005737, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.005, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09186016841030875, | |
| "grad_norm": 2.364478588104248, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.0085, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09696351109977035, | |
| "grad_norm": 0.31605347990989685, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.0073, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10206685378923194, | |
| "grad_norm": 1.516256332397461, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.0081, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10717019647869354, | |
| "grad_norm": 1.2425719499588013, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.0072, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11227353916815515, | |
| "grad_norm": 2.350731611251831, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.0147, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.11737688185761674, | |
| "grad_norm": 0.7598681449890137, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.0117, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12248022454707834, | |
| "grad_norm": 0.8952233791351318, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.0089, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.12758356723653994, | |
| "grad_norm": 0.6309432983398438, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.0047, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13268690992600152, | |
| "grad_norm": 1.7359912395477295, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.0096, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.13779025261546313, | |
| "grad_norm": 2.6553421020507812, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.0078, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14289359530492474, | |
| "grad_norm": 1.1739858388900757, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.0087, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.14799693799438632, | |
| "grad_norm": 0.708070695400238, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.0075, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15310028068384793, | |
| "grad_norm": 0.8523297309875488, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.0085, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1582036233733095, | |
| "grad_norm": 2.7822892665863037, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.0092, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.16330696606277112, | |
| "grad_norm": 3.9056875705718994, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.0172, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1684103087522327, | |
| "grad_norm": 1.9748897552490234, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.0086, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1735136514416943, | |
| "grad_norm": 1.313830852508545, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.0124, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.17861699413115592, | |
| "grad_norm": 1.782209038734436, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.0166, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1837203368206175, | |
| "grad_norm": 2.647857427597046, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.0151, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1888236795100791, | |
| "grad_norm": 2.1850852966308594, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.0116, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1939270221995407, | |
| "grad_norm": 2.69811749458313, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.0118, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1990303648890023, | |
| "grad_norm": 3.1227176189422607, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.0137, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.20413370757846389, | |
| "grad_norm": 3.615041494369507, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.0113, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2092370502679255, | |
| "grad_norm": 2.067406177520752, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.0154, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.21434039295738708, | |
| "grad_norm": 1.5332070589065552, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.0159, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2194437356468487, | |
| "grad_norm": 1.3139411211013794, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.0096, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2245470783363103, | |
| "grad_norm": 2.8063700199127197, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.0096, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.22965042102577188, | |
| "grad_norm": 1.2194337844848633, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.0113, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2347537637152335, | |
| "grad_norm": 1.4826334714889526, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.01, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.23985710640469507, | |
| "grad_norm": 1.4572652578353882, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.012, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.24496044909415668, | |
| "grad_norm": 2.323155641555786, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.016, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.25006379178361826, | |
| "grad_norm": 2.1754894256591797, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.0134, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.25516713447307987, | |
| "grad_norm": 2.55068039894104, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.0133, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2602704771625415, | |
| "grad_norm": 2.78916072845459, | |
| "learning_rate": 9.983271375464685e-06, | |
| "loss": 0.0102, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.26537381985200303, | |
| "grad_norm": 3.6304802894592285, | |
| "learning_rate": 9.96468401486989e-06, | |
| "loss": 0.0142, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.27047716254146464, | |
| "grad_norm": 1.1248530149459839, | |
| "learning_rate": 9.946096654275093e-06, | |
| "loss": 0.0096, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.27558050523092625, | |
| "grad_norm": 2.088334798812866, | |
| "learning_rate": 9.927509293680298e-06, | |
| "loss": 0.0189, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.28068384792038786, | |
| "grad_norm": 2.035660982131958, | |
| "learning_rate": 9.908921933085503e-06, | |
| "loss": 0.0125, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2857871906098495, | |
| "grad_norm": 1.8379945755004883, | |
| "learning_rate": 9.890334572490708e-06, | |
| "loss": 0.01, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.290890533299311, | |
| "grad_norm": 2.2616829872131348, | |
| "learning_rate": 9.871747211895911e-06, | |
| "loss": 0.0119, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.29599387598877264, | |
| "grad_norm": 2.0087382793426514, | |
| "learning_rate": 9.853159851301116e-06, | |
| "loss": 0.0104, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.30109721867823425, | |
| "grad_norm": 1.7350345849990845, | |
| "learning_rate": 9.83457249070632e-06, | |
| "loss": 0.0144, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.30620056136769586, | |
| "grad_norm": 1.9522229433059692, | |
| "learning_rate": 9.815985130111524e-06, | |
| "loss": 0.0152, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.31130390405715747, | |
| "grad_norm": 2.231642961502075, | |
| "learning_rate": 9.79739776951673e-06, | |
| "loss": 0.0138, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.316407246746619, | |
| "grad_norm": 1.9675791263580322, | |
| "learning_rate": 9.778810408921934e-06, | |
| "loss": 0.0148, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.32151058943608063, | |
| "grad_norm": 1.9099314212799072, | |
| "learning_rate": 9.76022304832714e-06, | |
| "loss": 0.0145, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.32661393212554224, | |
| "grad_norm": 1.777403473854065, | |
| "learning_rate": 9.741635687732343e-06, | |
| "loss": 0.0164, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.33171727481500385, | |
| "grad_norm": 3.160843849182129, | |
| "learning_rate": 9.723048327137548e-06, | |
| "loss": 0.0178, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3368206175044654, | |
| "grad_norm": 0.950518786907196, | |
| "learning_rate": 9.70446096654275e-06, | |
| "loss": 0.0154, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.341923960193927, | |
| "grad_norm": 3.335329294204712, | |
| "learning_rate": 9.685873605947956e-06, | |
| "loss": 0.0149, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3470273028833886, | |
| "grad_norm": 2.4955413341522217, | |
| "learning_rate": 9.66728624535316e-06, | |
| "loss": 0.0095, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.35213064557285023, | |
| "grad_norm": 1.4317821264266968, | |
| "learning_rate": 9.648698884758366e-06, | |
| "loss": 0.0143, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.35723398826231184, | |
| "grad_norm": 1.6408915519714355, | |
| "learning_rate": 9.63011152416357e-06, | |
| "loss": 0.0262, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3623373309517734, | |
| "grad_norm": 5.888397693634033, | |
| "learning_rate": 9.611524163568774e-06, | |
| "loss": 0.0297, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.367440673641235, | |
| "grad_norm": 2.526792526245117, | |
| "learning_rate": 9.592936802973979e-06, | |
| "loss": 0.0149, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3725440163306966, | |
| "grad_norm": 1.5245797634124756, | |
| "learning_rate": 9.574349442379182e-06, | |
| "loss": 0.0122, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3776473590201582, | |
| "grad_norm": 2.3768417835235596, | |
| "learning_rate": 9.555762081784387e-06, | |
| "loss": 0.0146, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3827507017096198, | |
| "grad_norm": 1.9545379877090454, | |
| "learning_rate": 9.537174721189592e-06, | |
| "loss": 0.0116, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3878540443990814, | |
| "grad_norm": 2.5888895988464355, | |
| "learning_rate": 9.518587360594797e-06, | |
| "loss": 0.0125, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.392957387088543, | |
| "grad_norm": 2.554670810699463, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.0121, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3980607297780046, | |
| "grad_norm": 2.3274645805358887, | |
| "learning_rate": 9.481412639405206e-06, | |
| "loss": 0.0152, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4031640724674662, | |
| "grad_norm": 1.916551113128662, | |
| "learning_rate": 9.46282527881041e-06, | |
| "loss": 0.0181, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.40826741515692777, | |
| "grad_norm": 2.7110981941223145, | |
| "learning_rate": 9.444237918215614e-06, | |
| "loss": 0.0201, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4133707578463894, | |
| "grad_norm": 2.66487193107605, | |
| "learning_rate": 9.425650557620819e-06, | |
| "loss": 0.0163, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.418474100535851, | |
| "grad_norm": 4.3903303146362305, | |
| "learning_rate": 9.407063197026024e-06, | |
| "loss": 0.0203, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4235774432253126, | |
| "grad_norm": 0.8613393902778625, | |
| "learning_rate": 9.388475836431227e-06, | |
| "loss": 0.0101, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.42868078591477415, | |
| "grad_norm": 14.285655975341797, | |
| "learning_rate": 9.369888475836432e-06, | |
| "loss": 0.0201, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.43378412860423576, | |
| "grad_norm": 2.281245708465576, | |
| "learning_rate": 9.351301115241637e-06, | |
| "loss": 0.017, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4388874712936974, | |
| "grad_norm": 2.5612051486968994, | |
| "learning_rate": 9.33271375464684e-06, | |
| "loss": 0.0164, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.443990813983159, | |
| "grad_norm": 3.728468894958496, | |
| "learning_rate": 9.314126394052045e-06, | |
| "loss": 0.0184, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4490941566726206, | |
| "grad_norm": 2.954237699508667, | |
| "learning_rate": 9.295539033457249e-06, | |
| "loss": 0.0216, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.45419749936208215, | |
| "grad_norm": 2.5756335258483887, | |
| "learning_rate": 9.276951672862453e-06, | |
| "loss": 0.021, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.45930084205154376, | |
| "grad_norm": 4.490197658538818, | |
| "learning_rate": 9.258364312267658e-06, | |
| "loss": 0.0138, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.46440418474100537, | |
| "grad_norm": 1.9928340911865234, | |
| "learning_rate": 9.239776951672863e-06, | |
| "loss": 0.0158, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.469507527430467, | |
| "grad_norm": 3.2016446590423584, | |
| "learning_rate": 9.221189591078068e-06, | |
| "loss": 0.0188, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.47461087011992853, | |
| "grad_norm": 2.1624643802642822, | |
| "learning_rate": 9.202602230483272e-06, | |
| "loss": 0.0139, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.47971421280939014, | |
| "grad_norm": 2.0108089447021484, | |
| "learning_rate": 9.184014869888477e-06, | |
| "loss": 0.0173, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.48481755549885175, | |
| "grad_norm": 2.6266250610351562, | |
| "learning_rate": 9.16542750929368e-06, | |
| "loss": 0.0181, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.48992089818831336, | |
| "grad_norm": 1.7041484117507935, | |
| "learning_rate": 9.146840148698885e-06, | |
| "loss": 0.0167, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.49502424087777497, | |
| "grad_norm": 2.4042234420776367, | |
| "learning_rate": 9.12825278810409e-06, | |
| "loss": 0.017, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5001275835672365, | |
| "grad_norm": 1.770944595336914, | |
| "learning_rate": 9.109665427509295e-06, | |
| "loss": 0.01, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5052309262566982, | |
| "grad_norm": 2.101804256439209, | |
| "learning_rate": 9.0910780669145e-06, | |
| "loss": 0.0152, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5103342689461597, | |
| "grad_norm": 3.545254945755005, | |
| "learning_rate": 9.072490706319703e-06, | |
| "loss": 0.014, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5154376116356213, | |
| "grad_norm": 2.445159912109375, | |
| "learning_rate": 9.053903345724908e-06, | |
| "loss": 0.0207, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.520540954325083, | |
| "grad_norm": 3.302297830581665, | |
| "learning_rate": 9.035315985130111e-06, | |
| "loss": 0.0212, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5256442970145445, | |
| "grad_norm": 4.689877510070801, | |
| "learning_rate": 9.016728624535316e-06, | |
| "loss": 0.025, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5307476397040061, | |
| "grad_norm": 4.139590740203857, | |
| "learning_rate": 8.998141263940521e-06, | |
| "loss": 0.0158, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5358509823934677, | |
| "grad_norm": 1.6236610412597656, | |
| "learning_rate": 8.979553903345726e-06, | |
| "loss": 0.0112, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5409543250829293, | |
| "grad_norm": 2.6642770767211914, | |
| "learning_rate": 8.96096654275093e-06, | |
| "loss": 0.0226, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.546057667772391, | |
| "grad_norm": 2.012868642807007, | |
| "learning_rate": 8.942379182156135e-06, | |
| "loss": 0.0172, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5511610104618525, | |
| "grad_norm": 1.9676612615585327, | |
| "learning_rate": 8.92379182156134e-06, | |
| "loss": 0.0131, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5562643531513141, | |
| "grad_norm": 3.358045816421509, | |
| "learning_rate": 8.905204460966543e-06, | |
| "loss": 0.0168, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5613676958407757, | |
| "grad_norm": 1.9890451431274414, | |
| "learning_rate": 8.886617100371748e-06, | |
| "loss": 0.0158, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5664710385302373, | |
| "grad_norm": 2.1915857791900635, | |
| "learning_rate": 8.868029739776953e-06, | |
| "loss": 0.015, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.571574381219699, | |
| "grad_norm": 2.0204272270202637, | |
| "learning_rate": 8.849442379182158e-06, | |
| "loss": 0.0217, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5766777239091605, | |
| "grad_norm": 1.8702834844589233, | |
| "learning_rate": 8.830855018587361e-06, | |
| "loss": 0.014, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.581781066598622, | |
| "grad_norm": 0.8649874925613403, | |
| "learning_rate": 8.812267657992566e-06, | |
| "loss": 0.0168, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5868844092880837, | |
| "grad_norm": 2.020085334777832, | |
| "learning_rate": 8.79368029739777e-06, | |
| "loss": 0.0166, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5919877519775453, | |
| "grad_norm": 0.6940491199493408, | |
| "learning_rate": 8.775092936802974e-06, | |
| "loss": 0.014, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5970910946670069, | |
| "grad_norm": 1.6421513557434082, | |
| "learning_rate": 8.75650557620818e-06, | |
| "loss": 0.0149, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6021944373564685, | |
| "grad_norm": 1.7957764863967896, | |
| "learning_rate": 8.737918215613384e-06, | |
| "loss": 0.0194, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.60729778004593, | |
| "grad_norm": 1.6488491296768188, | |
| "learning_rate": 8.719330855018588e-06, | |
| "loss": 0.0119, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6124011227353917, | |
| "grad_norm": 1.9999263286590576, | |
| "learning_rate": 8.700743494423793e-06, | |
| "loss": 0.0165, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6175044654248533, | |
| "grad_norm": 1.749192237854004, | |
| "learning_rate": 8.682156133828998e-06, | |
| "loss": 0.0193, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6226078081143149, | |
| "grad_norm": 2.414264440536499, | |
| "learning_rate": 8.663568773234201e-06, | |
| "loss": 0.0199, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6277111508037765, | |
| "grad_norm": 2.670834541320801, | |
| "learning_rate": 8.644981412639406e-06, | |
| "loss": 0.0178, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.632814493493238, | |
| "grad_norm": 3.2673842906951904, | |
| "learning_rate": 8.626394052044609e-06, | |
| "loss": 0.0161, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6379178361826997, | |
| "grad_norm": 2.5664849281311035, | |
| "learning_rate": 8.607806691449814e-06, | |
| "loss": 0.0213, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6430211788721613, | |
| "grad_norm": 2.350846290588379, | |
| "learning_rate": 8.589219330855019e-06, | |
| "loss": 0.0181, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6481245215616228, | |
| "grad_norm": 2.494407892227173, | |
| "learning_rate": 8.570631970260224e-06, | |
| "loss": 0.0128, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6532278642510845, | |
| "grad_norm": 2.3424453735351562, | |
| "learning_rate": 8.552044609665429e-06, | |
| "loss": 0.0127, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.658331206940546, | |
| "grad_norm": 2.1651947498321533, | |
| "learning_rate": 8.533457249070632e-06, | |
| "loss": 0.0229, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6634345496300077, | |
| "grad_norm": 0.5863803029060364, | |
| "learning_rate": 8.514869888475837e-06, | |
| "loss": 0.0145, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6685378923194693, | |
| "grad_norm": 1.3225018978118896, | |
| "learning_rate": 8.49628252788104e-06, | |
| "loss": 0.0149, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.6736412350089308, | |
| "grad_norm": 3.3000130653381348, | |
| "learning_rate": 8.477695167286246e-06, | |
| "loss": 0.0211, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.6787445776983925, | |
| "grad_norm": 2.677570104598999, | |
| "learning_rate": 8.45910780669145e-06, | |
| "loss": 0.0113, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.683847920387854, | |
| "grad_norm": 1.235533595085144, | |
| "learning_rate": 8.440520446096656e-06, | |
| "loss": 0.0132, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.6889512630773157, | |
| "grad_norm": 1.7336188554763794, | |
| "learning_rate": 8.42193308550186e-06, | |
| "loss": 0.0147, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6940546057667772, | |
| "grad_norm": 3.8093788623809814, | |
| "learning_rate": 8.403345724907064e-06, | |
| "loss": 0.0168, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6991579484562388, | |
| "grad_norm": 1.9721407890319824, | |
| "learning_rate": 8.384758364312269e-06, | |
| "loss": 0.0148, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7042612911457005, | |
| "grad_norm": 4.275414943695068, | |
| "learning_rate": 8.366171003717472e-06, | |
| "loss": 0.0171, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.709364633835162, | |
| "grad_norm": 1.36530339717865, | |
| "learning_rate": 8.347583643122677e-06, | |
| "loss": 0.0157, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7144679765246237, | |
| "grad_norm": 2.0768120288848877, | |
| "learning_rate": 8.328996282527882e-06, | |
| "loss": 0.0197, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7195713192140852, | |
| "grad_norm": 3.6376969814300537, | |
| "learning_rate": 8.310408921933087e-06, | |
| "loss": 0.02, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7246746619035468, | |
| "grad_norm": 4.029935836791992, | |
| "learning_rate": 8.29182156133829e-06, | |
| "loss": 0.0132, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7297780045930085, | |
| "grad_norm": 3.0603153705596924, | |
| "learning_rate": 8.273234200743495e-06, | |
| "loss": 0.0124, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.73488134728247, | |
| "grad_norm": 0.8475554585456848, | |
| "learning_rate": 8.2546468401487e-06, | |
| "loss": 0.0124, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7399846899719316, | |
| "grad_norm": 1.9978291988372803, | |
| "learning_rate": 8.236059479553904e-06, | |
| "loss": 0.0117, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7450880326613932, | |
| "grad_norm": 1.5020562410354614, | |
| "learning_rate": 8.217472118959108e-06, | |
| "loss": 0.0167, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7501913753508548, | |
| "grad_norm": 1.614305853843689, | |
| "learning_rate": 8.198884758364313e-06, | |
| "loss": 0.0149, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7552947180403164, | |
| "grad_norm": 2.371570110321045, | |
| "learning_rate": 8.180297397769518e-06, | |
| "loss": 0.0142, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.760398060729778, | |
| "grad_norm": 1.5552469491958618, | |
| "learning_rate": 8.161710037174722e-06, | |
| "loss": 0.0134, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.7655014034192396, | |
| "grad_norm": 1.9674372673034668, | |
| "learning_rate": 8.143122676579927e-06, | |
| "loss": 0.0225, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7706047461087012, | |
| "grad_norm": 1.94131600856781, | |
| "learning_rate": 8.12453531598513e-06, | |
| "loss": 0.0132, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.7757080887981628, | |
| "grad_norm": 2.533285140991211, | |
| "learning_rate": 8.105947955390335e-06, | |
| "loss": 0.0164, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7808114314876244, | |
| "grad_norm": 1.7931355237960815, | |
| "learning_rate": 8.08736059479554e-06, | |
| "loss": 0.0145, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.785914774177086, | |
| "grad_norm": 1.5637154579162598, | |
| "learning_rate": 8.068773234200745e-06, | |
| "loss": 0.0131, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.7910181168665475, | |
| "grad_norm": 1.0649983882904053, | |
| "learning_rate": 8.050185873605948e-06, | |
| "loss": 0.0317, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7961214595560092, | |
| "grad_norm": 1.9837394952774048, | |
| "learning_rate": 8.031598513011153e-06, | |
| "loss": 0.0168, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8012248022454708, | |
| "grad_norm": 3.6585099697113037, | |
| "learning_rate": 8.013011152416358e-06, | |
| "loss": 0.0131, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8063281449349324, | |
| "grad_norm": 2.7953765392303467, | |
| "learning_rate": 7.994423791821561e-06, | |
| "loss": 0.0162, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.811431487624394, | |
| "grad_norm": 2.3890202045440674, | |
| "learning_rate": 7.975836431226766e-06, | |
| "loss": 0.0158, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8165348303138555, | |
| "grad_norm": 2.073019504547119, | |
| "learning_rate": 7.95724907063197e-06, | |
| "loss": 0.0159, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8216381730033172, | |
| "grad_norm": 2.4629039764404297, | |
| "learning_rate": 7.938661710037175e-06, | |
| "loss": 0.0117, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8267415156927788, | |
| "grad_norm": 1.4736220836639404, | |
| "learning_rate": 7.92007434944238e-06, | |
| "loss": 0.0145, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8318448583822403, | |
| "grad_norm": 3.2814719676971436, | |
| "learning_rate": 7.901486988847585e-06, | |
| "loss": 0.0143, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.836948201071702, | |
| "grad_norm": 2.1625795364379883, | |
| "learning_rate": 7.88289962825279e-06, | |
| "loss": 0.0118, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8420515437611635, | |
| "grad_norm": 1.660874605178833, | |
| "learning_rate": 7.864312267657993e-06, | |
| "loss": 0.0127, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8471548864506252, | |
| "grad_norm": 1.7518630027770996, | |
| "learning_rate": 7.845724907063198e-06, | |
| "loss": 0.0085, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8522582291400868, | |
| "grad_norm": 1.452298879623413, | |
| "learning_rate": 7.827137546468401e-06, | |
| "loss": 0.0151, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8573615718295483, | |
| "grad_norm": 1.8911986351013184, | |
| "learning_rate": 7.808550185873606e-06, | |
| "loss": 0.0166, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.86246491451901, | |
| "grad_norm": 3.8515708446502686, | |
| "learning_rate": 7.789962825278811e-06, | |
| "loss": 0.0221, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.8675682572084715, | |
| "grad_norm": 2.210042953491211, | |
| "learning_rate": 7.771375464684016e-06, | |
| "loss": 0.0208, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8726715998979332, | |
| "grad_norm": 2.0735044479370117, | |
| "learning_rate": 7.75278810408922e-06, | |
| "loss": 0.0142, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.8777749425873947, | |
| "grad_norm": 2.415004253387451, | |
| "learning_rate": 7.734200743494424e-06, | |
| "loss": 0.0179, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.8828782852768563, | |
| "grad_norm": 2.272406816482544, | |
| "learning_rate": 7.71561338289963e-06, | |
| "loss": 0.0142, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.887981627966318, | |
| "grad_norm": 1.2048219442367554, | |
| "learning_rate": 7.697026022304833e-06, | |
| "loss": 0.0102, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.8930849706557795, | |
| "grad_norm": 1.414962887763977, | |
| "learning_rate": 7.678438661710038e-06, | |
| "loss": 0.0112, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8981883133452412, | |
| "grad_norm": 0.9970257878303528, | |
| "learning_rate": 7.659851301115243e-06, | |
| "loss": 0.0116, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9032916560347027, | |
| "grad_norm": 1.7614041566848755, | |
| "learning_rate": 7.641263940520448e-06, | |
| "loss": 0.0198, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9083949987241643, | |
| "grad_norm": 2.285222291946411, | |
| "learning_rate": 7.622676579925651e-06, | |
| "loss": 0.0146, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.913498341413626, | |
| "grad_norm": 2.238495111465454, | |
| "learning_rate": 7.604089219330856e-06, | |
| "loss": 0.0102, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9186016841030875, | |
| "grad_norm": 0.7516927123069763, | |
| "learning_rate": 7.58550185873606e-06, | |
| "loss": 0.0134, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9237050267925491, | |
| "grad_norm": 1.6228662729263306, | |
| "learning_rate": 7.566914498141265e-06, | |
| "loss": 0.0182, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9288083694820107, | |
| "grad_norm": 1.0676440000534058, | |
| "learning_rate": 7.548327137546469e-06, | |
| "loss": 0.0117, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9339117121714723, | |
| "grad_norm": 2.345280170440674, | |
| "learning_rate": 7.529739776951673e-06, | |
| "loss": 0.0116, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.939015054860934, | |
| "grad_norm": 2.056405782699585, | |
| "learning_rate": 7.511152416356878e-06, | |
| "loss": 0.0181, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9441183975503955, | |
| "grad_norm": 1.5895274877548218, | |
| "learning_rate": 7.492565055762082e-06, | |
| "loss": 0.0143, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9492217402398571, | |
| "grad_norm": 3.693983554840088, | |
| "learning_rate": 7.473977695167287e-06, | |
| "loss": 0.0139, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9543250829293187, | |
| "grad_norm": 1.7493189573287964, | |
| "learning_rate": 7.455390334572491e-06, | |
| "loss": 0.012, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9594284256187803, | |
| "grad_norm": 6.353549957275391, | |
| "learning_rate": 7.436802973977696e-06, | |
| "loss": 0.0182, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.9645317683082419, | |
| "grad_norm": 3.067734956741333, | |
| "learning_rate": 7.4182156133829e-06, | |
| "loss": 0.0116, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.9696351109977035, | |
| "grad_norm": 2.4685025215148926, | |
| "learning_rate": 7.399628252788105e-06, | |
| "loss": 0.0153, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.974738453687165, | |
| "grad_norm": 2.9748520851135254, | |
| "learning_rate": 7.38104089219331e-06, | |
| "loss": 0.0196, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.9798417963766267, | |
| "grad_norm": 1.787302017211914, | |
| "learning_rate": 7.362453531598514e-06, | |
| "loss": 0.0139, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9849451390660883, | |
| "grad_norm": 2.998495101928711, | |
| "learning_rate": 7.343866171003719e-06, | |
| "loss": 0.0117, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.9900484817555499, | |
| "grad_norm": 2.461190938949585, | |
| "learning_rate": 7.325278810408922e-06, | |
| "loss": 0.0188, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.9951518244450115, | |
| "grad_norm": 2.0859811305999756, | |
| "learning_rate": 7.306691449814127e-06, | |
| "loss": 0.0144, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 9.421244621276855, | |
| "learning_rate": 7.288104089219331e-06, | |
| "loss": 0.0214, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.0051033426894616, | |
| "grad_norm": 1.6903966665267944, | |
| "learning_rate": 7.269516728624536e-06, | |
| "loss": 0.0052, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.010206685378923, | |
| "grad_norm": 1.4454400539398193, | |
| "learning_rate": 7.25092936802974e-06, | |
| "loss": 0.0054, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.0153100280683849, | |
| "grad_norm": 1.46286141872406, | |
| "learning_rate": 7.2323420074349444e-06, | |
| "loss": 0.0036, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.0204133707578464, | |
| "grad_norm": 1.4223207235336304, | |
| "learning_rate": 7.213754646840149e-06, | |
| "loss": 0.0041, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0204133707578464, | |
| "eval_loss": 0.007851608097553253, | |
| "eval_runtime": 5932.2159, | |
| "eval_samples_per_second": 2.642, | |
| "eval_steps_per_second": 0.33, | |
| "eval_wer": 0.6722360915468404, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5880, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.846311674167296e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |