{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003816793893129771, "grad_norm": 2.615494966506958, "learning_rate": 1.7175572519083972e-06, "loss": 0.162, "step": 10 }, { "epoch": 0.007633587786259542, "grad_norm": 3.7720587253570557, "learning_rate": 3.6259541984732824e-06, "loss": 0.1953, "step": 20 }, { "epoch": 0.011450381679389313, "grad_norm": 3.1540627479553223, "learning_rate": 5.534351145038168e-06, "loss": 0.1651, "step": 30 }, { "epoch": 0.015267175572519083, "grad_norm": 2.391727924346924, "learning_rate": 7.4427480916030536e-06, "loss": 0.0976, "step": 40 }, { "epoch": 0.019083969465648856, "grad_norm": 1.7957884073257446, "learning_rate": 9.351145038167939e-06, "loss": 0.0483, "step": 50 }, { "epoch": 0.022900763358778626, "grad_norm": 2.7055726051330566, "learning_rate": 1.1259541984732823e-05, "loss": 0.0618, "step": 60 }, { "epoch": 0.026717557251908396, "grad_norm": 0.7185584902763367, "learning_rate": 1.316793893129771e-05, "loss": 0.0452, "step": 70 }, { "epoch": 0.030534351145038167, "grad_norm": 0.3574228286743164, "learning_rate": 1.5076335877862596e-05, "loss": 0.0287, "step": 80 }, { "epoch": 0.03435114503816794, "grad_norm": 0.600852906703949, "learning_rate": 1.6984732824427482e-05, "loss": 0.0345, "step": 90 }, { "epoch": 0.03816793893129771, "grad_norm": 1.2814120054244995, "learning_rate": 1.8893129770992367e-05, "loss": 0.0375, "step": 100 }, { "epoch": 0.04198473282442748, "grad_norm": 0.7709506154060364, "learning_rate": 2.0801526717557255e-05, "loss": 0.0177, "step": 110 }, { "epoch": 0.04580152671755725, "grad_norm": 0.749973714351654, "learning_rate": 2.270992366412214e-05, "loss": 0.0219, "step": 120 }, { "epoch": 0.04961832061068702, "grad_norm": 0.14578384160995483, "learning_rate": 2.4618320610687024e-05, "loss": 0.0226, "step": 130 }, { "epoch": 0.05343511450381679, "grad_norm": 1.139318585395813, "learning_rate": 2.652671755725191e-05, "loss": 0.0247, "step": 140 }, { "epoch": 0.05725190839694656, "grad_norm": 1.522550344467163, "learning_rate": 2.8435114503816796e-05, "loss": 0.0219, "step": 150 }, { "epoch": 0.061068702290076333, "grad_norm": 0.010661217384040356, "learning_rate": 3.0343511450381677e-05, "loss": 0.0233, "step": 160 }, { "epoch": 0.0648854961832061, "grad_norm": 1.0627490282058716, "learning_rate": 3.2251908396946565e-05, "loss": 0.0143, "step": 170 }, { "epoch": 0.06870229007633588, "grad_norm": 0.4552942216396332, "learning_rate": 3.416030534351145e-05, "loss": 0.0271, "step": 180 }, { "epoch": 0.07251908396946564, "grad_norm": 0.6815011501312256, "learning_rate": 3.606870229007634e-05, "loss": 0.0185, "step": 190 }, { "epoch": 0.07633587786259542, "grad_norm": 0.698657751083374, "learning_rate": 3.797709923664122e-05, "loss": 0.0162, "step": 200 }, { "epoch": 0.08015267175572519, "grad_norm": 0.5514554977416992, "learning_rate": 3.988549618320611e-05, "loss": 0.0277, "step": 210 }, { "epoch": 0.08396946564885496, "grad_norm": 0.09276178479194641, "learning_rate": 4.1793893129771e-05, "loss": 0.0176, "step": 220 }, { "epoch": 0.08778625954198473, "grad_norm": 1.0245144367218018, "learning_rate": 4.370229007633588e-05, "loss": 0.0219, "step": 230 }, { "epoch": 0.0916030534351145, "grad_norm": 0.5731092095375061, "learning_rate": 4.561068702290077e-05, "loss": 0.0188, "step": 240 }, { "epoch": 0.09541984732824428, "grad_norm": 0.11703433096408844, "learning_rate": 4.751908396946565e-05, "loss": 0.0153, "step": 250 }, { "epoch": 0.09923664122137404, "grad_norm": 0.3222142159938812, "learning_rate": 4.9427480916030536e-05, "loss": 0.0176, "step": 260 }, { "epoch": 0.10305343511450382, "grad_norm": 0.6780900359153748, "learning_rate": 4.9998912785867505e-05, "loss": 0.0143, "step": 270 }, { "epoch": 0.10687022900763359, "grad_norm": 0.8537261486053467, "learning_rate": 4.999358788306519e-05, "loss": 0.0223, "step": 280 }, { "epoch": 0.11068702290076336, "grad_norm": 0.17257721722126007, "learning_rate": 4.998382654320609e-05, "loss": 0.0178, "step": 290 }, { "epoch": 0.11450381679389313, "grad_norm": 0.7881976962089539, "learning_rate": 4.996963049895741e-05, "loss": 0.0146, "step": 300 }, { "epoch": 0.1183206106870229, "grad_norm": 0.6446636915206909, "learning_rate": 4.99510022701597e-05, "loss": 0.0109, "step": 310 }, { "epoch": 0.12213740458015267, "grad_norm": 0.6660375595092773, "learning_rate": 4.992794516337964e-05, "loss": 0.012, "step": 320 }, { "epoch": 0.12595419847328243, "grad_norm": 0.47002533078193665, "learning_rate": 4.9900463271323064e-05, "loss": 0.0133, "step": 330 }, { "epoch": 0.1297709923664122, "grad_norm": 0.9555854201316833, "learning_rate": 4.9868561472108496e-05, "loss": 0.0173, "step": 340 }, { "epoch": 0.13358778625954199, "grad_norm": 0.14546959102153778, "learning_rate": 4.9832245428401316e-05, "loss": 0.0172, "step": 350 }, { "epoch": 0.13740458015267176, "grad_norm": 0.27474039793014526, "learning_rate": 4.979152158640853e-05, "loss": 0.0174, "step": 360 }, { "epoch": 0.14122137404580154, "grad_norm": 0.2749495208263397, "learning_rate": 4.974639717473465e-05, "loss": 0.0172, "step": 370 }, { "epoch": 0.1450381679389313, "grad_norm": 0.19880647957324982, "learning_rate": 4.9696880203098525e-05, "loss": 0.012, "step": 380 }, { "epoch": 0.14885496183206107, "grad_norm": 0.7163119912147522, "learning_rate": 4.964297946091163e-05, "loss": 0.0138, "step": 390 }, { "epoch": 0.15267175572519084, "grad_norm": 0.4733160436153412, "learning_rate": 4.9584704515717884e-05, "loss": 0.0074, "step": 400 }, { "epoch": 0.15648854961832062, "grad_norm": 0.5145673155784607, "learning_rate": 4.952206571149541e-05, "loss": 0.0178, "step": 410 }, { "epoch": 0.16030534351145037, "grad_norm": 0.07867272943258286, "learning_rate": 4.945507416682046e-05, "loss": 0.0132, "step": 420 }, { "epoch": 0.16412213740458015, "grad_norm": 0.023460840806365013, "learning_rate": 4.938374177289378e-05, "loss": 0.0085, "step": 430 }, { "epoch": 0.16793893129770993, "grad_norm": 0.6247947812080383, "learning_rate": 4.930808119142993e-05, "loss": 0.0176, "step": 440 }, { "epoch": 0.1717557251908397, "grad_norm": 0.49582046270370483, "learning_rate": 4.922810585240981e-05, "loss": 0.0188, "step": 450 }, { "epoch": 0.17557251908396945, "grad_norm": 1.1747221946716309, "learning_rate": 4.914382995169673e-05, "loss": 0.0187, "step": 460 }, { "epoch": 0.17938931297709923, "grad_norm": 0.9253813624382019, "learning_rate": 4.905526844851667e-05, "loss": 0.0125, "step": 470 }, { "epoch": 0.183206106870229, "grad_norm": 0.32356423139572144, "learning_rate": 4.8962437062802924e-05, "loss": 0.0145, "step": 480 }, { "epoch": 0.18702290076335878, "grad_norm": 0.16892339289188385, "learning_rate": 4.886535227240579e-05, "loss": 0.0132, "step": 490 }, { "epoch": 0.19083969465648856, "grad_norm": 0.08704151958227158, "learning_rate": 4.8764031310167725e-05, "loss": 0.0145, "step": 500 }, { "epoch": 0.19083969465648856, "eval_loss": 0.03389899432659149, "eval_runtime": 140.2568, "eval_samples_per_second": 56.91, "eval_steps_per_second": 0.299, "step": 500 }, { "epoch": 0.1946564885496183, "grad_norm": 0.021338067948818207, "learning_rate": 4.865849216086438e-05, "loss": 0.0112, "step": 510 }, { "epoch": 0.1984732824427481, "grad_norm": 0.18011535704135895, "learning_rate": 4.854875355801233e-05, "loss": 0.0068, "step": 520 }, { "epoch": 0.20229007633587787, "grad_norm": 0.6645931005477905, "learning_rate": 4.843483498054381e-05, "loss": 0.0182, "step": 530 }, { "epoch": 0.20610687022900764, "grad_norm": 0.1481589674949646, "learning_rate": 4.83167566493491e-05, "loss": 0.0113, "step": 540 }, { "epoch": 0.2099236641221374, "grad_norm": 0.05964776873588562, "learning_rate": 4.819453952368731e-05, "loss": 0.0101, "step": 550 }, { "epoch": 0.21374045801526717, "grad_norm": 0.5340839624404907, "learning_rate": 4.8068205297465986e-05, "loss": 0.0181, "step": 560 }, { "epoch": 0.21755725190839695, "grad_norm": 0.28655287623405457, "learning_rate": 4.7937776395390434e-05, "loss": 0.0073, "step": 570 }, { "epoch": 0.22137404580152673, "grad_norm": 0.08776471763849258, "learning_rate": 4.7803275968983244e-05, "loss": 0.0129, "step": 580 }, { "epoch": 0.22519083969465647, "grad_norm": 0.11037462204694748, "learning_rate": 4.766472789247483e-05, "loss": 0.0088, "step": 590 }, { "epoch": 0.22900763358778625, "grad_norm": 0.21740970015525818, "learning_rate": 4.752215675856566e-05, "loss": 0.0115, "step": 600 }, { "epoch": 0.23282442748091603, "grad_norm": 0.07760237902402878, "learning_rate": 4.737558787406103e-05, "loss": 0.0118, "step": 610 }, { "epoch": 0.2366412213740458, "grad_norm": 0.2091819792985916, "learning_rate": 4.722504725537896e-05, "loss": 0.0096, "step": 620 }, { "epoch": 0.24045801526717558, "grad_norm": 1.0305904150009155, "learning_rate": 4.7070561623932265e-05, "loss": 0.0108, "step": 630 }, { "epoch": 0.24427480916030533, "grad_norm": 0.03920348733663559, "learning_rate": 4.691215840138537e-05, "loss": 0.0116, "step": 640 }, { "epoch": 0.2480916030534351, "grad_norm": 0.02367626503109932, "learning_rate": 4.674986570478695e-05, "loss": 0.0133, "step": 650 }, { "epoch": 0.25190839694656486, "grad_norm": 0.05315301567316055, "learning_rate": 4.6583712341578984e-05, "loss": 0.0087, "step": 660 }, { "epoch": 0.25572519083969464, "grad_norm": 0.10831905901432037, "learning_rate": 4.641372780448341e-05, "loss": 0.0084, "step": 670 }, { "epoch": 0.2595419847328244, "grad_norm": 0.078117236495018, "learning_rate": 4.6239942266267056e-05, "loss": 0.0058, "step": 680 }, { "epoch": 0.2633587786259542, "grad_norm": 0.28782543540000916, "learning_rate": 4.6062386574385886e-05, "loss": 0.0104, "step": 690 }, { "epoch": 0.26717557251908397, "grad_norm": 0.04873083531856537, "learning_rate": 4.5881092245509506e-05, "loss": 0.0118, "step": 700 }, { "epoch": 0.27099236641221375, "grad_norm": 0.26269766688346863, "learning_rate": 4.56960914599268e-05, "loss": 0.0077, "step": 710 }, { "epoch": 0.2748091603053435, "grad_norm": 0.3128824532032013, "learning_rate": 4.55074170558339e-05, "loss": 0.0107, "step": 720 }, { "epoch": 0.2786259541984733, "grad_norm": 1.1486977338790894, "learning_rate": 4.531510252350526e-05, "loss": 0.012, "step": 730 }, { "epoch": 0.2824427480916031, "grad_norm": 0.35734987258911133, "learning_rate": 4.511918199934907e-05, "loss": 0.0121, "step": 740 }, { "epoch": 0.2862595419847328, "grad_norm": 0.2775309681892395, "learning_rate": 4.491969025984789e-05, "loss": 0.0139, "step": 750 }, { "epoch": 0.2900763358778626, "grad_norm": 0.22545866668224335, "learning_rate": 4.471666271538578e-05, "loss": 0.0047, "step": 760 }, { "epoch": 0.29389312977099236, "grad_norm": 0.3366382420063019, "learning_rate": 4.451013540396281e-05, "loss": 0.008, "step": 770 }, { "epoch": 0.29770992366412213, "grad_norm": 0.645935595035553, "learning_rate": 4.430014498479819e-05, "loss": 0.0197, "step": 780 }, { "epoch": 0.3015267175572519, "grad_norm": 0.022258713841438293, "learning_rate": 4.408672873182322e-05, "loss": 0.0138, "step": 790 }, { "epoch": 0.3053435114503817, "grad_norm": 0.08058780431747437, "learning_rate": 4.386992452706499e-05, "loss": 0.01, "step": 800 }, { "epoch": 0.30916030534351147, "grad_norm": 0.5103694796562195, "learning_rate": 4.36497708539222e-05, "loss": 0.0127, "step": 810 }, { "epoch": 0.31297709923664124, "grad_norm": 0.2476615309715271, "learning_rate": 4.342630679033432e-05, "loss": 0.0129, "step": 820 }, { "epoch": 0.31679389312977096, "grad_norm": 0.2634897530078888, "learning_rate": 4.319957200184504e-05, "loss": 0.0052, "step": 830 }, { "epoch": 0.32061068702290074, "grad_norm": 0.31470030546188354, "learning_rate": 4.296960673456159e-05, "loss": 0.0165, "step": 840 }, { "epoch": 0.3244274809160305, "grad_norm": 0.19848482310771942, "learning_rate": 4.273645180801088e-05, "loss": 0.005, "step": 850 }, { "epoch": 0.3282442748091603, "grad_norm": 0.08697102218866348, "learning_rate": 4.2500148607893965e-05, "loss": 0.0082, "step": 860 }, { "epoch": 0.3320610687022901, "grad_norm": 0.38102781772613525, "learning_rate": 4.226073907873992e-05, "loss": 0.0126, "step": 870 }, { "epoch": 0.33587786259541985, "grad_norm": 0.14514830708503723, "learning_rate": 4.201826571646056e-05, "loss": 0.0058, "step": 880 }, { "epoch": 0.33969465648854963, "grad_norm": 0.43940168619155884, "learning_rate": 4.177277156080731e-05, "loss": 0.0078, "step": 890 }, { "epoch": 0.3435114503816794, "grad_norm": 0.05170935019850731, "learning_rate": 4.152430018773153e-05, "loss": 0.0082, "step": 900 }, { "epoch": 0.3473282442748092, "grad_norm": 0.01292837131768465, "learning_rate": 4.127289570164958e-05, "loss": 0.0156, "step": 910 }, { "epoch": 0.3511450381679389, "grad_norm": 0.21468913555145264, "learning_rate": 4.1018602727614264e-05, "loss": 0.0075, "step": 920 }, { "epoch": 0.3549618320610687, "grad_norm": 0.7786637544631958, "learning_rate": 4.0761466403393633e-05, "loss": 0.0167, "step": 930 }, { "epoch": 0.35877862595419846, "grad_norm": 0.06286737322807312, "learning_rate": 4.0501532371458993e-05, "loss": 0.0107, "step": 940 }, { "epoch": 0.36259541984732824, "grad_norm": 0.19294968247413635, "learning_rate": 4.0238846770883165e-05, "loss": 0.0103, "step": 950 }, { "epoch": 0.366412213740458, "grad_norm": 0.02884821780025959, "learning_rate": 3.9973456229150695e-05, "loss": 0.0133, "step": 960 }, { "epoch": 0.3702290076335878, "grad_norm": 0.36911624670028687, "learning_rate": 3.9705407853881324e-05, "loss": 0.0072, "step": 970 }, { "epoch": 0.37404580152671757, "grad_norm": 0.21048544347286224, "learning_rate": 3.9434749224468235e-05, "loss": 0.0047, "step": 980 }, { "epoch": 0.37786259541984735, "grad_norm": 0.09334168583154678, "learning_rate": 3.916152838363258e-05, "loss": 0.0125, "step": 990 }, { "epoch": 0.3816793893129771, "grad_norm": 0.49426600337028503, "learning_rate": 3.8885793828895756e-05, "loss": 0.0109, "step": 1000 }, { "epoch": 0.3816793893129771, "eval_loss": 0.032893069088459015, "eval_runtime": 140.214, "eval_samples_per_second": 56.927, "eval_steps_per_second": 0.3, "step": 1000 }, { "epoch": 0.38549618320610685, "grad_norm": 0.19869565963745117, "learning_rate": 3.8607594503970925e-05, "loss": 0.0144, "step": 1010 }, { "epoch": 0.3893129770992366, "grad_norm": 0.19766545295715332, "learning_rate": 3.832697979007539e-05, "loss": 0.0067, "step": 1020 }, { "epoch": 0.3931297709923664, "grad_norm": 0.4004887044429779, "learning_rate": 3.804399949716526e-05, "loss": 0.0047, "step": 1030 }, { "epoch": 0.3969465648854962, "grad_norm": 0.31220734119415283, "learning_rate": 3.775870385509402e-05, "loss": 0.0163, "step": 1040 }, { "epoch": 0.40076335877862596, "grad_norm": 0.12848161160945892, "learning_rate": 3.7471143504696625e-05, "loss": 0.0038, "step": 1050 }, { "epoch": 0.40458015267175573, "grad_norm": 0.39810845255851746, "learning_rate": 3.718136948880056e-05, "loss": 0.0077, "step": 1060 }, { "epoch": 0.4083969465648855, "grad_norm": 0.47316208481788635, "learning_rate": 3.6889433243165605e-05, "loss": 0.0141, "step": 1070 }, { "epoch": 0.4122137404580153, "grad_norm": 0.028033487498760223, "learning_rate": 3.6595386587353795e-05, "loss": 0.0049, "step": 1080 }, { "epoch": 0.41603053435114506, "grad_norm": 0.036221910268068314, "learning_rate": 3.6299281715531375e-05, "loss": 0.0083, "step": 1090 }, { "epoch": 0.4198473282442748, "grad_norm": 0.013016347773373127, "learning_rate": 3.600117118720408e-05, "loss": 0.0085, "step": 1100 }, { "epoch": 0.42366412213740456, "grad_norm": 0.07040252536535263, "learning_rate": 3.57011079178877e-05, "loss": 0.0102, "step": 1110 }, { "epoch": 0.42748091603053434, "grad_norm": 0.25287893414497375, "learning_rate": 3.53991451697155e-05, "loss": 0.0092, "step": 1120 }, { "epoch": 0.4312977099236641, "grad_norm": 0.08920171856880188, "learning_rate": 3.509533654198388e-05, "loss": 0.0156, "step": 1130 }, { "epoch": 0.4351145038167939, "grad_norm": 0.36695143580436707, "learning_rate": 3.478973596163843e-05, "loss": 0.0109, "step": 1140 }, { "epoch": 0.4389312977099237, "grad_norm": 0.022651812061667442, "learning_rate": 3.448239767370177e-05, "loss": 0.0057, "step": 1150 }, { "epoch": 0.44274809160305345, "grad_norm": 0.3203389644622803, "learning_rate": 3.4173376231644797e-05, "loss": 0.0075, "step": 1160 }, { "epoch": 0.44656488549618323, "grad_norm": 0.6244869232177734, "learning_rate": 3.386272648770333e-05, "loss": 0.0119, "step": 1170 }, { "epoch": 0.45038167938931295, "grad_norm": 0.16917894780635834, "learning_rate": 3.355050358314172e-05, "loss": 0.0065, "step": 1180 }, { "epoch": 0.4541984732824427, "grad_norm": 0.10675910860300064, "learning_rate": 3.323676293846501e-05, "loss": 0.0073, "step": 1190 }, { "epoch": 0.4580152671755725, "grad_norm": 0.8430480360984802, "learning_rate": 3.2921560243581675e-05, "loss": 0.0121, "step": 1200 }, { "epoch": 0.4618320610687023, "grad_norm": 0.1763671189546585, "learning_rate": 3.260495144791856e-05, "loss": 0.0067, "step": 1210 }, { "epoch": 0.46564885496183206, "grad_norm": 0.4134056568145752, "learning_rate": 3.2286992750489585e-05, "loss": 0.0115, "step": 1220 }, { "epoch": 0.46946564885496184, "grad_norm": 0.02143966406583786, "learning_rate": 3.1967740589920296e-05, "loss": 0.0075, "step": 1230 }, { "epoch": 0.4732824427480916, "grad_norm": 0.6696872115135193, "learning_rate": 3.1647251634429856e-05, "loss": 0.0089, "step": 1240 }, { "epoch": 0.4770992366412214, "grad_norm": 0.06671352684497833, "learning_rate": 3.1325582771772235e-05, "loss": 0.0147, "step": 1250 }, { "epoch": 0.48091603053435117, "grad_norm": 0.7138830423355103, "learning_rate": 3.100279109913848e-05, "loss": 0.0121, "step": 1260 }, { "epoch": 0.4847328244274809, "grad_norm": 0.15019002556800842, "learning_rate": 3.067893391302179e-05, "loss": 0.006, "step": 1270 }, { "epoch": 0.48854961832061067, "grad_norm": 0.29725247621536255, "learning_rate": 3.035406869904721e-05, "loss": 0.0123, "step": 1280 }, { "epoch": 0.49236641221374045, "grad_norm": 0.2249823957681656, "learning_rate": 3.0028253121767762e-05, "loss": 0.0092, "step": 1290 }, { "epoch": 0.4961832061068702, "grad_norm": 0.1760585904121399, "learning_rate": 2.970154501442881e-05, "loss": 0.0189, "step": 1300 }, { "epoch": 0.5, "grad_norm": 0.32748275995254517, "learning_rate": 2.9374002368702514e-05, "loss": 0.0106, "step": 1310 }, { "epoch": 0.5038167938931297, "grad_norm": 0.5537404417991638, "learning_rate": 2.904568332439408e-05, "loss": 0.0073, "step": 1320 }, { "epoch": 0.5076335877862596, "grad_norm": 0.024763356894254684, "learning_rate": 2.8716646159121795e-05, "loss": 0.0039, "step": 1330 }, { "epoch": 0.5114503816793893, "grad_norm": 0.3292398452758789, "learning_rate": 2.8386949277972606e-05, "loss": 0.0128, "step": 1340 }, { "epoch": 0.5152671755725191, "grad_norm": 0.28704291582107544, "learning_rate": 2.805665120313501e-05, "loss": 0.0139, "step": 1350 }, { "epoch": 0.5190839694656488, "grad_norm": 0.3549763560295105, "learning_rate": 2.7725810563511157e-05, "loss": 0.007, "step": 1360 }, { "epoch": 0.5229007633587787, "grad_norm": 0.10086268186569214, "learning_rate": 2.7394486084310128e-05, "loss": 0.0063, "step": 1370 }, { "epoch": 0.5267175572519084, "grad_norm": 0.30678582191467285, "learning_rate": 2.7062736576623943e-05, "loss": 0.0092, "step": 1380 }, { "epoch": 0.5305343511450382, "grad_norm": 0.20195415616035461, "learning_rate": 2.6730620926988476e-05, "loss": 0.0094, "step": 1390 }, { "epoch": 0.5343511450381679, "grad_norm": 0.7400510907173157, "learning_rate": 2.6398198086930915e-05, "loss": 0.0183, "step": 1400 }, { "epoch": 0.5381679389312977, "grad_norm": 0.10865656286478043, "learning_rate": 2.6065527062505634e-05, "loss": 0.0109, "step": 1410 }, { "epoch": 0.5419847328244275, "grad_norm": 0.1100832000374794, "learning_rate": 2.573266690382051e-05, "loss": 0.0086, "step": 1420 }, { "epoch": 0.5458015267175572, "grad_norm": 0.07376679033041, "learning_rate": 2.5399676694555312e-05, "loss": 0.0213, "step": 1430 }, { "epoch": 0.549618320610687, "grad_norm": 0.30974867939949036, "learning_rate": 2.506661554147417e-05, "loss": 0.0091, "step": 1440 }, { "epoch": 0.5534351145038168, "grad_norm": 0.4819875955581665, "learning_rate": 2.473354256393397e-05, "loss": 0.0079, "step": 1450 }, { "epoch": 0.5572519083969466, "grad_norm": 0.08225909620523453, "learning_rate": 2.440051688339046e-05, "loss": 0.0147, "step": 1460 }, { "epoch": 0.5610687022900763, "grad_norm": 0.009925187565386295, "learning_rate": 2.4067597612904037e-05, "loss": 0.0135, "step": 1470 }, { "epoch": 0.5648854961832062, "grad_norm": 0.36840465664863586, "learning_rate": 2.3734843846646988e-05, "loss": 0.0128, "step": 1480 }, { "epoch": 0.5687022900763359, "grad_norm": 0.0662972629070282, "learning_rate": 2.3402314649414116e-05, "loss": 0.0114, "step": 1490 }, { "epoch": 0.5725190839694656, "grad_norm": 0.28558099269866943, "learning_rate": 2.307006904613851e-05, "loss": 0.0113, "step": 1500 }, { "epoch": 0.5725190839694656, "eval_loss": 0.030398810282349586, "eval_runtime": 140.1617, "eval_samples_per_second": 56.949, "eval_steps_per_second": 0.3, "step": 1500 }, { "epoch": 0.5763358778625954, "grad_norm": 0.39870354533195496, "learning_rate": 2.2738166011414527e-05, "loss": 0.0094, "step": 1510 }, { "epoch": 0.5801526717557252, "grad_norm": 0.3758524954319, "learning_rate": 2.2406664459029545e-05, "loss": 0.0089, "step": 1520 }, { "epoch": 0.583969465648855, "grad_norm": 0.028594927862286568, "learning_rate": 2.207562323150662e-05, "loss": 0.0074, "step": 1530 }, { "epoch": 0.5877862595419847, "grad_norm": 0.23170693218708038, "learning_rate": 2.174510108965977e-05, "loss": 0.0054, "step": 1540 }, { "epoch": 0.5916030534351145, "grad_norm": 0.5391947031021118, "learning_rate": 2.1415156702163734e-05, "loss": 0.0079, "step": 1550 }, { "epoch": 0.5954198473282443, "grad_norm": 0.12324138730764389, "learning_rate": 2.1085848635140122e-05, "loss": 0.0071, "step": 1560 }, { "epoch": 0.5992366412213741, "grad_norm": 0.03993413597345352, "learning_rate": 2.0757235341761732e-05, "loss": 0.008, "step": 1570 }, { "epoch": 0.6030534351145038, "grad_norm": 0.32654133439064026, "learning_rate": 2.0429375151876978e-05, "loss": 0.0094, "step": 1580 }, { "epoch": 0.6068702290076335, "grad_norm": 0.012670164927840233, "learning_rate": 2.0102326261656114e-05, "loss": 0.0086, "step": 1590 }, { "epoch": 0.6106870229007634, "grad_norm": 0.22735942900180817, "learning_rate": 1.9776146723261265e-05, "loss": 0.0146, "step": 1600 }, { "epoch": 0.6145038167938931, "grad_norm": 0.5669217109680176, "learning_rate": 1.9450894434541994e-05, "loss": 0.0181, "step": 1610 }, { "epoch": 0.6183206106870229, "grad_norm": 0.09971689432859421, "learning_rate": 1.9126627128758256e-05, "loss": 0.0034, "step": 1620 }, { "epoch": 0.6221374045801527, "grad_norm": 0.11938827484846115, "learning_rate": 1.8803402364332573e-05, "loss": 0.0128, "step": 1630 }, { "epoch": 0.6259541984732825, "grad_norm": 0.47636914253234863, "learning_rate": 1.8481277514633243e-05, "loss": 0.0115, "step": 1640 }, { "epoch": 0.6297709923664122, "grad_norm": 0.3907536566257477, "learning_rate": 1.8160309757790384e-05, "loss": 0.0076, "step": 1650 }, { "epoch": 0.6335877862595419, "grad_norm": 0.2356630116701126, "learning_rate": 1.784055606654664e-05, "loss": 0.0131, "step": 1660 }, { "epoch": 0.6374045801526718, "grad_norm": 0.09361109882593155, "learning_rate": 1.752207319814438e-05, "loss": 0.007, "step": 1670 }, { "epoch": 0.6412213740458015, "grad_norm": 0.3632112145423889, "learning_rate": 1.7204917684251086e-05, "loss": 0.0096, "step": 1680 }, { "epoch": 0.6450381679389313, "grad_norm": 0.44978809356689453, "learning_rate": 1.6889145820924797e-05, "loss": 0.0136, "step": 1690 }, { "epoch": 0.648854961832061, "grad_norm": 0.4278349280357361, "learning_rate": 1.6574813658621464e-05, "loss": 0.0061, "step": 1700 }, { "epoch": 0.6526717557251909, "grad_norm": 0.3191720247268677, "learning_rate": 1.6261976992245727e-05, "loss": 0.0255, "step": 1710 }, { "epoch": 0.6564885496183206, "grad_norm": 0.030014334246516228, "learning_rate": 1.5950691351247218e-05, "loss": 0.0103, "step": 1720 }, { "epoch": 0.6603053435114504, "grad_norm": 0.3003236651420593, "learning_rate": 1.5641011989763903e-05, "loss": 0.0086, "step": 1730 }, { "epoch": 0.6641221374045801, "grad_norm": 0.05489334836602211, "learning_rate": 1.5332993876814285e-05, "loss": 0.0055, "step": 1740 }, { "epoch": 0.6679389312977099, "grad_norm": 0.27562958002090454, "learning_rate": 1.5026691686540262e-05, "loss": 0.0066, "step": 1750 }, { "epoch": 0.6717557251908397, "grad_norm": 0.4876582622528076, "learning_rate": 1.472215978850229e-05, "loss": 0.0088, "step": 1760 }, { "epoch": 0.6755725190839694, "grad_norm": 0.08189146220684052, "learning_rate": 1.4419452238028647e-05, "loss": 0.0087, "step": 1770 }, { "epoch": 0.6793893129770993, "grad_norm": 0.3738853633403778, "learning_rate": 1.4118622766620387e-05, "loss": 0.0091, "step": 1780 }, { "epoch": 0.683206106870229, "grad_norm": 0.2992166578769684, "learning_rate": 1.3819724772413929e-05, "loss": 0.0206, "step": 1790 }, { "epoch": 0.6870229007633588, "grad_norm": 0.6133850812911987, "learning_rate": 1.3522811310702634e-05, "loss": 0.0112, "step": 1800 }, { "epoch": 0.6908396946564885, "grad_norm": 0.25099530816078186, "learning_rate": 1.3227935084519385e-05, "loss": 0.0092, "step": 1810 }, { "epoch": 0.6946564885496184, "grad_norm": 0.4365796744823456, "learning_rate": 1.2935148435281643e-05, "loss": 0.0194, "step": 1820 }, { "epoch": 0.6984732824427481, "grad_norm": 0.5801776051521301, "learning_rate": 1.2644503333500673e-05, "loss": 0.0078, "step": 1830 }, { "epoch": 0.7022900763358778, "grad_norm": 0.0873272716999054, "learning_rate": 1.2356051369556593e-05, "loss": 0.0059, "step": 1840 }, { "epoch": 0.7061068702290076, "grad_norm": 0.03350086137652397, "learning_rate": 1.2069843744540999e-05, "loss": 0.0063, "step": 1850 }, { "epoch": 0.7099236641221374, "grad_norm": 0.33509764075279236, "learning_rate": 1.1785931261168537e-05, "loss": 0.0056, "step": 1860 }, { "epoch": 0.7137404580152672, "grad_norm": 0.12034814059734344, "learning_rate": 1.1504364314759311e-05, "loss": 0.0073, "step": 1870 }, { "epoch": 0.7175572519083969, "grad_norm": 0.5151724815368652, "learning_rate": 1.1225192884293578e-05, "loss": 0.0099, "step": 1880 }, { "epoch": 0.7213740458015268, "grad_norm": 0.32692572474479675, "learning_rate": 1.0948466523540296e-05, "loss": 0.008, "step": 1890 }, { "epoch": 0.7251908396946565, "grad_norm": 0.005217437632381916, "learning_rate": 1.0674234352261175e-05, "loss": 0.0071, "step": 1900 }, { "epoch": 0.7290076335877863, "grad_norm": 0.2179807722568512, "learning_rate": 1.0402545047491818e-05, "loss": 0.0166, "step": 1910 }, { "epoch": 0.732824427480916, "grad_norm": 0.6623926162719727, "learning_rate": 1.0133446834901344e-05, "loss": 0.0081, "step": 1920 }, { "epoch": 0.7366412213740458, "grad_norm": 0.27897024154663086, "learning_rate": 9.866987480232218e-06, "loss": 0.0109, "step": 1930 }, { "epoch": 0.7404580152671756, "grad_norm": 0.026624036952853203, "learning_rate": 9.603214280821707e-06, "loss": 0.0168, "step": 1940 }, { "epoch": 0.7442748091603053, "grad_norm": 0.505171537399292, "learning_rate": 9.342174057206402e-06, "loss": 0.0055, "step": 1950 }, { "epoch": 0.7480916030534351, "grad_norm": 0.5251939296722412, "learning_rate": 9.083913144811499e-06, "loss": 0.0067, "step": 1960 }, { "epoch": 0.7519083969465649, "grad_norm": 0.2802061438560486, "learning_rate": 8.828477385726055e-06, "loss": 0.0073, "step": 1970 }, { "epoch": 0.7557251908396947, "grad_norm": 0.4821580648422241, "learning_rate": 8.575912120565956e-06, "loss": 0.0077, "step": 1980 }, { "epoch": 0.7595419847328244, "grad_norm": 0.16226717829704285, "learning_rate": 8.326262180425746e-06, "loss": 0.0063, "step": 1990 }, { "epoch": 0.7633587786259542, "grad_norm": 0.2521667182445526, "learning_rate": 8.079571878921002e-06, "loss": 0.0058, "step": 2000 }, { "epoch": 0.7633587786259542, "eval_loss": 0.028967689722776413, "eval_runtime": 140.2457, "eval_samples_per_second": 56.914, "eval_steps_per_second": 0.299, "step": 2000 }, { "epoch": 0.767175572519084, "grad_norm": 0.40514904260635376, "learning_rate": 7.83588500432256e-06, "loss": 0.0112, "step": 2010 }, { "epoch": 0.7709923664122137, "grad_norm": 1.045690894126892, "learning_rate": 7.595244811783944e-06, "loss": 0.0058, "step": 2020 }, { "epoch": 0.7748091603053435, "grad_norm": 0.5543403625488281, "learning_rate": 7.357694015663416e-06, "loss": 0.0102, "step": 2030 }, { "epoch": 0.7786259541984732, "grad_norm": 0.14902685582637787, "learning_rate": 7.123274781942107e-06, "loss": 0.0041, "step": 2040 }, { "epoch": 0.7824427480916031, "grad_norm": 0.0032010909635573626, "learning_rate": 6.892028720739388e-06, "loss": 0.0159, "step": 2050 }, { "epoch": 0.7862595419847328, "grad_norm": 0.05139963701367378, "learning_rate": 6.663996878926943e-06, "loss": 0.0064, "step": 2060 }, { "epoch": 0.7900763358778626, "grad_norm": 0.3358862102031708, "learning_rate": 6.439219732842869e-06, "loss": 0.0098, "step": 2070 }, { "epoch": 0.7938931297709924, "grad_norm": 0.15072062611579895, "learning_rate": 6.217737181106978e-06, "loss": 0.0065, "step": 2080 }, { "epoch": 0.7977099236641222, "grad_norm": 0.002467066515237093, "learning_rate": 5.999588537538639e-06, "loss": 0.0037, "step": 2090 }, { "epoch": 0.8015267175572519, "grad_norm": 0.034411050379276276, "learning_rate": 5.784812524178529e-06, "loss": 0.0074, "step": 2100 }, { "epoch": 0.8053435114503816, "grad_norm": 0.08725389838218689, "learning_rate": 5.573447264415288e-06, "loss": 0.0185, "step": 2110 }, { "epoch": 0.8091603053435115, "grad_norm": 0.2480231672525406, "learning_rate": 5.365530276218531e-06, "loss": 0.0071, "step": 2120 }, { "epoch": 0.8129770992366412, "grad_norm": 0.5087897181510925, "learning_rate": 5.161098465479308e-06, "loss": 0.0243, "step": 2130 }, { "epoch": 0.816793893129771, "grad_norm": 0.3457648456096649, "learning_rate": 4.96018811945918e-06, "loss": 0.0073, "step": 2140 }, { "epoch": 0.8206106870229007, "grad_norm": 0.39516162872314453, "learning_rate": 4.762834900349117e-06, "loss": 0.0122, "step": 2150 }, { "epoch": 0.8244274809160306, "grad_norm": 0.3586041331291199, "learning_rate": 4.56907383893935e-06, "loss": 0.0073, "step": 2160 }, { "epoch": 0.8282442748091603, "grad_norm": 0.06679055839776993, "learning_rate": 4.378939328401338e-06, "loss": 0.0052, "step": 2170 }, { "epoch": 0.8320610687022901, "grad_norm": 0.40074533224105835, "learning_rate": 4.192465118182814e-06, "loss": 0.0064, "step": 2180 }, { "epoch": 0.8358778625954199, "grad_norm": 0.13426686823368073, "learning_rate": 4.0096843080172195e-06, "loss": 0.0139, "step": 2190 }, { "epoch": 0.8396946564885496, "grad_norm": 0.08818082511425018, "learning_rate": 3.8306293420483505e-06, "loss": 0.0104, "step": 2200 }, { "epoch": 0.8435114503816794, "grad_norm": 0.676325261592865, "learning_rate": 3.6553320030714626e-06, "loss": 0.0065, "step": 2210 }, { "epoch": 0.8473282442748091, "grad_norm": 0.31104177236557007, "learning_rate": 3.483823406891701e-06, "loss": 0.0074, "step": 2220 }, { "epoch": 0.851145038167939, "grad_norm": 0.358847975730896, "learning_rate": 3.3161339968009768e-06, "loss": 0.0103, "step": 2230 }, { "epoch": 0.8549618320610687, "grad_norm": 0.4074176251888275, "learning_rate": 3.1522935381741643e-06, "loss": 0.0106, "step": 2240 }, { "epoch": 0.8587786259541985, "grad_norm": 0.22954821586608887, "learning_rate": 2.9923311131856783e-06, "loss": 0.0087, "step": 2250 }, { "epoch": 0.8625954198473282, "grad_norm": 0.33302852511405945, "learning_rate": 2.836275115647294e-06, "loss": 0.013, "step": 2260 }, { "epoch": 0.8664122137404581, "grad_norm": 0.09198283404111862, "learning_rate": 2.6841532459681627e-06, "loss": 0.0067, "step": 2270 }, { "epoch": 0.8702290076335878, "grad_norm": 0.03521020710468292, "learning_rate": 2.5359925062378884e-06, "loss": 0.0092, "step": 2280 }, { "epoch": 0.8740458015267175, "grad_norm": 0.40850088000297546, "learning_rate": 2.3918191954336144e-06, "loss": 0.0064, "step": 2290 }, { "epoch": 0.8778625954198473, "grad_norm": 0.002618383150547743, "learning_rate": 2.2516589047518273e-06, "loss": 0.0069, "step": 2300 }, { "epoch": 0.8816793893129771, "grad_norm": 0.28037503361701965, "learning_rate": 2.1155365130658717e-06, "loss": 0.0063, "step": 2310 }, { "epoch": 0.8854961832061069, "grad_norm": 0.06269484013319016, "learning_rate": 1.9834761825098773e-06, "loss": 0.0043, "step": 2320 }, { "epoch": 0.8893129770992366, "grad_norm": 0.6617615222930908, "learning_rate": 1.8555013541899036e-06, "loss": 0.0148, "step": 2330 }, { "epoch": 0.8931297709923665, "grad_norm": 0.06300811469554901, "learning_rate": 1.7316347440230557e-06, "loss": 0.0116, "step": 2340 }, { "epoch": 0.8969465648854962, "grad_norm": 0.15155254304409027, "learning_rate": 1.61189833870537e-06, "loss": 0.0035, "step": 2350 }, { "epoch": 0.9007633587786259, "grad_norm": 0.7464874386787415, "learning_rate": 1.496313391809076e-06, "loss": 0.0131, "step": 2360 }, { "epoch": 0.9045801526717557, "grad_norm": 0.7433227896690369, "learning_rate": 1.3849004200100546e-06, "loss": 0.0084, "step": 2370 }, { "epoch": 0.9083969465648855, "grad_norm": 0.4609885513782501, "learning_rate": 1.2776791994460553e-06, "loss": 0.0091, "step": 2380 }, { "epoch": 0.9122137404580153, "grad_norm": 0.03363984078168869, "learning_rate": 1.1746687622063573e-06, "loss": 0.0109, "step": 2390 }, { "epoch": 0.916030534351145, "grad_norm": 0.34646764397621155, "learning_rate": 1.0758873929535424e-06, "loss": 0.0058, "step": 2400 }, { "epoch": 0.9198473282442748, "grad_norm": 0.28582972288131714, "learning_rate": 9.813526256778894e-07, "loss": 0.0054, "step": 2410 }, { "epoch": 0.9236641221374046, "grad_norm": 0.5931139588356018, "learning_rate": 8.910812405850566e-07, "loss": 0.0118, "step": 2420 }, { "epoch": 0.9274809160305344, "grad_norm": 0.44896554946899414, "learning_rate": 8.050892611175253e-07, "loss": 0.0119, "step": 2430 }, { "epoch": 0.9312977099236641, "grad_norm": 0.19585078954696655, "learning_rate": 7.233919511104082e-07, "loss": 0.0132, "step": 2440 }, { "epoch": 0.9351145038167938, "grad_norm": 0.24196402728557587, "learning_rate": 6.460038120820688e-07, "loss": 0.0051, "step": 2450 }, { "epoch": 0.9389312977099237, "grad_norm": 0.2216787487268448, "learning_rate": 5.729385806600484e-07, "loss": 0.0039, "step": 2460 }, { "epoch": 0.9427480916030534, "grad_norm": 0.008937466889619827, "learning_rate": 5.042092261427822e-07, "loss": 0.0151, "step": 2470 }, { "epoch": 0.9465648854961832, "grad_norm": 0.2098822146654129, "learning_rate": 4.3982794819751316e-07, "loss": 0.0045, "step": 2480 }, { "epoch": 0.950381679389313, "grad_norm": 0.09708132594823837, "learning_rate": 3.7980617469479953e-07, "loss": 0.0059, "step": 2490 }, { "epoch": 0.9541984732824428, "grad_norm": 0.10786189138889313, "learning_rate": 3.2415455968004826e-07, "loss": 0.0032, "step": 2500 }, { "epoch": 0.9541984732824428, "eval_loss": 0.027030499652028084, "eval_runtime": 140.2791, "eval_samples_per_second": 56.901, "eval_steps_per_second": 0.299, "step": 2500 }, { "epoch": 0.9580152671755725, "grad_norm": 0.5997936725616455, "learning_rate": 2.7288298148238913e-07, "loss": 0.0164, "step": 2510 }, { "epoch": 0.9618320610687023, "grad_norm": 1.1892848014831543, "learning_rate": 2.2600054096122703e-07, "loss": 0.0095, "step": 2520 }, { "epoch": 0.9656488549618321, "grad_norm": 0.014309865422546864, "learning_rate": 1.8351555989082892e-07, "loss": 0.006, "step": 2530 }, { "epoch": 0.9694656488549618, "grad_norm": 0.23357251286506653, "learning_rate": 1.4543557948317744e-07, "loss": 0.0084, "step": 2540 }, { "epoch": 0.9732824427480916, "grad_norm": 0.4700785279273987, "learning_rate": 1.1176735904937508e-07, "loss": 0.0079, "step": 2550 }, { "epoch": 0.9770992366412213, "grad_norm": 0.005513277370482683, "learning_rate": 8.251687479986503e-08, "loss": 0.0176, "step": 2560 }, { "epoch": 0.9809160305343512, "grad_norm": 0.32526788115501404, "learning_rate": 5.768931878361583e-08, "loss": 0.0091, "step": 2570 }, { "epoch": 0.9847328244274809, "grad_norm": 0.29356321692466736, "learning_rate": 3.728909796652525e-08, "loss": 0.0022, "step": 2580 }, { "epoch": 0.9885496183206107, "grad_norm": 0.03470044955611229, "learning_rate": 2.1319833449179317e-08, "loss": 0.0055, "step": 2590 }, { "epoch": 0.9923664122137404, "grad_norm": 0.004818221088498831, "learning_rate": 9.78435982407766e-09, "loss": 0.0093, "step": 2600 }, { "epoch": 0.9961832061068703, "grad_norm": 0.03156821429729462, "learning_rate": 2.6847246725053565e-09, "loss": 0.0034, "step": 2610 }, { "epoch": 1.0, "grad_norm": 0.0063484120182693005, "learning_rate": 2.2188201059814007e-11, "loss": 0.0075, "step": 2620 } ], "logging_steps": 10, "max_steps": 2620, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3731098682179103e+18, "train_batch_size": 192, "trial_name": null, "trial_params": null }