| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2620, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003816793893129771, | |
| "grad_norm": 2.615494966506958, | |
| "learning_rate": 1.7175572519083972e-06, | |
| "loss": 0.162, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007633587786259542, | |
| "grad_norm": 3.7720587253570557, | |
| "learning_rate": 3.6259541984732824e-06, | |
| "loss": 0.1953, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.011450381679389313, | |
| "grad_norm": 3.1540627479553223, | |
| "learning_rate": 5.534351145038168e-06, | |
| "loss": 0.1651, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.015267175572519083, | |
| "grad_norm": 2.391727924346924, | |
| "learning_rate": 7.4427480916030536e-06, | |
| "loss": 0.0976, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.019083969465648856, | |
| "grad_norm": 1.7957884073257446, | |
| "learning_rate": 9.351145038167939e-06, | |
| "loss": 0.0483, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.022900763358778626, | |
| "grad_norm": 2.7055726051330566, | |
| "learning_rate": 1.1259541984732823e-05, | |
| "loss": 0.0618, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.026717557251908396, | |
| "grad_norm": 0.7185584902763367, | |
| "learning_rate": 1.316793893129771e-05, | |
| "loss": 0.0452, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.030534351145038167, | |
| "grad_norm": 0.3574228286743164, | |
| "learning_rate": 1.5076335877862596e-05, | |
| "loss": 0.0287, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03435114503816794, | |
| "grad_norm": 0.600852906703949, | |
| "learning_rate": 1.6984732824427482e-05, | |
| "loss": 0.0345, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03816793893129771, | |
| "grad_norm": 1.2814120054244995, | |
| "learning_rate": 1.8893129770992367e-05, | |
| "loss": 0.0375, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04198473282442748, | |
| "grad_norm": 0.7709506154060364, | |
| "learning_rate": 2.0801526717557255e-05, | |
| "loss": 0.0177, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04580152671755725, | |
| "grad_norm": 0.749973714351654, | |
| "learning_rate": 2.270992366412214e-05, | |
| "loss": 0.0219, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04961832061068702, | |
| "grad_norm": 0.14578384160995483, | |
| "learning_rate": 2.4618320610687024e-05, | |
| "loss": 0.0226, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05343511450381679, | |
| "grad_norm": 1.139318585395813, | |
| "learning_rate": 2.652671755725191e-05, | |
| "loss": 0.0247, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05725190839694656, | |
| "grad_norm": 1.522550344467163, | |
| "learning_rate": 2.8435114503816796e-05, | |
| "loss": 0.0219, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.061068702290076333, | |
| "grad_norm": 0.010661217384040356, | |
| "learning_rate": 3.0343511450381677e-05, | |
| "loss": 0.0233, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0648854961832061, | |
| "grad_norm": 1.0627490282058716, | |
| "learning_rate": 3.2251908396946565e-05, | |
| "loss": 0.0143, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06870229007633588, | |
| "grad_norm": 0.4552942216396332, | |
| "learning_rate": 3.416030534351145e-05, | |
| "loss": 0.0271, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07251908396946564, | |
| "grad_norm": 0.6815011501312256, | |
| "learning_rate": 3.606870229007634e-05, | |
| "loss": 0.0185, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07633587786259542, | |
| "grad_norm": 0.698657751083374, | |
| "learning_rate": 3.797709923664122e-05, | |
| "loss": 0.0162, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08015267175572519, | |
| "grad_norm": 0.5514554977416992, | |
| "learning_rate": 3.988549618320611e-05, | |
| "loss": 0.0277, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08396946564885496, | |
| "grad_norm": 0.09276178479194641, | |
| "learning_rate": 4.1793893129771e-05, | |
| "loss": 0.0176, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08778625954198473, | |
| "grad_norm": 1.0245144367218018, | |
| "learning_rate": 4.370229007633588e-05, | |
| "loss": 0.0219, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0916030534351145, | |
| "grad_norm": 0.5731092095375061, | |
| "learning_rate": 4.561068702290077e-05, | |
| "loss": 0.0188, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.09541984732824428, | |
| "grad_norm": 0.11703433096408844, | |
| "learning_rate": 4.751908396946565e-05, | |
| "loss": 0.0153, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09923664122137404, | |
| "grad_norm": 0.3222142159938812, | |
| "learning_rate": 4.9427480916030536e-05, | |
| "loss": 0.0176, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.10305343511450382, | |
| "grad_norm": 0.6780900359153748, | |
| "learning_rate": 4.9998912785867505e-05, | |
| "loss": 0.0143, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.10687022900763359, | |
| "grad_norm": 0.8537261486053467, | |
| "learning_rate": 4.999358788306519e-05, | |
| "loss": 0.0223, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11068702290076336, | |
| "grad_norm": 0.17257721722126007, | |
| "learning_rate": 4.998382654320609e-05, | |
| "loss": 0.0178, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.11450381679389313, | |
| "grad_norm": 0.7881976962089539, | |
| "learning_rate": 4.996963049895741e-05, | |
| "loss": 0.0146, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1183206106870229, | |
| "grad_norm": 0.6446636915206909, | |
| "learning_rate": 4.99510022701597e-05, | |
| "loss": 0.0109, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.12213740458015267, | |
| "grad_norm": 0.6660375595092773, | |
| "learning_rate": 4.992794516337964e-05, | |
| "loss": 0.012, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.12595419847328243, | |
| "grad_norm": 0.47002533078193665, | |
| "learning_rate": 4.9900463271323064e-05, | |
| "loss": 0.0133, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1297709923664122, | |
| "grad_norm": 0.9555854201316833, | |
| "learning_rate": 4.9868561472108496e-05, | |
| "loss": 0.0173, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.13358778625954199, | |
| "grad_norm": 0.14546959102153778, | |
| "learning_rate": 4.9832245428401316e-05, | |
| "loss": 0.0172, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.13740458015267176, | |
| "grad_norm": 0.27474039793014526, | |
| "learning_rate": 4.979152158640853e-05, | |
| "loss": 0.0174, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.14122137404580154, | |
| "grad_norm": 0.2749495208263397, | |
| "learning_rate": 4.974639717473465e-05, | |
| "loss": 0.0172, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1450381679389313, | |
| "grad_norm": 0.19880647957324982, | |
| "learning_rate": 4.9696880203098525e-05, | |
| "loss": 0.012, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.14885496183206107, | |
| "grad_norm": 0.7163119912147522, | |
| "learning_rate": 4.964297946091163e-05, | |
| "loss": 0.0138, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.15267175572519084, | |
| "grad_norm": 0.4733160436153412, | |
| "learning_rate": 4.9584704515717884e-05, | |
| "loss": 0.0074, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15648854961832062, | |
| "grad_norm": 0.5145673155784607, | |
| "learning_rate": 4.952206571149541e-05, | |
| "loss": 0.0178, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.16030534351145037, | |
| "grad_norm": 0.07867272943258286, | |
| "learning_rate": 4.945507416682046e-05, | |
| "loss": 0.0132, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.16412213740458015, | |
| "grad_norm": 0.023460840806365013, | |
| "learning_rate": 4.938374177289378e-05, | |
| "loss": 0.0085, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.16793893129770993, | |
| "grad_norm": 0.6247947812080383, | |
| "learning_rate": 4.930808119142993e-05, | |
| "loss": 0.0176, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1717557251908397, | |
| "grad_norm": 0.49582046270370483, | |
| "learning_rate": 4.922810585240981e-05, | |
| "loss": 0.0188, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.17557251908396945, | |
| "grad_norm": 1.1747221946716309, | |
| "learning_rate": 4.914382995169673e-05, | |
| "loss": 0.0187, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.17938931297709923, | |
| "grad_norm": 0.9253813624382019, | |
| "learning_rate": 4.905526844851667e-05, | |
| "loss": 0.0125, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.183206106870229, | |
| "grad_norm": 0.32356423139572144, | |
| "learning_rate": 4.8962437062802924e-05, | |
| "loss": 0.0145, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.18702290076335878, | |
| "grad_norm": 0.16892339289188385, | |
| "learning_rate": 4.886535227240579e-05, | |
| "loss": 0.0132, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.19083969465648856, | |
| "grad_norm": 0.08704151958227158, | |
| "learning_rate": 4.8764031310167725e-05, | |
| "loss": 0.0145, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19083969465648856, | |
| "eval_loss": 0.03389899432659149, | |
| "eval_runtime": 140.2568, | |
| "eval_samples_per_second": 56.91, | |
| "eval_steps_per_second": 0.299, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1946564885496183, | |
| "grad_norm": 0.021338067948818207, | |
| "learning_rate": 4.865849216086438e-05, | |
| "loss": 0.0112, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1984732824427481, | |
| "grad_norm": 0.18011535704135895, | |
| "learning_rate": 4.854875355801233e-05, | |
| "loss": 0.0068, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.20229007633587787, | |
| "grad_norm": 0.6645931005477905, | |
| "learning_rate": 4.843483498054381e-05, | |
| "loss": 0.0182, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.20610687022900764, | |
| "grad_norm": 0.1481589674949646, | |
| "learning_rate": 4.83167566493491e-05, | |
| "loss": 0.0113, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2099236641221374, | |
| "grad_norm": 0.05964776873588562, | |
| "learning_rate": 4.819453952368731e-05, | |
| "loss": 0.0101, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.21374045801526717, | |
| "grad_norm": 0.5340839624404907, | |
| "learning_rate": 4.8068205297465986e-05, | |
| "loss": 0.0181, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.21755725190839695, | |
| "grad_norm": 0.28655287623405457, | |
| "learning_rate": 4.7937776395390434e-05, | |
| "loss": 0.0073, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.22137404580152673, | |
| "grad_norm": 0.08776471763849258, | |
| "learning_rate": 4.7803275968983244e-05, | |
| "loss": 0.0129, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.22519083969465647, | |
| "grad_norm": 0.11037462204694748, | |
| "learning_rate": 4.766472789247483e-05, | |
| "loss": 0.0088, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.22900763358778625, | |
| "grad_norm": 0.21740970015525818, | |
| "learning_rate": 4.752215675856566e-05, | |
| "loss": 0.0115, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.23282442748091603, | |
| "grad_norm": 0.07760237902402878, | |
| "learning_rate": 4.737558787406103e-05, | |
| "loss": 0.0118, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2366412213740458, | |
| "grad_norm": 0.2091819792985916, | |
| "learning_rate": 4.722504725537896e-05, | |
| "loss": 0.0096, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.24045801526717558, | |
| "grad_norm": 1.0305904150009155, | |
| "learning_rate": 4.7070561623932265e-05, | |
| "loss": 0.0108, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.24427480916030533, | |
| "grad_norm": 0.03920348733663559, | |
| "learning_rate": 4.691215840138537e-05, | |
| "loss": 0.0116, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2480916030534351, | |
| "grad_norm": 0.02367626503109932, | |
| "learning_rate": 4.674986570478695e-05, | |
| "loss": 0.0133, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.25190839694656486, | |
| "grad_norm": 0.05315301567316055, | |
| "learning_rate": 4.6583712341578984e-05, | |
| "loss": 0.0087, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.25572519083969464, | |
| "grad_norm": 0.10831905901432037, | |
| "learning_rate": 4.641372780448341e-05, | |
| "loss": 0.0084, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2595419847328244, | |
| "grad_norm": 0.078117236495018, | |
| "learning_rate": 4.6239942266267056e-05, | |
| "loss": 0.0058, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2633587786259542, | |
| "grad_norm": 0.28782543540000916, | |
| "learning_rate": 4.6062386574385886e-05, | |
| "loss": 0.0104, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.26717557251908397, | |
| "grad_norm": 0.04873083531856537, | |
| "learning_rate": 4.5881092245509506e-05, | |
| "loss": 0.0118, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.27099236641221375, | |
| "grad_norm": 0.26269766688346863, | |
| "learning_rate": 4.56960914599268e-05, | |
| "loss": 0.0077, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2748091603053435, | |
| "grad_norm": 0.3128824532032013, | |
| "learning_rate": 4.55074170558339e-05, | |
| "loss": 0.0107, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2786259541984733, | |
| "grad_norm": 1.1486977338790894, | |
| "learning_rate": 4.531510252350526e-05, | |
| "loss": 0.012, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2824427480916031, | |
| "grad_norm": 0.35734987258911133, | |
| "learning_rate": 4.511918199934907e-05, | |
| "loss": 0.0121, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2862595419847328, | |
| "grad_norm": 0.2775309681892395, | |
| "learning_rate": 4.491969025984789e-05, | |
| "loss": 0.0139, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2900763358778626, | |
| "grad_norm": 0.22545866668224335, | |
| "learning_rate": 4.471666271538578e-05, | |
| "loss": 0.0047, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.29389312977099236, | |
| "grad_norm": 0.3366382420063019, | |
| "learning_rate": 4.451013540396281e-05, | |
| "loss": 0.008, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.29770992366412213, | |
| "grad_norm": 0.645935595035553, | |
| "learning_rate": 4.430014498479819e-05, | |
| "loss": 0.0197, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3015267175572519, | |
| "grad_norm": 0.022258713841438293, | |
| "learning_rate": 4.408672873182322e-05, | |
| "loss": 0.0138, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "grad_norm": 0.08058780431747437, | |
| "learning_rate": 4.386992452706499e-05, | |
| "loss": 0.01, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.30916030534351147, | |
| "grad_norm": 0.5103694796562195, | |
| "learning_rate": 4.36497708539222e-05, | |
| "loss": 0.0127, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.31297709923664124, | |
| "grad_norm": 0.2476615309715271, | |
| "learning_rate": 4.342630679033432e-05, | |
| "loss": 0.0129, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.31679389312977096, | |
| "grad_norm": 0.2634897530078888, | |
| "learning_rate": 4.319957200184504e-05, | |
| "loss": 0.0052, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.32061068702290074, | |
| "grad_norm": 0.31470030546188354, | |
| "learning_rate": 4.296960673456159e-05, | |
| "loss": 0.0165, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.3244274809160305, | |
| "grad_norm": 0.19848482310771942, | |
| "learning_rate": 4.273645180801088e-05, | |
| "loss": 0.005, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3282442748091603, | |
| "grad_norm": 0.08697102218866348, | |
| "learning_rate": 4.2500148607893965e-05, | |
| "loss": 0.0082, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3320610687022901, | |
| "grad_norm": 0.38102781772613525, | |
| "learning_rate": 4.226073907873992e-05, | |
| "loss": 0.0126, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.33587786259541985, | |
| "grad_norm": 0.14514830708503723, | |
| "learning_rate": 4.201826571646056e-05, | |
| "loss": 0.0058, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.33969465648854963, | |
| "grad_norm": 0.43940168619155884, | |
| "learning_rate": 4.177277156080731e-05, | |
| "loss": 0.0078, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.3435114503816794, | |
| "grad_norm": 0.05170935019850731, | |
| "learning_rate": 4.152430018773153e-05, | |
| "loss": 0.0082, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3473282442748092, | |
| "grad_norm": 0.01292837131768465, | |
| "learning_rate": 4.127289570164958e-05, | |
| "loss": 0.0156, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.3511450381679389, | |
| "grad_norm": 0.21468913555145264, | |
| "learning_rate": 4.1018602727614264e-05, | |
| "loss": 0.0075, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3549618320610687, | |
| "grad_norm": 0.7786637544631958, | |
| "learning_rate": 4.0761466403393633e-05, | |
| "loss": 0.0167, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.35877862595419846, | |
| "grad_norm": 0.06286737322807312, | |
| "learning_rate": 4.0501532371458993e-05, | |
| "loss": 0.0107, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.36259541984732824, | |
| "grad_norm": 0.19294968247413635, | |
| "learning_rate": 4.0238846770883165e-05, | |
| "loss": 0.0103, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.366412213740458, | |
| "grad_norm": 0.02884821780025959, | |
| "learning_rate": 3.9973456229150695e-05, | |
| "loss": 0.0133, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3702290076335878, | |
| "grad_norm": 0.36911624670028687, | |
| "learning_rate": 3.9705407853881324e-05, | |
| "loss": 0.0072, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.37404580152671757, | |
| "grad_norm": 0.21048544347286224, | |
| "learning_rate": 3.9434749224468235e-05, | |
| "loss": 0.0047, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.37786259541984735, | |
| "grad_norm": 0.09334168583154678, | |
| "learning_rate": 3.916152838363258e-05, | |
| "loss": 0.0125, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3816793893129771, | |
| "grad_norm": 0.49426600337028503, | |
| "learning_rate": 3.8885793828895756e-05, | |
| "loss": 0.0109, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3816793893129771, | |
| "eval_loss": 0.032893069088459015, | |
| "eval_runtime": 140.214, | |
| "eval_samples_per_second": 56.927, | |
| "eval_steps_per_second": 0.3, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.38549618320610685, | |
| "grad_norm": 0.19869565963745117, | |
| "learning_rate": 3.8607594503970925e-05, | |
| "loss": 0.0144, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3893129770992366, | |
| "grad_norm": 0.19766545295715332, | |
| "learning_rate": 3.832697979007539e-05, | |
| "loss": 0.0067, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3931297709923664, | |
| "grad_norm": 0.4004887044429779, | |
| "learning_rate": 3.804399949716526e-05, | |
| "loss": 0.0047, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3969465648854962, | |
| "grad_norm": 0.31220734119415283, | |
| "learning_rate": 3.775870385509402e-05, | |
| "loss": 0.0163, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.40076335877862596, | |
| "grad_norm": 0.12848161160945892, | |
| "learning_rate": 3.7471143504696625e-05, | |
| "loss": 0.0038, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.40458015267175573, | |
| "grad_norm": 0.39810845255851746, | |
| "learning_rate": 3.718136948880056e-05, | |
| "loss": 0.0077, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.4083969465648855, | |
| "grad_norm": 0.47316208481788635, | |
| "learning_rate": 3.6889433243165605e-05, | |
| "loss": 0.0141, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.4122137404580153, | |
| "grad_norm": 0.028033487498760223, | |
| "learning_rate": 3.6595386587353795e-05, | |
| "loss": 0.0049, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.41603053435114506, | |
| "grad_norm": 0.036221910268068314, | |
| "learning_rate": 3.6299281715531375e-05, | |
| "loss": 0.0083, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.4198473282442748, | |
| "grad_norm": 0.013016347773373127, | |
| "learning_rate": 3.600117118720408e-05, | |
| "loss": 0.0085, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.42366412213740456, | |
| "grad_norm": 0.07040252536535263, | |
| "learning_rate": 3.57011079178877e-05, | |
| "loss": 0.0102, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.42748091603053434, | |
| "grad_norm": 0.25287893414497375, | |
| "learning_rate": 3.53991451697155e-05, | |
| "loss": 0.0092, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4312977099236641, | |
| "grad_norm": 0.08920171856880188, | |
| "learning_rate": 3.509533654198388e-05, | |
| "loss": 0.0156, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.4351145038167939, | |
| "grad_norm": 0.36695143580436707, | |
| "learning_rate": 3.478973596163843e-05, | |
| "loss": 0.0109, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4389312977099237, | |
| "grad_norm": 0.022651812061667442, | |
| "learning_rate": 3.448239767370177e-05, | |
| "loss": 0.0057, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.44274809160305345, | |
| "grad_norm": 0.3203389644622803, | |
| "learning_rate": 3.4173376231644797e-05, | |
| "loss": 0.0075, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.44656488549618323, | |
| "grad_norm": 0.6244869232177734, | |
| "learning_rate": 3.386272648770333e-05, | |
| "loss": 0.0119, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.45038167938931295, | |
| "grad_norm": 0.16917894780635834, | |
| "learning_rate": 3.355050358314172e-05, | |
| "loss": 0.0065, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4541984732824427, | |
| "grad_norm": 0.10675910860300064, | |
| "learning_rate": 3.323676293846501e-05, | |
| "loss": 0.0073, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4580152671755725, | |
| "grad_norm": 0.8430480360984802, | |
| "learning_rate": 3.2921560243581675e-05, | |
| "loss": 0.0121, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4618320610687023, | |
| "grad_norm": 0.1763671189546585, | |
| "learning_rate": 3.260495144791856e-05, | |
| "loss": 0.0067, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.46564885496183206, | |
| "grad_norm": 0.4134056568145752, | |
| "learning_rate": 3.2286992750489585e-05, | |
| "loss": 0.0115, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.46946564885496184, | |
| "grad_norm": 0.02143966406583786, | |
| "learning_rate": 3.1967740589920296e-05, | |
| "loss": 0.0075, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.4732824427480916, | |
| "grad_norm": 0.6696872115135193, | |
| "learning_rate": 3.1647251634429856e-05, | |
| "loss": 0.0089, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4770992366412214, | |
| "grad_norm": 0.06671352684497833, | |
| "learning_rate": 3.1325582771772235e-05, | |
| "loss": 0.0147, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.48091603053435117, | |
| "grad_norm": 0.7138830423355103, | |
| "learning_rate": 3.100279109913848e-05, | |
| "loss": 0.0121, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4847328244274809, | |
| "grad_norm": 0.15019002556800842, | |
| "learning_rate": 3.067893391302179e-05, | |
| "loss": 0.006, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.48854961832061067, | |
| "grad_norm": 0.29725247621536255, | |
| "learning_rate": 3.035406869904721e-05, | |
| "loss": 0.0123, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.49236641221374045, | |
| "grad_norm": 0.2249823957681656, | |
| "learning_rate": 3.0028253121767762e-05, | |
| "loss": 0.0092, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.4961832061068702, | |
| "grad_norm": 0.1760585904121399, | |
| "learning_rate": 2.970154501442881e-05, | |
| "loss": 0.0189, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.32748275995254517, | |
| "learning_rate": 2.9374002368702514e-05, | |
| "loss": 0.0106, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5038167938931297, | |
| "grad_norm": 0.5537404417991638, | |
| "learning_rate": 2.904568332439408e-05, | |
| "loss": 0.0073, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5076335877862596, | |
| "grad_norm": 0.024763356894254684, | |
| "learning_rate": 2.8716646159121795e-05, | |
| "loss": 0.0039, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5114503816793893, | |
| "grad_norm": 0.3292398452758789, | |
| "learning_rate": 2.8386949277972606e-05, | |
| "loss": 0.0128, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5152671755725191, | |
| "grad_norm": 0.28704291582107544, | |
| "learning_rate": 2.805665120313501e-05, | |
| "loss": 0.0139, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5190839694656488, | |
| "grad_norm": 0.3549763560295105, | |
| "learning_rate": 2.7725810563511157e-05, | |
| "loss": 0.007, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5229007633587787, | |
| "grad_norm": 0.10086268186569214, | |
| "learning_rate": 2.7394486084310128e-05, | |
| "loss": 0.0063, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.5267175572519084, | |
| "grad_norm": 0.30678582191467285, | |
| "learning_rate": 2.7062736576623943e-05, | |
| "loss": 0.0092, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5305343511450382, | |
| "grad_norm": 0.20195415616035461, | |
| "learning_rate": 2.6730620926988476e-05, | |
| "loss": 0.0094, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.5343511450381679, | |
| "grad_norm": 0.7400510907173157, | |
| "learning_rate": 2.6398198086930915e-05, | |
| "loss": 0.0183, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5381679389312977, | |
| "grad_norm": 0.10865656286478043, | |
| "learning_rate": 2.6065527062505634e-05, | |
| "loss": 0.0109, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.5419847328244275, | |
| "grad_norm": 0.1100832000374794, | |
| "learning_rate": 2.573266690382051e-05, | |
| "loss": 0.0086, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5458015267175572, | |
| "grad_norm": 0.07376679033041, | |
| "learning_rate": 2.5399676694555312e-05, | |
| "loss": 0.0213, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.549618320610687, | |
| "grad_norm": 0.30974867939949036, | |
| "learning_rate": 2.506661554147417e-05, | |
| "loss": 0.0091, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.5534351145038168, | |
| "grad_norm": 0.4819875955581665, | |
| "learning_rate": 2.473354256393397e-05, | |
| "loss": 0.0079, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5572519083969466, | |
| "grad_norm": 0.08225909620523453, | |
| "learning_rate": 2.440051688339046e-05, | |
| "loss": 0.0147, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5610687022900763, | |
| "grad_norm": 0.009925187565386295, | |
| "learning_rate": 2.4067597612904037e-05, | |
| "loss": 0.0135, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.5648854961832062, | |
| "grad_norm": 0.36840465664863586, | |
| "learning_rate": 2.3734843846646988e-05, | |
| "loss": 0.0128, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5687022900763359, | |
| "grad_norm": 0.0662972629070282, | |
| "learning_rate": 2.3402314649414116e-05, | |
| "loss": 0.0114, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5725190839694656, | |
| "grad_norm": 0.28558099269866943, | |
| "learning_rate": 2.307006904613851e-05, | |
| "loss": 0.0113, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5725190839694656, | |
| "eval_loss": 0.030398810282349586, | |
| "eval_runtime": 140.1617, | |
| "eval_samples_per_second": 56.949, | |
| "eval_steps_per_second": 0.3, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5763358778625954, | |
| "grad_norm": 0.39870354533195496, | |
| "learning_rate": 2.2738166011414527e-05, | |
| "loss": 0.0094, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.5801526717557252, | |
| "grad_norm": 0.3758524954319, | |
| "learning_rate": 2.2406664459029545e-05, | |
| "loss": 0.0089, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.583969465648855, | |
| "grad_norm": 0.028594927862286568, | |
| "learning_rate": 2.207562323150662e-05, | |
| "loss": 0.0074, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.5877862595419847, | |
| "grad_norm": 0.23170693218708038, | |
| "learning_rate": 2.174510108965977e-05, | |
| "loss": 0.0054, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5916030534351145, | |
| "grad_norm": 0.5391947031021118, | |
| "learning_rate": 2.1415156702163734e-05, | |
| "loss": 0.0079, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5954198473282443, | |
| "grad_norm": 0.12324138730764389, | |
| "learning_rate": 2.1085848635140122e-05, | |
| "loss": 0.0071, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5992366412213741, | |
| "grad_norm": 0.03993413597345352, | |
| "learning_rate": 2.0757235341761732e-05, | |
| "loss": 0.008, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6030534351145038, | |
| "grad_norm": 0.32654133439064026, | |
| "learning_rate": 2.0429375151876978e-05, | |
| "loss": 0.0094, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6068702290076335, | |
| "grad_norm": 0.012670164927840233, | |
| "learning_rate": 2.0102326261656114e-05, | |
| "loss": 0.0086, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "grad_norm": 0.22735942900180817, | |
| "learning_rate": 1.9776146723261265e-05, | |
| "loss": 0.0146, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6145038167938931, | |
| "grad_norm": 0.5669217109680176, | |
| "learning_rate": 1.9450894434541994e-05, | |
| "loss": 0.0181, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6183206106870229, | |
| "grad_norm": 0.09971689432859421, | |
| "learning_rate": 1.9126627128758256e-05, | |
| "loss": 0.0034, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6221374045801527, | |
| "grad_norm": 0.11938827484846115, | |
| "learning_rate": 1.8803402364332573e-05, | |
| "loss": 0.0128, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.6259541984732825, | |
| "grad_norm": 0.47636914253234863, | |
| "learning_rate": 1.8481277514633243e-05, | |
| "loss": 0.0115, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.6297709923664122, | |
| "grad_norm": 0.3907536566257477, | |
| "learning_rate": 1.8160309757790384e-05, | |
| "loss": 0.0076, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6335877862595419, | |
| "grad_norm": 0.2356630116701126, | |
| "learning_rate": 1.784055606654664e-05, | |
| "loss": 0.0131, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.6374045801526718, | |
| "grad_norm": 0.09361109882593155, | |
| "learning_rate": 1.752207319814438e-05, | |
| "loss": 0.007, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.6412213740458015, | |
| "grad_norm": 0.3632112145423889, | |
| "learning_rate": 1.7204917684251086e-05, | |
| "loss": 0.0096, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.6450381679389313, | |
| "grad_norm": 0.44978809356689453, | |
| "learning_rate": 1.6889145820924797e-05, | |
| "loss": 0.0136, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.648854961832061, | |
| "grad_norm": 0.4278349280357361, | |
| "learning_rate": 1.6574813658621464e-05, | |
| "loss": 0.0061, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6526717557251909, | |
| "grad_norm": 0.3191720247268677, | |
| "learning_rate": 1.6261976992245727e-05, | |
| "loss": 0.0255, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.6564885496183206, | |
| "grad_norm": 0.030014334246516228, | |
| "learning_rate": 1.5950691351247218e-05, | |
| "loss": 0.0103, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.6603053435114504, | |
| "grad_norm": 0.3003236651420593, | |
| "learning_rate": 1.5641011989763903e-05, | |
| "loss": 0.0086, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.6641221374045801, | |
| "grad_norm": 0.05489334836602211, | |
| "learning_rate": 1.5332993876814285e-05, | |
| "loss": 0.0055, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6679389312977099, | |
| "grad_norm": 0.27562958002090454, | |
| "learning_rate": 1.5026691686540262e-05, | |
| "loss": 0.0066, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6717557251908397, | |
| "grad_norm": 0.4876582622528076, | |
| "learning_rate": 1.472215978850229e-05, | |
| "loss": 0.0088, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.6755725190839694, | |
| "grad_norm": 0.08189146220684052, | |
| "learning_rate": 1.4419452238028647e-05, | |
| "loss": 0.0087, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.6793893129770993, | |
| "grad_norm": 0.3738853633403778, | |
| "learning_rate": 1.4118622766620387e-05, | |
| "loss": 0.0091, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.683206106870229, | |
| "grad_norm": 0.2992166578769684, | |
| "learning_rate": 1.3819724772413929e-05, | |
| "loss": 0.0206, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.6870229007633588, | |
| "grad_norm": 0.6133850812911987, | |
| "learning_rate": 1.3522811310702634e-05, | |
| "loss": 0.0112, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6908396946564885, | |
| "grad_norm": 0.25099530816078186, | |
| "learning_rate": 1.3227935084519385e-05, | |
| "loss": 0.0092, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.6946564885496184, | |
| "grad_norm": 0.4365796744823456, | |
| "learning_rate": 1.2935148435281643e-05, | |
| "loss": 0.0194, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.6984732824427481, | |
| "grad_norm": 0.5801776051521301, | |
| "learning_rate": 1.2644503333500673e-05, | |
| "loss": 0.0078, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.7022900763358778, | |
| "grad_norm": 0.0873272716999054, | |
| "learning_rate": 1.2356051369556593e-05, | |
| "loss": 0.0059, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7061068702290076, | |
| "grad_norm": 0.03350086137652397, | |
| "learning_rate": 1.2069843744540999e-05, | |
| "loss": 0.0063, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.7099236641221374, | |
| "grad_norm": 0.33509764075279236, | |
| "learning_rate": 1.1785931261168537e-05, | |
| "loss": 0.0056, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.7137404580152672, | |
| "grad_norm": 0.12034814059734344, | |
| "learning_rate": 1.1504364314759311e-05, | |
| "loss": 0.0073, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.7175572519083969, | |
| "grad_norm": 0.5151724815368652, | |
| "learning_rate": 1.1225192884293578e-05, | |
| "loss": 0.0099, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.7213740458015268, | |
| "grad_norm": 0.32692572474479675, | |
| "learning_rate": 1.0948466523540296e-05, | |
| "loss": 0.008, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.7251908396946565, | |
| "grad_norm": 0.005217437632381916, | |
| "learning_rate": 1.0674234352261175e-05, | |
| "loss": 0.0071, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7290076335877863, | |
| "grad_norm": 0.2179807722568512, | |
| "learning_rate": 1.0402545047491818e-05, | |
| "loss": 0.0166, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.732824427480916, | |
| "grad_norm": 0.6623926162719727, | |
| "learning_rate": 1.0133446834901344e-05, | |
| "loss": 0.0081, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.7366412213740458, | |
| "grad_norm": 0.27897024154663086, | |
| "learning_rate": 9.866987480232218e-06, | |
| "loss": 0.0109, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.7404580152671756, | |
| "grad_norm": 0.026624036952853203, | |
| "learning_rate": 9.603214280821707e-06, | |
| "loss": 0.0168, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.7442748091603053, | |
| "grad_norm": 0.505171537399292, | |
| "learning_rate": 9.342174057206402e-06, | |
| "loss": 0.0055, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.7480916030534351, | |
| "grad_norm": 0.5251939296722412, | |
| "learning_rate": 9.083913144811499e-06, | |
| "loss": 0.0067, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.7519083969465649, | |
| "grad_norm": 0.2802061438560486, | |
| "learning_rate": 8.828477385726055e-06, | |
| "loss": 0.0073, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.7557251908396947, | |
| "grad_norm": 0.4821580648422241, | |
| "learning_rate": 8.575912120565956e-06, | |
| "loss": 0.0077, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.7595419847328244, | |
| "grad_norm": 0.16226717829704285, | |
| "learning_rate": 8.326262180425746e-06, | |
| "loss": 0.0063, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "grad_norm": 0.2521667182445526, | |
| "learning_rate": 8.079571878921002e-06, | |
| "loss": 0.0058, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "eval_loss": 0.028967689722776413, | |
| "eval_runtime": 140.2457, | |
| "eval_samples_per_second": 56.914, | |
| "eval_steps_per_second": 0.299, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.767175572519084, | |
| "grad_norm": 0.40514904260635376, | |
| "learning_rate": 7.83588500432256e-06, | |
| "loss": 0.0112, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.7709923664122137, | |
| "grad_norm": 1.045690894126892, | |
| "learning_rate": 7.595244811783944e-06, | |
| "loss": 0.0058, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.7748091603053435, | |
| "grad_norm": 0.5543403625488281, | |
| "learning_rate": 7.357694015663416e-06, | |
| "loss": 0.0102, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.7786259541984732, | |
| "grad_norm": 0.14902685582637787, | |
| "learning_rate": 7.123274781942107e-06, | |
| "loss": 0.0041, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.7824427480916031, | |
| "grad_norm": 0.0032010909635573626, | |
| "learning_rate": 6.892028720739388e-06, | |
| "loss": 0.0159, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7862595419847328, | |
| "grad_norm": 0.05139963701367378, | |
| "learning_rate": 6.663996878926943e-06, | |
| "loss": 0.0064, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.7900763358778626, | |
| "grad_norm": 0.3358862102031708, | |
| "learning_rate": 6.439219732842869e-06, | |
| "loss": 0.0098, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.7938931297709924, | |
| "grad_norm": 0.15072062611579895, | |
| "learning_rate": 6.217737181106978e-06, | |
| "loss": 0.0065, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7977099236641222, | |
| "grad_norm": 0.002467066515237093, | |
| "learning_rate": 5.999588537538639e-06, | |
| "loss": 0.0037, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.8015267175572519, | |
| "grad_norm": 0.034411050379276276, | |
| "learning_rate": 5.784812524178529e-06, | |
| "loss": 0.0074, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8053435114503816, | |
| "grad_norm": 0.08725389838218689, | |
| "learning_rate": 5.573447264415288e-06, | |
| "loss": 0.0185, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.8091603053435115, | |
| "grad_norm": 0.2480231672525406, | |
| "learning_rate": 5.365530276218531e-06, | |
| "loss": 0.0071, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.8129770992366412, | |
| "grad_norm": 0.5087897181510925, | |
| "learning_rate": 5.161098465479308e-06, | |
| "loss": 0.0243, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.816793893129771, | |
| "grad_norm": 0.3457648456096649, | |
| "learning_rate": 4.96018811945918e-06, | |
| "loss": 0.0073, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.8206106870229007, | |
| "grad_norm": 0.39516162872314453, | |
| "learning_rate": 4.762834900349117e-06, | |
| "loss": 0.0122, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.8244274809160306, | |
| "grad_norm": 0.3586041331291199, | |
| "learning_rate": 4.56907383893935e-06, | |
| "loss": 0.0073, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.8282442748091603, | |
| "grad_norm": 0.06679055839776993, | |
| "learning_rate": 4.378939328401338e-06, | |
| "loss": 0.0052, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.8320610687022901, | |
| "grad_norm": 0.40074533224105835, | |
| "learning_rate": 4.192465118182814e-06, | |
| "loss": 0.0064, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.8358778625954199, | |
| "grad_norm": 0.13426686823368073, | |
| "learning_rate": 4.0096843080172195e-06, | |
| "loss": 0.0139, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.8396946564885496, | |
| "grad_norm": 0.08818082511425018, | |
| "learning_rate": 3.8306293420483505e-06, | |
| "loss": 0.0104, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8435114503816794, | |
| "grad_norm": 0.676325261592865, | |
| "learning_rate": 3.6553320030714626e-06, | |
| "loss": 0.0065, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.8473282442748091, | |
| "grad_norm": 0.31104177236557007, | |
| "learning_rate": 3.483823406891701e-06, | |
| "loss": 0.0074, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.851145038167939, | |
| "grad_norm": 0.358847975730896, | |
| "learning_rate": 3.3161339968009768e-06, | |
| "loss": 0.0103, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.8549618320610687, | |
| "grad_norm": 0.4074176251888275, | |
| "learning_rate": 3.1522935381741643e-06, | |
| "loss": 0.0106, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.8587786259541985, | |
| "grad_norm": 0.22954821586608887, | |
| "learning_rate": 2.9923311131856783e-06, | |
| "loss": 0.0087, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.8625954198473282, | |
| "grad_norm": 0.33302852511405945, | |
| "learning_rate": 2.836275115647294e-06, | |
| "loss": 0.013, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.8664122137404581, | |
| "grad_norm": 0.09198283404111862, | |
| "learning_rate": 2.6841532459681627e-06, | |
| "loss": 0.0067, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.8702290076335878, | |
| "grad_norm": 0.03521020710468292, | |
| "learning_rate": 2.5359925062378884e-06, | |
| "loss": 0.0092, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.8740458015267175, | |
| "grad_norm": 0.40850088000297546, | |
| "learning_rate": 2.3918191954336144e-06, | |
| "loss": 0.0064, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.8778625954198473, | |
| "grad_norm": 0.002618383150547743, | |
| "learning_rate": 2.2516589047518273e-06, | |
| "loss": 0.0069, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8816793893129771, | |
| "grad_norm": 0.28037503361701965, | |
| "learning_rate": 2.1155365130658717e-06, | |
| "loss": 0.0063, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.8854961832061069, | |
| "grad_norm": 0.06269484013319016, | |
| "learning_rate": 1.9834761825098773e-06, | |
| "loss": 0.0043, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.8893129770992366, | |
| "grad_norm": 0.6617615222930908, | |
| "learning_rate": 1.8555013541899036e-06, | |
| "loss": 0.0148, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.8931297709923665, | |
| "grad_norm": 0.06300811469554901, | |
| "learning_rate": 1.7316347440230557e-06, | |
| "loss": 0.0116, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.8969465648854962, | |
| "grad_norm": 0.15155254304409027, | |
| "learning_rate": 1.61189833870537e-06, | |
| "loss": 0.0035, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.9007633587786259, | |
| "grad_norm": 0.7464874386787415, | |
| "learning_rate": 1.496313391809076e-06, | |
| "loss": 0.0131, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.9045801526717557, | |
| "grad_norm": 0.7433227896690369, | |
| "learning_rate": 1.3849004200100546e-06, | |
| "loss": 0.0084, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.9083969465648855, | |
| "grad_norm": 0.4609885513782501, | |
| "learning_rate": 1.2776791994460553e-06, | |
| "loss": 0.0091, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.9122137404580153, | |
| "grad_norm": 0.03363984078168869, | |
| "learning_rate": 1.1746687622063573e-06, | |
| "loss": 0.0109, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "grad_norm": 0.34646764397621155, | |
| "learning_rate": 1.0758873929535424e-06, | |
| "loss": 0.0058, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9198473282442748, | |
| "grad_norm": 0.28582972288131714, | |
| "learning_rate": 9.813526256778894e-07, | |
| "loss": 0.0054, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.9236641221374046, | |
| "grad_norm": 0.5931139588356018, | |
| "learning_rate": 8.910812405850566e-07, | |
| "loss": 0.0118, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.9274809160305344, | |
| "grad_norm": 0.44896554946899414, | |
| "learning_rate": 8.050892611175253e-07, | |
| "loss": 0.0119, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.9312977099236641, | |
| "grad_norm": 0.19585078954696655, | |
| "learning_rate": 7.233919511104082e-07, | |
| "loss": 0.0132, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.9351145038167938, | |
| "grad_norm": 0.24196402728557587, | |
| "learning_rate": 6.460038120820688e-07, | |
| "loss": 0.0051, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.9389312977099237, | |
| "grad_norm": 0.2216787487268448, | |
| "learning_rate": 5.729385806600484e-07, | |
| "loss": 0.0039, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.9427480916030534, | |
| "grad_norm": 0.008937466889619827, | |
| "learning_rate": 5.042092261427822e-07, | |
| "loss": 0.0151, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.9465648854961832, | |
| "grad_norm": 0.2098822146654129, | |
| "learning_rate": 4.3982794819751316e-07, | |
| "loss": 0.0045, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.950381679389313, | |
| "grad_norm": 0.09708132594823837, | |
| "learning_rate": 3.7980617469479953e-07, | |
| "loss": 0.0059, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.9541984732824428, | |
| "grad_norm": 0.10786189138889313, | |
| "learning_rate": 3.2415455968004826e-07, | |
| "loss": 0.0032, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9541984732824428, | |
| "eval_loss": 0.027030499652028084, | |
| "eval_runtime": 140.2791, | |
| "eval_samples_per_second": 56.901, | |
| "eval_steps_per_second": 0.299, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9580152671755725, | |
| "grad_norm": 0.5997936725616455, | |
| "learning_rate": 2.7288298148238913e-07, | |
| "loss": 0.0164, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.9618320610687023, | |
| "grad_norm": 1.1892848014831543, | |
| "learning_rate": 2.2600054096122703e-07, | |
| "loss": 0.0095, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.9656488549618321, | |
| "grad_norm": 0.014309865422546864, | |
| "learning_rate": 1.8351555989082892e-07, | |
| "loss": 0.006, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.9694656488549618, | |
| "grad_norm": 0.23357251286506653, | |
| "learning_rate": 1.4543557948317744e-07, | |
| "loss": 0.0084, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.9732824427480916, | |
| "grad_norm": 0.4700785279273987, | |
| "learning_rate": 1.1176735904937508e-07, | |
| "loss": 0.0079, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.9770992366412213, | |
| "grad_norm": 0.005513277370482683, | |
| "learning_rate": 8.251687479986503e-08, | |
| "loss": 0.0176, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.9809160305343512, | |
| "grad_norm": 0.32526788115501404, | |
| "learning_rate": 5.768931878361583e-08, | |
| "loss": 0.0091, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.9847328244274809, | |
| "grad_norm": 0.29356321692466736, | |
| "learning_rate": 3.728909796652525e-08, | |
| "loss": 0.0022, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.9885496183206107, | |
| "grad_norm": 0.03470044955611229, | |
| "learning_rate": 2.1319833449179317e-08, | |
| "loss": 0.0055, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.9923664122137404, | |
| "grad_norm": 0.004818221088498831, | |
| "learning_rate": 9.78435982407766e-09, | |
| "loss": 0.0093, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9961832061068703, | |
| "grad_norm": 0.03156821429729462, | |
| "learning_rate": 2.6847246725053565e-09, | |
| "loss": 0.0034, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0063484120182693005, | |
| "learning_rate": 2.2188201059814007e-11, | |
| "loss": 0.0075, | |
| "step": 2620 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3731098682179103e+18, | |
| "train_batch_size": 192, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |