Migrated from latest commit: Upload task output e5439073-3144-4ed0-aa87-6f301d440db4
df1cea9
verified
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9953379953379953, | |
| "eval_steps": 500, | |
| "global_step": 856, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011655011655011656, | |
| "grad_norm": 5.380261825460372, | |
| "learning_rate": 9.237540571428572e-06, | |
| "loss": 4.2109, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.023310023310023312, | |
| "grad_norm": 1.4502882732797857, | |
| "learning_rate": 2.0784466285714287e-05, | |
| "loss": 3.9981, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03496503496503497, | |
| "grad_norm": 0.9652384253156429, | |
| "learning_rate": 3.2331392000000005e-05, | |
| "loss": 3.2711, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.046620046620046623, | |
| "grad_norm": 1.1395250787625393, | |
| "learning_rate": 4.3878317714285716e-05, | |
| "loss": 2.6312, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05827505827505827, | |
| "grad_norm": 3.4845160401616764, | |
| "learning_rate": 5.542524342857144e-05, | |
| "loss": 2.1284, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06993006993006994, | |
| "grad_norm": 1.9051485411146671, | |
| "learning_rate": 6.697216914285716e-05, | |
| "loss": 1.8294, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08158508158508158, | |
| "grad_norm": 0.33187533324621077, | |
| "learning_rate": 7.851909485714286e-05, | |
| "loss": 1.7266, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09324009324009325, | |
| "grad_norm": 0.44248932276377023, | |
| "learning_rate": 8.082695323179129e-05, | |
| "loss": 1.7283, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1048951048951049, | |
| "grad_norm": 1.2160485145535014, | |
| "learning_rate": 8.082075099954982e-05, | |
| "loss": 1.7117, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11655011655011654, | |
| "grad_norm": 65.29684577693374, | |
| "learning_rate": 8.080977885578941e-05, | |
| "loss": 1.8952, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1282051282051282, | |
| "grad_norm": 1.15546734315769, | |
| "learning_rate": 8.079403852760764e-05, | |
| "loss": 3.1909, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13986013986013987, | |
| "grad_norm": 2.634452844296507, | |
| "learning_rate": 8.077353249265015e-05, | |
| "loss": 1.6638, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 0.32202059804233973, | |
| "learning_rate": 8.07482639787204e-05, | |
| "loss": 1.6088, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.16317016317016317, | |
| "grad_norm": 0.3642353360782208, | |
| "learning_rate": 8.071823696327185e-05, | |
| "loss": 1.5268, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.17482517482517482, | |
| "grad_norm": 0.34692958418989034, | |
| "learning_rate": 8.068345617278169e-05, | |
| "loss": 1.5875, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1864801864801865, | |
| "grad_norm": 38.12513649812565, | |
| "learning_rate": 8.06439270820069e-05, | |
| "loss": 1.7234, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.19813519813519814, | |
| "grad_norm": 0.3217004898392202, | |
| "learning_rate": 8.059965591312254e-05, | |
| "loss": 1.4259, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2097902097902098, | |
| "grad_norm": 0.24719946255426536, | |
| "learning_rate": 8.055064963474229e-05, | |
| "loss": 1.4095, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22144522144522144, | |
| "grad_norm": 0.5042383844039631, | |
| "learning_rate": 8.049691596082148e-05, | |
| "loss": 1.6829, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2331002331002331, | |
| "grad_norm": 0.2641927692422699, | |
| "learning_rate": 8.043846334944299e-05, | |
| "loss": 1.6542, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24475524475524477, | |
| "grad_norm": 0.2525842272998614, | |
| "learning_rate": 8.03753010014858e-05, | |
| "loss": 1.5055, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2564102564102564, | |
| "grad_norm": 0.2848401355775407, | |
| "learning_rate": 8.030743885917666e-05, | |
| "loss": 1.5441, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2680652680652681, | |
| "grad_norm": 0.2578389420241452, | |
| "learning_rate": 8.023488760452522e-05, | |
| "loss": 1.5164, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.27972027972027974, | |
| "grad_norm": 0.384485780217046, | |
| "learning_rate": 8.01576586576425e-05, | |
| "loss": 1.4309, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2913752913752914, | |
| "grad_norm": 0.3005538892717842, | |
| "learning_rate": 8.007576417494336e-05, | |
| "loss": 1.4464, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 0.3038129052309793, | |
| "learning_rate": 7.998921704723294e-05, | |
| "loss": 1.308, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3146853146853147, | |
| "grad_norm": 0.27857195778115285, | |
| "learning_rate": 7.989803089767754e-05, | |
| "loss": 1.3887, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.32634032634032634, | |
| "grad_norm": 0.2852295677864667, | |
| "learning_rate": 7.980222007966029e-05, | |
| "loss": 1.4992, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.337995337995338, | |
| "grad_norm": 0.2744064066208899, | |
| "learning_rate": 7.970179967452175e-05, | |
| "loss": 1.4618, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.34965034965034963, | |
| "grad_norm": 0.24802775085025972, | |
| "learning_rate": 7.959678548918605e-05, | |
| "loss": 1.3894, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3613053613053613, | |
| "grad_norm": 0.25341971271489, | |
| "learning_rate": 7.948719405367275e-05, | |
| "loss": 1.402, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.372960372960373, | |
| "grad_norm": 0.24564392308719407, | |
| "learning_rate": 7.937304261849485e-05, | |
| "loss": 1.3474, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.28202938683817624, | |
| "learning_rate": 7.925434915194349e-05, | |
| "loss": 1.4338, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3962703962703963, | |
| "grad_norm": 0.310609971889877, | |
| "learning_rate": 7.913113233725954e-05, | |
| "loss": 1.4238, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.40792540792540793, | |
| "grad_norm": 0.27009263717875037, | |
| "learning_rate": 7.90034115696928e-05, | |
| "loss": 1.3496, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4195804195804196, | |
| "grad_norm": 0.2710853328115736, | |
| "learning_rate": 7.887120695344898e-05, | |
| "loss": 1.4584, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.43123543123543123, | |
| "grad_norm": 0.25152522039701575, | |
| "learning_rate": 7.873453929852514e-05, | |
| "loss": 1.3774, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4428904428904429, | |
| "grad_norm": 0.2835745821601218, | |
| "learning_rate": 7.85934301174341e-05, | |
| "loss": 1.4099, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 1.0519805093542818, | |
| "learning_rate": 7.844790162181818e-05, | |
| "loss": 1.437, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4662004662004662, | |
| "grad_norm": 0.8265825248892074, | |
| "learning_rate": 7.829797671895288e-05, | |
| "loss": 1.4522, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47785547785547783, | |
| "grad_norm": 0.7862211345050619, | |
| "learning_rate": 7.814367900814116e-05, | |
| "loss": 1.4808, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.48951048951048953, | |
| "grad_norm": 2.729527753162334, | |
| "learning_rate": 7.79850327769987e-05, | |
| "loss": 1.3966, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5011655011655012, | |
| "grad_norm": 0.5270917240760644, | |
| "learning_rate": 7.78220629976309e-05, | |
| "loss": 1.4515, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.2370434474197243, | |
| "learning_rate": 7.765479532270198e-05, | |
| "loss": 1.354, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5244755244755245, | |
| "grad_norm": 0.302576280537011, | |
| "learning_rate": 7.748325608139717e-05, | |
| "loss": 1.4122, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5361305361305362, | |
| "grad_norm": 0.26006318976574305, | |
| "learning_rate": 7.730747227527824e-05, | |
| "loss": 1.4065, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5477855477855478, | |
| "grad_norm": 0.25033459622615245, | |
| "learning_rate": 7.712747157403322e-05, | |
| "loss": 1.3791, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5594405594405595, | |
| "grad_norm": 0.22820286614092533, | |
| "learning_rate": 7.694328231112112e-05, | |
| "loss": 1.339, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5710955710955711, | |
| "grad_norm": 0.2808234577155991, | |
| "learning_rate": 7.675493347931184e-05, | |
| "loss": 1.391, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5827505827505828, | |
| "grad_norm": 0.248396781368916, | |
| "learning_rate": 7.656245472612264e-05, | |
| "loss": 1.3796, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5944055944055944, | |
| "grad_norm": 0.24239687758201922, | |
| "learning_rate": 7.636587634915133e-05, | |
| "loss": 1.2704, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.2889579270489845, | |
| "learning_rate": 7.616522929130724e-05, | |
| "loss": 1.3738, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6177156177156177, | |
| "grad_norm": 0.37773683176723527, | |
| "learning_rate": 7.596054513594051e-05, | |
| "loss": 1.2168, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6293706293706294, | |
| "grad_norm": 0.2507614413849896, | |
| "learning_rate": 7.575185610187072e-05, | |
| "loss": 1.2799, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6410256410256411, | |
| "grad_norm": 0.2519925650582622, | |
| "learning_rate": 7.553919503831533e-05, | |
| "loss": 1.3737, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6526806526806527, | |
| "grad_norm": 0.25693488058335956, | |
| "learning_rate": 7.532259541971902e-05, | |
| "loss": 1.3132, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6643356643356644, | |
| "grad_norm": 0.2654099769032578, | |
| "learning_rate": 7.510209134048455e-05, | |
| "loss": 1.336, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.675990675990676, | |
| "grad_norm": 0.2991450312535211, | |
| "learning_rate": 7.4877717509606e-05, | |
| "loss": 1.3136, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6876456876456877, | |
| "grad_norm": 0.7936383565363224, | |
| "learning_rate": 7.46495092452054e-05, | |
| "loss": 1.3218, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6993006993006993, | |
| "grad_norm": 0.2833344928086715, | |
| "learning_rate": 7.441750246897328e-05, | |
| "loss": 1.3775, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.710955710955711, | |
| "grad_norm": 0.23315264052796472, | |
| "learning_rate": 7.418173370051446e-05, | |
| "loss": 1.3401, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7226107226107226, | |
| "grad_norm": 0.27055343448065056, | |
| "learning_rate": 7.394224005159947e-05, | |
| "loss": 1.3282, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7342657342657343, | |
| "grad_norm": 0.2480849022059861, | |
| "learning_rate": 7.369905922032295e-05, | |
| "loss": 1.2888, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.745920745920746, | |
| "grad_norm": 0.31945412606809426, | |
| "learning_rate": 7.345222948516969e-05, | |
| "loss": 1.4228, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 0.2576143196799571, | |
| "learning_rate": 7.320178969898926e-05, | |
| "loss": 1.3058, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.43520117132833047, | |
| "learning_rate": 7.294777928288031e-05, | |
| "loss": 1.3131, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7808857808857809, | |
| "grad_norm": 0.2882495350884511, | |
| "learning_rate": 7.26902382199854e-05, | |
| "loss": 1.32, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7925407925407926, | |
| "grad_norm": 0.33796460872196393, | |
| "learning_rate": 7.242920704919733e-05, | |
| "loss": 1.3031, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8041958041958042, | |
| "grad_norm": 0.2549144290343736, | |
| "learning_rate": 7.216472685877808e-05, | |
| "loss": 1.2656, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8158508158508159, | |
| "grad_norm": 0.6420737440392777, | |
| "learning_rate": 7.189683927989109e-05, | |
| "loss": 1.3621, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8275058275058275, | |
| "grad_norm": 0.25149854473667893, | |
| "learning_rate": 7.162558648004833e-05, | |
| "loss": 1.3055, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8391608391608392, | |
| "grad_norm": 0.8756690609514122, | |
| "learning_rate": 7.13510111564727e-05, | |
| "loss": 1.306, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8508158508158508, | |
| "grad_norm": 0.27473178820064537, | |
| "learning_rate": 7.107315652937733e-05, | |
| "loss": 1.3125, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8624708624708625, | |
| "grad_norm": 5.526242806524436, | |
| "learning_rate": 7.079206633516216e-05, | |
| "loss": 1.2443, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8741258741258742, | |
| "grad_norm": 0.2859887823856341, | |
| "learning_rate": 7.050778481952977e-05, | |
| "loss": 1.3051, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8857808857808858, | |
| "grad_norm": 0.26065360686340344, | |
| "learning_rate": 7.022035673052052e-05, | |
| "loss": 1.2533, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8974358974358975, | |
| "grad_norm": 0.28515419529118324, | |
| "learning_rate": 6.992982731146909e-05, | |
| "loss": 1.292, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.2521847950964824, | |
| "learning_rate": 6.963624229388268e-05, | |
| "loss": 1.3149, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9207459207459208, | |
| "grad_norm": 0.24378682997123743, | |
| "learning_rate": 6.933964789024263e-05, | |
| "loss": 1.299, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9324009324009324, | |
| "grad_norm": 0.22312967942400147, | |
| "learning_rate": 6.904009078673016e-05, | |
| "loss": 1.2422, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9440559440559441, | |
| "grad_norm": 0.22503476475625694, | |
| "learning_rate": 6.873761813587769e-05, | |
| "loss": 1.2913, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9557109557109557, | |
| "grad_norm": 0.22697946460375637, | |
| "learning_rate": 6.843227754914657e-05, | |
| "loss": 1.29, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9673659673659674, | |
| "grad_norm": 0.3144384632248903, | |
| "learning_rate": 6.812411708943284e-05, | |
| "loss": 1.2395, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9790209790209791, | |
| "grad_norm": 0.24364089502734793, | |
| "learning_rate": 6.781318526350156e-05, | |
| "loss": 1.4002, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9906759906759907, | |
| "grad_norm": 0.23043000137266378, | |
| "learning_rate": 6.749953101435168e-05, | |
| "loss": 1.2886, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9976689976689976, | |
| "eval_loss": 1.346985936164856, | |
| "eval_runtime": 41.4179, | |
| "eval_samples_per_second": 1.69, | |
| "eval_steps_per_second": 0.217, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.0023310023310024, | |
| "grad_norm": 0.6370362893316296, | |
| "learning_rate": 6.718320371351193e-05, | |
| "loss": 1.2846, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.013986013986014, | |
| "grad_norm": 0.26814660795194867, | |
| "learning_rate": 6.686425315326941e-05, | |
| "loss": 1.0865, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 0.23807462749178174, | |
| "learning_rate": 6.654272953883189e-05, | |
| "loss": 1.1612, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0372960372960374, | |
| "grad_norm": 0.23960120741081306, | |
| "learning_rate": 6.621868348042517e-05, | |
| "loss": 1.1393, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.048951048951049, | |
| "grad_norm": 0.2601814951485897, | |
| "learning_rate": 6.58921659853266e-05, | |
| "loss": 1.2539, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 0.2489367328385621, | |
| "learning_rate": 6.55632284498362e-05, | |
| "loss": 1.1934, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.0722610722610724, | |
| "grad_norm": 0.26068816228316616, | |
| "learning_rate": 6.523192265118652e-05, | |
| "loss": 1.252, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.083916083916084, | |
| "grad_norm": 0.2681555770705724, | |
| "learning_rate": 6.489830073939237e-05, | |
| "loss": 1.2231, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.0955710955710956, | |
| "grad_norm": 0.24692924998707827, | |
| "learning_rate": 6.456241522904223e-05, | |
| "loss": 1.1952, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1072261072261071, | |
| "grad_norm": 0.24530619885017255, | |
| "learning_rate": 6.422431899103189e-05, | |
| "loss": 1.1665, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.118881118881119, | |
| "grad_norm": 0.2677189175766508, | |
| "learning_rate": 6.388406524424222e-05, | |
| "loss": 1.176, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1305361305361306, | |
| "grad_norm": 0.2411887554782912, | |
| "learning_rate": 6.35417075471622e-05, | |
| "loss": 1.1704, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.1421911421911422, | |
| "grad_norm": 0.2541256840409212, | |
| "learning_rate": 6.319729978945832e-05, | |
| "loss": 1.1782, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 0.24850101976269176, | |
| "learning_rate": 6.285089618349196e-05, | |
| "loss": 1.1847, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.1655011655011656, | |
| "grad_norm": 0.2672134029476264, | |
| "learning_rate": 6.250255125578597e-05, | |
| "loss": 1.0407, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1655011655011656, | |
| "eval_loss": 1.335543155670166, | |
| "eval_runtime": 41.117, | |
| "eval_samples_per_second": 1.702, | |
| "eval_steps_per_second": 0.219, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1771561771561772, | |
| "grad_norm": 0.24056303633515866, | |
| "learning_rate": 6.21523198384418e-05, | |
| "loss": 1.223, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.1888111888111887, | |
| "grad_norm": 0.25656987276961235, | |
| "learning_rate": 6.18002570605085e-05, | |
| "loss": 1.1589, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2004662004662006, | |
| "grad_norm": 0.25884269499583334, | |
| "learning_rate": 6.144641833930498e-05, | |
| "loss": 1.1185, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 0.24120488400409848, | |
| "learning_rate": 6.109085937169695e-05, | |
| "loss": 1.2347, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2237762237762237, | |
| "grad_norm": 0.24337953823169015, | |
| "learning_rate": 6.0733636125329776e-05, | |
| "loss": 1.0777, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.2354312354312353, | |
| "grad_norm": 0.2553344373947113, | |
| "learning_rate": 6.0374804829818786e-05, | |
| "loss": 1.1297, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2470862470862472, | |
| "grad_norm": 0.26618655203985797, | |
| "learning_rate": 6.001442196789827e-05, | |
| "loss": 1.1129, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.2587412587412588, | |
| "grad_norm": 0.2631162852747018, | |
| "learning_rate": 5.965254426653072e-05, | |
| "loss": 1.2239, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2703962703962703, | |
| "grad_norm": 0.23274413800511448, | |
| "learning_rate": 5.928922868797752e-05, | |
| "loss": 1.103, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.282051282051282, | |
| "grad_norm": 0.23157161304838858, | |
| "learning_rate": 5.892453242083273e-05, | |
| "loss": 1.0679, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2937062937062938, | |
| "grad_norm": 0.28740045332560504, | |
| "learning_rate": 5.855851287102113e-05, | |
| "loss": 1.1891, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.3053613053613053, | |
| "grad_norm": 0.2970846582151576, | |
| "learning_rate": 5.81912276527621e-05, | |
| "loss": 1.1354, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.317016317016317, | |
| "grad_norm": 0.2630829061530002, | |
| "learning_rate": 5.7822734579500705e-05, | |
| "loss": 1.1793, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.3286713286713288, | |
| "grad_norm": 0.24952577498784723, | |
| "learning_rate": 5.745309165480747e-05, | |
| "loss": 1.0412, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3403263403263403, | |
| "grad_norm": 0.29261484653650605, | |
| "learning_rate": 5.7082357063248116e-05, | |
| "loss": 1.1266, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.351981351981352, | |
| "grad_norm": 0.27423549595827834, | |
| "learning_rate": 5.671058916122493e-05, | |
| "loss": 1.1526, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.26193784576015794, | |
| "learning_rate": 5.6337846467790995e-05, | |
| "loss": 1.1818, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.3752913752913754, | |
| "grad_norm": 0.2536711363621732, | |
| "learning_rate": 5.596418765543887e-05, | |
| "loss": 1.1099, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.386946386946387, | |
| "grad_norm": 0.25809295106016567, | |
| "learning_rate": 5.55896715408651e-05, | |
| "loss": 1.1281, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.3986013986013985, | |
| "grad_norm": 0.2964280224538696, | |
| "learning_rate": 5.521435707571199e-05, | |
| "loss": 1.1209, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 0.2535619637821886, | |
| "learning_rate": 5.483830333728829e-05, | |
| "loss": 1.0736, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.421911421911422, | |
| "grad_norm": 0.2834714269145168, | |
| "learning_rate": 5.4461569519269803e-05, | |
| "loss": 1.0904, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4335664335664335, | |
| "grad_norm": 0.26457948671956855, | |
| "learning_rate": 5.4084214922382015e-05, | |
| "loss": 1.1718, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.4452214452214451, | |
| "grad_norm": 0.30016622504388313, | |
| "learning_rate": 5.370629894506561e-05, | |
| "loss": 1.1222, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.456876456876457, | |
| "grad_norm": 0.24560811509210745, | |
| "learning_rate": 5.332788107412684e-05, | |
| "loss": 1.1179, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.4685314685314685, | |
| "grad_norm": 0.27129656920581763, | |
| "learning_rate": 5.294902087537369e-05, | |
| "loss": 1.1379, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4801864801864801, | |
| "grad_norm": 0.26155557293622045, | |
| "learning_rate": 5.256977798423988e-05, | |
| "loss": 1.1354, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.491841491841492, | |
| "grad_norm": 0.2542515790861205, | |
| "learning_rate": 5.2190212096397825e-05, | |
| "loss": 1.0926, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.5034965034965035, | |
| "grad_norm": 0.2846310062143532, | |
| "learning_rate": 5.181038295836196e-05, | |
| "loss": 1.206, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.2578892076658671, | |
| "learning_rate": 5.143035035808435e-05, | |
| "loss": 1.1348, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.526806526806527, | |
| "grad_norm": 0.2677065667604314, | |
| "learning_rate": 5.1050174115543476e-05, | |
| "loss": 1.1376, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.2623622010580359, | |
| "learning_rate": 5.066991407332825e-05, | |
| "loss": 1.147, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5501165501165501, | |
| "grad_norm": 0.2924192984321144, | |
| "learning_rate": 5.028963008721822e-05, | |
| "loss": 1.2343, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.5617715617715617, | |
| "grad_norm": 0.25879469972966473, | |
| "learning_rate": 4.990938201676194e-05, | |
| "loss": 1.1271, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5734265734265733, | |
| "grad_norm": 0.2738138873868371, | |
| "learning_rate": 4.952922971585451e-05, | |
| "loss": 1.0558, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.5850815850815851, | |
| "grad_norm": 0.24373153596626246, | |
| "learning_rate": 4.914923302331625e-05, | |
| "loss": 1.1583, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.5967365967365967, | |
| "grad_norm": 0.26700970389535894, | |
| "learning_rate": 4.87694517534735e-05, | |
| "loss": 1.0228, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.6083916083916083, | |
| "grad_norm": 0.29201876514809744, | |
| "learning_rate": 4.838994568674351e-05, | |
| "loss": 1.1897, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.6200466200466201, | |
| "grad_norm": 0.2731368722942658, | |
| "learning_rate": 4.801077456022443e-05, | |
| "loss": 1.2347, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.6317016317016317, | |
| "grad_norm": 0.24941385151182885, | |
| "learning_rate": 4.763199805829236e-05, | |
| "loss": 1.0522, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6433566433566433, | |
| "grad_norm": 0.2601913077602386, | |
| "learning_rate": 4.7253675803206544e-05, | |
| "loss": 1.1047, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.6550116550116551, | |
| "grad_norm": 0.2627466378538171, | |
| "learning_rate": 4.687586734572431e-05, | |
| "loss": 1.2039, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.33292973388136243, | |
| "learning_rate": 4.649863215572747e-05, | |
| "loss": 1.0949, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.6783216783216783, | |
| "grad_norm": 0.28930444872767674, | |
| "learning_rate": 4.612202961286117e-05, | |
| "loss": 1.1411, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.68997668997669, | |
| "grad_norm": 0.2811673269560653, | |
| "learning_rate": 4.574611899718721e-05, | |
| "loss": 1.1188, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.7016317016317015, | |
| "grad_norm": 0.2624778144533447, | |
| "learning_rate": 4.537095947985282e-05, | |
| "loss": 1.2099, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7132867132867133, | |
| "grad_norm": 0.31909240386960064, | |
| "learning_rate": 4.499661011377677e-05, | |
| "loss": 1.1953, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.724941724941725, | |
| "grad_norm": 0.45177444374511333, | |
| "learning_rate": 4.46231298243539e-05, | |
| "loss": 1.1496, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7365967365967365, | |
| "grad_norm": 0.3304135322652554, | |
| "learning_rate": 4.425057740017993e-05, | |
| "loss": 1.1407, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.7482517482517483, | |
| "grad_norm": 0.26814860174828314, | |
| "learning_rate": 4.38790114837976e-05, | |
| "loss": 1.0907, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.75990675990676, | |
| "grad_norm": 0.29715139148195335, | |
| "learning_rate": 4.350849056246595e-05, | |
| "loss": 1.1766, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.7715617715617715, | |
| "grad_norm": 0.274851580699654, | |
| "learning_rate": 4.313907295895397e-05, | |
| "loss": 1.1497, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7832167832167833, | |
| "grad_norm": 0.30500141557968397, | |
| "learning_rate": 4.277081682236013e-05, | |
| "loss": 1.0404, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.7948717948717947, | |
| "grad_norm": 2.9176458727968746, | |
| "learning_rate": 4.240378011895935e-05, | |
| "loss": 1.0777, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8065268065268065, | |
| "grad_norm": 0.32004127534889776, | |
| "learning_rate": 4.2038020623078596e-05, | |
| "loss": 1.1521, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.38996610677191895, | |
| "learning_rate": 4.1673595908002826e-05, | |
| "loss": 1.0999, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8298368298368297, | |
| "grad_norm": 0.39155093719025963, | |
| "learning_rate": 4.131056333691247e-05, | |
| "loss": 1.1836, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.8414918414918415, | |
| "grad_norm": 0.2765149318590016, | |
| "learning_rate": 4.094898005385408e-05, | |
| "loss": 1.1018, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8531468531468531, | |
| "grad_norm": 0.22839762225047938, | |
| "learning_rate": 4.058890297474543e-05, | |
| "loss": 1.1704, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.8648018648018647, | |
| "grad_norm": 0.269742896374976, | |
| "learning_rate": 4.023038877841649e-05, | |
| "loss": 1.0703, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8764568764568765, | |
| "grad_norm": 0.29602473720606215, | |
| "learning_rate": 3.987349389768777e-05, | |
| "loss": 1.1029, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.8881118881118881, | |
| "grad_norm": 0.2609207540513949, | |
| "learning_rate": 3.951827451048737e-05, | |
| "loss": 1.2367, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8997668997668997, | |
| "grad_norm": 0.2508364860523673, | |
| "learning_rate": 3.916478653100816e-05, | |
| "loss": 1.0849, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.9114219114219115, | |
| "grad_norm": 1.2856984958855535, | |
| "learning_rate": 3.881308560090648e-05, | |
| "loss": 1.1273, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.24270883363297913, | |
| "learning_rate": 3.846322708054368e-05, | |
| "loss": 1.1128, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.9347319347319347, | |
| "grad_norm": 0.26881059124314843, | |
| "learning_rate": 3.811526604027204e-05, | |
| "loss": 1.2133, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9463869463869465, | |
| "grad_norm": 0.2878909720257614, | |
| "learning_rate": 3.7769257251766225e-05, | |
| "loss": 1.1629, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.958041958041958, | |
| "grad_norm": 0.31812523517669944, | |
| "learning_rate": 3.742525517940187e-05, | |
| "loss": 1.149, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 0.27576941685401857, | |
| "learning_rate": 3.708331397168247e-05, | |
| "loss": 1.1484, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.9813519813519813, | |
| "grad_norm": 0.27263251333241223, | |
| "learning_rate": 3.674348745271595e-05, | |
| "loss": 1.1931, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.993006993006993, | |
| "grad_norm": 0.263716889490106, | |
| "learning_rate": 3.6405829113742405e-05, | |
| "loss": 1.1308, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.9953379953379953, | |
| "eval_loss": 1.2667760848999023, | |
| "eval_runtime": 41.1775, | |
| "eval_samples_per_second": 1.7, | |
| "eval_steps_per_second": 0.219, | |
| "step": 856 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1287, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 887872819298304.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |