| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.999558732680258, | |
| "eval_steps": 1000, | |
| "global_step": 28325, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008825346394845997, | |
| "grad_norm": 4.154237270355225, | |
| "learning_rate": 4.9915269196822594e-05, | |
| "loss": 1.761, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.017650692789691995, | |
| "grad_norm": 5.022342681884766, | |
| "learning_rate": 4.98270079435128e-05, | |
| "loss": 1.1951, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.026476039184537992, | |
| "grad_norm": 4.0111613273620605, | |
| "learning_rate": 4.9738746690203006e-05, | |
| "loss": 1.1727, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03530138557938399, | |
| "grad_norm": 4.008307933807373, | |
| "learning_rate": 4.9650485436893205e-05, | |
| "loss": 1.0397, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04412673197422999, | |
| "grad_norm": 3.561506748199463, | |
| "learning_rate": 4.956222418358341e-05, | |
| "loss": 1.1023, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.052952078369075985, | |
| "grad_norm": 3.71460223197937, | |
| "learning_rate": 4.947396293027361e-05, | |
| "loss": 1.0662, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.061777424763921986, | |
| "grad_norm": 3.7053279876708984, | |
| "learning_rate": 4.9385701676963815e-05, | |
| "loss": 1.0294, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07060277115876798, | |
| "grad_norm": 4.443565845489502, | |
| "learning_rate": 4.929744042365402e-05, | |
| "loss": 1.0758, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07942811755361398, | |
| "grad_norm": 3.3921597003936768, | |
| "learning_rate": 4.920917917034422e-05, | |
| "loss": 0.979, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08825346394845998, | |
| "grad_norm": 3.324091672897339, | |
| "learning_rate": 4.9120917917034426e-05, | |
| "loss": 1.0505, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09707881034330597, | |
| "grad_norm": 3.984813690185547, | |
| "learning_rate": 4.903265666372463e-05, | |
| "loss": 1.0452, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.10590415673815197, | |
| "grad_norm": 4.010895729064941, | |
| "learning_rate": 4.894439541041483e-05, | |
| "loss": 1.0423, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.11472950313299797, | |
| "grad_norm": 4.561792373657227, | |
| "learning_rate": 4.885613415710503e-05, | |
| "loss": 1.042, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.12355484952784397, | |
| "grad_norm": 3.540585517883301, | |
| "learning_rate": 4.8767872903795235e-05, | |
| "loss": 1.0458, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.13238019592268996, | |
| "grad_norm": 3.182912588119507, | |
| "learning_rate": 4.867961165048544e-05, | |
| "loss": 1.0393, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.14120554231753596, | |
| "grad_norm": 3.2969844341278076, | |
| "learning_rate": 4.8591350397175647e-05, | |
| "loss": 1.0733, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15003088871238196, | |
| "grad_norm": 3.535918712615967, | |
| "learning_rate": 4.8503089143865845e-05, | |
| "loss": 0.9854, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.15885623510722796, | |
| "grad_norm": 4.074019432067871, | |
| "learning_rate": 4.8414827890556044e-05, | |
| "loss": 1.0172, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.16768158150207396, | |
| "grad_norm": 2.4251832962036133, | |
| "learning_rate": 4.832656663724625e-05, | |
| "loss": 0.9895, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.17650692789691996, | |
| "grad_norm": 2.9533393383026123, | |
| "learning_rate": 4.8238305383936456e-05, | |
| "loss": 1.0214, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.17650692789691996, | |
| "eval_loss": 0.9503015279769897, | |
| "eval_runtime": 181.6386, | |
| "eval_samples_per_second": 155.953, | |
| "eval_steps_per_second": 15.597, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18533227429176596, | |
| "grad_norm": 3.5368824005126953, | |
| "learning_rate": 4.815004413062666e-05, | |
| "loss": 0.9633, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.19415762068661194, | |
| "grad_norm": 3.8759989738464355, | |
| "learning_rate": 4.806178287731686e-05, | |
| "loss": 0.9748, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.20298296708145794, | |
| "grad_norm": 3.02571177482605, | |
| "learning_rate": 4.797352162400706e-05, | |
| "loss": 0.9824, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.21180831347630394, | |
| "grad_norm": 4.157599449157715, | |
| "learning_rate": 4.7885260370697265e-05, | |
| "loss": 0.9452, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.22063365987114994, | |
| "grad_norm": 3.4217748641967773, | |
| "learning_rate": 4.779699911738747e-05, | |
| "loss": 0.9668, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.22945900626599594, | |
| "grad_norm": 3.7693021297454834, | |
| "learning_rate": 4.770873786407768e-05, | |
| "loss": 0.9758, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.23828435266084194, | |
| "grad_norm": 4.903484344482422, | |
| "learning_rate": 4.7620476610767876e-05, | |
| "loss": 0.9817, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.24710969905568794, | |
| "grad_norm": 3.7475759983062744, | |
| "learning_rate": 4.7532215357458075e-05, | |
| "loss": 0.968, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2559350454505339, | |
| "grad_norm": 2.6870646476745605, | |
| "learning_rate": 4.744395410414828e-05, | |
| "loss": 0.9143, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2647603918453799, | |
| "grad_norm": 3.0536086559295654, | |
| "learning_rate": 4.7355692850838486e-05, | |
| "loss": 0.9469, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2735857382402259, | |
| "grad_norm": 3.0142056941986084, | |
| "learning_rate": 4.726743159752869e-05, | |
| "loss": 0.9072, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2824110846350719, | |
| "grad_norm": 2.2967958450317383, | |
| "learning_rate": 4.717917034421889e-05, | |
| "loss": 0.9582, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2912364310299179, | |
| "grad_norm": 3.881225824356079, | |
| "learning_rate": 4.709090909090909e-05, | |
| "loss": 0.9517, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.3000617774247639, | |
| "grad_norm": 3.5175845623016357, | |
| "learning_rate": 4.7002647837599295e-05, | |
| "loss": 0.9889, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3088871238196099, | |
| "grad_norm": 2.7161202430725098, | |
| "learning_rate": 4.69143865842895e-05, | |
| "loss": 0.8666, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.3177124702144559, | |
| "grad_norm": 3.3932855129241943, | |
| "learning_rate": 4.68261253309797e-05, | |
| "loss": 0.908, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3265378166093019, | |
| "grad_norm": 2.1355509757995605, | |
| "learning_rate": 4.6737864077669906e-05, | |
| "loss": 0.9444, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3353631630041479, | |
| "grad_norm": 3.1375510692596436, | |
| "learning_rate": 4.6649602824360105e-05, | |
| "loss": 0.8773, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3441885093989939, | |
| "grad_norm": 3.356266975402832, | |
| "learning_rate": 4.656134157105031e-05, | |
| "loss": 0.9471, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3530138557938399, | |
| "grad_norm": 3.24814510345459, | |
| "learning_rate": 4.6473080317740516e-05, | |
| "loss": 0.9159, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3530138557938399, | |
| "eval_loss": 0.8810886144638062, | |
| "eval_runtime": 181.6697, | |
| "eval_samples_per_second": 155.926, | |
| "eval_steps_per_second": 15.594, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3618392021886859, | |
| "grad_norm": 2.7671000957489014, | |
| "learning_rate": 4.6384819064430715e-05, | |
| "loss": 0.8957, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3706645485835319, | |
| "grad_norm": 3.4630813598632812, | |
| "learning_rate": 4.629655781112092e-05, | |
| "loss": 0.9142, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.37948989497837793, | |
| "grad_norm": 3.2977519035339355, | |
| "learning_rate": 4.620829655781112e-05, | |
| "loss": 0.8814, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3883152413732239, | |
| "grad_norm": 2.398122787475586, | |
| "learning_rate": 4.6120035304501326e-05, | |
| "loss": 0.8989, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3971405877680699, | |
| "grad_norm": 2.642117500305176, | |
| "learning_rate": 4.603177405119153e-05, | |
| "loss": 0.9322, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4059659341629159, | |
| "grad_norm": 3.492455005645752, | |
| "learning_rate": 4.594351279788173e-05, | |
| "loss": 0.9167, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4147912805577619, | |
| "grad_norm": 3.0943338871002197, | |
| "learning_rate": 4.5855251544571936e-05, | |
| "loss": 0.9265, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.4236166269526079, | |
| "grad_norm": 3.072920322418213, | |
| "learning_rate": 4.576699029126214e-05, | |
| "loss": 0.9321, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4324419733474539, | |
| "grad_norm": 2.8082592487335205, | |
| "learning_rate": 4.567872903795234e-05, | |
| "loss": 0.8699, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4412673197422999, | |
| "grad_norm": 3.1018335819244385, | |
| "learning_rate": 4.5590467784642546e-05, | |
| "loss": 0.8869, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4500926661371459, | |
| "grad_norm": 3.1571199893951416, | |
| "learning_rate": 4.5502206531332745e-05, | |
| "loss": 1.0116, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4589180125319919, | |
| "grad_norm": 3.358121871948242, | |
| "learning_rate": 4.541394527802295e-05, | |
| "loss": 0.9374, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4677433589268379, | |
| "grad_norm": 2.648608684539795, | |
| "learning_rate": 4.532568402471316e-05, | |
| "loss": 0.8901, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4765687053216839, | |
| "grad_norm": 3.6951816082000732, | |
| "learning_rate": 4.5237422771403356e-05, | |
| "loss": 0.8871, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4853940517165299, | |
| "grad_norm": 2.734888792037964, | |
| "learning_rate": 4.5149161518093555e-05, | |
| "loss": 0.8735, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.4942193981113759, | |
| "grad_norm": 2.99570369720459, | |
| "learning_rate": 4.506090026478376e-05, | |
| "loss": 0.8863, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5030447445062218, | |
| "grad_norm": 3.6677873134613037, | |
| "learning_rate": 4.4972639011473966e-05, | |
| "loss": 0.9086, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.5118700909010678, | |
| "grad_norm": 2.688584804534912, | |
| "learning_rate": 4.488437775816417e-05, | |
| "loss": 0.9028, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5206954372959138, | |
| "grad_norm": 3.2178659439086914, | |
| "learning_rate": 4.479611650485437e-05, | |
| "loss": 0.9089, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.5295207836907598, | |
| "grad_norm": 2.8180041313171387, | |
| "learning_rate": 4.470785525154457e-05, | |
| "loss": 0.8956, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5295207836907598, | |
| "eval_loss": 0.8461005687713623, | |
| "eval_runtime": 181.6855, | |
| "eval_samples_per_second": 155.912, | |
| "eval_steps_per_second": 15.593, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5383461300856058, | |
| "grad_norm": 3.1606943607330322, | |
| "learning_rate": 4.4619593998234776e-05, | |
| "loss": 0.8886, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5471714764804518, | |
| "grad_norm": 2.718230724334717, | |
| "learning_rate": 4.453133274492498e-05, | |
| "loss": 0.8704, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5559968228752978, | |
| "grad_norm": 3.1943600177764893, | |
| "learning_rate": 4.444307149161519e-05, | |
| "loss": 0.9335, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5648221692701438, | |
| "grad_norm": 3.1927852630615234, | |
| "learning_rate": 4.4354810238305386e-05, | |
| "loss": 0.89, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5736475156649898, | |
| "grad_norm": 3.269458293914795, | |
| "learning_rate": 4.4266548984995585e-05, | |
| "loss": 0.8806, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5824728620598358, | |
| "grad_norm": 3.280489921569824, | |
| "learning_rate": 4.417828773168579e-05, | |
| "loss": 0.8952, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5912982084546818, | |
| "grad_norm": 3.4834930896759033, | |
| "learning_rate": 4.4090026478375996e-05, | |
| "loss": 0.8698, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.6001235548495278, | |
| "grad_norm": 2.7703258991241455, | |
| "learning_rate": 4.40017652250662e-05, | |
| "loss": 0.8516, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6089489012443738, | |
| "grad_norm": 3.2723336219787598, | |
| "learning_rate": 4.39135039717564e-05, | |
| "loss": 0.8564, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.6177742476392198, | |
| "grad_norm": 2.9629292488098145, | |
| "learning_rate": 4.38252427184466e-05, | |
| "loss": 0.8509, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6265995940340658, | |
| "grad_norm": 2.5405166149139404, | |
| "learning_rate": 4.3736981465136806e-05, | |
| "loss": 0.875, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6354249404289118, | |
| "grad_norm": 2.6754391193389893, | |
| "learning_rate": 4.364872021182701e-05, | |
| "loss": 0.8813, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6442502868237578, | |
| "grad_norm": 3.0264227390289307, | |
| "learning_rate": 4.356045895851722e-05, | |
| "loss": 0.8518, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6530756332186038, | |
| "grad_norm": 2.8763012886047363, | |
| "learning_rate": 4.3472197705207416e-05, | |
| "loss": 0.8272, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6619009796134498, | |
| "grad_norm": 2.9555325508117676, | |
| "learning_rate": 4.3383936451897615e-05, | |
| "loss": 0.9122, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6707263260082958, | |
| "grad_norm": 3.2128372192382812, | |
| "learning_rate": 4.329567519858782e-05, | |
| "loss": 0.825, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6795516724031418, | |
| "grad_norm": 2.1403987407684326, | |
| "learning_rate": 4.320741394527803e-05, | |
| "loss": 0.8808, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6883770187979878, | |
| "grad_norm": 4.011631965637207, | |
| "learning_rate": 4.3119152691968226e-05, | |
| "loss": 0.8071, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6972023651928339, | |
| "grad_norm": 2.8009603023529053, | |
| "learning_rate": 4.303089143865843e-05, | |
| "loss": 0.8236, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.7060277115876799, | |
| "grad_norm": 2.3197402954101562, | |
| "learning_rate": 4.294263018534863e-05, | |
| "loss": 0.9126, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7060277115876799, | |
| "eval_loss": 0.8137471675872803, | |
| "eval_runtime": 181.6839, | |
| "eval_samples_per_second": 155.914, | |
| "eval_steps_per_second": 15.593, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7148530579825259, | |
| "grad_norm": 2.557133913040161, | |
| "learning_rate": 4.2854368932038836e-05, | |
| "loss": 0.8425, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.7236784043773719, | |
| "grad_norm": 3.1375222206115723, | |
| "learning_rate": 4.276610767872904e-05, | |
| "loss": 0.8095, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7325037507722179, | |
| "grad_norm": 3.2000410556793213, | |
| "learning_rate": 4.267784642541924e-05, | |
| "loss": 0.8318, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.7413290971670639, | |
| "grad_norm": 2.4913811683654785, | |
| "learning_rate": 4.2589585172109446e-05, | |
| "loss": 0.8729, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7501544435619099, | |
| "grad_norm": 2.605590581893921, | |
| "learning_rate": 4.250132391879965e-05, | |
| "loss": 0.7657, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7589797899567559, | |
| "grad_norm": 2.879939317703247, | |
| "learning_rate": 4.241306266548985e-05, | |
| "loss": 0.8319, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7678051363516017, | |
| "grad_norm": 3.4146721363067627, | |
| "learning_rate": 4.232480141218006e-05, | |
| "loss": 0.8513, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7766304827464477, | |
| "grad_norm": 2.533024787902832, | |
| "learning_rate": 4.2236540158870256e-05, | |
| "loss": 0.8228, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7854558291412937, | |
| "grad_norm": 3.1727654933929443, | |
| "learning_rate": 4.214827890556046e-05, | |
| "loss": 0.8579, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7942811755361397, | |
| "grad_norm": 3.0712013244628906, | |
| "learning_rate": 4.206001765225067e-05, | |
| "loss": 0.8504, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8031065219309858, | |
| "grad_norm": 4.400243282318115, | |
| "learning_rate": 4.1971756398940866e-05, | |
| "loss": 0.8606, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.8119318683258318, | |
| "grad_norm": 3.040689468383789, | |
| "learning_rate": 4.188349514563107e-05, | |
| "loss": 0.822, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8207572147206778, | |
| "grad_norm": 2.518383026123047, | |
| "learning_rate": 4.179523389232127e-05, | |
| "loss": 0.8013, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.8295825611155238, | |
| "grad_norm": 3.1982264518737793, | |
| "learning_rate": 4.170697263901148e-05, | |
| "loss": 0.8374, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8384079075103698, | |
| "grad_norm": 3.7081596851348877, | |
| "learning_rate": 4.161871138570168e-05, | |
| "loss": 0.8281, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.8472332539052158, | |
| "grad_norm": 2.829268217086792, | |
| "learning_rate": 4.153045013239188e-05, | |
| "loss": 0.8736, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8560586003000618, | |
| "grad_norm": 2.8219809532165527, | |
| "learning_rate": 4.144218887908208e-05, | |
| "loss": 0.8324, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8648839466949078, | |
| "grad_norm": 3.261457681655884, | |
| "learning_rate": 4.1353927625772286e-05, | |
| "loss": 0.7835, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8737092930897538, | |
| "grad_norm": 2.952995777130127, | |
| "learning_rate": 4.126566637246249e-05, | |
| "loss": 0.8674, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8825346394845998, | |
| "grad_norm": 2.9086506366729736, | |
| "learning_rate": 4.11774051191527e-05, | |
| "loss": 0.8263, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8825346394845998, | |
| "eval_loss": 0.7903227806091309, | |
| "eval_runtime": 181.7285, | |
| "eval_samples_per_second": 155.875, | |
| "eval_steps_per_second": 15.589, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8913599858794458, | |
| "grad_norm": 2.967153549194336, | |
| "learning_rate": 4.1089143865842896e-05, | |
| "loss": 0.8394, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.9001853322742918, | |
| "grad_norm": 3.2790939807891846, | |
| "learning_rate": 4.1000882612533095e-05, | |
| "loss": 0.8037, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9090106786691378, | |
| "grad_norm": 2.5177745819091797, | |
| "learning_rate": 4.09126213592233e-05, | |
| "loss": 0.8217, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.9178360250639838, | |
| "grad_norm": 3.300760507583618, | |
| "learning_rate": 4.082436010591351e-05, | |
| "loss": 0.8177, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9266613714588298, | |
| "grad_norm": 2.8145620822906494, | |
| "learning_rate": 4.073609885260371e-05, | |
| "loss": 0.8196, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.9354867178536758, | |
| "grad_norm": 2.681286096572876, | |
| "learning_rate": 4.064783759929391e-05, | |
| "loss": 0.8417, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9443120642485218, | |
| "grad_norm": 2.404695510864258, | |
| "learning_rate": 4.055957634598411e-05, | |
| "loss": 0.841, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9531374106433678, | |
| "grad_norm": 2.606860876083374, | |
| "learning_rate": 4.0471315092674316e-05, | |
| "loss": 0.8194, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9619627570382138, | |
| "grad_norm": 2.41960072517395, | |
| "learning_rate": 4.038305383936452e-05, | |
| "loss": 0.8009, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.9707881034330598, | |
| "grad_norm": 3.663374185562134, | |
| "learning_rate": 4.029479258605473e-05, | |
| "loss": 0.8017, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9796134498279058, | |
| "grad_norm": 2.751960277557373, | |
| "learning_rate": 4.0206531332744927e-05, | |
| "loss": 0.7816, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.9884387962227518, | |
| "grad_norm": 2.9253251552581787, | |
| "learning_rate": 4.0118270079435126e-05, | |
| "loss": 0.8022, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.9972641426175978, | |
| "grad_norm": 3.4623188972473145, | |
| "learning_rate": 4.003000882612533e-05, | |
| "loss": 0.7701, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.0060894890124437, | |
| "grad_norm": 2.0556461811065674, | |
| "learning_rate": 3.994174757281554e-05, | |
| "loss": 0.7143, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.0149148354072897, | |
| "grad_norm": 2.6380767822265625, | |
| "learning_rate": 3.985348631950574e-05, | |
| "loss": 0.6442, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.0237401818021357, | |
| "grad_norm": 3.469467878341675, | |
| "learning_rate": 3.976522506619594e-05, | |
| "loss": 0.6443, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.0325655281969817, | |
| "grad_norm": 3.0494396686553955, | |
| "learning_rate": 3.967696381288615e-05, | |
| "loss": 0.6686, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.0413908745918277, | |
| "grad_norm": 3.4059927463531494, | |
| "learning_rate": 3.9588702559576346e-05, | |
| "loss": 0.6985, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.0502162209866737, | |
| "grad_norm": 2.977858781814575, | |
| "learning_rate": 3.950044130626655e-05, | |
| "loss": 0.6614, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.0590415673815197, | |
| "grad_norm": 2.572244644165039, | |
| "learning_rate": 3.941218005295675e-05, | |
| "loss": 0.6959, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0590415673815197, | |
| "eval_loss": 0.7684289813041687, | |
| "eval_runtime": 181.6989, | |
| "eval_samples_per_second": 155.901, | |
| "eval_steps_per_second": 15.592, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0678669137763657, | |
| "grad_norm": 2.360053777694702, | |
| "learning_rate": 3.932568402471315e-05, | |
| "loss": 0.6979, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.0766922601712117, | |
| "grad_norm": 2.46508526802063, | |
| "learning_rate": 3.923742277140335e-05, | |
| "loss": 0.6179, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0855176065660577, | |
| "grad_norm": 2.3281443119049072, | |
| "learning_rate": 3.914916151809356e-05, | |
| "loss": 0.6245, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.0943429529609037, | |
| "grad_norm": 2.59844708442688, | |
| "learning_rate": 3.9060900264783764e-05, | |
| "loss": 0.6938, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.1031682993557497, | |
| "grad_norm": 3.2816011905670166, | |
| "learning_rate": 3.897263901147396e-05, | |
| "loss": 0.6129, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.1119936457505957, | |
| "grad_norm": 2.310105800628662, | |
| "learning_rate": 3.888437775816417e-05, | |
| "loss": 0.6435, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.1208189921454417, | |
| "grad_norm": 2.9647128582000732, | |
| "learning_rate": 3.879611650485437e-05, | |
| "loss": 0.6694, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.1296443385402877, | |
| "grad_norm": 2.9377100467681885, | |
| "learning_rate": 3.870785525154457e-05, | |
| "loss": 0.7068, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.1384696849351337, | |
| "grad_norm": 2.559523105621338, | |
| "learning_rate": 3.861959399823478e-05, | |
| "loss": 0.6432, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.1472950313299797, | |
| "grad_norm": 2.5071003437042236, | |
| "learning_rate": 3.853133274492498e-05, | |
| "loss": 0.6542, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1561203777248257, | |
| "grad_norm": 2.6376101970672607, | |
| "learning_rate": 3.8443071491615184e-05, | |
| "loss": 0.6715, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.1649457241196717, | |
| "grad_norm": 2.169046401977539, | |
| "learning_rate": 3.835481023830538e-05, | |
| "loss": 0.6367, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.1737710705145177, | |
| "grad_norm": 2.852400779724121, | |
| "learning_rate": 3.826654898499559e-05, | |
| "loss": 0.6711, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.1825964169093637, | |
| "grad_norm": 2.8903868198394775, | |
| "learning_rate": 3.8178287731685794e-05, | |
| "loss": 0.6603, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1914217633042097, | |
| "grad_norm": 3.0550076961517334, | |
| "learning_rate": 3.809002647837599e-05, | |
| "loss": 0.7009, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.2002471096990557, | |
| "grad_norm": 3.017362356185913, | |
| "learning_rate": 3.80017652250662e-05, | |
| "loss": 0.651, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.2090724560939017, | |
| "grad_norm": 3.1694257259368896, | |
| "learning_rate": 3.7913503971756404e-05, | |
| "loss": 0.6674, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.2178978024887477, | |
| "grad_norm": 3.285419464111328, | |
| "learning_rate": 3.78270079435128e-05, | |
| "loss": 0.6425, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.2267231488835937, | |
| "grad_norm": 3.4212896823883057, | |
| "learning_rate": 3.7738746690203006e-05, | |
| "loss": 0.7068, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.2355484952784397, | |
| "grad_norm": 2.7931535243988037, | |
| "learning_rate": 3.7650485436893205e-05, | |
| "loss": 0.6685, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2355484952784397, | |
| "eval_loss": 0.7610744833946228, | |
| "eval_runtime": 181.743, | |
| "eval_samples_per_second": 155.863, | |
| "eval_steps_per_second": 15.588, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2443738416732857, | |
| "grad_norm": 3.0050013065338135, | |
| "learning_rate": 3.756222418358341e-05, | |
| "loss": 0.63, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.2531991880681317, | |
| "grad_norm": 3.8987817764282227, | |
| "learning_rate": 3.747396293027361e-05, | |
| "loss": 0.6285, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2620245344629777, | |
| "grad_norm": 2.7524607181549072, | |
| "learning_rate": 3.7385701676963815e-05, | |
| "loss": 0.649, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.2708498808578237, | |
| "grad_norm": 3.0095231533050537, | |
| "learning_rate": 3.729744042365402e-05, | |
| "loss": 0.6298, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2796752272526697, | |
| "grad_norm": 2.8414998054504395, | |
| "learning_rate": 3.720917917034422e-05, | |
| "loss": 0.5854, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.2885005736475157, | |
| "grad_norm": 2.4056193828582764, | |
| "learning_rate": 3.7120917917034425e-05, | |
| "loss": 0.6555, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.2973259200423617, | |
| "grad_norm": 2.7647316455841064, | |
| "learning_rate": 3.7032656663724624e-05, | |
| "loss": 0.6433, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.3061512664372077, | |
| "grad_norm": 3.025959014892578, | |
| "learning_rate": 3.694439541041483e-05, | |
| "loss": 0.6319, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.3149766128320537, | |
| "grad_norm": 2.893221378326416, | |
| "learning_rate": 3.685613415710503e-05, | |
| "loss": 0.6474, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.3238019592268997, | |
| "grad_norm": 2.621736764907837, | |
| "learning_rate": 3.6767872903795235e-05, | |
| "loss": 0.6302, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.3326273056217457, | |
| "grad_norm": 3.4629733562469482, | |
| "learning_rate": 3.667961165048544e-05, | |
| "loss": 0.6259, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.3414526520165917, | |
| "grad_norm": 2.6440625190734863, | |
| "learning_rate": 3.6591350397175646e-05, | |
| "loss": 0.7059, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.3502779984114377, | |
| "grad_norm": 3.098646879196167, | |
| "learning_rate": 3.6503089143865845e-05, | |
| "loss": 0.6563, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.3591033448062837, | |
| "grad_norm": 3.1059629917144775, | |
| "learning_rate": 3.6414827890556044e-05, | |
| "loss": 0.7017, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3679286912011297, | |
| "grad_norm": 3.305082082748413, | |
| "learning_rate": 3.632656663724625e-05, | |
| "loss": 0.6417, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.3767540375959757, | |
| "grad_norm": 2.580493688583374, | |
| "learning_rate": 3.6238305383936456e-05, | |
| "loss": 0.6768, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.3855793839908217, | |
| "grad_norm": 2.5949904918670654, | |
| "learning_rate": 3.615004413062666e-05, | |
| "loss": 0.6778, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.3944047303856677, | |
| "grad_norm": 3.4138495922088623, | |
| "learning_rate": 3.606354810238306e-05, | |
| "loss": 0.644, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.4032300767805137, | |
| "grad_norm": 2.841855764389038, | |
| "learning_rate": 3.5975286849073256e-05, | |
| "loss": 0.6587, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.4120554231753597, | |
| "grad_norm": 3.4159514904022217, | |
| "learning_rate": 3.588702559576346e-05, | |
| "loss": 0.6483, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.4120554231753597, | |
| "eval_loss": 0.7551769018173218, | |
| "eval_runtime": 181.7484, | |
| "eval_samples_per_second": 155.858, | |
| "eval_steps_per_second": 15.587, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.4208807695702057, | |
| "grad_norm": 3.678314447402954, | |
| "learning_rate": 3.579876434245367e-05, | |
| "loss": 0.6326, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.4297061159650517, | |
| "grad_norm": 2.811630964279175, | |
| "learning_rate": 3.5710503089143866e-05, | |
| "loss": 0.6684, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.4385314623598977, | |
| "grad_norm": 2.802046775817871, | |
| "learning_rate": 3.562224183583407e-05, | |
| "loss": 0.6581, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.4473568087547437, | |
| "grad_norm": 3.1183340549468994, | |
| "learning_rate": 3.553398058252427e-05, | |
| "loss": 0.6444, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.4561821551495897, | |
| "grad_norm": 2.275667428970337, | |
| "learning_rate": 3.544571932921448e-05, | |
| "loss": 0.6042, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.4650075015444357, | |
| "grad_norm": 3.0248420238494873, | |
| "learning_rate": 3.535745807590468e-05, | |
| "loss": 0.6638, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.4738328479392817, | |
| "grad_norm": 3.109173059463501, | |
| "learning_rate": 3.526919682259488e-05, | |
| "loss": 0.6636, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.4826581943341277, | |
| "grad_norm": 3.1378836631774902, | |
| "learning_rate": 3.518093556928509e-05, | |
| "loss": 0.6723, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.4914835407289737, | |
| "grad_norm": 2.956021547317505, | |
| "learning_rate": 3.5092674315975286e-05, | |
| "loss": 0.6449, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.5003088871238197, | |
| "grad_norm": 2.559032440185547, | |
| "learning_rate": 3.500441306266549e-05, | |
| "loss": 0.6343, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.5091342335186657, | |
| "grad_norm": 2.106903553009033, | |
| "learning_rate": 3.49161518093557e-05, | |
| "loss": 0.6494, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.5179595799135117, | |
| "grad_norm": 4.132986068725586, | |
| "learning_rate": 3.48278905560459e-05, | |
| "loss": 0.6556, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.5267849263083577, | |
| "grad_norm": 3.011594533920288, | |
| "learning_rate": 3.4739629302736095e-05, | |
| "loss": 0.6318, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.5356102727032037, | |
| "grad_norm": 3.080071210861206, | |
| "learning_rate": 3.46513680494263e-05, | |
| "loss": 0.6635, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.5444356190980497, | |
| "grad_norm": 2.3603274822235107, | |
| "learning_rate": 3.456310679611651e-05, | |
| "loss": 0.6224, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.5532609654928957, | |
| "grad_norm": 3.5411500930786133, | |
| "learning_rate": 3.447484554280671e-05, | |
| "loss": 0.6349, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.5620863118877417, | |
| "grad_norm": 3.3874282836914062, | |
| "learning_rate": 3.438658428949691e-05, | |
| "loss": 0.6537, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.5709116582825877, | |
| "grad_norm": 3.0514631271362305, | |
| "learning_rate": 3.429832303618711e-05, | |
| "loss": 0.6496, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5797370046774337, | |
| "grad_norm": 3.084991931915283, | |
| "learning_rate": 3.4210061782877316e-05, | |
| "loss": 0.6647, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.5885623510722797, | |
| "grad_norm": 3.14204478263855, | |
| "learning_rate": 3.412180052956752e-05, | |
| "loss": 0.6551, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.5885623510722797, | |
| "eval_loss": 0.7328709363937378, | |
| "eval_runtime": 181.7304, | |
| "eval_samples_per_second": 155.874, | |
| "eval_steps_per_second": 15.589, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.5973876974671257, | |
| "grad_norm": 2.8025941848754883, | |
| "learning_rate": 3.403353927625773e-05, | |
| "loss": 0.6385, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.6062130438619717, | |
| "grad_norm": 2.485394239425659, | |
| "learning_rate": 3.3945278022947927e-05, | |
| "loss": 0.5855, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.6150383902568177, | |
| "grad_norm": 3.1365981101989746, | |
| "learning_rate": 3.3857016769638126e-05, | |
| "loss": 0.6335, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.6238637366516637, | |
| "grad_norm": 2.991084098815918, | |
| "learning_rate": 3.376875551632833e-05, | |
| "loss": 0.649, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.6326890830465097, | |
| "grad_norm": 3.4233882427215576, | |
| "learning_rate": 3.368049426301854e-05, | |
| "loss": 0.6562, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.6415144294413557, | |
| "grad_norm": 2.715102195739746, | |
| "learning_rate": 3.359223300970874e-05, | |
| "loss": 0.6356, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.6503397758362017, | |
| "grad_norm": 3.2025671005249023, | |
| "learning_rate": 3.350397175639894e-05, | |
| "loss": 0.6451, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.6591651222310477, | |
| "grad_norm": 2.856401205062866, | |
| "learning_rate": 3.341571050308915e-05, | |
| "loss": 0.6527, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.6679904686258937, | |
| "grad_norm": 2.430091142654419, | |
| "learning_rate": 3.3327449249779346e-05, | |
| "loss": 0.6901, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.6768158150207397, | |
| "grad_norm": 3.5278499126434326, | |
| "learning_rate": 3.323918799646955e-05, | |
| "loss": 0.6499, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6856411614155855, | |
| "grad_norm": 2.8567793369293213, | |
| "learning_rate": 3.315092674315976e-05, | |
| "loss": 0.6201, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.6944665078104315, | |
| "grad_norm": 2.610034465789795, | |
| "learning_rate": 3.306266548984996e-05, | |
| "loss": 0.6407, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.7032918542052775, | |
| "grad_norm": 3.1110494136810303, | |
| "learning_rate": 3.297440423654016e-05, | |
| "loss": 0.6229, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.7121172006001235, | |
| "grad_norm": 3.0331647396087646, | |
| "learning_rate": 3.288614298323036e-05, | |
| "loss": 0.6331, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.7209425469949695, | |
| "grad_norm": 2.306661605834961, | |
| "learning_rate": 3.279788172992057e-05, | |
| "loss": 0.6221, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.7297678933898155, | |
| "grad_norm": 3.5243642330169678, | |
| "learning_rate": 3.2709620476610766e-05, | |
| "loss": 0.652, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.7385932397846615, | |
| "grad_norm": 3.3176615238189697, | |
| "learning_rate": 3.262135922330097e-05, | |
| "loss": 0.6614, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.7474185861795075, | |
| "grad_norm": 2.874948501586914, | |
| "learning_rate": 3.253309796999118e-05, | |
| "loss": 0.6597, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.7562439325743535, | |
| "grad_norm": 3.0557405948638916, | |
| "learning_rate": 3.2444836716681377e-05, | |
| "loss": 0.62, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.7650692789691995, | |
| "grad_norm": 3.2125186920166016, | |
| "learning_rate": 3.235657546337158e-05, | |
| "loss": 0.6577, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.7650692789691995, | |
| "eval_loss": 0.7176974415779114, | |
| "eval_runtime": 181.8356, | |
| "eval_samples_per_second": 155.784, | |
| "eval_steps_per_second": 15.58, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.7738946253640455, | |
| "grad_norm": 3.4087321758270264, | |
| "learning_rate": 3.226831421006178e-05, | |
| "loss": 0.6412, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.7827199717588915, | |
| "grad_norm": 3.135465383529663, | |
| "learning_rate": 3.218005295675199e-05, | |
| "loss": 0.6436, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.7915453181537375, | |
| "grad_norm": 3.334916830062866, | |
| "learning_rate": 3.209179170344219e-05, | |
| "loss": 0.6381, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.8003706645485835, | |
| "grad_norm": 2.579524278640747, | |
| "learning_rate": 3.200353045013239e-05, | |
| "loss": 0.6083, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.8091960109434295, | |
| "grad_norm": 3.2345893383026123, | |
| "learning_rate": 3.19152691968226e-05, | |
| "loss": 0.6353, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.8180213573382755, | |
| "grad_norm": 3.3575096130371094, | |
| "learning_rate": 3.1827007943512796e-05, | |
| "loss": 0.627, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.8268467037331215, | |
| "grad_norm": 2.8366053104400635, | |
| "learning_rate": 3.1738746690203e-05, | |
| "loss": 0.6247, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.8356720501279675, | |
| "grad_norm": 3.510075330734253, | |
| "learning_rate": 3.165048543689321e-05, | |
| "loss": 0.6429, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.8444973965228135, | |
| "grad_norm": 2.613879919052124, | |
| "learning_rate": 3.1562224183583414e-05, | |
| "loss": 0.6243, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.8533227429176595, | |
| "grad_norm": 3.2765021324157715, | |
| "learning_rate": 3.147396293027361e-05, | |
| "loss": 0.6741, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.8621480893125055, | |
| "grad_norm": 3.1736323833465576, | |
| "learning_rate": 3.138570167696381e-05, | |
| "loss": 0.6523, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.8709734357073515, | |
| "grad_norm": 2.467745065689087, | |
| "learning_rate": 3.129744042365402e-05, | |
| "loss": 0.6149, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.8797987821021975, | |
| "grad_norm": 3.506396532058716, | |
| "learning_rate": 3.120917917034422e-05, | |
| "loss": 0.625, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.8886241284970435, | |
| "grad_norm": 3.8226113319396973, | |
| "learning_rate": 3.112091791703443e-05, | |
| "loss": 0.6314, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.8974494748918895, | |
| "grad_norm": 3.097027540206909, | |
| "learning_rate": 3.103265666372462e-05, | |
| "loss": 0.6384, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.9062748212867355, | |
| "grad_norm": 2.8758792877197266, | |
| "learning_rate": 3.0944395410414827e-05, | |
| "loss": 0.6459, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.9151001676815815, | |
| "grad_norm": 3.3849267959594727, | |
| "learning_rate": 3.085613415710503e-05, | |
| "loss": 0.6229, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.9239255140764275, | |
| "grad_norm": 3.4073870182037354, | |
| "learning_rate": 3.076787290379524e-05, | |
| "loss": 0.6299, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.9327508604712735, | |
| "grad_norm": 2.961178779602051, | |
| "learning_rate": 3.067961165048544e-05, | |
| "loss": 0.6157, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.9415762068661195, | |
| "grad_norm": 3.150435447692871, | |
| "learning_rate": 3.0591350397175636e-05, | |
| "loss": 0.6094, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.9415762068661195, | |
| "eval_loss": 0.7086746692657471, | |
| "eval_runtime": 181.7779, | |
| "eval_samples_per_second": 155.833, | |
| "eval_steps_per_second": 15.585, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.9504015532609655, | |
| "grad_norm": 2.8941028118133545, | |
| "learning_rate": 3.050308914386584e-05, | |
| "loss": 0.6292, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.9592268996558115, | |
| "grad_norm": 3.9323644638061523, | |
| "learning_rate": 3.0414827890556047e-05, | |
| "loss": 0.6188, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.9680522460506575, | |
| "grad_norm": 3.0189285278320312, | |
| "learning_rate": 3.032656663724625e-05, | |
| "loss": 0.6135, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.9768775924455035, | |
| "grad_norm": 4.591734886169434, | |
| "learning_rate": 3.0238305383936455e-05, | |
| "loss": 0.6226, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.9857029388403493, | |
| "grad_norm": 2.5225682258605957, | |
| "learning_rate": 3.0150044130626658e-05, | |
| "loss": 0.6162, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.9945282852351953, | |
| "grad_norm": 2.399808406829834, | |
| "learning_rate": 3.0061782877316857e-05, | |
| "loss": 0.6034, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.0033536316300413, | |
| "grad_norm": 2.693629503250122, | |
| "learning_rate": 2.9973521624007062e-05, | |
| "loss": 0.5803, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 2.0121789780248873, | |
| "grad_norm": 2.8983335494995117, | |
| "learning_rate": 2.9885260370697265e-05, | |
| "loss": 0.496, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.0210043244197333, | |
| "grad_norm": 3.2255337238311768, | |
| "learning_rate": 2.979699911738747e-05, | |
| "loss": 0.4722, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 2.0298296708145793, | |
| "grad_norm": 2.8371284008026123, | |
| "learning_rate": 2.9708737864077673e-05, | |
| "loss": 0.4893, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.0386550172094253, | |
| "grad_norm": 2.7969467639923096, | |
| "learning_rate": 2.9620476610767872e-05, | |
| "loss": 0.4663, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 2.0474803636042713, | |
| "grad_norm": 2.523054361343384, | |
| "learning_rate": 2.9532215357458078e-05, | |
| "loss": 0.4998, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.0563057099991173, | |
| "grad_norm": 3.1918458938598633, | |
| "learning_rate": 2.944395410414828e-05, | |
| "loss": 0.4968, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 2.0651310563939633, | |
| "grad_norm": 3.167325735092163, | |
| "learning_rate": 2.9355692850838486e-05, | |
| "loss": 0.4688, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.0739564027888093, | |
| "grad_norm": 3.2759768962860107, | |
| "learning_rate": 2.9267431597528688e-05, | |
| "loss": 0.5058, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 2.0827817491836553, | |
| "grad_norm": 2.962247371673584, | |
| "learning_rate": 2.9179170344218887e-05, | |
| "loss": 0.4622, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.0916070955785013, | |
| "grad_norm": 3.7349026203155518, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 0.4978, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 2.1004324419733473, | |
| "grad_norm": 2.856795072555542, | |
| "learning_rate": 2.9002647837599295e-05, | |
| "loss": 0.4611, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.1092577883681933, | |
| "grad_norm": 2.8157849311828613, | |
| "learning_rate": 2.8914386584289497e-05, | |
| "loss": 0.4769, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 2.1180831347630393, | |
| "grad_norm": 3.712787389755249, | |
| "learning_rate": 2.8826125330979703e-05, | |
| "loss": 0.5045, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.1180831347630393, | |
| "eval_loss": 0.7179752588272095, | |
| "eval_runtime": 181.8101, | |
| "eval_samples_per_second": 155.805, | |
| "eval_steps_per_second": 15.582, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.1269084811578853, | |
| "grad_norm": 2.407057285308838, | |
| "learning_rate": 2.8737864077669902e-05, | |
| "loss": 0.4973, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 2.1357338275527313, | |
| "grad_norm": 2.9012491703033447, | |
| "learning_rate": 2.8649602824360104e-05, | |
| "loss": 0.4973, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.1445591739475773, | |
| "grad_norm": 2.8666646480560303, | |
| "learning_rate": 2.856134157105031e-05, | |
| "loss": 0.4711, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 2.1533845203424233, | |
| "grad_norm": 2.6431496143341064, | |
| "learning_rate": 2.8473080317740512e-05, | |
| "loss": 0.4555, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.1622098667372693, | |
| "grad_norm": 2.309817314147949, | |
| "learning_rate": 2.8384819064430718e-05, | |
| "loss": 0.4654, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 2.1710352131321153, | |
| "grad_norm": 2.9013850688934326, | |
| "learning_rate": 2.829655781112092e-05, | |
| "loss": 0.4767, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.1798605595269613, | |
| "grad_norm": 3.384150266647339, | |
| "learning_rate": 2.820829655781112e-05, | |
| "loss": 0.5275, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 2.1886859059218073, | |
| "grad_norm": 3.1525537967681885, | |
| "learning_rate": 2.8120035304501325e-05, | |
| "loss": 0.4777, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.1975112523166533, | |
| "grad_norm": 2.6473567485809326, | |
| "learning_rate": 2.8031774051191528e-05, | |
| "loss": 0.4789, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 2.2063365987114993, | |
| "grad_norm": 2.334127187728882, | |
| "learning_rate": 2.7943512797881733e-05, | |
| "loss": 0.4749, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.2151619451063453, | |
| "grad_norm": 2.6121950149536133, | |
| "learning_rate": 2.7855251544571936e-05, | |
| "loss": 0.4807, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 2.2239872915011913, | |
| "grad_norm": 2.5503621101379395, | |
| "learning_rate": 2.7766990291262135e-05, | |
| "loss": 0.4568, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.2328126378960373, | |
| "grad_norm": 3.1507601737976074, | |
| "learning_rate": 2.767872903795234e-05, | |
| "loss": 0.5129, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 2.2416379842908833, | |
| "grad_norm": 4.06856107711792, | |
| "learning_rate": 2.7590467784642543e-05, | |
| "loss": 0.4652, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.2504633306857293, | |
| "grad_norm": 3.199040412902832, | |
| "learning_rate": 2.750220653133275e-05, | |
| "loss": 0.4995, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 2.2592886770805753, | |
| "grad_norm": 3.08388352394104, | |
| "learning_rate": 2.741394527802295e-05, | |
| "loss": 0.5213, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.2681140234754213, | |
| "grad_norm": 2.8300905227661133, | |
| "learning_rate": 2.732568402471315e-05, | |
| "loss": 0.5109, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 2.2769393698702673, | |
| "grad_norm": 3.372019052505493, | |
| "learning_rate": 2.7237422771403352e-05, | |
| "loss": 0.482, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.2857647162651133, | |
| "grad_norm": 3.004849433898926, | |
| "learning_rate": 2.7149161518093558e-05, | |
| "loss": 0.4918, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 2.2945900626599594, | |
| "grad_norm": 3.0937423706054688, | |
| "learning_rate": 2.706090026478376e-05, | |
| "loss": 0.5041, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.2945900626599594, | |
| "eval_loss": 0.7087730169296265, | |
| "eval_runtime": 181.7582, | |
| "eval_samples_per_second": 155.85, | |
| "eval_steps_per_second": 15.587, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.3034154090548054, | |
| "grad_norm": 3.027329444885254, | |
| "learning_rate": 2.6972639011473966e-05, | |
| "loss": 0.4969, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 2.3122407554496514, | |
| "grad_norm": 2.688856840133667, | |
| "learning_rate": 2.6884377758164168e-05, | |
| "loss": 0.5064, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.3210661018444974, | |
| "grad_norm": 3.5008068084716797, | |
| "learning_rate": 2.6796116504854367e-05, | |
| "loss": 0.4931, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 2.3298914482393434, | |
| "grad_norm": 2.946478843688965, | |
| "learning_rate": 2.6707855251544573e-05, | |
| "loss": 0.5058, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.3387167946341894, | |
| "grad_norm": 3.2251195907592773, | |
| "learning_rate": 2.6619593998234775e-05, | |
| "loss": 0.4723, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 2.3475421410290354, | |
| "grad_norm": 2.8397462368011475, | |
| "learning_rate": 2.653133274492498e-05, | |
| "loss": 0.4753, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.3563674874238814, | |
| "grad_norm": 2.383877992630005, | |
| "learning_rate": 2.6443071491615183e-05, | |
| "loss": 0.4403, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 2.3651928338187274, | |
| "grad_norm": 2.9594268798828125, | |
| "learning_rate": 2.6354810238305382e-05, | |
| "loss": 0.4968, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.3740181802135734, | |
| "grad_norm": 3.557107925415039, | |
| "learning_rate": 2.6266548984995588e-05, | |
| "loss": 0.4739, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 2.3828435266084194, | |
| "grad_norm": 3.5984597206115723, | |
| "learning_rate": 2.617828773168579e-05, | |
| "loss": 0.5027, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.3916688730032654, | |
| "grad_norm": 2.6769216060638428, | |
| "learning_rate": 2.6090026478375996e-05, | |
| "loss": 0.4781, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 2.4004942193981114, | |
| "grad_norm": 4.396824836730957, | |
| "learning_rate": 2.60017652250662e-05, | |
| "loss": 0.5125, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.4093195657929574, | |
| "grad_norm": 3.351771831512451, | |
| "learning_rate": 2.5915269196822594e-05, | |
| "loss": 0.5022, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 2.4181449121878034, | |
| "grad_norm": 3.4032695293426514, | |
| "learning_rate": 2.58270079435128e-05, | |
| "loss": 0.4641, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.4269702585826494, | |
| "grad_norm": 2.782778263092041, | |
| "learning_rate": 2.5738746690203002e-05, | |
| "loss": 0.4862, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 2.4357956049774954, | |
| "grad_norm": 2.8130016326904297, | |
| "learning_rate": 2.5650485436893208e-05, | |
| "loss": 0.4982, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.4446209513723414, | |
| "grad_norm": 3.3339552879333496, | |
| "learning_rate": 2.556222418358341e-05, | |
| "loss": 0.4999, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 2.4534462977671874, | |
| "grad_norm": 2.952395439147949, | |
| "learning_rate": 2.547396293027361e-05, | |
| "loss": 0.5009, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.4622716441620334, | |
| "grad_norm": 2.979508399963379, | |
| "learning_rate": 2.5385701676963815e-05, | |
| "loss": 0.4805, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 2.4710969905568794, | |
| "grad_norm": 2.709047555923462, | |
| "learning_rate": 2.5297440423654017e-05, | |
| "loss": 0.5152, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.4710969905568794, | |
| "eval_loss": 0.7000770568847656, | |
| "eval_runtime": 181.7422, | |
| "eval_samples_per_second": 155.864, | |
| "eval_steps_per_second": 15.588, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.4799223369517254, | |
| "grad_norm": 2.9618706703186035, | |
| "learning_rate": 2.5209179170344223e-05, | |
| "loss": 0.4763, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 2.4887476833465714, | |
| "grad_norm": 2.6275711059570312, | |
| "learning_rate": 2.5120917917034425e-05, | |
| "loss": 0.5057, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.4975730297414174, | |
| "grad_norm": 2.0146267414093018, | |
| "learning_rate": 2.5032656663724624e-05, | |
| "loss": 0.4706, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 2.5063983761362634, | |
| "grad_norm": 3.7594754695892334, | |
| "learning_rate": 2.4944395410414826e-05, | |
| "loss": 0.5005, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.5152237225311094, | |
| "grad_norm": 2.0959393978118896, | |
| "learning_rate": 2.4856134157105032e-05, | |
| "loss": 0.513, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 2.5240490689259554, | |
| "grad_norm": 2.7866508960723877, | |
| "learning_rate": 2.4767872903795238e-05, | |
| "loss": 0.5094, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.5328744153208014, | |
| "grad_norm": 3.7597134113311768, | |
| "learning_rate": 2.4679611650485437e-05, | |
| "loss": 0.4843, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 2.5416997617156474, | |
| "grad_norm": 2.447920083999634, | |
| "learning_rate": 2.4591350397175643e-05, | |
| "loss": 0.4716, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.5505251081104934, | |
| "grad_norm": 3.173151731491089, | |
| "learning_rate": 2.450308914386584e-05, | |
| "loss": 0.4899, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 2.5593504545053394, | |
| "grad_norm": 2.6911866664886475, | |
| "learning_rate": 2.4414827890556047e-05, | |
| "loss": 0.4829, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.5681758009001854, | |
| "grad_norm": 2.1910979747772217, | |
| "learning_rate": 2.432656663724625e-05, | |
| "loss": 0.5169, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 2.5770011472950314, | |
| "grad_norm": 3.5892837047576904, | |
| "learning_rate": 2.4238305383936452e-05, | |
| "loss": 0.5169, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.5858264936898774, | |
| "grad_norm": 4.015511989593506, | |
| "learning_rate": 2.4150044130626658e-05, | |
| "loss": 0.4537, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 2.5946518400847234, | |
| "grad_norm": 3.547804832458496, | |
| "learning_rate": 2.406178287731686e-05, | |
| "loss": 0.5112, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.6034771864795694, | |
| "grad_norm": 3.3924198150634766, | |
| "learning_rate": 2.3973521624007062e-05, | |
| "loss": 0.4676, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 2.6123025328744154, | |
| "grad_norm": 2.915863037109375, | |
| "learning_rate": 2.3885260370697265e-05, | |
| "loss": 0.486, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.6211278792692614, | |
| "grad_norm": 2.734058380126953, | |
| "learning_rate": 2.3796999117387467e-05, | |
| "loss": 0.4795, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 2.6299532256641074, | |
| "grad_norm": 2.50942325592041, | |
| "learning_rate": 2.370873786407767e-05, | |
| "loss": 0.4739, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.6387785720589534, | |
| "grad_norm": 3.3483402729034424, | |
| "learning_rate": 2.3620476610767875e-05, | |
| "loss": 0.5242, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 2.6476039184537994, | |
| "grad_norm": 2.779780149459839, | |
| "learning_rate": 2.3532215357458077e-05, | |
| "loss": 0.465, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.6476039184537994, | |
| "eval_loss": 0.7009094953536987, | |
| "eval_runtime": 181.7609, | |
| "eval_samples_per_second": 155.848, | |
| "eval_steps_per_second": 15.586, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.6564292648486454, | |
| "grad_norm": 3.3387420177459717, | |
| "learning_rate": 2.344395410414828e-05, | |
| "loss": 0.5071, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 2.6652546112434914, | |
| "grad_norm": 3.358072519302368, | |
| "learning_rate": 2.3355692850838485e-05, | |
| "loss": 0.4635, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.6740799576383374, | |
| "grad_norm": 3.029219627380371, | |
| "learning_rate": 2.3267431597528684e-05, | |
| "loss": 0.4748, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.6829053040331834, | |
| "grad_norm": 2.3864011764526367, | |
| "learning_rate": 2.317917034421889e-05, | |
| "loss": 0.4895, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.6917306504280294, | |
| "grad_norm": 2.5701119899749756, | |
| "learning_rate": 2.309090909090909e-05, | |
| "loss": 0.4693, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.7005559968228754, | |
| "grad_norm": 2.640472173690796, | |
| "learning_rate": 2.3002647837599295e-05, | |
| "loss": 0.4556, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.7093813432177214, | |
| "grad_norm": 2.6650307178497314, | |
| "learning_rate": 2.29143865842895e-05, | |
| "loss": 0.4795, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.7182066896125674, | |
| "grad_norm": 2.955892562866211, | |
| "learning_rate": 2.28261253309797e-05, | |
| "loss": 0.458, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.7270320360074134, | |
| "grad_norm": 3.361410617828369, | |
| "learning_rate": 2.2737864077669905e-05, | |
| "loss": 0.5199, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.7358573824022594, | |
| "grad_norm": 3.844050645828247, | |
| "learning_rate": 2.2649602824360108e-05, | |
| "loss": 0.4946, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.7446827287971054, | |
| "grad_norm": 3.419942617416382, | |
| "learning_rate": 2.256134157105031e-05, | |
| "loss": 0.5105, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.7535080751919514, | |
| "grad_norm": 2.58404278755188, | |
| "learning_rate": 2.2473080317740512e-05, | |
| "loss": 0.5022, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.7623334215867974, | |
| "grad_norm": 2.5541560649871826, | |
| "learning_rate": 2.2384819064430715e-05, | |
| "loss": 0.5038, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.7711587679816434, | |
| "grad_norm": 2.7854466438293457, | |
| "learning_rate": 2.229655781112092e-05, | |
| "loss": 0.501, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.7799841143764894, | |
| "grad_norm": 2.999455213546753, | |
| "learning_rate": 2.2208296557811123e-05, | |
| "loss": 0.4941, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.7888094607713354, | |
| "grad_norm": 3.169633388519287, | |
| "learning_rate": 2.2120035304501325e-05, | |
| "loss": 0.5011, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.7976348071661814, | |
| "grad_norm": 2.6727561950683594, | |
| "learning_rate": 2.2031774051191527e-05, | |
| "loss": 0.4743, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.8064601535610274, | |
| "grad_norm": 2.7246663570404053, | |
| "learning_rate": 2.1943512797881733e-05, | |
| "loss": 0.4549, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.8152854999558734, | |
| "grad_norm": 3.7977099418640137, | |
| "learning_rate": 2.1855251544571932e-05, | |
| "loss": 0.4873, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.8241108463507194, | |
| "grad_norm": 3.3139097690582275, | |
| "learning_rate": 2.1766990291262138e-05, | |
| "loss": 0.4491, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.8241108463507194, | |
| "eval_loss": 0.6963682174682617, | |
| "eval_runtime": 181.7512, | |
| "eval_samples_per_second": 155.856, | |
| "eval_steps_per_second": 15.587, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.8329361927455654, | |
| "grad_norm": 3.014883279800415, | |
| "learning_rate": 2.167872903795234e-05, | |
| "loss": 0.503, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.8417615391404114, | |
| "grad_norm": 3.094257116317749, | |
| "learning_rate": 2.1590467784642542e-05, | |
| "loss": 0.466, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.8505868855352574, | |
| "grad_norm": 3.347677707672119, | |
| "learning_rate": 2.1502206531332748e-05, | |
| "loss": 0.4772, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.8594122319301034, | |
| "grad_norm": 3.1895172595977783, | |
| "learning_rate": 2.1413945278022947e-05, | |
| "loss": 0.4722, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.8682375783249494, | |
| "grad_norm": 3.3283205032348633, | |
| "learning_rate": 2.1325684024713153e-05, | |
| "loss": 0.4596, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.8770629247197954, | |
| "grad_norm": 2.760470390319824, | |
| "learning_rate": 2.1237422771403355e-05, | |
| "loss": 0.4908, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.8858882711146414, | |
| "grad_norm": 2.9019477367401123, | |
| "learning_rate": 2.1149161518093558e-05, | |
| "loss": 0.4889, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.8947136175094874, | |
| "grad_norm": 2.706892251968384, | |
| "learning_rate": 2.1060900264783763e-05, | |
| "loss": 0.4896, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.9035389639043334, | |
| "grad_norm": 2.5221893787384033, | |
| "learning_rate": 2.0972639011473962e-05, | |
| "loss": 0.4779, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.9123643102991794, | |
| "grad_norm": 2.687797784805298, | |
| "learning_rate": 2.0884377758164168e-05, | |
| "loss": 0.4829, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.9211896566940254, | |
| "grad_norm": 3.208223581314087, | |
| "learning_rate": 2.079611650485437e-05, | |
| "loss": 0.5011, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 2.9300150030888714, | |
| "grad_norm": 3.5684010982513428, | |
| "learning_rate": 2.0707855251544573e-05, | |
| "loss": 0.5349, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.9388403494837174, | |
| "grad_norm": 2.910231351852417, | |
| "learning_rate": 2.0619593998234775e-05, | |
| "loss": 0.4876, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 2.9476656958785634, | |
| "grad_norm": 2.2015023231506348, | |
| "learning_rate": 2.0531332744924977e-05, | |
| "loss": 0.4947, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.9564910422734094, | |
| "grad_norm": 3.053490400314331, | |
| "learning_rate": 2.0443071491615183e-05, | |
| "loss": 0.532, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.9653163886682554, | |
| "grad_norm": 2.6305556297302246, | |
| "learning_rate": 2.0354810238305385e-05, | |
| "loss": 0.4773, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.9741417350631014, | |
| "grad_norm": 1.92122220993042, | |
| "learning_rate": 2.0266548984995588e-05, | |
| "loss": 0.4753, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 2.9829670814579474, | |
| "grad_norm": 2.6048147678375244, | |
| "learning_rate": 2.017828773168579e-05, | |
| "loss": 0.5042, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.9917924278527934, | |
| "grad_norm": 2.4308252334594727, | |
| "learning_rate": 2.0090026478375996e-05, | |
| "loss": 0.4722, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 3.0006177742476394, | |
| "grad_norm": 2.56939435005188, | |
| "learning_rate": 2.0003530450132395e-05, | |
| "loss": 0.4925, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.0006177742476394, | |
| "eval_loss": 0.6830546855926514, | |
| "eval_runtime": 181.7333, | |
| "eval_samples_per_second": 155.871, | |
| "eval_steps_per_second": 15.589, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.0094431206424854, | |
| "grad_norm": 2.973703622817993, | |
| "learning_rate": 1.9915269196822594e-05, | |
| "loss": 0.3581, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 3.0182684670373314, | |
| "grad_norm": 5.493050575256348, | |
| "learning_rate": 1.98270079435128e-05, | |
| "loss": 0.3559, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.0270938134321774, | |
| "grad_norm": 2.2883782386779785, | |
| "learning_rate": 1.9738746690203002e-05, | |
| "loss": 0.3607, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 3.0359191598270234, | |
| "grad_norm": 2.020956039428711, | |
| "learning_rate": 1.9650485436893204e-05, | |
| "loss": 0.3722, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.0447445062218694, | |
| "grad_norm": 3.439941883087158, | |
| "learning_rate": 1.9562224183583406e-05, | |
| "loss": 0.3458, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 3.0535698526167154, | |
| "grad_norm": 2.0934534072875977, | |
| "learning_rate": 1.9473962930273612e-05, | |
| "loss": 0.3543, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.062395199011561, | |
| "grad_norm": 2.9070968627929688, | |
| "learning_rate": 1.9385701676963815e-05, | |
| "loss": 0.3893, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 3.071220545406407, | |
| "grad_norm": 2.8557307720184326, | |
| "learning_rate": 1.9297440423654017e-05, | |
| "loss": 0.3579, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.080045891801253, | |
| "grad_norm": 3.181415557861328, | |
| "learning_rate": 1.920917917034422e-05, | |
| "loss": 0.3933, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 3.088871238196099, | |
| "grad_norm": 3.302639961242676, | |
| "learning_rate": 1.912091791703442e-05, | |
| "loss": 0.4011, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.097696584590945, | |
| "grad_norm": 2.84566330909729, | |
| "learning_rate": 1.9032656663724627e-05, | |
| "loss": 0.3722, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 3.106521930985791, | |
| "grad_norm": 2.536458730697632, | |
| "learning_rate": 1.894439541041483e-05, | |
| "loss": 0.3794, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.115347277380637, | |
| "grad_norm": 2.6080989837646484, | |
| "learning_rate": 1.8856134157105032e-05, | |
| "loss": 0.3696, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 3.124172623775483, | |
| "grad_norm": 3.233208417892456, | |
| "learning_rate": 1.8767872903795238e-05, | |
| "loss": 0.3693, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.132997970170329, | |
| "grad_norm": 2.3864200115203857, | |
| "learning_rate": 1.8679611650485437e-05, | |
| "loss": 0.3647, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 3.141823316565175, | |
| "grad_norm": 3.92567777633667, | |
| "learning_rate": 1.8591350397175642e-05, | |
| "loss": 0.3672, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.150648662960021, | |
| "grad_norm": 2.5370068550109863, | |
| "learning_rate": 1.850308914386584e-05, | |
| "loss": 0.3576, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 3.159474009354867, | |
| "grad_norm": 2.7078917026519775, | |
| "learning_rate": 1.8414827890556047e-05, | |
| "loss": 0.3667, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.168299355749713, | |
| "grad_norm": 3.0950546264648438, | |
| "learning_rate": 1.832656663724625e-05, | |
| "loss": 0.3988, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 3.177124702144559, | |
| "grad_norm": 2.078380823135376, | |
| "learning_rate": 1.8238305383936452e-05, | |
| "loss": 0.3736, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.177124702144559, | |
| "eval_loss": 0.6996409296989441, | |
| "eval_runtime": 181.8922, | |
| "eval_samples_per_second": 155.735, | |
| "eval_steps_per_second": 15.575, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.185950048539405, | |
| "grad_norm": 2.491473913192749, | |
| "learning_rate": 1.8150044130626657e-05, | |
| "loss": 0.3493, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 3.194775394934251, | |
| "grad_norm": 3.2102303504943848, | |
| "learning_rate": 1.806178287731686e-05, | |
| "loss": 0.3671, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.203600741329097, | |
| "grad_norm": 2.8939054012298584, | |
| "learning_rate": 1.7973521624007062e-05, | |
| "loss": 0.3849, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 3.212426087723943, | |
| "grad_norm": 2.372587203979492, | |
| "learning_rate": 1.7885260370697265e-05, | |
| "loss": 0.3651, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.221251434118789, | |
| "grad_norm": 3.5877439975738525, | |
| "learning_rate": 1.7796999117387467e-05, | |
| "loss": 0.3736, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 3.230076780513635, | |
| "grad_norm": 3.3625779151916504, | |
| "learning_rate": 1.770873786407767e-05, | |
| "loss": 0.3653, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.238902126908481, | |
| "grad_norm": 3.4853782653808594, | |
| "learning_rate": 1.7620476610767875e-05, | |
| "loss": 0.3615, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 3.247727473303327, | |
| "grad_norm": 4.208776950836182, | |
| "learning_rate": 1.7532215357458077e-05, | |
| "loss": 0.3907, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.256552819698173, | |
| "grad_norm": 3.6717019081115723, | |
| "learning_rate": 1.744395410414828e-05, | |
| "loss": 0.3682, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 3.265378166093019, | |
| "grad_norm": 2.0552759170532227, | |
| "learning_rate": 1.7355692850838482e-05, | |
| "loss": 0.3786, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.274203512487865, | |
| "grad_norm": 2.2906410694122314, | |
| "learning_rate": 1.7267431597528684e-05, | |
| "loss": 0.3941, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 3.283028858882711, | |
| "grad_norm": 2.546051263809204, | |
| "learning_rate": 1.717917034421889e-05, | |
| "loss": 0.374, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.291854205277557, | |
| "grad_norm": 3.243657350540161, | |
| "learning_rate": 1.7090909090909092e-05, | |
| "loss": 0.4133, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 3.300679551672403, | |
| "grad_norm": 3.139357328414917, | |
| "learning_rate": 1.7002647837599295e-05, | |
| "loss": 0.3807, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.309504898067249, | |
| "grad_norm": 4.064445972442627, | |
| "learning_rate": 1.69143865842895e-05, | |
| "loss": 0.386, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 3.318330244462095, | |
| "grad_norm": 2.3872506618499756, | |
| "learning_rate": 1.68261253309797e-05, | |
| "loss": 0.3618, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.327155590856941, | |
| "grad_norm": 2.3966455459594727, | |
| "learning_rate": 1.6737864077669905e-05, | |
| "loss": 0.3862, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 3.335980937251787, | |
| "grad_norm": 3.9382779598236084, | |
| "learning_rate": 1.6649602824360104e-05, | |
| "loss": 0.344, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.344806283646633, | |
| "grad_norm": 3.825925588607788, | |
| "learning_rate": 1.656134157105031e-05, | |
| "loss": 0.3657, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 3.353631630041479, | |
| "grad_norm": 2.955763101577759, | |
| "learning_rate": 1.6473080317740512e-05, | |
| "loss": 0.4116, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.353631630041479, | |
| "eval_loss": 0.7010481953620911, | |
| "eval_runtime": 181.7645, | |
| "eval_samples_per_second": 155.845, | |
| "eval_steps_per_second": 15.586, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.362456976436325, | |
| "grad_norm": 3.2211873531341553, | |
| "learning_rate": 1.6384819064430714e-05, | |
| "loss": 0.3662, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 3.371282322831171, | |
| "grad_norm": 3.256361722946167, | |
| "learning_rate": 1.629655781112092e-05, | |
| "loss": 0.3738, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 3.380107669226017, | |
| "grad_norm": 2.363861560821533, | |
| "learning_rate": 1.6208296557811123e-05, | |
| "loss": 0.4157, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 3.388933015620863, | |
| "grad_norm": 3.10766339302063, | |
| "learning_rate": 1.6120035304501325e-05, | |
| "loss": 0.3781, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.397758362015709, | |
| "grad_norm": 2.534269332885742, | |
| "learning_rate": 1.6031774051191527e-05, | |
| "loss": 0.3727, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 3.406583708410555, | |
| "grad_norm": 2.8136239051818848, | |
| "learning_rate": 1.594351279788173e-05, | |
| "loss": 0.3843, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 3.415409054805401, | |
| "grad_norm": 2.956463575363159, | |
| "learning_rate": 1.5855251544571932e-05, | |
| "loss": 0.3829, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 3.424234401200247, | |
| "grad_norm": 3.122279644012451, | |
| "learning_rate": 1.5766990291262138e-05, | |
| "loss": 0.3771, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 3.433059747595093, | |
| "grad_norm": 3.2590718269348145, | |
| "learning_rate": 1.567872903795234e-05, | |
| "loss": 0.3508, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 3.441885093989939, | |
| "grad_norm": 2.1195740699768066, | |
| "learning_rate": 1.5590467784642542e-05, | |
| "loss": 0.3806, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.450710440384785, | |
| "grad_norm": 2.7716526985168457, | |
| "learning_rate": 1.5502206531332748e-05, | |
| "loss": 0.372, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 3.459535786779631, | |
| "grad_norm": 2.7619106769561768, | |
| "learning_rate": 1.5413945278022947e-05, | |
| "loss": 0.3641, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.468361133174477, | |
| "grad_norm": 3.310124635696411, | |
| "learning_rate": 1.5325684024713153e-05, | |
| "loss": 0.3525, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 3.477186479569323, | |
| "grad_norm": 2.739682674407959, | |
| "learning_rate": 1.523918799646955e-05, | |
| "loss": 0.3682, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 3.486011825964169, | |
| "grad_norm": 3.138272523880005, | |
| "learning_rate": 1.5150926743159754e-05, | |
| "loss": 0.3577, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 3.494837172359015, | |
| "grad_norm": 2.602055549621582, | |
| "learning_rate": 1.5062665489849956e-05, | |
| "loss": 0.3866, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 3.503662518753861, | |
| "grad_norm": 3.8529045581817627, | |
| "learning_rate": 1.497440423654016e-05, | |
| "loss": 0.369, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 3.512487865148707, | |
| "grad_norm": 2.2044355869293213, | |
| "learning_rate": 1.4886142983230364e-05, | |
| "loss": 0.3472, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 3.521313211543553, | |
| "grad_norm": 4.661579132080078, | |
| "learning_rate": 1.4797881729920565e-05, | |
| "loss": 0.3698, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 3.530138557938399, | |
| "grad_norm": 2.468601703643799, | |
| "learning_rate": 1.4709620476610769e-05, | |
| "loss": 0.3905, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.530138557938399, | |
| "eval_loss": 0.6975388526916504, | |
| "eval_runtime": 181.7923, | |
| "eval_samples_per_second": 155.821, | |
| "eval_steps_per_second": 15.584, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.538963904333245, | |
| "grad_norm": 3.216001272201538, | |
| "learning_rate": 1.462135922330097e-05, | |
| "loss": 0.3641, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 3.547789250728091, | |
| "grad_norm": 2.9274120330810547, | |
| "learning_rate": 1.4533097969991174e-05, | |
| "loss": 0.3772, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 3.556614597122937, | |
| "grad_norm": 3.954235076904297, | |
| "learning_rate": 1.4444836716681378e-05, | |
| "loss": 0.3594, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 3.565439943517783, | |
| "grad_norm": 2.441312551498413, | |
| "learning_rate": 1.435657546337158e-05, | |
| "loss": 0.3602, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 3.574265289912629, | |
| "grad_norm": 2.377568006515503, | |
| "learning_rate": 1.4268314210061784e-05, | |
| "loss": 0.37, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 3.583090636307475, | |
| "grad_norm": 2.7893033027648926, | |
| "learning_rate": 1.4180052956751985e-05, | |
| "loss": 0.3747, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 3.591915982702321, | |
| "grad_norm": 5.03574800491333, | |
| "learning_rate": 1.4091791703442189e-05, | |
| "loss": 0.3832, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 3.600741329097167, | |
| "grad_norm": 3.8684732913970947, | |
| "learning_rate": 1.4003530450132393e-05, | |
| "loss": 0.3635, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 3.609566675492013, | |
| "grad_norm": 2.8690998554229736, | |
| "learning_rate": 1.3915269196822595e-05, | |
| "loss": 0.3532, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 3.618392021886859, | |
| "grad_norm": 3.5299575328826904, | |
| "learning_rate": 1.38270079435128e-05, | |
| "loss": 0.3592, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.627217368281705, | |
| "grad_norm": 2.8447256088256836, | |
| "learning_rate": 1.3738746690203003e-05, | |
| "loss": 0.3809, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 3.636042714676551, | |
| "grad_norm": 2.9593961238861084, | |
| "learning_rate": 1.3650485436893204e-05, | |
| "loss": 0.3842, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 3.644868061071397, | |
| "grad_norm": 2.8917315006256104, | |
| "learning_rate": 1.3562224183583408e-05, | |
| "loss": 0.3778, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 3.653693407466243, | |
| "grad_norm": 2.705604314804077, | |
| "learning_rate": 1.3473962930273609e-05, | |
| "loss": 0.3856, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 3.662518753861089, | |
| "grad_norm": 3.337663173675537, | |
| "learning_rate": 1.3385701676963813e-05, | |
| "loss": 0.3554, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 3.671344100255935, | |
| "grad_norm": 2.7694709300994873, | |
| "learning_rate": 1.3297440423654017e-05, | |
| "loss": 0.3576, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.680169446650781, | |
| "grad_norm": 3.0217745304107666, | |
| "learning_rate": 1.3209179170344219e-05, | |
| "loss": 0.3568, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 3.688994793045627, | |
| "grad_norm": 3.2156639099121094, | |
| "learning_rate": 1.3120917917034423e-05, | |
| "loss": 0.3733, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 3.697820139440473, | |
| "grad_norm": 3.0890026092529297, | |
| "learning_rate": 1.3032656663724627e-05, | |
| "loss": 0.3755, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 3.706645485835319, | |
| "grad_norm": 2.598747730255127, | |
| "learning_rate": 1.2944395410414828e-05, | |
| "loss": 0.3959, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.706645485835319, | |
| "eval_loss": 0.6940839290618896, | |
| "eval_runtime": 181.7434, | |
| "eval_samples_per_second": 155.863, | |
| "eval_steps_per_second": 15.588, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.715470832230165, | |
| "grad_norm": 3.0001120567321777, | |
| "learning_rate": 1.2856134157105032e-05, | |
| "loss": 0.3494, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 3.724296178625011, | |
| "grad_norm": 3.8093442916870117, | |
| "learning_rate": 1.2767872903795232e-05, | |
| "loss": 0.4024, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 3.733121525019857, | |
| "grad_norm": 3.1789517402648926, | |
| "learning_rate": 1.2679611650485437e-05, | |
| "loss": 0.3872, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 3.741946871414703, | |
| "grad_norm": 3.2620937824249268, | |
| "learning_rate": 1.259135039717564e-05, | |
| "loss": 0.3594, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 3.750772217809549, | |
| "grad_norm": 4.006773471832275, | |
| "learning_rate": 1.2503089143865843e-05, | |
| "loss": 0.3807, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 3.759597564204395, | |
| "grad_norm": 3.351083517074585, | |
| "learning_rate": 1.2414827890556047e-05, | |
| "loss": 0.3591, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 3.768422910599241, | |
| "grad_norm": 3.535079002380371, | |
| "learning_rate": 1.232656663724625e-05, | |
| "loss": 0.3901, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 3.777248256994087, | |
| "grad_norm": 3.1112380027770996, | |
| "learning_rate": 1.2238305383936452e-05, | |
| "loss": 0.3548, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 3.786073603388933, | |
| "grad_norm": 3.1333882808685303, | |
| "learning_rate": 1.2150044130626656e-05, | |
| "loss": 0.3696, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 3.794898949783779, | |
| "grad_norm": 3.432013511657715, | |
| "learning_rate": 1.2061782877316858e-05, | |
| "loss": 0.3492, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.803724296178625, | |
| "grad_norm": 3.25502347946167, | |
| "learning_rate": 1.1973521624007062e-05, | |
| "loss": 0.381, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 3.812549642573471, | |
| "grad_norm": 3.45226788520813, | |
| "learning_rate": 1.1885260370697264e-05, | |
| "loss": 0.3853, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 3.821374988968317, | |
| "grad_norm": 3.1285228729248047, | |
| "learning_rate": 1.1796999117387468e-05, | |
| "loss": 0.3897, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 3.830200335363163, | |
| "grad_norm": 2.432709217071533, | |
| "learning_rate": 1.170873786407767e-05, | |
| "loss": 0.3672, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 3.839025681758009, | |
| "grad_norm": 2.7926766872406006, | |
| "learning_rate": 1.1620476610767873e-05, | |
| "loss": 0.365, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 3.847851028152855, | |
| "grad_norm": 3.418755531311035, | |
| "learning_rate": 1.1532215357458075e-05, | |
| "loss": 0.3828, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 3.856676374547701, | |
| "grad_norm": 3.320993185043335, | |
| "learning_rate": 1.144395410414828e-05, | |
| "loss": 0.3527, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 3.865501720942547, | |
| "grad_norm": 3.38420033454895, | |
| "learning_rate": 1.1355692850838483e-05, | |
| "loss": 0.3768, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 3.874327067337393, | |
| "grad_norm": 2.989563226699829, | |
| "learning_rate": 1.1267431597528686e-05, | |
| "loss": 0.3536, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 3.883152413732239, | |
| "grad_norm": 2.897488832473755, | |
| "learning_rate": 1.1179170344218888e-05, | |
| "loss": 0.3853, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.883152413732239, | |
| "eval_loss": 0.689664900302887, | |
| "eval_runtime": 181.7323, | |
| "eval_samples_per_second": 155.872, | |
| "eval_steps_per_second": 15.589, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.891977760127085, | |
| "grad_norm": 3.514223575592041, | |
| "learning_rate": 1.1090909090909092e-05, | |
| "loss": 0.3693, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 3.900803106521931, | |
| "grad_norm": 3.4816884994506836, | |
| "learning_rate": 1.1002647837599295e-05, | |
| "loss": 0.3776, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 3.909628452916777, | |
| "grad_norm": 3.4098060131073, | |
| "learning_rate": 1.0914386584289497e-05, | |
| "loss": 0.3825, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 3.918453799311623, | |
| "grad_norm": 2.6594343185424805, | |
| "learning_rate": 1.08261253309797e-05, | |
| "loss": 0.3863, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 3.927279145706469, | |
| "grad_norm": 2.3485662937164307, | |
| "learning_rate": 1.07396293027361e-05, | |
| "loss": 0.3735, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 3.936104492101315, | |
| "grad_norm": 2.5646393299102783, | |
| "learning_rate": 1.0651368049426302e-05, | |
| "loss": 0.3884, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 3.944929838496161, | |
| "grad_norm": 2.7431557178497314, | |
| "learning_rate": 1.0563106796116505e-05, | |
| "loss": 0.3416, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 3.953755184891007, | |
| "grad_norm": 2.6949362754821777, | |
| "learning_rate": 1.0474845542806707e-05, | |
| "loss": 0.3718, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 3.962580531285853, | |
| "grad_norm": 2.811133861541748, | |
| "learning_rate": 1.0386584289496913e-05, | |
| "loss": 0.3376, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 3.971405877680699, | |
| "grad_norm": 2.7501027584075928, | |
| "learning_rate": 1.0298323036187115e-05, | |
| "loss": 0.3899, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.980231224075545, | |
| "grad_norm": 3.2334673404693604, | |
| "learning_rate": 1.0210061782877317e-05, | |
| "loss": 0.3666, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 3.989056570470391, | |
| "grad_norm": 2.5137345790863037, | |
| "learning_rate": 1.012180052956752e-05, | |
| "loss": 0.4036, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 3.997881916865237, | |
| "grad_norm": 3.6843361854553223, | |
| "learning_rate": 1.0033539276257724e-05, | |
| "loss": 0.378, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 4.006707263260083, | |
| "grad_norm": 3.4136714935302734, | |
| "learning_rate": 9.945278022947926e-06, | |
| "loss": 0.3131, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 4.015532609654929, | |
| "grad_norm": 2.0005850791931152, | |
| "learning_rate": 9.857016769638128e-06, | |
| "loss": 0.2745, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 4.024357956049775, | |
| "grad_norm": 3.076453447341919, | |
| "learning_rate": 9.768755516328332e-06, | |
| "loss": 0.2694, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 4.033183302444621, | |
| "grad_norm": 2.3047375679016113, | |
| "learning_rate": 9.680494263018536e-06, | |
| "loss": 0.2986, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 4.042008648839467, | |
| "grad_norm": 3.2981176376342773, | |
| "learning_rate": 9.592233009708739e-06, | |
| "loss": 0.3034, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 4.050833995234313, | |
| "grad_norm": 1.8951621055603027, | |
| "learning_rate": 9.503971756398941e-06, | |
| "loss": 0.2673, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 4.059659341629159, | |
| "grad_norm": 2.2991392612457275, | |
| "learning_rate": 9.415710503089143e-06, | |
| "loss": 0.2809, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 4.059659341629159, | |
| "eval_loss": 0.7109093070030212, | |
| "eval_runtime": 181.7083, | |
| "eval_samples_per_second": 155.893, | |
| "eval_steps_per_second": 15.591, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 4.068484688024005, | |
| "grad_norm": 3.2459051609039307, | |
| "learning_rate": 9.327449249779347e-06, | |
| "loss": 0.2973, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 4.077310034418851, | |
| "grad_norm": 2.494835615158081, | |
| "learning_rate": 9.23918799646955e-06, | |
| "loss": 0.2891, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 4.086135380813697, | |
| "grad_norm": 3.9098784923553467, | |
| "learning_rate": 9.150926743159754e-06, | |
| "loss": 0.2718, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 4.094960727208543, | |
| "grad_norm": 3.0526154041290283, | |
| "learning_rate": 9.062665489849956e-06, | |
| "loss": 0.3181, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 4.103786073603389, | |
| "grad_norm": 2.8916709423065186, | |
| "learning_rate": 8.97440423654016e-06, | |
| "loss": 0.304, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 4.112611419998235, | |
| "grad_norm": 2.576984405517578, | |
| "learning_rate": 8.886142983230363e-06, | |
| "loss": 0.2993, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 4.121436766393081, | |
| "grad_norm": 2.776900053024292, | |
| "learning_rate": 8.797881729920565e-06, | |
| "loss": 0.2662, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 4.130262112787927, | |
| "grad_norm": 2.9947192668914795, | |
| "learning_rate": 8.709620476610767e-06, | |
| "loss": 0.2965, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 4.139087459182773, | |
| "grad_norm": 3.5091171264648438, | |
| "learning_rate": 8.621359223300971e-06, | |
| "loss": 0.311, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 4.147912805577619, | |
| "grad_norm": 3.0500705242156982, | |
| "learning_rate": 8.533097969991175e-06, | |
| "loss": 0.3022, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 4.156738151972465, | |
| "grad_norm": 3.1017823219299316, | |
| "learning_rate": 8.444836716681378e-06, | |
| "loss": 0.2615, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 4.165563498367311, | |
| "grad_norm": 3.331235408782959, | |
| "learning_rate": 8.35657546337158e-06, | |
| "loss": 0.3127, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 4.174388844762157, | |
| "grad_norm": 2.713762044906616, | |
| "learning_rate": 8.268314210061784e-06, | |
| "loss": 0.3146, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 4.183214191157003, | |
| "grad_norm": 2.274113178253174, | |
| "learning_rate": 8.180052956751986e-06, | |
| "loss": 0.2899, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 4.192039537551849, | |
| "grad_norm": 2.678398370742798, | |
| "learning_rate": 8.091791703442189e-06, | |
| "loss": 0.2949, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 4.200864883946695, | |
| "grad_norm": 2.3312747478485107, | |
| "learning_rate": 8.003530450132391e-06, | |
| "loss": 0.3019, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 4.209690230341541, | |
| "grad_norm": 2.080692768096924, | |
| "learning_rate": 7.915269196822595e-06, | |
| "loss": 0.281, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 4.218515576736387, | |
| "grad_norm": 2.982898235321045, | |
| "learning_rate": 7.827007943512799e-06, | |
| "loss": 0.3062, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 4.227340923131233, | |
| "grad_norm": 2.4919323921203613, | |
| "learning_rate": 7.738746690203001e-06, | |
| "loss": 0.2851, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 4.236166269526079, | |
| "grad_norm": 2.330500841140747, | |
| "learning_rate": 7.650485436893204e-06, | |
| "loss": 0.3078, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.236166269526079, | |
| "eval_loss": 0.7070273756980896, | |
| "eval_runtime": 181.7435, | |
| "eval_samples_per_second": 155.863, | |
| "eval_steps_per_second": 15.588, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.244991615920925, | |
| "grad_norm": 3.2452151775360107, | |
| "learning_rate": 7.562224183583408e-06, | |
| "loss": 0.3133, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 4.253816962315771, | |
| "grad_norm": 3.1312410831451416, | |
| "learning_rate": 7.47396293027361e-06, | |
| "loss": 0.2953, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 4.262642308710617, | |
| "grad_norm": 3.128032684326172, | |
| "learning_rate": 7.385701676963813e-06, | |
| "loss": 0.2757, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 4.271467655105463, | |
| "grad_norm": 3.9495296478271484, | |
| "learning_rate": 7.297440423654016e-06, | |
| "loss": 0.2845, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 4.280293001500309, | |
| "grad_norm": 3.0301661491394043, | |
| "learning_rate": 7.20917917034422e-06, | |
| "loss": 0.3086, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 4.289118347895155, | |
| "grad_norm": 3.9362733364105225, | |
| "learning_rate": 7.120917917034423e-06, | |
| "loss": 0.2845, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 4.297943694290001, | |
| "grad_norm": 2.506880760192871, | |
| "learning_rate": 7.032656663724625e-06, | |
| "loss": 0.3013, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 4.306769040684847, | |
| "grad_norm": 2.264787435531616, | |
| "learning_rate": 6.944395410414828e-06, | |
| "loss": 0.2966, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 4.315594387079693, | |
| "grad_norm": 2.2223780155181885, | |
| "learning_rate": 6.856134157105031e-06, | |
| "loss": 0.2934, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 4.324419733474539, | |
| "grad_norm": 3.4193592071533203, | |
| "learning_rate": 6.767872903795235e-06, | |
| "loss": 0.2961, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.333245079869385, | |
| "grad_norm": 3.005798101425171, | |
| "learning_rate": 6.679611650485437e-06, | |
| "loss": 0.2712, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 4.342070426264231, | |
| "grad_norm": 3.0497212409973145, | |
| "learning_rate": 6.5913503971756395e-06, | |
| "loss": 0.2939, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 4.350895772659077, | |
| "grad_norm": 2.77939772605896, | |
| "learning_rate": 6.503089143865843e-06, | |
| "loss": 0.2936, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 4.359721119053923, | |
| "grad_norm": 2.7358789443969727, | |
| "learning_rate": 6.414827890556047e-06, | |
| "loss": 0.3102, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 4.368546465448769, | |
| "grad_norm": 3.289128303527832, | |
| "learning_rate": 6.326566637246249e-06, | |
| "loss": 0.2994, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 4.377371811843615, | |
| "grad_norm": 2.8301990032196045, | |
| "learning_rate": 6.238305383936452e-06, | |
| "loss": 0.2993, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 4.386197158238461, | |
| "grad_norm": 3.143252372741699, | |
| "learning_rate": 6.1500441306266555e-06, | |
| "loss": 0.2728, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 4.395022504633307, | |
| "grad_norm": 2.6168856620788574, | |
| "learning_rate": 6.061782877316858e-06, | |
| "loss": 0.2935, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 4.403847851028153, | |
| "grad_norm": 5.8992228507995605, | |
| "learning_rate": 5.973521624007061e-06, | |
| "loss": 0.3128, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 4.412673197422999, | |
| "grad_norm": 3.9669909477233887, | |
| "learning_rate": 5.88702559576346e-06, | |
| "loss": 0.3087, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.412673197422999, | |
| "eval_loss": 0.7075334191322327, | |
| "eval_runtime": 181.7426, | |
| "eval_samples_per_second": 155.863, | |
| "eval_steps_per_second": 15.588, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.421498543817845, | |
| "grad_norm": 1.8581490516662598, | |
| "learning_rate": 5.798764342453663e-06, | |
| "loss": 0.2811, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 4.430323890212691, | |
| "grad_norm": 2.18139386177063, | |
| "learning_rate": 5.710503089143866e-06, | |
| "loss": 0.296, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 4.439149236607537, | |
| "grad_norm": 2.703723907470703, | |
| "learning_rate": 5.622241835834069e-06, | |
| "loss": 0.2826, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 4.447974583002383, | |
| "grad_norm": 3.0339534282684326, | |
| "learning_rate": 5.533980582524273e-06, | |
| "loss": 0.2828, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 4.456799929397229, | |
| "grad_norm": 3.030467987060547, | |
| "learning_rate": 5.445719329214475e-06, | |
| "loss": 0.2899, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 4.465625275792075, | |
| "grad_norm": 2.0632028579711914, | |
| "learning_rate": 5.357458075904678e-06, | |
| "loss": 0.3062, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 4.474450622186921, | |
| "grad_norm": 2.691420793533325, | |
| "learning_rate": 5.2691968225948806e-06, | |
| "loss": 0.2933, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 4.483275968581767, | |
| "grad_norm": 2.518090009689331, | |
| "learning_rate": 5.180935569285085e-06, | |
| "loss": 0.3056, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 4.492101314976613, | |
| "grad_norm": 3.4329051971435547, | |
| "learning_rate": 5.092674315975287e-06, | |
| "loss": 0.2922, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 4.500926661371459, | |
| "grad_norm": 1.8490076065063477, | |
| "learning_rate": 5.00441306266549e-06, | |
| "loss": 0.3004, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 4.509752007766305, | |
| "grad_norm": 2.656369209289551, | |
| "learning_rate": 4.916151809355693e-06, | |
| "loss": 0.2917, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 4.518577354161151, | |
| "grad_norm": 3.1583127975463867, | |
| "learning_rate": 4.8278905560458965e-06, | |
| "loss": 0.3037, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 4.527402700555997, | |
| "grad_norm": 2.609161138534546, | |
| "learning_rate": 4.739629302736099e-06, | |
| "loss": 0.3102, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 4.536228046950843, | |
| "grad_norm": 3.1805193424224854, | |
| "learning_rate": 4.651368049426302e-06, | |
| "loss": 0.2879, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 4.545053393345689, | |
| "grad_norm": 2.557835578918457, | |
| "learning_rate": 4.563106796116505e-06, | |
| "loss": 0.2958, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 4.553878739740535, | |
| "grad_norm": 2.510469675064087, | |
| "learning_rate": 4.474845542806708e-06, | |
| "loss": 0.2947, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 4.562704086135381, | |
| "grad_norm": 3.4820563793182373, | |
| "learning_rate": 4.386584289496911e-06, | |
| "loss": 0.2927, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 4.571529432530227, | |
| "grad_norm": 3.2949700355529785, | |
| "learning_rate": 4.298323036187114e-06, | |
| "loss": 0.2697, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 4.580354778925073, | |
| "grad_norm": 2.9834752082824707, | |
| "learning_rate": 4.210061782877317e-06, | |
| "loss": 0.3102, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 4.589180125319919, | |
| "grad_norm": 3.7537124156951904, | |
| "learning_rate": 4.12180052956752e-06, | |
| "loss": 0.2956, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.589180125319919, | |
| "eval_loss": 0.7058725953102112, | |
| "eval_runtime": 181.7357, | |
| "eval_samples_per_second": 155.869, | |
| "eval_steps_per_second": 15.589, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.598005471714765, | |
| "grad_norm": 2.9796440601348877, | |
| "learning_rate": 4.033539276257723e-06, | |
| "loss": 0.2979, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 4.606830818109611, | |
| "grad_norm": 3.2112295627593994, | |
| "learning_rate": 3.945278022947927e-06, | |
| "loss": 0.2921, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 4.615656164504457, | |
| "grad_norm": 2.114686965942383, | |
| "learning_rate": 3.857016769638129e-06, | |
| "loss": 0.2757, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 4.624481510899303, | |
| "grad_norm": 2.44917368888855, | |
| "learning_rate": 3.7687555163283322e-06, | |
| "loss": 0.2593, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 4.633306857294149, | |
| "grad_norm": 2.6650853157043457, | |
| "learning_rate": 3.680494263018535e-06, | |
| "loss": 0.3195, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 4.642132203688995, | |
| "grad_norm": 2.660585403442383, | |
| "learning_rate": 3.5922330097087378e-06, | |
| "loss": 0.3108, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 4.650957550083841, | |
| "grad_norm": 3.4605438709259033, | |
| "learning_rate": 3.503971756398941e-06, | |
| "loss": 0.2904, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 4.659782896478687, | |
| "grad_norm": 2.301084041595459, | |
| "learning_rate": 3.4157105030891437e-06, | |
| "loss": 0.3159, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 4.668608242873533, | |
| "grad_norm": 3.1047492027282715, | |
| "learning_rate": 3.327449249779347e-06, | |
| "loss": 0.3224, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 4.677433589268379, | |
| "grad_norm": 2.4581010341644287, | |
| "learning_rate": 3.2391879964695497e-06, | |
| "loss": 0.284, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 4.686258935663225, | |
| "grad_norm": 3.301814317703247, | |
| "learning_rate": 3.150926743159753e-06, | |
| "loss": 0.31, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 4.695084282058071, | |
| "grad_norm": 2.921429395675659, | |
| "learning_rate": 3.062665489849956e-06, | |
| "loss": 0.2928, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 4.703909628452917, | |
| "grad_norm": 2.5449726581573486, | |
| "learning_rate": 2.9744042365401592e-06, | |
| "loss": 0.298, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 4.712734974847763, | |
| "grad_norm": 3.2756364345550537, | |
| "learning_rate": 2.886142983230362e-06, | |
| "loss": 0.3066, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 4.721560321242609, | |
| "grad_norm": 2.0160672664642334, | |
| "learning_rate": 2.797881729920565e-06, | |
| "loss": 0.2864, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 4.730385667637455, | |
| "grad_norm": 3.0313892364501953, | |
| "learning_rate": 2.709620476610768e-06, | |
| "loss": 0.2866, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 4.739211014032301, | |
| "grad_norm": 3.7765920162200928, | |
| "learning_rate": 2.621359223300971e-06, | |
| "loss": 0.2816, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 4.748036360427147, | |
| "grad_norm": 2.8294875621795654, | |
| "learning_rate": 2.533097969991174e-06, | |
| "loss": 0.2752, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 4.756861706821993, | |
| "grad_norm": 2.9688727855682373, | |
| "learning_rate": 2.444836716681377e-06, | |
| "loss": 0.2961, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 4.765687053216839, | |
| "grad_norm": 4.593882083892822, | |
| "learning_rate": 2.3565754633715803e-06, | |
| "loss": 0.2898, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 4.765687053216839, | |
| "eval_loss": 0.705726683139801, | |
| "eval_runtime": 181.7545, | |
| "eval_samples_per_second": 155.853, | |
| "eval_steps_per_second": 15.587, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 4.774512399611685, | |
| "grad_norm": 3.7863380908966064, | |
| "learning_rate": 2.268314210061783e-06, | |
| "loss": 0.3242, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 4.783337746006531, | |
| "grad_norm": 2.438889980316162, | |
| "learning_rate": 2.1800529567519862e-06, | |
| "loss": 0.2942, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 4.792163092401377, | |
| "grad_norm": 2.7048282623291016, | |
| "learning_rate": 2.091791703442189e-06, | |
| "loss": 0.287, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 4.800988438796223, | |
| "grad_norm": 3.619891881942749, | |
| "learning_rate": 2.003530450132392e-06, | |
| "loss": 0.2992, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 4.809813785191069, | |
| "grad_norm": 1.8524075746536255, | |
| "learning_rate": 1.915269196822595e-06, | |
| "loss": 0.2748, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 4.818639131585915, | |
| "grad_norm": 3.449615955352783, | |
| "learning_rate": 1.8270079435127981e-06, | |
| "loss": 0.3181, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 4.827464477980761, | |
| "grad_norm": 3.258453130722046, | |
| "learning_rate": 1.7387466902030011e-06, | |
| "loss": 0.2837, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 4.836289824375607, | |
| "grad_norm": 3.91930890083313, | |
| "learning_rate": 1.650485436893204e-06, | |
| "loss": 0.2842, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 4.845115170770453, | |
| "grad_norm": 3.170217275619507, | |
| "learning_rate": 1.5622241835834069e-06, | |
| "loss": 0.2763, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 4.853940517165299, | |
| "grad_norm": 3.373209238052368, | |
| "learning_rate": 1.47396293027361e-06, | |
| "loss": 0.2703, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 4.862765863560145, | |
| "grad_norm": 3.7620341777801514, | |
| "learning_rate": 1.385701676963813e-06, | |
| "loss": 0.2945, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 4.871591209954991, | |
| "grad_norm": 3.2243876457214355, | |
| "learning_rate": 1.297440423654016e-06, | |
| "loss": 0.2975, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 4.880416556349837, | |
| "grad_norm": 3.1117992401123047, | |
| "learning_rate": 1.209179170344219e-06, | |
| "loss": 0.2922, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 4.889241902744683, | |
| "grad_norm": 3.212538957595825, | |
| "learning_rate": 1.120917917034422e-06, | |
| "loss": 0.2898, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 4.898067249139529, | |
| "grad_norm": 2.3424935340881348, | |
| "learning_rate": 1.032656663724625e-06, | |
| "loss": 0.2852, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 4.906892595534375, | |
| "grad_norm": 2.8400442600250244, | |
| "learning_rate": 9.44395410414828e-07, | |
| "loss": 0.2948, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 4.915717941929221, | |
| "grad_norm": 3.1694393157958984, | |
| "learning_rate": 8.56134157105031e-07, | |
| "loss": 0.2583, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 4.924543288324067, | |
| "grad_norm": 2.80452299118042, | |
| "learning_rate": 7.678729037952339e-07, | |
| "loss": 0.2816, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 4.933368634718913, | |
| "grad_norm": 2.7664706707000732, | |
| "learning_rate": 6.813768755516329e-07, | |
| "loss": 0.2806, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 4.942193981113759, | |
| "grad_norm": 2.9960453510284424, | |
| "learning_rate": 5.931156222418359e-07, | |
| "loss": 0.2858, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.942193981113759, | |
| "eval_loss": 0.7043320536613464, | |
| "eval_runtime": 181.7383, | |
| "eval_samples_per_second": 155.867, | |
| "eval_steps_per_second": 15.588, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.951019327508605, | |
| "grad_norm": 3.455805540084839, | |
| "learning_rate": 5.048543689320388e-07, | |
| "loss": 0.3116, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 4.959844673903451, | |
| "grad_norm": 3.7969307899475098, | |
| "learning_rate": 4.165931156222418e-07, | |
| "loss": 0.2845, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 4.968670020298297, | |
| "grad_norm": 3.1473402976989746, | |
| "learning_rate": 3.2833186231244484e-07, | |
| "loss": 0.2933, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 4.977495366693143, | |
| "grad_norm": 3.197662115097046, | |
| "learning_rate": 2.4007060900264787e-07, | |
| "loss": 0.2918, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 4.986320713087989, | |
| "grad_norm": 2.3752260208129883, | |
| "learning_rate": 1.5180935569285085e-07, | |
| "loss": 0.2969, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 4.995146059482835, | |
| "grad_norm": 3.3417346477508545, | |
| "learning_rate": 6.354810238305384e-08, | |
| "loss": 0.2904, | |
| "step": 28300 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 28325, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3071621043067904e+18, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |