| { | |
| "best_global_step": 650, | |
| "best_metric": 0.3949255049228668, | |
| "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-600", | |
| "epoch": 2.0, | |
| "eval_steps": 50, | |
| "global_step": 686, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0029170464904284413, | |
| "grad_norm": 1.1577509641647339, | |
| "learning_rate": 0.0, | |
| "loss": 0.9893555045127869, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005834092980856883, | |
| "grad_norm": 0.9491796493530273, | |
| "learning_rate": 2.8985507246376816e-07, | |
| "loss": 0.8791205883026123, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.008751139471285323, | |
| "grad_norm": 1.1600768566131592, | |
| "learning_rate": 5.797101449275363e-07, | |
| "loss": 0.9858248233795166, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.011668185961713765, | |
| "grad_norm": 1.2298306226730347, | |
| "learning_rate": 8.695652173913044e-07, | |
| "loss": 1.0516364574432373, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.014585232452142206, | |
| "grad_norm": 0.9520533680915833, | |
| "learning_rate": 1.1594202898550726e-06, | |
| "loss": 0.8392249345779419, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.017502278942570646, | |
| "grad_norm": 1.2451188564300537, | |
| "learning_rate": 1.4492753623188408e-06, | |
| "loss": 1.0955077409744263, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02041932543299909, | |
| "grad_norm": 1.1123991012573242, | |
| "learning_rate": 1.7391304347826088e-06, | |
| "loss": 0.9201866388320923, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02333637192342753, | |
| "grad_norm": 0.9283139705657959, | |
| "learning_rate": 2.028985507246377e-06, | |
| "loss": 0.9770950078964233, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02625341841385597, | |
| "grad_norm": 0.9589216113090515, | |
| "learning_rate": 2.3188405797101453e-06, | |
| "loss": 0.9442565441131592, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02917046490428441, | |
| "grad_norm": 0.8866703510284424, | |
| "learning_rate": 2.6086956521739132e-06, | |
| "loss": 0.9354464411735535, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03208751139471285, | |
| "grad_norm": 0.7191241383552551, | |
| "learning_rate": 2.8985507246376816e-06, | |
| "loss": 0.7659736275672913, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03500455788514129, | |
| "grad_norm": 0.9110142588615417, | |
| "learning_rate": 3.188405797101449e-06, | |
| "loss": 0.9319326877593994, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03792160437556973, | |
| "grad_norm": 0.8754057288169861, | |
| "learning_rate": 3.4782608695652175e-06, | |
| "loss": 0.9819356203079224, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04083865086599818, | |
| "grad_norm": 0.896181046962738, | |
| "learning_rate": 3.768115942028986e-06, | |
| "loss": 1.026316523551941, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04375569735642662, | |
| "grad_norm": 0.6104832887649536, | |
| "learning_rate": 4.057971014492754e-06, | |
| "loss": 0.8427562713623047, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04667274384685506, | |
| "grad_norm": 0.6529208421707153, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.8496565222740173, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0495897903372835, | |
| "grad_norm": 0.6319335699081421, | |
| "learning_rate": 4.637681159420291e-06, | |
| "loss": 0.9139047861099243, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.05250683682771194, | |
| "grad_norm": 0.7458649277687073, | |
| "learning_rate": 4.927536231884059e-06, | |
| "loss": 0.8867442011833191, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.05542388331814038, | |
| "grad_norm": 0.6179773211479187, | |
| "learning_rate": 5.2173913043478265e-06, | |
| "loss": 0.9579408168792725, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.05834092980856882, | |
| "grad_norm": 0.794481635093689, | |
| "learning_rate": 5.507246376811595e-06, | |
| "loss": 0.8736554980278015, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06125797629899726, | |
| "grad_norm": 0.8356145620346069, | |
| "learning_rate": 5.797101449275363e-06, | |
| "loss": 0.9358762502670288, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0641750227894257, | |
| "grad_norm": 0.5891932845115662, | |
| "learning_rate": 6.086956521739132e-06, | |
| "loss": 0.8972038626670837, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.06709206927985414, | |
| "grad_norm": 0.6931268572807312, | |
| "learning_rate": 6.376811594202898e-06, | |
| "loss": 0.9583507776260376, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07000911577028258, | |
| "grad_norm": 0.7298229336738586, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.8119489550590515, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.07292616226071102, | |
| "grad_norm": 0.6419956684112549, | |
| "learning_rate": 6.956521739130435e-06, | |
| "loss": 0.9386100769042969, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07584320875113947, | |
| "grad_norm": 0.7508338689804077, | |
| "learning_rate": 7.246376811594203e-06, | |
| "loss": 0.9272583723068237, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0787602552415679, | |
| "grad_norm": 0.5848079919815063, | |
| "learning_rate": 7.536231884057972e-06, | |
| "loss": 0.8967856168746948, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08167730173199636, | |
| "grad_norm": 0.7384837865829468, | |
| "learning_rate": 7.82608695652174e-06, | |
| "loss": 0.8696568012237549, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0845943482224248, | |
| "grad_norm": 0.5069604516029358, | |
| "learning_rate": 8.115942028985508e-06, | |
| "loss": 0.9121193885803223, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.08751139471285324, | |
| "grad_norm": 0.833165168762207, | |
| "learning_rate": 8.405797101449275e-06, | |
| "loss": 0.8180589079856873, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09042844120328168, | |
| "grad_norm": 0.6355920433998108, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.8640957474708557, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.09334548769371012, | |
| "grad_norm": 1.0429315567016602, | |
| "learning_rate": 8.985507246376812e-06, | |
| "loss": 0.9517915844917297, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.09626253418413856, | |
| "grad_norm": 0.5875154733657837, | |
| "learning_rate": 9.275362318840581e-06, | |
| "loss": 0.9443603754043579, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.099179580674567, | |
| "grad_norm": 1.9913769960403442, | |
| "learning_rate": 9.565217391304349e-06, | |
| "loss": 0.9510866403579712, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.10209662716499544, | |
| "grad_norm": 0.5310097932815552, | |
| "learning_rate": 9.855072463768118e-06, | |
| "loss": 0.8653419613838196, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10501367365542388, | |
| "grad_norm": 0.624421238899231, | |
| "learning_rate": 1.0144927536231885e-05, | |
| "loss": 0.7941208481788635, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.10793072014585232, | |
| "grad_norm": 0.6314200758934021, | |
| "learning_rate": 1.0434782608695653e-05, | |
| "loss": 0.8931174278259277, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.11084776663628076, | |
| "grad_norm": 0.6272342205047607, | |
| "learning_rate": 1.0724637681159422e-05, | |
| "loss": 0.8978185057640076, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1137648131267092, | |
| "grad_norm": 0.5711184740066528, | |
| "learning_rate": 1.101449275362319e-05, | |
| "loss": 0.808263897895813, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.11668185961713765, | |
| "grad_norm": 0.7581208944320679, | |
| "learning_rate": 1.1304347826086957e-05, | |
| "loss": 0.7456756830215454, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11959890610756609, | |
| "grad_norm": 0.4989977180957794, | |
| "learning_rate": 1.1594202898550726e-05, | |
| "loss": 0.8273333311080933, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.12251595259799453, | |
| "grad_norm": 0.8602972626686096, | |
| "learning_rate": 1.1884057971014494e-05, | |
| "loss": 0.8514784574508667, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.12543299908842298, | |
| "grad_norm": 0.6918581128120422, | |
| "learning_rate": 1.2173913043478263e-05, | |
| "loss": 0.8182265162467957, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1283500455788514, | |
| "grad_norm": 0.653099536895752, | |
| "learning_rate": 1.2463768115942029e-05, | |
| "loss": 0.8242791891098022, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.13126709206927986, | |
| "grad_norm": 0.7485584616661072, | |
| "learning_rate": 1.2753623188405797e-05, | |
| "loss": 0.8229591250419617, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1341841385597083, | |
| "grad_norm": 0.6724833250045776, | |
| "learning_rate": 1.3043478260869566e-05, | |
| "loss": 0.8146833181381226, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.13710118505013674, | |
| "grad_norm": 0.857208251953125, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.8154427409172058, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.14001823154056517, | |
| "grad_norm": 0.5559669137001038, | |
| "learning_rate": 1.3623188405797103e-05, | |
| "loss": 0.879005491733551, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.14293527803099362, | |
| "grad_norm": 0.5910897850990295, | |
| "learning_rate": 1.391304347826087e-05, | |
| "loss": 0.8148283362388611, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.14585232452142205, | |
| "grad_norm": 0.6478891372680664, | |
| "learning_rate": 1.420289855072464e-05, | |
| "loss": 0.8293006420135498, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14585232452142205, | |
| "eval_loss": 0.7892261147499084, | |
| "eval_runtime": 973.2157, | |
| "eval_samples_per_second": 0.649, | |
| "eval_steps_per_second": 0.649, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1487693710118505, | |
| "grad_norm": 0.757882833480835, | |
| "learning_rate": 1.4492753623188407e-05, | |
| "loss": 0.8114852905273438, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.15168641750227893, | |
| "grad_norm": 0.8496116995811462, | |
| "learning_rate": 1.4782608695652174e-05, | |
| "loss": 0.7886185050010681, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.15460346399270739, | |
| "grad_norm": 0.6078857183456421, | |
| "learning_rate": 1.5072463768115944e-05, | |
| "loss": 0.7298170924186707, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1575205104831358, | |
| "grad_norm": 0.5856835246086121, | |
| "learning_rate": 1.536231884057971e-05, | |
| "loss": 0.7407160997390747, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.16043755697356427, | |
| "grad_norm": 1.0533701181411743, | |
| "learning_rate": 1.565217391304348e-05, | |
| "loss": 0.7057831287384033, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16335460346399272, | |
| "grad_norm": 0.8087610006332397, | |
| "learning_rate": 1.5942028985507246e-05, | |
| "loss": 0.7409019470214844, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.16627164995442115, | |
| "grad_norm": 0.629945695400238, | |
| "learning_rate": 1.6231884057971015e-05, | |
| "loss": 0.7768293023109436, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1691886964448496, | |
| "grad_norm": 0.5187911987304688, | |
| "learning_rate": 1.6521739130434785e-05, | |
| "loss": 0.825718104839325, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.17210574293527803, | |
| "grad_norm": 0.5866358280181885, | |
| "learning_rate": 1.681159420289855e-05, | |
| "loss": 0.8575979471206665, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.17502278942570648, | |
| "grad_norm": 1.5098934173583984, | |
| "learning_rate": 1.710144927536232e-05, | |
| "loss": 0.8058848977088928, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1779398359161349, | |
| "grad_norm": 0.6981958150863647, | |
| "learning_rate": 1.739130434782609e-05, | |
| "loss": 0.7640778422355652, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.18085688240656336, | |
| "grad_norm": 0.631349503993988, | |
| "learning_rate": 1.7681159420289858e-05, | |
| "loss": 0.7896331548690796, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1837739288969918, | |
| "grad_norm": 0.6930747032165527, | |
| "learning_rate": 1.7971014492753624e-05, | |
| "loss": 0.6762524247169495, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.18669097538742024, | |
| "grad_norm": 0.599399209022522, | |
| "learning_rate": 1.8260869565217393e-05, | |
| "loss": 0.7285035848617554, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.18960802187784867, | |
| "grad_norm": 0.6194344758987427, | |
| "learning_rate": 1.8550724637681162e-05, | |
| "loss": 0.7682523131370544, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.19252506836827712, | |
| "grad_norm": 0.5691342949867249, | |
| "learning_rate": 1.8840579710144928e-05, | |
| "loss": 0.6791993379592896, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.19544211485870555, | |
| "grad_norm": 0.6257390379905701, | |
| "learning_rate": 1.9130434782608697e-05, | |
| "loss": 0.6744828224182129, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.198359161349134, | |
| "grad_norm": 0.5871018767356873, | |
| "learning_rate": 1.9420289855072467e-05, | |
| "loss": 0.7317330837249756, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.20127620783956243, | |
| "grad_norm": 1.0744612216949463, | |
| "learning_rate": 1.9710144927536236e-05, | |
| "loss": 0.6617178916931152, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2041932543299909, | |
| "grad_norm": 0.675946831703186, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7615712881088257, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2071103008204193, | |
| "grad_norm": 0.7663411498069763, | |
| "learning_rate": 1.9999870372100614e-05, | |
| "loss": 0.7131291627883911, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.21002734731084777, | |
| "grad_norm": 0.6725395321846008, | |
| "learning_rate": 1.9999481491763123e-05, | |
| "loss": 0.7452989816665649, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.21294439380127622, | |
| "grad_norm": 0.6505664587020874, | |
| "learning_rate": 1.9998833369069483e-05, | |
| "loss": 0.7477136850357056, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.21586144029170465, | |
| "grad_norm": 0.7032860517501831, | |
| "learning_rate": 1.9997926020822643e-05, | |
| "loss": 0.6854275465011597, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2187784867821331, | |
| "grad_norm": 0.645345151424408, | |
| "learning_rate": 1.999675947054614e-05, | |
| "loss": 0.7552425265312195, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.22169553327256153, | |
| "grad_norm": 0.6620492935180664, | |
| "learning_rate": 1.9995333748483464e-05, | |
| "loss": 0.7262853384017944, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.22461257976298998, | |
| "grad_norm": 0.6511455774307251, | |
| "learning_rate": 1.9993648891597284e-05, | |
| "loss": 0.7591732144355774, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2275296262534184, | |
| "grad_norm": 0.6775254011154175, | |
| "learning_rate": 1.9991704943568497e-05, | |
| "loss": 0.7498704195022583, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.23044667274384686, | |
| "grad_norm": 0.8199896216392517, | |
| "learning_rate": 1.9989501954795076e-05, | |
| "loss": 0.7238684296607971, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2333637192342753, | |
| "grad_norm": 0.8197569847106934, | |
| "learning_rate": 1.998703998239079e-05, | |
| "loss": 0.7028778195381165, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23628076572470375, | |
| "grad_norm": 0.6602625250816345, | |
| "learning_rate": 1.9984319090183692e-05, | |
| "loss": 0.8842703104019165, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.23919781221513217, | |
| "grad_norm": 0.9587129354476929, | |
| "learning_rate": 1.99813393487145e-05, | |
| "loss": 0.732614278793335, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.24211485870556063, | |
| "grad_norm": 0.6822189092636108, | |
| "learning_rate": 1.997810083523473e-05, | |
| "loss": 0.7544928193092346, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.24503190519598905, | |
| "grad_norm": 0.8980082869529724, | |
| "learning_rate": 1.9974603633704726e-05, | |
| "loss": 0.6704054474830627, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2479489516864175, | |
| "grad_norm": 0.7413425445556641, | |
| "learning_rate": 1.9970847834791472e-05, | |
| "loss": 0.693661093711853, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.25086599817684596, | |
| "grad_norm": 0.8314999341964722, | |
| "learning_rate": 1.9966833535866223e-05, | |
| "loss": 0.667654275894165, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.25378304466727436, | |
| "grad_norm": 0.7972444891929626, | |
| "learning_rate": 1.9962560841002013e-05, | |
| "loss": 0.8403134942054749, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2567000911577028, | |
| "grad_norm": 0.8519951701164246, | |
| "learning_rate": 1.995802986097093e-05, | |
| "loss": 0.6897370219230652, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.25961713764813127, | |
| "grad_norm": 0.8268933892250061, | |
| "learning_rate": 1.995324071324126e-05, | |
| "loss": 0.6690632700920105, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.2625341841385597, | |
| "grad_norm": 0.7133983969688416, | |
| "learning_rate": 1.9948193521974436e-05, | |
| "loss": 0.6314147114753723, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2654512306289881, | |
| "grad_norm": 0.889302134513855, | |
| "learning_rate": 1.9942888418021814e-05, | |
| "loss": 0.7389825582504272, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2683682771194166, | |
| "grad_norm": 0.7022432088851929, | |
| "learning_rate": 1.99373255389213e-05, | |
| "loss": 0.6916261911392212, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.27128532360984503, | |
| "grad_norm": 0.696432888507843, | |
| "learning_rate": 1.9931505028893748e-05, | |
| "loss": 0.6908476948738098, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.2742023701002735, | |
| "grad_norm": 0.7667419910430908, | |
| "learning_rate": 1.9925427038839267e-05, | |
| "loss": 0.6500837206840515, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.27711941659070194, | |
| "grad_norm": 0.6974894404411316, | |
| "learning_rate": 1.9919091726333265e-05, | |
| "loss": 0.7059191465377808, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.28003646308113034, | |
| "grad_norm": 0.7047077417373657, | |
| "learning_rate": 1.9912499255622397e-05, | |
| "loss": 0.6287837624549866, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.2829535095715588, | |
| "grad_norm": 0.7729557156562805, | |
| "learning_rate": 1.990564979762029e-05, | |
| "loss": 0.6738612055778503, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.28587055606198725, | |
| "grad_norm": 0.7020529508590698, | |
| "learning_rate": 1.989854352990311e-05, | |
| "loss": 0.662042498588562, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.2887876025524157, | |
| "grad_norm": 0.7369800209999084, | |
| "learning_rate": 1.9891180636704975e-05, | |
| "loss": 0.6246830821037292, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.2917046490428441, | |
| "grad_norm": 0.7412623167037964, | |
| "learning_rate": 1.9883561308913154e-05, | |
| "loss": 0.6623879075050354, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2917046490428441, | |
| "eval_loss": 0.6552971005439758, | |
| "eval_runtime": 966.7072, | |
| "eval_samples_per_second": 0.654, | |
| "eval_steps_per_second": 0.654, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.29462169553327255, | |
| "grad_norm": 0.8428792953491211, | |
| "learning_rate": 1.987568574406314e-05, | |
| "loss": 0.6312171816825867, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.297538742023701, | |
| "grad_norm": 0.6948133707046509, | |
| "learning_rate": 1.9867554146333517e-05, | |
| "loss": 0.6266146898269653, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.30045578851412946, | |
| "grad_norm": 1.3897597789764404, | |
| "learning_rate": 1.985916672654068e-05, | |
| "loss": 0.6669265031814575, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.30337283500455786, | |
| "grad_norm": 0.8838400840759277, | |
| "learning_rate": 1.985052370213334e-05, | |
| "loss": 0.6601086854934692, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3062898814949863, | |
| "grad_norm": 0.8471395373344421, | |
| "learning_rate": 1.9841625297186925e-05, | |
| "loss": 0.5984431505203247, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.30920692798541477, | |
| "grad_norm": 0.8940042853355408, | |
| "learning_rate": 1.983247174239774e-05, | |
| "loss": 0.7223822474479675, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3121239744758432, | |
| "grad_norm": 0.7833696603775024, | |
| "learning_rate": 1.9823063275076998e-05, | |
| "loss": 0.6868705749511719, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3150410209662716, | |
| "grad_norm": 0.8794649243354797, | |
| "learning_rate": 1.9813400139144673e-05, | |
| "loss": 0.6246675848960876, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3179580674567001, | |
| "grad_norm": 0.8126057982444763, | |
| "learning_rate": 1.9803482585123165e-05, | |
| "loss": 0.5908697247505188, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.32087511394712853, | |
| "grad_norm": 0.7947676777839661, | |
| "learning_rate": 1.979331087013082e-05, | |
| "loss": 0.5751246809959412, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.323792160437557, | |
| "grad_norm": 0.713545560836792, | |
| "learning_rate": 1.978288525787524e-05, | |
| "loss": 0.6081106066703796, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.32670920692798544, | |
| "grad_norm": 1.011828064918518, | |
| "learning_rate": 1.977220601864647e-05, | |
| "loss": 0.7039169669151306, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.32962625341841384, | |
| "grad_norm": 0.730570912361145, | |
| "learning_rate": 1.9761273429309982e-05, | |
| "loss": 0.6140255928039551, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3325432999088423, | |
| "grad_norm": 1.059688687324524, | |
| "learning_rate": 1.9750087773299492e-05, | |
| "loss": 0.648114025592804, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.33546034639927075, | |
| "grad_norm": 0.9336895942687988, | |
| "learning_rate": 1.973864934060962e-05, | |
| "loss": 0.622555673122406, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3383773928896992, | |
| "grad_norm": 0.7195945978164673, | |
| "learning_rate": 1.9726958427788367e-05, | |
| "loss": 0.70485520362854, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.3412944393801276, | |
| "grad_norm": 0.8101872801780701, | |
| "learning_rate": 1.971501533792942e-05, | |
| "loss": 0.6958848834037781, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.34421148587055606, | |
| "grad_norm": 1.6075212955474854, | |
| "learning_rate": 1.970282038066432e-05, | |
| "loss": 0.6021550893783569, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3471285323609845, | |
| "grad_norm": 0.7881433963775635, | |
| "learning_rate": 1.9690373872154396e-05, | |
| "loss": 0.6449777483940125, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.35004557885141296, | |
| "grad_norm": 1.014639973640442, | |
| "learning_rate": 1.9677676135082606e-05, | |
| "loss": 0.5939379930496216, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.35296262534184136, | |
| "grad_norm": 0.8198449611663818, | |
| "learning_rate": 1.9664727498645144e-05, | |
| "loss": 0.6210286617279053, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3558796718322698, | |
| "grad_norm": 1.0194576978683472, | |
| "learning_rate": 1.9651528298542918e-05, | |
| "loss": 0.624247670173645, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.35879671832269827, | |
| "grad_norm": 0.7963470220565796, | |
| "learning_rate": 1.9638078876972842e-05, | |
| "loss": 0.6479315757751465, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3617137648131267, | |
| "grad_norm": 0.9007541537284851, | |
| "learning_rate": 1.9624379582618976e-05, | |
| "loss": 0.6131505370140076, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3646308113035551, | |
| "grad_norm": 0.8712120056152344, | |
| "learning_rate": 1.9610430770643464e-05, | |
| "loss": 0.6249448657035828, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3675478577939836, | |
| "grad_norm": 1.1482540369033813, | |
| "learning_rate": 1.9596232802677347e-05, | |
| "loss": 0.5844688415527344, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.37046490428441203, | |
| "grad_norm": 0.8662379384040833, | |
| "learning_rate": 1.9581786046811175e-05, | |
| "loss": 0.6573485732078552, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3733819507748405, | |
| "grad_norm": 0.8191388845443726, | |
| "learning_rate": 1.9567090877585477e-05, | |
| "loss": 0.5896862745285034, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.37629899726526894, | |
| "grad_norm": 1.0187078714370728, | |
| "learning_rate": 1.955214767598103e-05, | |
| "loss": 0.613490879535675, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.37921604375569734, | |
| "grad_norm": 0.8444119691848755, | |
| "learning_rate": 1.953695682940901e-05, | |
| "loss": 0.727687656879425, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3821330902461258, | |
| "grad_norm": 0.74753737449646, | |
| "learning_rate": 1.9521518731700913e-05, | |
| "loss": 0.6102436780929565, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.38505013673655425, | |
| "grad_norm": 1.0166202783584595, | |
| "learning_rate": 1.9505833783098378e-05, | |
| "loss": 0.6244844198226929, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.3879671832269827, | |
| "grad_norm": 0.8175772428512573, | |
| "learning_rate": 1.9489902390242793e-05, | |
| "loss": 0.5939282178878784, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.3908842297174111, | |
| "grad_norm": 1.0177713632583618, | |
| "learning_rate": 1.947372496616476e-05, | |
| "loss": 0.6418229937553406, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.39380127620783956, | |
| "grad_norm": 0.8652453422546387, | |
| "learning_rate": 1.9457301930273376e-05, | |
| "loss": 0.5870395302772522, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.396718322698268, | |
| "grad_norm": 0.8378894925117493, | |
| "learning_rate": 1.9440633708345365e-05, | |
| "loss": 0.6480278372764587, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.39963536918869647, | |
| "grad_norm": 0.8303541541099548, | |
| "learning_rate": 1.9423720732514052e-05, | |
| "loss": 0.6191359758377075, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.40255241567912486, | |
| "grad_norm": 0.8576734662055969, | |
| "learning_rate": 1.9406563441258145e-05, | |
| "loss": 0.5696198344230652, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4054694621695533, | |
| "grad_norm": 0.9558727145195007, | |
| "learning_rate": 1.9389162279390362e-05, | |
| "loss": 0.6177623271942139, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.4083865086599818, | |
| "grad_norm": 0.7046042084693909, | |
| "learning_rate": 1.9371517698045922e-05, | |
| "loss": 0.5836521983146667, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4113035551504102, | |
| "grad_norm": 1.0522717237472534, | |
| "learning_rate": 1.935363015467082e-05, | |
| "loss": 0.5728275775909424, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4142206016408386, | |
| "grad_norm": 0.9554787874221802, | |
| "learning_rate": 1.933550011301e-05, | |
| "loss": 0.632586658000946, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4171376481312671, | |
| "grad_norm": 0.8874214291572571, | |
| "learning_rate": 1.9317128043095293e-05, | |
| "loss": 0.5850118398666382, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.42005469462169553, | |
| "grad_norm": 1.0708963871002197, | |
| "learning_rate": 1.9298514421233276e-05, | |
| "loss": 0.6260685324668884, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.422971741112124, | |
| "grad_norm": 0.8135736584663391, | |
| "learning_rate": 1.9279659729992888e-05, | |
| "loss": 0.6031094193458557, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.42588878760255244, | |
| "grad_norm": 0.7971774339675903, | |
| "learning_rate": 1.9260564458192926e-05, | |
| "loss": 0.6101322770118713, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.42880583409298084, | |
| "grad_norm": 0.9374974966049194, | |
| "learning_rate": 1.9241229100889397e-05, | |
| "loss": 0.5836313366889954, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.4317228805834093, | |
| "grad_norm": 0.8043425679206848, | |
| "learning_rate": 1.9221654159362636e-05, | |
| "loss": 0.6181215047836304, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.43463992707383775, | |
| "grad_norm": 0.8923380374908447, | |
| "learning_rate": 1.920184014110436e-05, | |
| "loss": 0.6149677634239197, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.4375569735642662, | |
| "grad_norm": 0.8908132314682007, | |
| "learning_rate": 1.918178755980449e-05, | |
| "loss": 0.5899742841720581, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4375569735642662, | |
| "eval_loss": 0.5903874635696411, | |
| "eval_runtime": 1186.9542, | |
| "eval_samples_per_second": 0.532, | |
| "eval_steps_per_second": 0.532, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4404740200546946, | |
| "grad_norm": 1.060531497001648, | |
| "learning_rate": 1.9161496935337808e-05, | |
| "loss": 0.5852696895599365, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.44339106654512306, | |
| "grad_norm": 0.9723032712936401, | |
| "learning_rate": 1.914096879375053e-05, | |
| "loss": 0.5822056531906128, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4463081130355515, | |
| "grad_norm": 0.9519931674003601, | |
| "learning_rate": 1.912020366724663e-05, | |
| "loss": 0.6183493137359619, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.44922515952597997, | |
| "grad_norm": 0.8282918334007263, | |
| "learning_rate": 1.9099202094174055e-05, | |
| "loss": 0.6229860782623291, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.45214220601640837, | |
| "grad_norm": 0.9251292943954468, | |
| "learning_rate": 1.907796461901076e-05, | |
| "loss": 0.6552959680557251, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4550592525068368, | |
| "grad_norm": 1.0349540710449219, | |
| "learning_rate": 1.9056491792350606e-05, | |
| "loss": 0.6170098781585693, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.4579762989972653, | |
| "grad_norm": 0.8720711469650269, | |
| "learning_rate": 1.9034784170889076e-05, | |
| "loss": 0.5870137810707092, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.46089334548769373, | |
| "grad_norm": 1.0785977840423584, | |
| "learning_rate": 1.9012842317408843e-05, | |
| "loss": 0.5515124201774597, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.4638103919781221, | |
| "grad_norm": 1.0634154081344604, | |
| "learning_rate": 1.8990666800765187e-05, | |
| "loss": 0.6073828339576721, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.4667274384685506, | |
| "grad_norm": 0.8770879507064819, | |
| "learning_rate": 1.896825819587123e-05, | |
| "loss": 0.5960907936096191, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.46964448495897904, | |
| "grad_norm": 1.1225898265838623, | |
| "learning_rate": 1.894561708368305e-05, | |
| "loss": 0.545990526676178, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.4725615314494075, | |
| "grad_norm": 0.9373893141746521, | |
| "learning_rate": 1.8922744051184613e-05, | |
| "loss": 0.5566108822822571, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.4754785779398359, | |
| "grad_norm": 1.5016087293624878, | |
| "learning_rate": 1.8899639691372545e-05, | |
| "loss": 0.558845043182373, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.47839562443026434, | |
| "grad_norm": 0.903020977973938, | |
| "learning_rate": 1.8876304603240773e-05, | |
| "loss": 0.6824233531951904, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.4813126709206928, | |
| "grad_norm": 0.8239623308181763, | |
| "learning_rate": 1.8852739391764993e-05, | |
| "loss": 0.5630610585212708, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.48422971741112125, | |
| "grad_norm": 0.926069438457489, | |
| "learning_rate": 1.882894466788697e-05, | |
| "loss": 0.6211802363395691, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.4871467639015497, | |
| "grad_norm": 1.0098828077316284, | |
| "learning_rate": 1.8804921048498722e-05, | |
| "loss": 0.5513257384300232, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.4900638103919781, | |
| "grad_norm": 0.9228141903877258, | |
| "learning_rate": 1.8780669156426517e-05, | |
| "loss": 0.6197121739387512, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.49298085688240656, | |
| "grad_norm": 1.0551754236221313, | |
| "learning_rate": 1.8756189620414712e-05, | |
| "loss": 0.5221806764602661, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.495897903372835, | |
| "grad_norm": 0.9017496109008789, | |
| "learning_rate": 1.873148307510948e-05, | |
| "loss": 0.5766995549201965, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.49881494986326347, | |
| "grad_norm": 0.9704970717430115, | |
| "learning_rate": 1.870655016104233e-05, | |
| "loss": 0.6514763832092285, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5017319963536919, | |
| "grad_norm": 0.9972712397575378, | |
| "learning_rate": 1.8681391524613518e-05, | |
| "loss": 0.5273895263671875, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5046490428441204, | |
| "grad_norm": 0.9473339319229126, | |
| "learning_rate": 1.8656007818075288e-05, | |
| "loss": 0.5548599362373352, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5075660893345487, | |
| "grad_norm": 1.2493574619293213, | |
| "learning_rate": 1.8630399699514944e-05, | |
| "loss": 0.5593586564064026, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5104831358249772, | |
| "grad_norm": 1.2766696214675903, | |
| "learning_rate": 1.860456783283781e-05, | |
| "loss": 0.6054630279541016, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5134001823154056, | |
| "grad_norm": 0.9555240869522095, | |
| "learning_rate": 1.857851288775002e-05, | |
| "loss": 0.508592963218689, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5163172288058341, | |
| "grad_norm": 1.260219931602478, | |
| "learning_rate": 1.8552235539741118e-05, | |
| "loss": 0.5532065629959106, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5192342752962625, | |
| "grad_norm": 1.1859954595565796, | |
| "learning_rate": 1.8525736470066595e-05, | |
| "loss": 0.5683344006538391, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.522151321786691, | |
| "grad_norm": 1.3044344186782837, | |
| "learning_rate": 1.8499016365730203e-05, | |
| "loss": 0.5281959772109985, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5250683682771194, | |
| "grad_norm": 1.3049921989440918, | |
| "learning_rate": 1.8472075919466137e-05, | |
| "loss": 0.49621230363845825, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5279854147675479, | |
| "grad_norm": 1.0488537549972534, | |
| "learning_rate": 1.844491582972109e-05, | |
| "loss": 0.6194032430648804, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5309024612579762, | |
| "grad_norm": 1.5553455352783203, | |
| "learning_rate": 1.8417536800636138e-05, | |
| "loss": 0.5645846724510193, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5338195077484047, | |
| "grad_norm": 1.2673912048339844, | |
| "learning_rate": 1.8389939542028484e-05, | |
| "loss": 0.6267315745353699, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5367365542388332, | |
| "grad_norm": 1.0273847579956055, | |
| "learning_rate": 1.8362124769373064e-05, | |
| "loss": 0.5256403684616089, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5396536007292616, | |
| "grad_norm": 1.006093978881836, | |
| "learning_rate": 1.8334093203783986e-05, | |
| "loss": 0.5916382074356079, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5425706472196901, | |
| "grad_norm": 1.2740857601165771, | |
| "learning_rate": 1.8305845571995843e-05, | |
| "loss": 0.581648588180542, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5454876937101185, | |
| "grad_norm": 1.494248390197754, | |
| "learning_rate": 1.8277382606344872e-05, | |
| "loss": 0.4824523627758026, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.548404740200547, | |
| "grad_norm": 1.1862496137619019, | |
| "learning_rate": 1.824870504474996e-05, | |
| "loss": 0.5531858205795288, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.5513217866909754, | |
| "grad_norm": 3.503049373626709, | |
| "learning_rate": 1.8219813630693523e-05, | |
| "loss": 0.6308296918869019, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.5542388331814039, | |
| "grad_norm": 1.7544710636138916, | |
| "learning_rate": 1.819070911320222e-05, | |
| "loss": 0.6146273016929626, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5571558796718322, | |
| "grad_norm": 1.3367774486541748, | |
| "learning_rate": 1.8161392246827546e-05, | |
| "loss": 0.5848966240882874, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.5600729261622607, | |
| "grad_norm": 1.696418046951294, | |
| "learning_rate": 1.8131863791626263e-05, | |
| "loss": 0.6621730327606201, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5629899726526891, | |
| "grad_norm": 1.360052227973938, | |
| "learning_rate": 1.8102124513140694e-05, | |
| "loss": 0.5972204208374023, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.5659070191431176, | |
| "grad_norm": 1.5376263856887817, | |
| "learning_rate": 1.807217518237888e-05, | |
| "loss": 0.4938785433769226, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.568824065633546, | |
| "grad_norm": 1.2249681949615479, | |
| "learning_rate": 1.8042016575794585e-05, | |
| "loss": 0.5366095304489136, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5717411121239745, | |
| "grad_norm": 1.7868080139160156, | |
| "learning_rate": 1.8011649475267178e-05, | |
| "loss": 0.5116773843765259, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.574658158614403, | |
| "grad_norm": 2.369993209838867, | |
| "learning_rate": 1.7981074668081345e-05, | |
| "loss": 0.49072742462158203, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.5775752051048314, | |
| "grad_norm": 1.0168434381484985, | |
| "learning_rate": 1.7950292946906695e-05, | |
| "loss": 0.5691611170768738, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5804922515952597, | |
| "grad_norm": 1.2990851402282715, | |
| "learning_rate": 1.7919305109777195e-05, | |
| "loss": 0.5515039563179016, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.5834092980856882, | |
| "grad_norm": 1.4859853982925415, | |
| "learning_rate": 1.7888111960070493e-05, | |
| "loss": 0.5017011165618896, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5834092980856882, | |
| "eval_loss": 0.5414339303970337, | |
| "eval_runtime": 1180.7894, | |
| "eval_samples_per_second": 0.535, | |
| "eval_steps_per_second": 0.535, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5863263445761167, | |
| "grad_norm": 1.0065829753875732, | |
| "learning_rate": 1.7856714306487088e-05, | |
| "loss": 0.5677731037139893, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.5892433910665451, | |
| "grad_norm": 1.1727538108825684, | |
| "learning_rate": 1.7825112963029352e-05, | |
| "loss": 0.4525509476661682, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.5921604375569736, | |
| "grad_norm": 1.3376752138137817, | |
| "learning_rate": 1.7793308748980437e-05, | |
| "loss": 0.5208959579467773, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.595077484047402, | |
| "grad_norm": 0.9196159839630127, | |
| "learning_rate": 1.776130248888304e-05, | |
| "loss": 0.6033903360366821, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.5979945305378305, | |
| "grad_norm": 1.0750919580459595, | |
| "learning_rate": 1.772909501251801e-05, | |
| "loss": 0.5449609160423279, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6009115770282589, | |
| "grad_norm": 1.2459467649459839, | |
| "learning_rate": 1.769668715488285e-05, | |
| "loss": 0.5685338377952576, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6038286235186874, | |
| "grad_norm": 1.1690552234649658, | |
| "learning_rate": 1.766407975617006e-05, | |
| "loss": 0.5240382552146912, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6067456700091157, | |
| "grad_norm": 1.0816599130630493, | |
| "learning_rate": 1.7631273661745362e-05, | |
| "loss": 0.6802893877029419, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6096627164995442, | |
| "grad_norm": 1.3662947416305542, | |
| "learning_rate": 1.7598269722125775e-05, | |
| "loss": 0.48193931579589844, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6125797629899726, | |
| "grad_norm": 0.9364766478538513, | |
| "learning_rate": 1.7565068792957576e-05, | |
| "loss": 0.5675849914550781, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6154968094804011, | |
| "grad_norm": 1.123828411102295, | |
| "learning_rate": 1.75316717349941e-05, | |
| "loss": 0.5474762916564941, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6184138559708295, | |
| "grad_norm": 1.1924363374710083, | |
| "learning_rate": 1.749807941407345e-05, | |
| "loss": 0.4918654263019562, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.621330902461258, | |
| "grad_norm": 1.101293921470642, | |
| "learning_rate": 1.7464292701096014e-05, | |
| "loss": 0.5742691159248352, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6242479489516864, | |
| "grad_norm": 1.7374963760375977, | |
| "learning_rate": 1.7430312472001928e-05, | |
| "loss": 0.5828965902328491, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6271649954421149, | |
| "grad_norm": 1.3195666074752808, | |
| "learning_rate": 1.739613960774833e-05, | |
| "loss": 0.5265159010887146, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6300820419325432, | |
| "grad_norm": 1.254686713218689, | |
| "learning_rate": 1.7361774994286545e-05, | |
| "loss": 0.4929371476173401, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6329990884229717, | |
| "grad_norm": 1.1476380825042725, | |
| "learning_rate": 1.7327219522539102e-05, | |
| "loss": 0.5060417652130127, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6359161349134002, | |
| "grad_norm": 1.0914150476455688, | |
| "learning_rate": 1.7292474088376643e-05, | |
| "loss": 0.504043698310852, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.6388331814038286, | |
| "grad_norm": 1.1339508295059204, | |
| "learning_rate": 1.7257539592594698e-05, | |
| "loss": 0.4797310531139374, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.6417502278942571, | |
| "grad_norm": 1.0805399417877197, | |
| "learning_rate": 1.722241694089033e-05, | |
| "loss": 0.5878555178642273, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6446672743846855, | |
| "grad_norm": 1.8615056276321411, | |
| "learning_rate": 1.718710704383865e-05, | |
| "loss": 0.5005823969841003, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.647584320875114, | |
| "grad_norm": 1.1445401906967163, | |
| "learning_rate": 1.7151610816869214e-05, | |
| "loss": 0.4949319064617157, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.6505013673655424, | |
| "grad_norm": 0.9726515412330627, | |
| "learning_rate": 1.711592918024229e-05, | |
| "loss": 0.5073204040527344, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.6534184138559709, | |
| "grad_norm": 1.4491140842437744, | |
| "learning_rate": 1.7080063059024998e-05, | |
| "loss": 0.47885262966156006, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.6563354603463992, | |
| "grad_norm": 1.0070592164993286, | |
| "learning_rate": 1.7044013383067327e-05, | |
| "loss": 0.5775837898254395, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6592525068368277, | |
| "grad_norm": 0.966221272945404, | |
| "learning_rate": 1.7007781086978037e-05, | |
| "loss": 0.5050399899482727, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.6621695533272561, | |
| "grad_norm": 0.9808815121650696, | |
| "learning_rate": 1.6971367110100407e-05, | |
| "loss": 0.5737045407295227, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.6650865998176846, | |
| "grad_norm": 1.0158127546310425, | |
| "learning_rate": 1.6934772396487906e-05, | |
| "loss": 0.48077821731567383, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.668003646308113, | |
| "grad_norm": 1.32015860080719, | |
| "learning_rate": 1.6897997894879706e-05, | |
| "loss": 0.5614925026893616, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.6709206927985415, | |
| "grad_norm": 1.1055903434753418, | |
| "learning_rate": 1.686104455867608e-05, | |
| "loss": 0.4970760643482208, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.67383773928897, | |
| "grad_norm": 1.0804500579833984, | |
| "learning_rate": 1.682391334591371e-05, | |
| "loss": 0.5540452003479004, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.6767547857793984, | |
| "grad_norm": 1.1906245946884155, | |
| "learning_rate": 1.6786605219240807e-05, | |
| "loss": 0.5778501033782959, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.6796718322698267, | |
| "grad_norm": 0.9758645296096802, | |
| "learning_rate": 1.6749121145892192e-05, | |
| "loss": 0.49073565006256104, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.6825888787602552, | |
| "grad_norm": 1.1678364276885986, | |
| "learning_rate": 1.6711462097664207e-05, | |
| "loss": 0.4828741252422333, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.6855059252506837, | |
| "grad_norm": 1.148301362991333, | |
| "learning_rate": 1.6673629050889507e-05, | |
| "loss": 0.5143818855285645, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6884229717411121, | |
| "grad_norm": 1.005898356437683, | |
| "learning_rate": 1.6635622986411776e-05, | |
| "loss": 0.5301160216331482, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.6913400182315406, | |
| "grad_norm": 1.2227320671081543, | |
| "learning_rate": 1.659744488956027e-05, | |
| "loss": 0.4800386130809784, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.694257064721969, | |
| "grad_norm": 0.986456573009491, | |
| "learning_rate": 1.6559095750124296e-05, | |
| "loss": 0.5098081827163696, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.6971741112123975, | |
| "grad_norm": 1.1474376916885376, | |
| "learning_rate": 1.6520576562327518e-05, | |
| "loss": 0.5147273540496826, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7000911577028259, | |
| "grad_norm": 1.10917067527771, | |
| "learning_rate": 1.6481888324802223e-05, | |
| "loss": 0.5023190379142761, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7030082041932544, | |
| "grad_norm": 1.2339262962341309, | |
| "learning_rate": 1.644303204056341e-05, | |
| "loss": 0.5282092690467834, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7059252506836827, | |
| "grad_norm": 0.997941255569458, | |
| "learning_rate": 1.640400871698277e-05, | |
| "loss": 0.5635963082313538, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7088422971741112, | |
| "grad_norm": 1.0345823764801025, | |
| "learning_rate": 1.63648193657626e-05, | |
| "loss": 0.5577977895736694, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7117593436645396, | |
| "grad_norm": 1.3468303680419922, | |
| "learning_rate": 1.6325465002909554e-05, | |
| "loss": 0.4365362524986267, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7146763901549681, | |
| "grad_norm": 1.2817128896713257, | |
| "learning_rate": 1.628594664870831e-05, | |
| "loss": 0.46069926023483276, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7175934366453965, | |
| "grad_norm": 1.043311357498169, | |
| "learning_rate": 1.6246265327695117e-05, | |
| "loss": 0.5476971864700317, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.720510483135825, | |
| "grad_norm": 1.0297389030456543, | |
| "learning_rate": 1.620642206863124e-05, | |
| "loss": 0.48051249980926514, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7234275296262535, | |
| "grad_norm": 1.4869836568832397, | |
| "learning_rate": 1.6166417904476257e-05, | |
| "loss": 0.5683314800262451, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7263445761166819, | |
| "grad_norm": 1.0628005266189575, | |
| "learning_rate": 1.6126253872361336e-05, | |
| "loss": 0.5277887582778931, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7292616226071102, | |
| "grad_norm": 1.2682170867919922, | |
| "learning_rate": 1.608593101356229e-05, | |
| "loss": 0.5048879384994507, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7292616226071102, | |
| "eval_loss": 0.5038471221923828, | |
| "eval_runtime": 1175.0375, | |
| "eval_samples_per_second": 0.538, | |
| "eval_steps_per_second": 0.538, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7321786690975387, | |
| "grad_norm": 1.7376199960708618, | |
| "learning_rate": 1.6045450373472626e-05, | |
| "loss": 0.5093721151351929, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.7350957155879672, | |
| "grad_norm": 1.6047718524932861, | |
| "learning_rate": 1.6004813001576405e-05, | |
| "loss": 0.4796055555343628, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.7380127620783956, | |
| "grad_norm": 1.3582886457443237, | |
| "learning_rate": 1.5964019951421058e-05, | |
| "loss": 0.4733014702796936, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.7409298085688241, | |
| "grad_norm": 0.9468897581100464, | |
| "learning_rate": 1.5923072280590072e-05, | |
| "loss": 0.5312032103538513, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.7438468550592525, | |
| "grad_norm": 1.3890198469161987, | |
| "learning_rate": 1.5881971050675547e-05, | |
| "loss": 0.47576645016670227, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.746763901549681, | |
| "grad_norm": 1.782992959022522, | |
| "learning_rate": 1.584071732725071e-05, | |
| "loss": 0.5555092096328735, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.7496809480401094, | |
| "grad_norm": 1.1790621280670166, | |
| "learning_rate": 1.5799312179842265e-05, | |
| "loss": 0.5148727893829346, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.7525979945305379, | |
| "grad_norm": 1.446694254875183, | |
| "learning_rate": 1.5757756681902664e-05, | |
| "loss": 0.49939870834350586, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.7555150410209662, | |
| "grad_norm": 1.1786166429519653, | |
| "learning_rate": 1.571605191078229e-05, | |
| "loss": 0.562156081199646, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.7584320875113947, | |
| "grad_norm": 1.16925847530365, | |
| "learning_rate": 1.567419894770151e-05, | |
| "loss": 0.49580734968185425, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7613491340018231, | |
| "grad_norm": 1.60944664478302, | |
| "learning_rate": 1.5632198877722676e-05, | |
| "loss": 0.4821680784225464, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.7642661804922516, | |
| "grad_norm": 1.3957884311676025, | |
| "learning_rate": 1.5590052789721946e-05, | |
| "loss": 0.4392276406288147, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.76718322698268, | |
| "grad_norm": 1.636195421218872, | |
| "learning_rate": 1.5547761776361096e-05, | |
| "loss": 0.39603114128112793, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.7701002734731085, | |
| "grad_norm": 1.496766448020935, | |
| "learning_rate": 1.550532693405917e-05, | |
| "loss": 0.4833749234676361, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.773017319963537, | |
| "grad_norm": 1.3587844371795654, | |
| "learning_rate": 1.5462749362964058e-05, | |
| "loss": 0.43738317489624023, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7759343664539654, | |
| "grad_norm": 1.670704960823059, | |
| "learning_rate": 1.5420030166923983e-05, | |
| "loss": 0.4476737380027771, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.7788514129443938, | |
| "grad_norm": 1.2674932479858398, | |
| "learning_rate": 1.537717045345888e-05, | |
| "loss": 0.42266708612442017, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.7817684594348222, | |
| "grad_norm": 2.0639536380767822, | |
| "learning_rate": 1.5334171333731666e-05, | |
| "loss": 0.5245381593704224, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.7846855059252507, | |
| "grad_norm": 1.2091766595840454, | |
| "learning_rate": 1.529103392251946e-05, | |
| "loss": 0.5166443586349487, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.7876025524156791, | |
| "grad_norm": 1.1021631956100464, | |
| "learning_rate": 1.5247759338184653e-05, | |
| "loss": 0.5674265027046204, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7905195989061076, | |
| "grad_norm": 1.3143829107284546, | |
| "learning_rate": 1.520434870264595e-05, | |
| "loss": 0.40855613350868225, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.793436645396536, | |
| "grad_norm": 1.1784812211990356, | |
| "learning_rate": 1.5160803141349244e-05, | |
| "loss": 0.4308925271034241, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.7963536918869645, | |
| "grad_norm": 2.1635706424713135, | |
| "learning_rate": 1.5117123783238458e-05, | |
| "loss": 0.45035502314567566, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.7992707383773929, | |
| "grad_norm": 1.569203495979309, | |
| "learning_rate": 1.5073311760726287e-05, | |
| "loss": 0.5095728635787964, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8021877848678214, | |
| "grad_norm": 2.532621383666992, | |
| "learning_rate": 1.5029368209664822e-05, | |
| "loss": 0.496748685836792, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8051048313582497, | |
| "grad_norm": 1.6312552690505981, | |
| "learning_rate": 1.4985294269316098e-05, | |
| "loss": 0.4972914159297943, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8080218778486782, | |
| "grad_norm": 1.3996756076812744, | |
| "learning_rate": 1.4941091082322579e-05, | |
| "loss": 0.5589750409126282, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8109389243391066, | |
| "grad_norm": 1.1288363933563232, | |
| "learning_rate": 1.4896759794677526e-05, | |
| "loss": 0.5349453687667847, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8138559708295351, | |
| "grad_norm": 1.6913920640945435, | |
| "learning_rate": 1.4852301555695268e-05, | |
| "loss": 0.46511000394821167, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8167730173199635, | |
| "grad_norm": 1.1913212537765503, | |
| "learning_rate": 1.4807717517981439e-05, | |
| "loss": 0.4715422987937927, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.819690063810392, | |
| "grad_norm": 1.1179691553115845, | |
| "learning_rate": 1.476300883740307e-05, | |
| "loss": 0.53330397605896, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.8226071103008205, | |
| "grad_norm": 1.7473797798156738, | |
| "learning_rate": 1.4718176673058624e-05, | |
| "loss": 0.47564437985420227, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.8255241567912489, | |
| "grad_norm": 1.2653177976608276, | |
| "learning_rate": 1.4673222187247963e-05, | |
| "loss": 0.46364277601242065, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.8284412032816773, | |
| "grad_norm": 1.2567330598831177, | |
| "learning_rate": 1.4628146545442202e-05, | |
| "loss": 0.4778091013431549, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.8313582497721057, | |
| "grad_norm": 1.5848406553268433, | |
| "learning_rate": 1.4582950916253488e-05, | |
| "loss": 0.4480203688144684, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8342752962625342, | |
| "grad_norm": 1.3278183937072754, | |
| "learning_rate": 1.453763647140472e-05, | |
| "loss": 0.37945032119750977, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.8371923427529626, | |
| "grad_norm": 1.0961651802062988, | |
| "learning_rate": 1.4492204385699155e-05, | |
| "loss": 0.5306747555732727, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.8401093892433911, | |
| "grad_norm": 1.176276683807373, | |
| "learning_rate": 1.4446655836989961e-05, | |
| "loss": 0.49950045347213745, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.8430264357338195, | |
| "grad_norm": 1.2228269577026367, | |
| "learning_rate": 1.4400992006149674e-05, | |
| "loss": 0.494475394487381, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.845943482224248, | |
| "grad_norm": 1.1584209203720093, | |
| "learning_rate": 1.4355214077039592e-05, | |
| "loss": 0.44170859456062317, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8488605287146764, | |
| "grad_norm": 1.2041938304901123, | |
| "learning_rate": 1.4309323236479071e-05, | |
| "loss": 0.4359871745109558, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.8517775752051049, | |
| "grad_norm": 1.279645562171936, | |
| "learning_rate": 1.4263320674214762e-05, | |
| "loss": 0.45031386613845825, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.8546946216955332, | |
| "grad_norm": 1.3958357572555542, | |
| "learning_rate": 1.4217207582889769e-05, | |
| "loss": 0.4832204580307007, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.8576116681859617, | |
| "grad_norm": 1.2788586616516113, | |
| "learning_rate": 1.4170985158012725e-05, | |
| "loss": 0.5154346227645874, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.8605287146763901, | |
| "grad_norm": 1.3634892702102661, | |
| "learning_rate": 1.4124654597926795e-05, | |
| "loss": 0.46777206659317017, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8634457611668186, | |
| "grad_norm": 1.2719579935073853, | |
| "learning_rate": 1.4078217103778619e-05, | |
| "loss": 0.4247053265571594, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.866362807657247, | |
| "grad_norm": 2.890467643737793, | |
| "learning_rate": 1.4031673879487161e-05, | |
| "loss": 0.38349640369415283, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.8692798541476755, | |
| "grad_norm": 2.4354801177978516, | |
| "learning_rate": 1.3985026131712499e-05, | |
| "loss": 0.4134889543056488, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.872196900638104, | |
| "grad_norm": 1.0138323307037354, | |
| "learning_rate": 1.3938275069824541e-05, | |
| "loss": 0.5176680684089661, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.8751139471285324, | |
| "grad_norm": 1.2316186428070068, | |
| "learning_rate": 1.389142190587168e-05, | |
| "loss": 0.4818477928638458, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8751139471285324, | |
| "eval_loss": 0.4752846360206604, | |
| "eval_runtime": 1189.1666, | |
| "eval_samples_per_second": 0.531, | |
| "eval_steps_per_second": 0.531, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8780309936189608, | |
| "grad_norm": 1.515487551689148, | |
| "learning_rate": 1.384446785454936e-05, | |
| "loss": 0.47766175866127014, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.8809480401093892, | |
| "grad_norm": 1.4357497692108154, | |
| "learning_rate": 1.3797414133168591e-05, | |
| "loss": 0.49297061562538147, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.8838650865998177, | |
| "grad_norm": 1.2523037195205688, | |
| "learning_rate": 1.3750261961624383e-05, | |
| "loss": 0.4629015326499939, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.8867821330902461, | |
| "grad_norm": 3.5790023803710938, | |
| "learning_rate": 1.3703012562364124e-05, | |
| "loss": 0.3773120045661926, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.8896991795806746, | |
| "grad_norm": 1.9305704832077026, | |
| "learning_rate": 1.3655667160355892e-05, | |
| "loss": 0.496719628572464, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.892616226071103, | |
| "grad_norm": 1.1506154537200928, | |
| "learning_rate": 1.3608226983056687e-05, | |
| "loss": 0.49487072229385376, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.8955332725615315, | |
| "grad_norm": 1.8046090602874756, | |
| "learning_rate": 1.3560693260380614e-05, | |
| "loss": 0.4910697937011719, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.8984503190519599, | |
| "grad_norm": 2.0088653564453125, | |
| "learning_rate": 1.3513067224667e-05, | |
| "loss": 0.508246660232544, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9013673655423883, | |
| "grad_norm": 1.2966033220291138, | |
| "learning_rate": 1.3465350110648437e-05, | |
| "loss": 0.5125166177749634, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.9042844120328167, | |
| "grad_norm": 1.9976309537887573, | |
| "learning_rate": 1.3417543155418775e-05, | |
| "loss": 0.43942537903785706, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9072014585232452, | |
| "grad_norm": 1.2663682699203491, | |
| "learning_rate": 1.336964759840105e-05, | |
| "loss": 0.4839101731777191, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9101185050136736, | |
| "grad_norm": 1.1223328113555908, | |
| "learning_rate": 1.3321664681315354e-05, | |
| "loss": 0.48008066415786743, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.9130355515041021, | |
| "grad_norm": 1.5786972045898438, | |
| "learning_rate": 1.3273595648146634e-05, | |
| "loss": 0.47250309586524963, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.9159525979945305, | |
| "grad_norm": 1.2150241136550903, | |
| "learning_rate": 1.322544174511245e-05, | |
| "loss": 0.5149738788604736, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.918869644484959, | |
| "grad_norm": 1.3676542043685913, | |
| "learning_rate": 1.3177204220630662e-05, | |
| "loss": 0.4430195093154907, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9217866909753875, | |
| "grad_norm": 1.0703285932540894, | |
| "learning_rate": 1.3128884325287064e-05, | |
| "loss": 0.4798983037471771, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.9247037374658159, | |
| "grad_norm": 1.3131535053253174, | |
| "learning_rate": 1.308048331180296e-05, | |
| "loss": 0.4241073727607727, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.9276207839562443, | |
| "grad_norm": 1.4485348463058472, | |
| "learning_rate": 1.3032002435002698e-05, | |
| "loss": 0.527199923992157, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.9305378304466727, | |
| "grad_norm": 1.370936393737793, | |
| "learning_rate": 1.2983442951781114e-05, | |
| "loss": 0.47125962376594543, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.9334548769371012, | |
| "grad_norm": 1.2369643449783325, | |
| "learning_rate": 1.2934806121070973e-05, | |
| "loss": 0.4814244210720062, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9363719234275296, | |
| "grad_norm": 1.2632933855056763, | |
| "learning_rate": 1.2886093203810314e-05, | |
| "loss": 0.4915548264980316, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.9392889699179581, | |
| "grad_norm": 1.054569959640503, | |
| "learning_rate": 1.2837305462909764e-05, | |
| "loss": 0.5325602293014526, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.9422060164083865, | |
| "grad_norm": 1.15959632396698, | |
| "learning_rate": 1.27884441632198e-05, | |
| "loss": 0.43607404828071594, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.945123062898815, | |
| "grad_norm": 1.1667979955673218, | |
| "learning_rate": 1.2739510571497945e-05, | |
| "loss": 0.4631507992744446, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.9480401093892434, | |
| "grad_norm": 1.6009081602096558, | |
| "learning_rate": 1.2690505956375944e-05, | |
| "loss": 0.4935731887817383, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9509571558796718, | |
| "grad_norm": 1.1193996667861938, | |
| "learning_rate": 1.2641431588326858e-05, | |
| "loss": 0.45883435010910034, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.9538742023701002, | |
| "grad_norm": 1.5365067720413208, | |
| "learning_rate": 1.2592288739632138e-05, | |
| "loss": 0.5206276178359985, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.9567912488605287, | |
| "grad_norm": 1.0714622735977173, | |
| "learning_rate": 1.2543078684348632e-05, | |
| "loss": 0.5242853760719299, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.9597082953509571, | |
| "grad_norm": 1.3009248971939087, | |
| "learning_rate": 1.2493802698275557e-05, | |
| "loss": 0.4794357717037201, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.9626253418413856, | |
| "grad_norm": 1.495771050453186, | |
| "learning_rate": 1.244446205892143e-05, | |
| "loss": 0.5849282145500183, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.965542388331814, | |
| "grad_norm": 1.2046003341674805, | |
| "learning_rate": 1.2395058045470935e-05, | |
| "loss": 0.47758305072784424, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.9684594348222425, | |
| "grad_norm": 1.1362569332122803, | |
| "learning_rate": 1.2345591938751772e-05, | |
| "loss": 0.4490663409233093, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.971376481312671, | |
| "grad_norm": 1.2658129930496216, | |
| "learning_rate": 1.2296065021201438e-05, | |
| "loss": 0.4035309851169586, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.9742935278030994, | |
| "grad_norm": 4.370306015014648, | |
| "learning_rate": 1.2246478576833993e-05, | |
| "loss": 0.495273619890213, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.9772105742935278, | |
| "grad_norm": 1.3863654136657715, | |
| "learning_rate": 1.219683389120676e-05, | |
| "loss": 0.46410733461380005, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9801276207839562, | |
| "grad_norm": 1.4544321298599243, | |
| "learning_rate": 1.2147132251387004e-05, | |
| "loss": 0.4301709830760956, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.9830446672743847, | |
| "grad_norm": 1.0852457284927368, | |
| "learning_rate": 1.2097374945918554e-05, | |
| "loss": 0.48892468214035034, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.9859617137648131, | |
| "grad_norm": 1.5062257051467896, | |
| "learning_rate": 1.2047563264788412e-05, | |
| "loss": 0.4667983055114746, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.9888787602552416, | |
| "grad_norm": 1.2472951412200928, | |
| "learning_rate": 1.199769849939329e-05, | |
| "loss": 0.4827345013618469, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.99179580674567, | |
| "grad_norm": 1.2589871883392334, | |
| "learning_rate": 1.1947781942506151e-05, | |
| "loss": 0.405245304107666, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9947128532360985, | |
| "grad_norm": 1.25636625289917, | |
| "learning_rate": 1.1897814888242679e-05, | |
| "loss": 0.37956133484840393, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.9976298997265269, | |
| "grad_norm": 2.7064895629882812, | |
| "learning_rate": 1.1847798632027726e-05, | |
| "loss": 0.489456444978714, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.6156240701675415, | |
| "learning_rate": 1.1797734470561744e-05, | |
| "loss": 0.46473199129104614, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.0029170464904285, | |
| "grad_norm": 1.3046343326568604, | |
| "learning_rate": 1.1747623701787143e-05, | |
| "loss": 0.3504878282546997, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.005834092980857, | |
| "grad_norm": 1.414828896522522, | |
| "learning_rate": 1.1697467624854666e-05, | |
| "loss": 0.4719260334968567, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.0087511394712854, | |
| "grad_norm": 1.1873356103897095, | |
| "learning_rate": 1.164726754008969e-05, | |
| "loss": 0.45313555002212524, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.0116681859617138, | |
| "grad_norm": 1.1382380723953247, | |
| "learning_rate": 1.1597024748958526e-05, | |
| "loss": 0.4365478456020355, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.0145852324521423, | |
| "grad_norm": 1.8141961097717285, | |
| "learning_rate": 1.1546740554034661e-05, | |
| "loss": 0.3694503605365753, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.0175022789425707, | |
| "grad_norm": 1.333388328552246, | |
| "learning_rate": 1.1496416258965015e-05, | |
| "loss": 0.4755721688270569, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.0204193254329992, | |
| "grad_norm": 1.3464443683624268, | |
| "learning_rate": 1.1446053168436117e-05, | |
| "loss": 0.4227846562862396, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0204193254329992, | |
| "eval_loss": 0.44924086332321167, | |
| "eval_runtime": 1214.6648, | |
| "eval_samples_per_second": 0.52, | |
| "eval_steps_per_second": 0.52, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0233363719234276, | |
| "grad_norm": 1.2682689428329468, | |
| "learning_rate": 1.1395652588140292e-05, | |
| "loss": 0.44300130009651184, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.0262534184138559, | |
| "grad_norm": 1.7737696170806885, | |
| "learning_rate": 1.1345215824741814e-05, | |
| "loss": 0.5106258988380432, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.0291704649042843, | |
| "grad_norm": 1.2601238489151, | |
| "learning_rate": 1.1294744185843014e-05, | |
| "loss": 0.45930635929107666, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.0320875113947128, | |
| "grad_norm": 1.2162678241729736, | |
| "learning_rate": 1.1244238979950406e-05, | |
| "loss": 0.44163084030151367, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.0350045578851412, | |
| "grad_norm": 1.0905817747116089, | |
| "learning_rate": 1.1193701516440733e-05, | |
| "loss": 0.510662317276001, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0379216043755697, | |
| "grad_norm": 0.9624952673912048, | |
| "learning_rate": 1.1143133105527048e-05, | |
| "loss": 0.5297917127609253, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.0408386508659981, | |
| "grad_norm": 1.2757681608200073, | |
| "learning_rate": 1.1092535058224725e-05, | |
| "loss": 0.4332093596458435, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.0437556973564266, | |
| "grad_norm": 1.6885719299316406, | |
| "learning_rate": 1.104190868631748e-05, | |
| "loss": 0.4337635040283203, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.046672743846855, | |
| "grad_norm": 1.175484538078308, | |
| "learning_rate": 1.099125530232336e-05, | |
| "loss": 0.45411020517349243, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.0495897903372835, | |
| "grad_norm": 1.0964939594268799, | |
| "learning_rate": 1.0940576219460723e-05, | |
| "loss": 0.5333439707756042, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.052506836827712, | |
| "grad_norm": 1.5493136644363403, | |
| "learning_rate": 1.0889872751614176e-05, | |
| "loss": 0.4400906264781952, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.0554238833181404, | |
| "grad_norm": 1.2491416931152344, | |
| "learning_rate": 1.0839146213300526e-05, | |
| "loss": 0.31049978733062744, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.0583409298085689, | |
| "grad_norm": 1.7213693857192993, | |
| "learning_rate": 1.0788397919634694e-05, | |
| "loss": 0.389009028673172, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.0612579762989973, | |
| "grad_norm": 1.5405336618423462, | |
| "learning_rate": 1.0737629186295621e-05, | |
| "loss": 0.4068562984466553, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.0641750227894258, | |
| "grad_norm": 1.225455641746521, | |
| "learning_rate": 1.0686841329492159e-05, | |
| "loss": 0.47358617186546326, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0670920692798542, | |
| "grad_norm": 1.3436250686645508, | |
| "learning_rate": 1.0636035665928945e-05, | |
| "loss": 0.47050854563713074, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.0700091157702827, | |
| "grad_norm": 1.4952112436294556, | |
| "learning_rate": 1.058521351277227e-05, | |
| "loss": 0.43496906757354736, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.072926162260711, | |
| "grad_norm": 1.549112319946289, | |
| "learning_rate": 1.0534376187615924e-05, | |
| "loss": 0.45711052417755127, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.0758432087511394, | |
| "grad_norm": 1.3851526975631714, | |
| "learning_rate": 1.048352500844704e-05, | |
| "loss": 0.45045915246009827, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.0787602552415678, | |
| "grad_norm": 1.6302049160003662, | |
| "learning_rate": 1.0432661293611927e-05, | |
| "loss": 0.3736046254634857, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0816773017319963, | |
| "grad_norm": 1.3365869522094727, | |
| "learning_rate": 1.0381786361781885e-05, | |
| "loss": 0.42242100834846497, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.0845943482224247, | |
| "grad_norm": 1.4369138479232788, | |
| "learning_rate": 1.0330901531919026e-05, | |
| "loss": 0.44570961594581604, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.0875113947128532, | |
| "grad_norm": 1.3528283834457397, | |
| "learning_rate": 1.0280008123242069e-05, | |
| "loss": 0.43440738320350647, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.0904284412032816, | |
| "grad_norm": 1.469660997390747, | |
| "learning_rate": 1.0229107455192147e-05, | |
| "loss": 0.3960394263267517, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.09334548769371, | |
| "grad_norm": 1.4542185068130493, | |
| "learning_rate": 1.0178200847398595e-05, | |
| "loss": 0.47834208607673645, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0962625341841385, | |
| "grad_norm": 1.6470292806625366, | |
| "learning_rate": 1.0127289619644737e-05, | |
| "loss": 0.42791086435317993, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.099179580674567, | |
| "grad_norm": 1.1934021711349487, | |
| "learning_rate": 1.0076375091833681e-05, | |
| "loss": 0.4401305019855499, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.1020966271649955, | |
| "grad_norm": 0.9786668419837952, | |
| "learning_rate": 1.0025458583954078e-05, | |
| "loss": 0.4816555678844452, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.105013673655424, | |
| "grad_norm": 1.1348779201507568, | |
| "learning_rate": 9.974541416045924e-06, | |
| "loss": 0.41516968607902527, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.1079307201458524, | |
| "grad_norm": 1.0188615322113037, | |
| "learning_rate": 9.923624908166322e-06, | |
| "loss": 0.48087278008461, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1108477666362808, | |
| "grad_norm": 1.0821740627288818, | |
| "learning_rate": 9.872710380355263e-06, | |
| "loss": 0.41974008083343506, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.1137648131267093, | |
| "grad_norm": 1.250951886177063, | |
| "learning_rate": 9.82179915260141e-06, | |
| "loss": 0.42703643441200256, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.1166818596171377, | |
| "grad_norm": 1.4528254270553589, | |
| "learning_rate": 9.770892544807856e-06, | |
| "loss": 0.43801453709602356, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.1195989061075662, | |
| "grad_norm": 1.813859462738037, | |
| "learning_rate": 9.719991876757934e-06, | |
| "loss": 0.4344240725040436, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.1225159525979946, | |
| "grad_norm": 1.6681253910064697, | |
| "learning_rate": 9.669098468080976e-06, | |
| "loss": 0.4356998801231384, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.125432999088423, | |
| "grad_norm": 1.3447953462600708, | |
| "learning_rate": 9.618213638218117e-06, | |
| "loss": 0.43189188838005066, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.1283500455788513, | |
| "grad_norm": 1.9577926397323608, | |
| "learning_rate": 9.567338706388074e-06, | |
| "loss": 0.34984707832336426, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.1312670920692798, | |
| "grad_norm": 1.5225576162338257, | |
| "learning_rate": 9.516474991552965e-06, | |
| "loss": 0.4243963062763214, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.1341841385597082, | |
| "grad_norm": 1.7416809797286987, | |
| "learning_rate": 9.46562381238408e-06, | |
| "loss": 0.3414606750011444, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.1371011850501367, | |
| "grad_norm": 1.8358951807022095, | |
| "learning_rate": 9.414786487227732e-06, | |
| "loss": 0.387447327375412, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1400182315405651, | |
| "grad_norm": 1.9706153869628906, | |
| "learning_rate": 9.363964334071057e-06, | |
| "loss": 0.4599088728427887, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.1429352780309936, | |
| "grad_norm": 1.0604286193847656, | |
| "learning_rate": 9.313158670507843e-06, | |
| "loss": 0.4633581042289734, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.145852324521422, | |
| "grad_norm": 1.4851202964782715, | |
| "learning_rate": 9.262370813704379e-06, | |
| "loss": 0.3872259557247162, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.1487693710118505, | |
| "grad_norm": 1.7839159965515137, | |
| "learning_rate": 9.21160208036531e-06, | |
| "loss": 0.5215944647789001, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.151686417502279, | |
| "grad_norm": 1.3054656982421875, | |
| "learning_rate": 9.160853786699475e-06, | |
| "loss": 0.4030425548553467, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.1546034639927074, | |
| "grad_norm": 3.8467981815338135, | |
| "learning_rate": 9.110127248385827e-06, | |
| "loss": 0.4032524824142456, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.1575205104831359, | |
| "grad_norm": 1.8513801097869873, | |
| "learning_rate": 9.05942378053928e-06, | |
| "loss": 0.46577155590057373, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.1604375569735643, | |
| "grad_norm": 1.312689185142517, | |
| "learning_rate": 9.008744697676642e-06, | |
| "loss": 0.39114487171173096, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.1633546034639928, | |
| "grad_norm": 1.1996328830718994, | |
| "learning_rate": 8.958091313682521e-06, | |
| "loss": 0.481199711561203, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.1662716499544212, | |
| "grad_norm": 5.172409534454346, | |
| "learning_rate": 8.90746494177528e-06, | |
| "loss": 0.3803558945655823, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1662716499544212, | |
| "eval_loss": 0.4318464398384094, | |
| "eval_runtime": 1206.0306, | |
| "eval_samples_per_second": 0.524, | |
| "eval_steps_per_second": 0.524, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1691886964448497, | |
| "grad_norm": 1.0115015506744385, | |
| "learning_rate": 8.856866894472954e-06, | |
| "loss": 0.39636704325675964, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.172105742935278, | |
| "grad_norm": 1.1557435989379883, | |
| "learning_rate": 8.806298483559268e-06, | |
| "loss": 0.4076298475265503, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.1750227894257064, | |
| "grad_norm": 1.2802515029907227, | |
| "learning_rate": 8.755761020049597e-06, | |
| "loss": 0.44352248311042786, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.1779398359161348, | |
| "grad_norm": 1.2755069732666016, | |
| "learning_rate": 8.705255814156988e-06, | |
| "loss": 0.390497624874115, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.1808568824065633, | |
| "grad_norm": 1.2799782752990723, | |
| "learning_rate": 8.654784175258188e-06, | |
| "loss": 0.35810694098472595, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.1837739288969917, | |
| "grad_norm": 1.0968674421310425, | |
| "learning_rate": 8.604347411859713e-06, | |
| "loss": 0.3890265226364136, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.1866909753874202, | |
| "grad_norm": 1.3334455490112305, | |
| "learning_rate": 8.553946831563886e-06, | |
| "loss": 0.3916901648044586, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.1896080218778486, | |
| "grad_norm": 1.1888184547424316, | |
| "learning_rate": 8.503583741034988e-06, | |
| "loss": 0.5231326222419739, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.192525068368277, | |
| "grad_norm": 1.1163763999938965, | |
| "learning_rate": 8.45325944596534e-06, | |
| "loss": 0.4249858558177948, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.1954421148587056, | |
| "grad_norm": 1.3470333814620972, | |
| "learning_rate": 8.40297525104148e-06, | |
| "loss": 0.5201632380485535, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.198359161349134, | |
| "grad_norm": 1.5412285327911377, | |
| "learning_rate": 8.35273245991031e-06, | |
| "loss": 0.39376699924468994, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.2012762078395625, | |
| "grad_norm": 1.3408735990524292, | |
| "learning_rate": 8.302532375145339e-06, | |
| "loss": 0.39554283022880554, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.204193254329991, | |
| "grad_norm": 1.990668773651123, | |
| "learning_rate": 8.25237629821286e-06, | |
| "loss": 0.42424261569976807, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.2071103008204194, | |
| "grad_norm": 1.6471989154815674, | |
| "learning_rate": 8.202265529438259e-06, | |
| "loss": 0.3234582543373108, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.2100273473108478, | |
| "grad_norm": 1.1483631134033203, | |
| "learning_rate": 8.152201367972275e-06, | |
| "loss": 0.39163246750831604, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.2129443938012763, | |
| "grad_norm": 1.800149917602539, | |
| "learning_rate": 8.102185111757323e-06, | |
| "loss": 0.5055042505264282, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.2158614402917047, | |
| "grad_norm": 1.4394795894622803, | |
| "learning_rate": 8.052218057493849e-06, | |
| "loss": 0.4761751592159271, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.2187784867821332, | |
| "grad_norm": 1.622689962387085, | |
| "learning_rate": 8.002301500606715e-06, | |
| "loss": 0.4490141272544861, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.2216955332725616, | |
| "grad_norm": 1.2564961910247803, | |
| "learning_rate": 7.952436735211593e-06, | |
| "loss": 0.3964035212993622, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.22461257976299, | |
| "grad_norm": 1.3248411417007446, | |
| "learning_rate": 7.902625054081449e-06, | |
| "loss": 0.46039122343063354, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2275296262534183, | |
| "grad_norm": 1.568983793258667, | |
| "learning_rate": 7.852867748613e-06, | |
| "loss": 0.49916595220565796, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.2304466727438468, | |
| "grad_norm": 1.4784491062164307, | |
| "learning_rate": 7.803166108793243e-06, | |
| "loss": 0.4035068154335022, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.2333637192342752, | |
| "grad_norm": 1.2940057516098022, | |
| "learning_rate": 7.753521423166007e-06, | |
| "loss": 0.4154140055179596, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.2362807657247037, | |
| "grad_norm": 1.167786717414856, | |
| "learning_rate": 7.703934978798565e-06, | |
| "loss": 0.39541637897491455, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.2391978122151321, | |
| "grad_norm": 1.5126771926879883, | |
| "learning_rate": 7.65440806124823e-06, | |
| "loss": 0.37744253873825073, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.2421148587055606, | |
| "grad_norm": 1.2595263719558716, | |
| "learning_rate": 7.604941954529067e-06, | |
| "loss": 0.46380615234375, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.245031905195989, | |
| "grad_norm": 1.4258298873901367, | |
| "learning_rate": 7.555537941078573e-06, | |
| "loss": 0.3391319513320923, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.2479489516864175, | |
| "grad_norm": 1.5371774435043335, | |
| "learning_rate": 7.506197301724446e-06, | |
| "loss": 0.39805102348327637, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.250865998176846, | |
| "grad_norm": 1.3789173364639282, | |
| "learning_rate": 7.456921315651371e-06, | |
| "loss": 0.37969034910202026, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.2537830446672744, | |
| "grad_norm": 1.32931649684906, | |
| "learning_rate": 7.407711260367867e-06, | |
| "loss": 0.3841526508331299, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2567000911577029, | |
| "grad_norm": 1.2836817502975464, | |
| "learning_rate": 7.358568411673145e-06, | |
| "loss": 0.340289443731308, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.2596171376481313, | |
| "grad_norm": 1.0418318510055542, | |
| "learning_rate": 7.309494043624059e-06, | |
| "loss": 0.44747158885002136, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.2625341841385598, | |
| "grad_norm": 1.1769362688064575, | |
| "learning_rate": 7.260489428502058e-06, | |
| "loss": 0.45737382769584656, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.265451230628988, | |
| "grad_norm": 2.2730748653411865, | |
| "learning_rate": 7.211555836780203e-06, | |
| "loss": 0.3827931582927704, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.2683682771194165, | |
| "grad_norm": 1.263096809387207, | |
| "learning_rate": 7.162694537090235e-06, | |
| "loss": 0.3589435815811157, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.271285323609845, | |
| "grad_norm": 1.4073514938354492, | |
| "learning_rate": 7.113906796189692e-06, | |
| "loss": 0.45206642150878906, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.2742023701002734, | |
| "grad_norm": 1.064585566520691, | |
| "learning_rate": 7.0651938789290306e-06, | |
| "loss": 0.5409261584281921, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.2771194165907018, | |
| "grad_norm": 1.2346999645233154, | |
| "learning_rate": 7.016557048218889e-06, | |
| "loss": 0.40680158138275146, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.2800364630811303, | |
| "grad_norm": 1.5816547870635986, | |
| "learning_rate": 6.967997564997306e-06, | |
| "loss": 0.38718655705451965, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.2829535095715587, | |
| "grad_norm": 1.085268259048462, | |
| "learning_rate": 6.919516688197041e-06, | |
| "loss": 0.4863276779651642, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2858705560619872, | |
| "grad_norm": 1.0984629392623901, | |
| "learning_rate": 6.871115674712937e-06, | |
| "loss": 0.39562875032424927, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.2887876025524156, | |
| "grad_norm": 1.3004229068756104, | |
| "learning_rate": 6.822795779369339e-06, | |
| "loss": 0.44437694549560547, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.291704649042844, | |
| "grad_norm": 1.3541183471679688, | |
| "learning_rate": 6.774558254887553e-06, | |
| "loss": 0.4728967249393463, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.2946216955332726, | |
| "grad_norm": 1.2485377788543701, | |
| "learning_rate": 6.7264043518533695e-06, | |
| "loss": 0.4052809476852417, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.297538742023701, | |
| "grad_norm": 1.412827730178833, | |
| "learning_rate": 6.67833531868465e-06, | |
| "loss": 0.40149861574172974, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.3004557885141295, | |
| "grad_norm": 1.5576224327087402, | |
| "learning_rate": 6.630352401598953e-06, | |
| "loss": 0.44107240438461304, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.303372835004558, | |
| "grad_norm": 1.1551047563552856, | |
| "learning_rate": 6.582456844581226e-06, | |
| "loss": 0.4898405969142914, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.3062898814949864, | |
| "grad_norm": 1.9939689636230469, | |
| "learning_rate": 6.5346498893515645e-06, | |
| "loss": 0.4791329801082611, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.3092069279854148, | |
| "grad_norm": 1.4782553911209106, | |
| "learning_rate": 6.486932775333002e-06, | |
| "loss": 0.472908616065979, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.3121239744758433, | |
| "grad_norm": 1.2496148347854614, | |
| "learning_rate": 6.439306739619387e-06, | |
| "loss": 0.514995276927948, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3121239744758433, | |
| "eval_loss": 0.4178673028945923, | |
| "eval_runtime": 1197.5534, | |
| "eval_samples_per_second": 0.528, | |
| "eval_steps_per_second": 0.528, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3150410209662717, | |
| "grad_norm": 1.3996772766113281, | |
| "learning_rate": 6.391773016943316e-06, | |
| "loss": 0.4087896943092346, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.3179580674567002, | |
| "grad_norm": 1.20390784740448, | |
| "learning_rate": 6.344332839644111e-06, | |
| "loss": 0.43224579095840454, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.3208751139471286, | |
| "grad_norm": 1.2709496021270752, | |
| "learning_rate": 6.296987437635876e-06, | |
| "loss": 0.44104093313217163, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.323792160437557, | |
| "grad_norm": 1.0112334489822388, | |
| "learning_rate": 6.249738038375618e-06, | |
| "loss": 0.47084498405456543, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.3267092069279856, | |
| "grad_norm": 1.0771515369415283, | |
| "learning_rate": 6.202585866831411e-06, | |
| "loss": 0.4700928032398224, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.3296262534184138, | |
| "grad_norm": 1.4937143325805664, | |
| "learning_rate": 6.15553214545064e-06, | |
| "loss": 0.345747709274292, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.3325432999088422, | |
| "grad_norm": 1.1348456144332886, | |
| "learning_rate": 6.108578094128321e-06, | |
| "loss": 0.33824583888053894, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.3354603463992707, | |
| "grad_norm": 1.2502707242965698, | |
| "learning_rate": 6.061724930175461e-06, | |
| "loss": 0.3528832197189331, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.3383773928896991, | |
| "grad_norm": 1.5359619855880737, | |
| "learning_rate": 6.014973868287504e-06, | |
| "loss": 0.4413869082927704, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.3412944393801276, | |
| "grad_norm": 0.9747081398963928, | |
| "learning_rate": 5.9683261205128395e-06, | |
| "loss": 0.6849499940872192, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.344211485870556, | |
| "grad_norm": 1.3150533437728882, | |
| "learning_rate": 5.921782896221383e-06, | |
| "loss": 0.3901931047439575, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.3471285323609845, | |
| "grad_norm": 1.137770652770996, | |
| "learning_rate": 5.875345402073207e-06, | |
| "loss": 0.37498384714126587, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.350045578851413, | |
| "grad_norm": 1.2216367721557617, | |
| "learning_rate": 5.829014841987277e-06, | |
| "loss": 0.3874579966068268, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.3529626253418414, | |
| "grad_norm": 1.135439157485962, | |
| "learning_rate": 5.782792417110233e-06, | |
| "loss": 0.384797066450119, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.3558796718322699, | |
| "grad_norm": 1.2400696277618408, | |
| "learning_rate": 5.736679325785239e-06, | |
| "loss": 0.46303266286849976, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.3587967183226983, | |
| "grad_norm": 1.8848882913589478, | |
| "learning_rate": 5.6906767635209304e-06, | |
| "loss": 0.5068309903144836, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.3617137648131268, | |
| "grad_norm": 1.4707008600234985, | |
| "learning_rate": 5.644785922960412e-06, | |
| "loss": 0.364332914352417, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.364630811303555, | |
| "grad_norm": 2.4436841011047363, | |
| "learning_rate": 5.599007993850329e-06, | |
| "loss": 0.485107421875, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.3675478577939835, | |
| "grad_norm": 1.1924740076065063, | |
| "learning_rate": 5.553344163010039e-06, | |
| "loss": 0.34547489881515503, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.370464904284412, | |
| "grad_norm": 1.1255877017974854, | |
| "learning_rate": 5.507795614300846e-06, | |
| "loss": 0.39645254611968994, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3733819507748404, | |
| "grad_norm": 1.0937018394470215, | |
| "learning_rate": 5.4623635285952815e-06, | |
| "loss": 0.4267856478691101, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.3762989972652688, | |
| "grad_norm": 1.3355520963668823, | |
| "learning_rate": 5.417049083746513e-06, | |
| "loss": 0.3669992983341217, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.3792160437556973, | |
| "grad_norm": 1.7302504777908325, | |
| "learning_rate": 5.3718534545578035e-06, | |
| "loss": 0.3873697519302368, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.3821330902461257, | |
| "grad_norm": 1.17263662815094, | |
| "learning_rate": 5.326777812752041e-06, | |
| "loss": 0.4581540524959564, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.3850501367365542, | |
| "grad_norm": 1.0998128652572632, | |
| "learning_rate": 5.281823326941377e-06, | |
| "loss": 0.43062761425971985, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3879671832269826, | |
| "grad_norm": 1.1194556951522827, | |
| "learning_rate": 5.236991162596932e-06, | |
| "loss": 0.381741464138031, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.390884229717411, | |
| "grad_norm": 1.2759051322937012, | |
| "learning_rate": 5.19228248201856e-06, | |
| "loss": 0.49175748229026794, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.3938012762078396, | |
| "grad_norm": 1.2134747505187988, | |
| "learning_rate": 5.147698444304732e-06, | |
| "loss": 0.4997562766075134, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.396718322698268, | |
| "grad_norm": 1.0833078622817993, | |
| "learning_rate": 5.1032402053224804e-06, | |
| "loss": 0.42580488324165344, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.3996353691886965, | |
| "grad_norm": 1.4838510751724243, | |
| "learning_rate": 5.058908917677426e-06, | |
| "loss": 0.5015593767166138, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.402552415679125, | |
| "grad_norm": 1.218610167503357, | |
| "learning_rate": 5.014705730683904e-06, | |
| "loss": 0.34739193320274353, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.4054694621695534, | |
| "grad_norm": 1.1883307695388794, | |
| "learning_rate": 4.970631790335181e-06, | |
| "loss": 0.41708022356033325, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.4083865086599818, | |
| "grad_norm": 1.209291696548462, | |
| "learning_rate": 4.926688239273713e-06, | |
| "loss": 0.43546172976493835, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.4113035551504103, | |
| "grad_norm": 1.0801606178283691, | |
| "learning_rate": 4.882876216761543e-06, | |
| "loss": 0.44491735100746155, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.4142206016408387, | |
| "grad_norm": 1.2746628522872925, | |
| "learning_rate": 4.839196858650763e-06, | |
| "loss": 0.436122864484787, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.4171376481312672, | |
| "grad_norm": 1.4465962648391724, | |
| "learning_rate": 4.795651297354056e-06, | |
| "loss": 0.3750447630882263, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.4200546946216956, | |
| "grad_norm": 1.6736211776733398, | |
| "learning_rate": 4.752240661815346e-06, | |
| "loss": 0.38286519050598145, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.422971741112124, | |
| "grad_norm": 1.1946996450424194, | |
| "learning_rate": 4.708966077480544e-06, | |
| "loss": 0.4488063156604767, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.4258887876025526, | |
| "grad_norm": 1.42599356174469, | |
| "learning_rate": 4.665828666268335e-06, | |
| "loss": 0.44088613986968994, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.4288058340929808, | |
| "grad_norm": 1.2281016111373901, | |
| "learning_rate": 4.622829546541121e-06, | |
| "loss": 0.4030645489692688, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4317228805834092, | |
| "grad_norm": 1.2875670194625854, | |
| "learning_rate": 4.57996983307602e-06, | |
| "loss": 0.44702020287513733, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.4346399270738377, | |
| "grad_norm": 1.2456860542297363, | |
| "learning_rate": 4.537250637035947e-06, | |
| "loss": 0.4067370593547821, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.4375569735642661, | |
| "grad_norm": 1.2822725772857666, | |
| "learning_rate": 4.494673065940833e-06, | |
| "loss": 0.4237740635871887, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.4404740200546946, | |
| "grad_norm": 1.5517818927764893, | |
| "learning_rate": 4.452238223638906e-06, | |
| "loss": 0.40579724311828613, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.443391066545123, | |
| "grad_norm": 1.275344967842102, | |
| "learning_rate": 4.409947210278056e-06, | |
| "loss": 0.38880717754364014, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.4463081130355515, | |
| "grad_norm": 1.22952139377594, | |
| "learning_rate": 4.367801122277327e-06, | |
| "loss": 0.4042310416698456, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.44922515952598, | |
| "grad_norm": 1.122261643409729, | |
| "learning_rate": 4.325801052298493e-06, | |
| "loss": 0.5408368110656738, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.4521422060164084, | |
| "grad_norm": 1.5885361433029175, | |
| "learning_rate": 4.283948089217715e-06, | |
| "loss": 0.37697717547416687, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.4550592525068369, | |
| "grad_norm": 2.3565149307250977, | |
| "learning_rate": 4.242243318097338e-06, | |
| "loss": 0.3811529576778412, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.4579762989972653, | |
| "grad_norm": 1.1944137811660767, | |
| "learning_rate": 4.200687820157735e-06, | |
| "loss": 0.414781391620636, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4579762989972653, | |
| "eval_loss": 0.40706494450569153, | |
| "eval_runtime": 1189.1593, | |
| "eval_samples_per_second": 0.531, | |
| "eval_steps_per_second": 0.531, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4608933454876938, | |
| "grad_norm": 1.0442464351654053, | |
| "learning_rate": 4.159282672749289e-06, | |
| "loss": 0.38155990839004517, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.463810391978122, | |
| "grad_norm": 1.7274727821350098, | |
| "learning_rate": 4.118028949324453e-06, | |
| "loss": 0.4830601215362549, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.4667274384685505, | |
| "grad_norm": 2.064513921737671, | |
| "learning_rate": 4.0769277194099345e-06, | |
| "loss": 0.3975123167037964, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.469644484958979, | |
| "grad_norm": 1.7695534229278564, | |
| "learning_rate": 4.035980048578942e-06, | |
| "loss": 0.37033841013908386, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.4725615314494074, | |
| "grad_norm": 1.4455046653747559, | |
| "learning_rate": 3.995186998423597e-06, | |
| "loss": 0.39567673206329346, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.4754785779398358, | |
| "grad_norm": 1.1791958808898926, | |
| "learning_rate": 3.9545496265273765e-06, | |
| "loss": 0.44786664843559265, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.4783956244302643, | |
| "grad_norm": 2.0874717235565186, | |
| "learning_rate": 3.9140689864377105e-06, | |
| "loss": 0.3333263099193573, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.4813126709206927, | |
| "grad_norm": 1.5897501707077026, | |
| "learning_rate": 3.873746127638668e-06, | |
| "loss": 0.5105943083763123, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.4842297174111212, | |
| "grad_norm": 1.5059760808944702, | |
| "learning_rate": 3.833582095523749e-06, | |
| "loss": 0.43922683596611023, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.4871467639015497, | |
| "grad_norm": 1.379347562789917, | |
| "learning_rate": 3.7935779313687648e-06, | |
| "loss": 0.4584790766239166, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.490063810391978, | |
| "grad_norm": 1.0984690189361572, | |
| "learning_rate": 3.7537346723048816e-06, | |
| "loss": 0.5217512249946594, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.4929808568824066, | |
| "grad_norm": 1.5944225788116455, | |
| "learning_rate": 3.71405335129169e-06, | |
| "loss": 0.4180052876472473, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.495897903372835, | |
| "grad_norm": 1.2745033502578735, | |
| "learning_rate": 3.6745349970904465e-06, | |
| "loss": 0.4584833085536957, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.4988149498632635, | |
| "grad_norm": 1.2746814489364624, | |
| "learning_rate": 3.6351806342374007e-06, | |
| "loss": 0.3202287554740906, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.501731996353692, | |
| "grad_norm": 1.409638524055481, | |
| "learning_rate": 3.5959912830172348e-06, | |
| "loss": 0.37963351607322693, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.5046490428441204, | |
| "grad_norm": 1.1655553579330444, | |
| "learning_rate": 3.556967959436591e-06, | |
| "loss": 0.43133026361465454, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.5075660893345488, | |
| "grad_norm": 1.0495020151138306, | |
| "learning_rate": 3.518111675197776e-06, | |
| "loss": 0.3739299178123474, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.5104831358249773, | |
| "grad_norm": 1.3055057525634766, | |
| "learning_rate": 3.4794234376724835e-06, | |
| "loss": 0.4099601209163666, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.5134001823154057, | |
| "grad_norm": 1.2252463102340698, | |
| "learning_rate": 3.4409042498757084e-06, | |
| "loss": 0.380616158246994, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.5163172288058342, | |
| "grad_norm": 1.2728638648986816, | |
| "learning_rate": 3.4025551104397294e-06, | |
| "loss": 0.3510003685951233, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5192342752962626, | |
| "grad_norm": 2.70664644241333, | |
| "learning_rate": 3.3643770135882282e-06, | |
| "loss": 0.4087940752506256, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.522151321786691, | |
| "grad_norm": 1.6197112798690796, | |
| "learning_rate": 3.3263709491104933e-06, | |
| "loss": 0.45614126324653625, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.5250683682771196, | |
| "grad_norm": 1.3596103191375732, | |
| "learning_rate": 3.2885379023357956e-06, | |
| "loss": 0.3824586272239685, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.527985414767548, | |
| "grad_norm": 1.1768635511398315, | |
| "learning_rate": 3.2508788541078097e-06, | |
| "loss": 0.47717779874801636, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.5309024612579762, | |
| "grad_norm": 1.669474482536316, | |
| "learning_rate": 3.2133947807591958e-06, | |
| "loss": 0.4013281762599945, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.5338195077484047, | |
| "grad_norm": 1.600868582725525, | |
| "learning_rate": 3.1760866540862932e-06, | |
| "loss": 0.367280513048172, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.5367365542388332, | |
| "grad_norm": 1.1689515113830566, | |
| "learning_rate": 3.138955441323923e-06, | |
| "loss": 0.4432409405708313, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.5396536007292616, | |
| "grad_norm": 2.361961603164673, | |
| "learning_rate": 3.1020021051202973e-06, | |
| "loss": 0.4219942092895508, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.54257064721969, | |
| "grad_norm": 1.1962230205535889, | |
| "learning_rate": 3.0652276035120964e-06, | |
| "loss": 0.3672596514225006, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.5454876937101185, | |
| "grad_norm": 1.4149441719055176, | |
| "learning_rate": 3.0286328898995963e-06, | |
| "loss": 0.42919260263442993, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.548404740200547, | |
| "grad_norm": 1.2668434381484985, | |
| "learning_rate": 2.992218913021966e-06, | |
| "loss": 0.4499061107635498, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.5513217866909754, | |
| "grad_norm": 1.268114686012268, | |
| "learning_rate": 2.9559866169326734e-06, | |
| "loss": 0.34660714864730835, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.5542388331814039, | |
| "grad_norm": 1.0086419582366943, | |
| "learning_rate": 2.919936940975007e-06, | |
| "loss": 0.38239023089408875, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.557155879671832, | |
| "grad_norm": 1.0700170993804932, | |
| "learning_rate": 2.884070819757712e-06, | |
| "loss": 0.48240017890930176, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.5600729261622606, | |
| "grad_norm": 1.2101227045059204, | |
| "learning_rate": 2.8483891831307873e-06, | |
| "loss": 0.4098761975765228, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.562989972652689, | |
| "grad_norm": 1.2731400728225708, | |
| "learning_rate": 2.8128929561613505e-06, | |
| "loss": 0.45641395449638367, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.5659070191431175, | |
| "grad_norm": 1.1474392414093018, | |
| "learning_rate": 2.777583059109671e-06, | |
| "loss": 0.42283985018730164, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.568824065633546, | |
| "grad_norm": 1.789881944656372, | |
| "learning_rate": 2.7424604074053028e-06, | |
| "loss": 0.3469158113002777, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.5717411121239744, | |
| "grad_norm": 1.3426933288574219, | |
| "learning_rate": 2.707525911623362e-06, | |
| "loss": 0.35837510228157043, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.5746581586144028, | |
| "grad_norm": 1.2343578338623047, | |
| "learning_rate": 2.672780477460901e-06, | |
| "loss": 0.4736083745956421, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.5775752051048313, | |
| "grad_norm": 1.516298770904541, | |
| "learning_rate": 2.638225005713457e-06, | |
| "loss": 0.34345340728759766, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.5804922515952597, | |
| "grad_norm": 1.1488829851150513, | |
| "learning_rate": 2.6038603922516705e-06, | |
| "loss": 0.4134179949760437, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.5834092980856882, | |
| "grad_norm": 1.4486491680145264, | |
| "learning_rate": 2.569687527998073e-06, | |
| "loss": 0.3297592103481293, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.5863263445761167, | |
| "grad_norm": 1.272691011428833, | |
| "learning_rate": 2.5357072989039855e-06, | |
| "loss": 0.3958476185798645, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.589243391066545, | |
| "grad_norm": 1.244240641593933, | |
| "learning_rate": 2.501920585926555e-06, | |
| "loss": 0.4125611186027527, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.5921604375569736, | |
| "grad_norm": 1.5844073295593262, | |
| "learning_rate": 2.4683282650058992e-06, | |
| "loss": 0.3762253224849701, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.595077484047402, | |
| "grad_norm": 1.8209946155548096, | |
| "learning_rate": 2.4349312070424258e-06, | |
| "loss": 0.37053319811820984, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.5979945305378305, | |
| "grad_norm": 1.3752915859222412, | |
| "learning_rate": 2.4017302778742247e-06, | |
| "loss": 0.5004774332046509, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.600911577028259, | |
| "grad_norm": 5.143753528594971, | |
| "learning_rate": 2.36872633825464e-06, | |
| "loss": 0.39014023542404175, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.6038286235186874, | |
| "grad_norm": 1.0730944871902466, | |
| "learning_rate": 2.335920243829941e-06, | |
| "loss": 0.378440260887146, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6038286235186874, | |
| "eval_loss": 0.40037089586257935, | |
| "eval_runtime": 893.7411, | |
| "eval_samples_per_second": 0.707, | |
| "eval_steps_per_second": 0.707, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6067456700091158, | |
| "grad_norm": 1.5507797002792358, | |
| "learning_rate": 2.3033128451171548e-06, | |
| "loss": 0.4471960663795471, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.6096627164995443, | |
| "grad_norm": 1.9462968111038208, | |
| "learning_rate": 2.2709049874819924e-06, | |
| "loss": 0.3658301830291748, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.6125797629899727, | |
| "grad_norm": 1.2034238576889038, | |
| "learning_rate": 2.238697511116962e-06, | |
| "loss": 0.3911179304122925, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.6154968094804012, | |
| "grad_norm": 1.3574327230453491, | |
| "learning_rate": 2.2066912510195636e-06, | |
| "loss": 0.3998897671699524, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.6184138559708297, | |
| "grad_norm": 1.1973012685775757, | |
| "learning_rate": 2.1748870369706507e-06, | |
| "loss": 0.38577449321746826, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.621330902461258, | |
| "grad_norm": 1.9365874528884888, | |
| "learning_rate": 2.1432856935129144e-06, | |
| "loss": 0.411307156085968, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.6242479489516866, | |
| "grad_norm": 1.3558642864227295, | |
| "learning_rate": 2.1118880399295106e-06, | |
| "loss": 0.38424253463745117, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.627164995442115, | |
| "grad_norm": 1.4368890523910522, | |
| "learning_rate": 2.0806948902228075e-06, | |
| "loss": 0.39943546056747437, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.6300820419325432, | |
| "grad_norm": 1.6266753673553467, | |
| "learning_rate": 2.0497070530933084e-06, | |
| "loss": 0.36787641048431396, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.6329990884229717, | |
| "grad_norm": 1.2600938081741333, | |
| "learning_rate": 2.0189253319186576e-06, | |
| "loss": 0.3781934380531311, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6359161349134002, | |
| "grad_norm": 1.975071907043457, | |
| "learning_rate": 1.9883505247328237e-06, | |
| "loss": 0.4132305383682251, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.6388331814038286, | |
| "grad_norm": 1.4095909595489502, | |
| "learning_rate": 1.9579834242054154e-06, | |
| "loss": 0.3727574646472931, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.641750227894257, | |
| "grad_norm": 1.4271371364593506, | |
| "learning_rate": 1.9278248176211243e-06, | |
| "loss": 0.33786773681640625, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.6446672743846855, | |
| "grad_norm": 1.5907646417617798, | |
| "learning_rate": 1.8978754868593074e-06, | |
| "loss": 0.33035099506378174, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.647584320875114, | |
| "grad_norm": 1.1315702199935913, | |
| "learning_rate": 1.8681362083737387e-06, | |
| "loss": 0.41707149147987366, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.6505013673655424, | |
| "grad_norm": 1.4737143516540527, | |
| "learning_rate": 1.8386077531724556e-06, | |
| "loss": 0.43079230189323425, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.6534184138559709, | |
| "grad_norm": 1.1006760597229004, | |
| "learning_rate": 1.8092908867977822e-06, | |
| "loss": 0.3524904251098633, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.6563354603463991, | |
| "grad_norm": 1.4066118001937866, | |
| "learning_rate": 1.780186369306479e-06, | |
| "loss": 0.3695681691169739, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.6592525068368276, | |
| "grad_norm": 1.6444640159606934, | |
| "learning_rate": 1.7512949552500412e-06, | |
| "loss": 0.35596007108688354, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.662169553327256, | |
| "grad_norm": 1.159480094909668, | |
| "learning_rate": 1.7226173936551282e-06, | |
| "loss": 0.4520571827888489, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6650865998176845, | |
| "grad_norm": 1.5874221324920654, | |
| "learning_rate": 1.6941544280041567e-06, | |
| "loss": 0.4702282249927521, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.668003646308113, | |
| "grad_norm": 1.6153535842895508, | |
| "learning_rate": 1.6659067962160157e-06, | |
| "loss": 0.3803800046443939, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.6709206927985414, | |
| "grad_norm": 1.0748940706253052, | |
| "learning_rate": 1.6378752306269386e-06, | |
| "loss": 0.4368419051170349, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.6738377392889698, | |
| "grad_norm": 1.5286788940429688, | |
| "learning_rate": 1.6100604579715185e-06, | |
| "loss": 0.4195623993873596, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.6767547857793983, | |
| "grad_norm": 1.1433510780334473, | |
| "learning_rate": 1.5824631993638651e-06, | |
| "loss": 0.4366849660873413, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.6796718322698267, | |
| "grad_norm": 1.9694907665252686, | |
| "learning_rate": 1.5550841702789122e-06, | |
| "loss": 0.5555303692817688, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.6825888787602552, | |
| "grad_norm": 1.7587188482284546, | |
| "learning_rate": 1.5279240805338647e-06, | |
| "loss": 0.40394848585128784, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.6855059252506837, | |
| "grad_norm": 1.063381314277649, | |
| "learning_rate": 1.5009836342697993e-06, | |
| "loss": 0.49564215540885925, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.688422971741112, | |
| "grad_norm": 1.1742531061172485, | |
| "learning_rate": 1.4742635299334063e-06, | |
| "loss": 0.3891904950141907, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.6913400182315406, | |
| "grad_norm": 1.499934196472168, | |
| "learning_rate": 1.4477644602588848e-06, | |
| "loss": 0.35497623682022095, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.694257064721969, | |
| "grad_norm": 1.5112360715866089, | |
| "learning_rate": 1.421487112249984e-06, | |
| "loss": 0.4062272012233734, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.6971741112123975, | |
| "grad_norm": 1.3583141565322876, | |
| "learning_rate": 1.3954321671621885e-06, | |
| "loss": 0.3655265271663666, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.700091157702826, | |
| "grad_norm": 2.8181653022766113, | |
| "learning_rate": 1.3696003004850577e-06, | |
| "loss": 0.37418332695961, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.7030082041932544, | |
| "grad_norm": 0.967166543006897, | |
| "learning_rate": 1.3439921819247138e-06, | |
| "loss": 0.4946930408477783, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.7059252506836828, | |
| "grad_norm": 1.2773699760437012, | |
| "learning_rate": 1.3186084753864813e-06, | |
| "loss": 0.5101871490478516, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.7088422971741113, | |
| "grad_norm": 1.2814991474151611, | |
| "learning_rate": 1.293449838957671e-06, | |
| "loss": 0.3688133656978607, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.7117593436645397, | |
| "grad_norm": 1.594966173171997, | |
| "learning_rate": 1.2685169248905228e-06, | |
| "loss": 0.4739398956298828, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.7146763901549682, | |
| "grad_norm": 1.1471531391143799, | |
| "learning_rate": 1.2438103795852885e-06, | |
| "loss": 0.3719588816165924, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.7175934366453967, | |
| "grad_norm": 1.1657356023788452, | |
| "learning_rate": 1.2193308435734852e-06, | |
| "loss": 0.4119298458099365, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.720510483135825, | |
| "grad_norm": 1.1239042282104492, | |
| "learning_rate": 1.1950789515012783e-06, | |
| "loss": 0.38277503848075867, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7234275296262536, | |
| "grad_norm": 1.149478554725647, | |
| "learning_rate": 1.1710553321130324e-06, | |
| "loss": 0.35080626606941223, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.726344576116682, | |
| "grad_norm": 1.2020260095596313, | |
| "learning_rate": 1.1472606082350112e-06, | |
| "loss": 0.3991318345069885, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.7292616226071102, | |
| "grad_norm": 1.101475477218628, | |
| "learning_rate": 1.123695396759229e-06, | |
| "loss": 0.45791420340538025, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.7321786690975387, | |
| "grad_norm": 0.9617101550102234, | |
| "learning_rate": 1.1003603086274584e-06, | |
| "loss": 0.39805036783218384, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.7350957155879672, | |
| "grad_norm": 1.1439731121063232, | |
| "learning_rate": 1.07725594881539e-06, | |
| "loss": 0.35753339529037476, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.7380127620783956, | |
| "grad_norm": 1.0350618362426758, | |
| "learning_rate": 1.0543829163169516e-06, | |
| "loss": 0.42581748962402344, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.740929808568824, | |
| "grad_norm": 1.2865227460861206, | |
| "learning_rate": 1.031741804128773e-06, | |
| "loss": 0.34685325622558594, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.7438468550592525, | |
| "grad_norm": 1.2079373598098755, | |
| "learning_rate": 1.0093331992348154e-06, | |
| "loss": 0.48401936888694763, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.746763901549681, | |
| "grad_norm": 1.1684436798095703, | |
| "learning_rate": 9.871576825911577e-07, | |
| "loss": 0.387456476688385, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.7496809480401094, | |
| "grad_norm": 1.298045039176941, | |
| "learning_rate": 9.65215829110927e-07, | |
| "loss": 0.40196847915649414, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7496809480401094, | |
| "eval_loss": 0.3965963125228882, | |
| "eval_runtime": 912.3102, | |
| "eval_samples_per_second": 0.693, | |
| "eval_steps_per_second": 0.693, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7525979945305379, | |
| "grad_norm": 1.24501371383667, | |
| "learning_rate": 9.435082076493974e-07, | |
| "loss": 0.3990224003791809, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.7555150410209661, | |
| "grad_norm": 1.0634632110595703, | |
| "learning_rate": 9.220353809892435e-07, | |
| "loss": 0.44232451915740967, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.7584320875113946, | |
| "grad_norm": 1.0276325941085815, | |
| "learning_rate": 9.007979058259475e-07, | |
| "loss": 0.5336061716079712, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.761349134001823, | |
| "grad_norm": 1.1488786935806274, | |
| "learning_rate": 8.797963327533698e-07, | |
| "loss": 0.35023194551467896, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.7642661804922515, | |
| "grad_norm": 1.171109676361084, | |
| "learning_rate": 8.590312062494699e-07, | |
| "loss": 0.4461829662322998, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.76718322698268, | |
| "grad_norm": 1.3948134183883667, | |
| "learning_rate": 8.385030646621938e-07, | |
| "loss": 0.3448236584663391, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.7701002734731084, | |
| "grad_norm": 1.144608497619629, | |
| "learning_rate": 8.18212440195515e-07, | |
| "loss": 0.39913487434387207, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.7730173199635368, | |
| "grad_norm": 1.1941088438034058, | |
| "learning_rate": 7.981598588956396e-07, | |
| "loss": 0.40005186200141907, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.7759343664539653, | |
| "grad_norm": 1.1087690591812134, | |
| "learning_rate": 7.783458406373656e-07, | |
| "loss": 0.38895174860954285, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.7788514129443938, | |
| "grad_norm": 1.1787676811218262, | |
| "learning_rate": 7.587708991106069e-07, | |
| "loss": 0.36259594559669495, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7817684594348222, | |
| "grad_norm": 1.1265360116958618, | |
| "learning_rate": 7.394355418070731e-07, | |
| "loss": 0.44475269317626953, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.7846855059252507, | |
| "grad_norm": 1.2230898141860962, | |
| "learning_rate": 7.203402700071138e-07, | |
| "loss": 0.3823542594909668, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.7876025524156791, | |
| "grad_norm": 1.0893492698669434, | |
| "learning_rate": 7.01485578766724e-07, | |
| "loss": 0.43276944756507874, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.7905195989061076, | |
| "grad_norm": 1.039494514465332, | |
| "learning_rate": 6.828719569047082e-07, | |
| "loss": 0.5362570881843567, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.793436645396536, | |
| "grad_norm": 1.0307413339614868, | |
| "learning_rate": 6.644998869900054e-07, | |
| "loss": 0.34828731417655945, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.7963536918869645, | |
| "grad_norm": 1.1253540515899658, | |
| "learning_rate": 6.463698453291823e-07, | |
| "loss": 0.3669811487197876, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.799270738377393, | |
| "grad_norm": 1.1103028059005737, | |
| "learning_rate": 6.28482301954082e-07, | |
| "loss": 0.3868233561515808, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.8021877848678214, | |
| "grad_norm": 1.0804798603057861, | |
| "learning_rate": 6.108377206096394e-07, | |
| "loss": 0.4123673439025879, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.8051048313582498, | |
| "grad_norm": 1.1068788766860962, | |
| "learning_rate": 5.934365587418567e-07, | |
| "loss": 0.44468799233436584, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 1.8080218778486783, | |
| "grad_norm": 1.0318645238876343, | |
| "learning_rate": 5.762792674859474e-07, | |
| "loss": 0.3586595356464386, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8109389243391067, | |
| "grad_norm": 1.1553035974502563, | |
| "learning_rate": 5.593662916546361e-07, | |
| "loss": 0.4580552577972412, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.8138559708295352, | |
| "grad_norm": 1.3010531663894653, | |
| "learning_rate": 5.426980697266271e-07, | |
| "loss": 0.42412641644477844, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 1.8167730173199637, | |
| "grad_norm": 1.1858006715774536, | |
| "learning_rate": 5.262750338352418e-07, | |
| "loss": 0.38257676362991333, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 1.8196900638103921, | |
| "grad_norm": 1.1341536045074463, | |
| "learning_rate": 5.100976097572074e-07, | |
| "loss": 0.48365846276283264, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.8226071103008206, | |
| "grad_norm": 1.112844467163086, | |
| "learning_rate": 4.941662169016237e-07, | |
| "loss": 0.3893233835697174, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.825524156791249, | |
| "grad_norm": 1.1846497058868408, | |
| "learning_rate": 4.784812682990903e-07, | |
| "loss": 0.38869139552116394, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 1.8284412032816773, | |
| "grad_norm": 1.1383928060531616, | |
| "learning_rate": 4.6304317059099326e-07, | |
| "loss": 0.36156678199768066, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.8313582497721057, | |
| "grad_norm": 1.0891298055648804, | |
| "learning_rate": 4.478523240189703e-07, | |
| "loss": 0.40910348296165466, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 1.8342752962625342, | |
| "grad_norm": 1.1337662935256958, | |
| "learning_rate": 4.3290912241452545e-07, | |
| "loss": 0.3360365629196167, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 1.8371923427529626, | |
| "grad_norm": 1.280463695526123, | |
| "learning_rate": 4.182139531888263e-07, | |
| "loss": 0.44318532943725586, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.840109389243391, | |
| "grad_norm": 1.1408170461654663, | |
| "learning_rate": 4.0376719732265647e-07, | |
| "loss": 0.37003564834594727, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 1.8430264357338195, | |
| "grad_norm": 0.9730168581008911, | |
| "learning_rate": 3.8956922935653895e-07, | |
| "loss": 0.355985552072525, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 1.845943482224248, | |
| "grad_norm": 1.0643151998519897, | |
| "learning_rate": 3.756204173810263e-07, | |
| "loss": 0.3911808729171753, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.8488605287146764, | |
| "grad_norm": 1.1769851446151733, | |
| "learning_rate": 3.61921123027158e-07, | |
| "loss": 0.314385324716568, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 1.8517775752051049, | |
| "grad_norm": 0.921336829662323, | |
| "learning_rate": 3.484717014570838e-07, | |
| "loss": 0.3375144302845001, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.8546946216955331, | |
| "grad_norm": 0.9904773235321045, | |
| "learning_rate": 3.3527250135485744e-07, | |
| "loss": 0.4461369514465332, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.8576116681859616, | |
| "grad_norm": 1.0844534635543823, | |
| "learning_rate": 3.223238649173954e-07, | |
| "loss": 0.398414671421051, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.86052871467639, | |
| "grad_norm": 0.9829220771789551, | |
| "learning_rate": 3.096261278456048e-07, | |
| "loss": 0.35938704013824463, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.8634457611668185, | |
| "grad_norm": 1.13048255443573, | |
| "learning_rate": 2.971796193356835e-07, | |
| "loss": 0.3783624768257141, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.866362807657247, | |
| "grad_norm": 1.4307893514633179, | |
| "learning_rate": 2.8498466207058095e-07, | |
| "loss": 0.3601874113082886, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.8692798541476754, | |
| "grad_norm": 1.1835116147994995, | |
| "learning_rate": 2.7304157221163753e-07, | |
| "loss": 0.43897169828414917, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.8721969006381038, | |
| "grad_norm": 1.0730469226837158, | |
| "learning_rate": 2.613506593903825e-07, | |
| "loss": 0.4407995343208313, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.8751139471285323, | |
| "grad_norm": 0.9504678845405579, | |
| "learning_rate": 2.499122267005105e-07, | |
| "loss": 0.4105035960674286, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.8780309936189608, | |
| "grad_norm": 1.2599385976791382, | |
| "learning_rate": 2.387265706900199e-07, | |
| "loss": 0.41521430015563965, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.8809480401093892, | |
| "grad_norm": 1.035783052444458, | |
| "learning_rate": 2.2779398135353127e-07, | |
| "loss": 0.33491846919059753, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.8838650865998177, | |
| "grad_norm": 1.1612690687179565, | |
| "learning_rate": 2.1711474212476325e-07, | |
| "loss": 0.3367970287799835, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.8867821330902461, | |
| "grad_norm": 1.2541207075119019, | |
| "learning_rate": 2.066891298691831e-07, | |
| "loss": 0.46374717354774475, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.8896991795806746, | |
| "grad_norm": 1.1037088632583618, | |
| "learning_rate": 1.9651741487683562e-07, | |
| "loss": 0.3799871802330017, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.892616226071103, | |
| "grad_norm": 1.3611476421356201, | |
| "learning_rate": 1.8659986085532988e-07, | |
| "loss": 0.40523889660835266, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.8955332725615315, | |
| "grad_norm": 1.1628823280334473, | |
| "learning_rate": 1.7693672492300473e-07, | |
| "loss": 0.38399839401245117, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8955332725615315, | |
| "eval_loss": 0.3949255049228668, | |
| "eval_runtime": 903.6455, | |
| "eval_samples_per_second": 0.699, | |
| "eval_steps_per_second": 0.699, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.89845031905196, | |
| "grad_norm": 1.1185522079467773, | |
| "learning_rate": 1.675282576022641e-07, | |
| "loss": 0.4280855059623718, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.9013673655423884, | |
| "grad_norm": 1.1962717771530151, | |
| "learning_rate": 1.5837470281307666e-07, | |
| "loss": 0.3026162087917328, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.9042844120328168, | |
| "grad_norm": 1.1818240880966187, | |
| "learning_rate": 1.4947629786666084e-07, | |
| "loss": 0.43283963203430176, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.9072014585232453, | |
| "grad_norm": 1.161944031715393, | |
| "learning_rate": 1.4083327345932208e-07, | |
| "loss": 0.435259610414505, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.9101185050136738, | |
| "grad_norm": 1.1311709880828857, | |
| "learning_rate": 1.32445853666483e-07, | |
| "loss": 0.3258042633533478, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.9130355515041022, | |
| "grad_norm": 1.0152852535247803, | |
| "learning_rate": 1.2431425593686263e-07, | |
| "loss": 0.40951770544052124, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.9159525979945307, | |
| "grad_norm": 1.2698794603347778, | |
| "learning_rate": 1.164386910868498e-07, | |
| "loss": 0.3610893785953522, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.9188696444849591, | |
| "grad_norm": 1.1092722415924072, | |
| "learning_rate": 1.0881936329502851e-07, | |
| "loss": 0.31951773166656494, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.9217866909753876, | |
| "grad_norm": 1.2378597259521484, | |
| "learning_rate": 1.0145647009689008e-07, | |
| "loss": 0.3756055235862732, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 1.924703737465816, | |
| "grad_norm": 1.0100237131118774, | |
| "learning_rate": 9.43502023797116e-08, | |
| "loss": 0.26117536425590515, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.9276207839562443, | |
| "grad_norm": 1.2368487119674683, | |
| "learning_rate": 8.750074437760325e-08, | |
| "loss": 0.3092282712459564, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 1.9305378304466727, | |
| "grad_norm": 1.0328837633132935, | |
| "learning_rate": 8.090827366673548e-08, | |
| "loss": 0.4076297879219055, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 1.9334548769371012, | |
| "grad_norm": 0.9885771870613098, | |
| "learning_rate": 7.457296116073487e-08, | |
| "loss": 0.40007251501083374, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.9363719234275296, | |
| "grad_norm": 1.19287109375, | |
| "learning_rate": 6.849497110625214e-08, | |
| "loss": 0.3751019239425659, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 1.939288969917958, | |
| "grad_norm": 1.134682536125183, | |
| "learning_rate": 6.267446107870334e-08, | |
| "loss": 0.4558236300945282, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.9422060164083865, | |
| "grad_norm": 3.414883852005005, | |
| "learning_rate": 5.7111581978185336e-08, | |
| "loss": 0.5070392489433289, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.945123062898815, | |
| "grad_norm": 1.179479956626892, | |
| "learning_rate": 5.180647802556671e-08, | |
| "loss": 0.389989972114563, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 1.9480401093892434, | |
| "grad_norm": 1.1473273038864136, | |
| "learning_rate": 4.675928675874186e-08, | |
| "loss": 0.460910826921463, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.9509571558796717, | |
| "grad_norm": 0.9269355535507202, | |
| "learning_rate": 4.197013902907165e-08, | |
| "loss": 0.5488728284835815, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.9538742023701001, | |
| "grad_norm": 1.1781370639801025, | |
| "learning_rate": 3.7439158997989445e-08, | |
| "loss": 0.39483463764190674, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.9567912488605286, | |
| "grad_norm": 1.1759430170059204, | |
| "learning_rate": 3.316646413377811e-08, | |
| "loss": 0.38600990176200867, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 1.959708295350957, | |
| "grad_norm": 1.1981792449951172, | |
| "learning_rate": 2.9152165208529147e-08, | |
| "loss": 0.4657193422317505, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.9626253418413855, | |
| "grad_norm": 1.186043620109558, | |
| "learning_rate": 2.5396366295272756e-08, | |
| "loss": 0.46212077140808105, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 1.965542388331814, | |
| "grad_norm": 1.115103840827942, | |
| "learning_rate": 2.1899164765271096e-08, | |
| "loss": 0.4416077733039856, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.9684594348222424, | |
| "grad_norm": 1.2150691747665405, | |
| "learning_rate": 1.866065128550365e-08, | |
| "loss": 0.3557685911655426, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.9713764813126708, | |
| "grad_norm": 1.096506953239441, | |
| "learning_rate": 1.5680909816309098e-08, | |
| "loss": 0.32865390181541443, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 1.9742935278030993, | |
| "grad_norm": 1.0974191427230835, | |
| "learning_rate": 1.2960017609213727e-08, | |
| "loss": 0.37568721175193787, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 1.9772105742935278, | |
| "grad_norm": 1.1290082931518555, | |
| "learning_rate": 1.0498045204924145e-08, | |
| "loss": 0.329836905002594, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.9801276207839562, | |
| "grad_norm": 1.0609803199768066, | |
| "learning_rate": 8.295056431504301e-09, | |
| "loss": 0.2694982886314392, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.9830446672743847, | |
| "grad_norm": 0.9838472604751587, | |
| "learning_rate": 6.3511084027156885e-09, | |
| "loss": 0.4270719587802887, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.9859617137648131, | |
| "grad_norm": 1.1900098323822021, | |
| "learning_rate": 4.666251516536324e-09, | |
| "loss": 0.4060650169849396, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.9888787602552416, | |
| "grad_norm": 0.9812174439430237, | |
| "learning_rate": 3.2405294538606637e-09, | |
| "loss": 0.3900409936904907, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 1.99179580674567, | |
| "grad_norm": 1.1988210678100586, | |
| "learning_rate": 2.073979177357188e-09, | |
| "loss": 0.3999583125114441, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 1.9947128532360985, | |
| "grad_norm": 0.9738736152648926, | |
| "learning_rate": 1.1666309305202738e-09, | |
| "loss": 0.46780622005462646, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.997629899726527, | |
| "grad_norm": 0.9841824173927307, | |
| "learning_rate": 5.18508236878601e-10, | |
| "loss": 0.4595794975757599, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.0865421295166016, | |
| "learning_rate": 1.2962789938897323e-10, | |
| "loss": 0.5136060118675232, | |
| "step": 686 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 686, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.317102071220797e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |