| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 169, | |
| "global_step": 511, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0019569471624266144, | |
| "grad_norm": 4.606130123138428, | |
| "learning_rate": 3.125e-07, | |
| "loss": 0.6812, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.003913894324853229, | |
| "grad_norm": 5.627719402313232, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.7188, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.005870841487279843, | |
| "grad_norm": 5.225893974304199, | |
| "learning_rate": 9.375000000000001e-07, | |
| "loss": 0.6832, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.007827788649706457, | |
| "grad_norm": 4.055615425109863, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.7478, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.009784735812133072, | |
| "grad_norm": 3.32236385345459, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 0.6512, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011741682974559686, | |
| "grad_norm": 2.5439915657043457, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.72, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0136986301369863, | |
| "grad_norm": 1.9466145038604736, | |
| "learning_rate": 2.1875000000000002e-06, | |
| "loss": 0.6302, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.015655577299412915, | |
| "grad_norm": 1.833212971687317, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.6859, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.01761252446183953, | |
| "grad_norm": 1.4203251600265503, | |
| "learning_rate": 2.8125e-06, | |
| "loss": 0.5943, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.019569471624266144, | |
| "grad_norm": 1.7164653539657593, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.5744, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021526418786692758, | |
| "grad_norm": 1.4249149560928345, | |
| "learning_rate": 3.4375e-06, | |
| "loss": 0.5896, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.023483365949119372, | |
| "grad_norm": 1.2433736324310303, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5873, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.025440313111545987, | |
| "grad_norm": 1.0826597213745117, | |
| "learning_rate": 4.0625000000000005e-06, | |
| "loss": 0.5792, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0273972602739726, | |
| "grad_norm": 1.0738195180892944, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.6032, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.029354207436399216, | |
| "grad_norm": 1.1434872150421143, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 0.5698, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03131115459882583, | |
| "grad_norm": 1.4672112464904785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5471, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.033268101761252444, | |
| "grad_norm": 0.9875673651695251, | |
| "learning_rate": 4.999949650182267e-06, | |
| "loss": 0.5393, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03522504892367906, | |
| "grad_norm": 1.0974621772766113, | |
| "learning_rate": 4.999798602757149e-06, | |
| "loss": 0.5349, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03718199608610567, | |
| "grad_norm": 1.2209999561309814, | |
| "learning_rate": 4.999546863808815e-06, | |
| "loss": 0.6743, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.03913894324853229, | |
| "grad_norm": 0.8842924237251282, | |
| "learning_rate": 4.999194443477273e-06, | |
| "loss": 0.5919, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0410958904109589, | |
| "grad_norm": 1.0825450420379639, | |
| "learning_rate": 4.998741355957963e-06, | |
| "loss": 0.6438, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.043052837573385516, | |
| "grad_norm": 1.0688315629959106, | |
| "learning_rate": 4.998187619501185e-06, | |
| "loss": 0.5637, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04500978473581213, | |
| "grad_norm": 0.8487011790275574, | |
| "learning_rate": 4.99753325641136e-06, | |
| "loss": 0.5082, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.046966731898238745, | |
| "grad_norm": 0.9255719780921936, | |
| "learning_rate": 4.9967782930461405e-06, | |
| "loss": 0.5081, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04892367906066536, | |
| "grad_norm": 0.9492978453636169, | |
| "learning_rate": 4.9959227598153395e-06, | |
| "loss": 0.6473, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.050880626223091974, | |
| "grad_norm": 1.5718590021133423, | |
| "learning_rate": 4.994966691179712e-06, | |
| "loss": 0.5219, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.05283757338551859, | |
| "grad_norm": 0.9533342123031616, | |
| "learning_rate": 4.993910125649561e-06, | |
| "loss": 0.5279, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0547945205479452, | |
| "grad_norm": 1.8043086528778076, | |
| "learning_rate": 4.992753105783194e-06, | |
| "loss": 0.5277, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05675146771037182, | |
| "grad_norm": 1.317238450050354, | |
| "learning_rate": 4.991495678185202e-06, | |
| "loss": 0.4567, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.05870841487279843, | |
| "grad_norm": 1.1877973079681396, | |
| "learning_rate": 4.990137893504585e-06, | |
| "loss": 0.5536, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.060665362035225046, | |
| "grad_norm": 1.052051305770874, | |
| "learning_rate": 4.988679806432712e-06, | |
| "loss": 0.4946, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.06262230919765166, | |
| "grad_norm": 7.080264091491699, | |
| "learning_rate": 4.987121475701118e-06, | |
| "loss": 0.5056, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.06457925636007827, | |
| "grad_norm": 3.640033483505249, | |
| "learning_rate": 4.985462964079137e-06, | |
| "loss": 0.5162, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06653620352250489, | |
| "grad_norm": 2.17399263381958, | |
| "learning_rate": 4.983704338371375e-06, | |
| "loss": 0.5314, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.0684931506849315, | |
| "grad_norm": 0.9113507270812988, | |
| "learning_rate": 4.981845669415022e-06, | |
| "loss": 0.5416, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07045009784735812, | |
| "grad_norm": 0.865261971950531, | |
| "learning_rate": 4.9798870320769884e-06, | |
| "loss": 0.5266, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.07240704500978473, | |
| "grad_norm": 1.3988151550292969, | |
| "learning_rate": 4.977828505250903e-06, | |
| "loss": 0.4983, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.07436399217221135, | |
| "grad_norm": 1.0698161125183105, | |
| "learning_rate": 4.975670171853926e-06, | |
| "loss": 0.4723, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.07632093933463796, | |
| "grad_norm": 1.2741320133209229, | |
| "learning_rate": 4.9734121188234115e-06, | |
| "loss": 0.4996, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.07827788649706457, | |
| "grad_norm": 2.0048317909240723, | |
| "learning_rate": 4.971054437113406e-06, | |
| "loss": 0.6535, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08023483365949119, | |
| "grad_norm": 1.2805678844451904, | |
| "learning_rate": 4.968597221690986e-06, | |
| "loss": 0.5198, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0821917808219178, | |
| "grad_norm": 0.9233219027519226, | |
| "learning_rate": 4.96604057153243e-06, | |
| "loss": 0.5724, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.08414872798434442, | |
| "grad_norm": 0.9261006712913513, | |
| "learning_rate": 4.963384589619233e-06, | |
| "loss": 0.4601, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08610567514677103, | |
| "grad_norm": 1.3594372272491455, | |
| "learning_rate": 4.960629382933959e-06, | |
| "loss": 0.5616, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08806262230919765, | |
| "grad_norm": 2.4310686588287354, | |
| "learning_rate": 4.957775062455933e-06, | |
| "loss": 0.5442, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09001956947162426, | |
| "grad_norm": 1.030832290649414, | |
| "learning_rate": 4.9548217431567665e-06, | |
| "loss": 0.5964, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.09197651663405088, | |
| "grad_norm": 0.831721305847168, | |
| "learning_rate": 4.951769543995731e-06, | |
| "loss": 0.44, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.09393346379647749, | |
| "grad_norm": 0.9876791834831238, | |
| "learning_rate": 4.948618587914963e-06, | |
| "loss": 0.5404, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0958904109589041, | |
| "grad_norm": 0.9953415393829346, | |
| "learning_rate": 4.9453690018345144e-06, | |
| "loss": 0.5668, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09784735812133072, | |
| "grad_norm": 0.8553183078765869, | |
| "learning_rate": 4.9420209166472386e-06, | |
| "loss": 0.5414, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09980430528375733, | |
| "grad_norm": 0.7962396144866943, | |
| "learning_rate": 4.938574467213519e-06, | |
| "loss": 0.495, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.10176125244618395, | |
| "grad_norm": 0.7835857272148132, | |
| "learning_rate": 4.935029792355834e-06, | |
| "loss": 0.5037, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.10371819960861056, | |
| "grad_norm": 0.8453947901725769, | |
| "learning_rate": 4.931387034853173e-06, | |
| "loss": 0.5011, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.10567514677103718, | |
| "grad_norm": 1.8459208011627197, | |
| "learning_rate": 4.927646341435276e-06, | |
| "loss": 0.5554, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.10763209393346379, | |
| "grad_norm": 0.9212117195129395, | |
| "learning_rate": 4.9238078627767285e-06, | |
| "loss": 0.5886, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1095890410958904, | |
| "grad_norm": 0.7834203243255615, | |
| "learning_rate": 4.919871753490892e-06, | |
| "loss": 0.4602, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.11154598825831702, | |
| "grad_norm": 0.9025184512138367, | |
| "learning_rate": 4.9158381721236715e-06, | |
| "loss": 0.4544, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.11350293542074363, | |
| "grad_norm": 1.1300384998321533, | |
| "learning_rate": 4.91170728114714e-06, | |
| "loss": 0.5704, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.11545988258317025, | |
| "grad_norm": 0.7926605343818665, | |
| "learning_rate": 4.907479246952981e-06, | |
| "loss": 0.5112, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.11741682974559686, | |
| "grad_norm": 0.7744232416152954, | |
| "learning_rate": 4.903154239845798e-06, | |
| "loss": 0.4894, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11937377690802348, | |
| "grad_norm": 1.6636885404586792, | |
| "learning_rate": 4.8987324340362445e-06, | |
| "loss": 0.5311, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.12133072407045009, | |
| "grad_norm": 1.0098280906677246, | |
| "learning_rate": 4.894214007634014e-06, | |
| "loss": 0.4907, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1232876712328767, | |
| "grad_norm": 1.0168606042861938, | |
| "learning_rate": 4.889599142640663e-06, | |
| "loss": 0.5128, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.12524461839530332, | |
| "grad_norm": 0.8393405079841614, | |
| "learning_rate": 4.884888024942282e-06, | |
| "loss": 0.4989, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.12720156555772993, | |
| "grad_norm": 1.2758891582489014, | |
| "learning_rate": 4.880080844302004e-06, | |
| "loss": 0.5329, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12915851272015655, | |
| "grad_norm": 0.8657482862472534, | |
| "learning_rate": 4.875177794352364e-06, | |
| "loss": 0.5058, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.13111545988258316, | |
| "grad_norm": 0.9110330939292908, | |
| "learning_rate": 4.870179072587499e-06, | |
| "loss": 0.5137, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.13307240704500978, | |
| "grad_norm": 0.8738705515861511, | |
| "learning_rate": 4.865084880355193e-06, | |
| "loss": 0.5423, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.1350293542074364, | |
| "grad_norm": 0.8127829432487488, | |
| "learning_rate": 4.859895422848767e-06, | |
| "loss": 0.5402, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 0.768864631652832, | |
| "learning_rate": 4.854610909098813e-06, | |
| "loss": 0.5301, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13894324853228962, | |
| "grad_norm": 1.2464350461959839, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.5124, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.14090019569471623, | |
| "grad_norm": 0.9351313710212708, | |
| "learning_rate": 4.843757568126366e-06, | |
| "loss": 0.5152, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.842991054058075, | |
| "learning_rate": 4.838189178074867e-06, | |
| "loss": 0.5254, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.14481409001956946, | |
| "grad_norm": 0.7789003252983093, | |
| "learning_rate": 4.832526606104213e-06, | |
| "loss": 0.5528, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.14677103718199608, | |
| "grad_norm": 0.8701135516166687, | |
| "learning_rate": 4.826770080301978e-06, | |
| "loss": 0.5243, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1487279843444227, | |
| "grad_norm": 0.8384250998497009, | |
| "learning_rate": 4.8209198325401815e-06, | |
| "loss": 0.4648, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1506849315068493, | |
| "grad_norm": 1.0472533702850342, | |
| "learning_rate": 4.814976098465951e-06, | |
| "loss": 0.5342, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.15264187866927592, | |
| "grad_norm": 0.9264402389526367, | |
| "learning_rate": 4.808939117492028e-06, | |
| "loss": 0.5267, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.15459882583170254, | |
| "grad_norm": 0.8155198097229004, | |
| "learning_rate": 4.802809132787125e-06, | |
| "loss": 0.5363, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.15655577299412915, | |
| "grad_norm": 0.8857468366622925, | |
| "learning_rate": 4.796586391266135e-06, | |
| "loss": 0.5021, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15851272015655576, | |
| "grad_norm": 1.0320619344711304, | |
| "learning_rate": 4.790271143580174e-06, | |
| "loss": 0.4892, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.16046966731898238, | |
| "grad_norm": 0.9655166268348694, | |
| "learning_rate": 4.783863644106502e-06, | |
| "loss": 0.5493, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.162426614481409, | |
| "grad_norm": 1.3644921779632568, | |
| "learning_rate": 4.777364150938263e-06, | |
| "loss": 0.4835, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1643835616438356, | |
| "grad_norm": 1.291692852973938, | |
| "learning_rate": 4.770772925874093e-06, | |
| "loss": 0.5755, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.16634050880626222, | |
| "grad_norm": 1.0446902513504028, | |
| "learning_rate": 4.764090234407578e-06, | |
| "loss": 0.5659, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16829745596868884, | |
| "grad_norm": 0.9225801825523376, | |
| "learning_rate": 4.757316345716554e-06, | |
| "loss": 0.4067, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.17025440313111545, | |
| "grad_norm": 0.8291013240814209, | |
| "learning_rate": 4.75045153265227e-06, | |
| "loss": 0.4946, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.17221135029354206, | |
| "grad_norm": 1.1656488180160522, | |
| "learning_rate": 4.743496071728396e-06, | |
| "loss": 0.4933, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.17416829745596868, | |
| "grad_norm": 0.9090279936790466, | |
| "learning_rate": 4.736450243109885e-06, | |
| "loss": 0.5085, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.1761252446183953, | |
| "grad_norm": 1.2236806154251099, | |
| "learning_rate": 4.729314330601684e-06, | |
| "loss": 0.5147, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1780821917808219, | |
| "grad_norm": 0.9335976839065552, | |
| "learning_rate": 4.7220886216373095e-06, | |
| "loss": 0.4589, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.18003913894324852, | |
| "grad_norm": 0.759772002696991, | |
| "learning_rate": 4.714773407267264e-06, | |
| "loss": 0.5398, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.18199608610567514, | |
| "grad_norm": 0.9582347869873047, | |
| "learning_rate": 4.707368982147318e-06, | |
| "loss": 0.5696, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.18395303326810175, | |
| "grad_norm": 0.9130314588546753, | |
| "learning_rate": 4.699875644526633e-06, | |
| "loss": 0.4803, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.18590998043052837, | |
| "grad_norm": 0.9103049635887146, | |
| "learning_rate": 4.692293696235758e-06, | |
| "loss": 0.4833, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.18786692759295498, | |
| "grad_norm": 0.7975893616676331, | |
| "learning_rate": 4.684623442674463e-06, | |
| "loss": 0.5263, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.1898238747553816, | |
| "grad_norm": 0.761643648147583, | |
| "learning_rate": 4.676865192799443e-06, | |
| "loss": 0.4519, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.1917808219178082, | |
| "grad_norm": 0.7510681748390198, | |
| "learning_rate": 4.669019259111873e-06, | |
| "loss": 0.4871, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.19373776908023482, | |
| "grad_norm": 1.1785235404968262, | |
| "learning_rate": 4.661085957644817e-06, | |
| "loss": 0.4644, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.19569471624266144, | |
| "grad_norm": 1.2464004755020142, | |
| "learning_rate": 4.653065607950502e-06, | |
| "loss": 0.4791, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19765166340508805, | |
| "grad_norm": 2.580218553543091, | |
| "learning_rate": 4.644958533087443e-06, | |
| "loss": 0.4146, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.19960861056751467, | |
| "grad_norm": 0.9442769289016724, | |
| "learning_rate": 4.636765059607434e-06, | |
| "loss": 0.494, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.20156555772994128, | |
| "grad_norm": 0.7965562343597412, | |
| "learning_rate": 4.628485517542393e-06, | |
| "loss": 0.4496, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2035225048923679, | |
| "grad_norm": 1.2338522672653198, | |
| "learning_rate": 4.620120240391065e-06, | |
| "loss": 0.4592, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2054794520547945, | |
| "grad_norm": 0.8661827445030212, | |
| "learning_rate": 4.611669565105597e-06, | |
| "loss": 0.4883, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.20743639921722112, | |
| "grad_norm": 1.014655351638794, | |
| "learning_rate": 4.603133832077953e-06, | |
| "loss": 0.5101, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.20939334637964774, | |
| "grad_norm": 0.9033066630363464, | |
| "learning_rate": 4.5945133851262185e-06, | |
| "loss": 0.4515, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.21135029354207435, | |
| "grad_norm": 0.91737961769104, | |
| "learning_rate": 4.585808571480739e-06, | |
| "loss": 0.4886, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.21330724070450097, | |
| "grad_norm": 0.9076818823814392, | |
| "learning_rate": 4.577019741770137e-06, | |
| "loss": 0.5572, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.21526418786692758, | |
| "grad_norm": 0.9256044626235962, | |
| "learning_rate": 4.5681472500071935e-06, | |
| "loss": 0.5089, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2172211350293542, | |
| "grad_norm": 0.8705273270606995, | |
| "learning_rate": 4.559191453574582e-06, | |
| "loss": 0.5199, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2191780821917808, | |
| "grad_norm": 0.8358094096183777, | |
| "learning_rate": 4.550152713210478e-06, | |
| "loss": 0.5091, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.22113502935420742, | |
| "grad_norm": 1.0409964323043823, | |
| "learning_rate": 4.541031392994025e-06, | |
| "loss": 0.4997, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.22309197651663404, | |
| "grad_norm": 0.8039932250976562, | |
| "learning_rate": 4.53182786033067e-06, | |
| "loss": 0.537, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.22504892367906065, | |
| "grad_norm": 0.9191640615463257, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.5799, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.22700587084148727, | |
| "grad_norm": 0.8132153153419495, | |
| "learning_rate": 4.513175643827647e-06, | |
| "loss": 0.5217, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.22896281800391388, | |
| "grad_norm": 0.7776696085929871, | |
| "learning_rate": 4.503727711296539e-06, | |
| "loss": 0.4729, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.2309197651663405, | |
| "grad_norm": 0.8824874758720398, | |
| "learning_rate": 4.494199068905389e-06, | |
| "loss": 0.4977, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.2328767123287671, | |
| "grad_norm": 0.9938674569129944, | |
| "learning_rate": 4.484590100466524e-06, | |
| "loss": 0.5067, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.23483365949119372, | |
| "grad_norm": 1.018510103225708, | |
| "learning_rate": 4.474901193027791e-06, | |
| "loss": 0.5855, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.23679060665362034, | |
| "grad_norm": 1.0530946254730225, | |
| "learning_rate": 4.4651327368569695e-06, | |
| "loss": 0.4835, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.23874755381604695, | |
| "grad_norm": 0.7325494289398193, | |
| "learning_rate": 4.455285125426049e-06, | |
| "loss": 0.5043, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.24070450097847357, | |
| "grad_norm": 1.2264351844787598, | |
| "learning_rate": 4.445358755395382e-06, | |
| "loss": 0.4991, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.24266144814090018, | |
| "grad_norm": 0.7878324389457703, | |
| "learning_rate": 4.435354026597707e-06, | |
| "loss": 0.4943, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.2446183953033268, | |
| "grad_norm": 1.0379810333251953, | |
| "learning_rate": 4.425271342022039e-06, | |
| "loss": 0.5664, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2465753424657534, | |
| "grad_norm": 1.2007404565811157, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.4495, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.24853228962818003, | |
| "grad_norm": 1.4260215759277344, | |
| "learning_rate": 4.404873733176678e-06, | |
| "loss": 0.4848, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.25048923679060664, | |
| "grad_norm": 0.7717714309692383, | |
| "learning_rate": 4.3945596305196925e-06, | |
| "loss": 0.4975, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.25244618395303325, | |
| "grad_norm": 1.0631009340286255, | |
| "learning_rate": 4.384169215277042e-06, | |
| "loss": 0.538, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.25440313111545987, | |
| "grad_norm": 0.9604893326759338, | |
| "learning_rate": 4.373702905973136e-06, | |
| "loss": 0.554, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2563600782778865, | |
| "grad_norm": 0.8638473749160767, | |
| "learning_rate": 4.363161124189387e-06, | |
| "loss": 0.4839, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2583170254403131, | |
| "grad_norm": 0.8187501430511475, | |
| "learning_rate": 4.352544294547229e-06, | |
| "loss": 0.5105, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.2602739726027397, | |
| "grad_norm": 1.4357470273971558, | |
| "learning_rate": 4.341852844691012e-06, | |
| "loss": 0.4532, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.2622309197651663, | |
| "grad_norm": 0.8292232155799866, | |
| "learning_rate": 4.331087205270778e-06, | |
| "loss": 0.451, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.26418786692759294, | |
| "grad_norm": 0.8243665099143982, | |
| "learning_rate": 4.320247809924911e-06, | |
| "loss": 0.4857, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.26614481409001955, | |
| "grad_norm": 0.9147266745567322, | |
| "learning_rate": 4.309335095262675e-06, | |
| "loss": 0.4778, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.26810176125244617, | |
| "grad_norm": 0.8612287044525146, | |
| "learning_rate": 4.2983495008466285e-06, | |
| "loss": 0.4627, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.2700587084148728, | |
| "grad_norm": 0.8230846524238586, | |
| "learning_rate": 4.287291469174909e-06, | |
| "loss": 0.4627, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2720156555772994, | |
| "grad_norm": 0.8767359852790833, | |
| "learning_rate": 4.276161445663423e-06, | |
| "loss": 0.5119, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 0.8119643926620483, | |
| "learning_rate": 4.264959878627891e-06, | |
| "loss": 0.4495, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2759295499021526, | |
| "grad_norm": 0.7973845601081848, | |
| "learning_rate": 4.253687219265803e-06, | |
| "loss": 0.5228, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.27788649706457924, | |
| "grad_norm": 0.892238199710846, | |
| "learning_rate": 4.242343921638235e-06, | |
| "loss": 0.5154, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.27984344422700586, | |
| "grad_norm": 1.3092166185379028, | |
| "learning_rate": 4.230930442651558e-06, | |
| "loss": 0.5085, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.28180039138943247, | |
| "grad_norm": 1.2284399271011353, | |
| "learning_rate": 4.219447242039043e-06, | |
| "loss": 0.4366, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.2837573385518591, | |
| "grad_norm": 1.0883151292800903, | |
| "learning_rate": 4.207894782342337e-06, | |
| "loss": 0.5958, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 1.1132919788360596, | |
| "learning_rate": 4.196273528892831e-06, | |
| "loss": 0.4348, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2876712328767123, | |
| "grad_norm": 1.2576059103012085, | |
| "learning_rate": 4.18458394979292e-06, | |
| "loss": 0.5247, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.2896281800391389, | |
| "grad_norm": 0.8995031714439392, | |
| "learning_rate": 4.172826515897146e-06, | |
| "loss": 0.5082, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.29158512720156554, | |
| "grad_norm": 0.7533922791481018, | |
| "learning_rate": 4.161001700793231e-06, | |
| "loss": 0.4644, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.29354207436399216, | |
| "grad_norm": 0.9206835031509399, | |
| "learning_rate": 4.149109980783004e-06, | |
| "loss": 0.494, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.29549902152641877, | |
| "grad_norm": 1.208590030670166, | |
| "learning_rate": 4.137151834863213e-06, | |
| "loss": 0.5545, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2974559686888454, | |
| "grad_norm": 0.7689659595489502, | |
| "learning_rate": 4.125127744706232e-06, | |
| "loss": 0.4845, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.299412915851272, | |
| "grad_norm": 1.0235570669174194, | |
| "learning_rate": 4.113038194640658e-06, | |
| "loss": 0.4778, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3013698630136986, | |
| "grad_norm": 1.1112617254257202, | |
| "learning_rate": 4.100883671631806e-06, | |
| "loss": 0.5206, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.30332681017612523, | |
| "grad_norm": 1.073519229888916, | |
| "learning_rate": 4.088664665262091e-06, | |
| "loss": 0.4944, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.30528375733855184, | |
| "grad_norm": 0.8319236040115356, | |
| "learning_rate": 4.076381667711306e-06, | |
| "loss": 0.4741, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.30724070450097846, | |
| "grad_norm": 1.2600641250610352, | |
| "learning_rate": 4.064035173736804e-06, | |
| "loss": 0.5311, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.30919765166340507, | |
| "grad_norm": 0.8686632513999939, | |
| "learning_rate": 4.05162568065356e-06, | |
| "loss": 0.5436, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.3111545988258317, | |
| "grad_norm": 0.7053869366645813, | |
| "learning_rate": 4.039153688314146e-06, | |
| "loss": 0.4846, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3131115459882583, | |
| "grad_norm": 0.8360055685043335, | |
| "learning_rate": 4.0266196990885955e-06, | |
| "loss": 0.5041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3150684931506849, | |
| "grad_norm": 0.8842881321907043, | |
| "learning_rate": 4.014024217844167e-06, | |
| "loss": 0.4708, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.31702544031311153, | |
| "grad_norm": 1.0392301082611084, | |
| "learning_rate": 4.001367751925008e-06, | |
| "loss": 0.5315, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.31898238747553814, | |
| "grad_norm": 0.8801809549331665, | |
| "learning_rate": 3.98865081113172e-06, | |
| "loss": 0.4438, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.32093933463796476, | |
| "grad_norm": 1.395719289779663, | |
| "learning_rate": 3.9758739077008256e-06, | |
| "loss": 0.4929, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.32289628180039137, | |
| "grad_norm": 0.8075605034828186, | |
| "learning_rate": 3.96303755628413e-06, | |
| "loss": 0.4364, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.324853228962818, | |
| "grad_norm": 0.9566773772239685, | |
| "learning_rate": 3.950142273927996e-06, | |
| "loss": 0.4001, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3268101761252446, | |
| "grad_norm": 2.270550012588501, | |
| "learning_rate": 3.937188580052518e-06, | |
| "loss": 0.4683, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3287671232876712, | |
| "grad_norm": 0.8937717080116272, | |
| "learning_rate": 3.924176996430597e-06, | |
| "loss": 0.479, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.33072407045009783, | |
| "grad_norm": 0.9620490074157715, | |
| "learning_rate": 3.911108047166924e-06, | |
| "loss": 0.4669, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.33072407045009783, | |
| "eval_accuracy": 0.825246566025413, | |
| "eval_accuracy_first_token": 0.9521367521367521, | |
| "eval_accuracy_first_token_all": 0.9670737362807235, | |
| "eval_accuracy_first_token_all_total": 6469, | |
| "eval_accuracy_first_token_calculate": 0.7954545454545454, | |
| "eval_accuracy_first_token_calculate_total": 44, | |
| "eval_accuracy_first_token_execute": 1.0, | |
| "eval_accuracy_first_token_execute_total": 202, | |
| "eval_accuracy_first_token_get": 0.9649122807017544, | |
| "eval_accuracy_first_token_get_total": 456, | |
| "eval_accuracy_first_token_python": 0.8777777777777778, | |
| "eval_accuracy_first_token_python_total": 990, | |
| "eval_loss": 0.5176534056663513, | |
| "eval_perplexity": 1.206566771452624, | |
| "eval_runtime": 524.2306, | |
| "eval_samples_per_second": 1.269, | |
| "eval_steps_per_second": 0.16, | |
| "eval_total_number_first_token": 9360, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.33268101761252444, | |
| "grad_norm": 0.8628460168838501, | |
| "learning_rate": 3.897982258676867e-06, | |
| "loss": 0.4727, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.33463796477495106, | |
| "grad_norm": 0.8537535071372986, | |
| "learning_rate": 3.8848001596652765e-06, | |
| "loss": 0.4746, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.33659491193737767, | |
| "grad_norm": 0.9613227248191833, | |
| "learning_rate": 3.8715622811051754e-06, | |
| "loss": 0.5148, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.3385518590998043, | |
| "grad_norm": 0.8833454251289368, | |
| "learning_rate": 3.858269156216383e-06, | |
| "loss": 0.5125, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3405088062622309, | |
| "grad_norm": 0.9823891520500183, | |
| "learning_rate": 3.844921320444031e-06, | |
| "loss": 0.5127, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.3424657534246575, | |
| "grad_norm": 1.0789107084274292, | |
| "learning_rate": 3.8315193114369995e-06, | |
| "loss": 0.4935, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.34442270058708413, | |
| "grad_norm": 0.8753149509429932, | |
| "learning_rate": 3.8180636690262565e-06, | |
| "loss": 0.4543, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.34637964774951074, | |
| "grad_norm": 1.7468674182891846, | |
| "learning_rate": 3.804554935203115e-06, | |
| "loss": 0.4955, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.34833659491193736, | |
| "grad_norm": 0.9011304974555969, | |
| "learning_rate": 3.7909936540974052e-06, | |
| "loss": 0.5992, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.350293542074364, | |
| "grad_norm": 0.9541127681732178, | |
| "learning_rate": 3.777380371955552e-06, | |
| "loss": 0.5322, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.3522504892367906, | |
| "grad_norm": 1.3841750621795654, | |
| "learning_rate": 3.7637156371185744e-06, | |
| "loss": 0.4661, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3542074363992172, | |
| "grad_norm": 1.0240124464035034, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5231, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.3561643835616438, | |
| "grad_norm": 1.444016933441162, | |
| "learning_rate": 3.7362340130636926e-06, | |
| "loss": 0.5203, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.35812133072407043, | |
| "grad_norm": 0.7845962047576904, | |
| "learning_rate": 3.7224182308015977e-06, | |
| "loss": 0.4929, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.36007827788649704, | |
| "grad_norm": 1.0257796049118042, | |
| "learning_rate": 3.7085532097114098e-06, | |
| "loss": 0.4597, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.36203522504892366, | |
| "grad_norm": 0.9083458185195923, | |
| "learning_rate": 3.6946395082741582e-06, | |
| "loss": 0.5254, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3639921722113503, | |
| "grad_norm": 0.9128417372703552, | |
| "learning_rate": 3.6806776869317074e-06, | |
| "loss": 0.4428, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.3659491193737769, | |
| "grad_norm": 1.1980143785476685, | |
| "learning_rate": 3.6666683080641846e-06, | |
| "loss": 0.5374, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.3679060665362035, | |
| "grad_norm": 0.8467942476272583, | |
| "learning_rate": 3.6526119359673283e-06, | |
| "loss": 0.4963, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.3698630136986301, | |
| "grad_norm": 0.8798732757568359, | |
| "learning_rate": 3.6385091368297582e-06, | |
| "loss": 0.5208, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.37181996086105673, | |
| "grad_norm": 0.8612852692604065, | |
| "learning_rate": 3.624360478710165e-06, | |
| "loss": 0.3989, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.37377690802348335, | |
| "grad_norm": 0.7529587149620056, | |
| "learning_rate": 3.6101665315144357e-06, | |
| "loss": 0.5015, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.37573385518590996, | |
| "grad_norm": 0.8704853653907776, | |
| "learning_rate": 3.595927866972694e-06, | |
| "loss": 0.4318, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.3776908023483366, | |
| "grad_norm": 1.1298363208770752, | |
| "learning_rate": 3.581645058616271e-06, | |
| "loss": 0.5047, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3796477495107632, | |
| "grad_norm": 1.2964321374893188, | |
| "learning_rate": 3.5673186817546047e-06, | |
| "loss": 0.4764, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.3816046966731898, | |
| "grad_norm": 2.080096960067749, | |
| "learning_rate": 3.552949313452067e-06, | |
| "loss": 0.4808, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3835616438356164, | |
| "grad_norm": 0.8993785977363586, | |
| "learning_rate": 3.5385375325047167e-06, | |
| "loss": 0.5577, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.38551859099804303, | |
| "grad_norm": 0.8617794513702393, | |
| "learning_rate": 3.5240839194169885e-06, | |
| "loss": 0.5042, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.38747553816046965, | |
| "grad_norm": 0.9634183645248413, | |
| "learning_rate": 3.5095890563783124e-06, | |
| "loss": 0.466, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.38943248532289626, | |
| "grad_norm": 0.9015300273895264, | |
| "learning_rate": 3.4950535272396564e-06, | |
| "loss": 0.3887, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.3913894324853229, | |
| "grad_norm": 0.8658633828163147, | |
| "learning_rate": 3.480477917490014e-06, | |
| "loss": 0.4665, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3933463796477495, | |
| "grad_norm": 0.7967968583106995, | |
| "learning_rate": 3.4658628142328215e-06, | |
| "loss": 0.515, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.3953033268101761, | |
| "grad_norm": 0.7495056986808777, | |
| "learning_rate": 3.4512088061623077e-06, | |
| "loss": 0.4345, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3972602739726027, | |
| "grad_norm": 0.9585980772972107, | |
| "learning_rate": 3.436516483539781e-06, | |
| "loss": 0.4084, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.39921722113502933, | |
| "grad_norm": 0.9240750670433044, | |
| "learning_rate": 3.4217864381698523e-06, | |
| "loss": 0.4451, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.40117416829745595, | |
| "grad_norm": 1.2117798328399658, | |
| "learning_rate": 3.4070192633766025e-06, | |
| "loss": 0.5152, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.40313111545988256, | |
| "grad_norm": 0.868486225605011, | |
| "learning_rate": 3.39221555397968e-06, | |
| "loss": 0.5456, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.4050880626223092, | |
| "grad_norm": 0.7969531416893005, | |
| "learning_rate": 3.37737590627034e-06, | |
| "loss": 0.4295, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.4070450097847358, | |
| "grad_norm": 0.9103299975395203, | |
| "learning_rate": 3.362500917987427e-06, | |
| "loss": 0.4485, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.4090019569471624, | |
| "grad_norm": 1.0487585067749023, | |
| "learning_rate": 3.3475911882933014e-06, | |
| "loss": 0.4807, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 0.9155584573745728, | |
| "learning_rate": 3.332647317749702e-06, | |
| "loss": 0.4617, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.41291585127201563, | |
| "grad_norm": 0.9164103865623474, | |
| "learning_rate": 3.3176699082935546e-06, | |
| "loss": 0.5041, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.41487279843444225, | |
| "grad_norm": 0.7580545544624329, | |
| "learning_rate": 3.3026595632127274e-06, | |
| "loss": 0.465, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.41682974559686886, | |
| "grad_norm": 1.0577958822250366, | |
| "learning_rate": 3.2876168871217322e-06, | |
| "loss": 0.4055, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4187866927592955, | |
| "grad_norm": 1.2304415702819824, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.3852, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.4207436399217221, | |
| "grad_norm": 0.905158281326294, | |
| "learning_rate": 3.2574369668543187e-06, | |
| "loss": 0.4861, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4227005870841487, | |
| "grad_norm": 0.9109801054000854, | |
| "learning_rate": 3.2423009383206876e-06, | |
| "loss": 0.4247, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.4246575342465753, | |
| "grad_norm": 0.8025485277175903, | |
| "learning_rate": 3.227135010013498e-06, | |
| "loss": 0.5319, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.42661448140900193, | |
| "grad_norm": 0.883714497089386, | |
| "learning_rate": 3.211939792814131e-06, | |
| "loss": 0.5287, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.9827890396118164, | |
| "learning_rate": 3.19671589878372e-06, | |
| "loss": 0.4799, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.43052837573385516, | |
| "grad_norm": 0.8296178579330444, | |
| "learning_rate": 3.1814639411384953e-06, | |
| "loss": 0.4725, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4324853228962818, | |
| "grad_norm": 0.8092741370201111, | |
| "learning_rate": 3.1661845342250874e-06, | |
| "loss": 0.5054, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.4344422700587084, | |
| "grad_norm": 1.160125732421875, | |
| "learning_rate": 3.1508782934957804e-06, | |
| "loss": 0.6022, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.436399217221135, | |
| "grad_norm": 0.871837854385376, | |
| "learning_rate": 3.1355458354837183e-06, | |
| "loss": 0.4545, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.4383561643835616, | |
| "grad_norm": 0.8639246225357056, | |
| "learning_rate": 3.1201877777780724e-06, | |
| "loss": 0.449, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.44031311154598823, | |
| "grad_norm": 0.9144279956817627, | |
| "learning_rate": 3.1048047389991693e-06, | |
| "loss": 0.4308, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.44227005870841485, | |
| "grad_norm": 1.0165725946426392, | |
| "learning_rate": 3.089397338773569e-06, | |
| "loss": 0.4997, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.44422700587084146, | |
| "grad_norm": 0.7787861824035645, | |
| "learning_rate": 3.0739661977091027e-06, | |
| "loss": 0.4408, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.4461839530332681, | |
| "grad_norm": 0.8962077498435974, | |
| "learning_rate": 3.0585119373698858e-06, | |
| "loss": 0.4879, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.4481409001956947, | |
| "grad_norm": 0.8481760621070862, | |
| "learning_rate": 3.04303518025127e-06, | |
| "loss": 0.4525, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.4500978473581213, | |
| "grad_norm": 0.9689728021621704, | |
| "learning_rate": 3.0275365497547747e-06, | |
| "loss": 0.5199, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4520547945205479, | |
| "grad_norm": 1.0657813549041748, | |
| "learning_rate": 3.012016670162977e-06, | |
| "loss": 0.4834, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.45401174168297453, | |
| "grad_norm": 1.0324097871780396, | |
| "learning_rate": 2.9964761666143638e-06, | |
| "loss": 0.5407, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.45596868884540115, | |
| "grad_norm": 0.8452147245407104, | |
| "learning_rate": 2.980915665078153e-06, | |
| "loss": 0.5108, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.45792563600782776, | |
| "grad_norm": 1.1484103202819824, | |
| "learning_rate": 2.9653357923290753e-06, | |
| "loss": 0.4082, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.4598825831702544, | |
| "grad_norm": 0.859313428401947, | |
| "learning_rate": 2.949737175922135e-06, | |
| "loss": 0.4752, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.461839530332681, | |
| "grad_norm": 0.87496417760849, | |
| "learning_rate": 2.9341204441673267e-06, | |
| "loss": 0.4624, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.4637964774951076, | |
| "grad_norm": 0.9420116543769836, | |
| "learning_rate": 2.9184862261043272e-06, | |
| "loss": 0.4557, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.4657534246575342, | |
| "grad_norm": 1.4860702753067017, | |
| "learning_rate": 2.902835151477161e-06, | |
| "loss": 0.4617, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.46771037181996084, | |
| "grad_norm": 0.8771023750305176, | |
| "learning_rate": 2.887167850708831e-06, | |
| "loss": 0.5299, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.46966731898238745, | |
| "grad_norm": 0.8673617839813232, | |
| "learning_rate": 2.8714849548759293e-06, | |
| "loss": 0.5504, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.47162426614481406, | |
| "grad_norm": 0.8307452201843262, | |
| "learning_rate": 2.8557870956832135e-06, | |
| "loss": 0.4735, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.4735812133072407, | |
| "grad_norm": 0.9233512282371521, | |
| "learning_rate": 2.840074905438161e-06, | |
| "loss": 0.3701, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.4755381604696673, | |
| "grad_norm": 1.0768812894821167, | |
| "learning_rate": 2.8243490170255046e-06, | |
| "loss": 0.4983, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.4774951076320939, | |
| "grad_norm": 0.9305315017700195, | |
| "learning_rate": 2.808610063881737e-06, | |
| "loss": 0.4137, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.4794520547945205, | |
| "grad_norm": 1.1971187591552734, | |
| "learning_rate": 2.792858679969596e-06, | |
| "loss": 0.452, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.48140900195694714, | |
| "grad_norm": 1.314292073249817, | |
| "learning_rate": 2.7770954997525277e-06, | |
| "loss": 0.526, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.48336594911937375, | |
| "grad_norm": 1.2386282682418823, | |
| "learning_rate": 2.761321158169134e-06, | |
| "loss": 0.5002, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.48532289628180036, | |
| "grad_norm": 0.9772767424583435, | |
| "learning_rate": 2.745536290607593e-06, | |
| "loss": 0.5091, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.487279843444227, | |
| "grad_norm": 1.0364662408828735, | |
| "learning_rate": 2.729741532880069e-06, | |
| "loss": 0.4752, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4892367906066536, | |
| "grad_norm": 0.8030025362968445, | |
| "learning_rate": 2.7139375211971e-06, | |
| "loss": 0.462, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4911937377690802, | |
| "grad_norm": 1.3889553546905518, | |
| "learning_rate": 2.6981248921419713e-06, | |
| "loss": 0.4102, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.4931506849315068, | |
| "grad_norm": 0.9577500224113464, | |
| "learning_rate": 2.682304282645077e-06, | |
| "loss": 0.5008, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.49510763209393344, | |
| "grad_norm": 1.3206193447113037, | |
| "learning_rate": 2.66647632995826e-06, | |
| "loss": 0.4624, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.49706457925636005, | |
| "grad_norm": 0.8159929513931274, | |
| "learning_rate": 2.6506416716291466e-06, | |
| "loss": 0.4561, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.49902152641878667, | |
| "grad_norm": 0.854573130607605, | |
| "learning_rate": 2.634800945475465e-06, | |
| "loss": 0.5503, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5009784735812133, | |
| "grad_norm": 9.345633506774902, | |
| "learning_rate": 2.6189547895593565e-06, | |
| "loss": 0.5216, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.50293542074364, | |
| "grad_norm": 0.8881295323371887, | |
| "learning_rate": 2.6031038421616684e-06, | |
| "loss": 0.4713, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.5048923679060665, | |
| "grad_norm": 1.7568496465682983, | |
| "learning_rate": 2.587248741756253e-06, | |
| "loss": 0.5096, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.5068493150684932, | |
| "grad_norm": 0.8306764960289001, | |
| "learning_rate": 2.5713901269842405e-06, | |
| "loss": 0.4504, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.5088062622309197, | |
| "grad_norm": 0.9716941118240356, | |
| "learning_rate": 2.555528636628324e-06, | |
| "loss": 0.4668, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5107632093933464, | |
| "grad_norm": 0.8290694355964661, | |
| "learning_rate": 2.53966490958702e-06, | |
| "loss": 0.4288, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.512720156555773, | |
| "grad_norm": 0.9514800310134888, | |
| "learning_rate": 2.5237995848489422e-06, | |
| "loss": 0.5157, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.5146771037181996, | |
| "grad_norm": 1.515278935432434, | |
| "learning_rate": 2.507933301467056e-06, | |
| "loss": 0.4863, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.5166340508806262, | |
| "grad_norm": 0.9582359790802002, | |
| "learning_rate": 2.4920666985329446e-06, | |
| "loss": 0.4694, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5185909980430529, | |
| "grad_norm": 0.8128112554550171, | |
| "learning_rate": 2.4762004151510586e-06, | |
| "loss": 0.4244, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5205479452054794, | |
| "grad_norm": 1.151044487953186, | |
| "learning_rate": 2.4603350904129802e-06, | |
| "loss": 0.4555, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.5225048923679061, | |
| "grad_norm": 0.8072860240936279, | |
| "learning_rate": 2.4444713633716764e-06, | |
| "loss": 0.4173, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.5244618395303327, | |
| "grad_norm": 1.8496747016906738, | |
| "learning_rate": 2.42860987301576e-06, | |
| "loss": 0.4206, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5264187866927593, | |
| "grad_norm": 1.096216082572937, | |
| "learning_rate": 2.4127512582437486e-06, | |
| "loss": 0.4501, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5283757338551859, | |
| "grad_norm": 0.9519087076187134, | |
| "learning_rate": 2.3968961578383324e-06, | |
| "loss": 0.4848, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5303326810176126, | |
| "grad_norm": 0.9204405546188354, | |
| "learning_rate": 2.3810452104406444e-06, | |
| "loss": 0.4526, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5322896281800391, | |
| "grad_norm": 0.8748743534088135, | |
| "learning_rate": 2.3651990545245357e-06, | |
| "loss": 0.4547, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.5342465753424658, | |
| "grad_norm": 1.6212592124938965, | |
| "learning_rate": 2.3493583283708542e-06, | |
| "loss": 0.4937, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.5362035225048923, | |
| "grad_norm": 0.9793727993965149, | |
| "learning_rate": 2.3335236700417404e-06, | |
| "loss": 0.4456, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.538160469667319, | |
| "grad_norm": 1.7149184942245483, | |
| "learning_rate": 2.3176957173549236e-06, | |
| "loss": 0.4737, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5401174168297456, | |
| "grad_norm": 0.9447053074836731, | |
| "learning_rate": 2.3018751078580287e-06, | |
| "loss": 0.4496, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5420743639921722, | |
| "grad_norm": 0.9250771999359131, | |
| "learning_rate": 2.2860624788029013e-06, | |
| "loss": 0.4674, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5440313111545988, | |
| "grad_norm": 0.8631194233894348, | |
| "learning_rate": 2.2702584671199317e-06, | |
| "loss": 0.48, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5459882583170255, | |
| "grad_norm": 0.8921899199485779, | |
| "learning_rate": 2.2544637093924072e-06, | |
| "loss": 0.4009, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 0.811696469783783, | |
| "learning_rate": 2.238678841830867e-06, | |
| "loss": 0.4714, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5499021526418787, | |
| "grad_norm": 0.7900722026824951, | |
| "learning_rate": 2.2229045002474727e-06, | |
| "loss": 0.3956, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5518590998043053, | |
| "grad_norm": 0.8538399934768677, | |
| "learning_rate": 2.2071413200304046e-06, | |
| "loss": 0.3488, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5538160469667319, | |
| "grad_norm": 0.9310709238052368, | |
| "learning_rate": 2.1913899361182634e-06, | |
| "loss": 0.3915, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5557729941291585, | |
| "grad_norm": 0.8419170379638672, | |
| "learning_rate": 2.1756509829744958e-06, | |
| "loss": 0.4716, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5577299412915852, | |
| "grad_norm": 1.2008228302001953, | |
| "learning_rate": 2.1599250945618404e-06, | |
| "loss": 0.4493, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5596868884540117, | |
| "grad_norm": 0.8127449750900269, | |
| "learning_rate": 2.1442129043167877e-06, | |
| "loss": 0.4223, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.5616438356164384, | |
| "grad_norm": 1.0872187614440918, | |
| "learning_rate": 2.128515045124071e-06, | |
| "loss": 0.4814, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.5636007827788649, | |
| "grad_norm": 0.9573928713798523, | |
| "learning_rate": 2.1128321492911697e-06, | |
| "loss": 0.4606, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5655577299412916, | |
| "grad_norm": 0.7555910348892212, | |
| "learning_rate": 2.0971648485228404e-06, | |
| "loss": 0.446, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5675146771037182, | |
| "grad_norm": 0.9281080961227417, | |
| "learning_rate": 2.0815137738956736e-06, | |
| "loss": 0.4224, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5694716242661448, | |
| "grad_norm": 1.0036050081253052, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.4476, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.8064005970954895, | |
| "learning_rate": 2.0502628240778655e-06, | |
| "loss": 0.4518, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5733855185909981, | |
| "grad_norm": 0.8491390347480774, | |
| "learning_rate": 2.034664207670925e-06, | |
| "loss": 0.4947, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5753424657534246, | |
| "grad_norm": 1.433266043663025, | |
| "learning_rate": 2.019084334921849e-06, | |
| "loss": 0.4929, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5772994129158513, | |
| "grad_norm": 0.8420299291610718, | |
| "learning_rate": 2.003523833385637e-06, | |
| "loss": 0.4533, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5792563600782779, | |
| "grad_norm": 2.3586318492889404, | |
| "learning_rate": 1.987983329837024e-06, | |
| "loss": 0.4257, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5812133072407045, | |
| "grad_norm": 0.85833340883255, | |
| "learning_rate": 1.972463450245226e-06, | |
| "loss": 0.4875, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.5831702544031311, | |
| "grad_norm": 0.7927659749984741, | |
| "learning_rate": 1.956964819748731e-06, | |
| "loss": 0.415, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.5851272015655578, | |
| "grad_norm": 0.8850895762443542, | |
| "learning_rate": 1.9414880626301147e-06, | |
| "loss": 0.409, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.5870841487279843, | |
| "grad_norm": 1.509407877922058, | |
| "learning_rate": 1.9260338022908972e-06, | |
| "loss": 0.5041, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.589041095890411, | |
| "grad_norm": 1.5269814729690552, | |
| "learning_rate": 1.9106026612264316e-06, | |
| "loss": 0.4222, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5909980430528375, | |
| "grad_norm": 1.386004090309143, | |
| "learning_rate": 1.895195261000831e-06, | |
| "loss": 0.5278, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5929549902152642, | |
| "grad_norm": 1.278283953666687, | |
| "learning_rate": 1.8798122222219288e-06, | |
| "loss": 0.4823, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5949119373776908, | |
| "grad_norm": 0.8502036333084106, | |
| "learning_rate": 1.8644541645162834e-06, | |
| "loss": 0.4682, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5968688845401174, | |
| "grad_norm": 0.8835340142250061, | |
| "learning_rate": 1.84912170650422e-06, | |
| "loss": 0.3474, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.598825831702544, | |
| "grad_norm": 0.8175051212310791, | |
| "learning_rate": 1.833815465774913e-06, | |
| "loss": 0.4262, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.6007827788649707, | |
| "grad_norm": 1.031742811203003, | |
| "learning_rate": 1.818536058861506e-06, | |
| "loss": 0.4432, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.6027397260273972, | |
| "grad_norm": 0.9526416659355164, | |
| "learning_rate": 1.803284101216281e-06, | |
| "loss": 0.3981, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.6046966731898239, | |
| "grad_norm": 0.9259310364723206, | |
| "learning_rate": 1.7880602071858694e-06, | |
| "loss": 0.4249, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.6066536203522505, | |
| "grad_norm": 1.0978782176971436, | |
| "learning_rate": 1.7728649899865024e-06, | |
| "loss": 0.4955, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6086105675146771, | |
| "grad_norm": 0.8304716944694519, | |
| "learning_rate": 1.7576990616793139e-06, | |
| "loss": 0.4666, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.6105675146771037, | |
| "grad_norm": 2.960554838180542, | |
| "learning_rate": 1.7425630331456821e-06, | |
| "loss": 0.412, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6125244618395304, | |
| "grad_norm": 0.8440503478050232, | |
| "learning_rate": 1.7274575140626318e-06, | |
| "loss": 0.4814, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6144814090019569, | |
| "grad_norm": 0.8478916883468628, | |
| "learning_rate": 1.7123831128782686e-06, | |
| "loss": 0.4708, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6164383561643836, | |
| "grad_norm": 0.8599239587783813, | |
| "learning_rate": 1.697340436787273e-06, | |
| "loss": 0.428, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6183953033268101, | |
| "grad_norm": 1.151842474937439, | |
| "learning_rate": 1.6823300917064462e-06, | |
| "loss": 0.3433, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.6203522504892368, | |
| "grad_norm": 0.8544068336486816, | |
| "learning_rate": 1.6673526822502982e-06, | |
| "loss": 0.4431, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6223091976516634, | |
| "grad_norm": 1.1891425848007202, | |
| "learning_rate": 1.6524088117066984e-06, | |
| "loss": 0.4334, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.62426614481409, | |
| "grad_norm": 1.1379172801971436, | |
| "learning_rate": 1.637499082012574e-06, | |
| "loss": 0.5514, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.6262230919765166, | |
| "grad_norm": 1.0814030170440674, | |
| "learning_rate": 1.6226240937296617e-06, | |
| "loss": 0.4772, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6281800391389433, | |
| "grad_norm": 0.9527184963226318, | |
| "learning_rate": 1.6077844460203207e-06, | |
| "loss": 0.4292, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.6301369863013698, | |
| "grad_norm": 0.9083294868469238, | |
| "learning_rate": 1.5929807366233979e-06, | |
| "loss": 0.501, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.6320939334637965, | |
| "grad_norm": 1.4445644617080688, | |
| "learning_rate": 1.5782135618301486e-06, | |
| "loss": 0.4924, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.6340508806262231, | |
| "grad_norm": 1.361970067024231, | |
| "learning_rate": 1.56348351646022e-06, | |
| "loss": 0.4487, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.6360078277886497, | |
| "grad_norm": 0.8321120142936707, | |
| "learning_rate": 1.5487911938376925e-06, | |
| "loss": 0.4566, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6379647749510763, | |
| "grad_norm": 1.1182819604873657, | |
| "learning_rate": 1.5341371857671782e-06, | |
| "loss": 0.4253, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.639921722113503, | |
| "grad_norm": 1.133865475654602, | |
| "learning_rate": 1.5195220825099863e-06, | |
| "loss": 0.4212, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.6418786692759295, | |
| "grad_norm": 0.9962513446807861, | |
| "learning_rate": 1.5049464727603453e-06, | |
| "loss": 0.4702, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.6438356164383562, | |
| "grad_norm": 1.1037347316741943, | |
| "learning_rate": 1.4904109436216885e-06, | |
| "loss": 0.5035, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.6457925636007827, | |
| "grad_norm": 1.0168620347976685, | |
| "learning_rate": 1.475916080583012e-06, | |
| "loss": 0.4762, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6477495107632094, | |
| "grad_norm": 1.1695796251296997, | |
| "learning_rate": 1.4614624674952843e-06, | |
| "loss": 0.4313, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.649706457925636, | |
| "grad_norm": 1.4158042669296265, | |
| "learning_rate": 1.4470506865479337e-06, | |
| "loss": 0.4798, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.6516634050880626, | |
| "grad_norm": 0.9545938968658447, | |
| "learning_rate": 1.4326813182453959e-06, | |
| "loss": 0.4126, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6536203522504892, | |
| "grad_norm": 1.0253559350967407, | |
| "learning_rate": 1.4183549413837288e-06, | |
| "loss": 0.4633, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6555772994129159, | |
| "grad_norm": 0.9522657990455627, | |
| "learning_rate": 1.4040721330273063e-06, | |
| "loss": 0.4716, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6575342465753424, | |
| "grad_norm": 0.9612335562705994, | |
| "learning_rate": 1.3898334684855647e-06, | |
| "loss": 0.4699, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6594911937377691, | |
| "grad_norm": 0.9365352392196655, | |
| "learning_rate": 1.375639521289836e-06, | |
| "loss": 0.4775, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6614481409001957, | |
| "grad_norm": 0.9329326152801514, | |
| "learning_rate": 1.3614908631702435e-06, | |
| "loss": 0.4236, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6614481409001957, | |
| "eval_accuracy": 0.8283926414598568, | |
| "eval_accuracy_first_token": 0.9538461538461539, | |
| "eval_accuracy_first_token_all": 0.9723295718039883, | |
| "eval_accuracy_first_token_all_total": 6469, | |
| "eval_accuracy_first_token_calculate": 0.9090909090909091, | |
| "eval_accuracy_first_token_calculate_total": 44, | |
| "eval_accuracy_first_token_execute": 1.0, | |
| "eval_accuracy_first_token_execute_total": 202, | |
| "eval_accuracy_first_token_get": 0.9517543859649122, | |
| "eval_accuracy_first_token_get_total": 456, | |
| "eval_accuracy_first_token_python": 0.8838383838383839, | |
| "eval_accuracy_first_token_python_total": 990, | |
| "eval_loss": 0.5066910982131958, | |
| "eval_perplexity": 1.2021342846813718, | |
| "eval_runtime": 525.6643, | |
| "eval_samples_per_second": 1.265, | |
| "eval_steps_per_second": 0.16, | |
| "eval_total_number_first_token": 9360, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6634050880626223, | |
| "grad_norm": 0.9786916375160217, | |
| "learning_rate": 1.3473880640326725e-06, | |
| "loss": 0.4361, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6653620352250489, | |
| "grad_norm": 0.9186453819274902, | |
| "learning_rate": 1.3333316919358159e-06, | |
| "loss": 0.4658, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6673189823874756, | |
| "grad_norm": 1.02847421169281, | |
| "learning_rate": 1.3193223130682937e-06, | |
| "loss": 0.4517, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6692759295499021, | |
| "grad_norm": 0.970281720161438, | |
| "learning_rate": 1.3053604917258428e-06, | |
| "loss": 0.4617, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6712328767123288, | |
| "grad_norm": 0.7810537219047546, | |
| "learning_rate": 1.2914467902885902e-06, | |
| "loss": 0.4246, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6731898238747553, | |
| "grad_norm": 0.9498399496078491, | |
| "learning_rate": 1.2775817691984032e-06, | |
| "loss": 0.4706, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.675146771037182, | |
| "grad_norm": 0.9036231637001038, | |
| "learning_rate": 1.2637659869363085e-06, | |
| "loss": 0.4826, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6771037181996086, | |
| "grad_norm": 0.8129305243492126, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 0.4163, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.6790606653620352, | |
| "grad_norm": 1.4746164083480835, | |
| "learning_rate": 1.2362843628814267e-06, | |
| "loss": 0.3961, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.6810176125244618, | |
| "grad_norm": 11.255887031555176, | |
| "learning_rate": 1.222619628044449e-06, | |
| "loss": 0.4761, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.6829745596868885, | |
| "grad_norm": 0.9121260643005371, | |
| "learning_rate": 1.2090063459025956e-06, | |
| "loss": 0.4277, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 0.9116764068603516, | |
| "learning_rate": 1.1954450647968856e-06, | |
| "loss": 0.4696, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6868884540117417, | |
| "grad_norm": 1.206604242324829, | |
| "learning_rate": 1.181936330973744e-06, | |
| "loss": 0.4205, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.6888454011741683, | |
| "grad_norm": 0.8744117617607117, | |
| "learning_rate": 1.1684806885630003e-06, | |
| "loss": 0.5077, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.6908023483365949, | |
| "grad_norm": 2.155042886734009, | |
| "learning_rate": 1.155078679555969e-06, | |
| "loss": 0.4193, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.6927592954990215, | |
| "grad_norm": 0.9258475303649902, | |
| "learning_rate": 1.1417308437836181e-06, | |
| "loss": 0.3645, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.6947162426614482, | |
| "grad_norm": 0.7997338771820068, | |
| "learning_rate": 1.1284377188948258e-06, | |
| "loss": 0.4044, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6966731898238747, | |
| "grad_norm": 0.8342923521995544, | |
| "learning_rate": 1.1151998403347245e-06, | |
| "loss": 0.4132, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.6986301369863014, | |
| "grad_norm": 1.0009496212005615, | |
| "learning_rate": 1.1020177413231334e-06, | |
| "loss": 0.4046, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.700587084148728, | |
| "grad_norm": 1.0892616510391235, | |
| "learning_rate": 1.0888919528330778e-06, | |
| "loss": 0.4878, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.7025440313111546, | |
| "grad_norm": 0.829866886138916, | |
| "learning_rate": 1.0758230035694031e-06, | |
| "loss": 0.4876, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.7045009784735812, | |
| "grad_norm": 0.9134871363639832, | |
| "learning_rate": 1.062811419947482e-06, | |
| "loss": 0.5027, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7064579256360078, | |
| "grad_norm": 1.1233887672424316, | |
| "learning_rate": 1.049857726072005e-06, | |
| "loss": 0.3487, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.7084148727984344, | |
| "grad_norm": 0.8092291355133057, | |
| "learning_rate": 1.036962443715872e-06, | |
| "loss": 0.5009, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.7103718199608611, | |
| "grad_norm": 1.730331301689148, | |
| "learning_rate": 1.0241260922991761e-06, | |
| "loss": 0.386, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.7123287671232876, | |
| "grad_norm": 0.9802207946777344, | |
| "learning_rate": 1.0113491888682802e-06, | |
| "loss": 0.4209, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 1.0146572589874268, | |
| "learning_rate": 9.986322480749926e-07, | |
| "loss": 0.6119, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7162426614481409, | |
| "grad_norm": 0.930644154548645, | |
| "learning_rate": 9.85975782155834e-07, | |
| "loss": 0.4453, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.7181996086105675, | |
| "grad_norm": 1.2394402027130127, | |
| "learning_rate": 9.733803009114045e-07, | |
| "loss": 0.4364, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.7201565557729941, | |
| "grad_norm": 0.8096799850463867, | |
| "learning_rate": 9.608463116858544e-07, | |
| "loss": 0.3672, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.7221135029354208, | |
| "grad_norm": 0.9330917596817017, | |
| "learning_rate": 9.483743193464409e-07, | |
| "loss": 0.4665, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.7240704500978473, | |
| "grad_norm": 1.0829280614852905, | |
| "learning_rate": 9.359648262631962e-07, | |
| "loss": 0.4924, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.726027397260274, | |
| "grad_norm": 1.0950247049331665, | |
| "learning_rate": 9.236183322886946e-07, | |
| "loss": 0.4907, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.7279843444227005, | |
| "grad_norm": 0.8494971394538879, | |
| "learning_rate": 9.113353347379097e-07, | |
| "loss": 0.4286, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.7299412915851272, | |
| "grad_norm": 0.9138147830963135, | |
| "learning_rate": 8.991163283681945e-07, | |
| "loss": 0.4396, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.7318982387475538, | |
| "grad_norm": 1.6995892524719238, | |
| "learning_rate": 8.869618053593429e-07, | |
| "loss": 0.3989, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.7338551859099804, | |
| "grad_norm": 0.9424477815628052, | |
| "learning_rate": 8.748722552937688e-07, | |
| "loss": 0.4371, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.735812133072407, | |
| "grad_norm": 1.2042125463485718, | |
| "learning_rate": 8.628481651367876e-07, | |
| "loss": 0.4337, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.7377690802348337, | |
| "grad_norm": 0.9822342395782471, | |
| "learning_rate": 8.508900192169964e-07, | |
| "loss": 0.4329, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.7397260273972602, | |
| "grad_norm": 1.0332896709442139, | |
| "learning_rate": 8.389982992067688e-07, | |
| "loss": 0.4286, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.7416829745596869, | |
| "grad_norm": 0.8743665218353271, | |
| "learning_rate": 8.271734841028553e-07, | |
| "loss": 0.487, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.7436399217221135, | |
| "grad_norm": 0.9147298336029053, | |
| "learning_rate": 8.154160502070804e-07, | |
| "loss": 0.453, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7455968688845401, | |
| "grad_norm": 1.113299012184143, | |
| "learning_rate": 8.037264711071699e-07, | |
| "loss": 0.4432, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.7475538160469667, | |
| "grad_norm": 0.934984564781189, | |
| "learning_rate": 7.921052176576643e-07, | |
| "loss": 0.5102, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.7495107632093934, | |
| "grad_norm": 0.8149503469467163, | |
| "learning_rate": 7.805527579609575e-07, | |
| "loss": 0.4834, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.7514677103718199, | |
| "grad_norm": 1.2893983125686646, | |
| "learning_rate": 7.690695573484433e-07, | |
| "loss": 0.3211, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.7534246575342466, | |
| "grad_norm": 1.0519015789031982, | |
| "learning_rate": 7.576560783617667e-07, | |
| "loss": 0.4613, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7553816046966731, | |
| "grad_norm": 0.8619464039802551, | |
| "learning_rate": 7.463127807341966e-07, | |
| "loss": 0.4728, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.7573385518590998, | |
| "grad_norm": 0.890130341053009, | |
| "learning_rate": 7.35040121372109e-07, | |
| "loss": 0.4721, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.7592954990215264, | |
| "grad_norm": 1.1289362907409668, | |
| "learning_rate": 7.238385543365783e-07, | |
| "loss": 0.4206, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.761252446183953, | |
| "grad_norm": 0.8591368198394775, | |
| "learning_rate": 7.127085308250914e-07, | |
| "loss": 0.415, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.7632093933463796, | |
| "grad_norm": 0.9674418568611145, | |
| "learning_rate": 7.016504991533727e-07, | |
| "loss": 0.5114, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7651663405088063, | |
| "grad_norm": 1.0890218019485474, | |
| "learning_rate": 6.906649047373246e-07, | |
| "loss": 0.3641, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.7671232876712328, | |
| "grad_norm": 0.9494483470916748, | |
| "learning_rate": 6.797521900750897e-07, | |
| "loss": 0.4682, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.7690802348336595, | |
| "grad_norm": 0.9544976949691772, | |
| "learning_rate": 6.689127947292232e-07, | |
| "loss": 0.4227, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.7710371819960861, | |
| "grad_norm": 2.679705858230591, | |
| "learning_rate": 6.581471553089874e-07, | |
| "loss": 0.4482, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.7729941291585127, | |
| "grad_norm": 0.8427915573120117, | |
| "learning_rate": 6.474557054527709e-07, | |
| "loss": 0.4048, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7749510763209393, | |
| "grad_norm": 0.8168734312057495, | |
| "learning_rate": 6.368388758106134e-07, | |
| "loss": 0.377, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.776908023483366, | |
| "grad_norm": 1.0561057329177856, | |
| "learning_rate": 6.262970940268653e-07, | |
| "loss": 0.4315, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.7788649706457925, | |
| "grad_norm": 0.8930473923683167, | |
| "learning_rate": 6.158307847229594e-07, | |
| "loss": 0.5171, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.7808219178082192, | |
| "grad_norm": 1.0137521028518677, | |
| "learning_rate": 6.05440369480308e-07, | |
| "loss": 0.4549, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.7827788649706457, | |
| "grad_norm": 0.9667198061943054, | |
| "learning_rate": 5.951262668233232e-07, | |
| "loss": 0.4213, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7847358121330724, | |
| "grad_norm": 0.7895818948745728, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.427, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.786692759295499, | |
| "grad_norm": 1.007455825805664, | |
| "learning_rate": 5.747286579779607e-07, | |
| "loss": 0.4125, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.7886497064579256, | |
| "grad_norm": 1.8778549432754517, | |
| "learning_rate": 5.646459734022938e-07, | |
| "loss": 0.4568, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.7906066536203522, | |
| "grad_norm": 0.976000964641571, | |
| "learning_rate": 5.546412446046187e-07, | |
| "loss": 0.5, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.7925636007827789, | |
| "grad_norm": 0.9260036945343018, | |
| "learning_rate": 5.447148745739522e-07, | |
| "loss": 0.4729, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7945205479452054, | |
| "grad_norm": 0.851002037525177, | |
| "learning_rate": 5.348672631430319e-07, | |
| "loss": 0.4294, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.7964774951076321, | |
| "grad_norm": 0.976465106010437, | |
| "learning_rate": 5.250988069722096e-07, | |
| "loss": 0.4655, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.7984344422700587, | |
| "grad_norm": 0.9321781396865845, | |
| "learning_rate": 5.154098995334769e-07, | |
| "loss": 0.3931, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.8003913894324853, | |
| "grad_norm": 0.8924025297164917, | |
| "learning_rate": 5.058009310946119e-07, | |
| "loss": 0.4222, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.8023483365949119, | |
| "grad_norm": 0.8116724491119385, | |
| "learning_rate": 4.962722887034616e-07, | |
| "loss": 0.325, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8043052837573386, | |
| "grad_norm": 0.9633209705352783, | |
| "learning_rate": 4.868243561723535e-07, | |
| "loss": 0.3769, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.8062622309197651, | |
| "grad_norm": 0.902252733707428, | |
| "learning_rate": 4.774575140626317e-07, | |
| "loss": 0.3959, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.8082191780821918, | |
| "grad_norm": 0.8941038250923157, | |
| "learning_rate": 4.681721396693303e-07, | |
| "loss": 0.4998, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.8101761252446184, | |
| "grad_norm": 1.213836669921875, | |
| "learning_rate": 4.589686070059762e-07, | |
| "loss": 0.5012, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.812133072407045, | |
| "grad_norm": 1.0174344778060913, | |
| "learning_rate": 4.4984728678952234e-07, | |
| "loss": 0.468, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8140900195694716, | |
| "grad_norm": 1.8333814144134521, | |
| "learning_rate": 4.4080854642541833e-07, | |
| "loss": 0.4941, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.8160469667318982, | |
| "grad_norm": 1.6971678733825684, | |
| "learning_rate": 4.318527499928074e-07, | |
| "loss": 0.3649, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.8180039138943248, | |
| "grad_norm": 0.8866695165634155, | |
| "learning_rate": 4.229802582298634e-07, | |
| "loss": 0.4657, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.8199608610567515, | |
| "grad_norm": 1.3764787912368774, | |
| "learning_rate": 4.141914285192619e-07, | |
| "loss": 0.3836, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.821917808219178, | |
| "grad_norm": 0.9406548142433167, | |
| "learning_rate": 4.0548661487378184e-07, | |
| "loss": 0.497, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8238747553816047, | |
| "grad_norm": 0.8251882195472717, | |
| "learning_rate": 3.9686616792204677e-07, | |
| "loss": 0.4032, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.8258317025440313, | |
| "grad_norm": 0.8226965069770813, | |
| "learning_rate": 3.8833043489440477e-07, | |
| "loss": 0.4526, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.8277886497064579, | |
| "grad_norm": 0.9033458232879639, | |
| "learning_rate": 3.798797596089351e-07, | |
| "loss": 0.4149, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.8297455968688845, | |
| "grad_norm": 0.9945986866950989, | |
| "learning_rate": 3.715144824576078e-07, | |
| "loss": 0.5138, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.8317025440313112, | |
| "grad_norm": 1.1671781539916992, | |
| "learning_rate": 3.632349403925664e-07, | |
| "loss": 0.4718, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8336594911937377, | |
| "grad_norm": 1.2945449352264404, | |
| "learning_rate": 3.5504146691255736e-07, | |
| "loss": 0.4514, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.8356164383561644, | |
| "grad_norm": 1.3590197563171387, | |
| "learning_rate": 3.469343920494986e-07, | |
| "loss": 0.4147, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.837573385518591, | |
| "grad_norm": 0.8810437917709351, | |
| "learning_rate": 3.389140423551834e-07, | |
| "loss": 0.4462, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.8395303326810176, | |
| "grad_norm": 0.9122494459152222, | |
| "learning_rate": 3.3098074088812686e-07, | |
| "loss": 0.4766, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.8414872798434442, | |
| "grad_norm": 0.8525986075401306, | |
| "learning_rate": 3.2313480720055747e-07, | |
| "loss": 0.3684, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8434442270058709, | |
| "grad_norm": 1.0988531112670898, | |
| "learning_rate": 3.153765573255377e-07, | |
| "loss": 0.4956, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.8454011741682974, | |
| "grad_norm": 0.7911211848258972, | |
| "learning_rate": 3.0770630376424276e-07, | |
| "loss": 0.4842, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.8473581213307241, | |
| "grad_norm": 1.0055835247039795, | |
| "learning_rate": 3.0012435547336737e-07, | |
| "loss": 0.3518, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.8493150684931506, | |
| "grad_norm": 1.304575800895691, | |
| "learning_rate": 2.9263101785268253e-07, | |
| "loss": 0.3509, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.8512720156555773, | |
| "grad_norm": 0.9222425818443298, | |
| "learning_rate": 2.8522659273273606e-07, | |
| "loss": 0.3888, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8532289628180039, | |
| "grad_norm": 0.9765827059745789, | |
| "learning_rate": 2.779113783626916e-07, | |
| "loss": 0.4616, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.8551859099804305, | |
| "grad_norm": 0.972284734249115, | |
| "learning_rate": 2.7068566939831646e-07, | |
| "loss": 0.3573, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.9025648832321167, | |
| "learning_rate": 2.6354975689011576e-07, | |
| "loss": 0.4246, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.8590998043052838, | |
| "grad_norm": 0.8234553933143616, | |
| "learning_rate": 2.5650392827160446e-07, | |
| "loss": 0.3739, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.8610567514677103, | |
| "grad_norm": 1.1872916221618652, | |
| "learning_rate": 2.4954846734773054e-07, | |
| "loss": 0.377, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.863013698630137, | |
| "grad_norm": 0.9565138816833496, | |
| "learning_rate": 2.4268365428344737e-07, | |
| "loss": 0.5044, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.8649706457925636, | |
| "grad_norm": 1.1466796398162842, | |
| "learning_rate": 2.3590976559242278e-07, | |
| "loss": 0.3848, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.8669275929549902, | |
| "grad_norm": 0.9302741289138794, | |
| "learning_rate": 2.29227074125907e-07, | |
| "loss": 0.5157, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.8688845401174168, | |
| "grad_norm": 0.9383424520492554, | |
| "learning_rate": 2.2263584906173723e-07, | |
| "loss": 0.4421, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.8708414872798435, | |
| "grad_norm": 1.1834505796432495, | |
| "learning_rate": 2.1613635589349756e-07, | |
| "loss": 0.4172, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.87279843444227, | |
| "grad_norm": 0.9577175378799438, | |
| "learning_rate": 2.0972885641982605e-07, | |
| "loss": 0.4004, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.8747553816046967, | |
| "grad_norm": 0.8691757321357727, | |
| "learning_rate": 2.0341360873386673e-07, | |
| "loss": 0.4321, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.8767123287671232, | |
| "grad_norm": 1.0094484090805054, | |
| "learning_rate": 1.97190867212875e-07, | |
| "loss": 0.428, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.8786692759295499, | |
| "grad_norm": 0.8963342308998108, | |
| "learning_rate": 1.9106088250797266e-07, | |
| "loss": 0.4358, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.8806262230919765, | |
| "grad_norm": 1.7301355600357056, | |
| "learning_rate": 1.8502390153404936e-07, | |
| "loss": 0.4104, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8825831702544031, | |
| "grad_norm": 0.8558318614959717, | |
| "learning_rate": 1.790801674598186e-07, | |
| "loss": 0.4592, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.8845401174168297, | |
| "grad_norm": 0.8883755207061768, | |
| "learning_rate": 1.732299196980225e-07, | |
| "loss": 0.416, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.8864970645792564, | |
| "grad_norm": 1.679168701171875, | |
| "learning_rate": 1.6747339389578732e-07, | |
| "loss": 0.4899, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.8884540117416829, | |
| "grad_norm": 0.8892528414726257, | |
| "learning_rate": 1.6181082192513352e-07, | |
| "loss": 0.4228, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.8904109589041096, | |
| "grad_norm": 1.5113455057144165, | |
| "learning_rate": 1.5624243187363442e-07, | |
| "loss": 0.4832, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8923679060665362, | |
| "grad_norm": 1.2870134115219116, | |
| "learning_rate": 1.507684480352292e-07, | |
| "loss": 0.4141, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.8943248532289628, | |
| "grad_norm": 1.6229395866394043, | |
| "learning_rate": 1.4538909090118846e-07, | |
| "loss": 0.4619, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.8962818003913894, | |
| "grad_norm": 0.8794851899147034, | |
| "learning_rate": 1.4010457715123355e-07, | |
| "loss": 0.3665, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.898238747553816, | |
| "grad_norm": 0.8392042517662048, | |
| "learning_rate": 1.3491511964480703e-07, | |
| "loss": 0.4389, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.9001956947162426, | |
| "grad_norm": 1.3040436506271362, | |
| "learning_rate": 1.2982092741250145e-07, | |
| "loss": 0.3347, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9021526418786693, | |
| "grad_norm": 2.594942331314087, | |
| "learning_rate": 1.2482220564763669e-07, | |
| "loss": 0.3493, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.9041095890410958, | |
| "grad_norm": 1.2146382331848145, | |
| "learning_rate": 1.1991915569799645e-07, | |
| "loss": 0.487, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.9060665362035225, | |
| "grad_norm": 0.9857767224311829, | |
| "learning_rate": 1.1511197505771843e-07, | |
| "loss": 0.3678, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.9080234833659491, | |
| "grad_norm": 0.9433605670928955, | |
| "learning_rate": 1.1040085735933681e-07, | |
| "loss": 0.4477, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.9099804305283757, | |
| "grad_norm": 1.0072382688522339, | |
| "learning_rate": 1.0578599236598708e-07, | |
| "loss": 0.4258, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9119373776908023, | |
| "grad_norm": 0.977323591709137, | |
| "learning_rate": 1.0126756596375687e-07, | |
| "loss": 0.4071, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.913894324853229, | |
| "grad_norm": 0.924149751663208, | |
| "learning_rate": 9.684576015420277e-08, | |
| "loss": 0.477, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.9158512720156555, | |
| "grad_norm": 0.8529196381568909, | |
| "learning_rate": 9.252075304701929e-08, | |
| "loss": 0.4513, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.9178082191780822, | |
| "grad_norm": 1.0572128295898438, | |
| "learning_rate": 8.829271885286095e-08, | |
| "loss": 0.4472, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.9197651663405088, | |
| "grad_norm": 3.0187559127807617, | |
| "learning_rate": 8.416182787632871e-08, | |
| "loss": 0.3696, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9217221135029354, | |
| "grad_norm": 1.2419676780700684, | |
| "learning_rate": 8.012824650910938e-08, | |
| "loss": 0.3411, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.923679060665362, | |
| "grad_norm": 0.8936371803283691, | |
| "learning_rate": 7.619213722327184e-08, | |
| "loss": 0.4494, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.9256360078277887, | |
| "grad_norm": 1.0433343648910522, | |
| "learning_rate": 7.235365856472443e-08, | |
| "loss": 0.4545, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.9275929549902152, | |
| "grad_norm": 0.9922037720680237, | |
| "learning_rate": 6.86129651468273e-08, | |
| "loss": 0.4118, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.9295499021526419, | |
| "grad_norm": 0.8298634886741638, | |
| "learning_rate": 6.497020764416633e-08, | |
| "loss": 0.4768, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9315068493150684, | |
| "grad_norm": 0.8023221492767334, | |
| "learning_rate": 6.142553278648239e-08, | |
| "loss": 0.4451, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.9334637964774951, | |
| "grad_norm": 0.828525960445404, | |
| "learning_rate": 5.7979083352762146e-08, | |
| "loss": 0.3043, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.9354207436399217, | |
| "grad_norm": 1.59126615524292, | |
| "learning_rate": 5.463099816548578e-08, | |
| "loss": 0.3771, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.9373776908023483, | |
| "grad_norm": 1.2710837125778198, | |
| "learning_rate": 5.1381412085036994e-08, | |
| "loss": 0.4743, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.9393346379647749, | |
| "grad_norm": 0.953567624092102, | |
| "learning_rate": 4.823045600426901e-08, | |
| "loss": 0.4077, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9412915851272016, | |
| "grad_norm": 0.9778720736503601, | |
| "learning_rate": 4.5178256843233235e-08, | |
| "loss": 0.4112, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.9432485322896281, | |
| "grad_norm": 0.8094834685325623, | |
| "learning_rate": 4.2224937544067254e-08, | |
| "loss": 0.4878, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.9452054794520548, | |
| "grad_norm": 0.8327929377555847, | |
| "learning_rate": 3.9370617066040726e-08, | |
| "loss": 0.3676, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.9471624266144814, | |
| "grad_norm": 0.8924036622047424, | |
| "learning_rate": 3.661541038076755e-08, | |
| "loss": 0.3628, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.949119373776908, | |
| "grad_norm": 1.062476634979248, | |
| "learning_rate": 3.395942846757067e-08, | |
| "loss": 0.3709, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.9510763209393346, | |
| "grad_norm": 0.9672690033912659, | |
| "learning_rate": 3.1402778309014284e-08, | |
| "loss": 0.4846, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.9530332681017613, | |
| "grad_norm": 0.9301928281784058, | |
| "learning_rate": 2.8945562886593948e-08, | |
| "loss": 0.4465, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.9549902152641878, | |
| "grad_norm": 1.6346007585525513, | |
| "learning_rate": 2.6587881176588782e-08, | |
| "loss": 0.3958, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.9569471624266145, | |
| "grad_norm": 0.9479952454566956, | |
| "learning_rate": 2.4329828146074096e-08, | |
| "loss": 0.3922, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.958904109589041, | |
| "grad_norm": 1.1471753120422363, | |
| "learning_rate": 2.2171494749097243e-08, | |
| "loss": 0.462, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9608610567514677, | |
| "grad_norm": 0.9728820323944092, | |
| "learning_rate": 2.011296792301165e-08, | |
| "loss": 0.4206, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.9628180039138943, | |
| "grad_norm": 0.8930822014808655, | |
| "learning_rate": 1.8154330584978785e-08, | |
| "loss": 0.4664, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.9647749510763209, | |
| "grad_norm": 1.0260281562805176, | |
| "learning_rate": 1.629566162862445e-08, | |
| "loss": 0.4395, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.9667318982387475, | |
| "grad_norm": 1.2178572416305542, | |
| "learning_rate": 1.453703592086353e-08, | |
| "loss": 0.4311, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.9686888454011742, | |
| "grad_norm": 0.8803574442863464, | |
| "learning_rate": 1.28785242988827e-08, | |
| "loss": 0.4175, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9706457925636007, | |
| "grad_norm": 0.9738378524780273, | |
| "learning_rate": 1.132019356728853e-08, | |
| "loss": 0.4419, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.9726027397260274, | |
| "grad_norm": 0.8282538056373596, | |
| "learning_rate": 9.862106495415469e-09, | |
| "loss": 0.4128, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.974559686888454, | |
| "grad_norm": 1.130934715270996, | |
| "learning_rate": 8.504321814798433e-09, | |
| "loss": 0.3772, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.9765166340508806, | |
| "grad_norm": 2.3474204540252686, | |
| "learning_rate": 7.246894216806355e-09, | |
| "loss": 0.4271, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.9784735812133072, | |
| "grad_norm": 0.9170702695846558, | |
| "learning_rate": 6.089874350439507e-09, | |
| "loss": 0.4163, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9804305283757339, | |
| "grad_norm": 1.3329914808273315, | |
| "learning_rate": 5.033308820289185e-09, | |
| "loss": 0.4318, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.9823874755381604, | |
| "grad_norm": 0.9551968574523926, | |
| "learning_rate": 4.07724018466088e-09, | |
| "loss": 0.3538, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.9843444227005871, | |
| "grad_norm": 0.9140384197235107, | |
| "learning_rate": 3.2217069538600932e-09, | |
| "loss": 0.4503, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.9863013698630136, | |
| "grad_norm": 1.072695016860962, | |
| "learning_rate": 2.4667435886402414e-09, | |
| "loss": 0.4374, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.9882583170254403, | |
| "grad_norm": 0.8060042262077332, | |
| "learning_rate": 1.8123804988159909e-09, | |
| "loss": 0.4142, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9902152641878669, | |
| "grad_norm": 1.2433676719665527, | |
| "learning_rate": 1.2586440420372936e-09, | |
| "loss": 0.4401, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.9921722113502935, | |
| "grad_norm": 1.1050037145614624, | |
| "learning_rate": 8.0555652272718e-10, | |
| "loss": 0.4379, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.9921722113502935, | |
| "eval_accuracy": 0.8291501000753674, | |
| "eval_accuracy_first_token": 0.9575854700854701, | |
| "eval_accuracy_first_token_all": 0.9726387385994744, | |
| "eval_accuracy_first_token_all_total": 6469, | |
| "eval_accuracy_first_token_calculate": 0.9090909090909091, | |
| "eval_accuracy_first_token_calculate_total": 44, | |
| "eval_accuracy_first_token_execute": 1.0, | |
| "eval_accuracy_first_token_execute_total": 202, | |
| "eval_accuracy_first_token_get": 0.956140350877193, | |
| "eval_accuracy_first_token_get_total": 456, | |
| "eval_accuracy_first_token_python": 0.8909090909090909, | |
| "eval_accuracy_first_token_python_total": 990, | |
| "eval_loss": 0.5047600269317627, | |
| "eval_perplexity": 1.201347285698878, | |
| "eval_runtime": 525.3078, | |
| "eval_samples_per_second": 1.266, | |
| "eval_steps_per_second": 0.16, | |
| "eval_total_number_first_token": 9360, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.9941291585127201, | |
| "grad_norm": 0.9681710004806519, | |
| "learning_rate": 4.5313619118553256e-10, | |
| "loss": 0.4287, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.9960861056751468, | |
| "grad_norm": 0.8318100571632385, | |
| "learning_rate": 2.0139724285161976e-10, | |
| "loss": 0.4405, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.9980430528375733, | |
| "grad_norm": 0.8928787708282471, | |
| "learning_rate": 5.0349817733719165e-11, | |
| "loss": 0.3779, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.4657742977142334, | |
| "learning_rate": 0.0, | |
| "loss": 0.4483, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 511, | |
| "total_flos": 529508264312832.0, | |
| "train_loss": 0.47377988486140676, | |
| "train_runtime": 61575.0259, | |
| "train_samples_per_second": 0.133, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 511, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 529508264312832.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |