Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": 1.2795084714889526, | |
| "best_model_checkpoint": "saved_model/c2s_jun2024/checkpoint-9692", | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 9692, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0, | |
| "loss": 72.6113, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 18.23219108581543, | |
| "learning_rate": 2.5e-06, | |
| "loss": 74.5495, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 16.582162857055664, | |
| "learning_rate": 7.5e-06, | |
| "loss": 73.7367, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 14.804972648620605, | |
| "learning_rate": 1.2e-05, | |
| "loss": 72.8853, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 13.634269714355469, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 70.9592, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 13.762855529785156, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 66.9603, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 16.27646827697754, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 61.4318, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 27.16312026977539, | |
| "learning_rate": 3.15e-05, | |
| "loss": 53.3651, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 28.43309783935547, | |
| "learning_rate": 3.65e-05, | |
| "loss": 33.9745, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 12.292057991027832, | |
| "learning_rate": 4.15e-05, | |
| "loss": 13.4627, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 9.148832321166992, | |
| "learning_rate": 4.6500000000000005e-05, | |
| "loss": 6.8387, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 6.579999923706055, | |
| "learning_rate": 5.1500000000000005e-05, | |
| "loss": 4.7847, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 6.650771141052246, | |
| "learning_rate": 5.65e-05, | |
| "loss": 4.1684, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 6.5379791259765625, | |
| "learning_rate": 6.15e-05, | |
| "loss": 3.8221, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 6.095062732696533, | |
| "learning_rate": 6.65e-05, | |
| "loss": 3.5635, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 7.0580973625183105, | |
| "learning_rate": 7.15e-05, | |
| "loss": 3.4446, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 6.517209053039551, | |
| "learning_rate": 7.65e-05, | |
| "loss": 3.2972, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 5.954787731170654, | |
| "learning_rate": 8.15e-05, | |
| "loss": 3.2621, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 6.085761547088623, | |
| "learning_rate": 8.65e-05, | |
| "loss": 3.2072, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 6.4346442222595215, | |
| "learning_rate": 9.15e-05, | |
| "loss": 3.0868, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 6.535578727722168, | |
| "learning_rate": 9.65e-05, | |
| "loss": 3.0201, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 5.239222526550293, | |
| "learning_rate": 9.999378367177788e-05, | |
| "loss": 2.9792, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 5.576033592224121, | |
| "learning_rate": 9.997306257770411e-05, | |
| "loss": 3.0079, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 5.455887794494629, | |
| "learning_rate": 9.995234148363033e-05, | |
| "loss": 2.8296, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.566660404205322, | |
| "learning_rate": 9.993162038955657e-05, | |
| "loss": 2.7655, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.954742908477783, | |
| "learning_rate": 9.99108992954828e-05, | |
| "loss": 2.5655, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.5510752201080322, | |
| "learning_rate": 9.989017820140904e-05, | |
| "loss": 2.4527, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.358351230621338, | |
| "learning_rate": 9.986945710733528e-05, | |
| "loss": 2.2679, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.9349524974823, | |
| "learning_rate": 9.98487360132615e-05, | |
| "loss": 2.1456, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.3249402046203613, | |
| "learning_rate": 9.982801491918775e-05, | |
| "loss": 2.0943, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.92372989654541, | |
| "learning_rate": 9.980729382511397e-05, | |
| "loss": 2.0194, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.124359130859375, | |
| "learning_rate": 9.97865727310402e-05, | |
| "loss": 1.9523, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.372561454772949, | |
| "learning_rate": 9.976585163696644e-05, | |
| "loss": 1.905, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5799174308776855, | |
| "learning_rate": 9.974513054289267e-05, | |
| "loss": 1.9159, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.1826956272125244, | |
| "learning_rate": 9.97244094488189e-05, | |
| "loss": 1.8362, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.1002371311187744, | |
| "learning_rate": 9.970368835474514e-05, | |
| "loss": 1.844, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.1345527172088623, | |
| "learning_rate": 9.968296726067136e-05, | |
| "loss": 1.8084, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9457321166992188, | |
| "learning_rate": 9.96622461665976e-05, | |
| "loss": 1.7775, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9511795043945312, | |
| "learning_rate": 9.964152507252383e-05, | |
| "loss": 1.7872, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9775121212005615, | |
| "learning_rate": 9.962080397845007e-05, | |
| "loss": 1.7665, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9561394453048706, | |
| "learning_rate": 9.96000828843763e-05, | |
| "loss": 1.7664, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.7436013221740723, | |
| "learning_rate": 9.957936179030253e-05, | |
| "loss": 1.7016, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.6739649772644043, | |
| "learning_rate": 9.955864069622876e-05, | |
| "loss": 1.7219, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.940246343612671, | |
| "learning_rate": 9.9537919602155e-05, | |
| "loss": 1.7174, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.8286395072937012, | |
| "learning_rate": 9.951719850808123e-05, | |
| "loss": 1.6698, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.0042293071746826, | |
| "learning_rate": 9.949647741400747e-05, | |
| "loss": 1.6908, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.6445887088775635, | |
| "learning_rate": 9.94757563199337e-05, | |
| "loss": 1.6796, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.068713903427124, | |
| "learning_rate": 9.945503522585992e-05, | |
| "loss": 1.6685, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.8053257465362549, | |
| "learning_rate": 9.943431413178617e-05, | |
| "loss": 1.6522, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.580461859703064, | |
| "learning_rate": 9.94135930377124e-05, | |
| "loss": 1.6425, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.607007384300232, | |
| "learning_rate": 9.939287194363863e-05, | |
| "loss": 1.632, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.649885654449463, | |
| "learning_rate": 9.937215084956486e-05, | |
| "loss": 1.5966, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.7667235136032104, | |
| "learning_rate": 9.93514297554911e-05, | |
| "loss": 1.5942, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.595691442489624, | |
| "learning_rate": 9.933070866141732e-05, | |
| "loss": 1.609, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.5232254266738892, | |
| "learning_rate": 9.930998756734357e-05, | |
| "loss": 1.5614, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.4872910976409912, | |
| "learning_rate": 9.928926647326979e-05, | |
| "loss": 1.5657, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.609491229057312, | |
| "learning_rate": 9.926854537919603e-05, | |
| "loss": 1.5935, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.6403166055679321, | |
| "learning_rate": 9.924782428512226e-05, | |
| "loss": 1.6159, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.6648396253585815, | |
| "learning_rate": 9.922710319104848e-05, | |
| "loss": 1.6012, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.6322458982467651, | |
| "learning_rate": 9.920638209697473e-05, | |
| "loss": 1.5541, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.5503164529800415, | |
| "learning_rate": 9.918566100290095e-05, | |
| "loss": 1.5733, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.6093209981918335, | |
| "learning_rate": 9.916493990882719e-05, | |
| "loss": 1.5144, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.6871626377105713, | |
| "learning_rate": 9.914421881475342e-05, | |
| "loss": 1.573, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.7600977420806885, | |
| "learning_rate": 9.912349772067966e-05, | |
| "loss": 1.5577, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.4892425537109375, | |
| "learning_rate": 9.910277662660588e-05, | |
| "loss": 1.5751, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.5667476654052734, | |
| "learning_rate": 9.908205553253213e-05, | |
| "loss": 1.5298, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.3411659002304077, | |
| "learning_rate": 9.906133443845835e-05, | |
| "loss": 1.5409, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.5329233407974243, | |
| "learning_rate": 9.904061334438459e-05, | |
| "loss": 1.5165, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.5168925523757935, | |
| "learning_rate": 9.901989225031082e-05, | |
| "loss": 1.5222, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.6860578060150146, | |
| "learning_rate": 9.899917115623706e-05, | |
| "loss": 1.5179, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.4629698991775513, | |
| "learning_rate": 9.897845006216329e-05, | |
| "loss": 1.5593, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.3701924085617065, | |
| "learning_rate": 9.895772896808953e-05, | |
| "loss": 1.52, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.4276106357574463, | |
| "learning_rate": 9.893700787401575e-05, | |
| "loss": 1.5546, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.5609627962112427, | |
| "learning_rate": 9.8916286779942e-05, | |
| "loss": 1.5071, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.9602493047714233, | |
| "learning_rate": 9.889556568586822e-05, | |
| "loss": 1.5192, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.4681726694107056, | |
| "learning_rate": 9.887484459179444e-05, | |
| "loss": 1.5065, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.547143816947937, | |
| "learning_rate": 9.885412349772069e-05, | |
| "loss": 1.5303, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.7585084438323975, | |
| "learning_rate": 9.883340240364691e-05, | |
| "loss": 1.5412, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.4589301347732544, | |
| "learning_rate": 9.881268130957315e-05, | |
| "loss": 1.5008, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.5748664140701294, | |
| "learning_rate": 9.879196021549938e-05, | |
| "loss": 1.4856, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.4392333030700684, | |
| "learning_rate": 9.877123912142562e-05, | |
| "loss": 1.4593, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.439276933670044, | |
| "learning_rate": 9.875051802735185e-05, | |
| "loss": 1.4565, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.5028575658798218, | |
| "learning_rate": 9.872979693327809e-05, | |
| "loss": 1.5106, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.5902388095855713, | |
| "learning_rate": 9.870907583920431e-05, | |
| "loss": 1.459, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.5270620584487915, | |
| "learning_rate": 9.868835474513056e-05, | |
| "loss": 1.4705, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.354683518409729, | |
| "learning_rate": 9.866763365105678e-05, | |
| "loss": 1.4468, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3697203397750854, | |
| "learning_rate": 9.864691255698301e-05, | |
| "loss": 1.4669, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.5006585121154785, | |
| "learning_rate": 9.862619146290925e-05, | |
| "loss": 1.4641, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3566001653671265, | |
| "learning_rate": 9.860547036883548e-05, | |
| "loss": 1.4545, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3500274419784546, | |
| "learning_rate": 9.85847492747617e-05, | |
| "loss": 1.477, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3306142091751099, | |
| "learning_rate": 9.856402818068794e-05, | |
| "loss": 1.4469, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.2983628511428833, | |
| "learning_rate": 9.854330708661418e-05, | |
| "loss": 1.4603, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3828344345092773, | |
| "learning_rate": 9.852258599254041e-05, | |
| "loss": 1.4686, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.427741527557373, | |
| "learning_rate": 9.850186489846665e-05, | |
| "loss": 1.4756, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.267404556274414, | |
| "learning_rate": 9.848114380439287e-05, | |
| "loss": 1.4777, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.3213374614715576, | |
| "learning_rate": 9.846042271031912e-05, | |
| "loss": 1.4526, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.6813840866088867, | |
| "learning_rate": 9.843970161624534e-05, | |
| "loss": 1.49, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.2110322713851929, | |
| "learning_rate": 9.841898052217157e-05, | |
| "loss": 1.4796, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.3316526412963867, | |
| "learning_rate": 9.839825942809781e-05, | |
| "loss": 1.4523, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.313766598701477, | |
| "learning_rate": 9.837753833402404e-05, | |
| "loss": 1.4195, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.4528905153274536, | |
| "learning_rate": 9.835681723995028e-05, | |
| "loss": 1.4433, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3782751560211182, | |
| "learning_rate": 9.833609614587651e-05, | |
| "loss": 1.4673, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.5674275159835815, | |
| "learning_rate": 9.831537505180273e-05, | |
| "loss": 1.4296, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3901402950286865, | |
| "learning_rate": 9.829465395772898e-05, | |
| "loss": 1.4516, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.1594748497009277, | |
| "learning_rate": 9.82739328636552e-05, | |
| "loss": 1.4225, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.2297048568725586, | |
| "learning_rate": 9.825321176958144e-05, | |
| "loss": 1.4416, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.1866023540496826, | |
| "learning_rate": 9.823249067550768e-05, | |
| "loss": 1.444, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.407461404800415, | |
| "learning_rate": 9.82117695814339e-05, | |
| "loss": 1.4415, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.302164912223816, | |
| "learning_rate": 9.819104848736013e-05, | |
| "loss": 1.4405, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.3704490661621094, | |
| "learning_rate": 9.817032739328637e-05, | |
| "loss": 1.4408, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.2673710584640503, | |
| "learning_rate": 9.81496062992126e-05, | |
| "loss": 1.4221, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3337206840515137, | |
| "learning_rate": 9.812888520513884e-05, | |
| "loss": 1.4193, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3280502557754517, | |
| "learning_rate": 9.810816411106507e-05, | |
| "loss": 1.4736, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.2532864809036255, | |
| "learning_rate": 9.80874430169913e-05, | |
| "loss": 1.4665, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.2475242614746094, | |
| "learning_rate": 9.806672192291754e-05, | |
| "loss": 1.426, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.7034567594528198, | |
| "learning_rate": 9.804600082884376e-05, | |
| "loss": 1.4473, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.3586080074310303, | |
| "learning_rate": 9.802527973477e-05, | |
| "loss": 1.3959, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.2611415386199951, | |
| "learning_rate": 9.800455864069623e-05, | |
| "loss": 1.4401, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.3101681470870972, | |
| "learning_rate": 9.798383754662247e-05, | |
| "loss": 1.4431, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.1770988702774048, | |
| "learning_rate": 9.796311645254869e-05, | |
| "loss": 1.4108, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.2325702905654907, | |
| "learning_rate": 9.794239535847494e-05, | |
| "loss": 1.4141, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.2543164491653442, | |
| "learning_rate": 9.792167426440116e-05, | |
| "loss": 1.4133, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.1258199214935303, | |
| "learning_rate": 9.79009531703274e-05, | |
| "loss": 1.4041, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3423455953598022, | |
| "learning_rate": 9.788023207625363e-05, | |
| "loss": 1.4144, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.248947024345398, | |
| "learning_rate": 9.785951098217985e-05, | |
| "loss": 1.4043, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.129650354385376, | |
| "learning_rate": 9.78387898881061e-05, | |
| "loss": 1.4216, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.2218910455703735, | |
| "learning_rate": 9.781806879403232e-05, | |
| "loss": 1.3976, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.153981328010559, | |
| "learning_rate": 9.779734769995856e-05, | |
| "loss": 1.4304, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.1724766492843628, | |
| "learning_rate": 9.77766266058848e-05, | |
| "loss": 1.43, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.2830730676651, | |
| "learning_rate": 9.775590551181103e-05, | |
| "loss": 1.4429, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.2320913076400757, | |
| "learning_rate": 9.773518441773725e-05, | |
| "loss": 1.3898, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.2313491106033325, | |
| "learning_rate": 9.77144633236635e-05, | |
| "loss": 1.4273, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.1946086883544922, | |
| "learning_rate": 9.769374222958972e-05, | |
| "loss": 1.4234, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.127300500869751, | |
| "learning_rate": 9.767302113551596e-05, | |
| "loss": 1.4144, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.4888228178024292, | |
| "learning_rate": 9.765230004144219e-05, | |
| "loss": 1.4092, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.3795928955078125, | |
| "learning_rate": 9.763157894736843e-05, | |
| "loss": 1.3647, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.1433610916137695, | |
| "learning_rate": 9.761085785329466e-05, | |
| "loss": 1.415, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.040281891822815, | |
| "learning_rate": 9.75901367592209e-05, | |
| "loss": 1.4244, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.1311726570129395, | |
| "learning_rate": 9.756941566514712e-05, | |
| "loss": 1.3852, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.2847346067428589, | |
| "learning_rate": 9.754869457107337e-05, | |
| "loss": 1.4225, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.2235894203186035, | |
| "learning_rate": 9.752797347699959e-05, | |
| "loss": 1.3973, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.1802481412887573, | |
| "learning_rate": 9.750725238292582e-05, | |
| "loss": 1.3923, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.141739010810852, | |
| "learning_rate": 9.748653128885206e-05, | |
| "loss": 1.4049, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.2155243158340454, | |
| "learning_rate": 9.746581019477828e-05, | |
| "loss": 1.3866, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.4717819690704346, | |
| "learning_rate": 9.744508910070453e-05, | |
| "loss": 1.4264, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.1440094709396362, | |
| "learning_rate": 9.742436800663075e-05, | |
| "loss": 1.4291, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.3254936933517456, | |
| "learning_rate": 9.740364691255699e-05, | |
| "loss": 1.3973, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.2041431665420532, | |
| "learning_rate": 9.738292581848322e-05, | |
| "loss": 1.3779, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.1422394514083862, | |
| "learning_rate": 9.736220472440946e-05, | |
| "loss": 1.3918, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.2341557741165161, | |
| "learning_rate": 9.734148363033568e-05, | |
| "loss": 1.4065, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.1723967790603638, | |
| "learning_rate": 9.732076253626193e-05, | |
| "loss": 1.4003, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.279010534286499, | |
| "learning_rate": 9.730004144218815e-05, | |
| "loss": 1.3762, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.2639541625976562, | |
| "learning_rate": 9.727932034811438e-05, | |
| "loss": 1.3932, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.1406500339508057, | |
| "learning_rate": 9.725859925404062e-05, | |
| "loss": 1.4318, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.1991297006607056, | |
| "learning_rate": 9.723787815996685e-05, | |
| "loss": 1.3742, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.1058017015457153, | |
| "learning_rate": 9.721715706589309e-05, | |
| "loss": 1.3975, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.3658838272094727, | |
| "learning_rate": 9.719643597181932e-05, | |
| "loss": 1.4245, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.0663561820983887, | |
| "learning_rate": 9.717571487774555e-05, | |
| "loss": 1.3779, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.1523654460906982, | |
| "learning_rate": 9.715499378367178e-05, | |
| "loss": 1.4306, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.223913311958313, | |
| "learning_rate": 9.713427268959802e-05, | |
| "loss": 1.3748, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.0876872539520264, | |
| "learning_rate": 9.711355159552424e-05, | |
| "loss": 1.3806, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.3317033052444458, | |
| "learning_rate": 9.709283050145049e-05, | |
| "loss": 1.3586, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.2402222156524658, | |
| "learning_rate": 9.707210940737671e-05, | |
| "loss": 1.3886, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1467841863632202, | |
| "learning_rate": 9.705138831330294e-05, | |
| "loss": 1.3634, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1589218378067017, | |
| "learning_rate": 9.703066721922918e-05, | |
| "loss": 1.3466, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.9369345307350159, | |
| "learning_rate": 9.700994612515541e-05, | |
| "loss": 1.3819, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.0450528860092163, | |
| "learning_rate": 9.698922503108165e-05, | |
| "loss": 1.3482, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.0236886739730835, | |
| "learning_rate": 9.696850393700788e-05, | |
| "loss": 1.3468, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0324066877365112, | |
| "learning_rate": 9.69477828429341e-05, | |
| "loss": 1.3926, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1705087423324585, | |
| "learning_rate": 9.692706174886035e-05, | |
| "loss": 1.3547, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1479854583740234, | |
| "learning_rate": 9.690634065478658e-05, | |
| "loss": 1.3517, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1700282096862793, | |
| "learning_rate": 9.688561956071281e-05, | |
| "loss": 1.3635, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.079822301864624, | |
| "learning_rate": 9.686489846663905e-05, | |
| "loss": 1.3878, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.188466191291809, | |
| "learning_rate": 9.684417737256528e-05, | |
| "loss": 1.36, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.1050995588302612, | |
| "learning_rate": 9.68234562784915e-05, | |
| "loss": 1.3513, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.2480050325393677, | |
| "learning_rate": 9.680273518441774e-05, | |
| "loss": 1.362, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.1782851219177246, | |
| "learning_rate": 9.678201409034397e-05, | |
| "loss": 1.378, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.1327308416366577, | |
| "learning_rate": 9.676129299627021e-05, | |
| "loss": 1.3836, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.0974417924880981, | |
| "learning_rate": 9.674057190219644e-05, | |
| "loss": 1.3589, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.1006550788879395, | |
| "learning_rate": 9.671985080812266e-05, | |
| "loss": 1.3734, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.0745372772216797, | |
| "learning_rate": 9.669912971404891e-05, | |
| "loss": 1.4078, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.2572031021118164, | |
| "learning_rate": 9.667840861997513e-05, | |
| "loss": 1.3535, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.065767526626587, | |
| "learning_rate": 9.665768752590137e-05, | |
| "loss": 1.3657, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.2773245573043823, | |
| "learning_rate": 9.66369664318276e-05, | |
| "loss": 1.3813, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.0642096996307373, | |
| "learning_rate": 9.661624533775384e-05, | |
| "loss": 1.3829, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.1348739862442017, | |
| "learning_rate": 9.659552424368008e-05, | |
| "loss": 1.3864, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.136107087135315, | |
| "learning_rate": 9.657480314960631e-05, | |
| "loss": 1.3523, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.1533474922180176, | |
| "learning_rate": 9.655408205553253e-05, | |
| "loss": 1.3669, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.1027289628982544, | |
| "learning_rate": 9.653336096145878e-05, | |
| "loss": 1.3256, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.9988449811935425, | |
| "learning_rate": 9.6512639867385e-05, | |
| "loss": 1.4024, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.2975176572799683, | |
| "learning_rate": 9.649191877331124e-05, | |
| "loss": 1.3751, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.2186543941497803, | |
| "learning_rate": 9.647119767923747e-05, | |
| "loss": 1.3444, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.1342490911483765, | |
| "learning_rate": 9.64504765851637e-05, | |
| "loss": 1.3449, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.148695707321167, | |
| "learning_rate": 9.642975549108993e-05, | |
| "loss": 1.3325, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.9545331001281738, | |
| "learning_rate": 9.640903439701616e-05, | |
| "loss": 1.3375, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0941437482833862, | |
| "learning_rate": 9.63883133029424e-05, | |
| "loss": 1.3671, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0803030729293823, | |
| "learning_rate": 9.636759220886863e-05, | |
| "loss": 1.3648, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0937373638153076, | |
| "learning_rate": 9.634687111479487e-05, | |
| "loss": 1.3518, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.1884483098983765, | |
| "learning_rate": 9.632615002072109e-05, | |
| "loss": 1.3461, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.1179327964782715, | |
| "learning_rate": 9.630542892664734e-05, | |
| "loss": 1.3765, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.9843894839286804, | |
| "learning_rate": 9.628470783257356e-05, | |
| "loss": 1.3379, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0279515981674194, | |
| "learning_rate": 9.62639867384998e-05, | |
| "loss": 1.3389, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.0797231197357178, | |
| "learning_rate": 9.624326564442603e-05, | |
| "loss": 1.346, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.1976298093795776, | |
| "learning_rate": 9.622254455035227e-05, | |
| "loss": 1.3366, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.057880163192749, | |
| "learning_rate": 9.620182345627849e-05, | |
| "loss": 1.3264, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.1059492826461792, | |
| "learning_rate": 9.618110236220474e-05, | |
| "loss": 1.3446, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0970298051834106, | |
| "learning_rate": 9.616038126813096e-05, | |
| "loss": 1.3521, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0951462984085083, | |
| "learning_rate": 9.61396601740572e-05, | |
| "loss": 1.3669, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0926049947738647, | |
| "learning_rate": 9.611893907998343e-05, | |
| "loss": 1.354, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0136979818344116, | |
| "learning_rate": 9.609821798590965e-05, | |
| "loss": 1.3321, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.129214882850647, | |
| "learning_rate": 9.60774968918359e-05, | |
| "loss": 1.382, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.1166954040527344, | |
| "learning_rate": 9.605677579776212e-05, | |
| "loss": 1.3337, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.204231858253479, | |
| "learning_rate": 9.603605470368836e-05, | |
| "loss": 1.3642, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0265048742294312, | |
| "learning_rate": 9.601533360961459e-05, | |
| "loss": 1.3662, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0513389110565186, | |
| "learning_rate": 9.599461251554083e-05, | |
| "loss": 1.3395, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.17727792263031, | |
| "learning_rate": 9.597389142146705e-05, | |
| "loss": 1.3738, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.0676214694976807, | |
| "learning_rate": 9.59531703273933e-05, | |
| "loss": 1.3383, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.9273681640625, | |
| "learning_rate": 9.593244923331952e-05, | |
| "loss": 1.367, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.0774747133255005, | |
| "learning_rate": 9.591172813924575e-05, | |
| "loss": 1.3369, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.131264090538025, | |
| "learning_rate": 9.589100704517199e-05, | |
| "loss": 1.3457, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.106242060661316, | |
| "learning_rate": 9.587028595109822e-05, | |
| "loss": 1.321, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.054598331451416, | |
| "learning_rate": 9.584956485702446e-05, | |
| "loss": 1.3424, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.0380080938339233, | |
| "learning_rate": 9.58288437629507e-05, | |
| "loss": 1.3425, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.1068315505981445, | |
| "learning_rate": 9.580812266887692e-05, | |
| "loss": 1.321, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.1228212118148804, | |
| "learning_rate": 9.578740157480316e-05, | |
| "loss": 1.3301, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.9643247127532959, | |
| "learning_rate": 9.576668048072939e-05, | |
| "loss": 1.3403, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.9587458372116089, | |
| "learning_rate": 9.574595938665562e-05, | |
| "loss": 1.3402, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.0192015171051025, | |
| "learning_rate": 9.572523829258186e-05, | |
| "loss": 1.3595, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.1033486127853394, | |
| "learning_rate": 9.570451719850808e-05, | |
| "loss": 1.3515, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.244828462600708, | |
| "learning_rate": 9.568379610443431e-05, | |
| "loss": 1.3148, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.031778335571289, | |
| "learning_rate": 9.566307501036055e-05, | |
| "loss": 1.3343, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.0581692457199097, | |
| "learning_rate": 9.564235391628678e-05, | |
| "loss": 1.3352, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9989519119262695, | |
| "learning_rate": 9.562163282221302e-05, | |
| "loss": 1.3206, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1149669885635376, | |
| "learning_rate": 9.560091172813925e-05, | |
| "loss": 1.3355, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1359626054763794, | |
| "learning_rate": 9.558019063406548e-05, | |
| "loss": 1.3233, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1091575622558594, | |
| "learning_rate": 9.555946953999172e-05, | |
| "loss": 1.3678, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.0405771732330322, | |
| "learning_rate": 9.553874844591795e-05, | |
| "loss": 1.3555, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.068385124206543, | |
| "learning_rate": 9.551802735184418e-05, | |
| "loss": 1.346, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.0115128755569458, | |
| "learning_rate": 9.549730625777042e-05, | |
| "loss": 1.3448, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.026138424873352, | |
| "learning_rate": 9.547658516369665e-05, | |
| "loss": 1.3286, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.375127911567688, | |
| "learning_rate": 9.545586406962289e-05, | |
| "loss": 1.3931, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.2297391891479492, | |
| "learning_rate": 9.543514297554912e-05, | |
| "loss": 1.3223, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.143249750137329, | |
| "learning_rate": 9.541442188147534e-05, | |
| "loss": 1.3142, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.1182348728179932, | |
| "learning_rate": 9.539370078740158e-05, | |
| "loss": 1.3414, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.0450687408447266, | |
| "learning_rate": 9.537297969332781e-05, | |
| "loss": 1.3119, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.105624794960022, | |
| "learning_rate": 9.535225859925403e-05, | |
| "loss": 1.3275, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.1117305755615234, | |
| "learning_rate": 9.533153750518028e-05, | |
| "loss": 1.3384, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.122660756111145, | |
| "learning_rate": 9.53108164111065e-05, | |
| "loss": 1.3509, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.994361937046051, | |
| "learning_rate": 9.529009531703274e-05, | |
| "loss": 1.3638, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.1339287757873535, | |
| "learning_rate": 9.526937422295898e-05, | |
| "loss": 1.3282, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0273313522338867, | |
| "learning_rate": 9.524865312888521e-05, | |
| "loss": 1.3261, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.067122220993042, | |
| "learning_rate": 9.522793203481145e-05, | |
| "loss": 1.3502, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.9780186414718628, | |
| "learning_rate": 9.520721094073768e-05, | |
| "loss": 1.3209, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0634074211120605, | |
| "learning_rate": 9.51864898466639e-05, | |
| "loss": 1.3508, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0088226795196533, | |
| "learning_rate": 9.516576875259015e-05, | |
| "loss": 1.2848, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.156569242477417, | |
| "learning_rate": 9.514504765851637e-05, | |
| "loss": 1.3336, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.9981438517570496, | |
| "learning_rate": 9.512432656444261e-05, | |
| "loss": 1.3237, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0465401411056519, | |
| "learning_rate": 9.510360547036884e-05, | |
| "loss": 1.3347, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 3.282174825668335, | |
| "learning_rate": 9.508288437629508e-05, | |
| "loss": 1.3234, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0925480127334595, | |
| "learning_rate": 9.50621632822213e-05, | |
| "loss": 1.3604, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.0559757947921753, | |
| "learning_rate": 9.504144218814753e-05, | |
| "loss": 1.3411, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.0160987377166748, | |
| "learning_rate": 9.502072109407377e-05, | |
| "loss": 1.3299, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.0814076662063599, | |
| "learning_rate": 9.5e-05, | |
| "loss": 1.3053, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1541906595230103, | |
| "learning_rate": 9.497927890592624e-05, | |
| "loss": 1.3368, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.0476430654525757, | |
| "learning_rate": 9.495855781185246e-05, | |
| "loss": 1.3266, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.0859614610671997, | |
| "learning_rate": 9.493783671777871e-05, | |
| "loss": 1.3077, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.047561526298523, | |
| "learning_rate": 9.491711562370493e-05, | |
| "loss": 1.301, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.1071749925613403, | |
| "learning_rate": 9.489639452963117e-05, | |
| "loss": 1.3069, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.0207133293151855, | |
| "learning_rate": 9.48756734355574e-05, | |
| "loss": 1.3128, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1883114576339722, | |
| "learning_rate": 9.485495234148364e-05, | |
| "loss": 1.2987, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1708128452301025, | |
| "learning_rate": 9.483423124740986e-05, | |
| "loss": 1.3386, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.0731940269470215, | |
| "learning_rate": 9.481351015333611e-05, | |
| "loss": 1.3165, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.02231764793396, | |
| "learning_rate": 9.479278905926233e-05, | |
| "loss": 1.3364, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.9825921654701233, | |
| "learning_rate": 9.477206796518856e-05, | |
| "loss": 1.3078, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.1280665397644043, | |
| "learning_rate": 9.47513468711148e-05, | |
| "loss": 1.3359, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.9910861253738403, | |
| "learning_rate": 9.473062577704103e-05, | |
| "loss": 1.3361, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.0153850317001343, | |
| "learning_rate": 9.470990468296727e-05, | |
| "loss": 1.3059, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.01111900806427, | |
| "learning_rate": 9.468918358889349e-05, | |
| "loss": 1.3226, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.0714573860168457, | |
| "learning_rate": 9.466846249481973e-05, | |
| "loss": 1.3195, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.0012733936309814, | |
| "learning_rate": 9.464774140074596e-05, | |
| "loss": 1.2948, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.9637882709503174, | |
| "learning_rate": 9.46270203066722e-05, | |
| "loss": 1.3208, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.0453296899795532, | |
| "learning_rate": 9.460629921259843e-05, | |
| "loss": 1.3095, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.0107698440551758, | |
| "learning_rate": 9.458557811852467e-05, | |
| "loss": 1.3164, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1132638454437256, | |
| "learning_rate": 9.456485702445089e-05, | |
| "loss": 1.3162, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0389189720153809, | |
| "learning_rate": 9.454413593037714e-05, | |
| "loss": 1.317, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0654906034469604, | |
| "learning_rate": 9.452341483630336e-05, | |
| "loss": 1.305, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.2564867734909058, | |
| "learning_rate": 9.45026937422296e-05, | |
| "loss": 1.3301, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0308964252471924, | |
| "learning_rate": 9.448197264815583e-05, | |
| "loss": 1.334, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0542854070663452, | |
| "learning_rate": 9.446125155408206e-05, | |
| "loss": 1.3001, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.2161365747451782, | |
| "learning_rate": 9.444053046000829e-05, | |
| "loss": 1.2985, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.206581473350525, | |
| "learning_rate": 9.441980936593454e-05, | |
| "loss": 1.3177, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.1631922721862793, | |
| "learning_rate": 9.439908827186076e-05, | |
| "loss": 1.3269, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.9827607274055481, | |
| "learning_rate": 9.437836717778699e-05, | |
| "loss": 1.3228, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.0078628063201904, | |
| "learning_rate": 9.435764608371323e-05, | |
| "loss": 1.3047, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.1704260110855103, | |
| "learning_rate": 9.433692498963945e-05, | |
| "loss": 1.3074, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.075964093208313, | |
| "learning_rate": 9.43162038955657e-05, | |
| "loss": 1.3252, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.9463378190994263, | |
| "learning_rate": 9.429548280149192e-05, | |
| "loss": 1.3201, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.01523756980896, | |
| "learning_rate": 9.427476170741815e-05, | |
| "loss": 1.3127, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.9392449259757996, | |
| "learning_rate": 9.425404061334439e-05, | |
| "loss": 1.3254, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.01919424533844, | |
| "learning_rate": 9.423331951927062e-05, | |
| "loss": 1.3021, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1243764162063599, | |
| "learning_rate": 9.421259842519685e-05, | |
| "loss": 1.3112, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0084974765777588, | |
| "learning_rate": 9.41918773311231e-05, | |
| "loss": 1.3173, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9945486783981323, | |
| "learning_rate": 9.417115623704932e-05, | |
| "loss": 1.3114, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.1148301362991333, | |
| "learning_rate": 9.415043514297555e-05, | |
| "loss": 1.3275, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2701823711395264, | |
| "learning_rate": 9.412971404890179e-05, | |
| "loss": 1.3094, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.1923747062683105, | |
| "learning_rate": 9.410899295482802e-05, | |
| "loss": 1.2812, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2106274366378784, | |
| "learning_rate": 9.408827186075426e-05, | |
| "loss": 1.3011, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.0127681493759155, | |
| "learning_rate": 9.406755076668049e-05, | |
| "loss": 1.3059, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.042222499847412, | |
| "learning_rate": 9.404682967260671e-05, | |
| "loss": 1.2961, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9650092124938965, | |
| "learning_rate": 9.402610857853296e-05, | |
| "loss": 1.3264, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0504155158996582, | |
| "learning_rate": 9.400538748445918e-05, | |
| "loss": 1.2853, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0501419305801392, | |
| "learning_rate": 9.39846663903854e-05, | |
| "loss": 1.3179, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.056299090385437, | |
| "learning_rate": 9.396394529631165e-05, | |
| "loss": 1.2962, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0278836488723755, | |
| "learning_rate": 9.394322420223788e-05, | |
| "loss": 1.2828, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.9813990592956543, | |
| "learning_rate": 9.392250310816411e-05, | |
| "loss": 1.2986, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0665332078933716, | |
| "learning_rate": 9.390178201409035e-05, | |
| "loss": 1.2891, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0281347036361694, | |
| "learning_rate": 9.388106092001658e-05, | |
| "loss": 1.299, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0530226230621338, | |
| "learning_rate": 9.386033982594282e-05, | |
| "loss": 1.2887, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0053261518478394, | |
| "learning_rate": 9.383961873186905e-05, | |
| "loss": 1.327, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1362097263336182, | |
| "learning_rate": 9.381889763779527e-05, | |
| "loss": 1.3001, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0610814094543457, | |
| "learning_rate": 9.379817654372152e-05, | |
| "loss": 1.2535, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.9906120300292969, | |
| "learning_rate": 9.377745544964774e-05, | |
| "loss": 1.291, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0676803588867188, | |
| "learning_rate": 9.375673435557398e-05, | |
| "loss": 1.3032, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.056851863861084, | |
| "learning_rate": 9.373601326150021e-05, | |
| "loss": 1.2879, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.048841118812561, | |
| "learning_rate": 9.371529216742645e-05, | |
| "loss": 1.2798, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.047361969947815, | |
| "learning_rate": 9.369457107335268e-05, | |
| "loss": 1.3195, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.076904296875, | |
| "learning_rate": 9.367384997927892e-05, | |
| "loss": 1.3013, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.0863533020019531, | |
| "learning_rate": 9.365312888520514e-05, | |
| "loss": 1.2971, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0460786819458008, | |
| "learning_rate": 9.363240779113138e-05, | |
| "loss": 1.3023, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.906493604183197, | |
| "learning_rate": 9.361168669705761e-05, | |
| "loss": 1.3053, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.1181541681289673, | |
| "learning_rate": 9.359096560298383e-05, | |
| "loss": 1.3142, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0198432207107544, | |
| "learning_rate": 9.357024450891008e-05, | |
| "loss": 1.293, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.0075292587280273, | |
| "learning_rate": 9.35495234148363e-05, | |
| "loss": 1.299, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.024592399597168, | |
| "learning_rate": 9.352880232076254e-05, | |
| "loss": 1.2983, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9931455254554749, | |
| "learning_rate": 9.350808122668877e-05, | |
| "loss": 1.279, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.0673152208328247, | |
| "learning_rate": 9.348736013261501e-05, | |
| "loss": 1.2816, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.068587303161621, | |
| "learning_rate": 9.346663903854124e-05, | |
| "loss": 1.2934, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.9838789701461792, | |
| "learning_rate": 9.344591794446748e-05, | |
| "loss": 1.2917, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0613404512405396, | |
| "learning_rate": 9.34251968503937e-05, | |
| "loss": 1.2879, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0173070430755615, | |
| "learning_rate": 9.340447575631995e-05, | |
| "loss": 1.2966, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.1227622032165527, | |
| "learning_rate": 9.338375466224617e-05, | |
| "loss": 1.2554, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.007338523864746, | |
| "learning_rate": 9.33630335681724e-05, | |
| "loss": 1.3115, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0479813814163208, | |
| "learning_rate": 9.334231247409864e-05, | |
| "loss": 1.3048, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0560479164123535, | |
| "learning_rate": 9.332159138002486e-05, | |
| "loss": 1.2919, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.1081204414367676, | |
| "learning_rate": 9.33008702859511e-05, | |
| "loss": 1.2967, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0260145664215088, | |
| "learning_rate": 9.328014919187733e-05, | |
| "loss": 1.3195, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.057966947555542, | |
| "learning_rate": 9.325942809780357e-05, | |
| "loss": 1.2896, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.0711556673049927, | |
| "learning_rate": 9.32387070037298e-05, | |
| "loss": 1.2817, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.0118924379348755, | |
| "learning_rate": 9.321798590965604e-05, | |
| "loss": 1.3052, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.0227614641189575, | |
| "learning_rate": 9.319726481558226e-05, | |
| "loss": 1.3186, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.0655134916305542, | |
| "learning_rate": 9.317654372150851e-05, | |
| "loss": 1.3087, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.1255359649658203, | |
| "learning_rate": 9.315582262743473e-05, | |
| "loss": 1.2749, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.0832923650741577, | |
| "learning_rate": 9.313510153336096e-05, | |
| "loss": 1.2892, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.067236304283142, | |
| "learning_rate": 9.31143804392872e-05, | |
| "loss": 1.284, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.1556322574615479, | |
| "learning_rate": 9.309365934521344e-05, | |
| "loss": 1.2604, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.151723861694336, | |
| "learning_rate": 9.307293825113966e-05, | |
| "loss": 1.3045, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.0258938074111938, | |
| "learning_rate": 9.30522171570659e-05, | |
| "loss": 1.2859, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0165237188339233, | |
| "learning_rate": 9.303149606299213e-05, | |
| "loss": 1.3212, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.9969586133956909, | |
| "learning_rate": 9.301077496891836e-05, | |
| "loss": 1.3038, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.1335457563400269, | |
| "learning_rate": 9.29900538748446e-05, | |
| "loss": 1.2747, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0744903087615967, | |
| "learning_rate": 9.296933278077082e-05, | |
| "loss": 1.3078, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.2294646501541138, | |
| "learning_rate": 9.294861168669707e-05, | |
| "loss": 1.2631, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.0542582273483276, | |
| "learning_rate": 9.292789059262329e-05, | |
| "loss": 1.2778, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.0787122249603271, | |
| "learning_rate": 9.290716949854952e-05, | |
| "loss": 1.2952, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.182387351989746, | |
| "learning_rate": 9.288644840447576e-05, | |
| "loss": 1.2955, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.0466411113739014, | |
| "learning_rate": 9.2865727310402e-05, | |
| "loss": 1.3085, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0271363258361816, | |
| "learning_rate": 9.284500621632823e-05, | |
| "loss": 1.2881, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.1320871114730835, | |
| "learning_rate": 9.282428512225446e-05, | |
| "loss": 1.2671, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.1176432371139526, | |
| "learning_rate": 9.280356402818069e-05, | |
| "loss": 1.299, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.9895033240318298, | |
| "learning_rate": 9.278284293410694e-05, | |
| "loss": 1.2984, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.191007137298584, | |
| "learning_rate": 9.276212184003316e-05, | |
| "loss": 1.2808, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0878729820251465, | |
| "learning_rate": 9.274140074595939e-05, | |
| "loss": 1.2864, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.1144053936004639, | |
| "learning_rate": 9.272067965188563e-05, | |
| "loss": 1.3175, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.128405213356018, | |
| "learning_rate": 9.269995855781186e-05, | |
| "loss": 1.3147, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0539438724517822, | |
| "learning_rate": 9.267923746373808e-05, | |
| "loss": 1.2927, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0515836477279663, | |
| "learning_rate": 9.265851636966433e-05, | |
| "loss": 1.2743, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.1526955366134644, | |
| "learning_rate": 9.263779527559055e-05, | |
| "loss": 1.2911, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.010903000831604, | |
| "learning_rate": 9.261707418151679e-05, | |
| "loss": 1.2735, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.1373246908187866, | |
| "learning_rate": 9.259635308744302e-05, | |
| "loss": 1.2952, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9458179473876953, | |
| "learning_rate": 9.257563199336925e-05, | |
| "loss": 1.2936, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.949252724647522, | |
| "learning_rate": 9.25549108992955e-05, | |
| "loss": 1.287, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.1074215173721313, | |
| "learning_rate": 9.253418980522172e-05, | |
| "loss": 1.2937, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.995959460735321, | |
| "learning_rate": 9.251346871114795e-05, | |
| "loss": 1.265, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0461138486862183, | |
| "learning_rate": 9.249274761707419e-05, | |
| "loss": 1.2822, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0449700355529785, | |
| "learning_rate": 9.247202652300042e-05, | |
| "loss": 1.2896, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0590384006500244, | |
| "learning_rate": 9.245130542892664e-05, | |
| "loss": 1.2923, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.178272008895874, | |
| "learning_rate": 9.243058433485289e-05, | |
| "loss": 1.2797, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0651668310165405, | |
| "learning_rate": 9.240986324077911e-05, | |
| "loss": 1.2632, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0944633483886719, | |
| "learning_rate": 9.238914214670535e-05, | |
| "loss": 1.2853, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.042576551437378, | |
| "learning_rate": 9.236842105263158e-05, | |
| "loss": 1.2884, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.1282780170440674, | |
| "learning_rate": 9.234769995855782e-05, | |
| "loss": 1.2937, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.9996076822280884, | |
| "learning_rate": 9.232697886448405e-05, | |
| "loss": 1.2657, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.9630957245826721, | |
| "learning_rate": 9.230625777041029e-05, | |
| "loss": 1.2679, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.0428036451339722, | |
| "learning_rate": 9.228553667633651e-05, | |
| "loss": 1.2921, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.1940115690231323, | |
| "learning_rate": 9.226481558226275e-05, | |
| "loss": 1.2759, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.1081199645996094, | |
| "learning_rate": 9.224409448818898e-05, | |
| "loss": 1.2636, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.989032506942749, | |
| "learning_rate": 9.22233733941152e-05, | |
| "loss": 1.2489, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.0575727224349976, | |
| "learning_rate": 9.220265230004145e-05, | |
| "loss": 1.2777, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.0007938146591187, | |
| "learning_rate": 9.218193120596767e-05, | |
| "loss": 1.286, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.0560977458953857, | |
| "learning_rate": 9.216121011189391e-05, | |
| "loss": 1.2939, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.308013916015625, | |
| "learning_rate": 9.214048901782014e-05, | |
| "loss": 1.2728, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2011935710906982, | |
| "learning_rate": 9.211976792374638e-05, | |
| "loss": 1.2853, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0514850616455078, | |
| "learning_rate": 9.209904682967261e-05, | |
| "loss": 1.3102, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0865683555603027, | |
| "learning_rate": 9.207832573559885e-05, | |
| "loss": 1.2835, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2012451887130737, | |
| "learning_rate": 9.205760464152507e-05, | |
| "loss": 1.2801, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.0710102319717407, | |
| "learning_rate": 9.203688354745132e-05, | |
| "loss": 1.2745, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.0107301473617554, | |
| "learning_rate": 9.201616245337754e-05, | |
| "loss": 1.2928, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.0026335716247559, | |
| "learning_rate": 9.199544135930378e-05, | |
| "loss": 1.2916, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9443692564964294, | |
| "learning_rate": 9.197472026523001e-05, | |
| "loss": 1.2956, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9472268223762512, | |
| "learning_rate": 9.195399917115625e-05, | |
| "loss": 1.2875, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0817506313323975, | |
| "learning_rate": 9.193327807708247e-05, | |
| "loss": 1.2779, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0539813041687012, | |
| "learning_rate": 9.19125569830087e-05, | |
| "loss": 1.2661, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.9975004196166992, | |
| "learning_rate": 9.189183588893494e-05, | |
| "loss": 1.2499, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0313421487808228, | |
| "learning_rate": 9.187111479486117e-05, | |
| "loss": 1.2907, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0273147821426392, | |
| "learning_rate": 9.185039370078741e-05, | |
| "loss": 1.2929, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9810879230499268, | |
| "learning_rate": 9.182967260671363e-05, | |
| "loss": 1.2974, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.0243279933929443, | |
| "learning_rate": 9.180895151263988e-05, | |
| "loss": 1.2406, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.0115349292755127, | |
| "learning_rate": 9.17882304185661e-05, | |
| "loss": 1.2765, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.1206727027893066, | |
| "learning_rate": 9.176750932449234e-05, | |
| "loss": 1.2956, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.99837327003479, | |
| "learning_rate": 9.174678823041857e-05, | |
| "loss": 1.2614, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.0117827653884888, | |
| "learning_rate": 9.17260671363448e-05, | |
| "loss": 1.2611, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.1119413375854492, | |
| "learning_rate": 9.170534604227104e-05, | |
| "loss": 1.3006, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.0567435026168823, | |
| "learning_rate": 9.168462494819728e-05, | |
| "loss": 1.2544, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.1326267719268799, | |
| "learning_rate": 9.16639038541235e-05, | |
| "loss": 1.2895, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.0858250856399536, | |
| "learning_rate": 9.164318276004975e-05, | |
| "loss": 1.2672, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.1111207008361816, | |
| "learning_rate": 9.162246166597597e-05, | |
| "loss": 1.255, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.0791577100753784, | |
| "learning_rate": 9.16017405719022e-05, | |
| "loss": 1.2865, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.1741734743118286, | |
| "learning_rate": 9.158101947782844e-05, | |
| "loss": 1.2898, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9900088310241699, | |
| "learning_rate": 9.156029838375466e-05, | |
| "loss": 1.26, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.2227307558059692, | |
| "learning_rate": 9.15395772896809e-05, | |
| "loss": 1.2818, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.0696605443954468, | |
| "learning_rate": 9.151885619560713e-05, | |
| "loss": 1.2739, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.145063042640686, | |
| "learning_rate": 9.149813510153336e-05, | |
| "loss": 1.2886, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.0415133237838745, | |
| "learning_rate": 9.14774140074596e-05, | |
| "loss": 1.2999, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.9151254296302795, | |
| "learning_rate": 9.145669291338584e-05, | |
| "loss": 1.2883, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.1220918893814087, | |
| "learning_rate": 9.143597181931206e-05, | |
| "loss": 1.2751, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0417348146438599, | |
| "learning_rate": 9.14152507252383e-05, | |
| "loss": 1.2675, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.1090322732925415, | |
| "learning_rate": 9.139452963116453e-05, | |
| "loss": 1.2521, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.003110408782959, | |
| "learning_rate": 9.137380853709076e-05, | |
| "loss": 1.2461, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0214240550994873, | |
| "learning_rate": 9.1353087443017e-05, | |
| "loss": 1.3003, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0389635562896729, | |
| "learning_rate": 9.133236634894323e-05, | |
| "loss": 1.2642, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.1644842624664307, | |
| "learning_rate": 9.131164525486945e-05, | |
| "loss": 1.247, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0494420528411865, | |
| "learning_rate": 9.12909241607957e-05, | |
| "loss": 1.2727, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.1759871244430542, | |
| "learning_rate": 9.127020306672192e-05, | |
| "loss": 1.2569, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0006252527236938, | |
| "learning_rate": 9.124948197264816e-05, | |
| "loss": 1.2761, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0942422151565552, | |
| "learning_rate": 9.12287608785744e-05, | |
| "loss": 1.2778, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.1273877620697021, | |
| "learning_rate": 9.120803978450062e-05, | |
| "loss": 1.2738, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0103455781936646, | |
| "learning_rate": 9.118731869042686e-05, | |
| "loss": 1.2559, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0412319898605347, | |
| "learning_rate": 9.116659759635309e-05, | |
| "loss": 1.2839, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.1623831987380981, | |
| "learning_rate": 9.114587650227932e-05, | |
| "loss": 1.2684, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.213977336883545, | |
| "learning_rate": 9.112515540820556e-05, | |
| "loss": 1.2587, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.1630234718322754, | |
| "learning_rate": 9.110443431413179e-05, | |
| "loss": 1.2557, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0047425031661987, | |
| "learning_rate": 9.108371322005801e-05, | |
| "loss": 1.2785, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0434467792510986, | |
| "learning_rate": 9.106299212598426e-05, | |
| "loss": 1.2916, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0278736352920532, | |
| "learning_rate": 9.104227103191048e-05, | |
| "loss": 1.2494, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9865803122520447, | |
| "learning_rate": 9.102154993783672e-05, | |
| "loss": 1.2487, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.0419522523880005, | |
| "learning_rate": 9.100082884376295e-05, | |
| "loss": 1.2618, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.1537505388259888, | |
| "learning_rate": 9.098010774968919e-05, | |
| "loss": 1.2613, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.0565922260284424, | |
| "learning_rate": 9.095938665561542e-05, | |
| "loss": 1.2592, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.106313705444336, | |
| "learning_rate": 9.093866556154166e-05, | |
| "loss": 1.2585, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0392132997512817, | |
| "learning_rate": 9.091794446746788e-05, | |
| "loss": 1.2626, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.1017506122589111, | |
| "learning_rate": 9.089722337339413e-05, | |
| "loss": 1.2696, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.134838581085205, | |
| "learning_rate": 9.087650227932035e-05, | |
| "loss": 1.2626, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.3448967933654785, | |
| "learning_rate": 9.085578118524659e-05, | |
| "loss": 1.2728, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.9399920701980591, | |
| "learning_rate": 9.083506009117282e-05, | |
| "loss": 1.253, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.0530354976654053, | |
| "learning_rate": 9.081433899709904e-05, | |
| "loss": 1.2501, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.0609047412872314, | |
| "learning_rate": 9.079361790302529e-05, | |
| "loss": 1.2599, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.120044469833374, | |
| "learning_rate": 9.077289680895151e-05, | |
| "loss": 1.2511, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.068034291267395, | |
| "learning_rate": 9.075217571487775e-05, | |
| "loss": 1.2623, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9914749264717102, | |
| "learning_rate": 9.073145462080398e-05, | |
| "loss": 1.2754, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.084227204322815, | |
| "learning_rate": 9.071073352673022e-05, | |
| "loss": 1.2861, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9811500906944275, | |
| "learning_rate": 9.069001243265644e-05, | |
| "loss": 1.2361, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.0811760425567627, | |
| "learning_rate": 9.066929133858269e-05, | |
| "loss": 1.247, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.0787228345870972, | |
| "learning_rate": 9.064857024450891e-05, | |
| "loss": 1.2609, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.0164875984191895, | |
| "learning_rate": 9.062784915043515e-05, | |
| "loss": 1.289, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.1449209451675415, | |
| "learning_rate": 9.060712805636138e-05, | |
| "loss": 1.269, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.2805284261703491, | |
| "learning_rate": 9.058640696228762e-05, | |
| "loss": 1.2803, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.03864586353302, | |
| "learning_rate": 9.056568586821385e-05, | |
| "loss": 1.2635, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.9715671539306641, | |
| "learning_rate": 9.054496477414009e-05, | |
| "loss": 1.2232, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.1221860647201538, | |
| "learning_rate": 9.052424368006631e-05, | |
| "loss": 1.2536, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.9467464089393616, | |
| "learning_rate": 9.050352258599254e-05, | |
| "loss": 1.2529, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.1306428909301758, | |
| "learning_rate": 9.048280149191878e-05, | |
| "loss": 1.2621, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.038275122642517, | |
| "learning_rate": 9.0462080397845e-05, | |
| "loss": 1.2785, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.1947646141052246, | |
| "learning_rate": 9.044135930377125e-05, | |
| "loss": 1.2643, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0585174560546875, | |
| "learning_rate": 9.042063820969747e-05, | |
| "loss": 1.248, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1931108236312866, | |
| "learning_rate": 9.03999171156237e-05, | |
| "loss": 1.2884, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.280229926109314, | |
| "eval_runtime": 1606.712, | |
| "eval_samples_per_second": 262.53, | |
| "eval_steps_per_second": 4.102, | |
| "step": 4846 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0185681581497192, | |
| "learning_rate": 9.037919602154994e-05, | |
| "loss": 1.2701, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.2571942806243896, | |
| "learning_rate": 9.035847492747618e-05, | |
| "loss": 1.2309, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1584192514419556, | |
| "learning_rate": 9.033775383340241e-05, | |
| "loss": 1.2859, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.05573570728302, | |
| "learning_rate": 9.031703273932865e-05, | |
| "loss": 1.2502, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.0500950813293457, | |
| "learning_rate": 9.029631164525487e-05, | |
| "loss": 1.2288, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.0603646039962769, | |
| "learning_rate": 9.027559055118112e-05, | |
| "loss": 1.2609, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.085644245147705, | |
| "learning_rate": 9.025486945710734e-05, | |
| "loss": 1.2392, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.0088363885879517, | |
| "learning_rate": 9.023414836303357e-05, | |
| "loss": 1.2361, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.0100585222244263, | |
| "learning_rate": 9.021342726895981e-05, | |
| "loss": 1.2382, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.067159652709961, | |
| "learning_rate": 9.019270617488604e-05, | |
| "loss": 1.2433, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.0431448221206665, | |
| "learning_rate": 9.017198508081226e-05, | |
| "loss": 1.2461, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.3945410251617432, | |
| "learning_rate": 9.01512639867385e-05, | |
| "loss": 1.2406, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.0631392002105713, | |
| "learning_rate": 9.013054289266474e-05, | |
| "loss": 1.2265, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.0823907852172852, | |
| "learning_rate": 9.010982179859097e-05, | |
| "loss": 1.2388, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.1125047206878662, | |
| "learning_rate": 9.00891007045172e-05, | |
| "loss": 1.2553, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.0436159372329712, | |
| "learning_rate": 9.006837961044343e-05, | |
| "loss": 1.2362, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.2972134351730347, | |
| "learning_rate": 9.004765851636968e-05, | |
| "loss": 1.2387, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.0587375164031982, | |
| "learning_rate": 9.00269374222959e-05, | |
| "loss": 1.2348, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.0204670429229736, | |
| "learning_rate": 9.000621632822213e-05, | |
| "loss": 1.2391, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.0182541608810425, | |
| "learning_rate": 8.998549523414837e-05, | |
| "loss": 1.2337, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.9534347057342529, | |
| "learning_rate": 8.99647741400746e-05, | |
| "loss": 1.2403, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.1534489393234253, | |
| "learning_rate": 8.994405304600084e-05, | |
| "loss": 1.251, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.0630913972854614, | |
| "learning_rate": 8.992333195192707e-05, | |
| "loss": 1.2741, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.0464857816696167, | |
| "learning_rate": 8.99026108578533e-05, | |
| "loss": 1.27, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.1541072130203247, | |
| "learning_rate": 8.988188976377954e-05, | |
| "loss": 1.2183, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.007450819015503, | |
| "learning_rate": 8.986116866970576e-05, | |
| "loss": 1.2345, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.984767496585846, | |
| "learning_rate": 8.9840447575632e-05, | |
| "loss": 1.2661, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.2972489595413208, | |
| "learning_rate": 8.981972648155824e-05, | |
| "loss": 1.2577, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.1882805824279785, | |
| "learning_rate": 8.979900538748446e-05, | |
| "loss": 1.2685, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.1580913066864014, | |
| "learning_rate": 8.977828429341069e-05, | |
| "loss": 1.2269, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.0898735523223877, | |
| "learning_rate": 8.975756319933693e-05, | |
| "loss": 1.2315, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.1261813640594482, | |
| "learning_rate": 8.973684210526316e-05, | |
| "loss": 1.2617, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.0266163349151611, | |
| "learning_rate": 8.97161210111894e-05, | |
| "loss": 1.2379, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.0973056554794312, | |
| "learning_rate": 8.969539991711563e-05, | |
| "loss": 1.2527, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.0927435159683228, | |
| "learning_rate": 8.967467882304185e-05, | |
| "loss": 1.2433, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.1209070682525635, | |
| "learning_rate": 8.96539577289681e-05, | |
| "loss": 1.2512, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.222163200378418, | |
| "learning_rate": 8.963323663489432e-05, | |
| "loss": 1.2377, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.1234538555145264, | |
| "learning_rate": 8.961251554082056e-05, | |
| "loss": 1.2409, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.1121318340301514, | |
| "learning_rate": 8.95917944467468e-05, | |
| "loss": 1.2402, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.0124129056930542, | |
| "learning_rate": 8.957107335267303e-05, | |
| "loss": 1.2609, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.0647163391113281, | |
| "learning_rate": 8.955035225859925e-05, | |
| "loss": 1.2144, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0653977394104004, | |
| "learning_rate": 8.95296311645255e-05, | |
| "loss": 1.2337, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0555377006530762, | |
| "learning_rate": 8.950891007045172e-05, | |
| "loss": 1.2121, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0859911441802979, | |
| "learning_rate": 8.948818897637796e-05, | |
| "loss": 1.232, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0167226791381836, | |
| "learning_rate": 8.946746788230419e-05, | |
| "loss": 1.2252, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0780484676361084, | |
| "learning_rate": 8.944674678823041e-05, | |
| "loss": 1.2537, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.0674623250961304, | |
| "learning_rate": 8.942602569415666e-05, | |
| "loss": 1.2511, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.1867153644561768, | |
| "learning_rate": 8.940530460008288e-05, | |
| "loss": 1.2406, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.0617753267288208, | |
| "learning_rate": 8.938458350600912e-05, | |
| "loss": 1.2648, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.1268622875213623, | |
| "learning_rate": 8.936386241193535e-05, | |
| "loss": 1.2461, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.06646728515625, | |
| "learning_rate": 8.934314131786159e-05, | |
| "loss": 1.235, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.0061641931533813, | |
| "learning_rate": 8.932242022378781e-05, | |
| "loss": 1.2256, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.1114619970321655, | |
| "learning_rate": 8.930169912971406e-05, | |
| "loss": 1.2481, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.121626615524292, | |
| "learning_rate": 8.928097803564028e-05, | |
| "loss": 1.2592, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.0986332893371582, | |
| "learning_rate": 8.926025694156652e-05, | |
| "loss": 1.2397, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.055759072303772, | |
| "learning_rate": 8.923953584749275e-05, | |
| "loss": 1.2468, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.035019874572754, | |
| "learning_rate": 8.921881475341899e-05, | |
| "loss": 1.2455, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.0355908870697021, | |
| "learning_rate": 8.919809365934522e-05, | |
| "loss": 1.2584, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.0616044998168945, | |
| "learning_rate": 8.917737256527146e-05, | |
| "loss": 1.2472, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.0461037158966064, | |
| "learning_rate": 8.915665147119768e-05, | |
| "loss": 1.2531, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.2218214273452759, | |
| "learning_rate": 8.913593037712393e-05, | |
| "loss": 1.2486, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.0007187128067017, | |
| "learning_rate": 8.911520928305015e-05, | |
| "loss": 1.2464, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.056276798248291, | |
| "learning_rate": 8.909448818897638e-05, | |
| "loss": 1.2375, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.0595409870147705, | |
| "learning_rate": 8.907376709490262e-05, | |
| "loss": 1.2645, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.1598786115646362, | |
| "learning_rate": 8.905304600082884e-05, | |
| "loss": 1.2502, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.0664838552474976, | |
| "learning_rate": 8.903232490675508e-05, | |
| "loss": 1.2712, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.0016567707061768, | |
| "learning_rate": 8.901160381268131e-05, | |
| "loss": 1.2386, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.0359711647033691, | |
| "learning_rate": 8.899088271860755e-05, | |
| "loss": 1.238, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.1633154153823853, | |
| "learning_rate": 8.897016162453378e-05, | |
| "loss": 1.2139, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.085390567779541, | |
| "learning_rate": 8.894944053046002e-05, | |
| "loss": 1.2564, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.041222333908081, | |
| "learning_rate": 8.892871943638624e-05, | |
| "loss": 1.2361, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.0352637767791748, | |
| "learning_rate": 8.890799834231249e-05, | |
| "loss": 1.2104, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.0879154205322266, | |
| "learning_rate": 8.888727724823871e-05, | |
| "loss": 1.2408, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.177937388420105, | |
| "learning_rate": 8.886655615416494e-05, | |
| "loss": 1.221, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.0638147592544556, | |
| "learning_rate": 8.884583506009118e-05, | |
| "loss": 1.2309, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.0461276769638062, | |
| "learning_rate": 8.882511396601741e-05, | |
| "loss": 1.2498, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9356592297554016, | |
| "learning_rate": 8.880439287194365e-05, | |
| "loss": 1.2584, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9423561096191406, | |
| "learning_rate": 8.878367177786988e-05, | |
| "loss": 1.2094, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.1970158815383911, | |
| "learning_rate": 8.87629506837961e-05, | |
| "loss": 1.2134, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.1081819534301758, | |
| "learning_rate": 8.874222958972234e-05, | |
| "loss": 1.2134, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.1248120069503784, | |
| "learning_rate": 8.872150849564858e-05, | |
| "loss": 1.2288, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.0750600099563599, | |
| "learning_rate": 8.87007874015748e-05, | |
| "loss": 1.2191, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.058366060256958, | |
| "learning_rate": 8.868006630750105e-05, | |
| "loss": 1.2274, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.9436173439025879, | |
| "learning_rate": 8.865934521342727e-05, | |
| "loss": 1.2556, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.0624874830245972, | |
| "learning_rate": 8.86386241193535e-05, | |
| "loss": 1.2383, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.0870168209075928, | |
| "learning_rate": 8.861790302527974e-05, | |
| "loss": 1.2432, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.0561186075210571, | |
| "learning_rate": 8.859718193120597e-05, | |
| "loss": 1.2329, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.2139157056808472, | |
| "learning_rate": 8.857646083713221e-05, | |
| "loss": 1.2183, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.0662713050842285, | |
| "learning_rate": 8.855573974305844e-05, | |
| "loss": 1.2371, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.1198335886001587, | |
| "learning_rate": 8.853501864898466e-05, | |
| "loss": 1.2396, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.2191115617752075, | |
| "learning_rate": 8.851429755491091e-05, | |
| "loss": 1.2355, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.039201259613037, | |
| "learning_rate": 8.849357646083714e-05, | |
| "loss": 1.2251, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.9540196061134338, | |
| "learning_rate": 8.847285536676337e-05, | |
| "loss": 1.2424, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.0640240907669067, | |
| "learning_rate": 8.84521342726896e-05, | |
| "loss": 1.2414, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.0465424060821533, | |
| "learning_rate": 8.843141317861584e-05, | |
| "loss": 1.2241, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.1881465911865234, | |
| "learning_rate": 8.841069208454206e-05, | |
| "loss": 1.2182, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0851510763168335, | |
| "learning_rate": 8.83899709904683e-05, | |
| "loss": 1.2115, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0743972063064575, | |
| "learning_rate": 8.836924989639453e-05, | |
| "loss": 1.2211, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.049249529838562, | |
| "learning_rate": 8.834852880232077e-05, | |
| "loss": 1.2452, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.0910190343856812, | |
| "learning_rate": 8.8327807708247e-05, | |
| "loss": 1.249, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.0976841449737549, | |
| "learning_rate": 8.830708661417322e-05, | |
| "loss": 1.2123, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.3569914102554321, | |
| "learning_rate": 8.828636552009947e-05, | |
| "loss": 1.2338, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.068427324295044, | |
| "learning_rate": 8.82656444260257e-05, | |
| "loss": 1.2351, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.1309690475463867, | |
| "learning_rate": 8.824492333195193e-05, | |
| "loss": 1.2367, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.1933242082595825, | |
| "learning_rate": 8.822420223787816e-05, | |
| "loss": 1.2261, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.029557466506958, | |
| "learning_rate": 8.82034811438044e-05, | |
| "loss": 1.2499, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.106086015701294, | |
| "learning_rate": 8.818276004973062e-05, | |
| "loss": 1.2236, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.0114145278930664, | |
| "learning_rate": 8.816203895565687e-05, | |
| "loss": 1.2445, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.1829569339752197, | |
| "learning_rate": 8.814131786158309e-05, | |
| "loss": 1.2334, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.1952738761901855, | |
| "learning_rate": 8.812059676750933e-05, | |
| "loss": 1.2402, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.0816442966461182, | |
| "learning_rate": 8.809987567343556e-05, | |
| "loss": 1.252, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.1453193426132202, | |
| "learning_rate": 8.80791545793618e-05, | |
| "loss": 1.233, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.045602798461914, | |
| "learning_rate": 8.805843348528803e-05, | |
| "loss": 1.2195, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.0934393405914307, | |
| "learning_rate": 8.803771239121425e-05, | |
| "loss": 1.2177, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.0120600461959839, | |
| "learning_rate": 8.801699129714049e-05, | |
| "loss": 1.2329, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.103251576423645, | |
| "learning_rate": 8.799627020306672e-05, | |
| "loss": 1.2379, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.0402165651321411, | |
| "learning_rate": 8.797554910899296e-05, | |
| "loss": 1.2256, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.0983202457427979, | |
| "learning_rate": 8.79548280149192e-05, | |
| "loss": 1.2296, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.0412105321884155, | |
| "learning_rate": 8.793410692084543e-05, | |
| "loss": 1.2228, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.0473746061325073, | |
| "learning_rate": 8.791338582677165e-05, | |
| "loss": 1.2228, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.004840612411499, | |
| "learning_rate": 8.78926647326979e-05, | |
| "loss": 1.2108, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.1168406009674072, | |
| "learning_rate": 8.787194363862412e-05, | |
| "loss": 1.2104, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.1124876737594604, | |
| "learning_rate": 8.785122254455036e-05, | |
| "loss": 1.2336, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.059415340423584, | |
| "learning_rate": 8.783050145047659e-05, | |
| "loss": 1.2438, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.063344955444336, | |
| "learning_rate": 8.780978035640283e-05, | |
| "loss": 1.2352, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.0552620887756348, | |
| "learning_rate": 8.778905926232905e-05, | |
| "loss": 1.2131, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.1673498153686523, | |
| "learning_rate": 8.77683381682553e-05, | |
| "loss": 1.213, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.1740206480026245, | |
| "learning_rate": 8.774761707418152e-05, | |
| "loss": 1.2162, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.0944995880126953, | |
| "learning_rate": 8.772689598010775e-05, | |
| "loss": 1.2005, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.0152305364608765, | |
| "learning_rate": 8.770617488603399e-05, | |
| "loss": 1.2198, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.1265654563903809, | |
| "learning_rate": 8.768545379196021e-05, | |
| "loss": 1.2362, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.151207447052002, | |
| "learning_rate": 8.766473269788646e-05, | |
| "loss": 1.2561, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.035855770111084, | |
| "learning_rate": 8.764401160381268e-05, | |
| "loss": 1.2391, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.091009497642517, | |
| "learning_rate": 8.762329050973892e-05, | |
| "loss": 1.2342, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.057400107383728, | |
| "learning_rate": 8.760256941566515e-05, | |
| "loss": 1.2216, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.9303261041641235, | |
| "learning_rate": 8.758184832159139e-05, | |
| "loss": 1.2352, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.2061206102371216, | |
| "learning_rate": 8.756112722751761e-05, | |
| "loss": 1.215, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.0886818170547485, | |
| "learning_rate": 8.754040613344386e-05, | |
| "loss": 1.2266, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.279563546180725, | |
| "learning_rate": 8.751968503937008e-05, | |
| "loss": 1.237, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0063716173171997, | |
| "learning_rate": 8.749896394529631e-05, | |
| "loss": 1.2119, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0414812564849854, | |
| "learning_rate": 8.747824285122255e-05, | |
| "loss": 1.2177, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0915932655334473, | |
| "learning_rate": 8.745752175714878e-05, | |
| "loss": 1.2422, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0544465780258179, | |
| "learning_rate": 8.743680066307502e-05, | |
| "loss": 1.234, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.1502125263214111, | |
| "learning_rate": 8.741607956900125e-05, | |
| "loss": 1.2183, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.3113386631011963, | |
| "learning_rate": 8.739535847492748e-05, | |
| "loss": 1.202, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.087583303451538, | |
| "learning_rate": 8.737463738085371e-05, | |
| "loss": 1.241, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.135565161705017, | |
| "learning_rate": 8.735391628677995e-05, | |
| "loss": 1.2411, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.0986896753311157, | |
| "learning_rate": 8.733319519270617e-05, | |
| "loss": 1.2178, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.6357513666152954, | |
| "learning_rate": 8.731247409863242e-05, | |
| "loss": 1.2102, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.0731899738311768, | |
| "learning_rate": 8.729175300455864e-05, | |
| "loss": 1.213, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.1432324647903442, | |
| "learning_rate": 8.727103191048487e-05, | |
| "loss": 1.2395, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.0547071695327759, | |
| "learning_rate": 8.725031081641111e-05, | |
| "loss": 1.2121, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.1022225618362427, | |
| "learning_rate": 8.722958972233734e-05, | |
| "loss": 1.2274, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.0772980451583862, | |
| "learning_rate": 8.720886862826358e-05, | |
| "loss": 1.2244, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.073470115661621, | |
| "learning_rate": 8.718814753418981e-05, | |
| "loss": 1.2243, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.1750410795211792, | |
| "learning_rate": 8.716742644011604e-05, | |
| "loss": 1.2191, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.206298828125, | |
| "learning_rate": 8.714670534604228e-05, | |
| "loss": 1.2284, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.1253398656845093, | |
| "learning_rate": 8.71259842519685e-05, | |
| "loss": 1.2222, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.1828629970550537, | |
| "learning_rate": 8.710526315789474e-05, | |
| "loss": 1.225, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.0462387800216675, | |
| "learning_rate": 8.708454206382098e-05, | |
| "loss": 1.2156, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.3468637466430664, | |
| "learning_rate": 8.706382096974721e-05, | |
| "loss": 1.2111, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.1359398365020752, | |
| "learning_rate": 8.704309987567345e-05, | |
| "loss": 1.2022, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0748237371444702, | |
| "learning_rate": 8.702237878159967e-05, | |
| "loss": 1.2227, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0240176916122437, | |
| "learning_rate": 8.70016576875259e-05, | |
| "loss": 1.2042, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0575100183486938, | |
| "learning_rate": 8.698093659345214e-05, | |
| "loss": 1.2166, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0095828771591187, | |
| "learning_rate": 8.696021549937837e-05, | |
| "loss": 1.2352, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.140643835067749, | |
| "learning_rate": 8.69394944053046e-05, | |
| "loss": 1.2143, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.0403310060501099, | |
| "learning_rate": 8.691877331123084e-05, | |
| "loss": 1.2268, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.1176542043685913, | |
| "learning_rate": 8.689805221715706e-05, | |
| "loss": 1.2123, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.041100025177002, | |
| "learning_rate": 8.68773311230833e-05, | |
| "loss": 1.2356, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.9852685928344727, | |
| "learning_rate": 8.685661002900954e-05, | |
| "loss": 1.2043, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.206588864326477, | |
| "learning_rate": 8.683588893493577e-05, | |
| "loss": 1.2132, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.1841477155685425, | |
| "learning_rate": 8.6815167840862e-05, | |
| "loss": 1.241, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.1257410049438477, | |
| "learning_rate": 8.679444674678824e-05, | |
| "loss": 1.2241, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.053404688835144, | |
| "learning_rate": 8.677372565271446e-05, | |
| "loss": 1.2409, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.109271764755249, | |
| "learning_rate": 8.675300455864071e-05, | |
| "loss": 1.2036, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.0103741884231567, | |
| "learning_rate": 8.673228346456693e-05, | |
| "loss": 1.2128, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.149715542793274, | |
| "learning_rate": 8.671156237049317e-05, | |
| "loss": 1.208, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.0523351430892944, | |
| "learning_rate": 8.66908412764194e-05, | |
| "loss": 1.2146, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.2250432968139648, | |
| "learning_rate": 8.667012018234562e-05, | |
| "loss": 1.2454, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.9975298643112183, | |
| "learning_rate": 8.664939908827186e-05, | |
| "loss": 1.228, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.0323069095611572, | |
| "learning_rate": 8.66286779941981e-05, | |
| "loss": 1.2223, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.0941072702407837, | |
| "learning_rate": 8.660795690012433e-05, | |
| "loss": 1.2214, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.2442706823349, | |
| "learning_rate": 8.658723580605056e-05, | |
| "loss": 1.2316, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.1723049879074097, | |
| "learning_rate": 8.65665147119768e-05, | |
| "loss": 1.2184, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.1901644468307495, | |
| "learning_rate": 8.654579361790302e-05, | |
| "loss": 1.2382, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.1003718376159668, | |
| "learning_rate": 8.652507252382927e-05, | |
| "loss": 1.2272, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.1174200773239136, | |
| "learning_rate": 8.650435142975549e-05, | |
| "loss": 1.2248, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.2489265203475952, | |
| "learning_rate": 8.648363033568173e-05, | |
| "loss": 1.2252, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.2472732067108154, | |
| "learning_rate": 8.646290924160796e-05, | |
| "loss": 1.227, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.0011487007141113, | |
| "learning_rate": 8.64421881475342e-05, | |
| "loss": 1.2267, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.060117244720459, | |
| "learning_rate": 8.642146705346042e-05, | |
| "loss": 1.2263, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.102454423904419, | |
| "learning_rate": 8.640074595938667e-05, | |
| "loss": 1.2205, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.0496042966842651, | |
| "learning_rate": 8.638002486531289e-05, | |
| "loss": 1.2207, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.005552053451538, | |
| "learning_rate": 8.635930377123912e-05, | |
| "loss": 1.1992, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.0828580856323242, | |
| "learning_rate": 8.633858267716536e-05, | |
| "loss": 1.2164, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.0597617626190186, | |
| "learning_rate": 8.631786158309158e-05, | |
| "loss": 1.2081, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1591441631317139, | |
| "learning_rate": 8.629714048901783e-05, | |
| "loss": 1.2315, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1969908475875854, | |
| "learning_rate": 8.627641939494405e-05, | |
| "loss": 1.2307, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1163939237594604, | |
| "learning_rate": 8.625569830087029e-05, | |
| "loss": 1.2272, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.047241449356079, | |
| "learning_rate": 8.623497720679652e-05, | |
| "loss": 1.1991, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.0410964488983154, | |
| "learning_rate": 8.621425611272276e-05, | |
| "loss": 1.211, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.0674628019332886, | |
| "learning_rate": 8.619353501864899e-05, | |
| "loss": 1.2195, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.9892363548278809, | |
| "learning_rate": 8.617281392457523e-05, | |
| "loss": 1.2344, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.08130943775177, | |
| "learning_rate": 8.615209283050145e-05, | |
| "loss": 1.2299, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.0285786390304565, | |
| "learning_rate": 8.613137173642768e-05, | |
| "loss": 1.2166, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.0203038454055786, | |
| "learning_rate": 8.611065064235392e-05, | |
| "loss": 1.2043, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.17252516746521, | |
| "learning_rate": 8.608992954828015e-05, | |
| "loss": 1.2278, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.233001708984375, | |
| "learning_rate": 8.606920845420639e-05, | |
| "loss": 1.233, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.0858713388442993, | |
| "learning_rate": 8.604848736013262e-05, | |
| "loss": 1.2174, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.069405198097229, | |
| "learning_rate": 8.602776626605885e-05, | |
| "loss": 1.2176, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.0860111713409424, | |
| "learning_rate": 8.60070451719851e-05, | |
| "loss": 1.2179, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.111624836921692, | |
| "learning_rate": 8.598632407791132e-05, | |
| "loss": 1.2167, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.092925786972046, | |
| "learning_rate": 8.596560298383755e-05, | |
| "loss": 1.2154, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.115460753440857, | |
| "learning_rate": 8.594488188976379e-05, | |
| "loss": 1.2052, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.229777455329895, | |
| "learning_rate": 8.592416079569001e-05, | |
| "loss": 1.1769, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.1182063817977905, | |
| "learning_rate": 8.590343970161626e-05, | |
| "loss": 1.2035, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.0315207242965698, | |
| "learning_rate": 8.588271860754248e-05, | |
| "loss": 1.2158, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.1051239967346191, | |
| "learning_rate": 8.586199751346871e-05, | |
| "loss": 1.1874, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.0437036752700806, | |
| "learning_rate": 8.584127641939495e-05, | |
| "loss": 1.2421, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.1060231924057007, | |
| "learning_rate": 8.582055532532118e-05, | |
| "loss": 1.2392, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.0951759815216064, | |
| "learning_rate": 8.57998342312474e-05, | |
| "loss": 1.2348, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.7351630926132202, | |
| "learning_rate": 8.577911313717365e-05, | |
| "loss": 1.2198, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.3410096168518066, | |
| "learning_rate": 8.575839204309988e-05, | |
| "loss": 1.2069, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.1723905801773071, | |
| "learning_rate": 8.573767094902611e-05, | |
| "loss": 1.2264, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.0400230884552002, | |
| "learning_rate": 8.571694985495235e-05, | |
| "loss": 1.2285, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.1555556058883667, | |
| "learning_rate": 8.569622876087858e-05, | |
| "loss": 1.1982, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.0779943466186523, | |
| "learning_rate": 8.567550766680482e-05, | |
| "loss": 1.2241, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.011435627937317, | |
| "learning_rate": 8.565478657273105e-05, | |
| "loss": 1.1919, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.1413905620574951, | |
| "learning_rate": 8.563406547865727e-05, | |
| "loss": 1.2026, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.305766224861145, | |
| "learning_rate": 8.561334438458351e-05, | |
| "loss": 1.2243, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.1241309642791748, | |
| "learning_rate": 8.559262329050974e-05, | |
| "loss": 1.2305, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.1354045867919922, | |
| "learning_rate": 8.557190219643596e-05, | |
| "loss": 1.2203, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.0294325351715088, | |
| "learning_rate": 8.555118110236221e-05, | |
| "loss": 1.2356, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.1068978309631348, | |
| "learning_rate": 8.553046000828844e-05, | |
| "loss": 1.2191, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.0707268714904785, | |
| "learning_rate": 8.550973891421467e-05, | |
| "loss": 1.2168, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.158420443534851, | |
| "learning_rate": 8.54890178201409e-05, | |
| "loss": 1.1924, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.018847107887268, | |
| "learning_rate": 8.546829672606714e-05, | |
| "loss": 1.2326, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.1226823329925537, | |
| "learning_rate": 8.544757563199338e-05, | |
| "loss": 1.2029, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.1094051599502563, | |
| "learning_rate": 8.542685453791961e-05, | |
| "loss": 1.2005, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.1425297260284424, | |
| "learning_rate": 8.540613344384583e-05, | |
| "loss": 1.2109, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.0846836566925049, | |
| "learning_rate": 8.538541234977208e-05, | |
| "loss": 1.2098, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.0861196517944336, | |
| "learning_rate": 8.53646912556983e-05, | |
| "loss": 1.2201, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.0006380081176758, | |
| "learning_rate": 8.534397016162454e-05, | |
| "loss": 1.2457, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.0799134969711304, | |
| "learning_rate": 8.532324906755077e-05, | |
| "loss": 1.2117, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.1802451610565186, | |
| "learning_rate": 8.530252797347701e-05, | |
| "loss": 1.2277, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.0235154628753662, | |
| "learning_rate": 8.528180687940323e-05, | |
| "loss": 1.204, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.1273279190063477, | |
| "learning_rate": 8.526108578532946e-05, | |
| "loss": 1.2179, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.9829218983650208, | |
| "learning_rate": 8.52403646912557e-05, | |
| "loss": 1.1913, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.1805267333984375, | |
| "learning_rate": 8.521964359718194e-05, | |
| "loss": 1.1822, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.026557207107544, | |
| "learning_rate": 8.519892250310817e-05, | |
| "loss": 1.1912, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.04379141330719, | |
| "learning_rate": 8.517820140903439e-05, | |
| "loss": 1.2093, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.0553088188171387, | |
| "learning_rate": 8.515748031496064e-05, | |
| "loss": 1.2213, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.8822417855262756, | |
| "learning_rate": 8.513675922088686e-05, | |
| "loss": 1.2223, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.9738419055938721, | |
| "learning_rate": 8.51160381268131e-05, | |
| "loss": 1.2011, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.0899471044540405, | |
| "learning_rate": 8.509738914214672e-05, | |
| "loss": 1.2406, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.1138182878494263, | |
| "learning_rate": 8.507666804807294e-05, | |
| "loss": 1.1963, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.1311627626419067, | |
| "learning_rate": 8.505594695399917e-05, | |
| "loss": 1.1977, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.138723134994507, | |
| "learning_rate": 8.503522585992541e-05, | |
| "loss": 1.2104, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.0873496532440186, | |
| "learning_rate": 8.501450476585164e-05, | |
| "loss": 1.2113, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.9628106355667114, | |
| "learning_rate": 8.499378367177787e-05, | |
| "loss": 1.2258, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.1409735679626465, | |
| "learning_rate": 8.497306257770411e-05, | |
| "loss": 1.1964, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.111512541770935, | |
| "learning_rate": 8.495234148363034e-05, | |
| "loss": 1.2061, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.0905687808990479, | |
| "learning_rate": 8.493162038955657e-05, | |
| "loss": 1.2025, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.1417665481567383, | |
| "learning_rate": 8.49108992954828e-05, | |
| "loss": 1.1861, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.024594783782959, | |
| "learning_rate": 8.489017820140904e-05, | |
| "loss": 1.1893, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.112746238708496, | |
| "learning_rate": 8.486945710733528e-05, | |
| "loss": 1.2196, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.1404857635498047, | |
| "learning_rate": 8.484873601326151e-05, | |
| "loss": 1.2131, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.1401077508926392, | |
| "learning_rate": 8.482801491918773e-05, | |
| "loss": 1.2107, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.2235480546951294, | |
| "learning_rate": 8.480729382511397e-05, | |
| "loss": 1.2363, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.0837748050689697, | |
| "learning_rate": 8.47865727310402e-05, | |
| "loss": 1.2139, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.22895348072052, | |
| "learning_rate": 8.476585163696643e-05, | |
| "loss": 1.1998, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.105093240737915, | |
| "learning_rate": 8.474513054289267e-05, | |
| "loss": 1.2212, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.222209095954895, | |
| "learning_rate": 8.47244094488189e-05, | |
| "loss": 1.2453, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.1390637159347534, | |
| "learning_rate": 8.470368835474513e-05, | |
| "loss": 1.2071, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.0356426239013672, | |
| "learning_rate": 8.468296726067137e-05, | |
| "loss": 1.2157, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.0771561861038208, | |
| "learning_rate": 8.46622461665976e-05, | |
| "loss": 1.212, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.3607767820358276, | |
| "learning_rate": 8.464152507252384e-05, | |
| "loss": 1.2253, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.0732100009918213, | |
| "learning_rate": 8.462080397845007e-05, | |
| "loss": 1.1938, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.0551025867462158, | |
| "learning_rate": 8.460008288437629e-05, | |
| "loss": 1.2163, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.127244472503662, | |
| "learning_rate": 8.457936179030254e-05, | |
| "loss": 1.1956, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.0608160495758057, | |
| "learning_rate": 8.455864069622876e-05, | |
| "loss": 1.191, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.0632834434509277, | |
| "learning_rate": 8.4537919602155e-05, | |
| "loss": 1.1971, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.1187163591384888, | |
| "learning_rate": 8.451719850808123e-05, | |
| "loss": 1.2156, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.135420799255371, | |
| "learning_rate": 8.449647741400747e-05, | |
| "loss": 1.1953, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.1564319133758545, | |
| "learning_rate": 8.447575631993369e-05, | |
| "loss": 1.2169, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.1321407556533813, | |
| "learning_rate": 8.445503522585993e-05, | |
| "loss": 1.1869, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.097676157951355, | |
| "learning_rate": 8.443431413178616e-05, | |
| "loss": 1.2039, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.9873398542404175, | |
| "learning_rate": 8.44135930377124e-05, | |
| "loss": 1.2115, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.0774983167648315, | |
| "learning_rate": 8.439287194363863e-05, | |
| "loss": 1.2089, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.199475884437561, | |
| "learning_rate": 8.437215084956485e-05, | |
| "loss": 1.1913, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.2517530918121338, | |
| "learning_rate": 8.43514297554911e-05, | |
| "loss": 1.24, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.1117113828659058, | |
| "learning_rate": 8.433070866141732e-05, | |
| "loss": 1.1907, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.1518152952194214, | |
| "learning_rate": 8.430998756734356e-05, | |
| "loss": 1.2194, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.0633752346038818, | |
| "learning_rate": 8.428926647326979e-05, | |
| "loss": 1.2173, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.1065930128097534, | |
| "learning_rate": 8.426854537919603e-05, | |
| "loss": 1.1803, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.03267240524292, | |
| "learning_rate": 8.424782428512226e-05, | |
| "loss": 1.2178, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.9949610233306885, | |
| "learning_rate": 8.42271031910485e-05, | |
| "loss": 1.1999, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.0859277248382568, | |
| "learning_rate": 8.420638209697472e-05, | |
| "loss": 1.2056, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.0535070896148682, | |
| "learning_rate": 8.418566100290096e-05, | |
| "loss": 1.2004, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.1210662126541138, | |
| "learning_rate": 8.416493990882719e-05, | |
| "loss": 1.2223, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.0601907968521118, | |
| "learning_rate": 8.414421881475343e-05, | |
| "loss": 1.1985, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.0329095125198364, | |
| "learning_rate": 8.412349772067966e-05, | |
| "loss": 1.202, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.1331712007522583, | |
| "learning_rate": 8.410277662660588e-05, | |
| "loss": 1.1923, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.1752616167068481, | |
| "learning_rate": 8.408205553253212e-05, | |
| "loss": 1.195, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.1065553426742554, | |
| "learning_rate": 8.406133443845835e-05, | |
| "loss": 1.2173, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.1917232275009155, | |
| "learning_rate": 8.404061334438459e-05, | |
| "loss": 1.1926, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.1937179565429688, | |
| "learning_rate": 8.401989225031082e-05, | |
| "loss": 1.18, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.1231236457824707, | |
| "learning_rate": 8.399917115623706e-05, | |
| "loss": 1.2006, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.2102551460266113, | |
| "learning_rate": 8.397845006216328e-05, | |
| "loss": 1.209, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.0514934062957764, | |
| "learning_rate": 8.395772896808953e-05, | |
| "loss": 1.1959, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.0771079063415527, | |
| "learning_rate": 8.393700787401575e-05, | |
| "loss": 1.2303, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.1473889350891113, | |
| "learning_rate": 8.391628677994198e-05, | |
| "loss": 1.189, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.101943850517273, | |
| "learning_rate": 8.389556568586822e-05, | |
| "loss": 1.1792, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.217653751373291, | |
| "learning_rate": 8.387484459179446e-05, | |
| "loss": 1.2032, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1558287143707275, | |
| "learning_rate": 8.385412349772068e-05, | |
| "loss": 1.2048, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1772674322128296, | |
| "learning_rate": 8.383340240364693e-05, | |
| "loss": 1.208, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1206742525100708, | |
| "learning_rate": 8.381268130957315e-05, | |
| "loss": 1.2158, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.0629864931106567, | |
| "learning_rate": 8.379196021549938e-05, | |
| "loss": 1.1989, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.0601340532302856, | |
| "learning_rate": 8.377123912142562e-05, | |
| "loss": 1.2149, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.2142128944396973, | |
| "learning_rate": 8.375051802735184e-05, | |
| "loss": 1.2045, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.015764832496643, | |
| "learning_rate": 8.372979693327809e-05, | |
| "loss": 1.2177, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.1525570154190063, | |
| "learning_rate": 8.370907583920431e-05, | |
| "loss": 1.2203, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.0959409475326538, | |
| "learning_rate": 8.368835474513054e-05, | |
| "loss": 1.2022, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.073423981666565, | |
| "learning_rate": 8.366763365105678e-05, | |
| "loss": 1.2225, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.1058803796768188, | |
| "learning_rate": 8.364691255698301e-05, | |
| "loss": 1.202, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.067132592201233, | |
| "learning_rate": 8.362619146290924e-05, | |
| "loss": 1.2215, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.1303818225860596, | |
| "learning_rate": 8.360547036883548e-05, | |
| "loss": 1.2049, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.1500287055969238, | |
| "learning_rate": 8.35847492747617e-05, | |
| "loss": 1.2241, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.9954307675361633, | |
| "learning_rate": 8.356402818068794e-05, | |
| "loss": 1.1806, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.0905861854553223, | |
| "learning_rate": 8.354330708661418e-05, | |
| "loss": 1.2108, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.1111913919448853, | |
| "learning_rate": 8.352258599254041e-05, | |
| "loss": 1.2026, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.9808979034423828, | |
| "learning_rate": 8.350186489846665e-05, | |
| "loss": 1.2101, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.127181053161621, | |
| "learning_rate": 8.348114380439288e-05, | |
| "loss": 1.1955, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.0933669805526733, | |
| "learning_rate": 8.34604227103191e-05, | |
| "loss": 1.1904, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.2010207176208496, | |
| "learning_rate": 8.343970161624535e-05, | |
| "loss": 1.2018, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.019642949104309, | |
| "learning_rate": 8.341898052217157e-05, | |
| "loss": 1.221, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.115064024925232, | |
| "learning_rate": 8.339825942809781e-05, | |
| "loss": 1.193, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.008520245552063, | |
| "learning_rate": 8.337753833402404e-05, | |
| "loss": 1.222, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.0627065896987915, | |
| "learning_rate": 8.335681723995027e-05, | |
| "loss": 1.1968, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.2253535985946655, | |
| "learning_rate": 8.33360961458765e-05, | |
| "loss": 1.2135, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.0848592519760132, | |
| "learning_rate": 8.331537505180274e-05, | |
| "loss": 1.2087, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1441541910171509, | |
| "learning_rate": 8.329465395772897e-05, | |
| "loss": 1.1789, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1108758449554443, | |
| "learning_rate": 8.32739328636552e-05, | |
| "loss": 1.1778, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1133983135223389, | |
| "learning_rate": 8.325321176958144e-05, | |
| "loss": 1.2062, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1412779092788696, | |
| "learning_rate": 8.323249067550766e-05, | |
| "loss": 1.2129, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.0338503122329712, | |
| "learning_rate": 8.321176958143391e-05, | |
| "loss": 1.1727, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.1191462278366089, | |
| "learning_rate": 8.319104848736013e-05, | |
| "loss": 1.1997, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.0909239053726196, | |
| "learning_rate": 8.317032739328637e-05, | |
| "loss": 1.2144, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.1151865720748901, | |
| "learning_rate": 8.31496062992126e-05, | |
| "loss": 1.2033, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.143604040145874, | |
| "learning_rate": 8.312888520513884e-05, | |
| "loss": 1.2088, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.2317973375320435, | |
| "learning_rate": 8.310816411106507e-05, | |
| "loss": 1.2179, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.1043517589569092, | |
| "learning_rate": 8.308744301699131e-05, | |
| "loss": 1.1913, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.149396300315857, | |
| "learning_rate": 8.306672192291753e-05, | |
| "loss": 1.2066, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.0468456745147705, | |
| "learning_rate": 8.304600082884377e-05, | |
| "loss": 1.1823, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.0730814933776855, | |
| "learning_rate": 8.302527973477e-05, | |
| "loss": 1.2053, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.2069722414016724, | |
| "learning_rate": 8.300455864069622e-05, | |
| "loss": 1.1999, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.0964728593826294, | |
| "learning_rate": 8.298383754662247e-05, | |
| "loss": 1.1927, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.0547986030578613, | |
| "learning_rate": 8.296311645254869e-05, | |
| "loss": 1.2198, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.9480800628662109, | |
| "learning_rate": 8.294239535847493e-05, | |
| "loss": 1.1673, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.042935848236084, | |
| "learning_rate": 8.292167426440116e-05, | |
| "loss": 1.211, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.1812864542007446, | |
| "learning_rate": 8.29009531703274e-05, | |
| "loss": 1.1775, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.0850603580474854, | |
| "learning_rate": 8.288023207625363e-05, | |
| "loss": 1.2023, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.0494657754898071, | |
| "learning_rate": 8.285951098217987e-05, | |
| "loss": 1.1897, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.2138115167617798, | |
| "learning_rate": 8.283878988810609e-05, | |
| "loss": 1.1691, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.0943918228149414, | |
| "learning_rate": 8.281806879403234e-05, | |
| "loss": 1.1972, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.0463635921478271, | |
| "learning_rate": 8.279734769995856e-05, | |
| "loss": 1.2312, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.1013418436050415, | |
| "learning_rate": 8.27766266058848e-05, | |
| "loss": 1.2311, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.9996619820594788, | |
| "learning_rate": 8.275590551181103e-05, | |
| "loss": 1.2129, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.0416114330291748, | |
| "learning_rate": 8.273518441773727e-05, | |
| "loss": 1.1822, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.0862529277801514, | |
| "learning_rate": 8.271446332366349e-05, | |
| "loss": 1.2145, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.1965521574020386, | |
| "learning_rate": 8.269374222958972e-05, | |
| "loss": 1.1969, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.1601965427398682, | |
| "learning_rate": 8.267302113551596e-05, | |
| "loss": 1.2112, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.0845617055892944, | |
| "learning_rate": 8.265230004144219e-05, | |
| "loss": 1.1885, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.0907461643218994, | |
| "learning_rate": 8.263157894736843e-05, | |
| "loss": 1.2091, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.027777075767517, | |
| "learning_rate": 8.261085785329465e-05, | |
| "loss": 1.1919, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.2468430995941162, | |
| "learning_rate": 8.25901367592209e-05, | |
| "loss": 1.1948, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.1889891624450684, | |
| "learning_rate": 8.256941566514712e-05, | |
| "loss": 1.2143, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.2794381380081177, | |
| "learning_rate": 8.254869457107336e-05, | |
| "loss": 1.2132, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.0493546724319458, | |
| "learning_rate": 8.252797347699959e-05, | |
| "loss": 1.1766, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.001658320426941, | |
| "learning_rate": 8.250725238292583e-05, | |
| "loss": 1.1715, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.06058669090271, | |
| "learning_rate": 8.248653128885205e-05, | |
| "loss": 1.1793, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.09765625, | |
| "learning_rate": 8.24658101947783e-05, | |
| "loss": 1.1954, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.0230962038040161, | |
| "learning_rate": 8.244508910070452e-05, | |
| "loss": 1.2158, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.028173804283142, | |
| "learning_rate": 8.242436800663075e-05, | |
| "loss": 1.1874, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.0620988607406616, | |
| "learning_rate": 8.240364691255699e-05, | |
| "loss": 1.2101, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.0968023538589478, | |
| "learning_rate": 8.238292581848322e-05, | |
| "loss": 1.2131, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1402392387390137, | |
| "learning_rate": 8.236220472440946e-05, | |
| "loss": 1.2173, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1263937950134277, | |
| "learning_rate": 8.234148363033568e-05, | |
| "loss": 1.1796, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1383157968521118, | |
| "learning_rate": 8.232076253626191e-05, | |
| "loss": 1.1724, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.255072832107544, | |
| "learning_rate": 8.230004144218815e-05, | |
| "loss": 1.2019, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.4536107778549194, | |
| "learning_rate": 8.227932034811438e-05, | |
| "loss": 1.189, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.0804840326309204, | |
| "learning_rate": 8.225859925404062e-05, | |
| "loss": 1.2009, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.1545298099517822, | |
| "learning_rate": 8.223787815996686e-05, | |
| "loss": 1.2047, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.2308061122894287, | |
| "learning_rate": 8.221715706589308e-05, | |
| "loss": 1.1958, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.2053345441818237, | |
| "learning_rate": 8.219643597181933e-05, | |
| "loss": 1.183, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.2016477584838867, | |
| "learning_rate": 8.217571487774555e-05, | |
| "loss": 1.1831, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.2991312742233276, | |
| "learning_rate": 8.215499378367178e-05, | |
| "loss": 1.1893, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.0688191652297974, | |
| "learning_rate": 8.213427268959802e-05, | |
| "loss": 1.1962, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.0042345523834229, | |
| "learning_rate": 8.211355159552425e-05, | |
| "loss": 1.1935, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.1283092498779297, | |
| "learning_rate": 8.209283050145047e-05, | |
| "loss": 1.1983, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.1149276494979858, | |
| "learning_rate": 8.207210940737672e-05, | |
| "loss": 1.1967, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.0818403959274292, | |
| "learning_rate": 8.205138831330294e-05, | |
| "loss": 1.187, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.1263095140457153, | |
| "learning_rate": 8.203066721922918e-05, | |
| "loss": 1.1866, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.0879017114639282, | |
| "learning_rate": 8.200994612515541e-05, | |
| "loss": 1.2018, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.1448615789413452, | |
| "learning_rate": 8.198922503108164e-05, | |
| "loss": 1.1879, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.9900506138801575, | |
| "learning_rate": 8.196850393700788e-05, | |
| "loss": 1.1985, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.0438898801803589, | |
| "learning_rate": 8.19477828429341e-05, | |
| "loss": 1.1881, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.1896075010299683, | |
| "learning_rate": 8.192706174886034e-05, | |
| "loss": 1.2022, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.152300238609314, | |
| "learning_rate": 8.190634065478658e-05, | |
| "loss": 1.1828, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0616978406906128, | |
| "learning_rate": 8.188561956071281e-05, | |
| "loss": 1.1788, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0310215950012207, | |
| "learning_rate": 8.186489846663903e-05, | |
| "loss": 1.1898, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.1227622032165527, | |
| "learning_rate": 8.184417737256528e-05, | |
| "loss": 1.1936, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0663117170333862, | |
| "learning_rate": 8.18234562784915e-05, | |
| "loss": 1.1857, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.202330470085144, | |
| "learning_rate": 8.180273518441774e-05, | |
| "loss": 1.1811, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.036596417427063, | |
| "learning_rate": 8.178201409034397e-05, | |
| "loss": 1.2013, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.0503324270248413, | |
| "learning_rate": 8.176129299627021e-05, | |
| "loss": 1.1755, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.0510581731796265, | |
| "learning_rate": 8.174057190219644e-05, | |
| "loss": 1.1936, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.0580915212631226, | |
| "learning_rate": 8.171985080812268e-05, | |
| "loss": 1.2006, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0723899602890015, | |
| "learning_rate": 8.16991297140489e-05, | |
| "loss": 1.1841, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0289912223815918, | |
| "learning_rate": 8.167840861997515e-05, | |
| "loss": 1.1762, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.2664040327072144, | |
| "learning_rate": 8.165768752590137e-05, | |
| "loss": 1.1695, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0689359903335571, | |
| "learning_rate": 8.163696643182759e-05, | |
| "loss": 1.1888, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.131007194519043, | |
| "learning_rate": 8.161624533775384e-05, | |
| "loss": 1.1856, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.06068754196167, | |
| "learning_rate": 8.159552424368006e-05, | |
| "loss": 1.2055, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.0843102931976318, | |
| "learning_rate": 8.15748031496063e-05, | |
| "loss": 1.1898, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.141878366470337, | |
| "learning_rate": 8.155408205553253e-05, | |
| "loss": 1.1885, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.1082584857940674, | |
| "learning_rate": 8.153336096145877e-05, | |
| "loss": 1.1843, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.1285297870635986, | |
| "learning_rate": 8.1512639867385e-05, | |
| "loss": 1.2126, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.2701120376586914, | |
| "learning_rate": 8.149191877331124e-05, | |
| "loss": 1.1816, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.1763367652893066, | |
| "learning_rate": 8.147119767923746e-05, | |
| "loss": 1.2031, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.0942009687423706, | |
| "learning_rate": 8.145047658516371e-05, | |
| "loss": 1.1922, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.1373592615127563, | |
| "learning_rate": 8.142975549108993e-05, | |
| "loss": 1.1891, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.0547840595245361, | |
| "learning_rate": 8.140903439701617e-05, | |
| "loss": 1.2119, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.973156213760376, | |
| "learning_rate": 8.13883133029424e-05, | |
| "loss": 1.1929, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.07832670211792, | |
| "learning_rate": 8.136759220886864e-05, | |
| "loss": 1.1693, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.173492670059204, | |
| "learning_rate": 8.134687111479487e-05, | |
| "loss": 1.1998, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.161414623260498, | |
| "learning_rate": 8.132615002072109e-05, | |
| "loss": 1.1769, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.1352944374084473, | |
| "learning_rate": 8.130542892664733e-05, | |
| "loss": 1.1837, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.102401852607727, | |
| "learning_rate": 8.128470783257356e-05, | |
| "loss": 1.175, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.2963926792144775, | |
| "learning_rate": 8.12639867384998e-05, | |
| "loss": 1.1782, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.0536015033721924, | |
| "learning_rate": 8.124326564442602e-05, | |
| "loss": 1.1799, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.1193984746932983, | |
| "learning_rate": 8.122254455035227e-05, | |
| "loss": 1.1729, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.1112126111984253, | |
| "learning_rate": 8.120182345627849e-05, | |
| "loss": 1.168, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.0647683143615723, | |
| "learning_rate": 8.118110236220473e-05, | |
| "loss": 1.2057, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.1709637641906738, | |
| "learning_rate": 8.116038126813096e-05, | |
| "loss": 1.1757, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.0774716138839722, | |
| "learning_rate": 8.11396601740572e-05, | |
| "loss": 1.1784, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.088477373123169, | |
| "learning_rate": 8.111893907998343e-05, | |
| "loss": 1.1726, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.0596317052841187, | |
| "learning_rate": 8.109821798590967e-05, | |
| "loss": 1.2029, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.1406421661376953, | |
| "learning_rate": 8.107749689183589e-05, | |
| "loss": 1.2154, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.1345916986465454, | |
| "learning_rate": 8.105677579776214e-05, | |
| "loss": 1.1684, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.2776840925216675, | |
| "learning_rate": 8.103605470368836e-05, | |
| "loss": 1.1915, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.1104062795639038, | |
| "learning_rate": 8.101533360961459e-05, | |
| "loss": 1.1935, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.0262198448181152, | |
| "learning_rate": 8.099461251554083e-05, | |
| "loss": 1.2047, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.1350001096725464, | |
| "learning_rate": 8.097389142146705e-05, | |
| "loss": 1.1798, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.0631601810455322, | |
| "learning_rate": 8.095317032739328e-05, | |
| "loss": 1.1744, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.1678036451339722, | |
| "learning_rate": 8.093244923331952e-05, | |
| "loss": 1.208, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.0665799379348755, | |
| "learning_rate": 8.091172813924576e-05, | |
| "loss": 1.1792, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.1364054679870605, | |
| "learning_rate": 8.089100704517199e-05, | |
| "loss": 1.2036, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.0586035251617432, | |
| "learning_rate": 8.087028595109823e-05, | |
| "loss": 1.201, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.9820613861083984, | |
| "learning_rate": 8.084956485702445e-05, | |
| "loss": 1.1912, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.0832290649414062, | |
| "learning_rate": 8.08288437629507e-05, | |
| "loss": 1.2044, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.2247551679611206, | |
| "learning_rate": 8.080812266887692e-05, | |
| "loss": 1.1613, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.0764803886413574, | |
| "learning_rate": 8.078740157480315e-05, | |
| "loss": 1.1885, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.1871037483215332, | |
| "learning_rate": 8.076668048072939e-05, | |
| "loss": 1.2089, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.0709806680679321, | |
| "learning_rate": 8.074595938665562e-05, | |
| "loss": 1.1743, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.2077934741973877, | |
| "learning_rate": 8.072523829258184e-05, | |
| "loss": 1.192, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.111323356628418, | |
| "learning_rate": 8.070451719850809e-05, | |
| "loss": 1.178, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.1351996660232544, | |
| "learning_rate": 8.06858682138417e-05, | |
| "loss": 1.1921, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.1319098472595215, | |
| "learning_rate": 8.066514711976792e-05, | |
| "loss": 1.2069, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.080480694770813, | |
| "learning_rate": 8.064442602569417e-05, | |
| "loss": 1.1949, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.126483678817749, | |
| "learning_rate": 8.062370493162039e-05, | |
| "loss": 1.1523, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.0840978622436523, | |
| "learning_rate": 8.060298383754663e-05, | |
| "loss": 1.2074, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.1075447797775269, | |
| "learning_rate": 8.058226274347286e-05, | |
| "loss": 1.1937, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.0781664848327637, | |
| "learning_rate": 8.05615416493991e-05, | |
| "loss": 1.1837, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.0375909805297852, | |
| "learning_rate": 8.054082055532532e-05, | |
| "loss": 1.1928, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.101149320602417, | |
| "learning_rate": 8.052009946125155e-05, | |
| "loss": 1.2006, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.9537743926048279, | |
| "learning_rate": 8.049937836717779e-05, | |
| "loss": 1.1939, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.1106035709381104, | |
| "learning_rate": 8.047865727310402e-05, | |
| "loss": 1.1874, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.1406303644180298, | |
| "learning_rate": 8.045793617903026e-05, | |
| "loss": 1.1885, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.0510179996490479, | |
| "learning_rate": 8.043721508495648e-05, | |
| "loss": 1.1831, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.1432597637176514, | |
| "learning_rate": 8.041649399088273e-05, | |
| "loss": 1.1746, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.0710557699203491, | |
| "learning_rate": 8.039577289680895e-05, | |
| "loss": 1.1668, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.067130446434021, | |
| "learning_rate": 8.037505180273519e-05, | |
| "loss": 1.1895, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.1639565229415894, | |
| "learning_rate": 8.035433070866142e-05, | |
| "loss": 1.1786, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.2795084714889526, | |
| "eval_runtime": 1604.6697, | |
| "eval_samples_per_second": 262.864, | |
| "eval_steps_per_second": 4.107, | |
| "step": 9692 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 48460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "total_flos": 4.1335523730815713e+18, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |