| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9938573003622615, | |
| "eval_steps": 500, | |
| "global_step": 297, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010080327610647345, | |
| "grad_norm": 20.48233413696289, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 1.318, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02016065522129469, | |
| "grad_norm": 22.18445587158203, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 1.2034, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03024098283194204, | |
| "grad_norm": 20.34792137145996, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.0455, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04032131044258938, | |
| "grad_norm": 8.893705368041992, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 2.0515, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05040163805323673, | |
| "grad_norm": 6.274827003479004, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.0793, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06048196566388408, | |
| "grad_norm": 6.051919937133789, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.9434, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07056229327453142, | |
| "grad_norm": 7.502919673919678, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.9741, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.08064262088517876, | |
| "grad_norm": 6.437217712402344, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 1.1227, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.09072294849582611, | |
| "grad_norm": 5.169360160827637, | |
| "learning_rate": 6e-06, | |
| "loss": 0.9156, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.10080327610647347, | |
| "grad_norm": 5.1474432945251465, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.0925, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1108836037171208, | |
| "grad_norm": 6.759896755218506, | |
| "learning_rate": 7.333333333333333e-06, | |
| "loss": 0.8985, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.12096393132776816, | |
| "grad_norm": 5.231770992279053, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.6954, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1310442589384155, | |
| "grad_norm": 5.028665542602539, | |
| "learning_rate": 8.666666666666668e-06, | |
| "loss": 0.9763, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.14112458654906285, | |
| "grad_norm": 7.008236408233643, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 0.9969, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.15120491415971019, | |
| "grad_norm": 4.675139904022217, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8619, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16128524177035752, | |
| "grad_norm": 5.249491214752197, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 0.9176, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1713655693810049, | |
| "grad_norm": 4.5402092933654785, | |
| "learning_rate": 1.1333333333333334e-05, | |
| "loss": 0.8809, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.18144589699165223, | |
| "grad_norm": 4.799923896789551, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.005, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.19152622460229957, | |
| "grad_norm": 3.9302682876586914, | |
| "learning_rate": 1.2666666666666667e-05, | |
| "loss": 0.8784, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.20160655221294693, | |
| "grad_norm": 4.542870044708252, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.0378, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.21168687982359427, | |
| "grad_norm": 10.4898042678833, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.8458, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.2217672074342416, | |
| "grad_norm": 4.678144454956055, | |
| "learning_rate": 1.4666666666666666e-05, | |
| "loss": 0.9237, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.23184753504488895, | |
| "grad_norm": 4.897671222686768, | |
| "learning_rate": 1.5333333333333334e-05, | |
| "loss": 0.9661, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2419278626555363, | |
| "grad_norm": 5.067351818084717, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.9975, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.25200819026618365, | |
| "grad_norm": 3.7107274532318115, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.9316, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.262088517876831, | |
| "grad_norm": 4.553698539733887, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 0.871, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.27216884548747833, | |
| "grad_norm": 5.10447359085083, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.8687, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.2822491730981257, | |
| "grad_norm": 3.7116832733154297, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 0.8586, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.29232950070877306, | |
| "grad_norm": 5.299854755401611, | |
| "learning_rate": 1.9333333333333333e-05, | |
| "loss": 0.8595, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.30240982831942037, | |
| "grad_norm": 3.825899600982666, | |
| "learning_rate": 2e-05, | |
| "loss": 0.9615, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.31249015593006774, | |
| "grad_norm": 4.806526184082031, | |
| "learning_rate": 1.999930778307066e-05, | |
| "loss": 0.8595, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.32257048354071505, | |
| "grad_norm": 3.5444633960723877, | |
| "learning_rate": 1.9997231228115487e-05, | |
| "loss": 0.9748, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3326508111513624, | |
| "grad_norm": 4.36836051940918, | |
| "learning_rate": 1.9993770622619784e-05, | |
| "loss": 0.8577, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.3427311387620098, | |
| "grad_norm": 3.0740671157836914, | |
| "learning_rate": 1.9988926445681495e-05, | |
| "loss": 0.8407, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3528114663726571, | |
| "grad_norm": 3.8326187133789062, | |
| "learning_rate": 1.998269936794487e-05, | |
| "loss": 0.8997, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.36289179398330446, | |
| "grad_norm": 3.6133008003234863, | |
| "learning_rate": 1.9975090251507637e-05, | |
| "loss": 0.9572, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3729721215939518, | |
| "grad_norm": 4.111402988433838, | |
| "learning_rate": 1.9966100149801648e-05, | |
| "loss": 0.8465, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.38305244920459913, | |
| "grad_norm": 3.105464220046997, | |
| "learning_rate": 1.9955730307447015e-05, | |
| "loss": 0.84, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3931327768152465, | |
| "grad_norm": 3.377089738845825, | |
| "learning_rate": 1.9943982160079823e-05, | |
| "loss": 0.977, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.40321310442589386, | |
| "grad_norm": 3.674912214279175, | |
| "learning_rate": 1.9930857334153374e-05, | |
| "loss": 0.9114, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4132934320365412, | |
| "grad_norm": 3.491791248321533, | |
| "learning_rate": 1.9916357646713006e-05, | |
| "loss": 0.8507, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.42337375964718854, | |
| "grad_norm": 3.5988316535949707, | |
| "learning_rate": 1.9900485105144544e-05, | |
| "loss": 0.8459, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4334540872578359, | |
| "grad_norm": 3.147287368774414, | |
| "learning_rate": 1.988324190689639e-05, | |
| "loss": 0.9254, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4435344148684832, | |
| "grad_norm": 3.4546704292297363, | |
| "learning_rate": 1.9864630439175282e-05, | |
| "loss": 0.9388, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.4536147424791306, | |
| "grad_norm": 3.39437198638916, | |
| "learning_rate": 1.9844653278615836e-05, | |
| "loss": 0.8751, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4636950700897779, | |
| "grad_norm": 2.966585159301758, | |
| "learning_rate": 1.9823313190923797e-05, | |
| "loss": 0.833, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.47377539770042526, | |
| "grad_norm": 6.668085098266602, | |
| "learning_rate": 1.9800613130493158e-05, | |
| "loss": 0.9399, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.4838557253110726, | |
| "grad_norm": 4.198956489562988, | |
| "learning_rate": 1.9776556239997146e-05, | |
| "loss": 0.8604, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.49393605292171994, | |
| "grad_norm": 3.0325896739959717, | |
| "learning_rate": 1.9751145849953135e-05, | |
| "loss": 0.8399, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5040163805323673, | |
| "grad_norm": 3.0284290313720703, | |
| "learning_rate": 1.972438547826156e-05, | |
| "loss": 0.962, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5140967081430147, | |
| "grad_norm": 3.7126681804656982, | |
| "learning_rate": 1.9696278829718882e-05, | |
| "loss": 0.8381, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.524177035753662, | |
| "grad_norm": 3.0753326416015625, | |
| "learning_rate": 1.9666829795504693e-05, | |
| "loss": 1.2808, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5342573633643093, | |
| "grad_norm": 3.3041043281555176, | |
| "learning_rate": 1.9636042452643004e-05, | |
| "loss": 1.0921, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5443376909749567, | |
| "grad_norm": 2.974684953689575, | |
| "learning_rate": 1.9603921063437795e-05, | |
| "loss": 1.1766, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.554418018585604, | |
| "grad_norm": 3.104491710662842, | |
| "learning_rate": 1.9570470074882947e-05, | |
| "loss": 0.8245, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5644983461962514, | |
| "grad_norm": 2.8233213424682617, | |
| "learning_rate": 1.9535694118046584e-05, | |
| "loss": 0.8327, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5745786738068988, | |
| "grad_norm": 3.0855748653411865, | |
| "learning_rate": 1.949959800742991e-05, | |
| "loss": 0.8333, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5846590014175461, | |
| "grad_norm": 3.309098482131958, | |
| "learning_rate": 1.9462186740300697e-05, | |
| "loss": 0.8437, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5947393290281934, | |
| "grad_norm": 3.7956173419952393, | |
| "learning_rate": 1.942346549600144e-05, | |
| "loss": 0.8533, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6048196566388407, | |
| "grad_norm": 2.885507106781006, | |
| "learning_rate": 1.9383439635232296e-05, | |
| "loss": 0.9791, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6148999842494881, | |
| "grad_norm": 2.976921319961548, | |
| "learning_rate": 1.9342114699308962e-05, | |
| "loss": 0.9537, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6249803118601355, | |
| "grad_norm": 3.9822583198547363, | |
| "learning_rate": 1.9299496409395482e-05, | |
| "loss": 0.8513, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6350606394707828, | |
| "grad_norm": 3.2742626667022705, | |
| "learning_rate": 1.9255590665712214e-05, | |
| "loss": 0.8606, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6451409670814301, | |
| "grad_norm": 2.989588975906372, | |
| "learning_rate": 1.921040354671897e-05, | |
| "loss": 1.0236, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.6552212946920775, | |
| "grad_norm": 3.3849992752075195, | |
| "learning_rate": 1.9163941308273504e-05, | |
| "loss": 0.8274, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6653016223027248, | |
| "grad_norm": 2.9485199451446533, | |
| "learning_rate": 1.911621038276542e-05, | |
| "loss": 0.8366, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6753819499133722, | |
| "grad_norm": 2.8090484142303467, | |
| "learning_rate": 1.9067217378225655e-05, | |
| "loss": 1.0603, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6854622775240196, | |
| "grad_norm": 2.795466899871826, | |
| "learning_rate": 1.9016969077411645e-05, | |
| "loss": 1.0391, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6955426051346669, | |
| "grad_norm": 3.0513088703155518, | |
| "learning_rate": 1.8965472436868288e-05, | |
| "loss": 0.9469, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7056229327453142, | |
| "grad_norm": 2.89764404296875, | |
| "learning_rate": 1.891273458596486e-05, | |
| "loss": 0.8358, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7157032603559615, | |
| "grad_norm": 2.940281629562378, | |
| "learning_rate": 1.8858762825908e-05, | |
| "loss": 0.9117, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7257835879666089, | |
| "grad_norm": 3.0357506275177, | |
| "learning_rate": 1.8803564628730916e-05, | |
| "loss": 0.8441, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7358639155772563, | |
| "grad_norm": 3.0957512855529785, | |
| "learning_rate": 1.874714763625892e-05, | |
| "loss": 0.8154, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7459442431879036, | |
| "grad_norm": 3.3382112979888916, | |
| "learning_rate": 1.8689519659051467e-05, | |
| "loss": 1.0091, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.7560245707985509, | |
| "grad_norm": 2.7359678745269775, | |
| "learning_rate": 1.8630688675320844e-05, | |
| "loss": 0.9901, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7661048984091983, | |
| "grad_norm": 2.9160029888153076, | |
| "learning_rate": 1.8570662829827632e-05, | |
| "loss": 1.0645, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7761852260198456, | |
| "grad_norm": 3.7096657752990723, | |
| "learning_rate": 1.8509450432753123e-05, | |
| "loss": 0.8458, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.786265553630493, | |
| "grad_norm": 2.9605114459991455, | |
| "learning_rate": 1.8447059958548822e-05, | |
| "loss": 0.8315, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7963458812411404, | |
| "grad_norm": 3.1716909408569336, | |
| "learning_rate": 1.8383500044763226e-05, | |
| "loss": 0.8427, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.8064262088517877, | |
| "grad_norm": 4.014035224914551, | |
| "learning_rate": 1.8318779490846005e-05, | |
| "loss": 0.8391, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.816506536462435, | |
| "grad_norm": 2.6693358421325684, | |
| "learning_rate": 1.8252907256929777e-05, | |
| "loss": 1.0444, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.8265868640730824, | |
| "grad_norm": 3.824836254119873, | |
| "learning_rate": 1.818589246258964e-05, | |
| "loss": 0.9738, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8366671916837297, | |
| "grad_norm": 3.0832369327545166, | |
| "learning_rate": 1.8117744385580627e-05, | |
| "loss": 0.9109, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8467475192943771, | |
| "grad_norm": 3.707331657409668, | |
| "learning_rate": 1.804847246055326e-05, | |
| "loss": 0.8324, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8568278469050244, | |
| "grad_norm": 3.3008220195770264, | |
| "learning_rate": 1.797808627774738e-05, | |
| "loss": 0.9383, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8669081745156718, | |
| "grad_norm": 3.1887435913085938, | |
| "learning_rate": 1.7906595581664462e-05, | |
| "loss": 0.8441, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8769885021263191, | |
| "grad_norm": 2.8672823905944824, | |
| "learning_rate": 1.7834010269718526e-05, | |
| "loss": 0.8353, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8870688297369664, | |
| "grad_norm": 3.0346055030822754, | |
| "learning_rate": 1.776034039086592e-05, | |
| "loss": 1.348, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8971491573476138, | |
| "grad_norm": 3.2000229358673096, | |
| "learning_rate": 1.768559614421411e-05, | |
| "loss": 0.8544, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.9072294849582612, | |
| "grad_norm": 3.007753610610962, | |
| "learning_rate": 1.7609787877609678e-05, | |
| "loss": 0.8505, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.9173098125689085, | |
| "grad_norm": 2.9321534633636475, | |
| "learning_rate": 1.753292608620573e-05, | |
| "loss": 0.8402, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.9273901401795558, | |
| "grad_norm": 2.6721303462982178, | |
| "learning_rate": 1.7455021411008906e-05, | |
| "loss": 0.8421, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.9374704677902032, | |
| "grad_norm": 3.0125327110290527, | |
| "learning_rate": 1.7376084637406222e-05, | |
| "loss": 0.8443, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.9475507954008505, | |
| "grad_norm": 2.6522045135498047, | |
| "learning_rate": 1.7296126693671886e-05, | |
| "loss": 1.2249, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9576311230114979, | |
| "grad_norm": 2.6742281913757324, | |
| "learning_rate": 1.721515864945435e-05, | |
| "loss": 0.846, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9677114506221453, | |
| "grad_norm": 2.9093592166900635, | |
| "learning_rate": 1.7133191714243805e-05, | |
| "loss": 0.9391, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.9777917782327926, | |
| "grad_norm": 3.675670623779297, | |
| "learning_rate": 1.7050237235820287e-05, | |
| "loss": 0.8723, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.9878721058434399, | |
| "grad_norm": 2.698991298675537, | |
| "learning_rate": 1.6966306698682672e-05, | |
| "loss": 0.8491, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.9979524334540872, | |
| "grad_norm": 2.8708646297454834, | |
| "learning_rate": 1.6881411722458688e-05, | |
| "loss": 1.1059, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.0080327610647346, | |
| "grad_norm": 3.9162817001342773, | |
| "learning_rate": 1.6795564060296295e-05, | |
| "loss": 0.711, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.018113088675382, | |
| "grad_norm": 3.2829103469848633, | |
| "learning_rate": 1.6708775597236507e-05, | |
| "loss": 0.7179, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.0281934162860293, | |
| "grad_norm": 4.782021999359131, | |
| "learning_rate": 1.6621058348568008e-05, | |
| "loss": 0.759, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.0382737438966767, | |
| "grad_norm": 4.203643798828125, | |
| "learning_rate": 1.6532424458163692e-05, | |
| "loss": 0.7717, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.048354071507324, | |
| "grad_norm": 4.421259880065918, | |
| "learning_rate": 1.6442886196799465e-05, | |
| "loss": 0.651, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.0584343991179714, | |
| "grad_norm": 3.6463119983673096, | |
| "learning_rate": 1.6352455960455385e-05, | |
| "loss": 0.7719, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0685147267286186, | |
| "grad_norm": 3.408778429031372, | |
| "learning_rate": 1.6261146268599564e-05, | |
| "loss": 0.6591, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.078595054339266, | |
| "grad_norm": 3.7105519771575928, | |
| "learning_rate": 1.6168969762454897e-05, | |
| "loss": 0.6645, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.0886753819499133, | |
| "grad_norm": 3.8667266368865967, | |
| "learning_rate": 1.607593920324899e-05, | |
| "loss": 0.7159, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.0987557095605607, | |
| "grad_norm": 3.4244542121887207, | |
| "learning_rate": 1.598206747044746e-05, | |
| "loss": 0.8331, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.108836037171208, | |
| "grad_norm": 3.326638698577881, | |
| "learning_rate": 1.5887367559970825e-05, | |
| "loss": 0.6831, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.1189163647818554, | |
| "grad_norm": 3.7026546001434326, | |
| "learning_rate": 1.5791852582395334e-05, | |
| "loss": 0.6642, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.1289966923925028, | |
| "grad_norm": 3.1158483028411865, | |
| "learning_rate": 1.569553576113789e-05, | |
| "loss": 0.6531, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.1390770200031501, | |
| "grad_norm": 3.1516995429992676, | |
| "learning_rate": 1.5598430430625335e-05, | |
| "loss": 0.6734, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.1491573476137975, | |
| "grad_norm": 3.048830270767212, | |
| "learning_rate": 1.5500550034448415e-05, | |
| "loss": 0.6412, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.1592376752244449, | |
| "grad_norm": 2.9057071208953857, | |
| "learning_rate": 1.540190812350059e-05, | |
| "loss": 0.6441, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1693180028350922, | |
| "grad_norm": 4.930371284484863, | |
| "learning_rate": 1.5302518354101992e-05, | |
| "loss": 0.6499, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.1793983304457396, | |
| "grad_norm": 3.2132606506347656, | |
| "learning_rate": 1.5202394486108823e-05, | |
| "loss": 0.7648, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.1894786580563868, | |
| "grad_norm": 3.247512102127075, | |
| "learning_rate": 1.5101550381008377e-05, | |
| "loss": 0.6341, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1995589856670341, | |
| "grad_norm": 2.8837008476257324, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.6785, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.2096393132776815, | |
| "grad_norm": 3.362884998321533, | |
| "learning_rate": 1.4897757402062285e-05, | |
| "loss": 0.6433, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.2197196408883288, | |
| "grad_norm": 3.0538456439971924, | |
| "learning_rate": 1.4794836742006667e-05, | |
| "loss": 0.7454, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.2297999684989762, | |
| "grad_norm": 3.1768131256103516, | |
| "learning_rate": 1.4691252268517794e-05, | |
| "loss": 0.7879, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.2398802961096236, | |
| "grad_norm": 3.0788283348083496, | |
| "learning_rate": 1.4587018322180906e-05, | |
| "loss": 0.7103, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.249960623720271, | |
| "grad_norm": 2.7563843727111816, | |
| "learning_rate": 1.4482149333496455e-05, | |
| "loss": 0.6592, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2600409513309183, | |
| "grad_norm": 3.421998977661133, | |
| "learning_rate": 1.4376659820882308e-05, | |
| "loss": 0.7862, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2701212789415657, | |
| "grad_norm": 3.0229763984680176, | |
| "learning_rate": 1.4270564388663761e-05, | |
| "loss": 0.6586, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.2802016065522128, | |
| "grad_norm": 2.8393478393554688, | |
| "learning_rate": 1.4163877725051677e-05, | |
| "loss": 0.7359, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2902819341628602, | |
| "grad_norm": 3.009399175643921, | |
| "learning_rate": 1.4056614600108998e-05, | |
| "loss": 0.6755, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.3003622617735076, | |
| "grad_norm": 2.9821226596832275, | |
| "learning_rate": 1.3948789863705914e-05, | |
| "loss": 0.629, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.310442589384155, | |
| "grad_norm": 3.1034419536590576, | |
| "learning_rate": 1.3840418443464015e-05, | |
| "loss": 0.6466, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.3205229169948023, | |
| "grad_norm": 3.318528413772583, | |
| "learning_rate": 1.3731515342689654e-05, | |
| "loss": 1.0047, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.3306032446054497, | |
| "grad_norm": 2.9240915775299072, | |
| "learning_rate": 1.3622095638296827e-05, | |
| "loss": 0.668, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.340683572216097, | |
| "grad_norm": 3.9342963695526123, | |
| "learning_rate": 1.3512174478719896e-05, | |
| "loss": 0.6465, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3507638998267444, | |
| "grad_norm": 2.6329188346862793, | |
| "learning_rate": 1.340176708181637e-05, | |
| "loss": 0.8036, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3608442274373918, | |
| "grad_norm": 3.2634246349334717, | |
| "learning_rate": 1.32908887327601e-05, | |
| "loss": 0.6348, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3709245550480391, | |
| "grad_norm": 2.8796093463897705, | |
| "learning_rate": 1.317955478192515e-05, | |
| "loss": 0.7079, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3810048826586865, | |
| "grad_norm": 3.15555477142334, | |
| "learning_rate": 1.306778064276064e-05, | |
| "loss": 0.7197, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3910852102693338, | |
| "grad_norm": 2.8696324825286865, | |
| "learning_rate": 1.2955581789656844e-05, | |
| "loss": 0.6422, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.4011655378799812, | |
| "grad_norm": 2.841829538345337, | |
| "learning_rate": 1.2842973755802872e-05, | |
| "loss": 0.6522, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.4112458654906286, | |
| "grad_norm": 2.869424819946289, | |
| "learning_rate": 1.2729972131036212e-05, | |
| "loss": 0.7462, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.4213261931012757, | |
| "grad_norm": 3.1612401008605957, | |
| "learning_rate": 1.2616592559684408e-05, | |
| "loss": 0.6471, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.431406520711923, | |
| "grad_norm": 2.7609212398529053, | |
| "learning_rate": 1.25028507383992e-05, | |
| "loss": 0.7627, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.4414868483225705, | |
| "grad_norm": 2.678645610809326, | |
| "learning_rate": 1.2388762413983447e-05, | |
| "loss": 0.6729, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.4515671759332178, | |
| "grad_norm": 2.8031179904937744, | |
| "learning_rate": 1.2274343381211067e-05, | |
| "loss": 0.6497, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4616475035438652, | |
| "grad_norm": 2.735318660736084, | |
| "learning_rate": 1.2159609480640361e-05, | |
| "loss": 0.6786, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4717278311545126, | |
| "grad_norm": 2.9970738887786865, | |
| "learning_rate": 1.2044576596421003e-05, | |
| "loss": 0.6498, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.48180815876516, | |
| "grad_norm": 2.8578624725341797, | |
| "learning_rate": 1.192926065409497e-05, | |
| "loss": 0.6432, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4918884863758073, | |
| "grad_norm": 2.9433505535125732, | |
| "learning_rate": 1.1813677618391759e-05, | |
| "loss": 0.6274, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.5019688139864544, | |
| "grad_norm": 2.799851417541504, | |
| "learning_rate": 1.1697843491018189e-05, | |
| "loss": 0.6507, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.5120491415971018, | |
| "grad_norm": 2.589261770248413, | |
| "learning_rate": 1.1581774308443042e-05, | |
| "loss": 0.8801, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.5221294692077492, | |
| "grad_norm": 2.7499136924743652, | |
| "learning_rate": 1.1465486139676955e-05, | |
| "loss": 0.6428, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.5322097968183965, | |
| "grad_norm": 3.6993484497070312, | |
| "learning_rate": 1.134899508404775e-05, | |
| "loss": 0.6641, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.542290124429044, | |
| "grad_norm": 5.174093723297119, | |
| "learning_rate": 1.1232317268971586e-05, | |
| "loss": 0.7828, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.5523704520396913, | |
| "grad_norm": 2.734003782272339, | |
| "learning_rate": 1.1115468847720245e-05, | |
| "loss": 0.7631, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5624507796503386, | |
| "grad_norm": 3.996946334838867, | |
| "learning_rate": 1.0998465997184798e-05, | |
| "loss": 0.6416, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.572531107260986, | |
| "grad_norm": 3.302497386932373, | |
| "learning_rate": 1.088132491563602e-05, | |
| "loss": 0.6543, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.5826114348716334, | |
| "grad_norm": 2.9945664405822754, | |
| "learning_rate": 1.0764061820481872e-05, | |
| "loss": 0.6902, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5926917624822807, | |
| "grad_norm": 2.6038448810577393, | |
| "learning_rate": 1.0646692946022285e-05, | |
| "loss": 0.6289, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.602772090092928, | |
| "grad_norm": 2.6396548748016357, | |
| "learning_rate": 1.0529234541201631e-05, | |
| "loss": 0.8164, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.6128524177035755, | |
| "grad_norm": 2.770799160003662, | |
| "learning_rate": 1.041170286735918e-05, | |
| "loss": 0.6438, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.6229327453142228, | |
| "grad_norm": 2.666429042816162, | |
| "learning_rate": 1.0294114195977796e-05, | |
| "loss": 0.6912, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.6330130729248702, | |
| "grad_norm": 2.9112017154693604, | |
| "learning_rate": 1.0176484806431288e-05, | |
| "loss": 0.7345, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.6430934005355176, | |
| "grad_norm": 3.0811562538146973, | |
| "learning_rate": 1.0058830983730622e-05, | |
| "loss": 0.7558, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.6531737281461647, | |
| "grad_norm": 2.8588948249816895, | |
| "learning_rate": 9.94116901626938e-06, | |
| "loss": 0.648, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.663254055756812, | |
| "grad_norm": 2.7404801845550537, | |
| "learning_rate": 9.823515193568715e-06, | |
| "loss": 0.695, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6733343833674594, | |
| "grad_norm": 2.9604055881500244, | |
| "learning_rate": 9.705885804022207e-06, | |
| "loss": 0.6304, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.6834147109781068, | |
| "grad_norm": 3.1369986534118652, | |
| "learning_rate": 9.588297132640824e-06, | |
| "loss": 0.8216, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6934950385887542, | |
| "grad_norm": 2.912094831466675, | |
| "learning_rate": 9.470765458798369e-06, | |
| "loss": 0.6653, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.7035753661994015, | |
| "grad_norm": 2.6910438537597656, | |
| "learning_rate": 9.353307053977717e-06, | |
| "loss": 0.6645, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.7136556938100487, | |
| "grad_norm": 2.958939790725708, | |
| "learning_rate": 9.235938179518131e-06, | |
| "loss": 0.6222, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.723736021420696, | |
| "grad_norm": 2.7143542766571045, | |
| "learning_rate": 9.118675084363986e-06, | |
| "loss": 0.7051, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.7338163490313434, | |
| "grad_norm": 2.6610677242279053, | |
| "learning_rate": 9.001534002815209e-06, | |
| "loss": 0.6333, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.7438966766419908, | |
| "grad_norm": 2.9667537212371826, | |
| "learning_rate": 8.884531152279757e-06, | |
| "loss": 0.6832, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.7539770042526381, | |
| "grad_norm": 2.64609956741333, | |
| "learning_rate": 8.767682731028415e-06, | |
| "loss": 0.9484, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.7640573318632855, | |
| "grad_norm": 2.682523012161255, | |
| "learning_rate": 8.651004915952252e-06, | |
| "loss": 0.8721, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7741376594739329, | |
| "grad_norm": 2.5906975269317627, | |
| "learning_rate": 8.534513860323047e-06, | |
| "loss": 0.9793, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.7842179870845802, | |
| "grad_norm": 2.636467456817627, | |
| "learning_rate": 8.418225691556962e-06, | |
| "loss": 0.9016, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7942983146952276, | |
| "grad_norm": 3.5005948543548584, | |
| "learning_rate": 8.302156508981816e-06, | |
| "loss": 0.738, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.804378642305875, | |
| "grad_norm": 2.7986643314361572, | |
| "learning_rate": 8.18632238160824e-06, | |
| "loss": 0.6635, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.8144589699165223, | |
| "grad_norm": 2.8597512245178223, | |
| "learning_rate": 8.070739345905032e-06, | |
| "loss": 0.7473, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.8245392975271697, | |
| "grad_norm": 2.7487239837646484, | |
| "learning_rate": 7.955423403578998e-06, | |
| "loss": 0.7526, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.834619625137817, | |
| "grad_norm": 2.68874454498291, | |
| "learning_rate": 7.840390519359644e-06, | |
| "loss": 0.6491, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.8446999527484644, | |
| "grad_norm": 2.8393709659576416, | |
| "learning_rate": 7.725656618788938e-06, | |
| "loss": 0.6401, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.8547802803591118, | |
| "grad_norm": 2.8322646617889404, | |
| "learning_rate": 7.611237586016558e-06, | |
| "loss": 0.7692, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.8648606079697592, | |
| "grad_norm": 2.760575771331787, | |
| "learning_rate": 7.497149261600803e-06, | |
| "loss": 0.926, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8749409355804063, | |
| "grad_norm": 2.6379311084747314, | |
| "learning_rate": 7.383407440315595e-06, | |
| "loss": 0.654, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.8850212631910537, | |
| "grad_norm": 2.6411261558532715, | |
| "learning_rate": 7.27002786896379e-06, | |
| "loss": 0.7753, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.895101590801701, | |
| "grad_norm": 2.6866044998168945, | |
| "learning_rate": 7.157026244197132e-06, | |
| "loss": 0.6479, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.9051819184123484, | |
| "grad_norm": 2.743093252182007, | |
| "learning_rate": 7.044418210343161e-06, | |
| "loss": 0.7825, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.9152622460229958, | |
| "grad_norm": 2.7608628273010254, | |
| "learning_rate": 6.932219357239362e-06, | |
| "loss": 0.6497, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.9253425736336431, | |
| "grad_norm": 2.581033706665039, | |
| "learning_rate": 6.820445218074849e-06, | |
| "loss": 0.658, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.9354229012442903, | |
| "grad_norm": 2.8341994285583496, | |
| "learning_rate": 6.7091112672399e-06, | |
| "loss": 0.8367, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.9455032288549376, | |
| "grad_norm": 2.712247133255005, | |
| "learning_rate": 6.5982329181836325e-06, | |
| "loss": 0.647, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.955583556465585, | |
| "grad_norm": 2.683356761932373, | |
| "learning_rate": 6.487825521280109e-06, | |
| "loss": 0.7316, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.9656638840762324, | |
| "grad_norm": 2.6433842182159424, | |
| "learning_rate": 6.3779043617031775e-06, | |
| "loss": 0.84, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9757442116868797, | |
| "grad_norm": 4.231180667877197, | |
| "learning_rate": 6.268484657310351e-06, | |
| "loss": 0.7416, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.9858245392975271, | |
| "grad_norm": 3.0023813247680664, | |
| "learning_rate": 6.159581556535989e-06, | |
| "loss": 0.8632, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.9959048669081745, | |
| "grad_norm": 2.6306092739105225, | |
| "learning_rate": 6.051210136294089e-06, | |
| "loss": 0.6557, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.005985194518822, | |
| "grad_norm": 3.3288090229034424, | |
| "learning_rate": 5.943385399891004e-06, | |
| "loss": 0.5327, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.016065522129469, | |
| "grad_norm": 3.394890069961548, | |
| "learning_rate": 5.8361222749483246e-06, | |
| "loss": 0.5682, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0261458497401166, | |
| "grad_norm": 4.5706658363342285, | |
| "learning_rate": 5.729435611336239e-06, | |
| "loss": 0.521, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.036226177350764, | |
| "grad_norm": 3.595043420791626, | |
| "learning_rate": 5.6233401791176946e-06, | |
| "loss": 0.4973, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.0463065049614113, | |
| "grad_norm": 3.381319761276245, | |
| "learning_rate": 5.517850666503547e-06, | |
| "loss": 0.7273, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.0563868325720587, | |
| "grad_norm": 3.813019037246704, | |
| "learning_rate": 5.412981677819094e-06, | |
| "loss": 0.5748, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.066467160182706, | |
| "grad_norm": 2.9697418212890625, | |
| "learning_rate": 5.308747731482207e-06, | |
| "loss": 0.6197, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0765474877933534, | |
| "grad_norm": 4.898626804351807, | |
| "learning_rate": 5.205163257993341e-06, | |
| "loss": 0.4839, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.0866278154040008, | |
| "grad_norm": 4.427509784698486, | |
| "learning_rate": 5.1022425979377174e-06, | |
| "loss": 0.455, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.096708143014648, | |
| "grad_norm": 3.7625739574432373, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.538, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.1067884706252955, | |
| "grad_norm": 2.9984402656555176, | |
| "learning_rate": 4.89844961899163e-06, | |
| "loss": 0.5426, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.116868798235943, | |
| "grad_norm": 3.2791342735290527, | |
| "learning_rate": 4.797605513891179e-06, | |
| "loss": 0.5505, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.12694912584659, | |
| "grad_norm": 3.1224496364593506, | |
| "learning_rate": 4.697481645898012e-06, | |
| "loss": 0.5466, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.137029453457237, | |
| "grad_norm": 2.9908924102783203, | |
| "learning_rate": 4.598091876499417e-06, | |
| "loss": 0.4739, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.1471097810678845, | |
| "grad_norm": 3.272909164428711, | |
| "learning_rate": 4.4994499655515865e-06, | |
| "loss": 0.4773, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.157190108678532, | |
| "grad_norm": 3.1660666465759277, | |
| "learning_rate": 4.4015695693746685e-06, | |
| "loss": 0.6012, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.1672704362891793, | |
| "grad_norm": 2.9826879501342773, | |
| "learning_rate": 4.304464238862115e-06, | |
| "loss": 0.559, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.1773507638998266, | |
| "grad_norm": 3.156632900238037, | |
| "learning_rate": 4.208147417604665e-06, | |
| "loss": 0.4658, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.187431091510474, | |
| "grad_norm": 2.7207696437835693, | |
| "learning_rate": 4.112632440029176e-06, | |
| "loss": 0.4746, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.1975114191211214, | |
| "grad_norm": 3.170917272567749, | |
| "learning_rate": 4.017932529552543e-06, | |
| "loss": 0.4555, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.2075917467317687, | |
| "grad_norm": 2.873971939086914, | |
| "learning_rate": 3.924060796751012e-06, | |
| "loss": 0.4927, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.217672074342416, | |
| "grad_norm": 3.0037410259246826, | |
| "learning_rate": 3.83103023754511e-06, | |
| "loss": 0.5199, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.2277524019530635, | |
| "grad_norm": 3.2414352893829346, | |
| "learning_rate": 3.7388537314004394e-06, | |
| "loss": 0.4665, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.237832729563711, | |
| "grad_norm": 2.9535632133483887, | |
| "learning_rate": 3.647544039544615e-06, | |
| "loss": 0.4625, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.247913057174358, | |
| "grad_norm": 2.874563455581665, | |
| "learning_rate": 3.557113803200537e-06, | |
| "loss": 0.4651, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.2579933847850056, | |
| "grad_norm": 2.8400771617889404, | |
| "learning_rate": 3.4675755418363054e-06, | |
| "loss": 0.4741, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.268073712395653, | |
| "grad_norm": 3.162914752960205, | |
| "learning_rate": 3.378941651431996e-06, | |
| "loss": 0.5043, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2781540400063003, | |
| "grad_norm": 3.108367681503296, | |
| "learning_rate": 3.2912244027634953e-06, | |
| "loss": 0.4612, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.2882343676169477, | |
| "grad_norm": 2.9453072547912598, | |
| "learning_rate": 3.204435939703705e-06, | |
| "loss": 0.5951, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.298314695227595, | |
| "grad_norm": 2.926748752593994, | |
| "learning_rate": 3.1185882775413123e-06, | |
| "loss": 0.4727, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.3083950228382424, | |
| "grad_norm": 2.7505362033843994, | |
| "learning_rate": 3.0336933013173307e-06, | |
| "loss": 0.4771, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.3184753504488897, | |
| "grad_norm": 3.1303627490997314, | |
| "learning_rate": 2.949762764179711e-06, | |
| "loss": 0.5534, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.328555678059537, | |
| "grad_norm": 2.6740784645080566, | |
| "learning_rate": 2.8668082857562006e-06, | |
| "loss": 0.4713, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.3386360056701845, | |
| "grad_norm": 2.6228513717651367, | |
| "learning_rate": 2.7848413505456564e-06, | |
| "loss": 0.532, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.348716333280832, | |
| "grad_norm": 3.3503799438476562, | |
| "learning_rate": 2.7038733063281177e-06, | |
| "loss": 0.5022, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.358796660891479, | |
| "grad_norm": 2.798093557357788, | |
| "learning_rate": 2.6239153625937786e-06, | |
| "loss": 0.4674, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.368876988502126, | |
| "grad_norm": 2.9283840656280518, | |
| "learning_rate": 2.544978588991096e-06, | |
| "loss": 0.5145, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.3789573161127735, | |
| "grad_norm": 2.9294168949127197, | |
| "learning_rate": 2.4670739137942723e-06, | |
| "loss": 0.5262, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.389037643723421, | |
| "grad_norm": 2.9340579509735107, | |
| "learning_rate": 2.390212122390323e-06, | |
| "loss": 0.4654, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.3991179713340682, | |
| "grad_norm": 2.9282586574554443, | |
| "learning_rate": 2.3144038557858915e-06, | |
| "loss": 0.7147, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.4091982989447156, | |
| "grad_norm": 2.9570140838623047, | |
| "learning_rate": 2.2396596091340805e-06, | |
| "loss": 0.4643, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.419278626555363, | |
| "grad_norm": 2.7537012100219727, | |
| "learning_rate": 2.165989730281475e-06, | |
| "loss": 0.4467, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.4293589541660103, | |
| "grad_norm": 2.764420986175537, | |
| "learning_rate": 2.0934044183355384e-06, | |
| "loss": 0.4774, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.4394392817766577, | |
| "grad_norm": 3.11476993560791, | |
| "learning_rate": 2.0219137222526188e-06, | |
| "loss": 0.5792, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.449519609387305, | |
| "grad_norm": 4.960219860076904, | |
| "learning_rate": 1.9515275394467446e-06, | |
| "loss": 0.457, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.4595999369979524, | |
| "grad_norm": 3.0883636474609375, | |
| "learning_rate": 1.882255614419376e-06, | |
| "loss": 0.6268, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.4696802646086, | |
| "grad_norm": 2.9060003757476807, | |
| "learning_rate": 1.8141075374103634e-06, | |
| "loss": 0.5785, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.479760592219247, | |
| "grad_norm": 2.7196030616760254, | |
| "learning_rate": 1.7470927430702277e-06, | |
| "loss": 0.4658, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.4898409198298945, | |
| "grad_norm": 2.8663458824157715, | |
| "learning_rate": 1.6812205091539979e-06, | |
| "loss": 0.4635, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.499921247440542, | |
| "grad_norm": 2.7674198150634766, | |
| "learning_rate": 1.6164999552367767e-06, | |
| "loss": 0.475, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.5100015750511893, | |
| "grad_norm": 2.971801280975342, | |
| "learning_rate": 1.5529400414511809e-06, | |
| "loss": 0.5657, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.5200819026618366, | |
| "grad_norm": 3.0309677124023438, | |
| "learning_rate": 1.4905495672468784e-06, | |
| "loss": 0.4609, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.530162230272484, | |
| "grad_norm": 2.9311161041259766, | |
| "learning_rate": 1.4293371701723701e-06, | |
| "loss": 0.5184, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.5402425578831314, | |
| "grad_norm": 2.813347578048706, | |
| "learning_rate": 1.369311324679159e-06, | |
| "loss": 0.4675, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.5503228854937783, | |
| "grad_norm": 2.8366470336914062, | |
| "learning_rate": 1.3104803409485357e-06, | |
| "loss": 0.5566, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.5604032131044256, | |
| "grad_norm": 2.9654905796051025, | |
| "learning_rate": 1.252852363741084e-06, | |
| "loss": 0.4739, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.570483540715073, | |
| "grad_norm": 2.7376723289489746, | |
| "learning_rate": 1.196435371269089e-06, | |
| "loss": 0.4755, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.5805638683257204, | |
| "grad_norm": 2.603374719619751, | |
| "learning_rate": 1.1412371740920036e-06, | |
| "loss": 0.45, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.5906441959363677, | |
| "grad_norm": 2.7828543186187744, | |
| "learning_rate": 1.0872654140351458e-06, | |
| "loss": 0.7043, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.600724523547015, | |
| "grad_norm": 2.6468091011047363, | |
| "learning_rate": 1.0345275631317165e-06, | |
| "loss": 0.6008, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.6108048511576625, | |
| "grad_norm": 2.7856605052948, | |
| "learning_rate": 9.830309225883562e-07, | |
| "loss": 0.55, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.62088517876831, | |
| "grad_norm": 2.949894428253174, | |
| "learning_rate": 9.327826217743452e-07, | |
| "loss": 0.4517, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.630965506378957, | |
| "grad_norm": 3.2041678428649902, | |
| "learning_rate": 8.837896172345827e-07, | |
| "loss": 0.5641, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.6410458339896046, | |
| "grad_norm": 3.0084447860717773, | |
| "learning_rate": 8.360586917264979e-07, | |
| "loss": 0.4635, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.651126161600252, | |
| "grad_norm": 2.7008798122406006, | |
| "learning_rate": 7.895964532810318e-07, | |
| "loss": 0.5861, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.6612064892108993, | |
| "grad_norm": 2.8974366188049316, | |
| "learning_rate": 7.4440933428779e-07, | |
| "loss": 0.4619, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.6712868168215467, | |
| "grad_norm": 2.954524040222168, | |
| "learning_rate": 7.005035906045199e-07, | |
| "loss": 0.4819, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.681367144432194, | |
| "grad_norm": 2.6953723430633545, | |
| "learning_rate": 6.578853006910402e-07, | |
| "loss": 0.4843, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.6914474720428414, | |
| "grad_norm": 3.108698844909668, | |
| "learning_rate": 6.165603647677054e-07, | |
| "loss": 0.4586, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.7015277996534888, | |
| "grad_norm": 2.9222874641418457, | |
| "learning_rate": 5.765345039985648e-07, | |
| "loss": 0.4737, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.711608127264136, | |
| "grad_norm": 2.904561996459961, | |
| "learning_rate": 5.378132596993047e-07, | |
| "loss": 0.4641, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.7216884548747835, | |
| "grad_norm": 3.0697550773620605, | |
| "learning_rate": 5.004019925700921e-07, | |
| "loss": 0.5555, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.731768782485431, | |
| "grad_norm": 2.980802536010742, | |
| "learning_rate": 4.6430588195341853e-07, | |
| "loss": 0.5908, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.7418491100960782, | |
| "grad_norm": 3.690682888031006, | |
| "learning_rate": 4.295299251170537e-07, | |
| "loss": 0.5399, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.7519294377067256, | |
| "grad_norm": 2.920565128326416, | |
| "learning_rate": 3.960789365622075e-07, | |
| "loss": 0.7304, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.762009765317373, | |
| "grad_norm": 2.853963851928711, | |
| "learning_rate": 3.6395754735699896e-07, | |
| "loss": 0.571, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.7720900929280203, | |
| "grad_norm": 2.8439598083496094, | |
| "learning_rate": 3.3317020449530666e-07, | |
| "loss": 0.5728, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.7821704205386677, | |
| "grad_norm": 2.6904118061065674, | |
| "learning_rate": 3.0372117028111825e-07, | |
| "loss": 0.4688, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.792250748149315, | |
| "grad_norm": 2.8385207653045654, | |
| "learning_rate": 2.7561452173844206e-07, | |
| "loss": 0.5202, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.8023310757599624, | |
| "grad_norm": 2.739987373352051, | |
| "learning_rate": 2.488541500468666e-07, | |
| "loss": 0.586, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.81241140337061, | |
| "grad_norm": 2.9050772190093994, | |
| "learning_rate": 2.2344376000285606e-07, | |
| "loss": 0.4698, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.822491730981257, | |
| "grad_norm": 2.7418956756591797, | |
| "learning_rate": 1.993868695068457e-07, | |
| "loss": 0.4432, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.832572058591904, | |
| "grad_norm": 2.746795892715454, | |
| "learning_rate": 1.766868090762075e-07, | |
| "loss": 0.5331, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.8426523862025515, | |
| "grad_norm": 2.7366440296173096, | |
| "learning_rate": 1.553467213841664e-07, | |
| "loss": 0.4813, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.852732713813199, | |
| "grad_norm": 2.755558729171753, | |
| "learning_rate": 1.3536956082472074e-07, | |
| "loss": 0.4859, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.862813041423846, | |
| "grad_norm": 2.7609097957611084, | |
| "learning_rate": 1.1675809310361497e-07, | |
| "loss": 0.4655, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.8728933690344935, | |
| "grad_norm": 3.0042688846588135, | |
| "learning_rate": 9.951489485545696e-08, | |
| "loss": 0.4557, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.882973696645141, | |
| "grad_norm": 2.9627745151519775, | |
| "learning_rate": 8.364235328699566e-08, | |
| "loss": 0.4968, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.8930540242557883, | |
| "grad_norm": 2.8256924152374268, | |
| "learning_rate": 6.914266584662988e-08, | |
| "loss": 0.4728, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.9031343518664356, | |
| "grad_norm": 2.7654502391815186, | |
| "learning_rate": 5.6017839920180506e-08, | |
| "loss": 0.4594, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.913214679477083, | |
| "grad_norm": 5.463405609130859, | |
| "learning_rate": 4.426969255298841e-08, | |
| "loss": 0.4786, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.9232950070877304, | |
| "grad_norm": 2.978433132171631, | |
| "learning_rate": 3.38998501983534e-08, | |
| "loss": 0.4687, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.9333753346983777, | |
| "grad_norm": 2.887697458267212, | |
| "learning_rate": 2.4909748492362162e-08, | |
| "loss": 0.5466, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.943455662309025, | |
| "grad_norm": 2.823009729385376, | |
| "learning_rate": 1.730063205513277e-08, | |
| "loss": 0.5068, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.9535359899196725, | |
| "grad_norm": 2.9089882373809814, | |
| "learning_rate": 1.1073554318509206e-08, | |
| "loss": 0.4521, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.96361631753032, | |
| "grad_norm": 2.763700246810913, | |
| "learning_rate": 6.229377380218005e-09, | |
| "loss": 0.456, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.973696645140967, | |
| "grad_norm": 2.9146535396575928, | |
| "learning_rate": 2.7687718845148538e-09, | |
| "loss": 0.4586, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.9837769727516146, | |
| "grad_norm": 2.999981641769409, | |
| "learning_rate": 6.922169293421821e-10, | |
| "loss": 0.464, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.9938573003622615, | |
| "grad_norm": 4.130459785461426, | |
| "learning_rate": 0.0, | |
| "loss": 0.4729, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.9938573003622615, | |
| "step": 297, | |
| "total_flos": 1.3286712322107113e+19, | |
| "train_loss": 0.7266113918638389, | |
| "train_runtime": 26017.2965, | |
| "train_samples_per_second": 5.856, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 297, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3286712322107113e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |