| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.98812351543943, | |
| "eval_steps": 100, | |
| "global_step": 525, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 4.008147055910771, | |
| "learning_rate": 1.25e-05, | |
| "loss": 4.2415, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 4.04569203441769, | |
| "learning_rate": 2.5e-05, | |
| "loss": 4.3121, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.865746651377984, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 4.3208, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.6407193073379105, | |
| "learning_rate": 5e-05, | |
| "loss": 3.8848, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.451159328560232, | |
| "learning_rate": 6.25e-05, | |
| "loss": 3.4391, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.8259504797317525, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 3.0656, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.1881779175566867, | |
| "learning_rate": 8.75e-05, | |
| "loss": 2.8135, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.614839668966139, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7319, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.5198673994210212, | |
| "learning_rate": 0.00011250000000000001, | |
| "loss": 2.6903, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.0044025931610727, | |
| "learning_rate": 0.000125, | |
| "loss": 2.584, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.1531821793787296, | |
| "learning_rate": 0.0001375, | |
| "loss": 2.586, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.6210600474209341, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 2.5298, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.5025244204180619, | |
| "learning_rate": 0.00016250000000000002, | |
| "loss": 2.4665, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.5058788641352842, | |
| "learning_rate": 0.000175, | |
| "loss": 2.4194, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.44571801666869537, | |
| "learning_rate": 0.0001875, | |
| "loss": 2.3531, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.44028009268534757, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2749, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.42473118020142525, | |
| "learning_rate": 0.00019999809527270051, | |
| "loss": 2.2587, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.465029302165452, | |
| "learning_rate": 0.0001999923811633618, | |
| "loss": 2.2196, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.49040381415815754, | |
| "learning_rate": 0.00019998285788966027, | |
| "loss": 2.2061, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.4160855034634493, | |
| "learning_rate": 0.00019996952581438068, | |
| "loss": 2.1173, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.45625369964232165, | |
| "learning_rate": 0.00019995238544540241, | |
| "loss": 2.1267, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.42551849567803673, | |
| "learning_rate": 0.00019993143743568, | |
| "loss": 2.0976, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5100052595965069, | |
| "learning_rate": 0.0001999066825832184, | |
| "loss": 2.0428, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.4717525078599394, | |
| "learning_rate": 0.00019987812183104247, | |
| "loss": 2.0068, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5596905853419681, | |
| "learning_rate": 0.0001998457562671611, | |
| "loss": 2.0303, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.4931645550169434, | |
| "learning_rate": 0.00019980958712452577, | |
| "loss": 1.9722, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.4433810930704678, | |
| "learning_rate": 0.0001997696157809835, | |
| "loss": 1.957, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5522396650266582, | |
| "learning_rate": 0.0001997258437592245, | |
| "loss": 1.915, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.49861222066728145, | |
| "learning_rate": 0.00019967827272672408, | |
| "loss": 1.8303, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6169911964169147, | |
| "learning_rate": 0.00019962690449567912, | |
| "loss": 1.8454, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.5639780725078123, | |
| "learning_rate": 0.000199571741022939, | |
| "loss": 1.8068, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.6302805853808786, | |
| "learning_rate": 0.0001995127844099313, | |
| "loss": 1.7166, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.6494693483139545, | |
| "learning_rate": 0.00019945003690258125, | |
| "loss": 1.6433, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.7598443409498918, | |
| "learning_rate": 0.00019938350089122682, | |
| "loss": 1.7081, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6512764391881087, | |
| "learning_rate": 0.00019931317891052708, | |
| "loss": 1.6436, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6953537359048508, | |
| "learning_rate": 0.00019923907363936593, | |
| "loss": 1.5862, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.6011387829084072, | |
| "learning_rate": 0.00019916118790075008, | |
| "loss": 1.5432, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.659130437748028, | |
| "learning_rate": 0.00019907952466170138, | |
| "loss": 1.5132, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7211467253555573, | |
| "learning_rate": 0.00019899408703314385, | |
| "loss": 1.506, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7006890038987398, | |
| "learning_rate": 0.0001989048782697851, | |
| "loss": 1.4498, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.64642158324997, | |
| "learning_rate": 0.00019881190176999255, | |
| "loss": 1.4478, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6608085069521318, | |
| "learning_rate": 0.00019871516107566366, | |
| "loss": 1.3542, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.7707478188072372, | |
| "learning_rate": 0.0001986146598720913, | |
| "loss": 1.3309, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8119298049916807, | |
| "learning_rate": 0.00019851040198782326, | |
| "loss": 1.345, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7712308653234212, | |
| "learning_rate": 0.0001984023913945162, | |
| "loss": 1.3076, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.682341709525683, | |
| "learning_rate": 0.0001982906322067847, | |
| "loss": 1.2565, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.7071991083514119, | |
| "learning_rate": 0.00019817512868204425, | |
| "loss": 1.1796, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.745222014713615, | |
| "learning_rate": 0.00019805588522034916, | |
| "loss": 1.1649, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.7158459299510994, | |
| "learning_rate": 0.00019793290636422505, | |
| "loss": 1.2109, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.7335821144549012, | |
| "learning_rate": 0.00019780619679849552, | |
| "loss": 1.1475, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.7804306024320766, | |
| "learning_rate": 0.000197675761350104, | |
| "loss": 1.1068, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8274924156959725, | |
| "learning_rate": 0.00019754160498792965, | |
| "loss": 1.1839, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.8840482383868431, | |
| "learning_rate": 0.0001974037328225982, | |
| "loss": 1.0928, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.7224652999279871, | |
| "learning_rate": 0.00019726215010628718, | |
| "loss": 1.0299, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7109288879933862, | |
| "learning_rate": 0.0001971168622325259, | |
| "loss": 1.0436, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7650325966583326, | |
| "learning_rate": 0.00019696787473598993, | |
| "loss": 1.041, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7307809391946058, | |
| "learning_rate": 0.00019681519329229033, | |
| "loss": 1.0195, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.6873943623441443, | |
| "learning_rate": 0.00019665882371775733, | |
| "loss": 0.972, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8185924734616268, | |
| "learning_rate": 0.00019649877196921896, | |
| "loss": 0.9986, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7907558585543373, | |
| "learning_rate": 0.00019633504414377388, | |
| "loss": 0.9201, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7216280408288712, | |
| "learning_rate": 0.00019616764647855926, | |
| "loss": 0.9976, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6946470891456141, | |
| "learning_rate": 0.00019599658535051314, | |
| "loss": 0.9008, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6470248283451219, | |
| "learning_rate": 0.00019582186727613152, | |
| "loss": 0.8226, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.8297915622585336, | |
| "learning_rate": 0.00019564349891122018, | |
| "loss": 0.8825, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7018515834126928, | |
| "learning_rate": 0.00019546148705064097, | |
| "loss": 0.8521, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6119835758734723, | |
| "learning_rate": 0.00019527583862805303, | |
| "loss": 0.7872, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.6396036538427098, | |
| "learning_rate": 0.00019508656071564882, | |
| "loss": 0.7887, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6712059239435435, | |
| "learning_rate": 0.00019489366052388441, | |
| "loss": 0.8406, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.6498227189328728, | |
| "learning_rate": 0.00019469714540120507, | |
| "loss": 0.7109, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.6950957852561941, | |
| "learning_rate": 0.00019449702283376517, | |
| "loss": 0.7008, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.6415745385783075, | |
| "learning_rate": 0.00019429330044514305, | |
| "loss": 0.6808, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6774461765802887, | |
| "learning_rate": 0.0001940859859960506, | |
| "loss": 0.7122, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6335543398879422, | |
| "learning_rate": 0.00019387508738403768, | |
| "loss": 0.6826, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6455659601218003, | |
| "learning_rate": 0.0001936606126431911, | |
| "loss": 0.7342, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.6804108080708727, | |
| "learning_rate": 0.00019344256994382878, | |
| "loss": 0.6983, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6233570198373359, | |
| "learning_rate": 0.00019322096759218836, | |
| "loss": 0.6426, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.6354196060962453, | |
| "learning_rate": 0.00019299581403011082, | |
| "loss": 0.6978, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6723728632702363, | |
| "learning_rate": 0.0001927671178347189, | |
| "loss": 0.6449, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6055794839258588, | |
| "learning_rate": 0.00019253488771809024, | |
| "loss": 0.6608, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6032563228830964, | |
| "learning_rate": 0.0001922991325269258, | |
| "loss": 0.6691, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.5917538532836075, | |
| "learning_rate": 0.00019205986124221251, | |
| "loss": 0.6418, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.6558132078005496, | |
| "learning_rate": 0.00019181708297888133, | |
| "loss": 0.6562, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.6110330049943966, | |
| "learning_rate": 0.00019157080698546, | |
| "loss": 0.5855, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.6481622083495842, | |
| "learning_rate": 0.00019132104264372063, | |
| "loss": 0.628, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.5730813607452849, | |
| "learning_rate": 0.0001910677994683225, | |
| "loss": 0.5476, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.6938507563801335, | |
| "learning_rate": 0.00019081108710644932, | |
| "loss": 0.6018, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.625439427503205, | |
| "learning_rate": 0.00019055091533744202, | |
| "loss": 0.5735, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.6628596764324554, | |
| "learning_rate": 0.00019028729407242597, | |
| "loss": 0.5389, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.6112099968245533, | |
| "learning_rate": 0.00019002023335393364, | |
| "loss": 0.5235, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.6098216223216336, | |
| "learning_rate": 0.0001897497433555218, | |
| "loss": 0.6058, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.6469247467013166, | |
| "learning_rate": 0.0001894758343813842, | |
| "loss": 0.5524, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.6344920759870597, | |
| "learning_rate": 0.00018919851686595874, | |
| "loss": 0.5605, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.6756355159547938, | |
| "learning_rate": 0.00018891780137353034, | |
| "loss": 0.5096, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.6439314455537293, | |
| "learning_rate": 0.00018863369859782825, | |
| "loss": 0.5516, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.5567728554741562, | |
| "learning_rate": 0.0001883462193616187, | |
| "loss": 0.4576, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.553595533418767, | |
| "learning_rate": 0.00018805537461629265, | |
| "loss": 0.4947, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.6200223910647112, | |
| "learning_rate": 0.00018776117544144863, | |
| "loss": 0.5073, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6294322114297511, | |
| "learning_rate": 0.00018746363304447073, | |
| "loss": 0.4938, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6000145257745209, | |
| "learning_rate": 0.00018716275876010135, | |
| "loss": 0.473, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5927861897994469, | |
| "learning_rate": 0.00018685856405000983, | |
| "loss": 0.4724, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_blimp_filtered_avg": 0.7155223880597015, | |
| "eval_blimp_filtered_std": 0.005000433138834185, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_blimp_supplement_avg": 0.8405172413793104, | |
| "eval_blimp_supplement_std": 0.016486001732879434, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_vqa_filtered_avg": 0.52, | |
| "eval_vqa_filtered_std": 0.05021167315686779, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_winoground_filtered_avg": 0.64, | |
| "eval_winoground_filtered_std": 0.04824181513244218, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.5504516732077648, | |
| "learning_rate": 0.00018655106050235548, | |
| "loss": 0.4393, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.5801589113252366, | |
| "learning_rate": 0.00018624025983134644, | |
| "loss": 0.468, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.5273944337529535, | |
| "learning_rate": 0.00018592617387679306, | |
| "loss": 0.439, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.508609381383424, | |
| "learning_rate": 0.00018560881460365724, | |
| "loss": 0.4272, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5396859577867195, | |
| "learning_rate": 0.0001852881941015964, | |
| "loss": 0.4362, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.5122858999271028, | |
| "learning_rate": 0.00018496432458450294, | |
| "loss": 0.3893, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.49626561438760436, | |
| "learning_rate": 0.00018463721839003915, | |
| "loss": 0.3498, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.48748413013987063, | |
| "learning_rate": 0.000184306887979167, | |
| "loss": 0.3256, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.5310280563857716, | |
| "learning_rate": 0.00018397334593567348, | |
| "loss": 0.3225, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.6232514021230662, | |
| "learning_rate": 0.00018363660496569127, | |
| "loss": 0.3489, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.5274577320762, | |
| "learning_rate": 0.00018329667789721485, | |
| "loss": 0.3123, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.5096311315676365, | |
| "learning_rate": 0.00018295357767961144, | |
| "loss": 0.3325, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.4613577097438129, | |
| "learning_rate": 0.00018260731738312818, | |
| "loss": 0.2936, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.4997938044342101, | |
| "learning_rate": 0.00018225791019839375, | |
| "loss": 0.3351, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.538085494988463, | |
| "learning_rate": 0.00018190536943591624, | |
| "loss": 0.329, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.5567068979809859, | |
| "learning_rate": 0.00018154970852557603, | |
| "loss": 0.318, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.5548141608588357, | |
| "learning_rate": 0.0001811909410161139, | |
| "loss": 0.3289, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.47326466614968965, | |
| "learning_rate": 0.0001808290805746153, | |
| "loss": 0.3076, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.47629585466918467, | |
| "learning_rate": 0.00018046414098598948, | |
| "loss": 0.3016, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.44135735344426463, | |
| "learning_rate": 0.00018009613615244436, | |
| "loss": 0.2704, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.5127645747027901, | |
| "learning_rate": 0.000179725080092957, | |
| "loss": 0.2887, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.5209981172771183, | |
| "learning_rate": 0.0001793509869427395, | |
| "loss": 0.2938, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.5481082193558409, | |
| "learning_rate": 0.00017897387095270058, | |
| "loss": 0.3191, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.4770065158307258, | |
| "learning_rate": 0.0001785937464889027, | |
| "loss": 0.2795, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.44845204938493194, | |
| "learning_rate": 0.0001782106280320147, | |
| "loss": 0.2667, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.47824147005907164, | |
| "learning_rate": 0.00017782453017676025, | |
| "loss": 0.267, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.501015317452837, | |
| "learning_rate": 0.00017743546763136187, | |
| "loss": 0.2831, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.5232536606095718, | |
| "learning_rate": 0.00017704345521698058, | |
| "loss": 0.2769, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.5495388553709665, | |
| "learning_rate": 0.00017664850786715136, | |
| "loss": 0.3031, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.5371555106361774, | |
| "learning_rate": 0.00017625064062721415, | |
| "loss": 0.2955, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.4716773551397148, | |
| "learning_rate": 0.00017584986865374082, | |
| "loss": 0.2666, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.5089124561646106, | |
| "learning_rate": 0.00017544620721395777, | |
| "loss": 0.3379, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.4715340007422714, | |
| "learning_rate": 0.00017503967168516426, | |
| "loss": 0.2771, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.43502563576445413, | |
| "learning_rate": 0.0001746302775541467, | |
| "loss": 0.2423, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.4967705692007805, | |
| "learning_rate": 0.00017421804041658863, | |
| "loss": 0.2498, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.49127370733051945, | |
| "learning_rate": 0.00017380297597647667, | |
| "loss": 0.2616, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.47835649282708265, | |
| "learning_rate": 0.00017338510004550223, | |
| "loss": 0.241, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.4843464174553606, | |
| "learning_rate": 0.00017296442854245915, | |
| "loss": 0.2458, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.5209405133977896, | |
| "learning_rate": 0.00017254097749263734, | |
| "loss": 0.2452, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.4709574288825739, | |
| "learning_rate": 0.0001721147630272123, | |
| "loss": 0.2627, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.4752105435022234, | |
| "learning_rate": 0.00017168580138263062, | |
| "loss": 0.2527, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.48781843284289905, | |
| "learning_rate": 0.00017125410889999134, | |
| "loss": 0.2356, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.5731736183258567, | |
| "learning_rate": 0.00017081970202442362, | |
| "loss": 0.2668, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.48105126464697834, | |
| "learning_rate": 0.0001703825973044602, | |
| "loss": 0.2454, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.5280645599674879, | |
| "learning_rate": 0.00016994281139140688, | |
| "loss": 0.2454, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.47876489284248624, | |
| "learning_rate": 0.0001695003610387084, | |
| "loss": 0.2463, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.48826354198860017, | |
| "learning_rate": 0.00016905526310130999, | |
| "loss": 0.2295, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.47715494831436517, | |
| "learning_rate": 0.0001686075345350156, | |
| "loss": 0.252, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.5152105233009641, | |
| "learning_rate": 0.0001681571923958416, | |
| "loss": 0.2771, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.4990883717055415, | |
| "learning_rate": 0.00016770425383936735, | |
| "loss": 0.2497, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.4674093996422124, | |
| "learning_rate": 0.00016724873612008155, | |
| "loss": 0.2441, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.4432102664091143, | |
| "learning_rate": 0.00016679065659072487, | |
| "loss": 0.2418, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.4677926556162063, | |
| "learning_rate": 0.00016633003270162902, | |
| "loss": 0.2483, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.5050389021999718, | |
| "learning_rate": 0.00016586688200005193, | |
| "loss": 0.225, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.538150442089787, | |
| "learning_rate": 0.00016540122212950934, | |
| "loss": 0.2629, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.4831894197759429, | |
| "learning_rate": 0.00016493307082910249, | |
| "loss": 0.2539, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.4864294249801108, | |
| "learning_rate": 0.00016446244593284277, | |
| "loss": 0.2638, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.46236092553249764, | |
| "learning_rate": 0.00016398936536897183, | |
| "loss": 0.2255, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.4963120760517666, | |
| "learning_rate": 0.00016351384715927898, | |
| "loss": 0.2524, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.5210286477375989, | |
| "learning_rate": 0.00016303590941841458, | |
| "loss": 0.225, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.5288475623534257, | |
| "learning_rate": 0.0001625555703531998, | |
| "loss": 0.2428, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.4973215047467683, | |
| "learning_rate": 0.00016207284826193335, | |
| "loss": 0.2522, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.44826317640998203, | |
| "learning_rate": 0.00016158776153369402, | |
| "loss": 0.2019, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.45392654459830534, | |
| "learning_rate": 0.0001611003286476406, | |
| "loss": 0.2338, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.4430521150056381, | |
| "learning_rate": 0.00016061056817230754, | |
| "loss": 0.2273, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.44345119147374473, | |
| "learning_rate": 0.00016011849876489776, | |
| "loss": 0.211, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 0.4808061249544928, | |
| "learning_rate": 0.000159624139170572, | |
| "loss": 0.2104, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.5573402749682285, | |
| "learning_rate": 0.00015912750822173445, | |
| "loss": 0.2492, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.5334950652460796, | |
| "learning_rate": 0.00015862862483731574, | |
| "loss": 0.2187, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.49497739813798797, | |
| "learning_rate": 0.00015812750802205187, | |
| "loss": 0.2097, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.44446540691990566, | |
| "learning_rate": 0.00015762417686576038, | |
| "loss": 0.204, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.42142200135464725, | |
| "learning_rate": 0.0001571186505426132, | |
| "loss": 0.1989, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.4328533901196503, | |
| "learning_rate": 0.00015661094831040598, | |
| "loss": 0.2173, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.43093996542664664, | |
| "learning_rate": 0.00015610108950982494, | |
| "loss": 0.1865, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.4850613308932528, | |
| "learning_rate": 0.00015558909356370944, | |
| "loss": 0.2181, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.47485870685329246, | |
| "learning_rate": 0.00015507497997631266, | |
| "loss": 0.2223, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.42085147271583295, | |
| "learning_rate": 0.0001545587683325583, | |
| "loss": 0.1845, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.4479801309419239, | |
| "learning_rate": 0.00015404047829729457, | |
| "loss": 0.1987, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.4624584058381783, | |
| "learning_rate": 0.00015352012961454507, | |
| "loss": 0.217, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.44005765649196454, | |
| "learning_rate": 0.00015299774210675657, | |
| "loss": 0.1837, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.4508346255489124, | |
| "learning_rate": 0.00015247333567404406, | |
| "loss": 0.2007, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.40396006791211914, | |
| "learning_rate": 0.00015194693029343248, | |
| "loss": 0.1866, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.44558839018398966, | |
| "learning_rate": 0.00015141854601809581, | |
| "loss": 0.1967, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.4337334328022437, | |
| "learning_rate": 0.00015088820297659314, | |
| "loss": 0.1891, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.4636781912221849, | |
| "learning_rate": 0.00015035592137210187, | |
| "loss": 0.193, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.47955885394967973, | |
| "learning_rate": 0.00014982172148164804, | |
| "loss": 0.1793, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.4721310395975314, | |
| "learning_rate": 0.00014928562365533392, | |
| "loss": 0.186, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.4737141537120664, | |
| "learning_rate": 0.00014874764831556285, | |
| "loss": 0.2058, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.40830849621087567, | |
| "learning_rate": 0.00014820781595626116, | |
| "loss": 0.1822, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.4272142710058541, | |
| "learning_rate": 0.0001476661471420975, | |
| "loss": 0.2057, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.4212227727031309, | |
| "learning_rate": 0.0001471226625076993, | |
| "loss": 0.1845, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.39660108389275345, | |
| "learning_rate": 0.0001465773827568671, | |
| "loss": 0.1769, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.38828383424285384, | |
| "learning_rate": 0.00014603032866178538, | |
| "loss": 0.1699, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.3681031142044674, | |
| "learning_rate": 0.00014548152106223157, | |
| "loss": 0.1456, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.46248659870169556, | |
| "learning_rate": 0.00014493098086478196, | |
| "loss": 0.1846, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 0.4437664820090981, | |
| "learning_rate": 0.00014437872904201542, | |
| "loss": 0.1706, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.4410375026146085, | |
| "learning_rate": 0.0001438247866317145, | |
| "loss": 0.1757, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.4290870801703047, | |
| "learning_rate": 0.00014326917473606366, | |
| "loss": 0.1777, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.4812130220306999, | |
| "learning_rate": 0.00014271191452084597, | |
| "loss": 0.2013, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.4314920290891278, | |
| "learning_rate": 0.00014215302721463623, | |
| "loss": 0.1857, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_blimp_filtered_avg": 0.7161194029850746, | |
| "eval_blimp_filtered_std": 0.005001692965803923, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_blimp_supplement_avg": 0.8211206896551724, | |
| "eval_blimp_supplement_std": 0.016785621805327337, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_vqa_filtered_avg": 0.51, | |
| "eval_vqa_filtered_std": 0.05024183937956912, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_winoground_filtered_avg": 0.62, | |
| "eval_winoground_filtered_std": 0.04878317312145633, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.41562514975066434, | |
| "learning_rate": 0.0001415925341079927, | |
| "loss": 0.21, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.37833993286875955, | |
| "learning_rate": 0.00014103045655264576, | |
| "loss": 0.1659, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 0.3880529818353851, | |
| "learning_rate": 0.00014046681596068466, | |
| "loss": 0.1638, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.40159118156434603, | |
| "learning_rate": 0.00013990163380374194, | |
| "loss": 0.1768, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.4086449128732129, | |
| "learning_rate": 0.00013933493161217523, | |
| "loss": 0.1544, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.3808287729283849, | |
| "learning_rate": 0.0001387667309742472, | |
| "loss": 0.1366, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.39609061286446773, | |
| "learning_rate": 0.0001381970535353032, | |
| "loss": 0.1494, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.40847272653729905, | |
| "learning_rate": 0.00013762592099694665, | |
| "loss": 0.1615, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.4334994696681873, | |
| "learning_rate": 0.00013705335511621228, | |
| "loss": 0.1542, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.4546384761691546, | |
| "learning_rate": 0.00013647937770473737, | |
| "loss": 0.1834, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.36130610610645814, | |
| "learning_rate": 0.00013590401062793083, | |
| "loss": 0.123, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.29975302946848653, | |
| "learning_rate": 0.0001353272758041402, | |
| "loss": 0.0824, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.29392603086414587, | |
| "learning_rate": 0.00013474919520381671, | |
| "loss": 0.0836, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.33169221984700814, | |
| "learning_rate": 0.00013416979084867852, | |
| "loss": 0.0683, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.39192700338704206, | |
| "learning_rate": 0.00013358908481087134, | |
| "loss": 0.0804, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.42443737109460977, | |
| "learning_rate": 0.0001330070992121281, | |
| "loss": 0.0797, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.42848813761714244, | |
| "learning_rate": 0.00013242385622292592, | |
| "loss": 0.0776, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.37448633759803696, | |
| "learning_rate": 0.00013183937806164172, | |
| "loss": 0.0739, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.3437440816482259, | |
| "learning_rate": 0.00013125368699370567, | |
| "loss": 0.0652, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.356415907025676, | |
| "learning_rate": 0.0001306668053307531, | |
| "loss": 0.0778, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.30675625825005026, | |
| "learning_rate": 0.00013007875542977448, | |
| "loss": 0.0665, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.29794655672460485, | |
| "learning_rate": 0.00012948955969226383, | |
| "loss": 0.0696, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.30163505061461343, | |
| "learning_rate": 0.00012889924056336532, | |
| "loss": 0.0705, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.32541739323213426, | |
| "learning_rate": 0.00012830782053101805, | |
| "loss": 0.0733, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.31121536090331003, | |
| "learning_rate": 0.00012771532212509974, | |
| "loss": 0.0711, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.34593292210442944, | |
| "learning_rate": 0.00012712176791656807, | |
| "loss": 0.0788, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.33946278651997686, | |
| "learning_rate": 0.0001265271805166012, | |
| "loss": 0.0677, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.3400898219352628, | |
| "learning_rate": 0.0001259315825757362, | |
| "loss": 0.0643, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.3813085350755264, | |
| "learning_rate": 0.00012533499678300618, | |
| "loss": 0.0761, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.3523012248149677, | |
| "learning_rate": 0.00012473744586507604, | |
| "loss": 0.0648, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.37842862853695125, | |
| "learning_rate": 0.00012413895258537675, | |
| "loss": 0.0812, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.39475455813661525, | |
| "learning_rate": 0.00012353953974323807, | |
| "loss": 0.0801, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.3205081471986943, | |
| "learning_rate": 0.00012293923017302002, | |
| "loss": 0.0677, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.31006899448135294, | |
| "learning_rate": 0.0001223380467432432, | |
| "loss": 0.07, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.3048520942780853, | |
| "learning_rate": 0.00012173601235571742, | |
| "loss": 0.0615, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.3425413653893973, | |
| "learning_rate": 0.0001211331499446693, | |
| "loss": 0.0658, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.31929344956491607, | |
| "learning_rate": 0.00012052948247586873, | |
| "loss": 0.0653, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.3414359773691709, | |
| "learning_rate": 0.00011992503294575383, | |
| "loss": 0.0723, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.32978160245312554, | |
| "learning_rate": 0.00011931982438055505, | |
| "loss": 0.07, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.33271868205929617, | |
| "learning_rate": 0.00011871387983541789, | |
| "loss": 0.0672, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.29862145989444433, | |
| "learning_rate": 0.00011810722239352467, | |
| "loss": 0.0603, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.34485364985513034, | |
| "learning_rate": 0.00011749987516521523, | |
| "loss": 0.0632, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.3299899118013224, | |
| "learning_rate": 0.00011689186128710654, | |
| "loss": 0.0601, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.29635972892096896, | |
| "learning_rate": 0.00011628320392121117, | |
| "loss": 0.0558, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.3414458592363874, | |
| "learning_rate": 0.0001156739262540552, | |
| "loss": 0.0703, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.3280087622706941, | |
| "learning_rate": 0.00011506405149579468, | |
| "loss": 0.0657, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.373086375777386, | |
| "learning_rate": 0.00011445360287933165, | |
| "loss": 0.0668, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.2937645914714354, | |
| "learning_rate": 0.00011384260365942904, | |
| "loss": 0.0612, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.39022311054047737, | |
| "learning_rate": 0.00011323107711182473, | |
| "loss": 0.0762, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.3345521008714258, | |
| "learning_rate": 0.00011261904653234485, | |
| "loss": 0.0711, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.30608871062806836, | |
| "learning_rate": 0.00011200653523601652, | |
| "loss": 0.0617, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.30714147902477945, | |
| "learning_rate": 0.00011139356655617945, | |
| "loss": 0.063, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.31051190204375445, | |
| "learning_rate": 0.00011078016384359724, | |
| "loss": 0.0659, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.3071085278813772, | |
| "learning_rate": 0.00011016635046556772, | |
| "loss": 0.061, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.3045837343462885, | |
| "learning_rate": 0.00010955214980503284, | |
| "loss": 0.0597, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.3049959198680976, | |
| "learning_rate": 0.00010893758525968789, | |
| "loss": 0.0587, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.3168437149994661, | |
| "learning_rate": 0.00010832268024109025, | |
| "loss": 0.0559, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.3024342626013227, | |
| "learning_rate": 0.00010770745817376742, | |
| "loss": 0.0583, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.3188509232471995, | |
| "learning_rate": 0.0001070919424943247, | |
| "loss": 0.061, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.3381945814712772, | |
| "learning_rate": 0.0001064761566505525, | |
| "loss": 0.0648, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.3131931451431926, | |
| "learning_rate": 0.00010586012410053292, | |
| "loss": 0.0624, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.32809637984753304, | |
| "learning_rate": 0.00010524386831174628, | |
| "loss": 0.0627, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.2832796499168925, | |
| "learning_rate": 0.00010462741276017711, | |
| "loss": 0.0535, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.3334141162384235, | |
| "learning_rate": 0.00010401078092941971, | |
| "loss": 0.061, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.27653747850590626, | |
| "learning_rate": 0.00010339399630978373, | |
| "loss": 0.0497, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.32205480409336124, | |
| "learning_rate": 0.00010277708239739924, | |
| "loss": 0.0658, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.310079147965717, | |
| "learning_rate": 0.0001021600626933217, | |
| "loss": 0.0525, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.31094425691461797, | |
| "learning_rate": 0.00010154296070263649, | |
| "loss": 0.0619, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.33419799536496597, | |
| "learning_rate": 0.00010092579993356386, | |
| "loss": 0.0615, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.3343121767672678, | |
| "learning_rate": 0.00010030860389656305, | |
| "loss": 0.0663, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.3516117623617434, | |
| "learning_rate": 9.969139610343696e-05, | |
| "loss": 0.0662, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.31796912631433194, | |
| "learning_rate": 9.907420006643619e-05, | |
| "loss": 0.0624, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.29219460425245597, | |
| "learning_rate": 9.845703929736351e-05, | |
| "loss": 0.0596, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.316635170830544, | |
| "learning_rate": 9.783993730667831e-05, | |
| "loss": 0.0659, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.33766616368603597, | |
| "learning_rate": 9.722291760260077e-05, | |
| "loss": 0.0646, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.31287192455811574, | |
| "learning_rate": 9.66060036902163e-05, | |
| "loss": 0.0585, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.28964582015181484, | |
| "learning_rate": 9.598921907058033e-05, | |
| "loss": 0.0543, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.3037919396698326, | |
| "learning_rate": 9.53725872398229e-05, | |
| "loss": 0.0512, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.3229974938313004, | |
| "learning_rate": 9.475613168825374e-05, | |
| "loss": 0.0531, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.29881091304580676, | |
| "learning_rate": 9.413987589946711e-05, | |
| "loss": 0.0569, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.29692909307641674, | |
| "learning_rate": 9.352384334944753e-05, | |
| "loss": 0.0547, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.33439942628885455, | |
| "learning_rate": 9.290805750567532e-05, | |
| "loss": 0.0622, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.2991141437988068, | |
| "learning_rate": 9.22925418262326e-05, | |
| "loss": 0.0464, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.3171911760038229, | |
| "learning_rate": 9.167731975890976e-05, | |
| "loss": 0.059, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.30072460150102115, | |
| "learning_rate": 9.106241474031212e-05, | |
| "loss": 0.0559, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.3301896190647226, | |
| "learning_rate": 9.04478501949672e-05, | |
| "loss": 0.0514, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.3298071637508188, | |
| "learning_rate": 8.983364953443227e-05, | |
| "loss": 0.0618, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.3497185839244567, | |
| "learning_rate": 8.921983615640277e-05, | |
| "loss": 0.065, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.33084725547728233, | |
| "learning_rate": 8.860643344382056e-05, | |
| "loss": 0.0527, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.33012822636415956, | |
| "learning_rate": 8.79934647639835e-05, | |
| "loss": 0.0666, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.3151687548518561, | |
| "learning_rate": 8.738095346765518e-05, | |
| "loss": 0.0573, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.30346203875619676, | |
| "learning_rate": 8.676892288817531e-05, | |
| "loss": 0.0491, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.3133369298353677, | |
| "learning_rate": 8.615739634057098e-05, | |
| "loss": 0.0595, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.28715782085999497, | |
| "learning_rate": 8.554639712066836e-05, | |
| "loss": 0.0542, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.2815995010771035, | |
| "learning_rate": 8.493594850420537e-05, | |
| "loss": 0.0551, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.280576878443274, | |
| "learning_rate": 8.432607374594484e-05, | |
| "loss": 0.0488, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.298809991890747, | |
| "learning_rate": 8.371679607878884e-05, | |
| "loss": 0.0544, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.30088222272143067, | |
| "learning_rate": 8.310813871289348e-05, | |
| "loss": 0.0591, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.3237358977236424, | |
| "learning_rate": 8.250012483478478e-05, | |
| "loss": 0.0547, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.34075237005827885, | |
| "learning_rate": 8.189277760647537e-05, | |
| "loss": 0.0566, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "eval_blimp_filtered_avg": 0.7037313432835821, | |
| "eval_blimp_filtered_std": 0.005058972315437875, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "eval_blimp_supplement_avg": 0.8103448275862069, | |
| "eval_blimp_supplement_std": 0.017321145118445798, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "eval_vqa_filtered_avg": 0.53, | |
| "eval_vqa_filtered_std": 0.0501613558046592, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "eval_winoground_filtered_avg": 0.68, | |
| "eval_winoground_filtered_std": 0.046882617226215034, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.3237263865460515, | |
| "learning_rate": 8.128612016458215e-05, | |
| "loss": 0.059, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.2977357286247905, | |
| "learning_rate": 8.068017561944499e-05, | |
| "loss": 0.0492, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.29591506818063545, | |
| "learning_rate": 8.00749670542462e-05, | |
| "loss": 0.052, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.2789469075911483, | |
| "learning_rate": 7.94705175241313e-05, | |
| "loss": 0.0455, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.2997082343784124, | |
| "learning_rate": 7.886685005533072e-05, | |
| "loss": 0.0498, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.30157528073661777, | |
| "learning_rate": 7.82639876442826e-05, | |
| "loss": 0.0567, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.32803298910194756, | |
| "learning_rate": 7.76619532567568e-05, | |
| "loss": 0.0622, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.28556449374878695, | |
| "learning_rate": 7.706076982697999e-05, | |
| "loss": 0.0489, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.32287162854623286, | |
| "learning_rate": 7.646046025676198e-05, | |
| "loss": 0.066, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.3384064716667544, | |
| "learning_rate": 7.586104741462325e-05, | |
| "loss": 0.0629, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.3005901634146794, | |
| "learning_rate": 7.526255413492395e-05, | |
| "loss": 0.051, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.2907146546357962, | |
| "learning_rate": 7.466500321699383e-05, | |
| "loss": 0.0546, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.30779520364750435, | |
| "learning_rate": 7.40684174242638e-05, | |
| "loss": 0.058, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.29074373091101263, | |
| "learning_rate": 7.347281948339879e-05, | |
| "loss": 0.0463, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.32970798475445445, | |
| "learning_rate": 7.287823208343192e-05, | |
| "loss": 0.0589, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.2798345327195924, | |
| "learning_rate": 7.228467787490028e-05, | |
| "loss": 0.0438, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.18326848967204043, | |
| "learning_rate": 7.169217946898197e-05, | |
| "loss": 0.0225, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.18022372679373735, | |
| "learning_rate": 7.110075943663472e-05, | |
| "loss": 0.0161, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 0.1633153575928502, | |
| "learning_rate": 7.051044030773618e-05, | |
| "loss": 0.0153, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.17802284328446474, | |
| "learning_rate": 6.992124457022553e-05, | |
| "loss": 0.0176, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.17359891604740127, | |
| "learning_rate": 6.933319466924693e-05, | |
| "loss": 0.0162, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.2202987501804585, | |
| "learning_rate": 6.874631300629435e-05, | |
| "loss": 0.0162, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.22277821921264357, | |
| "learning_rate": 6.81606219383583e-05, | |
| "loss": 0.0187, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.18724963681022663, | |
| "learning_rate": 6.757614377707409e-05, | |
| "loss": 0.0153, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.21995220887794256, | |
| "learning_rate": 6.699290078787193e-05, | |
| "loss": 0.0188, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.1967935793635855, | |
| "learning_rate": 6.641091518912867e-05, | |
| "loss": 0.0156, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.20661934683104752, | |
| "learning_rate": 6.583020915132152e-05, | |
| "loss": 0.0158, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.2422474266231083, | |
| "learning_rate": 6.525080479618331e-05, | |
| "loss": 0.0177, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 0.18354685059507367, | |
| "learning_rate": 6.467272419585984e-05, | |
| "loss": 0.013, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.22423754187379397, | |
| "learning_rate": 6.40959893720692e-05, | |
| "loss": 0.0188, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.18994008796265852, | |
| "learning_rate": 6.352062229526266e-05, | |
| "loss": 0.0132, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.24715301748493912, | |
| "learning_rate": 6.294664488378776e-05, | |
| "loss": 0.015, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.17280498203848704, | |
| "learning_rate": 6.237407900305335e-05, | |
| "loss": 0.0138, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.21773200395950232, | |
| "learning_rate": 6.180294646469679e-05, | |
| "loss": 0.0155, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.2144971485793242, | |
| "learning_rate": 6.123326902575282e-05, | |
| "loss": 0.0158, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 0.18331926033535073, | |
| "learning_rate": 6.06650683878248e-05, | |
| "loss": 0.013, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.1788180130126268, | |
| "learning_rate": 6.009836619625809e-05, | |
| "loss": 0.0133, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 0.20337677688861636, | |
| "learning_rate": 5.953318403931532e-05, | |
| "loss": 0.0129, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.20853998405220736, | |
| "learning_rate": 5.896954344735426e-05, | |
| "loss": 0.0176, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 0.1919639102705018, | |
| "learning_rate": 5.840746589200732e-05, | |
| "loss": 0.0144, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.2134469059873606, | |
| "learning_rate": 5.784697278536379e-05, | |
| "loss": 0.0138, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.18435084201272836, | |
| "learning_rate": 5.728808547915405e-05, | |
| "loss": 0.0135, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.19554570393158438, | |
| "learning_rate": 5.673082526393634e-05, | |
| "loss": 0.015, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.18522448379098544, | |
| "learning_rate": 5.617521336828556e-05, | |
| "loss": 0.0129, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.190207008998555, | |
| "learning_rate": 5.5621270957984573e-05, | |
| "loss": 0.0161, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 0.19594053008897275, | |
| "learning_rate": 5.506901913521808e-05, | |
| "loss": 0.0162, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 0.20111569255746164, | |
| "learning_rate": 5.451847893776845e-05, | |
| "loss": 0.0147, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.20867562278084897, | |
| "learning_rate": 5.396967133821461e-05, | |
| "loss": 0.0154, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.16028325232055693, | |
| "learning_rate": 5.342261724313292e-05, | |
| "loss": 0.0117, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.14992620939570764, | |
| "learning_rate": 5.28773374923007e-05, | |
| "loss": 0.0106, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.20669460754401175, | |
| "learning_rate": 5.2333852857902575e-05, | |
| "loss": 0.0161, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.21934716169620833, | |
| "learning_rate": 5.1792184043738855e-05, | |
| "loss": 0.0128, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.18204794157825063, | |
| "learning_rate": 5.1252351684437136e-05, | |
| "loss": 0.0129, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.21363608639584963, | |
| "learning_rate": 5.071437634466609e-05, | |
| "loss": 0.0105, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 0.15881770971724649, | |
| "learning_rate": 5.0178278518351983e-05, | |
| "loss": 0.0096, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.1980006966366768, | |
| "learning_rate": 4.964407862789817e-05, | |
| "loss": 0.0119, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 0.21004802159627842, | |
| "learning_rate": 4.911179702340688e-05, | |
| "loss": 0.0119, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.20419756258161648, | |
| "learning_rate": 4.85814539819042e-05, | |
| "loss": 0.0145, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.1565818058300373, | |
| "learning_rate": 4.8053069706567554e-05, | |
| "loss": 0.0105, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.19501698471957343, | |
| "learning_rate": 4.752666432595596e-05, | |
| "loss": 0.0126, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 0.20941486180216556, | |
| "learning_rate": 4.700225789324343e-05, | |
| "loss": 0.0105, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.18304197382791004, | |
| "learning_rate": 4.647987038545496e-05, | |
| "loss": 0.011, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.16720171411001336, | |
| "learning_rate": 4.595952170270542e-05, | |
| "loss": 0.0112, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.22478251297433013, | |
| "learning_rate": 4.544123166744172e-05, | |
| "loss": 0.0118, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.1598572948562243, | |
| "learning_rate": 4.492502002368738e-05, | |
| "loss": 0.0107, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.22373563049772874, | |
| "learning_rate": 4.4410906436290566e-05, | |
| "loss": 0.0104, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.16802667132434534, | |
| "learning_rate": 4.38989104901751e-05, | |
| "loss": 0.0114, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.24550738449688075, | |
| "learning_rate": 4.3389051689594e-05, | |
| "loss": 0.0121, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 0.1660066244443363, | |
| "learning_rate": 4.288134945738684e-05, | |
| "loss": 0.0099, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.1783889244909253, | |
| "learning_rate": 4.237582313423962e-05, | |
| "loss": 0.0094, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.17141038466777303, | |
| "learning_rate": 4.187249197794813e-05, | |
| "loss": 0.0095, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 0.1893721805088239, | |
| "learning_rate": 4.137137516268426e-05, | |
| "loss": 0.013, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.16935951673752134, | |
| "learning_rate": 4.0872491778265535e-05, | |
| "loss": 0.0091, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.13309068523326859, | |
| "learning_rate": 4.037586082942805e-05, | |
| "loss": 0.0091, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.18791651271841342, | |
| "learning_rate": 3.988150123510224e-05, | |
| "loss": 0.0121, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.1559825545952661, | |
| "learning_rate": 3.938943182769246e-05, | |
| "loss": 0.0102, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.2261919531211638, | |
| "learning_rate": 3.88996713523594e-05, | |
| "loss": 0.0127, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 0.20792420146527377, | |
| "learning_rate": 3.841223846630599e-05, | |
| "loss": 0.013, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.16486082885129608, | |
| "learning_rate": 3.792715173806669e-05, | |
| "loss": 0.0105, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.1549020176177142, | |
| "learning_rate": 3.74444296468002e-05, | |
| "loss": 0.0098, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.17250200199106172, | |
| "learning_rate": 3.696409058158544e-05, | |
| "loss": 0.0109, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.1415293330470341, | |
| "learning_rate": 3.6486152840721046e-05, | |
| "loss": 0.0084, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.14461810975420877, | |
| "learning_rate": 3.6010634631028226e-05, | |
| "loss": 0.0084, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.1557012557289619, | |
| "learning_rate": 3.553755406715724e-05, | |
| "loss": 0.0089, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 0.15752891661687976, | |
| "learning_rate": 3.506692917089751e-05, | |
| "loss": 0.0109, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.1694876915505117, | |
| "learning_rate": 3.459877787049072e-05, | |
| "loss": 0.009, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.1582663784415179, | |
| "learning_rate": 3.413311799994808e-05, | |
| "loss": 0.0095, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 0.13693031068741818, | |
| "learning_rate": 3.366996729837102e-05, | |
| "loss": 0.0092, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.14543112940410688, | |
| "learning_rate": 3.320934340927513e-05, | |
| "loss": 0.0108, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 0.19389482832864774, | |
| "learning_rate": 3.275126387991847e-05, | |
| "loss": 0.0098, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 0.15797165592004603, | |
| "learning_rate": 3.229574616063268e-05, | |
| "loss": 0.0076, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.21281942854700847, | |
| "learning_rate": 3.184280760415843e-05, | |
| "loss": 0.0142, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.12498130411986656, | |
| "learning_rate": 3.1392465464984455e-05, | |
| "loss": 0.0081, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 0.1152125429659436, | |
| "learning_rate": 3.094473689869002e-05, | |
| "loss": 0.0058, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.1567733530080216, | |
| "learning_rate": 3.0499638961291623e-05, | |
| "loss": 0.011, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.14500898906990572, | |
| "learning_rate": 3.0057188608593147e-05, | |
| "loss": 0.0085, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 0.16163974543952728, | |
| "learning_rate": 2.9617402695539808e-05, | |
| "loss": 0.013, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.13868168811451842, | |
| "learning_rate": 2.9180297975576364e-05, | |
| "loss": 0.0084, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.17847032901949134, | |
| "learning_rate": 2.8745891100008683e-05, | |
| "loss": 0.0121, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.17527442252411723, | |
| "learning_rate": 2.83141986173694e-05, | |
| "loss": 0.0084, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_blimp_filtered_avg": 0.7053731343283582, | |
| "eval_blimp_filtered_std": 0.005043001462199571, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_blimp_supplement_avg": 0.8125, | |
| "eval_blimp_supplement_std": 0.01736311122127593, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_vqa_filtered_avg": 0.52, | |
| "eval_vqa_filtered_std": 0.05021167315686779, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_winoground_filtered_avg": 0.64, | |
| "eval_winoground_filtered_std": 0.048241815132442176, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 0.14598157841040266, | |
| "learning_rate": 2.788523697278773e-05, | |
| "loss": 0.0093, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 0.20150542514971506, | |
| "learning_rate": 2.7459022507362686e-05, | |
| "loss": 0.0122, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.18255123614923588, | |
| "learning_rate": 2.7035571457540865e-05, | |
| "loss": 0.0103, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.16704045474943452, | |
| "learning_rate": 2.6614899954497795e-05, | |
| "loss": 0.0114, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.14683721625679494, | |
| "learning_rate": 2.619702402352332e-05, | |
| "loss": 0.01, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.18144743721435366, | |
| "learning_rate": 2.5781959583411374e-05, | |
| "loss": 0.0129, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.19646570441433073, | |
| "learning_rate": 2.5369722445853304e-05, | |
| "loss": 0.0143, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.1668088181727681, | |
| "learning_rate": 2.4960328314835745e-05, | |
| "loss": 0.0089, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.16111476451284476, | |
| "learning_rate": 2.4553792786042262e-05, | |
| "loss": 0.0091, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.17729690845562673, | |
| "learning_rate": 2.4150131346259197e-05, | |
| "loss": 0.0103, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.15155895346947004, | |
| "learning_rate": 2.3749359372785883e-05, | |
| "loss": 0.0096, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.15041370885333255, | |
| "learning_rate": 2.3351492132848664e-05, | |
| "loss": 0.0085, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.12197907148956355, | |
| "learning_rate": 2.2956544783019418e-05, | |
| "loss": 0.0067, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 0.1788434056496877, | |
| "learning_rate": 2.2564532368638146e-05, | |
| "loss": 0.01, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.19269466130772045, | |
| "learning_rate": 2.2175469823239768e-05, | |
| "loss": 0.0117, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.15780826445252463, | |
| "learning_rate": 2.1789371967985338e-05, | |
| "loss": 0.0101, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.19229144408434373, | |
| "learning_rate": 2.140625351109733e-05, | |
| "loss": 0.0084, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 0.15474486143047034, | |
| "learning_rate": 2.1026129047299436e-05, | |
| "loss": 0.0067, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 0.15864166155594778, | |
| "learning_rate": 2.0649013057260546e-05, | |
| "loss": 0.0098, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 0.22515244613844015, | |
| "learning_rate": 2.0274919907043033e-05, | |
| "loss": 0.0094, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.18684872878382638, | |
| "learning_rate": 1.9903863847555648e-05, | |
| "loss": 0.0127, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 0.06270483785922072, | |
| "learning_rate": 1.9535859014010526e-05, | |
| "loss": 0.0028, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": 0.09948637260912774, | |
| "learning_rate": 1.917091942538469e-05, | |
| "loss": 0.0037, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 0.07530065845248647, | |
| "learning_rate": 1.880905898388612e-05, | |
| "loss": 0.0039, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 0.054461890750773165, | |
| "learning_rate": 1.8450291474423998e-05, | |
| "loss": 0.0025, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 0.08002877578075594, | |
| "learning_rate": 1.8094630564083736e-05, | |
| "loss": 0.0035, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 0.05746226463965698, | |
| "learning_rate": 1.7742089801606276e-05, | |
| "loss": 0.0025, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 0.0633358139605444, | |
| "learning_rate": 1.7392682616871837e-05, | |
| "loss": 0.0027, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.06509683268742919, | |
| "learning_rate": 1.7046422320388556e-05, | |
| "loss": 0.0027, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 0.054571154616853274, | |
| "learning_rate": 1.6703322102785168e-05, | |
| "loss": 0.0026, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 0.06888564779650448, | |
| "learning_rate": 1.6363395034308703e-05, | |
| "loss": 0.0027, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 0.05307117129834359, | |
| "learning_rate": 1.6026654064326553e-05, | |
| "loss": 0.0025, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 0.06598879328529111, | |
| "learning_rate": 1.5693112020833013e-05, | |
| "loss": 0.003, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.054752236275106794, | |
| "learning_rate": 1.5362781609960852e-05, | |
| "loss": 0.0025, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 0.07106963888787232, | |
| "learning_rate": 1.5035675415497063e-05, | |
| "loss": 0.0031, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 0.052548572683446884, | |
| "learning_rate": 1.471180589840363e-05, | |
| "loss": 0.0025, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 0.08828036910254508, | |
| "learning_rate": 1.4391185396342789e-05, | |
| "loss": 0.0038, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.09463459893212552, | |
| "learning_rate": 1.4073826123206946e-05, | |
| "loss": 0.0038, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 0.08002928457971342, | |
| "learning_rate": 1.375974016865359e-05, | |
| "loss": 0.0031, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 0.07631532690730236, | |
| "learning_rate": 1.3448939497644509e-05, | |
| "loss": 0.0031, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 0.04831761603516682, | |
| "learning_rate": 1.3141435949990188e-05, | |
| "loss": 0.0027, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.07344003153336562, | |
| "learning_rate": 1.2837241239898667e-05, | |
| "loss": 0.0032, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 0.08305075630986966, | |
| "learning_rate": 1.253636695552931e-05, | |
| "loss": 0.003, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 0.1034575433958594, | |
| "learning_rate": 1.2238824558551365e-05, | |
| "loss": 0.0039, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 0.06655324788558148, | |
| "learning_rate": 1.1944625383707374e-05, | |
| "loss": 0.003, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.0790599253839735, | |
| "learning_rate": 1.1653780638381328e-05, | |
| "loss": 0.0029, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 0.04198685628145689, | |
| "learning_rate": 1.1366301402171775e-05, | |
| "loss": 0.0017, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 0.06439353264983554, | |
| "learning_rate": 1.1082198626469686e-05, | |
| "loss": 0.0024, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 0.07762450043477247, | |
| "learning_rate": 1.0801483134041268e-05, | |
| "loss": 0.0027, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.07856883953783565, | |
| "learning_rate": 1.0524165618615845e-05, | |
| "loss": 0.0033, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 0.07929308057852809, | |
| "learning_rate": 1.0250256644478195e-05, | |
| "loss": 0.003, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 0.0587512154822952, | |
| "learning_rate": 9.979766646066368e-06, | |
| "loss": 0.0027, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 0.06109551507247056, | |
| "learning_rate": 9.71270592757404e-06, | |
| "loss": 0.0032, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.05909029031199419, | |
| "learning_rate": 9.449084662557982e-06, | |
| "loss": 0.0026, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.0814055458144323, | |
| "learning_rate": 9.188912893550695e-06, | |
| "loss": 0.0026, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 0.07735385332942207, | |
| "learning_rate": 8.932200531677537e-06, | |
| "loss": 0.0028, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 0.08519595591969155, | |
| "learning_rate": 8.678957356279371e-06, | |
| "loss": 0.0024, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 0.055031384326470804, | |
| "learning_rate": 8.429193014540015e-06, | |
| "loss": 0.0026, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.05387324401647046, | |
| "learning_rate": 8.182917021118663e-06, | |
| "loss": 0.0026, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 0.07168879976269556, | |
| "learning_rate": 7.940138757787507e-06, | |
| "loss": 0.0032, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 0.07661756681904786, | |
| "learning_rate": 7.700867473074224e-06, | |
| "loss": 0.0035, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 0.09486930411075328, | |
| "learning_rate": 7.46511228190977e-06, | |
| "loss": 0.0049, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.0679530025111762, | |
| "learning_rate": 7.232882165281141e-06, | |
| "loss": 0.0026, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 0.06514922044267304, | |
| "learning_rate": 7.004185969889187e-06, | |
| "loss": 0.0027, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 0.06706026131022384, | |
| "learning_rate": 6.7790324078116364e-06, | |
| "loss": 0.0027, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 0.07709046890424658, | |
| "learning_rate": 6.557430056171221e-06, | |
| "loss": 0.0033, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.051443041020356704, | |
| "learning_rate": 6.339387356808912e-06, | |
| "loss": 0.0026, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 0.060318722923432995, | |
| "learning_rate": 6.124912615962341e-06, | |
| "loss": 0.0028, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 0.062212012735137795, | |
| "learning_rate": 5.9140140039494084e-06, | |
| "loss": 0.0025, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 0.06556299474776538, | |
| "learning_rate": 5.706699554856964e-06, | |
| "loss": 0.0023, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.08649267044276539, | |
| "learning_rate": 5.502977166234857e-06, | |
| "loss": 0.0035, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.08526822145924882, | |
| "learning_rate": 5.302854598794937e-06, | |
| "loss": 0.003, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 0.04133711118453636, | |
| "learning_rate": 5.106339476115596e-06, | |
| "loss": 0.0019, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.05708577094578342, | |
| "learning_rate": 4.913439284351207e-06, | |
| "loss": 0.0026, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 0.07367912633186298, | |
| "learning_rate": 4.724161371946978e-06, | |
| "loss": 0.0029, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.08135320771271103, | |
| "learning_rate": 4.538512949359075e-06, | |
| "loss": 0.0027, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 0.0849858165893086, | |
| "learning_rate": 4.356501088779841e-06, | |
| "loss": 0.0027, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 0.05260609110954984, | |
| "learning_rate": 4.178132723868477e-06, | |
| "loss": 0.0019, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 0.0795477617292828, | |
| "learning_rate": 4.003414649486892e-06, | |
| "loss": 0.0032, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 0.08161922179718771, | |
| "learning_rate": 3.832353521440768e-06, | |
| "loss": 0.0026, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 0.06830643544893618, | |
| "learning_rate": 3.6649558562261375e-06, | |
| "loss": 0.0032, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 0.08641205617098656, | |
| "learning_rate": 3.501228030781034e-06, | |
| "loss": 0.0028, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 0.04921706287498077, | |
| "learning_rate": 3.341176282242653e-06, | |
| "loss": 0.0021, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.05901589705081983, | |
| "learning_rate": 3.184806707709698e-06, | |
| "loss": 0.0027, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 0.08562934355546689, | |
| "learning_rate": 3.0321252640100885e-06, | |
| "loss": 0.0035, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 0.056139936545776606, | |
| "learning_rate": 2.88313776747412e-06, | |
| "loss": 0.0027, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 0.06574452787357139, | |
| "learning_rate": 2.7378498937128404e-06, | |
| "loss": 0.0031, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.06295208396607756, | |
| "learning_rate": 2.5962671774018234e-06, | |
| "loss": 0.0029, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 0.06348707610420529, | |
| "learning_rate": 2.458395012070369e-06, | |
| "loss": 0.0027, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 0.06438459591992919, | |
| "learning_rate": 2.3242386498960266e-06, | |
| "loss": 0.003, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 0.0936033257355208, | |
| "learning_rate": 2.1938032015044964e-06, | |
| "loss": 0.0053, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 0.0712704009642112, | |
| "learning_rate": 2.067093635774975e-06, | |
| "loss": 0.0033, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.05278839840964536, | |
| "learning_rate": 1.9441147796508407e-06, | |
| "loss": 0.0025, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 0.05158800004403027, | |
| "learning_rate": 1.8248713179557786e-06, | |
| "loss": 0.002, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 0.06302315225352234, | |
| "learning_rate": 1.7093677932153218e-06, | |
| "loss": 0.002, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 0.09014451602286425, | |
| "learning_rate": 1.5976086054838025e-06, | |
| "loss": 0.0031, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.08249201483869177, | |
| "learning_rate": 1.4895980121767627e-06, | |
| "loss": 0.0029, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 0.07887788932672342, | |
| "learning_rate": 1.3853401279086854e-06, | |
| "loss": 0.0028, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 0.09271365227044996, | |
| "learning_rate": 1.2848389243363512e-06, | |
| "loss": 0.0026, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 0.05191622392926365, | |
| "learning_rate": 1.1880982300074838e-06, | |
| "loss": 0.0027, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "eval_blimp_filtered_avg": 0.7105970149253731, | |
| "eval_blimp_filtered_std": 0.005015059082306442, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "eval_blimp_supplement_avg": 0.8146551724137931, | |
| "eval_blimp_supplement_std": 0.01739418193453382, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "eval_vqa_filtered_avg": 0.52, | |
| "eval_vqa_filtered_std": 0.05021167315686779, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "eval_winoground_filtered_avg": 0.64, | |
| "eval_winoground_filtered_std": 0.048241815132442176, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.06004512680198857, | |
| "learning_rate": 1.0951217302148986e-06, | |
| "loss": 0.0021, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 0.07576379765393293, | |
| "learning_rate": 1.0059129668561707e-06, | |
| "loss": 0.0027, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 0.0655321501764931, | |
| "learning_rate": 9.204753382986097e-07, | |
| "loss": 0.0029, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 0.06668565079155468, | |
| "learning_rate": 8.388120992499083e-07, | |
| "loss": 0.0024, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.08295379764022878, | |
| "learning_rate": 7.609263606340622e-07, | |
| "loss": 0.003, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 0.05830372848137469, | |
| "learning_rate": 6.868210894729332e-07, | |
| "loss": 0.0027, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 0.04555270319966449, | |
| "learning_rate": 6.164991087731831e-07, | |
| "loss": 0.0021, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 0.057930715171302063, | |
| "learning_rate": 5.499630974187375e-07, | |
| "loss": 0.0024, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 0.09648171217668358, | |
| "learning_rate": 4.872155900687347e-07, | |
| "loss": 0.0032, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 0.08324119499167887, | |
| "learning_rate": 4.2825897706100235e-07, | |
| "loss": 0.0018, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 0.05280884195269513, | |
| "learning_rate": 3.7309550432090835e-07, | |
| "loss": 0.003, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 0.06550697689715686, | |
| "learning_rate": 3.217272732759402e-07, | |
| "loss": 0.0029, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 0.07122072726515956, | |
| "learning_rate": 2.741562407755138e-07, | |
| "loss": 0.0026, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 0.08524060823989948, | |
| "learning_rate": 2.3038421901651064e-07, | |
| "loss": 0.0032, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 0.06597786647502633, | |
| "learning_rate": 1.9041287547424403e-07, | |
| "loss": 0.0026, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 0.0640231658570345, | |
| "learning_rate": 1.5424373283889904e-07, | |
| "loss": 0.0025, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 0.06369562949011548, | |
| "learning_rate": 1.2187816895752324e-07, | |
| "loss": 0.003, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.05040741786604575, | |
| "learning_rate": 9.3317416781602e-08, | |
| "loss": 0.0021, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.059709231647531516, | |
| "learning_rate": 6.856256432000718e-08, | |
| "loss": 0.0024, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 0.07196915675318658, | |
| "learning_rate": 4.7614554597608105e-08, | |
| "loss": 0.0033, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.057403114857655216, | |
| "learning_rate": 3.047418561933357e-08, | |
| "loss": 0.002, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.08027211044033893, | |
| "learning_rate": 1.7142110339740668e-08, | |
| "loss": 0.003, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 0.09851204603686081, | |
| "learning_rate": 7.618836638190186e-09, | |
| "loss": 0.0028, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 0.0505999817391235, | |
| "learning_rate": 1.904727299473219e-09, | |
| "loss": 0.0023, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 0.059018226862226256, | |
| "learning_rate": 0.0, | |
| "loss": 0.0029, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "step": 525, | |
| "total_flos": 415734656204800.0, | |
| "train_loss": 0.33918485829939266, | |
| "train_runtime": 37482.4854, | |
| "train_samples_per_second": 8.975, | |
| "train_steps_per_second": 0.014 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 525, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 415734656204800.0, | |
| "train_batch_size": 40, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |