AbstractPhil commited on
Commit
24e4fe0
·
verified ·
1 Parent(s): a841477

Upload weights and configs - David-partial_shared-deep_efficiency - Run 20251012_065325

Browse files
weights/David-partial_shared-deep_efficiency/20251012_065325/best_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f9941e6c55e69bf8ce1e9c17580cdff556a9b11672c77edc340cd68253d05f8
3
+ size 702105004
weights/David-partial_shared-deep_efficiency/20251012_065325/best_model_metadata.json ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0,
3
+ "optimizer_state_dict": {
4
+ "state": {
5
+ "0": {
6
+ "step": "tensor(1252.)",
7
+ "exp_avg": "tensor([[ 3.2360e-04, 2.7346e-04, -1.6687e-04, ..., -2.1505e-04,\n -2.4204e-04, 2.4885e-04],\n [ 5.6676e-04, -2.0420e-04, -6.4213e-05, ..., -2.9597e-04,\n 1.4124e-04, 1.3683e-04],\n [-7.9647e-05, 2.4677e-05, 2.5698e-05, ..., -6.3598e-05,\n -6.7680e-06, 7.9479e-06],\n ...,\n [ 5.2355e-05, 1.0158e-04, -6.3774e-05, ..., -2.1613e-05,\n 4.9762e-05, -4.7784e-08],\n [ 1.7859e-04, -2.8151e-04, -3.2186e-05, ..., 1.6563e-04,\n 4.0493e-05, 1.0766e-04],\n [ 6.4692e-05, -1.7622e-04, -1.3839e-04, ..., 1.3661e-04,\n -3.8703e-04, 1.4046e-05]], device='cuda:0')",
8
+ "exp_avg_sq": "tensor([[2.9281e-07, 2.1510e-07, 1.5994e-07, ..., 1.7497e-07, 1.6714e-07,\n 1.5146e-07],\n [4.9048e-07, 4.4868e-07, 2.2952e-07, ..., 2.3536e-07, 1.9671e-07,\n 1.7949e-07],\n [2.2012e-07, 3.3196e-07, 1.2386e-07, ..., 2.1462e-07, 1.2784e-07,\n 1.1551e-07],\n ...,\n [2.7466e-07, 3.3623e-07, 1.3015e-07, ..., 1.5296e-07, 1.2514e-07,\n 1.0651e-07],\n [3.1969e-07, 2.3004e-07, 1.4582e-07, ..., 2.9085e-07, 1.8175e-07,\n 1.3265e-07],\n [2.2935e-07, 2.7262e-07, 1.6681e-07, ..., 1.8106e-07, 1.5615e-07,\n 1.3212e-07]], device='cuda:0')"
9
+ },
10
+ "1": {
11
+ "step": "tensor(1252.)",
12
+ "exp_avg": "tensor([-0.0062, 0.0129, -0.0033, ..., -0.0012, 0.0100, 0.0101],\n device='cuda:0')",
13
+ "exp_avg_sq": "tensor([0.0003, 0.0005, 0.0004, ..., 0.0003, 0.0004, 0.0004], device='cuda:0')"
14
+ },
15
+ "2": {
16
+ "step": "tensor(1252.)",
17
+ "exp_avg": "tensor([[ 4.6825e-05, 7.9989e-05, -1.3229e-05, ..., 2.2961e-04,\n -1.1310e-04, 5.4385e-05],\n [-3.4353e-06, -2.7469e-05, -5.4433e-05, ..., -2.1096e-05,\n 7.7922e-06, -2.2306e-04],\n [-2.0391e-04, -4.3801e-05, -4.1140e-07, ..., 5.8920e-07,\n 3.5899e-06, -2.3491e-05],\n ...,\n [ 6.3841e-05, -3.6618e-05, 1.2547e-05, ..., 7.7761e-06,\n -4.5621e-06, -3.5599e-06],\n [ 6.2041e-05, 3.0583e-05, -7.8144e-05, ..., 1.1888e-04,\n 1.3342e-04, 1.7825e-04],\n [ 2.2318e-04, 1.7938e-05, -4.6985e-06, ..., 2.2735e-05,\n 1.1068e-04, 1.2627e-04]], device='cuda:0')",
18
+ "exp_avg_sq": "tensor([[6.7661e-08, 3.6735e-07, 1.3594e-07, ..., 2.5964e-07, 1.7028e-07,\n 7.5712e-07],\n [1.2881e-07, 2.1535e-07, 5.9323e-08, ..., 3.6588e-08, 2.5184e-08,\n 1.7425e-07],\n [1.0874e-08, 1.3439e-08, 2.2856e-09, ..., 4.6359e-10, 5.7900e-10,\n 1.3302e-07],\n ...,\n [5.4523e-08, 1.4797e-07, 1.4317e-07, ..., 1.9136e-07, 3.2520e-08,\n 1.7944e-07],\n [5.2944e-07, 1.6817e-07, 3.3061e-07, ..., 1.6966e-07, 2.1756e-07,\n 1.7452e-07],\n [2.0124e-07, 2.4536e-07, 2.4409e-07, ..., 2.6753e-07, 1.6941e-07,\n 2.3342e-07]], device='cuda:0')"
19
+ },
20
+ "3": {
21
+ "step": "tensor(1252.)",
22
+ "exp_avg": "tensor([ 6.3095e-03, 1.0105e-02, -7.2114e-03, 2.6916e-03, 1.4540e-02,\n -8.1281e-03, -1.2252e-02, -1.8867e-02, -9.1811e-13, 2.4565e-03,\n -2.4282e-03, 2.0367e-02, -3.7201e-03, 1.8798e-02, -2.8116e-03,\n -3.9847e-02, -4.0327e-05, 3.8992e-02, 8.0075e-03, -2.4616e-02,\n -4.7554e-02, -2.3473e-04, -1.1967e-02, -3.8392e-03, -7.5564e-03,\n -2.3041e-03, -9.7437e-03, 1.0517e-02, 1.2970e-24, 2.6848e-03,\n -5.1608e-03, 5.9408e-03, 1.0477e-03, -3.2507e-03, 1.8797e-16,\n 3.8762e-02, 5.5948e-15, -1.6928e-02, 2.8906e-02, -1.4080e-02,\n -2.2503e-02, 1.9257e-03, 1.7432e-04, 5.6052e-45, 5.6052e-45,\n -2.0917e-02, -2.8884e-02, -3.8992e-03, -5.8552e-03, 3.9288e-02,\n 1.7358e-02, -3.0812e-02, 1.3196e-02, 1.8865e-02, -3.8879e-03,\n 5.6875e-03, 2.2141e-02, 6.3627e-11, -1.6639e-04, 7.2713e-08,\n 2.5927e-03, -1.8203e-03, -3.8897e-03, 2.4415e-02, -3.5754e-03,\n -4.0851e-02, 6.9345e-03, -2.1552e-08, 2.9751e-03, 3.9696e-03,\n -7.5438e-03, 6.1877e-02, 1.0226e-02, -3.3057e-02, 3.3042e-02,\n -6.6663e-03, -1.3553e-02, 3.4443e-12, 6.3441e-04, 4.0149e-02,\n 5.0423e-03, -1.7325e-02, 1.5242e-02, 2.7652e-02, -1.3957e-02,\n -7.1098e-03, -2.5046e-02, -1.2420e-02, 2.5842e-02, 3.1485e-02,\n -2.1129e-02, 4.5402e-03, 1.2060e-02, -4.6220e-03, -1.1274e-04,\n -5.6345e-04, -3.8450e-02, -2.2102e-02, 9.1533e-03, 1.5455e-02,\n 5.1599e-02, -1.6354e-02, 1.2006e-41, -1.2626e-02, 2.1031e-26,\n 4.9244e-03, 5.6052e-45, -2.5105e-04, -3.5303e-02, -2.6220e-03,\n 1.7408e-02, 4.3364e-03, -6.1681e-03, 4.7083e-02, -8.4818e-03,\n 2.9509e-02, 4.7633e-07, -3.8074e-05, 4.3689e-03, -7.0218e-03,\n -1.5689e-02, -2.0203e-02, 4.6682e-03, -3.4952e-03, -8.7931e-03,\n 1.7215e-02, 3.9927e-02, 6.4847e-03, -4.4929e-02, 1.2448e-02,\n 9.0080e-07, -2.3419e-03, 6.1351e-03, -5.5610e-02, 8.7252e-09,\n 1.4794e-02, 7.6962e-06, 5.8033e-08, 1.9340e-03, 3.1489e-02,\n 7.4275e-03, -1.8960e-02, 9.2173e-03, 3.1696e-02, 1.5664e-25,\n -8.9091e-03, -2.4165e-02, 4.4681e-03, 2.6227e-03, 2.4077e-02,\n -1.8199e-03, 1.8766e-03, -3.0567e-02, -1.8892e-02, 1.0254e-03,\n -1.9105e-02, 1.5145e-02, 1.2229e-24, -4.4243e-02, -2.1899e-02,\n -3.9118e-03, -1.8184e-02, -4.8141e-03, 1.7521e-05, 2.2574e-02,\n -1.1329e-03, -6.4991e-08, -9.4891e-03, -3.4095e-02, 4.1038e-03,\n -1.4977e-03, 3.0519e-02, -2.1678e-03, -2.5536e-02, -2.6753e-02,\n -5.3924e-03, 8.1109e-03, 3.4046e-03, -2.2300e-02, -9.7498e-03,\n -4.1122e-03, 2.0895e-02, -1.5390e-10, 4.5864e-02, 2.6596e-02,\n 1.6540e-02, -4.9707e-02, 1.9603e-03, 3.3280e-02, 5.6111e-03,\n -1.5679e-08, 7.6568e-03, -2.5600e-04, -2.1001e-02, -3.1119e-03,\n -3.0294e-02, 4.8759e-17, 2.0169e-02, 2.5225e-02, 2.1511e-02,\n 6.6674e-03, -7.5192e-04, 9.3959e-04, -2.0633e-02, -1.0759e-03,\n 1.0492e-02, -4.4686e-03, -9.6244e-03, -1.8906e-02, 1.9656e-04,\n 8.2339e-04, -1.9035e-03, 5.6052e-45, 2.4783e-02, 2.7619e-02,\n 1.6462e-04, 4.7972e-37, 3.0153e-02, 1.0774e-07, 5.6052e-45,\n -1.2876e-04, -1.2103e-02, 1.0204e-02, 8.0852e-03, 2.2690e-02,\n 5.1314e-05, 1.0505e-02, 3.3135e-03, -1.2457e-03, 3.9897e-03,\n -1.3445e-02, 2.4383e-43, -7.3454e-03, -2.3105e-02, 1.0013e-02,\n 3.8331e-24, -5.1301e-09, -4.8806e-02, -5.4689e-03, 3.3349e-02,\n -2.0689e-02, 2.5848e-02, -2.6074e-02, -1.2091e-02, 1.8388e-02,\n -8.8004e-03, 2.8824e-02, 5.2831e-03, -1.1920e-02, 1.8355e-02,\n 9.7951e-43, -6.1385e-04, -3.9222e-03, 3.9038e-03, 1.9289e-04,\n -1.7810e-02, -2.1360e-02, 5.6052e-45, -3.2697e-02, 5.6203e-08,\n -2.4920e-02, 5.6052e-45, -3.8732e-02, -2.3872e-02, 7.7504e-03,\n 2.3459e-02, -1.7773e-02, 1.1753e-02, 3.8272e-03, -8.9033e-04,\n 3.3556e-02, -2.3780e-03, -3.4198e-02, -4.0069e-02, -1.9181e-02,\n 1.5414e-44, -5.8333e-03, -1.8738e-03, 5.0300e-02, 1.2154e-02,\n 2.2529e-02, -5.4243e-02, 4.8063e-02, -9.2528e-04, -3.3071e-02,\n 1.0725e-03, 1.1701e-03, 7.5004e-03, 1.1670e-02, -1.3901e-02,\n 5.1590e-02, 5.6052e-45, -8.0554e-03, 4.5924e-02, -2.9494e-02,\n 1.9085e-02, 4.2879e-02, 5.1231e-02, -1.6516e-02, 3.8583e-05,\n 1.6455e-21, 6.1769e-03, -1.4991e-02, -3.2885e-02, 3.6586e-03,\n -7.2611e-02, -2.3407e-02, 4.8259e-03, -4.5377e-02, -2.4321e-02,\n -5.4561e-03, 1.8229e-36, -2.2144e-02, 1.4436e-02, 1.2742e-04,\n -1.5233e-02, 2.4071e-03, -1.0965e-02, 2.5808e-02, -1.2052e-02,\n -1.6745e-02, -1.1039e-02, -3.8941e-03, 2.6514e-02, 1.6604e-02,\n 1.5907e-02, 3.4279e-14, -3.7125e-03, 3.1294e-03, -1.4289e-02,\n 1.1421e-08, 2.2224e-04, -1.8152e-02, -1.1667e-05, -2.2080e-02,\n 2.5481e-03, 8.7359e-03, -1.6184e-06, 1.1210e-44, -2.1634e-02,\n 1.0344e-02, 7.3392e-03, -1.1644e-02, 5.6052e-45, -5.1584e-02,\n -1.5072e-04, 2.5573e-02, 4.1720e-35, 3.4396e-07, -3.4991e-02,\n -1.3301e-03, 5.9017e-04, -2.4378e-02, -5.0121e-02, 1.7391e-02,\n 1.6600e-02, 3.0522e-03, -5.7808e-11, -3.4560e-03, 3.3240e-41,\n -3.6787e-07, -2.5211e-02, 1.4029e-02, 1.5168e-21, -7.2827e-03,\n 3.0149e-02, -1.9312e-02, 6.7360e-03, 8.1858e-04, 5.5464e-08,\n 9.5818e-03, 2.4091e-05, 5.6052e-45, 5.6052e-45, -2.1818e-02,\n 1.1318e-02, -2.2359e-02, -1.1030e-02, 1.2218e-02, 1.8844e-02,\n -1.1539e-02, 8.8762e-03, 5.5772e-43, 7.8445e-04, -1.4140e-02,\n -5.6555e-03, 1.0241e-25, 8.6787e-03, -6.1377e-02, -1.8105e-09,\n 7.1233e-02, -5.3768e-03, -3.9863e-02, -3.7629e-03, -8.2004e-03,\n 1.5133e-02, 2.7545e-02, 8.1354e-03, -1.4285e-02, 1.1388e-03,\n 1.2611e-02, -3.9126e-03, -5.9678e-02, 5.8810e-05, -2.8542e-06,\n 2.3374e-03, -2.9888e-02, -5.4860e-03, -1.0809e-02, 8.8289e-04,\n -2.2799e-02, -1.6299e-02, -1.1042e-02, 9.1711e-03, 7.0359e-02,\n 1.1406e-02, 9.1842e-03, 1.4526e-07, 1.2878e-02, -2.7906e-03,\n -2.2023e-02, -1.6329e-06, -2.8631e-07, 1.2647e-02, 1.1436e-02,\n 2.8000e-02, -1.3175e-02, 2.1959e-22, 2.8441e-02, 4.9711e-03,\n 3.4165e-02, 2.4129e-03, 3.8593e-03, 4.3026e-03, 1.2388e-03,\n 4.0722e-03, 3.0206e-03, 1.7281e-03, 1.1913e-02, 3.3509e-03,\n -1.2070e-02, -6.9294e-03, -8.9284e-03, -1.4664e-02, 5.6052e-45,\n -3.7891e-02, 1.1406e-04, 4.2613e-22, 1.7537e-02, -2.8075e-02,\n -1.4222e-02, -1.8718e-03, 1.4716e-02, 5.6052e-45, 1.3721e-02,\n -9.3163e-03, -2.2895e-12, -6.4407e-02, 2.4566e-07, 6.2432e-03,\n -1.6119e-02, 1.4562e-02, -6.4639e-04, -1.8107e-02, 1.0461e-02,\n 1.1217e-02, 5.6052e-45, 1.7513e-03, 1.8045e-02, -5.7474e-04,\n -3.0518e-02, 2.5504e-03, -4.2295e-03, 1.7233e-02, 7.8549e-03,\n 1.8105e-02, -1.1617e-09, 6.7133e-05, -2.5239e-02, -5.1445e-03,\n -3.5988e-02, -2.7007e-02, 1.6730e-02, -1.5433e-02, -4.7194e-03,\n -1.5377e-03, 3.7160e-03, 1.6736e-02, 1.5087e-02, -1.4317e-02,\n 2.6669e-02, -6.4973e-05, 1.0112e-02, 9.9678e-03, -1.7406e-02,\n -3.2214e-02, 5.9153e-03, -1.2751e-08, -8.9147e-04, -8.1204e-03,\n -9.8581e-04, -3.7886e-02, -6.9848e-03, 7.0728e-34, -2.2390e-02,\n -2.0875e-03, 3.0616e-03, -1.0206e-02, 5.9606e-03, -1.1724e-02,\n 2.0957e-02, 1.6450e-07, 5.6881e-04, -4.8386e-03, 1.4958e-02,\n -4.1869e-04, 1.1522e-02, -8.4504e-03, -2.0420e-02, -2.0566e-02,\n 1.0246e-31, -1.3805e-02, -8.4617e-03, -3.1084e-02, 7.7151e-03,\n 1.0890e-02, -2.1323e-03, -1.3998e-02, -3.0063e-02, 3.4947e-02,\n -9.7465e-03, 8.0982e-03, 4.1236e-04, 2.0780e-06, 2.4225e-02,\n -2.1038e-02, -3.6554e-02, -7.8693e-03, 5.8824e-03, -1.3709e-02,\n 1.6772e-02, -2.1016e-03, -3.6102e-02, -2.1737e-08, -7.6063e-06,\n -1.9318e-03, 8.1567e-03, 2.0737e-02, 2.1467e-02, 1.5372e-03,\n 2.7647e-02, -1.7205e-04, -3.9141e-03, 4.8734e-12, 2.6239e-02,\n -2.2899e-02, -5.5385e-04, -2.0649e-02, 1.7950e-02, 8.5972e-04,\n 7.3162e-03, -6.1593e-03, 7.7086e-07, -2.8280e-03, 1.4817e-02,\n -2.1850e-02, -6.0554e-03, 1.0418e-02, 2.0517e-02, -2.8043e-02,\n -4.3947e-03, 2.0423e-03, -1.6967e-02, 4.3766e-02, 1.5823e-03,\n -5.4661e-02, -3.3160e-02, -1.9256e-04, 3.1671e-02, 1.6358e-02,\n 1.5966e-03, 9.8480e-03, 1.1538e-02, 4.5594e-02, 1.6002e-20,\n 1.2507e-02, 6.4729e-11, -9.9683e-03, -4.4485e-02, -1.5981e-03,\n -4.2940e-02, 5.2815e-07, 2.2432e-02, 5.6052e-45, 3.4584e-02,\n 1.6527e-14, -3.7697e-02, 3.2442e-03, 6.8584e-03, 6.4316e-10,\n 3.6284e-02, 1.3219e-03, 2.5215e-02, 6.0128e-03, -1.9469e-02,\n 3.5533e-03, -1.1232e-05, 1.0818e-02, 1.1199e-02, -7.7353e-04,\n -5.9933e-04, -3.0985e-02, 5.0710e-02, 3.3469e-39, 3.4692e-28,\n 5.6799e-02, 1.5112e-02, -1.5871e-02, 6.0281e-03, 5.6052e-45,\n 4.2577e-02, -2.3296e-02, -9.8035e-03, -9.9523e-03, 2.0622e-02,\n 9.6894e-03, -1.0142e-02, -3.6530e-04, -4.4305e-02, 5.5633e-04,\n -2.5540e-02, -1.0392e-02, 5.6052e-45, 4.0551e-02, -9.7175e-05,\n 7.1499e-02, -1.3626e-03, -1.0137e-02, -1.7970e-02, -2.8623e-02,\n 1.4694e-05, -1.5370e-09, 3.8830e-02, -8.6145e-03, 1.2965e-02,\n 2.1306e-02, 6.3871e-03, 1.0217e-04, 8.7199e-03, 1.3137e-03,\n 1.1221e-02, -7.0534e-02, 5.6052e-45, -7.3751e-03, 8.3064e-04,\n 1.3766e-03, 7.5711e-03, 8.5444e-03, -2.9912e-03, -5.1570e-03,\n -6.3520e-04, 1.0969e-38, -6.5334e-03, 1.6987e-02, 3.9340e-02,\n -1.2284e-02, 3.1037e-04, -7.3568e-03, -7.9704e-03, -5.4135e-03,\n -1.8481e-02, 4.4651e-03, 2.5858e-03, 1.1231e-02, 6.3346e-03,\n -4.4806e-03, -2.2731e-03, -3.1428e-02, 8.1347e-08, 1.0547e-10,\n -1.0057e-02, -4.4577e-02, 9.4988e-03, 7.6134e-14, -4.5697e-03,\n 5.6092e-02, -5.4494e-02, -2.1807e-02, 2.3846e-03, 2.0488e-02,\n 1.2870e-02, -3.8596e-02, 1.4518e-02, 1.2988e-03, -6.6681e-04,\n -2.1169e-05, -2.1591e-02, 1.1567e-02, 3.6024e-03, 1.5887e-03,\n 8.7326e-03, -6.3797e-04, 4.5447e-03, -1.4360e-02, 3.6341e-02,\n 8.0137e-03, 7.1405e-03, 8.1216e-03, 3.2469e-03, -1.3172e-01,\n 1.1304e-02, 6.4318e-04, 4.5953e-02, 3.9268e-02, 6.4269e-13,\n 2.3201e-03, -1.3688e-02, -3.0304e-02, -5.2764e-02, -2.5362e-02,\n -6.8679e-04, -1.3592e-02, 8.2715e-03, 2.3912e-02, -4.3248e-02,\n 1.6923e-02, 2.5448e-03, 8.2811e-07, -3.0993e-02, 5.9813e-03,\n 6.0964e-14, 8.0658e-10, -3.6148e-02, 1.6691e-02, -1.0385e-02,\n -5.6443e-03, -4.2842e-02, 4.0031e-04, -1.6079e-02, 5.2353e-04,\n 9.1052e-03, 4.2465e-02, 7.9762e-03, 3.0922e-09, 6.3492e-02,\n -3.6915e-02, -3.2362e-02, -1.1295e-08, -3.1847e-02, -2.7718e-02,\n -1.4213e-07, -9.8430e-03, 1.7578e-02, -1.0240e-03, -3.2461e-02,\n 1.6937e-02, 2.0227e-02, -9.3911e-03, -2.2609e-08, 5.6052e-45,\n -9.1615e-03, 2.8999e-03, -2.3742e-07, -2.5721e-03, 2.3479e-02,\n -2.5475e-02, 2.4535e-02, 4.3862e-02], device='cuda:0')",
23
+ "exp_avg_sq": "tensor([9.9184e-03, 3.9638e-03, 1.7345e-04, 9.9986e-03, 1.2448e-02, 6.1718e-03,\n 1.2088e-02, 1.1267e-02, 9.5139e-06, 8.4350e-03, 3.6677e-04, 9.3281e-03,\n 1.3587e-03, 1.5304e-03, 2.7651e-04, 6.5069e-03, 5.4342e-04, 9.8717e-03,\n 5.0608e-03, 9.8138e-03, 8.4615e-03, 2.5360e-05, 4.5034e-03, 1.3096e-02,\n 7.4525e-03, 3.7504e-03, 1.6901e-03, 4.9534e-03, 7.3275e-06, 1.1300e-02,\n 1.3426e-03, 7.3099e-04, 6.6785e-05, 8.4838e-03, 2.5318e-08, 9.7259e-03,\n 1.3558e-06, 5.4524e-03, 1.0203e-02, 1.2807e-02, 9.8004e-03, 4.0702e-05,\n 5.6678e-07, 6.4663e-06, 1.0984e-05, 8.8656e-03, 1.0470e-02, 3.7924e-03,\n 9.6993e-03, 6.2924e-03, 8.7966e-03, 9.3304e-03, 7.4247e-03, 9.3099e-03,\n 4.5929e-04, 2.7105e-04, 8.9152e-03, 1.1805e-05, 8.3804e-05, 2.1136e-05,\n 8.3829e-06, 5.0984e-04, 8.8572e-03, 1.4897e-02, 5.2421e-03, 1.1726e-02,\n 1.0643e-02, 1.1655e-06, 6.8796e-03, 2.3766e-04, 9.8384e-03, 1.0292e-02,\n 2.7329e-04, 1.0401e-03, 1.1615e-02, 2.8894e-04, 1.2342e-02, 6.6121e-06,\n 6.9927e-05, 1.4044e-02, 1.6560e-03, 9.7846e-03, 3.6291e-03, 9.2430e-03,\n 5.7003e-03, 2.8462e-03, 8.6408e-03, 9.8162e-03, 1.2940e-02, 8.9749e-03,\n 9.6031e-03, 8.7022e-06, 1.1764e-02, 2.1277e-03, 1.9139e-05, 1.9355e-05,\n 1.2929e-02, 1.0064e-02, 1.3028e-02, 9.7184e-03, 1.0036e-02, 1.3303e-02,\n 2.6623e-05, 1.6275e-03, 2.6933e-08, 4.3224e-03, 3.1381e-07, 9.7967e-03,\n 8.5941e-03, 2.7152e-04, 9.3152e-03, 4.1779e-04, 1.0460e-02, 1.0109e-02,\n 1.0018e-02, 8.9683e-03, 3.2559e-06, 2.9013e-07, 6.0253e-03, 1.0506e-03,\n 1.0089e-02, 4.3870e-03, 3.4034e-05, 9.5618e-03, 5.7941e-03, 1.3030e-03,\n 1.2375e-02, 1.0594e-03, 1.2641e-02, 8.8211e-03, 1.1298e-06, 7.1990e-04,\n 1.0771e-02, 1.0488e-02, 1.8391e-06, 4.1492e-03, 1.1615e-07, 2.4299e-06,\n 1.2898e-02, 1.2603e-02, 9.7322e-03, 1.1835e-02, 4.6588e-03, 1.0031e-02,\n 4.5713e-06, 2.7210e-04, 1.2151e-02, 4.7409e-04, 1.0459e-02, 1.3686e-02,\n 2.0595e-04, 5.2870e-04, 3.4539e-03, 7.1901e-04, 1.1584e-02, 1.1608e-02,\n 3.3529e-03, 5.6626e-06, 1.2413e-02, 1.0844e-02, 1.1714e-03, 8.2955e-03,\n 1.5894e-04, 2.0576e-05, 9.3551e-03, 1.9486e-04, 8.5691e-08, 1.2315e-02,\n 3.7861e-03, 4.5172e-04, 6.4066e-03, 8.5240e-03, 3.7165e-03, 1.1555e-02,\n 5.6092e-03, 1.0200e-02, 9.2765e-03, 7.2243e-04, 1.2803e-02, 6.8335e-03,\n 9.1525e-03, 1.0590e-02, 1.7816e-06, 1.1588e-02, 1.1717e-02, 7.4922e-03,\n 8.5542e-03, 1.6443e-04, 1.0988e-02, 6.8847e-04, 1.5922e-06, 5.6045e-04,\n 1.6673e-05, 1.0371e-02, 1.0418e-02, 3.8739e-03, 6.6929e-07, 1.1890e-02,\n 9.9224e-03, 3.3563e-03, 2.7792e-04, 1.1545e-02, 1.0858e-02, 1.0272e-02,\n 2.9340e-04, 1.0192e-02, 1.1927e-02, 9.2529e-03, 5.6079e-03, 1.2056e-05,\n 1.4712e-05, 1.4977e-03, 2.6436e-08, 1.1957e-02, 7.4899e-03, 3.2670e-05,\n 2.8411e-06, 1.0066e-02, 1.2196e-05, 6.1832e-10, 3.5646e-04, 8.0693e-03,\n 3.4266e-03, 1.5587e-03, 9.4824e-03, 3.7557e-06, 5.3407e-04, 2.3861e-03,\n 8.8817e-03, 3.2429e-04, 1.0358e-02, 5.5871e-06, 1.1052e-02, 1.4697e-03,\n 2.8477e-03, 7.0624e-08, 3.3241e-05, 7.5760e-03, 1.2230e-02, 9.5781e-03,\n 7.6146e-03, 9.7549e-03, 1.0588e-02, 1.0485e-02, 5.0060e-03, 1.6205e-03,\n 6.7711e-03, 4.4809e-03, 6.0491e-03, 1.1154e-02, 1.8592e-06, 5.5433e-05,\n 6.2387e-03, 7.5267e-04, 1.9140e-03, 9.8668e-03, 5.2729e-03, 1.4129e-09,\n 8.5224e-03, 4.3018e-06, 5.0917e-04, 1.1792e-06, 7.9248e-03, 1.2124e-02,\n 7.6136e-04, 5.8892e-04, 2.7933e-03, 2.6166e-03, 1.8173e-04, 1.0199e-02,\n 5.1026e-03, 8.1347e-03, 9.3416e-03, 1.0154e-02, 7.9817e-03, 6.5260e-08,\n 2.7281e-03, 9.8393e-03, 9.6065e-03, 2.0993e-04, 1.1485e-02, 8.6378e-03,\n 9.9260e-03, 1.5805e-04, 1.3715e-02, 1.9903e-04, 1.0288e-04, 1.2890e-03,\n 2.7119e-03, 5.7973e-04, 1.2280e-02, 2.2445e-06, 2.3399e-03, 2.8378e-03,\n 1.2901e-02, 1.0079e-02, 4.9145e-03, 1.2153e-02, 5.5520e-03, 8.0610e-06,\n 9.3250e-08, 4.9430e-04, 9.7800e-03, 1.0884e-02, 2.7434e-04, 1.1206e-02,\n 1.1246e-02, 8.1469e-05, 4.6692e-03, 9.3879e-03, 1.6387e-03, 3.7635e-08,\n 6.7876e-03, 1.3237e-02, 9.5950e-06, 8.6009e-03, 1.7518e-04, 1.2650e-02,\n 9.2887e-03, 1.1199e-02, 9.9179e-03, 6.6423e-03, 9.0764e-05, 1.1212e-02,\n 1.0159e-02, 9.1605e-03, 4.4815e-07, 1.2830e-04, 8.2952e-03, 1.0278e-02,\n 1.0244e-05, 9.0209e-07, 8.5716e-03, 1.2072e-05, 1.0708e-02, 3.6826e-04,\n 8.6391e-03, 1.0280e-05, 1.7550e-05, 8.0718e-03, 1.0433e-02, 9.9146e-03,\n 8.7642e-03, 2.8526e-07, 5.4229e-03, 1.2632e-04, 6.4362e-03, 1.1150e-07,\n 5.5325e-05, 8.8957e-03, 1.9004e-03, 4.4019e-05, 1.0681e-02, 1.1783e-02,\n 1.0122e-02, 1.2771e-03, 9.6328e-03, 2.9619e-09, 1.0061e-02, 2.8479e-06,\n 1.6234e-05, 1.1441e-02, 5.5538e-03, 1.8467e-08, 5.6070e-03, 9.7220e-03,\n 1.0857e-02, 1.0968e-02, 6.1836e-04, 6.5869e-09, 1.0446e-02, 3.9520e-04,\n 3.2826e-09, 1.1487e-06, 1.0872e-02, 1.5904e-03, 9.4345e-03, 1.3370e-02,\n 1.0217e-02, 7.9814e-03, 4.6065e-03, 6.7023e-03, 7.4527e-07, 9.5869e-03,\n 9.7663e-03, 4.5271e-03, 7.2334e-06, 3.6960e-03, 1.1498e-02, 1.1504e-05,\n 9.9961e-03, 1.1459e-02, 7.5040e-03, 8.7774e-04, 3.7774e-03, 1.7271e-03,\n 1.0381e-02, 8.0213e-03, 1.1501e-02, 4.2458e-03, 1.0014e-02, 1.2250e-02,\n 7.9142e-03, 1.4718e-05, 5.4960e-06, 5.2989e-04, 4.0839e-03, 1.0451e-02,\n 2.3254e-03, 6.0797e-05, 1.0781e-02, 1.1603e-02, 9.0869e-03, 2.4486e-03,\n 1.0283e-02, 1.1929e-03, 2.4817e-03, 1.1319e-05, 2.9875e-04, 1.2611e-04,\n 1.0050e-02, 2.3301e-05, 7.2324e-08, 8.6714e-03, 7.2033e-04, 9.0854e-03,\n 1.0527e-02, 3.8592e-07, 1.0223e-02, 7.8883e-04, 3.0843e-03, 4.8777e-03,\n 1.0606e-02, 8.7275e-03, 1.0082e-02, 1.4042e-02, 7.7449e-05, 2.0741e-04,\n 7.0261e-03, 3.2047e-03, 1.1550e-02, 2.5786e-03, 7.8137e-04, 6.4589e-03,\n 4.3945e-08, 8.7159e-03, 1.3333e-04, 6.6628e-07, 1.0388e-02, 9.3512e-03,\n 9.5524e-03, 9.3245e-03, 8.7068e-03, 1.1240e-07, 1.0350e-02, 1.0376e-02,\n 1.0049e-04, 1.2250e-02, 1.3410e-05, 8.4229e-03, 8.2075e-03, 1.0715e-02,\n 1.0263e-02, 1.1253e-02, 5.0521e-04, 7.6267e-03, 9.2555e-09, 7.1250e-04,\n 9.5941e-03, 5.7027e-04, 9.3679e-03, 1.5375e-03, 1.0229e-02, 6.6896e-03,\n 1.0340e-02, 2.3134e-03, 1.4731e-05, 5.0970e-06, 1.0986e-02, 1.0623e-02,\n 7.9373e-03, 1.0649e-02, 7.1226e-03, 9.6605e-03, 8.2029e-03, 2.1330e-05,\n 5.3497e-03, 5.1561e-04, 9.5778e-03, 2.9556e-03, 2.3418e-03, 8.4025e-03,\n 1.3299e-03, 9.1222e-03, 1.3303e-02, 1.4603e-02, 7.9975e-03, 6.3817e-06,\n 3.2679e-04, 9.2322e-03, 5.1355e-04, 7.0141e-03, 9.6612e-03, 3.9839e-06,\n 1.2299e-02, 1.2202e-03, 6.6041e-05, 1.0626e-02, 1.7431e-04, 7.5061e-03,\n 1.3948e-02, 3.4622e-05, 6.7070e-06, 5.8472e-03, 5.5244e-03, 2.2765e-03,\n 5.0700e-03, 6.2908e-03, 6.7274e-04, 8.3025e-03, 2.3536e-05, 1.0938e-02,\n 5.1515e-03, 1.1936e-02, 4.1714e-03, 6.0465e-04, 1.2546e-04, 3.1944e-03,\n 9.8850e-03, 1.1961e-02, 1.2060e-02, 3.1239e-03, 3.3486e-05, 1.6520e-05,\n 6.6240e-03, 6.7038e-03, 1.1650e-02, 8.1773e-03, 1.0956e-03, 7.7222e-03,\n 6.7403e-03, 4.4305e-03, 9.1532e-03, 5.8893e-06, 1.8733e-06, 8.1103e-03,\n 3.5327e-03, 8.6038e-03, 7.8190e-03, 1.3551e-02, 1.0429e-02, 1.8269e-05,\n 1.4832e-04, 1.4961e-08, 9.6145e-03, 9.7836e-03, 1.8102e-05, 9.3103e-03,\n 1.1942e-02, 1.2948e-02, 4.2080e-03, 7.0179e-03, 4.1050e-06, 1.3984e-02,\n 7.0550e-03, 7.1955e-03, 9.6193e-03, 5.6793e-04, 1.0979e-02, 5.7230e-03,\n 3.7430e-04, 6.0442e-05, 8.3886e-03, 1.0885e-02, 3.1238e-03, 7.6791e-03,\n 7.3018e-03, 1.4830e-04, 8.0569e-03, 1.3069e-02, 7.2944e-03, 2.6866e-03,\n 4.8650e-03, 3.9082e-03, 1.3331e-07, 3.6239e-03, 7.7176e-08, 8.6674e-03,\n 1.1584e-02, 3.6070e-04, 5.2625e-03, 2.0171e-05, 3.0835e-03, 1.6336e-05,\n 1.1952e-02, 1.9028e-07, 1.5203e-02, 3.9541e-04, 9.8897e-03, 9.6939e-06,\n 1.1359e-02, 1.0302e-02, 6.2645e-03, 5.9479e-04, 8.6953e-03, 1.6101e-03,\n 3.2511e-06, 8.9957e-03, 2.3874e-03, 2.0366e-04, 1.1121e-02, 1.0625e-02,\n 1.1387e-02, 2.7975e-08, 1.4052e-05, 1.2416e-02, 1.0142e-02, 8.7494e-03,\n 6.8978e-03, 2.0833e-06, 7.1099e-03, 6.9320e-03, 2.5485e-03, 9.8313e-04,\n 9.6102e-03, 9.2427e-03, 1.0953e-02, 3.4481e-05, 1.1498e-02, 1.5959e-03,\n 1.0476e-02, 9.9469e-03, 1.1401e-07, 9.7881e-03, 4.6950e-05, 1.2361e-02,\n 1.1186e-02, 5.6603e-04, 8.0328e-03, 1.1114e-02, 8.7376e-06, 1.7988e-05,\n 8.0512e-03, 1.3776e-02, 1.1091e-02, 1.0629e-02, 8.1678e-04, 4.9361e-05,\n 1.6293e-05, 5.8089e-05, 1.0597e-02, 1.8517e-03, 1.1946e-05, 2.3467e-03,\n 2.4193e-05, 1.9655e-03, 9.3234e-03, 7.9578e-03, 1.1049e-02, 4.4625e-04,\n 8.3640e-05, 1.2436e-05, 9.2785e-03, 1.1818e-02, 1.1981e-02, 9.4372e-03,\n 5.4867e-06, 1.0451e-03, 3.4060e-03, 3.2152e-03, 8.0561e-03, 1.0020e-04,\n 3.8884e-04, 9.0080e-04, 7.9726e-04, 2.1783e-03, 4.6703e-04, 9.3744e-03,\n 3.8161e-05, 1.4399e-07, 2.7210e-03, 6.5332e-03, 1.0761e-02, 1.2106e-05,\n 8.1905e-03, 9.1315e-03, 1.1335e-02, 6.6705e-03, 7.5027e-03, 6.4941e-03,\n 1.1654e-02, 1.0086e-02, 1.8824e-03, 1.1797e-04, 1.3039e-03, 9.7983e-06,\n 1.0102e-02, 3.1178e-04, 5.2086e-03, 1.0573e-02, 5.4079e-03, 1.4472e-05,\n 1.8536e-03, 1.3271e-02, 8.6851e-03, 7.9314e-04, 1.0239e-02, 1.2435e-02,\n 4.5479e-05, 2.8504e-03, 9.7193e-03, 1.0806e-03, 5.8454e-03, 1.0572e-02,\n 3.0788e-06, 1.0166e-05, 1.0514e-02, 1.1293e-02, 1.1867e-02, 1.1283e-02,\n 3.8813e-05, 8.8937e-03, 1.9452e-04, 1.1277e-02, 9.3335e-03, 8.1192e-03,\n 2.6972e-04, 8.2701e-06, 2.4193e-03, 1.5200e-04, 3.2153e-05, 2.8855e-07,\n 9.1662e-03, 9.6311e-03, 9.4919e-03, 5.9059e-05, 8.3068e-03, 3.1946e-05,\n 4.1728e-03, 1.2068e-04, 1.1547e-02, 1.1361e-02, 3.0063e-04, 1.8320e-05,\n 8.2752e-03, 5.7239e-03, 1.0008e-02, 4.2373e-05, 8.6247e-03, 9.5463e-03,\n 1.6616e-06, 8.3653e-03, 1.0384e-02, 5.8058e-05, 6.9204e-03, 1.1099e-02,\n 7.8941e-03, 1.1429e-02, 4.2850e-06, 3.9826e-06, 9.5279e-03, 4.8957e-03,\n 1.4369e-05, 8.8260e-03, 1.1724e-02, 5.2362e-03, 8.1295e-03, 1.1351e-02],\n device='cuda:0')"
24
+ },
25
+ "4": {
26
+ "step": "tensor(1252.)",
27
+ "exp_avg": "tensor([[-4.3154e-05, -1.5916e-04, -5.1215e-06, ..., 1.6921e-04,\n 1.8602e-04, 5.5249e-06],\n [ 8.7941e-06, -3.3989e-05, -5.6176e-05, ..., 2.2403e-04,\n 8.7335e-04, 2.8426e-04],\n [ 3.8337e-04, 2.8059e-04, 4.8507e-05, ..., -5.7658e-04,\n -2.7183e-04, 2.5420e-04],\n ...,\n [-1.3194e-04, 2.9983e-04, 4.1040e-05, ..., 6.6671e-04,\n 4.3644e-05, 7.6893e-04],\n [ 1.8217e-05, -3.1378e-04, 1.1447e-05, ..., -1.2026e-04,\n 2.3890e-04, -1.9527e-04],\n [-5.5710e-04, -6.6925e-04, 3.4424e-05, ..., -4.2656e-05,\n -1.9776e-04, -5.9877e-04]], device='cuda:0')",
28
+ "exp_avg_sq": "tensor([[1.0648e-06, 4.1248e-07, 6.1744e-09, ..., 4.8223e-07, 1.0498e-06,\n 1.2330e-06],\n [2.2697e-06, 1.0434e-06, 7.9740e-09, ..., 9.2273e-07, 2.2066e-06,\n 2.3521e-06],\n [2.5404e-06, 9.9041e-07, 1.0235e-08, ..., 8.7782e-07, 1.7688e-06,\n 2.5592e-06],\n ...,\n [2.5289e-06, 5.8547e-07, 8.6233e-09, ..., 1.3528e-06, 1.9141e-06,\n 3.0528e-06],\n [2.0279e-06, 7.7899e-07, 5.1655e-09, ..., 1.2277e-06, 1.9490e-06,\n 2.9510e-06],\n [2.2495e-06, 7.6231e-07, 1.7792e-08, ..., 9.8704e-07, 2.3084e-06,\n 3.1873e-06]], device='cuda:0')"
29
+ }
30
+ },
31
+ "param_groups": [
32
+ {
33
+ "lr": 0.0009755527298894294,
34
+ "name": "shared",
35
+ "betas": [
36
+ 0.9,
37
+ 0.999
38
+ ],
39
+ "eps": 1e-08,
40
+ "weight_decay": 1e-05,
41
+ "amsgrad": false,
42
+ "maximize": false,
43
+ "foreach": null,
44
+ "capturable": false,
45
+ "differentiable": false,
46
+ "fused": null,
47
+ "decoupled_weight_decay": true,
48
+ "initial_lr": 0.001,
49
+ "params": [
50
+ 0,
51
+ 1
52
+ ]
53
+ },
54
+ {
55
+ "lr": 0.0009755527298894294,
56
+ "name": "scale_256",
57
+ "betas": [
58
+ 0.9,
59
+ 0.999
60
+ ],
61
+ "eps": 1e-08,
62
+ "weight_decay": 1e-05,
63
+ "amsgrad": false,
64
+ "maximize": false,
65
+ "foreach": null,
66
+ "capturable": false,
67
+ "differentiable": false,
68
+ "fused": null,
69
+ "decoupled_weight_decay": true,
70
+ "initial_lr": 0.001,
71
+ "params": [
72
+ 2,
73
+ 3,
74
+ 4
75
+ ]
76
+ },
77
+ {
78
+ "lr": 0.0009755527298894294,
79
+ "name": "scale_512",
80
+ "betas": [
81
+ 0.9,
82
+ 0.999
83
+ ],
84
+ "eps": 1e-08,
85
+ "weight_decay": 1e-05,
86
+ "amsgrad": false,
87
+ "maximize": false,
88
+ "foreach": null,
89
+ "capturable": false,
90
+ "differentiable": false,
91
+ "fused": null,
92
+ "decoupled_weight_decay": true,
93
+ "initial_lr": 0.001,
94
+ "params": [
95
+ 5,
96
+ 6,
97
+ 7
98
+ ]
99
+ },
100
+ {
101
+ "lr": 0.0009755527298894294,
102
+ "name": "scale_768",
103
+ "betas": [
104
+ 0.9,
105
+ 0.999
106
+ ],
107
+ "eps": 1e-08,
108
+ "weight_decay": 1e-05,
109
+ "amsgrad": false,
110
+ "maximize": false,
111
+ "foreach": null,
112
+ "capturable": false,
113
+ "differentiable": false,
114
+ "fused": null,
115
+ "decoupled_weight_decay": true,
116
+ "initial_lr": 0.001,
117
+ "params": [
118
+ 8,
119
+ 9,
120
+ 10
121
+ ]
122
+ },
123
+ {
124
+ "lr": 0.0009755527298894294,
125
+ "name": "scale_1024",
126
+ "betas": [
127
+ 0.9,
128
+ 0.999
129
+ ],
130
+ "eps": 1e-08,
131
+ "weight_decay": 1e-05,
132
+ "amsgrad": false,
133
+ "maximize": false,
134
+ "foreach": null,
135
+ "capturable": false,
136
+ "differentiable": false,
137
+ "fused": null,
138
+ "decoupled_weight_decay": true,
139
+ "initial_lr": 0.001,
140
+ "params": [
141
+ 11,
142
+ 12,
143
+ 13
144
+ ]
145
+ },
146
+ {
147
+ "lr": 0.0009755527298894294,
148
+ "name": "scale_1280",
149
+ "betas": [
150
+ 0.9,
151
+ 0.999
152
+ ],
153
+ "eps": 1e-08,
154
+ "weight_decay": 1e-05,
155
+ "amsgrad": false,
156
+ "maximize": false,
157
+ "foreach": null,
158
+ "capturable": false,
159
+ "differentiable": false,
160
+ "fused": null,
161
+ "decoupled_weight_decay": true,
162
+ "initial_lr": 0.001,
163
+ "params": [
164
+ 14,
165
+ 15,
166
+ 16
167
+ ]
168
+ },
169
+ {
170
+ "lr": 0.0009755527298894294,
171
+ "name": "scale_1536",
172
+ "betas": [
173
+ 0.9,
174
+ 0.999
175
+ ],
176
+ "eps": 1e-08,
177
+ "weight_decay": 1e-05,
178
+ "amsgrad": false,
179
+ "maximize": false,
180
+ "foreach": null,
181
+ "capturable": false,
182
+ "differentiable": false,
183
+ "fused": null,
184
+ "decoupled_weight_decay": true,
185
+ "initial_lr": 0.001,
186
+ "params": [
187
+ 17,
188
+ 18,
189
+ 19
190
+ ]
191
+ },
192
+ {
193
+ "lr": 0.0009755527298894294,
194
+ "name": "scale_1792",
195
+ "betas": [
196
+ 0.9,
197
+ 0.999
198
+ ],
199
+ "eps": 1e-08,
200
+ "weight_decay": 1e-05,
201
+ "amsgrad": false,
202
+ "maximize": false,
203
+ "foreach": null,
204
+ "capturable": false,
205
+ "differentiable": false,
206
+ "fused": null,
207
+ "decoupled_weight_decay": true,
208
+ "initial_lr": 0.001,
209
+ "params": [
210
+ 20,
211
+ 21,
212
+ 22
213
+ ]
214
+ },
215
+ {
216
+ "lr": 0.0009755527298894294,
217
+ "name": "scale_2048",
218
+ "betas": [
219
+ 0.9,
220
+ 0.999
221
+ ],
222
+ "eps": 1e-08,
223
+ "weight_decay": 1e-05,
224
+ "amsgrad": false,
225
+ "maximize": false,
226
+ "foreach": null,
227
+ "capturable": false,
228
+ "differentiable": false,
229
+ "fused": null,
230
+ "decoupled_weight_decay": true,
231
+ "initial_lr": 0.001,
232
+ "params": [
233
+ 23,
234
+ 24,
235
+ 25
236
+ ]
237
+ },
238
+ {
239
+ "lr": 0.0009755527298894294,
240
+ "name": "scale_2304",
241
+ "betas": [
242
+ 0.9,
243
+ 0.999
244
+ ],
245
+ "eps": 1e-08,
246
+ "weight_decay": 1e-05,
247
+ "amsgrad": false,
248
+ "maximize": false,
249
+ "foreach": null,
250
+ "capturable": false,
251
+ "differentiable": false,
252
+ "fused": null,
253
+ "decoupled_weight_decay": true,
254
+ "initial_lr": 0.001,
255
+ "params": [
256
+ 26,
257
+ 27,
258
+ 28
259
+ ]
260
+ },
261
+ {
262
+ "lr": 0.0009755527298894294,
263
+ "name": "scale_2560",
264
+ "betas": [
265
+ 0.9,
266
+ 0.999
267
+ ],
268
+ "eps": 1e-08,
269
+ "weight_decay": 1e-05,
270
+ "amsgrad": false,
271
+ "maximize": false,
272
+ "foreach": null,
273
+ "capturable": false,
274
+ "differentiable": false,
275
+ "fused": null,
276
+ "decoupled_weight_decay": true,
277
+ "initial_lr": 0.001,
278
+ "params": [
279
+ 29,
280
+ 30,
281
+ 31
282
+ ]
283
+ },
284
+ {
285
+ "lr": 0.00048778860081564085,
286
+ "name": "fusion",
287
+ "betas": [
288
+ 0.9,
289
+ 0.999
290
+ ],
291
+ "eps": 1e-08,
292
+ "weight_decay": 1e-05,
293
+ "amsgrad": false,
294
+ "maximize": false,
295
+ "foreach": null,
296
+ "capturable": false,
297
+ "differentiable": false,
298
+ "fused": null,
299
+ "decoupled_weight_decay": true,
300
+ "initial_lr": 0.0005,
301
+ "params": [
302
+ 32,
303
+ 33,
304
+ 34,
305
+ 35,
306
+ 36,
307
+ 37,
308
+ 38,
309
+ 39,
310
+ 40,
311
+ 41,
312
+ 42,
313
+ 43,
314
+ 44,
315
+ 45,
316
+ 46,
317
+ 47,
318
+ 48,
319
+ 49,
320
+ 50,
321
+ 51,
322
+ 52,
323
+ 53,
324
+ 54,
325
+ 55,
326
+ 56,
327
+ 57,
328
+ 58,
329
+ 59,
330
+ 60,
331
+ 61,
332
+ 62,
333
+ 63,
334
+ 64,
335
+ 65,
336
+ 66,
337
+ 67,
338
+ 68,
339
+ 69,
340
+ 70,
341
+ 71,
342
+ 72,
343
+ 73,
344
+ 74,
345
+ 75,
346
+ 76,
347
+ 77,
348
+ 78,
349
+ 79,
350
+ 80,
351
+ 81,
352
+ 82,
353
+ 83,
354
+ 84,
355
+ 85
356
+ ]
357
+ }
358
+ ]
359
+ },
360
+ "scheduler_state_dict": {
361
+ "T_0": 10,
362
+ "T_i": 10,
363
+ "T_mult": 2,
364
+ "eta_min": 1e-06,
365
+ "T_cur": 1,
366
+ "base_lrs": [
367
+ 0.001,
368
+ 0.001,
369
+ 0.001,
370
+ 0.001,
371
+ 0.001,
372
+ 0.001,
373
+ 0.001,
374
+ 0.001,
375
+ 0.001,
376
+ 0.001,
377
+ 0.001,
378
+ 0.0005
379
+ ],
380
+ "last_epoch": 1,
381
+ "_step_count": 0,
382
+ "_is_initial": false,
383
+ "_get_lr_called_within_step": false,
384
+ "_last_lr": [
385
+ 0.0009755527298894294,
386
+ 0.0009755527298894294,
387
+ 0.0009755527298894294,
388
+ 0.0009755527298894294,
389
+ 0.0009755527298894294,
390
+ 0.0009755527298894294,
391
+ 0.0009755527298894294,
392
+ 0.0009755527298894294,
393
+ 0.0009755527298894294,
394
+ 0.0009755527298894294,
395
+ 0.0009755527298894294,
396
+ 0.00048778860081564085
397
+ ]
398
+ },
399
+ "metrics": {
400
+ "best_val_acc": 81.158,
401
+ "best_epoch": 0,
402
+ "scale_accuracies": {
403
+ "256": 81.158
404
+ }
405
+ },
406
+ "train_config": {
407
+ "name": "david_training",
408
+ "run_id": "20251012_065325",
409
+ "dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
410
+ "model_variant": "clip_vit_l14",
411
+ "num_classes": 1000,
412
+ "preset": "clip_vit_l14_ultra_deep",
413
+ "custom_config_path": null,
414
+ "num_classes_override": null,
415
+ "use_belly_override": null,
416
+ "belly_expand_override": null,
417
+ "progressive_training_override": true,
418
+ "scale_warmup_epochs_override": null,
419
+ "num_epochs": 10,
420
+ "batch_size": 1024,
421
+ "learning_rate": 0.001,
422
+ "weight_decay": 1e-05,
423
+ "warmup_epochs": 3,
424
+ "use_rose_loss": true,
425
+ "rose_initial_weight": 0.1,
426
+ "rose_max_weight": 0.5,
427
+ "rose_weight_schedule": "adaptive",
428
+ "use_cayley_loss": false,
429
+ "cayley_weight": 0.001,
430
+ "scale_loss_balance": null,
431
+ "use_mixed_precision": false,
432
+ "gradient_clip": 10.0,
433
+ "scheduler_type": "cosine_restarts",
434
+ "min_lr": 1e-06,
435
+ "freeze_strategy": "never",
436
+ "freeze_threshold": 90.0,
437
+ "unfreeze_on_plateau": true,
438
+ "patience": 10,
439
+ "track_gradients": true,
440
+ "gradient_scale_threshold": 1e-05,
441
+ "gradient_scale_multiplier": 10.0,
442
+ "log_interval": 50,
443
+ "val_interval": 1,
444
+ "save_interval": 5,
445
+ "log_fusion_weights": true,
446
+ "log_loss_components": true,
447
+ "save_format": "safetensors",
448
+ "hf_repo": "AbstractPhil/gated-david",
449
+ "upload_to_hub": true,
450
+ "base_dir": "./david_training",
451
+ "num_workers": 10,
452
+ "pin_memory": true,
453
+ "prefetch_factor": 4,
454
+ "persistent_workers": true
455
+ }
456
+ }
weights/David-partial_shared-deep_efficiency/20251012_065325/david_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "david_clip_vit_l14_ultra_deep",
3
+ "uid": "c.david.clip_vit_l14_ultra_deep",
4
+ "feature_dim": 768,
5
+ "num_classes": 1000,
6
+ "scales": [
7
+ 256,
8
+ 512,
9
+ 768,
10
+ 1024,
11
+ 1280,
12
+ 1536,
13
+ 1792,
14
+ 2048,
15
+ 2304,
16
+ 2560
17
+ ],
18
+ "sharing_mode": "partial_shared",
19
+ "fusion_mode": "deep_efficiency",
20
+ "use_belly": true,
21
+ "belly_expand": 3.0,
22
+ "shared_feature_dim": 2048,
23
+ "shared_layers": 12,
24
+ "shared_dropout": 0.1,
25
+ "fusion_temperature": 1.0,
26
+ "fusion_dropout": 0.1,
27
+ "tree_depth": 3,
28
+ "num_experts": 10,
29
+ "compression_ratio": 4,
30
+ "expert_dropout": 0.1,
31
+ "attention_dropout": 0.1,
32
+ "progressive_training": true,
33
+ "scale_warmup_epochs": {
34
+ "256": 0,
35
+ "512": 1,
36
+ "768": 2,
37
+ "1024": 3,
38
+ "1280": 4,
39
+ "1536": 5,
40
+ "1792": 6,
41
+ "2048": 7,
42
+ "2304": 8,
43
+ "2560": 9
44
+ }
45
+ }
weights/David-partial_shared-deep_efficiency/20251012_065325/train_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "david_training",
3
+ "run_id": "20251012_065325",
4
+ "dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
5
+ "model_variant": "clip_vit_l14",
6
+ "num_classes": 1000,
7
+ "preset": "clip_vit_l14_ultra_deep",
8
+ "custom_config_path": null,
9
+ "num_classes_override": null,
10
+ "use_belly_override": null,
11
+ "belly_expand_override": null,
12
+ "progressive_training_override": true,
13
+ "scale_warmup_epochs_override": null,
14
+ "num_epochs": 10,
15
+ "batch_size": 1024,
16
+ "learning_rate": 0.001,
17
+ "weight_decay": 1e-05,
18
+ "warmup_epochs": 3,
19
+ "use_rose_loss": true,
20
+ "rose_initial_weight": 0.1,
21
+ "rose_max_weight": 0.5,
22
+ "rose_weight_schedule": "adaptive",
23
+ "use_cayley_loss": false,
24
+ "cayley_weight": 0.001,
25
+ "scale_loss_balance": null,
26
+ "use_mixed_precision": false,
27
+ "gradient_clip": 10.0,
28
+ "scheduler_type": "cosine_restarts",
29
+ "min_lr": 1e-06,
30
+ "freeze_strategy": "never",
31
+ "freeze_threshold": 90.0,
32
+ "unfreeze_on_plateau": true,
33
+ "patience": 10,
34
+ "track_gradients": true,
35
+ "gradient_scale_threshold": 1e-05,
36
+ "gradient_scale_multiplier": 10.0,
37
+ "log_interval": 50,
38
+ "val_interval": 1,
39
+ "save_interval": 5,
40
+ "log_fusion_weights": true,
41
+ "log_loss_components": true,
42
+ "save_format": "safetensors",
43
+ "hf_repo": "AbstractPhil/gated-david",
44
+ "upload_to_hub": true,
45
+ "base_dir": "./david_training",
46
+ "num_workers": 10,
47
+ "pin_memory": true,
48
+ "prefetch_factor": 4,
49
+ "persistent_workers": true
50
+ }