AbstractPhil commited on
Commit
a5f93a0
·
verified ·
1 Parent(s): aa1b2bb

Upload weights and configs - Run 20251012_031919

Browse files
weights/best_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ecf5e86fd5b2ddb68e090d5f70544f98d7a52e2e21a58950633ab7170563619
3
+ size 59515088
weights/best_model_metadata.json ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0,
3
+ "optimizer_state_dict": {
4
+ "state": {
5
+ "0": {
6
+ "step": "tensor(1252.)",
7
+ "exp_avg": "tensor([[ 9.9808e-05, -2.3590e-04, 4.8703e-05, ..., -6.7867e-06,\n 1.5045e-04, -6.9639e-06],\n [-3.2055e-05, -2.8797e-04, 6.0306e-05, ..., 9.5280e-06,\n 2.2100e-06, 3.8027e-06],\n [ 5.6052e-45, -5.6052e-45, -5.6052e-45, ..., 5.6052e-45,\n 5.6052e-45, 5.6052e-45],\n ...,\n [ 1.2700e-04, -2.9514e-05, 2.9405e-05, ..., 3.5731e-05,\n 2.1884e-06, -6.4312e-05],\n [ 2.5446e-04, -2.6147e-04, 2.8969e-05, ..., 8.7159e-05,\n 5.2191e-05, 1.4825e-04],\n [ 2.1196e-05, 7.8471e-06, -4.5547e-05, ..., 5.2981e-06,\n -1.3684e-04, -7.5568e-05]], device='cuda:0')",
8
+ "exp_avg_sq": "tensor([[5.4091e-07, 5.4144e-07, 4.9248e-08, ..., 4.1057e-08, 7.7326e-08,\n 5.4548e-08],\n [2.1701e-07, 3.3507e-07, 5.9846e-08, ..., 6.8188e-08, 6.9439e-08,\n 7.7029e-08],\n [2.3653e-09, 4.7337e-10, 1.4479e-10, ..., 2.1699e-10, 5.2052e-10,\n 4.1789e-10],\n ...,\n [4.9622e-07, 6.7429e-07, 5.9319e-08, ..., 7.7861e-08, 6.7676e-08,\n 4.2370e-08],\n [1.8010e-07, 3.5606e-07, 3.6999e-08, ..., 5.1092e-08, 7.0665e-08,\n 4.0146e-08],\n [1.0317e-07, 2.4285e-07, 4.6463e-08, ..., 5.2589e-08, 4.8899e-08,\n 4.3048e-08]], device='cuda:0')"
9
+ },
10
+ "1": {
11
+ "step": "tensor(1252.)",
12
+ "exp_avg": "tensor([ 2.9097e-03, 1.3327e-03, 5.6052e-45, -6.0815e-04, -4.8489e-07,\n 5.6052e-45, 4.5090e-04, -3.5691e-03, 1.6202e-03, -8.7439e-15,\n 4.4855e-04, 1.3435e-04, 2.7017e-03, 2.9764e-04, -2.6219e-03,\n 5.6216e-05, 1.3226e-03, -3.9642e-03, 8.1591e-04, -8.6576e-04,\n 2.2454e-04, 1.0421e-04, -2.1005e-03, 6.4988e-24, 5.7679e-04,\n 2.7692e-04, -9.5206e-04, -5.5439e-03, -5.3738e-03, 2.8346e-04,\n 1.7878e-03, 5.6052e-45, 1.0163e-03, -2.8758e-04, -1.0929e-03,\n 6.1516e-36, -6.6696e-03, 1.2855e-03, 1.6802e-03, -2.5499e-04,\n 8.3145e-04, -7.6461e-04, 1.0018e-03, -8.2878e-04, -6.3370e-04,\n 6.4852e-03, -1.1970e-03, 4.4301e-23, -1.5002e-03, -1.3855e-03,\n -4.5024e-03, 7.1654e-04, -4.9084e-04, 1.6491e-03, 5.6052e-45,\n -3.0916e-03, 5.2380e-03, -4.4587e-04, 3.4016e-04, -2.3714e-03,\n 2.0890e-16, -3.2571e-03, 1.5128e-03, -4.8482e-04, 8.1386e-05,\n 1.5725e-03, 2.3494e-03, 5.0308e-04, 3.6531e-03, 3.8240e-21,\n 4.7257e-03, -5.3711e-04, 1.8309e-04, 1.5676e-12, 3.5145e-03,\n 9.8415e-03, 1.0916e-02, -6.8465e-04, 4.2006e-03, -6.7891e-03,\n -1.2097e-03, 5.6052e-45, 1.1576e-16, -2.7480e-04, -5.5370e-03,\n -7.5772e-04, -2.9113e-18, 4.3444e-03, 3.1825e-03, -3.6059e-04,\n -3.6649e-04, 3.3190e-03, -1.9954e-03, -8.0797e-03, 3.6348e-03,\n 4.6834e-03, 2.3070e-03, -2.8616e-03, -7.9862e-04, -5.0693e-03,\n 2.4830e-04, 3.3737e-03, 4.2926e-03, 3.6573e-03, -3.5065e-04,\n -5.6572e-04, 1.8134e-03, -6.3776e-04, 4.6051e-03, -2.8824e-03,\n -3.6993e-04, -9.8880e-04, 1.0007e-40, -9.8369e-04, -2.8130e-03,\n 1.4090e-03, -1.7435e-06, 7.0776e-04, 5.7418e-03, -2.8657e-05,\n 2.4314e-03, -2.0983e-03, 5.1883e-03, -2.1431e-38, 7.4072e-07,\n -1.2746e-03, -3.4858e-04, -4.6374e-04, -1.6977e-03, 5.6052e-45,\n 3.7000e-03, 1.5831e-03, -2.0776e-03, -3.7744e-11, -4.3330e-04,\n 1.4578e-03, 7.6183e-03, -3.3606e-03, -1.0650e-03, 4.3183e-03,\n 3.1241e-08, 1.4853e-03, 2.1910e-07, 1.8818e-03, -3.2697e-03,\n 5.9948e-04, -2.3068e-03, 4.1328e-03, 2.3744e-07, 1.1056e-03,\n -1.6506e-04, 5.6052e-45, 5.6052e-45, 5.9209e-04, -2.7072e-03,\n -5.1818e-03, 3.9953e-04, 2.1181e-03, 5.6052e-45, 7.1065e-04,\n 2.8993e-03, 1.0938e-04, 4.5471e-04, -3.0714e-03, -1.6887e-03,\n 4.9794e-04, 3.8360e-04, -2.8068e-06, -3.4753e-04, 6.1017e-03,\n 9.3058e-04, 1.6621e-23, 5.6052e-45, 1.0997e-03, 1.6329e-03,\n 5.6052e-45, -2.2469e-14, 1.2329e-04, 6.3936e-03, -1.4311e-03,\n 2.5462e-03, -6.5818e-04, 1.6421e-03, -4.6383e-03, 1.1713e-03,\n 4.1140e-04, -7.0093e-04, -2.9552e-03, -9.6662e-03, 2.7718e-03,\n -7.6885e-03, -1.9008e-03, 4.6894e-04, 1.5270e-03, -1.1454e-02,\n 3.9764e-03, 3.5573e-03, 3.9063e-04, -1.4086e-03, -9.4108e-05,\n -1.9810e-03, 9.3351e-04, -2.7475e-03, -2.0864e-03, 6.9651e-04,\n 3.6452e-03, -2.2449e-03, -5.8442e-04, 2.1306e-03, 5.8208e-03,\n 1.6991e-04, -2.1017e-03, 3.0419e-03, 1.8699e-03, -1.8809e-03,\n -1.2653e-03, -5.3299e-03, -1.6545e-03, -1.0058e-03, 1.5983e-03,\n -2.4800e-03, -8.4721e-03, 1.9065e-26, 4.7373e-04, 2.0919e-03,\n -2.9753e-03, -4.0522e-04, 2.5252e-03, -2.8403e-03, -1.4599e-03,\n 1.3485e-03, -1.0358e-03, 3.9361e-41, 1.5485e-39, -5.7891e-03,\n -5.8865e-04, 8.3625e-04, 6.9007e-05, 3.3757e-03, 2.4274e-04,\n -6.1542e-05, 3.0383e-03, 2.3547e-03, 4.9550e-03, -2.0679e-03,\n 2.9972e-14, -2.8711e-03, -3.2705e-03, -3.5964e-03, 1.7757e-03,\n 6.0012e-03, 4.5402e-27, 2.1096e-03, 7.6903e-04, -4.8270e-04,\n -6.9613e-04, 1.0011e-03, 9.6959e-05, -3.0302e-03, 4.3707e-03,\n -2.7226e-03, -4.3493e-04, -1.7738e-03, 2.8508e-04, 2.4977e-04,\n -3.0230e-03, -2.4574e-03, 2.2031e-03, -2.3219e-03, 1.1483e-04,\n 4.2586e-03, 3.5752e-03, 5.6052e-45, 5.5265e-03, -5.1519e-03,\n 1.0887e-03, 7.3665e-04, 2.1852e-03, -1.6445e-03, -7.3967e-05,\n -7.4355e-04, 7.6051e-04, 3.4523e-03, 1.0644e-03, -1.1934e-03,\n -1.8700e-04, 3.7302e-14, -2.1369e-03, 5.6052e-45, -1.7855e-03,\n -5.3899e-03, 5.6500e-03, -6.1412e-05, -2.9054e-03, -3.7659e-04,\n -1.8160e-04, 1.9969e-31, -5.3695e-04, -4.6615e-04, 2.3254e-38,\n 5.6052e-45, 5.4223e-03, 4.4807e-03, -1.3030e-03, -1.2695e-03,\n -1.2296e-03, -9.0901e-04, 2.3960e-03, 1.6224e-04, -1.0780e-03,\n 2.8688e-03, 2.3945e-03, 2.9703e-03, 3.5277e-03, -2.4392e-03,\n -1.7388e-03, 1.9980e-03, 2.0871e-03, 5.6052e-45, 3.7856e-04,\n 2.2651e-03, 1.7563e-03, 5.6052e-45, 2.0318e-03, 1.3920e-03,\n 6.9956e-03, 1.8756e-03, -5.5313e-04, 2.4667e-03, 8.8846e-04,\n 1.1469e-03, 2.3866e-03, -1.7824e-03, 7.7836e-04, -5.0318e-04,\n -9.9463e-04, 9.0413e-04, 5.4385e-07, 6.3684e-04, 1.0693e-03,\n -1.9220e-03, 3.1149e-03, -5.8925e-04, -2.1141e-03, 1.9296e-03,\n 5.6052e-45, -3.9401e-03, 1.2427e-03, 5.6052e-45, -6.7204e-04,\n -1.1268e-05, 5.9482e-04, 1.4091e-03, -3.0139e-03, 4.3913e-04,\n -2.7723e-03, -2.4720e-03, -4.9926e-04, -2.7104e-03, -5.6586e-04,\n 7.8348e-04, 5.6052e-45, 1.3427e-03, -4.7691e-03, 5.6052e-45,\n -2.0714e-03, -5.4685e-04, -1.1198e-07, -7.7871e-05, -4.1724e-03,\n -2.1828e-03, -6.9466e-04, -2.3864e-04, 2.6827e-03, -2.0941e-03,\n 1.8311e-03, 2.7079e-03, -4.4246e-03, 3.3094e-03, 1.8410e-03,\n -6.3030e-03, 3.4858e-03, -1.9636e-03, 5.6052e-45, 1.9304e-05,\n -4.1517e-21, 2.4526e-03, 4.8325e-03, -6.6209e-04, 9.4050e-04,\n 3.5215e-42, 4.5619e-03, -6.1602e-03, -2.5304e-03, -9.4652e-04,\n 7.0431e-04, -7.8636e-24, 3.5365e-03, 5.6052e-45, 5.6052e-45,\n 9.4229e-04, 2.1341e-03, -6.8848e-03, 2.8542e-03, -2.0011e-04,\n 6.2322e-04, 5.4069e-04, 2.1369e-03, -2.7031e-03, 6.2608e-04,\n -7.1263e-04, -5.2130e-03, -7.6677e-04, 5.3319e-03, -7.3731e-41,\n -8.2319e-04, -8.3675e-04, 9.1410e-04, 5.6052e-45, 5.3688e-03,\n 4.4872e-03, -3.9738e-03, 5.5525e-04, -2.6904e-03, -6.7565e-04,\n 1.5406e-03, -2.9458e-03, 3.1132e-03, 5.6203e-04, -1.3210e-03,\n -2.1013e-03, -8.1524e-04, -3.7536e-03, -2.9657e-03, -2.6874e-03,\n 1.5260e-03, -3.0026e-03, 1.4654e-08, -1.1917e-03, 1.6394e-03,\n 5.6052e-45, -1.9517e-10, -3.2261e-03, 1.5158e-04, 2.0841e-03,\n -1.8274e-16, 1.8457e-29, -1.1565e-03, 4.7800e-04, 3.4919e-08,\n 4.5683e-18, -3.7613e-27, 5.6052e-45, -1.4436e-03, -2.0606e-03,\n -5.2525e-04, -2.9899e-03, 3.4768e-04, 1.1600e-03, 2.7800e-03,\n 2.1888e-03, -4.4828e-07, -7.0891e-04, -9.2954e-05, 1.1944e-03,\n 1.1122e-04, -5.0771e-03, 1.2030e-03, -5.6601e-04, 2.6567e-03,\n -2.5926e-03, 2.5320e-03, -4.3434e-04, -9.2896e-04, 6.3431e-03,\n 8.5656e-04, 5.1081e-03, -2.3216e-03, -5.3759e-03, 3.3953e-03,\n 4.0417e-03, -3.0275e-03, -6.7968e-05, -2.8031e-03, -2.8511e-04,\n 3.4441e-03, 2.6827e-03, -2.8124e-03, -4.7469e-14, 5.0620e-03,\n -1.5232e-03, -1.7148e-03, -5.0032e-03, 3.2487e-03, -1.8802e-03,\n 6.7241e-04, 1.3107e-03, -8.0724e-03, 2.5132e-03, 5.6052e-45,\n -9.5002e-04, 2.7097e-03, 2.6700e-03, 2.4434e-03, -1.6090e-21,\n 6.0059e-03, -1.0610e-03, -4.8448e-03, 7.5928e-04, 5.6052e-45,\n 5.9775e-25, 2.2420e-05, 3.3517e-04, 3.1995e-03, 2.1595e-04,\n 2.6273e-04, 4.7507e-03, 2.2225e-03, -2.0668e-08, 3.3253e-04,\n 1.1676e-03, -1.4700e-03, -3.4555e-03, -5.6052e-45, -3.1489e-03,\n -3.4126e-03, 4.0212e-03, 4.2094e-03, -1.0276e-02, 3.6612e-03,\n 1.8664e-03, 1.1503e-03, 4.9484e-03, 8.5730e-04, -3.0980e-06,\n 4.3121e-03, 1.4095e-03, -8.8099e-11, -1.1596e-03, -2.9243e-03,\n -4.0902e-03, -2.4074e-03, -3.5444e-03, 1.1548e-18, -4.3250e-08,\n 1.1259e-03, 7.5497e-04, 1.6669e-03, -4.1334e-03, -8.5967e-04,\n 6.6669e-04, -6.8106e-08, 1.7873e-03, 6.9239e-04, -3.4414e-03,\n 3.3455e-03, -6.4815e-04, -4.4830e-04, 6.2729e-04, 3.1767e-03,\n -1.1806e-03, 5.6052e-45, -6.1646e-05, -1.5390e-08, -3.8129e-03,\n -5.5182e-04, 4.8020e-03, 3.0543e-03, 2.6255e-04, -7.5502e-03,\n -3.9313e-03, -7.1502e-03, 4.7074e-03, 3.0065e-03, -1.9671e-03,\n -7.0393e-04, 1.2949e-21, 5.6052e-45, -5.1700e-03, -2.5810e-03,\n 2.2440e-14, 5.9939e-03, 4.5024e-04, -3.9611e-04, -6.0625e-04,\n 2.5357e-03, 4.8548e-03, 5.6052e-45, 1.3375e-05, 8.7432e-04,\n -4.6417e-04, -4.8118e-03, -1.0334e-05, -1.0974e-03, 7.7801e-04,\n 5.6052e-45, -1.3710e-03, -4.1969e-03, -8.9786e-04, 3.7286e-04,\n 2.2078e-03, 5.6052e-45, 4.4025e-04, 3.5078e-17, -2.3542e-04,\n -8.6203e-04, -2.0364e-03, -1.9391e-03, -1.9571e-03, -3.8523e-03,\n 8.1275e-44, 3.8030e-03, -2.9845e-03, 1.1340e-03, -3.8654e-03,\n -3.1780e-03, 3.0170e-03, -3.5518e-06, 1.2191e-03, 1.9990e-03,\n 1.3138e-03, -4.8024e-03, -5.5048e-03, 2.6232e-03, 2.9120e-03,\n 6.1065e-03, 1.9772e-14, -1.4609e-03, -1.3955e-03, -1.1309e-03,\n 4.4795e-03, 5.6052e-45, 1.8031e-03, 6.4633e-07, 2.0465e-03,\n 3.7773e-03, 1.9373e-03, 3.8224e-04, 4.0307e-03, -1.3818e-03],\n device='cuda:0')",
13
+ "exp_avg_sq": "tensor([1.0657e-04, 1.2498e-04, 5.4747e-07, 1.5065e-04, 1.6597e-06, 9.6464e-08,\n 5.7696e-05, 1.5453e-04, 8.2285e-05, 4.0288e-06, 8.5279e-05, 8.9790e-05,\n 1.7742e-04, 1.9223e-04, 9.8694e-05, 5.8505e-05, 1.1162e-04, 1.5988e-04,\n 7.5000e-05, 4.3486e-05, 4.3413e-05, 1.6213e-04, 1.3302e-04, 6.6957e-07,\n 3.2834e-05, 2.1819e-06, 2.3604e-04, 1.9225e-04, 1.1807e-04, 2.0260e-04,\n 2.3734e-05, 4.9176e-08, 9.9726e-05, 6.7809e-05, 6.4116e-05, 2.2739e-06,\n 1.6892e-04, 1.6830e-04, 1.0314e-04, 9.3657e-05, 2.0912e-04, 2.0061e-04,\n 1.3971e-04, 1.0896e-04, 5.0730e-05, 1.3257e-04, 3.1017e-05, 1.6003e-06,\n 1.0016e-04, 3.9827e-05, 1.0853e-04, 1.0274e-04, 6.8523e-05, 1.6013e-04,\n 1.1100e-05, 9.5912e-05, 1.9205e-04, 1.3528e-04, 6.9431e-05, 1.2905e-04,\n 5.8560e-06, 2.2769e-04, 7.7184e-05, 1.8214e-04, 7.5148e-06, 1.6535e-04,\n 8.7372e-05, 1.5874e-05, 1.4836e-04, 7.8219e-07, 1.4206e-04, 1.4345e-04,\n 1.3214e-04, 5.8393e-06, 1.1338e-04, 1.5193e-04, 1.2176e-04, 1.5996e-04,\n 1.0422e-04, 1.4050e-04, 1.3875e-04, 5.0348e-07, 4.1452e-06, 1.0420e-04,\n 1.2363e-04, 1.2190e-04, 2.5825e-06, 1.7398e-04, 8.3418e-05, 1.2805e-04,\n 1.5007e-04, 4.6880e-05, 5.2009e-05, 6.3075e-05, 1.1318e-04, 1.5523e-04,\n 1.2102e-04, 1.2687e-04, 9.0525e-05, 2.5028e-04, 2.9265e-04, 1.4653e-04,\n 1.1980e-04, 1.4750e-04, 1.6089e-05, 6.2279e-06, 2.0253e-04, 2.0641e-04,\n 1.4600e-04, 1.1342e-04, 6.9079e-05, 5.9040e-05, 3.0364e-08, 1.1472e-04,\n 1.2216e-04, 1.5451e-05, 6.0434e-06, 1.7606e-04, 8.8550e-05, 3.0746e-05,\n 1.1317e-04, 9.6538e-05, 1.8607e-04, 1.4595e-05, 1.1918e-04, 1.4209e-05,\n 9.3113e-05, 1.3750e-04, 5.7601e-05, 9.6174e-08, 2.1956e-04, 1.4674e-04,\n 8.0491e-05, 1.4195e-05, 9.8237e-05, 6.8813e-05, 1.6731e-04, 1.4326e-04,\n 4.1997e-05, 5.4175e-05, 1.2158e-06, 1.6454e-04, 4.0711e-06, 1.8878e-04,\n 1.2410e-04, 1.6002e-04, 1.0956e-04, 1.0676e-04, 4.3374e-06, 1.8985e-04,\n 1.1697e-04, 4.6518e-06, 2.0894e-06, 9.9115e-05, 1.6756e-04, 1.4277e-04,\n 4.4317e-06, 1.0657e-04, 1.2864e-06, 2.2122e-04, 9.5258e-05, 1.4995e-04,\n 1.2619e-04, 2.7058e-05, 1.1466e-04, 2.2609e-05, 1.9238e-06, 1.0774e-05,\n 1.0919e-04, 1.0577e-04, 6.1629e-05, 6.2394e-07, 2.7799e-09, 1.5646e-04,\n 1.5640e-04, 1.5983e-06, 2.5999e-06, 1.3227e-04, 3.0020e-04, 4.0478e-05,\n 5.8057e-05, 9.0989e-05, 1.3166e-04, 2.9814e-04, 6.8373e-05, 1.1930e-04,\n 3.9874e-05, 8.8335e-05, 7.1290e-05, 1.3050e-04, 1.5606e-04, 1.6142e-04,\n 9.1281e-05, 1.5517e-04, 8.5782e-05, 1.6078e-04, 9.9462e-05, 1.5166e-04,\n 1.0937e-04, 1.1834e-04, 9.8588e-05, 1.5297e-04, 1.3750e-04, 1.3850e-04,\n 1.0609e-04, 5.9584e-05, 9.6978e-05, 1.8988e-04, 1.3983e-04, 1.6517e-04,\n 2.0238e-04, 6.1500e-05, 1.2606e-04, 1.0112e-04, 5.8389e-05, 8.5489e-05,\n 1.2577e-04, 1.7108e-04, 1.3891e-04, 1.1265e-04, 1.1794e-04, 1.8124e-04,\n 5.4795e-07, 1.1601e-04, 7.7962e-05, 1.0644e-04, 4.2547e-05, 7.7368e-05,\n 1.8829e-04, 1.4142e-04, 2.5550e-04, 1.1878e-04, 6.3489e-06, 4.3365e-06,\n 1.6430e-04, 4.9562e-05, 1.3993e-04, 8.1268e-05, 1.2643e-04, 2.2004e-05,\n 1.9739e-04, 4.1638e-05, 1.5866e-04, 1.4759e-04, 1.3158e-04, 1.8179e-05,\n 1.6027e-04, 1.2549e-04, 1.9569e-04, 8.4398e-05, 1.4535e-04, 5.4644e-05,\n 1.6169e-04, 1.2208e-04, 1.7599e-04, 9.4468e-05, 6.0572e-05, 7.8003e-04,\n 2.2577e-04, 9.8741e-05, 1.0735e-04, 9.2178e-05, 1.5435e-04, 3.3711e-05,\n 1.4859e-04, 7.5763e-05, 1.4214e-04, 8.2884e-05, 1.2263e-04, 9.7052e-05,\n 1.5557e-04, 1.3137e-04, 8.1252e-07, 1.2670e-04, 1.5327e-04, 6.3749e-05,\n 1.5327e-04, 1.5620e-04, 1.3979e-04, 1.8585e-04, 1.3567e-04, 1.0034e-04,\n 1.1085e-04, 1.5390e-04, 1.5355e-04, 1.2933e-04, 5.3038e-07, 1.1110e-04,\n 1.1968e-06, 1.2746e-04, 1.4634e-04, 1.6784e-04, 1.0154e-04, 8.7271e-05,\n 3.6511e-05, 1.4328e-04, 2.6839e-06, 8.2360e-05, 1.6174e-04, 2.6423e-07,\n 8.0022e-06, 1.5647e-04, 1.4390e-05, 2.0712e-04, 1.1966e-04, 1.2116e-04,\n 8.2312e-05, 1.4790e-04, 1.3410e-04, 1.3199e-04, 1.5459e-04, 1.4375e-04,\n 1.0500e-04, 9.5798e-05, 2.0366e-04, 1.3458e-04, 9.5124e-05, 2.5052e-04,\n 6.5783e-06, 1.2063e-04, 1.5319e-04, 1.8916e-04, 9.1263e-07, 2.1179e-04,\n 2.5095e-06, 1.4195e-04, 1.1650e-04, 1.0627e-04, 9.2918e-05, 8.5886e-05,\n 1.5846e-04, 1.3672e-04, 1.3835e-05, 1.5123e-04, 1.0138e-04, 1.0761e-04,\n 1.5179e-04, 7.8470e-06, 9.2061e-05, 1.4345e-04, 1.5339e-04, 1.7330e-04,\n 1.6678e-04, 6.9284e-05, 1.3848e-04, 2.1464e-07, 1.3049e-04, 6.7242e-05,\n 1.7938e-06, 1.6084e-04, 8.2247e-06, 5.9421e-05, 1.0957e-04, 1.4101e-04,\n 1.6227e-05, 2.1253e-04, 9.9932e-05, 1.9896e-04, 1.6496e-04, 1.6180e-04,\n 7.5642e-05, 5.5413e-06, 1.3033e-04, 1.1072e-04, 3.9993e-07, 2.8074e-05,\n 5.0329e-05, 4.8088e-06, 1.1054e-04, 5.9186e-05, 1.1593e-04, 1.0599e-04,\n 7.6909e-05, 8.4948e-05, 1.6006e-04, 7.4657e-05, 1.6763e-04, 1.5340e-04,\n 1.9831e-04, 9.3780e-05, 1.2386e-04, 1.0620e-04, 1.1448e-04, 6.1126e-06,\n 1.0337e-04, 1.7338e-05, 1.0805e-04, 2.1856e-04, 6.0433e-05, 1.0851e-04,\n 3.6828e-07, 1.6744e-04, 1.1827e-04, 3.1246e-05, 1.0913e-04, 1.0361e-04,\n 2.3393e-06, 2.0273e-05, 5.3356e-06, 6.7816e-06, 9.0499e-05, 2.4462e-04,\n 1.3379e-04, 1.6504e-04, 6.7356e-06, 8.5502e-05, 1.3591e-04, 1.6342e-04,\n 5.8167e-05, 6.7462e-05, 1.0389e-04, 1.6985e-04, 7.0559e-05, 1.4013e-04,\n 1.8478e-06, 7.2669e-05, 1.5968e-05, 1.0009e-04, 1.3229e-05, 1.2551e-04,\n 9.0110e-05, 1.3758e-04, 2.1852e-05, 1.2398e-04, 6.1984e-05, 2.1324e-04,\n 1.3438e-04, 1.3156e-04, 1.7449e-04, 1.1232e-04, 1.1211e-04, 1.0872e-04,\n 8.0641e-05, 1.4451e-04, 1.3943e-04, 9.0162e-05, 1.2867e-04, 1.2411e-06,\n 1.3691e-04, 3.0526e-05, 2.6990e-07, 1.5792e-06, 1.3751e-04, 1.6311e-04,\n 1.4706e-04, 2.3367e-06, 1.0337e-05, 1.2572e-04, 1.2500e-04, 5.0513e-06,\n 3.9280e-06, 3.4529e-06, 8.1961e-07, 1.1639e-04, 1.0618e-04, 1.3924e-04,\n 1.3476e-04, 6.0390e-05, 5.5459e-05, 1.3877e-04, 1.9087e-04, 4.5239e-06,\n 5.6055e-05, 6.4027e-05, 2.2887e-04, 2.1422e-04, 1.7405e-04, 1.7171e-04,\n 1.3046e-04, 7.8992e-05, 2.0561e-04, 5.6756e-05, 1.6212e-04, 6.1590e-05,\n 1.1942e-04, 4.8147e-05, 1.0807e-04, 1.5405e-04, 8.5730e-05, 1.3142e-04,\n 1.4351e-04, 1.1481e-04, 1.1602e-04, 1.3749e-04, 1.5277e-04, 1.6102e-04,\n 4.9004e-05, 7.3917e-05, 1.4746e-06, 2.4094e-04, 2.0498e-04, 7.0090e-05,\n 1.1606e-04, 1.5974e-04, 1.1282e-04, 1.7416e-04, 1.2365e-04, 1.8325e-04,\n 1.6413e-04, 1.0075e-05, 1.5340e-04, 1.7198e-04, 3.7495e-05, 1.0890e-04,\n 3.2546e-06, 9.5192e-05, 1.6514e-04, 1.4632e-04, 1.3682e-04, 1.1080e-06,\n 6.8253e-07, 6.4819e-05, 1.0894e-05, 1.6991e-04, 5.8200e-05, 6.9377e-05,\n 1.9031e-04, 1.1263e-04, 6.5790e-06, 1.9370e-05, 1.2348e-04, 1.6868e-04,\n 6.1779e-05, 1.9711e-06, 2.4460e-04, 1.2958e-04, 1.2875e-04, 1.5574e-04,\n 1.2438e-04, 1.2424e-04, 1.8478e-04, 1.1652e-04, 1.3442e-04, 1.7498e-04,\n 1.3293e-04, 1.0791e-04, 1.1685e-04, 4.7575e-07, 7.0879e-05, 9.9483e-05,\n 9.2598e-05, 1.3999e-04, 1.3390e-04, 1.5507e-06, 2.2021e-06, 1.2262e-04,\n 3.5282e-05, 3.9577e-05, 1.8776e-04, 2.3607e-05, 1.2914e-04, 9.5425e-06,\n 8.8458e-05, 5.4793e-05, 1.7688e-04, 1.4917e-04, 7.1929e-05, 1.2966e-04,\n 1.1531e-04, 1.4657e-04, 1.4532e-04, 7.1435e-07, 9.3775e-05, 3.5738e-06,\n 1.2525e-04, 9.9462e-05, 1.2356e-04, 1.1832e-04, 1.1851e-04, 1.1907e-04,\n 9.6039e-05, 9.3672e-05, 1.2571e-04, 9.0461e-05, 4.6265e-05, 8.1472e-05,\n 2.8538e-06, 3.9573e-06, 1.8178e-04, 8.5273e-05, 1.1549e-06, 1.1301e-04,\n 9.9930e-06, 2.6556e-04, 9.0106e-05, 1.0476e-04, 1.3779e-04, 4.4612e-07,\n 1.9354e-07, 9.2418e-05, 7.6904e-05, 5.2415e-05, 3.1193e-06, 1.1338e-04,\n 1.2712e-04, 6.7011e-07, 9.0553e-05, 1.9501e-04, 2.1779e-04, 1.0918e-04,\n 7.6809e-05, 5.6993e-07, 1.2559e-04, 1.8947e-07, 2.4857e-05, 1.9147e-04,\n 1.6334e-04, 9.5103e-05, 1.7345e-04, 1.0121e-04, 1.6039e-07, 9.0950e-05,\n 1.2939e-04, 2.5690e-05, 1.5040e-04, 1.4694e-04, 1.0786e-04, 1.0812e-04,\n 1.0765e-04, 1.0414e-04, 1.2522e-04, 1.6548e-04, 1.1310e-04, 4.7277e-05,\n 1.2718e-04, 1.6998e-04, 2.2692e-06, 5.2793e-05, 1.5023e-04, 3.3612e-05,\n 1.9915e-04, 9.5176e-07, 1.2857e-04, 5.6581e-06, 5.1162e-05, 1.4187e-04,\n 7.1108e-06, 1.2574e-04, 9.9547e-05, 8.2938e-05], device='cuda:0')"
14
+ },
15
+ "2": {
16
+ "step": "tensor(1252.)",
17
+ "exp_avg": "tensor([[-2.4572e-05, -1.9201e-06, 5.6052e-45, ..., -2.0221e-05,\n -8.0042e-06, 1.2930e-05],\n [ 8.2135e-05, 5.8937e-05, -5.6052e-45, ..., -2.3740e-05,\n 7.0554e-05, -2.9479e-05],\n [ 3.2157e-05, -2.7470e-05, -5.6052e-45, ..., -2.2250e-05,\n 7.4611e-05, 1.1439e-05],\n ...,\n [ 1.5776e-05, 1.4394e-05, -5.6052e-45, ..., -1.4606e-05,\n 1.9324e-05, 1.3889e-04],\n [ 4.4814e-05, 1.0931e-05, -5.6052e-45, ..., -5.8624e-05,\n 1.0031e-04, 1.1417e-04],\n [-4.9805e-05, -2.2688e-05, 5.6052e-45, ..., 1.1320e-05,\n 8.2752e-05, 5.0785e-05]], device='cuda:0')",
18
+ "exp_avg_sq": "tensor([[6.1916e-08, 4.0039e-08, 2.8288e-12, ..., 7.2189e-08, 3.4673e-08,\n 3.0847e-08],\n [9.9022e-08, 8.6026e-08, 2.4207e-12, ..., 9.1891e-08, 6.3491e-08,\n 6.3855e-08],\n [6.0957e-08, 4.8856e-08, 1.0245e-11, ..., 6.7222e-08, 6.9665e-08,\n 5.9740e-08],\n ...,\n [1.0218e-07, 7.0578e-08, 3.4236e-11, ..., 1.5361e-07, 6.8790e-08,\n 7.8031e-08],\n [9.7974e-08, 7.2477e-08, 4.8175e-12, ..., 1.2711e-07, 6.6718e-08,\n 9.0619e-08],\n [1.0754e-07, 8.6475e-08, 8.5844e-13, ..., 1.4465e-07, 1.1352e-07,\n 8.2872e-08]], device='cuda:0')"
19
+ }
20
+ },
21
+ "param_groups": [
22
+ {
23
+ "lr": 0.00975530705321762,
24
+ "name": "scale_256",
25
+ "betas": [
26
+ 0.9,
27
+ 0.999
28
+ ],
29
+ "eps": 1e-08,
30
+ "weight_decay": 1e-05,
31
+ "amsgrad": false,
32
+ "maximize": false,
33
+ "foreach": null,
34
+ "capturable": false,
35
+ "differentiable": false,
36
+ "fused": null,
37
+ "decoupled_weight_decay": true,
38
+ "initial_lr": 0.01,
39
+ "params": [
40
+ 0,
41
+ 1,
42
+ 2
43
+ ]
44
+ },
45
+ {
46
+ "lr": 0.00975530705321762,
47
+ "name": "scale_512",
48
+ "betas": [
49
+ 0.9,
50
+ 0.999
51
+ ],
52
+ "eps": 1e-08,
53
+ "weight_decay": 1e-05,
54
+ "amsgrad": false,
55
+ "maximize": false,
56
+ "foreach": null,
57
+ "capturable": false,
58
+ "differentiable": false,
59
+ "fused": null,
60
+ "decoupled_weight_decay": true,
61
+ "initial_lr": 0.01,
62
+ "params": [
63
+ 3,
64
+ 4,
65
+ 5
66
+ ]
67
+ },
68
+ {
69
+ "lr": 0.00975530705321762,
70
+ "name": "scale_768",
71
+ "betas": [
72
+ 0.9,
73
+ 0.999
74
+ ],
75
+ "eps": 1e-08,
76
+ "weight_decay": 1e-05,
77
+ "amsgrad": false,
78
+ "maximize": false,
79
+ "foreach": null,
80
+ "capturable": false,
81
+ "differentiable": false,
82
+ "fused": null,
83
+ "decoupled_weight_decay": true,
84
+ "initial_lr": 0.01,
85
+ "params": [
86
+ 6,
87
+ 7,
88
+ 8
89
+ ]
90
+ },
91
+ {
92
+ "lr": 0.00975530705321762,
93
+ "name": "scale_1024",
94
+ "betas": [
95
+ 0.9,
96
+ 0.999
97
+ ],
98
+ "eps": 1e-08,
99
+ "weight_decay": 1e-05,
100
+ "amsgrad": false,
101
+ "maximize": false,
102
+ "foreach": null,
103
+ "capturable": false,
104
+ "differentiable": false,
105
+ "fused": null,
106
+ "decoupled_weight_decay": true,
107
+ "initial_lr": 0.01,
108
+ "params": [
109
+ 9,
110
+ 10,
111
+ 11
112
+ ]
113
+ },
114
+ {
115
+ "lr": 0.00975530705321762,
116
+ "name": "scale_1280",
117
+ "betas": [
118
+ 0.9,
119
+ 0.999
120
+ ],
121
+ "eps": 1e-08,
122
+ "weight_decay": 1e-05,
123
+ "amsgrad": false,
124
+ "maximize": false,
125
+ "foreach": null,
126
+ "capturable": false,
127
+ "differentiable": false,
128
+ "fused": null,
129
+ "decoupled_weight_decay": true,
130
+ "initial_lr": 0.01,
131
+ "params": [
132
+ 12,
133
+ 13,
134
+ 14
135
+ ]
136
+ },
137
+ {
138
+ "lr": 0.004877665762479736,
139
+ "name": "fusion",
140
+ "betas": [
141
+ 0.9,
142
+ 0.999
143
+ ],
144
+ "eps": 1e-08,
145
+ "weight_decay": 1e-05,
146
+ "amsgrad": false,
147
+ "maximize": false,
148
+ "foreach": null,
149
+ "capturable": false,
150
+ "differentiable": false,
151
+ "fused": null,
152
+ "decoupled_weight_decay": true,
153
+ "initial_lr": 0.005,
154
+ "params": [
155
+ 15,
156
+ 16,
157
+ 17,
158
+ 18,
159
+ 19,
160
+ 20,
161
+ 21,
162
+ 22,
163
+ 23,
164
+ 24,
165
+ 25,
166
+ 26,
167
+ 27,
168
+ 28,
169
+ 29,
170
+ 30,
171
+ 31,
172
+ 32,
173
+ 33,
174
+ 34,
175
+ 35,
176
+ 36,
177
+ 37,
178
+ 38,
179
+ 39,
180
+ 40,
181
+ 41,
182
+ 42,
183
+ 43,
184
+ 44,
185
+ 45,
186
+ 46,
187
+ 47,
188
+ 48
189
+ ]
190
+ }
191
+ ]
192
+ },
193
+ "scheduler_state_dict": {
194
+ "T_0": 10,
195
+ "T_i": 10,
196
+ "T_mult": 2,
197
+ "eta_min": 1e-06,
198
+ "T_cur": 1,
199
+ "base_lrs": [
200
+ 0.01,
201
+ 0.01,
202
+ 0.01,
203
+ 0.01,
204
+ 0.01,
205
+ 0.005
206
+ ],
207
+ "last_epoch": 1,
208
+ "_step_count": 0,
209
+ "_is_initial": false,
210
+ "_get_lr_called_within_step": false,
211
+ "_last_lr": [
212
+ 0.00975530705321762,
213
+ 0.00975530705321762,
214
+ 0.00975530705321762,
215
+ 0.00975530705321762,
216
+ 0.00975530705321762,
217
+ 0.004877665762479736
218
+ ]
219
+ },
220
+ "metrics": {
221
+ "best_val_acc": 71.754,
222
+ "best_epoch": 0,
223
+ "scale_accuracies": {
224
+ "256": 71.754
225
+ }
226
+ },
227
+ "train_config": {
228
+ "name": "david_training",
229
+ "run_id": "20251012_031919",
230
+ "dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
231
+ "model_variant": "clip_vit_b16",
232
+ "num_classes": 1000,
233
+ "preset": "high_accuracy",
234
+ "custom_config_path": null,
235
+ "num_classes_override": null,
236
+ "use_belly_override": null,
237
+ "belly_expand_override": null,
238
+ "progressive_training_override": true,
239
+ "num_epochs": 20,
240
+ "batch_size": 1024,
241
+ "learning_rate": 0.01,
242
+ "weight_decay": 1e-05,
243
+ "warmup_epochs": 3,
244
+ "use_rose_loss": true,
245
+ "rose_initial_weight": 0.1,
246
+ "rose_max_weight": 0.5,
247
+ "rose_weight_schedule": "adaptive",
248
+ "use_cayley_loss": false,
249
+ "cayley_weight": 0.001,
250
+ "scale_loss_balance": null,
251
+ "use_mixed_precision": false,
252
+ "gradient_clip": 5.0,
253
+ "scheduler_type": "cosine_restarts",
254
+ "min_lr": 1e-06,
255
+ "freeze_strategy": "performance",
256
+ "freeze_threshold": 70.0,
257
+ "unfreeze_on_plateau": true,
258
+ "patience": 10,
259
+ "track_gradients": true,
260
+ "gradient_scale_threshold": 1e-07,
261
+ "gradient_scale_multiplier": 5.0,
262
+ "log_interval": 50,
263
+ "val_interval": 1,
264
+ "save_interval": 5,
265
+ "log_fusion_weights": true,
266
+ "log_loss_components": true,
267
+ "save_format": "safetensors",
268
+ "hf_repo": "AbstractPhil/gated-david",
269
+ "upload_to_hub": true,
270
+ "base_dir": "./david_training",
271
+ "num_workers": 10,
272
+ "pin_memory": true,
273
+ "prefetch_factor": 4,
274
+ "persistent_workers": true
275
+ }
276
+ }
weights/david_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "david_high_accuracy",
3
+ "uid": "c.david.high_accuracy",
4
+ "feature_dim": 512,
5
+ "num_classes": 1000,
6
+ "scales": [
7
+ 256,
8
+ 512,
9
+ 768,
10
+ 1024,
11
+ 1280
12
+ ],
13
+ "sharing_mode": "decoupled",
14
+ "fusion_mode": "deep_efficiency",
15
+ "use_belly": true,
16
+ "belly_expand": 2.5,
17
+ "shared_feature_dim": 768,
18
+ "shared_layers": 2,
19
+ "shared_dropout": 0.1,
20
+ "fusion_temperature": 1.0,
21
+ "fusion_dropout": 0.1,
22
+ "tree_depth": 3,
23
+ "num_experts": 5,
24
+ "compression_ratio": 2,
25
+ "expert_dropout": 0.1,
26
+ "attention_dropout": 0.1,
27
+ "progressive_training": true,
28
+ "scale_warmup_epochs": {
29
+ "256": 0,
30
+ "512": 3,
31
+ "768": 6,
32
+ "1024": 9,
33
+ "1280": 12
34
+ }
35
+ }
weights/train_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "david_training",
3
+ "run_id": "20251012_031919",
4
+ "dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
5
+ "model_variant": "clip_vit_b16",
6
+ "num_classes": 1000,
7
+ "preset": "high_accuracy",
8
+ "custom_config_path": null,
9
+ "num_classes_override": null,
10
+ "use_belly_override": null,
11
+ "belly_expand_override": null,
12
+ "progressive_training_override": true,
13
+ "num_epochs": 20,
14
+ "batch_size": 1024,
15
+ "learning_rate": 0.01,
16
+ "weight_decay": 1e-05,
17
+ "warmup_epochs": 3,
18
+ "use_rose_loss": true,
19
+ "rose_initial_weight": 0.1,
20
+ "rose_max_weight": 0.5,
21
+ "rose_weight_schedule": "adaptive",
22
+ "use_cayley_loss": false,
23
+ "cayley_weight": 0.001,
24
+ "scale_loss_balance": null,
25
+ "use_mixed_precision": false,
26
+ "gradient_clip": 5.0,
27
+ "scheduler_type": "cosine_restarts",
28
+ "min_lr": 1e-06,
29
+ "freeze_strategy": "performance",
30
+ "freeze_threshold": 70.0,
31
+ "unfreeze_on_plateau": true,
32
+ "patience": 10,
33
+ "track_gradients": true,
34
+ "gradient_scale_threshold": 1e-07,
35
+ "gradient_scale_multiplier": 5.0,
36
+ "log_interval": 50,
37
+ "val_interval": 1,
38
+ "save_interval": 5,
39
+ "log_fusion_weights": true,
40
+ "log_loss_components": true,
41
+ "save_format": "safetensors",
42
+ "hf_repo": "AbstractPhil/gated-david",
43
+ "upload_to_hub": true,
44
+ "base_dir": "./david_training",
45
+ "num_workers": 10,
46
+ "pin_memory": true,
47
+ "prefetch_factor": 4,
48
+ "persistent_workers": true
49
+ }