Upload weights and configs - Run 20251012_031919
Browse files- weights/best_model.safetensors +3 -0
- weights/best_model_metadata.json +276 -0
- weights/david_config.json +35 -0
- weights/train_config.json +49 -0
weights/best_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ecf5e86fd5b2ddb68e090d5f70544f98d7a52e2e21a58950633ab7170563619
|
| 3 |
+
size 59515088
|
weights/best_model_metadata.json
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 0,
|
| 3 |
+
"optimizer_state_dict": {
|
| 4 |
+
"state": {
|
| 5 |
+
"0": {
|
| 6 |
+
"step": "tensor(1252.)",
|
| 7 |
+
"exp_avg": "tensor([[ 9.9808e-05, -2.3590e-04, 4.8703e-05, ..., -6.7867e-06,\n 1.5045e-04, -6.9639e-06],\n [-3.2055e-05, -2.8797e-04, 6.0306e-05, ..., 9.5280e-06,\n 2.2100e-06, 3.8027e-06],\n [ 5.6052e-45, -5.6052e-45, -5.6052e-45, ..., 5.6052e-45,\n 5.6052e-45, 5.6052e-45],\n ...,\n [ 1.2700e-04, -2.9514e-05, 2.9405e-05, ..., 3.5731e-05,\n 2.1884e-06, -6.4312e-05],\n [ 2.5446e-04, -2.6147e-04, 2.8969e-05, ..., 8.7159e-05,\n 5.2191e-05, 1.4825e-04],\n [ 2.1196e-05, 7.8471e-06, -4.5547e-05, ..., 5.2981e-06,\n -1.3684e-04, -7.5568e-05]], device='cuda:0')",
|
| 8 |
+
"exp_avg_sq": "tensor([[5.4091e-07, 5.4144e-07, 4.9248e-08, ..., 4.1057e-08, 7.7326e-08,\n 5.4548e-08],\n [2.1701e-07, 3.3507e-07, 5.9846e-08, ..., 6.8188e-08, 6.9439e-08,\n 7.7029e-08],\n [2.3653e-09, 4.7337e-10, 1.4479e-10, ..., 2.1699e-10, 5.2052e-10,\n 4.1789e-10],\n ...,\n [4.9622e-07, 6.7429e-07, 5.9319e-08, ..., 7.7861e-08, 6.7676e-08,\n 4.2370e-08],\n [1.8010e-07, 3.5606e-07, 3.6999e-08, ..., 5.1092e-08, 7.0665e-08,\n 4.0146e-08],\n [1.0317e-07, 2.4285e-07, 4.6463e-08, ..., 5.2589e-08, 4.8899e-08,\n 4.3048e-08]], device='cuda:0')"
|
| 9 |
+
},
|
| 10 |
+
"1": {
|
| 11 |
+
"step": "tensor(1252.)",
|
| 12 |
+
"exp_avg": "tensor([ 2.9097e-03, 1.3327e-03, 5.6052e-45, -6.0815e-04, -4.8489e-07,\n 5.6052e-45, 4.5090e-04, -3.5691e-03, 1.6202e-03, -8.7439e-15,\n 4.4855e-04, 1.3435e-04, 2.7017e-03, 2.9764e-04, -2.6219e-03,\n 5.6216e-05, 1.3226e-03, -3.9642e-03, 8.1591e-04, -8.6576e-04,\n 2.2454e-04, 1.0421e-04, -2.1005e-03, 6.4988e-24, 5.7679e-04,\n 2.7692e-04, -9.5206e-04, -5.5439e-03, -5.3738e-03, 2.8346e-04,\n 1.7878e-03, 5.6052e-45, 1.0163e-03, -2.8758e-04, -1.0929e-03,\n 6.1516e-36, -6.6696e-03, 1.2855e-03, 1.6802e-03, -2.5499e-04,\n 8.3145e-04, -7.6461e-04, 1.0018e-03, -8.2878e-04, -6.3370e-04,\n 6.4852e-03, -1.1970e-03, 4.4301e-23, -1.5002e-03, -1.3855e-03,\n -4.5024e-03, 7.1654e-04, -4.9084e-04, 1.6491e-03, 5.6052e-45,\n -3.0916e-03, 5.2380e-03, -4.4587e-04, 3.4016e-04, -2.3714e-03,\n 2.0890e-16, -3.2571e-03, 1.5128e-03, -4.8482e-04, 8.1386e-05,\n 1.5725e-03, 2.3494e-03, 5.0308e-04, 3.6531e-03, 3.8240e-21,\n 4.7257e-03, -5.3711e-04, 1.8309e-04, 1.5676e-12, 3.5145e-03,\n 9.8415e-03, 1.0916e-02, -6.8465e-04, 4.2006e-03, -6.7891e-03,\n -1.2097e-03, 5.6052e-45, 1.1576e-16, -2.7480e-04, -5.5370e-03,\n -7.5772e-04, -2.9113e-18, 4.3444e-03, 3.1825e-03, -3.6059e-04,\n -3.6649e-04, 3.3190e-03, -1.9954e-03, -8.0797e-03, 3.6348e-03,\n 4.6834e-03, 2.3070e-03, -2.8616e-03, -7.9862e-04, -5.0693e-03,\n 2.4830e-04, 3.3737e-03, 4.2926e-03, 3.6573e-03, -3.5065e-04,\n -5.6572e-04, 1.8134e-03, -6.3776e-04, 4.6051e-03, -2.8824e-03,\n -3.6993e-04, -9.8880e-04, 1.0007e-40, -9.8369e-04, -2.8130e-03,\n 1.4090e-03, -1.7435e-06, 7.0776e-04, 5.7418e-03, -2.8657e-05,\n 2.4314e-03, -2.0983e-03, 5.1883e-03, -2.1431e-38, 7.4072e-07,\n -1.2746e-03, -3.4858e-04, -4.6374e-04, -1.6977e-03, 5.6052e-45,\n 3.7000e-03, 1.5831e-03, -2.0776e-03, -3.7744e-11, -4.3330e-04,\n 1.4578e-03, 7.6183e-03, -3.3606e-03, -1.0650e-03, 4.3183e-03,\n 3.1241e-08, 1.4853e-03, 2.1910e-07, 1.8818e-03, -3.2697e-03,\n 5.9948e-04, -2.3068e-03, 4.1328e-03, 2.3744e-07, 1.1056e-03,\n -1.6506e-04, 5.6052e-45, 5.6052e-45, 5.9209e-04, -2.7072e-03,\n -5.1818e-03, 3.9953e-04, 2.1181e-03, 5.6052e-45, 7.1065e-04,\n 2.8993e-03, 1.0938e-04, 4.5471e-04, -3.0714e-03, -1.6887e-03,\n 4.9794e-04, 3.8360e-04, -2.8068e-06, -3.4753e-04, 6.1017e-03,\n 9.3058e-04, 1.6621e-23, 5.6052e-45, 1.0997e-03, 1.6329e-03,\n 5.6052e-45, -2.2469e-14, 1.2329e-04, 6.3936e-03, -1.4311e-03,\n 2.5462e-03, -6.5818e-04, 1.6421e-03, -4.6383e-03, 1.1713e-03,\n 4.1140e-04, -7.0093e-04, -2.9552e-03, -9.6662e-03, 2.7718e-03,\n -7.6885e-03, -1.9008e-03, 4.6894e-04, 1.5270e-03, -1.1454e-02,\n 3.9764e-03, 3.5573e-03, 3.9063e-04, -1.4086e-03, -9.4108e-05,\n -1.9810e-03, 9.3351e-04, -2.7475e-03, -2.0864e-03, 6.9651e-04,\n 3.6452e-03, -2.2449e-03, -5.8442e-04, 2.1306e-03, 5.8208e-03,\n 1.6991e-04, -2.1017e-03, 3.0419e-03, 1.8699e-03, -1.8809e-03,\n -1.2653e-03, -5.3299e-03, -1.6545e-03, -1.0058e-03, 1.5983e-03,\n -2.4800e-03, -8.4721e-03, 1.9065e-26, 4.7373e-04, 2.0919e-03,\n -2.9753e-03, -4.0522e-04, 2.5252e-03, -2.8403e-03, -1.4599e-03,\n 1.3485e-03, -1.0358e-03, 3.9361e-41, 1.5485e-39, -5.7891e-03,\n -5.8865e-04, 8.3625e-04, 6.9007e-05, 3.3757e-03, 2.4274e-04,\n -6.1542e-05, 3.0383e-03, 2.3547e-03, 4.9550e-03, -2.0679e-03,\n 2.9972e-14, -2.8711e-03, -3.2705e-03, -3.5964e-03, 1.7757e-03,\n 6.0012e-03, 4.5402e-27, 2.1096e-03, 7.6903e-04, -4.8270e-04,\n -6.9613e-04, 1.0011e-03, 9.6959e-05, -3.0302e-03, 4.3707e-03,\n -2.7226e-03, -4.3493e-04, -1.7738e-03, 2.8508e-04, 2.4977e-04,\n -3.0230e-03, -2.4574e-03, 2.2031e-03, -2.3219e-03, 1.1483e-04,\n 4.2586e-03, 3.5752e-03, 5.6052e-45, 5.5265e-03, -5.1519e-03,\n 1.0887e-03, 7.3665e-04, 2.1852e-03, -1.6445e-03, -7.3967e-05,\n -7.4355e-04, 7.6051e-04, 3.4523e-03, 1.0644e-03, -1.1934e-03,\n -1.8700e-04, 3.7302e-14, -2.1369e-03, 5.6052e-45, -1.7855e-03,\n -5.3899e-03, 5.6500e-03, -6.1412e-05, -2.9054e-03, -3.7659e-04,\n -1.8160e-04, 1.9969e-31, -5.3695e-04, -4.6615e-04, 2.3254e-38,\n 5.6052e-45, 5.4223e-03, 4.4807e-03, -1.3030e-03, -1.2695e-03,\n -1.2296e-03, -9.0901e-04, 2.3960e-03, 1.6224e-04, -1.0780e-03,\n 2.8688e-03, 2.3945e-03, 2.9703e-03, 3.5277e-03, -2.4392e-03,\n -1.7388e-03, 1.9980e-03, 2.0871e-03, 5.6052e-45, 3.7856e-04,\n 2.2651e-03, 1.7563e-03, 5.6052e-45, 2.0318e-03, 1.3920e-03,\n 6.9956e-03, 1.8756e-03, -5.5313e-04, 2.4667e-03, 8.8846e-04,\n 1.1469e-03, 2.3866e-03, -1.7824e-03, 7.7836e-04, -5.0318e-04,\n -9.9463e-04, 9.0413e-04, 5.4385e-07, 6.3684e-04, 1.0693e-03,\n -1.9220e-03, 3.1149e-03, -5.8925e-04, -2.1141e-03, 1.9296e-03,\n 5.6052e-45, -3.9401e-03, 1.2427e-03, 5.6052e-45, -6.7204e-04,\n -1.1268e-05, 5.9482e-04, 1.4091e-03, -3.0139e-03, 4.3913e-04,\n -2.7723e-03, -2.4720e-03, -4.9926e-04, -2.7104e-03, -5.6586e-04,\n 7.8348e-04, 5.6052e-45, 1.3427e-03, -4.7691e-03, 5.6052e-45,\n -2.0714e-03, -5.4685e-04, -1.1198e-07, -7.7871e-05, -4.1724e-03,\n -2.1828e-03, -6.9466e-04, -2.3864e-04, 2.6827e-03, -2.0941e-03,\n 1.8311e-03, 2.7079e-03, -4.4246e-03, 3.3094e-03, 1.8410e-03,\n -6.3030e-03, 3.4858e-03, -1.9636e-03, 5.6052e-45, 1.9304e-05,\n -4.1517e-21, 2.4526e-03, 4.8325e-03, -6.6209e-04, 9.4050e-04,\n 3.5215e-42, 4.5619e-03, -6.1602e-03, -2.5304e-03, -9.4652e-04,\n 7.0431e-04, -7.8636e-24, 3.5365e-03, 5.6052e-45, 5.6052e-45,\n 9.4229e-04, 2.1341e-03, -6.8848e-03, 2.8542e-03, -2.0011e-04,\n 6.2322e-04, 5.4069e-04, 2.1369e-03, -2.7031e-03, 6.2608e-04,\n -7.1263e-04, -5.2130e-03, -7.6677e-04, 5.3319e-03, -7.3731e-41,\n -8.2319e-04, -8.3675e-04, 9.1410e-04, 5.6052e-45, 5.3688e-03,\n 4.4872e-03, -3.9738e-03, 5.5525e-04, -2.6904e-03, -6.7565e-04,\n 1.5406e-03, -2.9458e-03, 3.1132e-03, 5.6203e-04, -1.3210e-03,\n -2.1013e-03, -8.1524e-04, -3.7536e-03, -2.9657e-03, -2.6874e-03,\n 1.5260e-03, -3.0026e-03, 1.4654e-08, -1.1917e-03, 1.6394e-03,\n 5.6052e-45, -1.9517e-10, -3.2261e-03, 1.5158e-04, 2.0841e-03,\n -1.8274e-16, 1.8457e-29, -1.1565e-03, 4.7800e-04, 3.4919e-08,\n 4.5683e-18, -3.7613e-27, 5.6052e-45, -1.4436e-03, -2.0606e-03,\n -5.2525e-04, -2.9899e-03, 3.4768e-04, 1.1600e-03, 2.7800e-03,\n 2.1888e-03, -4.4828e-07, -7.0891e-04, -9.2954e-05, 1.1944e-03,\n 1.1122e-04, -5.0771e-03, 1.2030e-03, -5.6601e-04, 2.6567e-03,\n -2.5926e-03, 2.5320e-03, -4.3434e-04, -9.2896e-04, 6.3431e-03,\n 8.5656e-04, 5.1081e-03, -2.3216e-03, -5.3759e-03, 3.3953e-03,\n 4.0417e-03, -3.0275e-03, -6.7968e-05, -2.8031e-03, -2.8511e-04,\n 3.4441e-03, 2.6827e-03, -2.8124e-03, -4.7469e-14, 5.0620e-03,\n -1.5232e-03, -1.7148e-03, -5.0032e-03, 3.2487e-03, -1.8802e-03,\n 6.7241e-04, 1.3107e-03, -8.0724e-03, 2.5132e-03, 5.6052e-45,\n -9.5002e-04, 2.7097e-03, 2.6700e-03, 2.4434e-03, -1.6090e-21,\n 6.0059e-03, -1.0610e-03, -4.8448e-03, 7.5928e-04, 5.6052e-45,\n 5.9775e-25, 2.2420e-05, 3.3517e-04, 3.1995e-03, 2.1595e-04,\n 2.6273e-04, 4.7507e-03, 2.2225e-03, -2.0668e-08, 3.3253e-04,\n 1.1676e-03, -1.4700e-03, -3.4555e-03, -5.6052e-45, -3.1489e-03,\n -3.4126e-03, 4.0212e-03, 4.2094e-03, -1.0276e-02, 3.6612e-03,\n 1.8664e-03, 1.1503e-03, 4.9484e-03, 8.5730e-04, -3.0980e-06,\n 4.3121e-03, 1.4095e-03, -8.8099e-11, -1.1596e-03, -2.9243e-03,\n -4.0902e-03, -2.4074e-03, -3.5444e-03, 1.1548e-18, -4.3250e-08,\n 1.1259e-03, 7.5497e-04, 1.6669e-03, -4.1334e-03, -8.5967e-04,\n 6.6669e-04, -6.8106e-08, 1.7873e-03, 6.9239e-04, -3.4414e-03,\n 3.3455e-03, -6.4815e-04, -4.4830e-04, 6.2729e-04, 3.1767e-03,\n -1.1806e-03, 5.6052e-45, -6.1646e-05, -1.5390e-08, -3.8129e-03,\n -5.5182e-04, 4.8020e-03, 3.0543e-03, 2.6255e-04, -7.5502e-03,\n -3.9313e-03, -7.1502e-03, 4.7074e-03, 3.0065e-03, -1.9671e-03,\n -7.0393e-04, 1.2949e-21, 5.6052e-45, -5.1700e-03, -2.5810e-03,\n 2.2440e-14, 5.9939e-03, 4.5024e-04, -3.9611e-04, -6.0625e-04,\n 2.5357e-03, 4.8548e-03, 5.6052e-45, 1.3375e-05, 8.7432e-04,\n -4.6417e-04, -4.8118e-03, -1.0334e-05, -1.0974e-03, 7.7801e-04,\n 5.6052e-45, -1.3710e-03, -4.1969e-03, -8.9786e-04, 3.7286e-04,\n 2.2078e-03, 5.6052e-45, 4.4025e-04, 3.5078e-17, -2.3542e-04,\n -8.6203e-04, -2.0364e-03, -1.9391e-03, -1.9571e-03, -3.8523e-03,\n 8.1275e-44, 3.8030e-03, -2.9845e-03, 1.1340e-03, -3.8654e-03,\n -3.1780e-03, 3.0170e-03, -3.5518e-06, 1.2191e-03, 1.9990e-03,\n 1.3138e-03, -4.8024e-03, -5.5048e-03, 2.6232e-03, 2.9120e-03,\n 6.1065e-03, 1.9772e-14, -1.4609e-03, -1.3955e-03, -1.1309e-03,\n 4.4795e-03, 5.6052e-45, 1.8031e-03, 6.4633e-07, 2.0465e-03,\n 3.7773e-03, 1.9373e-03, 3.8224e-04, 4.0307e-03, -1.3818e-03],\n device='cuda:0')",
|
| 13 |
+
"exp_avg_sq": "tensor([1.0657e-04, 1.2498e-04, 5.4747e-07, 1.5065e-04, 1.6597e-06, 9.6464e-08,\n 5.7696e-05, 1.5453e-04, 8.2285e-05, 4.0288e-06, 8.5279e-05, 8.9790e-05,\n 1.7742e-04, 1.9223e-04, 9.8694e-05, 5.8505e-05, 1.1162e-04, 1.5988e-04,\n 7.5000e-05, 4.3486e-05, 4.3413e-05, 1.6213e-04, 1.3302e-04, 6.6957e-07,\n 3.2834e-05, 2.1819e-06, 2.3604e-04, 1.9225e-04, 1.1807e-04, 2.0260e-04,\n 2.3734e-05, 4.9176e-08, 9.9726e-05, 6.7809e-05, 6.4116e-05, 2.2739e-06,\n 1.6892e-04, 1.6830e-04, 1.0314e-04, 9.3657e-05, 2.0912e-04, 2.0061e-04,\n 1.3971e-04, 1.0896e-04, 5.0730e-05, 1.3257e-04, 3.1017e-05, 1.6003e-06,\n 1.0016e-04, 3.9827e-05, 1.0853e-04, 1.0274e-04, 6.8523e-05, 1.6013e-04,\n 1.1100e-05, 9.5912e-05, 1.9205e-04, 1.3528e-04, 6.9431e-05, 1.2905e-04,\n 5.8560e-06, 2.2769e-04, 7.7184e-05, 1.8214e-04, 7.5148e-06, 1.6535e-04,\n 8.7372e-05, 1.5874e-05, 1.4836e-04, 7.8219e-07, 1.4206e-04, 1.4345e-04,\n 1.3214e-04, 5.8393e-06, 1.1338e-04, 1.5193e-04, 1.2176e-04, 1.5996e-04,\n 1.0422e-04, 1.4050e-04, 1.3875e-04, 5.0348e-07, 4.1452e-06, 1.0420e-04,\n 1.2363e-04, 1.2190e-04, 2.5825e-06, 1.7398e-04, 8.3418e-05, 1.2805e-04,\n 1.5007e-04, 4.6880e-05, 5.2009e-05, 6.3075e-05, 1.1318e-04, 1.5523e-04,\n 1.2102e-04, 1.2687e-04, 9.0525e-05, 2.5028e-04, 2.9265e-04, 1.4653e-04,\n 1.1980e-04, 1.4750e-04, 1.6089e-05, 6.2279e-06, 2.0253e-04, 2.0641e-04,\n 1.4600e-04, 1.1342e-04, 6.9079e-05, 5.9040e-05, 3.0364e-08, 1.1472e-04,\n 1.2216e-04, 1.5451e-05, 6.0434e-06, 1.7606e-04, 8.8550e-05, 3.0746e-05,\n 1.1317e-04, 9.6538e-05, 1.8607e-04, 1.4595e-05, 1.1918e-04, 1.4209e-05,\n 9.3113e-05, 1.3750e-04, 5.7601e-05, 9.6174e-08, 2.1956e-04, 1.4674e-04,\n 8.0491e-05, 1.4195e-05, 9.8237e-05, 6.8813e-05, 1.6731e-04, 1.4326e-04,\n 4.1997e-05, 5.4175e-05, 1.2158e-06, 1.6454e-04, 4.0711e-06, 1.8878e-04,\n 1.2410e-04, 1.6002e-04, 1.0956e-04, 1.0676e-04, 4.3374e-06, 1.8985e-04,\n 1.1697e-04, 4.6518e-06, 2.0894e-06, 9.9115e-05, 1.6756e-04, 1.4277e-04,\n 4.4317e-06, 1.0657e-04, 1.2864e-06, 2.2122e-04, 9.5258e-05, 1.4995e-04,\n 1.2619e-04, 2.7058e-05, 1.1466e-04, 2.2609e-05, 1.9238e-06, 1.0774e-05,\n 1.0919e-04, 1.0577e-04, 6.1629e-05, 6.2394e-07, 2.7799e-09, 1.5646e-04,\n 1.5640e-04, 1.5983e-06, 2.5999e-06, 1.3227e-04, 3.0020e-04, 4.0478e-05,\n 5.8057e-05, 9.0989e-05, 1.3166e-04, 2.9814e-04, 6.8373e-05, 1.1930e-04,\n 3.9874e-05, 8.8335e-05, 7.1290e-05, 1.3050e-04, 1.5606e-04, 1.6142e-04,\n 9.1281e-05, 1.5517e-04, 8.5782e-05, 1.6078e-04, 9.9462e-05, 1.5166e-04,\n 1.0937e-04, 1.1834e-04, 9.8588e-05, 1.5297e-04, 1.3750e-04, 1.3850e-04,\n 1.0609e-04, 5.9584e-05, 9.6978e-05, 1.8988e-04, 1.3983e-04, 1.6517e-04,\n 2.0238e-04, 6.1500e-05, 1.2606e-04, 1.0112e-04, 5.8389e-05, 8.5489e-05,\n 1.2577e-04, 1.7108e-04, 1.3891e-04, 1.1265e-04, 1.1794e-04, 1.8124e-04,\n 5.4795e-07, 1.1601e-04, 7.7962e-05, 1.0644e-04, 4.2547e-05, 7.7368e-05,\n 1.8829e-04, 1.4142e-04, 2.5550e-04, 1.1878e-04, 6.3489e-06, 4.3365e-06,\n 1.6430e-04, 4.9562e-05, 1.3993e-04, 8.1268e-05, 1.2643e-04, 2.2004e-05,\n 1.9739e-04, 4.1638e-05, 1.5866e-04, 1.4759e-04, 1.3158e-04, 1.8179e-05,\n 1.6027e-04, 1.2549e-04, 1.9569e-04, 8.4398e-05, 1.4535e-04, 5.4644e-05,\n 1.6169e-04, 1.2208e-04, 1.7599e-04, 9.4468e-05, 6.0572e-05, 7.8003e-04,\n 2.2577e-04, 9.8741e-05, 1.0735e-04, 9.2178e-05, 1.5435e-04, 3.3711e-05,\n 1.4859e-04, 7.5763e-05, 1.4214e-04, 8.2884e-05, 1.2263e-04, 9.7052e-05,\n 1.5557e-04, 1.3137e-04, 8.1252e-07, 1.2670e-04, 1.5327e-04, 6.3749e-05,\n 1.5327e-04, 1.5620e-04, 1.3979e-04, 1.8585e-04, 1.3567e-04, 1.0034e-04,\n 1.1085e-04, 1.5390e-04, 1.5355e-04, 1.2933e-04, 5.3038e-07, 1.1110e-04,\n 1.1968e-06, 1.2746e-04, 1.4634e-04, 1.6784e-04, 1.0154e-04, 8.7271e-05,\n 3.6511e-05, 1.4328e-04, 2.6839e-06, 8.2360e-05, 1.6174e-04, 2.6423e-07,\n 8.0022e-06, 1.5647e-04, 1.4390e-05, 2.0712e-04, 1.1966e-04, 1.2116e-04,\n 8.2312e-05, 1.4790e-04, 1.3410e-04, 1.3199e-04, 1.5459e-04, 1.4375e-04,\n 1.0500e-04, 9.5798e-05, 2.0366e-04, 1.3458e-04, 9.5124e-05, 2.5052e-04,\n 6.5783e-06, 1.2063e-04, 1.5319e-04, 1.8916e-04, 9.1263e-07, 2.1179e-04,\n 2.5095e-06, 1.4195e-04, 1.1650e-04, 1.0627e-04, 9.2918e-05, 8.5886e-05,\n 1.5846e-04, 1.3672e-04, 1.3835e-05, 1.5123e-04, 1.0138e-04, 1.0761e-04,\n 1.5179e-04, 7.8470e-06, 9.2061e-05, 1.4345e-04, 1.5339e-04, 1.7330e-04,\n 1.6678e-04, 6.9284e-05, 1.3848e-04, 2.1464e-07, 1.3049e-04, 6.7242e-05,\n 1.7938e-06, 1.6084e-04, 8.2247e-06, 5.9421e-05, 1.0957e-04, 1.4101e-04,\n 1.6227e-05, 2.1253e-04, 9.9932e-05, 1.9896e-04, 1.6496e-04, 1.6180e-04,\n 7.5642e-05, 5.5413e-06, 1.3033e-04, 1.1072e-04, 3.9993e-07, 2.8074e-05,\n 5.0329e-05, 4.8088e-06, 1.1054e-04, 5.9186e-05, 1.1593e-04, 1.0599e-04,\n 7.6909e-05, 8.4948e-05, 1.6006e-04, 7.4657e-05, 1.6763e-04, 1.5340e-04,\n 1.9831e-04, 9.3780e-05, 1.2386e-04, 1.0620e-04, 1.1448e-04, 6.1126e-06,\n 1.0337e-04, 1.7338e-05, 1.0805e-04, 2.1856e-04, 6.0433e-05, 1.0851e-04,\n 3.6828e-07, 1.6744e-04, 1.1827e-04, 3.1246e-05, 1.0913e-04, 1.0361e-04,\n 2.3393e-06, 2.0273e-05, 5.3356e-06, 6.7816e-06, 9.0499e-05, 2.4462e-04,\n 1.3379e-04, 1.6504e-04, 6.7356e-06, 8.5502e-05, 1.3591e-04, 1.6342e-04,\n 5.8167e-05, 6.7462e-05, 1.0389e-04, 1.6985e-04, 7.0559e-05, 1.4013e-04,\n 1.8478e-06, 7.2669e-05, 1.5968e-05, 1.0009e-04, 1.3229e-05, 1.2551e-04,\n 9.0110e-05, 1.3758e-04, 2.1852e-05, 1.2398e-04, 6.1984e-05, 2.1324e-04,\n 1.3438e-04, 1.3156e-04, 1.7449e-04, 1.1232e-04, 1.1211e-04, 1.0872e-04,\n 8.0641e-05, 1.4451e-04, 1.3943e-04, 9.0162e-05, 1.2867e-04, 1.2411e-06,\n 1.3691e-04, 3.0526e-05, 2.6990e-07, 1.5792e-06, 1.3751e-04, 1.6311e-04,\n 1.4706e-04, 2.3367e-06, 1.0337e-05, 1.2572e-04, 1.2500e-04, 5.0513e-06,\n 3.9280e-06, 3.4529e-06, 8.1961e-07, 1.1639e-04, 1.0618e-04, 1.3924e-04,\n 1.3476e-04, 6.0390e-05, 5.5459e-05, 1.3877e-04, 1.9087e-04, 4.5239e-06,\n 5.6055e-05, 6.4027e-05, 2.2887e-04, 2.1422e-04, 1.7405e-04, 1.7171e-04,\n 1.3046e-04, 7.8992e-05, 2.0561e-04, 5.6756e-05, 1.6212e-04, 6.1590e-05,\n 1.1942e-04, 4.8147e-05, 1.0807e-04, 1.5405e-04, 8.5730e-05, 1.3142e-04,\n 1.4351e-04, 1.1481e-04, 1.1602e-04, 1.3749e-04, 1.5277e-04, 1.6102e-04,\n 4.9004e-05, 7.3917e-05, 1.4746e-06, 2.4094e-04, 2.0498e-04, 7.0090e-05,\n 1.1606e-04, 1.5974e-04, 1.1282e-04, 1.7416e-04, 1.2365e-04, 1.8325e-04,\n 1.6413e-04, 1.0075e-05, 1.5340e-04, 1.7198e-04, 3.7495e-05, 1.0890e-04,\n 3.2546e-06, 9.5192e-05, 1.6514e-04, 1.4632e-04, 1.3682e-04, 1.1080e-06,\n 6.8253e-07, 6.4819e-05, 1.0894e-05, 1.6991e-04, 5.8200e-05, 6.9377e-05,\n 1.9031e-04, 1.1263e-04, 6.5790e-06, 1.9370e-05, 1.2348e-04, 1.6868e-04,\n 6.1779e-05, 1.9711e-06, 2.4460e-04, 1.2958e-04, 1.2875e-04, 1.5574e-04,\n 1.2438e-04, 1.2424e-04, 1.8478e-04, 1.1652e-04, 1.3442e-04, 1.7498e-04,\n 1.3293e-04, 1.0791e-04, 1.1685e-04, 4.7575e-07, 7.0879e-05, 9.9483e-05,\n 9.2598e-05, 1.3999e-04, 1.3390e-04, 1.5507e-06, 2.2021e-06, 1.2262e-04,\n 3.5282e-05, 3.9577e-05, 1.8776e-04, 2.3607e-05, 1.2914e-04, 9.5425e-06,\n 8.8458e-05, 5.4793e-05, 1.7688e-04, 1.4917e-04, 7.1929e-05, 1.2966e-04,\n 1.1531e-04, 1.4657e-04, 1.4532e-04, 7.1435e-07, 9.3775e-05, 3.5738e-06,\n 1.2525e-04, 9.9462e-05, 1.2356e-04, 1.1832e-04, 1.1851e-04, 1.1907e-04,\n 9.6039e-05, 9.3672e-05, 1.2571e-04, 9.0461e-05, 4.6265e-05, 8.1472e-05,\n 2.8538e-06, 3.9573e-06, 1.8178e-04, 8.5273e-05, 1.1549e-06, 1.1301e-04,\n 9.9930e-06, 2.6556e-04, 9.0106e-05, 1.0476e-04, 1.3779e-04, 4.4612e-07,\n 1.9354e-07, 9.2418e-05, 7.6904e-05, 5.2415e-05, 3.1193e-06, 1.1338e-04,\n 1.2712e-04, 6.7011e-07, 9.0553e-05, 1.9501e-04, 2.1779e-04, 1.0918e-04,\n 7.6809e-05, 5.6993e-07, 1.2559e-04, 1.8947e-07, 2.4857e-05, 1.9147e-04,\n 1.6334e-04, 9.5103e-05, 1.7345e-04, 1.0121e-04, 1.6039e-07, 9.0950e-05,\n 1.2939e-04, 2.5690e-05, 1.5040e-04, 1.4694e-04, 1.0786e-04, 1.0812e-04,\n 1.0765e-04, 1.0414e-04, 1.2522e-04, 1.6548e-04, 1.1310e-04, 4.7277e-05,\n 1.2718e-04, 1.6998e-04, 2.2692e-06, 5.2793e-05, 1.5023e-04, 3.3612e-05,\n 1.9915e-04, 9.5176e-07, 1.2857e-04, 5.6581e-06, 5.1162e-05, 1.4187e-04,\n 7.1108e-06, 1.2574e-04, 9.9547e-05, 8.2938e-05], device='cuda:0')"
|
| 14 |
+
},
|
| 15 |
+
"2": {
|
| 16 |
+
"step": "tensor(1252.)",
|
| 17 |
+
"exp_avg": "tensor([[-2.4572e-05, -1.9201e-06, 5.6052e-45, ..., -2.0221e-05,\n -8.0042e-06, 1.2930e-05],\n [ 8.2135e-05, 5.8937e-05, -5.6052e-45, ..., -2.3740e-05,\n 7.0554e-05, -2.9479e-05],\n [ 3.2157e-05, -2.7470e-05, -5.6052e-45, ..., -2.2250e-05,\n 7.4611e-05, 1.1439e-05],\n ...,\n [ 1.5776e-05, 1.4394e-05, -5.6052e-45, ..., -1.4606e-05,\n 1.9324e-05, 1.3889e-04],\n [ 4.4814e-05, 1.0931e-05, -5.6052e-45, ..., -5.8624e-05,\n 1.0031e-04, 1.1417e-04],\n [-4.9805e-05, -2.2688e-05, 5.6052e-45, ..., 1.1320e-05,\n 8.2752e-05, 5.0785e-05]], device='cuda:0')",
|
| 18 |
+
"exp_avg_sq": "tensor([[6.1916e-08, 4.0039e-08, 2.8288e-12, ..., 7.2189e-08, 3.4673e-08,\n 3.0847e-08],\n [9.9022e-08, 8.6026e-08, 2.4207e-12, ..., 9.1891e-08, 6.3491e-08,\n 6.3855e-08],\n [6.0957e-08, 4.8856e-08, 1.0245e-11, ..., 6.7222e-08, 6.9665e-08,\n 5.9740e-08],\n ...,\n [1.0218e-07, 7.0578e-08, 3.4236e-11, ..., 1.5361e-07, 6.8790e-08,\n 7.8031e-08],\n [9.7974e-08, 7.2477e-08, 4.8175e-12, ..., 1.2711e-07, 6.6718e-08,\n 9.0619e-08],\n [1.0754e-07, 8.6475e-08, 8.5844e-13, ..., 1.4465e-07, 1.1352e-07,\n 8.2872e-08]], device='cuda:0')"
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"param_groups": [
|
| 22 |
+
{
|
| 23 |
+
"lr": 0.00975530705321762,
|
| 24 |
+
"name": "scale_256",
|
| 25 |
+
"betas": [
|
| 26 |
+
0.9,
|
| 27 |
+
0.999
|
| 28 |
+
],
|
| 29 |
+
"eps": 1e-08,
|
| 30 |
+
"weight_decay": 1e-05,
|
| 31 |
+
"amsgrad": false,
|
| 32 |
+
"maximize": false,
|
| 33 |
+
"foreach": null,
|
| 34 |
+
"capturable": false,
|
| 35 |
+
"differentiable": false,
|
| 36 |
+
"fused": null,
|
| 37 |
+
"decoupled_weight_decay": true,
|
| 38 |
+
"initial_lr": 0.01,
|
| 39 |
+
"params": [
|
| 40 |
+
0,
|
| 41 |
+
1,
|
| 42 |
+
2
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"lr": 0.00975530705321762,
|
| 47 |
+
"name": "scale_512",
|
| 48 |
+
"betas": [
|
| 49 |
+
0.9,
|
| 50 |
+
0.999
|
| 51 |
+
],
|
| 52 |
+
"eps": 1e-08,
|
| 53 |
+
"weight_decay": 1e-05,
|
| 54 |
+
"amsgrad": false,
|
| 55 |
+
"maximize": false,
|
| 56 |
+
"foreach": null,
|
| 57 |
+
"capturable": false,
|
| 58 |
+
"differentiable": false,
|
| 59 |
+
"fused": null,
|
| 60 |
+
"decoupled_weight_decay": true,
|
| 61 |
+
"initial_lr": 0.01,
|
| 62 |
+
"params": [
|
| 63 |
+
3,
|
| 64 |
+
4,
|
| 65 |
+
5
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"lr": 0.00975530705321762,
|
| 70 |
+
"name": "scale_768",
|
| 71 |
+
"betas": [
|
| 72 |
+
0.9,
|
| 73 |
+
0.999
|
| 74 |
+
],
|
| 75 |
+
"eps": 1e-08,
|
| 76 |
+
"weight_decay": 1e-05,
|
| 77 |
+
"amsgrad": false,
|
| 78 |
+
"maximize": false,
|
| 79 |
+
"foreach": null,
|
| 80 |
+
"capturable": false,
|
| 81 |
+
"differentiable": false,
|
| 82 |
+
"fused": null,
|
| 83 |
+
"decoupled_weight_decay": true,
|
| 84 |
+
"initial_lr": 0.01,
|
| 85 |
+
"params": [
|
| 86 |
+
6,
|
| 87 |
+
7,
|
| 88 |
+
8
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"lr": 0.00975530705321762,
|
| 93 |
+
"name": "scale_1024",
|
| 94 |
+
"betas": [
|
| 95 |
+
0.9,
|
| 96 |
+
0.999
|
| 97 |
+
],
|
| 98 |
+
"eps": 1e-08,
|
| 99 |
+
"weight_decay": 1e-05,
|
| 100 |
+
"amsgrad": false,
|
| 101 |
+
"maximize": false,
|
| 102 |
+
"foreach": null,
|
| 103 |
+
"capturable": false,
|
| 104 |
+
"differentiable": false,
|
| 105 |
+
"fused": null,
|
| 106 |
+
"decoupled_weight_decay": true,
|
| 107 |
+
"initial_lr": 0.01,
|
| 108 |
+
"params": [
|
| 109 |
+
9,
|
| 110 |
+
10,
|
| 111 |
+
11
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"lr": 0.00975530705321762,
|
| 116 |
+
"name": "scale_1280",
|
| 117 |
+
"betas": [
|
| 118 |
+
0.9,
|
| 119 |
+
0.999
|
| 120 |
+
],
|
| 121 |
+
"eps": 1e-08,
|
| 122 |
+
"weight_decay": 1e-05,
|
| 123 |
+
"amsgrad": false,
|
| 124 |
+
"maximize": false,
|
| 125 |
+
"foreach": null,
|
| 126 |
+
"capturable": false,
|
| 127 |
+
"differentiable": false,
|
| 128 |
+
"fused": null,
|
| 129 |
+
"decoupled_weight_decay": true,
|
| 130 |
+
"initial_lr": 0.01,
|
| 131 |
+
"params": [
|
| 132 |
+
12,
|
| 133 |
+
13,
|
| 134 |
+
14
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"lr": 0.004877665762479736,
|
| 139 |
+
"name": "fusion",
|
| 140 |
+
"betas": [
|
| 141 |
+
0.9,
|
| 142 |
+
0.999
|
| 143 |
+
],
|
| 144 |
+
"eps": 1e-08,
|
| 145 |
+
"weight_decay": 1e-05,
|
| 146 |
+
"amsgrad": false,
|
| 147 |
+
"maximize": false,
|
| 148 |
+
"foreach": null,
|
| 149 |
+
"capturable": false,
|
| 150 |
+
"differentiable": false,
|
| 151 |
+
"fused": null,
|
| 152 |
+
"decoupled_weight_decay": true,
|
| 153 |
+
"initial_lr": 0.005,
|
| 154 |
+
"params": [
|
| 155 |
+
15,
|
| 156 |
+
16,
|
| 157 |
+
17,
|
| 158 |
+
18,
|
| 159 |
+
19,
|
| 160 |
+
20,
|
| 161 |
+
21,
|
| 162 |
+
22,
|
| 163 |
+
23,
|
| 164 |
+
24,
|
| 165 |
+
25,
|
| 166 |
+
26,
|
| 167 |
+
27,
|
| 168 |
+
28,
|
| 169 |
+
29,
|
| 170 |
+
30,
|
| 171 |
+
31,
|
| 172 |
+
32,
|
| 173 |
+
33,
|
| 174 |
+
34,
|
| 175 |
+
35,
|
| 176 |
+
36,
|
| 177 |
+
37,
|
| 178 |
+
38,
|
| 179 |
+
39,
|
| 180 |
+
40,
|
| 181 |
+
41,
|
| 182 |
+
42,
|
| 183 |
+
43,
|
| 184 |
+
44,
|
| 185 |
+
45,
|
| 186 |
+
46,
|
| 187 |
+
47,
|
| 188 |
+
48
|
| 189 |
+
]
|
| 190 |
+
}
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
"scheduler_state_dict": {
|
| 194 |
+
"T_0": 10,
|
| 195 |
+
"T_i": 10,
|
| 196 |
+
"T_mult": 2,
|
| 197 |
+
"eta_min": 1e-06,
|
| 198 |
+
"T_cur": 1,
|
| 199 |
+
"base_lrs": [
|
| 200 |
+
0.01,
|
| 201 |
+
0.01,
|
| 202 |
+
0.01,
|
| 203 |
+
0.01,
|
| 204 |
+
0.01,
|
| 205 |
+
0.005
|
| 206 |
+
],
|
| 207 |
+
"last_epoch": 1,
|
| 208 |
+
"_step_count": 0,
|
| 209 |
+
"_is_initial": false,
|
| 210 |
+
"_get_lr_called_within_step": false,
|
| 211 |
+
"_last_lr": [
|
| 212 |
+
0.00975530705321762,
|
| 213 |
+
0.00975530705321762,
|
| 214 |
+
0.00975530705321762,
|
| 215 |
+
0.00975530705321762,
|
| 216 |
+
0.00975530705321762,
|
| 217 |
+
0.004877665762479736
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
"metrics": {
|
| 221 |
+
"best_val_acc": 71.754,
|
| 222 |
+
"best_epoch": 0,
|
| 223 |
+
"scale_accuracies": {
|
| 224 |
+
"256": 71.754
|
| 225 |
+
}
|
| 226 |
+
},
|
| 227 |
+
"train_config": {
|
| 228 |
+
"name": "david_training",
|
| 229 |
+
"run_id": "20251012_031919",
|
| 230 |
+
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 231 |
+
"model_variant": "clip_vit_b16",
|
| 232 |
+
"num_classes": 1000,
|
| 233 |
+
"preset": "high_accuracy",
|
| 234 |
+
"custom_config_path": null,
|
| 235 |
+
"num_classes_override": null,
|
| 236 |
+
"use_belly_override": null,
|
| 237 |
+
"belly_expand_override": null,
|
| 238 |
+
"progressive_training_override": true,
|
| 239 |
+
"num_epochs": 20,
|
| 240 |
+
"batch_size": 1024,
|
| 241 |
+
"learning_rate": 0.01,
|
| 242 |
+
"weight_decay": 1e-05,
|
| 243 |
+
"warmup_epochs": 3,
|
| 244 |
+
"use_rose_loss": true,
|
| 245 |
+
"rose_initial_weight": 0.1,
|
| 246 |
+
"rose_max_weight": 0.5,
|
| 247 |
+
"rose_weight_schedule": "adaptive",
|
| 248 |
+
"use_cayley_loss": false,
|
| 249 |
+
"cayley_weight": 0.001,
|
| 250 |
+
"scale_loss_balance": null,
|
| 251 |
+
"use_mixed_precision": false,
|
| 252 |
+
"gradient_clip": 5.0,
|
| 253 |
+
"scheduler_type": "cosine_restarts",
|
| 254 |
+
"min_lr": 1e-06,
|
| 255 |
+
"freeze_strategy": "performance",
|
| 256 |
+
"freeze_threshold": 70.0,
|
| 257 |
+
"unfreeze_on_plateau": true,
|
| 258 |
+
"patience": 10,
|
| 259 |
+
"track_gradients": true,
|
| 260 |
+
"gradient_scale_threshold": 1e-07,
|
| 261 |
+
"gradient_scale_multiplier": 5.0,
|
| 262 |
+
"log_interval": 50,
|
| 263 |
+
"val_interval": 1,
|
| 264 |
+
"save_interval": 5,
|
| 265 |
+
"log_fusion_weights": true,
|
| 266 |
+
"log_loss_components": true,
|
| 267 |
+
"save_format": "safetensors",
|
| 268 |
+
"hf_repo": "AbstractPhil/gated-david",
|
| 269 |
+
"upload_to_hub": true,
|
| 270 |
+
"base_dir": "./david_training",
|
| 271 |
+
"num_workers": 10,
|
| 272 |
+
"pin_memory": true,
|
| 273 |
+
"prefetch_factor": 4,
|
| 274 |
+
"persistent_workers": true
|
| 275 |
+
}
|
| 276 |
+
}
|
weights/david_config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "david_high_accuracy",
|
| 3 |
+
"uid": "c.david.high_accuracy",
|
| 4 |
+
"feature_dim": 512,
|
| 5 |
+
"num_classes": 1000,
|
| 6 |
+
"scales": [
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
768,
|
| 10 |
+
1024,
|
| 11 |
+
1280
|
| 12 |
+
],
|
| 13 |
+
"sharing_mode": "decoupled",
|
| 14 |
+
"fusion_mode": "deep_efficiency",
|
| 15 |
+
"use_belly": true,
|
| 16 |
+
"belly_expand": 2.5,
|
| 17 |
+
"shared_feature_dim": 768,
|
| 18 |
+
"shared_layers": 2,
|
| 19 |
+
"shared_dropout": 0.1,
|
| 20 |
+
"fusion_temperature": 1.0,
|
| 21 |
+
"fusion_dropout": 0.1,
|
| 22 |
+
"tree_depth": 3,
|
| 23 |
+
"num_experts": 5,
|
| 24 |
+
"compression_ratio": 2,
|
| 25 |
+
"expert_dropout": 0.1,
|
| 26 |
+
"attention_dropout": 0.1,
|
| 27 |
+
"progressive_training": true,
|
| 28 |
+
"scale_warmup_epochs": {
|
| 29 |
+
"256": 0,
|
| 30 |
+
"512": 3,
|
| 31 |
+
"768": 6,
|
| 32 |
+
"1024": 9,
|
| 33 |
+
"1280": 12
|
| 34 |
+
}
|
| 35 |
+
}
|
weights/train_config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "david_training",
|
| 3 |
+
"run_id": "20251012_031919",
|
| 4 |
+
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 5 |
+
"model_variant": "clip_vit_b16",
|
| 6 |
+
"num_classes": 1000,
|
| 7 |
+
"preset": "high_accuracy",
|
| 8 |
+
"custom_config_path": null,
|
| 9 |
+
"num_classes_override": null,
|
| 10 |
+
"use_belly_override": null,
|
| 11 |
+
"belly_expand_override": null,
|
| 12 |
+
"progressive_training_override": true,
|
| 13 |
+
"num_epochs": 20,
|
| 14 |
+
"batch_size": 1024,
|
| 15 |
+
"learning_rate": 0.01,
|
| 16 |
+
"weight_decay": 1e-05,
|
| 17 |
+
"warmup_epochs": 3,
|
| 18 |
+
"use_rose_loss": true,
|
| 19 |
+
"rose_initial_weight": 0.1,
|
| 20 |
+
"rose_max_weight": 0.5,
|
| 21 |
+
"rose_weight_schedule": "adaptive",
|
| 22 |
+
"use_cayley_loss": false,
|
| 23 |
+
"cayley_weight": 0.001,
|
| 24 |
+
"scale_loss_balance": null,
|
| 25 |
+
"use_mixed_precision": false,
|
| 26 |
+
"gradient_clip": 5.0,
|
| 27 |
+
"scheduler_type": "cosine_restarts",
|
| 28 |
+
"min_lr": 1e-06,
|
| 29 |
+
"freeze_strategy": "performance",
|
| 30 |
+
"freeze_threshold": 70.0,
|
| 31 |
+
"unfreeze_on_plateau": true,
|
| 32 |
+
"patience": 10,
|
| 33 |
+
"track_gradients": true,
|
| 34 |
+
"gradient_scale_threshold": 1e-07,
|
| 35 |
+
"gradient_scale_multiplier": 5.0,
|
| 36 |
+
"log_interval": 50,
|
| 37 |
+
"val_interval": 1,
|
| 38 |
+
"save_interval": 5,
|
| 39 |
+
"log_fusion_weights": true,
|
| 40 |
+
"log_loss_components": true,
|
| 41 |
+
"save_format": "safetensors",
|
| 42 |
+
"hf_repo": "AbstractPhil/gated-david",
|
| 43 |
+
"upload_to_hub": true,
|
| 44 |
+
"base_dir": "./david_training",
|
| 45 |
+
"num_workers": 10,
|
| 46 |
+
"pin_memory": true,
|
| 47 |
+
"prefetch_factor": 4,
|
| 48 |
+
"persistent_workers": true
|
| 49 |
+
}
|