AIST-87M / aist87m_memory_slice_release_report.json
gcoderw's picture
Remove ES-AIST comparison from AIST-87M card
9c1b25e verified
{
"tasks": [
"SprintDuplicateQuestions",
"STSBenchmark",
"Flickr30kT2IRetrieval",
"Flickr30kI2TRetrieval",
"CommonVoiceMini21T2ARetrieval",
"MACST2ARetrieval",
"UrbanSound8KT2ARetrieval",
"ClothoT2ARetrieval"
],
"primary_metric_policy": {
"text": "main_score",
"image_text_retrieval": "ndcg_at_10",
"audio_text_retrieval": "ndcg_at_10"
},
"runs": [
{
"label": "AIST-87M 1280",
"dimension": 1280,
"results_dir": "/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_default/dim1280/results/triembed__te-1280d/best_model",
"completed_tasks": 8,
"missing_tasks": [],
"overall_mean": 0.3491338344017094,
"family_means": {
"Audio recall": 0.1041809188034188,
"Image recall": 0.424995,
"Text continuity": 0.7631785
},
"rows": [
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.875145,
"metrics": {
"main_score": 0.875145
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.651212,
"metrics": {
"main_score": 0.651212,
"cosine_spearman": 0.651212,
"spearman": 0.651212
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.4685,
"metrics": {
"main_score": 0.4685,
"ndcg_at_10": 0.4685,
"recall_at_1": 0.2956,
"recall_at_10": 0.6718,
"mrr_at_10": 0.405197
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.38149,
"metrics": {
"main_score": 0.38149,
"ndcg_at_10": 0.38149,
"recall_at_1": 0.0816,
"recall_at_10": 0.4072,
"mrr_at_10": 0.533862
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.028403675213675213,
"metrics": {
"main_score": 0.03276290598290598,
"ndcg_at_10": 0.028403675213675213,
"recall_at_1": 0.005908376068376069,
"recall_at_10": 0.061962393162393166,
"mrr_at_10": 0.01842434188034188
},
"subsets": 117
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "MACST2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.11037,
"metrics": {
"main_score": 0.13995,
"ndcg_at_10": 0.11037,
"recall_at_1": 0.03308,
"recall_at_10": 0.21374,
"mrr_at_10": 0.079078
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "UrbanSound8KT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.00851,
"metrics": {
"main_score": 0.00963,
"ndcg_at_10": 0.00851,
"recall_at_1": 0.00196,
"recall_at_10": 0.01847,
"mrr_at_10": 0.00556
},
"subsets": 1
},
{
"label": "AIST-87M 1280",
"dimension": 1280,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.26944,
"metrics": {
"main_score": 0.3325,
"ndcg_at_10": 0.26944,
"recall_at_1": 0.1282,
"recall_at_10": 0.44315,
"mrr_at_10": 0.215861
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_default/dim1280/results/triembed__te-1280d/best_model"
]
},
{
"label": "AIST-87M 768",
"dimension": 768,
"results_dir": "/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_default/dim768/results/triembed__te-768d/best_model",
"completed_tasks": 8,
"missing_tasks": [],
"overall_mean": 0.34871195512820513,
"family_means": {
"Audio recall": 0.10426891025641025,
"Image recall": 0.423815,
"Text continuity": 0.7624949999999999
},
"rows": [
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.874231,
"metrics": {
"main_score": 0.874231
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.650759,
"metrics": {
"main_score": 0.650759,
"cosine_spearman": 0.650759,
"spearman": 0.650759
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.46701,
"metrics": {
"main_score": 0.46701,
"ndcg_at_10": 0.46701,
"recall_at_1": 0.2922,
"recall_at_10": 0.6712,
"mrr_at_10": 0.403385
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.38062,
"metrics": {
"main_score": 0.38062,
"ndcg_at_10": 0.38062,
"recall_at_1": 0.0814,
"recall_at_10": 0.4058,
"mrr_at_10": 0.532687
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.028395641025641027,
"metrics": {
"main_score": 0.03299991452991453,
"ndcg_at_10": 0.028395641025641027,
"recall_at_1": 0.005907350427350427,
"recall_at_10": 0.062035897435897436,
"mrr_at_10": 0.01839460683760684
},
"subsets": 117
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "MACST2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.11149,
"metrics": {
"main_score": 0.14249,
"ndcg_at_10": 0.11149,
"recall_at_1": 0.03308,
"recall_at_10": 0.21628,
"mrr_at_10": 0.079723
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "UrbanSound8KT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.00851,
"metrics": {
"main_score": 0.00963,
"ndcg_at_10": 0.00851,
"recall_at_1": 0.00196,
"recall_at_10": 0.01847,
"mrr_at_10": 0.005562
},
"subsets": 1
},
{
"label": "AIST-87M 768",
"dimension": 768,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.26868,
"metrics": {
"main_score": 0.33178,
"ndcg_at_10": 0.26868,
"recall_at_1": 0.12695,
"recall_at_10": 0.44208,
"mrr_at_10": 0.21516
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_default/dim768/results/triembed__te-768d/best_model",
"/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_dim768_fill/dim768/results/triembed__te-768d/best_model"
]
},
{
"label": "AIST-87M 512",
"dimension": 512,
"results_dir": "/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_dim512/dim512/results/triembed__te-512d/best_model",
"completed_tasks": 8,
"missing_tasks": [],
"overall_mean": 0.3488224732905983,
"family_means": {
"Audio recall": 0.10438869658119658,
"Image recall": 0.42417499999999997,
"Text continuity": 0.7623375
},
"rows": [
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.873508,
"metrics": {
"main_score": 0.873508
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.651167,
"metrics": {
"main_score": 0.651167,
"cosine_spearman": 0.651167,
"spearman": 0.651167
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.4676,
"metrics": {
"main_score": 0.4676,
"ndcg_at_10": 0.4676,
"recall_at_1": 0.2954,
"recall_at_10": 0.6702,
"mrr_at_10": 0.404515
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.38075,
"metrics": {
"main_score": 0.38075,
"ndcg_at_10": 0.38075,
"recall_at_1": 0.0824,
"recall_at_10": 0.4052,
"mrr_at_10": 0.535146
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.028264786324786326,
"metrics": {
"main_score": 0.03229504273504274,
"ndcg_at_10": 0.028264786324786326,
"recall_at_1": 0.006467948717948718,
"recall_at_10": 0.060837521367521366,
"mrr_at_10": 0.018573598290598292
},
"subsets": 117
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "MACST2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.11287,
"metrics": {
"main_score": 0.13486,
"ndcg_at_10": 0.11287,
"recall_at_1": 0.03308,
"recall_at_10": 0.22137,
"mrr_at_10": 0.080181
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "UrbanSound8KT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.0085,
"metrics": {
"main_score": 0.00923,
"ndcg_at_10": 0.0085,
"recall_at_1": 0.00196,
"recall_at_10": 0.01847,
"mrr_at_10": 0.005544
},
"subsets": 1
},
{
"label": "AIST-87M 512",
"dimension": 512,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.26792,
"metrics": {
"main_score": 0.33107,
"ndcg_at_10": 0.26792,
"recall_at_1": 0.1248,
"recall_at_10": 0.44261,
"mrr_at_10": 0.213985
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist81m_raw1280_mn20_merged_teacher_20260503T0125Z_memory_slice_dim512/dim512/results/triembed__te-512d/best_model"
]
},
{
"label": "AIST-95M 1280 Flickr",
"dimension": 1280,
"results_dir": "/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model",
"completed_tasks": 2,
"missing_tasks": [
"ClothoT2ARetrieval",
"CommonVoiceMini21T2ARetrieval",
"MACST2ARetrieval",
"STSBenchmark",
"SprintDuplicateQuestions",
"UrbanSound8KT2ARetrieval"
],
"overall_mean": 0.485,
"family_means": {
"Image recall": 0.485
},
"rows": [
{
"label": "AIST-95M 1280 Flickr",
"dimension": 1280,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.50216,
"metrics": {
"main_score": 0.50216,
"ndcg_at_10": 0.50216,
"recall_at_1": 0.3254,
"recall_at_10": 0.7004,
"mrr_at_10": 0.439975
},
"subsets": 1
},
{
"label": "AIST-95M 1280 Flickr",
"dimension": 1280,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.46784,
"metrics": {
"main_score": 0.46784,
"ndcg_at_10": 0.46784,
"recall_at_1": 0.0958,
"recall_at_10": 0.5034,
"mrr_at_10": 0.598869
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model"
]
},
{
"label": "Native mn20 audio 768",
"dimension": 768,
"results_dir": "/shared/augmem/triembed/results/es_aist_memory_audio_native_default_20260501T1835Z/dim768/results/triembed__native-efficientat-768d/latest_model",
"completed_tasks": 4,
"missing_tasks": [
"Flickr30kI2TRetrieval",
"Flickr30kT2IRetrieval",
"STSBenchmark",
"SprintDuplicateQuestions"
],
"overall_mean": 0.11513626068376069,
"family_means": {
"Audio recall": 0.11513626068376069
},
"rows": [
{
"label": "Native mn20 audio 768",
"dimension": 768,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.035825042735042736,
"metrics": {
"main_score": 0.04166820512820513,
"ndcg_at_10": 0.035825042735042736,
"recall_at_1": 0.009125726495726495,
"recall_at_10": 0.07585017094017094,
"mrr_at_10": 0.023907692307692307
},
"subsets": 117
},
{
"label": "Native mn20 audio 768",
"dimension": 768,
"task": "MACST2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.12746,
"metrics": {
"main_score": 0.13995,
"ndcg_at_10": 0.12746,
"recall_at_1": 0.05852,
"recall_at_10": 0.22392,
"mrr_at_10": 0.098715
},
"subsets": 1
},
{
"label": "Native mn20 audio 768",
"dimension": 768,
"task": "UrbanSound8KT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.00849,
"metrics": {
"main_score": 0.00923,
"ndcg_at_10": 0.00849,
"recall_at_1": 0.00196,
"recall_at_10": 0.01866,
"mrr_at_10": 0.005487
},
"subsets": 1
},
{
"label": "Native mn20 audio 768",
"dimension": 768,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.28877,
"metrics": {
"main_score": 0.3581,
"ndcg_at_10": 0.28877,
"recall_at_1": 0.14414,
"recall_at_10": 0.4641,
"mrr_at_10": 0.234475
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/es_aist_memory_audio_native_default_20260501T1835Z/dim768/results/triembed__native-efficientat-768d/latest_model"
]
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"results_dir": "/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim1280/results/triembed__te-1280d/TE-86M-dual-audio-best_model",
"completed_tasks": 8,
"missing_tasks": [],
"overall_mean": 0.3973782852564103,
"family_means": {
"Audio recall": 0.11287532051282051,
"Image recall": 0.485,
"Text continuity": 0.8787625
},
"rows": [
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.953368,
"metrics": {
"main_score": 0.953368
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.804157,
"metrics": {
"main_score": 0.804157,
"cosine_spearman": 0.804157,
"spearman": 0.804154
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.50216,
"metrics": {
"main_score": 0.50216,
"ndcg_at_10": 0.50216,
"recall_at_1": 0.3254,
"recall_at_10": 0.7004,
"mrr_at_10": 0.439975
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.46784,
"metrics": {
"main_score": 0.46784,
"ndcg_at_10": 0.46784,
"recall_at_1": 0.0958,
"recall_at_10": 0.5034,
"mrr_at_10": 0.598869
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.03849128205128205,
"metrics": {
"main_score": 0.04426282051282051,
"ndcg_at_10": 0.03849128205128205,
"recall_at_1": 0.00971991452991453,
"recall_at_10": 0.08076905982905982,
"mrr_at_10": 0.02587371794871795
},
"subsets": 117
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "MACST2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.10964,
"metrics": {
"main_score": 0.15522,
"ndcg_at_10": 0.10964,
"recall_at_1": 0.04326,
"recall_at_10": 0.19338,
"mrr_at_10": 0.083683
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "UrbanSound8KT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.00823,
"metrics": {
"main_score": 0.00904,
"ndcg_at_10": 0.00823,
"recall_at_1": 0.00177,
"recall_at_10": 0.01807,
"mrr_at_10": 0.00531
},
"subsets": 1
},
{
"label": "Dual-audio tower 1280",
"dimension": 1280,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.29514,
"metrics": {
"main_score": 0.36043,
"ndcg_at_10": 0.29514,
"recall_at_1": 0.14861,
"recall_at_10": 0.47395,
"mrr_at_10": 0.239903
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim1280/results/triembed__te-1280d/TE-86M-dual-audio-best_model"
]
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"results_dir": "/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim768/results/triembed__te-768d/TE-86M-dual-audio-best_model",
"completed_tasks": 6,
"missing_tasks": [
"MACST2ARetrieval",
"UrbanSound8KT2ARetrieval"
],
"overall_mean": 0.5098147193732193,
"family_means": {
"Audio recall": 0.16678465811965812,
"Image recall": 0.48403999999999997,
"Text continuity": 0.8786195
},
"rows": [
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.953072,
"metrics": {
"main_score": 0.953072
},
"subsets": 1
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.804167,
"metrics": {
"main_score": 0.804167,
"cosine_spearman": 0.804167,
"spearman": 0.804167
},
"subsets": 1
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "Flickr30kT2IRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.50179,
"metrics": {
"main_score": 0.50179,
"ndcg_at_10": 0.50179,
"recall_at_1": 0.3254,
"recall_at_10": 0.698,
"mrr_at_10": 0.440147
},
"subsets": 1
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "Flickr30kI2TRetrieval",
"family": "Image recall",
"primary_metric": "ndcg_at_10",
"primary": 0.46629,
"metrics": {
"main_score": 0.46629,
"ndcg_at_10": 0.46629,
"recall_at_1": 0.0956,
"recall_at_10": 0.5022,
"mrr_at_10": 0.597365
},
"subsets": 1
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.03849931623931624,
"metrics": {
"main_score": 0.04466316239316239,
"ndcg_at_10": 0.03849931623931624,
"recall_at_1": 0.009814871794871794,
"recall_at_10": 0.08058384615384616,
"mrr_at_10": 0.025928871794871796
},
"subsets": 117
},
{
"label": "Dual-audio tower 768",
"dimension": 768,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.29507,
"metrics": {
"main_score": 0.3615,
"ndcg_at_10": 0.29507,
"recall_at_1": 0.14861,
"recall_at_10": 0.47359,
"mrr_at_10": 0.239883
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim768/results/triembed__te-768d/TE-86M-dual-audio-best_model"
]
},
{
"label": "Dual-audio tower 512",
"dimension": 512,
"results_dir": "/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim512/results/triembed__te-512d/TE-86M-dual-audio-best_model",
"completed_tasks": 4,
"missing_tasks": [
"Flickr30kI2TRetrieval",
"Flickr30kT2IRetrieval",
"MACST2ARetrieval",
"UrbanSound8KT2ARetrieval"
],
"overall_mean": 0.5228179594017094,
"family_means": {
"Audio recall": 0.16697341880341882,
"Text continuity": 0.8786625
},
"rows": [
{
"label": "Dual-audio tower 512",
"dimension": 512,
"task": "SprintDuplicateQuestions",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.952893,
"metrics": {
"main_score": 0.952893
},
"subsets": 1
},
{
"label": "Dual-audio tower 512",
"dimension": 512,
"task": "STSBenchmark",
"family": "Text continuity",
"primary_metric": "main_score",
"primary": 0.804432,
"metrics": {
"main_score": 0.804432,
"cosine_spearman": 0.804432,
"spearman": 0.804432
},
"subsets": 1
},
{
"label": "Dual-audio tower 512",
"dimension": 512,
"task": "CommonVoiceMini21T2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.03858683760683761,
"metrics": {
"main_score": 0.04408854700854701,
"ndcg_at_10": 0.03858683760683761,
"recall_at_1": 0.00959076923076923,
"recall_at_10": 0.08129623931623932,
"mrr_at_10": 0.025843299145299144
},
"subsets": 117
},
{
"label": "Dual-audio tower 512",
"dimension": 512,
"task": "ClothoT2ARetrieval",
"family": "Audio recall",
"primary_metric": "ndcg_at_10",
"primary": 0.29536,
"metrics": {
"main_score": 0.35882,
"ndcg_at_10": 0.29536,
"recall_at_1": 0.1513,
"recall_at_10": 0.47162,
"mrr_at_10": 0.240905
},
"subsets": 1
}
],
"source_result_dirs": [
"/shared/augmem/triembed/results/aist86m_full_mteb_mieb_maeb_1280_768_512_20260502T070609Z/dim512/results/triembed__te-512d/TE-86M-dual-audio-best_model"
]
}
],
"comparisons": [
{
"baseline": "Native mn20 audio 768",
"target": "AIST-87M 768",
"paired_tasks": 4,
"mean_absolute_delta": -0.010867350427350436,
"rows": [
{
"task": "CommonVoiceMini21T2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Native mn20 audio 768",
"baseline_primary": 0.035825042735042736,
"target": "AIST-87M 768",
"target_primary": 0.028395641025641027,
"absolute_delta": -0.00742940170940171,
"relative_delta_pct": -20.73801213399403
},
{
"task": "MACST2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Native mn20 audio 768",
"baseline_primary": 0.12746,
"target": "AIST-87M 768",
"target_primary": 0.11149,
"absolute_delta": -0.015969999999999984,
"relative_delta_pct": -12.529420994821894
},
{
"task": "UrbanSound8KT2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Native mn20 audio 768",
"baseline_primary": 0.00849,
"target": "AIST-87M 768",
"target_primary": 0.00851,
"absolute_delta": 0.00002000000000000092,
"relative_delta_pct": 0.2355712603062535
},
{
"task": "ClothoT2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Native mn20 audio 768",
"baseline_primary": 0.28877,
"target": "AIST-87M 768",
"target_primary": 0.26868,
"absolute_delta": -0.020090000000000052,
"relative_delta_pct": -6.9570938809433285
}
]
},
{
"baseline": "Dual-audio tower 768",
"target": "AIST-87M 768",
"paired_tasks": 6,
"mean_absolute_delta": -0.06486544586894587,
"rows": [
{
"task": "SprintDuplicateQuestions",
"dimension": 768,
"family": "Text continuity",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.953072,
"target": "AIST-87M 768",
"target_primary": 0.874231,
"absolute_delta": -0.07884100000000005,
"relative_delta_pct": -8.272302617220948
},
{
"task": "STSBenchmark",
"dimension": 768,
"family": "Text continuity",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.804167,
"target": "AIST-87M 768",
"target_primary": 0.650759,
"absolute_delta": -0.153408,
"relative_delta_pct": -19.076634579633335
},
{
"task": "Flickr30kT2IRetrieval",
"dimension": 768,
"family": "Image recall",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.50179,
"target": "AIST-87M 768",
"target_primary": 0.46701,
"absolute_delta": -0.03477999999999998,
"relative_delta_pct": -6.931186352856769
},
{
"task": "Flickr30kI2TRetrieval",
"dimension": 768,
"family": "Image recall",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.46629,
"target": "AIST-87M 768",
"target_primary": 0.38062,
"absolute_delta": -0.08566999999999997,
"relative_delta_pct": -18.372686525552762
},
{
"task": "CommonVoiceMini21T2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.03849931623931624,
"target": "AIST-87M 768",
"target_primary": 0.028395641025641027,
"absolute_delta": -0.010103675213675212,
"relative_delta_pct": -26.243778333281533
},
{
"task": "ClothoT2ARetrieval",
"dimension": 768,
"family": "Audio recall",
"baseline": "Dual-audio tower 768",
"baseline_primary": 0.29507,
"target": "AIST-87M 768",
"target_primary": 0.26868,
"absolute_delta": -0.026390000000000025,
"relative_delta_pct": -8.943640492086633
}
]
}
]
}