Remove ES-AIST comparison from AIST-87M card
Browse files- README.md +2 -2
- aist87m_memory_slice_release_report.json +1 -237
- aist87m_memory_slice_release_report.md +0 -16
- aist_81m_raw_mn20_lora.yaml +3 -3
- manifest.json +6 -6
README.md
CHANGED
|
@@ -99,11 +99,11 @@ Selected 1280d task scores:
|
|
| 99 |
|
| 100 |
## Task-Aligned Comparisons
|
| 101 |
|
| 102 |
-
Comparisons below are only for locally available, task-aligned runs
|
|
|
|
| 103 |
|
| 104 |
| Comparison | Dim | Paired tasks | Read |
|
| 105 |
|---|---:|---:|---|
|
| 106 |
-
| vs `ES-AIST-81M` | 768 | 8 | lower text continuity; stronger Flickr and selected audio recall |
|
| 107 |
| vs native `mn20_as` audio baseline | 768 | 4 | slightly lower selected audio recall on average; UrbanSound8K is flat |
|
| 108 |
| vs dual-audio tower | 768 | 6 | smaller single-audio runtime, but lower paired text/image/audio scores |
|
| 109 |
| vs `AIST-95M` | 1280 | 2 | only paired Flickr tasks are available locally; `AIST-95M` remains stronger on that pair |
|
|
|
|
| 99 |
|
| 100 |
## Task-Aligned Comparisons
|
| 101 |
|
| 102 |
+
Comparisons below are only for locally available, task-aligned runs from the
|
| 103 |
+
same raw AIST line and its audio baselines.
|
| 104 |
|
| 105 |
| Comparison | Dim | Paired tasks | Read |
|
| 106 |
|---|---:|---:|---|
|
|
|
|
| 107 |
| vs native `mn20_as` audio baseline | 768 | 4 | slightly lower selected audio recall on average; UrbanSound8K is flat |
|
| 108 |
| vs dual-audio tower | 768 | 6 | smaller single-audio runtime, but lower paired text/image/audio scores |
|
| 109 |
| vs `AIST-95M` | 1280 | 2 | only paired Flickr tasks are available locally; `AIST-95M` remains stronger on that pair |
|
aist87m_memory_slice_release_report.json
CHANGED
|
@@ -491,146 +491,6 @@
|
|
| 491 |
"/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model"
|
| 492 |
]
|
| 493 |
},
|
| 494 |
-
{
|
| 495 |
-
"label": "ES-AIST-81M 768",
|
| 496 |
-
"dimension": 768,
|
| 497 |
-
"results_dir": "/shared/augmem/triembed/results/es_aist_memory_slice_default_20260501T1835Z/dim768/results/triembed__te-768d/best_model",
|
| 498 |
-
"completed_tasks": 8,
|
| 499 |
-
"missing_tasks": [],
|
| 500 |
-
"overall_mean": 0.30764677777777777,
|
| 501 |
-
"family_means": {
|
| 502 |
-
"Audio recall": 0.06462555555555556,
|
| 503 |
-
"Image recall": 0.271195,
|
| 504 |
-
"Text continuity": 0.830141
|
| 505 |
-
},
|
| 506 |
-
"rows": [
|
| 507 |
-
{
|
| 508 |
-
"label": "ES-AIST-81M 768",
|
| 509 |
-
"dimension": 768,
|
| 510 |
-
"task": "SprintDuplicateQuestions",
|
| 511 |
-
"family": "Text continuity",
|
| 512 |
-
"primary_metric": "main_score",
|
| 513 |
-
"primary": 0.916128,
|
| 514 |
-
"metrics": {
|
| 515 |
-
"main_score": 0.916128
|
| 516 |
-
},
|
| 517 |
-
"subsets": 1
|
| 518 |
-
},
|
| 519 |
-
{
|
| 520 |
-
"label": "ES-AIST-81M 768",
|
| 521 |
-
"dimension": 768,
|
| 522 |
-
"task": "STSBenchmark",
|
| 523 |
-
"family": "Text continuity",
|
| 524 |
-
"primary_metric": "main_score",
|
| 525 |
-
"primary": 0.744154,
|
| 526 |
-
"metrics": {
|
| 527 |
-
"main_score": 0.744154,
|
| 528 |
-
"cosine_spearman": 0.744154,
|
| 529 |
-
"spearman": 0.744154
|
| 530 |
-
},
|
| 531 |
-
"subsets": 1
|
| 532 |
-
},
|
| 533 |
-
{
|
| 534 |
-
"label": "ES-AIST-81M 768",
|
| 535 |
-
"dimension": 768,
|
| 536 |
-
"task": "Flickr30kT2IRetrieval",
|
| 537 |
-
"family": "Image recall",
|
| 538 |
-
"primary_metric": "ndcg_at_10",
|
| 539 |
-
"primary": 0.34676,
|
| 540 |
-
"metrics": {
|
| 541 |
-
"main_score": 0.34676,
|
| 542 |
-
"ndcg_at_10": 0.34676,
|
| 543 |
-
"recall_at_1": 0.1764,
|
| 544 |
-
"recall_at_10": 0.5528,
|
| 545 |
-
"mrr_at_10": 0.282987
|
| 546 |
-
},
|
| 547 |
-
"subsets": 1
|
| 548 |
-
},
|
| 549 |
-
{
|
| 550 |
-
"label": "ES-AIST-81M 768",
|
| 551 |
-
"dimension": 768,
|
| 552 |
-
"task": "Flickr30kI2TRetrieval",
|
| 553 |
-
"family": "Image recall",
|
| 554 |
-
"primary_metric": "ndcg_at_10",
|
| 555 |
-
"primary": 0.19563,
|
| 556 |
-
"metrics": {
|
| 557 |
-
"main_score": 0.19563,
|
| 558 |
-
"ndcg_at_10": 0.19563,
|
| 559 |
-
"recall_at_1": 0.037,
|
| 560 |
-
"recall_at_10": 0.2208,
|
| 561 |
-
"mrr_at_10": 0.295532
|
| 562 |
-
},
|
| 563 |
-
"subsets": 1
|
| 564 |
-
},
|
| 565 |
-
{
|
| 566 |
-
"label": "ES-AIST-81M 768",
|
| 567 |
-
"dimension": 768,
|
| 568 |
-
"task": "CommonVoiceMini21T2ARetrieval",
|
| 569 |
-
"family": "Audio recall",
|
| 570 |
-
"primary_metric": "ndcg_at_10",
|
| 571 |
-
"primary": 0.024182222222222223,
|
| 572 |
-
"metrics": {
|
| 573 |
-
"main_score": 0.02774760683760684,
|
| 574 |
-
"ndcg_at_10": 0.024182222222222223,
|
| 575 |
-
"recall_at_1": 0.005472478632478632,
|
| 576 |
-
"recall_at_10": 0.052400170940170944,
|
| 577 |
-
"mrr_at_10": 0.01579925641025641
|
| 578 |
-
},
|
| 579 |
-
"subsets": 117
|
| 580 |
-
},
|
| 581 |
-
{
|
| 582 |
-
"label": "ES-AIST-81M 768",
|
| 583 |
-
"dimension": 768,
|
| 584 |
-
"task": "MACST2ARetrieval",
|
| 585 |
-
"family": "Audio recall",
|
| 586 |
-
"primary_metric": "ndcg_at_10",
|
| 587 |
-
"primary": 0.07729,
|
| 588 |
-
"metrics": {
|
| 589 |
-
"main_score": 0.08906,
|
| 590 |
-
"ndcg_at_10": 0.07729,
|
| 591 |
-
"recall_at_1": 0.0229,
|
| 592 |
-
"recall_at_10": 0.15013,
|
| 593 |
-
"mrr_at_10": 0.055219
|
| 594 |
-
},
|
| 595 |
-
"subsets": 1
|
| 596 |
-
},
|
| 597 |
-
{
|
| 598 |
-
"label": "ES-AIST-81M 768",
|
| 599 |
-
"dimension": 768,
|
| 600 |
-
"task": "UrbanSound8KT2ARetrieval",
|
| 601 |
-
"family": "Audio recall",
|
| 602 |
-
"primary_metric": "ndcg_at_10",
|
| 603 |
-
"primary": 0.007,
|
| 604 |
-
"metrics": {
|
| 605 |
-
"main_score": 0.00747,
|
| 606 |
-
"ndcg_at_10": 0.007,
|
| 607 |
-
"recall_at_1": 0.00098,
|
| 608 |
-
"recall_at_10": 0.01631,
|
| 609 |
-
"mrr_at_10": 0.004257
|
| 610 |
-
},
|
| 611 |
-
"subsets": 1
|
| 612 |
-
},
|
| 613 |
-
{
|
| 614 |
-
"label": "ES-AIST-81M 768",
|
| 615 |
-
"dimension": 768,
|
| 616 |
-
"task": "ClothoT2ARetrieval",
|
| 617 |
-
"family": "Audio recall",
|
| 618 |
-
"primary_metric": "ndcg_at_10",
|
| 619 |
-
"primary": 0.15003,
|
| 620 |
-
"metrics": {
|
| 621 |
-
"main_score": 0.17601,
|
| 622 |
-
"ndcg_at_10": 0.15003,
|
| 623 |
-
"recall_at_1": 0.05121,
|
| 624 |
-
"recall_at_10": 0.28612,
|
| 625 |
-
"mrr_at_10": 0.108968
|
| 626 |
-
},
|
| 627 |
-
"subsets": 1
|
| 628 |
-
}
|
| 629 |
-
],
|
| 630 |
-
"source_result_dirs": [
|
| 631 |
-
"/shared/augmem/triembed/results/es_aist_memory_slice_default_20260501T1835Z/dim768/results/triembed__te-768d/best_model"
|
| 632 |
-
]
|
| 633 |
-
},
|
| 634 |
{
|
| 635 |
"label": "Native mn20 audio 768",
|
| 636 |
"dimension": 768,
|
|
@@ -1049,102 +909,6 @@
|
|
| 1049 |
}
|
| 1050 |
],
|
| 1051 |
"comparisons": [
|
| 1052 |
-
{
|
| 1053 |
-
"baseline": "ES-AIST-81M 768",
|
| 1054 |
-
"target": "AIST-87M 768",
|
| 1055 |
-
"paired_tasks": 8,
|
| 1056 |
-
"mean_absolute_delta": 0.041065177350427334,
|
| 1057 |
-
"rows": [
|
| 1058 |
-
{
|
| 1059 |
-
"task": "SprintDuplicateQuestions",
|
| 1060 |
-
"dimension": 768,
|
| 1061 |
-
"family": "Text continuity",
|
| 1062 |
-
"baseline": "ES-AIST-81M 768",
|
| 1063 |
-
"baseline_primary": 0.916128,
|
| 1064 |
-
"target": "AIST-87M 768",
|
| 1065 |
-
"target_primary": 0.874231,
|
| 1066 |
-
"absolute_delta": -0.04189700000000007,
|
| 1067 |
-
"relative_delta_pct": -4.573269237486473
|
| 1068 |
-
},
|
| 1069 |
-
{
|
| 1070 |
-
"task": "STSBenchmark",
|
| 1071 |
-
"dimension": 768,
|
| 1072 |
-
"family": "Text continuity",
|
| 1073 |
-
"baseline": "ES-AIST-81M 768",
|
| 1074 |
-
"baseline_primary": 0.744154,
|
| 1075 |
-
"target": "AIST-87M 768",
|
| 1076 |
-
"target_primary": 0.650759,
|
| 1077 |
-
"absolute_delta": -0.093395,
|
| 1078 |
-
"relative_delta_pct": -12.550493580629817
|
| 1079 |
-
},
|
| 1080 |
-
{
|
| 1081 |
-
"task": "Flickr30kT2IRetrieval",
|
| 1082 |
-
"dimension": 768,
|
| 1083 |
-
"family": "Image recall",
|
| 1084 |
-
"baseline": "ES-AIST-81M 768",
|
| 1085 |
-
"baseline_primary": 0.34676,
|
| 1086 |
-
"target": "AIST-87M 768",
|
| 1087 |
-
"target_primary": 0.46701,
|
| 1088 |
-
"absolute_delta": 0.12024999999999997,
|
| 1089 |
-
"relative_delta_pct": 34.67816357134617
|
| 1090 |
-
},
|
| 1091 |
-
{
|
| 1092 |
-
"task": "Flickr30kI2TRetrieval",
|
| 1093 |
-
"dimension": 768,
|
| 1094 |
-
"family": "Image recall",
|
| 1095 |
-
"baseline": "ES-AIST-81M 768",
|
| 1096 |
-
"baseline_primary": 0.19563,
|
| 1097 |
-
"target": "AIST-87M 768",
|
| 1098 |
-
"target_primary": 0.38062,
|
| 1099 |
-
"absolute_delta": 0.18499000000000002,
|
| 1100 |
-
"relative_delta_pct": 94.56116137606708
|
| 1101 |
-
},
|
| 1102 |
-
{
|
| 1103 |
-
"task": "CommonVoiceMini21T2ARetrieval",
|
| 1104 |
-
"dimension": 768,
|
| 1105 |
-
"family": "Audio recall",
|
| 1106 |
-
"baseline": "ES-AIST-81M 768",
|
| 1107 |
-
"baseline_primary": 0.024182222222222223,
|
| 1108 |
-
"target": "AIST-87M 768",
|
| 1109 |
-
"target_primary": 0.028395641025641027,
|
| 1110 |
-
"absolute_delta": 0.004213418803418804,
|
| 1111 |
-
"relative_delta_pct": 17.423621223474193
|
| 1112 |
-
},
|
| 1113 |
-
{
|
| 1114 |
-
"task": "MACST2ARetrieval",
|
| 1115 |
-
"dimension": 768,
|
| 1116 |
-
"family": "Audio recall",
|
| 1117 |
-
"baseline": "ES-AIST-81M 768",
|
| 1118 |
-
"baseline_primary": 0.07729,
|
| 1119 |
-
"target": "AIST-87M 768",
|
| 1120 |
-
"target_primary": 0.11149,
|
| 1121 |
-
"absolute_delta": 0.03420000000000001,
|
| 1122 |
-
"relative_delta_pct": 44.24893259153838
|
| 1123 |
-
},
|
| 1124 |
-
{
|
| 1125 |
-
"task": "UrbanSound8KT2ARetrieval",
|
| 1126 |
-
"dimension": 768,
|
| 1127 |
-
"family": "Audio recall",
|
| 1128 |
-
"baseline": "ES-AIST-81M 768",
|
| 1129 |
-
"baseline_primary": 0.007,
|
| 1130 |
-
"target": "AIST-87M 768",
|
| 1131 |
-
"target_primary": 0.00851,
|
| 1132 |
-
"absolute_delta": 0.00151,
|
| 1133 |
-
"relative_delta_pct": 21.571428571428573
|
| 1134 |
-
},
|
| 1135 |
-
{
|
| 1136 |
-
"task": "ClothoT2ARetrieval",
|
| 1137 |
-
"dimension": 768,
|
| 1138 |
-
"family": "Audio recall",
|
| 1139 |
-
"baseline": "ES-AIST-81M 768",
|
| 1140 |
-
"baseline_primary": 0.15003,
|
| 1141 |
-
"target": "AIST-87M 768",
|
| 1142 |
-
"target_primary": 0.26868,
|
| 1143 |
-
"absolute_delta": 0.11864999999999998,
|
| 1144 |
-
"relative_delta_pct": 79.08418316336731
|
| 1145 |
-
}
|
| 1146 |
-
]
|
| 1147 |
-
},
|
| 1148 |
{
|
| 1149 |
"baseline": "Native mn20 audio 768",
|
| 1150 |
"target": "AIST-87M 768",
|
|
@@ -1181,7 +945,7 @@
|
|
| 1181 |
"baseline_primary": 0.00849,
|
| 1182 |
"target": "AIST-87M 768",
|
| 1183 |
"target_primary": 0.00851,
|
| 1184 |
-
"absolute_delta":
|
| 1185 |
"relative_delta_pct": 0.2355712603062535
|
| 1186 |
},
|
| 1187 |
{
|
|
|
|
| 491 |
"/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model"
|
| 492 |
]
|
| 493 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
{
|
| 495 |
"label": "Native mn20 audio 768",
|
| 496 |
"dimension": 768,
|
|
|
|
| 909 |
}
|
| 910 |
],
|
| 911 |
"comparisons": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 912 |
{
|
| 913 |
"baseline": "Native mn20 audio 768",
|
| 914 |
"target": "AIST-87M 768",
|
|
|
|
| 945 |
"baseline_primary": 0.00849,
|
| 946 |
"target": "AIST-87M 768",
|
| 947 |
"target_primary": 0.00851,
|
| 948 |
+
"absolute_delta": 0.00002000000000000092,
|
| 949 |
"relative_delta_pct": 0.2355712603062535
|
| 950 |
},
|
| 951 |
{
|
aist87m_memory_slice_release_report.md
CHANGED
|
@@ -8,7 +8,6 @@ Primary metrics are `main_score` for text continuity tasks and `NDCG@10` for ima
|
|
| 8 |
| AIST-87M 768 | 768 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
|
| 9 |
| AIST-87M 512 | 512 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
|
| 10 |
| AIST-95M 1280 Flickr | 1280 | 2 | - | 0.485 | - | 0.485 | ClothoT2ARetrieval, CommonVoiceMini21T2ARetrieval, MACST2ARetrieval, STSBenchmark, SprintDuplicateQuestions, UrbanSound8KT2ARetrieval |
|
| 11 |
-
| ES-AIST-81M 768 | 768 | 8 | 0.830 | 0.271 | 0.065 | 0.308 | none |
|
| 12 |
| Native mn20 audio 768 | 768 | 4 | - | - | 0.115 | 0.115 | Flickr30kI2TRetrieval, Flickr30kT2IRetrieval, STSBenchmark, SprintDuplicateQuestions |
|
| 13 |
| Dual-audio tower 1280 | 1280 | 8 | 0.879 | 0.485 | 0.113 | 0.397 | none |
|
| 14 |
| Dual-audio tower 768 | 768 | 6 | 0.879 | 0.484 | 0.167 | 0.510 | MACST2ARetrieval, UrbanSound8KT2ARetrieval |
|
|
@@ -45,21 +44,6 @@ Primary metrics are `main_score` for text continuity tasks and `NDCG@10` for ima
|
|
| 45 |
|
| 46 |
## Paired Comparisons
|
| 47 |
|
| 48 |
-
### AIST-87M 768 vs ES-AIST-81M 768
|
| 49 |
-
|
| 50 |
-
Mean absolute delta over 8 paired tasks: 0.041.
|
| 51 |
-
|
| 52 |
-
| Dim | Task | Baseline | Target | Absolute delta | Relative delta |
|
| 53 |
-
|---:|---|---:|---:|---:|---:|
|
| 54 |
-
| 768 | SprintDuplicateQuestions | 0.916 | 0.874 | -0.042 | -4.6% |
|
| 55 |
-
| 768 | STSBenchmark | 0.744 | 0.651 | -0.093 | -12.6% |
|
| 56 |
-
| 768 | Flickr30kT2IRetrieval | 0.347 | 0.467 | 0.120 | 34.7% |
|
| 57 |
-
| 768 | Flickr30kI2TRetrieval | 0.196 | 0.381 | 0.185 | 94.6% |
|
| 58 |
-
| 768 | CommonVoiceMini21T2ARetrieval | 0.024 | 0.028 | 0.004 | 17.4% |
|
| 59 |
-
| 768 | MACST2ARetrieval | 0.077 | 0.111 | 0.034 | 44.2% |
|
| 60 |
-
| 768 | UrbanSound8KT2ARetrieval | 0.007 | 0.009 | 0.002 | 21.6% |
|
| 61 |
-
| 768 | ClothoT2ARetrieval | 0.150 | 0.269 | 0.119 | 79.1% |
|
| 62 |
-
|
| 63 |
### AIST-87M 768 vs Native mn20 audio 768
|
| 64 |
|
| 65 |
Mean absolute delta over 4 paired tasks: -0.011.
|
|
|
|
| 8 |
| AIST-87M 768 | 768 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
|
| 9 |
| AIST-87M 512 | 512 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
|
| 10 |
| AIST-95M 1280 Flickr | 1280 | 2 | - | 0.485 | - | 0.485 | ClothoT2ARetrieval, CommonVoiceMini21T2ARetrieval, MACST2ARetrieval, STSBenchmark, SprintDuplicateQuestions, UrbanSound8KT2ARetrieval |
|
|
|
|
| 11 |
| Native mn20 audio 768 | 768 | 4 | - | - | 0.115 | 0.115 | Flickr30kI2TRetrieval, Flickr30kT2IRetrieval, STSBenchmark, SprintDuplicateQuestions |
|
| 12 |
| Dual-audio tower 1280 | 1280 | 8 | 0.879 | 0.485 | 0.113 | 0.397 | none |
|
| 13 |
| Dual-audio tower 768 | 768 | 6 | 0.879 | 0.484 | 0.167 | 0.510 | MACST2ARetrieval, UrbanSound8KT2ARetrieval |
|
|
|
|
| 44 |
|
| 45 |
## Paired Comparisons
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
### AIST-87M 768 vs Native mn20 audio 768
|
| 48 |
|
| 49 |
Mean absolute delta over 4 paired tasks: -0.011.
|
aist_81m_raw_mn20_lora.yaml
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
# Raw AIST-81M baseline.
|
| 2 |
#
|
| 3 |
# Generic trimodal InfoNCE teacher recipe using the single native mn20_as LoRA
|
| 4 |
-
# audio path
|
| 5 |
-
#
|
| 6 |
-
#
|
| 7 |
|
| 8 |
dataset_dir: datasets
|
| 9 |
dataset_name: wordnet_2024_openai_validaudio
|
|
|
|
| 1 |
# Raw AIST-81M baseline.
|
| 2 |
#
|
| 3 |
# Generic trimodal InfoNCE teacher recipe using the single native mn20_as LoRA
|
| 4 |
+
# audio path. This intentionally has no extra signal layout or specialized
|
| 5 |
+
# corpus/loss. It is the full-data baseline; subset cache aliases are only for
|
| 6 |
+
# smoke tests.
|
| 7 |
|
| 8 |
dataset_dir: datasets
|
| 9 |
dataset_name: wordnet_2024_openai_validaudio
|
manifest.json
CHANGED
|
@@ -48,8 +48,8 @@
|
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"path": "aist_81m_raw_mn20_lora.yaml",
|
| 51 |
-
"size_bytes":
|
| 52 |
-
"sha256": "
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"path": "parameter_breakdown.json",
|
|
@@ -58,13 +58,13 @@
|
|
| 58 |
},
|
| 59 |
{
|
| 60 |
"path": "aist87m_memory_slice_release_report.md",
|
| 61 |
-
"size_bytes":
|
| 62 |
-
"sha256": "
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"path": "aist87m_memory_slice_release_report.json",
|
| 66 |
-
"size_bytes":
|
| 67 |
-
"sha256": "
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"path": "aist87m_vs_dual_audio_throughput_l4_20260504.json",
|
|
|
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"path": "aist_81m_raw_mn20_lora.yaml",
|
| 51 |
+
"size_bytes": 1280,
|
| 52 |
+
"sha256": "38dc7cbf48c39af80c0828d37d39c8a94ec30b722dc60b09dd06d2704b787c0d"
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"path": "parameter_breakdown.json",
|
|
|
|
| 58 |
},
|
| 59 |
{
|
| 60 |
"path": "aist87m_memory_slice_release_report.md",
|
| 61 |
+
"size_bytes": 4514,
|
| 62 |
+
"sha256": "6d036a8e0b82aa6fb25b9eb55721fc26d2546ff5fea8a14baca1310e5fe3c7f8"
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"path": "aist87m_memory_slice_release_report.json",
|
| 66 |
+
"size_bytes": 33035,
|
| 67 |
+
"sha256": "5af8785667cb7f053dd42975cb6ac94e79214ac5fe7c465ab82a136701f0c75a"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"path": "aist87m_vs_dual_audio_throughput_l4_20260504.json",
|