gcoderw commited on
Commit
9c1b25e
·
verified ·
1 Parent(s): e412b76

Remove ES-AIST comparison from AIST-87M card

Browse files
README.md CHANGED
@@ -99,11 +99,11 @@ Selected 1280d task scores:
99
 
100
  ## Task-Aligned Comparisons
101
 
102
- Comparisons below are only for locally available, task-aligned runs.
 
103
 
104
  | Comparison | Dim | Paired tasks | Read |
105
  |---|---:|---:|---|
106
- | vs `ES-AIST-81M` | 768 | 8 | lower text continuity; stronger Flickr and selected audio recall |
107
  | vs native `mn20_as` audio baseline | 768 | 4 | slightly lower selected audio recall on average; UrbanSound8K is flat |
108
  | vs dual-audio tower | 768 | 6 | smaller single-audio runtime, but lower paired text/image/audio scores |
109
  | vs `AIST-95M` | 1280 | 2 | only paired Flickr tasks are available locally; `AIST-95M` remains stronger on that pair |
 
99
 
100
  ## Task-Aligned Comparisons
101
 
102
+ Comparisons below are only for locally available, task-aligned runs from the
103
+ same raw AIST line and its audio baselines.
104
 
105
  | Comparison | Dim | Paired tasks | Read |
106
  |---|---:|---:|---|
 
107
  | vs native `mn20_as` audio baseline | 768 | 4 | slightly lower selected audio recall on average; UrbanSound8K is flat |
108
  | vs dual-audio tower | 768 | 6 | smaller single-audio runtime, but lower paired text/image/audio scores |
109
  | vs `AIST-95M` | 1280 | 2 | only paired Flickr tasks are available locally; `AIST-95M` remains stronger on that pair |
aist87m_memory_slice_release_report.json CHANGED
@@ -491,146 +491,6 @@
491
  "/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model"
492
  ]
493
  },
494
- {
495
- "label": "ES-AIST-81M 768",
496
- "dimension": 768,
497
- "results_dir": "/shared/augmem/triembed/results/es_aist_memory_slice_default_20260501T1835Z/dim768/results/triembed__te-768d/best_model",
498
- "completed_tasks": 8,
499
- "missing_tasks": [],
500
- "overall_mean": 0.30764677777777777,
501
- "family_means": {
502
- "Audio recall": 0.06462555555555556,
503
- "Image recall": 0.271195,
504
- "Text continuity": 0.830141
505
- },
506
- "rows": [
507
- {
508
- "label": "ES-AIST-81M 768",
509
- "dimension": 768,
510
- "task": "SprintDuplicateQuestions",
511
- "family": "Text continuity",
512
- "primary_metric": "main_score",
513
- "primary": 0.916128,
514
- "metrics": {
515
- "main_score": 0.916128
516
- },
517
- "subsets": 1
518
- },
519
- {
520
- "label": "ES-AIST-81M 768",
521
- "dimension": 768,
522
- "task": "STSBenchmark",
523
- "family": "Text continuity",
524
- "primary_metric": "main_score",
525
- "primary": 0.744154,
526
- "metrics": {
527
- "main_score": 0.744154,
528
- "cosine_spearman": 0.744154,
529
- "spearman": 0.744154
530
- },
531
- "subsets": 1
532
- },
533
- {
534
- "label": "ES-AIST-81M 768",
535
- "dimension": 768,
536
- "task": "Flickr30kT2IRetrieval",
537
- "family": "Image recall",
538
- "primary_metric": "ndcg_at_10",
539
- "primary": 0.34676,
540
- "metrics": {
541
- "main_score": 0.34676,
542
- "ndcg_at_10": 0.34676,
543
- "recall_at_1": 0.1764,
544
- "recall_at_10": 0.5528,
545
- "mrr_at_10": 0.282987
546
- },
547
- "subsets": 1
548
- },
549
- {
550
- "label": "ES-AIST-81M 768",
551
- "dimension": 768,
552
- "task": "Flickr30kI2TRetrieval",
553
- "family": "Image recall",
554
- "primary_metric": "ndcg_at_10",
555
- "primary": 0.19563,
556
- "metrics": {
557
- "main_score": 0.19563,
558
- "ndcg_at_10": 0.19563,
559
- "recall_at_1": 0.037,
560
- "recall_at_10": 0.2208,
561
- "mrr_at_10": 0.295532
562
- },
563
- "subsets": 1
564
- },
565
- {
566
- "label": "ES-AIST-81M 768",
567
- "dimension": 768,
568
- "task": "CommonVoiceMini21T2ARetrieval",
569
- "family": "Audio recall",
570
- "primary_metric": "ndcg_at_10",
571
- "primary": 0.024182222222222223,
572
- "metrics": {
573
- "main_score": 0.02774760683760684,
574
- "ndcg_at_10": 0.024182222222222223,
575
- "recall_at_1": 0.005472478632478632,
576
- "recall_at_10": 0.052400170940170944,
577
- "mrr_at_10": 0.01579925641025641
578
- },
579
- "subsets": 117
580
- },
581
- {
582
- "label": "ES-AIST-81M 768",
583
- "dimension": 768,
584
- "task": "MACST2ARetrieval",
585
- "family": "Audio recall",
586
- "primary_metric": "ndcg_at_10",
587
- "primary": 0.07729,
588
- "metrics": {
589
- "main_score": 0.08906,
590
- "ndcg_at_10": 0.07729,
591
- "recall_at_1": 0.0229,
592
- "recall_at_10": 0.15013,
593
- "mrr_at_10": 0.055219
594
- },
595
- "subsets": 1
596
- },
597
- {
598
- "label": "ES-AIST-81M 768",
599
- "dimension": 768,
600
- "task": "UrbanSound8KT2ARetrieval",
601
- "family": "Audio recall",
602
- "primary_metric": "ndcg_at_10",
603
- "primary": 0.007,
604
- "metrics": {
605
- "main_score": 0.00747,
606
- "ndcg_at_10": 0.007,
607
- "recall_at_1": 0.00098,
608
- "recall_at_10": 0.01631,
609
- "mrr_at_10": 0.004257
610
- },
611
- "subsets": 1
612
- },
613
- {
614
- "label": "ES-AIST-81M 768",
615
- "dimension": 768,
616
- "task": "ClothoT2ARetrieval",
617
- "family": "Audio recall",
618
- "primary_metric": "ndcg_at_10",
619
- "primary": 0.15003,
620
- "metrics": {
621
- "main_score": 0.17601,
622
- "ndcg_at_10": 0.15003,
623
- "recall_at_1": 0.05121,
624
- "recall_at_10": 0.28612,
625
- "mrr_at_10": 0.108968
626
- },
627
- "subsets": 1
628
- }
629
- ],
630
- "source_result_dirs": [
631
- "/shared/augmem/triembed/results/es_aist_memory_slice_default_20260501T1835Z/dim768/results/triembed__te-768d/best_model"
632
- ]
633
- },
634
  {
635
  "label": "Native mn20 audio 768",
636
  "dimension": 768,
@@ -1049,102 +909,6 @@
1049
  }
1050
  ],
1051
  "comparisons": [
1052
- {
1053
- "baseline": "ES-AIST-81M 768",
1054
- "target": "AIST-87M 768",
1055
- "paired_tasks": 8,
1056
- "mean_absolute_delta": 0.041065177350427334,
1057
- "rows": [
1058
- {
1059
- "task": "SprintDuplicateQuestions",
1060
- "dimension": 768,
1061
- "family": "Text continuity",
1062
- "baseline": "ES-AIST-81M 768",
1063
- "baseline_primary": 0.916128,
1064
- "target": "AIST-87M 768",
1065
- "target_primary": 0.874231,
1066
- "absolute_delta": -0.04189700000000007,
1067
- "relative_delta_pct": -4.573269237486473
1068
- },
1069
- {
1070
- "task": "STSBenchmark",
1071
- "dimension": 768,
1072
- "family": "Text continuity",
1073
- "baseline": "ES-AIST-81M 768",
1074
- "baseline_primary": 0.744154,
1075
- "target": "AIST-87M 768",
1076
- "target_primary": 0.650759,
1077
- "absolute_delta": -0.093395,
1078
- "relative_delta_pct": -12.550493580629817
1079
- },
1080
- {
1081
- "task": "Flickr30kT2IRetrieval",
1082
- "dimension": 768,
1083
- "family": "Image recall",
1084
- "baseline": "ES-AIST-81M 768",
1085
- "baseline_primary": 0.34676,
1086
- "target": "AIST-87M 768",
1087
- "target_primary": 0.46701,
1088
- "absolute_delta": 0.12024999999999997,
1089
- "relative_delta_pct": 34.67816357134617
1090
- },
1091
- {
1092
- "task": "Flickr30kI2TRetrieval",
1093
- "dimension": 768,
1094
- "family": "Image recall",
1095
- "baseline": "ES-AIST-81M 768",
1096
- "baseline_primary": 0.19563,
1097
- "target": "AIST-87M 768",
1098
- "target_primary": 0.38062,
1099
- "absolute_delta": 0.18499000000000002,
1100
- "relative_delta_pct": 94.56116137606708
1101
- },
1102
- {
1103
- "task": "CommonVoiceMini21T2ARetrieval",
1104
- "dimension": 768,
1105
- "family": "Audio recall",
1106
- "baseline": "ES-AIST-81M 768",
1107
- "baseline_primary": 0.024182222222222223,
1108
- "target": "AIST-87M 768",
1109
- "target_primary": 0.028395641025641027,
1110
- "absolute_delta": 0.004213418803418804,
1111
- "relative_delta_pct": 17.423621223474193
1112
- },
1113
- {
1114
- "task": "MACST2ARetrieval",
1115
- "dimension": 768,
1116
- "family": "Audio recall",
1117
- "baseline": "ES-AIST-81M 768",
1118
- "baseline_primary": 0.07729,
1119
- "target": "AIST-87M 768",
1120
- "target_primary": 0.11149,
1121
- "absolute_delta": 0.03420000000000001,
1122
- "relative_delta_pct": 44.24893259153838
1123
- },
1124
- {
1125
- "task": "UrbanSound8KT2ARetrieval",
1126
- "dimension": 768,
1127
- "family": "Audio recall",
1128
- "baseline": "ES-AIST-81M 768",
1129
- "baseline_primary": 0.007,
1130
- "target": "AIST-87M 768",
1131
- "target_primary": 0.00851,
1132
- "absolute_delta": 0.00151,
1133
- "relative_delta_pct": 21.571428571428573
1134
- },
1135
- {
1136
- "task": "ClothoT2ARetrieval",
1137
- "dimension": 768,
1138
- "family": "Audio recall",
1139
- "baseline": "ES-AIST-81M 768",
1140
- "baseline_primary": 0.15003,
1141
- "target": "AIST-87M 768",
1142
- "target_primary": 0.26868,
1143
- "absolute_delta": 0.11864999999999998,
1144
- "relative_delta_pct": 79.08418316336731
1145
- }
1146
- ]
1147
- },
1148
  {
1149
  "baseline": "Native mn20 audio 768",
1150
  "target": "AIST-87M 768",
@@ -1181,7 +945,7 @@
1181
  "baseline_primary": 0.00849,
1182
  "target": "AIST-87M 768",
1183
  "target_primary": 0.00851,
1184
- "absolute_delta": 2.000000000000092e-05,
1185
  "relative_delta_pct": 0.2355712603062535
1186
  },
1187
  {
 
491
  "/shared/augmem/triembed/results/aist95m_1280_mieb_flickr_20260502T0217Z/dim1280/results/triembed__te-1280d/best_model"
492
  ]
493
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  {
495
  "label": "Native mn20 audio 768",
496
  "dimension": 768,
 
909
  }
910
  ],
911
  "comparisons": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
  {
913
  "baseline": "Native mn20 audio 768",
914
  "target": "AIST-87M 768",
 
945
  "baseline_primary": 0.00849,
946
  "target": "AIST-87M 768",
947
  "target_primary": 0.00851,
948
+ "absolute_delta": 0.00002000000000000092,
949
  "relative_delta_pct": 0.2355712603062535
950
  },
951
  {
aist87m_memory_slice_release_report.md CHANGED
@@ -8,7 +8,6 @@ Primary metrics are `main_score` for text continuity tasks and `NDCG@10` for ima
8
  | AIST-87M 768 | 768 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
9
  | AIST-87M 512 | 512 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
10
  | AIST-95M 1280 Flickr | 1280 | 2 | - | 0.485 | - | 0.485 | ClothoT2ARetrieval, CommonVoiceMini21T2ARetrieval, MACST2ARetrieval, STSBenchmark, SprintDuplicateQuestions, UrbanSound8KT2ARetrieval |
11
- | ES-AIST-81M 768 | 768 | 8 | 0.830 | 0.271 | 0.065 | 0.308 | none |
12
  | Native mn20 audio 768 | 768 | 4 | - | - | 0.115 | 0.115 | Flickr30kI2TRetrieval, Flickr30kT2IRetrieval, STSBenchmark, SprintDuplicateQuestions |
13
  | Dual-audio tower 1280 | 1280 | 8 | 0.879 | 0.485 | 0.113 | 0.397 | none |
14
  | Dual-audio tower 768 | 768 | 6 | 0.879 | 0.484 | 0.167 | 0.510 | MACST2ARetrieval, UrbanSound8KT2ARetrieval |
@@ -45,21 +44,6 @@ Primary metrics are `main_score` for text continuity tasks and `NDCG@10` for ima
45
 
46
  ## Paired Comparisons
47
 
48
- ### AIST-87M 768 vs ES-AIST-81M 768
49
-
50
- Mean absolute delta over 8 paired tasks: 0.041.
51
-
52
- | Dim | Task | Baseline | Target | Absolute delta | Relative delta |
53
- |---:|---|---:|---:|---:|---:|
54
- | 768 | SprintDuplicateQuestions | 0.916 | 0.874 | -0.042 | -4.6% |
55
- | 768 | STSBenchmark | 0.744 | 0.651 | -0.093 | -12.6% |
56
- | 768 | Flickr30kT2IRetrieval | 0.347 | 0.467 | 0.120 | 34.7% |
57
- | 768 | Flickr30kI2TRetrieval | 0.196 | 0.381 | 0.185 | 94.6% |
58
- | 768 | CommonVoiceMini21T2ARetrieval | 0.024 | 0.028 | 0.004 | 17.4% |
59
- | 768 | MACST2ARetrieval | 0.077 | 0.111 | 0.034 | 44.2% |
60
- | 768 | UrbanSound8KT2ARetrieval | 0.007 | 0.009 | 0.002 | 21.6% |
61
- | 768 | ClothoT2ARetrieval | 0.150 | 0.269 | 0.119 | 79.1% |
62
-
63
  ### AIST-87M 768 vs Native mn20 audio 768
64
 
65
  Mean absolute delta over 4 paired tasks: -0.011.
 
8
  | AIST-87M 768 | 768 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
9
  | AIST-87M 512 | 512 | 8 | 0.762 | 0.424 | 0.104 | 0.349 | none |
10
  | AIST-95M 1280 Flickr | 1280 | 2 | - | 0.485 | - | 0.485 | ClothoT2ARetrieval, CommonVoiceMini21T2ARetrieval, MACST2ARetrieval, STSBenchmark, SprintDuplicateQuestions, UrbanSound8KT2ARetrieval |
 
11
  | Native mn20 audio 768 | 768 | 4 | - | - | 0.115 | 0.115 | Flickr30kI2TRetrieval, Flickr30kT2IRetrieval, STSBenchmark, SprintDuplicateQuestions |
12
  | Dual-audio tower 1280 | 1280 | 8 | 0.879 | 0.485 | 0.113 | 0.397 | none |
13
  | Dual-audio tower 768 | 768 | 6 | 0.879 | 0.484 | 0.167 | 0.510 | MACST2ARetrieval, UrbanSound8KT2ARetrieval |
 
44
 
45
  ## Paired Comparisons
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ### AIST-87M 768 vs Native mn20 audio 768
48
 
49
  Mean absolute delta over 4 paired tasks: -0.011.
aist_81m_raw_mn20_lora.yaml CHANGED
@@ -1,9 +1,9 @@
1
  # Raw AIST-81M baseline.
2
  #
3
  # Generic trimodal InfoNCE teacher recipe using the single native mn20_as LoRA
4
- # audio path already used by ES-AIST-81M. This intentionally has no ES/entity
5
- # signal layout or entity-specific corpus/loss. It is the full-data baseline;
6
- # subset cache aliases are only for smoke tests.
7
 
8
  dataset_dir: datasets
9
  dataset_name: wordnet_2024_openai_validaudio
 
1
  # Raw AIST-81M baseline.
2
  #
3
  # Generic trimodal InfoNCE teacher recipe using the single native mn20_as LoRA
4
+ # audio path. This intentionally has no extra signal layout or specialized
5
+ # corpus/loss. It is the full-data baseline; subset cache aliases are only for
6
+ # smoke tests.
7
 
8
  dataset_dir: datasets
9
  dataset_name: wordnet_2024_openai_validaudio
manifest.json CHANGED
@@ -48,8 +48,8 @@
48
  },
49
  {
50
  "path": "aist_81m_raw_mn20_lora.yaml",
51
- "size_bytes": 1316,
52
- "sha256": "f2a12e8664edac6e3180b08d49842ea38a1c5dbbe65284e79db785590b1f398e"
53
  },
54
  {
55
  "path": "parameter_breakdown.json",
@@ -58,13 +58,13 @@
58
  },
59
  {
60
  "path": "aist87m_memory_slice_release_report.md",
61
- "size_bytes": 5282,
62
- "sha256": "a2f7dc6bd68d079d3655d5a2b083ecc0f26fb6a5041ea89a9835465045f86b42"
63
  },
64
  {
65
  "path": "aist87m_memory_slice_release_report.json",
66
- "size_bytes": 40579,
67
- "sha256": "6b5894bd3b8de9e8c99ccd236587fc6f4652399ac95b343ba301478fcdb7c77a"
68
  },
69
  {
70
  "path": "aist87m_vs_dual_audio_throughput_l4_20260504.json",
 
48
  },
49
  {
50
  "path": "aist_81m_raw_mn20_lora.yaml",
51
+ "size_bytes": 1280,
52
+ "sha256": "38dc7cbf48c39af80c0828d37d39c8a94ec30b722dc60b09dd06d2704b787c0d"
53
  },
54
  {
55
  "path": "parameter_breakdown.json",
 
58
  },
59
  {
60
  "path": "aist87m_memory_slice_release_report.md",
61
+ "size_bytes": 4514,
62
+ "sha256": "6d036a8e0b82aa6fb25b9eb55721fc26d2546ff5fea8a14baca1310e5fe3c7f8"
63
  },
64
  {
65
  "path": "aist87m_memory_slice_release_report.json",
66
+ "size_bytes": 33035,
67
+ "sha256": "5af8785667cb7f053dd42975cb6ac94e79214ac5fe7c465ab82a136701f0c75a"
68
  },
69
  {
70
  "path": "aist87m_vs_dual_audio_throughput_l4_20260504.json",