gcoderw commited on
Commit
d000b90
·
verified ·
1 Parent(s): e6ea540

Publish ESS-AIST-81M preview base release

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
ESS-AIST-81M.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7eb74bacea98e7122e723c164dfd912dd1ccb6902605972f905f95c337dfa2
3
+ size 323643112
README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ tags:
6
+ - multimodal
7
+ - embedding
8
+ - trimodal
9
+ - retrieval
10
+ - image-text-audio
11
+ - feature-extraction
12
+ library_name: pytorch
13
+ pipeline_tag: feature-extraction
14
+ datasets:
15
+ - custom
16
+ ---
17
+
18
+ # ESS-AIST-81M Preview
19
+
20
+ `ESS-AIST-81M Preview` is the current Cortext trial checkpoint from the ESS line.
21
+
22
+ - release checkpoint: `ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt`
23
+ - text encoder: `MongoDB/mdbr-leaf-ir`
24
+ - image encoder: `mobilenetv4_conv_medium.e180_r384_in12k`
25
+ - audio encoder: native `mn20_as` EfficientAT LoRA audio backbone
26
+
27
+ This is the base safetensors release. GGUF quantizations are published separately.
28
+
29
+ ## Embedding Layout
30
+
31
+ Output embedding: `1536d`
32
+
33
+ - `0:512` semantic
34
+ - `512:1024` subject
35
+ - `1024:1536` event
36
+
37
+ Recommended normalized runtime views:
38
+
39
+ - `semantic_key = l2norm(z[0:512])`
40
+ - `subject_key = l2norm(z[512:1024])`
41
+ - `event_key = l2norm(z[1024:1536])`
42
+ - `full_key = l2norm(z[0:1536])`
43
+
44
+ ## Exact Release Metrics
45
+
46
+ All numbers below are from the exact published checkpoint `checkpoint_epoch_11.pt`.
47
+
48
+ ### 512d Retrieval
49
+
50
+ Source:
51
+ - `retrieval_512_gt1030.json`
52
+
53
+ Speech holdout:
54
+
55
+ - `A->T_r1 = 0.4672`
56
+ - `T->A_r1 = 0.4606`
57
+ - `A->T_r5 = 0.7398`
58
+ - `T->A_r5 = 0.7426`
59
+
60
+ SALT:
61
+
62
+ - `I->T_r1 = 0.4149`
63
+ - `T->I_r1 = 0.4327`
64
+ - `A->T_r1 = 0.2408`
65
+ - `T->A_r1 = 0.2486`
66
+ - `I->A_r1 = 0.4621`
67
+ - `A->I_r1 = 0.4829`
68
+
69
+ ### Held-Out ESS Eval
70
+
71
+ Subject:
72
+
73
+ - `subject_key` same/different AUC: `0.5067`
74
+ - `subject_key` same-topic-different-subject rejection AUC: `0.5067`
75
+
76
+ Event:
77
+
78
+ - `event_key` same/different AUC: `0.8241`
79
+ - `event_key` same-subject-different-event rejection AUC: `0.5535`
80
+ - `event_key` topic-shift rejection AUC: `0.9770`
81
+
82
+ ## Architecture
83
+
84
+ This preview is a frozen-encoder / trainable-projector stack:
85
+
86
+ - text encoder params: `22,861,056`
87
+ - image encoder params: `8,434,512`
88
+ - audio encoder params: `20,639,974`
89
+ - image projection params: `9,975,296`
90
+ - audio projection params: `9,975,296`
91
+ - text projection params: `8,926,720`
92
+ - total exact loaded params: `80,812,854`
93
+
94
+ ## Files
95
+
96
+ | File | Purpose |
97
+ |---|---|
98
+ | `ESS-AIST-81M.safetensors` | Base preview release artifact |
99
+ | `export_metadata.json` | ESS export contract |
100
+ | `manifest.json` | Release manifest |
101
+ | `parameter_breakdown.json` | Exact parameter accounting |
102
+ | `ess_ait_86m_spec.yaml` | Training config used for the release line |
103
+ | `retrieval_512_gt1030.json` | Exact 512d retrieval eval for this checkpoint |
104
+ | `subject_eval.json` | Exact held-out subject eval for this checkpoint |
105
+ | `event_eval.json` | Exact held-out event eval for this checkpoint |
106
+ | `prefix_eval.json` | Prefix-level AUC summary |
107
+
108
+ ## Caveats
109
+
110
+ - This is the current preview checkpoint, not the finished ESS subject-memory model.
111
+ - Subject performance is still the weakest domain on the current held-out eval.
112
+ - Use this for internal Cortext trials, not as the final memory-model release.
ess_ait_86m_spec.yaml ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ESS-AIST-86M starter spec config.
2
+ # This is a design-time config for the ESS trainer/export path built on AIT-86M.
3
+ # It intentionally extends beyond the current trimodal trainer surface.
4
+
5
+ dataset_dir: datasets
6
+ dataset_name: ess_multimodal_core_v1
7
+ cache_dir: cache
8
+
9
+ encoder_name: mobilenetv4_conv_medium.e180_r384_in12k
10
+ encoder_dim: 1280
11
+ modality: trimodal
12
+ audio_encoder_dim: 1280
13
+ audio_finetune_last_n_stages: 0
14
+ projection_hidden_dim: 2048
15
+ projection_output_dim: 1536
16
+ projection_dropout: 0.3
17
+
18
+ # Teacher policy:
19
+ # - AIST-95M is the primary semantic distillation teacher, especially for speech
20
+ # and audio-text retrieval retention.
21
+ # - AIT-86M is a secondary compatibility teacher / drift regularizer.
22
+ # Naming note:
23
+ # - although the deployment/runtime surface remains AIT-compatible, the primary
24
+ # teacher lineage is AIST, so the model family name is ESS-AIST-86M.
25
+ primary_semantic_teacher: AIST-95M
26
+ secondary_compat_teacher: AIT-86M
27
+
28
+ batch_size: 4096
29
+ max_epochs: 50
30
+ learning_rate: 0.0012
31
+ weight_decay: 0.0001
32
+ warmup_fraction: 0.05
33
+ grad_clip_norm: 1.0
34
+ gradient_accumulation_steps: 1
35
+
36
+ # Prefix Matryoshka targets.
37
+ matryoshka_dims: [1536, 1024, 512]
38
+ matryoshka_weights: [1.0, 1.0, 1.0]
39
+
40
+ loss_type: infonce
41
+ temperature: 0.07
42
+ temperature_min: 0.01
43
+ learn_temperature: true
44
+ hard_neg_k: 8
45
+ hard_neg_weight: 2.0
46
+ false_neg_threshold: 0.85
47
+
48
+ feature_noise_std: 0.0
49
+ feature_mask_ratio: 0.0
50
+ mixup_alpha: 0.0
51
+
52
+ num_workers: 4
53
+ pin_memory: true
54
+ prefetch_factor: 2
55
+ persistent_workers: true
56
+ mixed_precision: bf16
57
+
58
+ checkpoint_dir: checkpoints
59
+ save_every_n_epochs: 2
60
+ early_stopping_patience: 8
61
+ log_dir: runs
62
+ benchmark_eval_every_epochs: 1
63
+
64
+ # Next-run ESS corpus, adding LibriSpeech person-subject rows on top of the
65
+ # v6 subject-media + WIT + speech/wavcaps semantic lane.
66
+ ess_corpus_dir: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360
67
+ ess_train_jsonl: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/train.jsonl
68
+ ess_val_jsonl: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl
69
+ ess_train_text_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/cache/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360_train_leaf_ir_text_features.npy
70
+ ess_val_text_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/cache/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360_val_leaf_ir_text_features.npy
71
+
72
+ # Multimodal subject-media attachment from the finalized v19 generated bundle.
73
+ ess_subject_media_dataset_dir: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19
74
+ ess_subject_media_train_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_train_mobilenetv4_conv_medium_image_features.npy
75
+ ess_subject_media_val_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_val_mobilenetv4_conv_medium_image_features.npy
76
+ ess_subject_media_train_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_train_mn20_audioheavy_lora1280_audio_features.npy
77
+ ess_subject_media_val_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_val_mn20_audioheavy_lora1280_audio_features.npy
78
+ ess_wordnet_train_audio_cache: cache/wordnet_2024_openai_validaudio_train_mn20_audioheavy_lora1280_audio_features.npy
79
+ ess_wordnet_val_audio_cache: cache/wordnet_2024_openai_validaudio_val_mn20_audioheavy_lora1280_audio_features.npy
80
+ ess_speech_audio_cache: cache/speech_chatterbox_150k_train_mn20_audioheavy_lora1280_audio_features.npy
81
+ ess_wavcaps_audio_cache: cache/wavcaps_fsd_train_mn20_audioheavy_lora1280_audio_features.npy
82
+ ess_salt_audio_cache: cache/benchmark_salt_features/salt_audio_mn20_audioheavy_lora1280_4999.npy
83
+
84
+ # Optional external entity-subject image caches, e.g. WIT-derived ESS records.
85
+ # These are split-aligned caches built from the final train/val JSONL rows.
86
+ ess_wit_train_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_wit_records_en_maincap_4096/cache/ess_wit_records_en_maincap_4096_train_mobilenetv4_conv_medium_image_features.npy
87
+ ess_wit_val_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_wit_records_en_maincap_4096/cache/ess_wit_records_en_maincap_4096_val_mobilenetv4_conv_medium_image_features.npy
88
+
89
+ # Optional external person-subject caches keyed by ESS record_id. The field
90
+ # names still say "voxceleb" for compatibility, but they also carry staged
91
+ # LibriSpeech speaker-subject audio caches.
92
+ # ess_voxceleb_train_image_cache:
93
+ # ess_voxceleb_val_image_cache:
94
+ ess_voxceleb_train_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_librispeech_subject_trainclean360/cache/ess_librispeech_subject_trainclean360_train_mn20_audioheavy_lora1280_audio_features.npy
95
+ ess_voxceleb_val_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_librispeech_subject_trainclean360/cache/ess_librispeech_subject_trainclean360_val_mn20_audioheavy_lora1280_audio_features.npy
96
+
97
+ # ESS-specific fields for the forthcoming trainer.
98
+ ess_semantic_slice: [0, 512]
99
+ ess_subject_slice: [512, 1024]
100
+ ess_event_slice: [1024, 1536]
101
+
102
+ # Corpus composition at build time:
103
+ # - train: 219957 semantic / 28026 event / 94446 subject
104
+ # - val: 19954 semantic / 3074 event / 10967 subject
105
+ #
106
+ # Do not sample raw row frequency. Subject supervision is too small and must be
107
+ # explicitly oversampled to shape the subject block.
108
+ ess_sampling:
109
+ strategy: weighted_family_with_replacement
110
+ unit: record
111
+ train_family_weights:
112
+ semantic: 0.50
113
+ subject: 0.15
114
+ event: 0.35
115
+ train_dataset_weights:
116
+ speech_chatterbox_150k: 5.0
117
+ wit_entity_subject: 0.25
118
+ librispeech_subject: 0.01
119
+ val_family_weights:
120
+ semantic: 0.50
121
+ subject: 0.15
122
+ event: 0.35
123
+ val_dataset_weights:
124
+ speech_chatterbox_150k: 5.0
125
+ wit_entity_subject: 0.25
126
+ librispeech_subject: 0.01
127
+ family_from_active_supervision:
128
+ semantic: semantic
129
+ subject: subject
130
+ event: event
131
+ max_records_per_step:
132
+ semantic: 2048
133
+ subject: 1024
134
+ event: 1024
135
+ notes:
136
+ - subject rows are intentionally oversampled relative to raw corpus count
137
+ - semantic remains dominant to protect 512d retrieval
138
+ - speech_chatterbox semantic rows are oversampled within semantic because only ~20k rows survive dedupe into v6
139
+ - librispeech_subject is heavily downweighted within subject so person voice identity helps without flooding the entire subject block
140
+ - event stays high enough to shape prefix_1536 without overwhelming semantic
141
+
142
+ ess_loss_weights:
143
+ semantic_retrieval: 1.0
144
+ semantic_distillation: 1.0
145
+ subject_multimodal: 0.8
146
+ subject_contrastive: 0.8
147
+ subject_hard_negative: 0.8
148
+ event_contrastive: 0.8
149
+ event_rejection: 1.0
150
+ prefix_512: 1.0
151
+ prefix_1024: 0.75
152
+ prefix_1536: 0.75
153
+ block_decorrelation: 0.1
154
+ variance_regularization: 0.1
155
+
156
+ ess_negative_buckets:
157
+ - same_topic_different_subject
158
+ - same_subject_different_event
159
+ - stale_same_source
160
+ - wrong_active
161
+ - topic_shift
162
+ - lookalike_or_soundalike
163
+ ess_negative_buckets_by_family:
164
+ semantic:
165
+ - same_topic_different_subject
166
+ - same_subject_different_event
167
+ - stale_same_source
168
+ - wrong_active
169
+ - topic_shift
170
+ - lookalike_or_soundalike
171
+ subject:
172
+ - same_topic_different_subject
173
+ - stale_same_source
174
+ - wrong_active
175
+ - topic_shift
176
+ - lookalike_or_soundalike
177
+ event:
178
+ - same_topic_different_subject
179
+ - same_subject_different_event
180
+ - stale_same_source
181
+ - wrong_active
182
+ - topic_shift
183
+ - lookalike_or_soundalike
184
+
185
+ ess_eval_views:
186
+ - semantic_key
187
+ - subject_key
188
+ - event_key
189
+ - full_key
190
+ - prefix_512
191
+ - prefix_1024
192
+ - prefix_1536
event_eval.json ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
3
+ "split": "val",
4
+ "records_path": "/shared/augmem/triembed/checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl",
5
+ "views": {
6
+ "semantic_key": {
7
+ "event_same_different_auc": {
8
+ "auc": 0.829112461248993,
9
+ "positive_pairs": 7703,
10
+ "negative_pairs": 327075,
11
+ "positive_mean": 0.7533491437897564,
12
+ "negative_mean": 0.5995114385256625
13
+ },
14
+ "same_subject_different_event_rejection_auc": {
15
+ "auc": 0.5802306316888112,
16
+ "positive_pairs": 7703,
17
+ "negative_pairs": 118115,
18
+ "positive_mean": 0.7533491437897564,
19
+ "negative_mean": 0.7313196869598024
20
+ },
21
+ "stale_same_source_rejection_auc": {
22
+ "auc": null,
23
+ "positive_pairs": 7703,
24
+ "negative_pairs": 0,
25
+ "positive_mean": 0.7533491437897564,
26
+ "negative_mean": null
27
+ },
28
+ "wrong_active_rejection_auc": {
29
+ "auc": null,
30
+ "positive_pairs": 7703,
31
+ "negative_pairs": 0,
32
+ "positive_mean": 0.7533491437897564,
33
+ "negative_mean": null
34
+ },
35
+ "topic_shift_rejection_auc": {
36
+ "auc": 0.9697933441859231,
37
+ "positive_pairs": 7703,
38
+ "negative_pairs": 208960,
39
+ "positive_mean": 0.7533491437897564,
40
+ "negative_mean": 0.525006599016673
41
+ }
42
+ },
43
+ "subject_key": {
44
+ "event_same_different_auc": {
45
+ "auc": 0.6676734827239529,
46
+ "positive_pairs": 7703,
47
+ "negative_pairs": 327075,
48
+ "positive_mean": 0.668074605508422,
49
+ "negative_mean": 0.5629470272292642
50
+ },
51
+ "same_subject_different_event_rejection_auc": {
52
+ "auc": 0.1862661483021773,
53
+ "positive_pairs": 7703,
54
+ "negative_pairs": 118115,
55
+ "positive_mean": 0.668074605508422,
56
+ "negative_mean": 0.7863739344748515
57
+ },
58
+ "stale_same_source_rejection_auc": {
59
+ "auc": null,
60
+ "positive_pairs": 7703,
61
+ "negative_pairs": 0,
62
+ "positive_mean": 0.668074605508422,
63
+ "negative_mean": null
64
+ },
65
+ "wrong_active_rejection_auc": {
66
+ "auc": null,
67
+ "positive_pairs": 7703,
68
+ "negative_pairs": 0,
69
+ "positive_mean": 0.668074605508422,
70
+ "negative_mean": null
71
+ },
72
+ "topic_shift_rejection_auc": {
73
+ "auc": 0.9397898078829693,
74
+ "positive_pairs": 7703,
75
+ "negative_pairs": 208960,
76
+ "positive_mean": 0.668074605508422,
77
+ "negative_mean": 0.43665458298485105
78
+ }
79
+ },
80
+ "event_key": {
81
+ "event_same_different_auc": {
82
+ "auc": 0.8240710674869262,
83
+ "positive_pairs": 7703,
84
+ "negative_pairs": 327075,
85
+ "positive_mean": 0.6786398216704803,
86
+ "negative_mean": 0.4559661049066459
87
+ },
88
+ "same_subject_different_event_rejection_auc": {
89
+ "auc": 0.5534970574958717,
90
+ "positive_pairs": 7703,
91
+ "negative_pairs": 118115,
92
+ "positive_mean": 0.6786398216704803,
93
+ "negative_mean": 0.6430964619714105
94
+ },
95
+ "stale_same_source_rejection_auc": {
96
+ "auc": null,
97
+ "positive_pairs": 7703,
98
+ "negative_pairs": 0,
99
+ "positive_mean": 0.6786398216704803,
100
+ "negative_mean": null
101
+ },
102
+ "wrong_active_rejection_auc": {
103
+ "auc": null,
104
+ "positive_pairs": 7703,
105
+ "negative_pairs": 0,
106
+ "positive_mean": 0.6786398216704803,
107
+ "negative_mean": null
108
+ },
109
+ "topic_shift_rejection_auc": {
110
+ "auc": 0.9770134927840807,
111
+ "positive_pairs": 7703,
112
+ "negative_pairs": 208960,
113
+ "positive_mean": 0.6786398216704803,
114
+ "negative_mean": 0.35019034818428435
115
+ }
116
+ },
117
+ "full_key": {
118
+ "event_same_different_auc": {
119
+ "auc": 0.78280390304866,
120
+ "positive_pairs": 7703,
121
+ "negative_pairs": 327075,
122
+ "positive_mean": 0.7056915993491164,
123
+ "negative_mean": 0.5454881820918898
124
+ },
125
+ "same_subject_different_event_rejection_auc": {
126
+ "auc": 0.44637572176232837,
127
+ "positive_pairs": 7703,
128
+ "negative_pairs": 118115,
129
+ "positive_mean": 0.7056915993491164,
130
+ "negative_mean": 0.7219292155613277
131
+ },
132
+ "stale_same_source_rejection_auc": {
133
+ "auc": null,
134
+ "positive_pairs": 7703,
135
+ "negative_pairs": 0,
136
+ "positive_mean": 0.7056915993491164,
137
+ "negative_mean": null
138
+ },
139
+ "wrong_active_rejection_auc": {
140
+ "auc": null,
141
+ "positive_pairs": 7703,
142
+ "negative_pairs": 0,
143
+ "positive_mean": 0.7056915993491164,
144
+ "negative_mean": null
145
+ },
146
+ "topic_shift_rejection_auc": {
147
+ "auc": 0.9729705121252057,
148
+ "positive_pairs": 7703,
149
+ "negative_pairs": 208960,
150
+ "positive_mean": 0.7056915993491164,
151
+ "negative_mean": 0.4457545839475431
152
+ }
153
+ },
154
+ "prefix_512": {
155
+ "event_same_different_auc": {
156
+ "auc": 0.829112461248993,
157
+ "positive_pairs": 7703,
158
+ "negative_pairs": 327075,
159
+ "positive_mean": 0.7533491437897564,
160
+ "negative_mean": 0.5995114385256625
161
+ },
162
+ "same_subject_different_event_rejection_auc": {
163
+ "auc": 0.5802306316888112,
164
+ "positive_pairs": 7703,
165
+ "negative_pairs": 118115,
166
+ "positive_mean": 0.7533491437897564,
167
+ "negative_mean": 0.7313196869598024
168
+ },
169
+ "stale_same_source_rejection_auc": {
170
+ "auc": null,
171
+ "positive_pairs": 7703,
172
+ "negative_pairs": 0,
173
+ "positive_mean": 0.7533491437897564,
174
+ "negative_mean": null
175
+ },
176
+ "wrong_active_rejection_auc": {
177
+ "auc": null,
178
+ "positive_pairs": 7703,
179
+ "negative_pairs": 0,
180
+ "positive_mean": 0.7533491437897564,
181
+ "negative_mean": null
182
+ },
183
+ "topic_shift_rejection_auc": {
184
+ "auc": 0.9697933441859231,
185
+ "positive_pairs": 7703,
186
+ "negative_pairs": 208960,
187
+ "positive_mean": 0.7533491437897564,
188
+ "negative_mean": 0.525006599016673
189
+ }
190
+ },
191
+ "prefix_1024": {
192
+ "event_same_different_auc": {
193
+ "auc": 0.7453008223026156,
194
+ "positive_pairs": 7703,
195
+ "negative_pairs": 327075,
196
+ "positive_mean": 0.7183707235626442,
197
+ "negative_mean": 0.5870387001189491
198
+ },
199
+ "same_subject_different_event_rejection_auc": {
200
+ "auc": 0.3570483627258597,
201
+ "positive_pairs": 7703,
202
+ "negative_pairs": 118115,
203
+ "positive_mean": 0.7183707235626442,
204
+ "negative_mean": 0.7610074661872391
205
+ },
206
+ "stale_same_source_rejection_auc": {
207
+ "auc": null,
208
+ "positive_pairs": 7703,
209
+ "negative_pairs": 0,
210
+ "positive_mean": 0.7183707235626442,
211
+ "negative_mean": null
212
+ },
213
+ "wrong_active_rejection_auc": {
214
+ "auc": null,
215
+ "positive_pairs": 7703,
216
+ "negative_pairs": 0,
217
+ "positive_mean": 0.7183707235626442,
218
+ "negative_mean": null
219
+ },
220
+ "topic_shift_rejection_auc": {
221
+ "auc": 0.9647611939666115,
222
+ "positive_pairs": 7703,
223
+ "negative_pairs": 208960,
224
+ "positive_mean": 0.7183707235626442,
225
+ "negative_mean": 0.48870255538236756
226
+ }
227
+ },
228
+ "prefix_1536": {
229
+ "event_same_different_auc": {
230
+ "auc": 0.78280390304866,
231
+ "positive_pairs": 7703,
232
+ "negative_pairs": 327075,
233
+ "positive_mean": 0.7056915993491164,
234
+ "negative_mean": 0.5454881820918898
235
+ },
236
+ "same_subject_different_event_rejection_auc": {
237
+ "auc": 0.44637572176232837,
238
+ "positive_pairs": 7703,
239
+ "negative_pairs": 118115,
240
+ "positive_mean": 0.7056915993491164,
241
+ "negative_mean": 0.7219292155613277
242
+ },
243
+ "stale_same_source_rejection_auc": {
244
+ "auc": null,
245
+ "positive_pairs": 7703,
246
+ "negative_pairs": 0,
247
+ "positive_mean": 0.7056915993491164,
248
+ "negative_mean": null
249
+ },
250
+ "wrong_active_rejection_auc": {
251
+ "auc": null,
252
+ "positive_pairs": 7703,
253
+ "negative_pairs": 0,
254
+ "positive_mean": 0.7056915993491164,
255
+ "negative_mean": null
256
+ },
257
+ "topic_shift_rejection_auc": {
258
+ "auc": 0.9729705121252057,
259
+ "positive_pairs": 7703,
260
+ "negative_pairs": 208960,
261
+ "positive_mean": 0.7056915993491164,
262
+ "negative_mean": 0.4457545839475431
263
+ }
264
+ }
265
+ }
266
+ }
export_metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "ESS-AIST-81M",
3
+ "base_family": "AIST-86M-compatible",
4
+ "output_dimension": 1536,
5
+ "slices": {
6
+ "semantic": [
7
+ 0,
8
+ 512
9
+ ],
10
+ "subject": [
11
+ 512,
12
+ 1024
13
+ ],
14
+ "event": [
15
+ 1024,
16
+ 1536
17
+ ]
18
+ },
19
+ "prefixes": {
20
+ "semantic_prefix": 512,
21
+ "semantic_subject_prefix": 1024,
22
+ "full_prefix": 1536
23
+ },
24
+ "normalized_views": {
25
+ "semantic_key": "l2norm(z[0:512])",
26
+ "subject_key": "l2norm(z[512:1024])",
27
+ "event_key": "l2norm(z[1024:1536])",
28
+ "full_key": "l2norm(z[0:1536])"
29
+ },
30
+ "supported_modalities": [
31
+ "text",
32
+ "image",
33
+ "audio"
34
+ ],
35
+ "normalization_behavior": {
36
+ "raw_embedding": "un-normalized 1536d vector",
37
+ "recommended_runtime": "L2-normalize per-slice or full vector depending on task"
38
+ },
39
+ "matryoshka_behavior": {
40
+ "512": "semantic retrieval",
41
+ "1024": "semantic plus subject continuity",
42
+ "1536": "semantic plus subject plus event continuity"
43
+ },
44
+ "optional_probes": [
45
+ "salience_score",
46
+ "novelty_score",
47
+ "boundary_score"
48
+ ]
49
+ }
manifest.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "ESS-AIST-81M",
3
+ "trimodal_checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
4
+ "audio_checkpoint": "/shared/augmem/triembed/checkpoints/mn20_native_lora_aistmix_audioheavy100k175k175k_continue_from_balanced_20260426T143137Z/latest_model.pt",
5
+ "safetensors": "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M.safetensors",
6
+ "gguf": [
7
+ "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M_q8_0.gguf",
8
+ "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M_q5_1.gguf"
9
+ ]
10
+ }
parameter_breakdown.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "text_encoder": 22861056,
3
+ "image_encoder": 8434512,
4
+ "audio_encoder": 20639974,
5
+ "image_projection": 9975296,
6
+ "audio_projection": 9975296,
7
+ "text_projection": 8926720,
8
+ "total_exact_loaded_params": 80812854
9
+ }
prefix_eval.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
3
+ "split": "val",
4
+ "views": {
5
+ "semantic_key": {
6
+ "subject_same_different_auc": 0.4265240470767738,
7
+ "event_same_different_auc": 0.829112461248993,
8
+ "same_topic_different_subject_rejection_auc": 0.4265240470767738,
9
+ "same_subject_different_event_rejection_auc": 0.5802306316888112
10
+ },
11
+ "subject_key": {
12
+ "subject_same_different_auc": 0.5066875746523821,
13
+ "event_same_different_auc": 0.6676734827239529,
14
+ "same_topic_different_subject_rejection_auc": 0.5066875746523821,
15
+ "same_subject_different_event_rejection_auc": 0.1862661483021773
16
+ },
17
+ "event_key": {
18
+ "subject_same_different_auc": 0.3832485276953712,
19
+ "event_same_different_auc": 0.8240710674869262,
20
+ "same_topic_different_subject_rejection_auc": 0.3832485276953712,
21
+ "same_subject_different_event_rejection_auc": 0.5534970574958717
22
+ },
23
+ "full_key": {
24
+ "subject_same_different_auc": 0.42067046032727157,
25
+ "event_same_different_auc": 0.78280390304866,
26
+ "same_topic_different_subject_rejection_auc": 0.42067046032727157,
27
+ "same_subject_different_event_rejection_auc": 0.44637572176232837
28
+ },
29
+ "prefix_512": {
30
+ "subject_same_different_auc": 0.4265240470767738,
31
+ "event_same_different_auc": 0.829112461248993,
32
+ "same_topic_different_subject_rejection_auc": 0.4265240470767738,
33
+ "same_subject_different_event_rejection_auc": 0.5802306316888112
34
+ },
35
+ "prefix_1024": {
36
+ "subject_same_different_auc": 0.4690923681629257,
37
+ "event_same_different_auc": 0.7453008223026156,
38
+ "same_topic_different_subject_rejection_auc": 0.4690923681629257,
39
+ "same_subject_different_event_rejection_auc": 0.3570483627258597
40
+ },
41
+ "prefix_1536": {
42
+ "subject_same_different_auc": 0.42067046032727157,
43
+ "event_same_different_auc": 0.78280390304866,
44
+ "same_topic_different_subject_rejection_auc": 0.42067046032727157,
45
+ "same_subject_different_event_rejection_auc": 0.44637572176232837
46
+ }
47
+ }
48
+ }
retrieval_512_gt1030.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "SALT-512": {
3
+ "A->I_r1": 0.4828965961933136,
4
+ "A->I_r10": 0.8761752843856812,
5
+ "A->I_r5": 0.7863572835922241,
6
+ "A->T_r1": 0.24084816873073578,
7
+ "A->T_r10": 0.5153030753135681,
8
+ "A->T_r5": 0.45209044218063354,
9
+ "I->A_r1": 0.46209242939949036,
10
+ "I->A_r10": 0.881176233291626,
11
+ "I->A_r5": 0.7905581593513489,
12
+ "I->T_r1": 0.41488298773765564,
13
+ "I->T_r10": 0.5707141757011414,
14
+ "I->T_r5": 0.5401080250740051,
15
+ "T->A_r1": 0.2486497312784195,
16
+ "T->A_r10": 0.5323064923286438,
17
+ "T->A_r5": 0.46209242939949036,
18
+ "T->I_r1": 0.43268653750419617,
19
+ "T->I_r10": 0.5763152837753296,
20
+ "T->I_r5": 0.550710141658783
21
+ },
22
+ "_meta": {
23
+ "audio_suffix": "mn20_audioheavy_lora1280_audio_features",
24
+ "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
25
+ "device": "NVIDIA GeForce GT 1030",
26
+ "dims": [
27
+ 512
28
+ ],
29
+ "encoder_name": "mobilenetv4_conv_medium",
30
+ "image_suffix": "mobilenetv4_conv_medium_image_features"
31
+ },
32
+ "speech_chatterbox-512": {
33
+ "A->T_r1": 0.46719998121261597,
34
+ "A->T_r10": 0.824999988079071,
35
+ "A->T_r5": 0.739799976348877,
36
+ "T->A_r1": 0.46059998869895935,
37
+ "T->A_r10": 0.8277999758720398,
38
+ "T->A_r5": 0.7425999641418457
39
+ }
40
+ }
subject_eval.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
3
+ "split": "val",
4
+ "records_path": "/shared/augmem/triembed/checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl",
5
+ "views": {
6
+ "semantic_key": {
7
+ "subject_same_different_auc": {
8
+ "auc": 0.4265240470767738,
9
+ "positive_pairs": 160248,
10
+ "negative_pairs": 6805,
11
+ "positive_mean": 0.7436165443684017,
12
+ "negative_mean": 0.7611866422867968
13
+ },
14
+ "same_topic_different_subject_rejection_auc": {
15
+ "auc": 0.4265240470767738,
16
+ "positive_pairs": 160248,
17
+ "negative_pairs": 6805,
18
+ "positive_mean": 0.7436165443684017,
19
+ "negative_mean": 0.7611866422867968
20
+ }
21
+ },
22
+ "subject_key": {
23
+ "subject_same_different_auc": {
24
+ "auc": 0.5066875746523821,
25
+ "positive_pairs": 160248,
26
+ "negative_pairs": 6805,
27
+ "positive_mean": 0.7964573271532047,
28
+ "negative_mean": 0.7948588548339001
29
+ },
30
+ "same_topic_different_subject_rejection_auc": {
31
+ "auc": 0.5066875746523821,
32
+ "positive_pairs": 160248,
33
+ "negative_pairs": 6805,
34
+ "positive_mean": 0.7964573271532047,
35
+ "negative_mean": 0.7948588548339001
36
+ }
37
+ },
38
+ "event_key": {
39
+ "subject_same_different_auc": {
40
+ "auc": 0.3832485276953712,
41
+ "positive_pairs": 160248,
42
+ "negative_pairs": 6805,
43
+ "positive_mean": 0.6533856675037972,
44
+ "negative_mean": 0.7097943563909324
45
+ },
46
+ "same_topic_different_subject_rejection_auc": {
47
+ "auc": 0.3832485276953712,
48
+ "positive_pairs": 160248,
49
+ "negative_pairs": 6805,
50
+ "positive_mean": 0.6533856675037972,
51
+ "negative_mean": 0.7097943563909324
52
+ }
53
+ },
54
+ "full_key": {
55
+ "subject_same_different_auc": {
56
+ "auc": 0.42067046032727157,
57
+ "positive_pairs": 160248,
58
+ "negative_pairs": 6805,
59
+ "positive_mean": 0.7333961783599913,
60
+ "negative_mean": 0.754728921120172
61
+ },
62
+ "same_topic_different_subject_rejection_auc": {
63
+ "auc": 0.42067046032727157,
64
+ "positive_pairs": 160248,
65
+ "negative_pairs": 6805,
66
+ "positive_mean": 0.7333961783599913,
67
+ "negative_mean": 0.754728921120172
68
+ }
69
+ },
70
+ "prefix_512": {
71
+ "subject_same_different_auc": {
72
+ "auc": 0.4265240470767738,
73
+ "positive_pairs": 160248,
74
+ "negative_pairs": 6805,
75
+ "positive_mean": 0.7436165443684017,
76
+ "negative_mean": 0.7611866422867968
77
+ },
78
+ "same_topic_different_subject_rejection_auc": {
79
+ "auc": 0.4265240470767738,
80
+ "positive_pairs": 160248,
81
+ "negative_pairs": 6805,
82
+ "positive_mean": 0.7436165443684017,
83
+ "negative_mean": 0.7611866422867968
84
+ }
85
+ },
86
+ "prefix_1024": {
87
+ "subject_same_different_auc": {
88
+ "auc": 0.4690923681629257,
89
+ "positive_pairs": 160248,
90
+ "negative_pairs": 6805,
91
+ "positive_mean": 0.7721689081039962,
92
+ "negative_mean": 0.7791595536982812
93
+ },
94
+ "same_topic_different_subject_rejection_auc": {
95
+ "auc": 0.4690923681629257,
96
+ "positive_pairs": 160248,
97
+ "negative_pairs": 6805,
98
+ "positive_mean": 0.7721689081039962,
99
+ "negative_mean": 0.7791595536982812
100
+ }
101
+ },
102
+ "prefix_1536": {
103
+ "subject_same_different_auc": {
104
+ "auc": 0.42067046032727157,
105
+ "positive_pairs": 160248,
106
+ "negative_pairs": 6805,
107
+ "positive_mean": 0.7333961783599913,
108
+ "negative_mean": 0.754728921120172
109
+ },
110
+ "same_topic_different_subject_rejection_auc": {
111
+ "auc": 0.42067046032727157,
112
+ "positive_pairs": 160248,
113
+ "negative_pairs": 6805,
114
+ "positive_mean": 0.7333961783599913,
115
+ "negative_mean": 0.754728921120172
116
+ }
117
+ }
118
+ }
119
+ }